From 96663defdfd5d09bb0c28d37f9e27600e47b5bb8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 22 Dec 2025 15:00:01 +0000 Subject: [PATCH] feat: Refactor TEI processing for improved file handling and reference parsing, enhance CI cleanup --- .github/workflows/ci-build.yml | 6 ++++- grobid_client/format/TEI2LossyJSON.py | 26 +++++++++++--------- grobid_client/format/TEI2Markdown.py | 34 ++++++++++++++++++--------- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 2c7cd25..595a2d3 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -22,7 +22,11 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'pip' - name: Cleanup more disk space - run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py index 76154ee..41d194a 100644 --- a/grobid_client/format/TEI2LossyJSON.py +++ b/grobid_client/format/TEI2LossyJSON.py @@ -9,6 +9,8 @@ import uuid from collections import OrderedDict from concurrent.futures import ProcessPoolExecutor, as_completed +import html +import re from pathlib import Path from typing import Dict, Union, BinaryIO, Iterator @@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False If stream=False returns the full document dict (same shape as original function). """ # Load with BeautifulSoup but avoid building huge structures when streaming - with open(tei_file, 'r') as f: - content = f.read() + if hasattr(tei_file, 'read'): + # File-like object (BinaryIO/StringIO) + content = tei_file.read() + if isinstance(content, bytes): + content = content.decode('utf-8') + else: + # Path-like object + with open(tei_file, 'r', encoding='utf-8') as f: + content = f.read() soup = BeautifulSoup(content, 'xml') if soup.TEI is None: @@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) -> Extract detailed bibliographic information from TEI biblStruct elements. Implements comprehensive parsing for all standard TEI bibliographic components. """ - import re citation_data = OrderedDict() citation_data['id'] = f"b{index}" @@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list): def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict): """Extract and process imprint information including publisher, dates, and page ranges.""" - import re # Extract publisher information publisher_elements = imprint_element.find_all("publisher") @@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict: Extract person data (author/editor) from TEI persName or author elements. Handles various name formats and affiliations. """ - import re person_data = {} @@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str: text = text.decode('utf-8', errors='ignore') # Normalize whitespace and strip - import re text = re.sub(r'\s+', ' ', text.strip()) # Remove any potential XML/HTML entities - import html text = html.unescape(text) return text @@ -740,15 +744,15 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence) if self.validate_refs: for ref in struct['refs']: - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] + assert ref['offset_start'] < ref['offset_end'], "Wrong offsets" + assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets" yield struct else: struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p) if self.validate_refs: for ref in struct['refs']: - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] + assert ref['offset_start'] < ref['offset_end'], "Wrong offsets" + assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets" yield struct # Update head_paragraph for potential next div diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py index 167231e..228f527 100644 --- a/grobid_client/format/TEI2Markdown.py +++ b/grobid_client/format/TEI2Markdown.py @@ -11,8 +11,7 @@ - Annex - References """ -import os -import uuid +import re from pathlib import Path from typing import List, Dict, Union, Optional, BinaryIO from bs4 import BeautifulSoup, NavigableString, Tag @@ -44,9 +43,12 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]: try: # Load with BeautifulSoup if isinstance(tei_file, (str, Path)): - content = open(tei_file, 'r', encoding='utf-8').read() + with open(tei_file, 'r', encoding='utf-8') as f: + content = f.read() else: content = tei_file.read() + if isinstance(content, bytes): + content = content.decode('utf-8') soup = BeautifulSoup(content, 'xml') @@ -77,7 +79,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]: # Extract publication date pub_date = self._extract_publication_date(soup) if pub_date: - markdown_sections.append(f"Publishd on {pub_date}\n\n") + markdown_sections.append(f"Published on {pub_date}\n\n") # Extract abstract abstract = self._extract_abstract(soup) @@ -511,17 +513,25 @@ def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None: unit = bibl_scope.get("unit", "").lower() text = bibl_scope.get_text().strip() - if unit == "vol" and text: + if unit in ["vol", "volume"] and text: bib_data['volume'] = text elif unit == "issue" and text: bib_data['issue'] = text elif unit == "page" and text: # Handle page ranges - if "from" in bibl_scope.attrs: - bib_data['pages'] = f"{text}-" - elif "to" in bibl_scope.attrs and bib_data.get('pages'): - bib_data['pages'] += text - else: + from_val = bibl_scope.get("from") + to_val = bibl_scope.get("to") + if from_val and to_val: + # Both from and to in same element + bib_data['pages'] = f"{from_val}-{to_val}" + elif from_val: + # Only from specified, may get combined with another element + bib_data['pages'] = f"{from_val}-" + elif to_val and bib_data.get('pages'): + # Only to specified, append to existing from + bib_data['pages'] = bib_data['pages'] + to_val + elif text and not bib_data.get('pages'): + # Plain text, no from/to attributes bib_data['pages'] = text def _extract_author_info(self, author: Tag) -> dict: @@ -629,6 +639,9 @@ def _build_publication_details(self, ref_data: dict) -> str: """Build publication details string from extracted data.""" details = [] + if ref_data.get('year'): + details.append(f"({ref_data['year']})") + if ref_data.get('volume'): details.append(ref_data['volume']) @@ -684,7 +697,6 @@ def _extract_raw_reference(self, bibl_struct: Tag) -> str: raw_text = bibl_struct.get_text().strip() # Remove reference number if present - import re raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text) # Clean up excessive whitespace