Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ jobs:
python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Cleanup more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
26 changes: 15 additions & 11 deletions grobid_client/format/TEI2LossyJSON.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import uuid
from collections import OrderedDict
from concurrent.futures import ProcessPoolExecutor, as_completed
import html
import re
from pathlib import Path
from typing import Dict, Union, BinaryIO, Iterator

Expand Down Expand Up @@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
If stream=False returns the full document dict (same shape as original function).
"""
# Load with BeautifulSoup but avoid building huge structures when streaming
with open(tei_file, 'r') as f:
content = f.read()
if hasattr(tei_file, 'read'):
# File-like object (BinaryIO/StringIO)
content = tei_file.read()
if isinstance(content, bytes):
content = content.decode('utf-8')
else:
# Path-like object
with open(tei_file, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'xml')

if soup.TEI is None:
Expand Down Expand Up @@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) ->
Extract detailed bibliographic information from TEI biblStruct elements.
Implements comprehensive parsing for all standard TEI bibliographic components.
"""
import re

citation_data = OrderedDict()
citation_data['id'] = f"b{index}"
Expand Down Expand Up @@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list):

def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict):
"""Extract and process imprint information including publisher, dates, and page ranges."""
import re

# Extract publisher information
publisher_elements = imprint_element.find_all("publisher")
Expand Down Expand Up @@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict:
Extract person data (author/editor) from TEI persName or author elements.
Handles various name formats and affiliations.
"""
import re

person_data = {}

Expand Down Expand Up @@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str:
text = text.decode('utf-8', errors='ignore')

# Normalize whitespace and strip
import re
text = re.sub(r'\s+', ' ', text.strip())

# Remove any potential XML/HTML entities
import html
text = html.unescape(text)

return text
Expand Down Expand Up @@ -740,15 +744,15 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
if self.validate_refs:
for ref in struct['refs']:
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
yield struct
else:
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
if self.validate_refs:
for ref in struct['refs']:
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
yield struct

# Update head_paragraph for potential next div
Expand Down
34 changes: 23 additions & 11 deletions grobid_client/format/TEI2Markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
- Annex
- References
"""
import os
import uuid
import re
from pathlib import Path
from typing import List, Dict, Union, Optional, BinaryIO
from bs4 import BeautifulSoup, NavigableString, Tag
Expand Down Expand Up @@ -44,9 +43,12 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
try:
# Load with BeautifulSoup
if isinstance(tei_file, (str, Path)):
content = open(tei_file, 'r', encoding='utf-8').read()
with open(tei_file, 'r', encoding='utf-8') as f:
content = f.read()
else:
content = tei_file.read()
if isinstance(content, bytes):
content = content.decode('utf-8')

soup = BeautifulSoup(content, 'xml')

Expand Down Expand Up @@ -77,7 +79,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
# Extract publication date
pub_date = self._extract_publication_date(soup)
if pub_date:
markdown_sections.append(f"Publishd on {pub_date}\n\n")
markdown_sections.append(f"Published on {pub_date}\n\n")

# Extract abstract
abstract = self._extract_abstract(soup)
Expand Down Expand Up @@ -511,17 +513,25 @@ def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None:
unit = bibl_scope.get("unit", "").lower()
text = bibl_scope.get_text().strip()

if unit == "vol" and text:
if unit in ["vol", "volume"] and text:
bib_data['volume'] = text
elif unit == "issue" and text:
bib_data['issue'] = text
elif unit == "page" and text:
# Handle page ranges
if "from" in bibl_scope.attrs:
bib_data['pages'] = f"{text}-"
elif "to" in bibl_scope.attrs and bib_data.get('pages'):
bib_data['pages'] += text
else:
from_val = bibl_scope.get("from")
to_val = bibl_scope.get("to")
if from_val and to_val:
# Both from and to in same element
bib_data['pages'] = f"{from_val}-{to_val}"
elif from_val:
# Only from specified, may get combined with another element
bib_data['pages'] = f"{from_val}-"
elif to_val and bib_data.get('pages'):
# Only to specified, append to existing from
bib_data['pages'] = bib_data['pages'] + to_val
elif text and not bib_data.get('pages'):
# Plain text, no from/to attributes
bib_data['pages'] = text

def _extract_author_info(self, author: Tag) -> dict:
Expand Down Expand Up @@ -629,6 +639,9 @@ def _build_publication_details(self, ref_data: dict) -> str:
"""Build publication details string from extracted data."""
details = []

if ref_data.get('year'):
details.append(f"({ref_data['year']})")

if ref_data.get('volume'):
details.append(ref_data['volume'])

Expand Down Expand Up @@ -684,7 +697,6 @@ def _extract_raw_reference(self, bibl_struct: Tag) -> str:
raw_text = bibl_struct.get_text().strip()

# Remove reference number if present
import re
raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text)

# Clean up excessive whitespace
Expand Down