Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 47 additions & 11 deletions grobid_client/format/TEI2LossyJSON.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
"""
Process a div and its nested content, handling various back section types.
Supports nested divs for complex back sections like annex with multiple subsections.
Also handles formula elements that are direct children of divs.
"""
head = div.find("head")
p_nodes = div.find_all("p")
Expand All @@ -691,10 +692,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
if child.name == "div" or child.name.endswith(":div"):
nested_divs.append(child)

# Count only direct child paragraphs, not those in nested divs
# Count only direct child paragraphs and formulas, not those in nested divs
direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"]
direct_formula_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "formula"]
has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0

if len(nested_divs) > 0 and len(direct_p_nodes) == 0:
if len(nested_divs) > 0 and not has_direct_content:
# This is a container div - process each nested div independently
for nested_div in nested_divs:
# Skip references divs
Expand All @@ -707,11 +710,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa

# Determine the section header and content type for divs with content
if head:
if len(direct_p_nodes) == 0:
# This div has only a head, no paragraphs (standalone head)
if not has_direct_content:
# This div has only a head, no paragraphs or formulas (standalone head)
current_head_paragraph = self._clean_text(head.get_text())
else:
# This div has both head and paragraphs - head is the section header
# This div has both head and content - head is the section header
head_section = self._clean_text(head.get_text())
else:
# If no head element, try to use the type attribute as head_section
Expand All @@ -726,35 +729,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
head_section = "Author Contributions"
elif div_type == "availability":
# Only set as default if this div has its own content
if len(direct_p_nodes) > 0:
if has_direct_content:
head_section = "Data Availability"
elif div_type == "annex":
head_section = "Annex"
else:
# Generic handling - capitalize and format
head_section = div_type.replace("_", " ").title()

# Process paragraphs in this div
if len(direct_p_nodes) > 0:
for id_p, p in enumerate(direct_p_nodes):
# Process direct children (paragraphs and formulas) in document order
for child in div.children:
if not hasattr(child, 'name') or not child.name:
continue

if child.name == "p":
paragraph_id = get_random_id(prefix="p_")

if passage_level == "sentence":
for id_s, sentence in enumerate(p.find_all("s")):
for id_s, sentence in enumerate(child.find_all("s")):
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
if self.validate_refs:
for ref in struct['refs']:
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
yield struct
else:
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
if self.validate_refs:
for ref in struct['refs']:
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
yield struct

elif child.name == "formula":
# Process formula elements as passages
formula_id = get_random_id(prefix="f_")
formula_text = self._clean_text(child.get_text())

if formula_text:
# Create a passage structure for the formula
formula_passage = {
"id": formula_id,
"text": formula_text,
"coords": [
box_to_dict(coord.split(","))
for coord in child.get("coords", "").split(";")
] if child.has_attr("coords") else [],
"refs": [],
"type": "formula"
}

if current_head_paragraph or head_paragraph:
formula_passage["head_paragraph"] = current_head_paragraph or head_paragraph
if head_section:
formula_passage["head_section"] = head_section

# Extract formula label if present
label = child.find("label")
if label:
formula_passage["label"] = self._clean_text(label.get_text())

yield formula_passage

# Update head_paragraph for potential next div
if current_head_paragraph is not None:
head_paragraph = current_head_paragraph
Expand Down
73 changes: 59 additions & 14 deletions grobid_client/format/TEI2Markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,24 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
head = div.find("head")
if head:
section_title = head.get_text().strip()
fulltext_sections.append(f"### {section_title}\n")

# Get paragraphs
paragraphs = div.find_all("p")
for p in paragraphs:
paragraph_text = self._process_paragraph(p)
if paragraph_text.strip():
fulltext_sections.append(f"{paragraph_text}\n\n")
if section_title:
fulltext_sections.append(f"### {section_title}\n")

# Process direct children of the div in document order
# This captures paragraphs, formulas, and other elements as they appear
for child in div.children:
if not hasattr(child, 'name') or not child.name:
continue

if child.name == "p":
paragraph_text = self._process_paragraph(child)
if paragraph_text.strip():
fulltext_sections.append(f"{paragraph_text}\n\n")
elif child.name == "formula":
# Handle formula elements - extract text and optional label
formula_text = self._process_formula(child)
if formula_text.strip():
fulltext_sections.append(f"{formula_text}\n\n")

return "".join(fulltext_sections)

Expand Down Expand Up @@ -272,16 +282,23 @@ def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None:
if header_text not in annex_sections:
annex_sections.append(header_text)

# Process paragraphs that are direct children of this div (not in nested divs)
# Process direct children of this div in document order
# This captures paragraphs, formulas, and other elements as they appear
for child in div.children:
if hasattr(child, 'name') and child.name == "p":
if not hasattr(child, 'name') or not child.name:
continue

if child.name == "p":
paragraph_text = self._process_paragraph(child)
if paragraph_text.strip():
annex_sections.append(f"{paragraph_text}\n\n")

# Process nested div elements
for child in div.children:
if hasattr(child, 'name') and child.name == "div":
elif child.name == "formula":
# Handle formula elements
formula_text = self._process_formula(child)
if formula_text.strip():
annex_sections.append(f"{formula_text}\n\n")
elif child.name == "div":
# Process nested div elements
self._process_div_and_nested_divs(child, annex_sections)

def _extract_references(self, soup: BeautifulSoup) -> str:
Expand Down Expand Up @@ -338,6 +355,34 @@ def _process_paragraph(self, p_element: Tag) -> str:

return "".join(text_parts).strip()

def _process_formula(self, formula_element: Tag) -> str:
"""Process a formula element and convert to markdown.

Formulas are rendered as italicized text with optional equation label.
"""
# Get the main formula text (excluding the label)
formula_text_parts = []
label_text = ""

for child in formula_element.children:
if hasattr(child, 'name') and child.name == "label":
# Extract equation label (e.g., "(1)", "(2)")
label_text = child.get_text().strip()
elif isinstance(child, NavigableString):
formula_text_parts.append(str(child))
else:
# Other elements within formula - get their text
formula_text_parts.append(child.get_text())

formula_text = "".join(formula_text_parts).strip()

if formula_text:
# Format as: *formula text* (label) if label exists
if label_text:
return f"*{formula_text}* {label_text}"
return f"*{formula_text}*"
return ""

def _table_to_markdown(self, table_element: Tag) -> str:
"""Convert a table element to simple markdown."""
markdown_lines = []
Expand Down
87 changes: 87 additions & 0 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,92 @@ def test_offset_validation_for_specific_references(self):
else:
print("No offset differences detected between conversion and expected output")

def test_formula_extraction_in_json(self):
"""Test that formula elements are extracted as passages in JSON conversion."""
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter

# Use the actual TEI file from test resources which contains formulas
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')

converter = TEI2LossyJSONConverter()
json_data = converter.convert_tei_file(tei_file, stream=False)

# Find formula passages
body_text = json_data.get('body_text', [])
formula_passages = [p for p in body_text if p.get('type') == 'formula']

# The test file contains 2 formulas
assert len(formula_passages) >= 2, "Should extract at least 2 formulas from test file"

# Check formula structure
for formula in formula_passages:
assert 'text' in formula, "Formula should have text"
assert 'id' in formula, "Formula should have id"
assert formula['text'].strip(), "Formula text should not be empty"

# The test formulas have labels
if 'label' in formula:
assert formula['label'].strip(), "Formula label should not be empty"

# Check for specific formula content from test file
formula_texts = [f.get('text', '') for f in formula_passages]
assert any('Fext' in t for t in formula_texts), "Should extract formula containing 'Fext'"

def test_formula_extraction_in_markdown(self):
"""Test that formula elements are included in Markdown conversion."""
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter

# Use the actual TEI file from test resources which contains formulas
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')

converter = TEI2MarkdownConverter()
markdown = converter.convert_tei_file(tei_file)

# Check that formula content is present
assert 'Fext' in markdown, "Markdown should contain formula text 'Fext'"

# Check that formula is italicized (surrounded by asterisks)
assert '*Fext' in markdown, "Formula should be italicized in Markdown"

def test_header_only_div_in_json(self):
"""Test that headers without paragraphs are included in JSON conversion."""
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter

# Use the actual TEI file from test resources
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')

converter = TEI2LossyJSONConverter()
json_data = converter.convert_tei_file(tei_file, stream=False)

# Collect all section headers from the body_text
body_text = json_data.get('body_text', [])
section_headers = set()
for passage in body_text:
if 'head_section' in passage and passage['head_section']:
section_headers.add(passage['head_section'])

# Check that common sections are present
# The test file has Acknowledgements and Competing interests headers
assert 'Competing interests' in section_headers, "Should include 'Competing interests' header"

# Verify we have a good number of sections
assert len(section_headers) >= 10, f"Should extract many section headers, got {len(section_headers)}"

def test_header_only_div_in_markdown(self):
"""Test that headers without paragraphs are included in Markdown conversion."""
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter

# Use the actual TEI file from test resources
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')

converter = TEI2MarkdownConverter()
markdown = converter.convert_tei_file(tei_file)

# Check that the Acknowledgements header is present
assert 'Acknowledgements' in markdown, "Markdown should contain 'Acknowledgements' header"

# Check it's formatted as a header (preceded by ###)
assert '### Acknowledgements' in markdown, "Acknowledgements should be formatted as Markdown header"

def test_conversion_JSON(self):
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
Expand All @@ -648,3 +734,4 @@ def test_conversion_JSON(self):
actual_text = paragraph_text[offset_start:offset_end]
assert actual_text == ref_text, f"Reference text at offsets ({offset_start}-{offset_end}) should match '{ref_text}' but got '{actual_text}'"