From 4ebff29a532c13add0e445a44c1a6483fc277c11 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 23 Dec 2025 08:50:37 +0000 Subject: [PATCH] feat: Add support for processing and converting TEI formula elements to LossyJSON and Markdown, including new tests. --- grobid_client/format/TEI2LossyJSON.py | 58 ++++++++++++++---- grobid_client/format/TEI2Markdown.py | 73 +++++++++++++++++----- tests/test_conversions.py | 87 +++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 25 deletions(-) diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py index 41d194a..2e40ae1 100644 --- a/grobid_client/format/TEI2LossyJSON.py +++ b/grobid_client/format/TEI2LossyJSON.py @@ -677,6 +677,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa """ Process a div and its nested content, handling various back section types. Supports nested divs for complex back sections like annex with multiple subsections. + Also handles formula elements that are direct children of divs. """ head = div.find("head") p_nodes = div.find_all("p") @@ -691,10 +692,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa if child.name == "div" or child.name.endswith(":div"): nested_divs.append(child) - # Count only direct child paragraphs, not those in nested divs + # Count only direct child paragraphs and formulas, not those in nested divs direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"] + direct_formula_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "formula"] + has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0 - if len(nested_divs) > 0 and len(direct_p_nodes) == 0: + if len(nested_divs) > 0 and not has_direct_content: # This is a container div - process each nested div independently for nested_div in nested_divs: # Skip references divs @@ -707,11 +710,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa # Determine the section header and content type for divs with content if head: - if len(direct_p_nodes) == 0: - # This div has only a head, no paragraphs (standalone head) + if not has_direct_content: + # This div has only a head, no paragraphs or formulas (standalone head) current_head_paragraph = self._clean_text(head.get_text()) else: - # This div has both head and paragraphs - head is the section header + # This div has both head and content - head is the section header head_section = self._clean_text(head.get_text()) else: # If no head element, try to use the type attribute as head_section @@ -726,7 +729,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa head_section = "Author Contributions" elif div_type == "availability": # Only set as default if this div has its own content - if len(direct_p_nodes) > 0: + if has_direct_content: head_section = "Data Availability" elif div_type == "annex": head_section = "Annex" @@ -734,13 +737,16 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa # Generic handling - capitalize and format head_section = div_type.replace("_", " ").title() - # Process paragraphs in this div - if len(direct_p_nodes) > 0: - for id_p, p in enumerate(direct_p_nodes): + # Process direct children (paragraphs and formulas) in document order + for child in div.children: + if not hasattr(child, 'name') or not child.name: + continue + + if child.name == "p": paragraph_id = get_random_id(prefix="p_") if passage_level == "sentence": - for id_s, sentence in enumerate(p.find_all("s")): + for id_s, sentence in enumerate(child.find_all("s")): struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence) if self.validate_refs: for ref in struct['refs']: @@ -748,13 +754,43 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets" yield struct else: - struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p) + struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child) if self.validate_refs: for ref in struct['refs']: assert ref['offset_start'] < ref['offset_end'], "Wrong offsets" assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets" yield struct + elif child.name == "formula": + # Process formula elements as passages + formula_id = get_random_id(prefix="f_") + formula_text = self._clean_text(child.get_text()) + + if formula_text: + # Create a passage structure for the formula + formula_passage = { + "id": formula_id, + "text": formula_text, + "coords": [ + box_to_dict(coord.split(",")) + for coord in child.get("coords", "").split(";") + ] if child.has_attr("coords") else [], + "refs": [], + "type": "formula" + } + + if current_head_paragraph or head_paragraph: + formula_passage["head_paragraph"] = current_head_paragraph or head_paragraph + if head_section: + formula_passage["head_section"] = head_section + + # Extract formula label if present + label = child.find("label") + if label: + formula_passage["label"] = self._clean_text(label.get_text()) + + yield formula_passage + # Update head_paragraph for potential next div if current_head_paragraph is not None: head_paragraph = current_head_paragraph diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py index 228f527..9e60ba3 100644 --- a/grobid_client/format/TEI2Markdown.py +++ b/grobid_client/format/TEI2Markdown.py @@ -213,14 +213,24 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str: head = div.find("head") if head: section_title = head.get_text().strip() - fulltext_sections.append(f"### {section_title}\n") - - # Get paragraphs - paragraphs = div.find_all("p") - for p in paragraphs: - paragraph_text = self._process_paragraph(p) - if paragraph_text.strip(): - fulltext_sections.append(f"{paragraph_text}\n\n") + if section_title: + fulltext_sections.append(f"### {section_title}\n") + + # Process direct children of the div in document order + # This captures paragraphs, formulas, and other elements as they appear + for child in div.children: + if not hasattr(child, 'name') or not child.name: + continue + + if child.name == "p": + paragraph_text = self._process_paragraph(child) + if paragraph_text.strip(): + fulltext_sections.append(f"{paragraph_text}\n\n") + elif child.name == "formula": + # Handle formula elements - extract text and optional label + formula_text = self._process_formula(child) + if formula_text.strip(): + fulltext_sections.append(f"{formula_text}\n\n") return "".join(fulltext_sections) @@ -272,16 +282,23 @@ def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None: if header_text not in annex_sections: annex_sections.append(header_text) - # Process paragraphs that are direct children of this div (not in nested divs) + # Process direct children of this div in document order + # This captures paragraphs, formulas, and other elements as they appear for child in div.children: - if hasattr(child, 'name') and child.name == "p": + if not hasattr(child, 'name') or not child.name: + continue + + if child.name == "p": paragraph_text = self._process_paragraph(child) if paragraph_text.strip(): annex_sections.append(f"{paragraph_text}\n\n") - - # Process nested div elements - for child in div.children: - if hasattr(child, 'name') and child.name == "div": + elif child.name == "formula": + # Handle formula elements + formula_text = self._process_formula(child) + if formula_text.strip(): + annex_sections.append(f"{formula_text}\n\n") + elif child.name == "div": + # Process nested div elements self._process_div_and_nested_divs(child, annex_sections) def _extract_references(self, soup: BeautifulSoup) -> str: @@ -338,6 +355,34 @@ def _process_paragraph(self, p_element: Tag) -> str: return "".join(text_parts).strip() + def _process_formula(self, formula_element: Tag) -> str: + """Process a formula element and convert to markdown. + + Formulas are rendered as italicized text with optional equation label. + """ + # Get the main formula text (excluding the label) + formula_text_parts = [] + label_text = "" + + for child in formula_element.children: + if hasattr(child, 'name') and child.name == "label": + # Extract equation label (e.g., "(1)", "(2)") + label_text = child.get_text().strip() + elif isinstance(child, NavigableString): + formula_text_parts.append(str(child)) + else: + # Other elements within formula - get their text + formula_text_parts.append(child.get_text()) + + formula_text = "".join(formula_text_parts).strip() + + if formula_text: + # Format as: *formula text* (label) if label exists + if label_text: + return f"*{formula_text}* {label_text}" + return f"*{formula_text}*" + return "" + def _table_to_markdown(self, table_element: Tag) -> str: """Convert a table element to simple markdown.""" markdown_lines = [] diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 62666ac..ccb8527 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -622,6 +622,92 @@ def test_offset_validation_for_specific_references(self): else: print("No offset differences detected between conversion and expected output") + def test_formula_extraction_in_json(self): + """Test that formula elements are extracted as passages in JSON conversion.""" + from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter + + # Use the actual TEI file from test resources which contains formulas + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + converter = TEI2LossyJSONConverter() + json_data = converter.convert_tei_file(tei_file, stream=False) + + # Find formula passages + body_text = json_data.get('body_text', []) + formula_passages = [p for p in body_text if p.get('type') == 'formula'] + + # The test file contains 2 formulas + assert len(formula_passages) >= 2, "Should extract at least 2 formulas from test file" + + # Check formula structure + for formula in formula_passages: + assert 'text' in formula, "Formula should have text" + assert 'id' in formula, "Formula should have id" + assert formula['text'].strip(), "Formula text should not be empty" + + # The test formulas have labels + if 'label' in formula: + assert formula['label'].strip(), "Formula label should not be empty" + + # Check for specific formula content from test file + formula_texts = [f.get('text', '') for f in formula_passages] + assert any('Fext' in t for t in formula_texts), "Should extract formula containing 'Fext'" + + def test_formula_extraction_in_markdown(self): + """Test that formula elements are included in Markdown conversion.""" + from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter + + # Use the actual TEI file from test resources which contains formulas + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + converter = TEI2MarkdownConverter() + markdown = converter.convert_tei_file(tei_file) + + # Check that formula content is present + assert 'Fext' in markdown, "Markdown should contain formula text 'Fext'" + + # Check that formula is italicized (surrounded by asterisks) + assert '*Fext' in markdown, "Formula should be italicized in Markdown" + + def test_header_only_div_in_json(self): + """Test that headers without paragraphs are included in JSON conversion.""" + from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter + + # Use the actual TEI file from test resources + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + converter = TEI2LossyJSONConverter() + json_data = converter.convert_tei_file(tei_file, stream=False) + + # Collect all section headers from the body_text + body_text = json_data.get('body_text', []) + section_headers = set() + for passage in body_text: + if 'head_section' in passage and passage['head_section']: + section_headers.add(passage['head_section']) + + # Check that common sections are present + # The test file has Acknowledgements and Competing interests headers + assert 'Competing interests' in section_headers, "Should include 'Competing interests' header" + + # Verify we have a good number of sections + assert len(section_headers) >= 10, f"Should extract many section headers, got {len(section_headers)}" + + def test_header_only_div_in_markdown(self): + """Test that headers without paragraphs are included in Markdown conversion.""" + from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter + + # Use the actual TEI file from test resources + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + converter = TEI2MarkdownConverter() + markdown = converter.convert_tei_file(tei_file) + + # Check that the Acknowledgements header is present + assert 'Acknowledgements' in markdown, "Markdown should contain 'Acknowledgements' header" + + # Check it's formatted as a header (preceded by ###) + assert '### Acknowledgements' in markdown, "Acknowledgements should be formatted as Markdown header" def test_conversion_JSON(self): from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter @@ -648,3 +734,4 @@ def test_conversion_JSON(self): actual_text = paragraph_text[offset_start:offset_end] assert actual_text == ref_text, f"Reference text at offsets ({offset_start}-{offset_end}) should match '{ref_text}' but got '{actual_text}'" +