Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions grobid_client/format/TEI2LossyJSON.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,9 +669,27 @@ def _iter_passages_from_soup_for_text(self, text_node: Tag, passage_level: str)

div_type = div.get("type")

# Check if this is a header-only div (no content, no nested divs)
# If so, capture its header as context for subsequent divs
head = div.find("head")
direct_p_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "p"]
direct_formula_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "formula"]
nested_divs = [c for c in div.children if hasattr(c, 'name') and (c.name == "div" or (c.name and c.name.endswith(":div")))]
has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0

if head and not has_direct_content and len(nested_divs) == 0:
# This is a header-only div with no nested content
# Capture the header for the next div
head_paragraph = self._clean_text(head.get_text())
continue # Skip to next div, the header will be used by subsequent sibling

# Process this div and potentially nested divs
for passage in self._process_div_with_nested_content(div, passage_level, head_paragraph):
yield passage

# Reset head_paragraph after it's been used by a content-bearing div
head_paragraph = None


def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_paragraph: str = None) -> Iterator[Dict[str, Union[str, Dict[str, str]]]]:
"""
Expand Down
Loading