Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ client.process(
markdown_output=True
)

```python
# Process citation lists
client.process(
service="processCitationList",
Expand All @@ -232,6 +233,32 @@ client.process(
)
```

### Standalone Conversion Tools

The library includes standalone scripts to convert TEI XML files to other formats without using the main client or server.

#### TEI to JSON Converter

Converts TEI XML files to the structured JSON format (similar to `--json` option).

```bash
# Convert a single file
python -m grobid_client.format.TEI2LossyJSON_cli --input path/to/file.tei.xml --output path/to/output.json

# Convert with verbose logging
python -m grobid_client.format.TEI2LossyJSON_cli --input path/to/file.tei.xml --verbose
```

#### TEI to Markdown Converter

Converts TEI XML files to Markdown format (similar to `--markdown` option).

```bash
# Convert a single file
python -m grobid_client.format.TEI2Markdown_cli --input path/to/file.tei.xml --output path/to/output.md
```


## ⚙️ Configuration

Configuration can be provided via a JSON file. When using the CLI, the `--server` argument overrides the config file
Expand Down
131 changes: 131 additions & 0 deletions grobid_client/format/TEI2LossyJSON_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Standalone CLI for TEI2LossyJSON converter.

This script provides a command-line interface for converting TEI XML files to JSON format
using the TEI2LossyJSONConverter.
"""
import argparse
import json
import logging
import sys
from pathlib import Path

from .TEI2LossyJSON import TEI2LossyJSONConverter


def setup_logging(verbose: bool = False):
"""Setup logging configuration."""
level = logging.INFO if verbose else logging.WARNING
logging.basicConfig(
level=level,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)


def convert_single_file(input_file: Path, output_file: Path, verbose: bool = False) -> bool:
"""Convert a single TEI file to JSON format."""
try:
if verbose:
logging.info(f"Converting {input_file} to {output_file}")

converter = TEI2LossyJSONConverter()
result = converter.convert_tei_file(input_file, stream=False)

if result is None:
logging.error(f"Failed to convert {input_file}: TEI file is not well-formed or empty")
return False

# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)

# Write JSON output
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)

if verbose:
logging.info(f"Successfully converted {input_file} to {output_file}")

return True

except Exception as e:
logging.error(f"Error converting {input_file}: {str(e)}")
return False


def main():
"""Main CLI entry point."""
parser = argparse.ArgumentParser(
description="Convert TEI XML files to JSON format using TEI2LossyJSON converter",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Convert a single TEI file
python -m grobid_client.format.TEI2LossyJSON --input input.tei.xml --output output.json

# Convert with verbose logging
python -m grobid_client.format.TEI2LossyJSON --input input.tei.xml --output output.json --verbose

# Convert and output to stdout
python -m grobid_client.format.TEI2LossyJSON --input input.tei.xml
"""
)

parser.add_argument(
"--input", "-i",
type=Path,
required=True,
help="Input TEI XML file to convert"
)

parser.add_argument(
"--output", "-o",
type=Path,
help="Output JSON file (if not specified, prints to stdout)"
)

parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose logging"
)

args = parser.parse_args()

# Setup logging
setup_logging(args.verbose)

# Validate input file
if not args.input.exists():
logging.error(f"Input file does not exist: {args.input}")
sys.exit(1)

if not args.input.is_file():
logging.error(f"Input path is not a file: {args.input}")
sys.exit(1)

# Convert the file
if args.output:
success = convert_single_file(args.input, args.output, args.verbose)
sys.exit(0 if success else 1)
else:
# Output to stdout
try:
converter = TEI2LossyJSONConverter()
result = converter.convert_tei_file(args.input, stream=False)

if result is None:
logging.error(f"Failed to convert {args.input}: TEI file is not well-formed or empty")
sys.exit(1)

# Print JSON to stdout
print(json.dumps(result, indent=2, ensure_ascii=False))

except Exception as e:
logging.error(f"Error converting {args.input}: {str(e)}")
sys.exit(1)


if __name__ == "__main__":
main()
130 changes: 130 additions & 0 deletions grobid_client/format/TEI2Markdown_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Standalone CLI for TEI2Markdown converter.

This script provides a command-line interface for converting TEI XML files to Markdown format
using the TEI2MarkdownConverter.
"""
import argparse
import logging
import sys
from pathlib import Path

from .TEI2Markdown import TEI2MarkdownConverter


def setup_logging(verbose: bool = False):
"""Setup logging configuration."""
level = logging.INFO if verbose else logging.WARNING
logging.basicConfig(
level=level,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)


def convert_single_file(input_file: Path, output_file: Path, verbose: bool = False) -> bool:
"""Convert a single TEI file to Markdown format."""
try:
if verbose:
logging.info(f"Converting {input_file} to {output_file}")

converter = TEI2MarkdownConverter()
result = converter.convert_tei_file(input_file)

if result is None:
logging.error(f"Failed to convert {input_file}: TEI file is not well-formed or empty")
return False

# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)

# Write Markdown output
with open(output_file, 'w', encoding='utf-8') as f:
f.write(result)

if verbose:
logging.info(f"Successfully converted {input_file} to {output_file}")

return True

except Exception as e:
logging.error(f"Error converting {input_file}: {str(e)}")
return False


def main():
"""Main CLI entry point."""
parser = argparse.ArgumentParser(
description="Convert TEI XML files to Markdown format using TEI2Markdown converter",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Convert a single TEI file
python -m grobid_client.format.TEI2Markdown --input input.tei.xml --output output.md

# Convert with verbose logging
python -m grobid_client.format.TEI2Markdown --input input.tei.xml --output output.md --verbose

# Convert and output to stdout
python -m grobid_client.format.TEI2Markdown --input input.tei.xml
"""
)

parser.add_argument(
"--input", "-i",
type=Path,
required=True,
help="Input TEI XML file to convert"
)

parser.add_argument(
"--output", "-o",
type=Path,
help="Output Markdown file (if not specified, prints to stdout)"
)

parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose logging"
)

args = parser.parse_args()

# Setup logging
setup_logging(args.verbose)

# Validate input file
if not args.input.exists():
logging.error(f"Input file does not exist: {args.input}")
sys.exit(1)

if not args.input.is_file():
logging.error(f"Input path is not a file: {args.input}")
sys.exit(1)

# Convert the file
if args.output:
success = convert_single_file(args.input, args.output, args.verbose)
sys.exit(0 if success else 1)
else:
# Output to stdout
try:
converter = TEI2MarkdownConverter()
result = converter.convert_tei_file(args.input)

if result is None:
logging.error(f"Failed to convert {args.input}: TEI file is not well-formed or empty")
sys.exit(1)

# Print Markdown to stdout
print(result)

except Exception as e:
logging.error(f"Error converting {args.input}: {str(e)}")
sys.exit(1)


if __name__ == "__main__":
main()
48 changes: 48 additions & 0 deletions grobid_client/format/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
Package entry point for format converters.

This provides a menu to choose between TEI2LossyJSON and TEI2Markdown converters.
"""
import argparse
import sys


def main():
"""Main entry point that provides a menu for converter selection."""

# Check if a converter was specified
if len(sys.argv) < 2:
print("GROBID format converters - Choose a converter to run")
print("\nUsage:")
print(" python -m grobid_client.format <converter> [options]")
print("\nAvailable converters:")
print(" TEI2LossyJSON - Convert TEI XML to JSON format")
print(" TEI2Markdown - Convert TEI XML to Markdown format")
print("\nExamples:")
print(" python -m grobid_client.format TEI2LossyJSON --input file.tei.xml --output output.json")
print(" python -m grobid_client.format TEI2Markdown --input file.tei.xml --output output.md")
print("\nGet help for specific converter:")
print(" python -m grobid_client.format TEI2LossyJSON --help")
print(" python -m grobid_client.format TEI2Markdown --help")
sys.exit(1)

converter = sys.argv[1]

if converter == "TEI2LossyJSON":
from .TEI2LossyJSON_cli import main as lossy_main
# Replace sys.argv to pass remaining args to the converter
sys.argv = ["TEI2LossyJSON"] + sys.argv[2:]
lossy_main()
elif converter == "TEI2Markdown":
from .TEI2Markdown_cli import main as markdown_main
# Replace sys.argv to pass remaining args to the converter
sys.argv = ["TEI2Markdown"] + sys.argv[2:]
markdown_main()
else:
print(f"Unknown converter: {converter}")
print("Available converters: TEI2LossyJSON, TEI2Markdown")
sys.exit(1)


if __name__ == "__main__":
main()
Loading