Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/undate/converters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,20 @@ Converters
Overview
--------


.. automodule:: undate.converters

-----

.. automodule:: undate.converters.base
:members:
:undoc-members:


.. autoclass:: undate.converters.combined.OmnibusDateConverter
:members:


Formats
--------

Expand All @@ -33,6 +43,8 @@ Extended Date-Time Format (EDTF)
Calendars
---------

.. automodule:: undate.converters.calendars

Gregorian
^^^^^^^^^

Expand Down
30 changes: 29 additions & 1 deletion src/undate/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,29 @@
from undate.converters.base import BaseDateConverter as BaseDateConverter
"""
Converter classes add support for parsing and serializing dates
in a variety of formats. A subset of these are calendar converters
(:mod:`undate.converters.calendar`), which means they support both parsing
and conversion from an alternate calendar to a common Gregorian
for comparison across dates.

To parse a date with a supported converter, use the ``Undate`` class method
:meth:`~undate.undate.Undate.parse` and specify the date as a string
with the desired format or calendar, e.g.

.. code-block::

Undate.parse("2001-05", "EDTF")
Undate.parse("7 Heshvan 5425", "Hebrew")

For converters that support it, you can also serialize a date in a specified
format with ``Undate`` class method :meth:`~undate.undate.Undate.format`:

.. code-block::

Undate.parse("Rabīʿ ath-Thānī 343", "Islamic").format("EDTF")


"""

from undate.converters.base import BaseDateConverter, GRAMMAR_FILE_PATH

__all__ = ["BaseDateConverter", "GRAMMAR_FILE_PATH"]
5 changes: 5 additions & 0 deletions src/undate/converters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

import importlib
import logging
import pathlib
import pkgutil
from functools import cache
from typing import Dict, Type
Expand All @@ -53,6 +54,10 @@
logger = logging.getLogger(__name__)


#: Path to parser grammar files
GRAMMAR_FILE_PATH = pathlib.Path(__file__).parent / "grammars"


class BaseDateConverter:
"""Base class for parsing, formatting, and converting dates to handle
specific formats and different calendars."""
Expand Down
6 changes: 3 additions & 3 deletions src/undate/converters/calendars/hebrew/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib

from lark import Lark

grammar_path = pathlib.Path(__file__).parent / "hebrew.lark"
from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark"

with open(grammar_path) as grammar:
# NOTE: LALR parser is faster but can't be used to ambiguity between years and dates
Expand Down
9 changes: 5 additions & 4 deletions src/undate/converters/calendars/hebrew/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ def hebrew_date(self, items):

# initialize and return an undate with year, month, day and
# configured calendar (hebrew by default)
# NOTE: use self.calendar so Seleucid can extend more easily
return Undate(**parts, calendar=self.calendar)

# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
# def year(self, items):
# return Tree(data="year", children=[items[0]])
def year(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
Expand Down
6 changes: 3 additions & 3 deletions src/undate/converters/calendars/islamic/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib

from lark import Lark

grammar_path = pathlib.Path(__file__).parent / "islamic.lark"
from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "islamic.lark"

with open(grammar_path) as grammar:
# NOTE: LALR parser is faster but can't be used due to ambiguity between years and days
Expand Down
13 changes: 11 additions & 2 deletions src/undate/converters/calendars/islamic/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,17 @@ def islamic_date(self, items):

# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
# def year(self, items):
# return Tree(data="year", children=[items[0]])
def year(self, items):
# combine multiple parts into a single string
# (for some reason we're getting an anonymous token in combined parser)
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def day(self, items):
# combine multiple parts into a single string
# (for some reason we're getting an anonymous token in combined parser)
value = "".join([str(i) for i in items])
return Tree(data="day", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
Expand Down
85 changes: 85 additions & 0 deletions src/undate/converters/combined.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
**Experimental** combined parser. Supports EDTF, Hebrew, and Hijri
where dates are unambiguous. (Year-only dates are parsed as EDTF in
Gregorian calendar.)
"""

from typing import Union

from lark import Lark
from lark.exceptions import UnexpectedCharacters
from lark.visitors import Transformer, merge_transformers

from undate import Undate, UndateInterval
from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH
from undate.converters.edtf.transformer import EDTFTransformer
from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer
from undate.converters.calendars.islamic.transformer import IslamicDateTransformer


class CombinedDateTransformer(Transformer):
def start(self, children):
# trigger the transformer for the appropriate part of the grammar
return children


# NOTE: currently year-only dates in combined parser are interpreted as
# EDTF and use Gregorian calendar.
# In future, we could refine by adding calendar names & abbreviations
# to the parser in order to recognize years from other calendars.

combined_transformer = merge_transformers(
CombinedDateTransformer(),
edtf=EDTFTransformer(),
hebrew=HebrewDateTransformer(),
islamic=IslamicDateTransformer(),
)


# open based on filename so we can specify relative import path based on grammar file
parser = Lark.open(
str(GRAMMAR_FILE_PATH / "combined.lark"), rel_to=__file__, strict=True
)


class OmnibusDateConverter(BaseDateConverter):
"""
Combination parser that aggregates existing parser grammars.
Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous.
(Year-only dates are parsed as EDTF in Gregorian calendar.)

Does not support serialization.

Example usage::

Undate.parse("Tammuz 4816", "omnibus")

"""

#: converter name: omnibus
name: str = "omnibus"

def __init__(self):
self.transformer = combined_transformer

def parse(self, value: str) -> Union[Undate, UndateInterval]:
"""
Parse a string in a supported format and return an :class:`~undate.undate.Undate`
or :class:`~undate.undate.UndateInterval`.
"""
if not value:
raise ValueError("Parsing empty/unset string is not supported")

# parse the input string, then transform to undate object
try:
parsetree = parser.parse(value)
# transform returns a list; we want the first item in the list
return self.transformer.transform(parsetree)[0]
except UnexpectedCharacters:
raise ValueError(
"Parsing failed: '%s' is not in a recognized date format" % value
)

def to_string(self, undate: Union[Undate, UndateInterval]) -> str:
"Not supported by this converter. Will raise :class:`ValueError`"
raise ValueError("Omnibus converter does not support serialization")
6 changes: 3 additions & 3 deletions src/undate/converters/edtf/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib

from lark import Lark

grammar_path = pathlib.Path(__file__).parent / "edtf.lark"
from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "edtf.lark"

with open(grammar_path) as grammar:
edtf_parser = Lark(grammar.read(), start="edtf")
5 changes: 4 additions & 1 deletion src/undate/converters/edtf/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ def day_unspecified(self, items):
def date_level1(self, items):
return self.date(items)

# year (including negative years) use default transformation
def year(self, items):
# combine parts (numeric & unknown) into a single string
value = "".join(self.get_values(items))
return Tree(data="year", children=[value])

def year_fivedigitsplus(self, items):
# strip off the leading Y and convert to integer
Expand Down
32 changes: 32 additions & 0 deletions src/undate/converters/grammars/combined.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
%import common.WS
%ignore WS

start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date )

// Renaming of the import variables is required, as they receive the namespace of this file.
// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565

// All grammars are in the same file, so we can use relative imports

// relative import from edtf.lark
%import .edtf.edtf -> edtf__start

// relative import from hebrew.lark
%import .hebrew.hebrew_date -> hebrew__hebrew_date
%import .hebrew.day -> hebrew__day
%import .hebrew.month -> hebrew__month
%import .hebrew.year -> hebrew__year

// relative import from islamic.lark
%import .islamic.islamic_date -> islamic__islamic_date
%import .islamic.day -> islamic__day
%import .islamic.month -> islamic__month
%import .islamic.year -> islamic__year


// override hebrew date to omit year-only, since year without calendar is ambiguous
// NOTE: potentially support year with calendar label
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year

// same for islamic date, year alone is ambiguous
%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ hebrew_date: weekday? day month comma? year | month year | year
// PGP dates use qualifiers like "first decade of" (for beginning of month)
// "first third of", seasons (can look for more examples)

// Hebrew calendar starts with year 1 in 3761 BCE
// Hebrew calendar starts with year 1 in 3761 BCE
year: /\d+/

// months
month: month_1
| month_2
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12
| month_13
| month_3
| month_4
| month_5
| month_6
| month_7
| month_8
| month_9
| month_10
| month_11
| month_12
| month_13
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/

Expand Down
54 changes: 54 additions & 0 deletions tests/test_converters/test_combined_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pytest

from undate.converters.combined import parser, combined_transformer

from undate import Undate, UndateInterval

# test that valid dates can be parsed

testcases = [
# EDTF
("1984", Undate(1984)),
("201X", Undate("201X")),
("20XX", Undate("20XX")),
("2004-XX", Undate(2004, "XX")),
("1000/2000", UndateInterval(Undate(1000), Undate(2000))),
# Hebrew / Anno Mundi calendar
("Tammuz 4816", Undate(4816, 4, calendar="Hebrew")),
# Islamic / Hijri calendar
("Jumādā I 1243", Undate(1243, 5, calendar="Islamic")),
("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Islamic")),
("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Islamic")),
]


@pytest.mark.parametrize("date_string,expected", testcases)
def test_transform(date_string, expected):
# test the transformer directly
transformer = combined_transformer
# parse the input string, then transform to undate object
parsetree = parser.parse(date_string)
# since the same unknown date is not considered strictly equal,
# compare object representations
transformed_date = transformer.transform(parsetree)
assert repr(transformed_date[0]) == repr(expected)


@pytest.mark.parametrize("date_string,expected", testcases)
def test_converter(date_string, expected):
# should work the same way when called through the converter class
assert repr(Undate.parse(date_string, "omnibus")) == repr(expected)


def test_parse_errors():
# empty string not supported
with pytest.raises(ValueError, match="not supported"):
Undate.parse("", "omnibus")

with pytest.raises(ValueError, match="not in a recognized date format"):
Undate.parse("Monday 2023", "omnibus")


def test_no_serialize():
with pytest.raises(ValueError, match="does not support"):
Undate("2022").format("omnibus")
Loading