Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change Log

## 0.7

- Add parsing to Gregorian date converter; supports month names (full or abbreviated)
in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya
- Include Gregorian dates in omnibus parser

## 0.6

- Experimental omnibus date converter + parser (EDTF, Hebrew, Hijri)
Expand Down
16 changes: 15 additions & 1 deletion DEVELOPER_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,18 @@ pip install -e ".[docs]"
sphinx-build docs docs/_build
```

HTML documentation will be generated in `docs/_build/html`
HTML documentation will be generated in `docs/_build/html`


### Regenerating multilingual Gregorian month name parse file

The Gregorian Lark parser includes a script-generated file, which
populates month names based on a list of language codes using the Babel
library. To regenerate, run the script with hatch (which should
be installed globally)::
```sh
hatch run codegen:generate
```

When the `.lark` file is modified by the script, it must be committed to git.

6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ path = "src/undate/__init__.py"
[tool.hatch.build.targets.sdist]
include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"]

[tool.hatch.envs.codegen]
dependencies = ["babel"]

[tool.hatch.envs.codegen.scripts]
generate = "python scripts/generate_gregorian_grammar.py"

[tool.pytest.ini_options]
pythonpath = "src/"
markers = [
Expand Down
76 changes: 76 additions & 0 deletions scripts/generate_gregorian_grammar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python
"""
This script generates the gregorian_multilang.lark file
with month names (full and abbreviated) based on the list of
target languages.

Run this script with hatch to regenerate the file::

hatch run codegen:generate

"""

from collections import defaultdict
import pathlib

from babel.dates import get_month_names

# lark grammar path relative to this script
GRAMMAR_DIR_PATH = (
pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars"
)
# file that is generated by this script, in that directory
MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark"

# include month names in the following languages
languages = [
"en", # English
"es", # Spanish
"fr", # French
"de", # German
"rw", # Kinyarwanda
"lg", # Ganda
"ti", # Tigrinya
]

# warning to include at top of generated file
warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT.
// To regenerate: hatch run codegen:generate

"""


def main():
# create a dictionary of lists to hold the names for each month
all_month_names = defaultdict(list)

for lang in languages:
for width in ["wide", "abbreviated"]:
for month_num, month_name in get_month_names(width, locale=lang).items():
# some locales use a . on the shortened month; let's ignore that
month_name = month_name.strip(".").lower()
# In some cases different languages have the same abbreviations;
# in some cases, abbreviated and full are the same.
# Only add if not already present, to avoid redundancy
if month_name not in all_month_names[month_num]:
all_month_names[month_num].append(month_name)

with MONTH_GRAMMAR_FILE.open("w") as outfile:
outfile.write(warning_text)

# for each numeric month, generate a rule with all variant names:
# month_1: /January|Jan/i
for i, names in all_month_names.items():
# combine all names in a case-insensitive OR regex
# sort shortest variants last to avoid partial matches hitting first
or_names = "|".join(sorted(names, key=len, reverse=True))
outfile.write(f"month_{i}: /({or_names})/i\n")

print(
f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}"
)
print("If the file has changed, make sure to commit the new version.")


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions src/undate/converters/calendars/gregorian/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from undate.converters.calendars.gregorian.converter import GregorianDateConverter

__all__ = ["GregorianDateConverter"]
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from calendar import monthrange, isleap

from lark.exceptions import UnexpectedInput

from undate.undate import Undate
from undate.converters.base import BaseCalendarConverter
from undate.converters.calendars.gregorian.parser import gregorian_parser
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer


class GregorianDateConverter(BaseCalendarConverter):
Expand All @@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter):
#: arbitrary known leap year
LEAP_YEAR: int = 2024

def __init__(self):
self.transformer = GregorianDateTransformer()

def min_month(self) -> int:
"""First month for the Gregorian calendar."""
return 1
Expand Down Expand Up @@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]:
a common point of comparison.
"""
return (year, month, day)

def parse(self, value: str) -> Undate:
"""
Parse a Gregorian date string of any supported precision in any
supported language and return an :class:`~undate.undate.Undate`.
The input date string is preserved in the label of the resulting
Undate object.
"""
if not value:
raise ValueError("Parsing empty string is not supported")

# parse the input string, then transform to undate object
try:
# parse the string with our Gregorian date parser
parsetree = gregorian_parser.parse(value)
# transform the parse tree into an undate object
undate_obj = self.transformer.transform(parsetree)
# set the original date string as the label
undate_obj.label = value
return undate_obj
except UnexpectedInput as err:
raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err
10 changes: 10 additions & 0 deletions src/undate/converters/calendars/gregorian/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from lark import Lark

from undate.converters import GRAMMAR_FILE_PATH

grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark"

# open based on filename to allow relative imports based on grammar file
gregorian_parser = Lark.open(
str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True
)
42 changes: 42 additions & 0 deletions src/undate/converters/calendars/gregorian/transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from lark import Transformer, Tree

from undate import Undate, Calendar


class GregorianDateTransformer(Transformer):
"""Transform a Gregorian date parse tree and return an Undate."""

# Currently parser should not result in intervals

calendar = Calendar.GREGORIAN

def gregorian_date(self, items):
parts = {}
for child in items:
if child.data in ["year", "month", "day"]:
# in each case we expect one integer value;
# anonymous tokens convert to their value and cast as int
value = int(child.children[0])
parts[str(child.data)] = value

# initialize and return an undate with year, month, day and
# Gregorian calendar
return Undate(**parts, calendar=self.calendar)

def year(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="year", children=[value])

def month(self, items):
# month has a nested tree for the rule and the value
# the name of the rule (month_1, month_2, etc) gives us the
# number of the month needed for converting the date
tree = items[0]
month_n = tree.data.split("_")[-1]
return Tree(data="month", children=[month_n])

def day(self, items):
# combine multiple parts into a single string
value = "".join([str(i) for i in items])
return Tree(data="day", children=[value])
4 changes: 2 additions & 2 deletions src/undate/converters/calendars/hebrew/converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Union

from convertdate import hebrew # type: ignore
from lark.exceptions import UnexpectedCharacters
from lark.exceptions import UnexpectedInput

from undate import Undate, UndateInterval
from undate.converters.base import BaseCalendarConverter
Expand Down Expand Up @@ -111,7 +111,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
# set the original date as a label, with the calendar name
undate_obj.label = f"{value} {self.calendar_name}"
return undate_obj
except UnexpectedCharacters as err:
except UnexpectedInput as err:
raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err

# do we need to support conversion the other direction?
Expand Down
7 changes: 4 additions & 3 deletions src/undate/converters/calendars/hebrew/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark"

with open(grammar_path) as grammar:
# NOTE: LALR parser is faster but can't be used to ambiguity between years and dates
hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True)
# open based on filename to allow relative imports based on grammar file
hebrew_parser = Lark.open(
str(grammar_path), rel_to=__file__, start="hebrew_date", strict=True
)
4 changes: 2 additions & 2 deletions src/undate/converters/calendars/islamic/converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Union

from convertdate import islamic # type: ignore
from lark.exceptions import UnexpectedCharacters
from lark.exceptions import UnexpectedInput

from undate import Undate, UndateInterval
from undate.converters.base import BaseCalendarConverter
Expand Down Expand Up @@ -97,7 +97,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
# set the original date as a label, with the calendar name
undate_obj.label = f"{value} {self.calendar_name}"
return undate_obj
except UnexpectedCharacters as err:
except UnexpectedInput as err:
raise ValueError(f"Could not parse '{value}' as an Islamic date") from err

# do we need to support conversion the other direction?
Expand Down
7 changes: 4 additions & 3 deletions src/undate/converters/calendars/islamic/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

grammar_path = GRAMMAR_FILE_PATH / "islamic.lark"

with open(grammar_path) as grammar:
# NOTE: LALR parser is faster but can't be used due to ambiguity between years and days
islamic_parser = Lark(grammar.read(), start="islamic_date", strict=True)
# open based on filename to allow relative imports based on grammar file
islamic_parser = Lark.open(
str(grammar_path), rel_to=__file__, start="islamic_date", strict=True
)
8 changes: 5 additions & 3 deletions src/undate/converters/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from typing import Union

from lark import Lark
from lark.exceptions import UnexpectedCharacters
from lark.exceptions import UnexpectedInput
from lark.visitors import Transformer, merge_transformers

from undate import Undate, UndateInterval
from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH
from undate.converters.edtf.transformer import EDTFTransformer
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer
from undate.converters.calendars.islamic.transformer import IslamicDateTransformer

Expand All @@ -33,6 +34,7 @@ def start(self, children):
edtf=EDTFTransformer(),
hebrew=HebrewDateTransformer(),
islamic=IslamicDateTransformer(),
gregorian=GregorianDateTransformer(),
)


Expand All @@ -45,7 +47,7 @@ def start(self, children):
class OmnibusDateConverter(BaseDateConverter):
"""
Combination parser that aggregates existing parser grammars.
Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous.
Currently supports EDTF, Gregorian, Hebrew, and Hijri where dates are unambiguous.
(Year-only dates are parsed as EDTF in Gregorian calendar.)

Does not support serialization.
Expand Down Expand Up @@ -75,7 +77,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
parsetree = parser.parse(value)
# transform returns a list; we want the first item in the list
return self.transformer.transform(parsetree)[0]
except UnexpectedCharacters:
except UnexpectedInput:
raise ValueError(
"Parsing failed: '%s' is not in a recognized date format" % value
)
Expand Down
8 changes: 4 additions & 4 deletions src/undate/converters/edtf/converter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Optional, Union

from lark.exceptions import UnexpectedCharacters
from lark.exceptions import UnexpectedInput

from undate import Undate, UndateInterval
from undate.converters.base import BaseDateConverter
Expand Down Expand Up @@ -40,10 +40,10 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
try:
parsetree = edtf_parser.parse(value)
return self.transformer.transform(parsetree)
except UnexpectedCharacters:
except UnexpectedInput as err:
raise ValueError(
"Parsing failed: '%s' is not a supported EDTF date format" % value
)
f"Parsing failed: '{value}' is not a supported EDTF date format"
) from err

def _convert_missing_digits(
self, value: Optional[str], old_missing_digit: str
Expand Down
17 changes: 14 additions & 3 deletions src/undate/converters/grammars/combined.lark
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
%import common.WS
%ignore WS

start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date )
// Ignore periods and commas in dates
%import .undate_common.DATE_PUNCTUATION
%ignore DATE_PUNCTUATION

start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date | gregorian__gregorian_date )

// Renaming of the import variables is required, as they receive the namespace of this file.
// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565
Expand All @@ -23,10 +27,17 @@ start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date )
%import .islamic.month -> islamic__month
%import .islamic.year -> islamic__year

// gregorian calendar, in multiple languages
%import .gregorian.gregorian_date -> gregorian__gregorian_date


// override hebrew date to omit year-only, since year without calendar is ambiguous
// NOTE: potentially support year with calendar label
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year
%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year

// same for islamic date, year alone is ambiguous
%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year
%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year

// same as above. omit year only, since covered by EDTF
// %override gregorian__gregorian_date: day month year | month day year | year month day | month year | year month | day month | month day

Loading