From c37a1fe2a273207ed0c3802a3728208d26a8ab0d Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 12:30:22 -0500 Subject: [PATCH 01/17] Preliminary gregorian grammer, parser, and tests --- pyproject.toml | 6 +++ scripts/generate_gregorian_grammar.py | 26 ++++++++++ .../calendars/gregorian/__init__.py | 3 ++ .../{gregorian.py => gregorian/converter.py} | 0 .../converters/calendars/gregorian/parser.py | 10 ++++ .../calendars/gregorian/transformer.py | 37 +++++++++++++ src/undate/converters/grammars/gregorian.lark | 30 +++++++++++ .../grammars/gregorian_multilang.lark | 4 ++ .../test_gregorian_converter.py} | 0 .../test_gregorian/test_gregorian_parser.py | 52 +++++++++++++++++++ 10 files changed, 168 insertions(+) create mode 100644 scripts/generate_gregorian_grammar.py create mode 100644 src/undate/converters/calendars/gregorian/__init__.py rename src/undate/converters/calendars/{gregorian.py => gregorian/converter.py} (100%) create mode 100644 src/undate/converters/calendars/gregorian/parser.py create mode 100644 src/undate/converters/calendars/gregorian/transformer.py create mode 100644 src/undate/converters/grammars/gregorian.lark create mode 100644 src/undate/converters/grammars/gregorian_multilang.lark rename tests/test_converters/test_calendars/{test_gregorian.py => test_gregorian/test_gregorian_converter.py} (100%) create mode 100644 tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py diff --git a/pyproject.toml b/pyproject.toml index fcebbbd..e7b3e77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,12 @@ path = "src/undate/__init__.py" [tool.hatch.build.targets.sdist] include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"] +[tool.hatch.envs.codegen] +dependencies = ["babel"] + +[tool.hatch.envs.codegen.scripts] +generate = "python scripts/generate_gregorian_grammar.py" + [tool.pytest.ini_options] pythonpath = "src/" markers = [ diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py new file mode 100644 index 0000000..87dd7d0 --- /dev/null +++ b/scripts/generate_gregorian_grammar.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +from babel.dates import get_month_names + + +languages = { + "rw": "Kinyarwanda", + "lg": "Ganda", + "ti": "Tigrinya", + "fr": "French", + "en": "English", +} +# for locale_code in ["fr_FR", "de_DE", "rw_rw", "ti_ET", "lg_UG"]: + + +def main(): + for lang, name in languages.items(): + print(f"\n### {name} (`{lang}`)") + for width in ["abbreviated", "wide"]: + print( + f"- {width}: " + ", ".join(get_month_names(width, locale=lang).values()) + ) + + +if __name__ == "__main__": + main() diff --git a/src/undate/converters/calendars/gregorian/__init__.py b/src/undate/converters/calendars/gregorian/__init__.py new file mode 100644 index 0000000..f08896b --- /dev/null +++ b/src/undate/converters/calendars/gregorian/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.gregorian.converter import GregorianDateConverter + +__all__ = ["GregorianDateConverter"] diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian/converter.py similarity index 100% rename from src/undate/converters/calendars/gregorian.py rename to src/undate/converters/calendars/gregorian/converter.py diff --git a/src/undate/converters/calendars/gregorian/parser.py b/src/undate/converters/calendars/gregorian/parser.py new file mode 100644 index 0000000..cfcea53 --- /dev/null +++ b/src/undate/converters/calendars/gregorian/parser.py @@ -0,0 +1,10 @@ +from lark import Lark + +from undate.converters import GRAMMAR_FILE_PATH + +grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark" + +# open based on filename to allow relative imports based on grammar file +gregorian_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True +) diff --git a/src/undate/converters/calendars/gregorian/transformer.py b/src/undate/converters/calendars/gregorian/transformer.py new file mode 100644 index 0000000..7f24e02 --- /dev/null +++ b/src/undate/converters/calendars/gregorian/transformer.py @@ -0,0 +1,37 @@ +from lark import Transformer, Tree + +from undate import Undate, Calendar + + +class GregorianDateTransformer(Transformer): + """Transform a Gregorian date parse tree and return an Undate.""" + + # Currently parser should not result in intervals + + calendar = Calendar.GREGORIAN + + def gregorian_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with year, month, day and + # Gregorian calendar + return Undate(**parts, calendar=self.calendar) + + def year(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark new file mode 100644 index 0000000..308aee8 --- /dev/null +++ b/src/undate/converters/grammars/gregorian.lark @@ -0,0 +1,30 @@ +%import common.WS +%ignore WS + +%import .gregorian_multilang (month_1, month_2) + +// no weekday support for now +gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day + +// months have 29 to 30 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ + +// Gregorian calendar started in 1582; assume years with 3 or more digits for now, +// so we can support mixed day / year order unambiguously +year: /\d{3,}/ + +// months +month: month_1 + | month_2 + + // | month_3 + // | month_4 + // | month_5 + // | month_6 + // | month_7 + // | month_8 + // | month_9 + // | month_10 + // | month_11 + // | month_12 + diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark new file mode 100644 index 0000000..42e59bf --- /dev/null +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -0,0 +1,4 @@ + + +month_1: "Jan" | "January" +month_2: "Feb" | "February" \ No newline at end of file diff --git a/tests/test_converters/test_calendars/test_gregorian.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py similarity index 100% rename from tests/test_converters/test_calendars/test_gregorian.py rename to tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py new file mode 100644 index 0000000..83df118 --- /dev/null +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py @@ -0,0 +1,52 @@ +import pytest +from lark.exceptions import UnexpectedCharacters, UnexpectedEOF + +from undate.converters.calendars.gregorian.parser import gregorian_parser + + +# for now, just test that valid dates can be parsed + +testcases = [ + # year + "2012", + # month + year + "Jan 1960", + "Feb 1801", + "1900 Feb", + # day + month + year + "Feb 5 1602", + # "1602 February 5", + # day + month + "January 5", + "5 Jan", + # three digit year +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert gregorian_parser.parse(date_string) + + +error_cases = [ + # # invalid days + ("0 Tammuz 5403", UnexpectedCharacters), + # ("31 Tishri 5403", UnexpectedCharacters), + # # month alone + # ("Tishri", UnexpectedEOF), + # # month day only + ("12 ", UnexpectedEOF), + # # invalid month + # ("Foo 383", UnexpectedCharacters), + # # wrong format + # ("2024-10-02", UnexpectedCharacters), + # # year month day not supported + # ("5403 Adar", UnexpectedCharacters), + # ("5403 Adar 14", UnexpectedCharacters), +] + + +@pytest.mark.parametrize("date_string,exception", error_cases) +def test_should_error(date_string, exception): + with pytest.raises(exception): + gregorian_parser.parse(date_string) From 3c58c2e2a1246c74e08cd9c946215f591436dd2d Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 13:37:36 -0500 Subject: [PATCH 02/17] Fully implement script to generate month names for Gregorian parser --- scripts/generate_gregorian_grammar.py | 62 +++++++++++++++++++++------ 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py index 87dd7d0..9b5f770 100644 --- a/scripts/generate_gregorian_grammar.py +++ b/scripts/generate_gregorian_grammar.py @@ -1,25 +1,59 @@ #!/usr/bin/env python +from collections import defaultdict +import pathlib + from babel.dates import get_month_names +# lark grammar path relative to this script +GRAMMAR_DIR_PATH = ( + pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars" +) +# file that is generated by this script, in that directory +MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark" + +# include month names in the following languages +languages = [ + "en", # English + "es", # Spanish + "fr", # French + "de", # German + "rw", # Kinyarwanda + "lg", # Ganda + "ti", # Tigrinya +] -languages = { - "rw": "Kinyarwanda", - "lg": "Ganda", - "ti": "Tigrinya", - "fr": "French", - "en": "English", -} -# for locale_code in ["fr_FR", "de_DE", "rw_rw", "ti_ET", "lg_UG"]: +# warning to include at top of generated file +warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT. +// To regenerate: uvx hatch run codegen:generate + +""" def main(): - for lang, name in languages.items(): - print(f"\n### {name} (`{lang}`)") - for width in ["abbreviated", "wide"]: - print( - f"- {width}: " + ", ".join(get_month_names(width, locale=lang).values()) - ) + # create a dictionary of lists to hold the names for each month + all_month_names = defaultdict(list) + + for lang in languages: + for width in ["wide", "abbreviated"]: + for month_num, month_name in get_month_names(width, locale=lang).items(): + # some locales use a . on the shortened month; let's ignore that + all_month_names[month_num].append(month_name.strip(".")) + + with MONTH_GRAMMAR_FILE.open("w") as outfile: + outfile.write(warning_text) + + # for each numeric month, generate a rule with all variant names: + # month_1: "January" | "Jan" ... + for i, names in all_month_names.items(): + # combine all names in an OR string + or_names = " | ".join(f'"{m}"' for m in names) + outfile.write(f"month_{i}: {or_names}\n") + + print( + f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}" + ) + print("If the file has changed, make sure to commit the new version.") if __name__ == "__main__": From 01ebe5e93ea75c4b9e79dc12a15fcf59b5263b0e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 13:38:04 -0500 Subject: [PATCH 03/17] Grammar with month names in multiple languages --- .../grammars/gregorian_multilang.lark | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark index 42e59bf..6505be7 100644 --- a/src/undate/converters/grammars/gregorian_multilang.lark +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -1,4 +1,15 @@ +// WARNING: This file is auto-generated. DO NOT EDIT. +// To regenerate: uvx hatch run codegen:generate - -month_1: "Jan" | "January" -month_2: "Feb" | "February" \ No newline at end of file +month_1: "January" | "Jan" | "enero" | "ene" | "janvier" | "janv" | "Januar" | "Jan" | "Mutarama" | "mut" | "Janwaliyo" | "Jan" | "ጥሪ" | "ጥሪ" +month_2: "February" | "Feb" | "febrero" | "feb" | "février" | "févr" | "Februar" | "Feb" | "Gashyantare" | "gas" | "Febwaliyo" | "Feb" | "ለካቲት" | "ለካ" +month_3: "March" | "Mar" | "marzo" | "mar" | "mars" | "mars" | "März" | "März" | "Werurwe" | "wer" | "Marisi" | "Mar" | "መጋቢት" | "መጋ" +month_4: "April" | "Apr" | "abril" | "abr" | "avril" | "avr" | "April" | "Apr" | "Mata" | "mat" | "Apuli" | "Apu" | "ሚያዝያ" | "ሚያ" +month_5: "May" | "May" | "mayo" | "may" | "mai" | "mai" | "Mai" | "Mai" | "Gicurasi" | "gic" | "Maayi" | "Maa" | "ጉንበት" | "ግን" +month_6: "June" | "Jun" | "junio" | "jun" | "juin" | "juin" | "Juni" | "Juni" | "Kamena" | "kam" | "Juuni" | "Juu" | "ሰነ" | "ሰነ" +month_7: "July" | "Jul" | "julio" | "jul" | "juillet" | "juil" | "Juli" | "Juli" | "Nyakanga" | "nya" | "Julaayi" | "Jul" | "ሓምለ" | "ሓም" +month_8: "August" | "Aug" | "agosto" | "ago" | "août" | "août" | "August" | "Aug" | "Kanama" | "kan" | "Agusito" | "Agu" | "ነሓሰ" | "ነሓ" +month_9: "September" | "Sep" | "septiembre" | "sept" | "septembre" | "sept" | "September" | "Sept" | "Nzeri" | "nze" | "Sebuttemba" | "Seb" | "መስከረም" | "መስ" +month_10: "October" | "Oct" | "octubre" | "oct" | "octobre" | "oct" | "Oktober" | "Okt" | "Ukwakira" | "ukw" | "Okitobba" | "Oki" | "ጥቅምቲ" | "ጥቅ" +month_11: "November" | "Nov" | "noviembre" | "nov" | "novembre" | "nov" | "November" | "Nov" | "Ugushyingo" | "ugu" | "Novemba" | "Nov" | "ሕዳር" | "ሕዳ" +month_12: "December" | "Dec" | "diciembre" | "dic" | "décembre" | "déc" | "Dezember" | "Dez" | "Ukuboza" | "uku" | "Desemba" | "Des" | "ታሕሳስ" | "ታሕ" From fb2906125acc99273437fed0cba1ff52289918b3 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 13:39:36 -0500 Subject: [PATCH 04/17] Import and use all month names --- src/undate/converters/grammars/gregorian.lark | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark index 308aee8..b30a594 100644 --- a/src/undate/converters/grammars/gregorian.lark +++ b/src/undate/converters/grammars/gregorian.lark @@ -1,12 +1,13 @@ %import common.WS %ignore WS -%import .gregorian_multilang (month_1, month_2) +%import .gregorian_multilang (month_1, month_2, month_3, month_4, month_5, \ + month_6, month_7, month_8, month_9, month_10, month_11, month_12) // no weekday support for now gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day -// months have 29 to 30 days; we do not expect leading zeroes +// months have 28 to 31 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ // Gregorian calendar started in 1582; assume years with 3 or more digits for now, @@ -15,16 +16,15 @@ year: /\d{3,}/ // months month: month_1 - | month_2 - - // | month_3 - // | month_4 - // | month_5 - // | month_6 - // | month_7 - // | month_8 - // | month_9 - // | month_10 - // | month_11 - // | month_12 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 From 6d6259b50cab0b20dae30a204257ce14a7c77157 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 13:46:41 -0500 Subject: [PATCH 05/17] Don't repeat month names / abbreviations --- scripts/generate_gregorian_grammar.py | 7 +++++- .../grammars/gregorian_multilang.lark | 22 +++++++++---------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py index 9b5f770..a5f4563 100644 --- a/scripts/generate_gregorian_grammar.py +++ b/scripts/generate_gregorian_grammar.py @@ -38,7 +38,12 @@ def main(): for width in ["wide", "abbreviated"]: for month_num, month_name in get_month_names(width, locale=lang).items(): # some locales use a . on the shortened month; let's ignore that - all_month_names[month_num].append(month_name.strip(".")) + month_name = month_name.strip(".") + # In some cases different languages have the same abbreviations; + # in some cases, abbreviated and full are the same. + # Only add if not already present, to avoid redundancy + if month_name not in all_month_names[month_num]: + all_month_names[month_num].append(month_name) with MONTH_GRAMMAR_FILE.open("w") as outfile: outfile.write(warning_text) diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark index 6505be7..83dcbb1 100644 --- a/src/undate/converters/grammars/gregorian_multilang.lark +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -1,15 +1,15 @@ // WARNING: This file is auto-generated. DO NOT EDIT. // To regenerate: uvx hatch run codegen:generate -month_1: "January" | "Jan" | "enero" | "ene" | "janvier" | "janv" | "Januar" | "Jan" | "Mutarama" | "mut" | "Janwaliyo" | "Jan" | "ጥሪ" | "ጥሪ" -month_2: "February" | "Feb" | "febrero" | "feb" | "février" | "févr" | "Februar" | "Feb" | "Gashyantare" | "gas" | "Febwaliyo" | "Feb" | "ለካቲት" | "ለካ" -month_3: "March" | "Mar" | "marzo" | "mar" | "mars" | "mars" | "März" | "März" | "Werurwe" | "wer" | "Marisi" | "Mar" | "መጋቢት" | "መጋ" -month_4: "April" | "Apr" | "abril" | "abr" | "avril" | "avr" | "April" | "Apr" | "Mata" | "mat" | "Apuli" | "Apu" | "ሚያዝያ" | "ሚያ" -month_5: "May" | "May" | "mayo" | "may" | "mai" | "mai" | "Mai" | "Mai" | "Gicurasi" | "gic" | "Maayi" | "Maa" | "ጉንበት" | "ግን" -month_6: "June" | "Jun" | "junio" | "jun" | "juin" | "juin" | "Juni" | "Juni" | "Kamena" | "kam" | "Juuni" | "Juu" | "ሰነ" | "ሰነ" -month_7: "July" | "Jul" | "julio" | "jul" | "juillet" | "juil" | "Juli" | "Juli" | "Nyakanga" | "nya" | "Julaayi" | "Jul" | "ሓምለ" | "ሓም" -month_8: "August" | "Aug" | "agosto" | "ago" | "août" | "août" | "August" | "Aug" | "Kanama" | "kan" | "Agusito" | "Agu" | "ነሓሰ" | "ነሓ" -month_9: "September" | "Sep" | "septiembre" | "sept" | "septembre" | "sept" | "September" | "Sept" | "Nzeri" | "nze" | "Sebuttemba" | "Seb" | "መስከረም" | "መስ" -month_10: "October" | "Oct" | "octubre" | "oct" | "octobre" | "oct" | "Oktober" | "Okt" | "Ukwakira" | "ukw" | "Okitobba" | "Oki" | "ጥቅምቲ" | "ጥቅ" -month_11: "November" | "Nov" | "noviembre" | "nov" | "novembre" | "nov" | "November" | "Nov" | "Ugushyingo" | "ugu" | "Novemba" | "Nov" | "ሕዳር" | "ሕዳ" +month_1: "January" | "Jan" | "enero" | "ene" | "janvier" | "janv" | "Januar" | "Mutarama" | "mut" | "Janwaliyo" | "ጥሪ" +month_2: "February" | "Feb" | "febrero" | "feb" | "février" | "févr" | "Februar" | "Gashyantare" | "gas" | "Febwaliyo" | "ለካቲት" | "ለካ" +month_3: "March" | "Mar" | "marzo" | "mar" | "mars" | "März" | "Werurwe" | "wer" | "Marisi" | "መጋቢት" | "መጋ" +month_4: "April" | "Apr" | "abril" | "abr" | "avril" | "avr" | "Mata" | "mat" | "Apuli" | "Apu" | "ሚያዝያ" | "ሚያ" +month_5: "May" | "mayo" | "may" | "mai" | "Mai" | "Gicurasi" | "gic" | "Maayi" | "Maa" | "ጉንበት" | "ግን" +month_6: "June" | "Jun" | "junio" | "jun" | "juin" | "Juni" | "Kamena" | "kam" | "Juuni" | "Juu" | "ሰነ" +month_7: "July" | "Jul" | "julio" | "jul" | "juillet" | "juil" | "Juli" | "Nyakanga" | "nya" | "Julaayi" | "ሓምለ" | "ሓም" +month_8: "August" | "Aug" | "agosto" | "ago" | "août" | "Kanama" | "kan" | "Agusito" | "Agu" | "ነሓሰ" | "ነሓ" +month_9: "September" | "Sep" | "septiembre" | "sept" | "septembre" | "Sept" | "Nzeri" | "nze" | "Sebuttemba" | "Seb" | "መስከረም" | "መስ" +month_10: "October" | "Oct" | "octubre" | "oct" | "octobre" | "Oktober" | "Okt" | "Ukwakira" | "ukw" | "Okitobba" | "Oki" | "ጥቅምቲ" | "ጥቅ" +month_11: "November" | "Nov" | "noviembre" | "nov" | "novembre" | "Ugushyingo" | "ugu" | "Novemba" | "ሕዳር" | "ሕዳ" month_12: "December" | "Dec" | "diciembre" | "dic" | "décembre" | "déc" | "Dezember" | "Dez" | "Ukuboza" | "uku" | "Desemba" | "Des" | "ታሕሳስ" | "ታሕ" From 294e573ee283d292a198884ed8ec659a75578a8c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 13:52:59 -0500 Subject: [PATCH 06/17] Add more test cases in multiple languages --- .../test_gregorian/test_gregorian_parser.py | 58 ++++++++++++------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py index 83df118..f4f5af8 100644 --- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py @@ -4,22 +4,40 @@ from undate.converters.calendars.gregorian.parser import gregorian_parser -# for now, just test that valid dates can be parsed +# test that valid dates can be parsed to confirm parser is working correctly testcases = [ # year "2012", + # three digit year + "566", # month + year "Jan 1960", "Feb 1801", "1900 Feb", - # day + month + year - "Feb 5 1602", - # "1602 February 5", + # day + month + year in any order + "May 5 1602", + "5 May 1602", + "1602 October 5", # day + month - "January 5", - "5 Jan", - # three digit year + "December 5", + "5 December", + # Kinyarwanda (rw) + "2025 ugu 4", # Babel renders as "2025 ugu. 4" + "2025 Ugushyingo 4", + "2025 ugu", + "2025 Ugushyingo", + # Ganda (lg) + "4 Novemba 2025", + "4 Nov 2025", + "Novemba 2025", + "4 Novemba", + # Tigrinya (ti) + "ሕዳ 4 2025", # Babel renders with a comma after the day + "ሕዳር 4 2025", + # French + "18 avril 2025", + "18 avr 2025", # Babel renders as 18 avr. 2025 ] @@ -29,20 +47,20 @@ def test_should_parse(date_string): error_cases = [ - # # invalid days - ("0 Tammuz 5403", UnexpectedCharacters), - # ("31 Tishri 5403", UnexpectedCharacters), - # # month alone - # ("Tishri", UnexpectedEOF), - # # month day only + # invalid days + ("0 June 1006", UnexpectedCharacters), + ("42 March 1206", UnexpectedCharacters), + # month alone + ("Juin", UnexpectedCharacters), + # day only ("12 ", UnexpectedEOF), - # # invalid month - # ("Foo 383", UnexpectedCharacters), - # # wrong format - # ("2024-10-02", UnexpectedCharacters), - # # year month day not supported - # ("5403 Adar", UnexpectedCharacters), - # ("5403 Adar 14", UnexpectedCharacters), + # non-Gregorian month + ("5 Tammuz 5403", UnexpectedCharacters), + ("31 Tishri 5403", UnexpectedCharacters), + # invalid month + ("Foo 383", UnexpectedCharacters), + # wrong format + ("2024-10-02", UnexpectedCharacters), ] From f654e830d241ee9629146072cc6a53b37cf3320b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 14:12:28 -0500 Subject: [PATCH 07/17] Test gregorian parser transformer; refine parsing logic --- src/undate/converters/grammars/gregorian.lark | 10 +++++- .../test_gregorian/test_gregorian_parser.py | 6 ++-- .../test_gregorian_transformer.py | 31 +++++++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark index b30a594..d70b0db 100644 --- a/src/undate/converters/grammars/gregorian.lark +++ b/src/undate/converters/grammars/gregorian.lark @@ -1,9 +1,15 @@ %import common.WS %ignore WS +// Some abbreviations use periods; some default date formats +// include commas. Ignore both +PUNCTUATION: "." | "," +%ignore PUNCTUATION + %import .gregorian_multilang (month_1, month_2, month_3, month_4, month_5, \ month_6, month_7, month_8, month_9, month_10, month_11, month_12) + // no weekday support for now gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day @@ -12,7 +18,9 @@ day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ // Gregorian calendar started in 1582; assume years with 3 or more digits for now, // so we can support mixed day / year order unambiguously -year: /\d{3,}/ +year: /\b\d{3,}\b/ +// Use word boundaries to separate from other tokens (esp. numeric day), +// since we otherwise ignore whitespace // months month: month_1 diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py index f4f5af8..6cb7ba6 100644 --- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py @@ -23,7 +23,7 @@ "December 5", "5 December", # Kinyarwanda (rw) - "2025 ugu 4", # Babel renders as "2025 ugu. 4" + "2025 ugu. 4", "2025 Ugushyingo 4", "2025 ugu", "2025 Ugushyingo", @@ -33,11 +33,11 @@ "Novemba 2025", "4 Novemba", # Tigrinya (ti) - "ሕዳ 4 2025", # Babel renders with a comma after the day + "ሕዳ 4, 2025", "ሕዳር 4 2025", # French "18 avril 2025", - "18 avr 2025", # Babel renders as 18 avr. 2025 + "18 avr. 2025", ] diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py new file mode 100644 index 0000000..114a713 --- /dev/null +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py @@ -0,0 +1,31 @@ +import pytest +from undate.converters.calendars.gregorian.parser import gregorian_parser +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer +from undate.undate import Undate, Calendar +from undate.date import DatePrecision + + +testcases = [ + ("2012", Undate(2012), DatePrecision.YEAR), + ("May 13 1602", Undate(1602, 5, 13), DatePrecision.DAY), + ("Jan 1960", Undate(1960, 1), DatePrecision.MONTH), + ("2022 ugu. 4", Undate(2022, 11, 4), DatePrecision.DAY), + ("2022 Ugushyingo", Undate(2022, 11), DatePrecision.MONTH), + ("4 Novemba", Undate(month=11, day=4), DatePrecision.DAY), + # ignores whitespace, comma, period + ("4Novemba", Undate(month=11, day=4), DatePrecision.DAY), + ("18 avril, 2025", Undate(2025, 4, 18), DatePrecision.DAY), +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = GregorianDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = gregorian_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + # use EDTF to compare so we can check dates with unknown years + assert transformed_date.format("EDTF") == expected.format("EDTF") + # currently only returns undate, parser doesn't support intervals + assert transformed_date.precision == expected_precision + assert transformed_date.calendar == Calendar.GREGORIAN From 5b5d89f0e6246ac428b1af4e80504d3b8a395c3c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 14:20:21 -0500 Subject: [PATCH 08/17] Connect parsing to gregorian converter class and test --- .../calendars/gregorian/converter.py | 30 +++++++++++++++++++ .../test_gregorian_converter.py | 27 +++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/undate/converters/calendars/gregorian/converter.py b/src/undate/converters/calendars/gregorian/converter.py index b3b103b..30e2dfc 100644 --- a/src/undate/converters/calendars/gregorian/converter.py +++ b/src/undate/converters/calendars/gregorian/converter.py @@ -1,6 +1,11 @@ from calendar import monthrange, isleap +from lark.exceptions import UnexpectedCharacters + +from undate.undate import Undate from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.gregorian.parser import gregorian_parser +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer class GregorianDateConverter(BaseCalendarConverter): @@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter): #: arbitrary known leap year LEAP_YEAR: int = 2024 + def __init__(self): + self.transformer = GregorianDateTransformer() + def min_month(self) -> int: """First month for the Gregorian calendar.""" return 1 @@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]: a common point of comparison. """ return (year, month, day) + + def parse(self, value: str) -> Undate: + """ + Parse a Gregorian date string of any supported precision in any + supported language and return an :class:`~undate.undate.Undate`. + The input date string is preserved in the label of the resulting + Undate object. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Hebrew date parser + parsetree = gregorian_parser.parse(value) + # transform the parse tree into an undate object + undate_obj = self.transformer.transform(parsetree) + # set the original date string as the label + undate_obj.label = value + return undate_obj + except UnexpectedCharacters as err: + raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py index e0bf5ef..f506d23 100644 --- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py @@ -1,3 +1,5 @@ +from undate.date import DatePrecision +from undate.undate import Undate, Calendar from undate.converters.calendars import GregorianDateConverter @@ -38,3 +40,28 @@ def test_representative_years(self): converter.LEAP_YEAR, converter.NON_LEAP_YEAR, ] + + def test_parse(self): + # day + date_str = "2022 Ugushyingo 26" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(2022, 11, 26) # Ugushyingo = November + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.DAY + assert date.label == date_str + + # month + date_str = "avril 1362" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(1362, 4) + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.MONTH + assert date.label == date_str + + # year + date_str = "932" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(932) + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.YEAR + assert date.label == date_str From 6600f58d0f89a69df79926edc08b65705546956f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 14:38:14 -0500 Subject: [PATCH 09/17] Add Gregorian to omnibus parser --- .../calendars/gregorian/transformer.py | 5 +++++ src/undate/converters/combined.py | 4 +++- src/undate/converters/grammars/combined.lark | 18 +++++++++++++++--- src/undate/converters/grammars/gregorian.lark | 6 +++--- tests/test_converters/test_combined_parser.py | 5 +++++ 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/undate/converters/calendars/gregorian/transformer.py b/src/undate/converters/calendars/gregorian/transformer.py index 7f24e02..a8e7048 100644 --- a/src/undate/converters/calendars/gregorian/transformer.py +++ b/src/undate/converters/calendars/gregorian/transformer.py @@ -35,3 +35,8 @@ def month(self, items): tree = items[0] month_n = tree.data.split("_")[-1] return Tree(data="month", children=[month_n]) + + def day(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="day", children=[value]) diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py index 54d66a5..c74beb9 100644 --- a/src/undate/converters/combined.py +++ b/src/undate/converters/combined.py @@ -13,6 +13,7 @@ from undate import Undate, UndateInterval from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH from undate.converters.edtf.transformer import EDTFTransformer +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer from undate.converters.calendars.islamic.transformer import IslamicDateTransformer @@ -33,6 +34,7 @@ def start(self, children): edtf=EDTFTransformer(), hebrew=HebrewDateTransformer(), islamic=IslamicDateTransformer(), + gregorian=GregorianDateTransformer(), ) @@ -45,7 +47,7 @@ def start(self, children): class OmnibusDateConverter(BaseDateConverter): """ Combination parser that aggregates existing parser grammars. - Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous. + Currently supports EDTF, Gregorian, Hebrew, and Hijri where dates are unambiguous. (Year-only dates are parsed as EDTF in Gregorian calendar.) Does not support serialization. diff --git a/src/undate/converters/grammars/combined.lark b/src/undate/converters/grammars/combined.lark index 0e77b5c..1f87208 100644 --- a/src/undate/converters/grammars/combined.lark +++ b/src/undate/converters/grammars/combined.lark @@ -1,7 +1,12 @@ %import common.WS %ignore WS -start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) +// Some abbreviations use periods; some default date formats +// include commas. Ignore both. (Copied from gregorian.lark) +PUNCTUATION: "." | "," +%ignore PUNCTUATION + +start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date | gregorian__gregorian_date ) // Renaming of the import variables is required, as they receive the namespace of this file. // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 @@ -23,10 +28,17 @@ start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) %import .islamic.month -> islamic__month %import .islamic.year -> islamic__year +// gregorian calendar, in multiple languages +%import .gregorian.gregorian_date -> gregorian__gregorian_date + // override hebrew date to omit year-only, since year without calendar is ambiguous // NOTE: potentially support year with calendar label -%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year +%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year // same for islamic date, year alone is ambiguous -%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year +%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year + +// same as above. omit year only, since covered by EDTF +// %override gregorian__gregorian_date: day month year | month day year | year month day | month year | year month | day month | month day + diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark index d70b0db..74cff9b 100644 --- a/src/undate/converters/grammars/gregorian.lark +++ b/src/undate/converters/grammars/gregorian.lark @@ -11,15 +11,15 @@ PUNCTUATION: "." | "," // no weekday support for now -gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day +gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day // months have 28 to 31 days; we do not expect leading zeroes -day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ +day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ // Gregorian calendar started in 1582; assume years with 3 or more digits for now, // so we can support mixed day / year order unambiguously year: /\b\d{3,}\b/ -// Use word boundaries to separate from other tokens (esp. numeric day), +// Use word boundaries to separate from other tokens (esp. numeric day), // since we otherwise ignore whitespace // months diff --git a/tests/test_converters/test_combined_parser.py b/tests/test_converters/test_combined_parser.py index 717a16e..d6e0621 100644 --- a/tests/test_converters/test_combined_parser.py +++ b/tests/test_converters/test_combined_parser.py @@ -19,6 +19,11 @@ ("Jumādā I 1243", Undate(1243, 5, calendar="Islamic")), ("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Islamic")), ("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Islamic")), + # Gregorian with non-numeric month (full or abbreviated) + ("June 1602", Undate(1602, 6, calendar="Gregorian")), + ("13 Jan 1602", Undate(1602, 1, 13, calendar="Gregorian")), + ("2022 ugu. 4", Undate(2022, 11, 4, calendar="Gregorian")), + ("18 avril", Undate(month=4, day=18, calendar="Gregorian")), ] From 2bd8c23cd32435cd76788aefd0257ada645622ad Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 14:41:59 -0500 Subject: [PATCH 10/17] Document Gregorian parser & languages in change log --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51746ff..5375a5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## 0.7 + +- Add parsing to Gregorian date converter; supports month names (full or abbreviated) + in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya +- Include Gregorian dates in omnibus parser + ## 0.6 - Experimental omnibus date converter + parser (EDTF, Hebrew, Hijri) From 9ca84244068960fbdad55d6f99c0e30de71d85cc Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 14:59:27 -0500 Subject: [PATCH 11/17] Add dev notes for codegen script; drop uvx from hatch run command --- DEVELOPER_NOTES.md | 17 ++++++++++++++++- scripts/generate_gregorian_grammar.py | 12 +++++++++++- .../grammars/gregorian_multilang.lark | 2 +- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 6d4918c..e74c145 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -88,4 +88,19 @@ pip install -e ".[docs]" sphinx-build docs docs/_build ``` -HTML documentation will be generated in `docs/_build/html` \ No newline at end of file +HTML documentation will be generated in `docs/_build/html` + + +### Regenerating multilingual Gregorian month name parse file + +The Gregorian Lark parser includes a script-generated file, which +populates month names based on a list of language codes using the Babel +library. To regenerate, run the script with hatch (which should +be installed globally):: + + hatch run codegen:generate + +When the `.lark` file is modified by the script, it must be committed to git. + + + diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py index a5f4563..ce68362 100644 --- a/scripts/generate_gregorian_grammar.py +++ b/scripts/generate_gregorian_grammar.py @@ -1,4 +1,14 @@ #!/usr/bin/env python +""" +This script generates the gregorian_multilang.lark file +with month names (full and abbreviated) based on the list of +target languages. + +Run this script with hatch to regeneate the file:: + + hatch run codegen:generate + +""" from collections import defaultdict import pathlib @@ -25,7 +35,7 @@ # warning to include at top of generated file warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT. -// To regenerate: uvx hatch run codegen:generate +// To regenerate: hatch run codegen:generate """ diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark index 83dcbb1..a4d6045 100644 --- a/src/undate/converters/grammars/gregorian_multilang.lark +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -1,5 +1,5 @@ // WARNING: This file is auto-generated. DO NOT EDIT. -// To regenerate: uvx hatch run codegen:generate +// To regenerate: hatch run codegen:generate month_1: "January" | "Jan" | "enero" | "ene" | "janvier" | "janv" | "Januar" | "Mutarama" | "mut" | "Janwaliyo" | "ጥሪ" month_2: "February" | "Feb" | "febrero" | "feb" | "février" | "févr" | "Februar" | "Gashyantare" | "gas" | "Febwaliyo" | "ለካቲት" | "ለካ" From e4c468dc2bd07597fd4989cdc8374d63ac41caa6 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 15:17:15 -0500 Subject: [PATCH 12/17] Make Gregorian parser case-insensitive --- scripts/generate_gregorian_grammar.py | 13 +++++----- .../calendars/gregorian/converter.py | 2 +- .../grammars/gregorian_multilang.lark | 24 +++++++++---------- .../test_gregorian/test_gregorian_parser.py | 5 +++- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py index ce68362..822bbf7 100644 --- a/scripts/generate_gregorian_grammar.py +++ b/scripts/generate_gregorian_grammar.py @@ -4,7 +4,7 @@ with month names (full and abbreviated) based on the list of target languages. -Run this script with hatch to regeneate the file:: +Run this script with hatch to regenerate the file:: hatch run codegen:generate @@ -48,7 +48,7 @@ def main(): for width in ["wide", "abbreviated"]: for month_num, month_name in get_month_names(width, locale=lang).items(): # some locales use a . on the shortened month; let's ignore that - month_name = month_name.strip(".") + month_name = month_name.strip(".").lower() # In some cases different languages have the same abbreviations; # in some cases, abbreviated and full are the same. # Only add if not already present, to avoid redundancy @@ -59,11 +59,12 @@ def main(): outfile.write(warning_text) # for each numeric month, generate a rule with all variant names: - # month_1: "January" | "Jan" ... + # month_1: /January|Jan/i for i, names in all_month_names.items(): - # combine all names in an OR string - or_names = " | ".join(f'"{m}"' for m in names) - outfile.write(f"month_{i}: {or_names}\n") + # combine all names in a case-insensitive OR regex + # sort shortest variants last to avoid partial matches hitting first + or_names = "|".join(sorted(names, key=len, reverse=True)) + outfile.write(f"month_{i}: /({or_names})/i\n") print( f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}" diff --git a/src/undate/converters/calendars/gregorian/converter.py b/src/undate/converters/calendars/gregorian/converter.py index 30e2dfc..f51c0dc 100644 --- a/src/undate/converters/calendars/gregorian/converter.py +++ b/src/undate/converters/calendars/gregorian/converter.py @@ -100,7 +100,7 @@ def parse(self, value: str) -> Undate: # parse the input string, then transform to undate object try: - # parse the string with our Hebrew date parser + # parse the string with our Gregorian date parser parsetree = gregorian_parser.parse(value) # transform the parse tree into an undate object undate_obj = self.transformer.transform(parsetree) diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark index a4d6045..5cd2927 100644 --- a/src/undate/converters/grammars/gregorian_multilang.lark +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -1,15 +1,15 @@ // WARNING: This file is auto-generated. DO NOT EDIT. // To regenerate: hatch run codegen:generate -month_1: "January" | "Jan" | "enero" | "ene" | "janvier" | "janv" | "Januar" | "Mutarama" | "mut" | "Janwaliyo" | "ጥሪ" -month_2: "February" | "Feb" | "febrero" | "feb" | "février" | "févr" | "Februar" | "Gashyantare" | "gas" | "Febwaliyo" | "ለካቲት" | "ለካ" -month_3: "March" | "Mar" | "marzo" | "mar" | "mars" | "März" | "Werurwe" | "wer" | "Marisi" | "መጋቢት" | "መጋ" -month_4: "April" | "Apr" | "abril" | "abr" | "avril" | "avr" | "Mata" | "mat" | "Apuli" | "Apu" | "ሚያዝያ" | "ሚያ" -month_5: "May" | "mayo" | "may" | "mai" | "Mai" | "Gicurasi" | "gic" | "Maayi" | "Maa" | "ጉንበት" | "ግን" -month_6: "June" | "Jun" | "junio" | "jun" | "juin" | "Juni" | "Kamena" | "kam" | "Juuni" | "Juu" | "ሰነ" -month_7: "July" | "Jul" | "julio" | "jul" | "juillet" | "juil" | "Juli" | "Nyakanga" | "nya" | "Julaayi" | "ሓምለ" | "ሓም" -month_8: "August" | "Aug" | "agosto" | "ago" | "août" | "Kanama" | "kan" | "Agusito" | "Agu" | "ነሓሰ" | "ነሓ" -month_9: "September" | "Sep" | "septiembre" | "sept" | "septembre" | "Sept" | "Nzeri" | "nze" | "Sebuttemba" | "Seb" | "መስከረም" | "መስ" -month_10: "October" | "Oct" | "octubre" | "oct" | "octobre" | "Oktober" | "Okt" | "Ukwakira" | "ukw" | "Okitobba" | "Oki" | "ጥቅምቲ" | "ጥቅ" -month_11: "November" | "Nov" | "noviembre" | "nov" | "novembre" | "Ugushyingo" | "ugu" | "Novemba" | "ሕዳር" | "ሕዳ" -month_12: "December" | "Dec" | "diciembre" | "dic" | "décembre" | "déc" | "Dezember" | "Dez" | "Ukuboza" | "uku" | "Desemba" | "Des" | "ታሕሳስ" | "ታሕ" +month_1: /(janwaliyo|mutarama|january|janvier|januar|enero|janv|jan|ene|mut|ጥሪ)/i +month_2: /(gashyantare|febwaliyo|february|febrero|février|februar|févr|ለካቲት|feb|gas|ለካ)/i +month_3: /(werurwe|marisi|march|marzo|mars|märz|መጋቢት|mar|wer|መጋ)/i +month_4: /(april|abril|avril|apuli|mata|ሚያዝያ|apr|abr|avr|mat|apu|ሚያ)/i +month_5: /(gicurasi|maayi|mayo|ጉንበት|may|mai|gic|maa|ግን)/i +month_6: /(kamena|junio|juuni|june|juin|juni|jun|kam|juu|ሰነ)/i +month_7: /(nyakanga|juillet|julaayi|julio|july|juil|juli|jul|nya|ሓምለ|ሓም)/i +month_8: /(agusito|august|agosto|kanama|août|aug|ago|kan|agu|ነሓሰ|ነሓ)/i +month_9: /(septiembre|sebuttemba|september|septembre|nzeri|መስከረም|sept|sep|nze|seb|መስ)/i +month_10: /(ukwakira|okitobba|october|octubre|octobre|oktober|ጥቅምቲ|oct|okt|ukw|oki|ጥቅ)/i +month_11: /(ugushyingo|noviembre|november|novembre|novemba|nov|ugu|ሕዳር|ሕዳ)/i +month_12: /(diciembre|december|décembre|dezember|ukuboza|desemba|ታሕሳስ|dec|dic|déc|dez|uku|des|ታሕ)/i diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py index 6cb7ba6..3938bad 100644 --- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py @@ -38,6 +38,9 @@ # French "18 avril 2025", "18 avr. 2025", + # case-insensitive + "18 JUNE 2025", + "Avril 2025", ] @@ -51,7 +54,7 @@ def test_should_parse(date_string): ("0 June 1006", UnexpectedCharacters), ("42 March 1206", UnexpectedCharacters), # month alone - ("Juin", UnexpectedCharacters), + ("Juin", UnexpectedEOF), # day only ("12 ", UnexpectedEOF), # non-Gregorian month From a29a5a4e338fbd66a9f27e71d00d669d7907a4a1 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 15:21:09 -0500 Subject: [PATCH 13/17] Test error handling in gregorian converter parse method --- .../test_gregorian/test_gregorian_converter.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py index f506d23..9839b34 100644 --- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py @@ -1,3 +1,6 @@ +import pytest + + from undate.date import DatePrecision from undate.undate import Undate, Calendar from undate.converters.calendars import GregorianDateConverter @@ -65,3 +68,10 @@ def test_parse(self): assert date.calendar == Calendar.GREGORIAN assert date.precision == DatePrecision.YEAR assert date.label == date_str + + def test_parse_errors(self): + with pytest.raises(ValueError, match="empty string is not supported"): + GregorianDateConverter().parse("") + + with pytest.raises(ValueError, match="Could not parse"): + GregorianDateConverter().parse("Foo 1920") From b9c2bf6d1eeda935203883b388ada7e3f838f29e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 15:29:03 -0500 Subject: [PATCH 14/17] Catch more generic Lark exception per @coderabbitai --- src/undate/converters/calendars/gregorian/converter.py | 4 ++-- src/undate/converters/calendars/hebrew/converter.py | 4 ++-- src/undate/converters/calendars/islamic/converter.py | 4 ++-- src/undate/converters/combined.py | 4 ++-- src/undate/converters/edtf/converter.py | 8 ++++---- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/undate/converters/calendars/gregorian/converter.py b/src/undate/converters/calendars/gregorian/converter.py index f51c0dc..9aa954d 100644 --- a/src/undate/converters/calendars/gregorian/converter.py +++ b/src/undate/converters/calendars/gregorian/converter.py @@ -1,6 +1,6 @@ from calendar import monthrange, isleap -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate.undate import Undate from undate.converters.base import BaseCalendarConverter @@ -107,5 +107,5 @@ def parse(self, value: str) -> Undate: # set the original date string as the label undate_obj.label = value return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py index a8fdfe7..dc8ad19 100644 --- a/src/undate/converters/calendars/hebrew/converter.py +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -1,7 +1,7 @@ from typing import Union from convertdate import hebrew # type: ignore -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseCalendarConverter @@ -111,7 +111,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err # do we need to support conversion the other direction? diff --git a/src/undate/converters/calendars/islamic/converter.py b/src/undate/converters/calendars/islamic/converter.py index 67f2a64..fae7f7f 100644 --- a/src/undate/converters/calendars/islamic/converter.py +++ b/src/undate/converters/calendars/islamic/converter.py @@ -1,7 +1,7 @@ from typing import Union from convertdate import islamic # type: ignore -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseCalendarConverter @@ -97,7 +97,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as an Islamic date") from err # do we need to support conversion the other direction? diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py index c74beb9..ec08c31 100644 --- a/src/undate/converters/combined.py +++ b/src/undate/converters/combined.py @@ -7,7 +7,7 @@ from typing import Union from lark import Lark -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from lark.visitors import Transformer, merge_transformers from undate import Undate, UndateInterval @@ -77,7 +77,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: parsetree = parser.parse(value) # transform returns a list; we want the first item in the list return self.transformer.transform(parsetree)[0] - except UnexpectedCharacters: + except UnexpectedInput: raise ValueError( "Parsing failed: '%s' is not in a recognized date format" % value ) diff --git a/src/undate/converters/edtf/converter.py b/src/undate/converters/edtf/converter.py index d0b742f..e5eddac 100644 --- a/src/undate/converters/edtf/converter.py +++ b/src/undate/converters/edtf/converter.py @@ -1,6 +1,6 @@ from typing import Optional, Union -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseDateConverter @@ -40,10 +40,10 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: try: parsetree = edtf_parser.parse(value) return self.transformer.transform(parsetree) - except UnexpectedCharacters: + except UnexpectedInput as err: raise ValueError( - "Parsing failed: '%s' is not a supported EDTF date format" % value - ) + f"Parsing failed: '{value}' is not a supported EDTF date format" + ) from err def _convert_missing_digits( self, value: Optional[str], old_missing_digit: str From bb1d724d01b803b0ad3e04d851e0ecc28240f71c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 15:39:04 -0500 Subject: [PATCH 15/17] Ignore commas and periods across all grammars --- src/undate/converters/calendars/hebrew/parser.py | 7 ++++--- src/undate/converters/calendars/islamic/parser.py | 7 ++++--- src/undate/converters/grammars/combined.lark | 7 +++---- src/undate/converters/grammars/gregorian.lark | 7 +++---- src/undate/converters/grammars/hebrew.lark | 9 ++++++--- src/undate/converters/grammars/islamic.lark | 4 ++++ 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py index 3056f85..074d2c5 100644 --- a/src/undate/converters/calendars/hebrew/parser.py +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -4,6 +4,7 @@ grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark" -with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates - hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True) +# open based on filename to allow relative imports based on grammar file +hebrew_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="hebrew_date", strict=True +) diff --git a/src/undate/converters/calendars/islamic/parser.py b/src/undate/converters/calendars/islamic/parser.py index 61a0cf0..d753a7a 100644 --- a/src/undate/converters/calendars/islamic/parser.py +++ b/src/undate/converters/calendars/islamic/parser.py @@ -4,6 +4,7 @@ grammar_path = GRAMMAR_FILE_PATH / "islamic.lark" -with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but can't be used due to ambiguity between years and days - islamic_parser = Lark(grammar.read(), start="islamic_date", strict=True) +# open based on filename to allow relative imports based on grammar file +islamic_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="islamic_date", strict=True +) diff --git a/src/undate/converters/grammars/combined.lark b/src/undate/converters/grammars/combined.lark index 1f87208..3f6a568 100644 --- a/src/undate/converters/grammars/combined.lark +++ b/src/undate/converters/grammars/combined.lark @@ -1,10 +1,9 @@ %import common.WS %ignore WS -// Some abbreviations use periods; some default date formats -// include commas. Ignore both. (Copied from gregorian.lark) -PUNCTUATION: "." | "," -%ignore PUNCTUATION +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date | gregorian__gregorian_date ) diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark index 74cff9b..93338f9 100644 --- a/src/undate/converters/grammars/gregorian.lark +++ b/src/undate/converters/grammars/gregorian.lark @@ -1,10 +1,9 @@ %import common.WS %ignore WS -// Some abbreviations use periods; some default date formats -// include commas. Ignore both -PUNCTUATION: "." | "," -%ignore PUNCTUATION +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION %import .gregorian_multilang (month_1, month_2, month_3, month_4, month_5, \ month_6, month_7, month_8, month_9, month_10, month_11, month_12) diff --git a/src/undate/converters/grammars/hebrew.lark b/src/undate/converters/grammars/hebrew.lark index 118ed98..1b28d19 100644 --- a/src/undate/converters/grammars/hebrew.lark +++ b/src/undate/converters/grammars/hebrew.lark @@ -1,9 +1,13 @@ %import common.WS %ignore WS +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -hebrew_date: weekday? day month comma? year | month year | year +hebrew_date: weekday? day month year | month year | year // TODO: handle date ranges? @@ -31,8 +35,7 @@ month: month_1 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ -comma: "," -weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") // months, in order; from convertdate list diff --git a/src/undate/converters/grammars/islamic.lark b/src/undate/converters/grammars/islamic.lark index 1e4940b..530116a 100644 --- a/src/undate/converters/grammars/islamic.lark +++ b/src/undate/converters/grammars/islamic.lark @@ -1,6 +1,10 @@ %import common.WS %ignore WS +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + // only support day month year format for now // parser requires numeric day and year to be distinguished based on order islamic_date: weekday? day month year | month year | year From e16f4d26f9702c856c168a081ddaa25f2f931907 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 15:40:03 -0500 Subject: [PATCH 16/17] Use markdown formatting instead of rst for hatch run command --- DEVELOPER_NOTES.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index e74c145..f77f330 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -97,10 +97,9 @@ The Gregorian Lark parser includes a script-generated file, which populates month names based on a list of language codes using the Babel library. To regenerate, run the script with hatch (which should be installed globally):: - - hatch run codegen:generate +```sh +hatch run codegen:generate +``` When the `.lark` file is modified by the script, it must be committed to git. - - From 3efce6de651f31f6e9674413ca9f9e4d4528b61e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 12 Feb 2026 15:41:15 -0500 Subject: [PATCH 17/17] Add new undate_common lark grammar to version control --- src/undate/converters/grammars/undate_common.lark | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/undate/converters/grammars/undate_common.lark diff --git a/src/undate/converters/grammars/undate_common.lark b/src/undate/converters/grammars/undate_common.lark new file mode 100644 index 0000000..ac42b47 --- /dev/null +++ b/src/undate/converters/grammars/undate_common.lark @@ -0,0 +1,3 @@ +// Some abbreviations use periods; some default date formats +// include commas. Ignore both +DATE_PUNCTUATION: "." | ","