diff --git a/CHANGELOG.md b/CHANGELOG.md index 51746ff..5375a5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## 0.7 + +- Add parsing to Gregorian date converter; supports month names (full or abbreviated) + in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya +- Include Gregorian dates in omnibus parser + ## 0.6 - Experimental omnibus date converter + parser (EDTF, Hebrew, Hijri) diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 6d4918c..f77f330 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -88,4 +88,18 @@ pip install -e ".[docs]" sphinx-build docs docs/_build ``` -HTML documentation will be generated in `docs/_build/html` \ No newline at end of file +HTML documentation will be generated in `docs/_build/html` + + +### Regenerating multilingual Gregorian month name parse file + +The Gregorian Lark parser includes a script-generated file, which +populates month names based on a list of language codes using the Babel +library. To regenerate, run the script with hatch (which should +be installed globally):: +```sh +hatch run codegen:generate +``` + +When the `.lark` file is modified by the script, it must be committed to git. + diff --git a/pyproject.toml b/pyproject.toml index fcebbbd..e7b3e77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,12 @@ path = "src/undate/__init__.py" [tool.hatch.build.targets.sdist] include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"] +[tool.hatch.envs.codegen] +dependencies = ["babel"] + +[tool.hatch.envs.codegen.scripts] +generate = "python scripts/generate_gregorian_grammar.py" + [tool.pytest.ini_options] pythonpath = "src/" markers = [ diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py new file mode 100644 index 0000000..822bbf7 --- /dev/null +++ b/scripts/generate_gregorian_grammar.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +""" +This script generates the gregorian_multilang.lark file +with month names (full and abbreviated) based on the list of +target languages. + +Run this script with hatch to regenerate the file:: + + hatch run codegen:generate + +""" + +from collections import defaultdict +import pathlib + +from babel.dates import get_month_names + +# lark grammar path relative to this script +GRAMMAR_DIR_PATH = ( + pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars" +) +# file that is generated by this script, in that directory +MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark" + +# include month names in the following languages +languages = [ + "en", # English + "es", # Spanish + "fr", # French + "de", # German + "rw", # Kinyarwanda + "lg", # Ganda + "ti", # Tigrinya +] + +# warning to include at top of generated file +warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT. +// To regenerate: hatch run codegen:generate + +""" + + +def main(): + # create a dictionary of lists to hold the names for each month + all_month_names = defaultdict(list) + + for lang in languages: + for width in ["wide", "abbreviated"]: + for month_num, month_name in get_month_names(width, locale=lang).items(): + # some locales use a . on the shortened month; let's ignore that + month_name = month_name.strip(".").lower() + # In some cases different languages have the same abbreviations; + # in some cases, abbreviated and full are the same. + # Only add if not already present, to avoid redundancy + if month_name not in all_month_names[month_num]: + all_month_names[month_num].append(month_name) + + with MONTH_GRAMMAR_FILE.open("w") as outfile: + outfile.write(warning_text) + + # for each numeric month, generate a rule with all variant names: + # month_1: /January|Jan/i + for i, names in all_month_names.items(): + # combine all names in a case-insensitive OR regex + # sort shortest variants last to avoid partial matches hitting first + or_names = "|".join(sorted(names, key=len, reverse=True)) + outfile.write(f"month_{i}: /({or_names})/i\n") + + print( + f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}" + ) + print("If the file has changed, make sure to commit the new version.") + + +if __name__ == "__main__": + main() diff --git a/src/undate/converters/calendars/gregorian/__init__.py b/src/undate/converters/calendars/gregorian/__init__.py new file mode 100644 index 0000000..f08896b --- /dev/null +++ b/src/undate/converters/calendars/gregorian/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.gregorian.converter import GregorianDateConverter + +__all__ = ["GregorianDateConverter"] diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian/converter.py similarity index 68% rename from src/undate/converters/calendars/gregorian.py rename to src/undate/converters/calendars/gregorian/converter.py index b3b103b..9aa954d 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian/converter.py @@ -1,6 +1,11 @@ from calendar import monthrange, isleap +from lark.exceptions import UnexpectedInput + +from undate.undate import Undate from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.gregorian.parser import gregorian_parser +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer class GregorianDateConverter(BaseCalendarConverter): @@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter): #: arbitrary known leap year LEAP_YEAR: int = 2024 + def __init__(self): + self.transformer = GregorianDateTransformer() + def min_month(self) -> int: """First month for the Gregorian calendar.""" return 1 @@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]: a common point of comparison. """ return (year, month, day) + + def parse(self, value: str) -> Undate: + """ + Parse a Gregorian date string of any supported precision in any + supported language and return an :class:`~undate.undate.Undate`. + The input date string is preserved in the label of the resulting + Undate object. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Gregorian date parser + parsetree = gregorian_parser.parse(value) + # transform the parse tree into an undate object + undate_obj = self.transformer.transform(parsetree) + # set the original date string as the label + undate_obj.label = value + return undate_obj + except UnexpectedInput as err: + raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err diff --git a/src/undate/converters/calendars/gregorian/parser.py b/src/undate/converters/calendars/gregorian/parser.py new file mode 100644 index 0000000..cfcea53 --- /dev/null +++ b/src/undate/converters/calendars/gregorian/parser.py @@ -0,0 +1,10 @@ +from lark import Lark + +from undate.converters import GRAMMAR_FILE_PATH + +grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark" + +# open based on filename to allow relative imports based on grammar file +gregorian_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True +) diff --git a/src/undate/converters/calendars/gregorian/transformer.py b/src/undate/converters/calendars/gregorian/transformer.py new file mode 100644 index 0000000..a8e7048 --- /dev/null +++ b/src/undate/converters/calendars/gregorian/transformer.py @@ -0,0 +1,42 @@ +from lark import Transformer, Tree + +from undate import Undate, Calendar + + +class GregorianDateTransformer(Transformer): + """Transform a Gregorian date parse tree and return an Undate.""" + + # Currently parser should not result in intervals + + calendar = Calendar.GREGORIAN + + def gregorian_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with year, month, day and + # Gregorian calendar + return Undate(**parts, calendar=self.calendar) + + def year(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) + + def day(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="day", children=[value]) diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py index a8fdfe7..dc8ad19 100644 --- a/src/undate/converters/calendars/hebrew/converter.py +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -1,7 +1,7 @@ from typing import Union from convertdate import hebrew # type: ignore -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseCalendarConverter @@ -111,7 +111,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err # do we need to support conversion the other direction? diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py index 3056f85..074d2c5 100644 --- a/src/undate/converters/calendars/hebrew/parser.py +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -4,6 +4,7 @@ grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark" -with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates - hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True) +# open based on filename to allow relative imports based on grammar file +hebrew_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="hebrew_date", strict=True +) diff --git a/src/undate/converters/calendars/islamic/converter.py b/src/undate/converters/calendars/islamic/converter.py index 67f2a64..fae7f7f 100644 --- a/src/undate/converters/calendars/islamic/converter.py +++ b/src/undate/converters/calendars/islamic/converter.py @@ -1,7 +1,7 @@ from typing import Union from convertdate import islamic # type: ignore -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseCalendarConverter @@ -97,7 +97,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as an Islamic date") from err # do we need to support conversion the other direction? diff --git a/src/undate/converters/calendars/islamic/parser.py b/src/undate/converters/calendars/islamic/parser.py index 61a0cf0..d753a7a 100644 --- a/src/undate/converters/calendars/islamic/parser.py +++ b/src/undate/converters/calendars/islamic/parser.py @@ -4,6 +4,7 @@ grammar_path = GRAMMAR_FILE_PATH / "islamic.lark" -with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but can't be used due to ambiguity between years and days - islamic_parser = Lark(grammar.read(), start="islamic_date", strict=True) +# open based on filename to allow relative imports based on grammar file +islamic_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="islamic_date", strict=True +) diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py index 54d66a5..ec08c31 100644 --- a/src/undate/converters/combined.py +++ b/src/undate/converters/combined.py @@ -7,12 +7,13 @@ from typing import Union from lark import Lark -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from lark.visitors import Transformer, merge_transformers from undate import Undate, UndateInterval from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH from undate.converters.edtf.transformer import EDTFTransformer +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer from undate.converters.calendars.islamic.transformer import IslamicDateTransformer @@ -33,6 +34,7 @@ def start(self, children): edtf=EDTFTransformer(), hebrew=HebrewDateTransformer(), islamic=IslamicDateTransformer(), + gregorian=GregorianDateTransformer(), ) @@ -45,7 +47,7 @@ def start(self, children): class OmnibusDateConverter(BaseDateConverter): """ Combination parser that aggregates existing parser grammars. - Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous. + Currently supports EDTF, Gregorian, Hebrew, and Hijri where dates are unambiguous. (Year-only dates are parsed as EDTF in Gregorian calendar.) Does not support serialization. @@ -75,7 +77,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: parsetree = parser.parse(value) # transform returns a list; we want the first item in the list return self.transformer.transform(parsetree)[0] - except UnexpectedCharacters: + except UnexpectedInput: raise ValueError( "Parsing failed: '%s' is not in a recognized date format" % value ) diff --git a/src/undate/converters/edtf/converter.py b/src/undate/converters/edtf/converter.py index d0b742f..e5eddac 100644 --- a/src/undate/converters/edtf/converter.py +++ b/src/undate/converters/edtf/converter.py @@ -1,6 +1,6 @@ from typing import Optional, Union -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseDateConverter @@ -40,10 +40,10 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: try: parsetree = edtf_parser.parse(value) return self.transformer.transform(parsetree) - except UnexpectedCharacters: + except UnexpectedInput as err: raise ValueError( - "Parsing failed: '%s' is not a supported EDTF date format" % value - ) + f"Parsing failed: '{value}' is not a supported EDTF date format" + ) from err def _convert_missing_digits( self, value: Optional[str], old_missing_digit: str diff --git a/src/undate/converters/grammars/combined.lark b/src/undate/converters/grammars/combined.lark index 0e77b5c..3f6a568 100644 --- a/src/undate/converters/grammars/combined.lark +++ b/src/undate/converters/grammars/combined.lark @@ -1,7 +1,11 @@ %import common.WS %ignore WS -start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + +start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date | gregorian__gregorian_date ) // Renaming of the import variables is required, as they receive the namespace of this file. // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 @@ -23,10 +27,17 @@ start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) %import .islamic.month -> islamic__month %import .islamic.year -> islamic__year +// gregorian calendar, in multiple languages +%import .gregorian.gregorian_date -> gregorian__gregorian_date + // override hebrew date to omit year-only, since year without calendar is ambiguous // NOTE: potentially support year with calendar label -%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year +%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year // same for islamic date, year alone is ambiguous -%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year +%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year + +// same as above. omit year only, since covered by EDTF +// %override gregorian__gregorian_date: day month year | month day year | year month day | month year | year month | day month | month day + diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark new file mode 100644 index 0000000..93338f9 --- /dev/null +++ b/src/undate/converters/grammars/gregorian.lark @@ -0,0 +1,37 @@ +%import common.WS +%ignore WS + +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + +%import .gregorian_multilang (month_1, month_2, month_3, month_4, month_5, \ + month_6, month_7, month_8, month_9, month_10, month_11, month_12) + + +// no weekday support for now +gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day + +// months have 28 to 31 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ + +// Gregorian calendar started in 1582; assume years with 3 or more digits for now, +// so we can support mixed day / year order unambiguously +year: /\b\d{3,}\b/ +// Use word boundaries to separate from other tokens (esp. numeric day), +// since we otherwise ignore whitespace + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark new file mode 100644 index 0000000..5cd2927 --- /dev/null +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -0,0 +1,15 @@ +// WARNING: This file is auto-generated. DO NOT EDIT. +// To regenerate: hatch run codegen:generate + +month_1: /(janwaliyo|mutarama|january|janvier|januar|enero|janv|jan|ene|mut|ጥሪ)/i +month_2: /(gashyantare|febwaliyo|february|febrero|février|februar|févr|ለካቲት|feb|gas|ለካ)/i +month_3: /(werurwe|marisi|march|marzo|mars|märz|መጋቢት|mar|wer|መጋ)/i +month_4: /(april|abril|avril|apuli|mata|ሚያዝያ|apr|abr|avr|mat|apu|ሚያ)/i +month_5: /(gicurasi|maayi|mayo|ጉንበት|may|mai|gic|maa|ግን)/i +month_6: /(kamena|junio|juuni|june|juin|juni|jun|kam|juu|ሰነ)/i +month_7: /(nyakanga|juillet|julaayi|julio|july|juil|juli|jul|nya|ሓምለ|ሓም)/i +month_8: /(agusito|august|agosto|kanama|août|aug|ago|kan|agu|ነሓሰ|ነሓ)/i +month_9: /(septiembre|sebuttemba|september|septembre|nzeri|መስከረም|sept|sep|nze|seb|መስ)/i +month_10: /(ukwakira|okitobba|october|octubre|octobre|oktober|ጥቅምቲ|oct|okt|ukw|oki|ጥቅ)/i +month_11: /(ugushyingo|noviembre|november|novembre|novemba|nov|ugu|ሕዳር|ሕዳ)/i +month_12: /(diciembre|december|décembre|dezember|ukuboza|desemba|ታሕሳስ|dec|dic|déc|dez|uku|des|ታሕ)/i diff --git a/src/undate/converters/grammars/hebrew.lark b/src/undate/converters/grammars/hebrew.lark index 118ed98..1b28d19 100644 --- a/src/undate/converters/grammars/hebrew.lark +++ b/src/undate/converters/grammars/hebrew.lark @@ -1,9 +1,13 @@ %import common.WS %ignore WS +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -hebrew_date: weekday? day month comma? year | month year | year +hebrew_date: weekday? day month year | month year | year // TODO: handle date ranges? @@ -31,8 +35,7 @@ month: month_1 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ -comma: "," -weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") // months, in order; from convertdate list diff --git a/src/undate/converters/grammars/islamic.lark b/src/undate/converters/grammars/islamic.lark index 1e4940b..530116a 100644 --- a/src/undate/converters/grammars/islamic.lark +++ b/src/undate/converters/grammars/islamic.lark @@ -1,6 +1,10 @@ %import common.WS %ignore WS +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + // only support day month year format for now // parser requires numeric day and year to be distinguished based on order islamic_date: weekday? day month year | month year | year diff --git a/src/undate/converters/grammars/undate_common.lark b/src/undate/converters/grammars/undate_common.lark new file mode 100644 index 0000000..ac42b47 --- /dev/null +++ b/src/undate/converters/grammars/undate_common.lark @@ -0,0 +1,3 @@ +// Some abbreviations use periods; some default date formats +// include commas. Ignore both +DATE_PUNCTUATION: "." | "," diff --git a/tests/test_converters/test_calendars/test_gregorian.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py similarity index 55% rename from tests/test_converters/test_calendars/test_gregorian.py rename to tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py index e0bf5ef..9839b34 100644 --- a/tests/test_converters/test_calendars/test_gregorian.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py @@ -1,3 +1,8 @@ +import pytest + + +from undate.date import DatePrecision +from undate.undate import Undate, Calendar from undate.converters.calendars import GregorianDateConverter @@ -38,3 +43,35 @@ def test_representative_years(self): converter.LEAP_YEAR, converter.NON_LEAP_YEAR, ] + + def test_parse(self): + # day + date_str = "2022 Ugushyingo 26" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(2022, 11, 26) # Ugushyingo = November + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.DAY + assert date.label == date_str + + # month + date_str = "avril 1362" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(1362, 4) + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.MONTH + assert date.label == date_str + + # year + date_str = "932" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(932) + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.YEAR + assert date.label == date_str + + def test_parse_errors(self): + with pytest.raises(ValueError, match="empty string is not supported"): + GregorianDateConverter().parse("") + + with pytest.raises(ValueError, match="Could not parse"): + GregorianDateConverter().parse("Foo 1920") diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py new file mode 100644 index 0000000..3938bad --- /dev/null +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py @@ -0,0 +1,73 @@ +import pytest +from lark.exceptions import UnexpectedCharacters, UnexpectedEOF + +from undate.converters.calendars.gregorian.parser import gregorian_parser + + +# test that valid dates can be parsed to confirm parser is working correctly + +testcases = [ + # year + "2012", + # three digit year + "566", + # month + year + "Jan 1960", + "Feb 1801", + "1900 Feb", + # day + month + year in any order + "May 5 1602", + "5 May 1602", + "1602 October 5", + # day + month + "December 5", + "5 December", + # Kinyarwanda (rw) + "2025 ugu. 4", + "2025 Ugushyingo 4", + "2025 ugu", + "2025 Ugushyingo", + # Ganda (lg) + "4 Novemba 2025", + "4 Nov 2025", + "Novemba 2025", + "4 Novemba", + # Tigrinya (ti) + "ሕዳ 4, 2025", + "ሕዳር 4 2025", + # French + "18 avril 2025", + "18 avr. 2025", + # case-insensitive + "18 JUNE 2025", + "Avril 2025", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert gregorian_parser.parse(date_string) + + +error_cases = [ + # invalid days + ("0 June 1006", UnexpectedCharacters), + ("42 March 1206", UnexpectedCharacters), + # month alone + ("Juin", UnexpectedEOF), + # day only + ("12 ", UnexpectedEOF), + # non-Gregorian month + ("5 Tammuz 5403", UnexpectedCharacters), + ("31 Tishri 5403", UnexpectedCharacters), + # invalid month + ("Foo 383", UnexpectedCharacters), + # wrong format + ("2024-10-02", UnexpectedCharacters), +] + + +@pytest.mark.parametrize("date_string,exception", error_cases) +def test_should_error(date_string, exception): + with pytest.raises(exception): + gregorian_parser.parse(date_string) diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py new file mode 100644 index 0000000..114a713 --- /dev/null +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py @@ -0,0 +1,31 @@ +import pytest +from undate.converters.calendars.gregorian.parser import gregorian_parser +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer +from undate.undate import Undate, Calendar +from undate.date import DatePrecision + + +testcases = [ + ("2012", Undate(2012), DatePrecision.YEAR), + ("May 13 1602", Undate(1602, 5, 13), DatePrecision.DAY), + ("Jan 1960", Undate(1960, 1), DatePrecision.MONTH), + ("2022 ugu. 4", Undate(2022, 11, 4), DatePrecision.DAY), + ("2022 Ugushyingo", Undate(2022, 11), DatePrecision.MONTH), + ("4 Novemba", Undate(month=11, day=4), DatePrecision.DAY), + # ignores whitespace, comma, period + ("4Novemba", Undate(month=11, day=4), DatePrecision.DAY), + ("18 avril, 2025", Undate(2025, 4, 18), DatePrecision.DAY), +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = GregorianDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = gregorian_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + # use EDTF to compare so we can check dates with unknown years + assert transformed_date.format("EDTF") == expected.format("EDTF") + # currently only returns undate, parser doesn't support intervals + assert transformed_date.precision == expected_precision + assert transformed_date.calendar == Calendar.GREGORIAN diff --git a/tests/test_converters/test_combined_parser.py b/tests/test_converters/test_combined_parser.py index 717a16e..d6e0621 100644 --- a/tests/test_converters/test_combined_parser.py +++ b/tests/test_converters/test_combined_parser.py @@ -19,6 +19,11 @@ ("Jumādā I 1243", Undate(1243, 5, calendar="Islamic")), ("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Islamic")), ("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Islamic")), + # Gregorian with non-numeric month (full or abbreviated) + ("June 1602", Undate(1602, 6, calendar="Gregorian")), + ("13 Jan 1602", Undate(1602, 1, 13, calendar="Gregorian")), + ("2022 ugu. 4", Undate(2022, 11, 4, calendar="Gregorian")), + ("18 avril", Undate(month=4, day=18, calendar="Gregorian")), ]