diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..f1bab1c59 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-01-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv index da69e23eb..9369023e0 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -1,3 +1,5 @@ ई.पू. ईसा पूर्व ई. ईस्वी ई. ईसवी +वर्ष पूर्व वर्ष पूर्व +शताब्दी शताब्दी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv deleted file mode 100644 index 41201ae18..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv +++ /dev/null @@ -1,34 +0,0 @@ -१ एक -२ दो -३ तीन -४ चार -५ पाँच -६ छः -६ छ: -६ छह -६ छे -७ सात -८ आठ -९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पन्द्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस -२५ पच्चीस -२६ छब्बीस -२७ सत्ताईस -२८ अट्ठाईस -२९ उनतीस -३० तीस -३१ इकतीस diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv index a1ab32da0..d472e15df 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv @@ -263,6 +263,7 @@ yr सालों yr वर्ष yr वर्षों hp हॉर्स पावर +hp हॉर्सपॉवर hp हॉर्सपावर hp अश्वशक्ति hp अश्वशक्त @@ -284,4 +285,11 @@ mi/s मील प्रति सेकेंड mi/h मील प्रति घंटा mi/h मील प्रति घंटे mi/m मील प्रति मिनट -₹/ac रुपए प्रति एकड़ \ No newline at end of file +₹/ac रुपए प्रति एकड़ +w हफ़्ते +w हफ़्ता +w सप्ताह +सदियां सदियां +सदियाँ सदियाँ +सदियों सदियों +सदी सदी diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv index 0ca503bb1..3ee478688 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv @@ -22,7 +22,6 @@ km बोस्निया और हर्जेगोविना का म p बोत्सवाना पुला r$ ब्राजीलियाई रियाल £ ब्रिटिश पाउंड -£ पाउंड b$ ब्रुनेई डॉलर лв बुल्गारियाई लेव fbu बुरुंडी फ्रैंक @@ -179,4 +178,4 @@ bs. वेनेजुएलन बोलिवार ₺ लीरा ₦ नाइरा ¢ सेंट्स -¢ सेंट \ No newline at end of file +¢ सेंट diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv new file mode 100644 index 000000000..f56b256e6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv @@ -0,0 +1,231 @@ +१ दो +२ तीन +३ चार +४ पाँच +४ पांच +५ छः +५ छह +५ छे +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पन्द्रह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१६ सतरह +१७ अठारह +१७ अट्ठारह +१८ उन्नीस +१८ उनीस +१९ बीस +२० इक्कीस +२० इकीस +२० ईकीस +२१ बाईस +२१ बाइस +२२ तेईस +२२ तेइस +२३ चौबीस +२४ पच्चीस +२४ पचीस +२५ छब्बीस +२५ छबीस +२६ सत्ताईस +२६ सत्ताइस +२६ सताईस +२६ सताइस +२७ अट्ठाईस +२७ अट्ठाइस +२७ अठाईस +२७ अठाइस +२८ उनतीस +२८ उन्तीस +२९ तीस +३० इकतीस +३० इकतिस +३० इकत्तीस +३० इकत्तिस +३१ बत्तीस +३१ बत्तिस +३१ बतीस +३१ बतिस +३२ तैंतीस +३२ तैंतिस +३२ तैंत्तीस +३२ तैंत्तिस +३२ तेतीस +३२ तेंतीस +३३ चौंतीस +३३ चौंतिस +३३ चौंत्तीस +३३ चौंत्तिस +३४ पैंतीस +३४ पैंतिस +३४ पैंत्तीस +३४ पैंत्तिस +३५ छत्तीस +३५ छत्तिस +३५ छतीस +३५ छतिस +३६ सैंतीस +३६ सैंतिस +३६ सैंत्तीस +३६ सैंत्तिस +३७ अड़तीस +३७ अड़तिस +३७ अड़त्तीस +३७ अड़त्तिस +३८ उनतालीस +३८ उनतालिस +३८ उनत्तालीस +३८ उनत्तालिस +३८ उन्तालीस +३८ उन्तालिस +३९ चालीस +४० इकतालीस +४० इकतालिस +४० इक्तालीस +४१ बयालीस +४१ बयालिस +४१ ब्यालीस +४२ तैंतालीस +४२ तैंतालिस +४३ चौवालीस +४३ चौवालिस +४३ चवालीस +४३ चवालिस +४३ चौंतालीस +४४ पैंतालीस +४४ पैंतालिस +४५ छियालीस +४५ छियालिस +४५ छयालीस +४६ सैंतालीस +४६ सैंतालिस +४६ सैतालिस +४७ अड़तालीस +४७ अड़तालिस +४८ उनचास +४९ पचास +५० इक्यावन +५० इकयावन +५१ बावन +५२ तिरपन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५५ छपन +५६ सत्तावन +५६ सतावन +५७ अट्ठावन +५७ अठावन +५८ उनसठ +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६१ बासट +६२ तिरसठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६५ छयासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६८ उनहतर +६९ सत्तर +६९ सतर +७० इकहत्तर +७० इकहतर +७० इक्हत्तर +७० इकत्तर +७१ बहत्तर +७१ बहतर +७२ तिहत्तर +७२ तिहतर +७३ चौहत्तर +७३ चौहतर +७४ पचहत्तर +७४ पचहतर +७४ पिछत्तर +७४ पिछतर +७५ छिहत्तर +७५ छिहतर +७५ छियत्तर +७६ सतहत्तर +७६ सतहतर +७६ सतत्तर +७७ अठहत्तर +७७ अठहतर +७८ उन्यासी +७८ उन्यासि +७८ उनासी +७८ उनासि +७९ अस्सी +७९ अस्सि +८० इक्यासी +८० इक्यासि +८१ बयासी +८१ बयासि +८१ ब्यासी +८१ ब्यासि +८१ बिरासी +८२ तिरासी +८२ तिरासि +८२ तेरासी +८३ चौरासी +८३ चौरासि +८४ पचासी +८४ पचासि +८४ पिचयासी +८४ पिचयासि +८४ पिचासी +८५ छियासी +८५ छियासि +८६ सत्तासी +८६ सत्तासि +८६ सतासी +८६ सतासि +८७ अट्ठासी +८७ अट्ठासि +८७ अठासी +८७ अठासि +८८ नवासी +८८ नवासि +८९ नब्बे +९० इक्यानबे +९० इक्यानवे +९१ बानबे +९१ बानवे +९२ तिरानबे +९२ तिरानवे +९३ चौरानबे +९३ चौरानवे +९४ पंचानबे +९४ पंचानवे +९४ पचानवे +९४ पिचयानवे +९४ पिचयानबे +९४ पिच्यानवे +९४ पिच्यानबे +९५ छियानबे +९५ छियानवे +९६ सत्तानबे +९६ सत्तानवे +९७ अट्ठानबे +९७ अट्ठानवे +९७ अठानवे +९७ अठानबे +९८ निन्यान्बे +९८ निन्यानबे +९८ निन्यानवे +९८ निन्यान्वे diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv index 91f656cfd..3968a1320 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -9,6 +9,7 @@ १७ सत्रह १७ सतरह १८ अठारह +१८ अठाहर १८ अट्ठारह १९ उन्नीस १९ उनीस @@ -216,4 +217,4 @@ ९९ निन्यान्बे ९९ निन्यानबे ९९ निन्यानवे -९९ निन्यान्वे \ No newline at end of file +९९ निन्यान्वे diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv deleted file mode 100644 index 7e0c8628e..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv +++ /dev/null @@ -1,27 +0,0 @@ -१ एक -२ दो -३ तीन -४ चार -५ पाँच -६ छः -६ छह -६ छे -७ सात -८ आठ -९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पन्द्रह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv new file mode 100644 index 000000000..8bb4c67ca --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv @@ -0,0 +1,15 @@ +१२ एक +१ दो +२ तीन +३ चार +४ पाँच +४ पांच +५ छः +५ छह +५ छे +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv deleted file mode 100644 index 2bd8d6c61..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv +++ /dev/null @@ -1,128 +0,0 @@ -०१ एक -०२ दो -०३ तीन -०४ चार -०५ पाँच -०६ छः -०६ छह -०६ छे -०७ सात -०८ आठ -०९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पन्द्रह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१८ अट्ठारह -१९ उन्नीस -१९ उनीस -२० बीस -२१ इक्कीस -२१ इकीस -२१ ईकीस -२२ बाईस -२२ बाइस -२३ तेईस -२३ तेइस -२४ चौबीस -२५ पच्चीस -२५ पचीस -२६ छब्बीस -२६ छबीस -२७ सत्ताईस -२७ सत्ताइस -२७ सताईस -२७ सताइस -२८ अट्ठाईस -२८ अट्ठाइस -२८ अठाईस -२८ अठाइस -२९ उनतीस -२९ उन्तीस -३० तीस -३१ इकतीस -३१ इकतिस -३१ इकत्तीस -३१ इकत्तिस -३२ बत्तीस -३२ बत्तिस -३२ बतीस -३२ बतिस -३३ तैंतीस -३३ तैंतिस -३३ तैंत्तीस -३३ तैंत्तिस -३३ तेतीस -३४ चौंतीस -३४ चौंतिस -३४ चौंत्तीस -३४ चौंत्तिस -३५ पैंतीस -३५ पैंतिस -३५ पैंत्तीस -३५ पैंत्तिस -३६ छत्तीस -३६ छत्तिस -३६ छतीस -३६ छतिस -३७ सैंतीस -३७ सैंतिस -३७ सैंत्तीस -३७ सैंत्तिस -३८ अड़तीस -३८ अड़तिस -३८ अड़त्तीस -३८ अड़त्तिस -३९ उनतालीस -३९ उनतालिस -३९ उनत्तालीस -३९ उनत्तालिस -३९ उन्तालीस -३९ उन्तालिस -४० चालीस -४१ इकतालीस -४१ इकतालिस -४१ इक्तालीस -४२ बयालीस -४२ बयालिस -४२ ब्यालीस -४३ तैंतालीस -४३ तैंतालिस -४४ चौवालीस -४४ चौवालिस -४४ चवालीस -४४ चवालिस -४४ चौंतालीस -४५ पैंतालीस -४५ पैंतालिस -४६ छियालीस -४६ छियालिस -४६ छयालीस -४७ सैंतालीस -४७ सैंतालिस -४७ सैतालिस -४८ अड़तालीस -४८ अड़तालिस -४९ उनचास -५० पचास -५१ इक्यावन -५१ इकयावन -५२ बावन -५३ तिरपन -५३ तिरेपन -५४ चौवन -५५ पचपन -५६ छप्पन -५६ छपन -५७ सत्तावन -५७ सतावन -५८ अट्ठावन -५८ अठावन -५९ उनसठ -६० साठ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv index f9eb081b9..8cfd0e19f 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv @@ -1,13 +1,7 @@ १/४ पाव -१/२ आधा -३/४ पौन -१:३० डेढ़ बजे -२:३० ढाई बजे -१.५ डेढ़ -२.५ ढाई कु. कुमारी स्मि. श्रीमती श्री. श्री श्री. श्रीमान मा. मास्टर -डॉ. डॉक्टर \ No newline at end of file +डॉ. डॉक्टर diff --git a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py index 96cbc58bb..b002efa52 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py @@ -32,6 +32,7 @@ graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() +DEVANAGARI_DIGIT = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"] NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = u"\u00a0" @@ -63,6 +64,10 @@ MINUS = pynini.union("ऋणात्मक", "नकारात्मक").optimize() +def integer_to_devanagari(n: int) -> str: + return ''.join(DEVANAGARI_DIGIT[int(d)] for d in str(n)) + + def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py index 63b055bef..7fcdcf348 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py @@ -44,8 +44,14 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() graph_teens_and_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + graph_paune = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() + self.graph_zero = graph_zero + self.graph_digit = graph_digit + self.graph_single_digit_with_zero = pynutil.insert("०") + graph_digit + self.graph_teens_and_ties = graph_teens_and_ties self.graph_two_digit = graph_teens_and_ties | (pynutil.insert("०") + graph_digit) graph_hundred = pynini.cross("सौ", "") + delete_hundred = pynutil.delete("सौ") delete_thousand = pynutil.delete("हज़ार") | pynutil.delete("हजार") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("०")) graph_hundred_component += delete_space @@ -64,11 +70,93 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) graph_hundred_as_thousand += delete_space graph_hundred_as_thousand += self.graph_two_digit | pynutil.insert("००") + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + graph_digit + + pynutil.insert("५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + graph_digit + + pynutil.insert("२५०", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_paune + + pynutil.insert("७५०", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("१५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("२५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) - self.graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + graph_in_hundreds = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + (graph_digit | self.graph_two_digit) + + pynutil.insert("५०", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + (graph_digit | self.graph_two_digit) + + pynutil.insert("२५", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_paune + + pynutil.insert("७५", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("१५०", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("ढाई") + delete_space + pynutil.insert("२५०", weight=-0.1) + delete_space + delete_hundred, + -0.1, + ) + self.graph_hundreds = graph_hundred_component | graph_hundred_as_thousand | graph_in_hundreds graph_teens_and_ties_component = pynini.union( - graph_teens_and_ties | pynutil.insert("00") + delete_space + (graph_digit | pynutil.insert("0")), + graph_teens_and_ties | pynutil.insert("००") + delete_space + (graph_digit | pynutil.insert("०")), ) graph_ties_component_at_least_one_none_zero_digit = self.graph_two_digit @ ( pynini.closure(NEMO_HI_DIGIT) + pynini.closure(NEMO_HI_DIGIT) @@ -139,6 +227,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_no_prefix = pynutil.add_weight( pynini.cross("सौ", "१००") | pynini.cross("हज़ार", "१०००") + | pynini.cross("हजार", "१०००") | pynini.cross("लाख", "१०००००") | pynini.cross("करोड़", "१०००००००"), 2, diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 817b1b86a..27e6eec32 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -16,11 +16,13 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + DEVANAGARI_DIGIT, NEMO_HI_DIGIT, GraphFst, delete_extra_space, delete_space, insert_space, + integer_to_devanagari, ) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -35,16 +37,21 @@ class DateFst(GraphFst): date: DateFst """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): super().__init__(name="date", kind="classify") graph_year = pynutil.add_weight( pynini.compose(cardinal.graph_no_exception, pynini.closure(NEMO_HI_DIGIT, 1, 4)), 0.03 ) + cardinal_graph = cardinal.graph_no_exception month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) - graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() + graph_date_days = cardinal.graph_digit | cardinal.graph_teens_and_ties + date_days = pynini.union(*[integer_to_devanagari(i) for i in range(1, 32)]).optimize() + graph_date_days = graph_date_days @ date_days + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() + graph_morph_features = pynini.string_file(get_abs_path("data/ordinals/morph_features.tsv")) self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") @@ -60,8 +67,10 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\" ") ) self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") - insert_comma = pynutil.insert(", ") - + self.ordinal_century = pynutil.insert("era: \"") + cardinal_graph + pynutil.insert("\" ") + self.morpho_graph = ( + pynutil.insert("morphosyntactic_features: \"") + graph_morph_features + pynutil.insert("\"") + ) graph_day_month = self.day + delete_space + self.month graph_month_day = self.month + delete_space + self.day graph_month_day += pynutil.insert(" preserve_order: true") @@ -76,7 +85,9 @@ def __init__(self, cardinal: GraphFst): ) graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century graph_year_range = self.year_range + graph_year_range_century = self.year_range + delete_space + self.century + graph_ordinal_century = self.ordinal_century + self.morpho_graph + delete_extra_space + self.century graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day graph_date_exceptions += pynutil.insert("preserve_order: true") @@ -91,6 +102,8 @@ def __init__(self, cardinal: GraphFst): | graph_day_month_year_century | graph_month_year_century | graph_year_range + | graph_year_range_century + | graph_ordinal_century | graph_date_exceptions ) final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py index 1e44f59e8..970bf7313 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py @@ -16,13 +16,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.text_normalization.hi.graph_utils import ( INPUT_CASED, INPUT_LOWER_CASED, MIN_NEG_WEIGHT, MINUS, - NEMO_DIGIT, + NEMO_HI_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, @@ -30,7 +31,6 @@ delete_extra_space, delete_space, ) -from nemo_text_processing.text_normalization.en.utils import load_labels class FractionFst(GraphFst): @@ -59,9 +59,99 @@ def __init__(self, cardinal: GraphFst): numerator = pynutil.insert("numerator: \"") + graph_cardinal + pynutil.insert("\"") denominator = pynutil.insert(" denominator: \"") + graph_cardinal + pynutil.insert("\"") + graph_fraction = numerator + delete_bata + denominator + graph_mixed_fraction = integer + delete_extra_space + pynutil.delete("सही") + delete_space + graph_fraction + + graph_saade = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + integer + + pynutil.insert("numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.01, + ) + graph_sava = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + integer + + pynutil.insert("numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"४\""), + -0.001, + ) + graph_paune = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + integer + + pynutil.insert("numerator: \"३\"") + + delete_space + + pynutil.insert(" denominator: \"४\""), + -0.01, + ) + graph_dedh = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.01, + ) + graph_dhaai = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.1, + ) + + graph_aadha_and_saade_only = ( + pynini.union(pynutil.delete("आधा") | pynutil.delete("साढ़े")) + + delete_space + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\"") + ) + graph_sava_only = ( + pynutil.delete("सवा") + + delete_space + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"४\"") + ) + graph_paune_only = ( + pynini.union(pynutil.delete("पौन") | pynutil.delete("पौना")) + + delete_space + + pynutil.insert("numerator: \"३\"") + + delete_space + + pynutil.insert(" denominator: \"४\"") + ) + + graph_tihaai = ( + numerator + delete_space + pynutil.delete("तिहाई") + delete_space + pynutil.insert(" denominator: \"३\"") + ) + graph_chauthaai = ( + numerator + delete_space + pynutil.delete("चौथाई") + delete_space + pynutil.insert(" denominator: \"४\"") + ) + + graph_quarterly_exceptions = ( + graph_saade + | graph_sava + | graph_paune + | graph_dedh + | graph_dhaai + | graph_aadha_and_saade_only + | graph_sava_only + | graph_paune_only + | graph_tihaai + | graph_chauthaai + ) - graph = graph_fraction + graph = graph_fraction | graph_mixed_fraction | graph_quarterly_exceptions self.graph = graph.optimize() self.final_graph_wo_negative = graph optional_graph_negative = pynini.closure( diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py index 15d8e4eb8..192fc88d3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py @@ -23,7 +23,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path class MeasureFst(GraphFst): @@ -51,7 +51,60 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) measurements_graph = pynini.string_file(get_abs_path("data/measure/measurements.tsv")).invert() + paune_graph = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() + self.measurements = pynutil.insert("units: \"") + measurements_graph + pynutil.insert("\" ") + graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + graph_integer_paune = pynutil.insert("integer_part: \"") + paune_graph + pynutil.insert("\"") + + graph_saade_single_digit = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + graph_integer + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 0.1, + ) + graph_sava_single_digit = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + graph_integer + + delete_space + + pynutil.insert(" fractional_part: \"२५\""), + 0.1, + ) + graph_paune_single_digit = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_integer_paune + + delete_space + + pynutil.insert(" fractional_part: \"७५\""), + 1, + ) + graph_dedh_single_digit = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 0.1, + ) + graph_dhaai_single_digit = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 1, + ) + + graph_exceptions = ( + graph_saade_single_digit + | graph_sava_single_digit + | graph_paune_single_digit + | graph_dedh_single_digit + | graph_dhaai_single_digit + ) graph_measurements = ( pynutil.insert("decimal { ") @@ -71,8 +124,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + delete_extra_space + self.measurements ) + graph_quarterly_measurements = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + graph_exceptions + + pynutil.insert(" }") + + delete_extra_space + + self.measurements + ) + graph_exception_bai = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + delete_space + + pynini.cross("बाई", "x") + + delete_space + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynini.closure(delete_extra_space + self.measurements) + ) - graph = graph_measurements + graph = graph_measurements | graph_quarterly_measurements | graph_exception_bai self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py index 7fa59ee26..e79c9d0b3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py @@ -21,7 +21,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path class MoneyFst(GraphFst): @@ -40,14 +40,22 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.graph_no_exception + cardinal_single_and_double_digit_graph = cardinal.graph_digit | cardinal.graph_teens_and_ties decimal_graph = decimal.final_graph_wo_negative currency_graph = pynini.string_file(get_abs_path("data/money/currency.tsv")).invert() + paune_graph = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + self.integer_quarterly_measures = pynutil.insert("integer_part: \"") + cardinal_single_and_double_digit_graph + self.integer_paune = pynutil.insert("integer_part: \"") + paune_graph self.paise = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\"") self.fraction = decimal_graph self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") aur = pynutil.delete("और") + delete_hundred = pynutil.delete("सौ") + delete_lakh = pynutil.delete("लाख") + delete_hazar = pynutil.delete("हजार") | pynutil.delete("हज़ार") + delete_crore = pynutil.delete("करोड़") | pynutil.delete("करोड़") graph_currency_decimal = self.fraction + delete_extra_space + self.currency graph_currency_cardinal = self.integer + delete_extra_space + self.currency @@ -60,8 +68,186 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + delete_extra_space + pynutil.delete(currency_graph) ) + # cases for saade,sava with teens and ties + graph_saade_teens_ties = ( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + graph_sava_teens_ties = ( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" fractional_part: \"२५\"") + + delete_extra_space + + self.currency + ) + graph_dedh = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + graph_dhaai = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + + graph_exceptions_teens_ties = graph_saade_teens_ties | graph_sava_teens_ties | graph_dedh | graph_dhaai + + # cases for saade,sava,paune,dedh and dhaai with hundreds and thousands + graph_exceptions = self.integer + delete_extra_space + self.currency + + # exceptions with lakhs + graph_saade_lakh = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency, + 0.01, + ) + graph_sava_lakh = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("२५०००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency, + 0.01, + ) + graph_paune_lakh = ( + pynutil.delete("पौने") + + delete_space + + self.integer_paune + + delete_space + + pynutil.insert("७५०००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + graph_dedh_lakh = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("१५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + graph_dhaai_lakh = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("२५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + + graph_exceptions_lakhs = ( + graph_saade_lakh | graph_sava_lakh | graph_paune_lakh | graph_dedh_lakh | graph_dhaai_lakh + ) + + # exceptions with crores + graph_saade_crore = ( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_sava_crore = ( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("२५०००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_paune_crore = ( + pynutil.delete("पौने") + + delete_space + + self.integer_paune + + delete_space + + pynutil.insert("७५०००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_dhaai_crore = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("२५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_dedh_crore = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("१५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + + graph_exceptions_crores = ( + graph_saade_crore | graph_sava_crore | graph_paune_crore | graph_dedh_crore | graph_dhaai_crore + ) + + graph_quarterly_measures = ( + graph_exceptions_teens_ties | graph_exceptions | graph_exceptions_lakhs | graph_exceptions_crores + ) - graph = graph_currency_decimal | graph_currency_cardinal | graph_rupay_and_paisa + graph = graph_currency_decimal | graph_currency_cardinal | graph_rupay_and_paisa | graph_quarterly_measures self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py index d6f4d59ac..473055891 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py @@ -83,5 +83,6 @@ def __init__(self, cardinal: GraphFst): rule = pynini.cdrewrite(morpho_graph, pynini.closure(NEMO_HI_DIGIT), pynini.union("[EOS]", " "), NEMO_SIGMA) final_graph = pynutil.insert("integer: \"") + graph @ rule - final_graph = self.add_tokens(final_graph) + self.final_graph = self.add_tokens(final_graph) + final_graph = self.final_graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py index 6bfc51af7..942b5022b 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py @@ -15,7 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + DEVANAGARI_DIGIT, + GraphFst, + delete_extra_space, + delete_space, + insert_space, + integer_to_devanagari, +) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -29,12 +36,15 @@ class TimeFst(GraphFst): time: TimeFst """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") - hour_graph = pynini.string_file(get_abs_path("data/time/hour.tsv")).invert() - minute_graph = pynini.string_file(get_abs_path("data/time/minute_and_second.tsv")).invert() - second_graph = pynini.string_file(get_abs_path("data/time/minute_and_second.tsv")).invert() + hour_graph = cardinal.graph_digit | cardinal.graph_teens_and_ties + time_hours = pynini.union(*[integer_to_devanagari(i) for i in range(1, 25)]).optimize() + hour_graph = hour_graph @ time_hours + + cardinal_graph = cardinal.graph_single_digit_with_zero | cardinal.graph_teens_and_ties + paune_hour_graph = pynini.string_file(get_abs_path("data/time/hour_for_paune.tsv")).invert() delete_baje = pynini.union( pynutil.delete("बजके") | pynutil.delete("बजकर") | pynutil.delete("बजे") | pynutil.delete("घंटा") @@ -44,8 +54,9 @@ def __init__(self): delete_second = pynutil.delete("सेकंड") self.hour = pynutil.insert("hours: \"") + hour_graph + pynutil.insert("\" ") - self.minute = pynutil.insert("minutes: \"") + minute_graph + pynutil.insert("\" ") - self.second = pynutil.insert("seconds: \"") + second_graph + pynutil.insert("\" ") + self.paune_hour = pynutil.insert("hours: \"") + paune_hour_graph + pynutil.insert("\" ") + self.minute = pynutil.insert("minutes: \"") + cardinal_graph + pynutil.insert("\" ") + self.second = pynutil.insert("seconds: \"") + cardinal_graph + pynutil.insert("\" ") # hour minute second graph_hms = ( @@ -63,17 +74,20 @@ def __init__(self): ) # hour minute and hour minute without "baje and minat" - graph_hm = ( + graph_hm = pynutil.add_weight( self.hour + delete_space + pynini.closure(delete_baje, 0, 1) + delete_space + self.minute - + pynini.closure(delete_space + delete_minute, 0, 1) + + pynini.closure(delete_space + delete_minute, 0, 1), + 0.01, ) # hour second - graph_hs = self.hour + delete_space + delete_baje + delete_space + self.second + delete_space + delete_second + graph_hs = pynutil.add_weight( + self.hour + delete_space + delete_baje + delete_space + self.second + delete_space + delete_second, 0.01 + ) # minute second graph_ms = ( @@ -83,7 +97,61 @@ def __init__(self): # hour graph_hour = self.hour + delete_space + delete_baje - graph = graph_hms | graph_hm | graph_hs | graph_ms | graph_hour + graph_saade = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + self.hour + + delete_space + + pynutil.insert(" minutes: \"३०\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_sava = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + self.hour + + delete_space + + pynutil.insert(" minutes: \"१५\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_paune = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + self.paune_hour + + delete_space + + pynutil.insert(" minutes: \"४५\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_dedh = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + delete_baje + + pynutil.insert("hours: \"१\"") + + delete_space + + pynutil.insert(" minutes: \"३०\""), + 0.01, + ) + graph_dhaai = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + delete_baje + + pynutil.insert("hours: \"२\"") + + delete_space + + pynutil.insert(" minutes: \"३०\""), + 0.01, + ) + graph_quarterly_measures = ( + graph_dedh + | graph_dhaai + | ((graph_saade | graph_sava | graph_paune) + pynini.closure(delete_space + delete_baje)) + ) + + graph = graph_hms | graph_hm | graph_hs | graph_ms | graph_hour | graph_quarterly_measures self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index b3fcb0c2d..50abab0e5 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -79,9 +79,9 @@ def __init__( decimal_graph = decimal.fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst - date = DateFst(cardinal) + date = DateFst(cardinal, ordinal) date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal) time_graph = time.fst measure = MeasureFst(cardinal, decimal) measure_graph = measure.fst diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index eacfb5765..7a5c10c4c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -30,7 +30,7 @@ class DateFst(GraphFst): date { day: "५" month: "जनवरी" year: "२०१२" preserve_order: true } -> ५ जनवरी २०१२ """ - def __init__(self): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): super().__init__(name="date", kind="verbalize") month = ( pynutil.delete("month:") @@ -61,6 +61,21 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) + era = ( + pynutil.delete("era:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + morpho_features = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + graph_fy = year graph_fy |= period + delete_space + year @@ -100,6 +115,11 @@ def __init__(self): # year range graph_year_range = year + # ordinal century + graph_ordinal_century = era + delete_space + morpho_features + delete_extra_space + period + + # graph_ordinal_range = graph_ordinal + delete_extra_space + period + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -122,6 +142,7 @@ def __init__(self): | graph_dmyc | graph_myc | graph_year_range + | graph_ordinal_century ) + delete_space + optional_preserve_order diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py index 0fa7e97bd..45b5832b5 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py @@ -16,6 +16,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py index 1fc9ba373..d6d8f72f8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py @@ -52,7 +52,17 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_decimal = ( pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") ) + graph_exception_bai = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit + graph |= graph_exception_bai + pynini.closure(delete_space + pynutil.insert(" ") + unit) delete_tokens = self.delete_tokens(graph) self.decimal = graph_decimal self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py index d6c4e0025..94f280798 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py @@ -40,6 +40,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - + self.numbers = graph + graph = graph.optimize() delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 165fe7a7e..f1a6c55a3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -38,11 +38,12 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst - ordinal_graph = OrdinalFst().fst + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst decimal = DecimalFst() decimal_graph = decimal.fst fraction_graph = FractionFst().fst - date_graph = DateFst().fst + date_graph = DateFst(cardinal, ordinal).fst time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 5e7fa1892..6a46230cf 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -162,8 +162,8 @@ def training_data_to_tokens( for instance in data: if instance.token_type != EOS_TYPE: if category is None or instance.token_type == category: - result[instance.token_type][0].append(instance.un_normalized) - result[instance.token_type][1].append(instance.normalized) + result[instance.token_type][0].append(unicodedata.normalize(NFC, instance.un_normalized)) + result[instance.token_type][1].append(unicodedata.normalize(NFC, instance.normalized)) return result diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt index a72ad4183..4a7221675 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt @@ -43,4 +43,12 @@ एक अरब बारह करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~११२२३४५५६७ एक अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~१०२२३४५५६७ ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~११०२२३४५५६७ -इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~५१०२२३४५५६७ \ No newline at end of file +इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~५१०२२३४५५६७ +सवा सात सौ~७२५ +साढ़े सात सौ~७५० +साढ़े सात हज़ार~७५०० +सवा सात हज़ार~७२५० +डेढ़ सौ~१५० +ढाई सौ~२५० +साढ़े सोलह सौ~१६५० +सवा सोलह सौ~१६२५ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index 6d570a9c5..402361d71 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -33,3 +33,10 @@ पच्चीस ईसा पूर्व~२५ ई.पू. मार्च की दो~मार्च २ फ़रवरी की बीस~फ़रवरी २० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे ईसवी~१९९०-१९९१ ई. +दो हज़ार पाँच से दो हज़ार उन्नीस ईसा पूर्व~२००५-२०१९ ई.पू. +दसवें शताब्दी~१०वें शताब्दी +अठाहरवीं शताब्दी~१८वीं शताब्दी +एक हज़ार एकवीं शताब्दी~१००१वीं शताब्दी +एक सौ उन्नीसवां शताब्दी~११९वां शताब्दी +उन्नीस सौ बीस से छब्बीस तक~१९२०-२६ तक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt index 23bef1a85..5b8d86602 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt @@ -4,4 +4,10 @@ एक सौ आठ दशमलव सात पाँच~१०८.७५ एक सौ दस दशमलव सात पाँच~११०.७५ एक सौ दो दशमलव तीन~१०२.३ -एक सौ छह दशमलव पाँच~१०६.५ \ No newline at end of file +एक सौ छह दशमलव पाँच~१०६.५ +साढ़े तीन सौ दशमलव दो दो~३५०.२२ +सवा तीन सौ दशमलव दो~३२५.२ +साढ़े चार सौ दशमलव सात पाँच~४५०.७५ +सवा चार सौ दशमलव सात पाँच~४२५.७५ +ढाई सौ दशमलव छह~२५०.६ +डेढ़ सौ दशमलव सात पाँच~१५०.७५ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt index 12ee24d61..21ceff6c6 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt @@ -7,4 +7,25 @@ एक सौ तेईस बटा एक सौ पच्चीस~१२३/१२५ छह सौ बासठ बटा एक~६६२/१ एक सौ पाँच बटा सात~१०५/७ -छह सौ चौवन बटा तीन~६५४/३ \ No newline at end of file +छह सौ चौवन बटा तीन~६५४/३ +एक सौ तैंतीस सही एक बटा दो~१३३ १/२ +एक सौ तैंतीस सही दो बटा तीन~१३३ २/३ +एक सही छह बटा छह~१ ६/६ +दो सही एक बटा छह~२ १/६ +तीन सही तीन बटा चार~३ ३/४ +एक सौ बीस सही तीन बटा चार~१२० ३/४ +एक सौ बीस सही पिछत्तर बटा नब्बे~१२० ७५/९० +तीन सही तीन बटा चार~३ ३/४ +सवा चौरासी~८४ १/४ +डेढ़~१ १/२ +ढाई~२ १/२ +आधा~१/२ +साढ़े~१/२ +सवा~१/४ +पौन~३/४ +पौना~३/४ +सवा पैंतीस~३५ १/४ +साढ़े चार सौ बटा दस~४५०/१० +तीन चौथाई~३/४ +दो तिहाई~२/३ +एक चौथाई~१/४ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt index 3bd860f08..21615f1c5 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt @@ -29,4 +29,20 @@ तैंतीस दशमलव तीन तीन किलोमीटर प्रति घंटा~३३.३३ km/h चौदह हज़ार इकहत्तर दशमलव नौ नौ पिंट~१४०७१.९९ pt बहत्तर दशमलव आठ तीन मील प्रति घंटा~७२.८३ mi/h -बहत्तर मील प्रति घंटा~७२ mi/h \ No newline at end of file +बहत्तर मील प्रति घंटा~७२ mi/h +पौने ग्यारह घंटे~१०.७५ h +साढ़े सात वर्ष~७.५ yr +सवा ग्यारह सौ मीटर~११२५ m +पौने चार सौ हेक्टेयर~३७५ ha +साढ़े दस घन फीट~१०.५ ft³ +पौने पांच सौ किलोमीटर~४७५ km +ढाई सौ गैलन~२५० gal +डेढ़ दर्जन~१.५ doz +साढ़े सात ऐंपीयर~७.५ A +पौने तीन हजार एकड़~२७५० ac +साढ़े बारह वर्ग माइक्रोमीटर~१२.५ µm² +ढाई महीने~२.५ mo +दो बाई दो~२x२ +दो बाई दो~२x२ +पाँच बाई पाँच~५x५ +बाईस बाई पाँच घन फीट~२२x५ ft³ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt index 8cc06397b..8821940c3 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt @@ -21,4 +21,30 @@ इकहत्तर हज़ार इकहत्तर बिटकॉइन~₿७१०७१ बत्तीस बुरुंडी फ्रैंक~fbu३२ पन्द्रह सौ कैमन आइलैंड्स डॉलर~ci$१५०० -छह सौ पच्चीस रुपये दो पैसे~₹६२५.२ \ No newline at end of file +छह सौ पच्चीस रुपये दो पैसे~₹६२५.२ +साढ़े सात सौ डॉलर~$७५० +सवा दो सौ यूक्रेनी ग्रिव्ना~₴२२५ +साढ़े छः लाख रुपए~₹६५०००० +सवा छः लाख अल्जीरियाई दिनार~دج६२५००० +सवा पंद्रह लाख युगांडा शिलिंग~ush१५२५००० +साढ़े पंद्रह लाख रुपए~₹१५५०००० +साढ़े पाँच हज़ार लीरा~₺५५०० +ढाई सौ यूरो~€२५० +ढाई हजार बुरुंडी फ्रैंक~fbu२५०० +ढाई करोड़ रुपए~₹२५०००००० +ढाई लाख रुपए~₹२५०००० +डेढ़ सौ यूरो~€१५० +डेढ़ हजार रुपए~₹१५०० +डेढ़ करोड़ रुपए~₹१५०००००० +डेढ़ लाख रुपए~₹१५०००० +पौने तीन सौ रुपए~₹२७५ +पौने पंद्रह सौ रुपए~₹१४७५ +पौने तीन हजार रुपए~₹२७५० +पौने पंद्रह हजार यूरो~€१४७५० +पौने पैंतालिस हजार यूरो~€४४७५० +पौने तीन लाख रुपए~₹२७५००० +पौने पंद्रह लाख रुपए~₹१४७५००० +पौने पैंतालिस लाख रुपए~₹४४७५००० +पौने तीन करोड़ रुपए~₹२७५००००० +पौने पंद्रह करोड़ रुपए~₹१४७५००००० +पौने पैंतालिस करोड़ रुपए~₹४४७५००००० diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt index c1edb837d..8ec5e4df3 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt @@ -14,4 +14,12 @@ नौ घंटा दो सेकंड~९:००:०२ सोलह घंटा एक मिनट सत्ताईस सेकंड~१६:०१:२७ दस बजकर चौवन मिनट आठ सेकंड~१०:५४:०८ -तीन मिनट उन्नीस सेकंड~००:०३:१९ \ No newline at end of file +तीन मिनट उन्नीस सेकंड~००:०३:१९ +ढाई बजे~२:३० +डेढ़ बजे~१:३० +डेढ़ घंटा~१:३० +साढ़े पाँच बजे~५:३० +सवा चार बजे~४:१५ +साढ़े ग्यारह~११:३० +पौने पाँच~४:४५ +पौने तीन घंटा~२:४५ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt index 30824fced..68f4fd775 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt @@ -1,5 +1,3 @@ -डेढ़ बजे~१:३० -ढाई बजे~२:३० मास्टर निखिल तनिष~मा. निखिल तनिष पाव~१/४ श्रीमती ज्योत्सना~स्मि. ज्योत्सना @@ -7,6 +5,4 @@ आधा कप चाय~१/२ कप चाय श्रीमान भारत कुमार~श्री. भारत कुमार डॉक्टर प्रशांत~डॉ. प्रशांत -डेढ़~१.५ कुमारी~कु. -ढाई~२.५ \ No newline at end of file