From b13b6a0cd3dc586ea4234acd4bc7b41794668120 Mon Sep 17 00:00:00 2001 From: tarushi2k2 Date: Thu, 3 Apr 2025 15:43:26 +0530 Subject: [PATCH 1/4] Future implementations to date.py - Hindi ITN (#265) * Addition of whitelist and word classes Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updation of Jenkins date Signed-off-by: Tarushi V * Cleanup Signed-off-by: Tarushi V * Updation Signed-off-by: Tarushi V * Updation Signed-off-by: Tarushi V * Future implementations for date Signed-off-by: Tarushi V * pushing rough date code for ref Signed-off-by: Tarushi V * Future implementations date.py Signed-off-by: Tarushi V * Cleanup Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updation of Jenkinsfile Signed-off-by: Tarushi V * Telephone.py-hindi itn Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Telephone.py - Hindi ITN Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Telephone modified tagger and verbalizer Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * telephone tagger with 3,4,5 digit std codes Signed-off-by: Tarushi V * Further additions - telephone.py Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Jenkins update Signed-off-by: Tarushi V * Telephone.py Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated tagger-telephone.py Signed-off-by: Tarushi V * Telephone and Jenkinsfile cleanup Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Jenkins Signed-off-by: Tarushi V --------- Signed-off-by: Tarushi V Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Anand Joseph --- Jenkinsfile | 2 +- .../hi/data/date/century.tsv | 3 + .../hi/data/telephone/eng_to_hindi_digit.tsv | 10 ++ .../telephone/teens_and_ties_eng_to_hin.tsv | 90 ++++++++++ .../hi/taggers/date.py | 35 +++- .../hi/taggers/telephone.py | 158 ++++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 4 + .../hi/verbalizers/date.py | 38 ++++- .../hi/verbalizers/telephone.py | 55 ++++++ .../hi/verbalizers/verbalize.py | 3 + .../test_cases_date.txt | 12 +- .../test_cases_telephone.txt | 37 ++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 + .../nemo_text_processing/hi/test_telephone.py | 31 ++++ 14 files changed, 477 insertions(+), 6 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/hi/test_telephone.py diff --git a/Jenkinsfile b/Jenkinsfile index 6edad14a2..82a0a4799 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv new file mode 100644 index 000000000..da69e23eb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -0,0 +1,3 @@ +ई.पू. ईसा पूर्व +ई. ईस्वी +ई. ईसवी diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv new file mode 100644 index 000000000..53c5e36cb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv @@ -0,0 +1,10 @@ +० zero +१ one +२ two +३ three +४ four +५ five +६ six +७ seven +८ eight +९ nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv new file mode 100644 index 000000000..ac37b55f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv @@ -0,0 +1,90 @@ +१० ten +११ eleven +१२ twelve +१३ thirteen +१४ fourteen +१५ fifteen +१६ sixteen +१७ seventeen +१८ eighteen +१९ nineteen +२० twenty +२१ twenty one +२२ twenty two +२३ twenty three +२४ twenty four +२५ twenty five +२६ twenty six +२७ twenty seven +२८ twenty eight +२९ twenty nine +३० thirty +३१ thirty one +३२ thirty two +३३ thirty three +३४ thirty four +३५ thirty five +३६ thirty six +३७ thirty seven +३८ thirty eight +३९ thirty nine +४० forty +४१ forty one +४२ forty two +४३ forty three +४४ forty four +४५ forty five +४६ forty six +४७ forty seven +४८ forty eight +४९ forty nine +५० fifty +५१ fifty one +५२ fifty two +५३ fifty three +५४ fifty four +५५ fifty five +५६ fifty six +५७ fifty seven +५८ fifty eight +५९ fifty nine +६० sixty +६१ sixty one +६२ sixty two +६३ sixty three +६४ sixty four +६५ sixty five +६६ sixty six +६७ sixty seven +६८ sixty eight +६९ sixty nine +७० seventy +७१ seventy one +७२ seventy two +७३ seventy three +७४ seventy four +७५ seventy five +७६ seventy six +७७ seventy seven +७८ seventy eight +७९ seventy nine +८० eighty +८१ eighty one +८२ eighty two +८३ eighty three +८४ eighty four +८५ eighty five +८६ eighty six +८७ eighty seven +८८ eighty eight +८९ eighty nine +९० ninety +९१ ninety one +९२ ninety two +९३ ninety three +९४ ninety four +९५ ninety five +९६ ninety six +९७ ninety seven +९८ ninety eight +९९ ninety nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 61183ae72..6859f0834 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -44,10 +44,22 @@ def __init__(self, cardinal: GraphFst): month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") + self.year_range = ( + pynutil.insert("year: \"") + + graph_year + + delete_space + + pynini.cross("से", "-") + + delete_space + + graph_year + + delete_space + + pynutil.insert("\" ") + ) + self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") graph_day_month = self.day + delete_space + self.month @@ -58,9 +70,28 @@ def __init__(self, cardinal: GraphFst): graph_month_day_year += pynutil.insert(" preserve_order: true") graph_month_year = self.month + delete_space + self.year graph_saal = self.year + graph_AD_BC = self.year + delete_space + self.century + graph_day_month_year_century = ( + self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century + ) + graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century + graph_year_range = self.year_range - graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year - self.graph = graph.optimize() + graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day + graph_date_exceptions += pynutil.insert("preserve_order: true") + graph = ( + graph_day_month + | graph_month_day + | graph_day_month_year + | graph_month_day_year + | graph_month_year + | graph_saal + | graph_AD_BC + | graph_day_month_year_century + | graph_month_year_century + | graph_year_range + | graph_date_exceptions + ) final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..1d1d3c875 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } + + Args: + Cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="classify") + + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file( + get_abs_path("data/telephone/eng_to_hindi_digit.tsv") + ).invert() + + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + country_code_graph_double_digits |= pynini.string_file( + get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") + ).invert() + + self.hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 9) + + english_digit_graph + + delete_space + + pynutil.insert("\" ") + ) + + self.country_code_with_single_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.country_code_with_double_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits + + # two, three, four-digit extension code with zero + self.city_code_hindi = ( + pynutil.insert("extension: \"") + + pynini.closure(hindi_digit_graph + delete_space, 2, 5) + + pynutil.insert("\" ") + ) + self.city_code_english = ( + pynutil.insert("extension: \"") + + pynini.closure(english_digit_graph + delete_space, 2, 5) + + pynutil.insert("\" ") + ) + + self.city_extension = self.city_code_hindi | self.city_code_english + + # 7-digit landline graph in hindi and english digits + self.landline_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 7, 7) + + pynutil.insert("\" ") + ) + self.landline_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 7, 7) + + pynutil.insert("\" ") + ) + + self.landline = self.landline_hindi | self.landline_english + + self.pincode_in_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 5) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.pincode_in_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 5) + + english_digit_graph + + pynutil.insert("\" ") + ) + + self.credit_card_last_digits_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.credit_card_last_digits_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 3) + + english_digit_graph + + pynutil.insert("\" ") + ) + + delete_plus = pynini.union( + pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") + ) + + delete_zero = pynini.union( + pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") + ) + + graph_number_with_hindi_digit = ( + delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + ) + graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit + + graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline + + graph_pincode = self.pincode_in_hindi | self.pincode_in_english + + graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + + graph = ( + graph_number_with_hindi_digit + | graph_number_with_english_digit + | graph_landline_with_extension + | graph_pincode + | graph_credit_card_last_digits + ) + + final_graph = self.add_tokens(graph) + self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index a5a371d90..62554bd14 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -33,6 +33,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst @@ -82,6 +83,8 @@ def __init__( measure_graph = measure.fst money = MoneyFst(cardinal, decimal) money_graph = money.fst + telephone = TelephoneFst(cardinal) + telephone_graph = telephone.fst punct_graph = PunctuationFst().fst whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst @@ -95,6 +98,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(whitelist_graph, 1.01) ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 5442777da..eacfb5765 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -61,22 +61,45 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - graph_fy = period + delete_space + year + graph_fy = year + graph_fy |= period + delete_space + year + + # century + graph_century = year + delete_extra_space + period + # month (day) year graph_mdy = month + delete_extra_space + day + pynutil.insert(",") + delete_extra_space + year # (day) month year graph_dmy = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + # day month year century + graph_dmyc = ( + day + + delete_extra_space + + month + + pynutil.insert(",") + + delete_extra_space + + year + + delete_extra_space + + period + ) + # month year graph_my = month + pynini.closure(delete_extra_space + year, 0, 1) + # month year century + graph_myc = month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period + # month day graph_md = month + pynini.closure(delete_extra_space + day, 0, 1) # day month graph_dm = day + pynini.closure(delete_extra_space + month, 0, 1) + # year range + graph_year_range = year + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -88,7 +111,18 @@ def __init__(self): ) final_graph = ( - (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm) + ( + graph_fy + | graph_mdy + | graph_dmy + | graph_my + | graph_md + | graph_dm + | graph_century + | graph_dmyc + | graph_myc + | graph_year_range + ) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..3f4b4de1f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2025 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone, e.g. + telephone { number_part: "123-123-5678" } + -> 123-123-5678 + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynutil.insert("+") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + optional_city_code = pynini.closure( + pynutil.delete("extension: \"") + + pynutil.insert("०") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + delete_tokens = self.delete_tokens(optional_country_code + number_part) + delete_tokens |= self.delete_tokens(optional_city_code + number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index d88bd25d9..165fe7a7e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -21,6 +21,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst @@ -45,6 +46,7 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst @@ -59,5 +61,6 @@ def __init__(self): | time_graph | measure_graph | money_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index bdc450fdd..6d570a9c5 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -22,4 +22,14 @@ सत्ताईस जुलाई दो हज़ार ग्यारह~२७ जुलाई, २०११ जुलाई सत्ताईस~जुलाई २७ वर्ष दो हज़ार उन्नीस~वर्ष २०१९ -सन उन्नीस सौ नब्बे~सन १९९० \ No newline at end of file +सन उन्नीस सौ नब्बे~सन १९९० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे~१९९०-१९९१ +दो हज़ार पाँच से दो हज़ार उन्नीस~२००५-२०१९ +दो हज़ार पाँच से उन्नीस~२००५-१९ +चौंतीस सौ ईसा पूर्व~३४०० ई.पू. +उन्नीस सौ बीस ईस्वी~१९२० ई. +पच्चीस जनवरी अठारह सौ तिरेपन ईसवी~२५ जनवरी, १८५३ ई. +इकत्तीस मई उन्नीस सौ नब्बे ईसवी~३१ मई, १९९० ई. +पच्चीस ईसा पूर्व~२५ ई.पू. +मार्च की दो~मार्च २ +फ़रवरी की बीस~फ़रवरी २० diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..0c51d8df0 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,37 @@ +प्लस इक्यानवे nine four one one one two three four one two~+९१ ९४१११२३४१२ +प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ +प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus eleven nine four one one one two three~+११ ९४१११२३ +zero eight zero two nine four one one one two~०८० २९४१११२ +शून्य आठ शून्य दो नौ चार एक एक एक दो~०८० २९४१११२ +zero four zero two seven eight one eight three nine~०४० २७८१८३९ +शून्य चार शून्य दो सात आठ एक आठ तीन नौ~०४० २७८१८३९ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +प्लस नौ एक नौ तीन आठ दो सात एक चार छह पांच शून्य~+९१ ९३८२७१४६५० +प्लस नौ एक नौ शून्य पांच एक तीन चार आठ दो सात छह~+९१ ९०५१३४८२७६ +प्लस नौ एक नौ चार तीन सात दो शून्य पांच छह एक आठ~+९१ ९४३७२०५६१८ +PLUS ninety one nine three eight two seven one four six five zero~+९१ ९३८२७१४६५० +plus nine one nine zero five one three four eight two seven six~+९१ ९०५१३४८२७६ +plus ninety one nine four three seven two zero five six one eight~+९१ ९४३७२०५६१८ +ZERO seven three चार पाँच छह सात आठ नौ शून्य~०७३ ४५६७८९० +शून्य चार शून्य पाँच चार एक दो सात तीन आठ~०४० ५४१२७३८ +ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० +zero two eight seven six five four three two seven~०२८ ७६५४३२७ +PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ +zero eight zero two two nine four one one one~०८० २२९४१११ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +zero eight zero nine two two nine four one one one~०८०९ २२९४१११ +शून्य सात नौ नौ एक नौ आठ सात छह पांच चार~०७९९ १९८७६५४ +zero three one nine two two two nine four one one one~०३१९२ २२९४१११ +शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार~०७९११ १९८७६५४ +एक एक शून्य शून्य सात शून्य दिल्ली के वसंत कुंज का पिनकोड है~११००७० दिल्ली के वसंत कुंज का पिनकोड है +बंगलौर के बैयापानहली का पिनकोड पाँच छह शून्य शून्य तीन आठ है~बंगलौर के बैयापानहली का पिनकोड ५६००३८ है +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index aec7299d5..a365a834d 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,11 @@ testITNMoney() { runtest $input } +testITNTelephone() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + testITNWord() { input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt runtest $input diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py new file mode 100644 index 000000000..7e43f7e82 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() From 10776d8c5161d692f7bcabf549e113d00ce87a68 Mon Sep 17 00:00:00 2001 From: Anand Joseph Date: Thu, 3 Apr 2025 15:56:18 +0530 Subject: [PATCH 2/4] Add missing __init__.py file Signed-off-by: Anand Joseph --- .../hi/data/telephone/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From c933a2661d8385fbd7bd22a0aa81045c1616e53d Mon Sep 17 00:00:00 2001 From: tarushi2k2 Date: Tue, 12 Aug 2025 02:16:49 +0530 Subject: [PATCH 3/4] Hindi 2.0: Quarterly Measures, Fraction Exceptions, Changes to Date (#306) * Addition of whitelist and word classes Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updation of Jenkins date Signed-off-by: Tarushi V * Cleanup Signed-off-by: Tarushi V * Updation Signed-off-by: Tarushi V * Updation Signed-off-by: Tarushi V * Hindi 2.0 Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tarushi V Signed-off-by: tarushi2k2 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 2 +- .../hi/data/date/century.tsv | 2 + .../hi/data/date/date_days.tsv | 34 --- .../hi/data/measure/measurements.tsv | 10 +- .../hi/data/money/currency.tsv | 3 +- .../hi/data/numbers/paune.tsv | 231 ++++++++++++++++++ .../hi/data/numbers/teens_and_ties.tsv | 3 +- .../hi/data/time/hour.tsv | 27 -- .../hi/data/time/hour_for_paune.tsv | 15 ++ .../hi/data/time/minute_and_second.tsv | 128 ---------- .../hi/data/whitelist/whitelist.tsv | 8 +- .../hi/graph_utils.py | 5 + .../hi/taggers/cardinal.py | 93 ++++++- .../hi/taggers/date.py | 22 +- .../hi/taggers/fraction.py | 100 +++++++- .../hi/taggers/measure.py | 78 +++++- .../hi/taggers/money.py | 190 +++++++++++++- .../hi/taggers/ordinal.py | 3 +- .../hi/taggers/time.py | 90 ++++++- .../hi/taggers/tokenize_and_classify.py | 4 +- .../hi/verbalizers/date.py | 23 +- .../hi/verbalizers/fraction.py | 1 + .../hi/verbalizers/measure.py | 10 + .../hi/verbalizers/ordinal.py | 3 +- .../hi/verbalizers/verbalize.py | 5 +- .../text_normalization/data_loader_utils.py | 4 +- .../test_cases_cardinal.txt | 10 +- .../test_cases_date.txt | 7 + .../test_cases_decimal.txt | 8 +- .../test_cases_fraction.txt | 23 +- .../test_cases_measure.txt | 18 +- .../test_cases_money.txt | 28 ++- .../test_cases_time.txt | 10 +- .../test_cases_whitelist.txt | 4 - 34 files changed, 957 insertions(+), 245 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv diff --git a/Jenkinsfile b/Jenkinsfile index 82a0a4799..2bedb7922 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-01-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv index da69e23eb..9369023e0 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -1,3 +1,5 @@ ई.पू. ईसा पूर्व ई. ईस्वी ई. ईसवी +वर्ष पूर्व वर्ष पूर्व +शताब्दी शताब्दी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv deleted file mode 100644 index 41201ae18..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/date/date_days.tsv +++ /dev/null @@ -1,34 +0,0 @@ -१ एक -२ दो -३ तीन -४ चार -५ पाँच -६ छः -६ छ: -६ छह -६ छे -७ सात -८ आठ -९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पन्द्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस -२५ पच्चीस -२६ छब्बीस -२७ सत्ताईस -२८ अट्ठाईस -२९ उनतीस -३० तीस -३१ इकतीस diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv index a1ab32da0..d472e15df 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/measure/measurements.tsv @@ -263,6 +263,7 @@ yr सालों yr वर्ष yr वर्षों hp हॉर्स पावर +hp हॉर्सपॉवर hp हॉर्सपावर hp अश्वशक्ति hp अश्वशक्त @@ -284,4 +285,11 @@ mi/s मील प्रति सेकेंड mi/h मील प्रति घंटा mi/h मील प्रति घंटे mi/m मील प्रति मिनट -₹/ac रुपए प्रति एकड़ \ No newline at end of file +₹/ac रुपए प्रति एकड़ +w हफ़्ते +w हफ़्ता +w सप्ताह +सदियां सदियां +सदियाँ सदियाँ +सदियों सदियों +सदी सदी diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv index 0ca503bb1..3ee478688 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/money/currency.tsv @@ -22,7 +22,6 @@ km बोस्निया और हर्जेगोविना का म p बोत्सवाना पुला r$ ब्राजीलियाई रियाल £ ब्रिटिश पाउंड -£ पाउंड b$ ब्रुनेई डॉलर лв बुल्गारियाई लेव fbu बुरुंडी फ्रैंक @@ -179,4 +178,4 @@ bs. वेनेजुएलन बोलिवार ₺ लीरा ₦ नाइरा ¢ सेंट्स -¢ सेंट \ No newline at end of file +¢ सेंट diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv new file mode 100644 index 000000000..f56b256e6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/paune.tsv @@ -0,0 +1,231 @@ +१ दो +२ तीन +३ चार +४ पाँच +४ पांच +५ छः +५ छह +५ छे +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पन्द्रह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१६ सतरह +१७ अठारह +१७ अट्ठारह +१८ उन्नीस +१८ उनीस +१९ बीस +२० इक्कीस +२० इकीस +२० ईकीस +२१ बाईस +२१ बाइस +२२ तेईस +२२ तेइस +२३ चौबीस +२४ पच्चीस +२४ पचीस +२५ छब्बीस +२५ छबीस +२६ सत्ताईस +२६ सत्ताइस +२६ सताईस +२६ सताइस +२७ अट्ठाईस +२७ अट्ठाइस +२७ अठाईस +२७ अठाइस +२८ उनतीस +२८ उन्तीस +२९ तीस +३० इकतीस +३० इकतिस +३० इकत्तीस +३० इकत्तिस +३१ बत्तीस +३१ बत्तिस +३१ बतीस +३१ बतिस +३२ तैंतीस +३२ तैंतिस +३२ तैंत्तीस +३२ तैंत्तिस +३२ तेतीस +३२ तेंतीस +३३ चौंतीस +३३ चौंतिस +३३ चौंत्तीस +३३ चौंत्तिस +३४ पैंतीस +३४ पैंतिस +३४ पैंत्तीस +३४ पैंत्तिस +३५ छत्तीस +३५ छत्तिस +३५ छतीस +३५ छतिस +३६ सैंतीस +३६ सैंतिस +३६ सैंत्तीस +३६ सैंत्तिस +३७ अड़तीस +३७ अड़तिस +३७ अड़त्तीस +३७ अड़त्तिस +३८ उनतालीस +३८ उनतालिस +३८ उनत्तालीस +३८ उनत्तालिस +३८ उन्तालीस +३८ उन्तालिस +३९ चालीस +४० इकतालीस +४० इकतालिस +४० इक्तालीस +४१ बयालीस +४१ बयालिस +४१ ब्यालीस +४२ तैंतालीस +४२ तैंतालिस +४३ चौवालीस +४३ चौवालिस +४३ चवालीस +४३ चवालिस +४३ चौंतालीस +४४ पैंतालीस +४४ पैंतालिस +४५ छियालीस +४५ छियालिस +४५ छयालीस +४६ सैंतालीस +४६ सैंतालिस +४६ सैतालिस +४७ अड़तालीस +४७ अड़तालिस +४८ उनचास +४९ पचास +५० इक्यावन +५० इकयावन +५१ बावन +५२ तिरपन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५५ छपन +५६ सत्तावन +५६ सतावन +५७ अट्ठावन +५७ अठावन +५८ उनसठ +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६१ बासट +६२ तिरसठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६५ छयासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६८ उनहतर +६९ सत्तर +६९ सतर +७० इकहत्तर +७० इकहतर +७० इक्हत्तर +७० इकत्तर +७१ बहत्तर +७१ बहतर +७२ तिहत्तर +७२ तिहतर +७३ चौहत्तर +७३ चौहतर +७४ पचहत्तर +७४ पचहतर +७४ पिछत्तर +७४ पिछतर +७५ छिहत्तर +७५ छिहतर +७५ छियत्तर +७६ सतहत्तर +७६ सतहतर +७६ सतत्तर +७७ अठहत्तर +७७ अठहतर +७८ उन्यासी +७८ उन्यासि +७८ उनासी +७८ उनासि +७९ अस्सी +७९ अस्सि +८० इक्यासी +८० इक्यासि +८१ बयासी +८१ बयासि +८१ ब्यासी +८१ ब्यासि +८१ बिरासी +८२ तिरासी +८२ तिरासि +८२ तेरासी +८३ चौरासी +८३ चौरासि +८४ पचासी +८४ पचासि +८४ पिचयासी +८४ पिचयासि +८४ पिचासी +८५ छियासी +८५ छियासि +८६ सत्तासी +८६ सत्तासि +८६ सतासी +८६ सतासि +८७ अट्ठासी +८७ अट्ठासि +८७ अठासी +८७ अठासि +८८ नवासी +८८ नवासि +८९ नब्बे +९० इक्यानबे +९० इक्यानवे +९१ बानबे +९१ बानवे +९२ तिरानबे +९२ तिरानवे +९३ चौरानबे +९३ चौरानवे +९४ पंचानबे +९४ पंचानवे +९४ पचानवे +९४ पिचयानवे +९४ पिचयानबे +९४ पिच्यानवे +९४ पिच्यानबे +९५ छियानबे +९५ छियानवे +९६ सत्तानबे +९६ सत्तानवे +९७ अट्ठानबे +९७ अट्ठानवे +९७ अठानवे +९७ अठानबे +९८ निन्यान्बे +९८ निन्यानबे +९८ निन्यानवे +९८ निन्यान्वे diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv index 91f656cfd..3968a1320 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -9,6 +9,7 @@ १७ सत्रह १७ सतरह १८ अठारह +१८ अठाहर १८ अट्ठारह १९ उन्नीस १९ उनीस @@ -216,4 +217,4 @@ ९९ निन्यान्बे ९९ निन्यानबे ९९ निन्यानवे -९९ निन्यान्वे \ No newline at end of file +९९ निन्यान्वे diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv deleted file mode 100644 index 7e0c8628e..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour.tsv +++ /dev/null @@ -1,27 +0,0 @@ -१ एक -२ दो -३ तीन -४ चार -५ पाँच -६ छः -६ छह -६ छे -७ सात -८ आठ -९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पन्द्रह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv new file mode 100644 index 000000000..8bb4c67ca --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/time/hour_for_paune.tsv @@ -0,0 +1,15 @@ +१२ एक +१ दो +२ तीन +३ चार +४ पाँच +४ पांच +५ छः +५ छह +५ छे +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv deleted file mode 100644 index 2bd8d6c61..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/time/minute_and_second.tsv +++ /dev/null @@ -1,128 +0,0 @@ -०१ एक -०२ दो -०३ तीन -०४ चार -०५ पाँच -०६ छः -०६ छह -०६ छे -०७ सात -०८ आठ -०९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पन्द्रह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१८ अट्ठारह -१९ उन्नीस -१९ उनीस -२० बीस -२१ इक्कीस -२१ इकीस -२१ ईकीस -२२ बाईस -२२ बाइस -२३ तेईस -२३ तेइस -२४ चौबीस -२५ पच्चीस -२५ पचीस -२६ छब्बीस -२६ छबीस -२७ सत्ताईस -२७ सत्ताइस -२७ सताईस -२७ सताइस -२८ अट्ठाईस -२८ अट्ठाइस -२८ अठाईस -२८ अठाइस -२९ उनतीस -२९ उन्तीस -३० तीस -३१ इकतीस -३१ इकतिस -३१ इकत्तीस -३१ इकत्तिस -३२ बत्तीस -३२ बत्तिस -३२ बतीस -३२ बतिस -३३ तैंतीस -३३ तैंतिस -३३ तैंत्तीस -३३ तैंत्तिस -३३ तेतीस -३४ चौंतीस -३४ चौंतिस -३४ चौंत्तीस -३४ चौंत्तिस -३५ पैंतीस -३५ पैंतिस -३५ पैंत्तीस -३५ पैंत्तिस -३६ छत्तीस -३६ छत्तिस -३६ छतीस -३६ छतिस -३७ सैंतीस -३७ सैंतिस -३७ सैंत्तीस -३७ सैंत्तिस -३८ अड़तीस -३८ अड़तिस -३८ अड़त्तीस -३८ अड़त्तिस -३९ उनतालीस -३९ उनतालिस -३९ उनत्तालीस -३९ उनत्तालिस -३९ उन्तालीस -३९ उन्तालिस -४० चालीस -४१ इकतालीस -४१ इकतालिस -४१ इक्तालीस -४२ बयालीस -४२ बयालिस -४२ ब्यालीस -४३ तैंतालीस -४३ तैंतालिस -४४ चौवालीस -४४ चौवालिस -४४ चवालीस -४४ चवालिस -४४ चौंतालीस -४५ पैंतालीस -४५ पैंतालिस -४६ छियालीस -४६ छियालिस -४६ छयालीस -४७ सैंतालीस -४७ सैंतालिस -४७ सैतालिस -४८ अड़तालीस -४८ अड़तालिस -४९ उनचास -५० पचास -५१ इक्यावन -५१ इकयावन -५२ बावन -५३ तिरपन -५३ तिरेपन -५४ चौवन -५५ पचपन -५६ छप्पन -५६ छपन -५७ सत्तावन -५७ सतावन -५८ अट्ठावन -५८ अठावन -५९ उनसठ -६० साठ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv index f9eb081b9..8cfd0e19f 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv @@ -1,13 +1,7 @@ १/४ पाव -१/२ आधा -३/४ पौन -१:३० डेढ़ बजे -२:३० ढाई बजे -१.५ डेढ़ -२.५ ढाई कु. कुमारी स्मि. श्रीमती श्री. श्री श्री. श्रीमान मा. मास्टर -डॉ. डॉक्टर \ No newline at end of file +डॉ. डॉक्टर diff --git a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py index 8454fc139..d8f2a95a0 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py @@ -32,6 +32,7 @@ graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() +DEVANAGARI_DIGIT = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"] NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = u"\u00A0" @@ -63,6 +64,10 @@ MINUS = pynini.union("ऋणात्मक", "नकारात्मक").optimize() +def integer_to_devanagari(n: int) -> str: + return ''.join(DEVANAGARI_DIGIT[int(d)] for d in str(n)) + + def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py index f1e4da381..0f21f9aed 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py @@ -44,8 +44,14 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() graph_teens_and_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + graph_paune = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() + self.graph_zero = graph_zero + self.graph_digit = graph_digit + self.graph_single_digit_with_zero = pynutil.insert("०") + graph_digit + self.graph_teens_and_ties = graph_teens_and_ties self.graph_two_digit = graph_teens_and_ties | (pynutil.insert("०") + graph_digit) graph_hundred = pynini.cross("सौ", "") + delete_hundred = pynutil.delete("सौ") delete_thousand = pynutil.delete("हज़ार") | pynutil.delete("हजार") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("०")) graph_hundred_component += delete_space @@ -64,11 +70,93 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) graph_hundred_as_thousand += delete_space graph_hundred_as_thousand += self.graph_two_digit | pynutil.insert("००") + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + graph_digit + + pynutil.insert("५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + graph_digit + + pynutil.insert("२५०", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_paune + + pynutil.insert("७५०", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("१५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) + graph_hundred_as_thousand |= pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("२५००", weight=-0.1) + + delete_space + + delete_thousand, + -0.1, + ) - self.graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + graph_in_hundreds = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + (graph_digit | self.graph_two_digit) + + pynutil.insert("५०", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + (graph_digit | self.graph_two_digit) + + pynutil.insert("२५", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_paune + + pynutil.insert("७५", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("१५०", weight=-0.1) + + delete_space + + delete_hundred, + -0.1, + ) + graph_in_hundreds |= pynutil.add_weight( + pynutil.delete("ढाई") + delete_space + pynutil.insert("२५०", weight=-0.1) + delete_space + delete_hundred, + -0.1, + ) + self.graph_hundreds = graph_hundred_component | graph_hundred_as_thousand | graph_in_hundreds graph_teens_and_ties_component = pynini.union( - graph_teens_and_ties | pynutil.insert("00") + delete_space + (graph_digit | pynutil.insert("0")), + graph_teens_and_ties | pynutil.insert("००") + delete_space + (graph_digit | pynutil.insert("०")), ) graph_ties_component_at_least_one_none_zero_digit = self.graph_two_digit @ ( pynini.closure(NEMO_HI_DIGIT) + pynini.closure(NEMO_HI_DIGIT) @@ -132,6 +220,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_no_prefix = pynutil.add_weight( pynini.cross("सौ", "१००") | pynini.cross("हज़ार", "१०००") + | pynini.cross("हजार", "१०००") | pynini.cross("लाख", "१०००००") | pynini.cross("करोड़", "१०००००००"), 2, diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 6859f0834..c03d8c718 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -16,11 +16,13 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + DEVANAGARI_DIGIT, NEMO_HI_DIGIT, GraphFst, delete_extra_space, delete_space, insert_space, + integer_to_devanagari, ) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -35,16 +37,22 @@ class DateFst(GraphFst): date: DateFst """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): super().__init__(name="date", kind="classify") graph_year = pynutil.add_weight( pynini.compose(cardinal.graph_no_exception, pynini.closure(NEMO_HI_DIGIT, 1, 4)), 0.03 ) + cardinal_graph = cardinal.graph_no_exception month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) - graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() + + graph_date_days = cardinal.graph_digit | cardinal.graph_teens_and_ties + date_days = pynini.union(*[integer_to_devanagari(i) for i in range(1, 32)]).optimize() + graph_date_days = graph_date_days @ date_days + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() + graph_morph_features = pynini.string_file(get_abs_path("data/ordinals/morph_features.tsv")) self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") @@ -60,7 +68,10 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\" ") ) self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") - insert_comma = pynutil.insert(", ") + self.ordinal_century = pynutil.insert("era: \"") + cardinal_graph + pynutil.insert("\" ") + self.morpho_graph = ( + pynutil.insert("morphosyntactic_features: \"") + graph_morph_features + pynutil.insert("\"") + ) graph_day_month = self.day + delete_space + self.month graph_month_day = self.month + delete_space + self.day @@ -76,6 +87,9 @@ def __init__(self, cardinal: GraphFst): ) graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century graph_year_range = self.year_range + graph_year_range_century = self.year_range + delete_space + self.century + + graph_ordinal_century = self.ordinal_century + self.morpho_graph + delete_extra_space + self.century graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day graph_date_exceptions += pynutil.insert("preserve_order: true") @@ -91,6 +105,8 @@ def __init__(self, cardinal: GraphFst): | graph_day_month_year_century | graph_month_year_century | graph_year_range + | graph_year_range_century + | graph_ordinal_century | graph_date_exceptions ) final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py index 56b2c63e3..cf490e5b9 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py @@ -16,13 +16,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.text_normalization.hi.graph_utils import ( INPUT_CASED, INPUT_LOWER_CASED, MIN_NEG_WEIGHT, MINUS, - NEMO_DIGIT, + NEMO_HI_DIGIT, NEMO_SIGMA, TO_LOWER, GraphFst, @@ -30,7 +31,6 @@ delete_extra_space, delete_space, ) -from nemo_text_processing.text_normalization.en.utils import load_labels class FractionFst(GraphFst): @@ -59,9 +59,99 @@ def __init__(self, cardinal: GraphFst): numerator = pynutil.insert("numerator: \"") + graph_cardinal + pynutil.insert("\"") denominator = pynutil.insert(" denominator: \"") + graph_cardinal + pynutil.insert("\"") + graph_fraction = numerator + delete_bata + denominator + graph_mixed_fraction = integer + delete_extra_space + pynutil.delete("सही") + delete_space + graph_fraction + + graph_saade = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + integer + + pynutil.insert("numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.01, + ) + graph_sava = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + integer + + pynutil.insert("numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"४\""), + -0.001, + ) + graph_paune = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + integer + + pynutil.insert("numerator: \"३\"") + + delete_space + + pynutil.insert(" denominator: \"४\""), + -0.01, + ) + graph_dedh = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.01, + ) + graph_dhaai = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\""), + -0.1, + ) + + graph_aadha_and_saade_only = ( + pynini.union(pynutil.delete("आधा") | pynutil.delete("साढ़े")) + + delete_space + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"२\"") + ) + graph_sava_only = ( + pynutil.delete("सवा") + + delete_space + + pynutil.insert(" numerator: \"१\"") + + delete_space + + pynutil.insert(" denominator: \"४\"") + ) + graph_paune_only = ( + pynini.union(pynutil.delete("पौन") | pynutil.delete("पौना")) + + delete_space + + pynutil.insert("numerator: \"३\"") + + delete_space + + pynutil.insert(" denominator: \"४\"") + ) + + graph_tihaai = ( + numerator + delete_space + pynutil.delete("तिहाई") + delete_space + pynutil.insert(" denominator: \"३\"") + ) + graph_chauthaai = ( + numerator + delete_space + pynutil.delete("चौथाई") + delete_space + pynutil.insert(" denominator: \"४\"") + ) + + graph_quarterly_exceptions = ( + graph_saade + | graph_sava + | graph_paune + | graph_dedh + | graph_dhaai + | graph_aadha_and_saade_only + | graph_sava_only + | graph_paune_only + | graph_tihaai + | graph_chauthaai + ) - graph = graph_fraction + graph = graph_fraction | graph_mixed_fraction | graph_quarterly_exceptions self.graph = graph.optimize() self.final_graph_wo_negative = graph optional_graph_negative = pynini.closure( diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py index d7e9ba562..e3d985a0d 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py @@ -23,7 +23,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path class MeasureFst(GraphFst): @@ -49,7 +49,60 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) measurements_graph = pynini.string_file(get_abs_path("data/measure/measurements.tsv")).invert() + paune_graph = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() + self.measurements = pynutil.insert("units: \"") + measurements_graph + pynutil.insert("\" ") + graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + graph_integer_paune = pynutil.insert("integer_part: \"") + paune_graph + pynutil.insert("\"") + + graph_saade_single_digit = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + graph_integer + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 0.1, + ) + graph_sava_single_digit = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + graph_integer + + delete_space + + pynutil.insert(" fractional_part: \"२५\""), + 0.1, + ) + graph_paune_single_digit = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + graph_integer_paune + + delete_space + + pynutil.insert(" fractional_part: \"७५\""), + 1, + ) + graph_dedh_single_digit = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 0.1, + ) + graph_dhaai_single_digit = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + delete_space + + pynutil.insert(" fractional_part: \"५\""), + 1, + ) + + graph_exceptions = ( + graph_saade_single_digit + | graph_sava_single_digit + | graph_paune_single_digit + | graph_dedh_single_digit + | graph_dhaai_single_digit + ) graph_measurements = ( pynutil.insert("decimal { ") @@ -69,8 +122,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + delete_extra_space + self.measurements ) + graph_quarterly_measurements = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + graph_exceptions + + pynutil.insert(" }") + + delete_extra_space + + self.measurements + ) + graph_exception_bai = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + delete_space + + pynini.cross("बाई", "x") + + delete_space + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynini.closure(delete_extra_space + self.measurements) + ) - graph = graph_measurements + graph = graph_measurements | graph_quarterly_measurements | graph_exception_bai self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py index 7fa59ee26..e79c9d0b3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/money.py @@ -21,7 +21,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path class MoneyFst(GraphFst): @@ -40,14 +40,22 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.graph_no_exception + cardinal_single_and_double_digit_graph = cardinal.graph_digit | cardinal.graph_teens_and_ties decimal_graph = decimal.final_graph_wo_negative currency_graph = pynini.string_file(get_abs_path("data/money/currency.tsv")).invert() + paune_graph = pynini.string_file(get_abs_path("data/numbers/paune.tsv")).invert() self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + self.integer_quarterly_measures = pynutil.insert("integer_part: \"") + cardinal_single_and_double_digit_graph + self.integer_paune = pynutil.insert("integer_part: \"") + paune_graph self.paise = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\"") self.fraction = decimal_graph self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") aur = pynutil.delete("और") + delete_hundred = pynutil.delete("सौ") + delete_lakh = pynutil.delete("लाख") + delete_hazar = pynutil.delete("हजार") | pynutil.delete("हज़ार") + delete_crore = pynutil.delete("करोड़") | pynutil.delete("करोड़") graph_currency_decimal = self.fraction + delete_extra_space + self.currency graph_currency_cardinal = self.integer + delete_extra_space + self.currency @@ -60,8 +68,186 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + delete_extra_space + pynutil.delete(currency_graph) ) + # cases for saade,sava with teens and ties + graph_saade_teens_ties = ( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + graph_sava_teens_ties = ( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" fractional_part: \"२५\"") + + delete_extra_space + + self.currency + ) + graph_dedh = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"१\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + graph_dhaai = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"२\"") + + delete_space + + pynutil.insert(" fractional_part: \"५०\"") + + delete_extra_space + + self.currency + ) + + graph_exceptions_teens_ties = graph_saade_teens_ties | graph_sava_teens_ties | graph_dedh | graph_dhaai + + # cases for saade,sava,paune,dedh and dhaai with hundreds and thousands + graph_exceptions = self.integer + delete_extra_space + self.currency + + # exceptions with lakhs + graph_saade_lakh = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency, + 0.01, + ) + graph_sava_lakh = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("२५०००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency, + 0.01, + ) + graph_paune_lakh = ( + pynutil.delete("पौने") + + delete_space + + self.integer_paune + + delete_space + + pynutil.insert("७५०००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + graph_dedh_lakh = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("१५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + graph_dhaai_lakh = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("२५००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_lakh + + delete_extra_space + + self.currency + ) + + graph_exceptions_lakhs = ( + graph_saade_lakh | graph_sava_lakh | graph_paune_lakh | graph_dedh_lakh | graph_dhaai_lakh + ) + + # exceptions with crores + graph_saade_crore = ( + pynutil.delete("साढ़े") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_sava_crore = ( + pynutil.delete("सवा") + + delete_space + + self.integer_quarterly_measures + + delete_space + + pynutil.insert("२५०००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_paune_crore = ( + pynutil.delete("पौने") + + delete_space + + self.integer_paune + + delete_space + + pynutil.insert("७५०००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_dhaai_crore = ( + pynutil.delete("ढाई") + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("२५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + graph_dedh_crore = ( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + pynutil.insert("integer_part: \"") + + pynutil.insert("१५००००००", weight=-0.1) + + pynutil.insert("\"") + + delete_space + + delete_crore + + delete_extra_space + + self.currency + ) + + graph_exceptions_crores = ( + graph_saade_crore | graph_sava_crore | graph_paune_crore | graph_dedh_crore | graph_dhaai_crore + ) + + graph_quarterly_measures = ( + graph_exceptions_teens_ties | graph_exceptions | graph_exceptions_lakhs | graph_exceptions_crores + ) - graph = graph_currency_decimal | graph_currency_cardinal | graph_rupay_and_paisa + graph = graph_currency_decimal | graph_currency_cardinal | graph_rupay_and_paisa | graph_quarterly_measures self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py index d6f4d59ac..473055891 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/ordinal.py @@ -83,5 +83,6 @@ def __init__(self, cardinal: GraphFst): rule = pynini.cdrewrite(morpho_graph, pynini.closure(NEMO_HI_DIGIT), pynini.union("[EOS]", " "), NEMO_SIGMA) final_graph = pynutil.insert("integer: \"") + graph @ rule - final_graph = self.add_tokens(final_graph) + self.final_graph = self.add_tokens(final_graph) + final_graph = self.final_graph self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py index ac539966d..74caaf6db 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py @@ -15,7 +15,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + DEVANAGARI_DIGIT, + GraphFst, + delete_extra_space, + delete_space, + insert_space, + integer_to_devanagari, +) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -29,12 +36,15 @@ class TimeFst(GraphFst): time: TimeFst """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") - hour_graph = pynini.string_file(get_abs_path("data/time/hour.tsv")).invert() - minute_graph = pynini.string_file(get_abs_path("data/time/minute_and_second.tsv")).invert() - second_graph = pynini.string_file(get_abs_path("data/time/minute_and_second.tsv")).invert() + hour_graph = cardinal.graph_digit | cardinal.graph_teens_and_ties + time_hours = pynini.union(*[integer_to_devanagari(i) for i in range(1, 25)]).optimize() + hour_graph = hour_graph @ time_hours + + cardinal_graph = cardinal.graph_single_digit_with_zero | cardinal.graph_teens_and_ties + paune_hour_graph = pynini.string_file(get_abs_path("data/time/hour_for_paune.tsv")).invert() delete_baje = pynini.union( pynutil.delete("बजके") | pynutil.delete("बजकर") | pynutil.delete("बजे") | pynutil.delete("घंटा") @@ -44,8 +54,9 @@ def __init__(self): delete_second = pynutil.delete("सेकंड") self.hour = pynutil.insert("hours: \"") + hour_graph + pynutil.insert("\" ") - self.minute = pynutil.insert("minutes: \"") + minute_graph + pynutil.insert("\" ") - self.second = pynutil.insert("seconds: \"") + second_graph + pynutil.insert("\" ") + self.paune_hour = pynutil.insert("hours: \"") + paune_hour_graph + pynutil.insert("\" ") + self.minute = pynutil.insert("minutes: \"") + cardinal_graph + pynutil.insert("\" ") + self.second = pynutil.insert("seconds: \"") + cardinal_graph + pynutil.insert("\" ") # hour minute second graph_hms = ( @@ -63,17 +74,20 @@ def __init__(self): ) # hour minute and hour minute without "baje and minat" - graph_hm = ( + graph_hm = pynutil.add_weight( self.hour + delete_space + pynini.closure(delete_baje, 0, 1) + delete_space + self.minute - + pynini.closure(delete_space + delete_minute, 0, 1) + + pynini.closure(delete_space + delete_minute, 0, 1), + 0.01, ) # hour second - graph_hs = self.hour + delete_space + delete_baje + delete_space + self.second + delete_space + delete_second + graph_hs = pynutil.add_weight( + self.hour + delete_space + delete_baje + delete_space + self.second + delete_space + delete_second, 0.01 + ) # minute second graph_ms = ( @@ -83,7 +97,61 @@ def __init__(self): # hour graph_hour = self.hour + delete_space + delete_baje - graph = graph_hms | graph_hm | graph_hs | graph_ms | graph_hour + graph_saade = pynutil.add_weight( + pynutil.delete("साढ़े") + + delete_space + + self.hour + + delete_space + + pynutil.insert(" minutes: \"३०\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_sava = pynutil.add_weight( + pynutil.delete("सवा") + + delete_space + + self.hour + + delete_space + + pynutil.insert(" minutes: \"१५\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_paune = pynutil.add_weight( + pynutil.delete("पौने") + + delete_space + + self.paune_hour + + delete_space + + pynutil.insert(" minutes: \"४५\"") + + delete_space + + pynini.closure(delete_baje), + 0.01, + ) + graph_dedh = pynutil.add_weight( + pynini.union(pynutil.delete("डेढ़") | pynutil.delete("डेढ़")) + + delete_space + + delete_baje + + pynutil.insert("hours: \"१\"") + + delete_space + + pynutil.insert(" minutes: \"३०\""), + 0.01, + ) + graph_dhaai = pynutil.add_weight( + pynutil.delete("ढाई") + + delete_space + + delete_baje + + pynutil.insert("hours: \"२\"") + + delete_space + + pynutil.insert(" minutes: \"३०\""), + 0.01, + ) + graph_quarterly_measures = ( + graph_dedh + | graph_dhaai + | ((graph_saade | graph_sava | graph_paune) + pynini.closure(delete_space + delete_baje)) + ) + + graph = graph_hms | graph_hm | graph_hs | graph_ms | graph_hour | graph_quarterly_measures self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 62554bd14..4ba10de0a 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -75,9 +75,9 @@ def __init__( decimal_graph = decimal.fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst - date = DateFst(cardinal) + date = DateFst(cardinal, ordinal) date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal) time_graph = time.fst measure = MeasureFst(cardinal, decimal) measure_graph = measure.fst diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index eacfb5765..7a5c10c4c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -30,7 +30,7 @@ class DateFst(GraphFst): date { day: "५" month: "जनवरी" year: "२०१२" preserve_order: true } -> ५ जनवरी २०१२ """ - def __init__(self): + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): super().__init__(name="date", kind="verbalize") month = ( pynutil.delete("month:") @@ -61,6 +61,21 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) + era = ( + pynutil.delete("era:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + morpho_features = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + graph_fy = year graph_fy |= period + delete_space + year @@ -100,6 +115,11 @@ def __init__(self): # year range graph_year_range = year + # ordinal century + graph_ordinal_century = era + delete_space + morpho_features + delete_extra_space + period + + # graph_ordinal_range = graph_ordinal + delete_extra_space + period + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -122,6 +142,7 @@ def __init__(self): | graph_dmyc | graph_myc | graph_year_range + | graph_ordinal_century ) + delete_space + optional_preserve_order diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py index 0fa7e97bd..45b5832b5 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/fraction.py @@ -16,6 +16,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py index 1fc9ba373..d6d8f72f8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/measure.py @@ -52,7 +52,17 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph_decimal = ( pynutil.delete("decimal {") + delete_space + decimal.numbers + delete_space + pynutil.delete("}") ) + graph_exception_bai = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit + graph |= graph_exception_bai + pynini.closure(delete_space + pynutil.insert(" ") + unit) delete_tokens = self.delete_tokens(graph) self.decimal = graph_decimal self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py index d6c4e0025..94f280798 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/ordinal.py @@ -40,6 +40,7 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - + self.numbers = graph + graph = graph.optimize() delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 165fe7a7e..f1a6c55a3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -38,11 +38,12 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst - ordinal_graph = OrdinalFst().fst + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst decimal = DecimalFst() decimal_graph = decimal.fst fraction_graph = FractionFst().fst - date_graph = DateFst().fst + date_graph = DateFst(cardinal, ordinal).fst time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 5e7fa1892..6a46230cf 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -162,8 +162,8 @@ def training_data_to_tokens( for instance in data: if instance.token_type != EOS_TYPE: if category is None or instance.token_type == category: - result[instance.token_type][0].append(instance.un_normalized) - result[instance.token_type][1].append(instance.normalized) + result[instance.token_type][0].append(unicodedata.normalize(NFC, instance.un_normalized)) + result[instance.token_type][1].append(unicodedata.normalize(NFC, instance.normalized)) return result diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt index a72ad4183..4a7221675 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_cardinal.txt @@ -43,4 +43,12 @@ एक अरब बारह करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~११२२३४५५६७ एक अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~१०२२३४५५६७ ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~११०२२३४५५६७ -इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~५१०२२३४५५६७ \ No newline at end of file +इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~५१०२२३४५५६७ +सवा सात सौ~७२५ +साढ़े सात सौ~७५० +साढ़े सात हज़ार~७५०० +सवा सात हज़ार~७२५० +डेढ़ सौ~१५० +ढाई सौ~२५० +साढ़े सोलह सौ~१६५० +सवा सोलह सौ~१६२५ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index 6d570a9c5..402361d71 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -33,3 +33,10 @@ पच्चीस ईसा पूर्व~२५ ई.पू. मार्च की दो~मार्च २ फ़रवरी की बीस~फ़रवरी २० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे ईसवी~१९९०-१९९१ ई. +दो हज़ार पाँच से दो हज़ार उन्नीस ईसा पूर्व~२००५-२०१९ ई.पू. +दसवें शताब्दी~१०वें शताब्दी +अठाहरवीं शताब्दी~१८वीं शताब्दी +एक हज़ार एकवीं शताब्दी~१००१वीं शताब्दी +एक सौ उन्नीसवां शताब्दी~११९वां शताब्दी +उन्नीस सौ बीस से छब्बीस तक~१९२०-२६ तक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt index 23bef1a85..5b8d86602 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_decimal.txt @@ -4,4 +4,10 @@ एक सौ आठ दशमलव सात पाँच~१०८.७५ एक सौ दस दशमलव सात पाँच~११०.७५ एक सौ दो दशमलव तीन~१०२.३ -एक सौ छह दशमलव पाँच~१०६.५ \ No newline at end of file +एक सौ छह दशमलव पाँच~१०६.५ +साढ़े तीन सौ दशमलव दो दो~३५०.२२ +सवा तीन सौ दशमलव दो~३२५.२ +साढ़े चार सौ दशमलव सात पाँच~४५०.७५ +सवा चार सौ दशमलव सात पाँच~४२५.७५ +ढाई सौ दशमलव छह~२५०.६ +डेढ़ सौ दशमलव सात पाँच~१५०.७५ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt index 12ee24d61..21ceff6c6 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_fraction.txt @@ -7,4 +7,25 @@ एक सौ तेईस बटा एक सौ पच्चीस~१२३/१२५ छह सौ बासठ बटा एक~६६२/१ एक सौ पाँच बटा सात~१०५/७ -छह सौ चौवन बटा तीन~६५४/३ \ No newline at end of file +छह सौ चौवन बटा तीन~६५४/३ +एक सौ तैंतीस सही एक बटा दो~१३३ १/२ +एक सौ तैंतीस सही दो बटा तीन~१३३ २/३ +एक सही छह बटा छह~१ ६/६ +दो सही एक बटा छह~२ १/६ +तीन सही तीन बटा चार~३ ३/४ +एक सौ बीस सही तीन बटा चार~१२० ३/४ +एक सौ बीस सही पिछत्तर बटा नब्बे~१२० ७५/९० +तीन सही तीन बटा चार~३ ३/४ +सवा चौरासी~८४ १/४ +डेढ़~१ १/२ +ढाई~२ १/२ +आधा~१/२ +साढ़े~१/२ +सवा~१/४ +पौन~३/४ +पौना~३/४ +सवा पैंतीस~३५ १/४ +साढ़े चार सौ बटा दस~४५०/१० +तीन चौथाई~३/४ +दो तिहाई~२/३ +एक चौथाई~१/४ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt index 3bd860f08..21615f1c5 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_measure.txt @@ -29,4 +29,20 @@ तैंतीस दशमलव तीन तीन किलोमीटर प्रति घंटा~३३.३३ km/h चौदह हज़ार इकहत्तर दशमलव नौ नौ पिंट~१४०७१.९९ pt बहत्तर दशमलव आठ तीन मील प्रति घंटा~७२.८३ mi/h -बहत्तर मील प्रति घंटा~७२ mi/h \ No newline at end of file +बहत्तर मील प्रति घंटा~७२ mi/h +पौने ग्यारह घंटे~१०.७५ h +साढ़े सात वर्ष~७.५ yr +सवा ग्यारह सौ मीटर~११२५ m +पौने चार सौ हेक्टेयर~३७५ ha +साढ़े दस घन फीट~१०.५ ft³ +पौने पांच सौ किलोमीटर~४७५ km +ढाई सौ गैलन~२५० gal +डेढ़ दर्जन~१.५ doz +साढ़े सात ऐंपीयर~७.५ A +पौने तीन हजार एकड़~२७५० ac +साढ़े बारह वर्ग माइक्रोमीटर~१२.५ µm² +ढाई महीने~२.५ mo +दो बाई दो~२x२ +दो बाई दो~२x२ +पाँच बाई पाँच~५x५ +बाईस बाई पाँच घन फीट~२२x५ ft³ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt index 8cc06397b..8821940c3 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_money.txt @@ -21,4 +21,30 @@ इकहत्तर हज़ार इकहत्तर बिटकॉइन~₿७१०७१ बत्तीस बुरुंडी फ्रैंक~fbu३२ पन्द्रह सौ कैमन आइलैंड्स डॉलर~ci$१५०० -छह सौ पच्चीस रुपये दो पैसे~₹६२५.२ \ No newline at end of file +छह सौ पच्चीस रुपये दो पैसे~₹६२५.२ +साढ़े सात सौ डॉलर~$७५० +सवा दो सौ यूक्रेनी ग्रिव्ना~₴२२५ +साढ़े छः लाख रुपए~₹६५०००० +सवा छः लाख अल्जीरियाई दिनार~دج६२५००० +सवा पंद्रह लाख युगांडा शिलिंग~ush१५२५००० +साढ़े पंद्रह लाख रुपए~₹१५५०००० +साढ़े पाँच हज़ार लीरा~₺५५०० +ढाई सौ यूरो~€२५० +ढाई हजार बुरुंडी फ्रैंक~fbu२५०० +ढाई करोड़ रुपए~₹२५०००००० +ढाई लाख रुपए~₹२५०००० +डेढ़ सौ यूरो~€१५० +डेढ़ हजार रुपए~₹१५०० +डेढ़ करोड़ रुपए~₹१५०००००० +डेढ़ लाख रुपए~₹१५०००० +पौने तीन सौ रुपए~₹२७५ +पौने पंद्रह सौ रुपए~₹१४७५ +पौने तीन हजार रुपए~₹२७५० +पौने पंद्रह हजार यूरो~€१४७५० +पौने पैंतालिस हजार यूरो~€४४७५० +पौने तीन लाख रुपए~₹२७५००० +पौने पंद्रह लाख रुपए~₹१४७५००० +पौने पैंतालिस लाख रुपए~₹४४७५००० +पौने तीन करोड़ रुपए~₹२७५००००० +पौने पंद्रह करोड़ रुपए~₹१४७५००००० +पौने पैंतालिस करोड़ रुपए~₹४४७५००००० diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt index c1edb837d..8ec5e4df3 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_time.txt @@ -14,4 +14,12 @@ नौ घंटा दो सेकंड~९:००:०२ सोलह घंटा एक मिनट सत्ताईस सेकंड~१६:०१:२७ दस बजकर चौवन मिनट आठ सेकंड~१०:५४:०८ -तीन मिनट उन्नीस सेकंड~००:०३:१९ \ No newline at end of file +तीन मिनट उन्नीस सेकंड~००:०३:१९ +ढाई बजे~२:३० +डेढ़ बजे~१:३० +डेढ़ घंटा~१:३० +साढ़े पाँच बजे~५:३० +सवा चार बजे~४:१५ +साढ़े ग्यारह~११:३० +पौने पाँच~४:४५ +पौने तीन घंटा~२:४५ diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt index 30824fced..68f4fd775 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt @@ -1,5 +1,3 @@ -डेढ़ बजे~१:३० -ढाई बजे~२:३० मास्टर निखिल तनिष~मा. निखिल तनिष पाव~१/४ श्रीमती ज्योत्सना~स्मि. ज्योत्सना @@ -7,6 +5,4 @@ आधा कप चाय~१/२ कप चाय श्रीमान भारत कुमार~श्री. भारत कुमार डॉक्टर प्रशांत~डॉ. प्रशांत -डेढ़~१.५ कुमारी~कु. -ढाई~२.५ \ No newline at end of file From 2e8ff658de9212cc2f972beee63ab3e9917c6a16 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 16:13:49 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/hi/taggers/date.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index b39670ace..27e6eec32 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -70,7 +70,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst): self.ordinal_century = pynutil.insert("era: \"") + cardinal_graph + pynutil.insert("\" ") self.morpho_graph = ( pynutil.insert("morphosyntactic_features: \"") + graph_morph_features + pynutil.insert("\"") - ) + ) graph_day_month = self.day + delete_space + self.month graph_month_day = self.month + delete_space + self.day graph_month_day += pynutil.insert(" preserve_order: true")