From 034d05b9b3bdfc730384c0dc05159b20c75bb65f Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Wed, 21 May 2025 14:58:52 -0700 Subject: [PATCH 1/6] Add Korean TN support for cardinal numbers and postprocessing Signed-off-by: Jinwoo Bae --- Jenkinsfile | 22 ++ .../text_normalization/ko/__init__.py | 17 ++ .../ko/data/number/__init__.py | 13 + .../ko/data/number/digit.tsv | 9 + .../ko/data/number/teen.tsv | 10 + .../text_normalization/ko/data/number/ty.tsv | 8 + .../ko/data/number/zero.tsv | 1 + .../text_normalization/ko/graph_utils.py | 173 ++++++++++++ .../text_normalization/ko/taggers/__init__.py | 13 + .../text_normalization/ko/taggers/cardinal.py | 267 ++++++++++++++++++ .../ko/taggers/tokenize_and_classify.py | 75 +++++ .../text_normalization/ko/utils.py | 60 ++++ .../ko/verbalizers/__init__.py | 13 + .../ko/verbalizers/cardinal.py | 48 ++++ .../ko/verbalizers/post_processing.py | 113 ++++++++ .../ko/verbalizers/verbalize.py | 38 +++ .../ko/verbalizers/verbalize_final.py | 74 +++++ .../text_normalization/normalize.py | 6 +- tests/nemo_text_processing/ko/__init__.py | 13 + .../test_cases_cardinal.txt | 19 ++ .../nemo_text_processing/ko/test_cardinal.py | 34 +++ .../ko/test_sparrowhawk_normalization.sh | 123 ++++++++ .../pynini_export.py | 11 + 23 files changed, 1159 insertions(+), 1 deletion(-) create mode 100644 nemo_text_processing/text_normalization/ko/__init__.py create mode 100644 nemo_text_processing/text_normalization/ko/data/number/__init__.py create mode 100644 nemo_text_processing/text_normalization/ko/data/number/digit.tsv create mode 100644 nemo_text_processing/text_normalization/ko/data/number/teen.tsv create mode 100644 nemo_text_processing/text_normalization/ko/data/number/ty.tsv create mode 100644 nemo_text_processing/text_normalization/ko/data/number/zero.tsv create mode 100644 nemo_text_processing/text_normalization/ko/graph_utils.py create mode 100644 nemo_text_processing/text_normalization/ko/taggers/__init__.py create mode 100644 nemo_text_processing/text_normalization/ko/taggers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/text_normalization/ko/utils.py create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/__init__.py create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py create mode 100644 tests/nemo_text_processing/ko/__init__.py create mode 100644 tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/ko/test_cardinal.py create mode 100644 tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..c3339c7bc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,6 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' + KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -318,6 +319,22 @@ pipeline { } } } + stage('L0: Create KO TN Grammars') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: KO TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } + } + } // L1 Tests starts here @@ -406,6 +423,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/text_normalization/ko/__init__.py b/nemo_text_processing/text_normalization/ko/__init__.py new file mode 100644 index 000000000..dd0e509b3 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ko/data/number/__init__.py b/nemo_text_processing/text_normalization/ko/data/number/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/number/digit.tsv b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv new file mode 100644 index 000000000..61a7dddcf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv @@ -0,0 +1,9 @@ +1 일 +2 이 +3 삼 +4 사 +5 오 +6 육 +7 칠 +8 팔 +9 구 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv new file mode 100644 index 000000000..432fe5eb6 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv @@ -0,0 +1,10 @@ +10 십 +11 십일 +12 십이 +13 십삼 +14 십사 +15 십오 +16 십육 +17 십칠 +18 십팔 +19 십구 diff --git a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv new file mode 100644 index 000000000..02623c44c --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv @@ -0,0 +1,8 @@ +2 이십 +3 삼십 +4 사십 +5 오십 +6 육십 +7 칠십 +8 팔십 +9 구십 diff --git a/nemo_text_processing/text_normalization/ko/data/number/zero.tsv b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv new file mode 100644 index 000000000..7024c0534 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv @@ -0,0 +1 @@ +0 영 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..a7ffdd2b1 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/graph_utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = "lower_cased"): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ko/taggers/__init__.py b/nemo_text_processing/text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..187ebd419 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,267 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class CardinalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + # Load base .tsv files + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + + digit_except_one = pynini.difference(NEMO_DIGIT, "1") + digit_except_zero_one = pynini.difference(digit_except_one, "0") + + graph_digit_alt = digit_except_zero_one @ graph_digit + graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) + + # Compose all basic number forms + graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit + + hundreds = NEMO_DIGIT**3 + graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_alt + pynutil.insert('백'))) + pynini.union( + pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all) + ) + graph_hundred = hundreds @ graph_hundred_component + + thousands = NEMO_DIGIT**4 + graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_alt + pynutil.insert('천'))) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT**5 + graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT**6 + graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_all) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT**7 + graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT**8 + graph_ten_million_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT ** 9 + graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT**10 + graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + billions = NEMO_DIGIT**11 + graph_billions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_billions = billions @ graph_billions_component + + ten_billions = NEMO_DIGIT**12 + graph_ten_billions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT**13 + graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component + + trillion = NEMO_DIGIT**14 + graph_trillion_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('조') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all) + ) + ) + graph_trillions = trillion @ graph_trillion_component + + ten_trillions = NEMO_DIGIT**15 + graph_ten_trillions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('조') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all) + ) + ) + graph_ten_trillions = ten_trillions @ graph_ten_trillions_component + + hundred_trillions = NEMO_DIGIT**16 + graph_hundred_trillions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('조') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all) + ) + ) + graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component + + thousand_trillions = NEMO_DIGIT**17 + graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all) + ) + ) + graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component + + # FST + graph_num = pynini.union( + graph_thousand_trillions, + graph_hundred_trillions, + graph_ten_trillions, + graph_trillions, + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_all, + graph_zero, + ).optimize() + + # Sign and final formatting + optional_sign = pynini.closure( + pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1 + ) + final_graph = ( + optional_sign + + pynutil.insert('integer: "') + + graph_num + + pynutil.insert('"') + ) + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..2b22da370 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,75 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + +from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = "cased", + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_tokenize.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + cardinal = CardinalFst(deterministic=deterministic) + + classify = pynini.union(pynutil.add_weight(cardinal.fst, 1.1)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + tagger = pynini.closure(token, 1) + + self.fst = tagger.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/utils.py b/nemo_text_processing/text_normalization/ko/utils.py new file mode 100644 index 000000000..51aaea3e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..c6a48ab33 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { negative: "true" integer: "23" } -> 마이너스 이십삼 + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + self.optional_sign = pynini.cross("negative: \"true\"", "마이너스 ") + if not deterministic: + self.optional_sign |= pynini.cross("negative: \"true\"", "음수 ") + self.optional_sign |= pynini.cross("negative: \"true\"", "- ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + integer = pynini.closure(NEMO_NOT_QUOTE) + + self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"") + integer = pynutil.delete("integer:") + self.integer + + self.numbers = self.optional_sign + integer + delete_tokens = self.delete_tokens(self.numbers) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py new file mode 100644 index 000000000..09ec216c2 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py @@ -0,0 +1,113 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_SPACE, + NEMO_SIGMA, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "zh_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.set_punct_dict() + self.fst = self.get_punct_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def set_punct_dict(self): + self.punct_marks = { + "'": [ + "'", + '´', + 'ʹ', + 'ʻ', + 'ʼ', + 'ʽ', + 'ʾ', + 'ˈ', + 'ˊ', + 'ˋ', + '˴', + 'ʹ', + '΄', + '՚', + '՝', + 'י', + '׳', + 'ߴ', + 'ߵ', + 'ᑊ', + 'ᛌ', + '᾽', + '᾿', + '`', + '´', + '῾', + '‘', + '’', + '‛', + '′', + '‵', + 'ꞌ', + ''', + '`', + '𖽑', + '𖽒', + ], + } + + def get_punct_postprocess_graph(self): + """ + Returns graph to post process punctuation marks. + + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + """ + + remove_space_around_single_quote = pynini.cdrewrite( + delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA) + ) + # this works if spaces in between (good) + # delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA + + graph = remove_space_around_single_quote.optimize() + + return graph diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..9753db347 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + + self.fst = cardinal_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..9a4e2f7bf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now . + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + verbalize = VerbalizeFst(deterministic=deterministic).fst + # word = WordFst(deterministic=deterministic).fst + types = verbalize + + if deterministic: + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + else: + graph = delete_space + types + delete_space + + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 82f8f43d2..1d90903e1 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -174,6 +174,9 @@ def __init__( elif lang == 'ja': from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -720,7 +723,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko"], default="en", type=str, ) @@ -765,6 +768,7 @@ def parse_args(): parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs") parser.add_argument("--batch_size", default=200, type=int, help="Number of examples for each process") parser.add_argument( + "--max_number_of_permutations_per_split", default=729, type=int, diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..25dd560d1 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,19 @@ +1~일 +2~이 +3~삼 +123~백이십삼 +13000~만삼천 +9000~구천 +123000~십이만삼천 +123000012~일억이천삼백만십이 +1000000~백만 +100000000~일억 +1000000000000~일조 +100000000000000~백조 +20000000000001~이십조일 +800000000001001~팔백조천일 +82345670123135111~팔경이천삼백사십오조육천칠백일억이천삼백십삼만오천백십일 +9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구 +99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구 +999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..ed422e13e --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..9a50509cf --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh @@ -0,0 +1,123 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + +testTNCardinal() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +#testTNSpecialText() { +# input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt +# runtest $input +#} + +#testTNDate() { +# input=$TEST_DIR/data_text_normalization/test_cases_date.txt +# runtest $input +#} + +#testTNDecimal() { +# input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt +# runtest $input +#} + +#testTNRange() { +# input=$TEST_DIR/data_text_normalization/test_cases_range.txt +# runtest $input +#} + +#testTNSerial() { +# input=$TEST_DIR/data_text_normalization/test_cases_serial.txt +# runtest $input +#} + +#testTNRoman() { +# input=$TEST_DIR/data_text_normalization/test_cases_roman.txt +# runtest $input +#} + +#testTNElectronic() { +# input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt +# runtest $input +#} + +#testTNFraction() { +# input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt +# runtest $input +#} + +#testTNMoney() { +# input=$TEST_DIR/data_text_normalization/test_cases_money.txt +# runtest $input +#} + +#testTNOrdinal() { +# input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt +# runtest $input +#} + +#testTNTelephone() { +# input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt +# runtest $input +#} + +#testTNTime() { +# input=$TEST_DIR/data_text_normalization/test_cases_time.txt +# runtest $input +#} + +#testTNMeasure() { +# input=$TEST_DIR/data_text_normalization/test_cases_measure.txt +# runtest $input +#} + +#testTNWhitelist() { +# input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt +# runtest $input +#} + +#testTNWord() { +# input=$TEST_DIR/data_text_normalization/test_cases_word.txt +# runtest $input +#} + +#testTNAddress() { +# input=$TEST_DIR/data_text_normalization/test_cases_address.txt +# runtest $input +#} + +#testTNMath() { +# input=$TEST_DIR/data_text_normalization/test_cases_math.txt +# runtest $input +#} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..0885f19c0 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko', ], type=str, default='en', @@ -312,6 +313,16 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import ( + VerbalizeFst as TNVerbalizeFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From eb6a8c07784852331f2f3b8ad34b92ac94d32d82 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 21 May 2025 22:21:40 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/ko/taggers/cardinal.py | 171 +++++++++--------- .../text_normalization/normalize.py | 3 +- .../pynini_export.py | 4 +- 3 files changed, 92 insertions(+), 86 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py index 187ebd419..51c82e213 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -24,15 +24,15 @@ class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # Load base .tsv files - graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) - graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) - + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + digit_except_one = pynini.difference(NEMO_DIGIT, "1") digit_except_zero_one = pynini.difference(digit_except_one, "0") - + graph_digit_alt = digit_except_zero_one @ graph_digit graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) - graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) # Compose all basic number forms graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit @@ -50,7 +50,7 @@ def __init__(self, deterministic: bool = True): (pynini.closure(pynutil.delete('0')) + graph_all), ) graph_thousand = thousands @ graph_thousand_component - + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -59,16 +59,16 @@ def __init__(self, deterministic: bool = True): (pynini.closure(pynutil.delete('0')) + graph_all), ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - + hundred_thousands = NEMO_DIGIT**6 - graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_all) + pynutil.insert('만')) + pynini.union( + graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), (pynini.closure(pynutil.delete('0')) + graph_all), ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - + millions = NEMO_DIGIT**7 graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -79,15 +79,17 @@ def __init__(self, deterministic: bool = True): graph_million = millions @ graph_million_component ten_millions = NEMO_DIGIT**8 - graph_ten_million_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')) + pynini.union( + graph_ten_million_component = ( + (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만') + ) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), (pynini.closure(pynutil.delete('0')) + graph_all), ) graph_ten_million = ten_millions @ graph_ten_million_component - - hundred_millions = NEMO_DIGIT ** 9 + + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -127,7 +129,9 @@ def __init__(self, deterministic: bool = True): graph_billions = billions @ graph_billions_component ten_billions = NEMO_DIGIT**12 - graph_ten_billions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')) + pynini.union( + graph_ten_billions_component = ( + (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억') + ) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, (pynutil.delete('0') + graph_million_component), @@ -138,7 +142,7 @@ def __init__(self, deterministic: bool = True): (pynini.closure(pynutil.delete('0')) + graph_all), ) graph_ten_billions = ten_billions @ graph_ten_billions_component - + hundred_billions = NEMO_DIGIT**13 graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -155,79 +159,91 @@ def __init__(self, deterministic: bool = True): (pynini.closure(pynutil.delete('0')) + graph_all), ) graph_hundred_billions = hundred_billions @ graph_hundred_billions_component - + trillion = NEMO_DIGIT**14 - graph_trillion_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('조') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all) + graph_trillion_component = ( + (NEMO_DIGIT**2 @ graph_all) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), ) ) graph_trillions = trillion @ graph_trillion_component ten_trillions = NEMO_DIGIT**15 - graph_ten_trillions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('조') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all) - ) + graph_ten_trillions_component = ( + (NEMO_DIGIT**3 @ graph_hundred_component) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) ) graph_ten_trillions = ten_trillions @ graph_ten_trillions_component hundred_trillions = NEMO_DIGIT**16 - graph_hundred_trillions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('조') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all) + graph_hundred_trillions_component = ( + (NEMO_DIGIT**4 @ graph_thousand_component) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), ) ) graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component thousand_trillions = NEMO_DIGIT**17 - graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_hundred_trillions_component, - pynutil.delete('0') + graph_ten_trillions_component, - pynutil.delete('00') + graph_trillion_component, - pynutil.delete('000') + graph_hundred_billions_component, - pynutil.delete('0000') + graph_ten_billions_component, - pynutil.delete('00000') + graph_billions_component, - pynutil.delete('000000') + graph_thousand_million_component, - pynutil.delete('0000000') + graph_hundred_million_component, - pynutil.delete('00000000') + graph_ten_million_component, - pynutil.delete('000000000') + graph_million_component, - pynutil.delete('0000000000') + graph_hundred_thousand_component, - pynutil.delete('00000000000') + graph_ten_thousand_component, - pynutil.delete('000000000000') + graph_thousand_component, - pynutil.delete('0000000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all) + graph_thousand_trillions_component = ( + graph_digit + + pynutil.insert('경') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), ) ) graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component @@ -254,14 +270,7 @@ def __init__(self, deterministic: bool = True): ).optimize() # Sign and final formatting - optional_sign = pynini.closure( - pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1 - ) - final_graph = ( - optional_sign - + pynutil.insert('integer: "') - + graph_num - + pynutil.insert('"') - ) + optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1) + final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"') final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 1d90903e1..1a9219574 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -176,7 +176,7 @@ def __init__( from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst elif lang == 'ko': from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst - from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -768,7 +768,6 @@ def parse_args(): parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs") parser.add_argument("--batch_size", default=200, type=int, help="Number of examples for each process") parser.add_argument( - "--max_number_of_permutations_per_split", default=729, type=int, diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0885f19c0..fe6a9ff7e 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -317,12 +317,10 @@ def parse_args(): from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) - from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import ( - VerbalizeFst as TNVerbalizeFst, - ) from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import ( PostProcessingFst as TNPostProcessingFst, ) + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From 4c104f0792c281cfd27d583834dd071d1eb88c8a Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Fri, 23 May 2025 20:11:55 -0700 Subject: [PATCH 3/6] Refactor Korean TN cardinal and postprocessing logic based on review feedback Signed-off-by: Jinwoo Bae --- .../ko/data/number/teen.tsv | 10 - .../text_normalization/ko/data/number/ty.tsv | 1 + .../text_normalization/ko/graph_utils.py | 2 +- .../text_normalization/ko/taggers/cardinal.py | 180 ++++++++---------- .../ko/taggers/tokenize_and_classify.py | 3 - .../ko/verbalizers/post_processing.py | 70 +------ .../ko/verbalizers/verbalize.py | 2 - .../test_cases_cardinal.txt | 31 ++- .../nemo_text_processing/ko/test_cardinal.py | 3 +- .../ko/test_sparrowhawk_normalization.sh | 85 --------- 10 files changed, 120 insertions(+), 267 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/ko/data/number/teen.tsv diff --git a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv deleted file mode 100644 index 432fe5eb6..000000000 --- a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv +++ /dev/null @@ -1,10 +0,0 @@ -10 십 -11 십일 -12 십이 -13 십삼 -14 십사 -15 십오 -16 십육 -17 십칠 -18 십팔 -19 십구 diff --git a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv index 02623c44c..3d7bb221d 100644 --- a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv +++ b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv @@ -1,3 +1,4 @@ +1 십 2 이십 3 삼십 4 사십 diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py index a7ffdd2b1..9db51238f 100644 --- a/nemo_text_processing/text_normalization/ko/graph_utils.py +++ b/nemo_text_processing/text_normalization/ko/graph_utils.py @@ -23,7 +23,7 @@ from pynini.export import export from pynini.lib import byte, pynutil, utf8 -from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.en.utils import load_labels from nemo_text_processing.utils.logging import logger NEMO_CHAR = utf8.VALID_UTF8_CHAR diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py index 51c82e213..32b53855f 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -29,25 +29,24 @@ def __init__(self, deterministic: bool = True): digit_except_one = pynini.difference(NEMO_DIGIT, "1") digit_except_zero_one = pynini.difference(digit_except_one, "0") - - graph_digit_alt = digit_except_zero_one @ graph_digit - graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) - graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) + + graph_digit_no_zero_one = digit_except_zero_one @ graph_digit + graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) # Compose all basic number forms - graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit + graph_1_to_99 = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_digit hundreds = NEMO_DIGIT**3 - graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_alt + pynutil.insert('백'))) + pynini.union( - pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all) + graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백'))) + pynini.union( + pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99) ) graph_hundred = hundreds @ graph_hundred_component thousands = NEMO_DIGIT**4 - graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_alt + pynutil.insert('천'))) + pynini.union( + graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_thousand = thousands @ graph_thousand_component @@ -56,36 +55,35 @@ def __init__(self, deterministic: bool = True): pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component hundred_thousands = NEMO_DIGIT**6 - graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union( + + graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component millions = NEMO_DIGIT**7 - graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union( + graph_million_component = ((graph_hundred) + pynutil.insert('만')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_million = millions @ graph_million_component ten_millions = NEMO_DIGIT**8 - graph_ten_million_component = ( - (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만') - ) + pynini.union( + graph_ten_million_component = ((graph_thousand) + pynutil.insert('만')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_ten_million = ten_millions @ graph_ten_million_component @@ -98,12 +96,12 @@ def __init__(self, deterministic: bool = True): (pynutil.delete('000') + graph_ten_thousand_component), (pynutil.delete('0000') + graph_thousand_component), ((pynutil.delete('00000') + graph_hundred_component)), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_hundred_million = hundred_millions @ graph_hundred_million_component thousand_millions = NEMO_DIGIT**10 - graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('억')) + pynini.union( + graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('억')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, (pynutil.delete('0') + graph_million_component), @@ -111,12 +109,12 @@ def __init__(self, deterministic: bool = True): (pynutil.delete('000') + graph_ten_thousand_component), (pynutil.delete('0000') + graph_thousand_component), ((pynutil.delete('00000') + graph_hundred_component)), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_thousand_million = thousand_millions @ graph_thousand_million_component billions = NEMO_DIGIT**11 - graph_billions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('억')) + pynini.union( + graph_billions_component = ((graph_hundred) + pynutil.insert('억')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, (pynutil.delete('0') + graph_million_component), @@ -124,14 +122,12 @@ def __init__(self, deterministic: bool = True): (pynutil.delete('000') + graph_ten_thousand_component), (pynutil.delete('0000') + graph_thousand_component), ((pynutil.delete('00000') + graph_hundred_component)), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_billions = billions @ graph_billions_component ten_billions = NEMO_DIGIT**12 - graph_ten_billions_component = ( - (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억') - ) + pynini.union( + graph_ten_billions_component = ((graph_thousand) + pynutil.insert('억')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, (pynutil.delete('0') + graph_million_component), @@ -139,7 +135,7 @@ def __init__(self, deterministic: bool = True): (pynutil.delete('000') + graph_ten_thousand_component), (pynutil.delete('0000') + graph_thousand_component), ((pynutil.delete('00000') + graph_hundred_component)), - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_ten_billions = ten_billions @ graph_ten_billions_component @@ -156,94 +152,82 @@ def __init__(self, deterministic: bool = True): pynutil.delete('0000000') + graph_ten_thousand_component, pynutil.delete('00000000') + graph_thousand_component, pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) graph_hundred_billions = hundred_billions @ graph_hundred_billions_component trillion = NEMO_DIGIT**14 - graph_trillion_component = ( - (NEMO_DIGIT**2 @ graph_all) - + pynutil.insert('조') - + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all), + graph_trillion_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('조') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99) ) ) graph_trillions = trillion @ graph_trillion_component ten_trillions = NEMO_DIGIT**15 - graph_ten_trillions_component = ( - (NEMO_DIGIT**3 @ graph_hundred_component) - + pynutil.insert('조') - + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all), - ) + graph_ten_trillions_component = ((graph_hundred) + pynutil.insert('조') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99) + ) ) graph_ten_trillions = ten_trillions @ graph_ten_trillions_component hundred_trillions = NEMO_DIGIT**16 - graph_hundred_trillions_component = ( - (NEMO_DIGIT**4 @ graph_thousand_component) - + pynutil.insert('조') - + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all), + graph_hundred_trillions_component = ((graph_thousand) + pynutil.insert('조') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99) ) ) graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component thousand_trillions = NEMO_DIGIT**17 - graph_thousand_trillions_component = ( - graph_digit - + pynutil.insert('경') - + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_hundred_trillions_component, - pynutil.delete('0') + graph_ten_trillions_component, - pynutil.delete('00') + graph_trillion_component, - pynutil.delete('000') + graph_hundred_billions_component, - pynutil.delete('0000') + graph_ten_billions_component, - pynutil.delete('00000') + graph_billions_component, - pynutil.delete('000000') + graph_thousand_million_component, - pynutil.delete('0000000') + graph_hundred_million_component, - pynutil.delete('00000000') + graph_ten_million_component, - pynutil.delete('000000000') + graph_million_component, - pynutil.delete('0000000000') + graph_hundred_thousand_component, - pynutil.delete('00000000000') + graph_ten_thousand_component, - pynutil.delete('000000000000') + graph_thousand_component, - pynutil.delete('0000000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_all), + graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99) ) ) graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component @@ -265,7 +249,7 @@ def __init__(self, deterministic: bool = True): graph_ten_thousand, graph_thousand, graph_hundred, - graph_all, + graph_1_to_99, graph_zero, ).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py index 2b22da370..f9f868953 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -18,10 +18,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.ko.graph_utils import ( - NEMO_WHITE_SPACE, GraphFst, - delete_extra_space, - delete_space, generator_main, ) diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py index 09ec216c2..f5cc8298d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py @@ -18,9 +18,7 @@ import pynini from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_NOT_SPACE, NEMO_SIGMA, - delete_space, generator_main, ) from nemo_text_processing.utils.logging import logger @@ -41,73 +39,15 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, "zh_tn_post_processing.far") + far_file = os.path.join(cache_dir, "ko_tn_post_processing.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] logger.info(f'Post processing graph was restored from {far_file}.') else: - self.set_punct_dict() - self.fst = self.get_punct_postprocess_graph() + self.fst = self.get_postprocess_graph() if far_file: generator_main(far_file, {"post_process_graph": self.fst}) - - def set_punct_dict(self): - self.punct_marks = { - "'": [ - "'", - '´', - 'ʹ', - 'ʻ', - 'ʼ', - 'ʽ', - 'ʾ', - 'ˈ', - 'ˊ', - 'ˋ', - '˴', - 'ʹ', - '΄', - '՚', - '՝', - 'י', - '׳', - 'ߴ', - 'ߵ', - 'ᑊ', - 'ᛌ', - '᾽', - '᾿', - '`', - '´', - '῾', - '‘', - '’', - '‛', - '′', - '‵', - 'ꞌ', - ''', - '`', - '𖽑', - '𖽒', - ], - } - - def get_punct_postprocess_graph(self): - """ - Returns graph to post process punctuation marks. - - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. - """ - - remove_space_around_single_quote = pynini.cdrewrite( - delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA) - ) - # this works if spaces in between (good) - # delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA - - graph = remove_space_around_single_quote.optimize() - - return graph + + def get_postprocess_graph(self): + return pynini.cdrewrite(pynini.cross("", ""), "", "", pynini.closure(NEMO_SIGMA)).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py index 9753db347..8f38048f1 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pynini - from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt index 25dd560d1..40187f74e 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt @@ -16,4 +16,33 @@ 9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구 99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구 999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 -9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 \ No newline at end of file +9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +19~십구 +76~칠십육 +379~삼백칠십구 +850~팔백오십 +1004~천사 +8326~팔천삼백이십육 +10383~만삼백팔십삼 +34892~삼만사천팔백구십이 +573234~오십칠만삼천이백삼십사 +982010~구십팔만이천십 +2349023~이백삼십사만구천이십삼 +4303189~사백삼십만삼천백팔십구 +60321589~육천삼십이만천오백팔십구 +88234568~팔천팔백이십삼만사천오백육십팔 +792133923~칠억구천이백십삼만삼천구백이십삼 +187624689~일억팔천칠백육십이만사천육백팔십구 +2304050708~이십삼억사백오만칠백팔 +6436789729~육십사억삼천육백칠십팔만구천칠백이십구 +78234580257~칠백팔십이억삼천사백오십팔만이백오십칠 +987654321345~구천팔백칠십육억오천사백삼십이만천삼백사십오 +2345678901234~이조삼천사백오십육억칠천팔백구십만천이백삼십사 +35791357913579~삼십오조칠천구백십삼억오천칠백구십일만삼천오백칠십구 +470369258147036~사백칠십조삼천육백구십이억오천팔백십사만칠천삼십육 +5048258149517395~오천사십팔조이천오백팔십일억사천구백오십일만칠천삼백구십오 +67890123045607890~육경칠천팔백구십조천이백삼십억사천오백육십만칠천팔백구십 +-2~마이너스 이 +-93~마이너스 구십삼 +-90325~마이너스 구만삼백이십오 +-3234567~마이너스 삼백이십삼만사천오백육십칠 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index ed422e13e..763b7e607 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -15,10 +15,9 @@ import pytest from parameterized import parameterized -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import parse_test_case_file class TestCardinal: diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh index 9a50509cf..8c14c0336 100644 --- a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh @@ -31,91 +31,6 @@ testTNCardinal() { runtest $input } -#testTNSpecialText() { -# input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt -# runtest $input -#} - -#testTNDate() { -# input=$TEST_DIR/data_text_normalization/test_cases_date.txt -# runtest $input -#} - -#testTNDecimal() { -# input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt -# runtest $input -#} - -#testTNRange() { -# input=$TEST_DIR/data_text_normalization/test_cases_range.txt -# runtest $input -#} - -#testTNSerial() { -# input=$TEST_DIR/data_text_normalization/test_cases_serial.txt -# runtest $input -#} - -#testTNRoman() { -# input=$TEST_DIR/data_text_normalization/test_cases_roman.txt -# runtest $input -#} - -#testTNElectronic() { -# input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt -# runtest $input -#} - -#testTNFraction() { -# input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt -# runtest $input -#} - -#testTNMoney() { -# input=$TEST_DIR/data_text_normalization/test_cases_money.txt -# runtest $input -#} - -#testTNOrdinal() { -# input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt -# runtest $input -#} - -#testTNTelephone() { -# input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt -# runtest $input -#} - -#testTNTime() { -# input=$TEST_DIR/data_text_normalization/test_cases_time.txt -# runtest $input -#} - -#testTNMeasure() { -# input=$TEST_DIR/data_text_normalization/test_cases_measure.txt -# runtest $input -#} - -#testTNWhitelist() { -# input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt -# runtest $input -#} - -#testTNWord() { -# input=$TEST_DIR/data_text_normalization/test_cases_word.txt -# runtest $input -#} - -#testTNAddress() { -# input=$TEST_DIR/data_text_normalization/test_cases_address.txt -# runtest $input -#} - -#testTNMath() { -# input=$TEST_DIR/data_text_normalization/test_cases_math.txt -# runtest $input -#} - # Remove all command-line arguments shift $# From 90513790bcdf47b1cd03dd6061cb5974ed0aa10c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 24 May 2025 03:36:29 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/ko/taggers/cardinal.py | 144 ++++++++++-------- .../ko/taggers/tokenize_and_classify.py | 6 +- .../ko/verbalizers/post_processing.py | 7 +- 3 files changed, 82 insertions(+), 75 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py index 32b53855f..db530c931 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -29,21 +29,23 @@ def __init__(self, deterministic: bool = True): digit_except_one = pynini.difference(NEMO_DIGIT, "1") digit_except_zero_one = pynini.difference(digit_except_one, "0") - + graph_digit_no_zero_one = digit_except_zero_one @ graph_digit - graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) + graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) # Compose all basic number forms graph_1_to_99 = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_digit hundreds = NEMO_DIGIT**3 - graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백'))) + pynini.union( - pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99) - ) + graph_hundred_component = ( + pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백')) + ) + pynini.union(pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)) graph_hundred = hundreds @ graph_hundred_component thousands = NEMO_DIGIT**4 - graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))) + pynini.union( + graph_thousand_component = ( + pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천')) + ) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, (pynini.closure(pynutil.delete('0')) + graph_1_to_99), @@ -60,8 +62,8 @@ def __init__(self, deterministic: bool = True): graph_ten_thousand = ten_thousands @ graph_ten_thousand_component hundred_thousands = NEMO_DIGIT**6 - - graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union( + + graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), @@ -157,77 +159,89 @@ def __init__(self, deterministic: bool = True): graph_hundred_billions = hundred_billions @ graph_hundred_billions_component trillion = NEMO_DIGIT**14 - graph_trillion_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('조') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_1_to_99) + graph_trillion_component = ( + (NEMO_DIGIT**2 @ graph_1_to_99) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) ) graph_trillions = trillion @ graph_trillion_component ten_trillions = NEMO_DIGIT**15 - graph_ten_trillions_component = ((graph_hundred) + pynutil.insert('조') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_1_to_99) - ) + graph_ten_trillions_component = ( + (graph_hundred) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) ) graph_ten_trillions = ten_trillions @ graph_ten_trillions_component hundred_trillions = NEMO_DIGIT**16 - graph_hundred_trillions_component = ((graph_thousand) + pynutil.insert('조') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_ten_billions_component, - pynutil.delete('0') + graph_billions_component, - pynutil.delete('00') + graph_thousand_million_component, - pynutil.delete('000') + graph_hundred_million_component, - pynutil.delete('0000') + graph_ten_million_component, - pynutil.delete('00000') + graph_million_component, - pynutil.delete('000000') + graph_hundred_thousand_component, - pynutil.delete('0000000') + graph_ten_thousand_component, - pynutil.delete('00000000') + graph_thousand_component, - pynutil.delete('000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_1_to_99) + graph_hundred_trillions_component = ( + (graph_thousand) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) ) graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component thousand_trillions = NEMO_DIGIT**17 - graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union( - pynini.closure(pynutil.delete('0')), - graph_hundred_trillions_component, - pynutil.delete('0') + graph_ten_trillions_component, - pynutil.delete('00') + graph_trillion_component, - pynutil.delete('000') + graph_hundred_billions_component, - pynutil.delete('0000') + graph_ten_billions_component, - pynutil.delete('00000') + graph_billions_component, - pynutil.delete('000000') + graph_thousand_million_component, - pynutil.delete('0000000') + graph_hundred_million_component, - pynutil.delete('00000000') + graph_ten_million_component, - pynutil.delete('000000000') + graph_million_component, - pynutil.delete('0000000000') + graph_hundred_thousand_component, - pynutil.delete('00000000000') + graph_ten_thousand_component, - pynutil.delete('000000000000') + graph_thousand_component, - pynutil.delete('0000000000000') + graph_hundred_component, - (pynini.closure(pynutil.delete('0')) + graph_1_to_99) + graph_thousand_trillions_component = ( + graph_digit + + pynutil.insert('경') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), ) ) graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py index f9f868953..0676446e5 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -17,11 +17,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import ( - GraphFst, - generator_main, -) - +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.utils.logging import logger diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py index f5cc8298d..7ba146cff 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py @@ -17,10 +17,7 @@ import pynini -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_SIGMA, - generator_main, -) +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, generator_main from nemo_text_processing.utils.logging import logger @@ -48,6 +45,6 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): if far_file: generator_main(far_file, {"post_process_graph": self.fst}) - + def get_postprocess_graph(self): return pynini.cdrewrite(pynini.cross("", ""), "", "", pynini.closure(NEMO_SIGMA)).optimize() From 54781489cd1e69ae83e387f85602c74efc9e05b8 Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Sun, 25 May 2025 12:52:50 -0700 Subject: [PATCH 5/6] Add __init__.py to ko/data directory Signed-off-by: Jinwoo Bae --- .../text_normalization/ko/data/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nemo_text_processing/text_normalization/ko/data/__init__.py diff --git a/nemo_text_processing/text_normalization/ko/data/__init__.py b/nemo_text_processing/text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 833c7b90c90a4e05bb54e1dc1e6fbec1b0efef4e Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Tue, 3 Jun 2025 10:49:57 -0700 Subject: [PATCH 6/6] Update KO_TN_CACHE to trigger Korean CI run Signed-off-by: Jinwoo Bae --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index c3339c7bc..253af49c2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' - KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0' + KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages {