diff --git a/nemo_text_processing/text_normalization/ko/__init__.py b/nemo_text_processing/text_normalization/ko/__init__.py new file mode 100644 index 000000000..dd0e509b3 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ko/data/number/__init__.py b/nemo_text_processing/text_normalization/ko/data/number/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/number/digit.tsv b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv new file mode 100644 index 000000000..61a7dddcf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv @@ -0,0 +1,9 @@ +1 일 +2 이 +3 삼 +4 사 +5 오 +6 육 +7 칠 +8 팔 +9 구 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv new file mode 100644 index 000000000..432fe5eb6 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv @@ -0,0 +1,10 @@ +10 십 +11 십일 +12 십이 +13 십삼 +14 십사 +15 십오 +16 십육 +17 십칠 +18 십팔 +19 십구 diff --git a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv new file mode 100644 index 000000000..02623c44c --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv @@ -0,0 +1,8 @@ +2 이십 +3 삼십 +4 사십 +5 오십 +6 육십 +7 칠십 +8 팔십 +9 구십 diff --git a/nemo_text_processing/text_normalization/ko/data/number/zero.tsv b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv new file mode 100644 index 000000000..7024c0534 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv @@ -0,0 +1 @@ +0 영 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..a7ffdd2b1 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/graph_utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = "lower_cased"): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ko/taggers/__init__.py b/nemo_text_processing/text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..51c82e213 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,276 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class CardinalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + # Load base .tsv files + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + + digit_except_one = pynini.difference(NEMO_DIGIT, "1") + digit_except_zero_one = pynini.difference(digit_except_one, "0") + + graph_digit_alt = digit_except_zero_one @ graph_digit + graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) + + # Compose all basic number forms + graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit + + hundreds = NEMO_DIGIT**3 + graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_alt + pynutil.insert('백'))) + pynini.union( + pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all) + ) + graph_hundred = hundreds @ graph_hundred_component + + thousands = NEMO_DIGIT**4 + graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_alt + pynutil.insert('천'))) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT**5 + graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT**6 + graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT**7 + graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT**8 + graph_ten_million_component = ( + (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만') + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT**9 + graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT**10 + graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + billions = NEMO_DIGIT**11 + graph_billions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_billions = billions @ graph_billions_component + + ten_billions = NEMO_DIGIT**12 + graph_ten_billions_component = ( + (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억') + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT**13 + graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component + + trillion = NEMO_DIGIT**14 + graph_trillion_component = ( + (NEMO_DIGIT**2 @ graph_all) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + ) + graph_trillions = trillion @ graph_trillion_component + + ten_trillions = NEMO_DIGIT**15 + graph_ten_trillions_component = ( + (NEMO_DIGIT**3 @ graph_hundred_component) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + ) + graph_ten_trillions = ten_trillions @ graph_ten_trillions_component + + hundred_trillions = NEMO_DIGIT**16 + graph_hundred_trillions_component = ( + (NEMO_DIGIT**4 @ graph_thousand_component) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + ) + graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component + + thousand_trillions = NEMO_DIGIT**17 + graph_thousand_trillions_component = ( + graph_digit + + pynutil.insert('경') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_all), + ) + ) + graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component + + # FST + graph_num = pynini.union( + graph_thousand_trillions, + graph_hundred_trillions, + graph_ten_trillions, + graph_trillions, + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_all, + graph_zero, + ).optimize() + + # Sign and final formatting + optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1) + final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"') + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..2b22da370 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,75 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + +from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = "cased", + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_tokenize.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + cardinal = CardinalFst(deterministic=deterministic) + + classify = pynini.union(pynutil.add_weight(cardinal.fst, 1.1)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + tagger = pynini.closure(token, 1) + + self.fst = tagger.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/utils.py b/nemo_text_processing/text_normalization/ko/utils.py new file mode 100644 index 000000000..51aaea3e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..c6a48ab33 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { negative: "true" integer: "23" } -> 마이너스 이십삼 + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + self.optional_sign = pynini.cross("negative: \"true\"", "마이너스 ") + if not deterministic: + self.optional_sign |= pynini.cross("negative: \"true\"", "음수 ") + self.optional_sign |= pynini.cross("negative: \"true\"", "- ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + integer = pynini.closure(NEMO_NOT_QUOTE) + + self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"") + integer = pynutil.delete("integer:") + self.integer + + self.numbers = self.optional_sign + integer + delete_tokens = self.delete_tokens(self.numbers) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..9753db347 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + + self.fst = cardinal_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..9a4e2f7bf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now . + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + verbalize = VerbalizeFst(deterministic=deterministic).fst + # word = WordFst(deterministic=deterministic).fst + types = verbalize + + if deterministic: + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + else: + graph = delete_space + types + delete_space + + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 82f8f43d2..1a9219574 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -174,6 +174,9 @@ def __init__( elif lang == 'ja': from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -720,7 +723,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko"], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 0438579a7..fc9b21c29 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..05164093b --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,20 @@ +1~일 +2~이 +-2~마이너스 이 +3~삼 +123~백이십삼 +13000~만삼천 +9000~구천 +123000~십이만삼천 +123000012~일억이천삼백만십이 +1000000~백만 +100000000~일억 +1000000000000~일조 +100000000000000~백조 +20000000000001~이십조일 +800000000001001~팔백조천일 +82345670123135111~팔경이천삼백사십오조육천칠백일억이천삼백십삼만오천백십일 +9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구 +99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구 +999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..ed422e13e --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..63242e150 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh @@ -0,0 +1,123 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + +testTNCardinal() { + input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +#testTNSpecialText() { +# input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt +# runtest $input +#} + +#testTNDate() { +# input=$TEST_DIR/data_text_normalization/test_cases_date.txt +# runtest $input +#} + +#testTNDecimal() { +# input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt +# runtest $input +#} + +#testTNRange() { +# input=$TEST_DIR/data_text_normalization/test_cases_range.txt +# runtest $input +#} + +#testTNSerial() { +# input=$TEST_DIR/data_text_normalization/test_cases_serial.txt +# runtest $input +#} + +#testTNRoman() { +# input=$TEST_DIR/data_text_normalization/test_cases_roman.txt +# runtest $input +#} + +#testTNElectronic() { +# input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt +# runtest $input +#} + +#testTNFraction() { +# input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt +# runtest $input +#} + +#testTNMoney() { +# input=$TEST_DIR/data_text_normalization/test_cases_money.txt +# runtest $input +#} + +#testTNOrdinal() { +# input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt +# runtest $input +#} + +#testTNTelephone() { +# input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt +# runtest $input +#} + +#testTNTime() { +# input=$TEST_DIR/data_text_normalization/test_cases_time.txt +# runtest $input +#} + +#testTNMeasure() { +# input=$TEST_DIR/data_text_normalization/test_cases_measure.txt +# runtest $input +#} + +#testTNWhitelist() { +# input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt +# runtest $input +#} + +#testTNWord() { +# input=$TEST_DIR/data_text_normalization/test_cases_word.txt +# runtest $input +#} + +#testTNAddress() { +# input=$TEST_DIR/data_text_normalization/test_cases_address.txt +# runtest $input +#} + +#testTNMath() { +# input=$TEST_DIR/data_text_normalization/test_cases_math.txt +# runtest $input +#} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 5326784e9..0e06d5945 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str): Prepares tests pairs for ITN and TN tests """ test_pairs = [] - with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f: for line in f: components = line.strip("\n").split("~") spoken = components[0] diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index 017472ae9..21c6d0833 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -89,7 +89,7 @@ fi if [[ ${OVERWRITE_CACHE} != "" ]] ; then echo "[I] Exporting grammars" python3 pynini_export.py --output_dir=${FAR_PATH} --grammars=${GRAMMARS} --input_case=${INPUT_CASE} \ - --language=${LANGUAGE} --cache_dir=${CACHE_DIR} ${WHITELIST} ${OVERWRITE_CACHE} || exit 1 + --language=${LANGUAGE} --cache_dir=${CACHE_DIR} ${WHITELIST} ${OVERWRITE_CACHE} || exit 1 fi if [[ ${FORCE_REBUILD,,} == "true" ]]; then diff --git a/tools/text_processing_deployment/fst_test.py b/tools/text_processing_deployment/fst_test.py new file mode 100644 index 000000000..156c30279 --- /dev/null +++ b/tools/text_processing_deployment/fst_test.py @@ -0,0 +1,38 @@ +import argparse +from pynini.lib import rewrite +from nemo_text_processing.text_normalization.normalize import Normalizer + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_data_file", type=str, required=True) + parser.add_argument("--mode", type=str, required=True) + parser.add_argument("--grammars_dir", type=str, required=True) + parser.add_argument("--language", type=str, required=True) + args = parser.parse_args() + + print(f"[INFO] Loading normalizer with grammars from {args.grammars_dir}") + normalizer = Normalizer( + input_case="lower_cased", lang=args.language, cache_dir=args.grammars_dir, overwrite_cache=False + ) + + print(f"[INFO] Reading input test file: {args.input_data_file}") + with open(args.input_data_file, encoding="utf-8") as f: + lines = f.readlines() + + total = 0 + passed = 0 + + for line in lines: + if "~" not in line: + continue + input_text, expected_output = line.strip().split("~") + pred = normalizer.normalize(input_text) + + if pred.strip() == expected_output.strip(): + print(f" ALLOW: {input_text} → {pred}") + passed += 1 + else: + print(f" DENY: {input_text} → {pred} (expected: {expected_output})") + total += 1 + + print(f"\n[RESULT] Passed {passed}/{total} ({(passed/total)*100:.2f}%)") diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..4190c05f0 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko', ], type=str, default='en', @@ -308,10 +309,15 @@ def parse_args(): ) from nemo_text_processing.text_normalization.ja.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'rw': + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) - from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir,