diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..253af49c2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,6 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' + KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -318,6 +319,22 @@ pipeline { } } } + stage('L0: Create KO TN Grammars') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: KO TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } + } + } // L1 Tests starts here @@ -406,6 +423,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/text_normalization/ko/__init__.py b/nemo_text_processing/text_normalization/ko/__init__.py new file mode 100644 index 000000000..dd0e509b3 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ko/data/__init__.py b/nemo_text_processing/text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/number/__init__.py b/nemo_text_processing/text_normalization/ko/data/number/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/number/digit.tsv b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv new file mode 100644 index 000000000..61a7dddcf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv @@ -0,0 +1,9 @@ +1 일 +2 이 +3 삼 +4 사 +5 오 +6 육 +7 칠 +8 팔 +9 구 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv new file mode 100644 index 000000000..3d7bb221d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv @@ -0,0 +1,9 @@ +1 십 +2 이십 +3 삼십 +4 사십 +5 오십 +6 육십 +7 칠십 +8 팔십 +9 구십 diff --git a/nemo_text_processing/text_normalization/ko/data/number/zero.tsv b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv new file mode 100644 index 000000000..7024c0534 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv @@ -0,0 +1 @@ +0 영 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..9db51238f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/graph_utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = "lower_cased"): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ko/taggers/__init__.py b/nemo_text_processing/text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..db530c931 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,274 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class CardinalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + # Load base .tsv files + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + + digit_except_one = pynini.difference(NEMO_DIGIT, "1") + digit_except_zero_one = pynini.difference(digit_except_one, "0") + + graph_digit_no_zero_one = digit_except_zero_one @ graph_digit + graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv")) + + # Compose all basic number forms + graph_1_to_99 = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_digit + + hundreds = NEMO_DIGIT**3 + graph_hundred_component = ( + pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백')) + ) + pynini.union(pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)) + graph_hundred = hundreds @ graph_hundred_component + + thousands = NEMO_DIGIT**4 + graph_thousand_component = ( + pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천')) + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT**5 + graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT**6 + + graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT**7 + graph_million_component = ((graph_hundred) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT**8 + graph_ten_million_component = ((graph_thousand) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT**9 + graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT**10 + graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + billions = NEMO_DIGIT**11 + graph_billions_component = ((graph_hundred) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_billions = billions @ graph_billions_component + + ten_billions = NEMO_DIGIT**12 + graph_ten_billions_component = ((graph_thousand) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT**13 + graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component + + trillion = NEMO_DIGIT**14 + graph_trillion_component = ( + (NEMO_DIGIT**2 @ graph_1_to_99) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_trillions = trillion @ graph_trillion_component + + ten_trillions = NEMO_DIGIT**15 + graph_ten_trillions_component = ( + (graph_hundred) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_ten_trillions = ten_trillions @ graph_ten_trillions_component + + hundred_trillions = NEMO_DIGIT**16 + graph_hundred_trillions_component = ( + (graph_thousand) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component + + thousand_trillions = NEMO_DIGIT**17 + graph_thousand_trillions_component = ( + graph_digit + + pynutil.insert('경') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component + + # FST + graph_num = pynini.union( + graph_thousand_trillions, + graph_hundred_trillions, + graph_ten_trillions, + graph_trillions, + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_1_to_99, + graph_zero, + ).optimize() + + # Sign and final formatting + optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1) + final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"') + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..0676446e5 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,68 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main +from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = "cased", + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_tokenize.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + cardinal = CardinalFst(deterministic=deterministic) + + classify = pynini.union(pynutil.add_weight(cardinal.fst, 1.1)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + tagger = pynini.closure(token, 1) + + self.fst = tagger.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/utils.py b/nemo_text_processing/text_normalization/ko/utils.py new file mode 100644 index 000000000..51aaea3e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..c6a48ab33 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { negative: "true" integer: "23" } -> 마이너스 이십삼 + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + self.optional_sign = pynini.cross("negative: \"true\"", "마이너스 ") + if not deterministic: + self.optional_sign |= pynini.cross("negative: \"true\"", "음수 ") + self.optional_sign |= pynini.cross("negative: \"true\"", "- ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + integer = pynini.closure(NEMO_NOT_QUOTE) + + self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"") + integer = pynutil.delete("integer:") + self.integer + + self.numbers = self.optional_sign + integer + delete_tokens = self.delete_tokens(self.numbers) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py new file mode 100644 index 000000000..7ba146cff --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, generator_main +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "ko_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.fst = self.get_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def get_postprocess_graph(self): + return pynini.cdrewrite(pynini.cross("", ""), "", "", pynini.closure(NEMO_SIGMA)).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..8f38048f1 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + + self.fst = cardinal_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..9a4e2f7bf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now . + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + verbalize = VerbalizeFst(deterministic=deterministic).fst + # word = WordFst(deterministic=deterministic).fst + types = verbalize + + if deterministic: + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + else: + graph = delete_space + types + delete_space + + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 82f8f43d2..1a9219574 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -174,6 +174,9 @@ def __init__( elif lang == 'ja': from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -720,7 +723,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..40187f74e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,48 @@ +1~일 +2~이 +3~삼 +123~백이십삼 +13000~만삼천 +9000~구천 +123000~십이만삼천 +123000012~일억이천삼백만십이 +1000000~백만 +100000000~일억 +1000000000000~일조 +100000000000000~백조 +20000000000001~이십조일 +800000000001001~팔백조천일 +82345670123135111~팔경이천삼백사십오조육천칠백일억이천삼백십삼만오천백십일 +9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구 +99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구 +999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +19~십구 +76~칠십육 +379~삼백칠십구 +850~팔백오십 +1004~천사 +8326~팔천삼백이십육 +10383~만삼백팔십삼 +34892~삼만사천팔백구십이 +573234~오십칠만삼천이백삼십사 +982010~구십팔만이천십 +2349023~이백삼십사만구천이십삼 +4303189~사백삼십만삼천백팔십구 +60321589~육천삼십이만천오백팔십구 +88234568~팔천팔백이십삼만사천오백육십팔 +792133923~칠억구천이백십삼만삼천구백이십삼 +187624689~일억팔천칠백육십이만사천육백팔십구 +2304050708~이십삼억사백오만칠백팔 +6436789729~육십사억삼천육백칠십팔만구천칠백이십구 +78234580257~칠백팔십이억삼천사백오십팔만이백오십칠 +987654321345~구천팔백칠십육억오천사백삼십이만천삼백사십오 +2345678901234~이조삼천사백오십육억칠천팔백구십만천이백삼십사 +35791357913579~삼십오조칠천구백십삼억오천칠백구십일만삼천오백칠십구 +470369258147036~사백칠십조삼천육백구십이억오천팔백십사만칠천삼십육 +5048258149517395~오천사십팔조이천오백팔십일억사천구백오십일만칠천삼백구십오 +67890123045607890~육경칠천팔백구십조천이백삼십억사천오백육십만칠천팔백구십 +-2~마이너스 이 +-93~마이너스 구십삼 +-90325~마이너스 구만삼백이십오 +-3234567~마이너스 삼백이십삼만사천오백육십칠 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..763b7e607 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import parse_test_case_file + + +class TestCardinal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..8c14c0336 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh @@ -0,0 +1,38 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + +testTNCardinal() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..fe6a9ff7e 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko', ], type=str, default='en', @@ -312,6 +313,14 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir,