From 034d05b9b3bdfc730384c0dc05159b20c75bb65f Mon Sep 17 00:00:00 2001
From: Jinwoo Bae <bbae7050@gmail.com>
Date: Wed, 21 May 2025 14:58:52 -0700
Subject: [PATCH 1/6] Add Korean TN support for cardinal numbers and
 postprocessing

Signed-off-by: Jinwoo Bae <bbae7050@gmail.com>
---
 Jenkinsfile                                   |  22 ++
 .../text_normalization/ko/__init__.py         |  17 ++
 .../ko/data/number/__init__.py                |  13 +
 .../ko/data/number/digit.tsv                  |   9 +
 .../ko/data/number/teen.tsv                   |  10 +
 .../text_normalization/ko/data/number/ty.tsv  |   8 +
 .../ko/data/number/zero.tsv                   |   1 +
 .../text_normalization/ko/graph_utils.py      | 173 ++++++++++++
 .../text_normalization/ko/taggers/__init__.py |  13 +
 .../text_normalization/ko/taggers/cardinal.py | 267 ++++++++++++++++++
 .../ko/taggers/tokenize_and_classify.py       |  75 +++++
 .../text_normalization/ko/utils.py            |  60 ++++
 .../ko/verbalizers/__init__.py                |  13 +
 .../ko/verbalizers/cardinal.py                |  48 ++++
 .../ko/verbalizers/post_processing.py         | 113 ++++++++
 .../ko/verbalizers/verbalize.py               |  38 +++
 .../ko/verbalizers/verbalize_final.py         |  74 +++++
 .../text_normalization/normalize.py           |   6 +-
 tests/nemo_text_processing/ko/__init__.py     |  13 +
 .../test_cases_cardinal.txt                   |  19 ++
 .../nemo_text_processing/ko/test_cardinal.py  |  34 +++
 .../ko/test_sparrowhawk_normalization.sh      | 123 ++++++++
 .../pynini_export.py                          |  11 +
 23 files changed, 1159 insertions(+), 1 deletion(-)
 create mode 100644 nemo_text_processing/text_normalization/ko/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/ko/data/number/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/ko/data/number/digit.tsv
 create mode 100644 nemo_text_processing/text_normalization/ko/data/number/teen.tsv
 create mode 100644 nemo_text_processing/text_normalization/ko/data/number/ty.tsv
 create mode 100644 nemo_text_processing/text_normalization/ko/data/number/zero.tsv
 create mode 100644 nemo_text_processing/text_normalization/ko/graph_utils.py
 create mode 100644 nemo_text_processing/text_normalization/ko/taggers/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/ko/taggers/cardinal.py
 create mode 100644 nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
 create mode 100644 nemo_text_processing/text_normalization/ko/utils.py
 create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/__init__.py
 create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py
 create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
 create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py
 create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
 create mode 100644 tests/nemo_text_processing/ko/__init__.py
 create mode 100644 tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt
 create mode 100644 tests/nemo_text_processing/ko/test_cardinal.py
 create mode 100644 tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index 51ce37a10..c3339c7bc 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -28,6 +28,7 @@ pipeline {
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
     HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
+    KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {
@@ -318,6 +319,22 @@ pipeline {
         }
       }
     }
+    stage('L0: Create KO TN Grammars') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }   
+      failFast true
+      parallel {
+        stage('L0: KO TN grammars') {
+          steps {
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}'
+          }
+        }
+      }
+    }
 
 
 // L1 Tests starts here
@@ -406,6 +423,11 @@ pipeline {
             sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}'
           }
         }
+        stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') {
+          steps {
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}'
+          }
+        }
       }
     }
 
diff --git a/nemo_text_processing/text_normalization/ko/__init__.py b/nemo_text_processing/text_normalization/ko/__init__.py
new file mode 100644
index 000000000..dd0e509b3
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
+from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst
+from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst
diff --git a/nemo_text_processing/text_normalization/ko/data/number/__init__.py b/nemo_text_processing/text_normalization/ko/data/number/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/number/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/ko/data/number/digit.tsv b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv
new file mode 100644
index 000000000..61a7dddcf
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv
@@ -0,0 +1,9 @@
+1	일
+2	이
+3	삼
+4	사
+5	오
+6	육
+7	칠
+8	팔
+9	구
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv
new file mode 100644
index 000000000..432fe5eb6
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv
@@ -0,0 +1,10 @@
+10	십
+11	십일
+12	십이
+13	십삼
+14	십사
+15	십오
+16	십육
+17	십칠
+18	십팔
+19	십구
diff --git a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv
new file mode 100644
index 000000000..02623c44c
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv
@@ -0,0 +1,8 @@
+2	이십
+3	삼십
+4	사십
+5	오십
+6	육십
+7	칠십
+8	팔십
+9	구십
diff --git a/nemo_text_processing/text_normalization/ko/data/number/zero.tsv b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv
new file mode 100644
index 000000000..7024c0534
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv
@@ -0,0 +1 @@
+0	영
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py
new file mode 100644
index 000000000..a7ffdd2b1
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/graph_utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2015 and onwards Google, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import string
+from pathlib import Path
+from typing import Dict
+
+import pynini
+from pynini import Far
+from pynini.export import export
+from pynini.lib import byte, pynutil, utf8
+
+from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
+from nemo_text_processing.utils.logging import logger
+
+NEMO_CHAR = utf8.VALID_UTF8_CHAR
+
+NEMO_DIGIT = byte.DIGIT
+NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize()
+NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
+NEMO_HEX = pynini.union(*string.hexdigits).optimize()
+NEMO_SPACE = " "
+NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
+NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
+NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
+
+NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
+NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
+
+NEMO_SIGMA = pynini.closure(NEMO_CHAR)
+
+delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
+delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
+insert_space = pynutil.insert(" ")
+delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
+delete_preserve_order = pynini.closure(
+    pynutil.delete(" preserve_order: true")
+    | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
+)
+
+
+# Common string literals; expand as you see fit.
+username_string = "username"
+double_quotes = '"'
+domain_string = "domain"
+protocol_string = "protocol"
+slash = "/"
+double_slash = "//"
+triple_slash = "///"
+file = "file"
+period = "."
+at = "@"
+colon = ":"
+https = "https"
+http = "http"
+www = "www"
+
+
+def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
+    """
+    Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
+
+    Args:
+        file_name: exported file name
+        graphs: Mapping of a rule name and Pynini WFST graph to be exported
+    """
+    exporter = export.Exporter(file_name)
+    for rule, graph in graphs.items():
+        exporter[rule] = graph.optimize()
+    exporter.close()
+    logger.info(f"Created {file_name}")
+
+
+def convert_space(fst) -> "pynini.FstLike":
+    """
+    Converts space to nonbreaking space.
+    Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
+    This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
+
+    Args:
+        fst: input fst
+
+    Returns output fst where breaking spaces are converted to non breaking spaces
+    """
+    return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA)
+
+
+def string_map_cased(input_file: str, input_case: str = "lower_cased"):
+    labels = load_labels(input_file)
+    whitelist = pynini.string_map(labels).invert().optimize()
+    return whitelist
+
+
+class GraphFst:
+    """
+    Base class for all grammar fsts.
+
+    Args:
+        name: name of grammar class
+        kind: either 'classify' or 'verbalize'
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, name: str, kind: str, deterministic: bool = True):
+        self.name = name
+        self.kind = kind
+        self._fst = None
+        self.deterministic = deterministic
+
+        self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
+        if self.far_exist():
+            self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
+
+    def far_exist(self) -> bool:
+        """
+        Returns true if FAR can be loaded
+        """
+        return self.far_path.exists()
+
+    @property
+    def fst(self) -> "pynini.FstLike":
+        return self._fst
+
+    @fst.setter
+    def fst(self, fst):
+        self._fst = fst
+
+    def add_tokens(self, fst) -> "pynini.FstLike":
+        """
+        Wraps class name around to given fst
+
+        Args:
+            fst: input fst
+
+        Returns:
+            Fst: fst
+        """
+        return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
+
+    def delete_tokens(self, fst) -> "pynini.FstLike":
+        """
+        Deletes class name wrap around output of given fst
+
+        Args:
+            fst: input fst
+
+        Returns:
+            Fst: fst
+        """
+        res = (
+            pynutil.delete(f"{self.name}")
+            + delete_space
+            + pynutil.delete("{")
+            + delete_space
+            + fst
+            + delete_space
+            + pynutil.delete("}")
+        )
+        return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA)
diff --git a/nemo_text_processing/text_normalization/ko/taggers/__init__.py b/nemo_text_processing/text_normalization/ko/taggers/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/taggers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
new file mode 100644
index 000000000..187ebd419
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst
+from nemo_text_processing.text_normalization.ko.utils import get_abs_path
+
+
+class CardinalFst(GraphFst):
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
+        # Load base .tsv files
+        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) 
+        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))         
+        
+        digit_except_one = pynini.difference(NEMO_DIGIT, "1")
+        digit_except_zero_one = pynini.difference(digit_except_one, "0")
+        
+        graph_digit_alt = digit_except_zero_one @ graph_digit
+        graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
+        graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))         
+
+        # Compose all basic number forms
+        graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit
+
+        hundreds = NEMO_DIGIT**3
+        graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_alt + pynutil.insert('백'))) + pynini.union(
+            pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all)
+        )
+        graph_hundred = hundreds @ graph_hundred_component
+
+        thousands = NEMO_DIGIT**4
+        graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_alt + pynutil.insert('천'))) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_thousand = thousands @ graph_thousand_component
+        
+        ten_thousands = NEMO_DIGIT**5
+        graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_thousand_component,
+            (pynutil.delete('0') + graph_hundred_component),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
+        
+        hundred_thousands = NEMO_DIGIT**6
+        graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_all) + pynutil.insert('만')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_thousand_component,
+            (pynutil.delete('0') + graph_hundred_component),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component
+        
+        millions = NEMO_DIGIT**7
+        graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_thousand_component,
+            (pynutil.delete('0') + graph_hundred_component),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_million = millions @ graph_million_component
+
+        ten_millions = NEMO_DIGIT**8
+        graph_ten_million_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_thousand_component,
+            (pynutil.delete('0') + graph_hundred_component),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_ten_million = ten_millions @ graph_ten_million_component
+        
+        hundred_millions = NEMO_DIGIT ** 9
+        graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_million_component,
+            (pynutil.delete('0') + graph_million_component),
+            (pynutil.delete('00') + graph_hundred_thousand_component),
+            (pynutil.delete('000') + graph_ten_thousand_component),
+            (pynutil.delete('0000') + graph_thousand_component),
+            ((pynutil.delete('00000') + graph_hundred_component)),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_hundred_million = hundred_millions @ graph_hundred_million_component
+
+        thousand_millions = NEMO_DIGIT**10
+        graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('억')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_million_component,
+            (pynutil.delete('0') + graph_million_component),
+            (pynutil.delete('00') + graph_hundred_thousand_component),
+            (pynutil.delete('000') + graph_ten_thousand_component),
+            (pynutil.delete('0000') + graph_thousand_component),
+            ((pynutil.delete('00000') + graph_hundred_component)),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_thousand_million = thousand_millions @ graph_thousand_million_component
+
+        billions = NEMO_DIGIT**11
+        graph_billions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('억')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_million_component,
+            (pynutil.delete('0') + graph_million_component),
+            (pynutil.delete('00') + graph_hundred_thousand_component),
+            (pynutil.delete('000') + graph_ten_thousand_component),
+            (pynutil.delete('0000') + graph_thousand_component),
+            ((pynutil.delete('00000') + graph_hundred_component)),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_billions = billions @ graph_billions_component
+
+        ten_billions = NEMO_DIGIT**12
+        graph_ten_billions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_million_component,
+            (pynutil.delete('0') + graph_million_component),
+            (pynutil.delete('00') + graph_hundred_thousand_component),
+            (pynutil.delete('000') + graph_ten_thousand_component),
+            (pynutil.delete('0000') + graph_thousand_component),
+            ((pynutil.delete('00000') + graph_hundred_component)),
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_ten_billions = ten_billions @ graph_ten_billions_component
+        
+        hundred_billions = NEMO_DIGIT**13
+        graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_all),
+        )
+        graph_hundred_billions = hundred_billions @ graph_hundred_billions_component
+        
+        trillion = NEMO_DIGIT**14
+        graph_trillion_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('조') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_all)
+            )
+        )
+        graph_trillions = trillion @ graph_trillion_component
+
+        ten_trillions = NEMO_DIGIT**15
+        graph_ten_trillions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('조') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_all)
+            )       
+        )
+        graph_ten_trillions = ten_trillions @ graph_ten_trillions_component
+
+        hundred_trillions = NEMO_DIGIT**16
+        graph_hundred_trillions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('조') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_all)
+            )
+        )
+        graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component
+
+        thousand_trillions = NEMO_DIGIT**17
+        graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_hundred_trillions_component,
+            pynutil.delete('0') + graph_ten_trillions_component,
+            pynutil.delete('00') + graph_trillion_component,
+            pynutil.delete('000') + graph_hundred_billions_component,
+            pynutil.delete('0000') + graph_ten_billions_component,
+            pynutil.delete('00000') + graph_billions_component,
+            pynutil.delete('000000') + graph_thousand_million_component,
+            pynutil.delete('0000000') + graph_hundred_million_component,
+            pynutil.delete('00000000') + graph_ten_million_component,
+            pynutil.delete('000000000') + graph_million_component,
+            pynutil.delete('0000000000') + graph_hundred_thousand_component,
+            pynutil.delete('00000000000') + graph_ten_thousand_component,
+            pynutil.delete('000000000000') + graph_thousand_component,
+            pynutil.delete('0000000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_all)
+            )
+        )
+        graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component
+
+        # FST
+        graph_num = pynini.union(
+            graph_thousand_trillions,
+            graph_hundred_trillions,
+            graph_ten_trillions,
+            graph_trillions,
+            graph_hundred_billions,
+            graph_ten_billions,
+            graph_billions,
+            graph_thousand_million,
+            graph_hundred_million,
+            graph_ten_million,
+            graph_million,
+            graph_hundred_thousand,
+            graph_ten_thousand,
+            graph_thousand,
+            graph_hundred,
+            graph_all,
+            graph_zero,
+        ).optimize()
+
+        # Sign and final formatting
+        optional_sign = pynini.closure(
+            pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1
+        )
+        final_graph = (
+            optional_sign
+            + pynutil.insert('integer: "')
+            + graph_num
+            + pynutil.insert('"')
+        )
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
new file mode 100644
index 000000000..2b22da370
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.ko.graph_utils import (
+    NEMO_WHITE_SPACE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+
+from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
+from nemo_text_processing.utils.logging import logger
+
+
+class ClassifyFst(GraphFst):
+    """
+    Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
+    For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+
+    Args:
+        input_case: accepting either "lower_cased" or "cased" input.
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
+    """
+
+    def __init__(
+        self,
+        input_case: str = "cased",
+        deterministic: bool = True,
+        cache_dir: str = None,
+        overwrite_cache: bool = False,
+        whitelist: str = None,
+    ):
+        super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_tokenize.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            logger.info(f"ClassifyFst.fst was restored from {far_file}.")
+        else:
+            cardinal = CardinalFst(deterministic=deterministic)
+
+            classify = pynini.union(pynutil.add_weight(cardinal.fst, 1.1))
+
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            tagger = pynini.closure(token, 1)
+
+            self.fst = tagger.optimize()
+
+            if far_file:
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
diff --git a/nemo_text_processing/text_normalization/ko/utils.py b/nemo_text_processing/text_normalization/ko/utils.py
new file mode 100644
index 000000000..51aaea3e8
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/utils.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import os
+
+
+def get_abs_path(rel_path):
+    """
+    Get absolute path
+
+    Args:
+        rel_path: relative path to this file
+
+    Returns absolute path
+    """
+    return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
+
+
+def load_labels(abs_path):
+    """
+    loads relative path file as dictionary
+
+    Args:
+        abs_path: absolute path
+
+    Returns dictionary of mappings
+    """
+    with open(abs_path, encoding="utf-8") as label_tsv:
+        labels = list(csv.reader(label_tsv, delimiter="\t"))
+    return labels
+
+
+def augment_labels_with_punct_at_end(labels):
+    """
+    augments labels: if key ends on a punctuation that value does not have, add a new label
+    where the value maintains the punctuation
+
+    Args:
+        labels : input labels
+    Returns:
+        additional labels
+    """
+    res = []
+    for label in labels:
+        if len(label) > 1:
+            if label[0][-1] == "." and label[1][-1] != ".":
+                res.append([label[0], label[1] + "."] + label[2:])
+    return res
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py
new file mode 100644
index 000000000..c6a48ab33
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing cardinal, e.g.
+        cardinal { negative: "true" integer: "23" } -> 마이너스 이십삼
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic)
+
+        self.optional_sign = pynini.cross("negative: \"true\"", "마이너스 ")
+        if not deterministic:
+            self.optional_sign |= pynini.cross("negative: \"true\"", "음수 ")
+            self.optional_sign |= pynini.cross("negative: \"true\"", "- ")
+
+        self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
+
+        integer = pynini.closure(NEMO_NOT_QUOTE)
+
+        self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"")
+        integer = pynutil.delete("integer:") + self.integer
+
+        self.numbers = self.optional_sign + integer
+        delete_tokens = self.delete_tokens(self.numbers)
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
new file mode 100644
index 000000000..09ec216c2
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+import pynini
+
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_NOT_SPACE,
+    NEMO_SIGMA,
+    delete_space,
+    generator_main,
+)
+from nemo_text_processing.utils.logging import logger
+
+
+class PostProcessingFst:
+    """
+    Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g.
+    removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)"
+
+    Args:
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+    """
+
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, "zh_tn_post_processing.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
+            logger.info(f'Post processing graph was restored from {far_file}.')
+        else:
+            self.set_punct_dict()
+            self.fst = self.get_punct_postprocess_graph()
+
+            if far_file:
+                generator_main(far_file, {"post_process_graph": self.fst})
+
+    def set_punct_dict(self):
+        self.punct_marks = {
+            "'": [
+                "'",
+                '´',
+                'ʹ',
+                'ʻ',
+                'ʼ',
+                'ʽ',
+                'ʾ',
+                'ˈ',
+                'ˊ',
+                'ˋ',
+                '˴',
+                'ʹ',
+                '΄',
+                '՚',
+                '՝',
+                'י',
+                '׳',
+                'ߴ',
+                'ߵ',
+                'ᑊ',
+                'ᛌ',
+                '᾽',
+                '᾿',
+                '`',
+                '´',
+                '῾',
+                '‘',
+                '’',
+                '‛',
+                '′',
+                '‵',
+                'ꞌ',
+                '＇',
+                '｀',
+                '𖽑',
+                '𖽒',
+            ],
+        }
+
+    def get_punct_postprocess_graph(self):
+        """
+        Returns graph to post process punctuation marks.
+
+        {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept.
+        By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks.
+        """
+
+        remove_space_around_single_quote = pynini.cdrewrite(
+            delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA)
+        )
+        # this works if spaces in between (good)
+        # delete space between 2 NEMO_NOT_SPACE（left and right to the space) that are with in a content of NEMO_SIGMA
+
+        graph = remove_space_around_single_quote.optimize()
+
+        return graph
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py
new file mode 100644
index 000000000..9753db347
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+
+from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst
+from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst
+
+
+class VerbalizeFst(GraphFst):
+    """
+    Composes other verbalizer grammars.
+    For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic)
+
+        cardinal = CardinalFst(deterministic=deterministic)
+        cardinal_graph = cardinal.fst
+
+        self.fst = cardinal_graph.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
new file mode 100644
index 000000000..9a4e2f7bf
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.ko.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst
+from nemo_text_processing.utils.logging import logger
+
+
+class VerbalizeFinalFst(GraphFst):
+    """
+    Finite state transducer that verbalizes an entire sentence, e.g.
+    tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now .
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple options (used for audio-based normalization)
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+    """
+
+    def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False):
+        super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic)
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_verbalizer.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["verbalize"]
+            logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.')
+        else:
+            verbalize = VerbalizeFst(deterministic=deterministic).fst
+            # word = WordFst(deterministic=deterministic).fst
+            types = verbalize
+
+            if deterministic:
+                graph = (
+                    pynutil.delete("tokens")
+                    + delete_space
+                    + pynutil.delete("{")
+                    + delete_space
+                    + types
+                    + delete_space
+                    + pynutil.delete("}")
+                )
+            else:
+                graph = delete_space + types + delete_space
+
+            graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
+
+            self.fst = graph.optimize()
+            if far_file:
+                generator_main(far_file, {"verbalize": self.fst})
diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
index 82f8f43d2..1d90903e1 100644
--- a/nemo_text_processing/text_normalization/normalize.py
+++ b/nemo_text_processing/text_normalization/normalize.py
@@ -174,6 +174,9 @@ def __init__(
         elif lang == 'ja':
             from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst
             from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst
+        elif lang == 'ko':
+            from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
+            from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst    
         else:
             raise NotImplementedError(f"Language {lang} has not been supported yet.")
 
@@ -720,7 +723,7 @@ def parse_args():
     parser.add_argument(
         "--language",
         help="language",
-        choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi"],
+        choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko"],
         default="en",
         type=str,
     )
@@ -765,6 +768,7 @@ def parse_args():
     parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs")
     parser.add_argument("--batch_size", default=200, type=int, help="Number of examples for each process")
     parser.add_argument(
+        
         "--max_number_of_permutations_per_split",
         default=729,
         type=int,
diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/tests/nemo_text_processing/ko/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt
new file mode 100644
index 000000000..25dd560d1
--- /dev/null
+++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt
@@ -0,0 +1,19 @@
+1~일
+2~이
+3~삼
+123~백이십삼
+13000~만삼천
+9000~구천
+123000~십이만삼천
+123000012~일억이천삼백만십이
+1000000~백만
+100000000~일억
+1000000000000~일조
+100000000000000~백조
+20000000000001~이십조일
+800000000001001~팔백조천일
+82345670123135111~팔경이천삼백사십오조육천칠백일억이천삼백십삼만오천백십일
+9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구
+99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구
+999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
+9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
\ No newline at end of file
diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py
new file mode 100644
index 000000000..ed422e13e
--- /dev/null
+++ b/tests/nemo_text_processing/ko/test_cardinal.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from parameterized import parameterized
+
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from nemo_text_processing.text_normalization.normalize import Normalizer
+
+from ..utils import CACHE_DIR, parse_test_case_file
+
+
+class TestCardinal:
+    normalizer_ko = Normalizer(
+        lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased'
+    )
+
+    @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_norm(self, test_input, expected):
+        preds = self.normalizer_ko.normalize(test_input)
+        assert expected == preds
diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh
new file mode 100644
index 000000000..9a50509cf
--- /dev/null
+++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh
@@ -0,0 +1,123 @@
+#! /bin/sh
+GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
+TEST_DIR=${2:-"/workspace/tests"}
+
+runtest () {
+  input=$1
+  echo "INPUT is $input"
+  cd ${GRAMMARS_DIR}
+
+  # read test file
+  while read testcase; do
+    IFS='~' read written spoken <<< $testcase
+    # replace non breaking space with breaking space
+    # Use below if postprocessor is not used. Comment if it is used
+    #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
+    # Use below if postprocessor is  used. Comment if it is not used
+    denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
+
+    # trim white space
+    spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
+    denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
+
+    # input expected actual
+    assertEquals "$written" "$spoken" "$denorm_pred"
+  done < "$input"
+}
+
+
+testTNCardinal() {
+  input=$TEST_DIR/ko/data_text_normalization/test_cases_cardinal.txt
+  runtest $input
+}
+
+#testTNSpecialText() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt
+#  runtest $input
+#}
+
+#testTNDate() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_date.txt
+#  runtest $input
+#}
+
+#testTNDecimal() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt
+#  runtest $input
+#}
+
+#testTNRange() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_range.txt
+#  runtest $input
+#}
+
+#testTNSerial() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_serial.txt
+#  runtest $input
+#}
+
+#testTNRoman() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_roman.txt
+#  runtest $input
+#}
+
+#testTNElectronic() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt
+#  runtest $input
+#}
+
+#testTNFraction() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt
+#  runtest $input
+#}
+
+#testTNMoney() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_money.txt
+#  runtest $input
+#}
+
+#testTNOrdinal() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt
+#  runtest $input
+#}
+
+#testTNTelephone() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt
+#  runtest $input
+#}
+
+#testTNTime() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_time.txt
+#  runtest $input
+#}
+
+#testTNMeasure() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_measure.txt
+#  runtest $input
+#}
+
+#testTNWhitelist() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt
+#  runtest $input
+#}
+
+#testTNWord() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_word.txt
+#  runtest $input
+#}
+
+#testTNAddress() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_address.txt
+#  runtest $input
+#}
+
+#testTNMath() {
+#  input=$TEST_DIR/data_text_normalization/test_cases_math.txt
+#  runtest $input
+#}
+
+# Remove all command-line arguments
+shift $#
+
+# Load shUnit2
+. /workspace/shunit2/shunit2
\ No newline at end of file
diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py
index 6b82dfbec..0885f19c0 100644
--- a/tools/text_processing_deployment/pynini_export.py
+++ b/tools/text_processing_deployment/pynini_export.py
@@ -106,6 +106,7 @@ def parse_args():
             'mr',
             'ja',
             'rw',
+            'ko',
         ],
         type=str,
         default='en',
@@ -312,6 +313,16 @@ def parse_args():
             ClassifyFst as TNClassifyFst,
         )
         from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
+    elif args.language == 'ko':
+        from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import (
+            ClassifyFst as TNClassifyFst,
+        )
+        from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import (
+            VerbalizeFst as TNVerbalizeFst,
+        )
+        from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import (
+            PostProcessingFst as TNPostProcessingFst,
+        )
     output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}")
     export_grammars(
         output_dir=output_dir,

From eb6a8c07784852331f2f3b8ad34b92ac94d32d82 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 21 May 2025 22:21:40 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../text_normalization/ko/taggers/cardinal.py | 171 +++++++++---------
 .../text_normalization/normalize.py           |   3 +-
 .../pynini_export.py                          |   4 +-
 3 files changed, 92 insertions(+), 86 deletions(-)

diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
index 187ebd419..51c82e213 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -24,15 +24,15 @@ class CardinalFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
         # Load base .tsv files
-        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) 
-        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))         
-        
+        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
+        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
+
         digit_except_one = pynini.difference(NEMO_DIGIT, "1")
         digit_except_zero_one = pynini.difference(digit_except_one, "0")
-        
+
         graph_digit_alt = digit_except_zero_one @ graph_digit
         graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
-        graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))         
+        graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))
 
         # Compose all basic number forms
         graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit
@@ -50,7 +50,7 @@ def __init__(self, deterministic: bool = True):
             (pynini.closure(pynutil.delete('0')) + graph_all),
         )
         graph_thousand = thousands @ graph_thousand_component
-        
+
         ten_thousands = NEMO_DIGIT**5
         graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
             pynini.closure(pynutil.delete('0')),
@@ -59,16 +59,16 @@ def __init__(self, deterministic: bool = True):
             (pynini.closure(pynutil.delete('0')) + graph_all),
         )
         graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
-        
+
         hundred_thousands = NEMO_DIGIT**6
-        graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_all) + pynutil.insert('만')) + pynini.union(
+        graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
             (pynini.closure(pynutil.delete('0')) + graph_all),
         )
         graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component
-        
+
         millions = NEMO_DIGIT**7
         graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
@@ -79,15 +79,17 @@ def __init__(self, deterministic: bool = True):
         graph_million = millions @ graph_million_component
 
         ten_millions = NEMO_DIGIT**8
-        graph_ten_million_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')) + pynini.union(
+        graph_ten_million_component = (
+            (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')
+        ) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
             (pynini.closure(pynutil.delete('0')) + graph_all),
         )
         graph_ten_million = ten_millions @ graph_ten_million_component
-        
-        hundred_millions = NEMO_DIGIT ** 9
+
+        hundred_millions = NEMO_DIGIT**9
         graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_ten_million_component,
@@ -127,7 +129,9 @@ def __init__(self, deterministic: bool = True):
         graph_billions = billions @ graph_billions_component
 
         ten_billions = NEMO_DIGIT**12
-        graph_ten_billions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')) + pynini.union(
+        graph_ten_billions_component = (
+            (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')
+        ) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_ten_million_component,
             (pynutil.delete('0') + graph_million_component),
@@ -138,7 +142,7 @@ def __init__(self, deterministic: bool = True):
             (pynini.closure(pynutil.delete('0')) + graph_all),
         )
         graph_ten_billions = ten_billions @ graph_ten_billions_component
-        
+
         hundred_billions = NEMO_DIGIT**13
         graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
@@ -155,79 +159,91 @@ def __init__(self, deterministic: bool = True):
             (pynini.closure(pynutil.delete('0')) + graph_all),
         )
         graph_hundred_billions = hundred_billions @ graph_hundred_billions_component
-        
+
         trillion = NEMO_DIGIT**14
-        graph_trillion_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('조') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_ten_billions_component,
-            pynutil.delete('0') + graph_billions_component,
-            pynutil.delete('00') + graph_thousand_million_component,
-            pynutil.delete('000') + graph_hundred_million_component,
-            pynutil.delete('0000') + graph_ten_million_component,
-            pynutil.delete('00000') + graph_million_component,
-            pynutil.delete('000000') + graph_hundred_thousand_component,
-            pynutil.delete('0000000') + graph_ten_thousand_component,
-            pynutil.delete('00000000') + graph_thousand_component,
-            pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_all)
+        graph_trillion_component = (
+            (NEMO_DIGIT**2 @ graph_all)
+            + pynutil.insert('조')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_ten_billions_component,
+                pynutil.delete('0') + graph_billions_component,
+                pynutil.delete('00') + graph_thousand_million_component,
+                pynutil.delete('000') + graph_hundred_million_component,
+                pynutil.delete('0000') + graph_ten_million_component,
+                pynutil.delete('00000') + graph_million_component,
+                pynutil.delete('000000') + graph_hundred_thousand_component,
+                pynutil.delete('0000000') + graph_ten_thousand_component,
+                pynutil.delete('00000000') + graph_thousand_component,
+                pynutil.delete('000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_all),
             )
         )
         graph_trillions = trillion @ graph_trillion_component
 
         ten_trillions = NEMO_DIGIT**15
-        graph_ten_trillions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('조') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_ten_billions_component,
-            pynutil.delete('0') + graph_billions_component,
-            pynutil.delete('00') + graph_thousand_million_component,
-            pynutil.delete('000') + graph_hundred_million_component,
-            pynutil.delete('0000') + graph_ten_million_component,
-            pynutil.delete('00000') + graph_million_component,
-            pynutil.delete('000000') + graph_hundred_thousand_component,
-            pynutil.delete('0000000') + graph_ten_thousand_component,
-            pynutil.delete('00000000') + graph_thousand_component,
-            pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_all)
-            )       
+        graph_ten_trillions_component = (
+            (NEMO_DIGIT**3 @ graph_hundred_component)
+            + pynutil.insert('조')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_ten_billions_component,
+                pynutil.delete('0') + graph_billions_component,
+                pynutil.delete('00') + graph_thousand_million_component,
+                pynutil.delete('000') + graph_hundred_million_component,
+                pynutil.delete('0000') + graph_ten_million_component,
+                pynutil.delete('00000') + graph_million_component,
+                pynutil.delete('000000') + graph_hundred_thousand_component,
+                pynutil.delete('0000000') + graph_ten_thousand_component,
+                pynutil.delete('00000000') + graph_thousand_component,
+                pynutil.delete('000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_all),
+            )
         )
         graph_ten_trillions = ten_trillions @ graph_ten_trillions_component
 
         hundred_trillions = NEMO_DIGIT**16
-        graph_hundred_trillions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('조') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_ten_billions_component,
-            pynutil.delete('0') + graph_billions_component,
-            pynutil.delete('00') + graph_thousand_million_component,
-            pynutil.delete('000') + graph_hundred_million_component,
-            pynutil.delete('0000') + graph_ten_million_component,
-            pynutil.delete('00000') + graph_million_component,
-            pynutil.delete('000000') + graph_hundred_thousand_component,
-            pynutil.delete('0000000') + graph_ten_thousand_component,
-            pynutil.delete('00000000') + graph_thousand_component,
-            pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_all)
+        graph_hundred_trillions_component = (
+            (NEMO_DIGIT**4 @ graph_thousand_component)
+            + pynutil.insert('조')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_ten_billions_component,
+                pynutil.delete('0') + graph_billions_component,
+                pynutil.delete('00') + graph_thousand_million_component,
+                pynutil.delete('000') + graph_hundred_million_component,
+                pynutil.delete('0000') + graph_ten_million_component,
+                pynutil.delete('00000') + graph_million_component,
+                pynutil.delete('000000') + graph_hundred_thousand_component,
+                pynutil.delete('0000000') + graph_ten_thousand_component,
+                pynutil.delete('00000000') + graph_thousand_component,
+                pynutil.delete('000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_all),
             )
         )
         graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component
 
         thousand_trillions = NEMO_DIGIT**17
-        graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_hundred_trillions_component,
-            pynutil.delete('0') + graph_ten_trillions_component,
-            pynutil.delete('00') + graph_trillion_component,
-            pynutil.delete('000') + graph_hundred_billions_component,
-            pynutil.delete('0000') + graph_ten_billions_component,
-            pynutil.delete('00000') + graph_billions_component,
-            pynutil.delete('000000') + graph_thousand_million_component,
-            pynutil.delete('0000000') + graph_hundred_million_component,
-            pynutil.delete('00000000') + graph_ten_million_component,
-            pynutil.delete('000000000') + graph_million_component,
-            pynutil.delete('0000000000') + graph_hundred_thousand_component,
-            pynutil.delete('00000000000') + graph_ten_thousand_component,
-            pynutil.delete('000000000000') + graph_thousand_component,
-            pynutil.delete('0000000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_all)
+        graph_thousand_trillions_component = (
+            graph_digit
+            + pynutil.insert('경')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_hundred_trillions_component,
+                pynutil.delete('0') + graph_ten_trillions_component,
+                pynutil.delete('00') + graph_trillion_component,
+                pynutil.delete('000') + graph_hundred_billions_component,
+                pynutil.delete('0000') + graph_ten_billions_component,
+                pynutil.delete('00000') + graph_billions_component,
+                pynutil.delete('000000') + graph_thousand_million_component,
+                pynutil.delete('0000000') + graph_hundred_million_component,
+                pynutil.delete('00000000') + graph_ten_million_component,
+                pynutil.delete('000000000') + graph_million_component,
+                pynutil.delete('0000000000') + graph_hundred_thousand_component,
+                pynutil.delete('00000000000') + graph_ten_thousand_component,
+                pynutil.delete('000000000000') + graph_thousand_component,
+                pynutil.delete('0000000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_all),
             )
         )
         graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component
@@ -254,14 +270,7 @@ def __init__(self, deterministic: bool = True):
         ).optimize()
 
         # Sign and final formatting
-        optional_sign = pynini.closure(
-            pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1
-        )
-        final_graph = (
-            optional_sign
-            + pynutil.insert('integer: "')
-            + graph_num
-            + pynutil.insert('"')
-        )
+        optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1)
+        final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
         final_graph = self.add_tokens(final_graph)
         self.fst = final_graph.optimize()
diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
index 1d90903e1..1a9219574 100644
--- a/nemo_text_processing/text_normalization/normalize.py
+++ b/nemo_text_processing/text_normalization/normalize.py
@@ -176,7 +176,7 @@ def __init__(
             from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst
         elif lang == 'ko':
             from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
-            from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst    
+            from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst
         else:
             raise NotImplementedError(f"Language {lang} has not been supported yet.")
 
@@ -768,7 +768,6 @@ def parse_args():
     parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs")
     parser.add_argument("--batch_size", default=200, type=int, help="Number of examples for each process")
     parser.add_argument(
-        
         "--max_number_of_permutations_per_split",
         default=729,
         type=int,
diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py
index 0885f19c0..fe6a9ff7e 100644
--- a/tools/text_processing_deployment/pynini_export.py
+++ b/tools/text_processing_deployment/pynini_export.py
@@ -317,12 +317,10 @@ def parse_args():
         from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import (
             ClassifyFst as TNClassifyFst,
         )
-        from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import (
-            VerbalizeFst as TNVerbalizeFst,
-        )
         from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import (
             PostProcessingFst as TNPostProcessingFst,
         )
+        from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
     output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}")
     export_grammars(
         output_dir=output_dir,

From 4c104f0792c281cfd27d583834dd071d1eb88c8a Mon Sep 17 00:00:00 2001
From: Jinwoo Bae <bbae7050@gmail.com>
Date: Fri, 23 May 2025 20:11:55 -0700
Subject: [PATCH 3/6] Refactor Korean TN cardinal and postprocessing logic
 based on review feedback

Signed-off-by: Jinwoo Bae <bbae7050@gmail.com>
---
 .../ko/data/number/teen.tsv                   |  10 -
 .../text_normalization/ko/data/number/ty.tsv  |   1 +
 .../text_normalization/ko/graph_utils.py      |   2 +-
 .../text_normalization/ko/taggers/cardinal.py | 180 ++++++++----------
 .../ko/taggers/tokenize_and_classify.py       |   3 -
 .../ko/verbalizers/post_processing.py         |  70 +------
 .../ko/verbalizers/verbalize.py               |   2 -
 .../test_cases_cardinal.txt                   |  31 ++-
 .../nemo_text_processing/ko/test_cardinal.py  |   3 +-
 .../ko/test_sparrowhawk_normalization.sh      |  85 ---------
 10 files changed, 120 insertions(+), 267 deletions(-)
 delete mode 100644 nemo_text_processing/text_normalization/ko/data/number/teen.tsv

diff --git a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv b/nemo_text_processing/text_normalization/ko/data/number/teen.tsv
deleted file mode 100644
index 432fe5eb6..000000000
--- a/nemo_text_processing/text_normalization/ko/data/number/teen.tsv
+++ /dev/null
@@ -1,10 +0,0 @@
-10	십
-11	십일
-12	십이
-13	십삼
-14	십사
-15	십오
-16	십육
-17	십칠
-18	십팔
-19	십구
diff --git a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv
index 02623c44c..3d7bb221d 100644
--- a/nemo_text_processing/text_normalization/ko/data/number/ty.tsv
+++ b/nemo_text_processing/text_normalization/ko/data/number/ty.tsv
@@ -1,3 +1,4 @@
+1	십
 2	이십
 3	삼십
 4	사십
diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py
index a7ffdd2b1..9db51238f 100644
--- a/nemo_text_processing/text_normalization/ko/graph_utils.py
+++ b/nemo_text_processing/text_normalization/ko/graph_utils.py
@@ -23,7 +23,7 @@
 from pynini.export import export
 from pynini.lib import byte, pynutil, utf8
 
-from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
+from nemo_text_processing.text_normalization.en.utils import load_labels
 from nemo_text_processing.utils.logging import logger
 
 NEMO_CHAR = utf8.VALID_UTF8_CHAR
diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
index 51c82e213..32b53855f 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -29,25 +29,24 @@ def __init__(self, deterministic: bool = True):
 
         digit_except_one = pynini.difference(NEMO_DIGIT, "1")
         digit_except_zero_one = pynini.difference(digit_except_one, "0")
-
-        graph_digit_alt = digit_except_zero_one @ graph_digit
-        graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
-        graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))
+      
+        graph_digit_no_zero_one = digit_except_zero_one @ graph_digit
+        graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))       
 
         # Compose all basic number forms
-        graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit
+        graph_1_to_99 = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_digit
 
         hundreds = NEMO_DIGIT**3
-        graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_alt + pynutil.insert('백'))) + pynini.union(
-            pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all)
+        graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백'))) + pynini.union(
+            pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
         )
         graph_hundred = hundreds @ graph_hundred_component
 
         thousands = NEMO_DIGIT**4
-        graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_alt + pynutil.insert('천'))) + pynini.union(
+        graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_thousand = thousands @ graph_thousand_component
 
@@ -56,36 +55,35 @@ def __init__(self, deterministic: bool = True):
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
 
         hundred_thousands = NEMO_DIGIT**6
-        graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union(
+        
+        graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component
 
         millions = NEMO_DIGIT**7
-        graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union(
+        graph_million_component = ((graph_hundred) + pynutil.insert('만')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_million = millions @ graph_million_component
 
         ten_millions = NEMO_DIGIT**8
-        graph_ten_million_component = (
-            (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')
-        ) + pynini.union(
+        graph_ten_million_component = ((graph_thousand) + pynutil.insert('만')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_ten_million = ten_millions @ graph_ten_million_component
 
@@ -98,12 +96,12 @@ def __init__(self, deterministic: bool = True):
             (pynutil.delete('000') + graph_ten_thousand_component),
             (pynutil.delete('0000') + graph_thousand_component),
             ((pynutil.delete('00000') + graph_hundred_component)),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_hundred_million = hundred_millions @ graph_hundred_million_component
 
         thousand_millions = NEMO_DIGIT**10
-        graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('억')) + pynini.union(
+        graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('억')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_ten_million_component,
             (pynutil.delete('0') + graph_million_component),
@@ -111,12 +109,12 @@ def __init__(self, deterministic: bool = True):
             (pynutil.delete('000') + graph_ten_thousand_component),
             (pynutil.delete('0000') + graph_thousand_component),
             ((pynutil.delete('00000') + graph_hundred_component)),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_thousand_million = thousand_millions @ graph_thousand_million_component
 
         billions = NEMO_DIGIT**11
-        graph_billions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('억')) + pynini.union(
+        graph_billions_component = ((graph_hundred) + pynutil.insert('억')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_ten_million_component,
             (pynutil.delete('0') + graph_million_component),
@@ -124,14 +122,12 @@ def __init__(self, deterministic: bool = True):
             (pynutil.delete('000') + graph_ten_thousand_component),
             (pynutil.delete('0000') + graph_thousand_component),
             ((pynutil.delete('00000') + graph_hundred_component)),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_billions = billions @ graph_billions_component
 
         ten_billions = NEMO_DIGIT**12
-        graph_ten_billions_component = (
-            (NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')
-        ) + pynini.union(
+        graph_ten_billions_component = ((graph_thousand) + pynutil.insert('억')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_ten_million_component,
             (pynutil.delete('0') + graph_million_component),
@@ -139,7 +135,7 @@ def __init__(self, deterministic: bool = True):
             (pynutil.delete('000') + graph_ten_thousand_component),
             (pynutil.delete('0000') + graph_thousand_component),
             ((pynutil.delete('00000') + graph_hundred_component)),
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_ten_billions = ten_billions @ graph_ten_billions_component
 
@@ -156,94 +152,82 @@ def __init__(self, deterministic: bool = True):
             pynutil.delete('0000000') + graph_ten_thousand_component,
             pynutil.delete('00000000') + graph_thousand_component,
             pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_all),
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
         )
         graph_hundred_billions = hundred_billions @ graph_hundred_billions_component
 
         trillion = NEMO_DIGIT**14
-        graph_trillion_component = (
-            (NEMO_DIGIT**2 @ graph_all)
-            + pynutil.insert('조')
-            + pynini.union(
-                pynini.closure(pynutil.delete('0')),
-                graph_ten_billions_component,
-                pynutil.delete('0') + graph_billions_component,
-                pynutil.delete('00') + graph_thousand_million_component,
-                pynutil.delete('000') + graph_hundred_million_component,
-                pynutil.delete('0000') + graph_ten_million_component,
-                pynutil.delete('00000') + graph_million_component,
-                pynutil.delete('000000') + graph_hundred_thousand_component,
-                pynutil.delete('0000000') + graph_ten_thousand_component,
-                pynutil.delete('00000000') + graph_thousand_component,
-                pynutil.delete('000000000') + graph_hundred_component,
-                (pynini.closure(pynutil.delete('0')) + graph_all),
+        graph_trillion_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('조') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
             )
         )
         graph_trillions = trillion @ graph_trillion_component
 
         ten_trillions = NEMO_DIGIT**15
-        graph_ten_trillions_component = (
-            (NEMO_DIGIT**3 @ graph_hundred_component)
-            + pynutil.insert('조')
-            + pynini.union(
-                pynini.closure(pynutil.delete('0')),
-                graph_ten_billions_component,
-                pynutil.delete('0') + graph_billions_component,
-                pynutil.delete('00') + graph_thousand_million_component,
-                pynutil.delete('000') + graph_hundred_million_component,
-                pynutil.delete('0000') + graph_ten_million_component,
-                pynutil.delete('00000') + graph_million_component,
-                pynutil.delete('000000') + graph_hundred_thousand_component,
-                pynutil.delete('0000000') + graph_ten_thousand_component,
-                pynutil.delete('00000000') + graph_thousand_component,
-                pynutil.delete('000000000') + graph_hundred_component,
-                (pynini.closure(pynutil.delete('0')) + graph_all),
-            )
+        graph_ten_trillions_component = ((graph_hundred) + pynutil.insert('조') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
+            )       
         )
         graph_ten_trillions = ten_trillions @ graph_ten_trillions_component
 
         hundred_trillions = NEMO_DIGIT**16
-        graph_hundred_trillions_component = (
-            (NEMO_DIGIT**4 @ graph_thousand_component)
-            + pynutil.insert('조')
-            + pynini.union(
-                pynini.closure(pynutil.delete('0')),
-                graph_ten_billions_component,
-                pynutil.delete('0') + graph_billions_component,
-                pynutil.delete('00') + graph_thousand_million_component,
-                pynutil.delete('000') + graph_hundred_million_component,
-                pynutil.delete('0000') + graph_ten_million_component,
-                pynutil.delete('00000') + graph_million_component,
-                pynutil.delete('000000') + graph_hundred_thousand_component,
-                pynutil.delete('0000000') + graph_ten_thousand_component,
-                pynutil.delete('00000000') + graph_thousand_component,
-                pynutil.delete('000000000') + graph_hundred_component,
-                (pynini.closure(pynutil.delete('0')) + graph_all),
+        graph_hundred_trillions_component = ((graph_thousand) + pynutil.insert('조') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_ten_billions_component,
+            pynutil.delete('0') + graph_billions_component,
+            pynutil.delete('00') + graph_thousand_million_component,
+            pynutil.delete('000') + graph_hundred_million_component,
+            pynutil.delete('0000') + graph_ten_million_component,
+            pynutil.delete('00000') + graph_million_component,
+            pynutil.delete('000000') + graph_hundred_thousand_component,
+            pynutil.delete('0000000') + graph_ten_thousand_component,
+            pynutil.delete('00000000') + graph_thousand_component,
+            pynutil.delete('000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
             )
         )
         graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component
 
         thousand_trillions = NEMO_DIGIT**17
-        graph_thousand_trillions_component = (
-            graph_digit
-            + pynutil.insert('경')
-            + pynini.union(
-                pynini.closure(pynutil.delete('0')),
-                graph_hundred_trillions_component,
-                pynutil.delete('0') + graph_ten_trillions_component,
-                pynutil.delete('00') + graph_trillion_component,
-                pynutil.delete('000') + graph_hundred_billions_component,
-                pynutil.delete('0000') + graph_ten_billions_component,
-                pynutil.delete('00000') + graph_billions_component,
-                pynutil.delete('000000') + graph_thousand_million_component,
-                pynutil.delete('0000000') + graph_hundred_million_component,
-                pynutil.delete('00000000') + graph_ten_million_component,
-                pynutil.delete('000000000') + graph_million_component,
-                pynutil.delete('0000000000') + graph_hundred_thousand_component,
-                pynutil.delete('00000000000') + graph_ten_thousand_component,
-                pynutil.delete('000000000000') + graph_thousand_component,
-                pynutil.delete('0000000000000') + graph_hundred_component,
-                (pynini.closure(pynutil.delete('0')) + graph_all),
+        graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union(
+            pynini.closure(pynutil.delete('0')),
+            graph_hundred_trillions_component,
+            pynutil.delete('0') + graph_ten_trillions_component,
+            pynutil.delete('00') + graph_trillion_component,
+            pynutil.delete('000') + graph_hundred_billions_component,
+            pynutil.delete('0000') + graph_ten_billions_component,
+            pynutil.delete('00000') + graph_billions_component,
+            pynutil.delete('000000') + graph_thousand_million_component,
+            pynutil.delete('0000000') + graph_hundred_million_component,
+            pynutil.delete('00000000') + graph_ten_million_component,
+            pynutil.delete('000000000') + graph_million_component,
+            pynutil.delete('0000000000') + graph_hundred_thousand_component,
+            pynutil.delete('00000000000') + graph_ten_thousand_component,
+            pynutil.delete('000000000000') + graph_thousand_component,
+            pynutil.delete('0000000000000') + graph_hundred_component,
+            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
             )
         )
         graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component
@@ -265,7 +249,7 @@ def __init__(self, deterministic: bool = True):
             graph_ten_thousand,
             graph_thousand,
             graph_hundred,
-            graph_all,
+            graph_1_to_99,
             graph_zero,
         ).optimize()
 
diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
index 2b22da370..f9f868953 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
@@ -18,10 +18,7 @@
 from pynini.lib import pynutil
 
 from nemo_text_processing.text_normalization.ko.graph_utils import (
-    NEMO_WHITE_SPACE,
     GraphFst,
-    delete_extra_space,
-    delete_space,
     generator_main,
 )
 
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
index 09ec216c2..f5cc8298d 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
@@ -18,9 +18,7 @@
 import pynini
 
 from nemo_text_processing.text_normalization.en.graph_utils import (
-    NEMO_NOT_SPACE,
     NEMO_SIGMA,
-    delete_space,
     generator_main,
 )
 from nemo_text_processing.utils.logging import logger
@@ -41,73 +39,15 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
         far_file = None
         if cache_dir is not None and cache_dir != "None":
             os.makedirs(cache_dir, exist_ok=True)
-            far_file = os.path.join(cache_dir, "zh_tn_post_processing.far")
+            far_file = os.path.join(cache_dir, "ko_tn_post_processing.far")
         if not overwrite_cache and far_file and os.path.exists(far_file):
             self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
             logger.info(f'Post processing graph was restored from {far_file}.')
         else:
-            self.set_punct_dict()
-            self.fst = self.get_punct_postprocess_graph()
+            self.fst = self.get_postprocess_graph()
 
             if far_file:
                 generator_main(far_file, {"post_process_graph": self.fst})
-
-    def set_punct_dict(self):
-        self.punct_marks = {
-            "'": [
-                "'",
-                '´',
-                'ʹ',
-                'ʻ',
-                'ʼ',
-                'ʽ',
-                'ʾ',
-                'ˈ',
-                'ˊ',
-                'ˋ',
-                '˴',
-                'ʹ',
-                '΄',
-                '՚',
-                '՝',
-                'י',
-                '׳',
-                'ߴ',
-                'ߵ',
-                'ᑊ',
-                'ᛌ',
-                '᾽',
-                '᾿',
-                '`',
-                '´',
-                '῾',
-                '‘',
-                '’',
-                '‛',
-                '′',
-                '‵',
-                'ꞌ',
-                '＇',
-                '｀',
-                '𖽑',
-                '𖽒',
-            ],
-        }
-
-    def get_punct_postprocess_graph(self):
-        """
-        Returns graph to post process punctuation marks.
-
-        {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept.
-        By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks.
-        """
-
-        remove_space_around_single_quote = pynini.cdrewrite(
-            delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA)
-        )
-        # this works if spaces in between (good)
-        # delete space between 2 NEMO_NOT_SPACE（left and right to the space) that are with in a content of NEMO_SIGMA
-
-        graph = remove_space_around_single_quote.optimize()
-
-        return graph
+        
+    def get_postprocess_graph(self):
+        return pynini.cdrewrite(pynini.cross("", ""), "", "", pynini.closure(NEMO_SIGMA)).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py
index 9753db347..8f38048f1 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pynini
-
 from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst
 from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst
 
diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt
index 25dd560d1..40187f74e 100644
--- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt
+++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt
@@ -16,4 +16,33 @@
 9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구
 99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구
 999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
-9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
\ No newline at end of file
+9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
+19~십구
+76~칠십육
+379~삼백칠십구
+850~팔백오십
+1004~천사
+8326~팔천삼백이십육
+10383~만삼백팔십삼
+34892~삼만사천팔백구십이
+573234~오십칠만삼천이백삼십사
+982010~구십팔만이천십
+2349023~이백삼십사만구천이십삼
+4303189~사백삼십만삼천백팔십구
+60321589~육천삼십이만천오백팔십구
+88234568~팔천팔백이십삼만사천오백육십팔
+792133923~칠억구천이백십삼만삼천구백이십삼
+187624689~일억팔천칠백육십이만사천육백팔십구
+2304050708~이십삼억사백오만칠백팔
+6436789729~육십사억삼천육백칠십팔만구천칠백이십구
+78234580257~칠백팔십이억삼천사백오십팔만이백오십칠
+987654321345~구천팔백칠십육억오천사백삼십이만천삼백사십오
+2345678901234~이조삼천사백오십육억칠천팔백구십만천이백삼십사
+35791357913579~삼십오조칠천구백십삼억오천칠백구십일만삼천오백칠십구
+470369258147036~사백칠십조삼천육백구십이억오천팔백십사만칠천삼십육
+5048258149517395~오천사십팔조이천오백팔십일억사천구백오십일만칠천삼백구십오
+67890123045607890~육경칠천팔백구십조천이백삼십억사천오백육십만칠천팔백구십
+-2~마이너스 이
+-93~마이너스 구십삼
+-90325~마이너스 구만삼백이십오
+-3234567~마이너스 삼백이십삼만사천오백육십칠
\ No newline at end of file
diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py
index ed422e13e..763b7e607 100644
--- a/tests/nemo_text_processing/ko/test_cardinal.py
+++ b/tests/nemo_text_processing/ko/test_cardinal.py
@@ -15,10 +15,9 @@
 import pytest
 from parameterized import parameterized
 
-from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
 from nemo_text_processing.text_normalization.normalize import Normalizer
 
-from ..utils import CACHE_DIR, parse_test_case_file
+from ..utils import parse_test_case_file
 
 
 class TestCardinal:
diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh
index 9a50509cf..8c14c0336 100644
--- a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh
+++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh
@@ -31,91 +31,6 @@ testTNCardinal() {
   runtest $input
 }
 
-#testTNSpecialText() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt
-#  runtest $input
-#}
-
-#testTNDate() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_date.txt
-#  runtest $input
-#}
-
-#testTNDecimal() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt
-#  runtest $input
-#}
-
-#testTNRange() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_range.txt
-#  runtest $input
-#}
-
-#testTNSerial() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_serial.txt
-#  runtest $input
-#}
-
-#testTNRoman() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_roman.txt
-#  runtest $input
-#}
-
-#testTNElectronic() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt
-#  runtest $input
-#}
-
-#testTNFraction() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt
-#  runtest $input
-#}
-
-#testTNMoney() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_money.txt
-#  runtest $input
-#}
-
-#testTNOrdinal() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt
-#  runtest $input
-#}
-
-#testTNTelephone() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt
-#  runtest $input
-#}
-
-#testTNTime() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_time.txt
-#  runtest $input
-#}
-
-#testTNMeasure() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_measure.txt
-#  runtest $input
-#}
-
-#testTNWhitelist() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt
-#  runtest $input
-#}
-
-#testTNWord() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_word.txt
-#  runtest $input
-#}
-
-#testTNAddress() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_address.txt
-#  runtest $input
-#}
-
-#testTNMath() {
-#  input=$TEST_DIR/data_text_normalization/test_cases_math.txt
-#  runtest $input
-#}
-
 # Remove all command-line arguments
 shift $#
 

From 90513790bcdf47b1cd03dd6061cb5974ed0aa10c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 24 May 2025 03:36:29 +0000
Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../text_normalization/ko/taggers/cardinal.py | 144 ++++++++++--------
 .../ko/taggers/tokenize_and_classify.py       |   6 +-
 .../ko/verbalizers/post_processing.py         |   7 +-
 3 files changed, 82 insertions(+), 75 deletions(-)

diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
index 32b53855f..db530c931 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -29,21 +29,23 @@ def __init__(self, deterministic: bool = True):
 
         digit_except_one = pynini.difference(NEMO_DIGIT, "1")
         digit_except_zero_one = pynini.difference(digit_except_one, "0")
-      
+
         graph_digit_no_zero_one = digit_except_zero_one @ graph_digit
-        graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))       
+        graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
 
         # Compose all basic number forms
         graph_1_to_99 = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_digit
 
         hundreds = NEMO_DIGIT**3
-        graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백'))) + pynini.union(
-            pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
-        )
+        graph_hundred_component = (
+            pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백'))
+        ) + pynini.union(pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99))
         graph_hundred = hundreds @ graph_hundred_component
 
         thousands = NEMO_DIGIT**4
-        graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))) + pynini.union(
+        graph_thousand_component = (
+            pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))
+        ) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_hundred_component,
             (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
@@ -60,8 +62,8 @@ def __init__(self, deterministic: bool = True):
         graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
 
         hundred_thousands = NEMO_DIGIT**6
-        
-        graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union(
+
+        graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
@@ -157,77 +159,89 @@ def __init__(self, deterministic: bool = True):
         graph_hundred_billions = hundred_billions @ graph_hundred_billions_component
 
         trillion = NEMO_DIGIT**14
-        graph_trillion_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('조') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_ten_billions_component,
-            pynutil.delete('0') + graph_billions_component,
-            pynutil.delete('00') + graph_thousand_million_component,
-            pynutil.delete('000') + graph_hundred_million_component,
-            pynutil.delete('0000') + graph_ten_million_component,
-            pynutil.delete('00000') + graph_million_component,
-            pynutil.delete('000000') + graph_hundred_thousand_component,
-            pynutil.delete('0000000') + graph_ten_thousand_component,
-            pynutil.delete('00000000') + graph_thousand_component,
-            pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
+        graph_trillion_component = (
+            (NEMO_DIGIT**2 @ graph_1_to_99)
+            + pynutil.insert('조')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_ten_billions_component,
+                pynutil.delete('0') + graph_billions_component,
+                pynutil.delete('00') + graph_thousand_million_component,
+                pynutil.delete('000') + graph_hundred_million_component,
+                pynutil.delete('0000') + graph_ten_million_component,
+                pynutil.delete('00000') + graph_million_component,
+                pynutil.delete('000000') + graph_hundred_thousand_component,
+                pynutil.delete('0000000') + graph_ten_thousand_component,
+                pynutil.delete('00000000') + graph_thousand_component,
+                pynutil.delete('000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
             )
         )
         graph_trillions = trillion @ graph_trillion_component
 
         ten_trillions = NEMO_DIGIT**15
-        graph_ten_trillions_component = ((graph_hundred) + pynutil.insert('조') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_ten_billions_component,
-            pynutil.delete('0') + graph_billions_component,
-            pynutil.delete('00') + graph_thousand_million_component,
-            pynutil.delete('000') + graph_hundred_million_component,
-            pynutil.delete('0000') + graph_ten_million_component,
-            pynutil.delete('00000') + graph_million_component,
-            pynutil.delete('000000') + graph_hundred_thousand_component,
-            pynutil.delete('0000000') + graph_ten_thousand_component,
-            pynutil.delete('00000000') + graph_thousand_component,
-            pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
-            )       
+        graph_ten_trillions_component = (
+            (graph_hundred)
+            + pynutil.insert('조')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_ten_billions_component,
+                pynutil.delete('0') + graph_billions_component,
+                pynutil.delete('00') + graph_thousand_million_component,
+                pynutil.delete('000') + graph_hundred_million_component,
+                pynutil.delete('0000') + graph_ten_million_component,
+                pynutil.delete('00000') + graph_million_component,
+                pynutil.delete('000000') + graph_hundred_thousand_component,
+                pynutil.delete('0000000') + graph_ten_thousand_component,
+                pynutil.delete('00000000') + graph_thousand_component,
+                pynutil.delete('000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
+            )
         )
         graph_ten_trillions = ten_trillions @ graph_ten_trillions_component
 
         hundred_trillions = NEMO_DIGIT**16
-        graph_hundred_trillions_component = ((graph_thousand) + pynutil.insert('조') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_ten_billions_component,
-            pynutil.delete('0') + graph_billions_component,
-            pynutil.delete('00') + graph_thousand_million_component,
-            pynutil.delete('000') + graph_hundred_million_component,
-            pynutil.delete('0000') + graph_ten_million_component,
-            pynutil.delete('00000') + graph_million_component,
-            pynutil.delete('000000') + graph_hundred_thousand_component,
-            pynutil.delete('0000000') + graph_ten_thousand_component,
-            pynutil.delete('00000000') + graph_thousand_component,
-            pynutil.delete('000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
+        graph_hundred_trillions_component = (
+            (graph_thousand)
+            + pynutil.insert('조')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_ten_billions_component,
+                pynutil.delete('0') + graph_billions_component,
+                pynutil.delete('00') + graph_thousand_million_component,
+                pynutil.delete('000') + graph_hundred_million_component,
+                pynutil.delete('0000') + graph_ten_million_component,
+                pynutil.delete('00000') + graph_million_component,
+                pynutil.delete('000000') + graph_hundred_thousand_component,
+                pynutil.delete('0000000') + graph_ten_thousand_component,
+                pynutil.delete('00000000') + graph_thousand_component,
+                pynutil.delete('000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
             )
         )
         graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component
 
         thousand_trillions = NEMO_DIGIT**17
-        graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union(
-            pynini.closure(pynutil.delete('0')),
-            graph_hundred_trillions_component,
-            pynutil.delete('0') + graph_ten_trillions_component,
-            pynutil.delete('00') + graph_trillion_component,
-            pynutil.delete('000') + graph_hundred_billions_component,
-            pynutil.delete('0000') + graph_ten_billions_component,
-            pynutil.delete('00000') + graph_billions_component,
-            pynutil.delete('000000') + graph_thousand_million_component,
-            pynutil.delete('0000000') + graph_hundred_million_component,
-            pynutil.delete('00000000') + graph_ten_million_component,
-            pynutil.delete('000000000') + graph_million_component,
-            pynutil.delete('0000000000') + graph_hundred_thousand_component,
-            pynutil.delete('00000000000') + graph_ten_thousand_component,
-            pynutil.delete('000000000000') + graph_thousand_component,
-            pynutil.delete('0000000000000') + graph_hundred_component,
-            (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
+        graph_thousand_trillions_component = (
+            graph_digit
+            + pynutil.insert('경')
+            + pynini.union(
+                pynini.closure(pynutil.delete('0')),
+                graph_hundred_trillions_component,
+                pynutil.delete('0') + graph_ten_trillions_component,
+                pynutil.delete('00') + graph_trillion_component,
+                pynutil.delete('000') + graph_hundred_billions_component,
+                pynutil.delete('0000') + graph_ten_billions_component,
+                pynutil.delete('00000') + graph_billions_component,
+                pynutil.delete('000000') + graph_thousand_million_component,
+                pynutil.delete('0000000') + graph_hundred_million_component,
+                pynutil.delete('00000000') + graph_ten_million_component,
+                pynutil.delete('000000000') + graph_million_component,
+                pynutil.delete('0000000000') + graph_hundred_thousand_component,
+                pynutil.delete('00000000000') + graph_ten_thousand_component,
+                pynutil.delete('000000000000') + graph_thousand_component,
+                pynutil.delete('0000000000000') + graph_hundred_component,
+                (pynini.closure(pynutil.delete('0')) + graph_1_to_99),
             )
         )
         graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component
diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
index f9f868953..0676446e5 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
@@ -17,11 +17,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import (
-    GraphFst,
-    generator_main,
-)
-
+from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main
 from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
 from nemo_text_processing.utils.logging import logger
 
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
index f5cc8298d..7ba146cff 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py
@@ -17,10 +17,7 @@
 
 import pynini
 
-from nemo_text_processing.text_normalization.en.graph_utils import (
-    NEMO_SIGMA,
-    generator_main,
-)
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, generator_main
 from nemo_text_processing.utils.logging import logger
 
 
@@ -48,6 +45,6 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
 
             if far_file:
                 generator_main(far_file, {"post_process_graph": self.fst})
-        
+
     def get_postprocess_graph(self):
         return pynini.cdrewrite(pynini.cross("", ""), "", "", pynini.closure(NEMO_SIGMA)).optimize()

From 54781489cd1e69ae83e387f85602c74efc9e05b8 Mon Sep 17 00:00:00 2001
From: Jinwoo Bae <bbae7050@gmail.com>
Date: Sun, 25 May 2025 12:52:50 -0700
Subject: [PATCH 5/6] Add __init__.py to ko/data directory

Signed-off-by: Jinwoo Bae <bbae7050@gmail.com>
---
 .../text_normalization/ko/data/__init__.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 nemo_text_processing/text_normalization/ko/data/__init__.py

diff --git a/nemo_text_processing/text_normalization/ko/data/__init__.py b/nemo_text_processing/text_normalization/ko/data/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 833c7b90c90a4e05bb54e1dc1e6fbec1b0efef4e Mon Sep 17 00:00:00 2001
From: Jinwoo Bae <bbae7050@gmail.com>
Date: Tue, 3 Jun 2025 10:49:57 -0700
Subject: [PATCH 6/6] Update KO_TN_CACHE to trigger Korean CI run

Signed-off-by: Jinwoo Bae <bbae7050@gmail.com>
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c3339c7bc..253af49c2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -28,7 +28,7 @@ pipeline {
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
     HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
-    KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0'
+    KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-25-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {