Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_SPACE, GraphFst
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst


class CardinalFst(GraphFst):
Expand All @@ -27,8 +27,8 @@ class CardinalFst(GraphFst):
tn_cardinal: cardinal FST for TN
"""

def __init__(self, tn_cardinal):
super().__init__(name="cardinal", kind="classify")
def __init__(self, tn_cardinal, project_input: bool = False):
super().__init__(name="cardinal", kind="classify", project_input=project_input)

self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import (
NEMO_SPACE,
GraphFst,
delete_extra_space,
insert_space,
)
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SPACE, GraphFst, delete_extra_space


class DecimalFst(GraphFst):
Expand All @@ -32,8 +27,8 @@ class DecimalFst(GraphFst):
tn_decimal: Text normalization Decimal graph
"""

def __init__(self, tn_decimal):
super().__init__(name="decimal", kind="classify")
def __init__(self, tn_decimal, project_input: bool = False):
super().__init__(name="decimal", kind="classify", project_input=project_input)

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,8 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import (
NEMO_NOT_QUOTE,
GraphFst,
delete_space,
delete_zero_or_one_space,
insert_space,
)
from nemo_text_processing.text_normalization.ar.utils import get_abs_path
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_zero_or_one_space, insert_space


class FractionFst(GraphFst):
Expand All @@ -35,8 +29,8 @@ class FractionFst(GraphFst):

"""

def __init__(self, tn_cardinal: GraphFst):
super().__init__(name="fraction", kind="classify")
def __init__(self, tn_cardinal: GraphFst, project_input: bool = False):
super().__init__(name="fraction", kind="classify", project_input=project_input)

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("سالب", "\"true\" "), 0, 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst, convert_space, delete_extra_space
from nemo_text_processing.text_normalization.ar.taggers.measure import unit_singular
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space, delete_extra_space


class MeasureFst(GraphFst):
Expand All @@ -35,9 +35,9 @@ def __init__(
itn_cardinal_tagger: GraphFst,
itn_decimal_tagger: GraphFst,
itn_fraction_tagger: GraphFst,
deterministic: bool = True,
project_input: bool = False,
):
super().__init__(name="measure", kind="classify", deterministic=deterministic)
super().__init__(name="measure", kind="classify", project_input=project_input)

cardinal_graph = itn_cardinal_tagger.graph

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import (
from nemo_text_processing.text_normalization.ar.taggers.money import ar_cur, maj_singular, min_plural, min_singular
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
NEMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from nemo_text_processing.text_normalization.ar.taggers.money import ar_cur, maj_singular, min_plural, min_singular


class MoneyFst(GraphFst):
Expand All @@ -37,8 +36,8 @@ class MoneyFst(GraphFst):
itn_cardinal_tagger: ITN Cardinal Tagger
"""

def __init__(self, itn_cardinal_tagger: GraphFst, deterministic: bool = True):
super().__init__(name="money", kind="classify", deterministic=deterministic)
def __init__(self, itn_cardinal_tagger: GraphFst, project_input: bool = False):
super().__init__(name="money", kind="classify", project_input=project_input)

cardinal_graph = itn_cardinal_tagger.graph

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst


class PunctuationFst(GraphFst):
Expand All @@ -23,8 +23,8 @@ class PunctuationFst(GraphFst):
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""

def __init__(self):
super().__init__(name="punctuation", kind="classify")
def __init__(self, project_input: bool = False):
super().__init__(name="punctuation", kind="classify", project_input=project_input)

s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~"
punct = pynini.union(*s)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@
from nemo_text_processing.inverse_text_normalization.ar.taggers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.ar.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.ar.taggers.word import WordFst
from nemo_text_processing.text_normalization.ar.graph_utils import (
from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst as TNClassifyFst
from nemo_text_processing.text_normalization.en.graph_utils import (
INPUT_LOWER_CASED,
GraphFst,
delete_extra_space,
delete_space,
generate_far_filename,
generator_main,
)
from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst as TNClassifyFst
from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED
from nemo_text_processing.utils.logging import logger


Expand All @@ -52,6 +53,7 @@ def __init__(
self,
cache_dir: str = None,
overwrite_cache: bool = False,
project_input: bool = False,
whitelist: str = None,
input_case: str = INPUT_LOWER_CASED,
):
Expand All @@ -60,33 +62,45 @@ def __init__(
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, f"ar_itn_{input_case}.far")
far_file = generate_far_filename(
language="ar",
mode="itn",
cache_dir=cache_dir,
operation="tokenize_and_classify",
project_input=project_input,
input_case=input_case,
whitelist_file=whitelist,
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logger.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logger.info(f"Creating ClassifyFst grammars.")
tn_classify = TNClassifyFst(
input_case='cased', deterministic=True, cache_dir=cache_dir, overwrite_cache=True
input_case='cased',
deterministic=True,
project_input=project_input,
cache_dir=cache_dir,
overwrite_cache=True,
)

cardinal = CardinalFst(tn_cardinal=tn_classify.cardinal)
cardinal = CardinalFst(tn_cardinal=tn_classify.cardinal, project_input=project_input)
cardinal_graph = cardinal.fst
decimal = DecimalFst(tn_decimal=tn_classify.decimal)
decimal = DecimalFst(tn_decimal=tn_classify.decimal, project_input=project_input)
decimal_graph = decimal.fst
fraction = FractionFst(tn_cardinal=tn_classify.cardinal)
fraction = FractionFst(tn_cardinal=tn_classify.cardinal, project_input=project_input)
fraction_graph = fraction.fst
money = MoneyFst(itn_cardinal_tagger=cardinal)
money = MoneyFst(itn_cardinal_tagger=cardinal, project_input=project_input)
money_graph = money.fst
measure = MeasureFst(
itn_cardinal_tagger=cardinal,
itn_decimal_tagger=decimal,
itn_fraction_tagger=fraction,
deterministic=True,
project_input=project_input,
)
measure_graph = measure.fst
word_graph = WordFst().fst
punct_graph = PunctuationFst().fst
word_graph = WordFst(project_input=project_input).fst
punct_graph = PunctuationFst(project_input=project_input).fst

classify = (
pynutil.add_weight(cardinal_graph, 1.1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_SPACE, GraphFst
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst


class WordFst(GraphFst):
Expand All @@ -23,7 +23,7 @@ class WordFst(GraphFst):
e.g. sleep -> tokens { name: "sleep" }
"""

def __init__(self):
super().__init__(name="word", kind="classify")
def __init__(self, project_input: bool = False):
super().__init__(name="word", kind="classify", project_input=project_input)
word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
self.fst = word.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ class CardinalFst(GraphFst):
e.g. cardinal { integer: "23" negative: "-" } -> -23
"""

def __init__(self):
super().__init__(name="cardinal", kind="verbalize")
def __init__(self, project_input: bool = False):
super().__init__(name="cardinal", kind="verbalize", project_input=project_input)
optional_sign = pynini.closure(
pynutil.delete("negative:")
+ delete_space
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space


class DecimalFst(GraphFst):
Expand All @@ -24,8 +24,8 @@ class DecimalFst(GraphFst):
decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" } -> -12.5006 billion
"""

def __init__(self):
super().__init__(name="decimal", kind="verbalize")
def __init__(self, project_input: bool = False):
super().__init__(name="decimal", kind="verbalize", project_input=project_input)
optionl_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1)
integer = (
pynutil.delete("integer_part:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import (
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SPACE,
GraphFst,
Expand All @@ -31,8 +31,8 @@ class FractionFst(GraphFst):
e.g. fraction { numerator: "8" denominator: "3" } -> "8/3"
"""

def __init__(self):
super().__init__(name="fraction", kind="verbalize")
def __init__(self, project_input: bool = False):
super().__init__(name="fraction", kind="verbalize", project_input=project_input)

optional_negative = pynutil.delete("negative: \"") + pynini.cross("True", "-") + pynutil.delete("\"")
optional_negative = pynini.closure(optional_negative + delete_space, 0, 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, GraphFst, delete_space
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space


class MeasureFst(GraphFst):
Expand All @@ -28,8 +28,8 @@ class MeasureFst(GraphFst):
cardinal: ITN Cardinal verbalizer
"""

def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="measure", kind="verbalize", deterministic=deterministic)
def __init__(self, decimal: GraphFst, cardinal: GraphFst, project_input: bool = False):
super().__init__(name="measure", kind="verbalize", project_input=project_input)
optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"), 0, 1)
unit = (
pynutil.delete("units:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ar.graph_utils import NEMO_CHAR, GraphFst, delete_space
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space


class MoneyFst(GraphFst):
Expand All @@ -27,8 +27,8 @@ class MoneyFst(GraphFst):
decimal: ITN Decimal verbalizer
"""

def __init__(self, decimal: GraphFst, deterministic: bool = True):
super().__init__(name="money", kind="verbalize", deterministic=deterministic)
def __init__(self, decimal: GraphFst, project_input: bool = False):
super().__init__(name="money", kind="verbalize", project_input=project_input)
unit = (
pynutil.delete("currency:")
+ delete_space
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from nemo_text_processing.inverse_text_normalization.ar.verbalizers.fraction import FractionFst
from nemo_text_processing.inverse_text_normalization.ar.verbalizers.measure import MeasureFst
from nemo_text_processing.inverse_text_normalization.ar.verbalizers.money import MoneyFst
from nemo_text_processing.text_normalization.ar.graph_utils import GraphFst
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst


class VerbalizeFst(GraphFst):
Expand All @@ -28,17 +28,17 @@ class VerbalizeFst(GraphFst):
More details to deployment at NeMo/tools/text_processing_deployment.
"""

def __init__(self):
def __init__(self, project_input: bool = False):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal = CardinalFst(project_input=project_input)
cardinal_graph = cardinal.fst
decimal = DecimalFst()
decimal = DecimalFst(project_input=project_input)
decimal_graph = decimal.fst
fraction = FractionFst()
fraction = FractionFst(project_input=project_input)
fraction_graph = fraction.fst
money = MoneyFst(decimal, deterministic=True)
money = MoneyFst(decimal, project_input=project_input)
money_graph = money.fst
measure = MeasureFst(decimal=decimal, cardinal=cardinal, deterministic=True)
measure = MeasureFst(decimal=decimal, cardinal=cardinal, project_input=project_input)
measure_graph = measure.fst
graph = cardinal_graph | decimal_graph | fraction_graph | money_graph | measure_graph
self.fst = graph
Loading