diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 925fa348e..7762804d4 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -80,14 +80,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") ) - graph_integer_only = graph_maj_singular + insert_space + graph_integer_one - graph_integer_only |= graph_maj_plural + insert_space + graph_integer + currency_first = pynutil.insert(' morphosyntactic_features: "currency_first"') + # Currency-first tagging for exactly one major unit (e.g. $1 -> دولار واحد). + graph_integer_one_unit = graph_maj_singular + insert_space + graph_integer_one + currency_first - # For local currency "9د.ك" + # For local currency "5د.ك" graph_integer_only_ar = graph_integer + insert_space + graph_ar_cur - # graph_decimal_ar = graph_decimal_final + insert_space + graph_ar_cur - graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_integer_only_ar + graph = (graph_integer_one_unit + optional_delete_fractional_zeros) | graph_integer_only_ar # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 @@ -112,9 +112,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural - integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular - # non zero integer part - integer_plus_maj = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj + integer_plus_maj_with_one = integer_plus_maj | ( + graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular + ) + # Amount == 1 without fractional part uses graph_integer_one_unit / graph_one_prefix. + integer_plus_maj_no_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj + integer_plus_maj_with_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj_with_one graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "") graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"") @@ -141,11 +144,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_fractional_up_to_ten + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural ) - graph_with_no_minor_curr = integer_plus_maj - graph_with_no_minor_curr |= pynutil.add_weight( - integer_plus_maj, - weight=0.0001, - ) + graph_with_no_minor_curr = integer_plus_maj_no_minor graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order @@ -154,9 +153,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): if graph_with_no_minor is None else pynini.union(graph_with_no_minor, graph_with_no_minor_curr) ) - decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min + decimal_graph_with_minor_curr = integer_plus_maj_with_minor + pynini.cross(".", " ") + fractional_plus_min decimal_graph_with_minor_curr |= pynutil.add_weight( - integer_plus_maj + integer_plus_maj_with_minor + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal_graph diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/money.py b/nemo_text_processing/text_normalization/ar/verbalizers/money.py index 46da10742..9f5041b13 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/money.py @@ -28,6 +28,7 @@ class MoneyFst(GraphFst): Finite state transducer for verbalizing money, e.g. money { integer_part: "تسعة" currency_maj: "يورو" preserve_order: true} -> "تسعة يورو" money { integer_part: "تسعة" currency_maj: "دولار" preserve_order: true} -> "تسعة دولار" + money { currency_maj: "دولار" integer_part: "واحد" morphosyntactic_features: "currency_first"} -> "دولار واحد" money { integer_part: "خمسة" currency_maj: "دينار كويتي"} -> "خمسة دينار كويتي" Args: @@ -49,9 +50,10 @@ def __init__(self, deterministic: bool = True): integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") add_and = pynutil.insert(" و") + morph_currency_first = pynutil.delete(' morphosyntactic_features: "currency_first"') - # *** currency_maj - graph_integer = maj + keep_space + integer_part + # currency_maj before integer_part; disambiguated via morphosyntactic_features for Sparrowhawk. + graph_currency_first = maj + keep_space + integer_part + delete_space + morph_currency_first # *** currency_maj + (***) (و) *** current_min graph_integer_with_minor = ( @@ -65,12 +67,10 @@ def __init__(self, deterministic: bool = True): + pynini.closure(keep_space + min, 0, 1) + delete_preserve_order ) - # this graph fix word order from dollar three (دولار تسعة)--> three dollar (تسعة دولار) graph_integer_no_minor = integer_part + keep_space + maj + delete_space + delete_preserve_order - # *** current_min graph_minor = fractional_part + keep_space + delete_space + min + delete_preserve_order - graph = graph_integer | graph_integer_with_minor | graph_minor | graph_integer_no_minor + graph = graph_currency_first | graph_integer_with_minor | graph_minor | graph_integer_no_minor delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh new file mode 100755 index 000000000..6998a6fbc --- /dev/null +++ b/tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh @@ -0,0 +1,71 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ar"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + while IFS= read -r testcase; do + IFS='~' read -r written spoken <<< "$testcase" + + escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g') + denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + +# For test files stored as expected~input (spoken~written). +runtest_swapped () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + while IFS= read -r testcase; do + IFS='~' read -r spoken written <<< "$testcase" + + escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g') + denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + +testTNCardinal() { + input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNDecimal() { + input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +testTNFraction() { + input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt + runtest_swapped $input +} + +testTNMeasure() { + input=$TEST_DIR/data_text_normalization/test_cases_measure.txt + runtest_swapped $input +} + +testTNMoney() { + input=$TEST_DIR/data_text_normalization/test_cases_money.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 03705f2b6..73a4fc138 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -278,6 +278,7 @@ def parse_args(): from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.ar.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'it': from nemo_text_processing.text_normalization.it.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst,