Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 14 additions & 15 deletions nemo_text_processing/text_normalization/ar/taggers/money.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")
)

graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
graph_integer_only |= graph_maj_plural + insert_space + graph_integer
currency_first = pynutil.insert(' morphosyntactic_features: "currency_first"')
# Currency-first tagging for exactly one major unit (e.g. $1 -> دولار واحد).
graph_integer_one_unit = graph_maj_singular + insert_space + graph_integer_one + currency_first

# For local currency ".ك"
# For local currency ".ك"
graph_integer_only_ar = graph_integer + insert_space + graph_ar_cur
# graph_decimal_ar = graph_decimal_final + insert_space + graph_ar_cur

graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_integer_only_ar
graph = (graph_integer_one_unit + optional_delete_fractional_zeros) | graph_integer_only_ar

# remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
# e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
Expand All @@ -112,9 +112,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):

preserve_order = pynutil.insert(" preserve_order: true")
integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural
integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular
# non zero integer part
integer_plus_maj = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj
integer_plus_maj_with_one = integer_plus_maj | (
graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular
)
# Amount == 1 without fractional part uses graph_integer_one_unit / graph_one_prefix.
integer_plus_maj_no_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj
integer_plus_maj_with_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj_with_one

graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "")
graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"")
Expand All @@ -141,11 +144,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
graph_fractional_up_to_ten + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural
)

graph_with_no_minor_curr = integer_plus_maj
graph_with_no_minor_curr |= pynutil.add_weight(
integer_plus_maj,
weight=0.0001,
)
graph_with_no_minor_curr = integer_plus_maj_no_minor

graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order

Expand All @@ -154,9 +153,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
if graph_with_no_minor is None
else pynini.union(graph_with_no_minor, graph_with_no_minor_curr)
)
decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min
decimal_graph_with_minor_curr = integer_plus_maj_with_minor + pynini.cross(".", " ") + fractional_plus_min
decimal_graph_with_minor_curr |= pynutil.add_weight(
integer_plus_maj
integer_plus_maj_with_minor
+ pynini.cross(".", " ")
+ pynutil.insert("fractional_part: \"")
+ two_digits_fractional_part @ cardinal_graph
Expand Down
10 changes: 5 additions & 5 deletions nemo_text_processing/text_normalization/ar/verbalizers/money.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MoneyFst(GraphFst):
Finite state transducer for verbalizing money, e.g.
money { integer_part: "تسعة" currency_maj: "يورو" preserve_order: true} -> "تسعة يورو"
money { integer_part: "تسعة" currency_maj: "دولار" preserve_order: true} -> "تسعة دولار"
money { currency_maj: "دولار" integer_part: "واحد" morphosyntactic_features: "currency_first"} -> "دولار واحد"
money { integer_part: "خمسة" currency_maj: "دينار كويتي"} -> "خمسة دينار كويتي"

Args:
Expand All @@ -49,9 +50,10 @@ def __init__(self, deterministic: bool = True):

integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
add_and = pynutil.insert(" و")
morph_currency_first = pynutil.delete(' morphosyntactic_features: "currency_first"')

# *** currency_maj
graph_integer = maj + keep_space + integer_part
# currency_maj before integer_part; disambiguated via morphosyntactic_features for Sparrowhawk.
graph_currency_first = maj + keep_space + integer_part + delete_space + morph_currency_first

# *** currency_maj + (***) (و) *** current_min
graph_integer_with_minor = (
Expand All @@ -65,12 +67,10 @@ def __init__(self, deterministic: bool = True):
+ pynini.closure(keep_space + min, 0, 1)
+ delete_preserve_order
)
# this graph fix word order from dollar three (دولار تسعة)--> three dollar (تسعة دولار)
graph_integer_no_minor = integer_part + keep_space + maj + delete_space + delete_preserve_order
# *** current_min
graph_minor = fractional_part + keep_space + delete_space + min + delete_preserve_order

graph = graph_integer | graph_integer_with_minor | graph_minor | graph_integer_no_minor
graph = graph_currency_first | graph_integer_with_minor | graph_minor | graph_integer_no_minor

delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
71 changes: 71 additions & 0 deletions tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#! /bin/sh
GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
TEST_DIR=${2:-"/workspace/tests/ar"}

runtest () {
input=$1
echo "INPUT is $input"
cd ${GRAMMARS_DIR}

while IFS= read -r testcase; do
IFS='~' read -r written spoken <<< "$testcase"

escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g')
denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')

spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"

assertEquals "$written" "$spoken" "$denorm_pred"
done < "$input"
}

# For test files stored as expected~input (spoken~written).
runtest_swapped () {
input=$1
echo "INPUT is $input"
cd ${GRAMMARS_DIR}

while IFS= read -r testcase; do
IFS='~' read -r spoken written <<< "$testcase"

escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g')
denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')

spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"

assertEquals "$written" "$spoken" "$denorm_pred"
done < "$input"
}

testTNCardinal() {
input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt
runtest $input
}

testTNDecimal() {
input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt
runtest $input
}

testTNFraction() {
input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt
runtest_swapped $input
}

testTNMeasure() {
input=$TEST_DIR/data_text_normalization/test_cases_measure.txt
runtest_swapped $input
}

testTNMoney() {
input=$TEST_DIR/data_text_normalization/test_cases_money.txt
runtest $input
}

# Remove all command-line arguments
shift $#

# Load shUnit2
. /workspace/shunit2/shunit2
1 change: 1 addition & 0 deletions tools/text_processing_deployment/pynini_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def parse_args():
from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import (
ClassifyFst as TNClassifyFst,
)
from nemo_text_processing.text_normalization.ar.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
elif args.language == 'it':
from nemo_text_processing.text_normalization.it.taggers.tokenize_and_classify import (
ClassifyFst as TNClassifyFst,
Expand Down
Loading