From d0a46f46665a75c2e167ec39fd152bcc9bad9a62 Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 4 Aug 2025 12:43:13 +0100 Subject: [PATCH 1/6] feat: port normalization from phoonnx --- ovos_dialog_normalizer_plugin/__init__.py | 287 +-------- ovos_dialog_normalizer_plugin/util.py | 718 ++++++++++++++++++++++ requirements.txt | 5 +- 3 files changed, 725 insertions(+), 285 deletions(-) create mode 100644 ovos_dialog_normalizer_plugin/util.py diff --git a/ovos_dialog_normalizer_plugin/__init__.py b/ovos_dialog_normalizer_plugin/__init__.py index 7c3447b..3a3e65e 100644 --- a/ovos_dialog_normalizer_plugin/__init__.py +++ b/ovos_dialog_normalizer_plugin/__init__.py @@ -1,11 +1,10 @@ -import re from typing import Optional, Tuple from ovos_bus_client.session import Session, SessionManager -from ovos_number_parser import pronounce_number from ovos_plugin_manager.templates.transformers import DialogTransformer from ovos_utils.log import LOG -from unicode_rbnf import RbnfEngine, FormatPurpose + +from ovos_dialog_normalizer_plugin.util import normalize class DialogNormalizerTransformer(DialogTransformer): @@ -14,244 +13,6 @@ class DialogNormalizerTransformer(DialogTransformer): - Handles common abbreviations - Supports multiple languages """ - CONTRACTIONS = { - "en": { - "I'd": "I would", - "I'll": "I will", - "I'm": "I am", - "I've": "I have", - "ain't": "is not", - "aren't": "are not", - "can't": "can not", - "could've": "could have", - "couldn't": "could not", - "didn't": "did not", - "doesn't": "does not", - "don't": "do not", - "gonna": "going to", - "gotta": "got to", - "hadn't": "had not", - "hasn't": "has not", - "haven't": "have not", - "he'd": "he would", - "he'll": "he will", - "he's": "he is", - "how'd": "how did", - "how'll": "how will", - "how's": "how is", - "isn't": "is not", - "it'd": "it would", - "it'll": "it will", - "it's": "it is", - "might've": "might have", - "mightn't": "might not", - "must've": "must have", - "mustn't": "must not", - "needn't": "need not", - "oughtn't": "ought not", - "shan't": "shall not", - "she'd": "she would", - "she'll": "she will", - "she's": "she is", - "should've": "should have", - "shouldn't": "should not", - "somebody's": "somebody is", - "someone'd": "someone would", - "someone'll": "someone will", - "someone's": "someone is", - "that'd": "that would", - "that'll": "that will", - "that's": "that is", - "there'd": "there would", - "there're": "there are", - "there's": "there is", - "they'd": "they would", - "they'll": "they will", - "they're": "they are", - "they've": "they have", - "wasn't": "was not", - "we'd": "we would", - "we'll": "we will", - "we're": "we are", - "we've": "we have", - "weren't": "were not", - "what'd": "what did", - "what'll": "what will", - "what're": "what are", - "what's": "what is", - "what've": "what have", - "whats": "what is", - "when'd": "when did", - "when's": "when is", - "where'd": "where did", - "where's": "where is", - "where've": "where have", - "who'd": "who would", - "who'd've": "who would have", - "who'll": "who will", - "who're": "who are", - "who's": "who is", - "who've": "who have", - "why'd": "why did", - "why're": "why are", - "why's": "why is", - "won't": "will not", - "won't've": "will not have", - "would've": "would have", - "wouldn't": "would not", - "wouldn't've": "would not have", - "y'ain't": "you are not", - "y'aint": "you are not", - "y'all": "you all", - "ya'll": "you all", - "you'd": "you would", - "you'd've": "you would have", - "you'll": "you will", - "you're": "you are", - "you've": "you have", - "I'm'a": "I am going to", - "I'm'o": "I am going to", - "I'll've": "I will have", - "I'd've": "I would have", - "Whatcha": "What are you", - "amn't": "am not", - "'cause": "because", - "can't've": "cannot have", - "couldn't've": "could not have", - "daren't": "dare not", - "daresn't": "dare not", - "dasn't": "dare not", - "everyone's": "everyone is", - "gimme": "give me", - "gon't": "go not", - "hadn't've": "had not have", - "he've": "he would have", - "he'll've": "he will have", - "he'd've": "he would have", - "here's": "here is", - "how're": "how are", - "how'd'y": "how do you do", - "howd'y": "how do you do", - "howdy": "how do you do", - "'tis": "it is", - "'twas": "it was", - "it'll've": "it will have", - "it'd've": "it would have", - "kinda": "kind of", - "let's": "let us", - "ma'am": "madam", - "may've": "may have", - "mayn't": "may not", - "mightn't've": "might not have", - "mustn't've": "must not have", - "needn't've": "need not have", - "ol'": "old", - "oughtn't've": "ought not have", - "sha'n't": "shall not", - "shan't": "shall not", - "shalln't": "shall not", - "shan't've": "shall not have", - "she'd've": "she would have", - "shouldn't've": "should not have", - "so've": "so have", - "so's": "so is", - "something's": "something is", - "that're": "that are", - "that'd've": "that would have", - "there'll": "there will", - "there'd've": "there would have", - "these're": "these are", - "they'll've": "they will have", - "they'd've": "they would have", - "this's": "this is", - "this'll": "this will", - "this'd": "this would", - "those're": "those are", - "to've": "to have", - "wanna": "want to", - "we'll've": "we will have", - "we'd've": "we would have", - "what'll've": "what will have", - "when've": "when have", - "where're": "where are", - "which's": "which is", - "who'll've": "who will have", - "why've": "why have", - "will've": "will have", - "y'all're": "you all are", - "y'all've": "you all have", - "y'all'd": "you all would", - "y'all'd've": "you all would have", - "you'll've": "you will have" - } - } - - TITLES = { - "en": { - "Dr.": "Doctor", - "Mr.": "Mister", - "Prof.": "Professor" - }, - "ca": { - "Dr.": "Doctor", - "Sr.": "Senyor", - "Sra.": "Senyora", - "Prof.": "Professor" - }, - "es": { - "Dr.": "Doctor", - "Sr.": "Señor", - "Sra.": "Señora", - "Prof.": "Profesor", - "D.": "Don", - "Dña.": "Doña" - }, - "pt": { - "Dr.": "Doutor", - "Sr.": "Senhor", - "Sra.": "Senhora", - "Prof.": "Professor", - "Drª.": "Doutora", - "Eng.": "Engenheiro", - "D.": "Dom", - "Dª": "Dona" - }, - "gl": { - "Dr.": "Doutor", - "Sr.": "Señor", - "Sra.": "Señora", - "Prof.": "Profesor", - "Srta.": "Señorita" - }, - "fr": { - "Dr.": "Docteur", - "M.": "Monsieur", - "Mme": "Madame", - "Mlle": "Mademoiselle", - "Prof.": "Professeur", - "Pr.": "Professeur" - }, - "it": { - "Dr.": "Dottore", - "Sig.": "Signore", - "Sig.ra": "Signora", - "Prof.": "Professore", - "Dott.ssa": "Dottoressa", - "Sig.na": "Signorina" - }, - "nl": { - "Dr.": "Dokter", - "Dhr.": "De Heer", - "Mevr.": "Mevrouw", - "Prof.": "Professor", - "Drs.": "Dokterandus", - "Ing.": "Ingenieur" - }, - "de": { - "Dr.": "Doktor", - "Prof.": "Professor" - } - } def __init__(self, name="ovos-dialog-normalizer-plugin", priority=5, config=None): super().__init__(name=name, priority=priority, config=config) @@ -260,51 +21,9 @@ def transform(self, dialog: str, context: Optional[dict] = None) -> Tuple[str, d """Normalize dialog text.""" context = context or {} sess = Session.deserialize(context["session"]) if "session" in context else SessionManager.get() - lang = sess.lang.split("-")[0] - original = dialog try: - rbnf_engine = RbnfEngine.for_language(lang) - except: # doesnt support lang - rbnf_engine = None - - # substitute ' €' by 'euros' and 'someword€' by 'someword euros' - dialog = re.sub(r"(\w+)\s*€", r"\1 euros", dialog) - - try: - # TODO - add language specific code here if needed - if lang == "gl": - # substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados' - dialog = re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", dialog) - - words = dialog.split() - for idx, word in enumerate(words): - - if word in self.CONTRACTIONS.get(lang, {}): - words[idx] = self.CONTRACTIONS[lang][word] - continue - - if word in self.TITLES.get(lang, {}): - words[idx] = self.TITLES[lang][word] - continue - - if word.isdigit(): - try: - words[idx] = pronounce_number(int(word), lang=sess.lang) - except Exception as e: - LOG.error(f"ovos-number-parser failed to pronounce number: {word} - ({e})") - - # NOTE: pronounce_digit may return the digit itself again for some languages (upstream bug) - # we recheck if isdigit() to handle this - if rbnf_engine and words[idx].isdigit(): - # fallback to unicode RBNF - try: - words[idx] = rbnf_engine.format_number(word, FormatPurpose.CARDINAL).text - except Exception as e: - LOG.error(f"unicode-rbnf failed to pronounce number: {word} - ({e})") - - dialog = " ".join(words) - + dialog = normalize(original, sess.lang) LOG.debug(f"normalized dialog: '{original}' -> '{dialog}'") except Exception as e: LOG.error(f"Failed to normalize dialog: {e}") diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py new file mode 100644 index 0000000..324e33e --- /dev/null +++ b/ovos_dialog_normalizer_plugin/util.py @@ -0,0 +1,718 @@ +import datetime +import logging +import re +import string +from datetime import date + +from ovos_date_parser import nice_time, nice_date +from ovos_number_parser import pronounce_number, pronounce_fraction +from ovos_number_parser.util import is_numeric +from unicode_rbnf import RbnfEngine, FormatPurpose + +LOG = logging.getLogger("normalize") + +# A dictionary of common contractions and their expanded forms. +# This list is very comprehensive for English. +CONTRACTIONS = { + "en": { + "I'd": "I would", + "I'll": "I will", + "I'm": "I am", + "I've": "I have", + "ain't": "is not", + "aren't": "are not", + "can't": "can not", + "could've": "could have", + "couldn't": "could not", + "didn't": "did not", + "doesn't": "does not", + "don't": "do not", + "gonna": "going to", + "gotta": "got to", + "hadn't": "had not", + "hasn't": "has not", + "haven't": "have not", + "he'd": "he would", + "he'll": "he will", + "he's": "he is", + "how'd": "how did", + "how'll": "how will", + "how's": "how is", + "isn't": "is not", + "it'd": "it would", + "it'll": "it will", + "it's": "it is", + "might've": "might have", + "mightn't": "might not", + "must've": "must have", + "mustn't": "must not", + "needn't": "need not", + "oughtn't": "ought not", + "shan't": "shall not", + "she'd": "she would", + "she'll": "she will", + "she's": "she is", + "should've": "should have", + "shouldn't": "should not", + "somebody's": "somebody is", + "someone'd": "someone would", + "someone'll": "someone will", + "someone's": "someone is", + "that'd": "that would", + "that'll": "that will", + "that's": "that is", + "there'd": "there would", + "there're": "there are", + "there's": "there is", + "they'd": "they would", + "they'll": "they will", + "they're": "they are", + "they've": "they have", + "wasn't": "was not", + "we'd": "we would", + "we'll": "we will", + "we're": "we are", + "we've": "we have", + "weren't": "were not", + "what'd": "what did", + "what'll": "what will", + "what're": "what are", + "what's": "what is", + "what've": "what have", + "whats": "what is", + "when'd": "when did", + "when's": "when is", + "where'd": "where did", + "where's": "where is", + "where've": "where have", + "who'd": "who would", + "who'd've": "who would have", + "who'll": "who will", + "who're": "who are", + "who's": "who is", + "who've": "who have", + "why'd": "why did", + "why're": "why are", + "why's": "why is", + "won't": "will not", + "won't've": "will not have", + "would've": "would have", + "wouldn't": "would not", + "wouldn't've": "would not have", + "y'ain't": "you are not", + "y'aint": "you are not", + "y'all": "you all", + "ya'll": "you all", + "you'd": "you would", + "you'd've": "you would have", + "you'll": "you will", + "you're": "you are", + "you've": "you have", + "I'm'a": "I am going to", + "I'm'o": "I am going to", + "I'll've": "I will have", + "I'd've": "I would have", + "Whatcha": "What are you", + "amn't": "am not", + "'cause": "because", + "can't've": "cannot have", + "couldn't've": "could not have", + "daren't": "dare not", + "daresn't": "dare not", + "dasn't": "dare not", + "everyone's": "everyone is", + "gimme": "give me", + "gon't": "go not", + "hadn't've": "had not have", + "he've": "he would have", + "he'll've": "he will have", + "he'd've": "he would have", + "here's": "here is", + "how're": "how are", + "how'd'y": "how do you do", + "howd'y": "how do you do", + "howdy": "how do you do", + "'tis": "it is", + "'twas": "it was", + "it'll've": "it will have", + "it'd've": "it would have", + "kinda": "kind of", + "let's": "let us", + "ma'am": "madam", + "may've": "may have", + "mayn't": "may not", + "mightn't've": "might not have", + "mustn't've": "must not have", + "needn't've": "need not have", + "ol'": "old", + "oughtn't've": "ought not have", + "sha'n't": "shall not", + "shan't": "shall not", + "shalln't": "shall not", + "shan't've": "shall not have", + "she'd've": "she would have", + "shouldn't've": "should not have", + "so've": "so have", + "so's": "so is", + "something's": "something is", + "that're": "that are", + "that'd've": "that would have", + "there'll": "there will", + "there'd've": "there would have", + "these're": "these are", + "they'll've": "they will have", + "they'd've": "they would have", + "this's": "this is", + "this'll": "this will", + "this'd": "this would", + "those're": "those are", + "to've": "to have", + "wanna": "want to", + "we'll've": "we will have", + "we'd've": "we would have", + "what'll've": "what will have", + "when've": "when have", + "where're": "where are", + "which's": "which is", + "who'll've": "who will have", + "why've": "why have", + "will've": "will have", + "y'all're": "you all are", + "y'all've": "you all have", + "y'all'd": "you all would", + "y'all'd've": "you all would have", + "you'll've": "you will have" + } +} + +# Dictionaries for titles, units, and their full word equivalents. +TITLES = { + "en": { + "Dr.": "Doctor", + "Mr.": "Mister", + "Prof.": "Professor" + }, + "ca": { + "Dr.": "Doctor", + "Sr.": "Senyor", + "Sra.": "Senyora", + "Prof.": "Professor" + }, + "es": { + "Dr.": "Doctor", + "Sr.": "Señor", + "Sra.": "Señora", + "Prof.": "Profesor", + "D.": "Don", + "Dña.": "Doña" + }, + "pt": { + "Dr.": "Doutor", + "Sr.": "Senhor", + "Sra.": "Senhora", + "Prof.": "Professor", + "Drª.": "Doutora", + "Eng.": "Engenheiro", + "D.": "Dom", + "Dª": "Dona" + }, + "gl": { + "Dr.": "Doutor", + "Sr.": "Señor", + "Sra.": "Señora", + "Prof.": "Profesor", + "Srta.": "Señorita" + }, + "fr": { + "Dr.": "Docteur", + "M.": "Monsieur", + "Mme": "Madame", + "Mlle": "Mademoiselle", + "Prof.": "Professeur", + "Pr.": "Professeur" + }, + "it": { + "Dr.": "Dottore", + "Sig.": "Signore", + "Sig.ra": "Signora", + "Prof.": "Professore", + "Dott.ssa": "Dottoressa", + "Sig.na": "Signorina" + }, + "nl": { + "Dr.": "Dokter", + "Dhr.": "De Heer", + "Mevr.": "Mevrouw", + "Prof.": "Professor", + "Drs.": "Dokterandus", + "Ing.": "Ingenieur" + }, + "de": { + "Dr.": "Doktor", + "Prof.": "Professor" + } +} + +UNITS = { + "en": { + "€": "euros", + "%": "per cent", + "°C": "degrees celsius", + "°F": "degrees fahrenheit", + "°K": "degrees kelvin", + "°": "degrees", + "$": "dollars", + "£": "pounds", + "km": "kilometers", + "m": "meters", + "cm": "centimeters", + "mm": "millimeters", + "ft": "feet", + "in": "inches", + "yd": "yards", + "mi": "miles", + "kg": "kilograms", + "g": "grams", + "lb": "pounds", + "oz": "ounces", + "L": "liters", + "mL": "milliliters", + "gal": "gallons", + "qt": "quarts", + "pt": "pints", + "hr": "hours", + "min": "minutes", + "s": "seconds" + }, + "pt": { + "€": "euros", + "%": "por cento", + "°C": "graus celsius", + "°F": "graus fahrenheit", + "°K": "graus kelvin", + "°": "graus", + "$": "dólares", + "£": "libras", + "km": "quilômetros", + "m": "metros", + "cm": "centímetros", + "mm": "milímetros", + "kg": "quilogramas", + "g": "gramas", + "L": "litros", + "mL": "mililitros", + "h": "horas", + "min": "minutos", + "s": "segundos" + }, + "es": { + "€": "euros", + "%": "por ciento", + "°C": "grados celsius", + "°F": "grados fahrenheit", + "°K": "grados kelvin", + "°": "grados", + "$": "dólares", + "£": "libras", + "km": "kilómetros", + "m": "metros", + "cm": "centímetros", + "kg": "kilogramos", + "g": "gramos", + "L": "litros", + "mL": "millilitros" + }, + "fr": { + "€": "euros", + "%": "pour cent", + "°C": "degrés celsius", + "°F": "degrés fahrenheit", + "°K": "degrés kelvin", + "°": "degrés", + "$": "dollars", + "£": "livres", + "km": "kilomètres", + "m": "mètres", + "cm": "centimètres", + "kg": "kilogrammes", + "g": "grammes", + "L": "litres", + "mL": "millilitres" + }, + "de": { + "€": "Euro", + "%": "Prozent", + "°C": "Grad Celsius", + "°F": "Grad Fahrenheit", + "°K": "Grad Kelvin", + "°": "Grad", + "$": "Dollar", + "£": "Pfund", + "km": "Kilometer", + "m": "Meter", + "cm": "Zentimeter", + "kg": "Kilogramm", + "g": "Gramm", + "L": "Liter", + "mL": "Milliliter" + } +} + + +def _get_number_separators(full_lang: str) -> tuple[str, str]: + """ + Determines decimal and thousands separators based on language. + Defaults to '.' decimal and ',' thousands for most languages. + Special cases: + - 'pt', 'es', 'fr', 'de': ',' decimal and '.' thousands. + """ + lang_code = full_lang.split("-")[0] + decimal_separator = '.' + thousands_separator = ',' + if lang_code in ["pt", "es", "fr", "de"]: + decimal_separator = ',' + thousands_separator = '.' + return decimal_separator, thousands_separator + + +def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str: + """ + Helper function to normalize a single word that is a number, handling + decimal and thousands separators based on locale. + """ + cleaned_word = word.rstrip(string.punctuation) + + # Handle fractions like '3/3' + if is_fraction(cleaned_word): + try: + return pronounce_fraction(cleaned_word, full_lang) + word[len(cleaned_word):] + except Exception as e: + LOG.error(f"ovos-number-parser failed to pronounce fraction: {word} - ({e})") + return word + + # Handle numbers with locale-specific separators + decimal_separator, thousands_separator = _get_number_separators(full_lang) + temp_cleaned_word = cleaned_word + + # Check if the word contains a thousands separator followed by digits and a decimal separator + # This is a specific check for formats like '123.456,78' + has_thousands_and_decimal = ( + thousands_separator in temp_cleaned_word and + decimal_separator in temp_cleaned_word and + temp_cleaned_word.index(thousands_separator) < temp_cleaned_word.index(decimal_separator) + ) + + if has_thousands_and_decimal: + temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "") + temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".") + elif decimal_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(decimal_separator, ".", 1)): + # Handle cases like '1,2' -> '1.2' + temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".") + elif thousands_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(thousands_separator, "", 1)): + # Handle cases like '1.234' -> '1234' + temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "") + + # Check if the word is a valid number after processing + if is_numeric(temp_cleaned_word): + try: + num = float(temp_cleaned_word) if "." in temp_cleaned_word else int(temp_cleaned_word) + return pronounce_number(num, lang=full_lang) + word[len(cleaned_word):] + except Exception as e: + LOG.error(f"ovos-number-parser failed to pronounce number: {word} - ({e})") + return word + + elif rbnf_engine and cleaned_word.isdigit(): + try: + pronounced_number = rbnf_engine.format_number(cleaned_word, FormatPurpose.CARDINAL).text + return pronounced_number + word[len(cleaned_word):] + except Exception as e: + LOG.error(f"unicode-rbnf failed to pronounce number: {word} - ({e})") + return word + + return word + + +# --- Date and Time Pronunciation --- +def pronounce_date(date_obj: date, full_lang: str) -> str: + """ + Pronounces a date object using ovos-date-parser. + """ + return nice_date(date_obj, full_lang) + + +def pronounce_time(time_string: str, full_lang: str) -> str: + """ + Pronounces a time string using ovos-date-parser. + Handles military time like "15h01" and converts it to a + datetime.time object before passing it to nice_time. + """ + try: + hours, mins = time_string.split("h") + time_obj = datetime.time(int(hours), int(mins)) + # Use nice_time from ovos-date-parser + return nice_time(time_obj, full_lang, speech=True, use_24hour=True, use_ampm=False) + except Exception as e: + LOG.warning(f"Failed to parse time string '{time_string}': {e}") + return time_string.replace("h", " ") + + +def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DMY") -> str: + """ + Helper function to normalize dates and times using regular expressions. + This prepares the strings for pronunciation. + """ + lang_code = full_lang.split("-")[0] + # Pre-process with regex to handle English am/pm times + if lang_code == "en": + text = re.sub(r"(?i)(\d+)(am|pm)", r"\1 \2", text) + # Handle the pronunciation for TTS + text = text.replace("am", "A M").replace("pm", "P M") + + # Normalize times like "15h01" to words + time_pattern = re.compile(r"(\d{1,2})h(\d{2})", re.IGNORECASE) + + def replace_time(match): + time_str = match.group(0) + return pronounce_time(time_str, full_lang) + + text = time_pattern.sub(replace_time, text) + + # Find dates like "DD/MM/YYYY" or "YYYY/MM/DD" + date_pattern = re.compile(r"(\d{1,4})[/-](\d{1,2})[/-](\d{1,4})") + + match = date_pattern.search(text) + + if match: + # Get the three parts of the date string + part1_str, part2_str, part3_str = match.groups() + p1, p2, p3 = int(part1_str), int(part2_str), int(part3_str) + + # Initialize month, day, and year + month, day, year = None, None, None + + # Determine year first based on length (4 digits) + if len(part1_str) == 4: + year, rest_parts = p1, [p2, p3] + elif len(part3_str) == 4: + year, rest_parts = p3, [p1, p2] + else: + # If no 4-digit year, it's ambiguous, assume a 2-digit year. + # We'll assume the last part is the year based on common patterns. + year = p3 + # Expand 2-digit year to 4-digit year + if year < 100: + # Assume years 00-29 are 2000-2029, 30-99 are 1930-1999 + year = 2000 + year if year < 30 else 1900 + year + rest_parts = [p1, p2] + + # From the remaining parts, try to determine day and month + if day is None and any(p > 12 and len(str(p)) == 2 for p in rest_parts): + # If a two-digit number is > 12, it's a day + day_candidate = next((p for p in rest_parts if p > 12), None) + if day_candidate: + day = day_candidate + rest_parts.remove(day_candidate) + month = rest_parts[0] + + # Fallback to date_format if day/month are still ambiguous + if day is None or month is None: + if date_format.lower() == "mdy": + month, day = rest_parts[0], rest_parts[1] + else: # default to DD/MM/YY + day, month = rest_parts[0], rest_parts[1] + + try: + date_obj = date(year, month, day) + pronounced_date_str = pronounce_date(date_obj, full_lang) + text = text.replace(match.group(0), pronounced_date_str) + except (ValueError, IndexError) as e: + LOG.warning(f"Could not parse date from '{match.group(0)}': {e}") + + return text + + +def _normalize_word_hyphen_digit(text: str) -> str: + """ + Helper function to normalize words attached to digits with a hyphen, + such as 'sub-23' -> 'sub 23'. + """ + # Regex to find a word (\w+) followed by a hyphen and a digit (\d+) + pattern = re.compile(r"(\w+)-(\d+)") + text = pattern.sub(r"\1 \2", text) + return text + + +def _normalize_units(text: str, full_lang: str) -> str: + """ + Helper function to normalize units attached to numbers. + This function handles symbolic and alphanumeric units separately + to avoid issues with word boundaries. + """ + text = text.replace("º", "°") # these characters look the same... but... + lang_code = full_lang.split("-")[0] + if lang_code in UNITS: + # Determine number separators for the language + decimal_separator, thousands_separator = _get_number_separators(full_lang) + + # Separate units into symbolic and alphanumeric + symbolic_units = {k: v for k, v in UNITS[lang_code].items() if not k.isalnum()} + alphanumeric_units = {k: v for k, v in UNITS[lang_code].items() if k.isalnum()} + + # Create regex pattern for symbolic units and replace them first + sorted_symbolic = sorted(symbolic_units.keys(), key=len, reverse=True) + symbolic_pattern_str = "|".join(re.escape(unit) for unit in sorted_symbolic) + if symbolic_pattern_str: + # Pattern to match numbers with optional thousands and decimal separators + number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)" + symbolic_pattern = re.compile(number_pattern_str + r"\s*(" + symbolic_pattern_str + r")", re.IGNORECASE) + + def replace_symbolic(match): + number = match.group(1) + # Remove thousands separator and replace decimal separator for parsing + if thousands_separator in number and decimal_separator in number: + number = number.replace(thousands_separator, "").replace(decimal_separator, ".") + elif decimal_separator != "." and decimal_separator in number: + number = number.replace(decimal_separator, ".") + unit_symbol = match.group(2) + unit_word = symbolic_units[unit_symbol] + try: + return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}" + except Exception as e: + LOG.error(f"Failed to pronounce number with unit: {number}{unit_symbol} - ({e})") + return match.group(0) + + text = symbolic_pattern.sub(replace_symbolic, text) + + # Create regex pattern for alphanumeric units and replace them next + sorted_alphanumeric = sorted(alphanumeric_units.keys(), key=len, reverse=True) + alphanumeric_pattern_str = "|".join(re.escape(unit) for unit in sorted_alphanumeric) + if alphanumeric_pattern_str: + number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)" + alphanumeric_pattern = re.compile(number_pattern_str + r"\s*(" + alphanumeric_pattern_str + r")\b", + re.IGNORECASE) + + def replace_alphanumeric(match): + number = match.group(1) + # Remove thousands separator and replace decimal separator for parsing + if thousands_separator in number and decimal_separator in number: + number = number.replace(thousands_separator, "").replace(decimal_separator, ".") + elif decimal_separator != "." and decimal_separator in number: + number = number.replace(decimal_separator, ".") + unit_symbol = match.group(2) + unit_word = alphanumeric_units[unit_symbol] + return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}" + + text = alphanumeric_pattern.sub(replace_alphanumeric, text) + return text + + +def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str: + """ + Helper function to normalize a single word. + """ + lang_code = full_lang.split("-")[0] + + if word in CONTRACTIONS.get(lang_code, {}): + return CONTRACTIONS[lang_code][word] + + if word in TITLES.get(lang_code, {}): + return TITLES[lang_code][word] + + # Delegate number parsing to the new helper function + normalized_number = _normalize_number_word(word, full_lang, rbnf_engine) + if normalized_number != word: + return normalized_number + + return word + + +def is_fraction(word: str) -> bool: + """Checks if a word is a fraction like '3/3'.""" + if "/" in word: + parts = word.split("/") + if len(parts) == 2: + n1, n2 = parts + return n1.isdigit() and n2.isdigit() + return False + + +def normalize(text: str, lang: str) -> str: + """ + Normalizes a text string by expanding contractions, titles, and pronouncing + numbers, units, and fractions. + """ + full_lang = lang + lang_code = full_lang.split("-")[0] + dialog = text + + # Step 1: Handle dates and times with ovos-date-parser + date_format = "MDY" if full_lang.lower() == "en-us" else "DMY" + dialog = _normalize_dates_and_times(dialog, full_lang, date_format) + + # Step 2: Normalize words with hyphens and digits + dialog = _normalize_word_hyphen_digit(dialog) + + # Step 3: Expand units attached to numbers + dialog = _normalize_units(dialog, full_lang) + + # Step 4: Normalize word-by-word + words = dialog.split() + rbnf_engine = None + try: + rbnf_engine = RbnfEngine.for_language(lang_code) + except (ValueError, KeyError) as e: + LOG.debug(f"RBNF engine not available for language '{lang_code}': {e}") + + normalized_words = [_normalize_word(word, full_lang, rbnf_engine) for word in words] + dialog = " ".join(normalized_words) + + return dialog + + +if __name__ == "__main__": + # --- Example usage for demonstration purposes --- + + # General normalization examples + print("General English example: " + normalize('I\'m Dr. Prof. 3/3 0.5% of 12345€, 5ft, and 10kg', 'en')) + print( + f"Word Salad Portuguese (Dr. Prof. 3/3 0,5% de 12345€, 5m, e 10kg): {normalize('Dr. Prof. 3/3 0,5% de 12345€, 5m, e 10kg', 'pt')}") + print( + f"Word Salad Portuguese (Dr. Prof. 3/3 0.5% de 12345€, 5m, e 10kg): {normalize('Dr. Prof. 3/3 0.5% de 12345€, 5m, e 10kg', 'pt')}") + + # Portuguese examples with comma decimal separator + print("\n--- Portuguese Decimal Separator Examples ---") + print( + f"Original: 'A coima aplicada é de 1,2 milhões de euros.' Normalized: '{normalize('A coima aplicada é de 1,2 milhões de euros.', 'pt')}'") + print( + f"Original: 'Agora, tem 1,88 metros e muito para contar.' Normalized: '{normalize('Agora, tem 1,88 metros e muito para contar.', 'pt')}'") + print( + f"Original: 'Ainda temos 1,7 milhões de pobres!' Normalized: '{normalize('Ainda temos 1,7 milhões de pobres!', 'pt')}'") + print(f"Original: 'O lucro foi de 123.456,78€.' Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt')}'") + print(f"Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt-PT')}'") + + # English dates and times + print("\n--- English Date & Time Examples ---") + print(f"English date (MDY format): {normalize('The date is 08/03/2025', 'en-US')}") + print(f"English ambiguous date (MDY assumed): {normalize('The report is due 15/05/2025', 'en-US')}") + print(f"English date with dashes: {normalize('The event is on 11-04-2025', 'en-US')}") + print(f"English AM/PM time: {normalize('The meeting is at 10am', 'en-US')}") + print(f"English military time: {normalize('The party is at 19h30', 'en-US')}") + print(f"English month name: {normalize('The report is due 15 May 2025', 'en-US')}") + + # Portuguese dates and times + print("\n--- Portuguese Date & Time Examples ---") + print(f"Portuguese date (A data é 03/08/2025): {normalize('A data é 03/08/2025', 'pt')}") + print( + f"Portuguese ambiguous date (O relatório é para 15/05/2025): {normalize('O relatório é para 15/05/2025', 'pt')}") + print( + f"Portuguese date with dashes (O evento é no dia 25-10-2024): {normalize('O evento é no dia 25-10-2024', 'pt')}") + print(f"Portuguese military time (O encontro é às 14h30): {normalize('O encontro é às 14h30', 'pt')}") + + # Other examples + print(f"\n--- Other Examples ---") + print(f"English fraction: {normalize('The fraction is 1/2', 'en')}") + print(f"English plural fraction: {normalize('There are 3/4 of a cup', 'en')}") + print(f"Spanish example with units: {normalize('The temperature is 25ºC', 'es')}") + print(f"Portuguese with punctuation: {normalize('12345€, 5m e 10kg', 'pt')}") + print( + f"Portuguese word-digit: {normalize('Esta temporada leva oito jogos ao serviço da equipa sub-23 leonina.', 'pt')}") diff --git a/requirements.txt b/requirements.txt index 726ac59..0bb67ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ ovos-plugin-manager -ovos-number-parser>=0.3.0 \ No newline at end of file +langcodes +ovos-number-parser>=0.4.0 +ovos-date-parser>=0.6.4a1 +unicode_rbnf \ No newline at end of file From d1d53a938da1ff526c441ef05e269d6657e7a7e1 Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Mon, 4 Aug 2025 12:48:36 +0100 Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`pho?= =?UTF-8?q?onnx`=20(#3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @JarbasAl. * https://github.com/OpenVoiceOS/ovos-dialog-normalizer-plugin/pull/2#issuecomment-3150270624 The following files were modified: * `ovos_dialog_normalizer_plugin/__init__.py` * `ovos_dialog_normalizer_plugin/util.py` Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- ovos_dialog_normalizer_plugin/__init__.py | 14 ++- ovos_dialog_normalizer_plugin/util.py | 102 +++++++++++++++++----- 2 files changed, 94 insertions(+), 22 deletions(-) diff --git a/ovos_dialog_normalizer_plugin/__init__.py b/ovos_dialog_normalizer_plugin/__init__.py index 3a3e65e..817f086 100644 --- a/ovos_dialog_normalizer_plugin/__init__.py +++ b/ovos_dialog_normalizer_plugin/__init__.py @@ -15,10 +15,22 @@ class DialogNormalizerTransformer(DialogTransformer): """ def __init__(self, name="ovos-dialog-normalizer-plugin", priority=5, config=None): + """ + Initialize the dialog normalizer transformer with a name, priority, and optional configuration. + """ super().__init__(name=name, priority=priority, config=config) def transform(self, dialog: str, context: Optional[dict] = None) -> Tuple[str, dict]: - """Normalize dialog text.""" + """ + Normalizes the input dialog text according to the session's language settings. + + Parameters: + dialog (str): The dialog text to be normalized. + context (dict, optional): Optional context containing session information. + + Returns: + tuple: A tuple containing the normalized dialog string and the (unchanged) context dictionary. + """ context = context or {} sess = Session.deserialize(context["session"]) if "session" in context else SessionManager.get() original = dialog diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py index 324e33e..f7bb3a7 100644 --- a/ovos_dialog_normalizer_plugin/util.py +++ b/ovos_dialog_normalizer_plugin/util.py @@ -361,10 +361,13 @@ def _get_number_separators(full_lang: str) -> tuple[str, str]: """ - Determines decimal and thousands separators based on language. - Defaults to '.' decimal and ',' thousands for most languages. - Special cases: - - 'pt', 'es', 'fr', 'de': ',' decimal and '.' thousands. + Return the decimal and thousands separators appropriate for the specified language. + + Parameters: + full_lang (str): The full language code (e.g., "en-US", "pt-BR"). + + Returns: + tuple[str, str]: A tuple containing the decimal separator and thousands separator for the language. """ lang_code = full_lang.split("-")[0] decimal_separator = '.' @@ -377,8 +380,9 @@ def _get_number_separators(full_lang: str) -> tuple[str, str]: def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str: """ - Helper function to normalize a single word that is a number, handling - decimal and thousands separators based on locale. + Normalizes a word representing a number or fraction, converting it to its spoken form according to locale conventions. + + Handles locale-specific decimal and thousands separators, expands fractions, and uses available pronunciation engines to generate the spoken equivalent. If normalization fails, returns the original word. """ cleaned_word = word.rstrip(string.punctuation) @@ -435,16 +439,23 @@ def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str: # --- Date and Time Pronunciation --- def pronounce_date(date_obj: date, full_lang: str) -> str: """ - Pronounces a date object using ovos-date-parser. + Return the spoken form of a date object in the specified language. + + Parameters: + date_obj (date): The date to be pronounced. + full_lang (str): The language code for pronunciation. + + Returns: + str: The spoken representation of the date. """ return nice_date(date_obj, full_lang) def pronounce_time(time_string: str, full_lang: str) -> str: """ - Pronounces a time string using ovos-date-parser. - Handles military time like "15h01" and converts it to a - datetime.time object before passing it to nice_time. + Convert a time string in "HHhMM" format to its spoken form in the specified language. + + If parsing fails, returns the input string with "h" replaced by a space. """ try: hours, mins = time_string.split("h") @@ -458,8 +469,17 @@ def pronounce_time(time_string: str, full_lang: str) -> str: def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DMY") -> str: """ - Helper function to normalize dates and times using regular expressions. - This prepares the strings for pronunciation. + Normalizes dates and times in a text string, converting them to their spoken equivalents for the specified language. + + This function identifies and processes time expressions (e.g., "15h01") and date patterns (e.g., "DD/MM/YYYY", "YYYY/MM/DD") using regular expressions. It handles locale-specific formats, expands ambiguous years, and replaces recognized dates and times with their pronounced forms suitable for text-to-speech. For English, it also separates and expands "am"/"pm" time markers. + + Parameters: + text (str): The input text containing dates and times to normalize. + full_lang (str): The language code specifying the locale for normalization. + date_format (str, optional): The expected date format ("DMY" or "MDY"). Defaults to "DMY". + + Returns: + str: The text with dates and times replaced by their spoken equivalents. """ lang_code = full_lang.split("-")[0] # Pre-process with regex to handle English am/pm times @@ -472,6 +492,15 @@ def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DM time_pattern = re.compile(r"(\d{1,2})h(\d{2})", re.IGNORECASE) def replace_time(match): + """ + Replaces a matched time string with its spoken equivalent in the specified language. + + Parameters: + match: A regex match object containing the time string to be pronounced. + + Returns: + A string with the time expressed in spoken form for the target language. + """ time_str = match.group(0) return pronounce_time(time_str, full_lang) @@ -533,8 +562,9 @@ def replace_time(match): def _normalize_word_hyphen_digit(text: str) -> str: """ - Helper function to normalize words attached to digits with a hyphen, - such as 'sub-23' -> 'sub 23'. + Replaces occurrences of a word followed by a hyphen and digits with the word and number separated by a space. + + For example, transforms 'sub-23' into 'sub 23'. """ # Regex to find a word (\w+) followed by a hyphen and a digit (\d+) pattern = re.compile(r"(\w+)-(\d+)") @@ -544,9 +574,12 @@ def _normalize_word_hyphen_digit(text: str) -> str: def _normalize_units(text: str, full_lang: str) -> str: """ - Helper function to normalize units attached to numbers. - This function handles symbolic and alphanumeric units separately - to avoid issues with word boundaries. + Expands and pronounces units attached to numbers in the text according to the specified language. + + This function detects numbers followed by unit symbols or abbreviations (e.g., "50kg", "100€"), converts the number to its spoken form, and replaces the unit with its full word equivalent based on language-specific mappings. Handles both symbolic (non-alphanumeric) and alphanumeric units, accounting for locale-specific decimal and thousands separators. + + Returns: + str: The text with numbers and units normalized to their spoken forms. """ text = text.replace("º", "°") # these characters look the same... but... lang_code = full_lang.split("-")[0] @@ -567,6 +600,11 @@ def _normalize_units(text: str, full_lang: str) -> str: symbolic_pattern = re.compile(number_pattern_str + r"\s*(" + symbolic_pattern_str + r")", re.IGNORECASE) def replace_symbolic(match): + """ + Replaces a matched symbolic unit expression with its spoken number and unit word equivalent. + + The function is intended for use as a regex replacement callback, converting patterns like "50%" or "1.5€" into their spoken forms (e.g., "fifty percent" or "one point five euros") according to the specified language. If pronunciation fails, returns the original matched string. + """ number = match.group(1) # Remove thousands separator and replace decimal separator for parsing if thousands_separator in number and decimal_separator in number: @@ -592,6 +630,15 @@ def replace_symbolic(match): re.IGNORECASE) def replace_alphanumeric(match): + """ + Replaces a matched alphanumeric unit expression with its spoken number and full unit name. + + Parameters: + match: A regex match object containing a number and an alphanumeric unit symbol. + + Returns: + A string with the number pronounced in the specified language followed by the expanded unit name. + """ number = match.group(1) # Remove thousands separator and replace decimal separator for parsing if thousands_separator in number and decimal_separator in number: @@ -608,7 +655,9 @@ def replace_alphanumeric(match): def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str: """ - Helper function to normalize a single word. + Normalizes a single word by expanding contractions, titles, or pronouncing numbers and fractions. + + If the word matches a known contraction or title in the specified language, it is expanded to its full form. If the word represents a number or fraction, it is converted to its spoken equivalent. Returns the original word if no normalization applies. """ lang_code = full_lang.split("-")[0] @@ -627,7 +676,12 @@ def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str: def is_fraction(word: str) -> bool: - """Checks if a word is a fraction like '3/3'.""" + """ + Determine if the input string represents a numeric fraction in the form 'n1/n2'. + + Returns: + bool: True if the string is a fraction with two integer components separated by '/', otherwise False. + """ if "/" in word: parts = word.split("/") if len(parts) == 2: @@ -638,8 +692,14 @@ def is_fraction(word: str) -> bool: def normalize(text: str, lang: str) -> str: """ - Normalizes a text string by expanding contractions, titles, and pronouncing - numbers, units, and fractions. + Normalize a text string for spoken output by expanding contractions, titles, numbers, units, fractions, dates, and times according to the specified language. + + Parameters: + text (str): The input text to normalize. + lang (str): The language code (e.g., "en-US", "pt-PT") used for locale-specific normalization. + + Returns: + str: The normalized text with contractions expanded, numbers and units pronounced, and dates and times converted to spoken form. """ full_lang = lang lang_code = full_lang.split("-")[0] From 01456654ce79e8efdc9750b4a20152cb3a1f6aa7 Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 4 Aug 2025 13:10:29 +0100 Subject: [PATCH 3/6] move to .json files for easy localization --- .github/workflows/unit_tests.yml | 45 ++ .../locale/ca/titles.json | 6 + .../locale/de/titles.json | 4 + .../locale/de/units.json | 17 + .../locale/en/contractions.json | 168 +++++++ .../locale/en/titles.json | 5 + .../locale/en/units.json | 30 ++ .../locale/es/titles.json | 8 + .../locale/es/units.json | 17 + .../locale/fr/titles.json | 8 + .../locale/fr/units.json | 17 + .../locale/gl/titles.json | 7 + .../locale/it/titles.json | 8 + .../locale/nl/titles.json | 8 + .../locale/pt/titles.json | 10 + .../locale/pt/units.json | 21 + ovos_dialog_normalizer_plugin/util.py | 425 +++--------------- requirements.txt | 4 +- setup.py | 10 + tests/__init__.py | 1 + 20 files changed, 459 insertions(+), 360 deletions(-) create mode 100644 .github/workflows/unit_tests.yml create mode 100644 ovos_dialog_normalizer_plugin/locale/ca/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/de/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/de/units.json create mode 100644 ovos_dialog_normalizer_plugin/locale/en/contractions.json create mode 100644 ovos_dialog_normalizer_plugin/locale/en/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/en/units.json create mode 100644 ovos_dialog_normalizer_plugin/locale/es/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/es/units.json create mode 100644 ovos_dialog_normalizer_plugin/locale/fr/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/fr/units.json create mode 100644 ovos_dialog_normalizer_plugin/locale/gl/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/it/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/nl/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/pt/titles.json create mode 100644 ovos_dialog_normalizer_plugin/locale/pt/units.json create mode 100644 tests/__init__.py diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml new file mode 100644 index 0000000..291030a --- /dev/null +++ b/.github/workflows/unit_tests.yml @@ -0,0 +1,45 @@ +# This workflow will run unit tests + +name: Run Unit Tests +on: + pull_request: + branches: + - dev + paths-ignore: + - 'ovos_dialog_normalizer_plugin/version.py' + - '.github/**' + - '.gitignore' + - 'LICENSE' + - 'CHANGELOG.md' + - 'MANIFEST.in' + - 'README.md' + workflow_dispatch: + +jobs: + unit_tests: + strategy: + max-parallel: 3 + matrix: + python-version: [ "3.10", "3.11", "3.12" ] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt install python3-dev + python -m pip install build wheel unicode-rbnf + - name: Install core repo + run: | + pip install -e . + - name: Install test dependencies + run: | + pip install pytest pytest-timeout pytest-cov + - name: Run unittests + run: | + pytest --cov=ovos_dialog_normalizer_plugin --cov-report xml tests + diff --git a/ovos_dialog_normalizer_plugin/locale/ca/titles.json b/ovos_dialog_normalizer_plugin/locale/ca/titles.json new file mode 100644 index 0000000..6beb307 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/ca/titles.json @@ -0,0 +1,6 @@ +{ + "Dr.": "Doctor", + "Sr.": "Senyor", + "Sra.": "Senyora", + "Prof.": "Professor" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/de/titles.json b/ovos_dialog_normalizer_plugin/locale/de/titles.json new file mode 100644 index 0000000..770de9d --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/de/titles.json @@ -0,0 +1,4 @@ +{ + "Dr.": "Doktor", + "Prof.": "Professor" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/de/units.json b/ovos_dialog_normalizer_plugin/locale/de/units.json new file mode 100644 index 0000000..2234346 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/de/units.json @@ -0,0 +1,17 @@ +{ + "€": "Euro", + "%": "Prozent", + "°C": "Grad Celsius", + "°F": "Grad Fahrenheit", + "°K": "Grad Kelvin", + "°": "Grad", + "$": "Dollar", + "£": "Pfund", + "km": "Kilometer", + "m": "Meter", + "cm": "Zentimeter", + "kg": "Kilogramm", + "g": "Gramm", + "L": "Liter", + "mL": "Milliliter" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/en/contractions.json b/ovos_dialog_normalizer_plugin/locale/en/contractions.json new file mode 100644 index 0000000..376c354 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/en/contractions.json @@ -0,0 +1,168 @@ +{ + "I'd": "I would", + "I'll": "I will", + "I'm": "I am", + "I've": "I have", + "ain't": "is not", + "aren't": "are not", + "can't": "can not", + "could've": "could have", + "couldn't": "could not", + "didn't": "did not", + "doesn't": "does not", + "don't": "do not", + "gonna": "going to", + "gotta": "got to", + "hadn't": "had not", + "hasn't": "has not", + "haven't": "have not", + "he'd": "he would", + "he'll": "he will", + "he's": "he is", + "how'd": "how did", + "how'll": "how will", + "how's": "how is", + "isn't": "is not", + "it'd": "it would", + "it'll": "it will", + "it's": "it is", + "might've": "might have", + "mightn't": "might not", + "must've": "must have", + "mustn't": "must not", + "needn't": "need not", + "oughtn't": "ought not", + "she'd": "she would", + "she'll": "she will", + "she's": "she is", + "should've": "should have", + "shouldn't": "should not", + "somebody's": "somebody is", + "someone'd": "someone would", + "someone'll": "someone will", + "someone's": "someone is", + "that'd": "that would", + "that'll": "that will", + "that's": "that is", + "there'd": "there would", + "there're": "there are", + "there's": "there is", + "they'd": "they would", + "they'll": "they will", + "they're": "they are", + "they've": "they have", + "wasn't": "was not", + "we'd": "we would", + "we'll": "we will", + "we're": "we are", + "we've": "we have", + "weren't": "were not", + "what'd": "what did", + "what'll": "what will", + "what're": "what are", + "what's": "what is", + "what've": "what have", + "whats": "what is", + "when'd": "when did", + "when's": "when is", + "where'd": "where did", + "where's": "where is", + "where've": "where have", + "who'd": "who would", + "who'd've": "who would have", + "who'll": "who will", + "who're": "who are", + "who's": "who is", + "who've": "who have", + "why'd": "why did", + "why're": "why are", + "why's": "why is", + "won't": "will not", + "won't've": "will not have", + "would've": "would have", + "wouldn't": "would not", + "wouldn't've": "would not have", + "y'ain't": "you are not", + "y'aint": "you are not", + "y'all": "you all", + "ya'll": "you all", + "you'd": "you would", + "you'd've": "you would have", + "you'll": "you will", + "you're": "you are", + "you've": "you have", + "I'm'a": "I am going to", + "I'm'o": "I am going to", + "I'll've": "I will have", + "I'd've": "I would have", + "Whatcha": "What are you", + "amn't": "am not", + "'cause": "because", + "can't've": "cannot have", + "couldn't've": "could not have", + "daren't": "dare not", + "daresn't": "dare not", + "dasn't": "dare not", + "everyone's": "everyone is", + "gimme": "give me", + "gon't": "go not", + "hadn't've": "had not have", + "he've": "he would have", + "he'll've": "he will have", + "he'd've": "he would have", + "here's": "here is", + "how're": "how are", + "how'd'y": "how do you do", + "howd'y": "how do you do", + "howdy": "how do you do", + "'tis": "it is", + "'twas": "it was", + "it'll've": "it will have", + "it'd've": "it would have", + "kinda": "kind of", + "let's": "let us", + "ma'am": "madam", + "may've": "may have", + "mayn't": "may not", + "mightn't've": "might not have", + "mustn't've": "must not have", + "needn't've": "need not have", + "ol'": "old", + "oughtn't've": "ought not have", + "sha'n't": "shall not", + "shan't": "shall not", + "shalln't": "shall not", + "shan't've": "shall not have", + "she'd've": "she would have", + "shouldn't've": "should not have", + "so've": "so have", + "so's": "so is", + "something's": "something is", + "that're": "that are", + "that'd've": "that would have", + "there'll": "there will", + "there'd've": "there would have", + "these're": "these are", + "they'll've": "they will have", + "they'd've": "they would have", + "this's": "this is", + "this'll": "this will", + "this'd": "this would", + "those're": "those are", + "to've": "to have", + "wanna": "want to", + "we'll've": "we will have", + "we'd've": "we would have", + "what'll've": "what will have", + "when've": "when have", + "where're": "where are", + "which's": "which is", + "who'll've": "who will have", + "why've": "why have", + "will've": "will have", + "y'all're": "you all are", + "y'all've": "you all have", + "y'all'd": "you all would", + "y'all'd've": "you all would have", + "you'll've": "you will have" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/en/titles.json b/ovos_dialog_normalizer_plugin/locale/en/titles.json new file mode 100644 index 0000000..30666a9 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/en/titles.json @@ -0,0 +1,5 @@ +{ + "Dr.": "Doctor", + "Mr.": "Mister", + "Prof.": "Professor" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/en/units.json b/ovos_dialog_normalizer_plugin/locale/en/units.json new file mode 100644 index 0000000..924489e --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/en/units.json @@ -0,0 +1,30 @@ +{ + "€": "euros", + "%": "per cent", + "°C": "degrees celsius", + "°F": "degrees fahrenheit", + "°K": "degrees kelvin", + "°": "degrees", + "$": "dollars", + "£": "pounds", + "km": "kilometers", + "m": "meters", + "cm": "centimeters", + "mm": "millimeters", + "ft": "feet", + "in": "inches", + "yd": "yards", + "mi": "miles", + "kg": "kilograms", + "g": "grams", + "lb": "pounds", + "oz": "ounces", + "L": "liters", + "mL": "milliliters", + "gal": "gallons", + "qt": "quarts", + "pt": "pints", + "hr": "hours", + "min": "minutes", + "s": "seconds" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/es/titles.json b/ovos_dialog_normalizer_plugin/locale/es/titles.json new file mode 100644 index 0000000..f7ba7c9 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/es/titles.json @@ -0,0 +1,8 @@ +{ + "Dr.": "Doctor", + "Sr.": "Señor", + "Sra.": "Señora", + "Prof.": "Profesor", + "D.": "Don", + "Dña.": "Doña" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/es/units.json b/ovos_dialog_normalizer_plugin/locale/es/units.json new file mode 100644 index 0000000..3bafba3 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/es/units.json @@ -0,0 +1,17 @@ +{ + "€": "euros", + "%": "por ciento", + "°C": "grados celsius", + "°F": "grados fahrenheit", + "°K": "grados kelvin", + "°": "grados", + "$": "dólares", + "£": "libras", + "km": "kilómetros", + "m": "metros", + "cm": "centímetros", + "kg": "kilogramos", + "g": "gramos", + "L": "litros", + "mL": "millilitros" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/fr/titles.json b/ovos_dialog_normalizer_plugin/locale/fr/titles.json new file mode 100644 index 0000000..3160db5 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/fr/titles.json @@ -0,0 +1,8 @@ +{ + "Dr.": "Docteur", + "M.": "Monsieur", + "Mme": "Madame", + "Mlle": "Mademoiselle", + "Prof.": "Professeur", + "Pr.": "Professeur" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/fr/units.json b/ovos_dialog_normalizer_plugin/locale/fr/units.json new file mode 100644 index 0000000..b17482e --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/fr/units.json @@ -0,0 +1,17 @@ +{ + "€": "euros", + "%": "pour cent", + "°C": "degrés celsius", + "°F": "degrés fahrenheit", + "°K": "degrés kelvin", + "°": "degrés", + "$": "dollars", + "£": "livres", + "km": "kilomètres", + "m": "mètres", + "cm": "centimètres", + "kg": "kilogrammes", + "g": "grammes", + "L": "litres", + "mL": "millilitres" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/gl/titles.json b/ovos_dialog_normalizer_plugin/locale/gl/titles.json new file mode 100644 index 0000000..b5ac190 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/gl/titles.json @@ -0,0 +1,7 @@ +{ + "Dr.": "Doutor", + "Sr.": "Señor", + "Sra.": "Señora", + "Prof.": "Profesor", + "Srta.": "Señorita" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/it/titles.json b/ovos_dialog_normalizer_plugin/locale/it/titles.json new file mode 100644 index 0000000..263f9b0 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/it/titles.json @@ -0,0 +1,8 @@ +{ + "Dr.": "Dottore", + "Sig.": "Signore", + "Sig.ra": "Signora", + "Prof.": "Professore", + "Dott.ssa": "Dottoressa", + "Sig.na": "Signorina" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/nl/titles.json b/ovos_dialog_normalizer_plugin/locale/nl/titles.json new file mode 100644 index 0000000..dc10e29 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/nl/titles.json @@ -0,0 +1,8 @@ +{ + "Dr.": "Dokter", + "Dhr.": "De Heer", + "Mevr.": "Mevrouw", + "Prof.": "Professor", + "Drs.": "Dokterandus", + "Ing.": "Ingenieur" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/pt/titles.json b/ovos_dialog_normalizer_plugin/locale/pt/titles.json new file mode 100644 index 0000000..38b76a3 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/pt/titles.json @@ -0,0 +1,10 @@ +{ + "Dr.": "Doutor", + "Sr.": "Senhor", + "Sra.": "Senhora", + "Prof.": "Professor", + "Drª.": "Doutora", + "Eng.": "Engenheiro", + "D.": "Dom", + "Dª": "Dona" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/locale/pt/units.json b/ovos_dialog_normalizer_plugin/locale/pt/units.json new file mode 100644 index 0000000..4424c79 --- /dev/null +++ b/ovos_dialog_normalizer_plugin/locale/pt/units.json @@ -0,0 +1,21 @@ +{ + "€": "euros", + "%": "por cento", + "°C": "graus celsius", + "°F": "graus fahrenheit", + "°K": "graus kelvin", + "°": "graus", + "$": "dólares", + "£": "libras", + "km": "quilômetros", + "m": "metros", + "cm": "centímetros", + "mm": "milímetros", + "kg": "quilogramas", + "g": "gramas", + "L": "litros", + "mL": "mililitros", + "h": "horas", + "min": "minutos", + "s": "segundos" +} \ No newline at end of file diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py index f7bb3a7..195dfa8 100644 --- a/ovos_dialog_normalizer_plugin/util.py +++ b/ovos_dialog_normalizer_plugin/util.py @@ -1,5 +1,6 @@ import datetime -import logging +import json +import os import re import string from datetime import date @@ -7,356 +8,59 @@ from ovos_date_parser import nice_time, nice_date from ovos_number_parser import pronounce_number, pronounce_fraction from ovos_number_parser.util import is_numeric +from ovos_utils.log import LOG from unicode_rbnf import RbnfEngine, FormatPurpose -LOG = logging.getLogger("normalize") - -# A dictionary of common contractions and their expanded forms. -# This list is very comprehensive for English. -CONTRACTIONS = { - "en": { - "I'd": "I would", - "I'll": "I will", - "I'm": "I am", - "I've": "I have", - "ain't": "is not", - "aren't": "are not", - "can't": "can not", - "could've": "could have", - "couldn't": "could not", - "didn't": "did not", - "doesn't": "does not", - "don't": "do not", - "gonna": "going to", - "gotta": "got to", - "hadn't": "had not", - "hasn't": "has not", - "haven't": "have not", - "he'd": "he would", - "he'll": "he will", - "he's": "he is", - "how'd": "how did", - "how'll": "how will", - "how's": "how is", - "isn't": "is not", - "it'd": "it would", - "it'll": "it will", - "it's": "it is", - "might've": "might have", - "mightn't": "might not", - "must've": "must have", - "mustn't": "must not", - "needn't": "need not", - "oughtn't": "ought not", - "shan't": "shall not", - "she'd": "she would", - "she'll": "she will", - "she's": "she is", - "should've": "should have", - "shouldn't": "should not", - "somebody's": "somebody is", - "someone'd": "someone would", - "someone'll": "someone will", - "someone's": "someone is", - "that'd": "that would", - "that'll": "that will", - "that's": "that is", - "there'd": "there would", - "there're": "there are", - "there's": "there is", - "they'd": "they would", - "they'll": "they will", - "they're": "they are", - "they've": "they have", - "wasn't": "was not", - "we'd": "we would", - "we'll": "we will", - "we're": "we are", - "we've": "we have", - "weren't": "were not", - "what'd": "what did", - "what'll": "what will", - "what're": "what are", - "what's": "what is", - "what've": "what have", - "whats": "what is", - "when'd": "when did", - "when's": "when is", - "where'd": "where did", - "where's": "where is", - "where've": "where have", - "who'd": "who would", - "who'd've": "who would have", - "who'll": "who will", - "who're": "who are", - "who's": "who is", - "who've": "who have", - "why'd": "why did", - "why're": "why are", - "why's": "why is", - "won't": "will not", - "won't've": "will not have", - "would've": "would have", - "wouldn't": "would not", - "wouldn't've": "would not have", - "y'ain't": "you are not", - "y'aint": "you are not", - "y'all": "you all", - "ya'll": "you all", - "you'd": "you would", - "you'd've": "you would have", - "you'll": "you will", - "you're": "you are", - "you've": "you have", - "I'm'a": "I am going to", - "I'm'o": "I am going to", - "I'll've": "I will have", - "I'd've": "I would have", - "Whatcha": "What are you", - "amn't": "am not", - "'cause": "because", - "can't've": "cannot have", - "couldn't've": "could not have", - "daren't": "dare not", - "daresn't": "dare not", - "dasn't": "dare not", - "everyone's": "everyone is", - "gimme": "give me", - "gon't": "go not", - "hadn't've": "had not have", - "he've": "he would have", - "he'll've": "he will have", - "he'd've": "he would have", - "here's": "here is", - "how're": "how are", - "how'd'y": "how do you do", - "howd'y": "how do you do", - "howdy": "how do you do", - "'tis": "it is", - "'twas": "it was", - "it'll've": "it will have", - "it'd've": "it would have", - "kinda": "kind of", - "let's": "let us", - "ma'am": "madam", - "may've": "may have", - "mayn't": "may not", - "mightn't've": "might not have", - "mustn't've": "must not have", - "needn't've": "need not have", - "ol'": "old", - "oughtn't've": "ought not have", - "sha'n't": "shall not", - "shan't": "shall not", - "shalln't": "shall not", - "shan't've": "shall not have", - "she'd've": "she would have", - "shouldn't've": "should not have", - "so've": "so have", - "so's": "so is", - "something's": "something is", - "that're": "that are", - "that'd've": "that would have", - "there'll": "there will", - "there'd've": "there would have", - "these're": "these are", - "they'll've": "they will have", - "they'd've": "they would have", - "this's": "this is", - "this'll": "this will", - "this'd": "this would", - "those're": "those are", - "to've": "to have", - "wanna": "want to", - "we'll've": "we will have", - "we'd've": "we would have", - "what'll've": "what will have", - "when've": "when have", - "where're": "where are", - "which's": "which is", - "who'll've": "who will have", - "why've": "why have", - "will've": "will have", - "y'all're": "you all are", - "y'all've": "you all have", - "y'all'd": "you all would", - "y'all'd've": "you all would have", - "you'll've": "you will have" - } -} - -# Dictionaries for titles, units, and their full word equivalents. -TITLES = { - "en": { - "Dr.": "Doctor", - "Mr.": "Mister", - "Prof.": "Professor" - }, - "ca": { - "Dr.": "Doctor", - "Sr.": "Senyor", - "Sra.": "Senyora", - "Prof.": "Professor" - }, - "es": { - "Dr.": "Doctor", - "Sr.": "Señor", - "Sra.": "Señora", - "Prof.": "Profesor", - "D.": "Don", - "Dña.": "Doña" - }, - "pt": { - "Dr.": "Doutor", - "Sr.": "Senhor", - "Sra.": "Senhora", - "Prof.": "Professor", - "Drª.": "Doutora", - "Eng.": "Engenheiro", - "D.": "Dom", - "Dª": "Dona" - }, - "gl": { - "Dr.": "Doutor", - "Sr.": "Señor", - "Sra.": "Señora", - "Prof.": "Profesor", - "Srta.": "Señorita" - }, - "fr": { - "Dr.": "Docteur", - "M.": "Monsieur", - "Mme": "Madame", - "Mlle": "Mademoiselle", - "Prof.": "Professeur", - "Pr.": "Professeur" - }, - "it": { - "Dr.": "Dottore", - "Sig.": "Signore", - "Sig.ra": "Signora", - "Prof.": "Professore", - "Dott.ssa": "Dottoressa", - "Sig.na": "Signorina" - }, - "nl": { - "Dr.": "Dokter", - "Dhr.": "De Heer", - "Mevr.": "Mevrouw", - "Prof.": "Professor", - "Drs.": "Dokterandus", - "Ing.": "Ingenieur" - }, - "de": { - "Dr.": "Doktor", - "Prof.": "Professor" - } -} - -UNITS = { - "en": { - "€": "euros", - "%": "per cent", - "°C": "degrees celsius", - "°F": "degrees fahrenheit", - "°K": "degrees kelvin", - "°": "degrees", - "$": "dollars", - "£": "pounds", - "km": "kilometers", - "m": "meters", - "cm": "centimeters", - "mm": "millimeters", - "ft": "feet", - "in": "inches", - "yd": "yards", - "mi": "miles", - "kg": "kilograms", - "g": "grams", - "lb": "pounds", - "oz": "ounces", - "L": "liters", - "mL": "milliliters", - "gal": "gallons", - "qt": "quarts", - "pt": "pints", - "hr": "hours", - "min": "minutes", - "s": "seconds" - }, - "pt": { - "€": "euros", - "%": "por cento", - "°C": "graus celsius", - "°F": "graus fahrenheit", - "°K": "graus kelvin", - "°": "graus", - "$": "dólares", - "£": "libras", - "km": "quilômetros", - "m": "metros", - "cm": "centímetros", - "mm": "milímetros", - "kg": "quilogramas", - "g": "gramas", - "L": "litros", - "mL": "mililitros", - "h": "horas", - "min": "minutos", - "s": "segundos" - }, - "es": { - "€": "euros", - "%": "por ciento", - "°C": "grados celsius", - "°F": "grados fahrenheit", - "°K": "grados kelvin", - "°": "grados", - "$": "dólares", - "£": "libras", - "km": "kilómetros", - "m": "metros", - "cm": "centímetros", - "kg": "kilogramos", - "g": "gramos", - "L": "litros", - "mL": "millilitros" - }, - "fr": { - "€": "euros", - "%": "pour cent", - "°C": "degrés celsius", - "°F": "degrés fahrenheit", - "°K": "degrés kelvin", - "°": "degrés", - "$": "dollars", - "£": "livres", - "km": "kilomètres", - "m": "mètres", - "cm": "centimètres", - "kg": "kilogrammes", - "g": "grammes", - "L": "litres", - "mL": "millilitres" - }, - "de": { - "€": "Euro", - "%": "Prozent", - "°C": "Grad Celsius", - "°F": "Grad Fahrenheit", - "°K": "Grad Kelvin", - "°": "Grad", - "$": "Dollar", - "£": "Pfund", - "km": "Kilometer", - "m": "Meter", - "cm": "Zentimeter", - "kg": "Kilogramm", - "g": "Gramm", - "L": "Liter", - "mL": "Milliliter" - } -} +RESOURCES_DIR = os.path.join(os.path.dirname(__file__), "locale") + + +# --- Locale Data Management Class --- +class LocaleDataManager: + """ + A helper class to lazy-load and cache locale-specific data from JSON files. + The data is not hardcoded and will be loaded from a 'locale' directory + containing language-specific JSON files on first use. + """ + + def __init__(self): + """Initializes an empty cache for locale data.""" + self.cache = {} + + def _load_data(self, lang_code: str, file_name: str) -> dict: + """Loads a single JSON file and caches it.""" + file_path = os.path.join(RESOURCES_DIR, lang_code, f"{file_name}.json") + try: + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + self.cache.setdefault(lang_code, {})[file_name] = data + return data + except FileNotFoundError: + LOG.debug(f"Locale file not found: {file_path}. Using empty dictionary.") + self.cache.setdefault(lang_code, {})[file_name] = {} + return {} + except json.JSONDecodeError as e: + LOG.error(f"Error decoding JSON from {file_path}: {e}") + self.cache.setdefault(lang_code, {})[file_name] = {} + return {} + + def get_data(self, lang_code: str, file_name: str) -> dict: + """Retrieves data for a given language and file, using the cache.""" + if lang_code in self.cache and file_name in self.cache[lang_code]: + return self.cache[lang_code][file_name] + return self._load_data(lang_code, file_name) + + def get_contractions(self, lang_code: str) -> dict: + return self.get_data(lang_code, "contractions") + + def get_units(self, lang_code: str) -> dict: + return self.get_data(lang_code, "units") + + def get_titles(self, lang_code: str) -> dict: + return self.get_data(lang_code, "titles") + + +# Instantiate the manager to be used by the normalization functions +locale_data_manager = LocaleDataManager() def _get_number_separators(full_lang: str) -> tuple[str, str]: @@ -364,14 +68,15 @@ def _get_number_separators(full_lang: str) -> tuple[str, str]: Return the decimal and thousands separators appropriate for the specified language. Parameters: - full_lang (str): The full language code (e.g., "en-US", "pt-BR"). + full_lang (str): The full language code (e.g., "en-US", "pt-BR"). Returns: - tuple[str, str]: A tuple containing the decimal separator and thousands separator for the language. + tuple[str, str]: A tuple containing the decimal separator and thousands separator for the language. """ lang_code = full_lang.split("-")[0] decimal_separator = '.' thousands_separator = ',' + # TODO This logic can also be moved to a JSON file if lang_code in ["pt", "es", "fr", "de"]: decimal_separator = ',' thousands_separator = '.' @@ -583,13 +288,15 @@ def _normalize_units(text: str, full_lang: str) -> str: """ text = text.replace("º", "°") # these characters look the same... but... lang_code = full_lang.split("-")[0] - if lang_code in UNITS: + units_data = locale_data_manager.get_units(lang_code) + + if units_data: # Determine number separators for the language decimal_separator, thousands_separator = _get_number_separators(full_lang) # Separate units into symbolic and alphanumeric - symbolic_units = {k: v for k, v in UNITS[lang_code].items() if not k.isalnum()} - alphanumeric_units = {k: v for k, v in UNITS[lang_code].items() if k.isalnum()} + symbolic_units = {k: v for k, v in units_data.items() if not k.isalnum()} + alphanumeric_units = {k: v for k, v in units_data.items() if k.isalnum()} # Create regex pattern for symbolic units and replace them first sorted_symbolic = sorted(symbolic_units.keys(), key=len, reverse=True) @@ -660,12 +367,14 @@ def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str: If the word matches a known contraction or title in the specified language, it is expanded to its full form. If the word represents a number or fraction, it is converted to its spoken equivalent. Returns the original word if no normalization applies. """ lang_code = full_lang.split("-")[0] + contractions = locale_data_manager.get_contractions(lang_code) + titles = locale_data_manager.get_titles(lang_code) - if word in CONTRACTIONS.get(lang_code, {}): - return CONTRACTIONS[lang_code][word] + if word in contractions: + return contractions[word] - if word in TITLES.get(lang_code, {}): - return TITLES[lang_code][word] + if word in titles: + return titles[word] # Delegate number parsing to the new helper function normalized_number = _normalize_number_word(word, full_lang, rbnf_engine) diff --git a/requirements.txt b/requirements.txt index 0bb67ef..7689f93 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ ovos-plugin-manager langcodes -ovos-number-parser>=0.4.0 -ovos-date-parser>=0.6.4a1 +ovos-number-parser>=0.4.0,<1.0.0 +ovos-date-parser>=0.6.4a1,<1.0.0 unicode_rbnf \ No newline at end of file diff --git a/setup.py b/setup.py index 04d5728..ff32270 100755 --- a/setup.py +++ b/setup.py @@ -48,6 +48,14 @@ def get_version(): ENTRY_POINT = 'ovos-dialog-normalizer-plugin=ovos_dialog_normalizer_plugin:DialogNormalizerTransformer' +def package_files(directory): + paths = [] + for (path, directories, filenames) in os.walk(directory): + for filename in filenames: + paths.append(os.path.join('..', path, filename)) + return paths + + setup( name='ovos-dialog-normalizer-plugin', @@ -58,6 +66,8 @@ def get_version(): author_email='jarbasai@mailfence.com', license='MIT', packages=['ovos_dialog_normalizer_plugin'], + include_package_data=True, + package_data={'': package_files('ovos_dialog_normalizer_plugin')}, zip_safe=True, keywords='ovos plugin utterance dialog TTS normalization', entry_points={ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..aa02126 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Test package for ovos_dialog_normalizer_plugin.""" \ No newline at end of file From 98a541b2edf41cd326c2f1e29aa997ed25697b41 Mon Sep 17 00:00:00 2001 From: JarbasAI <33701864+JarbasAl@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:11:33 +0100 Subject: [PATCH 4/6] Update ovos_dialog_normalizer_plugin/util.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- ovos_dialog_normalizer_plugin/util.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py index 195dfa8..9304cc4 100644 --- a/ovos_dialog_normalizer_plugin/util.py +++ b/ovos_dialog_normalizer_plugin/util.py @@ -341,10 +341,10 @@ def replace_alphanumeric(match): Replaces a matched alphanumeric unit expression with its spoken number and full unit name. Parameters: - match: A regex match object containing a number and an alphanumeric unit symbol. + match: A regex match object containing a number and an alphanumeric unit symbol. Returns: - A string with the number pronounced in the specified language followed by the expanded unit name. + A string with the number pronounced in the specified language followed by the expanded unit name. """ number = match.group(1) # Remove thousands separator and replace decimal separator for parsing @@ -354,8 +354,11 @@ def replace_alphanumeric(match): number = number.replace(decimal_separator, ".") unit_symbol = match.group(2) unit_word = alphanumeric_units[unit_symbol] - return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}" - + try: + return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}" + except Exception as e: + LOG.error(f"Failed to pronounce number with unit: {number}{unit_symbol} - ({e})") + return match.group(0) text = alphanumeric_pattern.sub(replace_alphanumeric, text) return text From 2c7af67abf6c623bae6c68df1f5b3ed8d21ef75b Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 4 Aug 2025 13:17:38 +0100 Subject: [PATCH 5/6] update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 55c2d85..f2729b2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # OVOS Dialog Normalizer -a dialog transformer plugins for OVOS +a dialog transformer plugins for OpenVoiceOS ## Description @@ -29,7 +29,9 @@ All you need to do is add a entry in your `mycroft.conf` under `"dialog_transfor Pull Requests welcome! -Adding new expansions should be straightforward, to improve number handling please refer to [ovos-number-parser](https://github.com/OpenVoiceOS/ovos-number-parser) +- to support new languages translate the `.json` files in the `locale` folder +- to improve number handling please refer to [ovos-number-parser](https://github.com/OpenVoiceOS/ovos-number-parser) +- to improve date/time handling please refer to [ovos-date-parser](https://github.com/OpenVoiceOS/ovos-date-parser) ## Credits From b29eeb95b269df14cd620dbda09d53857e0882de Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 4 Aug 2025 15:14:54 +0100 Subject: [PATCH 6/6] error handlin --- ovos_dialog_normalizer_plugin/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py index 9304cc4..1bf423d 100644 --- a/ovos_dialog_normalizer_plugin/util.py +++ b/ovos_dialog_normalizer_plugin/util.py @@ -353,8 +353,8 @@ def replace_alphanumeric(match): elif decimal_separator != "." and decimal_separator in number: number = number.replace(decimal_separator, ".") unit_symbol = match.group(2) - unit_word = alphanumeric_units[unit_symbol] try: + unit_word = alphanumeric_units[unit_symbol] return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}" except Exception as e: LOG.error(f"Failed to pronounce number with unit: {number}{unit_symbol} - ({e})")