From d0a46f46665a75c2e167ec39fd152bcc9bad9a62 Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Mon, 4 Aug 2025 12:43:13 +0100
Subject: [PATCH 1/6] feat: port normalization from phoonnx

---
 ovos_dialog_normalizer_plugin/__init__.py | 287 +--------
 ovos_dialog_normalizer_plugin/util.py     | 718 ++++++++++++++++++++++
 requirements.txt                          |   5 +-
 3 files changed, 725 insertions(+), 285 deletions(-)
 create mode 100644 ovos_dialog_normalizer_plugin/util.py

diff --git a/ovos_dialog_normalizer_plugin/__init__.py b/ovos_dialog_normalizer_plugin/__init__.py
index 7c3447b..3a3e65e 100644
--- a/ovos_dialog_normalizer_plugin/__init__.py
+++ b/ovos_dialog_normalizer_plugin/__init__.py
@@ -1,11 +1,10 @@
-import re
 from typing import Optional, Tuple
 
 from ovos_bus_client.session import Session, SessionManager
-from ovos_number_parser import pronounce_number
 from ovos_plugin_manager.templates.transformers import DialogTransformer
 from ovos_utils.log import LOG
-from unicode_rbnf import RbnfEngine, FormatPurpose
+
+from ovos_dialog_normalizer_plugin.util import normalize
 
 
 class DialogNormalizerTransformer(DialogTransformer):
@@ -14,244 +13,6 @@ class DialogNormalizerTransformer(DialogTransformer):
     - Handles common abbreviations
     - Supports multiple languages
     """
-    CONTRACTIONS = {
-        "en": {
-            "I'd": "I would",
-            "I'll": "I will",
-            "I'm": "I am",
-            "I've": "I have",
-            "ain't": "is not",
-            "aren't": "are not",
-            "can't": "can not",
-            "could've": "could have",
-            "couldn't": "could not",
-            "didn't": "did not",
-            "doesn't": "does not",
-            "don't": "do not",
-            "gonna": "going to",
-            "gotta": "got to",
-            "hadn't": "had not",
-            "hasn't": "has not",
-            "haven't": "have not",
-            "he'd": "he would",
-            "he'll": "he will",
-            "he's": "he is",
-            "how'd": "how did",
-            "how'll": "how will",
-            "how's": "how is",
-            "isn't": "is not",
-            "it'd": "it would",
-            "it'll": "it will",
-            "it's": "it is",
-            "might've": "might have",
-            "mightn't": "might not",
-            "must've": "must have",
-            "mustn't": "must not",
-            "needn't": "need not",
-            "oughtn't": "ought not",
-            "shan't": "shall not",
-            "she'd": "she would",
-            "she'll": "she will",
-            "she's": "she is",
-            "should've": "should have",
-            "shouldn't": "should not",
-            "somebody's": "somebody is",
-            "someone'd": "someone would",
-            "someone'll": "someone will",
-            "someone's": "someone is",
-            "that'd": "that would",
-            "that'll": "that will",
-            "that's": "that is",
-            "there'd": "there would",
-            "there're": "there are",
-            "there's": "there is",
-            "they'd": "they would",
-            "they'll": "they will",
-            "they're": "they are",
-            "they've": "they have",
-            "wasn't": "was not",
-            "we'd": "we would",
-            "we'll": "we will",
-            "we're": "we are",
-            "we've": "we have",
-            "weren't": "were not",
-            "what'd": "what did",
-            "what'll": "what will",
-            "what're": "what are",
-            "what's": "what is",
-            "what've": "what have",
-            "whats": "what is",
-            "when'd": "when did",
-            "when's": "when is",
-            "where'd": "where did",
-            "where's": "where is",
-            "where've": "where have",
-            "who'd": "who would",
-            "who'd've": "who would have",
-            "who'll": "who will",
-            "who're": "who are",
-            "who's": "who is",
-            "who've": "who have",
-            "why'd": "why did",
-            "why're": "why are",
-            "why's": "why is",
-            "won't": "will not",
-            "won't've": "will not have",
-            "would've": "would have",
-            "wouldn't": "would not",
-            "wouldn't've": "would not have",
-            "y'ain't": "you are not",
-            "y'aint": "you are not",
-            "y'all": "you all",
-            "ya'll": "you all",
-            "you'd": "you would",
-            "you'd've": "you would have",
-            "you'll": "you will",
-            "you're": "you are",
-            "you've": "you have",
-            "I'm'a": "I am going to",
-            "I'm'o": "I am going to",
-            "I'll've": "I will have",
-            "I'd've": "I would have",
-            "Whatcha": "What are you",
-            "amn't": "am not",
-            "'cause": "because",
-            "can't've": "cannot have",
-            "couldn't've": "could not have",
-            "daren't": "dare not",
-            "daresn't": "dare not",
-            "dasn't": "dare not",
-            "everyone's": "everyone is",
-            "gimme": "give me",
-            "gon't": "go not",
-            "hadn't've": "had not have",
-            "he've": "he would have",
-            "he'll've": "he will have",
-            "he'd've": "he would have",
-            "here's": "here is",
-            "how're": "how are",
-            "how'd'y": "how do you do",
-            "howd'y": "how do you do",
-            "howdy": "how do you do",
-            "'tis": "it is",
-            "'twas": "it was",
-            "it'll've": "it will have",
-            "it'd've": "it would have",
-            "kinda": "kind of",
-            "let's": "let us",
-            "ma'am": "madam",
-            "may've": "may have",
-            "mayn't": "may not",
-            "mightn't've": "might not have",
-            "mustn't've": "must not have",
-            "needn't've": "need not have",
-            "ol'": "old",
-            "oughtn't've": "ought not have",
-            "sha'n't": "shall not",
-            "shan't": "shall not",
-            "shalln't": "shall not",
-            "shan't've": "shall not have",
-            "she'd've": "she would have",
-            "shouldn't've": "should not have",
-            "so've": "so have",
-            "so's": "so is",
-            "something's": "something is",
-            "that're": "that are",
-            "that'd've": "that would have",
-            "there'll": "there will",
-            "there'd've": "there would have",
-            "these're": "these are",
-            "they'll've": "they will have",
-            "they'd've": "they would have",
-            "this's": "this is",
-            "this'll": "this will",
-            "this'd": "this would",
-            "those're": "those are",
-            "to've": "to have",
-            "wanna": "want to",
-            "we'll've": "we will have",
-            "we'd've": "we would have",
-            "what'll've": "what will have",
-            "when've": "when have",
-            "where're": "where are",
-            "which's": "which is",
-            "who'll've": "who will have",
-            "why've": "why have",
-            "will've": "will have",
-            "y'all're": "you all are",
-            "y'all've": "you all have",
-            "y'all'd": "you all would",
-            "y'all'd've": "you all would have",
-            "you'll've": "you will have"
-        }
-    }
-
-    TITLES = {
-        "en": {
-            "Dr.": "Doctor",
-            "Mr.": "Mister",
-            "Prof.": "Professor"
-        },
-        "ca": {
-            "Dr.": "Doctor",
-            "Sr.": "Senyor",
-            "Sra.": "Senyora",
-            "Prof.": "Professor"
-        },
-        "es": {
-            "Dr.": "Doctor",
-            "Sr.": "Señor",
-            "Sra.": "Señora",
-            "Prof.": "Profesor",
-            "D.": "Don",
-            "Dña.": "Doña"
-        },
-        "pt": {
-            "Dr.": "Doutor",
-            "Sr.": "Senhor",
-            "Sra.": "Senhora",
-            "Prof.": "Professor",
-            "Drª.": "Doutora",
-            "Eng.": "Engenheiro",
-            "D.": "Dom",
-            "Dª": "Dona"
-        },
-        "gl": {
-            "Dr.": "Doutor",
-            "Sr.": "Señor",
-            "Sra.": "Señora",
-            "Prof.": "Profesor",
-            "Srta.": "Señorita"
-        },
-        "fr": {
-            "Dr.": "Docteur",
-            "M.": "Monsieur",
-            "Mme": "Madame",
-            "Mlle": "Mademoiselle",
-            "Prof.": "Professeur",
-            "Pr.": "Professeur"
-        },
-        "it": {
-            "Dr.": "Dottore",
-            "Sig.": "Signore",
-            "Sig.ra": "Signora",
-            "Prof.": "Professore",
-            "Dott.ssa": "Dottoressa",
-            "Sig.na": "Signorina"
-        },
-        "nl": {
-            "Dr.": "Dokter",
-            "Dhr.": "De Heer",
-            "Mevr.": "Mevrouw",
-            "Prof.": "Professor",
-            "Drs.": "Dokterandus",
-            "Ing.": "Ingenieur"
-        },
-        "de": {
-            "Dr.": "Doktor",
-            "Prof.": "Professor"
-        }
-    }
 
     def __init__(self, name="ovos-dialog-normalizer-plugin", priority=5, config=None):
         super().__init__(name=name, priority=priority, config=config)
@@ -260,51 +21,9 @@ def transform(self, dialog: str, context: Optional[dict] = None) -> Tuple[str, d
         """Normalize dialog text."""
         context = context or {}
         sess = Session.deserialize(context["session"]) if "session" in context else SessionManager.get()
-        lang = sess.lang.split("-")[0]
-
         original = dialog
         try:
-            rbnf_engine = RbnfEngine.for_language(lang)
-        except:  # doesnt support lang
-            rbnf_engine = None
-
-        # substitute ' €' by 'euros' and 'someword€' by 'someword euros'
-        dialog = re.sub(r"(\w+)\s*€", r"\1 euros", dialog)
-
-        try:
-            # TODO - add language specific code here if needed
-            if lang == "gl":
-                # substitute ' ºC' by 'graos centígrados' and 'somewordºC' by 'someword graos centígrados'
-                dialog = re.sub(r"(\w+)\s*ºC", r"\1 graos centígrados", dialog)
-
-            words = dialog.split()
-            for idx, word in enumerate(words):
-
-                if word in self.CONTRACTIONS.get(lang, {}):
-                    words[idx] = self.CONTRACTIONS[lang][word]
-                    continue
-
-                if word in self.TITLES.get(lang, {}):
-                    words[idx] = self.TITLES[lang][word]
-                    continue
-
-                if word.isdigit():
-                    try:
-                        words[idx] = pronounce_number(int(word), lang=sess.lang)
-                    except Exception as e:
-                        LOG.error(f"ovos-number-parser failed to pronounce number: {word} - ({e})")
-
-                # NOTE: pronounce_digit may return the digit itself again for some languages (upstream bug)
-                # we recheck if isdigit() to handle this
-                if rbnf_engine and words[idx].isdigit():
-                    # fallback to unicode RBNF
-                    try:
-                        words[idx] = rbnf_engine.format_number(word, FormatPurpose.CARDINAL).text
-                    except Exception as e:
-                        LOG.error(f"unicode-rbnf failed to pronounce number: {word} - ({e})")
-
-            dialog = " ".join(words)
-
+            dialog = normalize(original, sess.lang)
             LOG.debug(f"normalized dialog: '{original}' -> '{dialog}'")
         except Exception as e:
             LOG.error(f"Failed to normalize dialog: {e}")
diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py
new file mode 100644
index 0000000..324e33e
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/util.py
@@ -0,0 +1,718 @@
+import datetime
+import logging
+import re
+import string
+from datetime import date
+
+from ovos_date_parser import nice_time, nice_date
+from ovos_number_parser import pronounce_number, pronounce_fraction
+from ovos_number_parser.util import is_numeric
+from unicode_rbnf import RbnfEngine, FormatPurpose
+
+LOG = logging.getLogger("normalize")
+
+# A dictionary of common contractions and their expanded forms.
+# This list is very comprehensive for English.
+CONTRACTIONS = {
+    "en": {
+        "I'd": "I would",
+        "I'll": "I will",
+        "I'm": "I am",
+        "I've": "I have",
+        "ain't": "is not",
+        "aren't": "are not",
+        "can't": "can not",
+        "could've": "could have",
+        "couldn't": "could not",
+        "didn't": "did not",
+        "doesn't": "does not",
+        "don't": "do not",
+        "gonna": "going to",
+        "gotta": "got to",
+        "hadn't": "had not",
+        "hasn't": "has not",
+        "haven't": "have not",
+        "he'd": "he would",
+        "he'll": "he will",
+        "he's": "he is",
+        "how'd": "how did",
+        "how'll": "how will",
+        "how's": "how is",
+        "isn't": "is not",
+        "it'd": "it would",
+        "it'll": "it will",
+        "it's": "it is",
+        "might've": "might have",
+        "mightn't": "might not",
+        "must've": "must have",
+        "mustn't": "must not",
+        "needn't": "need not",
+        "oughtn't": "ought not",
+        "shan't": "shall not",
+        "she'd": "she would",
+        "she'll": "she will",
+        "she's": "she is",
+        "should've": "should have",
+        "shouldn't": "should not",
+        "somebody's": "somebody is",
+        "someone'd": "someone would",
+        "someone'll": "someone will",
+        "someone's": "someone is",
+        "that'd": "that would",
+        "that'll": "that will",
+        "that's": "that is",
+        "there'd": "there would",
+        "there're": "there are",
+        "there's": "there is",
+        "they'd": "they would",
+        "they'll": "they will",
+        "they're": "they are",
+        "they've": "they have",
+        "wasn't": "was not",
+        "we'd": "we would",
+        "we'll": "we will",
+        "we're": "we are",
+        "we've": "we have",
+        "weren't": "were not",
+        "what'd": "what did",
+        "what'll": "what will",
+        "what're": "what are",
+        "what's": "what is",
+        "what've": "what have",
+        "whats": "what is",
+        "when'd": "when did",
+        "when's": "when is",
+        "where'd": "where did",
+        "where's": "where is",
+        "where've": "where have",
+        "who'd": "who would",
+        "who'd've": "who would have",
+        "who'll": "who will",
+        "who're": "who are",
+        "who's": "who is",
+        "who've": "who have",
+        "why'd": "why did",
+        "why're": "why are",
+        "why's": "why is",
+        "won't": "will not",
+        "won't've": "will not have",
+        "would've": "would have",
+        "wouldn't": "would not",
+        "wouldn't've": "would not have",
+        "y'ain't": "you are not",
+        "y'aint": "you are not",
+        "y'all": "you all",
+        "ya'll": "you all",
+        "you'd": "you would",
+        "you'd've": "you would have",
+        "you'll": "you will",
+        "you're": "you are",
+        "you've": "you have",
+        "I'm'a": "I am going to",
+        "I'm'o": "I am going to",
+        "I'll've": "I will have",
+        "I'd've": "I would have",
+        "Whatcha": "What are you",
+        "amn't": "am not",
+        "'cause": "because",
+        "can't've": "cannot have",
+        "couldn't've": "could not have",
+        "daren't": "dare not",
+        "daresn't": "dare not",
+        "dasn't": "dare not",
+        "everyone's": "everyone is",
+        "gimme": "give me",
+        "gon't": "go not",
+        "hadn't've": "had not have",
+        "he've": "he would have",
+        "he'll've": "he will have",
+        "he'd've": "he would have",
+        "here's": "here is",
+        "how're": "how are",
+        "how'd'y": "how do you do",
+        "howd'y": "how do you do",
+        "howdy": "how do you do",
+        "'tis": "it is",
+        "'twas": "it was",
+        "it'll've": "it will have",
+        "it'd've": "it would have",
+        "kinda": "kind of",
+        "let's": "let us",
+        "ma'am": "madam",
+        "may've": "may have",
+        "mayn't": "may not",
+        "mightn't've": "might not have",
+        "mustn't've": "must not have",
+        "needn't've": "need not have",
+        "ol'": "old",
+        "oughtn't've": "ought not have",
+        "sha'n't": "shall not",
+        "shan't": "shall not",
+        "shalln't": "shall not",
+        "shan't've": "shall not have",
+        "she'd've": "she would have",
+        "shouldn't've": "should not have",
+        "so've": "so have",
+        "so's": "so is",
+        "something's": "something is",
+        "that're": "that are",
+        "that'd've": "that would have",
+        "there'll": "there will",
+        "there'd've": "there would have",
+        "these're": "these are",
+        "they'll've": "they will have",
+        "they'd've": "they would have",
+        "this's": "this is",
+        "this'll": "this will",
+        "this'd": "this would",
+        "those're": "those are",
+        "to've": "to have",
+        "wanna": "want to",
+        "we'll've": "we will have",
+        "we'd've": "we would have",
+        "what'll've": "what will have",
+        "when've": "when have",
+        "where're": "where are",
+        "which's": "which is",
+        "who'll've": "who will have",
+        "why've": "why have",
+        "will've": "will have",
+        "y'all're": "you all are",
+        "y'all've": "you all have",
+        "y'all'd": "you all would",
+        "y'all'd've": "you all would have",
+        "you'll've": "you will have"
+    }
+}
+
+# Dictionaries for titles, units, and their full word equivalents.
+TITLES = {
+    "en": {
+        "Dr.": "Doctor",
+        "Mr.": "Mister",
+        "Prof.": "Professor"
+    },
+    "ca": {
+        "Dr.": "Doctor",
+        "Sr.": "Senyor",
+        "Sra.": "Senyora",
+        "Prof.": "Professor"
+    },
+    "es": {
+        "Dr.": "Doctor",
+        "Sr.": "Señor",
+        "Sra.": "Señora",
+        "Prof.": "Profesor",
+        "D.": "Don",
+        "Dña.": "Doña"
+    },
+    "pt": {
+        "Dr.": "Doutor",
+        "Sr.": "Senhor",
+        "Sra.": "Senhora",
+        "Prof.": "Professor",
+        "Drª.": "Doutora",
+        "Eng.": "Engenheiro",
+        "D.": "Dom",
+        "Dª": "Dona"
+    },
+    "gl": {
+        "Dr.": "Doutor",
+        "Sr.": "Señor",
+        "Sra.": "Señora",
+        "Prof.": "Profesor",
+        "Srta.": "Señorita"
+    },
+    "fr": {
+        "Dr.": "Docteur",
+        "M.": "Monsieur",
+        "Mme": "Madame",
+        "Mlle": "Mademoiselle",
+        "Prof.": "Professeur",
+        "Pr.": "Professeur"
+    },
+    "it": {
+        "Dr.": "Dottore",
+        "Sig.": "Signore",
+        "Sig.ra": "Signora",
+        "Prof.": "Professore",
+        "Dott.ssa": "Dottoressa",
+        "Sig.na": "Signorina"
+    },
+    "nl": {
+        "Dr.": "Dokter",
+        "Dhr.": "De Heer",
+        "Mevr.": "Mevrouw",
+        "Prof.": "Professor",
+        "Drs.": "Dokterandus",
+        "Ing.": "Ingenieur"
+    },
+    "de": {
+        "Dr.": "Doktor",
+        "Prof.": "Professor"
+    }
+}
+
+UNITS = {
+    "en": {
+        "€": "euros",
+        "%": "per cent",
+        "°C": "degrees celsius",
+        "°F": "degrees fahrenheit",
+        "°K": "degrees kelvin",
+        "°": "degrees",
+        "$": "dollars",
+        "£": "pounds",
+        "km": "kilometers",
+        "m": "meters",
+        "cm": "centimeters",
+        "mm": "millimeters",
+        "ft": "feet",
+        "in": "inches",
+        "yd": "yards",
+        "mi": "miles",
+        "kg": "kilograms",
+        "g": "grams",
+        "lb": "pounds",
+        "oz": "ounces",
+        "L": "liters",
+        "mL": "milliliters",
+        "gal": "gallons",
+        "qt": "quarts",
+        "pt": "pints",
+        "hr": "hours",
+        "min": "minutes",
+        "s": "seconds"
+    },
+    "pt": {
+        "€": "euros",
+        "%": "por cento",
+        "°C": "graus celsius",
+        "°F": "graus fahrenheit",
+        "°K": "graus kelvin",
+        "°": "graus",
+        "$": "dólares",
+        "£": "libras",
+        "km": "quilômetros",
+        "m": "metros",
+        "cm": "centímetros",
+        "mm": "milímetros",
+        "kg": "quilogramas",
+        "g": "gramas",
+        "L": "litros",
+        "mL": "mililitros",
+        "h": "horas",
+        "min": "minutos",
+        "s": "segundos"
+    },
+    "es": {
+        "€": "euros",
+        "%": "por ciento",
+        "°C": "grados celsius",
+        "°F": "grados fahrenheit",
+        "°K": "grados kelvin",
+        "°": "grados",
+        "$": "dólares",
+        "£": "libras",
+        "km": "kilómetros",
+        "m": "metros",
+        "cm": "centímetros",
+        "kg": "kilogramos",
+        "g": "gramos",
+        "L": "litros",
+        "mL": "millilitros"
+    },
+    "fr": {
+        "€": "euros",
+        "%": "pour cent",
+        "°C": "degrés celsius",
+        "°F": "degrés fahrenheit",
+        "°K": "degrés kelvin",
+        "°": "degrés",
+        "$": "dollars",
+        "£": "livres",
+        "km": "kilomètres",
+        "m": "mètres",
+        "cm": "centimètres",
+        "kg": "kilogrammes",
+        "g": "grammes",
+        "L": "litres",
+        "mL": "millilitres"
+    },
+    "de": {
+        "€": "Euro",
+        "%": "Prozent",
+        "°C": "Grad Celsius",
+        "°F": "Grad Fahrenheit",
+        "°K": "Grad Kelvin",
+        "°": "Grad",
+        "$": "Dollar",
+        "£": "Pfund",
+        "km": "Kilometer",
+        "m": "Meter",
+        "cm": "Zentimeter",
+        "kg": "Kilogramm",
+        "g": "Gramm",
+        "L": "Liter",
+        "mL": "Milliliter"
+    }
+}
+
+
+def _get_number_separators(full_lang: str) -> tuple[str, str]:
+    """
+    Determines decimal and thousands separators based on language.
+    Defaults to '.' decimal and ',' thousands for most languages.
+    Special cases:
+    - 'pt', 'es', 'fr', 'de': ',' decimal and '.' thousands.
+    """
+    lang_code = full_lang.split("-")[0]
+    decimal_separator = '.'
+    thousands_separator = ','
+    if lang_code in ["pt", "es", "fr", "de"]:
+        decimal_separator = ','
+        thousands_separator = '.'
+    return decimal_separator, thousands_separator
+
+
+def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str:
+    """
+    Helper function to normalize a single word that is a number, handling
+    decimal and thousands separators based on locale.
+    """
+    cleaned_word = word.rstrip(string.punctuation)
+
+    # Handle fractions like '3/3'
+    if is_fraction(cleaned_word):
+        try:
+            return pronounce_fraction(cleaned_word, full_lang) + word[len(cleaned_word):]
+        except Exception as e:
+            LOG.error(f"ovos-number-parser failed to pronounce fraction: {word} - ({e})")
+            return word
+
+    # Handle numbers with locale-specific separators
+    decimal_separator, thousands_separator = _get_number_separators(full_lang)
+    temp_cleaned_word = cleaned_word
+
+    # Check if the word contains a thousands separator followed by digits and a decimal separator
+    # This is a specific check for formats like '123.456,78'
+    has_thousands_and_decimal = (
+            thousands_separator in temp_cleaned_word and
+            decimal_separator in temp_cleaned_word and
+            temp_cleaned_word.index(thousands_separator) < temp_cleaned_word.index(decimal_separator)
+    )
+
+    if has_thousands_and_decimal:
+        temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "")
+        temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".")
+    elif decimal_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(decimal_separator, ".", 1)):
+        # Handle cases like '1,2' -> '1.2'
+        temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".")
+    elif thousands_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(thousands_separator, "", 1)):
+        # Handle cases like '1.234' -> '1234'
+        temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "")
+
+    # Check if the word is a valid number after processing
+    if is_numeric(temp_cleaned_word):
+        try:
+            num = float(temp_cleaned_word) if "." in temp_cleaned_word else int(temp_cleaned_word)
+            return pronounce_number(num, lang=full_lang) + word[len(cleaned_word):]
+        except Exception as e:
+            LOG.error(f"ovos-number-parser failed to pronounce number: {word} - ({e})")
+            return word
+
+    elif rbnf_engine and cleaned_word.isdigit():
+        try:
+            pronounced_number = rbnf_engine.format_number(cleaned_word, FormatPurpose.CARDINAL).text
+            return pronounced_number + word[len(cleaned_word):]
+        except Exception as e:
+            LOG.error(f"unicode-rbnf failed to pronounce number: {word} - ({e})")
+            return word
+
+    return word
+
+
+# --- Date and Time Pronunciation ---
+def pronounce_date(date_obj: date, full_lang: str) -> str:
+    """
+    Pronounces a date object using ovos-date-parser.
+    """
+    return nice_date(date_obj, full_lang)
+
+
+def pronounce_time(time_string: str, full_lang: str) -> str:
+    """
+    Pronounces a time string using ovos-date-parser.
+    Handles military time like "15h01" and converts it to a
+    datetime.time object before passing it to nice_time.
+    """
+    try:
+        hours, mins = time_string.split("h")
+        time_obj = datetime.time(int(hours), int(mins))
+        # Use nice_time from ovos-date-parser
+        return nice_time(time_obj, full_lang, speech=True, use_24hour=True, use_ampm=False)
+    except Exception as e:
+        LOG.warning(f"Failed to parse time string '{time_string}': {e}")
+        return time_string.replace("h", " ")
+
+
+def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DMY") -> str:
+    """
+    Helper function to normalize dates and times using regular expressions.
+    This prepares the strings for pronunciation.
+    """
+    lang_code = full_lang.split("-")[0]
+    # Pre-process with regex to handle English am/pm times
+    if lang_code == "en":
+        text = re.sub(r"(?i)(\d+)(am|pm)", r"\1 \2", text)
+        # Handle the pronunciation for TTS
+        text = text.replace("am", "A M").replace("pm", "P M")
+
+    # Normalize times like "15h01" to words
+    time_pattern = re.compile(r"(\d{1,2})h(\d{2})", re.IGNORECASE)
+
+    def replace_time(match):
+        time_str = match.group(0)
+        return pronounce_time(time_str, full_lang)
+
+    text = time_pattern.sub(replace_time, text)
+
+    # Find dates like "DD/MM/YYYY" or "YYYY/MM/DD"
+    date_pattern = re.compile(r"(\d{1,4})[/-](\d{1,2})[/-](\d{1,4})")
+
+    match = date_pattern.search(text)
+
+    if match:
+        # Get the three parts of the date string
+        part1_str, part2_str, part3_str = match.groups()
+        p1, p2, p3 = int(part1_str), int(part2_str), int(part3_str)
+
+        # Initialize month, day, and year
+        month, day, year = None, None, None
+
+        # Determine year first based on length (4 digits)
+        if len(part1_str) == 4:
+            year, rest_parts = p1, [p2, p3]
+        elif len(part3_str) == 4:
+            year, rest_parts = p3, [p1, p2]
+        else:
+            # If no 4-digit year, it's ambiguous, assume a 2-digit year.
+            # We'll assume the last part is the year based on common patterns.
+            year = p3
+            # Expand 2-digit year to 4-digit year
+            if year < 100:
+                # Assume years 00-29 are 2000-2029, 30-99 are 1930-1999
+                year = 2000 + year if year < 30 else 1900 + year
+            rest_parts = [p1, p2]
+
+        # From the remaining parts, try to determine day and month
+        if day is None and any(p > 12 and len(str(p)) == 2 for p in rest_parts):
+            # If a two-digit number is > 12, it's a day
+            day_candidate = next((p for p in rest_parts if p > 12), None)
+            if day_candidate:
+                day = day_candidate
+                rest_parts.remove(day_candidate)
+                month = rest_parts[0]
+
+        # Fallback to date_format if day/month are still ambiguous
+        if day is None or month is None:
+            if date_format.lower() == "mdy":
+                month, day = rest_parts[0], rest_parts[1]
+            else:  # default to DD/MM/YY
+                day, month = rest_parts[0], rest_parts[1]
+
+        try:
+            date_obj = date(year, month, day)
+            pronounced_date_str = pronounce_date(date_obj, full_lang)
+            text = text.replace(match.group(0), pronounced_date_str)
+        except (ValueError, IndexError) as e:
+            LOG.warning(f"Could not parse date from '{match.group(0)}': {e}")
+
+    return text
+
+
+def _normalize_word_hyphen_digit(text: str) -> str:
+    """
+    Helper function to normalize words attached to digits with a hyphen,
+    such as 'sub-23' -> 'sub 23'.
+    """
+    # Regex to find a word (\w+) followed by a hyphen and a digit (\d+)
+    pattern = re.compile(r"(\w+)-(\d+)")
+    text = pattern.sub(r"\1 \2", text)
+    return text
+
+
+def _normalize_units(text: str, full_lang: str) -> str:
+    """
+    Helper function to normalize units attached to numbers.
+    This function handles symbolic and alphanumeric units separately
+    to avoid issues with word boundaries.
+    """
+    text = text.replace("º", "°")  # these characters look the same... but...
+    lang_code = full_lang.split("-")[0]
+    if lang_code in UNITS:
+        # Determine number separators for the language
+        decimal_separator, thousands_separator = _get_number_separators(full_lang)
+
+        # Separate units into symbolic and alphanumeric
+        symbolic_units = {k: v for k, v in UNITS[lang_code].items() if not k.isalnum()}
+        alphanumeric_units = {k: v for k, v in UNITS[lang_code].items() if k.isalnum()}
+
+        # Create regex pattern for symbolic units and replace them first
+        sorted_symbolic = sorted(symbolic_units.keys(), key=len, reverse=True)
+        symbolic_pattern_str = "|".join(re.escape(unit) for unit in sorted_symbolic)
+        if symbolic_pattern_str:
+            # Pattern to match numbers with optional thousands and decimal separators
+            number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)"
+            symbolic_pattern = re.compile(number_pattern_str + r"\s*(" + symbolic_pattern_str + r")", re.IGNORECASE)
+
+            def replace_symbolic(match):
+                number = match.group(1)
+                # Remove thousands separator and replace decimal separator for parsing
+                if thousands_separator in number and decimal_separator in number:
+                    number = number.replace(thousands_separator, "").replace(decimal_separator, ".")
+                elif decimal_separator != "." and decimal_separator in number:
+                    number = number.replace(decimal_separator, ".")
+                unit_symbol = match.group(2)
+                unit_word = symbolic_units[unit_symbol]
+                try:
+                    return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}"
+                except Exception as e:
+                    LOG.error(f"Failed to pronounce number with unit: {number}{unit_symbol} - ({e})")
+                    return match.group(0)
+
+            text = symbolic_pattern.sub(replace_symbolic, text)
+
+        # Create regex pattern for alphanumeric units and replace them next
+        sorted_alphanumeric = sorted(alphanumeric_units.keys(), key=len, reverse=True)
+        alphanumeric_pattern_str = "|".join(re.escape(unit) for unit in sorted_alphanumeric)
+        if alphanumeric_pattern_str:
+            number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)"
+            alphanumeric_pattern = re.compile(number_pattern_str + r"\s*(" + alphanumeric_pattern_str + r")\b",
+                                              re.IGNORECASE)
+
+            def replace_alphanumeric(match):
+                number = match.group(1)
+                # Remove thousands separator and replace decimal separator for parsing
+                if thousands_separator in number and decimal_separator in number:
+                    number = number.replace(thousands_separator, "").replace(decimal_separator, ".")
+                elif decimal_separator != "." and decimal_separator in number:
+                    number = number.replace(decimal_separator, ".")
+                unit_symbol = match.group(2)
+                unit_word = alphanumeric_units[unit_symbol]
+                return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}"
+
+            text = alphanumeric_pattern.sub(replace_alphanumeric, text)
+    return text
+
+
+def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str:
+    """
+    Helper function to normalize a single word.
+    """
+    lang_code = full_lang.split("-")[0]
+
+    if word in CONTRACTIONS.get(lang_code, {}):
+        return CONTRACTIONS[lang_code][word]
+
+    if word in TITLES.get(lang_code, {}):
+        return TITLES[lang_code][word]
+
+    # Delegate number parsing to the new helper function
+    normalized_number = _normalize_number_word(word, full_lang, rbnf_engine)
+    if normalized_number != word:
+        return normalized_number
+
+    return word
+
+
+def is_fraction(word: str) -> bool:
+    """Checks if a word is a fraction like '3/3'."""
+    if "/" in word:
+        parts = word.split("/")
+        if len(parts) == 2:
+            n1, n2 = parts
+            return n1.isdigit() and n2.isdigit()
+    return False
+
+
+def normalize(text: str, lang: str) -> str:
+    """
+    Normalizes a text string by expanding contractions, titles, and pronouncing
+    numbers, units, and fractions.
+    """
+    full_lang = lang
+    lang_code = full_lang.split("-")[0]
+    dialog = text
+
+    # Step 1: Handle dates and times with ovos-date-parser
+    date_format = "MDY" if full_lang.lower() == "en-us" else "DMY"
+    dialog = _normalize_dates_and_times(dialog, full_lang, date_format)
+
+    # Step 2: Normalize words with hyphens and digits
+    dialog = _normalize_word_hyphen_digit(dialog)
+
+    # Step 3: Expand units attached to numbers
+    dialog = _normalize_units(dialog, full_lang)
+
+    # Step 4: Normalize word-by-word
+    words = dialog.split()
+    rbnf_engine = None
+    try:
+        rbnf_engine = RbnfEngine.for_language(lang_code)
+    except (ValueError, KeyError) as e:
+        LOG.debug(f"RBNF engine not available for language '{lang_code}': {e}")
+
+    normalized_words = [_normalize_word(word, full_lang, rbnf_engine) for word in words]
+    dialog = " ".join(normalized_words)
+
+    return dialog
+
+
+if __name__ == "__main__":
+    # --- Example usage for demonstration purposes ---
+
+    # General normalization examples
+    print("General English example: " + normalize('I\'m Dr. Prof. 3/3 0.5% of 12345€, 5ft, and 10kg', 'en'))
+    print(
+        f"Word Salad Portuguese (Dr. Prof. 3/3 0,5% de 12345€, 5m, e 10kg): {normalize('Dr. Prof. 3/3 0,5% de 12345€, 5m, e 10kg', 'pt')}")
+    print(
+        f"Word Salad Portuguese (Dr. Prof. 3/3 0.5% de 12345€, 5m, e 10kg): {normalize('Dr. Prof. 3/3 0.5% de 12345€, 5m, e 10kg', 'pt')}")
+
+    # Portuguese examples with comma decimal separator
+    print("\n--- Portuguese Decimal Separator Examples ---")
+    print(
+        f"Original: 'A coima aplicada é de 1,2 milhões de euros.' Normalized: '{normalize('A coima aplicada é de 1,2 milhões de euros.', 'pt')}'")
+    print(
+        f"Original: 'Agora, tem 1,88 metros e muito para contar.' Normalized: '{normalize('Agora, tem 1,88 metros e muito para contar.', 'pt')}'")
+    print(
+        f"Original: 'Ainda temos 1,7 milhões de pobres!' Normalized: '{normalize('Ainda temos 1,7 milhões de pobres!', 'pt')}'")
+    print(f"Original: 'O lucro foi de 123.456,78€.' Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt')}'")
+    print(f"Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt-PT')}'")
+
+    # English dates and times
+    print("\n--- English Date & Time Examples ---")
+    print(f"English date (MDY format): {normalize('The date is 08/03/2025', 'en-US')}")
+    print(f"English ambiguous date (MDY assumed): {normalize('The report is due 15/05/2025', 'en-US')}")
+    print(f"English date with dashes: {normalize('The event is on 11-04-2025', 'en-US')}")
+    print(f"English AM/PM time: {normalize('The meeting is at 10am', 'en-US')}")
+    print(f"English military time: {normalize('The party is at 19h30', 'en-US')}")
+    print(f"English month name: {normalize('The report is due 15 May 2025', 'en-US')}")
+
+    # Portuguese dates and times
+    print("\n--- Portuguese Date & Time Examples ---")
+    print(f"Portuguese date (A data é 03/08/2025): {normalize('A data é 03/08/2025', 'pt')}")
+    print(
+        f"Portuguese ambiguous date (O relatório é para 15/05/2025): {normalize('O relatório é para 15/05/2025', 'pt')}")
+    print(
+        f"Portuguese date with dashes (O evento é no dia 25-10-2024): {normalize('O evento é no dia 25-10-2024', 'pt')}")
+    print(f"Portuguese military time (O encontro é às 14h30): {normalize('O encontro é às 14h30', 'pt')}")
+
+    # Other examples
+    print(f"\n--- Other Examples ---")
+    print(f"English fraction: {normalize('The fraction is 1/2', 'en')}")
+    print(f"English plural fraction: {normalize('There are 3/4 of a cup', 'en')}")
+    print(f"Spanish example with units: {normalize('The temperature is 25ºC', 'es')}")
+    print(f"Portuguese with punctuation: {normalize('12345€, 5m e 10kg', 'pt')}")
+    print(
+        f"Portuguese word-digit: {normalize('Esta temporada leva oito jogos ao serviço da equipa sub-23 leonina.', 'pt')}")
diff --git a/requirements.txt b/requirements.txt
index 726ac59..0bb67ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 ovos-plugin-manager
-ovos-number-parser>=0.3.0
\ No newline at end of file
+langcodes
+ovos-number-parser>=0.4.0
+ovos-date-parser>=0.6.4a1
+unicode_rbnf
\ No newline at end of file

From d1d53a938da1ff526c441ef05e269d6657e7a7e1 Mon Sep 17 00:00:00 2001
From: "coderabbitai[bot]"
 <136622811+coderabbitai[bot]@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:48:36 +0100
Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`pho?=
 =?UTF-8?q?onnx`=20(#3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Docstrings generation was requested by @JarbasAl.

* https://github.com/OpenVoiceOS/ovos-dialog-normalizer-plugin/pull/2#issuecomment-3150270624

The following files were modified:

* `ovos_dialog_normalizer_plugin/__init__.py`
* `ovos_dialog_normalizer_plugin/util.py`

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 ovos_dialog_normalizer_plugin/__init__.py |  14 ++-
 ovos_dialog_normalizer_plugin/util.py     | 102 +++++++++++++++++-----
 2 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/ovos_dialog_normalizer_plugin/__init__.py b/ovos_dialog_normalizer_plugin/__init__.py
index 3a3e65e..817f086 100644
--- a/ovos_dialog_normalizer_plugin/__init__.py
+++ b/ovos_dialog_normalizer_plugin/__init__.py
@@ -15,10 +15,22 @@ class DialogNormalizerTransformer(DialogTransformer):
     """
 
     def __init__(self, name="ovos-dialog-normalizer-plugin", priority=5, config=None):
+        """
+        Initialize the dialog normalizer transformer with a name, priority, and optional configuration.
+        """
         super().__init__(name=name, priority=priority, config=config)
 
     def transform(self, dialog: str, context: Optional[dict] = None) -> Tuple[str, dict]:
-        """Normalize dialog text."""
+        """
+        Normalizes the input dialog text according to the session's language settings.
+        
+        Parameters:
+        	dialog (str): The dialog text to be normalized.
+        	context (dict, optional): Optional context containing session information.
+        
+        Returns:
+        	tuple: A tuple containing the normalized dialog string and the (unchanged) context dictionary.
+        """
         context = context or {}
         sess = Session.deserialize(context["session"]) if "session" in context else SessionManager.get()
         original = dialog
diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py
index 324e33e..f7bb3a7 100644
--- a/ovos_dialog_normalizer_plugin/util.py
+++ b/ovos_dialog_normalizer_plugin/util.py
@@ -361,10 +361,13 @@
 
 def _get_number_separators(full_lang: str) -> tuple[str, str]:
     """
-    Determines decimal and thousands separators based on language.
-    Defaults to '.' decimal and ',' thousands for most languages.
-    Special cases:
-    - 'pt', 'es', 'fr', 'de': ',' decimal and '.' thousands.
+    Return the decimal and thousands separators appropriate for the specified language.
+    
+    Parameters:
+    	full_lang (str): The full language code (e.g., "en-US", "pt-BR").
+    
+    Returns:
+    	tuple[str, str]: A tuple containing the decimal separator and thousands separator for the language.
     """
     lang_code = full_lang.split("-")[0]
     decimal_separator = '.'
@@ -377,8 +380,9 @@ def _get_number_separators(full_lang: str) -> tuple[str, str]:
 
 def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str:
     """
-    Helper function to normalize a single word that is a number, handling
-    decimal and thousands separators based on locale.
+    Normalizes a word representing a number or fraction, converting it to its spoken form according to locale conventions.
+    
+    Handles locale-specific decimal and thousands separators, expands fractions, and uses available pronunciation engines to generate the spoken equivalent. If normalization fails, returns the original word.
     """
     cleaned_word = word.rstrip(string.punctuation)
 
@@ -435,16 +439,23 @@ def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str:
 # --- Date and Time Pronunciation ---
 def pronounce_date(date_obj: date, full_lang: str) -> str:
     """
-    Pronounces a date object using ovos-date-parser.
+    Return the spoken form of a date object in the specified language.
+    
+    Parameters:
+        date_obj (date): The date to be pronounced.
+        full_lang (str): The language code for pronunciation.
+    
+    Returns:
+        str: The spoken representation of the date.
     """
     return nice_date(date_obj, full_lang)
 
 
 def pronounce_time(time_string: str, full_lang: str) -> str:
     """
-    Pronounces a time string using ovos-date-parser.
-    Handles military time like "15h01" and converts it to a
-    datetime.time object before passing it to nice_time.
+    Convert a time string in "HHhMM" format to its spoken form in the specified language.
+    
+    If parsing fails, returns the input string with "h" replaced by a space.
     """
     try:
         hours, mins = time_string.split("h")
@@ -458,8 +469,17 @@ def pronounce_time(time_string: str, full_lang: str) -> str:
 
 def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DMY") -> str:
     """
-    Helper function to normalize dates and times using regular expressions.
-    This prepares the strings for pronunciation.
+    Normalizes dates and times in a text string, converting them to their spoken equivalents for the specified language.
+    
+    This function identifies and processes time expressions (e.g., "15h01") and date patterns (e.g., "DD/MM/YYYY", "YYYY/MM/DD") using regular expressions. It handles locale-specific formats, expands ambiguous years, and replaces recognized dates and times with their pronounced forms suitable for text-to-speech. For English, it also separates and expands "am"/"pm" time markers.
+    
+    Parameters:
+        text (str): The input text containing dates and times to normalize.
+        full_lang (str): The language code specifying the locale for normalization.
+        date_format (str, optional): The expected date format ("DMY" or "MDY"). Defaults to "DMY".
+    
+    Returns:
+        str: The text with dates and times replaced by their spoken equivalents.
     """
     lang_code = full_lang.split("-")[0]
     # Pre-process with regex to handle English am/pm times
@@ -472,6 +492,15 @@ def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DM
     time_pattern = re.compile(r"(\d{1,2})h(\d{2})", re.IGNORECASE)
 
     def replace_time(match):
+        """
+        Replaces a matched time string with its spoken equivalent in the specified language.
+        
+        Parameters:
+        	match: A regex match object containing the time string to be pronounced.
+        
+        Returns:
+        	A string with the time expressed in spoken form for the target language.
+        """
         time_str = match.group(0)
         return pronounce_time(time_str, full_lang)
 
@@ -533,8 +562,9 @@ def replace_time(match):
 
 def _normalize_word_hyphen_digit(text: str) -> str:
     """
-    Helper function to normalize words attached to digits with a hyphen,
-    such as 'sub-23' -> 'sub 23'.
+    Replaces occurrences of a word followed by a hyphen and digits with the word and number separated by a space.
+    
+    For example, transforms 'sub-23' into 'sub 23'.
     """
     # Regex to find a word (\w+) followed by a hyphen and a digit (\d+)
     pattern = re.compile(r"(\w+)-(\d+)")
@@ -544,9 +574,12 @@ def _normalize_word_hyphen_digit(text: str) -> str:
 
 def _normalize_units(text: str, full_lang: str) -> str:
     """
-    Helper function to normalize units attached to numbers.
-    This function handles symbolic and alphanumeric units separately
-    to avoid issues with word boundaries.
+    Expands and pronounces units attached to numbers in the text according to the specified language.
+    
+    This function detects numbers followed by unit symbols or abbreviations (e.g., "50kg", "100€"), converts the number to its spoken form, and replaces the unit with its full word equivalent based on language-specific mappings. Handles both symbolic (non-alphanumeric) and alphanumeric units, accounting for locale-specific decimal and thousands separators.
+    
+    Returns:
+        str: The text with numbers and units normalized to their spoken forms.
     """
     text = text.replace("º", "°")  # these characters look the same... but...
     lang_code = full_lang.split("-")[0]
@@ -567,6 +600,11 @@ def _normalize_units(text: str, full_lang: str) -> str:
             symbolic_pattern = re.compile(number_pattern_str + r"\s*(" + symbolic_pattern_str + r")", re.IGNORECASE)
 
             def replace_symbolic(match):
+                """
+                Replaces a matched symbolic unit expression with its spoken number and unit word equivalent.
+                
+                The function is intended for use as a regex replacement callback, converting patterns like "50%" or "1.5€" into their spoken forms (e.g., "fifty percent" or "one point five euros") according to the specified language. If pronunciation fails, returns the original matched string.
+                """
                 number = match.group(1)
                 # Remove thousands separator and replace decimal separator for parsing
                 if thousands_separator in number and decimal_separator in number:
@@ -592,6 +630,15 @@ def replace_symbolic(match):
                                               re.IGNORECASE)
 
             def replace_alphanumeric(match):
+                """
+                Replaces a matched alphanumeric unit expression with its spoken number and full unit name.
+                
+                Parameters:
+                	match: A regex match object containing a number and an alphanumeric unit symbol.
+                
+                Returns:
+                	A string with the number pronounced in the specified language followed by the expanded unit name.
+                """
                 number = match.group(1)
                 # Remove thousands separator and replace decimal separator for parsing
                 if thousands_separator in number and decimal_separator in number:
@@ -608,7 +655,9 @@ def replace_alphanumeric(match):
 
 def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str:
     """
-    Helper function to normalize a single word.
+    Normalizes a single word by expanding contractions, titles, or pronouncing numbers and fractions.
+    
+    If the word matches a known contraction or title in the specified language, it is expanded to its full form. If the word represents a number or fraction, it is converted to its spoken equivalent. Returns the original word if no normalization applies.
     """
     lang_code = full_lang.split("-")[0]
 
@@ -627,7 +676,12 @@ def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str:
 
 
 def is_fraction(word: str) -> bool:
-    """Checks if a word is a fraction like '3/3'."""
+    """
+    Determine if the input string represents a numeric fraction in the form 'n1/n2'.
+    
+    Returns:
+        bool: True if the string is a fraction with two integer components separated by '/', otherwise False.
+    """
     if "/" in word:
         parts = word.split("/")
         if len(parts) == 2:
@@ -638,8 +692,14 @@ def is_fraction(word: str) -> bool:
 
 def normalize(text: str, lang: str) -> str:
     """
-    Normalizes a text string by expanding contractions, titles, and pronouncing
-    numbers, units, and fractions.
+    Normalize a text string for spoken output by expanding contractions, titles, numbers, units, fractions, dates, and times according to the specified language.
+    
+    Parameters:
+        text (str): The input text to normalize.
+        lang (str): The language code (e.g., "en-US", "pt-PT") used for locale-specific normalization.
+    
+    Returns:
+        str: The normalized text with contractions expanded, numbers and units pronounced, and dates and times converted to spoken form.
     """
     full_lang = lang
     lang_code = full_lang.split("-")[0]

From 01456654ce79e8efdc9750b4a20152cb3a1f6aa7 Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Mon, 4 Aug 2025 13:10:29 +0100
Subject: [PATCH 3/6] move to .json files for easy localization

---
 .github/workflows/unit_tests.yml              |  45 ++
 .../locale/ca/titles.json                     |   6 +
 .../locale/de/titles.json                     |   4 +
 .../locale/de/units.json                      |  17 +
 .../locale/en/contractions.json               | 168 +++++++
 .../locale/en/titles.json                     |   5 +
 .../locale/en/units.json                      |  30 ++
 .../locale/es/titles.json                     |   8 +
 .../locale/es/units.json                      |  17 +
 .../locale/fr/titles.json                     |   8 +
 .../locale/fr/units.json                      |  17 +
 .../locale/gl/titles.json                     |   7 +
 .../locale/it/titles.json                     |   8 +
 .../locale/nl/titles.json                     |   8 +
 .../locale/pt/titles.json                     |  10 +
 .../locale/pt/units.json                      |  21 +
 ovos_dialog_normalizer_plugin/util.py         | 425 +++---------------
 requirements.txt                              |   4 +-
 setup.py                                      |  10 +
 tests/__init__.py                             |   1 +
 20 files changed, 459 insertions(+), 360 deletions(-)
 create mode 100644 .github/workflows/unit_tests.yml
 create mode 100644 ovos_dialog_normalizer_plugin/locale/ca/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/de/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/de/units.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/en/contractions.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/en/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/en/units.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/es/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/es/units.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/fr/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/fr/units.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/gl/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/it/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/nl/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/pt/titles.json
 create mode 100644 ovos_dialog_normalizer_plugin/locale/pt/units.json
 create mode 100644 tests/__init__.py

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
new file mode 100644
index 0000000..291030a
--- /dev/null
+++ b/.github/workflows/unit_tests.yml
@@ -0,0 +1,45 @@
+# This workflow will run unit tests
+
+name: Run Unit Tests
+on:
+  pull_request:
+    branches:
+      - dev
+    paths-ignore:
+      - 'ovos_dialog_normalizer_plugin/version.py'
+      - '.github/**'
+      - '.gitignore'
+      - 'LICENSE'
+      - 'CHANGELOG.md'
+      - 'MANIFEST.in'
+      - 'README.md'
+  workflow_dispatch:
+
+jobs:
+  unit_tests:
+    strategy:
+      max-parallel: 3
+      matrix:
+        python-version: [ "3.10", "3.11", "3.12" ]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install System Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt install python3-dev
+          python -m pip install build wheel unicode-rbnf
+      - name: Install core repo
+        run: |
+          pip install -e .
+      - name: Install test dependencies
+        run: |
+          pip install pytest pytest-timeout pytest-cov
+      - name: Run unittests
+        run: |
+          pytest --cov=ovos_dialog_normalizer_plugin --cov-report xml tests
+
diff --git a/ovos_dialog_normalizer_plugin/locale/ca/titles.json b/ovos_dialog_normalizer_plugin/locale/ca/titles.json
new file mode 100644
index 0000000..6beb307
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/ca/titles.json
@@ -0,0 +1,6 @@
+{
+  "Dr.": "Doctor",
+  "Sr.": "Senyor",
+  "Sra.": "Senyora",
+  "Prof.": "Professor"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/de/titles.json b/ovos_dialog_normalizer_plugin/locale/de/titles.json
new file mode 100644
index 0000000..770de9d
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/de/titles.json
@@ -0,0 +1,4 @@
+{
+  "Dr.": "Doktor",
+  "Prof.": "Professor"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/de/units.json b/ovos_dialog_normalizer_plugin/locale/de/units.json
new file mode 100644
index 0000000..2234346
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/de/units.json
@@ -0,0 +1,17 @@
+{
+  "€": "Euro",
+  "%": "Prozent",
+  "°C": "Grad Celsius",
+  "°F": "Grad Fahrenheit",
+  "°K": "Grad Kelvin",
+  "°": "Grad",
+  "$": "Dollar",
+  "£": "Pfund",
+  "km": "Kilometer",
+  "m": "Meter",
+  "cm": "Zentimeter",
+  "kg": "Kilogramm",
+  "g": "Gramm",
+  "L": "Liter",
+  "mL": "Milliliter"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/en/contractions.json b/ovos_dialog_normalizer_plugin/locale/en/contractions.json
new file mode 100644
index 0000000..376c354
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/en/contractions.json
@@ -0,0 +1,168 @@
+{
+  "I'd": "I would",
+  "I'll": "I will",
+  "I'm": "I am",
+  "I've": "I have",
+  "ain't": "is not",
+  "aren't": "are not",
+  "can't": "can not",
+  "could've": "could have",
+  "couldn't": "could not",
+  "didn't": "did not",
+  "doesn't": "does not",
+  "don't": "do not",
+  "gonna": "going to",
+  "gotta": "got to",
+  "hadn't": "had not",
+  "hasn't": "has not",
+  "haven't": "have not",
+  "he'd": "he would",
+  "he'll": "he will",
+  "he's": "he is",
+  "how'd": "how did",
+  "how'll": "how will",
+  "how's": "how is",
+  "isn't": "is not",
+  "it'd": "it would",
+  "it'll": "it will",
+  "it's": "it is",
+  "might've": "might have",
+  "mightn't": "might not",
+  "must've": "must have",
+  "mustn't": "must not",
+  "needn't": "need not",
+  "oughtn't": "ought not",
+  "she'd": "she would",
+  "she'll": "she will",
+  "she's": "she is",
+  "should've": "should have",
+  "shouldn't": "should not",
+  "somebody's": "somebody is",
+  "someone'd": "someone would",
+  "someone'll": "someone will",
+  "someone's": "someone is",
+  "that'd": "that would",
+  "that'll": "that will",
+  "that's": "that is",
+  "there'd": "there would",
+  "there're": "there are",
+  "there's": "there is",
+  "they'd": "they would",
+  "they'll": "they will",
+  "they're": "they are",
+  "they've": "they have",
+  "wasn't": "was not",
+  "we'd": "we would",
+  "we'll": "we will",
+  "we're": "we are",
+  "we've": "we have",
+  "weren't": "were not",
+  "what'd": "what did",
+  "what'll": "what will",
+  "what're": "what are",
+  "what's": "what is",
+  "what've": "what have",
+  "whats": "what is",
+  "when'd": "when did",
+  "when's": "when is",
+  "where'd": "where did",
+  "where's": "where is",
+  "where've": "where have",
+  "who'd": "who would",
+  "who'd've": "who would have",
+  "who'll": "who will",
+  "who're": "who are",
+  "who's": "who is",
+  "who've": "who have",
+  "why'd": "why did",
+  "why're": "why are",
+  "why's": "why is",
+  "won't": "will not",
+  "won't've": "will not have",
+  "would've": "would have",
+  "wouldn't": "would not",
+  "wouldn't've": "would not have",
+  "y'ain't": "you are not",
+  "y'aint": "you are not",
+  "y'all": "you all",
+  "ya'll": "you all",
+  "you'd": "you would",
+  "you'd've": "you would have",
+  "you'll": "you will",
+  "you're": "you are",
+  "you've": "you have",
+  "I'm'a": "I am going to",
+  "I'm'o": "I am going to",
+  "I'll've": "I will have",
+  "I'd've": "I would have",
+  "Whatcha": "What are you",
+  "amn't": "am not",
+  "'cause": "because",
+  "can't've": "cannot have",
+  "couldn't've": "could not have",
+  "daren't": "dare not",
+  "daresn't": "dare not",
+  "dasn't": "dare not",
+  "everyone's": "everyone is",
+  "gimme": "give me",
+  "gon't": "go not",
+  "hadn't've": "had not have",
+  "he've": "he would have",
+  "he'll've": "he will have",
+  "he'd've": "he would have",
+  "here's": "here is",
+  "how're": "how are",
+  "how'd'y": "how do you do",
+  "howd'y": "how do you do",
+  "howdy": "how do you do",
+  "'tis": "it is",
+  "'twas": "it was",
+  "it'll've": "it will have",
+  "it'd've": "it would have",
+  "kinda": "kind of",
+  "let's": "let us",
+  "ma'am": "madam",
+  "may've": "may have",
+  "mayn't": "may not",
+  "mightn't've": "might not have",
+  "mustn't've": "must not have",
+  "needn't've": "need not have",
+  "ol'": "old",
+  "oughtn't've": "ought not have",
+  "sha'n't": "shall not",
+  "shan't": "shall not",
+  "shalln't": "shall not",
+  "shan't've": "shall not have",
+  "she'd've": "she would have",
+  "shouldn't've": "should not have",
+  "so've": "so have",
+  "so's": "so is",
+  "something's": "something is",
+  "that're": "that are",
+  "that'd've": "that would have",
+  "there'll": "there will",
+  "there'd've": "there would have",
+  "these're": "these are",
+  "they'll've": "they will have",
+  "they'd've": "they would have",
+  "this's": "this is",
+  "this'll": "this will",
+  "this'd": "this would",
+  "those're": "those are",
+  "to've": "to have",
+  "wanna": "want to",
+  "we'll've": "we will have",
+  "we'd've": "we would have",
+  "what'll've": "what will have",
+  "when've": "when have",
+  "where're": "where are",
+  "which's": "which is",
+  "who'll've": "who will have",
+  "why've": "why have",
+  "will've": "will have",
+  "y'all're": "you all are",
+  "y'all've": "you all have",
+  "y'all'd": "you all would",
+  "y'all'd've": "you all would have",
+  "you'll've": "you will have"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/en/titles.json b/ovos_dialog_normalizer_plugin/locale/en/titles.json
new file mode 100644
index 0000000..30666a9
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/en/titles.json
@@ -0,0 +1,5 @@
+{
+  "Dr.": "Doctor",
+  "Mr.": "Mister",
+  "Prof.": "Professor"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/en/units.json b/ovos_dialog_normalizer_plugin/locale/en/units.json
new file mode 100644
index 0000000..924489e
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/en/units.json
@@ -0,0 +1,30 @@
+{
+  "€": "euros",
+  "%": "per cent",
+  "°C": "degrees celsius",
+  "°F": "degrees fahrenheit",
+  "°K": "degrees kelvin",
+  "°": "degrees",
+  "$": "dollars",
+  "£": "pounds",
+  "km": "kilometers",
+  "m": "meters",
+  "cm": "centimeters",
+  "mm": "millimeters",
+  "ft": "feet",
+  "in": "inches",
+  "yd": "yards",
+  "mi": "miles",
+  "kg": "kilograms",
+  "g": "grams",
+  "lb": "pounds",
+  "oz": "ounces",
+  "L": "liters",
+  "mL": "milliliters",
+  "gal": "gallons",
+  "qt": "quarts",
+  "pt": "pints",
+  "hr": "hours",
+  "min": "minutes",
+  "s": "seconds"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/es/titles.json b/ovos_dialog_normalizer_plugin/locale/es/titles.json
new file mode 100644
index 0000000..f7ba7c9
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/es/titles.json
@@ -0,0 +1,8 @@
+{
+  "Dr.": "Doctor",
+  "Sr.": "Señor",
+  "Sra.": "Señora",
+  "Prof.": "Profesor",
+  "D.": "Don",
+  "Dña.": "Doña"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/es/units.json b/ovos_dialog_normalizer_plugin/locale/es/units.json
new file mode 100644
index 0000000..3bafba3
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/es/units.json
@@ -0,0 +1,17 @@
+{
+  "€": "euros",
+  "%": "por ciento",
+  "°C": "grados celsius",
+  "°F": "grados fahrenheit",
+  "°K": "grados kelvin",
+  "°": "grados",
+  "$": "dólares",
+  "£": "libras",
+  "km": "kilómetros",
+  "m": "metros",
+  "cm": "centímetros",
+  "kg": "kilogramos",
+  "g": "gramos",
+  "L": "litros",
+  "mL": "millilitros"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/fr/titles.json b/ovos_dialog_normalizer_plugin/locale/fr/titles.json
new file mode 100644
index 0000000..3160db5
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/fr/titles.json
@@ -0,0 +1,8 @@
+{
+  "Dr.": "Docteur",
+  "M.": "Monsieur",
+  "Mme": "Madame",
+  "Mlle": "Mademoiselle",
+  "Prof.": "Professeur",
+  "Pr.": "Professeur"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/fr/units.json b/ovos_dialog_normalizer_plugin/locale/fr/units.json
new file mode 100644
index 0000000..b17482e
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/fr/units.json
@@ -0,0 +1,17 @@
+{
+  "€": "euros",
+  "%": "pour cent",
+  "°C": "degrés celsius",
+  "°F": "degrés fahrenheit",
+  "°K": "degrés kelvin",
+  "°": "degrés",
+  "$": "dollars",
+  "£": "livres",
+  "km": "kilomètres",
+  "m": "mètres",
+  "cm": "centimètres",
+  "kg": "kilogrammes",
+  "g": "grammes",
+  "L": "litres",
+  "mL": "millilitres"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/gl/titles.json b/ovos_dialog_normalizer_plugin/locale/gl/titles.json
new file mode 100644
index 0000000..b5ac190
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/gl/titles.json
@@ -0,0 +1,7 @@
+{
+  "Dr.": "Doutor",
+  "Sr.": "Señor",
+  "Sra.": "Señora",
+  "Prof.": "Profesor",
+  "Srta.": "Señorita"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/it/titles.json b/ovos_dialog_normalizer_plugin/locale/it/titles.json
new file mode 100644
index 0000000..263f9b0
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/it/titles.json
@@ -0,0 +1,8 @@
+{
+  "Dr.": "Dottore",
+  "Sig.": "Signore",
+  "Sig.ra": "Signora",
+  "Prof.": "Professore",
+  "Dott.ssa": "Dottoressa",
+  "Sig.na": "Signorina"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/nl/titles.json b/ovos_dialog_normalizer_plugin/locale/nl/titles.json
new file mode 100644
index 0000000..dc10e29
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/nl/titles.json
@@ -0,0 +1,8 @@
+{
+  "Dr.": "Dokter",
+  "Dhr.": "De Heer",
+  "Mevr.": "Mevrouw",
+  "Prof.": "Professor",
+  "Drs.": "Dokterandus",
+  "Ing.": "Ingenieur"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/pt/titles.json b/ovos_dialog_normalizer_plugin/locale/pt/titles.json
new file mode 100644
index 0000000..38b76a3
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/pt/titles.json
@@ -0,0 +1,10 @@
+{
+  "Dr.": "Doutor",
+  "Sr.": "Senhor",
+  "Sra.": "Senhora",
+  "Prof.": "Professor",
+  "Drª.": "Doutora",
+  "Eng.": "Engenheiro",
+  "D.": "Dom",
+  "Dª": "Dona"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/locale/pt/units.json b/ovos_dialog_normalizer_plugin/locale/pt/units.json
new file mode 100644
index 0000000..4424c79
--- /dev/null
+++ b/ovos_dialog_normalizer_plugin/locale/pt/units.json
@@ -0,0 +1,21 @@
+{
+  "€": "euros",
+  "%": "por cento",
+  "°C": "graus celsius",
+  "°F": "graus fahrenheit",
+  "°K": "graus kelvin",
+  "°": "graus",
+  "$": "dólares",
+  "£": "libras",
+  "km": "quilômetros",
+  "m": "metros",
+  "cm": "centímetros",
+  "mm": "milímetros",
+  "kg": "quilogramas",
+  "g": "gramas",
+  "L": "litros",
+  "mL": "mililitros",
+  "h": "horas",
+  "min": "minutos",
+  "s": "segundos"
+}
\ No newline at end of file
diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py
index f7bb3a7..195dfa8 100644
--- a/ovos_dialog_normalizer_plugin/util.py
+++ b/ovos_dialog_normalizer_plugin/util.py
@@ -1,5 +1,6 @@
 import datetime
-import logging
+import json
+import os
 import re
 import string
 from datetime import date
@@ -7,356 +8,59 @@
 from ovos_date_parser import nice_time, nice_date
 from ovos_number_parser import pronounce_number, pronounce_fraction
 from ovos_number_parser.util import is_numeric
+from ovos_utils.log import LOG
 from unicode_rbnf import RbnfEngine, FormatPurpose
 
-LOG = logging.getLogger("normalize")
-
-# A dictionary of common contractions and their expanded forms.
-# This list is very comprehensive for English.
-CONTRACTIONS = {
-    "en": {
-        "I'd": "I would",
-        "I'll": "I will",
-        "I'm": "I am",
-        "I've": "I have",
-        "ain't": "is not",
-        "aren't": "are not",
-        "can't": "can not",
-        "could've": "could have",
-        "couldn't": "could not",
-        "didn't": "did not",
-        "doesn't": "does not",
-        "don't": "do not",
-        "gonna": "going to",
-        "gotta": "got to",
-        "hadn't": "had not",
-        "hasn't": "has not",
-        "haven't": "have not",
-        "he'd": "he would",
-        "he'll": "he will",
-        "he's": "he is",
-        "how'd": "how did",
-        "how'll": "how will",
-        "how's": "how is",
-        "isn't": "is not",
-        "it'd": "it would",
-        "it'll": "it will",
-        "it's": "it is",
-        "might've": "might have",
-        "mightn't": "might not",
-        "must've": "must have",
-        "mustn't": "must not",
-        "needn't": "need not",
-        "oughtn't": "ought not",
-        "shan't": "shall not",
-        "she'd": "she would",
-        "she'll": "she will",
-        "she's": "she is",
-        "should've": "should have",
-        "shouldn't": "should not",
-        "somebody's": "somebody is",
-        "someone'd": "someone would",
-        "someone'll": "someone will",
-        "someone's": "someone is",
-        "that'd": "that would",
-        "that'll": "that will",
-        "that's": "that is",
-        "there'd": "there would",
-        "there're": "there are",
-        "there's": "there is",
-        "they'd": "they would",
-        "they'll": "they will",
-        "they're": "they are",
-        "they've": "they have",
-        "wasn't": "was not",
-        "we'd": "we would",
-        "we'll": "we will",
-        "we're": "we are",
-        "we've": "we have",
-        "weren't": "were not",
-        "what'd": "what did",
-        "what'll": "what will",
-        "what're": "what are",
-        "what's": "what is",
-        "what've": "what have",
-        "whats": "what is",
-        "when'd": "when did",
-        "when's": "when is",
-        "where'd": "where did",
-        "where's": "where is",
-        "where've": "where have",
-        "who'd": "who would",
-        "who'd've": "who would have",
-        "who'll": "who will",
-        "who're": "who are",
-        "who's": "who is",
-        "who've": "who have",
-        "why'd": "why did",
-        "why're": "why are",
-        "why's": "why is",
-        "won't": "will not",
-        "won't've": "will not have",
-        "would've": "would have",
-        "wouldn't": "would not",
-        "wouldn't've": "would not have",
-        "y'ain't": "you are not",
-        "y'aint": "you are not",
-        "y'all": "you all",
-        "ya'll": "you all",
-        "you'd": "you would",
-        "you'd've": "you would have",
-        "you'll": "you will",
-        "you're": "you are",
-        "you've": "you have",
-        "I'm'a": "I am going to",
-        "I'm'o": "I am going to",
-        "I'll've": "I will have",
-        "I'd've": "I would have",
-        "Whatcha": "What are you",
-        "amn't": "am not",
-        "'cause": "because",
-        "can't've": "cannot have",
-        "couldn't've": "could not have",
-        "daren't": "dare not",
-        "daresn't": "dare not",
-        "dasn't": "dare not",
-        "everyone's": "everyone is",
-        "gimme": "give me",
-        "gon't": "go not",
-        "hadn't've": "had not have",
-        "he've": "he would have",
-        "he'll've": "he will have",
-        "he'd've": "he would have",
-        "here's": "here is",
-        "how're": "how are",
-        "how'd'y": "how do you do",
-        "howd'y": "how do you do",
-        "howdy": "how do you do",
-        "'tis": "it is",
-        "'twas": "it was",
-        "it'll've": "it will have",
-        "it'd've": "it would have",
-        "kinda": "kind of",
-        "let's": "let us",
-        "ma'am": "madam",
-        "may've": "may have",
-        "mayn't": "may not",
-        "mightn't've": "might not have",
-        "mustn't've": "must not have",
-        "needn't've": "need not have",
-        "ol'": "old",
-        "oughtn't've": "ought not have",
-        "sha'n't": "shall not",
-        "shan't": "shall not",
-        "shalln't": "shall not",
-        "shan't've": "shall not have",
-        "she'd've": "she would have",
-        "shouldn't've": "should not have",
-        "so've": "so have",
-        "so's": "so is",
-        "something's": "something is",
-        "that're": "that are",
-        "that'd've": "that would have",
-        "there'll": "there will",
-        "there'd've": "there would have",
-        "these're": "these are",
-        "they'll've": "they will have",
-        "they'd've": "they would have",
-        "this's": "this is",
-        "this'll": "this will",
-        "this'd": "this would",
-        "those're": "those are",
-        "to've": "to have",
-        "wanna": "want to",
-        "we'll've": "we will have",
-        "we'd've": "we would have",
-        "what'll've": "what will have",
-        "when've": "when have",
-        "where're": "where are",
-        "which's": "which is",
-        "who'll've": "who will have",
-        "why've": "why have",
-        "will've": "will have",
-        "y'all're": "you all are",
-        "y'all've": "you all have",
-        "y'all'd": "you all would",
-        "y'all'd've": "you all would have",
-        "you'll've": "you will have"
-    }
-}
-
-# Dictionaries for titles, units, and their full word equivalents.
-TITLES = {
-    "en": {
-        "Dr.": "Doctor",
-        "Mr.": "Mister",
-        "Prof.": "Professor"
-    },
-    "ca": {
-        "Dr.": "Doctor",
-        "Sr.": "Senyor",
-        "Sra.": "Senyora",
-        "Prof.": "Professor"
-    },
-    "es": {
-        "Dr.": "Doctor",
-        "Sr.": "Señor",
-        "Sra.": "Señora",
-        "Prof.": "Profesor",
-        "D.": "Don",
-        "Dña.": "Doña"
-    },
-    "pt": {
-        "Dr.": "Doutor",
-        "Sr.": "Senhor",
-        "Sra.": "Senhora",
-        "Prof.": "Professor",
-        "Drª.": "Doutora",
-        "Eng.": "Engenheiro",
-        "D.": "Dom",
-        "Dª": "Dona"
-    },
-    "gl": {
-        "Dr.": "Doutor",
-        "Sr.": "Señor",
-        "Sra.": "Señora",
-        "Prof.": "Profesor",
-        "Srta.": "Señorita"
-    },
-    "fr": {
-        "Dr.": "Docteur",
-        "M.": "Monsieur",
-        "Mme": "Madame",
-        "Mlle": "Mademoiselle",
-        "Prof.": "Professeur",
-        "Pr.": "Professeur"
-    },
-    "it": {
-        "Dr.": "Dottore",
-        "Sig.": "Signore",
-        "Sig.ra": "Signora",
-        "Prof.": "Professore",
-        "Dott.ssa": "Dottoressa",
-        "Sig.na": "Signorina"
-    },
-    "nl": {
-        "Dr.": "Dokter",
-        "Dhr.": "De Heer",
-        "Mevr.": "Mevrouw",
-        "Prof.": "Professor",
-        "Drs.": "Dokterandus",
-        "Ing.": "Ingenieur"
-    },
-    "de": {
-        "Dr.": "Doktor",
-        "Prof.": "Professor"
-    }
-}
-
-UNITS = {
-    "en": {
-        "€": "euros",
-        "%": "per cent",
-        "°C": "degrees celsius",
-        "°F": "degrees fahrenheit",
-        "°K": "degrees kelvin",
-        "°": "degrees",
-        "$": "dollars",
-        "£": "pounds",
-        "km": "kilometers",
-        "m": "meters",
-        "cm": "centimeters",
-        "mm": "millimeters",
-        "ft": "feet",
-        "in": "inches",
-        "yd": "yards",
-        "mi": "miles",
-        "kg": "kilograms",
-        "g": "grams",
-        "lb": "pounds",
-        "oz": "ounces",
-        "L": "liters",
-        "mL": "milliliters",
-        "gal": "gallons",
-        "qt": "quarts",
-        "pt": "pints",
-        "hr": "hours",
-        "min": "minutes",
-        "s": "seconds"
-    },
-    "pt": {
-        "€": "euros",
-        "%": "por cento",
-        "°C": "graus celsius",
-        "°F": "graus fahrenheit",
-        "°K": "graus kelvin",
-        "°": "graus",
-        "$": "dólares",
-        "£": "libras",
-        "km": "quilômetros",
-        "m": "metros",
-        "cm": "centímetros",
-        "mm": "milímetros",
-        "kg": "quilogramas",
-        "g": "gramas",
-        "L": "litros",
-        "mL": "mililitros",
-        "h": "horas",
-        "min": "minutos",
-        "s": "segundos"
-    },
-    "es": {
-        "€": "euros",
-        "%": "por ciento",
-        "°C": "grados celsius",
-        "°F": "grados fahrenheit",
-        "°K": "grados kelvin",
-        "°": "grados",
-        "$": "dólares",
-        "£": "libras",
-        "km": "kilómetros",
-        "m": "metros",
-        "cm": "centímetros",
-        "kg": "kilogramos",
-        "g": "gramos",
-        "L": "litros",
-        "mL": "millilitros"
-    },
-    "fr": {
-        "€": "euros",
-        "%": "pour cent",
-        "°C": "degrés celsius",
-        "°F": "degrés fahrenheit",
-        "°K": "degrés kelvin",
-        "°": "degrés",
-        "$": "dollars",
-        "£": "livres",
-        "km": "kilomètres",
-        "m": "mètres",
-        "cm": "centimètres",
-        "kg": "kilogrammes",
-        "g": "grammes",
-        "L": "litres",
-        "mL": "millilitres"
-    },
-    "de": {
-        "€": "Euro",
-        "%": "Prozent",
-        "°C": "Grad Celsius",
-        "°F": "Grad Fahrenheit",
-        "°K": "Grad Kelvin",
-        "°": "Grad",
-        "$": "Dollar",
-        "£": "Pfund",
-        "km": "Kilometer",
-        "m": "Meter",
-        "cm": "Zentimeter",
-        "kg": "Kilogramm",
-        "g": "Gramm",
-        "L": "Liter",
-        "mL": "Milliliter"
-    }
-}
+RESOURCES_DIR = os.path.join(os.path.dirname(__file__), "locale")
+
+
+# --- Locale Data Management Class ---
+class LocaleDataManager:
+    """
+    A helper class to lazy-load and cache locale-specific data from JSON files.
+    The data is not hardcoded and will be loaded from a 'locale' directory
+    containing language-specific JSON files on first use.
+    """
+
+    def __init__(self):
+        """Initializes an empty cache for locale data."""
+        self.cache = {}
+
+    def _load_data(self, lang_code: str, file_name: str) -> dict:
+        """Loads a single JSON file and caches it."""
+        file_path = os.path.join(RESOURCES_DIR, lang_code, f"{file_name}.json")
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                self.cache.setdefault(lang_code, {})[file_name] = data
+                return data
+        except FileNotFoundError:
+            LOG.debug(f"Locale file not found: {file_path}. Using empty dictionary.")
+            self.cache.setdefault(lang_code, {})[file_name] = {}
+            return {}
+        except json.JSONDecodeError as e:
+            LOG.error(f"Error decoding JSON from {file_path}: {e}")
+            self.cache.setdefault(lang_code, {})[file_name] = {}
+            return {}
+
+    def get_data(self, lang_code: str, file_name: str) -> dict:
+        """Retrieves data for a given language and file, using the cache."""
+        if lang_code in self.cache and file_name in self.cache[lang_code]:
+            return self.cache[lang_code][file_name]
+        return self._load_data(lang_code, file_name)
+
+    def get_contractions(self, lang_code: str) -> dict:
+        return self.get_data(lang_code, "contractions")
+
+    def get_units(self, lang_code: str) -> dict:
+        return self.get_data(lang_code, "units")
+
+    def get_titles(self, lang_code: str) -> dict:
+        return self.get_data(lang_code, "titles")
+
+
+# Instantiate the manager to be used by the normalization functions
+locale_data_manager = LocaleDataManager()
 
 
 def _get_number_separators(full_lang: str) -> tuple[str, str]:
@@ -364,14 +68,15 @@ def _get_number_separators(full_lang: str) -> tuple[str, str]:
     Return the decimal and thousands separators appropriate for the specified language.
     
     Parameters:
-    	full_lang (str): The full language code (e.g., "en-US", "pt-BR").
+        full_lang (str): The full language code (e.g., "en-US", "pt-BR").
     
     Returns:
-    	tuple[str, str]: A tuple containing the decimal separator and thousands separator for the language.
+        tuple[str, str]: A tuple containing the decimal separator and thousands separator for the language.
     """
     lang_code = full_lang.split("-")[0]
     decimal_separator = '.'
     thousands_separator = ','
+    # TODO This logic can also be moved to a JSON file
     if lang_code in ["pt", "es", "fr", "de"]:
         decimal_separator = ','
         thousands_separator = '.'
@@ -583,13 +288,15 @@ def _normalize_units(text: str, full_lang: str) -> str:
     """
     text = text.replace("º", "°")  # these characters look the same... but...
     lang_code = full_lang.split("-")[0]
-    if lang_code in UNITS:
+    units_data = locale_data_manager.get_units(lang_code)
+
+    if units_data:
         # Determine number separators for the language
         decimal_separator, thousands_separator = _get_number_separators(full_lang)
 
         # Separate units into symbolic and alphanumeric
-        symbolic_units = {k: v for k, v in UNITS[lang_code].items() if not k.isalnum()}
-        alphanumeric_units = {k: v for k, v in UNITS[lang_code].items() if k.isalnum()}
+        symbolic_units = {k: v for k, v in units_data.items() if not k.isalnum()}
+        alphanumeric_units = {k: v for k, v in units_data.items() if k.isalnum()}
 
         # Create regex pattern for symbolic units and replace them first
         sorted_symbolic = sorted(symbolic_units.keys(), key=len, reverse=True)
@@ -660,12 +367,14 @@ def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str:
     If the word matches a known contraction or title in the specified language, it is expanded to its full form. If the word represents a number or fraction, it is converted to its spoken equivalent. Returns the original word if no normalization applies.
     """
     lang_code = full_lang.split("-")[0]
+    contractions = locale_data_manager.get_contractions(lang_code)
+    titles = locale_data_manager.get_titles(lang_code)
 
-    if word in CONTRACTIONS.get(lang_code, {}):
-        return CONTRACTIONS[lang_code][word]
+    if word in contractions:
+        return contractions[word]
 
-    if word in TITLES.get(lang_code, {}):
-        return TITLES[lang_code][word]
+    if word in titles:
+        return titles[word]
 
     # Delegate number parsing to the new helper function
     normalized_number = _normalize_number_word(word, full_lang, rbnf_engine)
diff --git a/requirements.txt b/requirements.txt
index 0bb67ef..7689f93 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 ovos-plugin-manager
 langcodes
-ovos-number-parser>=0.4.0
-ovos-date-parser>=0.6.4a1
+ovos-number-parser>=0.4.0,<1.0.0
+ovos-date-parser>=0.6.4a1,<1.0.0
 unicode_rbnf
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 04d5728..ff32270 100755
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,14 @@ def get_version():
 
 ENTRY_POINT = 'ovos-dialog-normalizer-plugin=ovos_dialog_normalizer_plugin:DialogNormalizerTransformer'
 
+def package_files(directory):
+    paths = []
+    for (path, directories, filenames) in os.walk(directory):
+        for filename in filenames:
+            paths.append(os.path.join('..', path, filename))
+    return paths
+
+
 
 setup(
     name='ovos-dialog-normalizer-plugin',
@@ -58,6 +66,8 @@ def get_version():
     author_email='jarbasai@mailfence.com',
     license='MIT',
     packages=['ovos_dialog_normalizer_plugin'],
+    include_package_data=True,
+    package_data={'': package_files('ovos_dialog_normalizer_plugin')},
     zip_safe=True,
     keywords='ovos plugin utterance dialog TTS normalization',
     entry_points={
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..aa02126
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+"""Test package for ovos_dialog_normalizer_plugin."""
\ No newline at end of file

From 98a541b2edf41cd326c2f1e29aa997ed25697b41 Mon Sep 17 00:00:00 2001
From: JarbasAI <33701864+JarbasAl@users.noreply.github.com>
Date: Mon, 4 Aug 2025 13:11:33 +0100
Subject: [PATCH 4/6] Update ovos_dialog_normalizer_plugin/util.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 ovos_dialog_normalizer_plugin/util.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py
index 195dfa8..9304cc4 100644
--- a/ovos_dialog_normalizer_plugin/util.py
+++ b/ovos_dialog_normalizer_plugin/util.py
@@ -341,10 +341,10 @@ def replace_alphanumeric(match):
                 Replaces a matched alphanumeric unit expression with its spoken number and full unit name.
                 
                 Parameters:
-                	match: A regex match object containing a number and an alphanumeric unit symbol.
+                    match: A regex match object containing a number and an alphanumeric unit symbol.
                 
                 Returns:
-                	A string with the number pronounced in the specified language followed by the expanded unit name.
+                    A string with the number pronounced in the specified language followed by the expanded unit name.
                 """
                 number = match.group(1)
                 # Remove thousands separator and replace decimal separator for parsing
@@ -354,8 +354,11 @@ def replace_alphanumeric(match):
                     number = number.replace(decimal_separator, ".")
                 unit_symbol = match.group(2)
                 unit_word = alphanumeric_units[unit_symbol]
-                return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}"
-
+                try:
+                    return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}"
+                except Exception as e:
+                    LOG.error(f"Failed to pronounce number with unit: {number}{unit_symbol} - ({e})")
+                    return match.group(0)
             text = alphanumeric_pattern.sub(replace_alphanumeric, text)
     return text
 

From 2c7af67abf6c623bae6c68df1f5b3ed8d21ef75b Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Mon, 4 Aug 2025 13:17:38 +0100
Subject: [PATCH 5/6] update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 55c2d85..f2729b2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # OVOS Dialog Normalizer
 
-a dialog transformer plugins for OVOS
+a dialog transformer plugins for OpenVoiceOS
 
 ## Description
 
@@ -29,7 +29,9 @@ All you need to do is add a entry in your `mycroft.conf` under `"dialog_transfor
 
 Pull Requests welcome! 
 
-Adding new expansions should be straightforward, to improve number handling please refer to [ovos-number-parser](https://github.com/OpenVoiceOS/ovos-number-parser)
+- to support new languages translate the `.json` files in the `locale` folder
+- to improve number handling please refer to [ovos-number-parser](https://github.com/OpenVoiceOS/ovos-number-parser)
+- to improve date/time handling please refer to [ovos-date-parser](https://github.com/OpenVoiceOS/ovos-date-parser)
 
 ## Credits
 

From b29eeb95b269df14cd620dbda09d53857e0882de Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Mon, 4 Aug 2025 15:14:54 +0100
Subject: [PATCH 6/6] error handlin

---
 ovos_dialog_normalizer_plugin/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ovos_dialog_normalizer_plugin/util.py b/ovos_dialog_normalizer_plugin/util.py
index 9304cc4..1bf423d 100644
--- a/ovos_dialog_normalizer_plugin/util.py
+++ b/ovos_dialog_normalizer_plugin/util.py
@@ -353,8 +353,8 @@ def replace_alphanumeric(match):
                 elif decimal_separator != "." and decimal_separator in number:
                     number = number.replace(decimal_separator, ".")
                 unit_symbol = match.group(2)
-                unit_word = alphanumeric_units[unit_symbol]
                 try:
+                    unit_word = alphanumeric_units[unit_symbol]
                     return f"{pronounce_number(float(number) if '.' in number else int(number), full_lang)} {unit_word}"
                 except Exception as e:
                     LOG.error(f"Failed to pronounce number with unit: {number}{unit_symbol} - ({e})")