diff --git a/zeeguu/api/test/test_verbal_flashcards.py b/zeeguu/api/test/test_verbal_flashcards.py index 17623c10..022577f6 100644 --- a/zeeguu/api/test/test_verbal_flashcards.py +++ b/zeeguu/api/test/test_verbal_flashcards.py @@ -304,16 +304,17 @@ def test_score_word_match_accepts_common_danish_asr_variants(): @pytest.mark.parametrize( - "user_word, expected_word", + "user_word, expected_word, expected_allowed_distance", [ - ("hat", "kat"), - ("hond", "hund"), - ("pange", "penge"), + ("hat", "kat", 1), + ("hond", "hund", 1), + ("pange", "penge", 2), ], ) -def test_score_word_match_accepts_one_optimal_string_alignment_edit( +def test_score_word_match_accepts_words_within_length_based_edit_budget( user_word, expected_word, + expected_allowed_distance, ): from zeeguu.core.verbal_flashcards.fuzzy_match import score_word_match @@ -322,21 +323,22 @@ def test_score_word_match_accepts_one_optimal_string_alignment_edit( assert result["isMatch"] is True assert result["matchType"] == "fuzzy" assert result["optimalStringAlignmentDistance"] == 1 - assert result["allowedOptimalStringAlignmentDistance"] == 1 + assert result["allowedOptimalStringAlignmentDistance"] == expected_allowed_distance assert result["jaroWinkler"] > 0 @pytest.mark.parametrize( - "user_word, expected_word", + "user_word, expected_word, expected_allowed_distance", [ - ("hot", "kat"), - ("hd", "hund"), - ("pen", "penge"), + ("hot", "kat", 1), + ("zzzz", "hund", 1), + ("xxxxx", "penge", 2), ], ) -def test_score_word_match_rejects_multiple_optimal_string_alignment_edits( +def test_score_word_match_rejects_words_outside_length_based_edit_budget( user_word, expected_word, + expected_allowed_distance, ): from zeeguu.core.verbal_flashcards.fuzzy_match import score_word_match @@ -344,11 +346,14 @@ def test_score_word_match_rejects_multiple_optimal_string_alignment_edits( assert result["isMatch"] is False assert result["matchType"] == "close" - assert result["optimalStringAlignmentDistance"] > 1 - assert result["allowedOptimalStringAlignmentDistance"] == 1 + assert ( + result["optimalStringAlignmentDistance"] + > result["allowedOptimalStringAlignmentDistance"] + ) + assert result["allowedOptimalStringAlignmentDistance"] == expected_allowed_distance -def test_score_word_match_requires_exact_match_for_two_letter_words(): +def test_score_word_match_allows_one_edit_for_two_letter_words(): from zeeguu.core.verbal_flashcards.fuzzy_match import score_word_match result = score_word_match("og", "ok", language_code="da") diff --git a/zeeguu/core/verbal_flashcards/fuzzy_match.py b/zeeguu/core/verbal_flashcards/fuzzy_match.py index 75949cc3..d4cc1630 100644 --- a/zeeguu/core/verbal_flashcards/fuzzy_match.py +++ b/zeeguu/core/verbal_flashcards/fuzzy_match.py @@ -152,14 +152,22 @@ def allowed_optimal_string_alignment_distance(expected_word, language_code=None) Return the maximum edit distance accepted for a spoken flashcard answer. Acceptance is based on edit distance, not a blended similarity score: - after language-specific normalization, words of length >= 3 may differ by - one optimal string alignment edit. Jaro-Winkler is still returned as a - diagnostic signal for debugging and future analysis, but it does not decide - correctness. + after language-specific normalization, longer words get a larger edit + budget because ASR approximations often drift more on longer Danish words. + Jaro-Winkler is still returned as a diagnostic signal for debugging and + future analysis, but it does not decide correctness. """ normalizer = normalizer_for(language_code) normalized_length = len(normalizer.canonical_form(expected_word)) - return 0 if normalized_length <= 2 else 1 + if normalized_length <= 2: + return 0 + if normalized_length <= 4: + return 1 + if normalized_length <= 6: + return 2 + if normalized_length <= 9: + return 3 + return 4 def fuzzy_match_threshold(expected_word, language_code=None):