Give Wtp linktrailing_re attribute

kristian-clausal · kristian-clausal · commit db2859ecdca5 · 2026-03-09T11:46:03.000+02:00
See wiktectract issue #1604 tatuylonen/wiktextract#1604 https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link This adds a new attribute to Wtp that contains a `re.Pattern` object used for pattern-matching these kinds of suffixed links. Modify `Wtp.linktrailing_re` to change the behavior based on how the parsed Wikimedia project handles linktrailing. English uses `[a-z]+`. Our default implementation uses `\w+`, which should be fine most of the time. Languages without spaces seem to use the English `[a-z]+`, which seems to make sense. `[[englishword]]KANJI` wouldn't have the kanji characters be consumed, but `\w+` breaks this.
diff --git a/src/wikitextprocessor/core.py b/src/wikitextprocessor/core.py
@@ -282,6 +282,7 @@ class Wtp:
         "notes",  # NOTE error messages
         "wiki_notices",  # WIKI error messages
         "wikidata_session",
+        "linktrailing_re",
     )
 
     def __init__(
@@ -355,6 +356,14 @@ def __init__(
         if not quiet:
             logger.setLevel(logging.DEBUG)
         self.wikidata_session: Session | None = None
+        # Default regex pattern, will sometimes cause trouble.
+        # Linktrailing is when you have [[a li]]nk that consumes the
+        # trailing suffix so that the whole word is blue. Languages
+        # without spaces, like Japanese, should use the English
+        # [a-z] pattern, other languages their own if `w+` actually
+        # causes problems in them.
+        # Will be modified later in wiktextract wxr through WiktionaryConfig.
+        self.linktrailing_re = re.compile(r"(?s)(\w+)(.*)")
 
     def create_db(self) -> None:
         from .wikidata import init_wikidata_cache
diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py
@@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
         and not node.children[-1].children
         and not ctx.suppress_special
     ):
-        m = re.match(r"(?s)([a-z]+)(.*)", token)
+        m = ctx.linktrailing_re.match(token)
         if m:
             node.children[-1].children.append(m.group(1))
             token = m.group(2)
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -2,6 +2,7 @@
 #
 # Copyright (c) 2020-2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
+import re
 import unittest
 
 from wikitextprocessor import Wtp
@@ -1111,7 +1112,12 @@ def test_link_trailing_1(self):
         self.assertEqual(b, " heal")
 
     def test_link_trailing_not_latin(self):
+        _linktrailing_re = self.ctx.linktrailing_re
+        # Normally this alternative pattern would be provided by Wiktextract's
+        # WiktextractConfig or something similar.
+        self.ctx.linktrailing_re = re.compile(r"(?s)([a-z]+)(.*)")
         tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
+        self.ctx.linktrailing_re = _linktrailing_re
         self.assertEqual(len(tree.children), 2)
         a, b = tree.children
         self.assertEqual(a.kind, NodeKind.LINK)