From 941751550c4839485df44939113fefac103df1f3 Mon Sep 17 00:00:00 2001 From: daxida Date: Fri, 6 Mar 2026 11:32:29 +0100 Subject: [PATCH] Localize linktrail regex following wikimedia --- src/wikitextprocessor/parser.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index 217557b4..d8034591 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -924,11 +924,33 @@ def pop_until_nth_list(ctx: "Wtp", list_token: str) -> None: _parser_pop(ctx, True) +# https://github.com/wikimedia/mediawiki/blob/0f67e1045f35a8c854f3c6a3e2712c2d8b0b54d6/languages/messages/MessagesEn.php#L571-L575 +LINKTRAIL_RE = re.compile(r"(?s)([a-z]+)(.*)") +# https://github.com/wikimedia/mediawiki/blob/0f67e1045f35a8c854f3c6a3e2712c2d8b0b54d6/languages/messages/MessagesEl.php#L309 +LINKTRAIL_RE_EL = re.compile( + r"(?s)([a-zαβγδεζηθικλμνξοπρστυφχψωςΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώϊϋΐΰΆΈΉΊΌΎΏΪΫ]+)(.*)" +) +LINKTRAIL_RE_RU = re.compile( + r"(?s)([a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]+)(.*)" +) + + +def linktrail_re(lang: str) -> re.Pattern[str]: + match lang: + case "el": + return LINKTRAIL_RE_EL + case "ru": + return LINKTRAIL_RE_RU + case _: + return LINKTRAIL_RE + + def text_fn(ctx: "Wtp", token: str) -> None: """Inserts the token as raw text into the parse tree.""" close_begline_lists(ctx) node = ctx.parser_stack[-1] + lang = ctx.lang_code # Convert certain characters from the token into HTML entities # XXX this breaks tags inside templates, e.g. in @@ -1029,7 +1051,7 @@ def text_fn(ctx: "Wtp", token: str) -> None: and not node.children[-1].children and not ctx.suppress_special ): - m = re.match(r"(?s)([a-z]+)(.*)", token) + m = linktrail_re(lang).match(token) if m: node.children[-1].children.append(m.group(1)) token = m.group(2)