Skip to content

Commit db2859e

Browse files
Give Wtp linktrailing_re attribute
See wiktectract issue #1604 tatuylonen/wiktextract#1604 https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link This adds a new attribute to Wtp that contains a `re.Pattern` object used for pattern-matching these kinds of suffixed links. Modify `Wtp.linktrailing_re` to change the behavior based on how the parsed Wikimedia project handles linktrailing. English uses `[a-z]+`. Our default implementation uses `\w+`, which should be fine most of the time. Languages without spaces seem to use the English `[a-z]+`, which seems to make sense. `[[englishword]]KANJI` wouldn't have the kanji characters be consumed, but `\w+` breaks this.
1 parent ecb885e commit db2859e

3 files changed

Lines changed: 16 additions & 1 deletion

File tree

src/wikitextprocessor/core.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ class Wtp:
282282
"notes", # NOTE error messages
283283
"wiki_notices", # WIKI error messages
284284
"wikidata_session",
285+
"linktrailing_re",
285286
)
286287

287288
def __init__(
@@ -355,6 +356,14 @@ def __init__(
355356
if not quiet:
356357
logger.setLevel(logging.DEBUG)
357358
self.wikidata_session: Session | None = None
359+
# Default regex pattern, will sometimes cause trouble.
360+
# Linktrailing is when you have [[a li]]nk that consumes the
361+
# trailing suffix so that the whole word is blue. Languages
362+
# without spaces, like Japanese, should use the English
363+
# [a-z] pattern, other languages their own if `w+` actually
364+
# causes problems in them.
365+
# Will be modified later in wiktextract wxr through WiktionaryConfig.
366+
self.linktrailing_re = re.compile(r"(?s)(\w+)(.*)")
358367

359368
def create_db(self) -> None:
360369
from .wikidata import init_wikidata_cache

src/wikitextprocessor/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
10291029
and not node.children[-1].children
10301030
and not ctx.suppress_special
10311031
):
1032-
m = re.match(r"(?s)([a-z]+)(.*)", token)
1032+
m = ctx.linktrailing_re.match(token)
10331033
if m:
10341034
node.children[-1].children.append(m.group(1))
10351035
token = m.group(2)

tests/test_parser.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
44

5+
import re
56
import unittest
67

78
from wikitextprocessor import Wtp
@@ -1111,7 +1112,12 @@ def test_link_trailing_1(self):
11111112
self.assertEqual(b, " heal")
11121113

11131114
def test_link_trailing_not_latin(self):
1115+
_linktrailing_re = self.ctx.linktrailing_re
1116+
# Normally this alternative pattern would be provided by Wiktextract's
1117+
# WiktextractConfig or something similar.
1118+
self.ctx.linktrailing_re = re.compile(r"(?s)([a-z]+)(.*)")
11141119
tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
1120+
self.ctx.linktrailing_re = _linktrailing_re
11151121
self.assertEqual(len(tree.children), 2)
11161122
a, b = tree.children
11171123
self.assertEqual(a.kind, NodeKind.LINK)

0 commit comments

Comments
 (0)