-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathresolve_md_links.py
More file actions
120 lines (98 loc) · 5.07 KB
/
Copy pathresolve_md_links.py
File metadata and controls
120 lines (98 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
"""Resolve relative links in generated Markdown (.md) output to absolute URLs.
Hugo emits the per-page Markdown output as public/<path>/index.md (pretty URLs) carrying the
page's RAW content. The auto-generated API-reference pages use RELATIVE links, e.g.
`](../ns/class)` or `](../../ns/class)` (variable depth), which Hugo templates cannot resolve
and which break once the .md is flattened to an ugly URL by move_md_to_ugly_urls.sh (the page
depth changes). This script resolves each relative link against the page's own URL and rewrites
it to an absolute URL, matching the theme partial themes/docs/layouts/partials/md/abs-content.txt
(which already absolutizes ROOT-relative links). The two are disjoint: abs-content.txt handles
`/...` links, this script handles `../`, `./`, and bare relative links.
Run it BEFORE move_md_to_ugly_urls.sh: resolution relies on the pretty path
public/<a>/<b>/index.md => page URL /<a>/<b>/.
Usage:
python resolve_md_links.py [output_dir] --base-url <BASE>
output_dir Hugo output directory (default: "public").
--base-url The build's Hugo .Site.BaseURL, e.g.
production: https://reference.groupdocs.com/<product>
staging: /<product>
local: http://localhost:1313/<product>
(Trailing slash is ignored.)
"""
import argparse
import os
import posixpath
import re
import sys
# Targets that are already absolute / not page-relative -> leave untouched.
# Root-relative ("/...") links are handled by the theme's md/abs-content.txt.
SKIP_PREFIXES = ("http://", "https://", "//", "mailto:", "tel:", "data:", "#", "/")
# Final-segment extensions treated as assets (NOT given a .md suffix). Namespace segments
# like "com.groupdocs.annotation" contain dots but are pages, so use an explicit whitelist
# rather than "has any extension".
ASSET_EXTS = {
".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".ico",
".css", ".js", ".json", ".xml", ".txt", ".pdf", ".zip", ".csv",
}
# Markdown inline/image link target: ](target) — also matches the image form .
MD_LINK_RE = re.compile(r"\]\(([^)\s]+)\)")
# Raw-HTML attribute: src="target" / href="target" (also matches the href of xlink:href=).
HTML_ATTR_RE = re.compile(r'(\b(?:src|href)=")([^"]*)(")')
def is_relative(target):
return bool(target) and not target.startswith(SKIP_PREFIXES)
def resolve(target, url_dir, base):
"""Resolve a page-relative target against url_dir and return an absolute URL."""
hash_i = target.find("#")
if hash_i != -1:
path, anchor = target[:hash_i], target[hash_i:]
else:
path, anchor = target, ""
if not path:
return target
resolved = posixpath.normpath(posixpath.join(url_dir, path))
ext = posixpath.splitext(posixpath.basename(resolved))[1].lower()
if ext not in ASSET_EXTS:
resolved += ".md" # page link -> .md sibling
return base + resolved + anchor
def rewrite(content, url_dir, base):
def md_sub(m):
t = m.group(1)
return "](" + resolve(t, url_dir, base) + ")" if is_relative(t) else m.group(0)
def html_sub(m):
t = m.group(2)
return m.group(1) + resolve(t, url_dir, base) + m.group(3) if is_relative(t) else m.group(0)
return HTML_ATTR_RE.sub(html_sub, MD_LINK_RE.sub(md_sub, content))
def main():
ap = argparse.ArgumentParser(description="Resolve relative .md links to absolute URLs.")
ap.add_argument("output_dir", nargs="?", default="public")
ap.add_argument("--base-url", required=True,
help="Build .Site.BaseURL, e.g. https://reference.groupdocs.com/annotation or /annotation")
args = ap.parse_args()
out = args.output_dir
base = args.base_url.rstrip("/")
if not os.path.isdir(out):
sys.exit("Error: directory '%s' does not exist. Run 'hugo' first." % out)
changed = 0
for root, _dirs, files in os.walk(out):
for fn in files:
if not fn.endswith(".md"):
continue
path = os.path.join(root, fn)
# Page URL dir from the file's location relative to out:
# public/java/ns/class/index.md -> /java/ns/class/
# public/index.md -> /
rel = os.path.relpath(os.path.dirname(path), out).replace(os.sep, "/")
url_dir = "/" if rel == "." else "/" + rel + "/"
# errors="surrogateescape" + newline="" so any non-UTF-8 byte or CRLF in the
# generated content round-trips byte-for-byte (some source .md carry stray
# Windows-1252 bytes, e.g. 0xae); the link regexes only touch ASCII syntax.
with open(path, "r", encoding="utf-8", errors="surrogateescape", newline="") as f:
content = f.read()
new = rewrite(content, url_dir, base)
if new != content:
with open(path, "w", encoding="utf-8", errors="surrogateescape", newline="") as f:
f.write(new)
changed += 1
print("Resolved relative links in %d .md file(s) under '%s'." % (changed, out))
if __name__ == "__main__":
main()