diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..5aa7d68c
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,17 @@
+# Changelog - PR 170
+
+## ✨ New Features
+
+- **Slug Normalization**: Accented slugs are now normalized and locale-prefixed link references are supported.
+
+## 🐛 Fixes
+
+- **Doc Paths**: Flattened nested document paths.
+- **Link Normalization**: Links inside code blocks and indented code fences are now properly skipped during link normalization.
+- **Heading IDs**: Explicit heading IDs and empty filenames are handled correctly to prevent heading ID collisions.
+- **Slug Generation**: Preserved CJK and Unicode letters in slug generation.
+- **Code Fences**: Aligned code-fence regex with CommonMark standard.
+
+## 🧪 Testing
+
+- **Normalization**: Aligned tests with new normalization expectations.
diff --git a/bun-tests/vitest-bridge.test.ts b/bun-tests/vitest-bridge.test.ts
index a096b2fb..8e57ab74 100644
--- a/bun-tests/vitest-bridge.test.ts
+++ b/bun-tests/vitest-bridge.test.ts
@@ -24,5 +24,7 @@ test(
() => {
runVitest();
},
- { timeout: 120_000 }
+ // The full Vitest suite can take just under two minutes on this repo, and
+ // Bun's own test harness adds enough overhead that 120s is too tight.
+ { timeout: 300_000 }
);
diff --git a/eslint.config.mjs b/eslint.config.mjs
index d03fda21..83430e4d 100644
--- a/eslint.config.mjs
+++ b/eslint.config.mjs
@@ -32,7 +32,7 @@ const eslintConfig = [
// Docusaurus specific configurations
{
files: ["**/*.{js,mjs,cjs,ts,jsx,tsx}"],
- ignores: ["scripts/**", "api-server/**"], // Ignore scripts and api-server directories for docusaurus rules
+ ignores: ["scripts/**", "api-server/**", "bun-tests/**"], // Ignore non-Docusaurus runtime directories for docusaurus/react rules
plugins: {
"@docusaurus": docusaurusPlugin,
react: pluginReact,
@@ -74,7 +74,11 @@ const eslintConfig = [
// Scripts and API server specific configurations
{
- files: ["scripts/**/*.{js,mjs,cjs,ts}", "api-server/**/*.{js,mjs,cjs,ts}"],
+ files: [
+ "scripts/**/*.{js,mjs,cjs,ts}",
+ "api-server/**/*.{js,mjs,cjs,ts}",
+ "bun-tests/**/*.{js,mjs,cjs,ts}",
+ ],
plugins: {
import: importPlugin,
promise: promisePlugin,
diff --git a/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts b/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts
index 61cdb372..622c6456 100644
--- a/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts
+++ b/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts
@@ -93,6 +93,7 @@ vi.mock("../imageProcessor", () => ({
vi.mock("../utils", () => ({
sanitizeMarkdownContent: vi.fn((content) => content),
+ injectExplicitHeadingIds: vi.fn((content) => content),
compressImageToFileWithFallback: vi.fn().mockResolvedValue({
finalSize: 512,
usedFallback: false,
diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts
index 5354c120..9b5d3e1c 100644
--- a/scripts/notion-fetch/contentSanitizer.test.ts
+++ b/scripts/notion-fetch/contentSanitizer.test.ts
@@ -50,6 +50,18 @@ describe("contentSanitizer", () => {
expect(result).toBe(input); // Should remain unchanged
});
+ it("should preserve tilde fenced code blocks", () => {
+ const input = "~~~md\nconst obj = { key: 'value' };\n~~~";
+ const result = scriptModule.sanitizeMarkdownContent(input);
+ expect(result).toBe(input);
+ });
+
+ it("should preserve multi-backtick inline code spans", () => {
+ const input = "Use ```` and ``{foo}``.";
+ const result = scriptModule.sanitizeMarkdownContent(input);
+ expect(result).toBe(input);
+ });
+
it("should fix malformed patterns", () => {
const input = "Check for details.";
const result = scriptModule.sanitizeMarkdownContent(input);
@@ -226,4 +238,80 @@ echo "# Not a heading"
});
});
});
+
+ describe("injectExplicitHeadingIds", () => {
+ it("should normalize accented headings and append stable duplicate suffixes", () => {
+ const input = [
+ "# Título Único",
+ "## Título Único",
+ "### Niño & Acción",
+ ].join("\n");
+
+ const result = scriptModule.injectExplicitHeadingIds(input);
+
+ expect(result).toContain("# Título Único {#titulo-unico}");
+ expect(result).toContain("## Título Único {#titulo-unico-1}");
+ expect(result).toContain("### Niño & Acción {#nino-accion}");
+ });
+
+ it("should preserve existing explicit heading ids and code fences", () => {
+ const input = [
+ "# Encabezado {#custom-id}",
+ "```md",
+ "## Código Único",
+ "```",
+ "## Otro Título",
+ ].join("\n");
+
+ const result = scriptModule.injectExplicitHeadingIds(input);
+
+ expect(result).toContain("# Encabezado {#custom-id}");
+ expect(result).toContain("```md\n## Código Único\n```");
+ expect(result).toContain("## Otro Título {#otro-titulo}");
+ expect(result).not.toContain("## Código Único {#codigo-unico}");
+ });
+
+ it("should preserve headings inside tilde fenced code blocks", () => {
+ const input = ["~~~md", "## Código Único", "~~~", "## Otro Título"].join(
+ "\n"
+ );
+
+ const result = scriptModule.injectExplicitHeadingIds(input);
+
+ expect(result).toContain("~~~md\n## Código Único\n~~~");
+ expect(result).toContain("## Otro Título {#otro-titulo}");
+ expect(result).not.toContain("## Código Único {#codigo-unico}");
+ });
+
+ it("should avoid collisions between auto-incremented and explicit IDs", () => {
+ const input = ["## Título", "## Heading {#titulo-1}", "## Título"].join(
+ "\n"
+ );
+
+ const result = scriptModule.injectExplicitHeadingIds(input);
+
+ expect(result).toContain("## Título {#titulo}");
+ expect(result).toContain("## Heading {#titulo-1}");
+ // The second "Título" must NOT get titulo-1 (already claimed), should get titulo-2
+ expect(result).toContain("## Título {#titulo-2}");
+ });
+
+ it("should reserve later explicit ids before assigning earlier auto-generated headings", () => {
+ const input = ["## My Id", "## Custom {#my-id}"].join("\n");
+
+ const result = scriptModule.injectExplicitHeadingIds(input);
+
+ expect(result).toContain("## My Id {#my-id-1}");
+ expect(result).toContain("## Custom {#my-id}");
+ });
+
+ it("should not reserve a natural slug when a later explicit id is custom", () => {
+ const input = ["## My Id", "## My Id {#custom}"].join("\n");
+
+ const result = scriptModule.injectExplicitHeadingIds(input);
+
+ expect(result).toContain("## My Id {#my-id}");
+ expect(result).toContain("## My Id {#custom}");
+ });
+ });
});
diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts
index f652a60c..c30a341f 100644
--- a/scripts/notion-fetch/contentSanitizer.ts
+++ b/scripts/notion-fetch/contentSanitizer.ts
@@ -3,6 +3,13 @@
* that cause MDX compilation errors in Docusaurus.
*/
+import { createSafeSlug } from "./slugUtils";
+import {
+ maskFencedCodeBlocks,
+ maskInlineCodeSpans,
+ restoreCodeMasks,
+} from "./markdownUtils";
+
const EMOJI_STYLE_MARKERS = ["display:", "height:", "margin:"];
const isEmojiStyleObject = (snippet: string): boolean =>
@@ -68,6 +75,83 @@ function fixHeadingHierarchy(
return fixedLines.join("\n");
}
+export function injectExplicitHeadingIds(content: string): string {
+ if (!content) {
+ return content;
+ }
+
+ const {
+ content: maskedContent,
+ codeBlocks,
+ placeholders,
+ } = maskFencedCodeBlocks(content);
+ const reservedIds = new Set();
+ const headingCounts = new Map();
+
+ const lines = maskedContent.split("\n");
+ for (const line of lines) {
+ if (placeholders.some((placeholder) => line.includes(placeholder))) {
+ continue;
+ }
+
+ const fullMatch = line.match(
+ /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/
+ );
+ if (fullMatch) {
+ const [, , , , explicitId] = fullMatch;
+ if (explicitId) {
+ reservedIds.add(explicitId);
+ }
+ continue;
+ }
+
+ const explicitIdMatch = line.match(/\s\{#([^}]+)\}\s*$/);
+ if (explicitIdMatch) {
+ const explicitId = explicitIdMatch[1];
+ reservedIds.add(explicitId);
+ }
+ }
+
+ const updatedLines = lines.map((line) => {
+ if (placeholders.some((placeholder) => line.includes(placeholder))) {
+ return line;
+ }
+
+ const explicitHeadingMatch = line.match(
+ /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/
+ );
+ if (explicitHeadingMatch) {
+ return line;
+ }
+
+ const headingMatch = line.match(/^(\s{0,3})(#{1,6})\s+(.+?)\s*$/);
+ if (!headingMatch) {
+ return line;
+ }
+
+ const [, leadingWhitespace, hashes, headingText] = headingMatch;
+ const baseId = createSafeSlug(headingText);
+ if (!baseId) {
+ return line;
+ }
+
+ let counter = headingCounts.get(baseId) ?? 0;
+ let headingId = counter === 0 ? baseId : `${baseId}-${counter}`;
+ while (reservedIds.has(headingId) || headingCounts.has(headingId)) {
+ counter++;
+ headingId = `${baseId}-${counter}`;
+ }
+ headingCounts.set(baseId, counter + 1);
+ if (headingId !== baseId) {
+ headingCounts.set(headingId, 1);
+ }
+
+ return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`;
+ });
+
+ return restoreCodeMasks(updatedLines.join("\n"), codeBlocks, []);
+}
+
/**
* Sanitizes markdown content to fix malformed HTML/JSX tags that cause MDX compilation errors
* @param content - The markdown content string
@@ -77,23 +161,17 @@ export function sanitizeMarkdownContent(content: string): string {
// Fix specific malformed patterns that cause MDX errors
// 0. Mask code fences (```...```) and inline code (`...`) to avoid altering them
- const codeBlocks: string[] = [];
- const codeSpans: string[] = [];
- const codeBlockPlaceholders: string[] = [];
-
- content = content.replace(/```[\s\S]*?```/g, (m) => {
- codeBlocks.push(m);
- const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`;
- codeBlockPlaceholders.push(placeholder);
- return placeholder;
- });
- content = content.replace(/`[^`\n]*`/g, (m) => {
- codeSpans.push(m);
- return `__CODESPAN_${codeSpans.length - 1}__`;
- });
+ const {
+ content: maskedContent,
+ codeBlocks,
+ placeholders,
+ } = maskFencedCodeBlocks(content);
+ const { content: maskedWithCodeSpans, codeSpans } =
+ maskInlineCodeSpans(maskedContent);
+ content = maskedWithCodeSpans;
// 1. Fix heading hierarchy for proper TOC generation (after masking code blocks)
- content = fixHeadingHierarchy(content, codeBlockPlaceholders);
+ content = fixHeadingHierarchy(content, placeholders);
// 2. Aggressively strip all curly-brace expressions by unwrapping to inner text
// BUT preserve JSX style objects for emoji images
@@ -157,14 +235,7 @@ export function sanitizeMarkdownContent(content: string): string {
}
// 9. Restore masked code blocks and inline code
- content = content.replace(
- /__CODEBLOCK_(\d+)__/g,
- (_m, i) => codeBlocks[Number(i)]
- );
- content = content.replace(
- /__CODESPAN_(\d+)__/g,
- (_m, i) => codeSpans[Number(i)]
- );
+ content = restoreCodeMasks(content, codeBlocks, codeSpans);
return content;
}
diff --git a/scripts/notion-fetch/generateBlocks.test.ts b/scripts/notion-fetch/generateBlocks.test.ts
index b5b0e88d..5d131524 100644
--- a/scripts/notion-fetch/generateBlocks.test.ts
+++ b/scripts/notion-fetch/generateBlocks.test.ts
@@ -112,6 +112,7 @@ vi.mock("./imageProcessor", () => ({
vi.mock("./utils", () => ({
sanitizeMarkdownContent: vi.fn((content) => content),
+ injectExplicitHeadingIds: vi.fn((content) => content),
compressImageToFileWithFallback: vi.fn(),
detectFormatFromBuffer: vi.fn(() => "jpeg"),
formatFromContentType: vi.fn(() => "jpeg"),
@@ -198,6 +199,7 @@ describe("generateBlocks", () => {
let fetchNotionBlocks: Mock;
let processImage: Mock;
let compressImageToFileWithFallback: Mock;
+ let injectExplicitHeadingIds: Mock;
beforeEach(async () => {
restoreEnv = installTestNotionEnv();
@@ -223,6 +225,7 @@ describe("generateBlocks", () => {
const utils = await import("./utils");
compressImageToFileWithFallback =
utils.compressImageToFileWithFallback as Mock;
+ injectExplicitHeadingIds = utils.injectExplicitHeadingIds as Mock;
// Setup default mock implementations
processImage.mockResolvedValue(mockProcessedImageResult);
@@ -377,6 +380,155 @@ describe("generateBlocks", () => {
});
});
+ describe("Localized slug and link normalization", () => {
+ it("should derive the shared ASCII slug from the grouped title for every locale", async () => {
+ const { generateBlocks } = await import("./generateBlocks");
+ const mockWriteFileSync = fs.writeFileSync as Mock;
+
+ const mainPage = createMockNotionPage({
+ id: "main-accented",
+ title: "Título con acentos",
+ elementType: "Page",
+ subItems: ["en-accented", "es-accented", "pt-accented"],
+ });
+ const englishPage = createMockNotionPage({
+ id: "en-accented",
+ title: "Título con acentos",
+ language: "English",
+ elementType: "Page",
+ });
+ const spanishPage = createMockNotionPage({
+ id: "es-accented",
+ title: "Título con acentos",
+ language: "Spanish",
+ elementType: "Page",
+ });
+ const portuguesePage = createMockNotionPage({
+ id: "pt-accented",
+ title: "Título con acentos",
+ language: "Portuguese",
+ elementType: "Page",
+ });
+
+ n2m.pageToMarkdown.mockResolvedValue([]);
+ n2m.toMarkdownString.mockReturnValue({ parent: "Body content" });
+
+ await generateBlocks(
+ [mainPage, englishPage, spanishPage, portuguesePage],
+ vi.fn()
+ );
+
+ const markdownPaths = mockWriteFileSync.mock.calls
+ .map((call) => call[0])
+ .filter(
+ (value): value is string =>
+ typeof value === "string" && value.endsWith(".md")
+ );
+
+ expect(markdownPaths).toEqual(
+ expect.arrayContaining([
+ expect.stringContaining("titulo-con-acentos.md"),
+ expect.stringContaining(
+ "i18n/pt/docusaurus-plugin-content-docs/current/titulo-con-acentos.md"
+ ),
+ expect.stringContaining(
+ "i18n/es/docusaurus-plugin-content-docs/current/titulo-con-acentos.md"
+ ),
+ ])
+ );
+ });
+
+ it("should normalize localized internal docs links before writing markdown", async () => {
+ const { generateBlocks } = await import("./generateBlocks");
+ const mockWriteFileSync = fs.writeFileSync as Mock;
+
+ const pageFamily = createMockPageFamily("Página de prueba", "Page");
+ n2m.pageToMarkdown.mockResolvedValue([]);
+ n2m.toMarkdownString
+ .mockReturnValueOnce({
+ parent:
+ "[doc](/docs/Guía Rápida#Título Uno) [external](https://example.com/Árbol) [relative](./Guía Local#Título)",
+ })
+ .mockReturnValueOnce({
+ parent:
+ "[doc](/docs/Guía Rápida#Título Uno) [nested](/docs/Category Name/Sub Página#Título Dos)",
+ })
+ .mockReturnValueOnce({
+ parent: "[doc](/docs/Guía Rápida#Título Uno)",
+ });
+
+ await generateBlocks(pageFamily.pages, vi.fn());
+
+ const markdownWrites = mockWriteFileSync.mock.calls.filter(
+ (call) => typeof call[0] === "string" && call[0].endsWith(".md")
+ );
+
+ const englishOutput = markdownWrites.find(
+ (call) =>
+ typeof call[0] === "string" &&
+ !call[0].includes("/i18n/") &&
+ call[1].includes("/docs/guia-rapida#titulo-uno")
+ );
+ const portugueseOutput = markdownWrites.find(
+ (call) =>
+ typeof call[0] === "string" &&
+ call[0].includes("/i18n/pt/") &&
+ call[1].includes("/pt/docs/guia-rapida#titulo-uno")
+ );
+ const spanishOutput = markdownWrites.find(
+ (call) =>
+ typeof call[0] === "string" &&
+ call[0].includes("/i18n/es/") &&
+ call[1].includes("/es/docs/guia-rapida#titulo-uno")
+ );
+
+ expect(englishOutput?.[1]).toContain(
+ "[doc](/docs/guia-rapida#titulo-uno)"
+ );
+ expect(englishOutput?.[1]).toContain(
+ "[external](https://example.com/Árbol)"
+ );
+ expect(englishOutput?.[1]).toContain("[relative](./Guía Local#Título)");
+ expect(portugueseOutput?.[1]).toContain(
+ "[nested](/pt/docs/sub-pagina#titulo-dos)"
+ );
+ expect(spanishOutput?.[1]).toContain(
+ "[doc](/es/docs/guia-rapida#titulo-uno)"
+ );
+ });
+
+ it("should pass the de-duplicated content through heading ID injection before writing", async () => {
+ const { generateBlocks } = await import("./generateBlocks");
+ const mockWriteFileSync = fs.writeFileSync as Mock;
+
+ const page = createMockNotionPage({
+ id: "heading-page",
+ title: "Heading Title",
+ elementType: "Page",
+ language: "English",
+ });
+
+ n2m.pageToMarkdown.mockResolvedValue([]);
+ n2m.toMarkdownString.mockReturnValue({
+ parent: "# Heading Title\n\n## Título Único\nContent body",
+ });
+ injectExplicitHeadingIds.mockImplementation(
+ (content: string) => `${content}\n`
+ );
+
+ await generateBlocks([page], vi.fn());
+
+ expect(injectExplicitHeadingIds).toHaveBeenCalledWith(
+ "## Título Único\nContent body"
+ );
+
+ const markdownWrite = mockWriteFileSync.mock.calls.find(
+ (call) => typeof call[0] === "string" && call[0].endsWith(".md")
+ );
+ expect(markdownWrite?.[1]).toContain("");
+ });
+ });
+
describe("Title fallbacks", () => {
it("should fallback to legacy Title property when Content elements is missing", async () => {
const { generateBlocks } = await import("./generateBlocks");
diff --git a/scripts/notion-fetch/generateBlocks.ts b/scripts/notion-fetch/generateBlocks.ts
index 4dda0cd7..e514676f 100644
--- a/scripts/notion-fetch/generateBlocks.ts
+++ b/scripts/notion-fetch/generateBlocks.ts
@@ -9,7 +9,9 @@ import type {
import { n2m } from "../notionClient";
import { NOTION_PROPERTIES } from "../constants";
import chalk from "chalk";
-import { sanitizeMarkdownContent } from "./utils";
+import { sanitizeMarkdownContent, injectExplicitHeadingIds } from "./utils";
+import { createSafeSlug } from "./slugUtils";
+import { normalizeInternalDocLinks } from "./linkNormalizer";
import config from "../../docusaurus.config";
import SpinnerManager from "./spinnerManager";
import { convertCalloutToAdmonition, isCalloutBlock } from "./calloutProcessor";
@@ -528,6 +530,10 @@ async function processSinglePage(
emojiCount += result.fallbackEmojiCount;
contentHasS3 = result.containsS3;
+ markdownString.parent = normalizeInternalDocLinks(
+ markdownString.parent,
+ lang
+ );
markdownString.parent = sanitizeMarkdownContent(markdownString.parent);
markdownString.parent = ensureBlankLineAfterStandaloneBold(
@@ -538,18 +544,19 @@ async function processSinglePage(
markdownString.parent,
pageTitle
);
+ const finalContentBody = injectExplicitHeadingIds(contentBody);
const sectionFolderForWrite: Record = {};
sectionFolderForWrite[lang] = currentSectionFolderForLang;
- const finalDiagnostics = getImageDiagnostics(markdownString.parent ?? "");
+ const finalDiagnostics = getImageDiagnostics(finalContentBody ?? "");
contentHasS3 = finalDiagnostics.s3Matches > 0;
writeMarkdownFile(
filePath,
frontmatter,
- contentBody,
+ finalContentBody,
pageTitle,
pageProcessingIndex - 1,
totalPages,
@@ -887,10 +894,7 @@ export async function generateBlocks(
? sectionTypeRaw.trim()
: String(sectionTypeRaw ?? "").trim();
const normalizedSectionType = sectionTypeString.toLowerCase();
- const filename = title
- .toLowerCase()
- .replace(/\s+/g, "-")
- .replace(/[^a-z0-9-]/g, "");
+ const filename = createSafeSlug(title) || "untitled";
const orderedLocales = getOrderedLocales(Object.keys(pageByLang.content));
for (const lang of orderedLocales) {
diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts
new file mode 100644
index 00000000..f2fe5706
--- /dev/null
+++ b/scripts/notion-fetch/linkNormalizer.test.ts
@@ -0,0 +1,151 @@
+import { describe, it, expect, vi } from "vitest";
+
+// Mock the docusaurus config before importing the module under test,
+// mirroring the pattern used in generateBlocks.test.ts.
+vi.mock("../../docusaurus.config", () => ({
+ default: {
+ i18n: {
+ locales: ["en", "pt", "es"],
+ defaultLocale: "en",
+ },
+ },
+}));
+
+import { normalizeInternalDocLinks } from "./linkNormalizer";
+
+describe("linkNormalizer", () => {
+ describe("normalizeInternalDocLinks", () => {
+ it("should normalize a docs link for the default locale (en) without a locale prefix", () => {
+ const input = "[link](/docs/Guía Rápida)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe("[link](/docs/guia-rapida)");
+ });
+
+ it("should add a locale prefix for a non-default locale (es)", () => {
+ const input = "[link](/docs/Guía Rápida)";
+ const result = normalizeInternalDocLinks(input, "es");
+ expect(result).toBe("[link](/es/docs/guia-rapida)");
+ });
+
+ it("should normalize both the path and the fragment", () => {
+ const input = "[link](/docs/Page#Título Uno)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe("[link](/docs/page#titulo-uno)");
+ });
+
+ it("should leave external links untouched", () => {
+ const input = "[link](https://example.com/Árbol)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should leave relative links untouched", () => {
+ const input = "[link](./local)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not alter image links (lines starting with !)", () => {
+ const input = "";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should flatten a nested docs path to only the last segment (slug shape)", () => {
+ const input = "[link](/docs/Category Name/Sub Page)";
+ const result = normalizeInternalDocLinks(input, "pt");
+ // buildFrontmatter() writes slug: /${safeSlug} (single level), so the
+ // public URL is /pt/docs/sub-page, not /pt/docs/category-name/sub-page.
+ expect(result).toBe("[link](/pt/docs/sub-page)");
+ });
+
+ it("should not rewrite links inside a fenced code block", () => {
+ const input = "```\n[example](/docs/Guía Rápida)\n```";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not rewrite links inside an indented fenced code block", () => {
+ const input = " ```\n [example](/docs/Guía Rápida)\n ```";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not rewrite links inside a tilde fenced code block", () => {
+ const input = "~~~\n[example](/docs/Guía Rápida)\n~~~";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not rewrite links inside an indented tilde fenced code block", () => {
+ const input = " ~~~\n [example](/docs/Guía Rápida)\n ~~~";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not rewrite links inside an unclosed tilde fenced code block", () => {
+ const input = "~~~\n[example](/docs/Guía Rápida)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not rewrite links inside inline code", () => {
+ const input = "Use `[link](/docs/Guía Rápida)` as an example.";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should not rewrite links inside multi-backtick inline code", () => {
+ const input = "Use ``[link](/docs/Guía Rápida)`` as an example.";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should normalize multiple docs links on a single line", () => {
+ const input = "[a](/docs/Foo) and [b](/docs/Bar)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe("[a](/docs/foo) and [b](/docs/bar)");
+ });
+
+ it("should normalize both links when two docs links are directly adjacent (no separator)", () => {
+ const input = "[Link 1](/docs/Foo)[Link 2](/docs/Bar)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe("[Link 1](/docs/foo)[Link 2](/docs/bar)");
+ });
+
+ it("should return empty string for empty content", () => {
+ const result = normalizeInternalDocLinks("", "en");
+ expect(result).toBe("");
+ });
+
+ it("should leave plain text with only external links unchanged", () => {
+ const input = "plain text with [link](https://example.com)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe(input);
+ });
+
+ it("should normalize a link to exactly /docs (en, no locale prefix)", () => {
+ const input = "[link](/docs)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe("[link](/docs)");
+ });
+
+ it("should normalize a link to exactly /docs with locale prefix (es)", () => {
+ const input = "[link](/docs)";
+ const result = normalizeInternalDocLinks(input, "es");
+ expect(result).toBe("[link](/es/docs)");
+ });
+
+ it("should normalize a link to /docs#fragment (en, no locale prefix)", () => {
+ const input = "[link](/docs#Sección Uno)";
+ const result = normalizeInternalDocLinks(input, "en");
+ expect(result).toBe("[link](/docs#seccion-uno)");
+ });
+
+ it("should normalize a link to /docs#fragment with locale prefix (pt)", () => {
+ const input = "[link](/docs#Sección Uno)";
+ const result = normalizeInternalDocLinks(input, "pt");
+ expect(result).toBe("[link](/pt/docs#seccion-uno)");
+ });
+ });
+});
diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts
new file mode 100644
index 00000000..7a02948f
--- /dev/null
+++ b/scripts/notion-fetch/linkNormalizer.ts
@@ -0,0 +1,84 @@
+import config from "../../docusaurus.config";
+import { createSafeSlug } from "./slugUtils";
+import {
+ maskFencedCodeBlocks,
+ maskInlineCodeSpans,
+ restoreCodeMasks,
+} from "./markdownUtils";
+
+const DEFAULT_LOCALE = config.i18n.defaultLocale;
+const MARKDOWN_LINK_REGEX = /(? {
+ const trimmedTarget = rawTarget.trim();
+ const titleMatch = trimmedTarget.match(/^(\/docs\/[^\n]*?)(\s+"[^"]*")$/);
+ const target = titleMatch ? titleMatch[1] : trimmedTarget;
+ const titleSuffix = titleMatch?.[2] ?? "";
+
+ if (
+ target !== "/docs" &&
+ !target.startsWith("/docs/") &&
+ !target.startsWith("/docs#")
+ ) {
+ return match;
+ }
+
+ return `[${text}](${normalizeDocTarget(target, lang)}${titleSuffix})`;
+ }
+ );
+
+ return restoreCodeMasks(normalizedContent, codeBlocks, codeSpans);
+}
diff --git a/scripts/notion-fetch/markdownUtils.ts b/scripts/notion-fetch/markdownUtils.ts
new file mode 100644
index 00000000..e11675f8
--- /dev/null
+++ b/scripts/notion-fetch/markdownUtils.ts
@@ -0,0 +1,203 @@
+/**
+ * Shared markdown code-block masking utilities.
+ * Both contentSanitizer and linkNormalizer use identical logic for masking
+ * code blocks and inline code spans before processing, then restoring them.
+ */
+
+/** Placeholder prefix used for fenced code blocks. */
+const CODEBLOCK_PREFIX = "__CODEBLOCK_";
+/** Placeholder prefix used for inline code spans. */
+const CODESPAN_PREFIX = "__CODESPAN_";
+const PLACEHOLDER_SUFFIX = "__";
+
+/** Checks whether `line` is a valid closing fence for a block opened with `fenceChar` of length `fenceLength`. */
+function isClosingFenceLine(
+ line: string,
+ fenceChar: string,
+ fenceLength: number
+): boolean {
+ let i = 0;
+
+ while (i < line.length && line.charAt(i) === " ") {
+ i++;
+ }
+
+ if (i > 3) {
+ return false;
+ }
+
+ let fenceCount = 0;
+ while (i < line.length && line.charAt(i) === fenceChar) {
+ fenceCount++;
+ i++;
+ }
+
+ if (fenceCount < fenceLength) {
+ return false;
+ }
+
+ while (i < line.length) {
+ if (line.charAt(i) !== " " && line.charAt(i) !== "\t") {
+ return false;
+ }
+ i++;
+ }
+
+ return true;
+}
+
+/**
+ * Masks all fenced code blocks in `content`, replacing each with a placeholder token.
+ * Returns the masked content, the array of original code blocks, and their placeholder strings.
+ */
+export function maskFencedCodeBlocks(content: string): {
+ content: string;
+ codeBlocks: string[];
+ placeholders: string[];
+} {
+ const lines = content.split("\n");
+ const codeBlocks: string[] = [];
+ const placeholders: string[] = [];
+ const maskedLines: string[] = [];
+
+ let inFence = false;
+ let fenceChar = "";
+ let fenceLength = 0;
+ let blockLines: string[] = [];
+
+ for (const line of lines) {
+ if (!inFence) {
+ const openingMatch = line.match(/^ {0,3}(`{3,}|~{3,})(.*)$/);
+
+ if (!openingMatch) {
+ maskedLines.push(line);
+ continue;
+ }
+
+ const fence = openingMatch[1];
+ fenceChar = fence[0];
+ fenceLength = fence.length;
+ blockLines = [line];
+ inFence = true;
+ continue;
+ }
+
+ blockLines.push(line);
+
+ if (isClosingFenceLine(line, fenceChar, fenceLength)) {
+ codeBlocks.push(blockLines.join("\n"));
+ const placeholder = `${CODEBLOCK_PREFIX}${codeBlocks.length - 1}${PLACEHOLDER_SUFFIX}`;
+ placeholders.push(placeholder);
+ maskedLines.push(placeholder);
+ inFence = false;
+ blockLines = [];
+ }
+ }
+
+ if (inFence) {
+ codeBlocks.push(blockLines.join("\n"));
+ const placeholder = `${CODEBLOCK_PREFIX}${codeBlocks.length - 1}${PLACEHOLDER_SUFFIX}`;
+ placeholders.push(placeholder);
+ maskedLines.push(placeholder);
+ }
+
+ return {
+ content: maskedLines.join("\n"),
+ codeBlocks,
+ placeholders,
+ };
+}
+
+/**
+ * Masks all inline code spans in `content`, replacing each with a placeholder token.
+ * Returns the masked content and the array of original code spans.
+ */
+export function maskInlineCodeSpans(content: string): {
+ content: string;
+ codeSpans: string[];
+} {
+ const codeSpans: string[] = [];
+ const output: string[] = [];
+
+ let i = 0;
+ while (i < content.length) {
+ const currentChar = content.charAt(i);
+ if (currentChar !== "`") {
+ output.push(currentChar);
+ i++;
+ continue;
+ }
+
+ let openingLength = 0;
+ while (
+ i + openingLength < content.length &&
+ content.charAt(i + openingLength) === "`"
+ ) {
+ openingLength++;
+ }
+
+ let scanIndex = i + openingLength;
+ let closingIndex = -1;
+ while (scanIndex < content.length) {
+ const nextBacktick = content.indexOf("`", scanIndex);
+ if (nextBacktick === -1) {
+ break;
+ }
+
+ let closingLength = 0;
+ while (
+ nextBacktick + closingLength < content.length &&
+ content.charAt(nextBacktick + closingLength) === "`"
+ ) {
+ closingLength++;
+ }
+
+ if (closingLength === openingLength) {
+ closingIndex = nextBacktick;
+ break;
+ }
+
+ scanIndex = nextBacktick + closingLength;
+ }
+
+ if (closingIndex === -1) {
+ output.push(content.slice(i, i + openingLength));
+ i += openingLength;
+ continue;
+ }
+
+ const codeSpan = content.slice(i, closingIndex + openingLength);
+ codeSpans.push(codeSpan);
+ output.push(
+ `${CODESPAN_PREFIX}${codeSpans.length - 1}${PLACEHOLDER_SUFFIX}`
+ );
+ i = closingIndex + openingLength;
+ }
+
+ return {
+ content: output.join(""),
+ codeSpans,
+ };
+}
+
+/**
+ * Restores previously masked fenced code blocks and inline code spans.
+ */
+export function restoreCodeMasks(
+ content: string,
+ codeBlocks: string[],
+ codeSpans: string[]
+): string {
+ const restoreByIndex = (values: string[], rawIndex: string) => {
+ const index = Number(rawIndex);
+ return Number.isInteger(index) ? (values.at(index) ?? "") : "";
+ };
+
+ return content
+ .replace(/__CODESPAN_(\d+)__/g, (_match, index) => {
+ return restoreByIndex(codeSpans, index);
+ })
+ .replace(/__CODEBLOCK_(\d+)__/g, (_match, index) => {
+ return restoreByIndex(codeBlocks, index);
+ });
+}
diff --git a/scripts/notion-fetch/page-ordering.test.ts b/scripts/notion-fetch/page-ordering.test.ts
index 5bb96c83..4d0fa461 100644
--- a/scripts/notion-fetch/page-ordering.test.ts
+++ b/scripts/notion-fetch/page-ordering.test.ts
@@ -103,6 +103,7 @@ vi.mock("./imageProcessor", () => ({
vi.mock("./utils", () => ({
sanitizeMarkdownContent: vi.fn((content) => content),
+ injectExplicitHeadingIds: vi.fn((content) => content),
compressImageToFileWithFallback: vi.fn(),
detectFormatFromBuffer: vi.fn(() => "jpeg"),
formatFromContentType: vi.fn(() => "jpeg"),
diff --git a/scripts/notion-fetch/slugUtils.test.ts b/scripts/notion-fetch/slugUtils.test.ts
new file mode 100644
index 00000000..813b3478
--- /dev/null
+++ b/scripts/notion-fetch/slugUtils.test.ts
@@ -0,0 +1,54 @@
+import { describe, it, expect } from "vitest";
+import { createSafeSlug } from "./slugUtils";
+
+describe("slugUtils", () => {
+ describe("createSafeSlug", () => {
+ it("should convert basic Latin text to lowercase hyphenated slug", () => {
+ expect(createSafeSlug("Hello World")).toBe("hello-world");
+ });
+
+ it("should strip accented Latin characters", () => {
+ expect(createSafeSlug("Título con acentos")).toBe("titulo-con-acentos");
+ });
+
+ it("should handle Spanish accented characters", () => {
+ expect(createSafeSlug("Guía Rápida")).toBe("guia-rapida");
+ });
+
+ it("should handle Portuguese characters", () => {
+ expect(createSafeSlug("Instalação")).toBe("instalacao");
+ });
+
+ it("should handle ñ and accented vowels in Spanish words", () => {
+ expect(createSafeSlug("Niño & Acción")).toBe("nino-accion");
+ });
+
+ it("should return an empty string for empty input", () => {
+ expect(createSafeSlug("")).toBe("");
+ });
+
+ it("should strip diacritics from accented letters", () => {
+ expect(createSafeSlug("éàü")).toBe("eau");
+ });
+
+ it("should preserve numbers in the slug", () => {
+ expect(createSafeSlug("FAQ Section 2")).toBe("faq-section-2");
+ });
+
+ it("should collapse multiple spaces and hyphens into a single hyphen", () => {
+ expect(createSafeSlug("hello --- world")).toBe("hello-world");
+ });
+
+ it("should strip leading and trailing hyphens", () => {
+ expect(createSafeSlug("--hello--")).toBe("hello");
+ });
+
+ it("should preserve CJK input", () => {
+ expect(createSafeSlug("安装指南")).toBe("安装指南");
+ });
+
+ it("should extract both CJK and Latin from mixed input", () => {
+ expect(createSafeSlug("安装 Setup 指南")).toBe("安装-setup-指南");
+ });
+ });
+});
diff --git a/scripts/notion-fetch/slugUtils.ts b/scripts/notion-fetch/slugUtils.ts
new file mode 100644
index 00000000..d0f473ab
--- /dev/null
+++ b/scripts/notion-fetch/slugUtils.ts
@@ -0,0 +1,11 @@
+export function createSafeSlug(text: string): string {
+ return text
+ .normalize("NFD")
+ .replace(/\p{M}/gu, "")
+ .toLowerCase()
+ .trim()
+ .replace(/\s+/g, "-")
+ .replace(/[^\p{L}\p{N}-]/gu, "")
+ .replace(/-+/g, "-")
+ .replace(/^-+|-+$/g, "");
+}
diff --git a/scripts/notion-fetch/utils.ts b/scripts/notion-fetch/utils.ts
index 4b0a9415..4471e445 100644
--- a/scripts/notion-fetch/utils.ts
+++ b/scripts/notion-fetch/utils.ts
@@ -6,7 +6,10 @@ import { compressImage } from "./imageCompressor";
import { withTimeoutFallback } from "./timeoutUtils";
// Re-export sanitize so callers have a single utils entrypoint
-export { sanitizeMarkdownContent } from "./contentSanitizer";
+export {
+ sanitizeMarkdownContent,
+ injectExplicitHeadingIds,
+} from "./contentSanitizer";
// Fail-open toggle: defaults to true unless explicitly set to 'false'
export const SOFT_FAIL: boolean =
diff --git a/scripts/notion-fetch/verifyExportCoverage.ts b/scripts/notion-fetch/verifyExportCoverage.ts
index 80e4a78e..ade557c9 100644
--- a/scripts/notion-fetch/verifyExportCoverage.ts
+++ b/scripts/notion-fetch/verifyExportCoverage.ts
@@ -4,17 +4,13 @@ import path from "node:path";
import { glob } from "glob";
import { NOTION_PROPERTIES } from "../constants";
+import { createSafeSlug } from "./slugUtils";
type NotionPage = Record;
const EXPORT_FILENAME = "notion_db.json";
-const slugify = (title: string): string =>
- title
- .toLowerCase()
- .replace(/\s+/g, "-")
- .replace(/[^a-z0-9-]/g, "")
- .trim();
+const slugify = (title: string): string => createSafeSlug(title);
const getTitle = (page: NotionPage): string | undefined =>
page?.properties?.[NOTION_PROPERTIES.TITLE]?.title?.[0]?.plain_text;
@@ -50,14 +46,12 @@ export interface VerificationResult {
export function verifyExportCoverage(
exportPath: string = path.resolve(process.cwd(), EXPORT_FILENAME)
): VerificationResult {
- // eslint-disable-next-line security/detect-non-literal-fs-filename
if (!fs.existsSync(exportPath)) {
throw new Error(
`Notion export file not found at ${exportPath}. Run bun notion:export first.`
);
}
- // eslint-disable-next-line security/detect-non-literal-fs-filename
const payload = JSON.parse(fs.readFileSync(exportPath, "utf8"));
const results: NotionPage[] = payload.results ?? [];
const readyPages = results.filter(isReadyToPublish);
diff --git a/scripts/notion-translate/imageStabilization.test.ts b/scripts/notion-translate/imageStabilization.test.ts
index 514a5d92..946c9194 100644
--- a/scripts/notion-translate/imageStabilization.test.ts
+++ b/scripts/notion-translate/imageStabilization.test.ts
@@ -906,7 +906,7 @@ describe("image stabilization in translation pipeline", () => {
expect(mockProcessAndReplaceImages).toHaveBeenCalledWith(
expect.any(String),
- "hllo-wrld-pageid1"
+ "hello-world-pageid1"
);
});
diff --git a/scripts/notion-translate/index.ts b/scripts/notion-translate/index.ts
index 03773771..c2c2c4c2 100644
--- a/scripts/notion-translate/index.ts
+++ b/scripts/notion-translate/index.ts
@@ -38,6 +38,7 @@ import {
validateAndFixRemainingImages,
extractImageMatches,
} from "../notion-fetch/imageReplacer.js";
+import { createSafeSlug } from "../notion-fetch/slugUtils.js";
const LEGACY_SECTION_PROPERTY = "Section";
const PARENT_ITEM_PROPERTY = "Parent item";
@@ -584,11 +585,7 @@ const NOTION_IMAGE_URL_FAMILY_REGEX = new RegExp(
* image filenames remain consistent with markdown filenames.
*/
function generateSafeFilename(title: string, pageId: string): string {
- const baseSlug = title
- .toLowerCase()
- .replace(/\s+/g, "-")
- .replace(/[^a-z0-9-]/g, "")
- .substring(0, MAX_SLUG_LENGTH);
+ const baseSlug = createSafeSlug(title).substring(0, MAX_SLUG_LENGTH);
const stablePageId = pageId.toLowerCase().replace(/[^a-z0-9]/g, "");
const deterministicBase = baseSlug || "untitled";
return `${deterministicBase}-${stablePageId}`;