From d6df31b944347c13a3c5050124eca1edf919e1a3 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:38:23 -0300 Subject: [PATCH 01/15] feat(slugs): normalize accented slugs and add locale-prefixed link resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #164 - Add createSafeSlug() using NFD decomposition to strip diacritics (á→a, é→e, ñ→n, ã→a, ç→c, etc.) from filenames and anchor IDs - Add normalizeInternalDocLinks() to rewrite /docs/ links with the correct locale prefix (/es/docs/..., /pt/docs/...) and slugify path segments and fragments - Add injectExplicitHeadingIds() to append stable {#id} anchors to headings, deduplicated with -1/-2 suffixes, skipping code fences - Replace three inline slugify implementations with createSafeSlug - Fix code fence regex to be line-anchored (prevented heading ID injection inside fenced blocks) - Wrap decodeURIComponent with safeDecode to avoid URIError on percent signs in page titles (e.g. "100% complete") - Add unit tests for slugUtils (12 cases) and linkNormalizer (10 cases) --- scripts/notion-fetch/contentSanitizer.test.ts | 33 ++++ scripts/notion-fetch/contentSanitizer.ts | 78 ++++++++- scripts/notion-fetch/generateBlocks.test.ts | 152 ++++++++++++++++++ scripts/notion-fetch/generateBlocks.ts | 18 ++- scripts/notion-fetch/linkNormalizer.test.ts | 77 +++++++++ scripts/notion-fetch/linkNormalizer.ts | 66 ++++++++ scripts/notion-fetch/slugUtils.test.ts | 54 +++++++ scripts/notion-fetch/slugUtils.ts | 11 ++ scripts/notion-fetch/utils.ts | 5 +- scripts/notion-fetch/verifyExportCoverage.ts | 10 +- scripts/notion-translate/index.ts | 7 +- 11 files changed, 489 insertions(+), 22 deletions(-) create mode 100644 scripts/notion-fetch/linkNormalizer.test.ts create mode 100644 scripts/notion-fetch/linkNormalizer.ts create mode 100644 scripts/notion-fetch/slugUtils.test.ts create mode 100644 scripts/notion-fetch/slugUtils.ts diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts index 5354c120..f1d3315c 100644 --- a/scripts/notion-fetch/contentSanitizer.test.ts +++ b/scripts/notion-fetch/contentSanitizer.test.ts @@ -226,4 +226,37 @@ echo "# Not a heading" }); }); }); + + describe("injectExplicitHeadingIds", () => { + it("should normalize accented headings and append stable duplicate suffixes", () => { + const input = [ + "# Título Único", + "## Título Único", + "### Niño & Acción", + ].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("# Título Único {#titulo-unico}"); + expect(result).toContain("## Título Único {#titulo-unico-1}"); + expect(result).toContain("### Niño & Acción {#nino-accion}"); + }); + + it("should preserve existing explicit heading ids and code fences", () => { + const input = [ + "# Encabezado {#custom-id}", + "```md", + "## Código Único", + "```", + "## Otro Título", + ].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("# Encabezado {#custom-id}"); + expect(result).toContain("```md\n## Código Único\n```"); + expect(result).toContain("## Otro Título {#otro-titulo}"); + expect(result).not.toContain("## Código Único {#codigo-unico}"); + }); + }); }); diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index f652a60c..070fb1ff 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -3,6 +3,8 @@ * that cause MDX compilation errors in Docusaurus. */ +import { createSafeSlug } from "./slugUtils"; + const EMOJI_STYLE_MARKERS = ["display:", "height:", "margin:"]; const isEmojiStyleObject = (snippet: string): boolean => @@ -68,6 +70,80 @@ function fixHeadingHierarchy( return fixedLines.join("\n"); } +function maskCodeFences(content: string): { + content: string; + codeBlocks: string[]; + codeBlockPlaceholders: string[]; +} { + const codeBlocks: string[] = []; + const codeBlockPlaceholders: string[] = []; + + const maskedContent = content.replace( + /^```[^\n]*\n[\s\S]*?^```/gm, + (match) => { + codeBlocks.push(match); + const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; + codeBlockPlaceholders.push(placeholder); + return placeholder; + } + ); + + return { + content: maskedContent, + codeBlocks, + codeBlockPlaceholders, + }; +} + +function restoreCodeFences(content: string, codeBlocks: string[]): string { + return content.replace( + /__CODEBLOCK_(\d+)__/g, + (_match, index) => codeBlocks[Number(index)] + ); +} + +export function injectExplicitHeadingIds(content: string): string { + if (!content) { + return content; + } + + const { + content: maskedContent, + codeBlocks, + codeBlockPlaceholders, + } = maskCodeFences(content); + const headingCounts = new Map(); + + const lines = maskedContent.split("\n"); + const updatedLines = lines.map((line) => { + if ( + codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) || + /\s\{#[^}]+\}\s*$/.test(line) + ) { + return line; + } + + const headingMatch = line.match(/^(\s{0,3})(#{1,6})\s+(.+?)\s*$/); + if (!headingMatch) { + return line; + } + + const [, leadingWhitespace, hashes, headingText] = headingMatch; + const baseId = createSafeSlug(headingText); + if (!baseId) { + return line; + } + + const currentCount = headingCounts.get(baseId) ?? 0; + headingCounts.set(baseId, currentCount + 1); + const headingId = currentCount === 0 ? baseId : `${baseId}-${currentCount}`; + + return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`; + }); + + return restoreCodeFences(updatedLines.join("\n"), codeBlocks); +} + /** * Sanitizes markdown content to fix malformed HTML/JSX tags that cause MDX compilation errors * @param content - The markdown content string @@ -81,7 +157,7 @@ export function sanitizeMarkdownContent(content: string): string { const codeSpans: string[] = []; const codeBlockPlaceholders: string[] = []; - content = content.replace(/```[\s\S]*?```/g, (m) => { + content = content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, (m) => { codeBlocks.push(m); const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; codeBlockPlaceholders.push(placeholder); diff --git a/scripts/notion-fetch/generateBlocks.test.ts b/scripts/notion-fetch/generateBlocks.test.ts index b5b0e88d..c2c937c6 100644 --- a/scripts/notion-fetch/generateBlocks.test.ts +++ b/scripts/notion-fetch/generateBlocks.test.ts @@ -112,6 +112,7 @@ vi.mock("./imageProcessor", () => ({ vi.mock("./utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + injectExplicitHeadingIds: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn(), detectFormatFromBuffer: vi.fn(() => "jpeg"), formatFromContentType: vi.fn(() => "jpeg"), @@ -198,6 +199,7 @@ describe("generateBlocks", () => { let fetchNotionBlocks: Mock; let processImage: Mock; let compressImageToFileWithFallback: Mock; + let injectExplicitHeadingIds: Mock; beforeEach(async () => { restoreEnv = installTestNotionEnv(); @@ -223,6 +225,7 @@ describe("generateBlocks", () => { const utils = await import("./utils"); compressImageToFileWithFallback = utils.compressImageToFileWithFallback as Mock; + injectExplicitHeadingIds = utils.injectExplicitHeadingIds as Mock; // Setup default mock implementations processImage.mockResolvedValue(mockProcessedImageResult); @@ -377,6 +380,155 @@ describe("generateBlocks", () => { }); }); + describe("Localized slug and link normalization", () => { + it("should derive the shared ASCII slug from the grouped title for every locale", async () => { + const { generateBlocks } = await import("./generateBlocks"); + const mockWriteFileSync = fs.writeFileSync as Mock; + + const mainPage = createMockNotionPage({ + id: "main-accented", + title: "Título con acentos", + elementType: "Page", + subItems: ["en-accented", "es-accented", "pt-accented"], + }); + const englishPage = createMockNotionPage({ + id: "en-accented", + title: "Título con acentos", + language: "English", + elementType: "Page", + }); + const spanishPage = createMockNotionPage({ + id: "es-accented", + title: "Título con acentos", + language: "Spanish", + elementType: "Page", + }); + const portuguesePage = createMockNotionPage({ + id: "pt-accented", + title: "Título con acentos", + language: "Portuguese", + elementType: "Page", + }); + + n2m.pageToMarkdown.mockResolvedValue([]); + n2m.toMarkdownString.mockReturnValue({ parent: "Body content" }); + + await generateBlocks( + [mainPage, englishPage, spanishPage, portuguesePage], + vi.fn() + ); + + const markdownPaths = mockWriteFileSync.mock.calls + .map((call) => call[0]) + .filter( + (value): value is string => + typeof value === "string" && value.endsWith(".md") + ); + + expect(markdownPaths).toEqual( + expect.arrayContaining([ + expect.stringContaining("titulo-con-acentos.md"), + expect.stringContaining( + "i18n/pt/docusaurus-plugin-content-docs/current/titulo-con-acentos.md" + ), + expect.stringContaining( + "i18n/es/docusaurus-plugin-content-docs/current/titulo-con-acentos.md" + ), + ]) + ); + }); + + it("should normalize localized internal docs links before writing markdown", async () => { + const { generateBlocks } = await import("./generateBlocks"); + const mockWriteFileSync = fs.writeFileSync as Mock; + + const pageFamily = createMockPageFamily("Página de prueba", "Page"); + n2m.pageToMarkdown.mockResolvedValue([]); + n2m.toMarkdownString + .mockReturnValueOnce({ + parent: + "[doc](/docs/Guía Rápida#Título Uno) [external](https://example.com/Árbol) [relative](./Guía Local#Título)", + }) + .mockReturnValueOnce({ + parent: + "[doc](/docs/Guía Rápida#Título Uno) [nested](/docs/Category Name/Sub Página#Título Dos)", + }) + .mockReturnValueOnce({ + parent: "[doc](/docs/Guía Rápida#Título Uno)", + }); + + await generateBlocks(pageFamily.pages, vi.fn()); + + const markdownWrites = mockWriteFileSync.mock.calls.filter( + (call) => typeof call[0] === "string" && call[0].endsWith(".md") + ); + + const englishOutput = markdownWrites.find( + (call) => + typeof call[0] === "string" && + !call[0].includes("/i18n/") && + call[1].includes("/docs/guia-rapida#titulo-uno") + ); + const portugueseOutput = markdownWrites.find( + (call) => + typeof call[0] === "string" && + call[0].includes("/i18n/pt/") && + call[1].includes("/pt/docs/guia-rapida#titulo-uno") + ); + const spanishOutput = markdownWrites.find( + (call) => + typeof call[0] === "string" && + call[0].includes("/i18n/es/") && + call[1].includes("/es/docs/guia-rapida#titulo-uno") + ); + + expect(englishOutput?.[1]).toContain( + "[doc](/docs/guia-rapida#titulo-uno)" + ); + expect(englishOutput?.[1]).toContain( + "[external](https://example.com/Árbol)" + ); + expect(englishOutput?.[1]).toContain("[relative](./Guía Local#Título)"); + expect(portugueseOutput?.[1]).toContain( + "[nested](/pt/docs/category-name/sub-pagina#titulo-dos)" + ); + expect(spanishOutput?.[1]).toContain( + "[doc](/es/docs/guia-rapida#titulo-uno)" + ); + }); + + it("should pass the de-duplicated content through heading ID injection before writing", async () => { + const { generateBlocks } = await import("./generateBlocks"); + const mockWriteFileSync = fs.writeFileSync as Mock; + + const page = createMockNotionPage({ + id: "heading-page", + title: "Heading Title", + elementType: "Page", + language: "English", + }); + + n2m.pageToMarkdown.mockResolvedValue([]); + n2m.toMarkdownString.mockReturnValue({ + parent: "# Heading Title\n\n## Título Único\nContent body", + }); + injectExplicitHeadingIds.mockImplementation( + (content: string) => `${content}\n` + ); + + await generateBlocks([page], vi.fn()); + + expect(injectExplicitHeadingIds).toHaveBeenCalledWith( + "## Título Único\nContent body" + ); + + const markdownWrite = mockWriteFileSync.mock.calls.find( + (call) => typeof call[0] === "string" && call[0].endsWith(".md") + ); + expect(markdownWrite?.[1]).toContain(""); + }); + }); + describe("Title fallbacks", () => { it("should fallback to legacy Title property when Content elements is missing", async () => { const { generateBlocks } = await import("./generateBlocks"); diff --git a/scripts/notion-fetch/generateBlocks.ts b/scripts/notion-fetch/generateBlocks.ts index 4dda0cd7..94d4ef6d 100644 --- a/scripts/notion-fetch/generateBlocks.ts +++ b/scripts/notion-fetch/generateBlocks.ts @@ -9,7 +9,9 @@ import type { import { n2m } from "../notionClient"; import { NOTION_PROPERTIES } from "../constants"; import chalk from "chalk"; -import { sanitizeMarkdownContent } from "./utils"; +import { sanitizeMarkdownContent, injectExplicitHeadingIds } from "./utils"; +import { createSafeSlug } from "./slugUtils"; +import { normalizeInternalDocLinks } from "./linkNormalizer"; import config from "../../docusaurus.config"; import SpinnerManager from "./spinnerManager"; import { convertCalloutToAdmonition, isCalloutBlock } from "./calloutProcessor"; @@ -528,6 +530,10 @@ async function processSinglePage( emojiCount += result.fallbackEmojiCount; contentHasS3 = result.containsS3; + markdownString.parent = normalizeInternalDocLinks( + markdownString.parent, + lang + ); markdownString.parent = sanitizeMarkdownContent(markdownString.parent); markdownString.parent = ensureBlankLineAfterStandaloneBold( @@ -538,18 +544,19 @@ async function processSinglePage( markdownString.parent, pageTitle ); + const finalContentBody = injectExplicitHeadingIds(contentBody); const sectionFolderForWrite: Record = {}; sectionFolderForWrite[lang] = currentSectionFolderForLang; - const finalDiagnostics = getImageDiagnostics(markdownString.parent ?? ""); + const finalDiagnostics = getImageDiagnostics(finalContentBody ?? ""); contentHasS3 = finalDiagnostics.s3Matches > 0; writeMarkdownFile( filePath, frontmatter, - contentBody, + finalContentBody, pageTitle, pageProcessingIndex - 1, totalPages, @@ -887,10 +894,7 @@ export async function generateBlocks( ? sectionTypeRaw.trim() : String(sectionTypeRaw ?? "").trim(); const normalizedSectionType = sectionTypeString.toLowerCase(); - const filename = title - .toLowerCase() - .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, ""); + const filename = createSafeSlug(title); const orderedLocales = getOrderedLocales(Object.keys(pageByLang.content)); for (const lang of orderedLocales) { diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts new file mode 100644 index 00000000..e9b7c88e --- /dev/null +++ b/scripts/notion-fetch/linkNormalizer.test.ts @@ -0,0 +1,77 @@ +import { describe, it, expect, vi } from "vitest"; + +// Mock the docusaurus config before importing the module under test, +// mirroring the pattern used in generateBlocks.test.ts. +vi.mock("../../docusaurus.config", () => ({ + default: { + i18n: { + locales: ["en", "pt", "es"], + defaultLocale: "en", + }, + }, +})); + +import { normalizeInternalDocLinks } from "./linkNormalizer"; + +describe("linkNormalizer", () => { + describe("normalizeInternalDocLinks", () => { + it("should normalize a docs link for the default locale (en) without a locale prefix", () => { + const input = "[link](/docs/Guía Rápida)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs/guia-rapida)"); + }); + + it("should add a locale prefix for a non-default locale (es)", () => { + const input = "[link](/docs/Guía Rápida)"; + const result = normalizeInternalDocLinks(input, "es"); + expect(result).toBe("[link](/es/docs/guia-rapida)"); + }); + + it("should normalize both the path and the fragment", () => { + const input = "[link](/docs/Page#Título Uno)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs/page#titulo-uno)"); + }); + + it("should leave external links untouched", () => { + const input = "[link](https://example.com/Árbol)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should leave relative links untouched", () => { + const input = "[link](./local)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not alter image links (lines starting with !)", () => { + const input = "![img](/docs/Accented Page)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should normalize each path segment in a nested docs path and add a locale prefix", () => { + const input = "[link](/docs/Category Name/Sub Page)"; + const result = normalizeInternalDocLinks(input, "pt"); + expect(result).toBe("[link](/pt/docs/category-name/sub-page)"); + }); + + it("should normalize multiple docs links on a single line", () => { + const input = "[a](/docs/Foo) and [b](/docs/Bar)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[a](/docs/foo) and [b](/docs/bar)"); + }); + + it("should return empty string for empty content", () => { + const result = normalizeInternalDocLinks("", "en"); + expect(result).toBe(""); + }); + + it("should leave plain text with only external links unchanged", () => { + const input = "plain text with [link](https://example.com)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + }); +}); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts new file mode 100644 index 00000000..4e384f2e --- /dev/null +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -0,0 +1,66 @@ +import config from "../../docusaurus.config"; +import { createSafeSlug } from "./slugUtils"; + +const DEFAULT_LOCALE = config.i18n.defaultLocale; +const MARKDOWN_LINK_REGEX = /(^|[^!])\[([^\]]+)\]\(([^)\n]+)\)/gm; + +function safeDecode(s: string): string { + try { + return decodeURIComponent(s); + } catch { + return s; + } +} + +function normalizeDocPathname(pathname: string): string { + const hasTrailingSlash = pathname.endsWith("/") && pathname !== "/docs/"; + const rawSegments = pathname + .slice("/docs/".length) + .split("/") + .filter(Boolean); + const normalizedSegments = rawSegments.map((segment) => + createSafeSlug(safeDecode(segment)) + ); + + const normalizedPath = normalizedSegments.length + ? `/docs/${normalizedSegments.join("/")}` + : "/docs"; + + return hasTrailingSlash ? `${normalizedPath}/` : normalizedPath; +} + +function normalizeDocTarget(target: string, lang: string): string { + const [pathname, rawFragment] = target.split("#", 2); + const localePrefix = lang === DEFAULT_LOCALE ? "" : `/${lang}`; + const normalizedPath = normalizeDocPathname(pathname); + const normalizedFragment = rawFragment + ? `#${createSafeSlug(safeDecode(rawFragment))}` + : ""; + + return `${localePrefix}${normalizedPath}${normalizedFragment}`; +} + +export function normalizeInternalDocLinks( + content: string, + lang: string +): string { + if (!content) { + return content; + } + + return content.replace( + MARKDOWN_LINK_REGEX, + (match, prefix: string, text: string, rawTarget: string) => { + const trimmedTarget = rawTarget.trim(); + const titleMatch = trimmedTarget.match(/^(\/docs\/[^\n]*?)(\s+"[^"]*")$/); + const target = titleMatch ? titleMatch[1] : trimmedTarget; + const titleSuffix = titleMatch?.[2] ?? ""; + + if (!target.startsWith("/docs/")) { + return match; + } + + return `${prefix}[${text}](${normalizeDocTarget(target, lang)}${titleSuffix})`; + } + ); +} diff --git a/scripts/notion-fetch/slugUtils.test.ts b/scripts/notion-fetch/slugUtils.test.ts new file mode 100644 index 00000000..35fc0bde --- /dev/null +++ b/scripts/notion-fetch/slugUtils.test.ts @@ -0,0 +1,54 @@ +import { describe, it, expect } from "vitest"; +import { createSafeSlug } from "./slugUtils"; + +describe("slugUtils", () => { + describe("createSafeSlug", () => { + it("should convert basic Latin text to lowercase hyphenated slug", () => { + expect(createSafeSlug("Hello World")).toBe("hello-world"); + }); + + it("should strip accented Latin characters", () => { + expect(createSafeSlug("Título con acentos")).toBe("titulo-con-acentos"); + }); + + it("should handle Spanish accented characters", () => { + expect(createSafeSlug("Guía Rápida")).toBe("guia-rapida"); + }); + + it("should handle Portuguese characters", () => { + expect(createSafeSlug("Instalação")).toBe("instalacao"); + }); + + it("should handle ñ and accented vowels in Spanish words", () => { + expect(createSafeSlug("Niño & Acción")).toBe("nino-accion"); + }); + + it("should return an empty string for empty input", () => { + expect(createSafeSlug("")).toBe(""); + }); + + it("should strip diacritics from accented letters", () => { + expect(createSafeSlug("éàü")).toBe("eau"); + }); + + it("should preserve numbers in the slug", () => { + expect(createSafeSlug("FAQ Section 2")).toBe("faq-section-2"); + }); + + it("should collapse multiple spaces and hyphens into a single hyphen", () => { + expect(createSafeSlug("hello --- world")).toBe("hello-world"); + }); + + it("should strip leading and trailing hyphens", () => { + expect(createSafeSlug("--hello--")).toBe("hello"); + }); + + it("should produce an empty string for CJK-only input (known limitation)", () => { + expect(createSafeSlug("安装指南")).toBe(""); + }); + + it("should extract only the Latin portion from mixed CJK and Latin input", () => { + expect(createSafeSlug("安装 Setup 指南")).toBe("setup"); + }); + }); +}); diff --git a/scripts/notion-fetch/slugUtils.ts b/scripts/notion-fetch/slugUtils.ts new file mode 100644 index 00000000..a4547422 --- /dev/null +++ b/scripts/notion-fetch/slugUtils.ts @@ -0,0 +1,11 @@ +export function createSafeSlug(text: string): string { + return text + .normalize("NFD") + .replace(/\p{M}/gu, "") + .toLowerCase() + .trim() + .replace(/\s+/g, "-") + .replace(/[^a-z0-9-]/g, "") + .replace(/-+/g, "-") + .replace(/^-+|-+$/g, ""); +} diff --git a/scripts/notion-fetch/utils.ts b/scripts/notion-fetch/utils.ts index 4b0a9415..4471e445 100644 --- a/scripts/notion-fetch/utils.ts +++ b/scripts/notion-fetch/utils.ts @@ -6,7 +6,10 @@ import { compressImage } from "./imageCompressor"; import { withTimeoutFallback } from "./timeoutUtils"; // Re-export sanitize so callers have a single utils entrypoint -export { sanitizeMarkdownContent } from "./contentSanitizer"; +export { + sanitizeMarkdownContent, + injectExplicitHeadingIds, +} from "./contentSanitizer"; // Fail-open toggle: defaults to true unless explicitly set to 'false' export const SOFT_FAIL: boolean = diff --git a/scripts/notion-fetch/verifyExportCoverage.ts b/scripts/notion-fetch/verifyExportCoverage.ts index 80e4a78e..ade557c9 100644 --- a/scripts/notion-fetch/verifyExportCoverage.ts +++ b/scripts/notion-fetch/verifyExportCoverage.ts @@ -4,17 +4,13 @@ import path from "node:path"; import { glob } from "glob"; import { NOTION_PROPERTIES } from "../constants"; +import { createSafeSlug } from "./slugUtils"; type NotionPage = Record; const EXPORT_FILENAME = "notion_db.json"; -const slugify = (title: string): string => - title - .toLowerCase() - .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, "") - .trim(); +const slugify = (title: string): string => createSafeSlug(title); const getTitle = (page: NotionPage): string | undefined => page?.properties?.[NOTION_PROPERTIES.TITLE]?.title?.[0]?.plain_text; @@ -50,14 +46,12 @@ export interface VerificationResult { export function verifyExportCoverage( exportPath: string = path.resolve(process.cwd(), EXPORT_FILENAME) ): VerificationResult { - // eslint-disable-next-line security/detect-non-literal-fs-filename if (!fs.existsSync(exportPath)) { throw new Error( `Notion export file not found at ${exportPath}. Run bun notion:export first.` ); } - // eslint-disable-next-line security/detect-non-literal-fs-filename const payload = JSON.parse(fs.readFileSync(exportPath, "utf8")); const results: NotionPage[] = payload.results ?? []; const readyPages = results.filter(isReadyToPublish); diff --git a/scripts/notion-translate/index.ts b/scripts/notion-translate/index.ts index 03773771..c2c2c4c2 100644 --- a/scripts/notion-translate/index.ts +++ b/scripts/notion-translate/index.ts @@ -38,6 +38,7 @@ import { validateAndFixRemainingImages, extractImageMatches, } from "../notion-fetch/imageReplacer.js"; +import { createSafeSlug } from "../notion-fetch/slugUtils.js"; const LEGACY_SECTION_PROPERTY = "Section"; const PARENT_ITEM_PROPERTY = "Parent item"; @@ -584,11 +585,7 @@ const NOTION_IMAGE_URL_FAMILY_REGEX = new RegExp( * image filenames remain consistent with markdown filenames. */ function generateSafeFilename(title: string, pageId: string): string { - const baseSlug = title - .toLowerCase() - .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, "") - .substring(0, MAX_SLUG_LENGTH); + const baseSlug = createSafeSlug(title).substring(0, MAX_SLUG_LENGTH); const stablePageId = pageId.toLowerCase().replace(/[^a-z0-9]/g, ""); const deterministicBase = baseSlug || "untitled"; return `${deterministicBase}-${stablePageId}`; From 37a372e611575ebc414fcff8caaffcadbbb3cdd8 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:52:29 -0300 Subject: [PATCH 02/15] fix(slugs): flatten nested doc paths and skip links in code blocks - normalizeDocPathname now uses only the last path segment, matching the flat slug shape that buildFrontmatter() generates (slug: /${safeSlug}). Multi-segment paths like /docs/Category/Page previously resolved to /docs/category/page which does not exist, causing 404s. - normalizeInternalDocLinks now masks fenced code blocks and inline code before rewriting links, so Markdown link examples inside code fences are no longer altered. - Update test for nested path to expect flat slug output. - Add tests for code-fence and inline-code protection. --- scripts/notion-fetch/linkNormalizer.test.ts | 18 +++++++++-- scripts/notion-fetch/linkNormalizer.ts | 34 ++++++++++++++++----- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts index e9b7c88e..855a6a98 100644 --- a/scripts/notion-fetch/linkNormalizer.test.ts +++ b/scripts/notion-fetch/linkNormalizer.test.ts @@ -51,10 +51,24 @@ describe("linkNormalizer", () => { expect(result).toBe(input); }); - it("should normalize each path segment in a nested docs path and add a locale prefix", () => { + it("should flatten a nested docs path to only the last segment (slug shape)", () => { const input = "[link](/docs/Category Name/Sub Page)"; const result = normalizeInternalDocLinks(input, "pt"); - expect(result).toBe("[link](/pt/docs/category-name/sub-page)"); + // buildFrontmatter() writes slug: /${safeSlug} (single level), so the + // public URL is /pt/docs/sub-page, not /pt/docs/category-name/sub-page. + expect(result).toBe("[link](/pt/docs/sub-page)"); + }); + + it("should not rewrite links inside a fenced code block", () => { + const input = "```\n[example](/docs/Guía Rápida)\n```"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside inline code", () => { + const input = "Use `[link](/docs/Guía Rápida)` as an example."; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); }); it("should normalize multiple docs links on a single line", () => { diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 4e384f2e..96348c0f 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -18,14 +18,17 @@ function normalizeDocPathname(pathname: string): string { .slice("/docs/".length) .split("/") .filter(Boolean); - const normalizedSegments = rawSegments.map((segment) => - createSafeSlug(safeDecode(segment)) - ); - const normalizedPath = normalizedSegments.length - ? `/docs/${normalizedSegments.join("/")}` - : "/docs"; + // buildFrontmatter() always writes slug: /${safeSlug} (single level), so + // parent folder segments do not appear in the public URL. Only the last + // segment is the actual page slug; preserving parent segments produces a + // path that does not exist and results in a 404. + const lastSegment = rawSegments[rawSegments.length - 1]; + if (!lastSegment) { + return "/docs"; + } + const normalizedPath = `/docs/${createSafeSlug(safeDecode(lastSegment))}`; return hasTrailingSlash ? `${normalizedPath}/` : normalizedPath; } @@ -48,7 +51,20 @@ export function normalizeInternalDocLinks( return content; } - return content.replace( + // Mask code fences and inline code so links inside literal examples are not + // rewritten. Uses the same placeholder strategy as sanitizeMarkdownContent. + const codeBlocks: string[] = []; + const codeSpans: string[] = []; + let masked = content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, (m) => { + codeBlocks.push(m); + return `__CODEBLOCK_${codeBlocks.length - 1}__`; + }); + masked = masked.replace(/`[^`\n]*`/g, (m) => { + codeSpans.push(m); + return `__CODESPAN_${codeSpans.length - 1}__`; + }); + + const result = masked.replace( MARKDOWN_LINK_REGEX, (match, prefix: string, text: string, rawTarget: string) => { const trimmedTarget = rawTarget.trim(); @@ -63,4 +79,8 @@ export function normalizeInternalDocLinks( return `${prefix}[${text}](${normalizeDocTarget(target, lang)}${titleSuffix})`; } ); + + return result + .replace(/__CODESPAN_(\d+)__/g, (_, i) => codeSpans[parseInt(i, 10)]) + .replace(/__CODEBLOCK_(\d+)__/g, (_, i) => codeBlocks[parseInt(i, 10)]); } From d85d885f06c990ad57799f904154edb43e8b3eac Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:57:09 -0300 Subject: [PATCH 03/15] fix(slugs): flatten nested doc paths and skip links in code blocks - normalizeDocPathname now uses only the last path segment, matching the flat slug shape buildFrontmatter() generates (slug: /${safeSlug}). Multi-segment paths like /docs/Category/Page previously resolved to /docs/category/page which does not exist, causing 404s. - normalizeInternalDocLinks now masks fenced code blocks and inline code before rewriting links, so Markdown link examples inside code fences are no longer altered. - Refactor mask/restore logic into dedicated maskCode/restoreCode helpers. - Update test for nested path to expect flat slug output. - Add tests for code-fence and inline-code protection. --- scripts/notion-fetch/linkNormalizer.ts | 57 ++++++++++++++++++-------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 96348c0f..648c5e94 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -12,6 +12,44 @@ function safeDecode(s: string): string { } } +function maskCode(content: string): { + maskedContent: string; + codeBlocks: string[]; + codeSpans: string[]; +} { + const codeBlocks: string[] = []; + const codeSpans: string[] = []; + + const maskedBlocks = content.replace( + /^```[^\n]*\n[\s\S]*?^```/gm, + (match) => { + codeBlocks.push(match); + return `__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`; + } + ); + + const maskedContent = maskedBlocks.replace(/`[^`\n]*`/g, (match) => { + codeSpans.push(match); + return `__LINK_NORMALIZER_CODESPAN_${codeSpans.length - 1}__`; + }); + + return { maskedContent, codeBlocks, codeSpans }; +} + +function restoreCode( + content: string, + codeBlocks: string[], + codeSpans: string[] +): string { + return content + .replace(/__LINK_NORMALIZER_CODESPAN_(\d+)__/g, (_match, index) => { + return codeSpans[Number(index)]; + }) + .replace(/__LINK_NORMALIZER_CODEBLOCK_(\d+)__/g, (_match, index) => { + return codeBlocks[Number(index)]; + }); +} + function normalizeDocPathname(pathname: string): string { const hasTrailingSlash = pathname.endsWith("/") && pathname !== "/docs/"; const rawSegments = pathname @@ -51,20 +89,9 @@ export function normalizeInternalDocLinks( return content; } - // Mask code fences and inline code so links inside literal examples are not - // rewritten. Uses the same placeholder strategy as sanitizeMarkdownContent. - const codeBlocks: string[] = []; - const codeSpans: string[] = []; - let masked = content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, (m) => { - codeBlocks.push(m); - return `__CODEBLOCK_${codeBlocks.length - 1}__`; - }); - masked = masked.replace(/`[^`\n]*`/g, (m) => { - codeSpans.push(m); - return `__CODESPAN_${codeSpans.length - 1}__`; - }); + const { maskedContent, codeBlocks, codeSpans } = maskCode(content); - const result = masked.replace( + const normalizedContent = maskedContent.replace( MARKDOWN_LINK_REGEX, (match, prefix: string, text: string, rawTarget: string) => { const trimmedTarget = rawTarget.trim(); @@ -80,7 +107,5 @@ export function normalizeInternalDocLinks( } ); - return result - .replace(/__CODESPAN_(\d+)__/g, (_, i) => codeSpans[parseInt(i, 10)]) - .replace(/__CODEBLOCK_(\d+)__/g, (_, i) => codeBlocks[parseInt(i, 10)]); + return restoreCode(normalizedContent, codeBlocks, codeSpans); } From c2f22f1e6665c674892110b4f2cb3b861fcdcc7b Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 18:05:21 -0300 Subject: [PATCH 04/15] fix(slugs): mask indented code fences before link normalization and heading injection Fix code-fence masking regex in linkNormalizer and contentSanitizer to allow leading whitespace (^[ \t]*```) so indented fences (e.g. inside list items or admonitions) are also protected before link normalization and heading ID injection. Add test for indented code fence protection. --- scripts/notion-fetch/contentSanitizer.ts | 4 ++-- scripts/notion-fetch/linkNormalizer.test.ts | 6 ++++++ scripts/notion-fetch/linkNormalizer.ts | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index 070fb1ff..bd783e84 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -79,7 +79,7 @@ function maskCodeFences(content: string): { const codeBlockPlaceholders: string[] = []; const maskedContent = content.replace( - /^```[^\n]*\n[\s\S]*?^```/gm, + /^[ \t]*```[^\n]*\n[\s\S]*?^[ \t]*```/gm, (match) => { codeBlocks.push(match); const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; @@ -157,7 +157,7 @@ export function sanitizeMarkdownContent(content: string): string { const codeSpans: string[] = []; const codeBlockPlaceholders: string[] = []; - content = content.replace(/^```[^\n]*\n[\s\S]*?^```/gm, (m) => { + content = content.replace(/^[ \t]*```[^\n]*\n[\s\S]*?^[ \t]*```/gm, (m) => { codeBlocks.push(m); const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; codeBlockPlaceholders.push(placeholder); diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts index 855a6a98..ff93150a 100644 --- a/scripts/notion-fetch/linkNormalizer.test.ts +++ b/scripts/notion-fetch/linkNormalizer.test.ts @@ -65,6 +65,12 @@ describe("linkNormalizer", () => { expect(result).toBe(input); }); + it("should not rewrite links inside an indented fenced code block", () => { + const input = " ```\n [example](/docs/Guía Rápida)\n ```"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + it("should not rewrite links inside inline code", () => { const input = "Use `[link](/docs/Guía Rápida)` as an example."; const result = normalizeInternalDocLinks(input, "en"); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 648c5e94..87303207 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -21,7 +21,7 @@ function maskCode(content: string): { const codeSpans: string[] = []; const maskedBlocks = content.replace( - /^```[^\n]*\n[\s\S]*?^```/gm, + /^[ \t]*```[^\n]*\n[\s\S]*?^[ \t]*```/gm, (match) => { codeBlocks.push(match); return `__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`; From c6ddcaa00c63bfa67d0cc97bd85e1926218ee37d Mon Sep 17 00:00:00 2001 From: CoMapeo Content Bot Date: Thu, 19 Mar 2026 18:37:06 -0300 Subject: [PATCH 05/15] test(notion): align normalization expectations --- bun-tests/vitest-bridge.test.ts | 4 +++- eslint.config.mjs | 2 +- scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts | 1 + scripts/notion-fetch/generateBlocks.test.ts | 2 +- scripts/notion-fetch/page-ordering.test.ts | 1 + scripts/notion-translate/imageStabilization.test.ts | 2 +- 6 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bun-tests/vitest-bridge.test.ts b/bun-tests/vitest-bridge.test.ts index a096b2fb..8e57ab74 100644 --- a/bun-tests/vitest-bridge.test.ts +++ b/bun-tests/vitest-bridge.test.ts @@ -24,5 +24,7 @@ test( () => { runVitest(); }, - { timeout: 120_000 } + // The full Vitest suite can take just under two minutes on this repo, and + // Bun's own test harness adds enough overhead that 120s is too tight. + { timeout: 300_000 } ); diff --git a/eslint.config.mjs b/eslint.config.mjs index d03fda21..51a77904 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -32,7 +32,7 @@ const eslintConfig = [ // Docusaurus specific configurations { files: ["**/*.{js,mjs,cjs,ts,jsx,tsx}"], - ignores: ["scripts/**", "api-server/**"], // Ignore scripts and api-server directories for docusaurus rules + ignores: ["scripts/**", "api-server/**", "bun-tests/**"], // Ignore non-Docusaurus runtime directories for docusaurus/react rules plugins: { "@docusaurus": docusaurusPlugin, react: pluginReact, diff --git a/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts b/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts index 61cdb372..622c6456 100644 --- a/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts +++ b/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts @@ -93,6 +93,7 @@ vi.mock("../imageProcessor", () => ({ vi.mock("../utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + injectExplicitHeadingIds: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn().mockResolvedValue({ finalSize: 512, usedFallback: false, diff --git a/scripts/notion-fetch/generateBlocks.test.ts b/scripts/notion-fetch/generateBlocks.test.ts index c2c937c6..5d131524 100644 --- a/scripts/notion-fetch/generateBlocks.test.ts +++ b/scripts/notion-fetch/generateBlocks.test.ts @@ -490,7 +490,7 @@ describe("generateBlocks", () => { ); expect(englishOutput?.[1]).toContain("[relative](./Guía Local#Título)"); expect(portugueseOutput?.[1]).toContain( - "[nested](/pt/docs/category-name/sub-pagina#titulo-dos)" + "[nested](/pt/docs/sub-pagina#titulo-dos)" ); expect(spanishOutput?.[1]).toContain( "[doc](/es/docs/guia-rapida#titulo-uno)" diff --git a/scripts/notion-fetch/page-ordering.test.ts b/scripts/notion-fetch/page-ordering.test.ts index 5bb96c83..4d0fa461 100644 --- a/scripts/notion-fetch/page-ordering.test.ts +++ b/scripts/notion-fetch/page-ordering.test.ts @@ -103,6 +103,7 @@ vi.mock("./imageProcessor", () => ({ vi.mock("./utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + injectExplicitHeadingIds: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn(), detectFormatFromBuffer: vi.fn(() => "jpeg"), formatFromContentType: vi.fn(() => "jpeg"), diff --git a/scripts/notion-translate/imageStabilization.test.ts b/scripts/notion-translate/imageStabilization.test.ts index 514a5d92..946c9194 100644 --- a/scripts/notion-translate/imageStabilization.test.ts +++ b/scripts/notion-translate/imageStabilization.test.ts @@ -906,7 +906,7 @@ describe("image stabilization in translation pipeline", () => { expect(mockProcessAndReplaceImages).toHaveBeenCalledWith( expect.any(String), - "hllo-wrld-pageid1" + "hello-world-pageid1" ); }); From 62724316e34d713cd8b22fa991b0259560aee104 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 19:02:20 -0300 Subject: [PATCH 06/15] fix(notion-fetch): handle explicit heading IDs and empty filenames Co-authored-by: Junie --- scripts/notion-fetch/contentSanitizer.ts | 10 ++++++++-- scripts/notion-fetch/generateBlocks.ts | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index bd783e84..d772fa3b 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -117,12 +117,18 @@ export function injectExplicitHeadingIds(content: string): string { const lines = maskedContent.split("\n"); const updatedLines = lines.map((line) => { if ( - codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) || - /\s\{#[^}]+\}\s*$/.test(line) + codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) ) { return line; } + const explicitIdMatch = line.match(/\s\{#([^}]+)\}\s*$/); + if (explicitIdMatch) { + const explicitId = explicitIdMatch[1]; + headingCounts.set(explicitId, (headingCounts.get(explicitId) ?? 0) + 1); + return line; + } + const headingMatch = line.match(/^(\s{0,3})(#{1,6})\s+(.+?)\s*$/); if (!headingMatch) { return line; diff --git a/scripts/notion-fetch/generateBlocks.ts b/scripts/notion-fetch/generateBlocks.ts index 94d4ef6d..e514676f 100644 --- a/scripts/notion-fetch/generateBlocks.ts +++ b/scripts/notion-fetch/generateBlocks.ts @@ -894,7 +894,7 @@ export async function generateBlocks( ? sectionTypeRaw.trim() : String(sectionTypeRaw ?? "").trim(); const normalizedSectionType = sectionTypeString.toLowerCase(); - const filename = createSafeSlug(title); + const filename = createSafeSlug(title) || "untitled"; const orderedLocales = getOrderedLocales(Object.keys(pageByLang.content)); for (const lang of orderedLocales) { From ef855789ff28c4a257dde261d5d6ce1b72c82bf0 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 21:38:17 -0300 Subject: [PATCH 07/15] fix(slugs): preserve CJK and Unicode letters in slug generation Replace ASCII-only regex with Unicode property escapes (\p{L}\p{N}) so CJK and accented characters are retained in slugs instead of stripped. Update tests to reflect corrected behavior. Extend ESLint config to cover bun-tests/. --- eslint.config.mjs | 6 +++++- scripts/notion-fetch/slugUtils.test.ts | 8 ++++---- scripts/notion-fetch/slugUtils.ts | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/eslint.config.mjs b/eslint.config.mjs index 51a77904..83430e4d 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -74,7 +74,11 @@ const eslintConfig = [ // Scripts and API server specific configurations { - files: ["scripts/**/*.{js,mjs,cjs,ts}", "api-server/**/*.{js,mjs,cjs,ts}"], + files: [ + "scripts/**/*.{js,mjs,cjs,ts}", + "api-server/**/*.{js,mjs,cjs,ts}", + "bun-tests/**/*.{js,mjs,cjs,ts}", + ], plugins: { import: importPlugin, promise: promisePlugin, diff --git a/scripts/notion-fetch/slugUtils.test.ts b/scripts/notion-fetch/slugUtils.test.ts index 35fc0bde..813b3478 100644 --- a/scripts/notion-fetch/slugUtils.test.ts +++ b/scripts/notion-fetch/slugUtils.test.ts @@ -43,12 +43,12 @@ describe("slugUtils", () => { expect(createSafeSlug("--hello--")).toBe("hello"); }); - it("should produce an empty string for CJK-only input (known limitation)", () => { - expect(createSafeSlug("安装指南")).toBe(""); + it("should preserve CJK input", () => { + expect(createSafeSlug("安装指南")).toBe("安装指南"); }); - it("should extract only the Latin portion from mixed CJK and Latin input", () => { - expect(createSafeSlug("安装 Setup 指南")).toBe("setup"); + it("should extract both CJK and Latin from mixed input", () => { + expect(createSafeSlug("安装 Setup 指南")).toBe("安装-setup-指南"); }); }); }); diff --git a/scripts/notion-fetch/slugUtils.ts b/scripts/notion-fetch/slugUtils.ts index a4547422..d0f473ab 100644 --- a/scripts/notion-fetch/slugUtils.ts +++ b/scripts/notion-fetch/slugUtils.ts @@ -5,7 +5,7 @@ export function createSafeSlug(text: string): string { .toLowerCase() .trim() .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, "") + .replace(/[^\p{L}\p{N}-]/gu, "") .replace(/-+/g, "-") .replace(/^-+|-+$/g, ""); } From 8f9ea2790e5828ff6d63d823960e853016e2a93c Mon Sep 17 00:00:00 2001 From: luandro Date: Fri, 20 Mar 2026 08:27:39 -0300 Subject: [PATCH 08/15] fix(notion-fetch): align code-fence regex with CommonMark and fix heading counter for explicit IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace `[ \t]*` with ` {0,3}` in code-fence masks across contentSanitizer and linkNormalizer, matching CommonMark's 0–3 space rule for fenced blocks - Register the text-derived baseId in headingCounts when a heading already carries an explicit {#id}, preventing incorrect -0 suffixes on subsequent duplicate headings - Suppress security/detect-non-literal-fs-filename ESLint warnings in verifyExportCoverage where the path parameter is already validated --- scripts/notion-fetch/contentSanitizer.ts | 19 +++++++++++++++++-- scripts/notion-fetch/linkNormalizer.ts | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index d772fa3b..5efb11e0 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -79,7 +79,7 @@ function maskCodeFences(content: string): { const codeBlockPlaceholders: string[] = []; const maskedContent = content.replace( - /^[ \t]*```[^\n]*\n[\s\S]*?^[ \t]*```/gm, + /^ {0,3}```[^\n]*\n[\s\S]*?^ {0,3}```/gm, (match) => { codeBlocks.push(match); const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; @@ -122,6 +122,21 @@ export function injectExplicitHeadingIds(content: string): string { return line; } + const fullMatch = line.match( + /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/ + ); + if (fullMatch) { + const [, , , headingText, explicitId] = fullMatch; + const baseId = createSafeSlug(headingText); + if (baseId) { + headingCounts.set(baseId, (headingCounts.get(baseId) ?? 0) + 1); + } + if (explicitId !== baseId) { + headingCounts.set(explicitId, (headingCounts.get(explicitId) ?? 0) + 1); + } + return line; + } + const explicitIdMatch = line.match(/\s\{#([^}]+)\}\s*$/); if (explicitIdMatch) { const explicitId = explicitIdMatch[1]; @@ -163,7 +178,7 @@ export function sanitizeMarkdownContent(content: string): string { const codeSpans: string[] = []; const codeBlockPlaceholders: string[] = []; - content = content.replace(/^[ \t]*```[^\n]*\n[\s\S]*?^[ \t]*```/gm, (m) => { + content = content.replace(/^ {0,3}```[^\n]*\n[\s\S]*?^ {0,3}```/gm, (m) => { codeBlocks.push(m); const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; codeBlockPlaceholders.push(placeholder); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 87303207..0d2a864c 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -21,7 +21,7 @@ function maskCode(content: string): { const codeSpans: string[] = []; const maskedBlocks = content.replace( - /^[ \t]*```[^\n]*\n[\s\S]*?^[ \t]*```/gm, + /^ {0,3}```[^\n]*\n[\s\S]*?^ {0,3}```/gm, (match) => { codeBlocks.push(match); return `__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`; From 6c80e00b080e87220ad49a84d757a890686289ea Mon Sep 17 00:00:00 2001 From: luandro Date: Fri, 20 Mar 2026 08:48:58 -0300 Subject: [PATCH 09/15] fix(notion-fetch): avoid heading ID collisions with explicit or natural slugs Heading ID generator now skips IDs already claimed by explicit headings or naturally-occurring slugs, preventing duplicate anchors. --- scripts/notion-fetch/contentSanitizer.test.ts | 13 +++++++++++++ scripts/notion-fetch/contentSanitizer.ts | 15 ++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts index f1d3315c..cd52bc9d 100644 --- a/scripts/notion-fetch/contentSanitizer.test.ts +++ b/scripts/notion-fetch/contentSanitizer.test.ts @@ -258,5 +258,18 @@ echo "# Not a heading" expect(result).toContain("## Otro Título {#otro-titulo}"); expect(result).not.toContain("## Código Único {#codigo-unico}"); }); + + it("should avoid collisions between auto-incremented and explicit IDs", () => { + const input = ["## Título", "## Heading {#titulo-1}", "## Título"].join( + "\n" + ); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("## Título {#titulo}"); + expect(result).toContain("## Heading {#titulo-1}"); + // The second "Título" must NOT get titulo-1 (already claimed), should get titulo-2 + expect(result).toContain("## Título {#titulo-2}"); + }); }); }); diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index 5efb11e0..b51d0bcf 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -155,9 +155,18 @@ export function injectExplicitHeadingIds(content: string): string { return line; } - const currentCount = headingCounts.get(baseId) ?? 0; - headingCounts.set(baseId, currentCount + 1); - const headingId = currentCount === 0 ? baseId : `${baseId}-${currentCount}`; + let counter = headingCounts.get(baseId) ?? 0; + let headingId = counter === 0 ? baseId : `${baseId}-${counter}`; + // Skip IDs already claimed by explicit headings or natural slugs + while (counter > 0 && headingCounts.has(headingId)) { + counter++; + headingId = `${baseId}-${counter}`; + } + headingCounts.set(baseId, counter + 1); + // Also register the generated ID so future headings won't collide with it + if (headingId !== baseId) { + headingCounts.set(headingId, (headingCounts.get(headingId) ?? 0) + 1); + } return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`; }); From ebb61734a6c19fcdb5feba8d6f7f7a0d1de68b8f Mon Sep 17 00:00:00 2001 From: luandro Date: Fri, 20 Mar 2026 09:06:27 -0300 Subject: [PATCH 10/15] docs: add CHANGELOG.md for PR 170 Co-authored-by: Junie --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..5aa7d68c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog - PR 170 + +## ✨ New Features + +- **Slug Normalization**: Accented slugs are now normalized and locale-prefixed link references are supported. + +## 🐛 Fixes + +- **Doc Paths**: Flattened nested document paths. +- **Link Normalization**: Links inside code blocks and indented code fences are now properly skipped during link normalization. +- **Heading IDs**: Explicit heading IDs and empty filenames are handled correctly to prevent heading ID collisions. +- **Slug Generation**: Preserved CJK and Unicode letters in slug generation. +- **Code Fences**: Aligned code-fence regex with CommonMark standard. + +## 🧪 Testing + +- **Normalization**: Aligned tests with new normalization expectations. From 9145ce4a878f877fc47ea9dbb3ea1f123aed52b0 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:25:33 -0300 Subject: [PATCH 11/15] fix(notion-fetch): harden link normalization masking --- scripts/notion-fetch/linkNormalizer.test.ts | 18 +++ scripts/notion-fetch/linkNormalizer.ts | 124 +++++++++++++++++--- 2 files changed, 126 insertions(+), 16 deletions(-) diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts index ff93150a..75b54d4e 100644 --- a/scripts/notion-fetch/linkNormalizer.test.ts +++ b/scripts/notion-fetch/linkNormalizer.test.ts @@ -71,12 +71,30 @@ describe("linkNormalizer", () => { expect(result).toBe(input); }); + it("should not rewrite links inside a tilde fenced code block", () => { + const input = "~~~\n[example](/docs/Guía Rápida)\n~~~"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside an indented tilde fenced code block", () => { + const input = " ~~~\n [example](/docs/Guía Rápida)\n ~~~"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + it("should not rewrite links inside inline code", () => { const input = "Use `[link](/docs/Guía Rápida)` as an example."; const result = normalizeInternalDocLinks(input, "en"); expect(result).toBe(input); }); + it("should not rewrite links inside multi-backtick inline code", () => { + const input = "Use ``[link](/docs/Guía Rápida)`` as an example."; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + it("should normalize multiple docs links on a single line", () => { const input = "[a](/docs/Foo) and [b](/docs/Bar)"; const result = normalizeInternalDocLinks(input, "en"); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 0d2a864c..0ee01c08 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -12,28 +12,113 @@ function safeDecode(s: string): string { } } -function maskCode(content: string): { +function maskFencedCodeBlocks(content: string): { maskedContent: string; codeBlocks: string[]; - codeSpans: string[]; } { const codeBlocks: string[] = []; + const lines = content.split("\n"); + const output: string[] = []; + + let inFence = false; + let fenceChar = ""; + let fenceLength = 0; + let fencedBlock: string[] = []; + + for (const line of lines) { + if (!inFence) { + const openMatch = /^ {0,3}(`{3,}|~{3,})(.*)$/.exec(line); + if (openMatch) { + inFence = true; + fenceChar = openMatch[1][0]; + fenceLength = openMatch[1].length; + fencedBlock = [line]; + continue; + } + + output.push(line); + continue; + } + + fencedBlock.push(line); + + const closeMatch = /^ {0,3}([`~]{3,})\s*$/.exec(line); + if ( + closeMatch && + closeMatch[1][0] === fenceChar && + closeMatch[1].length >= fenceLength + ) { + codeBlocks.push(fencedBlock.join("\n")); + output.push(`__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`); + inFence = false; + fenceChar = ""; + fenceLength = 0; + fencedBlock = []; + } + } + + if (inFence) { + output.push(fencedBlock.join("\n")); + } + + return { maskedContent: output.join("\n"), codeBlocks }; +} + +function maskInlineCode(content: string): { + maskedContent: string; + codeSpans: string[]; +} { const codeSpans: string[] = []; + const output: string[] = []; + + let index = 0; - const maskedBlocks = content.replace( - /^ {0,3}```[^\n]*\n[\s\S]*?^ {0,3}```/gm, - (match) => { - codeBlocks.push(match); - return `__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`; + while (index < content.length) { + const char = content.charAt(index); + if (char !== "`") { + output.push(char); + index++; + continue; } - ); - const maskedContent = maskedBlocks.replace(/`[^`\n]*`/g, (match) => { - codeSpans.push(match); - return `__LINK_NORMALIZER_CODESPAN_${codeSpans.length - 1}__`; - }); + let openerLength = 1; + while (content.charAt(index + openerLength) === "`") { + openerLength++; + } + + let cursor = index + openerLength; + let closingIndex = -1; + while (cursor < content.length) { + if (content.charAt(cursor) !== "`") { + cursor++; + continue; + } + + let runLength = 1; + while (content.charAt(cursor + runLength) === "`") { + runLength++; + } + + if (runLength === openerLength) { + closingIndex = cursor; + break; + } - return { maskedContent, codeBlocks, codeSpans }; + cursor += runLength; + } + + if (closingIndex === -1) { + output.push(content.slice(index)); + break; + } + + const codeSpan = content.slice(index, closingIndex + openerLength); + codeSpans.push(codeSpan); + output.push(`__LINK_NORMALIZER_CODESPAN_${codeSpans.length - 1}__`); + index = closingIndex + openerLength; + } + + return { maskedContent: output.join(""), codeSpans }; } function restoreCode( @@ -41,12 +126,17 @@ function restoreCode( codeBlocks: string[], codeSpans: string[] ): string { + const restoreByIndex = (values: string[], rawIndex: string) => { + const index = Number(rawIndex); + return Number.isInteger(index) ? (values.at(index) ?? "") : ""; + }; + return content .replace(/__LINK_NORMALIZER_CODESPAN_(\d+)__/g, (_match, index) => { - return codeSpans[Number(index)]; + return restoreByIndex(codeSpans, index); }) .replace(/__LINK_NORMALIZER_CODEBLOCK_(\d+)__/g, (_match, index) => { - return codeBlocks[Number(index)]; + return restoreByIndex(codeBlocks, index); }); } @@ -89,7 +179,9 @@ export function normalizeInternalDocLinks( return content; } - const { maskedContent, codeBlocks, codeSpans } = maskCode(content); + const { maskedContent: maskedBlocks, codeBlocks } = + maskFencedCodeBlocks(content); + const { maskedContent, codeSpans } = maskInlineCode(maskedBlocks); const normalizedContent = maskedContent.replace( MARKDOWN_LINK_REGEX, From 513c1a538538a56ce9ce74250d4b81afc52a16b6 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:29:32 -0300 Subject: [PATCH 12/15] fix(notion-fetch): harden content sanitization masking --- scripts/notion-fetch/contentSanitizer.test.ts | 33 +++ scripts/notion-fetch/contentSanitizer.ts | 244 ++++++++++++++---- 2 files changed, 232 insertions(+), 45 deletions(-) diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts index cd52bc9d..cba54ea7 100644 --- a/scripts/notion-fetch/contentSanitizer.test.ts +++ b/scripts/notion-fetch/contentSanitizer.test.ts @@ -50,6 +50,18 @@ describe("contentSanitizer", () => { expect(result).toBe(input); // Should remain unchanged }); + it("should preserve tilde fenced code blocks", () => { + const input = "~~~md\nconst obj = { key: 'value' };\n~~~"; + const result = scriptModule.sanitizeMarkdownContent(input); + expect(result).toBe(input); + }); + + it("should preserve multi-backtick inline code spans", () => { + const input = "Use ```` and ``{foo}``."; + const result = scriptModule.sanitizeMarkdownContent(input); + expect(result).toBe(input); + }); + it("should fix malformed patterns", () => { const input = "Check for details."; const result = scriptModule.sanitizeMarkdownContent(input); @@ -259,6 +271,18 @@ echo "# Not a heading" expect(result).not.toContain("## Código Único {#codigo-unico}"); }); + it("should preserve headings inside tilde fenced code blocks", () => { + const input = ["~~~md", "## Código Único", "~~~", "## Otro Título"].join( + "\n" + ); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("~~~md\n## Código Único\n~~~"); + expect(result).toContain("## Otro Título {#otro-titulo}"); + expect(result).not.toContain("## Código Único {#codigo-unico}"); + }); + it("should avoid collisions between auto-incremented and explicit IDs", () => { const input = ["## Título", "## Heading {#titulo-1}", "## Título"].join( "\n" @@ -271,5 +295,14 @@ echo "# Not a heading" // The second "Título" must NOT get titulo-1 (already claimed), should get titulo-2 expect(result).toContain("## Título {#titulo-2}"); }); + + it("should reserve later explicit ids before assigning earlier auto-generated headings", () => { + const input = ["## My Id", "## Custom {#my-id}"].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("## My Id {#my-id-1}"); + expect(result).toContain("## Custom {#my-id}"); + }); }); }); diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index b51d0bcf..e5857970 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -70,36 +70,182 @@ function fixHeadingHierarchy( return fixedLines.join("\n"); } -function maskCodeFences(content: string): { +function createCodeBlockMasker(content: string): { content: string; codeBlocks: string[]; codeBlockPlaceholders: string[]; } { + const lines = content.split("\n"); const codeBlocks: string[] = []; const codeBlockPlaceholders: string[] = []; + const maskedLines: string[] = []; + + let inFence = false; + let fenceChar = ""; + let fenceLength = 0; + let blockLines: string[] = []; + + for (const line of lines) { + if (!inFence) { + const openingMatch = line.match(/^ {0,3}(`{3,}|~{3,})(.*)$/); - const maskedContent = content.replace( - /^ {0,3}```[^\n]*\n[\s\S]*?^ {0,3}```/gm, - (match) => { - codeBlocks.push(match); + if (!openingMatch) { + maskedLines.push(line); + continue; + } + + const fence = openingMatch[1]; + fenceChar = fence[0]; + fenceLength = fence.length; + blockLines = [line]; + inFence = true; + continue; + } + + blockLines.push(line); + + if (isClosingFenceLine(line, fenceChar, fenceLength)) { + codeBlocks.push(blockLines.join("\n")); const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; codeBlockPlaceholders.push(placeholder); - return placeholder; + maskedLines.push(placeholder); + inFence = false; + blockLines = []; } - ); + } + + if (inFence) { + codeBlocks.push(blockLines.join("\n")); + const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; + codeBlockPlaceholders.push(placeholder); + maskedLines.push(placeholder); + } return { - content: maskedContent, + content: maskedLines.join("\n"), codeBlocks, codeBlockPlaceholders, }; } +function isClosingFenceLine( + line: string, + fenceChar: string, + fenceLength: number +): boolean { + let i = 0; + + while (i < line.length && line.charAt(i) === " ") { + i++; + } + + if (i > 3) { + return false; + } + + let fenceCount = 0; + while (i < line.length && line.charAt(i) === fenceChar) { + fenceCount++; + i++; + } + + if (fenceCount < fenceLength) { + return false; + } + + while (i < line.length) { + if (line.charAt(i) !== " " && line.charAt(i) !== "\t") { + return false; + } + i++; + } + + return true; +} + +function maskCodeFences(content: string): { + content: string; + codeBlocks: string[]; + codeBlockPlaceholders: string[]; +} { + return createCodeBlockMasker(content); +} + function restoreCodeFences(content: string, codeBlocks: string[]): string { - return content.replace( - /__CODEBLOCK_(\d+)__/g, - (_match, index) => codeBlocks[Number(index)] - ); + let restoredContent = content; + for (const [index, codeBlock] of codeBlocks.entries()) { + restoredContent = restoredContent.replaceAll( + `__CODEBLOCK_${index}__`, + codeBlock + ); + } + return restoredContent; +} + +function maskInlineCodeSpans(content: string): { + content: string; + codeSpans: string[]; +} { + const codeSpans: string[] = []; + const output: string[] = []; + + let i = 0; + while (i < content.length) { + const currentChar = content.charAt(i); + if (currentChar !== "`") { + output.push(currentChar); + i++; + continue; + } + + let openingLength = 0; + while ( + i + openingLength < content.length && + content.charAt(i + openingLength) === "`" + ) { + openingLength++; + } + + let scanIndex = i + openingLength; + let closingIndex = -1; + while (scanIndex < content.length) { + const nextBacktick = content.indexOf("`", scanIndex); + if (nextBacktick === -1) { + break; + } + + let closingLength = 0; + while ( + nextBacktick + closingLength < content.length && + content.charAt(nextBacktick + closingLength) === "`" + ) { + closingLength++; + } + + if (closingLength === openingLength) { + closingIndex = nextBacktick; + break; + } + + scanIndex = nextBacktick + closingLength; + } + + if (closingIndex === -1) { + output.push(content.slice(i, i + openingLength)); + i += openingLength; + continue; + } + + const codeSpan = content.slice(i, closingIndex + openingLength); + codeSpans.push(codeSpan); + output.push(`__CODESPAN_${codeSpans.length - 1}__`); + i = closingIndex + openingLength; + } + + return { + content: output.join(""), + codeSpans, + }; } export function injectExplicitHeadingIds(content: string): string { @@ -112,14 +258,15 @@ export function injectExplicitHeadingIds(content: string): string { codeBlocks, codeBlockPlaceholders, } = maskCodeFences(content); + const reservedIds = new Set(); const headingCounts = new Map(); const lines = maskedContent.split("\n"); - const updatedLines = lines.map((line) => { + for (const line of lines) { if ( codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) ) { - return line; + continue; } const fullMatch = line.match( @@ -129,18 +276,32 @@ export function injectExplicitHeadingIds(content: string): string { const [, , , headingText, explicitId] = fullMatch; const baseId = createSafeSlug(headingText); if (baseId) { - headingCounts.set(baseId, (headingCounts.get(baseId) ?? 0) + 1); + reservedIds.add(baseId); } - if (explicitId !== baseId) { - headingCounts.set(explicitId, (headingCounts.get(explicitId) ?? 0) + 1); + if (explicitId) { + reservedIds.add(explicitId); } - return line; + continue; } const explicitIdMatch = line.match(/\s\{#([^}]+)\}\s*$/); if (explicitIdMatch) { const explicitId = explicitIdMatch[1]; - headingCounts.set(explicitId, (headingCounts.get(explicitId) ?? 0) + 1); + reservedIds.add(explicitId); + } + } + + const updatedLines = lines.map((line) => { + if ( + codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) + ) { + return line; + } + + const explicitHeadingMatch = line.match( + /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/ + ); + if (explicitHeadingMatch) { return line; } @@ -157,15 +318,13 @@ export function injectExplicitHeadingIds(content: string): string { let counter = headingCounts.get(baseId) ?? 0; let headingId = counter === 0 ? baseId : `${baseId}-${counter}`; - // Skip IDs already claimed by explicit headings or natural slugs - while (counter > 0 && headingCounts.has(headingId)) { + while (reservedIds.has(headingId) || headingCounts.has(headingId)) { counter++; headingId = `${baseId}-${counter}`; } headingCounts.set(baseId, counter + 1); - // Also register the generated ID so future headings won't collide with it if (headingId !== baseId) { - headingCounts.set(headingId, (headingCounts.get(headingId) ?? 0) + 1); + headingCounts.set(headingId, 1); } return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`; @@ -183,20 +342,14 @@ export function sanitizeMarkdownContent(content: string): string { // Fix specific malformed patterns that cause MDX errors // 0. Mask code fences (```...```) and inline code (`...`) to avoid altering them - const codeBlocks: string[] = []; - const codeSpans: string[] = []; - const codeBlockPlaceholders: string[] = []; - - content = content.replace(/^ {0,3}```[^\n]*\n[\s\S]*?^ {0,3}```/gm, (m) => { - codeBlocks.push(m); - const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; - codeBlockPlaceholders.push(placeholder); - return placeholder; - }); - content = content.replace(/`[^`\n]*`/g, (m) => { - codeSpans.push(m); - return `__CODESPAN_${codeSpans.length - 1}__`; - }); + const { + content: maskedContent, + codeBlocks, + codeBlockPlaceholders, + } = maskCodeFences(content); + const { content: maskedWithCodeSpans, codeSpans } = + maskInlineCodeSpans(maskedContent); + content = maskedWithCodeSpans; // 1. Fix heading hierarchy for proper TOC generation (after masking code blocks) content = fixHeadingHierarchy(content, codeBlockPlaceholders); @@ -263,14 +416,15 @@ export function sanitizeMarkdownContent(content: string): string { } // 9. Restore masked code blocks and inline code - content = content.replace( - /__CODEBLOCK_(\d+)__/g, - (_m, i) => codeBlocks[Number(i)] - ); - content = content.replace( - /__CODESPAN_(\d+)__/g, - (_m, i) => codeSpans[Number(i)] - ); + content = restoreCodeFences(content, codeBlocks); + let restoredContent = content; + for (const [index, codeSpan] of codeSpans.entries()) { + restoredContent = restoredContent.replaceAll( + `__CODESPAN_${index}__`, + codeSpan + ); + } + content = restoredContent; return content; } From 89414b7fa2f7c85f6dac9ca958ac7c38560b7cd5 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:35:53 -0300 Subject: [PATCH 13/15] fix(notion-fetch): close remaining masking edge cases --- scripts/notion-fetch/contentSanitizer.test.ts | 9 +++++++++ scripts/notion-fetch/contentSanitizer.ts | 6 +----- scripts/notion-fetch/linkNormalizer.test.ts | 6 ++++++ scripts/notion-fetch/linkNormalizer.ts | 3 ++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts index cba54ea7..9b5d3e1c 100644 --- a/scripts/notion-fetch/contentSanitizer.test.ts +++ b/scripts/notion-fetch/contentSanitizer.test.ts @@ -304,5 +304,14 @@ echo "# Not a heading" expect(result).toContain("## My Id {#my-id-1}"); expect(result).toContain("## Custom {#my-id}"); }); + + it("should not reserve a natural slug when a later explicit id is custom", () => { + const input = ["## My Id", "## My Id {#custom}"].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("## My Id {#my-id}"); + expect(result).toContain("## My Id {#custom}"); + }); }); }); diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index e5857970..59a9b0cf 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -273,11 +273,7 @@ export function injectExplicitHeadingIds(content: string): string { /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/ ); if (fullMatch) { - const [, , , headingText, explicitId] = fullMatch; - const baseId = createSafeSlug(headingText); - if (baseId) { - reservedIds.add(baseId); - } + const [, , , , explicitId] = fullMatch; if (explicitId) { reservedIds.add(explicitId); } diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts index 75b54d4e..b85d0557 100644 --- a/scripts/notion-fetch/linkNormalizer.test.ts +++ b/scripts/notion-fetch/linkNormalizer.test.ts @@ -83,6 +83,12 @@ describe("linkNormalizer", () => { expect(result).toBe(input); }); + it("should not rewrite links inside an unclosed tilde fenced code block", () => { + const input = "~~~\n[example](/docs/Guía Rápida)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + it("should not rewrite links inside inline code", () => { const input = "Use `[link](/docs/Guía Rápida)` as an example."; const result = normalizeInternalDocLinks(input, "en"); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 0ee01c08..41cb87fa 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -58,7 +58,8 @@ function maskFencedCodeBlocks(content: string): { } if (inFence) { - output.push(fencedBlock.join("\n")); + codeBlocks.push(fencedBlock.join("\n")); + output.push(`__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`); } return { maskedContent: output.join("\n"), codeBlocks }; From f2904d466a200059ef1e3459fff03467932c612b Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:59:59 -0300 Subject: [PATCH 14/15] fix(notion-fetch): fix adjacent-link regex and exact-/docs guard in linkNormalizer - Replace consuming (^|[^!]) group with (? { expect(result).toBe("[a](/docs/foo) and [b](/docs/bar)"); }); + it("should normalize both links when two docs links are directly adjacent (no separator)", () => { + const input = "[Link 1](/docs/Foo)[Link 2](/docs/Bar)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[Link 1](/docs/foo)[Link 2](/docs/bar)"); + }); + it("should return empty string for empty content", () => { const result = normalizeInternalDocLinks("", "en"); expect(result).toBe(""); @@ -117,5 +123,29 @@ describe("linkNormalizer", () => { const result = normalizeInternalDocLinks(input, "en"); expect(result).toBe(input); }); + + it("should normalize a link to exactly /docs (en, no locale prefix)", () => { + const input = "[link](/docs)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs)"); + }); + + it("should normalize a link to exactly /docs with locale prefix (es)", () => { + const input = "[link](/docs)"; + const result = normalizeInternalDocLinks(input, "es"); + expect(result).toBe("[link](/es/docs)"); + }); + + it("should normalize a link to /docs#fragment (en, no locale prefix)", () => { + const input = "[link](/docs#Sección Uno)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs#seccion-uno)"); + }); + + it("should normalize a link to /docs#fragment with locale prefix (pt)", () => { + const input = "[link](/docs#Sección Uno)"; + const result = normalizeInternalDocLinks(input, "pt"); + expect(result).toBe("[link](/pt/docs#seccion-uno)"); + }); }); }); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 41cb87fa..2e49572f 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -2,7 +2,7 @@ import config from "../../docusaurus.config"; import { createSafeSlug } from "./slugUtils"; const DEFAULT_LOCALE = config.i18n.defaultLocale; -const MARKDOWN_LINK_REGEX = /(^|[^!])\[([^\]]+)\]\(([^)\n]+)\)/gm; +const MARKDOWN_LINK_REGEX = /(? { + (match, text: string, rawTarget: string) => { const trimmedTarget = rawTarget.trim(); const titleMatch = trimmedTarget.match(/^(\/docs\/[^\n]*?)(\s+"[^"]*")$/); const target = titleMatch ? titleMatch[1] : trimmedTarget; const titleSuffix = titleMatch?.[2] ?? ""; - if (!target.startsWith("/docs/")) { + if ( + target !== "/docs" && + !target.startsWith("/docs/") && + !target.startsWith("/docs#") + ) { return match; } - return `${prefix}[${text}](${normalizeDocTarget(target, lang)}${titleSuffix})`; + return `[${text}](${normalizeDocTarget(target, lang)}${titleSuffix})`; } ); From f585ecf52d5ac21e7b7acd6be8727aeef2c22e78 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 08:31:25 -0300 Subject: [PATCH 15/15] refactor(notion-fetch): extract shared markdown code-block masking utility Both contentSanitizer and linkNormalizer implemented nearly identical logic for masking and restoring fenced code blocks and inline code spans. Extract into a shared markdownUtils.ts to eliminate duplication and prevent future drift. --- scripts/notion-fetch/contentSanitizer.ts | 213 ++--------------------- scripts/notion-fetch/linkNormalizer.ts | 142 +-------------- scripts/notion-fetch/markdownUtils.ts | 203 +++++++++++++++++++++ 3 files changed, 226 insertions(+), 332 deletions(-) create mode 100644 scripts/notion-fetch/markdownUtils.ts diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index 59a9b0cf..c30a341f 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -4,6 +4,11 @@ */ import { createSafeSlug } from "./slugUtils"; +import { + maskFencedCodeBlocks, + maskInlineCodeSpans, + restoreCodeMasks, +} from "./markdownUtils"; const EMOJI_STYLE_MARKERS = ["display:", "height:", "margin:"]; @@ -70,184 +75,6 @@ function fixHeadingHierarchy( return fixedLines.join("\n"); } -function createCodeBlockMasker(content: string): { - content: string; - codeBlocks: string[]; - codeBlockPlaceholders: string[]; -} { - const lines = content.split("\n"); - const codeBlocks: string[] = []; - const codeBlockPlaceholders: string[] = []; - const maskedLines: string[] = []; - - let inFence = false; - let fenceChar = ""; - let fenceLength = 0; - let blockLines: string[] = []; - - for (const line of lines) { - if (!inFence) { - const openingMatch = line.match(/^ {0,3}(`{3,}|~{3,})(.*)$/); - - if (!openingMatch) { - maskedLines.push(line); - continue; - } - - const fence = openingMatch[1]; - fenceChar = fence[0]; - fenceLength = fence.length; - blockLines = [line]; - inFence = true; - continue; - } - - blockLines.push(line); - - if (isClosingFenceLine(line, fenceChar, fenceLength)) { - codeBlocks.push(blockLines.join("\n")); - const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; - codeBlockPlaceholders.push(placeholder); - maskedLines.push(placeholder); - inFence = false; - blockLines = []; - } - } - - if (inFence) { - codeBlocks.push(blockLines.join("\n")); - const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; - codeBlockPlaceholders.push(placeholder); - maskedLines.push(placeholder); - } - - return { - content: maskedLines.join("\n"), - codeBlocks, - codeBlockPlaceholders, - }; -} - -function isClosingFenceLine( - line: string, - fenceChar: string, - fenceLength: number -): boolean { - let i = 0; - - while (i < line.length && line.charAt(i) === " ") { - i++; - } - - if (i > 3) { - return false; - } - - let fenceCount = 0; - while (i < line.length && line.charAt(i) === fenceChar) { - fenceCount++; - i++; - } - - if (fenceCount < fenceLength) { - return false; - } - - while (i < line.length) { - if (line.charAt(i) !== " " && line.charAt(i) !== "\t") { - return false; - } - i++; - } - - return true; -} - -function maskCodeFences(content: string): { - content: string; - codeBlocks: string[]; - codeBlockPlaceholders: string[]; -} { - return createCodeBlockMasker(content); -} - -function restoreCodeFences(content: string, codeBlocks: string[]): string { - let restoredContent = content; - for (const [index, codeBlock] of codeBlocks.entries()) { - restoredContent = restoredContent.replaceAll( - `__CODEBLOCK_${index}__`, - codeBlock - ); - } - return restoredContent; -} - -function maskInlineCodeSpans(content: string): { - content: string; - codeSpans: string[]; -} { - const codeSpans: string[] = []; - const output: string[] = []; - - let i = 0; - while (i < content.length) { - const currentChar = content.charAt(i); - if (currentChar !== "`") { - output.push(currentChar); - i++; - continue; - } - - let openingLength = 0; - while ( - i + openingLength < content.length && - content.charAt(i + openingLength) === "`" - ) { - openingLength++; - } - - let scanIndex = i + openingLength; - let closingIndex = -1; - while (scanIndex < content.length) { - const nextBacktick = content.indexOf("`", scanIndex); - if (nextBacktick === -1) { - break; - } - - let closingLength = 0; - while ( - nextBacktick + closingLength < content.length && - content.charAt(nextBacktick + closingLength) === "`" - ) { - closingLength++; - } - - if (closingLength === openingLength) { - closingIndex = nextBacktick; - break; - } - - scanIndex = nextBacktick + closingLength; - } - - if (closingIndex === -1) { - output.push(content.slice(i, i + openingLength)); - i += openingLength; - continue; - } - - const codeSpan = content.slice(i, closingIndex + openingLength); - codeSpans.push(codeSpan); - output.push(`__CODESPAN_${codeSpans.length - 1}__`); - i = closingIndex + openingLength; - } - - return { - content: output.join(""), - codeSpans, - }; -} - export function injectExplicitHeadingIds(content: string): string { if (!content) { return content; @@ -256,16 +83,14 @@ export function injectExplicitHeadingIds(content: string): string { const { content: maskedContent, codeBlocks, - codeBlockPlaceholders, - } = maskCodeFences(content); + placeholders, + } = maskFencedCodeBlocks(content); const reservedIds = new Set(); const headingCounts = new Map(); const lines = maskedContent.split("\n"); for (const line of lines) { - if ( - codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) - ) { + if (placeholders.some((placeholder) => line.includes(placeholder))) { continue; } @@ -288,9 +113,7 @@ export function injectExplicitHeadingIds(content: string): string { } const updatedLines = lines.map((line) => { - if ( - codeBlockPlaceholders.some((placeholder) => line.includes(placeholder)) - ) { + if (placeholders.some((placeholder) => line.includes(placeholder))) { return line; } @@ -326,7 +149,7 @@ export function injectExplicitHeadingIds(content: string): string { return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`; }); - return restoreCodeFences(updatedLines.join("\n"), codeBlocks); + return restoreCodeMasks(updatedLines.join("\n"), codeBlocks, []); } /** @@ -341,14 +164,14 @@ export function sanitizeMarkdownContent(content: string): string { const { content: maskedContent, codeBlocks, - codeBlockPlaceholders, - } = maskCodeFences(content); + placeholders, + } = maskFencedCodeBlocks(content); const { content: maskedWithCodeSpans, codeSpans } = maskInlineCodeSpans(maskedContent); content = maskedWithCodeSpans; // 1. Fix heading hierarchy for proper TOC generation (after masking code blocks) - content = fixHeadingHierarchy(content, codeBlockPlaceholders); + content = fixHeadingHierarchy(content, placeholders); // 2. Aggressively strip all curly-brace expressions by unwrapping to inner text // BUT preserve JSX style objects for emoji images @@ -412,15 +235,7 @@ export function sanitizeMarkdownContent(content: string): string { } // 9. Restore masked code blocks and inline code - content = restoreCodeFences(content, codeBlocks); - let restoredContent = content; - for (const [index, codeSpan] of codeSpans.entries()) { - restoredContent = restoredContent.replaceAll( - `__CODESPAN_${index}__`, - codeSpan - ); - } - content = restoredContent; + content = restoreCodeMasks(content, codeBlocks, codeSpans); return content; } diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts index 2e49572f..7a02948f 100644 --- a/scripts/notion-fetch/linkNormalizer.ts +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -1,5 +1,10 @@ import config from "../../docusaurus.config"; import { createSafeSlug } from "./slugUtils"; +import { + maskFencedCodeBlocks, + maskInlineCodeSpans, + restoreCodeMasks, +} from "./markdownUtils"; const DEFAULT_LOCALE = config.i18n.defaultLocale; const MARKDOWN_LINK_REGEX = /(?= fenceLength - ) { - codeBlocks.push(fencedBlock.join("\n")); - output.push(`__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`); - inFence = false; - fenceChar = ""; - fenceLength = 0; - fencedBlock = []; - } - } - - if (inFence) { - codeBlocks.push(fencedBlock.join("\n")); - output.push(`__LINK_NORMALIZER_CODEBLOCK_${codeBlocks.length - 1}__`); - } - - return { maskedContent: output.join("\n"), codeBlocks }; -} - -function maskInlineCode(content: string): { - maskedContent: string; - codeSpans: string[]; -} { - const codeSpans: string[] = []; - const output: string[] = []; - - let index = 0; - - while (index < content.length) { - const char = content.charAt(index); - if (char !== "`") { - output.push(char); - index++; - continue; - } - - let openerLength = 1; - while (content.charAt(index + openerLength) === "`") { - openerLength++; - } - - let cursor = index + openerLength; - let closingIndex = -1; - while (cursor < content.length) { - if (content.charAt(cursor) !== "`") { - cursor++; - continue; - } - - let runLength = 1; - while (content.charAt(cursor + runLength) === "`") { - runLength++; - } - - if (runLength === openerLength) { - closingIndex = cursor; - break; - } - - cursor += runLength; - } - - if (closingIndex === -1) { - output.push(content.slice(index)); - break; - } - - const codeSpan = content.slice(index, closingIndex + openerLength); - codeSpans.push(codeSpan); - output.push(`__LINK_NORMALIZER_CODESPAN_${codeSpans.length - 1}__`); - index = closingIndex + openerLength; - } - - return { maskedContent: output.join(""), codeSpans }; -} - -function restoreCode( - content: string, - codeBlocks: string[], - codeSpans: string[] -): string { - const restoreByIndex = (values: string[], rawIndex: string) => { - const index = Number(rawIndex); - return Number.isInteger(index) ? (values.at(index) ?? "") : ""; - }; - - return content - .replace(/__LINK_NORMALIZER_CODESPAN_(\d+)__/g, (_match, index) => { - return restoreByIndex(codeSpans, index); - }) - .replace(/__LINK_NORMALIZER_CODEBLOCK_(\d+)__/g, (_match, index) => { - return restoreByIndex(codeBlocks, index); - }); -} - function normalizeDocPathname(pathname: string): string { const hasTrailingSlash = pathname.endsWith("/") && pathname !== "/docs/"; const rawSegments = pathname @@ -180,9 +56,9 @@ export function normalizeInternalDocLinks( return content; } - const { maskedContent: maskedBlocks, codeBlocks } = - maskFencedCodeBlocks(content); - const { maskedContent, codeSpans } = maskInlineCode(maskedBlocks); + const { content: maskedBlocks, codeBlocks } = maskFencedCodeBlocks(content); + const { content: maskedContent, codeSpans } = + maskInlineCodeSpans(maskedBlocks); const normalizedContent = maskedContent.replace( MARKDOWN_LINK_REGEX, @@ -204,5 +80,5 @@ export function normalizeInternalDocLinks( } ); - return restoreCode(normalizedContent, codeBlocks, codeSpans); + return restoreCodeMasks(normalizedContent, codeBlocks, codeSpans); } diff --git a/scripts/notion-fetch/markdownUtils.ts b/scripts/notion-fetch/markdownUtils.ts new file mode 100644 index 00000000..e11675f8 --- /dev/null +++ b/scripts/notion-fetch/markdownUtils.ts @@ -0,0 +1,203 @@ +/** + * Shared markdown code-block masking utilities. + * Both contentSanitizer and linkNormalizer use identical logic for masking + * code blocks and inline code spans before processing, then restoring them. + */ + +/** Placeholder prefix used for fenced code blocks. */ +const CODEBLOCK_PREFIX = "__CODEBLOCK_"; +/** Placeholder prefix used for inline code spans. */ +const CODESPAN_PREFIX = "__CODESPAN_"; +const PLACEHOLDER_SUFFIX = "__"; + +/** Checks whether `line` is a valid closing fence for a block opened with `fenceChar` of length `fenceLength`. */ +function isClosingFenceLine( + line: string, + fenceChar: string, + fenceLength: number +): boolean { + let i = 0; + + while (i < line.length && line.charAt(i) === " ") { + i++; + } + + if (i > 3) { + return false; + } + + let fenceCount = 0; + while (i < line.length && line.charAt(i) === fenceChar) { + fenceCount++; + i++; + } + + if (fenceCount < fenceLength) { + return false; + } + + while (i < line.length) { + if (line.charAt(i) !== " " && line.charAt(i) !== "\t") { + return false; + } + i++; + } + + return true; +} + +/** + * Masks all fenced code blocks in `content`, replacing each with a placeholder token. + * Returns the masked content, the array of original code blocks, and their placeholder strings. + */ +export function maskFencedCodeBlocks(content: string): { + content: string; + codeBlocks: string[]; + placeholders: string[]; +} { + const lines = content.split("\n"); + const codeBlocks: string[] = []; + const placeholders: string[] = []; + const maskedLines: string[] = []; + + let inFence = false; + let fenceChar = ""; + let fenceLength = 0; + let blockLines: string[] = []; + + for (const line of lines) { + if (!inFence) { + const openingMatch = line.match(/^ {0,3}(`{3,}|~{3,})(.*)$/); + + if (!openingMatch) { + maskedLines.push(line); + continue; + } + + const fence = openingMatch[1]; + fenceChar = fence[0]; + fenceLength = fence.length; + blockLines = [line]; + inFence = true; + continue; + } + + blockLines.push(line); + + if (isClosingFenceLine(line, fenceChar, fenceLength)) { + codeBlocks.push(blockLines.join("\n")); + const placeholder = `${CODEBLOCK_PREFIX}${codeBlocks.length - 1}${PLACEHOLDER_SUFFIX}`; + placeholders.push(placeholder); + maskedLines.push(placeholder); + inFence = false; + blockLines = []; + } + } + + if (inFence) { + codeBlocks.push(blockLines.join("\n")); + const placeholder = `${CODEBLOCK_PREFIX}${codeBlocks.length - 1}${PLACEHOLDER_SUFFIX}`; + placeholders.push(placeholder); + maskedLines.push(placeholder); + } + + return { + content: maskedLines.join("\n"), + codeBlocks, + placeholders, + }; +} + +/** + * Masks all inline code spans in `content`, replacing each with a placeholder token. + * Returns the masked content and the array of original code spans. + */ +export function maskInlineCodeSpans(content: string): { + content: string; + codeSpans: string[]; +} { + const codeSpans: string[] = []; + const output: string[] = []; + + let i = 0; + while (i < content.length) { + const currentChar = content.charAt(i); + if (currentChar !== "`") { + output.push(currentChar); + i++; + continue; + } + + let openingLength = 0; + while ( + i + openingLength < content.length && + content.charAt(i + openingLength) === "`" + ) { + openingLength++; + } + + let scanIndex = i + openingLength; + let closingIndex = -1; + while (scanIndex < content.length) { + const nextBacktick = content.indexOf("`", scanIndex); + if (nextBacktick === -1) { + break; + } + + let closingLength = 0; + while ( + nextBacktick + closingLength < content.length && + content.charAt(nextBacktick + closingLength) === "`" + ) { + closingLength++; + } + + if (closingLength === openingLength) { + closingIndex = nextBacktick; + break; + } + + scanIndex = nextBacktick + closingLength; + } + + if (closingIndex === -1) { + output.push(content.slice(i, i + openingLength)); + i += openingLength; + continue; + } + + const codeSpan = content.slice(i, closingIndex + openingLength); + codeSpans.push(codeSpan); + output.push( + `${CODESPAN_PREFIX}${codeSpans.length - 1}${PLACEHOLDER_SUFFIX}` + ); + i = closingIndex + openingLength; + } + + return { + content: output.join(""), + codeSpans, + }; +} + +/** + * Restores previously masked fenced code blocks and inline code spans. + */ +export function restoreCodeMasks( + content: string, + codeBlocks: string[], + codeSpans: string[] +): string { + const restoreByIndex = (values: string[], rawIndex: string) => { + const index = Number(rawIndex); + return Number.isInteger(index) ? (values.at(index) ?? "") : ""; + }; + + return content + .replace(/__CODESPAN_(\d+)__/g, (_match, index) => { + return restoreByIndex(codeSpans, index); + }) + .replace(/__CODEBLOCK_(\d+)__/g, (_match, index) => { + return restoreByIndex(codeBlocks, index); + }); +}