diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..5aa7d68c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog - PR 170 + +## ✨ New Features + +- **Slug Normalization**: Accented slugs are now normalized and locale-prefixed link references are supported. + +## 🐛 Fixes + +- **Doc Paths**: Flattened nested document paths. +- **Link Normalization**: Links inside code blocks and indented code fences are now properly skipped during link normalization. +- **Heading IDs**: Explicit heading IDs and empty filenames are handled correctly to prevent heading ID collisions. +- **Slug Generation**: Preserved CJK and Unicode letters in slug generation. +- **Code Fences**: Aligned code-fence regex with CommonMark standard. + +## 🧪 Testing + +- **Normalization**: Aligned tests with new normalization expectations. diff --git a/bun-tests/vitest-bridge.test.ts b/bun-tests/vitest-bridge.test.ts index a096b2fb..8e57ab74 100644 --- a/bun-tests/vitest-bridge.test.ts +++ b/bun-tests/vitest-bridge.test.ts @@ -24,5 +24,7 @@ test( () => { runVitest(); }, - { timeout: 120_000 } + // The full Vitest suite can take just under two minutes on this repo, and + // Bun's own test harness adds enough overhead that 120s is too tight. + { timeout: 300_000 } ); diff --git a/eslint.config.mjs b/eslint.config.mjs index d03fda21..83430e4d 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -32,7 +32,7 @@ const eslintConfig = [ // Docusaurus specific configurations { files: ["**/*.{js,mjs,cjs,ts,jsx,tsx}"], - ignores: ["scripts/**", "api-server/**"], // Ignore scripts and api-server directories for docusaurus rules + ignores: ["scripts/**", "api-server/**", "bun-tests/**"], // Ignore non-Docusaurus runtime directories for docusaurus/react rules plugins: { "@docusaurus": docusaurusPlugin, react: pluginReact, @@ -74,7 +74,11 @@ const eslintConfig = [ // Scripts and API server specific configurations { - files: ["scripts/**/*.{js,mjs,cjs,ts}", "api-server/**/*.{js,mjs,cjs,ts}"], + files: [ + "scripts/**/*.{js,mjs,cjs,ts}", + "api-server/**/*.{js,mjs,cjs,ts}", + "bun-tests/**/*.{js,mjs,cjs,ts}", + ], plugins: { import: importPlugin, promise: promisePlugin, diff --git a/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts b/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts index 61cdb372..622c6456 100644 --- a/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts +++ b/scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts @@ -93,6 +93,7 @@ vi.mock("../imageProcessor", () => ({ vi.mock("../utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + injectExplicitHeadingIds: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn().mockResolvedValue({ finalSize: 512, usedFallback: false, diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts index 5354c120..9b5d3e1c 100644 --- a/scripts/notion-fetch/contentSanitizer.test.ts +++ b/scripts/notion-fetch/contentSanitizer.test.ts @@ -50,6 +50,18 @@ describe("contentSanitizer", () => { expect(result).toBe(input); // Should remain unchanged }); + it("should preserve tilde fenced code blocks", () => { + const input = "~~~md\nconst obj = { key: 'value' };\n~~~"; + const result = scriptModule.sanitizeMarkdownContent(input); + expect(result).toBe(input); + }); + + it("should preserve multi-backtick inline code spans", () => { + const input = "Use ```` and ``{foo}``."; + const result = scriptModule.sanitizeMarkdownContent(input); + expect(result).toBe(input); + }); + it("should fix malformed patterns", () => { const input = "Check for details."; const result = scriptModule.sanitizeMarkdownContent(input); @@ -226,4 +238,80 @@ echo "# Not a heading" }); }); }); + + describe("injectExplicitHeadingIds", () => { + it("should normalize accented headings and append stable duplicate suffixes", () => { + const input = [ + "# Título Único", + "## Título Único", + "### Niño & Acción", + ].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("# Título Único {#titulo-unico}"); + expect(result).toContain("## Título Único {#titulo-unico-1}"); + expect(result).toContain("### Niño & Acción {#nino-accion}"); + }); + + it("should preserve existing explicit heading ids and code fences", () => { + const input = [ + "# Encabezado {#custom-id}", + "```md", + "## Código Único", + "```", + "## Otro Título", + ].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("# Encabezado {#custom-id}"); + expect(result).toContain("```md\n## Código Único\n```"); + expect(result).toContain("## Otro Título {#otro-titulo}"); + expect(result).not.toContain("## Código Único {#codigo-unico}"); + }); + + it("should preserve headings inside tilde fenced code blocks", () => { + const input = ["~~~md", "## Código Único", "~~~", "## Otro Título"].join( + "\n" + ); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("~~~md\n## Código Único\n~~~"); + expect(result).toContain("## Otro Título {#otro-titulo}"); + expect(result).not.toContain("## Código Único {#codigo-unico}"); + }); + + it("should avoid collisions between auto-incremented and explicit IDs", () => { + const input = ["## Título", "## Heading {#titulo-1}", "## Título"].join( + "\n" + ); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("## Título {#titulo}"); + expect(result).toContain("## Heading {#titulo-1}"); + // The second "Título" must NOT get titulo-1 (already claimed), should get titulo-2 + expect(result).toContain("## Título {#titulo-2}"); + }); + + it("should reserve later explicit ids before assigning earlier auto-generated headings", () => { + const input = ["## My Id", "## Custom {#my-id}"].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("## My Id {#my-id-1}"); + expect(result).toContain("## Custom {#my-id}"); + }); + + it("should not reserve a natural slug when a later explicit id is custom", () => { + const input = ["## My Id", "## My Id {#custom}"].join("\n"); + + const result = scriptModule.injectExplicitHeadingIds(input); + + expect(result).toContain("## My Id {#my-id}"); + expect(result).toContain("## My Id {#custom}"); + }); + }); }); diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index f652a60c..c30a341f 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -3,6 +3,13 @@ * that cause MDX compilation errors in Docusaurus. */ +import { createSafeSlug } from "./slugUtils"; +import { + maskFencedCodeBlocks, + maskInlineCodeSpans, + restoreCodeMasks, +} from "./markdownUtils"; + const EMOJI_STYLE_MARKERS = ["display:", "height:", "margin:"]; const isEmojiStyleObject = (snippet: string): boolean => @@ -68,6 +75,83 @@ function fixHeadingHierarchy( return fixedLines.join("\n"); } +export function injectExplicitHeadingIds(content: string): string { + if (!content) { + return content; + } + + const { + content: maskedContent, + codeBlocks, + placeholders, + } = maskFencedCodeBlocks(content); + const reservedIds = new Set(); + const headingCounts = new Map(); + + const lines = maskedContent.split("\n"); + for (const line of lines) { + if (placeholders.some((placeholder) => line.includes(placeholder))) { + continue; + } + + const fullMatch = line.match( + /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/ + ); + if (fullMatch) { + const [, , , , explicitId] = fullMatch; + if (explicitId) { + reservedIds.add(explicitId); + } + continue; + } + + const explicitIdMatch = line.match(/\s\{#([^}]+)\}\s*$/); + if (explicitIdMatch) { + const explicitId = explicitIdMatch[1]; + reservedIds.add(explicitId); + } + } + + const updatedLines = lines.map((line) => { + if (placeholders.some((placeholder) => line.includes(placeholder))) { + return line; + } + + const explicitHeadingMatch = line.match( + /^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/ + ); + if (explicitHeadingMatch) { + return line; + } + + const headingMatch = line.match(/^(\s{0,3})(#{1,6})\s+(.+?)\s*$/); + if (!headingMatch) { + return line; + } + + const [, leadingWhitespace, hashes, headingText] = headingMatch; + const baseId = createSafeSlug(headingText); + if (!baseId) { + return line; + } + + let counter = headingCounts.get(baseId) ?? 0; + let headingId = counter === 0 ? baseId : `${baseId}-${counter}`; + while (reservedIds.has(headingId) || headingCounts.has(headingId)) { + counter++; + headingId = `${baseId}-${counter}`; + } + headingCounts.set(baseId, counter + 1); + if (headingId !== baseId) { + headingCounts.set(headingId, 1); + } + + return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`; + }); + + return restoreCodeMasks(updatedLines.join("\n"), codeBlocks, []); +} + /** * Sanitizes markdown content to fix malformed HTML/JSX tags that cause MDX compilation errors * @param content - The markdown content string @@ -77,23 +161,17 @@ export function sanitizeMarkdownContent(content: string): string { // Fix specific malformed patterns that cause MDX errors // 0. Mask code fences (```...```) and inline code (`...`) to avoid altering them - const codeBlocks: string[] = []; - const codeSpans: string[] = []; - const codeBlockPlaceholders: string[] = []; - - content = content.replace(/```[\s\S]*?```/g, (m) => { - codeBlocks.push(m); - const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`; - codeBlockPlaceholders.push(placeholder); - return placeholder; - }); - content = content.replace(/`[^`\n]*`/g, (m) => { - codeSpans.push(m); - return `__CODESPAN_${codeSpans.length - 1}__`; - }); + const { + content: maskedContent, + codeBlocks, + placeholders, + } = maskFencedCodeBlocks(content); + const { content: maskedWithCodeSpans, codeSpans } = + maskInlineCodeSpans(maskedContent); + content = maskedWithCodeSpans; // 1. Fix heading hierarchy for proper TOC generation (after masking code blocks) - content = fixHeadingHierarchy(content, codeBlockPlaceholders); + content = fixHeadingHierarchy(content, placeholders); // 2. Aggressively strip all curly-brace expressions by unwrapping to inner text // BUT preserve JSX style objects for emoji images @@ -157,14 +235,7 @@ export function sanitizeMarkdownContent(content: string): string { } // 9. Restore masked code blocks and inline code - content = content.replace( - /__CODEBLOCK_(\d+)__/g, - (_m, i) => codeBlocks[Number(i)] - ); - content = content.replace( - /__CODESPAN_(\d+)__/g, - (_m, i) => codeSpans[Number(i)] - ); + content = restoreCodeMasks(content, codeBlocks, codeSpans); return content; } diff --git a/scripts/notion-fetch/generateBlocks.test.ts b/scripts/notion-fetch/generateBlocks.test.ts index b5b0e88d..5d131524 100644 --- a/scripts/notion-fetch/generateBlocks.test.ts +++ b/scripts/notion-fetch/generateBlocks.test.ts @@ -112,6 +112,7 @@ vi.mock("./imageProcessor", () => ({ vi.mock("./utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + injectExplicitHeadingIds: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn(), detectFormatFromBuffer: vi.fn(() => "jpeg"), formatFromContentType: vi.fn(() => "jpeg"), @@ -198,6 +199,7 @@ describe("generateBlocks", () => { let fetchNotionBlocks: Mock; let processImage: Mock; let compressImageToFileWithFallback: Mock; + let injectExplicitHeadingIds: Mock; beforeEach(async () => { restoreEnv = installTestNotionEnv(); @@ -223,6 +225,7 @@ describe("generateBlocks", () => { const utils = await import("./utils"); compressImageToFileWithFallback = utils.compressImageToFileWithFallback as Mock; + injectExplicitHeadingIds = utils.injectExplicitHeadingIds as Mock; // Setup default mock implementations processImage.mockResolvedValue(mockProcessedImageResult); @@ -377,6 +380,155 @@ describe("generateBlocks", () => { }); }); + describe("Localized slug and link normalization", () => { + it("should derive the shared ASCII slug from the grouped title for every locale", async () => { + const { generateBlocks } = await import("./generateBlocks"); + const mockWriteFileSync = fs.writeFileSync as Mock; + + const mainPage = createMockNotionPage({ + id: "main-accented", + title: "Título con acentos", + elementType: "Page", + subItems: ["en-accented", "es-accented", "pt-accented"], + }); + const englishPage = createMockNotionPage({ + id: "en-accented", + title: "Título con acentos", + language: "English", + elementType: "Page", + }); + const spanishPage = createMockNotionPage({ + id: "es-accented", + title: "Título con acentos", + language: "Spanish", + elementType: "Page", + }); + const portuguesePage = createMockNotionPage({ + id: "pt-accented", + title: "Título con acentos", + language: "Portuguese", + elementType: "Page", + }); + + n2m.pageToMarkdown.mockResolvedValue([]); + n2m.toMarkdownString.mockReturnValue({ parent: "Body content" }); + + await generateBlocks( + [mainPage, englishPage, spanishPage, portuguesePage], + vi.fn() + ); + + const markdownPaths = mockWriteFileSync.mock.calls + .map((call) => call[0]) + .filter( + (value): value is string => + typeof value === "string" && value.endsWith(".md") + ); + + expect(markdownPaths).toEqual( + expect.arrayContaining([ + expect.stringContaining("titulo-con-acentos.md"), + expect.stringContaining( + "i18n/pt/docusaurus-plugin-content-docs/current/titulo-con-acentos.md" + ), + expect.stringContaining( + "i18n/es/docusaurus-plugin-content-docs/current/titulo-con-acentos.md" + ), + ]) + ); + }); + + it("should normalize localized internal docs links before writing markdown", async () => { + const { generateBlocks } = await import("./generateBlocks"); + const mockWriteFileSync = fs.writeFileSync as Mock; + + const pageFamily = createMockPageFamily("Página de prueba", "Page"); + n2m.pageToMarkdown.mockResolvedValue([]); + n2m.toMarkdownString + .mockReturnValueOnce({ + parent: + "[doc](/docs/Guía Rápida#Título Uno) [external](https://example.com/Árbol) [relative](./Guía Local#Título)", + }) + .mockReturnValueOnce({ + parent: + "[doc](/docs/Guía Rápida#Título Uno) [nested](/docs/Category Name/Sub Página#Título Dos)", + }) + .mockReturnValueOnce({ + parent: "[doc](/docs/Guía Rápida#Título Uno)", + }); + + await generateBlocks(pageFamily.pages, vi.fn()); + + const markdownWrites = mockWriteFileSync.mock.calls.filter( + (call) => typeof call[0] === "string" && call[0].endsWith(".md") + ); + + const englishOutput = markdownWrites.find( + (call) => + typeof call[0] === "string" && + !call[0].includes("/i18n/") && + call[1].includes("/docs/guia-rapida#titulo-uno") + ); + const portugueseOutput = markdownWrites.find( + (call) => + typeof call[0] === "string" && + call[0].includes("/i18n/pt/") && + call[1].includes("/pt/docs/guia-rapida#titulo-uno") + ); + const spanishOutput = markdownWrites.find( + (call) => + typeof call[0] === "string" && + call[0].includes("/i18n/es/") && + call[1].includes("/es/docs/guia-rapida#titulo-uno") + ); + + expect(englishOutput?.[1]).toContain( + "[doc](/docs/guia-rapida#titulo-uno)" + ); + expect(englishOutput?.[1]).toContain( + "[external](https://example.com/Árbol)" + ); + expect(englishOutput?.[1]).toContain("[relative](./Guía Local#Título)"); + expect(portugueseOutput?.[1]).toContain( + "[nested](/pt/docs/sub-pagina#titulo-dos)" + ); + expect(spanishOutput?.[1]).toContain( + "[doc](/es/docs/guia-rapida#titulo-uno)" + ); + }); + + it("should pass the de-duplicated content through heading ID injection before writing", async () => { + const { generateBlocks } = await import("./generateBlocks"); + const mockWriteFileSync = fs.writeFileSync as Mock; + + const page = createMockNotionPage({ + id: "heading-page", + title: "Heading Title", + elementType: "Page", + language: "English", + }); + + n2m.pageToMarkdown.mockResolvedValue([]); + n2m.toMarkdownString.mockReturnValue({ + parent: "# Heading Title\n\n## Título Único\nContent body", + }); + injectExplicitHeadingIds.mockImplementation( + (content: string) => `${content}\n` + ); + + await generateBlocks([page], vi.fn()); + + expect(injectExplicitHeadingIds).toHaveBeenCalledWith( + "## Título Único\nContent body" + ); + + const markdownWrite = mockWriteFileSync.mock.calls.find( + (call) => typeof call[0] === "string" && call[0].endsWith(".md") + ); + expect(markdownWrite?.[1]).toContain(""); + }); + }); + describe("Title fallbacks", () => { it("should fallback to legacy Title property when Content elements is missing", async () => { const { generateBlocks } = await import("./generateBlocks"); diff --git a/scripts/notion-fetch/generateBlocks.ts b/scripts/notion-fetch/generateBlocks.ts index 4dda0cd7..e514676f 100644 --- a/scripts/notion-fetch/generateBlocks.ts +++ b/scripts/notion-fetch/generateBlocks.ts @@ -9,7 +9,9 @@ import type { import { n2m } from "../notionClient"; import { NOTION_PROPERTIES } from "../constants"; import chalk from "chalk"; -import { sanitizeMarkdownContent } from "./utils"; +import { sanitizeMarkdownContent, injectExplicitHeadingIds } from "./utils"; +import { createSafeSlug } from "./slugUtils"; +import { normalizeInternalDocLinks } from "./linkNormalizer"; import config from "../../docusaurus.config"; import SpinnerManager from "./spinnerManager"; import { convertCalloutToAdmonition, isCalloutBlock } from "./calloutProcessor"; @@ -528,6 +530,10 @@ async function processSinglePage( emojiCount += result.fallbackEmojiCount; contentHasS3 = result.containsS3; + markdownString.parent = normalizeInternalDocLinks( + markdownString.parent, + lang + ); markdownString.parent = sanitizeMarkdownContent(markdownString.parent); markdownString.parent = ensureBlankLineAfterStandaloneBold( @@ -538,18 +544,19 @@ async function processSinglePage( markdownString.parent, pageTitle ); + const finalContentBody = injectExplicitHeadingIds(contentBody); const sectionFolderForWrite: Record = {}; sectionFolderForWrite[lang] = currentSectionFolderForLang; - const finalDiagnostics = getImageDiagnostics(markdownString.parent ?? ""); + const finalDiagnostics = getImageDiagnostics(finalContentBody ?? ""); contentHasS3 = finalDiagnostics.s3Matches > 0; writeMarkdownFile( filePath, frontmatter, - contentBody, + finalContentBody, pageTitle, pageProcessingIndex - 1, totalPages, @@ -887,10 +894,7 @@ export async function generateBlocks( ? sectionTypeRaw.trim() : String(sectionTypeRaw ?? "").trim(); const normalizedSectionType = sectionTypeString.toLowerCase(); - const filename = title - .toLowerCase() - .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, ""); + const filename = createSafeSlug(title) || "untitled"; const orderedLocales = getOrderedLocales(Object.keys(pageByLang.content)); for (const lang of orderedLocales) { diff --git a/scripts/notion-fetch/linkNormalizer.test.ts b/scripts/notion-fetch/linkNormalizer.test.ts new file mode 100644 index 00000000..f2fe5706 --- /dev/null +++ b/scripts/notion-fetch/linkNormalizer.test.ts @@ -0,0 +1,151 @@ +import { describe, it, expect, vi } from "vitest"; + +// Mock the docusaurus config before importing the module under test, +// mirroring the pattern used in generateBlocks.test.ts. +vi.mock("../../docusaurus.config", () => ({ + default: { + i18n: { + locales: ["en", "pt", "es"], + defaultLocale: "en", + }, + }, +})); + +import { normalizeInternalDocLinks } from "./linkNormalizer"; + +describe("linkNormalizer", () => { + describe("normalizeInternalDocLinks", () => { + it("should normalize a docs link for the default locale (en) without a locale prefix", () => { + const input = "[link](/docs/Guía Rápida)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs/guia-rapida)"); + }); + + it("should add a locale prefix for a non-default locale (es)", () => { + const input = "[link](/docs/Guía Rápida)"; + const result = normalizeInternalDocLinks(input, "es"); + expect(result).toBe("[link](/es/docs/guia-rapida)"); + }); + + it("should normalize both the path and the fragment", () => { + const input = "[link](/docs/Page#Título Uno)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs/page#titulo-uno)"); + }); + + it("should leave external links untouched", () => { + const input = "[link](https://example.com/Árbol)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should leave relative links untouched", () => { + const input = "[link](./local)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not alter image links (lines starting with !)", () => { + const input = "![img](/docs/Accented Page)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should flatten a nested docs path to only the last segment (slug shape)", () => { + const input = "[link](/docs/Category Name/Sub Page)"; + const result = normalizeInternalDocLinks(input, "pt"); + // buildFrontmatter() writes slug: /${safeSlug} (single level), so the + // public URL is /pt/docs/sub-page, not /pt/docs/category-name/sub-page. + expect(result).toBe("[link](/pt/docs/sub-page)"); + }); + + it("should not rewrite links inside a fenced code block", () => { + const input = "```\n[example](/docs/Guía Rápida)\n```"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside an indented fenced code block", () => { + const input = " ```\n [example](/docs/Guía Rápida)\n ```"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside a tilde fenced code block", () => { + const input = "~~~\n[example](/docs/Guía Rápida)\n~~~"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside an indented tilde fenced code block", () => { + const input = " ~~~\n [example](/docs/Guía Rápida)\n ~~~"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside an unclosed tilde fenced code block", () => { + const input = "~~~\n[example](/docs/Guía Rápida)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside inline code", () => { + const input = "Use `[link](/docs/Guía Rápida)` as an example."; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should not rewrite links inside multi-backtick inline code", () => { + const input = "Use ``[link](/docs/Guía Rápida)`` as an example."; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should normalize multiple docs links on a single line", () => { + const input = "[a](/docs/Foo) and [b](/docs/Bar)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[a](/docs/foo) and [b](/docs/bar)"); + }); + + it("should normalize both links when two docs links are directly adjacent (no separator)", () => { + const input = "[Link 1](/docs/Foo)[Link 2](/docs/Bar)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[Link 1](/docs/foo)[Link 2](/docs/bar)"); + }); + + it("should return empty string for empty content", () => { + const result = normalizeInternalDocLinks("", "en"); + expect(result).toBe(""); + }); + + it("should leave plain text with only external links unchanged", () => { + const input = "plain text with [link](https://example.com)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe(input); + }); + + it("should normalize a link to exactly /docs (en, no locale prefix)", () => { + const input = "[link](/docs)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs)"); + }); + + it("should normalize a link to exactly /docs with locale prefix (es)", () => { + const input = "[link](/docs)"; + const result = normalizeInternalDocLinks(input, "es"); + expect(result).toBe("[link](/es/docs)"); + }); + + it("should normalize a link to /docs#fragment (en, no locale prefix)", () => { + const input = "[link](/docs#Sección Uno)"; + const result = normalizeInternalDocLinks(input, "en"); + expect(result).toBe("[link](/docs#seccion-uno)"); + }); + + it("should normalize a link to /docs#fragment with locale prefix (pt)", () => { + const input = "[link](/docs#Sección Uno)"; + const result = normalizeInternalDocLinks(input, "pt"); + expect(result).toBe("[link](/pt/docs#seccion-uno)"); + }); + }); +}); diff --git a/scripts/notion-fetch/linkNormalizer.ts b/scripts/notion-fetch/linkNormalizer.ts new file mode 100644 index 00000000..7a02948f --- /dev/null +++ b/scripts/notion-fetch/linkNormalizer.ts @@ -0,0 +1,84 @@ +import config from "../../docusaurus.config"; +import { createSafeSlug } from "./slugUtils"; +import { + maskFencedCodeBlocks, + maskInlineCodeSpans, + restoreCodeMasks, +} from "./markdownUtils"; + +const DEFAULT_LOCALE = config.i18n.defaultLocale; +const MARKDOWN_LINK_REGEX = /(? { + const trimmedTarget = rawTarget.trim(); + const titleMatch = trimmedTarget.match(/^(\/docs\/[^\n]*?)(\s+"[^"]*")$/); + const target = titleMatch ? titleMatch[1] : trimmedTarget; + const titleSuffix = titleMatch?.[2] ?? ""; + + if ( + target !== "/docs" && + !target.startsWith("/docs/") && + !target.startsWith("/docs#") + ) { + return match; + } + + return `[${text}](${normalizeDocTarget(target, lang)}${titleSuffix})`; + } + ); + + return restoreCodeMasks(normalizedContent, codeBlocks, codeSpans); +} diff --git a/scripts/notion-fetch/markdownUtils.ts b/scripts/notion-fetch/markdownUtils.ts new file mode 100644 index 00000000..e11675f8 --- /dev/null +++ b/scripts/notion-fetch/markdownUtils.ts @@ -0,0 +1,203 @@ +/** + * Shared markdown code-block masking utilities. + * Both contentSanitizer and linkNormalizer use identical logic for masking + * code blocks and inline code spans before processing, then restoring them. + */ + +/** Placeholder prefix used for fenced code blocks. */ +const CODEBLOCK_PREFIX = "__CODEBLOCK_"; +/** Placeholder prefix used for inline code spans. */ +const CODESPAN_PREFIX = "__CODESPAN_"; +const PLACEHOLDER_SUFFIX = "__"; + +/** Checks whether `line` is a valid closing fence for a block opened with `fenceChar` of length `fenceLength`. */ +function isClosingFenceLine( + line: string, + fenceChar: string, + fenceLength: number +): boolean { + let i = 0; + + while (i < line.length && line.charAt(i) === " ") { + i++; + } + + if (i > 3) { + return false; + } + + let fenceCount = 0; + while (i < line.length && line.charAt(i) === fenceChar) { + fenceCount++; + i++; + } + + if (fenceCount < fenceLength) { + return false; + } + + while (i < line.length) { + if (line.charAt(i) !== " " && line.charAt(i) !== "\t") { + return false; + } + i++; + } + + return true; +} + +/** + * Masks all fenced code blocks in `content`, replacing each with a placeholder token. + * Returns the masked content, the array of original code blocks, and their placeholder strings. + */ +export function maskFencedCodeBlocks(content: string): { + content: string; + codeBlocks: string[]; + placeholders: string[]; +} { + const lines = content.split("\n"); + const codeBlocks: string[] = []; + const placeholders: string[] = []; + const maskedLines: string[] = []; + + let inFence = false; + let fenceChar = ""; + let fenceLength = 0; + let blockLines: string[] = []; + + for (const line of lines) { + if (!inFence) { + const openingMatch = line.match(/^ {0,3}(`{3,}|~{3,})(.*)$/); + + if (!openingMatch) { + maskedLines.push(line); + continue; + } + + const fence = openingMatch[1]; + fenceChar = fence[0]; + fenceLength = fence.length; + blockLines = [line]; + inFence = true; + continue; + } + + blockLines.push(line); + + if (isClosingFenceLine(line, fenceChar, fenceLength)) { + codeBlocks.push(blockLines.join("\n")); + const placeholder = `${CODEBLOCK_PREFIX}${codeBlocks.length - 1}${PLACEHOLDER_SUFFIX}`; + placeholders.push(placeholder); + maskedLines.push(placeholder); + inFence = false; + blockLines = []; + } + } + + if (inFence) { + codeBlocks.push(blockLines.join("\n")); + const placeholder = `${CODEBLOCK_PREFIX}${codeBlocks.length - 1}${PLACEHOLDER_SUFFIX}`; + placeholders.push(placeholder); + maskedLines.push(placeholder); + } + + return { + content: maskedLines.join("\n"), + codeBlocks, + placeholders, + }; +} + +/** + * Masks all inline code spans in `content`, replacing each with a placeholder token. + * Returns the masked content and the array of original code spans. + */ +export function maskInlineCodeSpans(content: string): { + content: string; + codeSpans: string[]; +} { + const codeSpans: string[] = []; + const output: string[] = []; + + let i = 0; + while (i < content.length) { + const currentChar = content.charAt(i); + if (currentChar !== "`") { + output.push(currentChar); + i++; + continue; + } + + let openingLength = 0; + while ( + i + openingLength < content.length && + content.charAt(i + openingLength) === "`" + ) { + openingLength++; + } + + let scanIndex = i + openingLength; + let closingIndex = -1; + while (scanIndex < content.length) { + const nextBacktick = content.indexOf("`", scanIndex); + if (nextBacktick === -1) { + break; + } + + let closingLength = 0; + while ( + nextBacktick + closingLength < content.length && + content.charAt(nextBacktick + closingLength) === "`" + ) { + closingLength++; + } + + if (closingLength === openingLength) { + closingIndex = nextBacktick; + break; + } + + scanIndex = nextBacktick + closingLength; + } + + if (closingIndex === -1) { + output.push(content.slice(i, i + openingLength)); + i += openingLength; + continue; + } + + const codeSpan = content.slice(i, closingIndex + openingLength); + codeSpans.push(codeSpan); + output.push( + `${CODESPAN_PREFIX}${codeSpans.length - 1}${PLACEHOLDER_SUFFIX}` + ); + i = closingIndex + openingLength; + } + + return { + content: output.join(""), + codeSpans, + }; +} + +/** + * Restores previously masked fenced code blocks and inline code spans. + */ +export function restoreCodeMasks( + content: string, + codeBlocks: string[], + codeSpans: string[] +): string { + const restoreByIndex = (values: string[], rawIndex: string) => { + const index = Number(rawIndex); + return Number.isInteger(index) ? (values.at(index) ?? "") : ""; + }; + + return content + .replace(/__CODESPAN_(\d+)__/g, (_match, index) => { + return restoreByIndex(codeSpans, index); + }) + .replace(/__CODEBLOCK_(\d+)__/g, (_match, index) => { + return restoreByIndex(codeBlocks, index); + }); +} diff --git a/scripts/notion-fetch/page-ordering.test.ts b/scripts/notion-fetch/page-ordering.test.ts index 5bb96c83..4d0fa461 100644 --- a/scripts/notion-fetch/page-ordering.test.ts +++ b/scripts/notion-fetch/page-ordering.test.ts @@ -103,6 +103,7 @@ vi.mock("./imageProcessor", () => ({ vi.mock("./utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + injectExplicitHeadingIds: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn(), detectFormatFromBuffer: vi.fn(() => "jpeg"), formatFromContentType: vi.fn(() => "jpeg"), diff --git a/scripts/notion-fetch/slugUtils.test.ts b/scripts/notion-fetch/slugUtils.test.ts new file mode 100644 index 00000000..813b3478 --- /dev/null +++ b/scripts/notion-fetch/slugUtils.test.ts @@ -0,0 +1,54 @@ +import { describe, it, expect } from "vitest"; +import { createSafeSlug } from "./slugUtils"; + +describe("slugUtils", () => { + describe("createSafeSlug", () => { + it("should convert basic Latin text to lowercase hyphenated slug", () => { + expect(createSafeSlug("Hello World")).toBe("hello-world"); + }); + + it("should strip accented Latin characters", () => { + expect(createSafeSlug("Título con acentos")).toBe("titulo-con-acentos"); + }); + + it("should handle Spanish accented characters", () => { + expect(createSafeSlug("Guía Rápida")).toBe("guia-rapida"); + }); + + it("should handle Portuguese characters", () => { + expect(createSafeSlug("Instalação")).toBe("instalacao"); + }); + + it("should handle ñ and accented vowels in Spanish words", () => { + expect(createSafeSlug("Niño & Acción")).toBe("nino-accion"); + }); + + it("should return an empty string for empty input", () => { + expect(createSafeSlug("")).toBe(""); + }); + + it("should strip diacritics from accented letters", () => { + expect(createSafeSlug("éàü")).toBe("eau"); + }); + + it("should preserve numbers in the slug", () => { + expect(createSafeSlug("FAQ Section 2")).toBe("faq-section-2"); + }); + + it("should collapse multiple spaces and hyphens into a single hyphen", () => { + expect(createSafeSlug("hello --- world")).toBe("hello-world"); + }); + + it("should strip leading and trailing hyphens", () => { + expect(createSafeSlug("--hello--")).toBe("hello"); + }); + + it("should preserve CJK input", () => { + expect(createSafeSlug("安装指南")).toBe("安装指南"); + }); + + it("should extract both CJK and Latin from mixed input", () => { + expect(createSafeSlug("安装 Setup 指南")).toBe("安装-setup-指南"); + }); + }); +}); diff --git a/scripts/notion-fetch/slugUtils.ts b/scripts/notion-fetch/slugUtils.ts new file mode 100644 index 00000000..d0f473ab --- /dev/null +++ b/scripts/notion-fetch/slugUtils.ts @@ -0,0 +1,11 @@ +export function createSafeSlug(text: string): string { + return text + .normalize("NFD") + .replace(/\p{M}/gu, "") + .toLowerCase() + .trim() + .replace(/\s+/g, "-") + .replace(/[^\p{L}\p{N}-]/gu, "") + .replace(/-+/g, "-") + .replace(/^-+|-+$/g, ""); +} diff --git a/scripts/notion-fetch/utils.ts b/scripts/notion-fetch/utils.ts index 4b0a9415..4471e445 100644 --- a/scripts/notion-fetch/utils.ts +++ b/scripts/notion-fetch/utils.ts @@ -6,7 +6,10 @@ import { compressImage } from "./imageCompressor"; import { withTimeoutFallback } from "./timeoutUtils"; // Re-export sanitize so callers have a single utils entrypoint -export { sanitizeMarkdownContent } from "./contentSanitizer"; +export { + sanitizeMarkdownContent, + injectExplicitHeadingIds, +} from "./contentSanitizer"; // Fail-open toggle: defaults to true unless explicitly set to 'false' export const SOFT_FAIL: boolean = diff --git a/scripts/notion-fetch/verifyExportCoverage.ts b/scripts/notion-fetch/verifyExportCoverage.ts index 80e4a78e..ade557c9 100644 --- a/scripts/notion-fetch/verifyExportCoverage.ts +++ b/scripts/notion-fetch/verifyExportCoverage.ts @@ -4,17 +4,13 @@ import path from "node:path"; import { glob } from "glob"; import { NOTION_PROPERTIES } from "../constants"; +import { createSafeSlug } from "./slugUtils"; type NotionPage = Record; const EXPORT_FILENAME = "notion_db.json"; -const slugify = (title: string): string => - title - .toLowerCase() - .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, "") - .trim(); +const slugify = (title: string): string => createSafeSlug(title); const getTitle = (page: NotionPage): string | undefined => page?.properties?.[NOTION_PROPERTIES.TITLE]?.title?.[0]?.plain_text; @@ -50,14 +46,12 @@ export interface VerificationResult { export function verifyExportCoverage( exportPath: string = path.resolve(process.cwd(), EXPORT_FILENAME) ): VerificationResult { - // eslint-disable-next-line security/detect-non-literal-fs-filename if (!fs.existsSync(exportPath)) { throw new Error( `Notion export file not found at ${exportPath}. Run bun notion:export first.` ); } - // eslint-disable-next-line security/detect-non-literal-fs-filename const payload = JSON.parse(fs.readFileSync(exportPath, "utf8")); const results: NotionPage[] = payload.results ?? []; const readyPages = results.filter(isReadyToPublish); diff --git a/scripts/notion-translate/imageStabilization.test.ts b/scripts/notion-translate/imageStabilization.test.ts index 514a5d92..946c9194 100644 --- a/scripts/notion-translate/imageStabilization.test.ts +++ b/scripts/notion-translate/imageStabilization.test.ts @@ -906,7 +906,7 @@ describe("image stabilization in translation pipeline", () => { expect(mockProcessAndReplaceImages).toHaveBeenCalledWith( expect.any(String), - "hllo-wrld-pageid1" + "hello-world-pageid1" ); }); diff --git a/scripts/notion-translate/index.ts b/scripts/notion-translate/index.ts index 03773771..c2c2c4c2 100644 --- a/scripts/notion-translate/index.ts +++ b/scripts/notion-translate/index.ts @@ -38,6 +38,7 @@ import { validateAndFixRemainingImages, extractImageMatches, } from "../notion-fetch/imageReplacer.js"; +import { createSafeSlug } from "../notion-fetch/slugUtils.js"; const LEGACY_SECTION_PROPERTY = "Section"; const PARENT_ITEM_PROPERTY = "Parent item"; @@ -584,11 +585,7 @@ const NOTION_IMAGE_URL_FAMILY_REGEX = new RegExp( * image filenames remain consistent with markdown filenames. */ function generateSafeFilename(title: string, pageId: string): string { - const baseSlug = title - .toLowerCase() - .replace(/\s+/g, "-") - .replace(/[^a-z0-9-]/g, "") - .substring(0, MAX_SLUG_LENGTH); + const baseSlug = createSafeSlug(title).substring(0, MAX_SLUG_LENGTH); const stablePageId = pageId.toLowerCase().replace(/[^a-z0-9]/g, ""); const deterministicBase = baseSlug || "untitled"; return `${deterministicBase}-${stablePageId}`;