Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Changelog - PR 170

## ✨ New Features

- **Slug Normalization**: Accented slugs are now normalized and locale-prefixed link references are supported.

## 🐛 Fixes

- **Doc Paths**: Flattened nested document paths.
- **Link Normalization**: Links inside code blocks and indented code fences are now properly skipped during link normalization.
- **Heading IDs**: Explicit heading IDs and empty filenames are handled correctly to prevent heading ID collisions.
- **Slug Generation**: Preserved CJK and Unicode letters in slug generation.
- **Code Fences**: Aligned code-fence regex with CommonMark standard.

## 🧪 Testing

- **Normalization**: Aligned tests with new normalization expectations.
4 changes: 3 additions & 1 deletion bun-tests/vitest-bridge.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,7 @@ test(
() => {
runVitest();
},
{ timeout: 120_000 }
// The full Vitest suite can take just under two minutes on this repo, and
// Bun's own test harness adds enough overhead that 120s is too tight.
{ timeout: 300_000 }
);
8 changes: 6 additions & 2 deletions eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const eslintConfig = [
// Docusaurus specific configurations
{
files: ["**/*.{js,mjs,cjs,ts,jsx,tsx}"],
ignores: ["scripts/**", "api-server/**"], // Ignore scripts and api-server directories for docusaurus rules
ignores: ["scripts/**", "api-server/**", "bun-tests/**"], // Ignore non-Docusaurus runtime directories for docusaurus/react rules
plugins: {
"@docusaurus": docusaurusPlugin,
react: pluginReact,
Expand Down Expand Up @@ -74,7 +74,11 @@ const eslintConfig = [

// Scripts and API server specific configurations
{
files: ["scripts/**/*.{js,mjs,cjs,ts}", "api-server/**/*.{js,mjs,cjs,ts}"],
files: [
"scripts/**/*.{js,mjs,cjs,ts}",
"api-server/**/*.{js,mjs,cjs,ts}",
"bun-tests/**/*.{js,mjs,cjs,ts}",
],
plugins: {
import: importPlugin,
promise: promisePlugin,
Expand Down
1 change: 1 addition & 0 deletions scripts/notion-fetch/__tests__/retry-loop-behavior.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ vi.mock("../imageProcessor", () => ({

vi.mock("../utils", () => ({
sanitizeMarkdownContent: vi.fn((content) => content),
injectExplicitHeadingIds: vi.fn((content) => content),
compressImageToFileWithFallback: vi.fn().mockResolvedValue({
finalSize: 512,
usedFallback: false,
Expand Down
88 changes: 88 additions & 0 deletions scripts/notion-fetch/contentSanitizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ describe("contentSanitizer", () => {
expect(result).toBe(input); // Should remain unchanged
});

it("should preserve tilde fenced code blocks", () => {
const input = "~~~md\nconst obj = { key: 'value' };\n~~~";
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toBe(input);
});

it("should preserve multi-backtick inline code spans", () => {
const input = "Use ``<link to section.>`` and ``{foo}``.";
const result = scriptModule.sanitizeMarkdownContent(input);
expect(result).toBe(input);
});

it("should fix malformed <link to section.> patterns", () => {
const input = "Check <link to section.> for details.";
const result = scriptModule.sanitizeMarkdownContent(input);
Expand Down Expand Up @@ -226,4 +238,80 @@ echo "# Not a heading"
});
});
});

describe("injectExplicitHeadingIds", () => {
it("should normalize accented headings and append stable duplicate suffixes", () => {
const input = [
"# Título Único",
"## Título Único",
"### Niño & Acción",
].join("\n");

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("# Título Único {#titulo-unico}");
expect(result).toContain("## Título Único {#titulo-unico-1}");
expect(result).toContain("### Niño & Acción {#nino-accion}");
});

it("should preserve existing explicit heading ids and code fences", () => {
const input = [
"# Encabezado {#custom-id}",
"```md",
"## Código Único",
"```",
"## Otro Título",
].join("\n");

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("# Encabezado {#custom-id}");
expect(result).toContain("```md\n## Código Único\n```");
expect(result).toContain("## Otro Título {#otro-titulo}");
expect(result).not.toContain("## Código Único {#codigo-unico}");
});

it("should preserve headings inside tilde fenced code blocks", () => {
const input = ["~~~md", "## Código Único", "~~~", "## Otro Título"].join(
"\n"
);

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("~~~md\n## Código Único\n~~~");
expect(result).toContain("## Otro Título {#otro-titulo}");
expect(result).not.toContain("## Código Único {#codigo-unico}");
});

it("should avoid collisions between auto-incremented and explicit IDs", () => {
const input = ["## Título", "## Heading {#titulo-1}", "## Título"].join(
"\n"
);

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("## Título {#titulo}");
expect(result).toContain("## Heading {#titulo-1}");
// The second "Título" must NOT get titulo-1 (already claimed), should get titulo-2
expect(result).toContain("## Título {#titulo-2}");
});

it("should reserve later explicit ids before assigning earlier auto-generated headings", () => {
const input = ["## My Id", "## Custom {#my-id}"].join("\n");

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("## My Id {#my-id-1}");
expect(result).toContain("## Custom {#my-id}");
});

it("should not reserve a natural slug when a later explicit id is custom", () => {
const input = ["## My Id", "## My Id {#custom}"].join("\n");

const result = scriptModule.injectExplicitHeadingIds(input);

expect(result).toContain("## My Id {#my-id}");
expect(result).toContain("## My Id {#custom}");
});
});
});
117 changes: 94 additions & 23 deletions scripts/notion-fetch/contentSanitizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
* that cause MDX compilation errors in Docusaurus.
*/

import { createSafeSlug } from "./slugUtils";
import {
maskFencedCodeBlocks,
maskInlineCodeSpans,
restoreCodeMasks,
} from "./markdownUtils";

const EMOJI_STYLE_MARKERS = ["display:", "height:", "margin:"];

const isEmojiStyleObject = (snippet: string): boolean =>
Expand Down Expand Up @@ -68,6 +75,83 @@ function fixHeadingHierarchy(
return fixedLines.join("\n");
}

export function injectExplicitHeadingIds(content: string): string {
if (!content) {
return content;
}

const {
content: maskedContent,
codeBlocks,
placeholders,
} = maskFencedCodeBlocks(content);
const reservedIds = new Set<string>();
const headingCounts = new Map<string, number>();

const lines = maskedContent.split("\n");
for (const line of lines) {
if (placeholders.some((placeholder) => line.includes(placeholder))) {
continue;
}

const fullMatch = line.match(
/^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/
);
if (fullMatch) {
const [, , , , explicitId] = fullMatch;
if (explicitId) {
reservedIds.add(explicitId);
}
continue;
}

const explicitIdMatch = line.match(/\s\{#([^}]+)\}\s*$/);
if (explicitIdMatch) {
const explicitId = explicitIdMatch[1];
reservedIds.add(explicitId);
}
}

const updatedLines = lines.map((line) => {
if (placeholders.some((placeholder) => line.includes(placeholder))) {
return line;
}

const explicitHeadingMatch = line.match(
/^(\s{0,3})(#{1,6})\s+(.+?)\s*\{#([^}]+)\}\s*$/
);
if (explicitHeadingMatch) {
return line;
}

const headingMatch = line.match(/^(\s{0,3})(#{1,6})\s+(.+?)\s*$/);
if (!headingMatch) {
return line;
}

const [, leadingWhitespace, hashes, headingText] = headingMatch;
const baseId = createSafeSlug(headingText);
if (!baseId) {
return line;
}

let counter = headingCounts.get(baseId) ?? 0;
let headingId = counter === 0 ? baseId : `${baseId}-${counter}`;
while (reservedIds.has(headingId) || headingCounts.has(headingId)) {
counter++;
headingId = `${baseId}-${counter}`;
}
headingCounts.set(baseId, counter + 1);
if (headingId !== baseId) {
headingCounts.set(headingId, 1);
}

return `${leadingWhitespace}${hashes} ${headingText} {#${headingId}}`;
});

return restoreCodeMasks(updatedLines.join("\n"), codeBlocks, []);
}

/**
* Sanitizes markdown content to fix malformed HTML/JSX tags that cause MDX compilation errors
* @param content - The markdown content string
Expand All @@ -77,23 +161,17 @@ export function sanitizeMarkdownContent(content: string): string {
// Fix specific malformed patterns that cause MDX errors

// 0. Mask code fences (```...```) and inline code (`...`) to avoid altering them
const codeBlocks: string[] = [];
const codeSpans: string[] = [];
const codeBlockPlaceholders: string[] = [];

content = content.replace(/```[\s\S]*?```/g, (m) => {
codeBlocks.push(m);
const placeholder = `__CODEBLOCK_${codeBlocks.length - 1}__`;
codeBlockPlaceholders.push(placeholder);
return placeholder;
});
content = content.replace(/`[^`\n]*`/g, (m) => {
codeSpans.push(m);
return `__CODESPAN_${codeSpans.length - 1}__`;
});
const {
content: maskedContent,
codeBlocks,
placeholders,
} = maskFencedCodeBlocks(content);
const { content: maskedWithCodeSpans, codeSpans } =
maskInlineCodeSpans(maskedContent);
content = maskedWithCodeSpans;

// 1. Fix heading hierarchy for proper TOC generation (after masking code blocks)
content = fixHeadingHierarchy(content, codeBlockPlaceholders);
content = fixHeadingHierarchy(content, placeholders);

// 2. Aggressively strip all curly-brace expressions by unwrapping to inner text
// BUT preserve JSX style objects for emoji images
Expand Down Expand Up @@ -157,14 +235,7 @@ export function sanitizeMarkdownContent(content: string): string {
}

// 9. Restore masked code blocks and inline code
content = content.replace(
/__CODEBLOCK_(\d+)__/g,
(_m, i) => codeBlocks[Number(i)]
);
content = content.replace(
/__CODESPAN_(\d+)__/g,
(_m, i) => codeSpans[Number(i)]
);
content = restoreCodeMasks(content, codeBlocks, codeSpans);

return content;
}
Loading
Loading