From 33c1581bba94d0e1d3e1517813be6b777d7fdb89 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:00:07 -0300 Subject: [PATCH 01/18] fix(translate): prevent content loss in long-form translation (#166) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Long-form Notion pages (troubleshooting, create/edit observation, etc.) were silently dropping sections during automatic translation. The likely cause: feeding very large chunks to the model saturated its effective attention window, causing it to omit headings and paragraphs without raising an error. Changes: - Lower proactive chunk ceiling from 500 K → 120 K chars so each translation request stays well within reliable model attention range - Add structural completeness validation after every translation call: checks heading count, fenced code blocks, bullet/numbered lists, table lines, and severe length shrinkage (< 55 % of source) - Retry with progressively smaller chunks (halved each attempt, floor 8 K chars) up to TRANSLATION_COMPLETENESS_MAX_RETRIES (2) times when incompleteness is detected, then surface a non-critical error Closes #166 --- scripts/constants.ts | 14 +- .../translateFrontMatter.test.ts | 268 ++++++++++++++++-- .../notion-translate/translateFrontMatter.ts | 254 ++++++++++++++--- 3 files changed, 473 insertions(+), 63 deletions(-) diff --git a/scripts/constants.ts b/scripts/constants.ts index 020f4c9c..bc64be72 100644 --- a/scripts/constants.ts +++ b/scripts/constants.ts @@ -182,10 +182,16 @@ export const ENGLISH_DIR_SAVE_ERROR = // Translation retry configuration export const TRANSLATION_MAX_RETRIES = 3; export const TRANSLATION_RETRY_BASE_DELAY_MS = 750; -/** Max characters per translation chunk. - * Targets ~143K tokens (500K chars / 3.5 chars per token). - * Leaves generous buffer within OpenAI's 272K structured-output limit. */ -export const TRANSLATION_CHUNK_MAX_CHARS = 500_000; +/** + * Reliability-oriented cap for proactive markdown translation chunking. + * This keeps long-form docs away from the model's theoretical context ceiling, + * even when the model advertises a much larger maximum context window. + */ +export const TRANSLATION_CHUNK_MAX_CHARS = 120_000; +/** Smallest total-budget chunk size used when retrying incomplete translations. */ +export const TRANSLATION_MIN_CHUNK_MAX_CHARS = 8_000; +/** Maximum times to retry with smaller chunks after completeness checks fail. */ +export const TRANSLATION_COMPLETENESS_MAX_RETRIES = 2; // URL handling export const INVALID_URL_PLACEHOLDER = diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index f1351017..25922e2a 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -5,6 +5,58 @@ import { } from "./test-openai-mock"; import { installTestNotionEnv } from "../test-utils"; +type MockOpenAIRequest = { + messages?: Array<{ role: string; content: string }>; +}; + +function extractPromptMarkdown(request: MockOpenAIRequest): { + title: string; + markdown: string; +} { + const userPrompt = + request.messages?.find((message) => message.role === "user")?.content ?? ""; + const titleMatch = userPrompt.match(/^title:\s*(.*)$/m); + const markdownMarker = "\nmarkdown: "; + const markdownIndex = userPrompt.indexOf(markdownMarker); + + return { + title: titleMatch?.[1] ?? "", + markdown: + markdownIndex >= 0 + ? userPrompt.slice(markdownIndex + markdownMarker.length) + : "", + }; +} + +function installStructuredTranslationMock( + mapResponse?: (payload: { title: string; markdown: string }) => { + title: string; + markdown: string; + } +) { + mockOpenAIChatCompletionCreate.mockImplementation( + async (request: MockOpenAIRequest) => { + const payload = extractPromptMarkdown(request); + const translated = mapResponse + ? mapResponse(payload) + : { + title: payload.title ? `Translated ${payload.title}` : "", + markdown: payload.markdown, + }; + + return { + choices: [ + { + message: { + content: JSON.stringify(translated), + }, + }, + ], + }; + } + ); +} + describe("notion-translate translateFrontMatter", () => { let restoreEnv: () => void; @@ -55,7 +107,186 @@ describe("notion-translate translateFrontMatter", () => { ); }); - it("classifies token overflow errors as non-critical token_overflow code", async () => { + it("chunks long-form content proactively below model-derived maximums", async () => { + const { translateText } = await import("./translateFrontMatter"); + installStructuredTranslationMock(); + + const largeContent = + "# Section One\n\n" + + "word ".repeat(14_000) + + "\n# Section Two\n\n" + + "word ".repeat(14_000); + + const result = await translateText(largeContent, "Large Page", "pt-BR"); + + expect(mockOpenAIChatCompletionCreate.mock.calls.length).toBeGreaterThan(1); + expect(result.markdown).toContain("# Section Two"); + }); + + it("retries with smaller chunks when a valid response omits a section", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "# Section One\n\n" + + "Alpha paragraph.\n\n" + + "# Section Two\n\n" + + "Beta paragraph.\n\n" + + "# Section Three\n\n" + + "Gamma paragraph."; + + mockOpenAIChatCompletionCreate + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "# Seção Um\n\nParágrafo alfa.\n\n# Seção Três\n\nParágrafo gama.", + title: "Título Traduzido", + }), + }, + }, + ], + }) + .mockResolvedValue({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "# Seção Um\n\nParágrafo alfa.\n\n# Seção Dois\n\nParágrafo beta.\n\n# Seção Três\n\nParágrafo gama.", + title: "Título Traduzido", + }), + }, + }, + ], + }); + + const result = await translateText(source, "Original Title", "pt-BR", { + chunkLimit: 8_500, + }); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(2); + expect(result.markdown).toContain("# Seção Dois"); + expect(result.title).toBe("Título Traduzido"); + }); + + it("fails when repeated completeness retries still return incomplete content", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "# Section One\n\n" + + "Alpha paragraph.\n\n" + + "# Section Two\n\n" + + "Beta paragraph.\n\n" + + "# Section Three\n\n" + + "Gamma paragraph."; + + mockOpenAIChatCompletionCreate.mockImplementation(async () => ({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "# Seção Um\n\nParágrafo alfa.\n\n# Seção Três\n\nParágrafo gama.", + title: "Título Traduzido", + }), + }, + }, + ], + })); + + await expect( + translateText(source, "Original Title", "pt-BR", { + chunkLimit: 8_500, + }) + ).rejects.toEqual( + expect.objectContaining({ + code: "unexpected_error", + isCritical: false, + }) + ); + expect(mockOpenAIChatCompletionCreate.mock.calls.length).toBeGreaterThan(1); + }); + + it("treats heavy structural shrinkage as incomplete long-form translation", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "# Long Section\n\n" + + Array.from( + { length: 160 }, + (_, index) => `Paragraph ${index} with repeated explanatory content.` + ).join("\n\n"); + + mockOpenAIChatCompletionCreate + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: "# Seção Longa\n\nResumo curto.", + title: "Título Traduzido", + }), + }, + }, + ], + }) + .mockImplementation(async (request: MockOpenAIRequest) => { + const payload = extractPromptMarkdown(request); + return { + choices: [ + { + message: { + content: JSON.stringify({ + markdown: payload.markdown.replace(/Paragraph/g, "Parágrafo"), + title: "Título Traduzido", + }), + }, + }, + ], + }; + }); + + const result = await translateText(source, "Original Title", "pt-BR", { + chunkLimit: 25_000, + }); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(2); + expect(result.markdown.length).toBeGreaterThan(4_000); + }); + + it("preserves complete heading structures when chunking by sections", async () => { + const { translateText } = await import("./translateFrontMatter"); + installStructuredTranslationMock(({ title, markdown }) => ({ + title: title ? `Translated ${title}` : "", + markdown: markdown + .replace("# Section One", "# Seção Um") + .replace("# Section Two", "# Seção Dois") + .replace("# Section Three", "# Seção Três") + .replace(/Alpha/g, "Alfa") + .replace(/Gamma/g, "Gama"), + })); + + const source = + "# Section One\n\n" + + "Alpha ".repeat(60) + + "\n\n# Section Two\n\n" + + "Beta ".repeat(60) + + "\n\n# Section Three\n\n" + + "Gamma ".repeat(60); + + const result = await translateText(source, "Original Title", "pt-BR", { + chunkLimit: 500, + }); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(3); + expect(result.markdown).toContain("# Seção Um"); + expect(result.markdown).toContain("# Seção Dois"); + expect(result.markdown).toContain("# Seção Três"); + }); + + it("continues to classify token overflow errors as non-critical token_overflow code", async () => { const { translateText } = await import("./translateFrontMatter"); mockOpenAIChatCompletionCreate.mockRejectedValue({ @@ -91,6 +322,7 @@ describe("notion-translate translateFrontMatter", () => { it("takes the single-call fast path for small content", async () => { const { translateText } = await import("./translateFrontMatter"); + installStructuredTranslationMock(); const result = await translateText( "# Small page\n\nJust a paragraph.", @@ -99,14 +331,15 @@ describe("notion-translate translateFrontMatter", () => { ); expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(1); - expect(result.title).toBe("Mock Title"); - expect(result.markdown).toBe("# translated\n\nMock content"); + expect(result.title).toBe("Translated Small"); + expect(result.markdown).toBe("# Small page\n\nJust a paragraph."); }); it("chunks large content and calls the API once per chunk", async () => { const { translateText, splitMarkdownIntoChunks } = await import( "./translateFrontMatter" ); + installStructuredTranslationMock(); // Build content that is larger than the chunk threshold const bigSection1 = "# Section One\n\n" + "word ".repeat(100_000); @@ -123,8 +356,8 @@ describe("notion-translate translateFrontMatter", () => { expect( mockOpenAIChatCompletionCreate.mock.calls.length ).toBeGreaterThanOrEqual(2); - expect(result.title).toBe("Mock Title"); // taken from first chunk - expect(typeof result.markdown).toBe("string"); + expect(result.title).toBe("Translated Big Page"); + expect(result.markdown).toContain("# Section Two"); expect(result.markdown.length).toBeGreaterThan(0); }); @@ -137,17 +370,20 @@ describe("notion-translate translateFrontMatter", () => { message: "This model's maximum context length is 131072 tokens. However, you requested 211603 tokens (211603 in the messages, 0 in the completion).", }) - .mockResolvedValue({ - choices: [ - { - message: { - content: JSON.stringify({ - markdown: "translated chunk", - title: "Translated Title", - }), + .mockImplementation(async (request: MockOpenAIRequest) => { + const payload = extractPromptMarkdown(request); + return { + choices: [ + { + message: { + content: JSON.stringify({ + markdown: payload.markdown, + title: "Translated Title", + }), + }, }, - }, - ], + ], + }; }); const result = await translateText( @@ -158,7 +394,7 @@ describe("notion-translate translateFrontMatter", () => { expect(mockOpenAIChatCompletionCreate.mock.calls.length).toBeGreaterThan(1); expect(result.title).toBe("Translated Title"); - expect(result.markdown.length).toBeGreaterThan(0); + expect(result.markdown).toContain("Just a paragraph."); }); it("masks and restores data URL images during translation", async () => { diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 4f69d9a7..aa462d2f 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -12,6 +12,9 @@ import { OPENAI_BASE_URL, IS_CUSTOM_OPENAI_API, getMaxChunkChars, + TRANSLATION_CHUNK_MAX_CHARS, + TRANSLATION_MIN_CHUNK_MAX_CHARS, + TRANSLATION_COMPLETENESS_MAX_RETRIES, } from "../constants.js"; // Load environment variables @@ -473,6 +476,101 @@ function isPlaceholderIntegrityError( ); } +type MarkdownStructureMetrics = { + headingCount: number; + fencedCodeBlockCount: number; + bulletListCount: number; + numberedListCount: number; + tableLineCount: number; + contentLength: number; +}; + +function collectMarkdownStructureMetrics( + markdown: string +): MarkdownStructureMetrics { + const headingMatches = markdown.match(/^#{1,6}\s.+$/gm) ?? []; + const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; + const bulletListMatches = markdown.match(/^\s*[-*+]\s+/gm) ?? []; + const numberedListMatches = markdown.match(/^\s*\d+\.\s+/gm) ?? []; + const tableLineMatches = markdown.match(/^\|.*\|\s*$/gm) ?? []; + + return { + headingCount: headingMatches.length, + fencedCodeBlockCount: Math.floor(fencedCodeMatches.length / 2), + bulletListCount: bulletListMatches.length, + numberedListCount: numberedListMatches.length, + tableLineCount: tableLineMatches.length, + contentLength: markdown.trim().length, + }; +} + +function isSuspiciouslyIncompleteTranslation( + sourceMarkdown: string, + translatedMarkdown: string +): boolean { + const sourceMetrics = collectMarkdownStructureMetrics(sourceMarkdown); + const translatedMetrics = collectMarkdownStructureMetrics(translatedMarkdown); + + if (sourceMetrics.contentLength === 0) { + return false; + } + + const lengthRatio = + translatedMetrics.contentLength / Math.max(sourceMetrics.contentLength, 1); + const headingLoss = + sourceMetrics.headingCount > 0 && + translatedMetrics.headingCount < sourceMetrics.headingCount; + const fencedBlockLoss = + sourceMetrics.fencedCodeBlockCount > 0 && + translatedMetrics.fencedCodeBlockCount < sourceMetrics.fencedCodeBlockCount; + const bulletListLoss = + sourceMetrics.bulletListCount >= 3 && + translatedMetrics.bulletListCount === 0; + const numberedListLoss = + sourceMetrics.numberedListCount >= 3 && + translatedMetrics.numberedListCount === 0; + const tableLoss = + sourceMetrics.tableLineCount >= 2 && translatedMetrics.tableLineCount === 0; + const severeLengthShrinkage = + sourceMetrics.contentLength >= 4_000 && lengthRatio < 0.55; + + return ( + headingLoss || + fencedBlockLoss || + bulletListLoss || + numberedListLoss || + tableLoss || + severeLengthShrinkage + ); +} + +function getProactiveChunkCharLimit(modelName: string): number { + return Math.min(getMaxChunkChars(modelName), TRANSLATION_CHUNK_MAX_CHARS); +} + +function getChunkContentBudget(totalChunkLimit: number, title: string): number { + const minimumBudget = Math.min( + totalChunkLimit, + TRANSLATION_MIN_CHUNK_MAX_CHARS + ); + + return Math.max( + totalChunkLimit - TRANSLATION_PROMPT.length - title.length - 20, + minimumBudget + ); +} + +function splitMarkdownForTranslation( + markdown: string, + title: string, + totalChunkLimit: number +): string[] { + return splitMarkdownIntoChunks( + markdown, + getChunkContentBudget(totalChunkLimit, title) + ); +} + /** * Translates a markdown file using OpenAI * @param filePath Path to the markdown file to translate @@ -622,7 +720,8 @@ async function translateChunkWithOverflowFallback( text: string, title: string, targetLanguage: string, - placeholderGuardAttempt = 0 + placeholderGuardAttempt = 0, + chunkBudgetForRetry = getProactiveChunkCharLimit(model) ): Promise<{ markdown: string; title: string }> { const requiredPlaceholders = extractDataUrlPlaceholders(text); @@ -655,8 +754,11 @@ async function translateChunkWithOverflowFallback( throw err; } - const splitTarget = Math.max(Math.floor(text.length / 2), 1); - let subChunks = splitMarkdownIntoChunks(text, splitTarget); + const splitTarget = Math.max( + Math.floor(Math.min(text.length, chunkBudgetForRetry) / 2), + TRANSLATION_MIN_CHUNK_MAX_CHARS + ); + let subChunks = splitMarkdownForTranslation(text, title, splitTarget); if (subChunks.length <= 1) { const midpoint = Math.floor(text.length / 2); if (midpoint < 1 || midpoint >= text.length) { @@ -688,6 +790,11 @@ async function translateChunkWithOverflowFallback( } } +type TranslateTextOptions = { + chunkLimit?: number; + completenessRetryDepth?: number; +}; + /** * Translates text using OpenAI * @param text Text to translate @@ -698,7 +805,8 @@ async function translateChunkWithOverflowFallback( export async function translateText( text: string, title: string, - targetLanguage: string + targetLanguage: string, + options: TranslateTextOptions = {} ): Promise<{ markdown: string; title: string }> { const safeText = typeof text === "string" && text.length > 0 @@ -706,59 +814,119 @@ export async function translateText( : "# Empty Content\n\nThis page has no content to translate."; const { maskedText, placeholders } = maskDataUrlImages(safeText); - // Get model-specific chunk size - const maxChunkChars = getMaxChunkChars(model); + const effectiveChunkLimit = + options.chunkLimit ?? getProactiveChunkCharLimit(model); + const completenessRetryDepth = options.completenessRetryDepth ?? 0; + + const translateAndValidate = async ( + sourceMarkdown: string, + translatedChunk: Promise<{ markdown: string; title: string }> + ) => { + const translated = await translatedChunk; + if ( + isSuspiciouslyIncompleteTranslation(sourceMarkdown, translated.markdown) + ) { + throw new TranslationError( + "Translated markdown appears incomplete compared to source structure", + "unexpected_error", + false + ); + } + return translated; + }; // Include system prompt overhead (~1800 chars) + title prefix + "markdown: " prefix const estimatedTotalChars = TRANSLATION_PROMPT.length + title.length + 20 + maskedText.length; - if (estimatedTotalChars <= maxChunkChars) { - // Fast path: content fits in a single call - const translated = await translateChunkWithOverflowFallback( + try { + if (estimatedTotalChars <= effectiveChunkLimit) { + // Fast path: content fits in a single call + const translated = await translateAndValidate( + maskedText, + translateChunkWithOverflowFallback( + maskedText, + title, + targetLanguage, + 0, + effectiveChunkLimit + ) + ); + return { + markdown: restoreDataUrlPlaceholders(translated.markdown, placeholders), + title: restoreDataUrlPlaceholders(translated.title, placeholders), + }; + } + + // Slow path: content too large — split into chunks + const chunks = splitMarkdownForTranslation( maskedText, title, - targetLanguage + effectiveChunkLimit ); - return { - markdown: restoreDataUrlPlaceholders(translated.markdown, placeholders), - title: restoreDataUrlPlaceholders(translated.title, placeholders), - }; - } - // Slow path: content too large — split into chunks - const contentBudget = - maxChunkChars - TRANSLATION_PROMPT.length - title.length - 20; - const chunks = splitMarkdownIntoChunks( - maskedText, - Math.max(contentBudget, 50_000) - ); + let translatedTitle = title; + const translatedChunks: string[] = []; - let translatedTitle = title; - const translatedChunks: string[] = []; + for (const [i, chunk] of chunks.entries()) { + const chunkTitle = i === 0 ? title : ""; + const result = await translateAndValidate( + chunk, + translateChunkWithOverflowFallback( + chunk, + chunkTitle, + targetLanguage, + 0, + effectiveChunkLimit + ) + ); - for (const [i, chunk] of chunks.entries()) { - const chunkTitle = i === 0 ? title : ""; - const result = await translateChunkWithOverflowFallback( - chunk, - chunkTitle, - targetLanguage - ); + if (i === 0) { + translatedTitle = result.title; + } + translatedChunks.push(result.markdown); + } - if (i === 0) { - translatedTitle = result.title; + const joinedMarkdown = translatedChunks.join(""); + if (isSuspiciouslyIncompleteTranslation(maskedText, joinedMarkdown)) { + throw new TranslationError( + "Translated markdown appears incomplete after chunk reassembly", + "unexpected_error", + false + ); } - translatedChunks.push(result.markdown); - } - // Sections already end with "\n"; join with "" to avoid extra blank lines - return { - markdown: restoreDataUrlPlaceholders( - translatedChunks.join(""), - placeholders - ), - title: restoreDataUrlPlaceholders(translatedTitle, placeholders), - }; + // Sections already end with "\n"; join with "" to avoid extra blank lines + return { + markdown: restoreDataUrlPlaceholders(joinedMarkdown, placeholders), + title: restoreDataUrlPlaceholders(translatedTitle, placeholders), + }; + } catch (error) { + const isRecoverableCompletenessFailure = + error instanceof TranslationError && + error.code === "unexpected_error" && + error.isCritical === false && + /incomplete/.test(error.message); + + if ( + isRecoverableCompletenessFailure && + completenessRetryDepth < TRANSLATION_COMPLETENESS_MAX_RETRIES + ) { + const nextChunkLimit = Math.max( + Math.floor(effectiveChunkLimit / 2), + TRANSLATION_MIN_CHUNK_MAX_CHARS + ); + + if (nextChunkLimit < effectiveChunkLimit) { + return translateText(text, title, targetLanguage, { + chunkLimit: nextChunkLimit, + completenessRetryDepth: completenessRetryDepth + 1, + }); + } + } + + throw error; + } } /** From fd3b4d24b7a389de1c807c8ca37f5bcd4bbb389b Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:29:12 -0300 Subject: [PATCH 02/18] =?UTF-8?q?fix(translate):=20address=20Codex=20revie?= =?UTF-8?q?w=20=E2=80=94=20reachable=208k=20floor=20and=20correct=20overhe?= =?UTF-8?q?ad?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1: TRANSLATION_COMPLETENESS_MAX_RETRIES was 2, so halving from 120k only reached 60k → 30k before giving up. Reaching the 8k floor requires 4 halvings (120k→60k→30k→15k→8k), so raise the constant to 4. P2: getChunkContentBudget was flooring the *content* budget at TRANSLATION_MIN_CHUNK_MAX_CHARS (8k), ignoring prompt overhead (~2.6k). This made the actual request larger than the documented 8k minimum. Fix: subtract overhead from the total limit and floor the content budget at 1; the 8k total-request floor is already enforced by the retry caller. Update the "preserves heading structures" test to use a chunkLimit that reflects a realistic total-request budget (3_200 chars) rather than a raw content size (500 chars), which the old incorrect floor had masked. --- scripts/constants.ts | 10 ++++++++-- .../notion-translate/translateFrontMatter.test.ts | 6 +++++- scripts/notion-translate/translateFrontMatter.ts | 14 +++++--------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/scripts/constants.ts b/scripts/constants.ts index bc64be72..33c40e3e 100644 --- a/scripts/constants.ts +++ b/scripts/constants.ts @@ -190,8 +190,14 @@ export const TRANSLATION_RETRY_BASE_DELAY_MS = 750; export const TRANSLATION_CHUNK_MAX_CHARS = 120_000; /** Smallest total-budget chunk size used when retrying incomplete translations. */ export const TRANSLATION_MIN_CHUNK_MAX_CHARS = 8_000; -/** Maximum times to retry with smaller chunks after completeness checks fail. */ -export const TRANSLATION_COMPLETENESS_MAX_RETRIES = 2; +/** + * Maximum times to retry with smaller chunks after completeness checks fail. + * Each retry halves the chunk limit. Starting from 120 K chars: + * 120k → 60k → 30k → 15k → 8k (floor) + * Four halvings are needed to descend from the default cap to the 8k floor, + * so this must be at least 4. + */ +export const TRANSLATION_COMPLETENESS_MAX_RETRIES = 4; // URL handling export const INVALID_URL_PLACEHOLDER = diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index 25922e2a..bd116e54 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -276,8 +276,12 @@ describe("notion-translate translateFrontMatter", () => { "\n\n# Section Three\n\n" + "Gamma ".repeat(60); + // chunkLimit is the *total* request budget (prompt overhead + markdown). + // Prompt overhead is ~2.6 K chars; a 3_200 limit leaves ~587 chars of + // markdown per chunk, which fits one 375-char section but not two — so + // the three sections produce exactly three API calls. const result = await translateText(source, "Original Title", "pt-BR", { - chunkLimit: 500, + chunkLimit: 3_200, }); expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(3); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index aa462d2f..abcb357f 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -549,15 +549,11 @@ function getProactiveChunkCharLimit(modelName: string): number { } function getChunkContentBudget(totalChunkLimit: number, title: string): number { - const minimumBudget = Math.min( - totalChunkLimit, - TRANSLATION_MIN_CHUNK_MAX_CHARS - ); - - return Math.max( - totalChunkLimit - TRANSLATION_PROMPT.length - title.length - 20, - minimumBudget - ); + // Subtract prompt overhead so the *total* request stays within totalChunkLimit. + // The minimum content budget is 1; the retry-level floor (TRANSLATION_MIN_CHUNK_MAX_CHARS) + // is enforced as a total-request budget by the caller, not as a markdown payload floor. + const overhead = TRANSLATION_PROMPT.length + title.length + 20; + return Math.max(totalChunkLimit - overhead, 1); } function splitMarkdownForTranslation( From c36dfe9b160b527fd1cccc203adef69620d31426 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:38:44 -0300 Subject: [PATCH 03/18] fix(translate): count setext headings in structure metrics collectMarkdownStructureMetrics only matched ATX headings (# Heading). CommonMark/Docusaurus also accept setext headings (underline with === or ---). If the model reformats a heading into setext style the count would drop and translateText would incorrectly treat the translation as incomplete. Add a multiline regex for setext headings and include them in headingCount. --- scripts/notion-translate/translateFrontMatter.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index abcb357f..2f3d59b9 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -488,14 +488,17 @@ type MarkdownStructureMetrics = { function collectMarkdownStructureMetrics( markdown: string ): MarkdownStructureMetrics { - const headingMatches = markdown.match(/^#{1,6}\s.+$/gm) ?? []; + // ATX headings: "# Heading" + const atxHeadingMatches = markdown.match(/^#{1,6}\s.+$/gm) ?? []; + // Setext headings: a non-empty line followed by a "===" or "---" underline + const setextHeadingMatches = markdown.match(/^.+\n[=\-]{2,}\s*$/gm) ?? []; const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; const bulletListMatches = markdown.match(/^\s*[-*+]\s+/gm) ?? []; const numberedListMatches = markdown.match(/^\s*\d+\.\s+/gm) ?? []; const tableLineMatches = markdown.match(/^\|.*\|\s*$/gm) ?? []; return { - headingCount: headingMatches.length, + headingCount: atxHeadingMatches.length + setextHeadingMatches.length, fencedCodeBlockCount: Math.floor(fencedCodeMatches.length / 2), bulletListCount: bulletListMatches.length, numberedListCount: numberedListMatches.length, From 4459a152a07320fc423eaf2298f91b80f64b2e9c Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:46:19 -0300 Subject: [PATCH 04/18] fix(translate): restrict setext heading detection to H1 (===) only The previous regex /^.+\n[=\-]{2,}\s*/gm matched any non-empty line followed by --- or ===, which also matches list items before thematic breaks (e.g. "- Item\n---"). This caused isSuspiciouslyIncompleteTranslation to count spurious headings in the source and falsely flag complete translations as incomplete. Fix: only match === underlines (setext H1). The = character has no other CommonMark meaning, so this is unambiguous. Setext H2 (--- underline) is skipped because it cannot be distinguished from a thematic break without a full parser. Notion content uses ATX headings exclusively anyway. --- scripts/notion-translate/translateFrontMatter.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 2f3d59b9..ce5c74de 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -490,8 +490,10 @@ function collectMarkdownStructureMetrics( ): MarkdownStructureMetrics { // ATX headings: "# Heading" const atxHeadingMatches = markdown.match(/^#{1,6}\s.+$/gm) ?? []; - // Setext headings: a non-empty line followed by a "===" or "---" underline - const setextHeadingMatches = markdown.match(/^.+\n[=\-]{2,}\s*$/gm) ?? []; + // Setext H1 headings only ("===" underline): unambiguous because "=" has no + // other CommonMark meaning. We deliberately skip "---" underlines (setext H2) + // since they are indistinguishable from thematic breaks without a full parser. + const setextHeadingMatches = markdown.match(/^.+\n=+\s*$/gm) ?? []; const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; const bulletListMatches = markdown.match(/^\s*[-*+]\s+/gm) ?? []; const numberedListMatches = markdown.match(/^\s*\d+\.\s+/gm) ?? []; From ecdf12235ac69091c6f4d0365224d6f70ee45990 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 17:57:02 -0300 Subject: [PATCH 05/18] fix(translate): add setext H2 and admonition tracking to completeness check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setext H2 headings (Heading\n---): re-introduce detection with a negative lookahead that excludes lines starting with list markers or block-level prefixes, which avoids the thematic-break false-positive while still catching genuine section headings. Admonitions (:::type … :::): Docusaurus callout blocks can be silently dropped by the model without triggering any of the existing checks. Count opening+closing ::: pairs (like fenced code blocks) and treat a drop in admonitionCount as an incompleteness signal. --- .../notion-translate/translateFrontMatter.ts | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index ce5c74de..9969d25f 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -479,6 +479,7 @@ function isPlaceholderIntegrityError( type MarkdownStructureMetrics = { headingCount: number; fencedCodeBlockCount: number; + admonitionCount: number; bulletListCount: number; numberedListCount: number; tableLineCount: number; @@ -490,18 +491,31 @@ function collectMarkdownStructureMetrics( ): MarkdownStructureMetrics { // ATX headings: "# Heading" const atxHeadingMatches = markdown.match(/^#{1,6}\s.+$/gm) ?? []; - // Setext H1 headings only ("===" underline): unambiguous because "=" has no - // other CommonMark meaning. We deliberately skip "---" underlines (setext H2) - // since they are indistinguishable from thematic breaks without a full parser. - const setextHeadingMatches = markdown.match(/^.+\n=+\s*$/gm) ?? []; + // Setext H1 headings ("===" underline): unambiguous — "=" has no other + // CommonMark meaning, so these can never be confused with thematic breaks. + const setextH1Matches = markdown.match(/^.+\n=+\s*$/gm) ?? []; + // Setext H2 headings ("---" underline): a thematic break uses the same + // syntax, but only when the preceding line is a block-level marker (list + // item, blockquote, ATX heading, etc.). A setext H2 content line is a + // plain paragraph — so we exclude lines starting with list/block markers. + const setextH2Matches = + markdown.match(/^(?![ \t]*(?:[-*+]|\d+\.)\s|[ \t]*[>#]).+\n-{2,}\s*$/gm) ?? + []; + // Fenced code blocks (backtick or tilde, opening + closing = pairs) const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; + // Docusaurus / MDX admonition markers (:::type … :::) + const admonitionMatches = markdown.match(/^:::/gm) ?? []; const bulletListMatches = markdown.match(/^\s*[-*+]\s+/gm) ?? []; const numberedListMatches = markdown.match(/^\s*\d+\.\s+/gm) ?? []; const tableLineMatches = markdown.match(/^\|.*\|\s*$/gm) ?? []; return { - headingCount: atxHeadingMatches.length + setextHeadingMatches.length, + headingCount: + atxHeadingMatches.length + + setextH1Matches.length + + setextH2Matches.length, fencedCodeBlockCount: Math.floor(fencedCodeMatches.length / 2), + admonitionCount: Math.floor(admonitionMatches.length / 2), bulletListCount: bulletListMatches.length, numberedListCount: numberedListMatches.length, tableLineCount: tableLineMatches.length, @@ -528,6 +542,9 @@ function isSuspiciouslyIncompleteTranslation( const fencedBlockLoss = sourceMetrics.fencedCodeBlockCount > 0 && translatedMetrics.fencedCodeBlockCount < sourceMetrics.fencedCodeBlockCount; + const admonitionLoss = + sourceMetrics.admonitionCount > 0 && + translatedMetrics.admonitionCount < sourceMetrics.admonitionCount; const bulletListLoss = sourceMetrics.bulletListCount >= 3 && translatedMetrics.bulletListCount === 0; @@ -542,6 +559,7 @@ function isSuspiciouslyIncompleteTranslation( return ( headingLoss || fencedBlockLoss || + admonitionLoss || bulletListLoss || numberedListLoss || tableLoss || From c67c0472457ad74fd778a27768f488143c2927eb Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 18:08:55 -0300 Subject: [PATCH 06/18] fix(translate): strip fenced content before metrics and fix table detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 — table detection: the previous regex only matched GFM table rows with outer pipes (| A | B |). Models sometimes emit pipeless form (A | B | C). Switch to matching GFM table *separator* rows instead — these are the unambiguous per-spec indicator of a table and work regardless of outer-pipe style. Threshold lowered to 1 separator (from 2 data lines). Regex uses a simple character-class + .filter() to avoid ReDoS-unsafe nested quantifiers. P2 — fenced code content: structural markers inside fenced code blocks (headings, list items, table rows, admonitions) were counted as real document structure. Strip fenced block interiors before running all regex checks so that code samples do not inflate source counts and cause false-positive incompleteness failures. --- .../notion-translate/translateFrontMatter.ts | 70 +++++++++++++++---- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 9969d25f..8f7345c3 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -482,32 +482,78 @@ type MarkdownStructureMetrics = { admonitionCount: number; bulletListCount: number; numberedListCount: number; - tableLineCount: number; + tableCount: number; contentLength: number; }; +/** + * Returns a copy of the markdown with the *content* of fenced code blocks + * removed (the opening/closing fence markers are kept so that fenced block + * counts remain accurate). This prevents structural markers inside code + * samples — headings, list items, table rows, etc. — from inflating counts. + */ +function stripFencedCodeContent(markdown: string): string { + const lines = markdown.split("\n"); + const result: string[] = []; + let inFence = false; + let fenceMarker = ""; + + for (const line of lines) { + if (!inFence) { + const match = line.match(/^(`{3,}|~{3,})/); + if (match) { + inFence = true; + fenceMarker = match[1]; + result.push(line); // keep opening marker + } else { + result.push(line); + } + } else { + if (line.startsWith(fenceMarker)) { + inFence = false; + fenceMarker = ""; + result.push(line); // keep closing marker + } + // drop content lines inside the fence + } + } + + return result.join("\n"); +} + function collectMarkdownStructureMetrics( markdown: string ): MarkdownStructureMetrics { + // Fenced code blocks must be counted on raw markdown (before stripping). + const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; + + // All other structural markers are measured on the stripped version so that + // examples inside code blocks do not inflate the counts. + const stripped = stripFencedCodeContent(markdown); + // ATX headings: "# Heading" - const atxHeadingMatches = markdown.match(/^#{1,6}\s.+$/gm) ?? []; + const atxHeadingMatches = stripped.match(/^#{1,6}\s.+$/gm) ?? []; // Setext H1 headings ("===" underline): unambiguous — "=" has no other // CommonMark meaning, so these can never be confused with thematic breaks. - const setextH1Matches = markdown.match(/^.+\n=+\s*$/gm) ?? []; + const setextH1Matches = stripped.match(/^.+\n=+\s*$/gm) ?? []; // Setext H2 headings ("---" underline): a thematic break uses the same // syntax, but only when the preceding line is a block-level marker (list // item, blockquote, ATX heading, etc.). A setext H2 content line is a // plain paragraph — so we exclude lines starting with list/block markers. const setextH2Matches = - markdown.match(/^(?![ \t]*(?:[-*+]|\d+\.)\s|[ \t]*[>#]).+\n-{2,}\s*$/gm) ?? + stripped.match(/^(?![ \t]*(?:[-*+]|\d+\.)\s|[ \t]*[>#]).+\n-{2,}\s*$/gm) ?? []; - // Fenced code blocks (backtick or tilde, opening + closing = pairs) - const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; // Docusaurus / MDX admonition markers (:::type … :::) - const admonitionMatches = markdown.match(/^:::/gm) ?? []; - const bulletListMatches = markdown.match(/^\s*[-*+]\s+/gm) ?? []; - const numberedListMatches = markdown.match(/^\s*\d+\.\s+/gm) ?? []; - const tableLineMatches = markdown.match(/^\|.*\|\s*$/gm) ?? []; + const admonitionMatches = stripped.match(/^:::/gm) ?? []; + const bulletListMatches = stripped.match(/^\s*[-*+]\s+/gm) ?? []; + const numberedListMatches = stripped.match(/^\s*\d+\.\s+/gm) ?? []; + // GFM table separator rows (---|---|---) are the unambiguous indicator of a + // table and work regardless of whether the model uses outer pipes or not. + // A separator line contains only "-", ":", "|", space, and tab characters, + // and must include both a "|" (distinguishes from thematic break) and a "-". + const tableMatches = (stripped.match(/^[ \t:|-]+\s*$/gm) ?? []).filter( + (line) => line.includes("|") && line.includes("-") + ); return { headingCount: @@ -518,7 +564,7 @@ function collectMarkdownStructureMetrics( admonitionCount: Math.floor(admonitionMatches.length / 2), bulletListCount: bulletListMatches.length, numberedListCount: numberedListMatches.length, - tableLineCount: tableLineMatches.length, + tableCount: tableMatches.length, contentLength: markdown.trim().length, }; } @@ -552,7 +598,7 @@ function isSuspiciouslyIncompleteTranslation( sourceMetrics.numberedListCount >= 3 && translatedMetrics.numberedListCount === 0; const tableLoss = - sourceMetrics.tableLineCount >= 2 && translatedMetrics.tableLineCount === 0; + sourceMetrics.tableCount >= 1 && translatedMetrics.tableCount === 0; const severeLengthShrinkage = sourceMetrics.contentLength >= 4_000 && lengthRatio < 0.55; From ece8b76c0474e354b7815ec6bc00692120239235 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 19:02:07 -0300 Subject: [PATCH 07/18] fix(scripts): resolve typescript compilation and markdown parsing bugs Fixes a TypeScript 'includes does not exist on type never' error and allows up to 3 spaces of optional indentation for fenced code blocks in stripFencedCodeContent. Co-authored-by: Junie --- scripts/notion-translate/translateFrontMatter.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 8f7345c3..4b9ecf2d 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -500,7 +500,7 @@ function stripFencedCodeContent(markdown: string): string { for (const line of lines) { if (!inFence) { - const match = line.match(/^(`{3,}|~{3,})/); + const match = line.match(/^[ \t]{0,3}(`{3,}|~{3,})/); if (match) { inFence = true; fenceMarker = match[1]; @@ -509,7 +509,7 @@ function stripFencedCodeContent(markdown: string): string { result.push(line); } } else { - if (line.startsWith(fenceMarker)) { + if (line.trimStart().startsWith(fenceMarker)) { inFence = false; fenceMarker = ""; result.push(line); // keep closing marker @@ -551,9 +551,9 @@ function collectMarkdownStructureMetrics( // table and work regardless of whether the model uses outer pipes or not. // A separator line contains only "-", ":", "|", space, and tab characters, // and must include both a "|" (distinguishes from thematic break) and a "-". - const tableMatches = (stripped.match(/^[ \t:|-]+\s*$/gm) ?? []).filter( - (line) => line.includes("|") && line.includes("-") - ); + const tableMatches = ( + (stripped.match(/^[ \t:|-]+\s*$/gm) ?? []) as string[] + ).filter((line) => line.includes("|") && line.includes("-")); return { headingCount: From 0c418f17a2d67788b565a5561229c11f8f31a61a Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 21:49:50 -0300 Subject: [PATCH 08/18] fix(translate): exclude YAML frontmatter from structure metrics Bullet lists inside YAML frontmatter (e.g. keywords lists) were being counted as structural elements, causing false-positive incomplete translation detections when the model reformatted them as inline arrays. Strip frontmatter before collecting structure metrics so that only translatable content body is evaluated. --- .../translateFrontMatter.test.ts | 41 +++++++++++++++++++ .../notion-translate/translateFrontMatter.ts | 23 +++++++++-- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index bd116e54..11bef903 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -209,6 +209,47 @@ describe("notion-translate translateFrontMatter", () => { expect(mockOpenAIChatCompletionCreate.mock.calls.length).toBeGreaterThan(1); }); + it("does not count bullet lists inside YAML frontmatter towards structure validation", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "---\n" + + "title: Page\n" + + "keywords:\n" + + " - one\n" + + " - two\n" + + " - three\n" + + " - four\n" + + "---\n\n" + + "# Section One\n\n" + + "Body paragraph."; + + // The translated version turns the keywords list into an inline array + mockOpenAIChatCompletionCreate.mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Page\n" + + "keywords: [one, two, three, four]\n" + + "---\n\n" + + "# Seção Um\n\n" + + "Parágrafo do corpo.", + title: "Página", + }), + }, + }, + ], + }); + + const result = await translateText(source, "Original Title", "pt-BR"); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(1); + expect(result.markdown).toContain("Seção Um"); + }); + it("treats heavy structural shrinkage as incomplete long-form translation", async () => { const { translateText } = await import("./translateFrontMatter"); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 4b9ecf2d..f7080a73 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -521,15 +521,32 @@ function stripFencedCodeContent(markdown: string): string { return result.join("\n"); } +function stripYamlFrontmatter(markdown: string): string { + if (markdown.startsWith("---\n") || markdown.startsWith("---\r\n")) { + const endFrontmatterIndex = markdown.indexOf("\n---", 3); + if (endFrontmatterIndex !== -1) { + const endOfLineIndex = markdown.indexOf("\n", endFrontmatterIndex + 1); + if (endOfLineIndex !== -1) { + return markdown.substring(endOfLineIndex + 1); + } + return ""; + } + } + return markdown; +} + function collectMarkdownStructureMetrics( markdown: string ): MarkdownStructureMetrics { + // Remove frontmatter before stripping fenced code content + const withoutFrontmatter = stripYamlFrontmatter(markdown); + // Fenced code blocks must be counted on raw markdown (before stripping). - const fencedCodeMatches = markdown.match(/^(`{3,}|~{3,})/gm) ?? []; + const fencedCodeMatches = withoutFrontmatter.match(/^(`{3,}|~{3,})/gm) ?? []; // All other structural markers are measured on the stripped version so that // examples inside code blocks do not inflate the counts. - const stripped = stripFencedCodeContent(markdown); + const stripped = stripFencedCodeContent(withoutFrontmatter); // ATX headings: "# Heading" const atxHeadingMatches = stripped.match(/^#{1,6}\s.+$/gm) ?? []; @@ -565,7 +582,7 @@ function collectMarkdownStructureMetrics( bulletListCount: bulletListMatches.length, numberedListCount: numberedListMatches.length, tableCount: tableMatches.length, - contentLength: markdown.trim().length, + contentLength: withoutFrontmatter.trim().length, }; } From d1b5ff28f2732a771f0205949ff1529d18e7d030 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 19 Mar 2026 22:01:46 -0300 Subject: [PATCH 09/18] fix(translate): tolerate one missing heading and restore unclosed fence content - Relax heading-loss threshold from strict (< source) to (< source - 1) so a single reformatted heading no longer triggers a spurious retry - Buffer lines inside unclosed fenced code blocks and restore them on EOF instead of silently dropping remaining content, preventing false positive completeness failures on malformed markdown - Update retries/failure tests to use 4-section documents that still demonstrate retry behaviour under the new heading tolerance --- .../notion-translate/translateFrontMatter.test.ts | 14 +++++++++----- scripts/notion-translate/translateFrontMatter.ts | 13 +++++++++++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index 11bef903..db0d8148 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -132,7 +132,9 @@ describe("notion-translate translateFrontMatter", () => { "# Section Two\n\n" + "Beta paragraph.\n\n" + "# Section Three\n\n" + - "Gamma paragraph."; + "Gamma paragraph.\n\n" + + "# Section Four\n\n" + + "Delta paragraph."; mockOpenAIChatCompletionCreate .mockResolvedValueOnce({ @@ -141,7 +143,7 @@ describe("notion-translate translateFrontMatter", () => { message: { content: JSON.stringify({ markdown: - "# Seção Um\n\nParágrafo alfa.\n\n# Seção Três\n\nParágrafo gama.", + "# Seção Um\n\nParágrafo alfa.\n\n# Seção Quatro\n\nParágrafo delta.", title: "Título Traduzido", }), }, @@ -154,7 +156,7 @@ describe("notion-translate translateFrontMatter", () => { message: { content: JSON.stringify({ markdown: - "# Seção Um\n\nParágrafo alfa.\n\n# Seção Dois\n\nParágrafo beta.\n\n# Seção Três\n\nParágrafo gama.", + "# Seção Um\n\nParágrafo alfa.\n\n# Seção Dois\n\nParágrafo beta.\n\n# Seção Três\n\nParágrafo gama.\n\n# Seção Quatro\n\nParágrafo delta.", title: "Título Traduzido", }), }, @@ -180,7 +182,9 @@ describe("notion-translate translateFrontMatter", () => { "# Section Two\n\n" + "Beta paragraph.\n\n" + "# Section Three\n\n" + - "Gamma paragraph."; + "Gamma paragraph.\n\n" + + "# Section Four\n\n" + + "Delta paragraph."; mockOpenAIChatCompletionCreate.mockImplementation(async () => ({ choices: [ @@ -188,7 +192,7 @@ describe("notion-translate translateFrontMatter", () => { message: { content: JSON.stringify({ markdown: - "# Seção Um\n\nParágrafo alfa.\n\n# Seção Três\n\nParágrafo gama.", + "# Seção Um\n\nParágrafo alfa.\n\n# Seção Quatro\n\nParágrafo delta.", title: "Título Traduzido", }), }, diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index f7080a73..b0f1692e 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -497,6 +497,7 @@ function stripFencedCodeContent(markdown: string): string { const result: string[] = []; let inFence = false; let fenceMarker = ""; + let fenceBuffer: string[] = []; for (const line of lines) { if (!inFence) { @@ -505,6 +506,7 @@ function stripFencedCodeContent(markdown: string): string { inFence = true; fenceMarker = match[1]; result.push(line); // keep opening marker + fenceBuffer = []; } else { result.push(line); } @@ -513,11 +515,18 @@ function stripFencedCodeContent(markdown: string): string { inFence = false; fenceMarker = ""; result.push(line); // keep closing marker + fenceBuffer = []; + } else { + fenceBuffer.push(line); } - // drop content lines inside the fence } } + // Failsafe: restore lines if the block was never closed + if (inFence && fenceBuffer.length > 0) { + result.push(...fenceBuffer); + } + return result.join("\n"); } @@ -601,7 +610,7 @@ function isSuspiciouslyIncompleteTranslation( translatedMetrics.contentLength / Math.max(sourceMetrics.contentLength, 1); const headingLoss = sourceMetrics.headingCount > 0 && - translatedMetrics.headingCount < sourceMetrics.headingCount; + translatedMetrics.headingCount < sourceMetrics.headingCount - 1; const fencedBlockLoss = sourceMetrics.fencedCodeBlockCount > 0 && translatedMetrics.fencedCodeBlockCount < sourceMetrics.fencedCodeBlockCount; From a5db86f820c6c3f174a917da2875f6d6b6e19ed1 Mon Sep 17 00:00:00 2001 From: luandro Date: Fri, 20 Mar 2026 08:47:09 -0300 Subject: [PATCH 10/18] docs: add initial CHANGELOG.md file --- CHANGELOG.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..ab63fbab --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). + +## [Unreleased] + +### Added +- **Targeted Notion Fetching:** Added the ability to fetch data for a single Notion page. +- **Pre-Release Safety:** Added validation checks to ensure all translations (locales) are complete. + +### Changed +- **Simplified Data Fetching:** Cleaned up and simplified the logic for fetching all pages from Notion. +- **Docker Tests:** Updated the Docker integration tests to work correctly with the newly added fetch-job types. + +### Removed +- **Code Cleanup:** Removed redundant code from the API schemas to make the codebase cleaner. + +### Fixed +- **Translation Completeness:** Fixed several issues with how the system measures if a page is fully translated. +- **Long-form Content Translation:** Prevented issues where content could be lost when translating very long pages. +- **Language Switcher (Locale Dropdown):** + - Fixed a bug where the language switcher would sometimes point to the wrong page. + - Corrected an issue that caused "double" language codes in URLs. + - Fixed navigation issues when switching languages on category index pages. + - Fixed a display issue where the language dropdown might be hidden behind other menu items. +- **Build Scripts:** Resolved bugs in the TypeScript compilation and Markdown parsing scripts. \ No newline at end of file From b040df4f55d6229ea03dc2250a1a527d95abe74d Mon Sep 17 00:00:00 2001 From: luandro Date: Wed, 25 Mar 2026 20:04:20 -0300 Subject: [PATCH 11/18] fix(translate): flag any heading loss as incomplete translation The headingLoss condition tolerated losing one heading silently (`< headingCount - 1`), weakening the regression guard for long-form content loss. Align it with all other structural checks (zero-tolerance) by changing to `< headingCount`. Addresses review feedback on PR #169. --- scripts/notion-translate/translateFrontMatter.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index b0f1692e..00e7ebfb 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -610,7 +610,7 @@ function isSuspiciouslyIncompleteTranslation( translatedMetrics.contentLength / Math.max(sourceMetrics.contentLength, 1); const headingLoss = sourceMetrics.headingCount > 0 && - translatedMetrics.headingCount < sourceMetrics.headingCount - 1; + translatedMetrics.headingCount < sourceMetrics.headingCount; const fencedBlockLoss = sourceMetrics.fencedCodeBlockCount > 0 && translatedMetrics.fencedCodeBlockCount < sourceMetrics.fencedCodeBlockCount; From 014bd8120eb0ca520f5551db671c95c97e916c42 Mon Sep 17 00:00:00 2001 From: luandro Date: Wed, 25 Mar 2026 20:40:02 -0300 Subject: [PATCH 12/18] fix(translate): detect finish_reason:length as token_overflow When OpenAI truncates output due to hitting the token budget it signals finish_reason: "length". Previously the code would fall through and try to parse truncated JSON, producing parse errors or garbage. Now it throws a non-critical token_overflow TranslationError immediately, which the existing translateChunkWithOverflowFallback retry path handles by splitting the chunk and retrying. Adds two tests: one verifying the error classification, one verifying the end-to-end retry-with-smaller-chunks behaviour. --- .../translateFrontMatter.test.ts | 64 +++++++++++++++++++ .../notion-translate/translateFrontMatter.ts | 12 +++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index db0d8148..6bba803b 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -369,6 +369,70 @@ describe("notion-translate translateFrontMatter", () => { ); }); + it("classifies finish_reason:length as non-critical token_overflow", async () => { + const { translateText } = await import("./translateFrontMatter"); + + mockOpenAIChatCompletionCreate.mockResolvedValue({ + choices: [ + { + finish_reason: "length", + message: { + content: '{"markdown":"partial content', + }, + }, + ], + }); + + await expect(translateText("# Body", "Title", "pt-BR")).rejects.toEqual( + expect.objectContaining({ + code: "token_overflow", + isCritical: false, + }) + ); + }); + + it("retries with smaller chunks when finish_reason:length is returned", async () => { + const { translateText } = await import("./translateFrontMatter"); + + mockOpenAIChatCompletionCreate + .mockResolvedValueOnce({ + choices: [ + { + finish_reason: "length", + message: { + content: '{"markdown":"partial content', + }, + }, + ], + }) + .mockImplementation(async (request: MockOpenAIRequest) => { + const payload = extractPromptMarkdown(request); + return { + choices: [ + { + finish_reason: "stop", + message: { + content: JSON.stringify({ + markdown: payload.markdown, + title: "Translated Title", + }), + }, + }, + ], + }; + }); + + const result = await translateText( + "# Small page\n\nJust a paragraph.", + "Small", + "pt-BR" + ); + + expect(mockOpenAIChatCompletionCreate.mock.calls.length).toBeGreaterThan(1); + expect(result.title).toBe("Translated Title"); + expect(result.markdown).toContain("Just a paragraph."); + }); + it("takes the single-call fast path for small content", async () => { const { translateText } = await import("./translateFrontMatter"); installStructuredTranslationMock(); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 00e7ebfb..2a69236a 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -759,7 +759,17 @@ async function translateTextSingleCall( ...modelParams, }); - const content = response.choices[0]?.message?.content; + const choice = response.choices[0]; + const finishReason = choice?.finish_reason; + if (finishReason === "length") { + throw new TranslationError( + "OpenAI output was truncated (finish_reason: length) — chunk too large for model output budget", + "token_overflow", + false + ); + } + + const content = choice?.message?.content; if (!content) { throw new TranslationError( "OpenAI returned an empty translation response", From 71689886935e59052c07d67596bb976b10408d21 Mon Sep 17 00:00:00 2001 From: luandro Date: Wed, 25 Mar 2026 23:30:11 -0300 Subject: [PATCH 13/18] fix(notion-translate): harden translation integrity checks --- i18n/es/code.json | 454 +++++++++-------- i18n/pt/code.json | 463 ++++++++++-------- scripts/notion-translate/index.test.ts | 40 +- .../notion-translate/translateBlocks.test.ts | 45 ++ .../translateFrontMatter.test.ts | 141 ++++++ .../notion-translate/translateFrontMatter.ts | 68 ++- 6 files changed, 791 insertions(+), 420 deletions(-) diff --git a/i18n/es/code.json b/i18n/es/code.json index 92b9f565..da4f2c4b 100644 --- a/i18n/es/code.json +++ b/i18n/es/code.json @@ -1,201 +1,255 @@ { - "theme.TOC.title": { - "message": "En esta página", - "description": "Title for the table of contents section" - }, - "Introduction": { - "message": "Introducción" - }, - "Preparing to Use CoMapeo": { - "message": "Preparación para el uso de CoMapeo" - }, - "Understanding CoMapeo's Core Concepts and Functions": { - "message": "Nueva Página" - }, - "Getting Started Essentials": { - "message": "Nuevo título de sección" - }, - "Gathering the Right Equipment for CoMapeo": { - "message": "Reunir el Equipo Adecuado para CoMapeo" - }, - "Device Setup and Maintenance for CoMapeo": { - "message": "Nueva Página" - }, - "Installing CoMapeo & Onboarding": { - "message": "Nueva Página" - }, - "Initial Use and CoMapeo Settings": { - "message": "Nueva Página" - }, - "Uninstalling CoMapeo": { - "message": "Desinstalar CoMapeo" - }, - "Customizing CoMapeo": { - "message": "Nueva Palanca" - }, - "Organizing Key Materials for Projects": { - "message": "Nueva Página" - }, - "Building a Custom Categories Set": { - "message": "Nueva Página" - }, - "Building Custom Background Maps": { - "message": "Nueva Página" - }, - "Observations & Tracks": { - "message": "Nuevo título de sección" - }, - "Gathering Observations & Tracks": { - "message": "Recopilación de observaciones" - }, - "Creating a New Observation": { - "message": "Nueva Página" - }, - "Creating a New Track": { - "message": "Nueva Página" - }, - "Reviewing Observations": { - "message": "Revisión de observaciones" - }, - "Exploring the Observations List": { - "message": "Nueva Página" - }, - "Reviewing an Observation": { - "message": "Nueva Página" - }, - "Editing Observations": { - "message": "Nueva Página" - }, - "Data Privacy & Security": { - "message": "Nuevo título de sección" - }, - "Encryption and Security": { - "message": "Nueva Página" - }, - "Managing Data Privacy & Security": { - "message": "Gestión de datos y privacidad" - }, - "Using an App Passcode for Security": { - "message": "Nueva Página" - }, - "Adjusting Data Sharing and Privacy": { - "message": "Nueva Página" - }, - "Mapping with Collaborators": { - "message": "Nueva Página" - }, - "Managing Projects": { - "message": "Gestión de proyectos" - }, - "Understanding Projects": { - "message": "Nueva Página" - }, - "Creating a New Project": { - "message": "Nueva Página" - }, - "Changing Categories Set": { - "message": "Nueva Página" - }, - "Managing a Team": { - "message": "Nueva Página" - }, - "Inviting Collaborators": { - "message": "Nueva Página" - }, - "Ending a Project": { - "message": "Nueva Página" - }, - "Exchanging Project Data": { - "message": "Intercambio de Datos del Proyecto" - }, - "Understanding How Exchange Works": { - "message": "Nueva Página A" - }, - "Using Exchange Offline": { - "message": "Nueva Página" - }, - "Using a Remote Archive": { - "message": "Nueva Página" - }, - "Moving Observations & Tracks Outside of CoMapeo": { - "message": "Compartir observaciones fuera de CoMapeo" - }, - "Sharing a Single Observation and Metadata": { - "message": "Nueva Página" - }, - "Exporting all Observations": { - "message": "Nueva Página" - }, - "Using Observations outside of CoMapeo": { - "message": "Nueva Página" - }, - "Miscellaneous": { - "message": "Misceláneas" - }, - "FAQ": { - "message": "Preguntas frecuentes" - }, - "Glossary": { - "message": "Glosario" - }, - "Troubleshooting": { - "message": "Nueva Palanca" - }, - "Common Solutions": { - "message": "Nueva Página" - }, - "Troubleshooting: Setup and Customization": { - "message": "Nueva Página" - }, - "Troubleshooting: Observations and Tracks": { - "message": "Nueva Página" - }, - "Troubleshooting: Data Privacy and Security": { - "message": "Nueva Página" - }, - "Troubleshooting: Mapping with Collaborators": { - "message": "Nueva Página" - }, - "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { - "message": "Nueva Página" - }, - "Elementos de contenido de prueba": { - "message": "Elementos de contenido de prueba" - }, - "Testing links": { - "message": "Nueva Página" - }, - "Understanding CoMapeo's Core Concepts and Functions": { - "message": "Nueva Página" - }, - "Installing CoMapeo and Onboarding": { - "message": "Nueva Página" - }, - "Planning and Preparing for a Project": { - "message": "Nueva Página" - }, - "Observations and Tracks": { - "message": "Nuevo título de sección" - }, - "Gathering Observations and Tracks": { - "message": "Recopilación de observaciones" - }, - "Data Privacy and Security": { - "message": "Nuevo título de sección" - }, - "Managing Data Privacy and Security": { - "message": "Gestión de datos y privacidad" - }, - "Moving Observations and Tracks Outside of CoMapeo": { - "message": "Compartir observaciones fuera de CoMapeo" - }, - "Developer Tools": { - "message": "Herramientas de desarrollador" - }, - "API Reference": { - "message": "Referencia de API" - }, - "CLI Reference": { - "message": "Referencia de CLI" - } -} + "theme.TOC.title": { + "message": "En esta página", + "description": "Title for the table of contents section" + }, + "Introduction": { + "message": "Introducción" + }, + "Preparing to Use CoMapeo": { + "message": "Preparación para usar " + }, + "Understanding CoMapeo's Core Concepts and Functions": { + "message": "Nueva Página" + }, + "Getting Started Essentials": { + "message": "Nuevo título de sección" + }, + "Gathering the Right Equipment for CoMapeo": { + "message": "Reúne el equipo adecuado para CoMapeo" + }, + "Device Setup and Maintenance for CoMapeo": { + "message": "Nueva Página" + }, + "Installing CoMapeo & Onboarding": { + "message": "Instalación de CoMapeo y primeros pasos" + }, + "Initial Use and CoMapeo Settings": { + "message": "Nueva Página" + }, + "Uninstalling CoMapeo": { + "message": "Desinstalar CoMapeo" + }, + "Customizing CoMapeo": { + "message": "Personaliza CoMapeo" + }, + "Organizing Key Materials for Projects": { + "message": "Nueva Página" + }, + "Building a Custom Categories Set": { + "message": "Nueva Página" + }, + "Building Custom Background Maps": { + "message": "Nueva Página" + }, + "Observations & Tracks": { + "message": "Observaciones y Trayectos" + }, + "Gathering Observations & Tracks": { + "message": "Registra Observaciones y Trayectos" + }, + "Creating a New Observation": { + "message": "Crea una Nueva Observación" + }, + "Creating a New Track": { + "message": "Crea un nuevo Trayecto" + }, + "Reviewing Observations": { + "message": "Revisa Observaciones" + }, + "Exploring the Observations List": { + "message": "Explora la Lista de Observaciones" + }, + "Reviewing an Observation": { + "message": "Revisa una Observación" + }, + "Editing Observations": { + "message": "Edita Observaciones" + }, + "Data Privacy & Security": { + "message": "Privacidad y Seguridad de Datos" + }, + "Encryption and Security": { + "message": "Nueva Página" + }, + "Managing Data Privacy & Security": { + "message": "Gestión de Privacidad y Seguridad de Datos" + }, + "Using an App Passcode for Security": { + "message": "Usa una Contraseña para CoMapeo por Seguridad" + }, + "Adjusting Data Sharing and Privacy": { + "message": "Nueva Página" + }, + "Mapping with Collaborators": { + "message": "Mapea con Colaboradores" + }, + "Managing Projects": { + "message": "Gestión de Proyectos" + }, + "Understanding Projects": { + "message": "Comprende las Bases Sobre Proyectos" + }, + "Creating a New Project": { + "message": "Crea un Nuevo Proyecto" + }, + "Changing Categories Set": { + "message": "Nueva Página" + }, + "Managing a Team": { + "message": "Nueva Página" + }, + "Inviting Collaborators": { + "message": "Invita Colaboradores" + }, + "Ending a Project": { + "message": "Nueva Página" + }, + "Exchanging Project Data": { + "message": "Intercambio de Datos del Proyecto" + }, + "Understanding How Exchange Works": { + "message": "Nueva Página A" + }, + "Using Exchange Offline": { + "message": "Nueva Página" + }, + "Using a Remote Archive": { + "message": "Usa un Archivo Remoto" + }, + "Moving Observations & Tracks Outside of CoMapeo": { + "message": "Mueve Observaciones y Trayectos fuera de CoMapeo" + }, + "Sharing a Single Observation and Metadata": { + "message": "Nueva Página" + }, + "Exporting all Observations": { + "message": "Exporta todas las Observaciones" + }, + "Using Observations outside of CoMapeo": { + "message": "Usa Observaciones fuera de CoMapeo" + }, + "Miscellaneous": { + "message": "Misceláneas" + }, + "FAQ": { + "message": "Preguntas frecuentes" + }, + "Glossary": { + "message": "Glosario" + }, + "Troubleshooting": { + "message": "Solución de problemas" + }, + "Common Solutions": { + "message": "Nueva Página" + }, + "Troubleshooting: Setup and Customization": { + "message": "Nueva Página" + }, + "Troubleshooting: Observations and Tracks": { + "message": "Nueva Página" + }, + "Troubleshooting: Data Privacy and Security": { + "message": "Nueva Página" + }, + "Troubleshooting: Mapping with Collaborators": { + "message": "Nueva Página" + }, + "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { + "message": "Nueva Página" + }, + "Elementos de contenido de prueba": { + "message": "Elementos de contenido de prueba" + }, + "Testing links": { + "message": "Nueva Página" + }, + "Installing CoMapeo and Onboarding": { + "message": "Nueva Página" + }, + "Planning and Preparing for a Project": { + "message": "Nueva Página" + }, + "Observations and Tracks": { + "message": "Nuevo título de sección" + }, + "Gathering Observations and Tracks": { + "message": "Recopilación de observaciones" + }, + "Data Privacy and Security": { + "message": "Nuevo título de sección" + }, + "Managing Data Privacy and Security": { + "message": "Gestión de datos y privacidad" + }, + "Moving Observations and Tracks Outside of CoMapeo": { + "message": "Compartir observaciones fuera de CoMapeo" + }, + "Developer Tools": { + "message": "Herramientas de desarrollador" + }, + "API Reference": { + "message": "Referencia de API" + }, + "CLI Reference": { + "message": "Referencia de CLI" + }, + "Understanding CoMapeo’s Core Concepts & Functions": { + "message": "Conoce los conceptos y funciones principales de CoMapeo" + }, + "Getting Started - Essentials": { + "message": "Introducción - Conceptos básicos" + }, + "Device Setup & Maintenance for CoMapeo": { + "message": "Configuración y mantenimiento del dispositivo para CoMapeo" + }, + "Initial Use & CoMapeo Settings": { + "message": "Uso Inicial y Ajustes de CoMapeo" + }, + "Creating Custom Background Maps": { + "message": "Crea Mapas de Fondo personalizados" + }, + "Reviewing and Editing Tracks": { + "message": "Revisión y Edición de Trayectos" + }, + "Encryption & Security": { + "message": "Encriptación y Seguridad" + }, + "Adjusting Data Sharing & Privacy": { + "message": "Ajusta el Intercambio de Datos y Privacidad" + }, + "Selecting Device Roles & Teams": { + "message": "Selección de roles y equipos de dispositivos" + }, + "Leave a project": { + "message": "Abandona un proyecto" + }, + "Removing a device from a Project": { + "message": "Eliminar un dispositivo de un proyecto" + }, + "Completing or Ending a Project": { + "message": "Finaliza un Proyecto" + }, + "Exchanging Observations": { + "message": "Intercambia Observaciones" + }, + "Sharing Background Map": { + "message": "Comparte el Mapa de Fondo" + }, + "Sharing a Single Observation & Metadata": { + "message": "Comparte una sola Observación y Metadatos" + }, + "Site Map": { + "message": "Mapa del sitio" + }, + "CoMapeo Data & Privacy (translating for public page)": { + "message": "Nueva Página" + }, + "[TEST] Installation Guide": { + "message": "[PRUEBA] Guía de Instalación" + }, + "Changing Backgroud Maps": { + "message": "Cambia Mapas de Fondo" + } +} \ No newline at end of file diff --git a/i18n/pt/code.json b/i18n/pt/code.json index 7c22c3c1..c3984d3f 100644 --- a/i18n/pt/code.json +++ b/i18n/pt/code.json @@ -1,201 +1,264 @@ { - "theme.TOC.title": { - "message": "Nesta página", - "description": "Title for the table of contents section" - }, - "Introduction": { - "message": "Introdução" - }, - "Preparing to Use CoMapeo": { - "message": "Preparando para usar do CoMapeo (Mobile)" - }, - "Understanding CoMapeo's Core Concepts and Functions": { - "message": "Nova Página" - }, - "Getting Started Essentials": { - "message": "Novo título da seção" - }, - "Gathering the Right Equipment for CoMapeo": { - "message": "Reunindo o Equipamento Certo para o CoMapeo" - }, - "Device Setup and Maintenance for CoMapeo": { - "message": "Nova Página" - }, - "Installing CoMapeo & Onboarding": { - "message": "Nova Página" - }, - "Initial Use and CoMapeo Settings": { - "message": "Nova Página" - }, - "Uninstalling CoMapeo": { - "message": "Nova Página" - }, - "Customizing CoMapeo": { - "message": "Novo Alternar" - }, - "Organizing Key Materials for Projects": { - "message": "Nova Página" - }, - "Building a Custom Categories Set": { - "message": "Nova Página" - }, - "Building Custom Background Maps": { - "message": "Nova Página" - }, - "Observations & Tracks": { - "message": "Novo título da seção" - }, - "Gathering Observations & Tracks": { - "message": "Coletando Observações" - }, - "Creating a New Observation": { - "message": "Nova Página" - }, - "Creating a New Track": { - "message": "Nova Página" - }, - "Reviewing Observations": { - "message": "Revisando Observações" - }, - "Exploring the Observations List": { - "message": "Nova Página" - }, - "Reviewing an Observation": { - "message": "Nova Página" - }, - "Editing Observations": { - "message": "Nova Página" - }, - "Data Privacy & Security": { - "message": "Novo título da seção" - }, - "Encryption and Security": { - "message": "Nova Página" - }, - "Managing Data Privacy & Security": { - "message": "Gerenciamento de dados e privacidade" - }, - "Using an App Passcode for Security": { - "message": "Nova Página" - }, - "Adjusting Data Sharing and Privacy": { - "message": "Nova Página" - }, - "Mapping with Collaborators": { - "message": "Nova Página" - }, - "Managing Projects": { - "message": "Gerenciando Projetos" - }, - "Understanding Projects": { - "message": "Nova Página" - }, - "Creating a New Project": { - "message": "Nova Página" - }, - "Changing Categories Set": { - "message": "Nova Página" - }, - "Managing a Team": { - "message": "Nova Página" - }, - "Inviting Collaborators": { - "message": "Nova Página" - }, - "Ending a Project": { - "message": "Nova Página" - }, - "Exchanging Project Data": { - "message": "Troca de Dados do Projeto" - }, - "Understanding How Exchange Works": { - "message": "Nova Página A" - }, - "Using Exchange Offline": { - "message": "Nova Página" - }, - "Using a Remote Archive": { - "message": "Nova Página" - }, - "Moving Observations & Tracks Outside of CoMapeo": { - "message": "Compartilhando observações fora do CoMapeo" - }, - "Sharing a Single Observation and Metadata": { - "message": "Nova Página" - }, - "Exporting all Observations": { - "message": "Nova Página" - }, - "Using Observations outside of CoMapeo": { - "message": "Nova Página" - }, - "Miscellaneous": { - "message": "Variado" - }, - "FAQ": { - "message": "Perguntas frequentes" - }, - "Glossary": { - "message": "Glossário" - }, - "Troubleshooting": { - "message": "Resolução de Problemas" - }, - "Common Solutions": { - "message": "Nova Página" - }, - "Troubleshooting: Setup and Customization": { - "message": "Nova Página" - }, - "Troubleshooting: Observations and Tracks": { - "message": "Nova Página" - }, - "Troubleshooting: Data Privacy and Security": { - "message": "Nova Página" - }, - "Troubleshooting: Mapping with Collaborators": { - "message": "Nova Página" - }, - "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { - "message": "Nova Página" - }, - "Elementos de Conteúdo de Teste": { - "message": "Elementos de Conteúdo de Teste" - }, - "Testing links": { - "message": "Nova Página" - }, - "Understanding CoMapeo's Core Concepts and Functions": { - "message": "Nova Página" - }, - "Installing CoMapeo and Onboarding": { - "message": "Nova Página" - }, - "Planning and Preparing for a Project": { - "message": "Nova Página" - }, - "Observations and Tracks": { - "message": "Novo título da seção" - }, - "Gathering Observations and Tracks": { - "message": "Coletando Observações" - }, - "Data Privacy and Security": { - "message": "Novo título da seção" - }, - "Managing Data Privacy and Security": { - "message": "Gerenciamento de dados e privacidade" - }, - "Moving Observations and Tracks Outside of CoMapeo": { - "message": "Compartilhando observações fora do CoMapeo" - }, - "Developer Tools": { - "message": "Ferramentas de desenvolvedor" - }, - "API Reference": { - "message": "Referência de API" - }, - "CLI Reference": { - "message": "Referência de CLI" - } -} + "theme.TOC.title": { + "message": "Nesta página", + "description": "Title for the table of contents section" + }, + "Introduction": { + "message": "Introdução" + }, + "Preparing to Use CoMapeo": { + "message": "Preparação para usar CoMapeo" + }, + "Understanding CoMapeo's Core Concepts and Functions": { + "message": "Nova Página" + }, + "Getting Started Essentials": { + "message": "Novo título da seção" + }, + "Gathering the Right Equipment for CoMapeo": { + "message": "Reunindo o Equipamento Adequado para CoMapeo" + }, + "Device Setup and Maintenance for CoMapeo": { + "message": "Nova Página" + }, + "Installing CoMapeo & Onboarding": { + "message": "Instalando o CoMapeo e Integração" + }, + "Initial Use and CoMapeo Settings": { + "message": "Nova Página" + }, + "Uninstalling CoMapeo": { + "message": "Desinstalando o CoMapeo" + }, + "Customizing CoMapeo": { + "message": "Personalizando CoMapeo" + }, + "Organizing Key Materials for Projects": { + "message": "Nova Página" + }, + "Building a Custom Categories Set": { + "message": "Nova Página" + }, + "Building Custom Background Maps": { + "message": "Nova Página" + }, + "Observations & Tracks": { + "message": "Observações e Trilhas" + }, + "Gathering Observations & Tracks": { + "message": "Coletando Observações e Trilhas" + }, + "Creating a New Observation": { + "message": "Criando uma Nova Observação" + }, + "Creating a New Track": { + "message": "Criando uma Nova Trilha" + }, + "Reviewing Observations": { + "message": "Revisando Observações" + }, + "Exploring the Observations List": { + "message": "Explorando a Lista de Observações" + }, + "Reviewing an Observation": { + "message": "Revisando uma observação" + }, + "Editing Observations": { + "message": "Editando observações" + }, + "Data Privacy & Security": { + "message": "Privacidade e segurança de dados" + }, + "Encryption and Security": { + "message": "Nova Página" + }, + "Managing Data Privacy & Security": { + "message": "Gestão de Privacidade de Dados e Segurança" + }, + "Using an App Passcode for Security": { + "message": "Utilize uma senha para o CoMapeo por motivos de segurança" + }, + "Adjusting Data Sharing and Privacy": { + "message": "Nova Página" + }, + "Mapping with Collaborators": { + "message": "Mapeamento com Colaboradores" + }, + "Managing Projects": { + "message": "Gerenciando Projetos" + }, + "Understanding Projects": { + "message": "Entenda os Fundamentos de Projetos" + }, + "Creating a New Project": { + "message": "Criar um novo projeto" + }, + "Changing Categories Set": { + "message": "Alterando o Conjunto de Categorias" + }, + "Managing a Team": { + "message": "Nova Página" + }, + "Inviting Collaborators": { + "message": "Convidar colaboradores" + }, + "Ending a Project": { + "message": "Nova Página" + }, + "Exchanging Project Data": { + "message": "Troca de Dados do Projeto" + }, + "Understanding How Exchange Works": { + "message": "Entendendo Como a Troca Funciona" + }, + "Using Exchange Offline": { + "message": "Trocar informações sem conexão com a internet" + }, + "Using a Remote Archive": { + "message": "Usar um arquivo remoto" + }, + "Moving Observations & Tracks Outside of CoMapeo": { + "message": "Compartilhando observações fora do CoMapeo" + }, + "Sharing a Single Observation and Metadata": { + "message": "Nova Página" + }, + "Exporting all Observations": { + "message": "Exportar todas as observações" + }, + "Using Observations outside of CoMapeo": { + "message": "Utilizando observações fora do CoMapeo" + }, + "Miscellaneous": { + "message": "Variado" + }, + "FAQ": { + "message": "Perguntas frequentes" + }, + "Glossary": { + "message": "Glossário" + }, + "Troubleshooting": { + "message": "Solução de problemas" + }, + "Common Solutions": { + "message": "Soluções Comuns" + }, + "Troubleshooting: Setup and Customization": { + "message": "Nova Página" + }, + "Troubleshooting: Observations and Tracks": { + "message": "Nova Página" + }, + "Troubleshooting: Data Privacy and Security": { + "message": "Nova Página" + }, + "Troubleshooting: Mapping with Collaborators": { + "message": "Nova Página" + }, + "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { + "message": "Nova Página" + }, + "Elementos de Conteúdo de Teste": { + "message": "Elementos de Conteúdo de Teste" + }, + "Testing links": { + "message": "Nova Página" + }, + "Installing CoMapeo and Onboarding": { + "message": "Nova Página" + }, + "Planning and Preparing for a Project": { + "message": "Nova Página" + }, + "Observations and Tracks": { + "message": "Novo título da seção" + }, + "Gathering Observations and Tracks": { + "message": "Coletando Observações" + }, + "Data Privacy and Security": { + "message": "Novo título da seção" + }, + "Managing Data Privacy and Security": { + "message": "Gerenciamento de dados e privacidade" + }, + "Moving Observations and Tracks Outside of CoMapeo": { + "message": "Compartilhando observações fora do CoMapeo" + }, + "Developer Tools": { + "message": "Ferramentas de desenvolvedor" + }, + "API Reference": { + "message": "Referência de API" + }, + "CLI Reference": { + "message": "Referência de CLI" + }, + "Understanding CoMapeo’s Core Concepts & Functions": { + "message": "Entendendo os Conceitos e Funções Principais do CoMapeo" + }, + "Getting Started - Essentials": { + "message": "Introdução - Noções básicas" + }, + "Device Setup & Maintenance for CoMapeo": { + "message": "Configuração e manutenção do dispositivo para o CoMapeo" + }, + "Initial Use & CoMapeo Settings": { + "message": "Uso inicial e Configurações do CoMapeo" + }, + "Planning & Preparing for a Project": { + "message": "Planejamento e Preparação para um Projeto" + }, + "Creating a Custom Categories Set": { + "message": "Construindo um Conjunto de Categorias Personalizado" + }, + "Creating Custom Background Maps": { + "message": "Criando mapas de fundo personalizados" + }, + "Reviewing and Editing Tracks": { + "message": "Revisão e Edição de Trilha" + }, + "Encryption & Security": { + "message": "Criptografia e Segurança" + }, + "Adjusting Data Sharing & Privacy": { + "message": "Ajuste o compartilhamento e a privacidade dos dados" + }, + "Selecting Device Roles & Teams": { + "message": "Seleção de funções e equipes de dispositivos" + }, + "Leave a project": { + "message": "Abandonar um projeto" + }, + "Removing a device from a Project": { + "message": "Remover um dispositivo de um projeto" + }, + "Completing or Ending a Project": { + "message": "Concluir um projeto" + }, + "Exchanging Observations": { + "message": "Tracar Observao…" + }, + "Sharing Background Map": { + "message": "Compartilhe o mapa de fundo" + }, + "Sharing a Single Observation & Metadata": { + "message": "Compartilhe uma única observação e metadados." + }, + "Site Map": { + "message": "Mapa do site" + }, + "Troubleshooting: Setup & Customization": { + "message": "Solução de Problemas: Configuração e Personalização" + }, + "CoMapeo Data & Privacy (translating for public page)": { + "message": "Nova Página" + }, + "[TEST] Installation Guide": { + "message": "[TESTE] Guia de Instalação" + }, + "Changing Backgroud Maps": { + "message": "Alterando mapas de fundo" + } +} \ No newline at end of file diff --git a/scripts/notion-translate/index.test.ts b/scripts/notion-translate/index.test.ts index 4cdcad63..ef66d2ae 100644 --- a/scripts/notion-translate/index.test.ts +++ b/scripts/notion-translate/index.test.ts @@ -1,3 +1,4 @@ +import path from "path"; import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; import { createMockNotionPage, installTestNotionEnv } from "../test-utils"; @@ -685,8 +686,16 @@ describe("notion-translate index", () => { it("exits with failure on partial doc translation failures and reports counts", async () => { mockTranslateText.mockImplementation( async (_markdown: string, _title: string, targetLanguage: string) => { - if (targetLanguage === "es") { - throw new Error("es translation failed"); + if (targetLanguage === "pt-BR") { + throw Object.assign( + new Error( + "Translated markdown appears incomplete after chunk reassembly" + ), + { + code: "unexpected_error", + isCritical: false, + } + ); } return { markdown: "# translated", @@ -708,10 +717,37 @@ describe("notion-translate index", () => { totalEnglishPages: 1, processedLanguages: 2, failedTranslations: 1, + newTranslations: 1, + updatedTranslations: 0, + skippedTranslations: 0, codeJsonFailures: 0, themeFailures: 0, }); expect(loggedSummary.failures).toHaveLength(1); + expect(loggedSummary.failures[0]).toMatchObject({ + language: "pt-BR", + title: "Hello World", + pageId: "english-page-1", + error: "Translated markdown appears incomplete after chunk reassembly", + isCritical: false, + }); + + const failedDocPath = path.join( + "i18n", + "pt", + "docusaurus-plugin-content-docs", + "current", + "hello-world-englishpage1.md" + ); + expect( + mockNotionPagesCreate.mock.calls.length + + mockNotionPagesUpdate.mock.calls.length + ).toBe(1); + expect( + mockWriteFile.mock.calls.some( + ([filePath]) => String(filePath) === failedDocPath + ) + ).toBe(false); }); it("does not block translation for generic signed amazonaws links outside Notion image URL families", async () => { diff --git a/scripts/notion-translate/translateBlocks.test.ts b/scripts/notion-translate/translateBlocks.test.ts index 0dd0c957..322cb175 100644 --- a/scripts/notion-translate/translateBlocks.test.ts +++ b/scripts/notion-translate/translateBlocks.test.ts @@ -206,6 +206,51 @@ describe("translateNotionBlocksDirectly", () => { expect(callout.rich_text[0].text.content).toBe("static/images/block.png"); }); + it("keeps short rich-text paragraph translation intact", async () => { + mockBlocksChildrenList.mockResolvedValue( + blocksResponse([ + { + id: "b7", + type: "paragraph", + paragraph: { + rich_text: [ + { + type: "text", + text: { content: "Short paragraph content" }, + plain_text: "Short paragraph content", + }, + ], + }, + has_children: false, + }, + ]) + ); + + mockTranslateText.mockResolvedValue({ + markdown: "Parágrafo curto traduzido", + title: "", + }); + + const { translateNotionBlocksDirectly } = await import("./translateBlocks"); + const result = await translateNotionBlocksDirectly("page-id", "pt-BR"); + + const block = result[0] as Record; + expect(block.type).toBe("paragraph"); + const paragraph = block.paragraph as { + rich_text: Array<{ text: { content: string }; plain_text: string }>; + }; + expect(paragraph.rich_text[0].text.content).toBe( + "Parágrafo curto traduzido" + ); + expect(paragraph.rich_text[0].plain_text).toBe("Parágrafo curto traduzido"); + expect(mockTranslateText).toHaveBeenCalledTimes(1); + expect(mockTranslateText).toHaveBeenCalledWith( + "Short paragraph content", + "", + "pt-BR" + ); + }); + it("strips Notion-internal metadata fields from output blocks", async () => { mockBlocksChildrenList.mockResolvedValue( blocksResponse([ diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index 6bba803b..90044efe 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -301,6 +301,110 @@ describe("notion-translate translateFrontMatter", () => { expect(result.markdown.length).toBeGreaterThan(4_000); }); + it("does not count marker-like text inside fenced code blocks toward completeness checks", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "# Section One\n\n" + + "```md\n" + + "# not a real heading\n" + + "- fake bullet\n" + + "1. fake number\n" + + ":::note\n" + + "table | row\n" + + "```\n\n" + + "Plain paragraph."; + + mockOpenAIChatCompletionCreate.mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "# Seção Um\n\n```md\n" + + "not a real heading\n" + + "fake bullet\n" + + "fake number\n" + + ":::note\n" + + "table | row\n" + + "```\n\n" + + "Parágrafo simples.", + title: "Título Traduzido", + }), + }, + }, + ], + }); + + const result = await translateText(source, "Original Title", "pt-BR"); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(1); + expect(result.markdown).toContain("Parágrafo simples."); + expect(result.markdown).toContain("not a real heading"); + }); + + it("retries chunked translations when the reassembled markdown is structurally incomplete", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "# Section One\n\n" + + "- Item one A\n" + + "- Item one B\n\n" + + "Alpha ".repeat(500) + + "\n\n# Section Two\n\n" + + "- Item two A\n" + + "- Item two B\n\n" + + "Beta ".repeat(500); + + let callCount = 0; + mockOpenAIChatCompletionCreate.mockImplementation( + async (request: MockOpenAIRequest) => { + callCount++; + const payload = extractPromptMarkdown(request); + const translated = + callCount <= 2 + ? { + title: "Título Traduzido", + markdown: payload.markdown + .replace("# Section One", "# Seção Um") + .replace("# Section Two", "# Seção Dois") + .replace(/^- /gm, "") + .replace(/Alpha/g, "Alfa") + .replace(/Beta/g, "Beta") + .replace(/Gamma/g, "Gama"), + } + : { + title: "Título Traduzido", + markdown: payload.markdown + .replace("# Section One", "# Seção Um") + .replace("# Section Two", "# Seção Dois") + .replace(/Alpha/g, "Alfa") + .replace(/Beta/g, "Beta") + .replace(/Gamma/g, "Gama"), + }; + + return { + choices: [ + { + message: { + content: JSON.stringify(translated), + }, + }, + ], + }; + } + ); + + const result = await translateText(source, "Original Title", "pt-BR", { + chunkLimit: 8_500, + }); + + expect(callCount).toBeGreaterThan(2); + expect(result.markdown).toContain("Item one A"); + expect(result.markdown).toContain("Item two B"); + expect(result.markdown).toContain("# Seção Dois"); + }); + it("preserves complete heading structures when chunking by sections", async () => { const { translateText } = await import("./translateFrontMatter"); installStructuredTranslationMock(({ title, markdown }) => ({ @@ -579,6 +683,43 @@ describe("notion-translate translateFrontMatter", () => { expect(result.markdown).toContain(dataUrl); }); + it("retries when a canonical /images path is rewritten", async () => { + const { translateText } = await import("./translateFrontMatter"); + const canonicalImagePath = "/images/example.png"; + + mockOpenAIChatCompletionCreate + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: "![image](/images/changed-path.png)\n\nTranslated", + title: "Translated Title", + }), + }, + }, + ], + }) + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: `![image](${canonicalImagePath})\n\nTranslated`, + title: "Translated Title", + }), + }, + }, + ], + }); + + const source = `![image](${canonicalImagePath})\n\nBody text`; + const result = await translateText(source, "Title", "pt-BR"); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(2); + expect(result.markdown).toContain(canonicalImagePath); + }); + it("splitMarkdownIntoChunks does not split on headings inside fenced code blocks", async () => { const { splitMarkdownIntoChunks } = await import("./translateFrontMatter"); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 2a69236a..56c1fa20 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -31,7 +31,11 @@ const MAX_RETRIES = TRANSLATION_MAX_RETRIES; const RETRY_BASE_DELAY_MS = TRANSLATION_RETRY_BASE_DELAY_MS; const DATA_URL_PLACEHOLDER_REGEX = /\/images\/__data_url_placeholder_\d+__\.png/g; +const CANONICAL_IMAGE_PATH_REGEX = /\/images\/[^\s)"'<>]+/g; const MAX_PLACEHOLDER_INTEGRITY_RETRIES = 2; + +const isDataUrlPlaceholderPath = (path: string): boolean => + /\/images\/__data_url_placeholder_\d+__\.png/.test(path); // Translation prompt template const TRANSLATION_PROMPT = ` # Role: Translation Assistant @@ -457,22 +461,30 @@ function extractDataUrlPlaceholders(text: string): string[] { return Array.from(new Set(matches)); } -function getMissingPlaceholders( +function extractCanonicalImagePaths(text: string): string[] { + const matches = text.match(CANONICAL_IMAGE_PATH_REGEX) ?? []; + + return Array.from( + new Set(matches.filter((match) => !isDataUrlPlaceholderPath(match))) + ); +} + +function getMissingProtectedPaths( text: string, - requiredPlaceholders: string[] + requiredPaths: string[] ): string[] { - return requiredPlaceholders.filter( - (placeholder) => !text.includes(placeholder) - ); + return requiredPaths.filter((requiredPath) => !text.includes(requiredPath)); } -function isPlaceholderIntegrityError( +function isProtectedPathIntegrityError( error: unknown ): error is TranslationError { return ( error instanceof TranslationError && error.code === "schema_invalid" && - /Data URL placeholder integrity check failed/.test(error.message) + /(Data URL placeholder|Canonical image path) integrity check failed/.test( + error.message + ) ); } @@ -711,12 +723,12 @@ async function translateTextSingleCall( text: string, title: string, targetLanguage: string, - requiredPlaceholders: string[] = [], + requiredProtectedPaths: string[] = [], strictPlaceholderGuard = false ): Promise<{ markdown: string; title: string }> { const placeholderGuard = - requiredPlaceholders.length > 0 - ? `\n\n${strictPlaceholderGuard ? "CRITICAL REQUIREMENT" : "Placeholder paths to preserve exactly"}:\n${requiredPlaceholders.map((placeholder) => `- ${placeholder}`).join("\n")}\n` + requiredProtectedPaths.length > 0 + ? `\n\n${strictPlaceholderGuard ? "CRITICAL REQUIREMENT" : "Image paths to preserve exactly"}:\n${requiredProtectedPaths.map((requiredPath) => `- ${requiredPath}`).join("\n")}\n` : ""; const textWithTitle = `title: ${title}\n${placeholderGuard}\nmarkdown: ${text}`; @@ -780,14 +792,29 @@ async function translateTextSingleCall( const parsed = parseTranslationPayload(content); - if (requiredPlaceholders.length > 0) { - const missingPlaceholders = getMissingPlaceholders( + if (requiredProtectedPaths.length > 0) { + const missingProtectedPaths = getMissingProtectedPaths( parsed.markdown, - requiredPlaceholders + requiredProtectedPaths ); - if (missingPlaceholders.length > 0) { + if (missingProtectedPaths.length > 0) { + const missingPlaceholderPaths = missingProtectedPaths.filter( + isDataUrlPlaceholderPath + ); + const missingCanonicalImagePaths = missingProtectedPaths.filter( + (path) => !isDataUrlPlaceholderPath(path) + ); + + if (missingPlaceholderPaths.length > 0) { + throw new TranslationError( + `Data URL placeholder integrity check failed: missing ${missingPlaceholderPaths.length} placeholder(s): ${missingPlaceholderPaths.slice(0, 3).join(", ")}`, + "schema_invalid", + true + ); + } + throw new TranslationError( - `Data URL placeholder integrity check failed: missing ${missingPlaceholders.length} placeholder(s): ${missingPlaceholders.slice(0, 3).join(", ")}`, + `Canonical image path integrity check failed: missing ${missingCanonicalImagePaths.length} path(s): ${missingCanonicalImagePaths.slice(0, 3).join(", ")}`, "schema_invalid", true ); @@ -824,19 +851,24 @@ async function translateChunkWithOverflowFallback( placeholderGuardAttempt = 0, chunkBudgetForRetry = getProactiveChunkCharLimit(model) ): Promise<{ markdown: string; title: string }> { - const requiredPlaceholders = extractDataUrlPlaceholders(text); + const requiredProtectedPaths = Array.from( + new Set([ + ...extractDataUrlPlaceholders(text), + ...extractCanonicalImagePaths(text), + ]) + ); try { return await translateTextSingleCall( text, title, targetLanguage, - requiredPlaceholders, + requiredProtectedPaths, placeholderGuardAttempt > 0 ); } catch (err) { if ( - isPlaceholderIntegrityError(err) && + isProtectedPathIntegrityError(err) && placeholderGuardAttempt < MAX_PLACEHOLDER_INTEGRITY_RETRIES ) { return translateChunkWithOverflowFallback( From 3252c661f5b695d62b02ce3b92a5ec0f5ab76b2b Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:09:43 -0300 Subject: [PATCH 14/18] fix(translate): handle indented fenced code blocks --- .../translateFrontMatter.test.ts | 72 +++++++++++++++++++ .../notion-translate/translateFrontMatter.ts | 8 ++- 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index 90044efe..c4a46646 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -343,6 +343,57 @@ describe("notion-translate translateFrontMatter", () => { expect(result.markdown).toContain("not a real heading"); }); + it("retries when an indented fenced block is dropped during translation", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "# Section One\n\n" + + "- Item one\n\n" + + " ```js\n" + + " console.log('keep me');\n" + + " ```\n\n" + + "Plain paragraph."; + + mockOpenAIChatCompletionCreate + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "# Seção Um\n\n" + "- Item um\n\n" + "Plain paragraph.", + title: "Título Traduzido", + }), + }, + }, + ], + }) + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "# Seção Um\n\n" + + "- Item um\n\n" + + " ```js\n" + + " console.log('keep me');\n" + + " ```\n\n" + + "Parágrafo simples.", + title: "Título Traduzido", + }), + }, + }, + ], + }); + + const result = await translateText(source, "Original Title", "pt-BR"); + + expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(2); + expect(result.markdown).toContain("console.log('keep me');"); + expect(result.markdown).toContain("Parágrafo simples."); + }); + it("retries chunked translations when the reassembled markdown is structurally incomplete", async () => { const { translateText } = await import("./translateFrontMatter"); @@ -737,6 +788,27 @@ describe("notion-translate translateFrontMatter", () => { expect(fenceChunk).toContain("# not a heading"); }); + it("splitMarkdownIntoChunks does not split on headings inside indented fenced code blocks", async () => { + const { splitMarkdownIntoChunks } = await import("./translateFrontMatter"); + + const content = + "# Real Heading\n\n" + + "- Item one\n\n" + + " ```\n" + + " # not a heading\n" + + " ```\n\n" + + "# Another Heading\n\n" + + "text\n"; + + const chunks = splitMarkdownIntoChunks(content, 55); + + const joined = chunks.join(""); + expect(joined).toBe(content); + const fenceChunk = chunks.find((c) => c.includes(" ```")); + expect(fenceChunk).toBeDefined(); + expect(fenceChunk).toContain("# not a heading"); + }); + it("splitMarkdownIntoChunks reassembly is lossless", async () => { const { splitMarkdownIntoChunks } = await import("./translateFrontMatter"); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 56c1fa20..cbcef433 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -303,8 +303,8 @@ function splitBySections(markdown: string): string[] { const lineWithNewline = idx < lastIdx ? line + "\n" : line.length > 0 ? line : ""; - // Toggle fence state on ``` or ~~~ lines - if (/^(`{3,}|~{3,})/.test(line)) { + // Toggle fence state on fenced code markers, including up to 3 leading spaces. + if (/^[ \t]{0,3}(`{3,}|~{3,})/.test(line)) { inFence = !inFence; } // Start a new section before any ATX heading (outside fences) @@ -563,7 +563,9 @@ function collectMarkdownStructureMetrics( const withoutFrontmatter = stripYamlFrontmatter(markdown); // Fenced code blocks must be counted on raw markdown (before stripping). - const fencedCodeMatches = withoutFrontmatter.match(/^(`{3,}|~{3,})/gm) ?? []; + // Allow up to 3 leading spaces so the metric matches CommonMark fence rules. + const fencedCodeMatches = + withoutFrontmatter.match(/^[ \t]{0,3}(`{3,}|~{3,})/gm) ?? []; // All other structural markers are measured on the stripped version so that // examples inside code blocks do not inflate the counts. From 93ba4559b3ac023a7746f00f7de2d7a28d8f35c5 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:20:56 -0300 Subject: [PATCH 15/18] fix(i18n): restore translation strings and changelog formatting --- CHANGELOG.md | 8 ++++++-- i18n/es/code.json | 4 ++-- i18n/pt/code.json | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab63fbab..87eeded5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,22 +7,26 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] ### Added + - **Targeted Notion Fetching:** Added the ability to fetch data for a single Notion page. - **Pre-Release Safety:** Added validation checks to ensure all translations (locales) are complete. ### Changed + - **Simplified Data Fetching:** Cleaned up and simplified the logic for fetching all pages from Notion. - **Docker Tests:** Updated the Docker integration tests to work correctly with the newly added fetch-job types. ### Removed + - **Code Cleanup:** Removed redundant code from the API schemas to make the codebase cleaner. ### Fixed + - **Translation Completeness:** Fixed several issues with how the system measures if a page is fully translated. - **Long-form Content Translation:** Prevented issues where content could be lost when translating very long pages. -- **Language Switcher (Locale Dropdown):** +- **Language Switcher (Locale Dropdown):** - Fixed a bug where the language switcher would sometimes point to the wrong page. - Corrected an issue that caused "double" language codes in URLs. - Fixed navigation issues when switching languages on category index pages. - Fixed a display issue where the language dropdown might be hidden behind other menu items. -- **Build Scripts:** Resolved bugs in the TypeScript compilation and Markdown parsing scripts. \ No newline at end of file +- **Build Scripts:** Resolved bugs in the TypeScript compilation and Markdown parsing scripts. diff --git a/i18n/es/code.json b/i18n/es/code.json index da4f2c4b..71950028 100644 --- a/i18n/es/code.json +++ b/i18n/es/code.json @@ -7,7 +7,7 @@ "message": "Introducción" }, "Preparing to Use CoMapeo": { - "message": "Preparación para usar " + "message": "Preparación para el uso de CoMapeo" }, "Understanding CoMapeo's Core Concepts and Functions": { "message": "Nueva Página" @@ -252,4 +252,4 @@ "Changing Backgroud Maps": { "message": "Cambia Mapas de Fondo" } -} \ No newline at end of file +} diff --git a/i18n/pt/code.json b/i18n/pt/code.json index c3984d3f..f0cf606f 100644 --- a/i18n/pt/code.json +++ b/i18n/pt/code.json @@ -238,7 +238,7 @@ "message": "Concluir um projeto" }, "Exchanging Observations": { - "message": "Tracar Observao…" + "message": "Troca de Observações" }, "Sharing Background Map": { "message": "Compartilhe o mapa de fundo" @@ -261,4 +261,4 @@ "Changing Backgroud Maps": { "message": "Alterando mapas de fundo" } -} \ No newline at end of file +} From 2a5bb8740e33826ecfefd85bbe18c6b7ad7b1885 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 07:30:45 -0300 Subject: [PATCH 16/18] revert(i18n): remove locale files from issue-166 --- i18n/es/code.json | 452 ++++++++++++++++++++------------------------- i18n/pt/code.json | 461 ++++++++++++++++++++-------------------------- 2 files changed, 398 insertions(+), 515 deletions(-) diff --git a/i18n/es/code.json b/i18n/es/code.json index 71950028..92b9f565 100644 --- a/i18n/es/code.json +++ b/i18n/es/code.json @@ -1,255 +1,201 @@ { - "theme.TOC.title": { - "message": "En esta página", - "description": "Title for the table of contents section" - }, - "Introduction": { - "message": "Introducción" - }, - "Preparing to Use CoMapeo": { - "message": "Preparación para el uso de CoMapeo" - }, - "Understanding CoMapeo's Core Concepts and Functions": { - "message": "Nueva Página" - }, - "Getting Started Essentials": { - "message": "Nuevo título de sección" - }, - "Gathering the Right Equipment for CoMapeo": { - "message": "Reúne el equipo adecuado para CoMapeo" - }, - "Device Setup and Maintenance for CoMapeo": { - "message": "Nueva Página" - }, - "Installing CoMapeo & Onboarding": { - "message": "Instalación de CoMapeo y primeros pasos" - }, - "Initial Use and CoMapeo Settings": { - "message": "Nueva Página" - }, - "Uninstalling CoMapeo": { - "message": "Desinstalar CoMapeo" - }, - "Customizing CoMapeo": { - "message": "Personaliza CoMapeo" - }, - "Organizing Key Materials for Projects": { - "message": "Nueva Página" - }, - "Building a Custom Categories Set": { - "message": "Nueva Página" - }, - "Building Custom Background Maps": { - "message": "Nueva Página" - }, - "Observations & Tracks": { - "message": "Observaciones y Trayectos" - }, - "Gathering Observations & Tracks": { - "message": "Registra Observaciones y Trayectos" - }, - "Creating a New Observation": { - "message": "Crea una Nueva Observación" - }, - "Creating a New Track": { - "message": "Crea un nuevo Trayecto" - }, - "Reviewing Observations": { - "message": "Revisa Observaciones" - }, - "Exploring the Observations List": { - "message": "Explora la Lista de Observaciones" - }, - "Reviewing an Observation": { - "message": "Revisa una Observación" - }, - "Editing Observations": { - "message": "Edita Observaciones" - }, - "Data Privacy & Security": { - "message": "Privacidad y Seguridad de Datos" - }, - "Encryption and Security": { - "message": "Nueva Página" - }, - "Managing Data Privacy & Security": { - "message": "Gestión de Privacidad y Seguridad de Datos" - }, - "Using an App Passcode for Security": { - "message": "Usa una Contraseña para CoMapeo por Seguridad" - }, - "Adjusting Data Sharing and Privacy": { - "message": "Nueva Página" - }, - "Mapping with Collaborators": { - "message": "Mapea con Colaboradores" - }, - "Managing Projects": { - "message": "Gestión de Proyectos" - }, - "Understanding Projects": { - "message": "Comprende las Bases Sobre Proyectos" - }, - "Creating a New Project": { - "message": "Crea un Nuevo Proyecto" - }, - "Changing Categories Set": { - "message": "Nueva Página" - }, - "Managing a Team": { - "message": "Nueva Página" - }, - "Inviting Collaborators": { - "message": "Invita Colaboradores" - }, - "Ending a Project": { - "message": "Nueva Página" - }, - "Exchanging Project Data": { - "message": "Intercambio de Datos del Proyecto" - }, - "Understanding How Exchange Works": { - "message": "Nueva Página A" - }, - "Using Exchange Offline": { - "message": "Nueva Página" - }, - "Using a Remote Archive": { - "message": "Usa un Archivo Remoto" - }, - "Moving Observations & Tracks Outside of CoMapeo": { - "message": "Mueve Observaciones y Trayectos fuera de CoMapeo" - }, - "Sharing a Single Observation and Metadata": { - "message": "Nueva Página" - }, - "Exporting all Observations": { - "message": "Exporta todas las Observaciones" - }, - "Using Observations outside of CoMapeo": { - "message": "Usa Observaciones fuera de CoMapeo" - }, - "Miscellaneous": { - "message": "Misceláneas" - }, - "FAQ": { - "message": "Preguntas frecuentes" - }, - "Glossary": { - "message": "Glosario" - }, - "Troubleshooting": { - "message": "Solución de problemas" - }, - "Common Solutions": { - "message": "Nueva Página" - }, - "Troubleshooting: Setup and Customization": { - "message": "Nueva Página" - }, - "Troubleshooting: Observations and Tracks": { - "message": "Nueva Página" - }, - "Troubleshooting: Data Privacy and Security": { - "message": "Nueva Página" - }, - "Troubleshooting: Mapping with Collaborators": { - "message": "Nueva Página" - }, - "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { - "message": "Nueva Página" - }, - "Elementos de contenido de prueba": { - "message": "Elementos de contenido de prueba" - }, - "Testing links": { - "message": "Nueva Página" - }, - "Installing CoMapeo and Onboarding": { - "message": "Nueva Página" - }, - "Planning and Preparing for a Project": { - "message": "Nueva Página" - }, - "Observations and Tracks": { - "message": "Nuevo título de sección" - }, - "Gathering Observations and Tracks": { - "message": "Recopilación de observaciones" - }, - "Data Privacy and Security": { - "message": "Nuevo título de sección" - }, - "Managing Data Privacy and Security": { - "message": "Gestión de datos y privacidad" - }, - "Moving Observations and Tracks Outside of CoMapeo": { - "message": "Compartir observaciones fuera de CoMapeo" - }, - "Developer Tools": { - "message": "Herramientas de desarrollador" - }, - "API Reference": { - "message": "Referencia de API" - }, - "CLI Reference": { - "message": "Referencia de CLI" - }, - "Understanding CoMapeo’s Core Concepts & Functions": { - "message": "Conoce los conceptos y funciones principales de CoMapeo" - }, - "Getting Started - Essentials": { - "message": "Introducción - Conceptos básicos" - }, - "Device Setup & Maintenance for CoMapeo": { - "message": "Configuración y mantenimiento del dispositivo para CoMapeo" - }, - "Initial Use & CoMapeo Settings": { - "message": "Uso Inicial y Ajustes de CoMapeo" - }, - "Creating Custom Background Maps": { - "message": "Crea Mapas de Fondo personalizados" - }, - "Reviewing and Editing Tracks": { - "message": "Revisión y Edición de Trayectos" - }, - "Encryption & Security": { - "message": "Encriptación y Seguridad" - }, - "Adjusting Data Sharing & Privacy": { - "message": "Ajusta el Intercambio de Datos y Privacidad" - }, - "Selecting Device Roles & Teams": { - "message": "Selección de roles y equipos de dispositivos" - }, - "Leave a project": { - "message": "Abandona un proyecto" - }, - "Removing a device from a Project": { - "message": "Eliminar un dispositivo de un proyecto" - }, - "Completing or Ending a Project": { - "message": "Finaliza un Proyecto" - }, - "Exchanging Observations": { - "message": "Intercambia Observaciones" - }, - "Sharing Background Map": { - "message": "Comparte el Mapa de Fondo" - }, - "Sharing a Single Observation & Metadata": { - "message": "Comparte una sola Observación y Metadatos" - }, - "Site Map": { - "message": "Mapa del sitio" - }, - "CoMapeo Data & Privacy (translating for public page)": { - "message": "Nueva Página" - }, - "[TEST] Installation Guide": { - "message": "[PRUEBA] Guía de Instalación" - }, - "Changing Backgroud Maps": { - "message": "Cambia Mapas de Fondo" - } + "theme.TOC.title": { + "message": "En esta página", + "description": "Title for the table of contents section" + }, + "Introduction": { + "message": "Introducción" + }, + "Preparing to Use CoMapeo": { + "message": "Preparación para el uso de CoMapeo" + }, + "Understanding CoMapeo's Core Concepts and Functions": { + "message": "Nueva Página" + }, + "Getting Started Essentials": { + "message": "Nuevo título de sección" + }, + "Gathering the Right Equipment for CoMapeo": { + "message": "Reunir el Equipo Adecuado para CoMapeo" + }, + "Device Setup and Maintenance for CoMapeo": { + "message": "Nueva Página" + }, + "Installing CoMapeo & Onboarding": { + "message": "Nueva Página" + }, + "Initial Use and CoMapeo Settings": { + "message": "Nueva Página" + }, + "Uninstalling CoMapeo": { + "message": "Desinstalar CoMapeo" + }, + "Customizing CoMapeo": { + "message": "Nueva Palanca" + }, + "Organizing Key Materials for Projects": { + "message": "Nueva Página" + }, + "Building a Custom Categories Set": { + "message": "Nueva Página" + }, + "Building Custom Background Maps": { + "message": "Nueva Página" + }, + "Observations & Tracks": { + "message": "Nuevo título de sección" + }, + "Gathering Observations & Tracks": { + "message": "Recopilación de observaciones" + }, + "Creating a New Observation": { + "message": "Nueva Página" + }, + "Creating a New Track": { + "message": "Nueva Página" + }, + "Reviewing Observations": { + "message": "Revisión de observaciones" + }, + "Exploring the Observations List": { + "message": "Nueva Página" + }, + "Reviewing an Observation": { + "message": "Nueva Página" + }, + "Editing Observations": { + "message": "Nueva Página" + }, + "Data Privacy & Security": { + "message": "Nuevo título de sección" + }, + "Encryption and Security": { + "message": "Nueva Página" + }, + "Managing Data Privacy & Security": { + "message": "Gestión de datos y privacidad" + }, + "Using an App Passcode for Security": { + "message": "Nueva Página" + }, + "Adjusting Data Sharing and Privacy": { + "message": "Nueva Página" + }, + "Mapping with Collaborators": { + "message": "Nueva Página" + }, + "Managing Projects": { + "message": "Gestión de proyectos" + }, + "Understanding Projects": { + "message": "Nueva Página" + }, + "Creating a New Project": { + "message": "Nueva Página" + }, + "Changing Categories Set": { + "message": "Nueva Página" + }, + "Managing a Team": { + "message": "Nueva Página" + }, + "Inviting Collaborators": { + "message": "Nueva Página" + }, + "Ending a Project": { + "message": "Nueva Página" + }, + "Exchanging Project Data": { + "message": "Intercambio de Datos del Proyecto" + }, + "Understanding How Exchange Works": { + "message": "Nueva Página A" + }, + "Using Exchange Offline": { + "message": "Nueva Página" + }, + "Using a Remote Archive": { + "message": "Nueva Página" + }, + "Moving Observations & Tracks Outside of CoMapeo": { + "message": "Compartir observaciones fuera de CoMapeo" + }, + "Sharing a Single Observation and Metadata": { + "message": "Nueva Página" + }, + "Exporting all Observations": { + "message": "Nueva Página" + }, + "Using Observations outside of CoMapeo": { + "message": "Nueva Página" + }, + "Miscellaneous": { + "message": "Misceláneas" + }, + "FAQ": { + "message": "Preguntas frecuentes" + }, + "Glossary": { + "message": "Glosario" + }, + "Troubleshooting": { + "message": "Nueva Palanca" + }, + "Common Solutions": { + "message": "Nueva Página" + }, + "Troubleshooting: Setup and Customization": { + "message": "Nueva Página" + }, + "Troubleshooting: Observations and Tracks": { + "message": "Nueva Página" + }, + "Troubleshooting: Data Privacy and Security": { + "message": "Nueva Página" + }, + "Troubleshooting: Mapping with Collaborators": { + "message": "Nueva Página" + }, + "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { + "message": "Nueva Página" + }, + "Elementos de contenido de prueba": { + "message": "Elementos de contenido de prueba" + }, + "Testing links": { + "message": "Nueva Página" + }, + "Understanding CoMapeo's Core Concepts and Functions": { + "message": "Nueva Página" + }, + "Installing CoMapeo and Onboarding": { + "message": "Nueva Página" + }, + "Planning and Preparing for a Project": { + "message": "Nueva Página" + }, + "Observations and Tracks": { + "message": "Nuevo título de sección" + }, + "Gathering Observations and Tracks": { + "message": "Recopilación de observaciones" + }, + "Data Privacy and Security": { + "message": "Nuevo título de sección" + }, + "Managing Data Privacy and Security": { + "message": "Gestión de datos y privacidad" + }, + "Moving Observations and Tracks Outside of CoMapeo": { + "message": "Compartir observaciones fuera de CoMapeo" + }, + "Developer Tools": { + "message": "Herramientas de desarrollador" + }, + "API Reference": { + "message": "Referencia de API" + }, + "CLI Reference": { + "message": "Referencia de CLI" + } } diff --git a/i18n/pt/code.json b/i18n/pt/code.json index f0cf606f..7c22c3c1 100644 --- a/i18n/pt/code.json +++ b/i18n/pt/code.json @@ -1,264 +1,201 @@ { - "theme.TOC.title": { - "message": "Nesta página", - "description": "Title for the table of contents section" - }, - "Introduction": { - "message": "Introdução" - }, - "Preparing to Use CoMapeo": { - "message": "Preparação para usar CoMapeo" - }, - "Understanding CoMapeo's Core Concepts and Functions": { - "message": "Nova Página" - }, - "Getting Started Essentials": { - "message": "Novo título da seção" - }, - "Gathering the Right Equipment for CoMapeo": { - "message": "Reunindo o Equipamento Adequado para CoMapeo" - }, - "Device Setup and Maintenance for CoMapeo": { - "message": "Nova Página" - }, - "Installing CoMapeo & Onboarding": { - "message": "Instalando o CoMapeo e Integração" - }, - "Initial Use and CoMapeo Settings": { - "message": "Nova Página" - }, - "Uninstalling CoMapeo": { - "message": "Desinstalando o CoMapeo" - }, - "Customizing CoMapeo": { - "message": "Personalizando CoMapeo" - }, - "Organizing Key Materials for Projects": { - "message": "Nova Página" - }, - "Building a Custom Categories Set": { - "message": "Nova Página" - }, - "Building Custom Background Maps": { - "message": "Nova Página" - }, - "Observations & Tracks": { - "message": "Observações e Trilhas" - }, - "Gathering Observations & Tracks": { - "message": "Coletando Observações e Trilhas" - }, - "Creating a New Observation": { - "message": "Criando uma Nova Observação" - }, - "Creating a New Track": { - "message": "Criando uma Nova Trilha" - }, - "Reviewing Observations": { - "message": "Revisando Observações" - }, - "Exploring the Observations List": { - "message": "Explorando a Lista de Observações" - }, - "Reviewing an Observation": { - "message": "Revisando uma observação" - }, - "Editing Observations": { - "message": "Editando observações" - }, - "Data Privacy & Security": { - "message": "Privacidade e segurança de dados" - }, - "Encryption and Security": { - "message": "Nova Página" - }, - "Managing Data Privacy & Security": { - "message": "Gestão de Privacidade de Dados e Segurança" - }, - "Using an App Passcode for Security": { - "message": "Utilize uma senha para o CoMapeo por motivos de segurança" - }, - "Adjusting Data Sharing and Privacy": { - "message": "Nova Página" - }, - "Mapping with Collaborators": { - "message": "Mapeamento com Colaboradores" - }, - "Managing Projects": { - "message": "Gerenciando Projetos" - }, - "Understanding Projects": { - "message": "Entenda os Fundamentos de Projetos" - }, - "Creating a New Project": { - "message": "Criar um novo projeto" - }, - "Changing Categories Set": { - "message": "Alterando o Conjunto de Categorias" - }, - "Managing a Team": { - "message": "Nova Página" - }, - "Inviting Collaborators": { - "message": "Convidar colaboradores" - }, - "Ending a Project": { - "message": "Nova Página" - }, - "Exchanging Project Data": { - "message": "Troca de Dados do Projeto" - }, - "Understanding How Exchange Works": { - "message": "Entendendo Como a Troca Funciona" - }, - "Using Exchange Offline": { - "message": "Trocar informações sem conexão com a internet" - }, - "Using a Remote Archive": { - "message": "Usar um arquivo remoto" - }, - "Moving Observations & Tracks Outside of CoMapeo": { - "message": "Compartilhando observações fora do CoMapeo" - }, - "Sharing a Single Observation and Metadata": { - "message": "Nova Página" - }, - "Exporting all Observations": { - "message": "Exportar todas as observações" - }, - "Using Observations outside of CoMapeo": { - "message": "Utilizando observações fora do CoMapeo" - }, - "Miscellaneous": { - "message": "Variado" - }, - "FAQ": { - "message": "Perguntas frequentes" - }, - "Glossary": { - "message": "Glossário" - }, - "Troubleshooting": { - "message": "Solução de problemas" - }, - "Common Solutions": { - "message": "Soluções Comuns" - }, - "Troubleshooting: Setup and Customization": { - "message": "Nova Página" - }, - "Troubleshooting: Observations and Tracks": { - "message": "Nova Página" - }, - "Troubleshooting: Data Privacy and Security": { - "message": "Nova Página" - }, - "Troubleshooting: Mapping with Collaborators": { - "message": "Nova Página" - }, - "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { - "message": "Nova Página" - }, - "Elementos de Conteúdo de Teste": { - "message": "Elementos de Conteúdo de Teste" - }, - "Testing links": { - "message": "Nova Página" - }, - "Installing CoMapeo and Onboarding": { - "message": "Nova Página" - }, - "Planning and Preparing for a Project": { - "message": "Nova Página" - }, - "Observations and Tracks": { - "message": "Novo título da seção" - }, - "Gathering Observations and Tracks": { - "message": "Coletando Observações" - }, - "Data Privacy and Security": { - "message": "Novo título da seção" - }, - "Managing Data Privacy and Security": { - "message": "Gerenciamento de dados e privacidade" - }, - "Moving Observations and Tracks Outside of CoMapeo": { - "message": "Compartilhando observações fora do CoMapeo" - }, - "Developer Tools": { - "message": "Ferramentas de desenvolvedor" - }, - "API Reference": { - "message": "Referência de API" - }, - "CLI Reference": { - "message": "Referência de CLI" - }, - "Understanding CoMapeo’s Core Concepts & Functions": { - "message": "Entendendo os Conceitos e Funções Principais do CoMapeo" - }, - "Getting Started - Essentials": { - "message": "Introdução - Noções básicas" - }, - "Device Setup & Maintenance for CoMapeo": { - "message": "Configuração e manutenção do dispositivo para o CoMapeo" - }, - "Initial Use & CoMapeo Settings": { - "message": "Uso inicial e Configurações do CoMapeo" - }, - "Planning & Preparing for a Project": { - "message": "Planejamento e Preparação para um Projeto" - }, - "Creating a Custom Categories Set": { - "message": "Construindo um Conjunto de Categorias Personalizado" - }, - "Creating Custom Background Maps": { - "message": "Criando mapas de fundo personalizados" - }, - "Reviewing and Editing Tracks": { - "message": "Revisão e Edição de Trilha" - }, - "Encryption & Security": { - "message": "Criptografia e Segurança" - }, - "Adjusting Data Sharing & Privacy": { - "message": "Ajuste o compartilhamento e a privacidade dos dados" - }, - "Selecting Device Roles & Teams": { - "message": "Seleção de funções e equipes de dispositivos" - }, - "Leave a project": { - "message": "Abandonar um projeto" - }, - "Removing a device from a Project": { - "message": "Remover um dispositivo de um projeto" - }, - "Completing or Ending a Project": { - "message": "Concluir um projeto" - }, - "Exchanging Observations": { - "message": "Troca de Observações" - }, - "Sharing Background Map": { - "message": "Compartilhe o mapa de fundo" - }, - "Sharing a Single Observation & Metadata": { - "message": "Compartilhe uma única observação e metadados." - }, - "Site Map": { - "message": "Mapa do site" - }, - "Troubleshooting: Setup & Customization": { - "message": "Solução de Problemas: Configuração e Personalização" - }, - "CoMapeo Data & Privacy (translating for public page)": { - "message": "Nova Página" - }, - "[TEST] Installation Guide": { - "message": "[TESTE] Guia de Instalação" - }, - "Changing Backgroud Maps": { - "message": "Alterando mapas de fundo" - } + "theme.TOC.title": { + "message": "Nesta página", + "description": "Title for the table of contents section" + }, + "Introduction": { + "message": "Introdução" + }, + "Preparing to Use CoMapeo": { + "message": "Preparando para usar do CoMapeo (Mobile)" + }, + "Understanding CoMapeo's Core Concepts and Functions": { + "message": "Nova Página" + }, + "Getting Started Essentials": { + "message": "Novo título da seção" + }, + "Gathering the Right Equipment for CoMapeo": { + "message": "Reunindo o Equipamento Certo para o CoMapeo" + }, + "Device Setup and Maintenance for CoMapeo": { + "message": "Nova Página" + }, + "Installing CoMapeo & Onboarding": { + "message": "Nova Página" + }, + "Initial Use and CoMapeo Settings": { + "message": "Nova Página" + }, + "Uninstalling CoMapeo": { + "message": "Nova Página" + }, + "Customizing CoMapeo": { + "message": "Novo Alternar" + }, + "Organizing Key Materials for Projects": { + "message": "Nova Página" + }, + "Building a Custom Categories Set": { + "message": "Nova Página" + }, + "Building Custom Background Maps": { + "message": "Nova Página" + }, + "Observations & Tracks": { + "message": "Novo título da seção" + }, + "Gathering Observations & Tracks": { + "message": "Coletando Observações" + }, + "Creating a New Observation": { + "message": "Nova Página" + }, + "Creating a New Track": { + "message": "Nova Página" + }, + "Reviewing Observations": { + "message": "Revisando Observações" + }, + "Exploring the Observations List": { + "message": "Nova Página" + }, + "Reviewing an Observation": { + "message": "Nova Página" + }, + "Editing Observations": { + "message": "Nova Página" + }, + "Data Privacy & Security": { + "message": "Novo título da seção" + }, + "Encryption and Security": { + "message": "Nova Página" + }, + "Managing Data Privacy & Security": { + "message": "Gerenciamento de dados e privacidade" + }, + "Using an App Passcode for Security": { + "message": "Nova Página" + }, + "Adjusting Data Sharing and Privacy": { + "message": "Nova Página" + }, + "Mapping with Collaborators": { + "message": "Nova Página" + }, + "Managing Projects": { + "message": "Gerenciando Projetos" + }, + "Understanding Projects": { + "message": "Nova Página" + }, + "Creating a New Project": { + "message": "Nova Página" + }, + "Changing Categories Set": { + "message": "Nova Página" + }, + "Managing a Team": { + "message": "Nova Página" + }, + "Inviting Collaborators": { + "message": "Nova Página" + }, + "Ending a Project": { + "message": "Nova Página" + }, + "Exchanging Project Data": { + "message": "Troca de Dados do Projeto" + }, + "Understanding How Exchange Works": { + "message": "Nova Página A" + }, + "Using Exchange Offline": { + "message": "Nova Página" + }, + "Using a Remote Archive": { + "message": "Nova Página" + }, + "Moving Observations & Tracks Outside of CoMapeo": { + "message": "Compartilhando observações fora do CoMapeo" + }, + "Sharing a Single Observation and Metadata": { + "message": "Nova Página" + }, + "Exporting all Observations": { + "message": "Nova Página" + }, + "Using Observations outside of CoMapeo": { + "message": "Nova Página" + }, + "Miscellaneous": { + "message": "Variado" + }, + "FAQ": { + "message": "Perguntas frequentes" + }, + "Glossary": { + "message": "Glossário" + }, + "Troubleshooting": { + "message": "Resolução de Problemas" + }, + "Common Solutions": { + "message": "Nova Página" + }, + "Troubleshooting: Setup and Customization": { + "message": "Nova Página" + }, + "Troubleshooting: Observations and Tracks": { + "message": "Nova Página" + }, + "Troubleshooting: Data Privacy and Security": { + "message": "Nova Página" + }, + "Troubleshooting: Mapping with Collaborators": { + "message": "Nova Página" + }, + "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { + "message": "Nova Página" + }, + "Elementos de Conteúdo de Teste": { + "message": "Elementos de Conteúdo de Teste" + }, + "Testing links": { + "message": "Nova Página" + }, + "Understanding CoMapeo's Core Concepts and Functions": { + "message": "Nova Página" + }, + "Installing CoMapeo and Onboarding": { + "message": "Nova Página" + }, + "Planning and Preparing for a Project": { + "message": "Nova Página" + }, + "Observations and Tracks": { + "message": "Novo título da seção" + }, + "Gathering Observations and Tracks": { + "message": "Coletando Observações" + }, + "Data Privacy and Security": { + "message": "Novo título da seção" + }, + "Managing Data Privacy and Security": { + "message": "Gerenciamento de dados e privacidade" + }, + "Moving Observations and Tracks Outside of CoMapeo": { + "message": "Compartilhando observações fora do CoMapeo" + }, + "Developer Tools": { + "message": "Ferramentas de desenvolvedor" + }, + "API Reference": { + "message": "Referência de API" + }, + "CLI Reference": { + "message": "Referência de CLI" + } } From 23e0754497ed363495ffb4a9b9d82132798ecd4b Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 08:00:23 -0300 Subject: [PATCH 17/18] fix(translate): track CommonMark fence length to prevent nested-fence misparse The fence state machine now tracks the opening fence character and length per CommonMark spec. A fence is only closed by a closing marker of the same character with at least the opening length and no info string. fix(translate): add frontmatter integrity check for critical Docusaurus fields Added parseFrontmatterKeys() to extract top-level YAML keys and assertFrontmatterIntegrity() to verify that translated markdown preserves all frontmatter keys present in the source. Detects missing or unexpectedly added critical fields (slug, sidebar_position, etc.) and fails the translation with a non-critical error so retries are possible. chore(changelog): scope Unreleased entries to translation-integrity work only Removed unrelated entries from CHANGELOG.md to focus on translation-integrity improvements. Kept only the entries relevant to this fix: - Translation Completeness - Long-form Content Translation - Build Scripts --- .../20260325-pr-critical-reviewer-CONTEXT.md | 129 +++++ .output2.txt | 470 ++++++++++++++++++ CHANGELOG.md | 15 - plans/2026-03-19-PLAN-v1.md | 160 ++++++ ...fix-output-truncation-token-overflow-v1.md | 48 ++ ...26-03-25-robust-translation-chunking-v1.md | 424 ++++++++++++++++ .../translateFrontMatter.test.ts | 200 ++++++++ .../notion-translate/translateFrontMatter.ts | 127 ++++- 8 files changed, 1551 insertions(+), 22 deletions(-) create mode 100644 .claude/agents/context/20260325-pr-critical-reviewer-CONTEXT.md create mode 100644 .output2.txt create mode 100644 plans/2026-03-19-PLAN-v1.md create mode 100644 plans/2026-03-25-fix-output-truncation-token-overflow-v1.md create mode 100644 plans/2026-03-25-robust-translation-chunking-v1.md diff --git a/.claude/agents/context/20260325-pr-critical-reviewer-CONTEXT.md b/.claude/agents/context/20260325-pr-critical-reviewer-CONTEXT.md new file mode 100644 index 00000000..eb1978c2 --- /dev/null +++ b/.claude/agents/context/20260325-pr-critical-reviewer-CONTEXT.md @@ -0,0 +1,129 @@ +--- +agent: pr-critical-reviewer +timestamp: 2026-03-25T00:00:00Z +session_id: issue-166-heading-loss-fix +prior_context: [] +next_agents: [] +--- + +# Agent Context: PR Critical Reviewer + +## Mission Summary +**PR Reviewed:** Fix for heading loss tolerance in isSuspiciouslyIncompleteTranslation (issue-166) +**Review Status:** Approved +**Critical Issues:** 0 + +## Key Findings from Prior Context +**Expected from Spec:** N/A — no prior spec context files found +**Expected from Plan:** N/A — no prior implementation planner context found +**Actual vs Expected:** Fix matches the reviewer's stated intent + +## Analysis Results +**Code Changes Reviewed:** +- Files changed: 1 +- Lines changed: 1 (single operator fix) +- Complexity assessment: Low + +**File reviewed:** +`/home/luandro/Dev/digidem/comapeo-docs/.worktrees/issue-166/scripts/notion-translate/translateFrontMatter.ts` +Lines 598-640 (isSuspiciouslyIncompleteTranslation function) + +**Test file reviewed:** +`/home/luandro/Dev/digidem/comapeo-docs/.worktrees/issue-166/scripts/notion-translate/translateFrontMatter.test.ts` + +## Fix Details + +**Before:** +```typescript +const headingLoss = + sourceMetrics.headingCount > 0 && + translatedMetrics.headingCount < sourceMetrics.headingCount - 1; +``` + +**After:** +```typescript +const headingLoss = + sourceMetrics.headingCount > 0 && + translatedMetrics.headingCount < sourceMetrics.headingCount; +``` + +## Analysis Results + +**Critical Issues Identified:** None + +**Severity Breakdown:** +| Type | Count | Severity | +|------|-------|----------| +| Bugs | 0 | - | +| Security | 0 | - | +| Performance | 0 | - | +| Correctness | 0 | - | + +## Edge Case Analysis + +**headingCount = 0:** +Safe. The `sourceMetrics.headingCount > 0` guard on line 612 short-circuits +the entire headingLoss sub-expression to false. Zero-heading documents are +unaffected. + +**headingCount = 1:** +Correct. If the source has one heading and the translation has zero, headingLoss +is now true and a retry fires. This is the right behavior — dropping the only +heading is a genuine structural loss. + +**Legitimate LLM heading merging (false-positive risk):** +Low risk. If an LLM merges two headings into one, the new strict check will +trigger a retry. The retry path (lines 995-1016) uses isCritical: false and +retries with smaller chunks up to TRANSLATION_COMPLETENESS_MAX_RETRIES times. +The cost is extra API calls only; the final translation is not broken. Silent +heading loss is a more severe failure mode than a spurious retry, so the +trade-off is acceptable and consistent with the design intent for all other +structural checks in the same function. + +**Consistency with sibling checks:** +fencedBlockLoss, admonitionLoss, and tableLoss all use strict zero-tolerance +comparisons. The old headingLoss tolerance of -1 was the only outlier. The fix +makes headingLoss consistent with the rest of the function. + +## Test Coverage Assessment + +The test "retries with smaller chunks when a valid response omits a section" +(translateFrontMatter.test.ts line 126) is the primary coverage for headingLoss. +It uses a 4-heading source and a 2-heading response. + +- Old condition: 2 < (4 - 1) = 2 < 3 = true — test passed before the fix +- New condition: 2 < 4 = true — test still passes after the fix + +The test remains valid. No test updates are required. + +**Coverage gap (non-blocking):** There is no test that specifically covers the +boundary case the fix addresses — source with N headings, translation returning +exactly N-1 headings. This gap existed before and still exists. It is not a +blocker since the existing test exercises the core path correctly. + +## Actions Taken +**Review Process:** +- Read full translateFrontMatter.ts (1044 lines) +- Read full translateFrontMatter.test.ts (572 lines) +- Analyzed isSuspiciouslyIncompleteTranslation logic and all sibling checks +- Analyzed retry/recovery path in translateText +- Verified edge cases for headingCount 0, 1, and N +- Verified test coverage adequacy + +**Sub-Agents Spawned:** None — fix is approved, no fixer needed + +## Recommendations + +**Before Merge:** +- No blocking items + +**Optional follow-up (not blocking):** +- Add a targeted unit test for the N-1 heading boundary case that was the + subject of the fix, to prevent future regressions on the exact threshold + +## Handoff Notes + +**For Developer:** +- Fix is correct and complete as written +- No test changes required +- Re-review not required diff --git a/.output2.txt b/.output2.txt new file mode 100644 index 00000000..bff54e94 --- /dev/null +++ b/.output2.txt @@ -0,0 +1,470 @@ +diff --git a/i18n/pt/code.json b/i18n/pt/code.json +index 7c22c3c..c3984d3 100644 +--- a/i18n/pt/code.json ++++ b/i18n/pt/code.json +@@ -1,201 +1,264 @@ + { +- "theme.TOC.title": { +- "message": "Nesta página", +- "description": "Title for the table of contents section" +- }, +- "Introduction": { +- "message": "Introdução" +- }, +- "Preparing to Use CoMapeo": { +- "message": "Preparando para usar do CoMapeo (Mobile)" +- }, +- "Understanding CoMapeo's Core Concepts and Functions": { +- "message": "Nova Página" +- }, +- "Getting Started Essentials": { +- "message": "Novo título da seção" +- }, +- "Gathering the Right Equipment for CoMapeo": { +- "message": "Reunindo o Equipamento Certo para o CoMapeo" +- }, +- "Device Setup and Maintenance for CoMapeo": { +- "message": "Nova Página" +- }, +- "Installing CoMapeo & Onboarding": { +- "message": "Nova Página" +- }, +- "Initial Use and CoMapeo Settings": { +- "message": "Nova Página" +- }, +- "Uninstalling CoMapeo": { +- "message": "Nova Página" +- }, +- "Customizing CoMapeo": { +- "message": "Novo Alternar" +- }, +- "Organizing Key Materials for Projects": { +- "message": "Nova Página" +- }, +- "Building a Custom Categories Set": { +- "message": "Nova Página" +- }, +- "Building Custom Background Maps": { +- "message": "Nova Página" +- }, +- "Observations & Tracks": { +- "message": "Novo título da seção" +- }, +- "Gathering Observations & Tracks": { +- "message": "Coletando Observações" +- }, +- "Creating a New Observation": { +- "message": "Nova Página" +- }, +- "Creating a New Track": { +- "message": "Nova Página" +- }, +- "Reviewing Observations": { +- "message": "Revisando Observações" +- }, +- "Exploring the Observations List": { +- "message": "Nova Página" +- }, +- "Reviewing an Observation": { +- "message": "Nova Página" +- }, +- "Editing Observations": { +- "message": "Nova Página" +- }, +- "Data Privacy & Security": { +- "message": "Novo título da seção" +- }, +- "Encryption and Security": { +- "message": "Nova Página" +- }, +- "Managing Data Privacy & Security": { +- "message": "Gerenciamento de dados e privacidade" +- }, +- "Using an App Passcode for Security": { +- "message": "Nova Página" +- }, +- "Adjusting Data Sharing and Privacy": { +- "message": "Nova Página" +- }, +- "Mapping with Collaborators": { +- "message": "Nova Página" +- }, +- "Managing Projects": { +- "message": "Gerenciando Projetos" +- }, +- "Understanding Projects": { +- "message": "Nova Página" +- }, +- "Creating a New Project": { +- "message": "Nova Página" +- }, +- "Changing Categories Set": { +- "message": "Nova Página" +- }, +- "Managing a Team": { +- "message": "Nova Página" +- }, +- "Inviting Collaborators": { +- "message": "Nova Página" +- }, +- "Ending a Project": { +- "message": "Nova Página" +- }, +- "Exchanging Project Data": { +- "message": "Troca de Dados do Projeto" +- }, +- "Understanding How Exchange Works": { +- "message": "Nova Página A" +- }, +- "Using Exchange Offline": { +- "message": "Nova Página" +- }, +- "Using a Remote Archive": { +- "message": "Nova Página" +- }, +- "Moving Observations & Tracks Outside of CoMapeo": { +- "message": "Compartilhando observações fora do CoMapeo" +- }, +- "Sharing a Single Observation and Metadata": { +- "message": "Nova Página" +- }, +- "Exporting all Observations": { +- "message": "Nova Página" +- }, +- "Using Observations outside of CoMapeo": { +- "message": "Nova Página" +- }, +- "Miscellaneous": { +- "message": "Variado" +- }, +- "FAQ": { +- "message": "Perguntas frequentes" +- }, +- "Glossary": { +- "message": "Glossário" +- }, +- "Troubleshooting": { +- "message": "Resolução de Problemas" +- }, +- "Common Solutions": { +- "message": "Nova Página" +- }, +- "Troubleshooting: Setup and Customization": { +- "message": "Nova Página" +- }, +- "Troubleshooting: Observations and Tracks": { +- "message": "Nova Página" +- }, +- "Troubleshooting: Data Privacy and Security": { +- "message": "Nova Página" +- }, +- "Troubleshooting: Mapping with Collaborators": { +- "message": "Nova Página" +- }, +- "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { +- "message": "Nova Página" +- }, +- "Elementos de Conteúdo de Teste": { +- "message": "Elementos de Conteúdo de Teste" +- }, +- "Testing links": { +- "message": "Nova Página" +- }, +- "Understanding CoMapeo's Core Concepts and Functions": { +- "message": "Nova Página" +- }, +- "Installing CoMapeo and Onboarding": { +- "message": "Nova Página" +- }, +- "Planning and Preparing for a Project": { +- "message": "Nova Página" +- }, +- "Observations and Tracks": { +- "message": "Novo título da seção" +- }, +- "Gathering Observations and Tracks": { +- "message": "Coletando Observações" +- }, +- "Data Privacy and Security": { +- "message": "Novo título da seção" +- }, +- "Managing Data Privacy and Security": { +- "message": "Gerenciamento de dados e privacidade" +- }, +- "Moving Observations and Tracks Outside of CoMapeo": { +- "message": "Compartilhando observações fora do CoMapeo" +- }, +- "Developer Tools": { +- "message": "Ferramentas de desenvolvedor" +- }, +- "API Reference": { +- "message": "Referência de API" +- }, +- "CLI Reference": { +- "message": "Referência de CLI" +- } +-} ++ "theme.TOC.title": { ++ "message": "Nesta página", ++ "description": "Title for the table of contents section" ++ }, ++ "Introduction": { ++ "message": "Introdução" ++ }, ++ "Preparing to Use CoMapeo": { ++ "message": "Preparação para usar CoMapeo" ++ }, ++ "Understanding CoMapeo's Core Concepts and Functions": { ++ "message": "Nova Página" ++ }, ++ "Getting Started Essentials": { ++ "message": "Novo título da seção" ++ }, ++ "Gathering the Right Equipment for CoMapeo": { ++ "message": "Reunindo o Equipamento Adequado para CoMapeo" ++ }, ++ "Device Setup and Maintenance for CoMapeo": { ++ "message": "Nova Página" ++ }, ++ "Installing CoMapeo & Onboarding": { ++ "message": "Instalando o CoMapeo e Integração" ++ }, ++ "Initial Use and CoMapeo Settings": { ++ "message": "Nova Página" ++ }, ++ "Uninstalling CoMapeo": { ++ "message": "Desinstalando o CoMapeo" ++ }, ++ "Customizing CoMapeo": { ++ "message": "Personalizando CoMapeo" ++ }, ++ "Organizing Key Materials for Projects": { ++ "message": "Nova Página" ++ }, ++ "Building a Custom Categories Set": { ++ "message": "Nova Página" ++ }, ++ "Building Custom Background Maps": { ++ "message": "Nova Página" ++ }, ++ "Observations & Tracks": { ++ "message": "Observações e Trilhas" ++ }, ++ "Gathering Observations & Tracks": { ++ "message": "Coletando Observações e Trilhas" ++ }, ++ "Creating a New Observation": { ++ "message": "Criando uma Nova Observação" ++ }, ++ "Creating a New Track": { ++ "message": "Criando uma Nova Trilha" ++ }, ++ "Reviewing Observations": { ++ "message": "Revisando Observações" ++ }, ++ "Exploring the Observations List": { ++ "message": "Explorando a Lista de Observações" ++ }, ++ "Reviewing an Observation": { ++ "message": "Revisando uma observação" ++ }, ++ "Editing Observations": { ++ "message": "Editando observações" ++ }, ++ "Data Privacy & Security": { ++ "message": "Privacidade e segurança de dados" ++ }, ++ "Encryption and Security": { ++ "message": "Nova Página" ++ }, ++ "Managing Data Privacy & Security": { ++ "message": "Gestão de Privacidade de Dados e Segurança" ++ }, ++ "Using an App Passcode for Security": { ++ "message": "Utilize uma senha para o CoMapeo por motivos de segurança" ++ }, ++ "Adjusting Data Sharing and Privacy": { ++ "message": "Nova Página" ++ }, ++ "Mapping with Collaborators": { ++ "message": "Mapeamento com Colaboradores" ++ }, ++ "Managing Projects": { ++ "message": "Gerenciando Projetos" ++ }, ++ "Understanding Projects": { ++ "message": "Entenda os Fundamentos de Projetos" ++ }, ++ "Creating a New Project": { ++ "message": "Criar um novo projeto" ++ }, ++ "Changing Categories Set": { ++ "message": "Alterando o Conjunto de Categorias" ++ }, ++ "Managing a Team": { ++ "message": "Nova Página" ++ }, ++ "Inviting Collaborators": { ++ "message": "Convidar colaboradores" ++ }, ++ "Ending a Project": { ++ "message": "Nova Página" ++ }, ++ "Exchanging Project Data": { ++ "message": "Troca de Dados do Projeto" ++ }, ++ "Understanding How Exchange Works": { ++ "message": "Entendendo Como a Troca Funciona" ++ }, ++ "Using Exchange Offline": { ++ "message": "Trocar informações sem conexão com a internet" ++ }, ++ "Using a Remote Archive": { ++ "message": "Usar um arquivo remoto" ++ }, ++ "Moving Observations & Tracks Outside of CoMapeo": { ++ "message": "Compartilhando observações fora do CoMapeo" ++ }, ++ "Sharing a Single Observation and Metadata": { ++ "message": "Nova Página" ++ }, ++ "Exporting all Observations": { ++ "message": "Exportar todas as observações" ++ }, ++ "Using Observations outside of CoMapeo": { ++ "message": "Utilizando observações fora do CoMapeo" ++ }, ++ "Miscellaneous": { ++ "message": "Variado" ++ }, ++ "FAQ": { ++ "message": "Perguntas frequentes" ++ }, ++ "Glossary": { ++ "message": "Glossário" ++ }, ++ "Troubleshooting": { ++ "message": "Solução de problemas" ++ }, ++ "Common Solutions": { ++ "message": "Soluções Comuns" ++ }, ++ "Troubleshooting: Setup and Customization": { ++ "message": "Nova Página" ++ }, ++ "Troubleshooting: Observations and Tracks": { ++ "message": "Nova Página" ++ }, ++ "Troubleshooting: Data Privacy and Security": { ++ "message": "Nova Página" ++ }, ++ "Troubleshooting: Mapping with Collaborators": { ++ "message": "Nova Página" ++ }, ++ "Troubleshooting: Moving Observations and Tracks outside of CoMapeo": { ++ "message": "Nova Página" ++ }, ++ "Elementos de Conteúdo de Teste": { ++ "message": "Elementos de Conteúdo de Teste" ++ }, ++ "Testing links": { ++ "message": "Nova Página" ++ }, ++ "Installing CoMapeo and Onboarding": { ++ "message": "Nova Página" ++ }, ++ "Planning and Preparing for a Project": { ++ "message": "Nova Página" ++ }, ++ "Observations and Tracks": { ++ "message": "Novo título da seção" ++ }, ++ "Gathering Observations and Tracks": { ++ "message": "Coletando Observações" ++ }, ++ "Data Privacy and Security": { ++ "message": "Novo título da seção" ++ }, ++ "Managing Data Privacy and Security": { ++ "message": "Gerenciamento de dados e privacidade" ++ }, ++ "Moving Observations and Tracks Outside of CoMapeo": { ++ "message": "Compartilhando observações fora do CoMapeo" ++ }, ++ "Developer Tools": { ++ "message": "Ferramentas de desenvolvedor" ++ }, ++ "API Reference": { ++ "message": "Referência de API" ++ }, ++ "CLI Reference": { ++ "message": "Referência de CLI" ++ }, ++ "Understanding CoMapeo’s Core Concepts & Functions": { ++ "message": "Entendendo os Conceitos e Funções Principais do CoMapeo" ++ }, ++ "Getting Started - Essentials": { ++ "message": "Introdução - Noções básicas" ++ }, ++ "Device Setup & Maintenance for CoMapeo": { ++ "message": "Configuração e manutenção do dispositivo para o CoMapeo" ++ }, ++ "Initial Use & CoMapeo Settings": { ++ "message": "Uso inicial e Configurações do CoMapeo" ++ }, ++ "Planning & Preparing for a Project": { ++ "message": "Planejamento e Preparação para um Projeto" ++ }, ++ "Creating a Custom Categories Set": { ++ "message": "Construindo um Conjunto de Categorias Personalizado" ++ }, ++ "Creating Custom Background Maps": { ++ "message": "Criando mapas de fundo personalizados" ++ }, ++ "Reviewing and Editing Tracks": { ++ "message": "Revisão e Edição de Trilha" ++ }, ++ "Encryption & Security": { ++ "message": "Criptografia e Segurança" ++ }, ++ "Adjusting Data Sharing & Privacy": { ++ "message": "Ajuste o compartilhamento e a privacidade dos dados" ++ }, ++ "Selecting Device Roles & Teams": { ++ "message": "Seleção de funções e equipes de dispositivos" ++ }, ++ "Leave a project": { ++ "message": "Abandonar um projeto" ++ }, ++ "Removing a device from a Project": { ++ "message": "Remover um dispositivo de um projeto" ++ }, ++ "Completing or Ending a Project": { ++ "message": "Concluir um projeto" ++ }, ++ "Exchanging Observations": { ++ "message": "Tracar Observao…" ++ }, ++ "Sharing Background Map": { ++ "message": "Compartilhe o mapa de fundo" ++ }, ++ "Sharing a Single Observation & Metadata": { ++ "message": "Compartilhe uma única observação e metadados." ++ }, ++ "Site Map": { ++ "message": "Mapa do site" ++ }, ++ "Troubleshooting: Setup & Customization": { ++ "message": "Solução de Problemas: Configuração e Personalização" ++ }, ++ "CoMapeo Data & Privacy (translating for public page)": { ++ "message": "Nova Página" ++ }, ++ "[TEST] Installation Guide": { ++ "message": "[TESTE] Guia de Instalação" ++ }, ++ "Changing Backgroud Maps": { ++ "message": "Alterando mapas de fundo" ++ } ++} +\ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 87eeded5..491840e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,25 +8,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Added -- **Targeted Notion Fetching:** Added the ability to fetch data for a single Notion page. - **Pre-Release Safety:** Added validation checks to ensure all translations (locales) are complete. -### Changed - -- **Simplified Data Fetching:** Cleaned up and simplified the logic for fetching all pages from Notion. -- **Docker Tests:** Updated the Docker integration tests to work correctly with the newly added fetch-job types. - -### Removed - -- **Code Cleanup:** Removed redundant code from the API schemas to make the codebase cleaner. - ### Fixed - **Translation Completeness:** Fixed several issues with how the system measures if a page is fully translated. - **Long-form Content Translation:** Prevented issues where content could be lost when translating very long pages. -- **Language Switcher (Locale Dropdown):** - - Fixed a bug where the language switcher would sometimes point to the wrong page. - - Corrected an issue that caused "double" language codes in URLs. - - Fixed navigation issues when switching languages on category index pages. - - Fixed a display issue where the language dropdown might be hidden behind other menu items. - **Build Scripts:** Resolved bugs in the TypeScript compilation and Markdown parsing scripts. diff --git a/plans/2026-03-19-PLAN-v1.md b/plans/2026-03-19-PLAN-v1.md new file mode 100644 index 00000000..e69e81ec --- /dev/null +++ b/plans/2026-03-19-PLAN-v1.md @@ -0,0 +1,160 @@ +# Long-Form Translation Reliability Plan + +## Objective + +Improve automatic Notion translation reliability for long-form documentation by proactively chunking markdown before unsafe request sizes, detecting structurally incomplete model responses, retrying with smaller chunks, and ensuring failures prevent both localized markdown writes and Notion-side translated page creation. + +## Scope + +### In Scope + +- Localized markdown generation under `i18n/...` via `scripts/notion-translate/translateFrontMatter.ts` and the save path in `scripts/notion-translate/index.ts` +- Notion-side translated page creation in `scripts/notion-translate/index.ts` and `scripts/notion-translate/translateBlocks.ts`, specifically the workflow gating that must prevent page creation when full-page markdown translation is incomplete +- Focused unit and workflow tests in `scripts/notion-translate/translateFrontMatter.test.ts`, `scripts/notion-translate/index.test.ts`, and `scripts/notion-translate/translateBlocks.test.ts` + +### Out of Scope + +- Redesigning `scripts/notion-translate/translateBlocks.ts` into a new translation architecture +- Unifying markdown and block translation into a single pipeline +- Changes outside the translation scripts and their focused tests + +## Current Workflow Summary + +- `bun run notion:translate` creates translation pages in Notion, updates `code.json`, translates theme strings, and saves localized markdown. +- For each non-title page, `processSinglePageTranslation()` first translates full markdown with `translateText()`, then builds Notion-side translated blocks with `translateNotionBlocksDirectly()`, then creates or updates the Notion page with `createNotionPageWithBlocks()`, then writes translated markdown with `saveTranslatedContentToDisk()`. +- `translateNotionBlocksDirectly()` is not independent of the markdown reliability work: `translateRichTextArray()` also routes rich-text translation through `translateText()`. Chunking, completeness validation, and retry behavior therefore affect both localized markdown and block translation behavior. +- Because both outputs are produced in the same workflow, an incomplete full-page translation must fail the page before either output is persisted. No partial success is acceptable where only one output is written. + +## Problem Statement + +Long-form markdown can remain on a single large model call, and the pipeline currently accepts structurally partial but schema-valid responses. That allows missing sections to propagate silently into generated locale files and into the translated Notion page created for the same source page. + +## Concrete Decisions + +- Proactive chunk cap: `120_000` total request characters per markdown translation call, regardless of larger model context windows. +- Retry floor: `8_000` total request characters. +- Completeness retry depth: `4` retries maximum, halving the chunk limit on each retry until the floor is reached. +- Title handling: the first chunk owns the translated title; later chunks send an empty title. +- Error classification: persistent incompleteness surfaces as a non-critical translation failure (`isCritical: false`) after completeness retries are exhausted. +- Workflow continuation semantics: the run continues processing remaining pages and languages after a page-level incompleteness failure, records the failure in the summary, and exits non-zero at the end if any document translation failed. +- Completeness validation signals: + - heading count loss + - fenced code block count loss + - admonition count loss + - table disappearance + - complete loss of bullet list items when the source has at least 3 bullet items + - complete loss of numbered list items when the source has at least 3 numbered items + - severe length shrinkage when source content length is at least `4_000` characters and translated/source ratio is below `0.55` +- Placeholder and image-path integrity checks remain mandatory and must continue to run alongside completeness validation. + +## Requirements + +### Functional Requirements + +- Add a reliability-oriented markdown chunk cap independent of model-advertised context limits. +- Validate translated markdown structure before accepting single-call responses and after chunk reassembly. +- Retry suspiciously incomplete translations with smaller chunks before surfacing failure. +- Treat persistent incompleteness as a document translation failure that: + - prevents create/update of the translated Notion page + - prevents writing localized markdown to disk + - increments `failedTranslations` for the language + - appears in `failures` and `TRANSLATION_SUMMARY` + - records `language`, `title`, `pageId`, `error`, and `isCritical` in the failure entry + - causes `bun run notion:translate` to exit non-zero per the workflow contract +- Apply workflow-level failure handling only after completeness retries are exhausted inside `translateFrontMatter.ts`. +- Continue processing remaining pages and languages after a page-level incompleteness failure, then fail the overall run from the final summary if any document translation failed. +- Preserve existing Notion block behavior for URL sanitization, image mapping, inline-image path consumption, recursive child block translation, and metadata stripping. +- Preserve existing behavior for short pages, title handling, placeholder integrity, and post-translation image validation. +- Do not add new dependencies. + +### Non-Functional Requirements + +- Keep changes localized to `scripts/constants.ts`, `scripts/notion-translate/translateFrontMatter.ts`, `scripts/notion-translate/index.ts`, and focused tests. +- Prefer deterministic structural checks over semantic or language-specific heuristics. +- Preserve the current block-translation architecture; this iteration adds workflow gating for both outputs, not a block-translator redesign. + +## Implementation Plan + +1. Add translation reliability constants in `scripts/constants.ts`. + - `TRANSLATION_CHUNK_MAX_CHARS = 120_000` + - `TRANSLATION_MIN_CHUNK_MAX_CHARS = 8_000` + - `TRANSLATION_COMPLETENESS_MAX_RETRIES = 4` + - Keep existing model-context helpers, but stop using them as the only practical chunk ceiling for long-form markdown. + +2. Update `translateText()` in `scripts/notion-translate/translateFrontMatter.ts`. + - Compute an effective chunk limit from the new cap. + - Use the fast path only when the full request fits within that limit. + - Reuse the existing section, paragraph, and line splitters for proactive chunking. + +3. Add completeness detection in `scripts/notion-translate/translateFrontMatter.ts`. + - Collect structural metrics on source and translated markdown. + - Ignore YAML frontmatter content when counting list and structure markers. + - Ignore marker-like text inside fenced code blocks. + - Validate both single-call responses and reassembled chunked responses. + +4. Add recoverable completeness retries in `scripts/notion-translate/translateFrontMatter.ts`. + - Classify incompleteness as a non-critical retryable translation failure. + - Halve the chunk limit on each retry until the `8_000`-character floor. + - Stop retrying after 4 completeness retries and rethrow the failure as a non-critical document translation error. + +5. Ensure page-level workflow gating in `scripts/notion-translate/index.ts`. + - Fail the page before `createNotionPageWithBlocks()` and `saveTranslatedContentToDisk()` if full-page markdown translation fails completeness checks. + - Keep localized markdown and Notion-side page creation behavior consistent for the same source page. + - Continue processing remaining pages and languages, then fail the overall run from the summary when `failedTranslations > 0`. + +6. Add focused unit coverage in `scripts/notion-translate/translateFrontMatter.test.ts`. + - Proactive chunking for long-form content below model-derived maxima + - Retry when a valid response omits a middle section + - Failure after repeated incomplete responses + - Severe length shrinkage detection + - Frontmatter and fenced-code false-positive guards + - No regression for placeholder integrity and token-overflow fallback + +7. Add block-translation regression coverage in `scripts/notion-translate/translateBlocks.test.ts`. + - Keep existing coverage for URL sanitization, image mapping, inline-image path consumption, metadata stripping, and recursive child-block handling. + - Add one regression test proving the new completeness logic does not falsely reject normal short rich-text block translations routed through `translateText()`. + +8. Add workflow coverage in `scripts/notion-translate/index.test.ts`. + - Add a dedicated incompleteness test instead of relying only on the generic translation API error case. + - Mock `translateText()` to fail only after completeness retries are exhausted. + - Verify `failedTranslations` increments for each affected language and `failures.length` matches. + - Verify each failure entry includes `language`, source page `title`, `pageId`, the incompleteness error text, and `isCritical: false`. + - Verify `TRANSLATION_SUMMARY` is still emitted on failure. + - Verify `main()` rejects so the CLI exits non-zero. + - Verify Notion page creation/update and localized markdown save are not executed for the failed page. + +9. Run targeted validation on touched files only. + - `bunx eslint scripts/constants.ts scripts/notion-translate/translateFrontMatter.ts scripts/notion-translate/index.ts scripts/notion-translate/translateFrontMatter.test.ts scripts/notion-translate/index.test.ts scripts/notion-translate/translateBlocks.test.ts --fix` + - `bunx prettier --write scripts/constants.ts scripts/notion-translate/translateFrontMatter.ts scripts/notion-translate/index.ts scripts/notion-translate/translateFrontMatter.test.ts scripts/notion-translate/index.test.ts scripts/notion-translate/translateBlocks.test.ts` + - `bunx vitest run scripts/notion-translate/translateFrontMatter.test.ts scripts/notion-translate/index.test.ts scripts/notion-translate/translateBlocks.test.ts` + +## Verification Criteria + +- Long-form markdown that previously fit the theoretical model limit is chunked once it exceeds the `120_000`-character cap. +- A structurally partial but schema-valid translation is retried instead of silently accepted. +- Persistent incompleteness surfaces as a page failure in `TRANSLATION_SUMMARY` and causes non-zero exit. +- A page that fails completeness checks produces neither a translated Notion page nor a saved localized markdown file. +- The workflow continues processing other pages and languages after the failed page, but the final run still exits non-zero when any document translation failed. +- Existing Notion block behavior remains intact, and short rich-text block translations do not regress under the shared `translateText()` reliability changes. +- Existing placeholder, image integrity, and overflow fallback behavior still pass after the change. + +## Risks and Mitigations + +1. **Risk: More aggressive chunking may reduce cross-section consistency.** + Mitigation: Keep structure-aware chunking and source the translated title from the first chunk only. + +2. **Risk: Structural checks may reject valid translations.** + Mitigation: Use conservative, deterministic signals and add false-positive regression tests for frontmatter and fenced code blocks. + +3. **Risk: Retry logic increases runtime and API cost.** + Mitigation: Only retry on completeness failures and keep the short-page fast path unchanged. + +4. **Risk: The root cause may later prove to exist in the block-translator path itself.** + Mitigation: This iteration gates both outputs on the validated full-page translation. A follow-up can add block-level completeness checks if evidence warrants it. + +## Ready-to-Implement Definition + +- Scope explicitly covers both localized markdown and Notion-side translated pages. +- Constants, heuristics, retry behavior, and failure semantics are fixed in the plan. +- The test matrix includes `translateFrontMatter.ts`, `translateBlocks.ts`, and `index.ts` coverage. +- No open design questions remain for the first implementation pass. diff --git a/plans/2026-03-25-fix-output-truncation-token-overflow-v1.md b/plans/2026-03-25-fix-output-truncation-token-overflow-v1.md new file mode 100644 index 00000000..97391366 --- /dev/null +++ b/plans/2026-03-25-fix-output-truncation-token-overflow-v1.md @@ -0,0 +1,48 @@ +# Fix: Output Truncation Classified as Non-Retryable Error + +## Objective + +When the OpenAI API returns a response with `finish_reason: "length"` (output token limit hit), the current code passes the truncated string to `JSON.parse`, which throws and is caught as a critical `schema_invalid` error — permanently killing translation for that page with no retry. The fix intercepts `finish_reason: "length"` before parsing and re-classifies it as a non-critical `token_overflow` error, so the existing overflow retry machinery in `translateChunkWithOverflowFallback` can re-attempt with a smaller chunk automatically. + +## Implementation Plan + +- [ ] Task 1. **Check `finish_reason` before calling `parseTranslationPayload` in `translateTextSingleCall`** + + In `scripts/notion-translate/translateFrontMatter.ts`, inside `translateTextSingleCall`, after the `response.choices[0]?.message?.content` read (currently around line 762), add a check for `finish_reason` on the same choice object. If it equals `"length"`, throw a `TranslationError` with code `"token_overflow"` and `isCritical: false`. The `token_overflow` code is the correct signal here: the existing handler in `translateChunkWithOverflowFallback` already detects this code and triggers a recursive split-and-retry. No new retry path needs to be written — the fix is purely a re-classification. + + The check must be placed **before** the `if (!content)` guard and the `parseTranslationPayload` call, so that a truncated-but-non-empty response is caught before `JSON.parse` sees it. + + Error message should be descriptive: `"OpenAI output was truncated (finish_reason: length) — chunk too large for model output budget"`. + +- [ ] Task 2. **Add two tests in `translateFrontMatter.test.ts` covering the new behaviour** + + **Test A — classification:** Mock `openai.chat.completions.create` to return an HTTP-200 response where `choices[0].finish_reason` is `"length"` and `choices[0].message.content` is a truncated JSON string (e.g. `'{"markdown":"partial content'`). Assert that `translateText` rejects with a `TranslationError` whose `code` is `"token_overflow"` and `isCritical` is `false`. This mirrors the pattern used in the existing `"continues to classify token overflow errors..."` test at line 338. + + **Test B — retry integration:** Mock the first call to return `finish_reason: "length"`, then let subsequent calls succeed (using `installStructuredTranslationMock` or a similar inline mock). Assert that `mockOpenAIChatCompletionCreate` is called more than once and that the final result contains the expected translated content. This mirrors the existing `"retries the fast path with adaptive splitting on token overflow"` test at line 413. + + Both tests go in the existing `describe("notion-translate translateFrontMatter", ...)` block, alongside the other classification and retry tests. + +- [ ] Task 3. **Run the test file and typecheck** + + Execute `bunx vitest run scripts/notion-translate/translateFrontMatter.test.ts` and confirm all tests pass, including the two new ones. Then run `bun run typecheck --noEmit` scoped to the changed files to confirm no TypeScript regressions. + +## Verification Criteria + +- `finish_reason: "length"` responses produce a `TranslationError` with `code: "token_overflow"` and `isCritical: false` — not `schema_invalid`. +- A subsequent retry is triggered automatically (call count > 1) when the first call returns `finish_reason: "length"`, without any changes to the retry orchestration logic. +- All existing tests in `translateFrontMatter.test.ts` continue to pass. +- TypeScript compilation produces no errors. + +## Potential Risks and Mitigations + +1. **Mock shape divergence**: The existing test mocks omit `finish_reason` (the field is `undefined`). The new check must only fire when `finish_reason === "length"` exactly, not when it is `undefined` or `"stop"`. A strict equality check (`=== "length"`) ensures backward compatibility with all existing mock responses. + Mitigation: Use strict equality; verify existing tests still pass after the change. + +2. **Infinite retry loop if chunk floor is already reached**: If `effectiveChunkLimit` is already at `TRANSLATION_MIN_CHUNK_MAX_CHARS` (8,000 chars) and the model still truncates output, the overflow fallback in `translateChunkWithOverflowFallback` detects that the chunk cannot be halved further and re-throws. This existing guard already handles the edge case correctly — no additional logic needed. + Mitigation: Confirm by reading `translateChunkWithOverflowFallback` lines 844–858, which already enforce the floor before re-throwing. + +## Alternative Approaches + +1. **Pass `max_tokens` explicitly in the API call**: Setting a large `max_tokens` (e.g., 32,768) would prevent truncation at the API level rather than handling it after the fact. This is complementary but not a substitute — the `finish_reason` check is still needed for robustness against future model changes or misconfiguration, and adding `max_tokens` to `getModelParams` is a separate, independent concern that would affect all models and require its own testing. + +2. **Attempt JSON repair before throwing**: Libraries like `jsonrepair` can reconstruct truncated JSON. This would avoid a retry API call entirely but introduces a dependency, silently accepts partial translations (content after the truncation point is lost), and masks the real problem rather than triggering the retry+completeness-validation path that already exists. diff --git a/plans/2026-03-25-robust-translation-chunking-v1.md b/plans/2026-03-25-robust-translation-chunking-v1.md new file mode 100644 index 00000000..f7fa7aff --- /dev/null +++ b/plans/2026-03-25-robust-translation-chunking-v1.md @@ -0,0 +1,424 @@ +# Robust Translation Chunking & Completion-Budget Hardening + +## Objective + +Harden markdown translation against output truncation, unsafe chunk sizing, and configuration drift by making completion-budget control explicit, tightening chunk-size derivation, handling frontmatter without breaking chunk 0 budgeting, and adding runtime validation that matches the repo's parity requirements. + +This revision intentionally narrows scope to changes that are safe to implement on top of the current translation loop. Speculative mechanisms that would require a larger refactor are deferred. + +## Status: Revised Before Implementation + +**Already completed in prior work:** +- Proactive chunking lowered from 500K to 120K chars +- Structural completeness validation (heading, code block, table, list, admonition checks) +- Completeness retry with halving chunk limits (up to 4 retries, 8K floor) +- `finish_reason: "length"` detection reclassified as `token_overflow` + +**In scope for this revision:** +- Explicit completion-budget configuration and request parameters +- Safer chunk-size derivation backed by small verified defaults plus env overrides +- Frontmatter-aware budgeting that does not break chunk 0 +- End-to-end propagation of `TRANSLATION_*` config through CLI, CI, and API job execution +- Stronger validation using targeted translation tests plus parity checks on real EN/PT/ES content + +**Deferred from v1:** +- Cross-chunk context injection +- Ratio-based targeted retry of individual chunks + +## Evidence Status + +There is no standalone saved research document for this plan. + +This revision is based on: +- current implementation review in `scripts/notion-translate/translateFrontMatter.ts` +- existing translation parity tracker and research map under `context/development/` +- current runtime/config paths in the API server and GitHub workflows +- current official model docs for the provider defaults this repo relies on + +Any mechanism not supported by those sources is treated as exploratory and kept out of the critical path. + +## Scope Boundaries + +**This plan hardens the markdown translation path only.** + +It does **not** claim full translation-parity closure across the entire pipeline. Full parity still depends on the broader backlog tracked in `context/development/translation-improvements-progress.md`, including: +- deterministic parity checker script +- broader `markdownToNotion` hardening +- broader `scripts/notion-fetch/*` locale-consistency work +- regression-gate coverage + +--- + +## Configuration Strategy + +### Authoritative Runtime Controls + +Use explicit runtime overrides first, then small verified model defaults, then conservative fallback with warning. + +| Variable | Default | Purpose | +|---|---|---| +| `OPENAI_MODEL` | current repo runtime default | Model selection only | +| `OPENAI_BASE_URL` | unset | Distinguishes OpenAI default API from custom/provider-compatible endpoints | +| `TRANSLATION_MAX_COMPLETION_TOKENS` | model-derived if verified, else conservative fallback | First-class output budget control | +| `TRANSLATION_CONTEXT_LIMIT` | model-derived if verified, else conservative fallback | Optional input-context override | +| `TRANSLATION_CHUNK_MAX_CHARS` | derived | Optional hard ceiling override | +| `TRANSLATION_MIN_CHUNK_CHARS` | `8000` | Retry floor for chunk halving | +| `TRANSLATION_COMPLETENESS_MAX_RETRIES` | `4` | Max completeness retry rounds | +| `TRANSLATION_JSON_ESCAPE_OVERHEAD` | `0.5` | Estimated JSON/escaping overhead | +| `TRANSLATION_CHARS_PER_TOKEN` | `3.5` | Conservative chars-per-token estimate | + +### Defaulting Policy + +1. `TRANSLATION_MAX_COMPLETION_TOKENS` is the primary control. +2. For the repo's known default models, maintain a **small verified table** in code. +3. For custom/self-hosted/OpenAI-compatible providers, require explicit overrides when the model is not in the verified table. +4. When falling back to a guessed/conservative default, log a warning to stderr so operators know the budget is not authoritative. + +### Concrete Conservative Fallbacks + +Use these exact values when no verified model default or explicit override exists: + +- `conservativeCompletionFallback = 8192` +- `conservativeContextFallback = 128000` + +Rationale: +- `8192` is conservative relative to the repo's current OpenAI defaults, but still large enough to avoid absurdly small proactive chunks. With the plan's default `3.5` chars/token and `0.5` escape overhead, it yields about `14,336` safe output chars before the context guardrail applies. +- `128000` matches the repo's current conservative unknown-model context fallback and common modern model context windows, while still being secondary to the output-budget cap for chunk sizing. + +Required warning rules: +- If either fallback is used, log the active model name and the fallback values being applied. +- If `OPENAI_BASE_URL` is set and either fallback is used, log a stronger warning that custom/provider-compatible deployments should set `TRANSLATION_MAX_COMPLETION_TOKENS` and `TRANSLATION_CONTEXT_LIMIT` explicitly. +- Do not silently treat fallback-derived limits as verified provider capabilities. + +### Minimal Verified Model Defaults + +Do not ship a large speculative table. + +Keep the built-in defaults limited to models the repo actively documents or defaults to, and annotate each entry as verified. Everything else must rely on explicit overrides. + +### Budget Derivation + +When `TRANSLATION_CHUNK_MAX_CHARS` is not set explicitly: + +```ts +completionCap = env.TRANSLATION_MAX_COMPLETION_TOKENS + ?? getVerifiedModelCompletionCap(model) + ?? conservativeCompletionFallback; + +contextLimit = env.TRANSLATION_CONTEXT_LIMIT + ?? getVerifiedModelContextLimit(model) + ?? conservativeContextFallback; + +charsPerToken = env.TRANSLATION_CHARS_PER_TOKEN ?? 3.5; +escapeOverhead = env.TRANSLATION_JSON_ESCAPE_OVERHEAD ?? 0.5; + +safeOutputChars = Math.floor( + completionCap * charsPerToken * (1 - escapeOverhead) +); + +inputBudget = Math.floor(contextLimit * charsPerToken / 2); + +chunkMaxChars = Math.min(safeOutputChars, inputBudget); +``` + +This derived limit must then still be capped by any explicit `TRANSLATION_CHUNK_MAX_CHARS` override. + +--- + +## Implementation Plan + +### Phase 0: Default-Model Decision & Operator Clarity + +- [ ] **0.1** Choose one authoritative runtime default model and document it consistently + + The code currently defaults to `gpt-5-mini`, but repo docs disagree in multiple places. Before relying on model-derived defaults, update all operator-facing references so the default model is unambiguous. + +- [ ] **0.2** Add an evidence note to the implementation PR/task summary + + State explicitly that this revision is based on code inspection, current provider docs, and parity requirements, not on a saved standalone research memo. + +### Phase 1: Explicit Completion Budget & Provider-Aware Request Params + +- [ ] **1.1** Add completion-budget helpers in `scripts/constants.ts` + + Replace the planned large `MODEL_OUTPUT_LIMITS` table with: + - a small verified completion-cap lookup for the repo's documented/default models + - a context-limit lookup + - env-first helpers for `TRANSLATION_MAX_COMPLETION_TOKENS` and `TRANSLATION_CONTEXT_LIMIT` + - warning-backed conservative fallbacks for unknown/custom models + +- [ ] **1.2** Pass explicit completion-budget params in the OpenAI request path + + In `scripts/notion-translate/translateFrontMatter.ts`, update the request builder so the API call sets an explicit output cap: + - OpenAI default API: `max_completion_tokens` + - custom/OpenAI-compatible providers: provider-compatible equivalent where supported (`max_tokens` if that is the documented parameter for the target path) + + This must be wired from the same effective completion-budget helper used by chunk-size derivation. + +- [ ] **1.3** Rewrite `getMaxChunkChars` around completion cap, not context alone + + The current formula is context-heavy and does not reflect response-size risk. Replace it with the completion-aware derivation above. + +- [ ] **1.4** Validate env parsing and fallback behavior + + Parse all numeric env vars with validation: + - positive integers for token/char/retry limits + - 0-1 range for `TRANSLATION_JSON_ESCAPE_OVERHEAD` + - positive float for `TRANSLATION_CHARS_PER_TOKEN` + + Invalid values should warn and fall back safely. + +- [ ] **1.5** Add/update tests for completion-budget behavior + + Add focused tests that prove: + - env overrides win over model defaults + - unknown/custom models warn and use conservative fallbacks + - `getMaxChunkChars` is derived from completion budget, not context alone + - request params include explicit completion-budget controls + +### Phase 2: Frontmatter-Aware Budgeting Without Changing Translation Semantics + +- [ ] **2.1** Replace destructive stripping with explicit extraction + + Add an `extractYamlFrontmatter()` helper that returns: + + ```ts + { frontmatter: string; body: string } + ``` + + Do not reuse `stripYamlFrontmatter()` for this purpose, because it discards the data needed for reconstruction. + +- [ ] **2.2** Reserve frontmatter budget before body splitting + + Keep frontmatter in the first translation request, but subtract its size from chunk 0's content budget **before** splitting the body. + + Required behavior: + - body-only content is split using chunk budgets that account for chunk 0 frontmatter overhead + - chunk 0 is reconstructed as `frontmatter + firstBodyChunk` + - later chunks remain body-only + - completeness validation continues to evaluate body content, not frontmatter noise + +- [ ] **2.3** Define the oversize-frontmatter edge case explicitly + + If frontmatter alone consumes nearly all of chunk 0's budget: + - log a warning + - bypass frontmatter-aware proactive splitting for that document + - rely on the existing overflow/completeness fallback rather than introducing a second special-case splitter + +- [ ] **2.4** Add focused tests for frontmatter-aware budgeting + + Verify: + - frontmatter appears only in the first API request + - chunk 0 stays within the intended total request budget + - the final output retains translated frontmatter and complete body content + - frontmatter-only validation noise does not trigger false incompleteness + +### Phase 3: Runtime Propagation of Translation Tuning + +**Supported execution paths for this revision:** +- local CLI translation runs +- GitHub Actions translation workflow +- API-triggered translation jobs launched from this repo + +**Out of scope for this revision:** +- deployed API-service runtime translation on Fly/production infrastructure + +That deployed runtime can be revisited later, but it should not block this markdown-translation hardening pass. + +- [ ] **3.1** Propagate `TRANSLATION_*` vars through API-triggered translation jobs + + Update `api-server/job-executor.ts` so child process env whitelisting includes the new translation-tuning vars. + +- [ ] **3.2** Add API env propagation tests + + Update `api-server/job-executor-env.test.ts` to verify the new vars are preserved for child jobs. + +- [ ] **3.3** Support translation overrides in GitHub Actions via non-secret workflow config + + Update `.github/workflows/translate-docs.yml` so `TRANSLATION_*` tuning values can be supplied from workflow/repository variables or workflow env. These values are configuration, not secrets. + +- [ ] **3.4** Explicitly exclude deployed API-service runtime translation from this plan + + Update docs/scope notes so this revision guarantees `TRANSLATION_*` support for CLI, GitHub Actions, and repo-local API jobs only. Do not expand Fly/deployed runtime secret propagation in this change set. + +### Phase 4: Diagnostics Instead of New Chunk-Control Heuristics + +- [ ] **4.1** Add per-chunk ratio telemetry only + + Record per-chunk input/output ratios for diagnostics, but do **not** add ratio-based control flow in v1. + + This telemetry can be logged in a debug-friendly structure for failed/incomplete translations and used to inform future research. + +- [ ] **4.2** Keep recovery behavior unchanged + + Continue using the existing whole-document completeness retry with smaller chunk limits. Do not add targeted retry of individual chunks in this revision. + +- [ ] **4.3** Explicitly defer cross-chunk context injection + + Do not prepend synthetic context to chunks in v1. That would require a cleaner separation between: + - content to translate + - context supplied to the model + - placeholder integrity checks + - overflow fallback splitting + + That refactor is larger than this plan. + +### Phase 5: Validation That Matches the Repo's Parity Contract + +- [ ] **5.1** Extend targeted translation tests + + Add/update focused tests in `scripts/notion-translate/translateFrontMatter.test.ts` for: + - explicit completion-budget request parameters + - env override precedence and invalid-value fallback + - frontmatter-aware budgeting + - no regression in overflow fallback and completeness retry + +- [ ] **5.2** Add an executable parity checker path + + The repo already has parity logic in `scripts/locale-parity.test.ts`, but it is trapped inside test-only fixtures. Extract or wrap the parity collector into an executable checker that can run against real repo content. + +- [ ] **5.3** Run one targeted parity validation on real content + + Validation must include: + - one targeted family known to have failed before + - one sampled family that currently succeeds + - frontmatter parity enabled if frontmatter handling changes + + Record results in `context/development/translation-improvements-progress.md`. + +- [ ] **5.4** Keep success claims narrow + + This plan is complete when markdown translation hardening is validated. Do not claim full pipeline parity closure unless the remaining backlog in `translation-improvements-progress.md` is also addressed. + +### Phase 6: Documentation Sweep + +- [ ] **6.1** Update `env-file` with an OpenAI/translation section + + `env-file` currently has no OpenAI section. Add one and include commented `TRANSLATION_*` examples in the same operator-facing location as `OPENAI_*`. + +- [ ] **6.2** Update operator docs + + Update at least: + - `SETUP.md` + - `context/workflows/translation-process.md` + - `context/api-server/reference.md` + + Update deployment docs as well if API-runtime translation remains in scope. + +- [ ] **6.3** Reconcile all default-model references + + After Phase 0's decision, update every operator-facing mention so docs stop disagreeing about the default model. + +### Phase 7: Verification Commands + +- [ ] **7.1** Run targeted checks on touched files + + Execute only the checks relevant to the files changed by implementation, for example: + + ```bash + bunx vitest run scripts/notion-translate/translateFrontMatter.test.ts + bunx vitest run scripts/constants.test.ts + bunx vitest run api-server/job-executor-env.test.ts + bunx vitest run scripts/locale-parity.test.ts + bun run typecheck --noEmit + bunx eslint --fix + bunx prettier --write + ``` + +--- + +## Verification Criteria + +The revised implementation is acceptable only if all of the following are true: + +- Translation requests use an explicit completion-budget parameter appropriate for the active provider path. +- Chunk sizing is derived from effective completion budget plus context guardrails, not from context size alone. +- Unknown/custom models do not silently rely on speculative limits; they warn and use explicit override paths. +- Frontmatter-aware chunking keeps frontmatter in the first request without allowing chunk 0 to exceed the intended budget. +- Existing overflow fallback and completeness retry continue to work. +- `TRANSLATION_*` overrides work in local CLI runs, the GitHub Actions translation workflow, and API-triggered translation jobs launched from this repo. +- Targeted translation tests pass. +- Parity validation is run on real EN/PT/ES content and recorded in `translation-improvements-progress.md`. +- Success claims remain limited to markdown translation hardening unless broader parity backlog items are also completed. + +--- + +## Risks and Mitigations + +1. **Provider parameter differences** + Mitigation: keep request-param handling explicit and provider-aware rather than assuming one universal output-token field. + +2. **Model tables become stale** + Mitigation: keep the built-in table intentionally small, verified, and override-friendly. + +3. **Frontmatter changes still create edge-case budget pressure** + Mitigation: reserve chunk 0 budget before splitting and define a fallback path for oversize frontmatter instead of inventing a second complex splitter. + +4. **Parity claims outpace actual pipeline coverage** + Mitigation: require one real parity run and record it, but keep the scope statement narrow. + +--- + +## Deferred Follow-Ups + +These are intentionally **not** part of this implementation: + +1. **Cross-chunk context injection** + Requires a cleaner translation contract so synthetic context is not treated as chunk content by placeholder validation, overflow fallback, or completeness checks. + +2. **Ratio-based targeted retry** + Requires a chunk-manifest model and replace-in-place reassembly logic that does not exist today. + +3. **Full pipeline parity closure** + Still depends on backlog work outside `translateFrontMatter.ts`. + +--- + +## File Change Map + +| File | Changes | +|---|---| +| `scripts/constants.ts` | Add effective completion/context budget helpers and env parsing | +| `scripts/constants.test.ts` | Add tests for env overrides, fallbacks, and completion-aware sizing | +| `scripts/notion-translate/translateFrontMatter.ts` | Add explicit completion-budget params, frontmatter-aware budgeting, ratio telemetry only | +| `scripts/notion-translate/translateFrontMatter.test.ts` | Add tests for provider-budget params and frontmatter-aware budgeting | +| `api-server/job-executor.ts` | Propagate `TRANSLATION_*` env vars to child jobs | +| `api-server/job-executor-env.test.ts` | Verify env propagation for translation tuning | +| `scripts/locale-parity.test.ts` or extracted shared module | Reuse parity logic through an executable path | +| `env-file` | Add OpenAI/translation env examples | +| `SETUP.md` | Update translation model/default and `TRANSLATION_*` docs | +| `context/workflows/translation-process.md` | Document translation tuning and validation expectations | +| `context/api-server/reference.md` | Document translation-related child-env/runtime config | + +--- + +## Progress Tracking + +| Phase | Task | Status | +|---|---|---| +| **0 — Defaults** | 0.1 Authoritative default model decision | Not Started | +| | 0.2 Evidence note in implementation summary | Not Started | +| **1 — Budget** | 1.1 Completion-budget helpers | Not Started | +| | 1.2 Explicit request completion params | Not Started | +| | 1.3 Rewrite `getMaxChunkChars` | Not Started | +| | 1.4 Env validation | Not Started | +| | 1.5 Budget tests | Not Started | +| **2 — Frontmatter** | 2.1 Extract helper | Not Started | +| | 2.2 Frontmatter-aware budgeting | Not Started | +| | 2.3 Oversize-frontmatter fallback | Not Started | +| | 2.4 Frontmatter tests | Not Started | +| **3 — Runtime** | 3.1 API env propagation | Not Started | +| | 3.2 API env tests | Not Started | +| | 3.3 CI override decision | Not Started | +| | 3.4 API-runtime scope decision | Not Started | +| **4 — Diagnostics** | 4.1 Ratio telemetry | Not Started | +| | 4.2 Keep existing recovery path | Not Started | +| | 4.3 Defer context injection | Not Started | +| **5 — Validation** | 5.1 Targeted translation tests | Not Started | +| | 5.2 Executable parity checker path | Not Started | +| | 5.3 Real parity run + tracker update | Not Started | +| | 5.4 Keep success claims narrow | Not Started | +| **6 — Docs** | 6.1 Update `env-file` | Not Started | +| | 6.2 Update operator docs | Not Started | +| | 6.3 Reconcile default-model references | Not Started | +| **7 — Checks** | 7.1 Targeted verification commands | Not Started | diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index c4a46646..4233f651 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -846,4 +846,204 @@ describe("notion-translate translateFrontMatter", () => { } expect(chunks.join("")).toBe(longLine); }); + + // parseFrontmatterKeys unit tests + + it("parseFrontmatterKeys returns empty array when no frontmatter is present", async () => { + const { parseFrontmatterKeys } = await import("./translateFrontMatter"); + expect(parseFrontmatterKeys("# Heading\n\nBody.")).toEqual([]); + }); + + it("parseFrontmatterKeys extracts top-level keys from frontmatter", async () => { + const { parseFrontmatterKeys } = await import("./translateFrontMatter"); + const md = + "---\n" + + "title: My Page\n" + + "slug: /my-page\n" + + "sidebar_position: 2\n" + + "---\n\n" + + "# Body"; + expect(parseFrontmatterKeys(md)).toEqual([ + "title", + "slug", + "sidebar_position", + ]); + }); + + it("parseFrontmatterKeys ignores indented lines (nested values)", async () => { + const { parseFrontmatterKeys } = await import("./translateFrontMatter"); + const md = + "---\n" + + "title: My Page\n" + + "keywords:\n" + + " - one\n" + + " - two\n" + + "---\n\n" + + "# Body"; + expect(parseFrontmatterKeys(md)).toEqual(["title", "keywords"]); + }); + + it("parseFrontmatterKeys returns empty array when frontmatter closing marker is missing", async () => { + const { parseFrontmatterKeys } = await import("./translateFrontMatter"); + const md = "---\ntitle: My Page\n# Body"; + expect(parseFrontmatterKeys(md)).toEqual([]); + }); + + // Frontmatter integrity integration tests + + it("fails when a critical frontmatter field is dropped by translation", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "---\n" + + "title: My Page\n" + + "slug: /my-page\n" + + "sidebar_position: 2\n" + + "---\n\n" + + "# Body\n\nSome content."; + + // Translation drops slug from the frontmatter + mockOpenAIChatCompletionCreate.mockResolvedValue({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Minha Página\n" + + "sidebar_position: 2\n" + + "---\n\n" + + "# Corpo\n\nAlgum conteúdo.", + title: "Minha Página", + }), + }, + }, + ], + }); + + await expect( + translateText(source, "My Page", "pt-BR") + ).rejects.toMatchObject({ + code: "schema_invalid", + isCritical: false, + message: expect.stringContaining("slug"), + }); + }); + + it("fails when a non-critical frontmatter key is dropped by translation", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "---\n" + + "title: My Page\n" + + "description: A description\n" + + "---\n\n" + + "# Body\n\nSome content."; + + // Translation drops description + mockOpenAIChatCompletionCreate.mockResolvedValue({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Minha Página\n" + + "---\n\n" + + "# Corpo\n\nAlgum conteúdo.", + title: "Minha Página", + }), + }, + }, + ], + }); + + await expect( + translateText(source, "My Page", "pt-BR") + ).rejects.toMatchObject({ + code: "schema_invalid", + isCritical: false, + message: expect.stringContaining("description"), + }); + }); + + it("fails when translation adds an unexpected critical frontmatter field", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = "---\ntitle: My Page\n---\n\n# Body\n\nSome content."; + + // Translation invents a slug field + mockOpenAIChatCompletionCreate.mockResolvedValue({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Minha Página\n" + + "slug: /invented\n" + + "---\n\n" + + "# Corpo\n\nAlgum conteúdo.", + title: "Minha Página", + }), + }, + }, + ], + }); + + await expect( + translateText(source, "My Page", "pt-BR") + ).rejects.toMatchObject({ + code: "schema_invalid", + isCritical: false, + message: expect.stringContaining("slug"), + }); + }); + + it("passes when all frontmatter keys are preserved in translation", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "---\n" + + "title: My Page\n" + + "slug: /my-page\n" + + "sidebar_position: 2\n" + + "---\n\n" + + "# Body\n\nSome content."; + + mockOpenAIChatCompletionCreate.mockResolvedValue({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Minha Página\n" + + "slug: /my-page\n" + + "sidebar_position: 2\n" + + "---\n\n" + + "# Corpo\n\nAlgum conteúdo.", + title: "Minha Página", + }), + }, + }, + ], + }); + + const result = await translateText(source, "My Page", "pt-BR"); + expect(result.markdown).toContain("slug: /my-page"); + expect(result.markdown).toContain("sidebar_position: 2"); + }); + + it("passes when markdown has no frontmatter and translation has none either", async () => { + const { translateText } = await import("./translateFrontMatter"); + installStructuredTranslationMock(); + + const result = await translateText( + "# No Frontmatter\n\nJust body.", + "Title", + "pt-BR" + ); + expect(result).toBeDefined(); + }); }); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index cbcef433..8e3acc9b 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -297,15 +297,33 @@ function splitBySections(markdown: string): string[] { const lastIdx = lines.length - 1; let current = ""; let inFence = false; + let fenceChar = ""; + let fenceLen = 0; for (const [idx, line] of lines.entries()) { // Reconstruct original text: all lines except the last trailing empty get "\n" appended const lineWithNewline = idx < lastIdx ? line + "\n" : line.length > 0 ? line : ""; - // Toggle fence state on fenced code markers, including up to 3 leading spaces. - if (/^[ \t]{0,3}(`{3,}|~{3,})/.test(line)) { - inFence = !inFence; + // Track fence state per CommonMark spec: a fence of N backticks/tildes is closed + // only by a closing fence of >= N of the same character (and no info string on close). + const fenceMatch = line.match(/^[ \t]{0,3}(`{3,}|~{3,})/); + if (fenceMatch) { + const ch = fenceMatch[1][0]; + const len = fenceMatch[1].length; + if (!inFence) { + inFence = true; + fenceChar = ch; + fenceLen = len; + } else if (ch === fenceChar && len >= fenceLen) { + // Closing fence: same character, at least as long, no info string + const afterFence = line.trimStart().slice(len); + if (/^\s*$/.test(afterFence)) { + inFence = false; + fenceChar = ""; + fenceLen = 0; + } + } } // Start a new section before any ATX heading (outside fences) if (!inFence && /^#{1,6}\s/.test(line) && current.length > 0) { @@ -488,6 +506,90 @@ function isProtectedPathIntegrityError( ); } +const CRITICAL_FRONTMATTER_FIELDS = new Set([ + "slug", + "sidebar_position", + "sidebar_label", + "id", + "title", +]); + +/** + * Extracts the top-level YAML keys from a frontmatter block. + * Only recognises simple `key:` entries (no nested parsing) — enough to + * detect dropped or added keys without pulling in a YAML parser dependency. + * @internal exported for testing + */ +export function parseFrontmatterKeys(markdown: string): string[] { + if (!markdown.startsWith("---\n") && !markdown.startsWith("---\r\n")) { + return []; + } + const endFrontmatterIndex = markdown.indexOf("\n---", 3); + if (endFrontmatterIndex === -1) { + return []; + } + const frontmatterBody = markdown.slice(4, endFrontmatterIndex); + const keys: string[] = []; + for (const line of frontmatterBody.split("\n")) { + // Top-level keys: start at column 0, followed by optional spaces and ":" + const match = line.match(/^([A-Za-z_][\w-]*)[\s]*:/); + if (match) { + keys.push(match[1]); + } + } + return keys; +} + +/** + * Checks that the translated markdown preserves all frontmatter keys that + * were present in the source, and that no critical routing/sidebar fields + * have been added or removed. + * + * Throws a non-critical `TranslationError` when an integrity violation is + * detected so the caller can retry (same pattern as completeness checks). + */ +function assertFrontmatterIntegrity( + sourceMarkdown: string, + translatedMarkdown: string +): void { + const sourceKeys = parseFrontmatterKeys(sourceMarkdown); + if (sourceKeys.length === 0) { + // No frontmatter in source — nothing to validate. + return; + } + + const translatedKeys = new Set(parseFrontmatterKeys(translatedMarkdown)); + + const missingKeys = sourceKeys.filter((key) => !translatedKeys.has(key)); + if (missingKeys.length > 0) { + const criticalMissing = missingKeys.filter((key) => + CRITICAL_FRONTMATTER_FIELDS.has(key) + ); + const label = + criticalMissing.length > 0 + ? `critical frontmatter key(s) missing: ${criticalMissing.join(", ")}` + : `frontmatter key(s) missing: ${missingKeys.join(", ")}`; + throw new TranslationError( + `Frontmatter integrity check failed — ${label}`, + "schema_invalid", + false + ); + } + + // Also flag if the translation invented new critical keys not in the source + const sourceKeySet = new Set(sourceKeys); + const addedCriticalKeys = [...translatedKeys].filter( + (key) => CRITICAL_FRONTMATTER_FIELDS.has(key) && !sourceKeySet.has(key) + ); + if (addedCriticalKeys.length > 0) { + throw new TranslationError( + `Frontmatter integrity check failed — unexpected critical key(s) added: ${addedCriticalKeys.join(", ")}`, + "schema_invalid", + false + ); + } +} + type MarkdownStructureMetrics = { headingCount: number; fencedCodeBlockCount: number; @@ -508,7 +610,8 @@ function stripFencedCodeContent(markdown: string): string { const lines = markdown.split("\n"); const result: string[] = []; let inFence = false; - let fenceMarker = ""; + let fenceChar = ""; + let fenceLen = 0; let fenceBuffer: string[] = []; for (const line of lines) { @@ -516,16 +619,25 @@ function stripFencedCodeContent(markdown: string): string { const match = line.match(/^[ \t]{0,3}(`{3,}|~{3,})/); if (match) { inFence = true; - fenceMarker = match[1]; + fenceChar = match[1][0]; + fenceLen = match[1].length; result.push(line); // keep opening marker fenceBuffer = []; } else { result.push(line); } } else { - if (line.trimStart().startsWith(fenceMarker)) { + // Closing fence per CommonMark spec: same character, >= opening length, no info string + const closeMatch = line.match(/^[ \t]{0,3}(`{3,}|~{3,})/); + if ( + closeMatch && + closeMatch[1][0] === fenceChar && + closeMatch[1].length >= fenceLen && + /^\s*$/.test(line.trimStart().slice(closeMatch[1].length)) + ) { inFence = false; - fenceMarker = ""; + fenceChar = ""; + fenceLen = 0; result.push(line); // keep closing marker fenceBuffer = []; } else { @@ -967,6 +1079,7 @@ export async function translateText( false ); } + assertFrontmatterIntegrity(sourceMarkdown, translated.markdown); return translated; }; From d15da434f36eb95e3e820d621b302e9d96703547 Mon Sep 17 00:00:00 2001 From: luandro Date: Thu, 26 Mar 2026 08:26:01 -0300 Subject: [PATCH 18/18] fix(translate): wire frontmatter integrity failures into chunk-halving retry assertFrontmatterIntegrity threw schema_invalid / isCritical:false and its docstring promised the caller would retry, but translateText only retried on unexpected_error + /incomplete/ messages. Frontmatter violations therefore bypassed all retry logic and aborted translation immediately. Extend isRecoverableCompletenessFailure to also match schema_invalid errors whose message contains "Frontmatter integrity check failed", so they share the same chunk-halving retry path as completeness failures. Add a test that verifies a first-attempt frontmatter drop triggers a retry and succeeds when the subsequent attempt preserves all keys. --- .../translateFrontMatter.test.ts | 51 +++++++++++++++++++ .../notion-translate/translateFrontMatter.ts | 6 ++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts index 4233f651..3b577570 100644 --- a/scripts/notion-translate/translateFrontMatter.test.ts +++ b/scripts/notion-translate/translateFrontMatter.test.ts @@ -1000,6 +1000,57 @@ describe("notion-translate translateFrontMatter", () => { }); }); + it("retries and succeeds when frontmatter integrity fails on first attempt but passes on retry", async () => { + const { translateText } = await import("./translateFrontMatter"); + + const source = + "---\n" + + "title: My Page\n" + + "slug: /my-page\n" + + "---\n\n" + + "# Body\n\nSome content."; + + // First call drops slug (integrity failure); second call preserves it. + mockOpenAIChatCompletionCreate + .mockResolvedValueOnce({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Minha Página\n" + + "---\n\n" + + "# Corpo\n\nAlgum conteúdo.", + title: "Minha Página", + }), + }, + }, + ], + }) + .mockResolvedValue({ + choices: [ + { + message: { + content: JSON.stringify({ + markdown: + "---\n" + + "title: Minha Página\n" + + "slug: /my-page\n" + + "---\n\n" + + "# Corpo\n\nAlgum conteúdo.", + title: "Minha Página", + }), + }, + }, + ], + }); + + const result = await translateText(source, "My Page", "pt-BR"); + expect(result.markdown).toContain("slug: /my-page"); + expect(result.markdown).toContain("title: Minha Página"); + }); + it("passes when all frontmatter keys are preserved in translation", async () => { const { translateText } = await import("./translateFrontMatter"); diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts index 8e3acc9b..2638d4d9 100644 --- a/scripts/notion-translate/translateFrontMatter.ts +++ b/scripts/notion-translate/translateFrontMatter.ts @@ -1152,9 +1152,11 @@ export async function translateText( } catch (error) { const isRecoverableCompletenessFailure = error instanceof TranslationError && - error.code === "unexpected_error" && error.isCritical === false && - /incomplete/.test(error.message); + ((error.code === "unexpected_error" && + /incomplete/.test(error.message)) || + (error.code === "schema_invalid" && + /Frontmatter integrity check failed/.test(error.message))); if ( isRecoverableCompletenessFailure &&