Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 53 additions & 9 deletions services/auto-triage-infra/src/parsers/classification-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,7 @@ const parseFromCodeBlock = (
text: string,
availableLabels: string[]
): ClassificationResult | null => {
const codeBlockRegex = /```(?:json|JSON)?\s*\r?\n([\s\S]*?)\r?\n\s*```/g;
const codeBlocks: string[] = [];
let match;

while ((match = codeBlockRegex.exec(text)) !== null) {
codeBlocks.push(match[1]);
}
const codeBlocks = extractCodeBlocks(text);

// Try code blocks from last to first (most recent)
for (let i = codeBlocks.length - 1; i >= 0; i--) {
Expand All @@ -119,8 +113,7 @@ const parseFromCodeBlock = (
}

// Fallback: direct tail search for the last code fence pair.
// The regex with lazy quantifier can miss the final block in very large texts
// with many ``` markers, so search backwards from the end instead.
// Search backwards from the end to recover from unmatched earlier fences.
const lastFenceEnd = text.lastIndexOf('```');
if (lastFenceEnd !== -1) {
const searchStart = Math.max(0, lastFenceEnd - 10_000);
Expand All @@ -145,6 +138,57 @@ const parseFromCodeBlock = (
return null;
};

const extractCodeBlocks = (text: string): string[] => {
const codeBlocks: string[] = [];
let searchIndex = 0;

while (searchIndex < text.length) {
const openFenceIndex = text.indexOf('```', searchIndex);
if (openFenceIndex === -1) break;

const infoStartIndex = openFenceIndex + 3;
const contentStartIndex = text.indexOf('\n', infoStartIndex);
if (contentStartIndex === -1) break;

const fenceInfo = text.substring(infoStartIndex, contentStartIndex).trim();
if (fenceInfo !== '' && fenceInfo !== 'json' && fenceInfo !== 'JSON') {
searchIndex = contentStartIndex + 1;
continue;
}

const closeFenceLineIndex = findClosingFenceLine(text, contentStartIndex + 1);
if (closeFenceLineIndex === -1) {
searchIndex = contentStartIndex + 1;
continue;
}

codeBlocks.push(text.substring(contentStartIndex + 1, closeFenceLineIndex));
const closeFenceLineEndIndex = text.indexOf('\n', closeFenceLineIndex);
searchIndex = closeFenceLineEndIndex === -1 ? text.length : closeFenceLineEndIndex + 1;
}

return codeBlocks;
};

const findClosingFenceLine = (text: string, startIndex: number): number => {
let lineStartIndex = startIndex;

while (lineStartIndex < text.length) {
const lineEndIndex = text.indexOf('\n', lineStartIndex);
const line = text.substring(lineStartIndex, lineEndIndex === -1 ? text.length : lineEndIndex);

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Windows \r\n line endings are not stripped — extracted JSON content will contain trailing \r on every line, breaking JSON.parse.

The old regex used \r?\n at both boundaries to strip carriage returns. Here, text.substring(..., lineEndIndex) keeps the \r before the \n, so when the extracted block content is passed to JSON.parse it will fail on Windows-generated or mixed-EOL text.

Fix: strip the trailing \r from each line:

Suggested change
const line = text.substring(lineStartIndex, lineEndIndex === -1 ? text.length : lineEndIndex);
const line = text.substring(lineStartIndex, lineEndIndex === -1 ? text.length : lineEndIndex).replace(/\r$/, '');

const leadingWhitespaceLength = line.length - line.trimStart().length;

if (line.startsWith('```', leadingWhitespaceLength)) {

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: The check line.startsWith('```', leadingWhitespaceLength) matches any line whose trimmed content begins with three backticks — including ```end, ```json, ``` some text, etc. This means a non-closing fence (e.g. a nested opening fence or an annotated fence) would be incorrectly treated as the closing fence, causing the block content to be truncated early.

The old regex implicitly required the closing fence to consist of only backticks (possibly with surrounding whitespace). The fix is to require the remainder of the line after the backticks to be empty:

Suggested change
if (line.startsWith('```', leadingWhitespaceLength)) {
if (line.trimStart().startsWith('```') && line.trimStart().slice(3).trim() === '') {

return lineStartIndex;
}

if (lineEndIndex === -1) break;
lineStartIndex = lineEndIndex + 1;
}

return -1;
};

/**
* Extract classification from plain JSON objects in text
* Uses balanced brace matching to find JSON objects
Expand Down