From 4edcc3ea0c732b25caca57646eb0e5f22a59d4a0 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Fri, 1 May 2026 22:29:30 -0400 Subject: [PATCH] fix: preserve original .md URL from llms.txt for markdown-availability checks When llms.txt linked to a .md/.mdx URL (notably Plaid's /index.html.md form), normalizePageUrl rewrote it to its HTML equivalent for sitemap dedup, then toMdUrls regenerated candidates from the HTML form that missed the URL the site actually published. markdown-url-support scored 0% on otherwise-compliant sites. Carry the original .md URL alongside the normalized URL through discovery as originalMdUrls. markdown-url-support tries it first, then falls through to toMdUrls() candidates, then a parent-clean fallback (gated to /index.html.md sources). toMdUrls itself is unchanged so other checks (llms-txt-directive-md, llms-txt-links- markdown) cannot regress to the prior false-positive class. Closes #77 --- .../markdown-url-support.ts | 75 +++- src/helpers/get-page-urls.ts | 180 ++++++-- .../unit/checks/llms-txt-directive-md.test.ts | 42 ++ .../checks/llms-txt-links-markdown.test.ts | 39 ++ test/unit/checks/markdown-url-support.test.ts | 410 ++++++++++++++++++ test/unit/helpers/get-page-urls.test.ts | 219 ++++++++++ test/unit/helpers/to-md-urls.test.ts | 24 + 7 files changed, 947 insertions(+), 42 deletions(-) diff --git a/src/checks/markdown-availability/markdown-url-support.ts b/src/checks/markdown-availability/markdown-url-support.ts index da3a3f2..106ab39 100644 --- a/src/checks/markdown-availability/markdown-url-support.ts +++ b/src/checks/markdown-availability/markdown-url-support.ts @@ -12,6 +12,8 @@ interface PageResult { alreadyMd?: boolean; status: number; error?: string; + /** True when the original llms.txt-published URL served the content. */ + originalUrlServed?: boolean; } /** @@ -19,12 +21,16 @@ interface PageResult { * based on which candidate succeeded in previous results. * Returns 'index' if `page/index.md` wins, 'direct' if `page.md` wins, or null if * there's no clear winner yet. + * + * Wins served via the `originalMdUrl` from llms.txt are NOT counted: those + * URLs reflect the site's published form, not a `toMdUrls()` candidate, and + * counting them would skew the heuristic for unrelated pages. */ function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null { let directWins = 0; let indexWins = 0; for (const r of results) { - if (!r.supported || !r.mdUrl) continue; + if (!r.supported || !r.mdUrl || r.originalUrlServed) continue; if (r.mdUrl.endsWith('/index.md') || r.mdUrl.endsWith('/index.mdx')) { indexWins++; } else { @@ -38,6 +44,33 @@ function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null return null; } +/** + * Issue #77: when llms.txt published a `/foo/index.html.md` URL but it 404s + * (and the regenerated `/foo/index.md` and `/foo/index.html/index.md` also + * 404), some sites still serve the markdown at the parent-clean path + * `/foo.md` (Plaid's pattern). This is gated to URLs whose llms.txt original + * matched `/index.html?\.md$` — strong evidence the site uses this convention. + * + * Do NOT move this into `toMdUrls()`. Other checks (`llms-txt-directive-md`, + * `llms-txt-links-markdown`, etc.) call `toMdUrls()` directly and would + * regress to the old false-positive class where unrelated sibling .md files + * pass validation. See issue #77 discussion. + */ +function deriveParentCleanMd(pageUrl: string, originalMdUrl: string): string | null { + if (!/\/index\.html?\.md$/i.test(new URL(originalMdUrl).pathname)) return null; + try { + const u = new URL(pageUrl); + const pathname = u.pathname.replace(/\/$/, ''); + // Strip /index.html or /index.htm from the page URL and append .md + const stripped = pathname.replace(/\/index\.html?$/i, ''); + if (!stripped || stripped === pathname) return null; + u.pathname = `${stripped}.md`; + return u.toString(); + } catch { + return null; + } +} + /** * Reorder toMdUrls() candidates based on the detected site preference. * 'index' puts `page/index.md` first; 'direct' keeps the default order (`page.md` first). @@ -58,6 +91,7 @@ async function check(ctx: CheckContext): Promise { totalPages, sampled: wasSampled, warnings, + originalMdUrls, } = await discoverAndSamplePages(ctx); const results: PageResult[] = []; @@ -68,15 +102,35 @@ async function check(ctx: CheckContext): Promise { const batch = pageUrls.slice(i, i + concurrency); const batchResults = await Promise.all( batch.map(async (url): Promise => { - const candidates = toMdUrls(url); + const baseCandidates = toMdUrls(url); // Non-markdown file types (e.g. .json, .xml) have no .md equivalent — skip them - if (candidates.length === 0) { + if (baseCandidates.length === 0) { return { url, mdUrl: url, supported: false, skipped: true, status: 0 }; } const alreadyMd = /\.mdx?$/i.test(new URL(url).pathname); - const ordered = orderCandidates(candidates, mdFormPreference); + const original = originalMdUrls?.[url]; + const parentClean = original ? deriveParentCleanMd(url, original) : null; + + // Build candidate list: + // 1. originalMdUrl (the URL llms.txt published) — first, when present. + // 2. toMdUrls() candidates, reordered by detected site preference. + // 3. parent-clean fallback (issue #77) — last, only when llms.txt + // published a /foo/index.html.md form. Tried only if 1+2 fail. + const ordered = orderCandidates(baseCandidates, mdFormPreference); + const candidateList: string[] = []; + const seen = new Set(); + const addCandidate = (c: string | null | undefined) => { + if (c && !seen.has(c)) { + seen.add(c); + candidateList.push(c); + } + }; + addCandidate(original); + for (const c of ordered) addCandidate(c); + addCandidate(parentClean); + let lastError: string | undefined; - for (const mdUrl of ordered) { + for (const mdUrl of candidateList) { try { const response = await ctx.http.fetch(mdUrl); const body = await response.text(); @@ -90,7 +144,14 @@ async function check(ctx: CheckContext): Promise { url, markdown: { content: body, source: 'md-url' }, }); - return { url, mdUrl, supported: true, alreadyMd, status: response.status }; + return { + url, + mdUrl, + supported: true, + alreadyMd, + status: response.status, + originalUrlServed: mdUrl === original, + }; } lastError = undefined; // Got a response, not a fetch error } catch (err) { @@ -99,7 +160,7 @@ async function check(ctx: CheckContext): Promise { } return { url, - mdUrl: ordered[0], + mdUrl: candidateList[0], supported: false, alreadyMd, status: 0, diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 1b6d42d..30430b3 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -43,45 +43,100 @@ export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise }> { + const existsResult = ctx.previousResults.get('llms-txt-exists'); + const discovered = getLlmsTxtFilesForAnalysis(existsResult); + + const entries = extractLinksFromLlmsTxtFiles(discovered); + const result = await walkAggregateLinksWithOriginals(ctx, entries); + return { + pageUrls: result.pageUrls.map((p) => p.url), + originalMdUrls: collectOriginalMdUrls(result.pageUrls), + }; +} + export async function getUrlsFromCachedLlmsTxtWithOmitted( ctx: CheckContext, ): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); const discovered = getLlmsTxtFilesForAnalysis(existsResult); - const urls = extractLinksFromLlmsTxtFiles(discovered); - return walkAggregateLinks(ctx, urls); + const entries = extractLinksFromLlmsTxtFiles(discovered); + const result = await walkAggregateLinksWithOriginals(ctx, entries); + return { + pageUrls: result.pageUrls.map((p) => p.url), + omittedTxtUrls: result.omittedTxtUrls, + }; +} + +function collectOriginalMdUrls(pages: DiscoveredPageUrl[]): Record { + const map: Record = {}; + for (const p of pages) { + if (p.originalMdUrl && !(p.url in map)) { + map[p.url] = p.originalMdUrl; + } + } + return map; } /** * Normalize a discovered page URL: convert .md/.mdx URLs to their HTML * equivalent so that llms.txt entries like `/docs/guide/index.md` deduplicate - * against sitemap entries like `/docs/guide/`. Markdown-specific checks are - * unaffected because they derive .md candidates from HTML URLs via toMdUrls(). + * against sitemap entries like `/docs/guide/`. The original .md URL is + * returned alongside as `originalMdUrl` so that markdown-availability checks + * can still test the URL the site explicitly published (issue #77). */ -function normalizePageUrl(url: string): string { - return isMdUrl(url) ? toHtmlUrl(url) : url; +function normalizePageUrl(url: string): { url: string; originalMdUrl?: string } { + if (isMdUrl(url)) { + return { url: toHtmlUrl(url), originalMdUrl: url }; + } + return { url }; +} + +interface DiscoveredPageUrl { + url: string; + originalMdUrl?: string; } -function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] { - const urls = new Set(); +function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): DiscoveredPageUrl[] { + // Map normalized URL → originalMdUrl. A given page may appear in llms.txt + // both as `/page.md` and `/page` (HTML); we keep the .md form so downstream + // markdown checks have a known-good URL to test. + const seen = new Map(); + + function record(rawUrl: string) { + const { url, originalMdUrl } = normalizePageUrl(rawUrl); + const existing = seen.get(url); + if (existing === undefined && originalMdUrl) { + seen.set(url, originalMdUrl); + } else if (!seen.has(url)) { + seen.set(url, originalMdUrl); + } + } + for (const file of files) { const links = extractMarkdownLinks(file.content); for (const link of links) { if (link.url.startsWith('http://') || link.url.startsWith('https://')) { - urls.add(normalizePageUrl(link.url)); + record(link.url); } else if (link.url.startsWith('/')) { // Resolve root-relative URLs against the source file's origin try { const base = new URL(file.url); - urls.add(normalizePageUrl(new URL(link.url, base.origin).toString())); + record(new URL(link.url, base.origin).toString()); } catch { // Skip malformed URLs } } } } - return Array.from(urls); + return Array.from(seen, ([url, originalMdUrl]) => ({ url, originalMdUrl })); } /** @@ -98,29 +153,32 @@ export interface AggregateWalkResult { omittedTxtUrls: string[]; } -async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { - const pageUrls: string[] = []; +async function walkAggregateLinksWithOriginals( + ctx: CheckContext, + entries: DiscoveredPageUrl[], +): Promise<{ pageUrls: DiscoveredPageUrl[]; omittedTxtUrls: string[] }> { + const pageUrls: DiscoveredPageUrl[] = []; const aggregateUrls: string[] = []; const omittedTxtUrls: string[] = []; const siteOrigin = ctx.effectiveOrigin ?? ctx.origin; - for (const url of urls) { + for (const entry of entries) { try { - const parsed = new URL(url); + const parsed = new URL(entry.url); if (/\.txt$/i.test(parsed.pathname)) { // .txt files are either aggregate indexes to walk (same origin) // or external resources to skip — never page URLs themselves if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) { - aggregateUrls.push(url); + aggregateUrls.push(entry.url); } } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) { // Only include same-origin page URLs; cross-origin links are // external resources the site owner doesn't control. - pageUrls.push(normalizePageUrl(url)); + pageUrls.push(entry); } } catch { - pageUrls.push(normalizePageUrl(url)); + pageUrls.push(entry); } } @@ -144,19 +202,19 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { +async function fetchLlmsTxtUrls( + ctx: CheckContext, +): Promise<{ pageUrls: string[]; originalMdUrls: Record }> { const explicitUrl = ctx.options.llmsTxtUrl; const candidates = explicitUrl ? [explicitUrl] @@ -213,9 +273,12 @@ async function fetchLlmsTxtUrls(ctx: CheckContext): Promise { const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl); const filesForAnalysis = canonical ? [canonical] : []; - const urls = extractLinksFromLlmsTxtFiles(filesForAnalysis); - const result = await walkAggregateLinks(ctx, urls); - return result.pageUrls; + const entries = extractLinksFromLlmsTxtFiles(filesForAnalysis); + const result = await walkAggregateLinksWithOriginals(ctx, entries); + return { + pageUrls: result.pageUrls.map((p) => p.url), + originalMdUrls: collectOriginalMdUrls(result.pageUrls), + }; } /** @@ -285,6 +348,13 @@ export interface PageUrlResult { warnings: string[]; /** Which discovery methods contributed to the final URL set. */ sources: DiscoverySource[]; + /** + * Map of normalized URL → the original .md/.mdx URL the llms.txt published. + * Only populated for URLs discovered via llms.txt; sitemap URLs contribute none. + * Markdown-availability checks use this to test the site-published markdown + * URL alongside the conventional candidates from `toMdUrls()` (issue #77). + */ + originalMdUrls?: Record; } function isGzipped(url: string): boolean { @@ -880,33 +950,54 @@ export async function getPageUrls(ctx: CheckContext): Promise { return deduplicateVersionedUrls(localeFiltered, version); } + /** Filter the originalMdUrls map to a subset of URLs. */ + function filterOriginalMdUrls( + map: Record, + keep: string[], + ): Record | undefined { + const keepSet = new Set(keep); + const out: Record = {}; + for (const [url, original] of Object.entries(map)) { + if (keepSet.has(url)) out[url] = original; + } + return Object.keys(out).length > 0 ? out : undefined; + } + // 1. Try llms.txt links from cached results (if llms-txt-exists ran) - const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx); - let llmsTxtUrls = refineUrls(filterByPathPrefix(cachedUrls, filterBase)); + const cached = await getUrlsFromCachedLlmsTxtWithOriginals(ctx); + let llmsTxtUrls = refineUrls(filterByPathPrefix(cached.pageUrls, filterBase)); + let originalMdUrls = cached.originalMdUrls; // 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run) if (llmsTxtUrls.length === 0 && !ctx.previousResults.has('llms-txt-exists')) { - const fetchedUrls = await fetchLlmsTxtUrls(ctx); - llmsTxtUrls = refineUrls(filterByPathPrefix(fetchedUrls, filterBase)); + const fetched = await fetchLlmsTxtUrls(ctx); + llmsTxtUrls = refineUrls(filterByPathPrefix(fetched.pageUrls, filterBase)); + originalMdUrls = fetched.originalMdUrls; } if (llmsTxtUrls.length > 0) { sources.push('llms-txt'); + const filteredOriginals = filterOriginalMdUrls(originalMdUrls, llmsTxtUrls); // If llms.txt meets the requested sample size, no need for sitemap if (llmsTxtUrls.length >= ctx.options.maxLinksToTest) { - return { urls: llmsTxtUrls, warnings, sources }; + return { urls: llmsTxtUrls, warnings, sources, originalMdUrls: filteredOriginals }; } // llms.txt is thin — try sitemap to fill the gap const sitemapUrls = await getUrlsFromSitemap(ctx, warnings, { pathFilterBase: filterBase }); if (sitemapUrls.length > 0) { sources.push('sitemap'); - return { urls: mergeUrlSets(llmsTxtUrls, sitemapUrls), warnings, sources }; + return { + urls: mergeUrlSets(llmsTxtUrls, sitemapUrls), + warnings, + sources, + originalMdUrls: filteredOriginals, + }; } // Sitemap had nothing; return llms.txt URLs alone - return { urls: llmsTxtUrls, warnings, sources }; + return { urls: llmsTxtUrls, warnings, sources, originalMdUrls: filteredOriginals }; } // 3. Try sitemap (path, locale, and version filtering applied inside) @@ -930,6 +1021,12 @@ export interface SampledPages { urlTags?: Record; /** Which discovery methods contributed to the page URL set. */ sources?: DiscoverySource[]; + /** + * Map of sampled URL → original .md/.mdx URL from llms.txt (issue #77). + * `markdown-url-support` uses this to test the URL the site explicitly + * published before falling back to conventional `toMdUrls()` candidates. + */ + originalMdUrls?: Record; } /** @@ -1023,12 +1120,25 @@ export async function discoverAndSamplePages(ctx: CheckContext): Promise | undefined; + if (discovery.originalMdUrls) { + const sampledSet = new Set(urls); + const filtered: Record = {}; + for (const [url, original] of Object.entries(discovery.originalMdUrls)) { + if (sampledSet.has(url)) filtered[url] = original; + } + if (Object.keys(filtered).length > 0) originalMdUrls = filtered; + } + ctx._sampledPages = { urls, totalPages, sampled, warnings: discovery.warnings, sources: discovery.sources, + originalMdUrls, }; return ctx._sampledPages; } diff --git a/test/unit/checks/llms-txt-directive-md.test.ts b/test/unit/checks/llms-txt-directive-md.test.ts index e0d3f7c..87c16a5 100644 --- a/test/unit/checks/llms-txt-directive-md.test.ts +++ b/test/unit/checks/llms-txt-directive-md.test.ts @@ -380,4 +380,46 @@ describe('llms-txt-directive-md', () => { expect(result.status).toBe('pass'); expect(result.details?.foundCount).toBe(1); }); + + // Issue #77 isolation: parent-clean candidate generation lives in + // markdown-url-support only. If it ever leaks into toMdUrls, this directive + // check would falsely pass when an unrelated /auth.md happens to contain + // the directive — a known false-positive class. Guard against that. + it('does not validate directive via parent-clean .md candidate (issue #77 isolation)', async () => { + const requestLog: string[] = []; + server.use( + // The page URL would be /docs/auth/index.html (HTML form). Conventional + // .md candidates 404… + http.get('http://test.local/docs/auth/index.md', () => { + requestLog.push('/docs/auth/index.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth/index.html/index.md', () => { + requestLog.push('/docs/auth/index.html/index.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + // …but a sibling /docs/auth.md exists with the directive. The + // directive check must NOT request this URL. + http.get('http://test.local/docs/auth.md', () => { + requestLog.push('/docs/auth.md'); + return new HttpResponse( + '> For AI agents: see [documentation index](/llms.txt) for navigation.\n\n# Wrong page', + { status: 200, headers: { 'Content-Type': 'text/markdown' } }, + ); + }), + // Content negotiation also fails so it can't pass that way + http.get('http://test.local/docs/auth/index.html', () => { + requestLog.push('/docs/auth/index.html [accept]'); + return new HttpResponse('', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }); + }), + ); + + const ctx = makeCtx(llms('/docs/auth/index.html.md')); + const result = await check.run(ctx); + expect(result.status).not.toBe('pass'); + expect(requestLog).not.toContain('/docs/auth.md'); + }); }); diff --git a/test/unit/checks/llms-txt-links-markdown.test.ts b/test/unit/checks/llms-txt-links-markdown.test.ts index f600e16..12e84d4 100644 --- a/test/unit/checks/llms-txt-links-markdown.test.ts +++ b/test/unit/checks/llms-txt-links-markdown.test.ts @@ -328,4 +328,43 @@ Just text, no links here. expect(result.status).toBe('warn'); expect(result.details?.mdVariantsAvailable).toBe(1); }); + + // Issue #77 isolation: parent-clean candidate generation lives in + // markdown-url-support only. If it ever leaks into toMdUrls, this links + // check would falsely validate a link to /docs/auth/index.html via an + // unrelated /docs/auth.md sibling. Guard against that. + it('does not validate link via parent-clean .md candidate (issue #77 isolation)', async () => { + const requestLog: string[] = []; + server.use( + http.head('http://test.local/docs/auth/index.html', () => { + requestLog.push('HEAD /docs/auth/index.html'); + return new HttpResponse(null, { status: 200 }); + }), + http.head('http://test.local/docs/auth/index.md', () => { + requestLog.push('HEAD /docs/auth/index.md'); + return new HttpResponse(null, { status: 404 }); + }), + http.head('http://test.local/docs/auth/index.html/index.md', () => { + requestLog.push('HEAD /docs/auth/index.html/index.md'); + return new HttpResponse(null, { status: 404 }); + }), + // The sibling /docs/auth.md exists but must not be requested. + http.head('http://test.local/docs/auth.md', () => { + requestLog.push('HEAD /docs/auth.md'); + return new HttpResponse(null, { status: 200 }); + }), + http.get('http://test.local/docs/auth.md', () => { + requestLog.push('GET /docs/auth.md'); + return new HttpResponse('# unrelated', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const content = `# Test\n> Summary\n## Links\n- [Auth](http://test.local/docs/auth/index.html): Auth\n`; + await check.run(makeCtx(content)); + expect(requestLog).not.toContain('HEAD /docs/auth.md'); + expect(requestLog).not.toContain('GET /docs/auth.md'); + }); }); diff --git a/test/unit/checks/markdown-url-support.test.ts b/test/unit/checks/markdown-url-support.test.ts index e39d35b..d4c2f4f 100644 --- a/test/unit/checks/markdown-url-support.test.ts +++ b/test/unit/checks/markdown-url-support.test.ts @@ -410,6 +410,85 @@ describe('markdown-url-support', () => { expect(cached?.markdown?.source).toBe('md-url'); }); + // Regression: issue #77 — sites whose llms.txt links use a `.html.md` + // suffix (e.g. Plaid: /docs/auth/index.html.md) lose markdown discovery + // because normalizePageUrl strips the .md, and the regenerated candidates + // (/docs/auth/index.md, /docs/auth/index.html/index.md) miss the real file. + it('detects markdown when llms.txt uses .html.md suffix (issue #77)', async () => { + const md = '# Auth\n\nAuth documentation with [a link](http://example.com).'; + server.use( + http.get( + 'http://test.local/docs/auth/index.html.md', + () => + new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + // The conventional candidates that the check currently generates 404 + http.get( + 'http://test.local/docs/auth/index.md', + () => new HttpResponse('Not Found', { status: 404 }), + ), + http.get( + 'http://test.local/docs/auth.md', + () => new HttpResponse('Not Found', { status: 404 }), + ), + http.get( + 'http://test.local/docs/auth/index.html/index.md', + () => new HttpResponse('Not Found', { status: 404 }), + ), + ); + + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth/index.html.md): Auth guide +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + expect(result.details?.mdSupported).toBe(1); + }); + + // Regression: issue #77, second variant — Plaid actually does serve + // /docs/auth.md (clean path + .md) in addition to /docs/auth/index.html.md. + // The check should find at least one working markdown form. + it('detects markdown via clean-path .md when llms.txt uses .html.md suffix (issue #77)', async () => { + const md = '# Auth\n\nClean-path markdown with [a link](http://example.com).'; + server.use( + http.get( + 'http://test.local/docs/auth.md', + () => + new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + // The .html.md form 404s in this scenario; only /docs/auth.md works. + http.get( + 'http://test.local/docs/auth/index.html.md', + () => new HttpResponse('Not Found', { status: 404 }), + ), + http.get( + 'http://test.local/docs/auth/index.md', + () => new HttpResponse('Not Found', { status: 404 }), + ), + http.get( + 'http://test.local/docs/auth/index.html/index.md', + () => new HttpResponse('Not Found', { status: 404 }), + ), + ); + + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth/index.html.md): Auth guide +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + expect(result.details?.mdSupported).toBe(1); + }); + it('auto-detects page/index.md preference and tries it first in later batches', async () => { // 3 pages, all served at page/index.md (not page.md). With concurrency=1, // each page is a separate batch, so after page 1+2 the check should @@ -479,4 +558,335 @@ describe('markdown-url-support', () => { expect(requestLog).not.toContain('/docs/c.md'); expect(requestLog).toContain('/docs/c/index.md'); }); + + // ── Issue #77 coverage: original-URL preservation and parent-clean gating ── + + // Sibling-page trap: when both the original .html.md and an unrelated + // /auth.md exist, the original must be tried first so we don't accidentally + // associate the page with a sibling's content. + it('prefers originalMdUrl over parent-clean candidate when both succeed (issue #77)', async () => { + const requestLog: string[] = []; + const realMd = '# Auth\n\nReal auth doc.'; + const wrongMd = '# Authentication Section\n\nA different page entirely.'; + server.use( + http.get('http://test.local/docs/auth/index.html.md', () => { + requestLog.push('/docs/auth/index.html.md'); + return new HttpResponse(realMd, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + http.get('http://test.local/docs/auth.md', () => { + requestLog.push('/docs/auth.md'); + return new HttpResponse(wrongMd, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth/index.html.md): Auth guide +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + const pageResults = result.details?.pageResults as Array<{ mdUrl: string }>; + expect(pageResults[0].mdUrl).toBe('http://test.local/docs/auth/index.html.md'); + expect(requestLog[0]).toBe('/docs/auth/index.html.md'); + expect(requestLog).not.toContain('/docs/auth.md'); + }); + + // Without an originalMdUrl signal (sitemap-only discovery), the parent-clean + // candidate must NOT be emitted. Otherwise an unrelated /auth.md would + // false-positive the check for a /auth/index.html page. + it('does not test /foo.md when /foo/index.html came from sitemap (issue #77 isolation)', async () => { + const requestLog: string[] = []; + server.use( + http.get('http://parentclean.local/robots.txt', () => new HttpResponse('', { status: 404 })), + http.get( + 'http://parentclean.local/sitemap.xml', + () => + new HttpResponse( + ` + http://parentclean.local/docs/auth/index.html + `, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + http.get('http://parentclean.local/docs/auth/index.md', () => { + requestLog.push('/docs/auth/index.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://parentclean.local/docs/auth/index.html/index.md', () => { + requestLog.push('/docs/auth/index.html/index.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://parentclean.local/docs/auth.md', () => { + requestLog.push('/docs/auth.md'); + return new HttpResponse('# Some other page\n\nUnrelated content.', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const ctx = createContext('http://parentclean.local', { requestDelay: 0 }); + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'fail', + message: 'No llms.txt', + details: { discoveredFiles: [] }, + }); + const result = await check.run(ctx); + expect(result.status).toBe('fail'); + expect(requestLog).not.toContain('/docs/auth.md'); + }); + + // Plain .md original (e.g. /docs/auth.md from llms.txt). No /index.html + // form is involved, so the parent-clean candidate must not fire. + it('does not emit parent-clean candidate when originalMdUrl is plain .md (issue #77)', async () => { + const requestLog: string[] = []; + server.use( + http.get('http://test.local/docs/feature.md', () => { + requestLog.push('/docs/feature.md'); + return new HttpResponse('# Feature\n\nThe feature.', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + http.get('http://test.local/docs.md', () => { + requestLog.push('/docs.md'); + return new HttpResponse('# Wrong page\n', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [Feature](http://test.local/docs/feature.md): Feature +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + expect(requestLog[0]).toBe('/docs/feature.md'); + expect(requestLog).not.toContain('/docs.md'); + }); + + // .mdx originals get the original-first benefit, but the parent-clean gate + // requires /index.html?\.md specifically. Confirm parent-clean does not fire. + it('tries .mdx original first but does not emit parent-clean candidate (issue #77)', async () => { + const requestLog: string[] = []; + server.use( + http.get('http://test.local/docs/guide.mdx', () => { + requestLog.push('/docs/guide.mdx'); + return new HttpResponse('# Guide\n\nThe guide.', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + http.get('http://test.local/docs.md', () => { + requestLog.push('/docs.md'); + return new HttpResponse('Wrong', { status: 200 }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [Guide](http://test.local/docs/guide.mdx): Guide +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + expect(requestLog[0]).toBe('/docs/guide.mdx'); + expect(requestLog).not.toContain('/docs.md'); + }); + + // /page.html.md (no /index segment): gate requires /index.html.md, so the + // parent-clean candidate is NOT emitted. The original is still tried first. + it('does not emit parent-clean candidate for /page.html.md (no /index)', async () => { + const requestLog: string[] = []; + server.use( + http.get('http://test.local/docs/auth.html.md', () => { + requestLog.push('/docs/auth.html.md'); + return new HttpResponse('# Auth\n\nAuth.', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth.html.md): Auth +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + expect(requestLog[0]).toBe('/docs/auth.html.md'); + }); + + // Original 404, parent-clean wins (Plaid second variant). + it('falls back to parent-clean candidate when /index.html.md original 404s (issue #77)', async () => { + const requestLog: string[] = []; + const md = '# Auth\n\nThe auth doc served via clean path.'; + server.use( + http.get('http://test.local/docs/auth/index.html.md', () => { + requestLog.push('/docs/auth/index.html.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth/index.md', () => { + requestLog.push('/docs/auth/index.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth/index.html/index.md', () => { + requestLog.push('/docs/auth/index.html/index.md'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth.md', () => { + requestLog.push('/docs/auth.md'); + return new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth/index.html.md): Auth +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('pass'); + expect(requestLog).toContain('/docs/auth/index.html.md'); + expect(requestLog).toContain('/docs/auth.md'); + const pageResults = result.details?.pageResults as Array<{ mdUrl: string }>; + expect(pageResults[0].mdUrl).toBe('http://test.local/docs/auth.md'); + }); + + // All forms 404: must fail cleanly without infinite candidate expansion. + it('fails cleanly when original, generated, and parent-clean candidates all 404', async () => { + const requestLog: string[] = []; + server.use( + http.get('http://test.local/docs/auth/index.html.md', () => { + requestLog.push('A'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth/index.md', () => { + requestLog.push('B'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth/index.html/index.md', () => { + requestLog.push('C'); + return new HttpResponse('Not Found', { status: 404 }); + }), + http.get('http://test.local/docs/auth.md', () => { + requestLog.push('D'); + return new HttpResponse('Not Found', { status: 404 }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth/index.html.md): Auth +`; + const result = await check.run(makeCtx({ content })); + expect(result.status).toBe('fail'); + // Bounded: at most original + 2 generated + 1 parent-clean = 4 requests. + expect(requestLog.length).toBeLessThanOrEqual(4); + }); + + // mdFormPreference must not be skewed by originalMdUrl wins; otherwise a + // run of .html.md sites would mis-bias the heuristic for unrelated pages. + it('does not skew mdFormPreference based on originalMdUrl wins (issue #77)', async () => { + const md = '# P\n\nContent.'; + const requestLog: string[] = []; + server.use( + // Three .html.md originals — all win on first try + http.get('http://test.local/docs/a/index.html.md', () => { + requestLog.push('/docs/a/index.html.md'); + return new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + http.get('http://test.local/docs/b/index.html.md', () => { + requestLog.push('/docs/b/index.html.md'); + return new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + http.get('http://test.local/docs/c/index.html.md', () => { + requestLog.push('/docs/c/index.html.md'); + return new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + // Fourth page is plain HTML (no llms.txt original); should try .md + // first by default (preference null because original wins didn't count). + http.get('http://test.local/docs/d.md', () => { + requestLog.push('/docs/d.md'); + return new HttpResponse(md, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }); + }), + ); + + const content = `# Docs +> Summary +## Links +- [A](http://test.local/docs/a/index.html.md): A +- [B](http://test.local/docs/b/index.html.md): B +- [C](http://test.local/docs/c/index.html.md): C +- [D](http://test.local/docs/d): D +`; + const ctx = makeCtx({ content }); + ctx.options.maxConcurrency = 1; + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + // /docs/d.md must be tried in the default order. If preference were + // skewed toward 'index' (because the .html.md originals end with index), + // the check would request /docs/d/index.md first. + const dIndex = requestLog.indexOf('/docs/d.md'); + const dIndexMd = requestLog.indexOf('/docs/d/index.md'); + expect(dIndex).toBeGreaterThanOrEqual(0); + if (dIndexMd >= 0) { + expect(dIndex).toBeLessThan(dIndexMd); + } + }); + + // pageResults reflects the URL the site actually served, not the page URL. + it('reports originalMdUrl in pageResults.mdUrl when it served the content', async () => { + server.use( + http.get( + 'http://test.local/docs/auth/index.html.md', + () => + new HttpResponse('# Auth', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + ); + const content = `# Docs +> Summary +## Links +- [Auth](http://test.local/docs/auth/index.html.md): Auth +`; + const ctx = makeCtx({ content }); + const result = await check.run(ctx); + const pageResults = result.details?.pageResults as Array<{ mdUrl: string }>; + expect(pageResults[0].mdUrl).toBe('http://test.local/docs/auth/index.html.md'); + // Cache is keyed by page URL (the normalized HTML form) + const cached = ctx.pageCache.get('http://test.local/docs/auth/index.html'); + expect(cached?.markdown?.content).toBe('# Auth'); + }); }); diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index 22322aa..1ad7933 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -2120,3 +2120,222 @@ describe('discoverAndSamplePages', () => { expect(result.warnings[0]).toContain('gzipped sitemap'); }); }); + +// Issue #77: when llms.txt links use a .md/.mdx (or .html.md) suffix, +// normalizePageUrl rewrites them to their HTML form for dedup against sitemap +// entries. The original .md URL must still be carried alongside so that +// markdown-availability checks can test the URL the site explicitly published. +describe('originalMdUrls (issue #77)', () => { + it('preserves originalMdUrl for .html.md links from llms.txt', async () => { + const ctx = createContext('http://test77a.local', { requestDelay: 0 }); + const content = `# Docs +> Summary +## Links +- [Auth](http://test77a.local/docs/auth/index.html.md): Auth guide +`; + const discovered: DiscoveredFile[] = [ + { url: 'http://test77a.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + mockSitemapNotFound(server, 'http://test77a.local'); + + const result = await discoverAndSamplePages(ctx); + expect(result.urls).toContain('http://test77a.local/docs/auth/index.html'); + expect(result.originalMdUrls).toBeDefined(); + expect(result.originalMdUrls!['http://test77a.local/docs/auth/index.html']).toBe( + 'http://test77a.local/docs/auth/index.html.md', + ); + }); + + it('preserves originalMdUrl for .mdx links from llms.txt', async () => { + const ctx = createContext('http://test77b.local', { requestDelay: 0 }); + const content = `# Docs +> Summary +## Links +- [Guide](http://test77b.local/docs/guide.mdx): Guide +`; + const discovered: DiscoveredFile[] = [ + { url: 'http://test77b.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + mockSitemapNotFound(server, 'http://test77b.local'); + + const result = await discoverAndSamplePages(ctx); + expect(result.urls).toContain('http://test77b.local/docs/guide'); + expect(result.originalMdUrls!['http://test77b.local/docs/guide']).toBe( + 'http://test77b.local/docs/guide.mdx', + ); + }); + + it('preserves originalMdUrl for plain .md links from llms.txt', async () => { + const ctx = createContext('http://test77c.local', { requestDelay: 0 }); + const content = `# Docs +> Summary +## Links +- [Auth](http://test77c.local/docs/auth.md): Auth +`; + const discovered: DiscoveredFile[] = [ + { url: 'http://test77c.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + mockSitemapNotFound(server, 'http://test77c.local'); + + const result = await discoverAndSamplePages(ctx); + expect(result.urls).toContain('http://test77c.local/docs/auth'); + expect(result.originalMdUrls!['http://test77c.local/docs/auth']).toBe( + 'http://test77c.local/docs/auth.md', + ); + }); + + it('does not populate originalMdUrls for sitemap-only discovery', async () => { + const ctx = createContext('http://test77d.local', { requestDelay: 0 }); + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'fail', + message: 'No llms.txt', + details: { discoveredFiles: [] }, + }); + server.use( + http.get('http://test77d.local/robots.txt', () => new HttpResponse('', { status: 404 })), + http.get( + 'http://test77d.local/sitemap.xml', + () => + new HttpResponse( + ` + http://test77d.local/docs/auth + http://test77d.local/docs/guide + `, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const result = await discoverAndSamplePages(ctx); + expect(result.urls).toContain('http://test77d.local/docs/auth'); + // Sitemap URLs are HTML; nothing to preserve. + expect( + result.originalMdUrls === undefined || Object.keys(result.originalMdUrls).length === 0, + ).toBe(true); + }); + + it('keeps the .md form when llms.txt has both .md and HTML for the same page', async () => { + const ctx = createContext('http://test77e.local', { requestDelay: 0 }); + const content = `# Docs +> Summary +## Links +- [Auth](http://test77e.local/docs/auth.md): Auth markdown +- [Auth HTML](http://test77e.local/docs/auth): Auth html +`; + const discovered: DiscoveredFile[] = [ + { url: 'http://test77e.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + mockSitemapNotFound(server, 'http://test77e.local'); + + const result = await discoverAndSamplePages(ctx); + // Both entries collapse to the HTML URL after normalization. + const matching = result.urls.filter((u) => u === 'http://test77e.local/docs/auth'); + expect(matching).toHaveLength(1); + expect(result.originalMdUrls!['http://test77e.local/docs/auth']).toBe( + 'http://test77e.local/docs/auth.md', + ); + }); + + it('filters originalMdUrls to the sampled subset', async () => { + const links = Array.from( + { length: 5 }, + (_, i) => `- [P${i}](http://test77f.local/docs/p${i}/index.html.md): Page ${i}`, + ).join('\n'); + const content = `# Docs\n> Summary\n## Links\n${links}\n`; + const ctx = createContext('http://test77f.local', { requestDelay: 0, maxLinksToTest: 2 }); + const discovered: DiscoveredFile[] = [ + { url: 'http://test77f.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + mockSitemapNotFound(server, 'http://test77f.local'); + + const result = await discoverAndSamplePages(ctx); + expect(result.urls).toHaveLength(2); + expect(result.sampled).toBe(true); + // Only the sampled URLs should have entries in originalMdUrls + expect(Object.keys(result.originalMdUrls ?? {})).toHaveLength(2); + for (const sampledUrl of result.urls) { + expect(result.originalMdUrls![sampledUrl]).toMatch(/\.html\.md$/); + } + }); + + it('propagates originalMdUrl through aggregate (.txt) walking', async () => { + const ctx = createContext('http://test77g.local', { requestDelay: 0 }); + const llmsContent = `# Docs +> Summary +## Links +- [Aggregate](http://test77g.local/docs/all.txt): All pages +`; + const aggregateContent = `# All +- [Auth](http://test77g.local/docs/auth/index.html.md): Auth guide +`; + server.use( + http.get( + 'http://test77g.local/docs/all.txt', + () => + new HttpResponse(aggregateContent, { + status: 200, + headers: { 'Content-Type': 'text/plain' }, + }), + ), + ); + const discovered: DiscoveredFile[] = [ + { + url: 'http://test77g.local/llms.txt', + content: llmsContent, + status: 200, + redirected: false, + }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + mockSitemapNotFound(server, 'http://test77g.local'); + + const result = await discoverAndSamplePages(ctx); + expect(result.urls).toContain('http://test77g.local/docs/auth/index.html'); + expect(result.originalMdUrls!['http://test77g.local/docs/auth/index.html']).toBe( + 'http://test77g.local/docs/auth/index.html.md', + ); + }); +}); diff --git a/test/unit/helpers/to-md-urls.test.ts b/test/unit/helpers/to-md-urls.test.ts index 7c46547..0364c84 100644 --- a/test/unit/helpers/to-md-urls.test.ts +++ b/test/unit/helpers/to-md-urls.test.ts @@ -68,6 +68,30 @@ describe('toMdUrls', () => { it('returns empty array for .xml files', () => { expect(toMdUrls('https://example.com/sitemap.xml')).toEqual([]); }); + + // Regression: issue #77 — Plaid-style /page/index.html.md URLs from llms.txt + it('returns .html.md URL as-is (already a markdown URL)', () => { + expect(toMdUrls('https://example.com/docs/auth/index.html.md')).toEqual([ + 'https://example.com/docs/auth/index.html.md', + ]); + }); + + // Invariant for issue #77 fix: parent-clean .md candidate generation + // (e.g. /docs/auth/index.html → /docs/auth.md) lives in markdown-url-support + // only, gated by an originalMdUrl signal from llms.txt. It must NOT be added + // to toMdUrls() — that would re-introduce the false-positive class where + // checks like llms-txt-directive-md and llms-txt-links-markdown pass via + // unrelated sibling .md files. If you need the parent-clean form, add it + // at the call site, not here. + it('does NOT generate /foo.md candidate from /foo/index.html (issue #77 isolation)', () => { + const result = toMdUrls('https://example.com/docs/auth/index.html'); + expect(result).not.toContain('https://example.com/docs/auth.md'); + }); + + it('does NOT generate /foo.md candidate from /foo/index.htm (issue #77 isolation)', () => { + const result = toMdUrls('https://example.com/docs/auth/index.htm'); + expect(result).not.toContain('https://example.com/docs/auth.md'); + }); }); describe('toHtmlUrl', () => {