From 4edcc3ea0c732b25caca57646eb0e5f22a59d4a0 Mon Sep 17 00:00:00 2001
From: dacharyc <dc@dacharycarey.com>
Date: Fri, 1 May 2026 22:29:30 -0400
Subject: [PATCH] fix: preserve original .md URL from llms.txt for
 markdown-availability checks

When llms.txt linked to a .md/.mdx URL (notably Plaid's /index.html.md
form), normalizePageUrl rewrote it to its HTML equivalent for sitemap
dedup, then toMdUrls regenerated candidates from the HTML form that
missed the URL the site actually published. markdown-url-support
scored 0% on otherwise-compliant sites.

Carry the original .md URL alongside the normalized URL through
discovery as originalMdUrls. markdown-url-support tries it first,
then falls through to toMdUrls() candidates, then a parent-clean
fallback (gated to /index.html.md sources). toMdUrls itself is
unchanged so other checks (llms-txt-directive-md, llms-txt-links-
markdown) cannot regress to the prior false-positive class.

Closes #77
---
 .../markdown-url-support.ts                   |  75 +++-
 src/helpers/get-page-urls.ts                  | 180 ++++++--
 .../unit/checks/llms-txt-directive-md.test.ts |  42 ++
 .../checks/llms-txt-links-markdown.test.ts    |  39 ++
 test/unit/checks/markdown-url-support.test.ts | 410 ++++++++++++++++++
 test/unit/helpers/get-page-urls.test.ts       | 219 ++++++++++
 test/unit/helpers/to-md-urls.test.ts          |  24 +
 7 files changed, 947 insertions(+), 42 deletions(-)

diff --git a/src/checks/markdown-availability/markdown-url-support.ts b/src/checks/markdown-availability/markdown-url-support.ts
index da3a3f2..106ab39 100644
--- a/src/checks/markdown-availability/markdown-url-support.ts
+++ b/src/checks/markdown-availability/markdown-url-support.ts
@@ -12,6 +12,8 @@ interface PageResult {
   alreadyMd?: boolean;
   status: number;
   error?: string;
+  /** True when the original llms.txt-published URL served the content. */
+  originalUrlServed?: boolean;
 }
 
 /**
@@ -19,12 +21,16 @@ interface PageResult {
  * based on which candidate succeeded in previous results.
  * Returns 'index' if `page/index.md` wins, 'direct' if `page.md` wins, or null if
  * there's no clear winner yet.
+ *
+ * Wins served via the `originalMdUrl` from llms.txt are NOT counted: those
+ * URLs reflect the site's published form, not a `toMdUrls()` candidate, and
+ * counting them would skew the heuristic for unrelated pages.
  */
 function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null {
   let directWins = 0;
   let indexWins = 0;
   for (const r of results) {
-    if (!r.supported || !r.mdUrl) continue;
+    if (!r.supported || !r.mdUrl || r.originalUrlServed) continue;
     if (r.mdUrl.endsWith('/index.md') || r.mdUrl.endsWith('/index.mdx')) {
       indexWins++;
     } else {
@@ -38,6 +44,33 @@ function detectPreferredMdForm(results: PageResult[]): 'direct' | 'index' | null
   return null;
 }
 
+/**
+ * Issue #77: when llms.txt published a `/foo/index.html.md` URL but it 404s
+ * (and the regenerated `/foo/index.md` and `/foo/index.html/index.md` also
+ * 404), some sites still serve the markdown at the parent-clean path
+ * `/foo.md` (Plaid's pattern). This is gated to URLs whose llms.txt original
+ * matched `/index.html?\.md$` — strong evidence the site uses this convention.
+ *
+ * Do NOT move this into `toMdUrls()`. Other checks (`llms-txt-directive-md`,
+ * `llms-txt-links-markdown`, etc.) call `toMdUrls()` directly and would
+ * regress to the old false-positive class where unrelated sibling .md files
+ * pass validation. See issue #77 discussion.
+ */
+function deriveParentCleanMd(pageUrl: string, originalMdUrl: string): string | null {
+  if (!/\/index\.html?\.md$/i.test(new URL(originalMdUrl).pathname)) return null;
+  try {
+    const u = new URL(pageUrl);
+    const pathname = u.pathname.replace(/\/$/, '');
+    // Strip /index.html or /index.htm from the page URL and append .md
+    const stripped = pathname.replace(/\/index\.html?$/i, '');
+    if (!stripped || stripped === pathname) return null;
+    u.pathname = `${stripped}.md`;
+    return u.toString();
+  } catch {
+    return null;
+  }
+}
+
 /**
  * Reorder toMdUrls() candidates based on the detected site preference.
  * 'index' puts `page/index.md` first; 'direct' keeps the default order (`page.md` first).
@@ -58,6 +91,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
     totalPages,
     sampled: wasSampled,
     warnings,
+    originalMdUrls,
   } = await discoverAndSamplePages(ctx);
 
   const results: PageResult[] = [];
@@ -68,15 +102,35 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
     const batch = pageUrls.slice(i, i + concurrency);
     const batchResults = await Promise.all(
       batch.map(async (url): Promise<PageResult> => {
-        const candidates = toMdUrls(url);
+        const baseCandidates = toMdUrls(url);
         // Non-markdown file types (e.g. .json, .xml) have no .md equivalent — skip them
-        if (candidates.length === 0) {
+        if (baseCandidates.length === 0) {
           return { url, mdUrl: url, supported: false, skipped: true, status: 0 };
         }
         const alreadyMd = /\.mdx?$/i.test(new URL(url).pathname);
-        const ordered = orderCandidates(candidates, mdFormPreference);
+        const original = originalMdUrls?.[url];
+        const parentClean = original ? deriveParentCleanMd(url, original) : null;
+
+        // Build candidate list:
+        //   1. originalMdUrl (the URL llms.txt published) — first, when present.
+        //   2. toMdUrls() candidates, reordered by detected site preference.
+        //   3. parent-clean fallback (issue #77) — last, only when llms.txt
+        //      published a /foo/index.html.md form. Tried only if 1+2 fail.
+        const ordered = orderCandidates(baseCandidates, mdFormPreference);
+        const candidateList: string[] = [];
+        const seen = new Set<string>();
+        const addCandidate = (c: string | null | undefined) => {
+          if (c && !seen.has(c)) {
+            seen.add(c);
+            candidateList.push(c);
+          }
+        };
+        addCandidate(original);
+        for (const c of ordered) addCandidate(c);
+        addCandidate(parentClean);
+
         let lastError: string | undefined;
-        for (const mdUrl of ordered) {
+        for (const mdUrl of candidateList) {
           try {
             const response = await ctx.http.fetch(mdUrl);
             const body = await response.text();
@@ -90,7 +144,14 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
                 url,
                 markdown: { content: body, source: 'md-url' },
               });
-              return { url, mdUrl, supported: true, alreadyMd, status: response.status };
+              return {
+                url,
+                mdUrl,
+                supported: true,
+                alreadyMd,
+                status: response.status,
+                originalUrlServed: mdUrl === original,
+              };
             }
             lastError = undefined; // Got a response, not a fetch error
           } catch (err) {
@@ -99,7 +160,7 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
         }
         return {
           url,
-          mdUrl: ordered[0],
+          mdUrl: candidateList[0],
           supported: false,
           alreadyMd,
           status: 0,
diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
index 1b6d42d..30430b3 100644
--- a/src/helpers/get-page-urls.ts
+++ b/src/helpers/get-page-urls.ts
@@ -43,45 +43,100 @@ export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise<strin
   return result.pageUrls;
 }
 
+/**
+ * Variant of `getUrlsFromCachedLlmsTxt` that also returns the originalMdUrl
+ * mapping for URLs whose llms.txt form ended in `.md`/`.mdx` (issue #77).
+ */
+export async function getUrlsFromCachedLlmsTxtWithOriginals(
+  ctx: CheckContext,
+): Promise<{ pageUrls: string[]; originalMdUrls: Record<string, string> }> {
+  const existsResult = ctx.previousResults.get('llms-txt-exists');
+  const discovered = getLlmsTxtFilesForAnalysis(existsResult);
+
+  const entries = extractLinksFromLlmsTxtFiles(discovered);
+  const result = await walkAggregateLinksWithOriginals(ctx, entries);
+  return {
+    pageUrls: result.pageUrls.map((p) => p.url),
+    originalMdUrls: collectOriginalMdUrls(result.pageUrls),
+  };
+}
+
 export async function getUrlsFromCachedLlmsTxtWithOmitted(
   ctx: CheckContext,
 ): Promise<AggregateWalkResult> {
   const existsResult = ctx.previousResults.get('llms-txt-exists');
   const discovered = getLlmsTxtFilesForAnalysis(existsResult);
 
-  const urls = extractLinksFromLlmsTxtFiles(discovered);
-  return walkAggregateLinks(ctx, urls);
+  const entries = extractLinksFromLlmsTxtFiles(discovered);
+  const result = await walkAggregateLinksWithOriginals(ctx, entries);
+  return {
+    pageUrls: result.pageUrls.map((p) => p.url),
+    omittedTxtUrls: result.omittedTxtUrls,
+  };
+}
+
+function collectOriginalMdUrls(pages: DiscoveredPageUrl[]): Record<string, string> {
+  const map: Record<string, string> = {};
+  for (const p of pages) {
+    if (p.originalMdUrl && !(p.url in map)) {
+      map[p.url] = p.originalMdUrl;
+    }
+  }
+  return map;
 }
 
 /**
  * Normalize a discovered page URL: convert .md/.mdx URLs to their HTML
  * equivalent so that llms.txt entries like `/docs/guide/index.md` deduplicate
- * against sitemap entries like `/docs/guide/`. Markdown-specific checks are
- * unaffected because they derive .md candidates from HTML URLs via toMdUrls().
+ * against sitemap entries like `/docs/guide/`. The original .md URL is
+ * returned alongside as `originalMdUrl` so that markdown-availability checks
+ * can still test the URL the site explicitly published (issue #77).
  */
-function normalizePageUrl(url: string): string {
-  return isMdUrl(url) ? toHtmlUrl(url) : url;
+function normalizePageUrl(url: string): { url: string; originalMdUrl?: string } {
+  if (isMdUrl(url)) {
+    return { url: toHtmlUrl(url), originalMdUrl: url };
+  }
+  return { url };
+}
+
+interface DiscoveredPageUrl {
+  url: string;
+  originalMdUrl?: string;
 }
 
-function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] {
-  const urls = new Set<string>();
+function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): DiscoveredPageUrl[] {
+  // Map normalized URL → originalMdUrl. A given page may appear in llms.txt
+  // both as `/page.md` and `/page` (HTML); we keep the .md form so downstream
+  // markdown checks have a known-good URL to test.
+  const seen = new Map<string, string | undefined>();
+
+  function record(rawUrl: string) {
+    const { url, originalMdUrl } = normalizePageUrl(rawUrl);
+    const existing = seen.get(url);
+    if (existing === undefined && originalMdUrl) {
+      seen.set(url, originalMdUrl);
+    } else if (!seen.has(url)) {
+      seen.set(url, originalMdUrl);
+    }
+  }
+
   for (const file of files) {
     const links = extractMarkdownLinks(file.content);
     for (const link of links) {
       if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
-        urls.add(normalizePageUrl(link.url));
+        record(link.url);
       } else if (link.url.startsWith('/')) {
         // Resolve root-relative URLs against the source file's origin
         try {
           const base = new URL(file.url);
-          urls.add(normalizePageUrl(new URL(link.url, base.origin).toString()));
+          record(new URL(link.url, base.origin).toString());
         } catch {
           // Skip malformed URLs
         }
       }
     }
   }
-  return Array.from(urls);
+  return Array.from(seen, ([url, originalMdUrl]) => ({ url, originalMdUrl }));
 }
 
 /**
@@ -98,29 +153,32 @@ export interface AggregateWalkResult {
   omittedTxtUrls: string[];
 }
 
-async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<AggregateWalkResult> {
-  const pageUrls: string[] = [];
+async function walkAggregateLinksWithOriginals(
+  ctx: CheckContext,
+  entries: DiscoveredPageUrl[],
+): Promise<{ pageUrls: DiscoveredPageUrl[]; omittedTxtUrls: string[] }> {
+  const pageUrls: DiscoveredPageUrl[] = [];
   const aggregateUrls: string[] = [];
   const omittedTxtUrls: string[] = [];
 
   const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
 
-  for (const url of urls) {
+  for (const entry of entries) {
     try {
-      const parsed = new URL(url);
+      const parsed = new URL(entry.url);
       if (/\.txt$/i.test(parsed.pathname)) {
         // .txt files are either aggregate indexes to walk (same origin)
         // or external resources to skip — never page URLs themselves
         if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
-          aggregateUrls.push(url);
+          aggregateUrls.push(entry.url);
         }
       } else if (parsed.origin === ctx.origin || parsed.origin === siteOrigin) {
         // Only include same-origin page URLs; cross-origin links are
         // external resources the site owner doesn't control.
-        pageUrls.push(normalizePageUrl(url));
+        pageUrls.push(entry);
       }
     } catch {
-      pageUrls.push(normalizePageUrl(url));
+      pageUrls.push(entry);
     }
   }
 
@@ -144,19 +202,19 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<Ag
         status: response.status,
         redirected: response.redirected,
       };
-      const subUrls = extractLinksFromLlmsTxtFiles([subFile]);
+      const subEntries = extractLinksFromLlmsTxtFiles([subFile]);
 
-      for (const subUrl of subUrls) {
+      for (const subEntry of subEntries) {
         try {
-          const parsed = new URL(subUrl);
+          const parsed = new URL(subEntry.url);
           const isSameOrigin = parsed.origin === ctx.origin || parsed.origin === siteOrigin;
           if (!isSameOrigin) continue;
 
           if (/\.txt$/i.test(parsed.pathname)) {
             // Depth-1 .txt link: record as omitted rather than descending
-            omittedTxtUrls.push(subUrl);
-          } else if (!isNonPageUrl(subUrl)) {
-            pageUrls.push(subUrl);
+            omittedTxtUrls.push(subEntry.url);
+          } else if (!isNonPageUrl(subEntry.url)) {
+            pageUrls.push(subEntry);
           }
         } catch {
           // Skip malformed URLs
@@ -177,7 +235,9 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise<Ag
  * Mirrors the canonical-selection logic in `llms-txt-exists` so that the same
  * single source of truth drives sampling whether or not `llms-txt-exists` ran.
  */
-async function fetchLlmsTxtUrls(ctx: CheckContext): Promise<string[]> {
+async function fetchLlmsTxtUrls(
+  ctx: CheckContext,
+): Promise<{ pageUrls: string[]; originalMdUrls: Record<string, string> }> {
   const explicitUrl = ctx.options.llmsTxtUrl;
   const candidates = explicitUrl
     ? [explicitUrl]
@@ -213,9 +273,12 @@ async function fetchLlmsTxtUrls(ctx: CheckContext): Promise<string[]> {
 
   const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl);
   const filesForAnalysis = canonical ? [canonical] : [];
-  const urls = extractLinksFromLlmsTxtFiles(filesForAnalysis);
-  const result = await walkAggregateLinks(ctx, urls);
-  return result.pageUrls;
+  const entries = extractLinksFromLlmsTxtFiles(filesForAnalysis);
+  const result = await walkAggregateLinksWithOriginals(ctx, entries);
+  return {
+    pageUrls: result.pageUrls.map((p) => p.url),
+    originalMdUrls: collectOriginalMdUrls(result.pageUrls),
+  };
 }
 
 /**
@@ -285,6 +348,13 @@ export interface PageUrlResult {
   warnings: string[];
   /** Which discovery methods contributed to the final URL set. */
   sources: DiscoverySource[];
+  /**
+   * Map of normalized URL → the original .md/.mdx URL the llms.txt published.
+   * Only populated for URLs discovered via llms.txt; sitemap URLs contribute none.
+   * Markdown-availability checks use this to test the site-published markdown
+   * URL alongside the conventional candidates from `toMdUrls()` (issue #77).
+   */
+  originalMdUrls?: Record<string, string>;
 }
 
 function isGzipped(url: string): boolean {
@@ -880,33 +950,54 @@ export async function getPageUrls(ctx: CheckContext): Promise<PageUrlResult> {
     return deduplicateVersionedUrls(localeFiltered, version);
   }
 
+  /** Filter the originalMdUrls map to a subset of URLs. */
+  function filterOriginalMdUrls(
+    map: Record<string, string>,
+    keep: string[],
+  ): Record<string, string> | undefined {
+    const keepSet = new Set(keep);
+    const out: Record<string, string> = {};
+    for (const [url, original] of Object.entries(map)) {
+      if (keepSet.has(url)) out[url] = original;
+    }
+    return Object.keys(out).length > 0 ? out : undefined;
+  }
+
   // 1. Try llms.txt links from cached results (if llms-txt-exists ran)
-  const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx);
-  let llmsTxtUrls = refineUrls(filterByPathPrefix(cachedUrls, filterBase));
+  const cached = await getUrlsFromCachedLlmsTxtWithOriginals(ctx);
+  let llmsTxtUrls = refineUrls(filterByPathPrefix(cached.pageUrls, filterBase));
+  let originalMdUrls = cached.originalMdUrls;
 
   // 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run)
   if (llmsTxtUrls.length === 0 && !ctx.previousResults.has('llms-txt-exists')) {
-    const fetchedUrls = await fetchLlmsTxtUrls(ctx);
-    llmsTxtUrls = refineUrls(filterByPathPrefix(fetchedUrls, filterBase));
+    const fetched = await fetchLlmsTxtUrls(ctx);
+    llmsTxtUrls = refineUrls(filterByPathPrefix(fetched.pageUrls, filterBase));
+    originalMdUrls = fetched.originalMdUrls;
   }
 
   if (llmsTxtUrls.length > 0) {
     sources.push('llms-txt');
+    const filteredOriginals = filterOriginalMdUrls(originalMdUrls, llmsTxtUrls);
 
     // If llms.txt meets the requested sample size, no need for sitemap
     if (llmsTxtUrls.length >= ctx.options.maxLinksToTest) {
-      return { urls: llmsTxtUrls, warnings, sources };
+      return { urls: llmsTxtUrls, warnings, sources, originalMdUrls: filteredOriginals };
     }
 
     // llms.txt is thin — try sitemap to fill the gap
     const sitemapUrls = await getUrlsFromSitemap(ctx, warnings, { pathFilterBase: filterBase });
     if (sitemapUrls.length > 0) {
       sources.push('sitemap');
-      return { urls: mergeUrlSets(llmsTxtUrls, sitemapUrls), warnings, sources };
+      return {
+        urls: mergeUrlSets(llmsTxtUrls, sitemapUrls),
+        warnings,
+        sources,
+        originalMdUrls: filteredOriginals,
+      };
     }
 
     // Sitemap had nothing; return llms.txt URLs alone
-    return { urls: llmsTxtUrls, warnings, sources };
+    return { urls: llmsTxtUrls, warnings, sources, originalMdUrls: filteredOriginals };
   }
 
   // 3. Try sitemap (path, locale, and version filtering applied inside)
@@ -930,6 +1021,12 @@ export interface SampledPages {
   urlTags?: Record<string, string>;
   /** Which discovery methods contributed to the page URL set. */
   sources?: DiscoverySource[];
+  /**
+   * Map of sampled URL → original .md/.mdx URL from llms.txt (issue #77).
+   * `markdown-url-support` uses this to test the URL the site explicitly
+   * published before falling back to conventional `toMdUrls()` candidates.
+   */
+  originalMdUrls?: Record<string, string>;
 }
 
 /**
@@ -1023,12 +1120,25 @@ export async function discoverAndSamplePages(ctx: CheckContext): Promise<Sampled
     }
   }
 
+  // Filter originalMdUrls to the sampled subset so downstream checks
+  // don't see entries for URLs that were filtered out.
+  let originalMdUrls: Record<string, string> | undefined;
+  if (discovery.originalMdUrls) {
+    const sampledSet = new Set(urls);
+    const filtered: Record<string, string> = {};
+    for (const [url, original] of Object.entries(discovery.originalMdUrls)) {
+      if (sampledSet.has(url)) filtered[url] = original;
+    }
+    if (Object.keys(filtered).length > 0) originalMdUrls = filtered;
+  }
+
   ctx._sampledPages = {
     urls,
     totalPages,
     sampled,
     warnings: discovery.warnings,
     sources: discovery.sources,
+    originalMdUrls,
   };
   return ctx._sampledPages;
 }
diff --git a/test/unit/checks/llms-txt-directive-md.test.ts b/test/unit/checks/llms-txt-directive-md.test.ts
index e0d3f7c..87c16a5 100644
--- a/test/unit/checks/llms-txt-directive-md.test.ts
+++ b/test/unit/checks/llms-txt-directive-md.test.ts
@@ -380,4 +380,46 @@ describe('llms-txt-directive-md', () => {
     expect(result.status).toBe('pass');
     expect(result.details?.foundCount).toBe(1);
   });
+
+  // Issue #77 isolation: parent-clean candidate generation lives in
+  // markdown-url-support only. If it ever leaks into toMdUrls, this directive
+  // check would falsely pass when an unrelated /auth.md happens to contain
+  // the directive — a known false-positive class. Guard against that.
+  it('does not validate directive via parent-clean .md candidate (issue #77 isolation)', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      // The page URL would be /docs/auth/index.html (HTML form). Conventional
+      // .md candidates 404…
+      http.get('http://test.local/docs/auth/index.md', () => {
+        requestLog.push('/docs/auth/index.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth/index.html/index.md', () => {
+        requestLog.push('/docs/auth/index.html/index.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      // …but a sibling /docs/auth.md exists with the directive. The
+      // directive check must NOT request this URL.
+      http.get('http://test.local/docs/auth.md', () => {
+        requestLog.push('/docs/auth.md');
+        return new HttpResponse(
+          '> For AI agents: see [documentation index](/llms.txt) for navigation.\n\n# Wrong page',
+          { status: 200, headers: { 'Content-Type': 'text/markdown' } },
+        );
+      }),
+      // Content negotiation also fails so it can't pass that way
+      http.get('http://test.local/docs/auth/index.html', () => {
+        requestLog.push('/docs/auth/index.html [accept]');
+        return new HttpResponse('<!DOCTYPE html><html></html>', {
+          status: 200,
+          headers: { 'Content-Type': 'text/html' },
+        });
+      }),
+    );
+
+    const ctx = makeCtx(llms('/docs/auth/index.html.md'));
+    const result = await check.run(ctx);
+    expect(result.status).not.toBe('pass');
+    expect(requestLog).not.toContain('/docs/auth.md');
+  });
 });
diff --git a/test/unit/checks/llms-txt-links-markdown.test.ts b/test/unit/checks/llms-txt-links-markdown.test.ts
index f600e16..12e84d4 100644
--- a/test/unit/checks/llms-txt-links-markdown.test.ts
+++ b/test/unit/checks/llms-txt-links-markdown.test.ts
@@ -328,4 +328,43 @@ Just text, no links here.
     expect(result.status).toBe('warn');
     expect(result.details?.mdVariantsAvailable).toBe(1);
   });
+
+  // Issue #77 isolation: parent-clean candidate generation lives in
+  // markdown-url-support only. If it ever leaks into toMdUrls, this links
+  // check would falsely validate a link to /docs/auth/index.html via an
+  // unrelated /docs/auth.md sibling. Guard against that.
+  it('does not validate link via parent-clean .md candidate (issue #77 isolation)', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      http.head('http://test.local/docs/auth/index.html', () => {
+        requestLog.push('HEAD /docs/auth/index.html');
+        return new HttpResponse(null, { status: 200 });
+      }),
+      http.head('http://test.local/docs/auth/index.md', () => {
+        requestLog.push('HEAD /docs/auth/index.md');
+        return new HttpResponse(null, { status: 404 });
+      }),
+      http.head('http://test.local/docs/auth/index.html/index.md', () => {
+        requestLog.push('HEAD /docs/auth/index.html/index.md');
+        return new HttpResponse(null, { status: 404 });
+      }),
+      // The sibling /docs/auth.md exists but must not be requested.
+      http.head('http://test.local/docs/auth.md', () => {
+        requestLog.push('HEAD /docs/auth.md');
+        return new HttpResponse(null, { status: 200 });
+      }),
+      http.get('http://test.local/docs/auth.md', () => {
+        requestLog.push('GET /docs/auth.md');
+        return new HttpResponse('# unrelated', {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const content = `# Test\n> Summary\n## Links\n- [Auth](http://test.local/docs/auth/index.html): Auth\n`;
+    await check.run(makeCtx(content));
+    expect(requestLog).not.toContain('HEAD /docs/auth.md');
+    expect(requestLog).not.toContain('GET /docs/auth.md');
+  });
 });
diff --git a/test/unit/checks/markdown-url-support.test.ts b/test/unit/checks/markdown-url-support.test.ts
index e39d35b..d4c2f4f 100644
--- a/test/unit/checks/markdown-url-support.test.ts
+++ b/test/unit/checks/markdown-url-support.test.ts
@@ -410,6 +410,85 @@ describe('markdown-url-support', () => {
     expect(cached?.markdown?.source).toBe('md-url');
   });
 
+  // Regression: issue #77 — sites whose llms.txt links use a `.html.md`
+  // suffix (e.g. Plaid: /docs/auth/index.html.md) lose markdown discovery
+  // because normalizePageUrl strips the .md, and the regenerated candidates
+  // (/docs/auth/index.md, /docs/auth/index.html/index.md) miss the real file.
+  it('detects markdown when llms.txt uses .html.md suffix (issue #77)', async () => {
+    const md = '# Auth\n\nAuth documentation with [a link](http://example.com).';
+    server.use(
+      http.get(
+        'http://test.local/docs/auth/index.html.md',
+        () =>
+          new HttpResponse(md, {
+            status: 200,
+            headers: { 'Content-Type': 'text/markdown' },
+          }),
+      ),
+      // The conventional candidates that the check currently generates 404
+      http.get(
+        'http://test.local/docs/auth/index.md',
+        () => new HttpResponse('Not Found', { status: 404 }),
+      ),
+      http.get(
+        'http://test.local/docs/auth.md',
+        () => new HttpResponse('Not Found', { status: 404 }),
+      ),
+      http.get(
+        'http://test.local/docs/auth/index.html/index.md',
+        () => new HttpResponse('Not Found', { status: 404 }),
+      ),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth/index.html.md): Auth guide
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    expect(result.details?.mdSupported).toBe(1);
+  });
+
+  // Regression: issue #77, second variant — Plaid actually does serve
+  // /docs/auth.md (clean path + .md) in addition to /docs/auth/index.html.md.
+  // The check should find at least one working markdown form.
+  it('detects markdown via clean-path .md when llms.txt uses .html.md suffix (issue #77)', async () => {
+    const md = '# Auth\n\nClean-path markdown with [a link](http://example.com).';
+    server.use(
+      http.get(
+        'http://test.local/docs/auth.md',
+        () =>
+          new HttpResponse(md, {
+            status: 200,
+            headers: { 'Content-Type': 'text/markdown' },
+          }),
+      ),
+      // The .html.md form 404s in this scenario; only /docs/auth.md works.
+      http.get(
+        'http://test.local/docs/auth/index.html.md',
+        () => new HttpResponse('Not Found', { status: 404 }),
+      ),
+      http.get(
+        'http://test.local/docs/auth/index.md',
+        () => new HttpResponse('Not Found', { status: 404 }),
+      ),
+      http.get(
+        'http://test.local/docs/auth/index.html/index.md',
+        () => new HttpResponse('Not Found', { status: 404 }),
+      ),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth/index.html.md): Auth guide
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    expect(result.details?.mdSupported).toBe(1);
+  });
+
   it('auto-detects page/index.md preference and tries it first in later batches', async () => {
     // 3 pages, all served at page/index.md (not page.md). With concurrency=1,
     // each page is a separate batch, so after page 1+2 the check should
@@ -479,4 +558,335 @@ describe('markdown-url-support', () => {
     expect(requestLog).not.toContain('/docs/c.md');
     expect(requestLog).toContain('/docs/c/index.md');
   });
+
+  // ── Issue #77 coverage: original-URL preservation and parent-clean gating ──
+
+  // Sibling-page trap: when both the original .html.md and an unrelated
+  // /auth.md exist, the original must be tried first so we don't accidentally
+  // associate the page with a sibling's content.
+  it('prefers originalMdUrl over parent-clean candidate when both succeed (issue #77)', async () => {
+    const requestLog: string[] = [];
+    const realMd = '# Auth\n\nReal auth doc.';
+    const wrongMd = '# Authentication Section\n\nA different page entirely.';
+    server.use(
+      http.get('http://test.local/docs/auth/index.html.md', () => {
+        requestLog.push('/docs/auth/index.html.md');
+        return new HttpResponse(realMd, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+      http.get('http://test.local/docs/auth.md', () => {
+        requestLog.push('/docs/auth.md');
+        return new HttpResponse(wrongMd, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth/index.html.md): Auth guide
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    const pageResults = result.details?.pageResults as Array<{ mdUrl: string }>;
+    expect(pageResults[0].mdUrl).toBe('http://test.local/docs/auth/index.html.md');
+    expect(requestLog[0]).toBe('/docs/auth/index.html.md');
+    expect(requestLog).not.toContain('/docs/auth.md');
+  });
+
+  // Without an originalMdUrl signal (sitemap-only discovery), the parent-clean
+  // candidate must NOT be emitted. Otherwise an unrelated /auth.md would
+  // false-positive the check for a /auth/index.html page.
+  it('does not test /foo.md when /foo/index.html came from sitemap (issue #77 isolation)', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      http.get('http://parentclean.local/robots.txt', () => new HttpResponse('', { status: 404 })),
+      http.get(
+        'http://parentclean.local/sitemap.xml',
+        () =>
+          new HttpResponse(
+            `<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+              <url><loc>http://parentclean.local/docs/auth/index.html</loc></url>
+            </urlset>`,
+            { status: 200, headers: { 'Content-Type': 'application/xml' } },
+          ),
+      ),
+      http.get('http://parentclean.local/docs/auth/index.md', () => {
+        requestLog.push('/docs/auth/index.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://parentclean.local/docs/auth/index.html/index.md', () => {
+        requestLog.push('/docs/auth/index.html/index.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://parentclean.local/docs/auth.md', () => {
+        requestLog.push('/docs/auth.md');
+        return new HttpResponse('# Some other page\n\nUnrelated content.', {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const ctx = createContext('http://parentclean.local', { requestDelay: 0 });
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'fail',
+      message: 'No llms.txt',
+      details: { discoveredFiles: [] },
+    });
+    const result = await check.run(ctx);
+    expect(result.status).toBe('fail');
+    expect(requestLog).not.toContain('/docs/auth.md');
+  });
+
+  // Plain .md original (e.g. /docs/auth.md from llms.txt). No /index.html
+  // form is involved, so the parent-clean candidate must not fire.
+  it('does not emit parent-clean candidate when originalMdUrl is plain .md (issue #77)', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      http.get('http://test.local/docs/feature.md', () => {
+        requestLog.push('/docs/feature.md');
+        return new HttpResponse('# Feature\n\nThe feature.', {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+      http.get('http://test.local/docs.md', () => {
+        requestLog.push('/docs.md');
+        return new HttpResponse('# Wrong page\n', {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Feature](http://test.local/docs/feature.md): Feature
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    expect(requestLog[0]).toBe('/docs/feature.md');
+    expect(requestLog).not.toContain('/docs.md');
+  });
+
+  // .mdx originals get the original-first benefit, but the parent-clean gate
+  // requires /index.html?\.md specifically. Confirm parent-clean does not fire.
+  it('tries .mdx original first but does not emit parent-clean candidate (issue #77)', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      http.get('http://test.local/docs/guide.mdx', () => {
+        requestLog.push('/docs/guide.mdx');
+        return new HttpResponse('# Guide\n\nThe guide.', {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+      http.get('http://test.local/docs.md', () => {
+        requestLog.push('/docs.md');
+        return new HttpResponse('Wrong', { status: 200 });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Guide](http://test.local/docs/guide.mdx): Guide
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    expect(requestLog[0]).toBe('/docs/guide.mdx');
+    expect(requestLog).not.toContain('/docs.md');
+  });
+
+  // /page.html.md (no /index segment): gate requires /index.html.md, so the
+  // parent-clean candidate is NOT emitted. The original is still tried first.
+  it('does not emit parent-clean candidate for /page.html.md (no /index)', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      http.get('http://test.local/docs/auth.html.md', () => {
+        requestLog.push('/docs/auth.html.md');
+        return new HttpResponse('# Auth\n\nAuth.', {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth.html.md): Auth
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    expect(requestLog[0]).toBe('/docs/auth.html.md');
+  });
+
+  // Original 404, parent-clean wins (Plaid second variant).
+  it('falls back to parent-clean candidate when /index.html.md original 404s (issue #77)', async () => {
+    const requestLog: string[] = [];
+    const md = '# Auth\n\nThe auth doc served via clean path.';
+    server.use(
+      http.get('http://test.local/docs/auth/index.html.md', () => {
+        requestLog.push('/docs/auth/index.html.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth/index.md', () => {
+        requestLog.push('/docs/auth/index.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth/index.html/index.md', () => {
+        requestLog.push('/docs/auth/index.html/index.md');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth.md', () => {
+        requestLog.push('/docs/auth.md');
+        return new HttpResponse(md, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth/index.html.md): Auth
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('pass');
+    expect(requestLog).toContain('/docs/auth/index.html.md');
+    expect(requestLog).toContain('/docs/auth.md');
+    const pageResults = result.details?.pageResults as Array<{ mdUrl: string }>;
+    expect(pageResults[0].mdUrl).toBe('http://test.local/docs/auth.md');
+  });
+
+  // All forms 404: must fail cleanly without infinite candidate expansion.
+  it('fails cleanly when original, generated, and parent-clean candidates all 404', async () => {
+    const requestLog: string[] = [];
+    server.use(
+      http.get('http://test.local/docs/auth/index.html.md', () => {
+        requestLog.push('A');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth/index.md', () => {
+        requestLog.push('B');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth/index.html/index.md', () => {
+        requestLog.push('C');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+      http.get('http://test.local/docs/auth.md', () => {
+        requestLog.push('D');
+        return new HttpResponse('Not Found', { status: 404 });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth/index.html.md): Auth
+`;
+    const result = await check.run(makeCtx({ content }));
+    expect(result.status).toBe('fail');
+    // Bounded: at most original + 2 generated + 1 parent-clean = 4 requests.
+    expect(requestLog.length).toBeLessThanOrEqual(4);
+  });
+
+  // mdFormPreference must not be skewed by originalMdUrl wins; otherwise a
+  // run of .html.md sites would mis-bias the heuristic for unrelated pages.
+  it('does not skew mdFormPreference based on originalMdUrl wins (issue #77)', async () => {
+    const md = '# P\n\nContent.';
+    const requestLog: string[] = [];
+    server.use(
+      // Three .html.md originals — all win on first try
+      http.get('http://test.local/docs/a/index.html.md', () => {
+        requestLog.push('/docs/a/index.html.md');
+        return new HttpResponse(md, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+      http.get('http://test.local/docs/b/index.html.md', () => {
+        requestLog.push('/docs/b/index.html.md');
+        return new HttpResponse(md, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+      http.get('http://test.local/docs/c/index.html.md', () => {
+        requestLog.push('/docs/c/index.html.md');
+        return new HttpResponse(md, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+      // Fourth page is plain HTML (no llms.txt original); should try .md
+      // first by default (preference null because original wins didn't count).
+      http.get('http://test.local/docs/d.md', () => {
+        requestLog.push('/docs/d.md');
+        return new HttpResponse(md, {
+          status: 200,
+          headers: { 'Content-Type': 'text/markdown' },
+        });
+      }),
+    );
+
+    const content = `# Docs
+> Summary
+## Links
+- [A](http://test.local/docs/a/index.html.md): A
+- [B](http://test.local/docs/b/index.html.md): B
+- [C](http://test.local/docs/c/index.html.md): C
+- [D](http://test.local/docs/d): D
+`;
+    const ctx = makeCtx({ content });
+    ctx.options.maxConcurrency = 1;
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    // /docs/d.md must be tried in the default order. If preference were
+    // skewed toward 'index' (because the .html.md originals end with index),
+    // the check would request /docs/d/index.md first.
+    const dIndex = requestLog.indexOf('/docs/d.md');
+    const dIndexMd = requestLog.indexOf('/docs/d/index.md');
+    expect(dIndex).toBeGreaterThanOrEqual(0);
+    if (dIndexMd >= 0) {
+      expect(dIndex).toBeLessThan(dIndexMd);
+    }
+  });
+
+  // pageResults reflects the URL the site actually served, not the page URL.
+  it('reports originalMdUrl in pageResults.mdUrl when it served the content', async () => {
+    server.use(
+      http.get(
+        'http://test.local/docs/auth/index.html.md',
+        () =>
+          new HttpResponse('# Auth', {
+            status: 200,
+            headers: { 'Content-Type': 'text/markdown' },
+          }),
+      ),
+    );
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test.local/docs/auth/index.html.md): Auth
+`;
+    const ctx = makeCtx({ content });
+    const result = await check.run(ctx);
+    const pageResults = result.details?.pageResults as Array<{ mdUrl: string }>;
+    expect(pageResults[0].mdUrl).toBe('http://test.local/docs/auth/index.html.md');
+    // Cache is keyed by page URL (the normalized HTML form)
+    const cached = ctx.pageCache.get('http://test.local/docs/auth/index.html');
+    expect(cached?.markdown?.content).toBe('# Auth');
+  });
 });
diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts
index 22322aa..1ad7933 100644
--- a/test/unit/helpers/get-page-urls.test.ts
+++ b/test/unit/helpers/get-page-urls.test.ts
@@ -2120,3 +2120,222 @@ describe('discoverAndSamplePages', () => {
     expect(result.warnings[0]).toContain('gzipped sitemap');
   });
 });
+
+// Issue #77: when llms.txt links use a .md/.mdx (or .html.md) suffix,
+// normalizePageUrl rewrites them to their HTML form for dedup against sitemap
+// entries. The original .md URL must still be carried alongside so that
+// markdown-availability checks can test the URL the site explicitly published.
+describe('originalMdUrls (issue #77)', () => {
+  it('preserves originalMdUrl for .html.md links from llms.txt', async () => {
+    const ctx = createContext('http://test77a.local', { requestDelay: 0 });
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test77a.local/docs/auth/index.html.md): Auth guide
+`;
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://test77a.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+    mockSitemapNotFound(server, 'http://test77a.local');
+
+    const result = await discoverAndSamplePages(ctx);
+    expect(result.urls).toContain('http://test77a.local/docs/auth/index.html');
+    expect(result.originalMdUrls).toBeDefined();
+    expect(result.originalMdUrls!['http://test77a.local/docs/auth/index.html']).toBe(
+      'http://test77a.local/docs/auth/index.html.md',
+    );
+  });
+
+  it('preserves originalMdUrl for .mdx links from llms.txt', async () => {
+    const ctx = createContext('http://test77b.local', { requestDelay: 0 });
+    const content = `# Docs
+> Summary
+## Links
+- [Guide](http://test77b.local/docs/guide.mdx): Guide
+`;
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://test77b.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+    mockSitemapNotFound(server, 'http://test77b.local');
+
+    const result = await discoverAndSamplePages(ctx);
+    expect(result.urls).toContain('http://test77b.local/docs/guide');
+    expect(result.originalMdUrls!['http://test77b.local/docs/guide']).toBe(
+      'http://test77b.local/docs/guide.mdx',
+    );
+  });
+
+  it('preserves originalMdUrl for plain .md links from llms.txt', async () => {
+    const ctx = createContext('http://test77c.local', { requestDelay: 0 });
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test77c.local/docs/auth.md): Auth
+`;
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://test77c.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+    mockSitemapNotFound(server, 'http://test77c.local');
+
+    const result = await discoverAndSamplePages(ctx);
+    expect(result.urls).toContain('http://test77c.local/docs/auth');
+    expect(result.originalMdUrls!['http://test77c.local/docs/auth']).toBe(
+      'http://test77c.local/docs/auth.md',
+    );
+  });
+
+  it('does not populate originalMdUrls for sitemap-only discovery', async () => {
+    const ctx = createContext('http://test77d.local', { requestDelay: 0 });
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'fail',
+      message: 'No llms.txt',
+      details: { discoveredFiles: [] },
+    });
+    server.use(
+      http.get('http://test77d.local/robots.txt', () => new HttpResponse('', { status: 404 })),
+      http.get(
+        'http://test77d.local/sitemap.xml',
+        () =>
+          new HttpResponse(
+            `<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+              <url><loc>http://test77d.local/docs/auth</loc></url>
+              <url><loc>http://test77d.local/docs/guide</loc></url>
+            </urlset>`,
+            { status: 200, headers: { 'Content-Type': 'application/xml' } },
+          ),
+      ),
+    );
+
+    const result = await discoverAndSamplePages(ctx);
+    expect(result.urls).toContain('http://test77d.local/docs/auth');
+    // Sitemap URLs are HTML; nothing to preserve.
+    expect(
+      result.originalMdUrls === undefined || Object.keys(result.originalMdUrls).length === 0,
+    ).toBe(true);
+  });
+
+  it('keeps the .md form when llms.txt has both .md and HTML for the same page', async () => {
+    const ctx = createContext('http://test77e.local', { requestDelay: 0 });
+    const content = `# Docs
+> Summary
+## Links
+- [Auth](http://test77e.local/docs/auth.md): Auth markdown
+- [Auth HTML](http://test77e.local/docs/auth): Auth html
+`;
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://test77e.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+    mockSitemapNotFound(server, 'http://test77e.local');
+
+    const result = await discoverAndSamplePages(ctx);
+    // Both entries collapse to the HTML URL after normalization.
+    const matching = result.urls.filter((u) => u === 'http://test77e.local/docs/auth');
+    expect(matching).toHaveLength(1);
+    expect(result.originalMdUrls!['http://test77e.local/docs/auth']).toBe(
+      'http://test77e.local/docs/auth.md',
+    );
+  });
+
+  it('filters originalMdUrls to the sampled subset', async () => {
+    const links = Array.from(
+      { length: 5 },
+      (_, i) => `- [P${i}](http://test77f.local/docs/p${i}/index.html.md): Page ${i}`,
+    ).join('\n');
+    const content = `# Docs\n> Summary\n## Links\n${links}\n`;
+    const ctx = createContext('http://test77f.local', { requestDelay: 0, maxLinksToTest: 2 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://test77f.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+    mockSitemapNotFound(server, 'http://test77f.local');
+
+    const result = await discoverAndSamplePages(ctx);
+    expect(result.urls).toHaveLength(2);
+    expect(result.sampled).toBe(true);
+    // Only the sampled URLs should have entries in originalMdUrls
+    expect(Object.keys(result.originalMdUrls ?? {})).toHaveLength(2);
+    for (const sampledUrl of result.urls) {
+      expect(result.originalMdUrls![sampledUrl]).toMatch(/\.html\.md$/);
+    }
+  });
+
+  it('propagates originalMdUrl through aggregate (.txt) walking', async () => {
+    const ctx = createContext('http://test77g.local', { requestDelay: 0 });
+    const llmsContent = `# Docs
+> Summary
+## Links
+- [Aggregate](http://test77g.local/docs/all.txt): All pages
+`;
+    const aggregateContent = `# All
+- [Auth](http://test77g.local/docs/auth/index.html.md): Auth guide
+`;
+    server.use(
+      http.get(
+        'http://test77g.local/docs/all.txt',
+        () =>
+          new HttpResponse(aggregateContent, {
+            status: 200,
+            headers: { 'Content-Type': 'text/plain' },
+          }),
+      ),
+    );
+    const discovered: DiscoveredFile[] = [
+      {
+        url: 'http://test77g.local/llms.txt',
+        content: llmsContent,
+        status: 200,
+        redirected: false,
+      },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+    mockSitemapNotFound(server, 'http://test77g.local');
+
+    const result = await discoverAndSamplePages(ctx);
+    expect(result.urls).toContain('http://test77g.local/docs/auth/index.html');
+    expect(result.originalMdUrls!['http://test77g.local/docs/auth/index.html']).toBe(
+      'http://test77g.local/docs/auth/index.html.md',
+    );
+  });
+});
diff --git a/test/unit/helpers/to-md-urls.test.ts b/test/unit/helpers/to-md-urls.test.ts
index 7c46547..0364c84 100644
--- a/test/unit/helpers/to-md-urls.test.ts
+++ b/test/unit/helpers/to-md-urls.test.ts
@@ -68,6 +68,30 @@ describe('toMdUrls', () => {
   it('returns empty array for .xml files', () => {
     expect(toMdUrls('https://example.com/sitemap.xml')).toEqual([]);
   });
+
+  // Regression: issue #77 — Plaid-style /page/index.html.md URLs from llms.txt
+  it('returns .html.md URL as-is (already a markdown URL)', () => {
+    expect(toMdUrls('https://example.com/docs/auth/index.html.md')).toEqual([
+      'https://example.com/docs/auth/index.html.md',
+    ]);
+  });
+
+  // Invariant for issue #77 fix: parent-clean .md candidate generation
+  // (e.g. /docs/auth/index.html → /docs/auth.md) lives in markdown-url-support
+  // only, gated by an originalMdUrl signal from llms.txt. It must NOT be added
+  // to toMdUrls() — that would re-introduce the false-positive class where
+  // checks like llms-txt-directive-md and llms-txt-links-markdown pass via
+  // unrelated sibling .md files. If you need the parent-clean form, add it
+  // at the call site, not here.
+  it('does NOT generate /foo.md candidate from /foo/index.html (issue #77 isolation)', () => {
+    const result = toMdUrls('https://example.com/docs/auth/index.html');
+    expect(result).not.toContain('https://example.com/docs/auth.md');
+  });
+
+  it('does NOT generate /foo.md candidate from /foo/index.htm (issue #77 isolation)', () => {
+    const result = toMdUrls('https://example.com/docs/auth/index.htm');
+    expect(result).not.toContain('https://example.com/docs/auth.md');
+  });
 });
 
 describe('toHtmlUrl', () => {