eigenpal · jedrazb · Jun 21, 2026 · Jun 20, 2026
diff --git a/.changeset/clipboard-html-hardening.md b/.changeset/clipboard-html-hardening.md
@@ -0,0 +1,5 @@
+---
+'@eigenpal/docx-editor-core': patch
+---
+
+Harden clipboard HTML paste against script injection and slow-input denial of service. Pasted HTML is now sanitized (via DOMPurify) and parsed into an inert document instead of being assigned to `innerHTML`, so embedded scripts, event handlers, and `javascript:` URLs cannot run. Word comment stripping and Office/Word namespace-tag removal now use linear scans that cannot backtrack on hostile input or leave a stray comment opener behind.
diff --git a/bun.lock b/bun.lock
diff --git a/packages/core/package.json b/packages/core/package.json
@@ -608,6 +608,7 @@
   },
   "dependencies": {
     "docxtemplater": "^3.50.0",
+    "dompurify": "^3.2.0",
     "jszip": "^3.10.1",
     "pizzip": "^3.1.7",
     "xml-js": "^1.6.11"

diff --git a/packages/core/src/utils/__tests__/clipboard-html.test.ts b/packages/core/src/utils/__tests__/clipboard-html.test.ts
@@ -0,0 +1,104 @@
+import { GlobalRegistrator } from '@happy-dom/global-registrator';
+import { afterAll, beforeAll, describe, expect, test } from 'bun:test';
+
+import { cleanWordHtml, htmlToRuns } from '../clipboard';
+
+// htmlToRuns binds DOMPurify to the live window lazily on first call, so a
+// window registered before any test runs is sufficient.
+beforeAll(() => GlobalRegistrator.register());
+afterAll(() => GlobalRegistrator.unregister());
+
+describe('cleanWordHtml comment stripping', () => {
+  test('removes plain HTML comments', () => {
+    expect(cleanWordHtml('a<!-- hidden -->b')).toBe('ab');
+  });
+
+  test('removes Word downlevel conditional comments', () => {
+    const html = 'x<!--[if gte mso 9]><xml>junk</xml><![endif]-->y';
+    expect(cleanWordHtml(html)).toBe('xy');
+  });
+
+  test('leaves no stray "<!--" when a comment is unterminated', () => {
+    // Incomplete-multi-character-sanitization regression: a lone, unterminated
+    // comment opener must not survive the cleanup.
+    expect(cleanWordHtml('safe<!--dangling')).not.toContain('<!--');
+  });
+
+  test('stays linear on adversarial conditional-comment input (no ReDoS)', () => {
+    // Polynomial-ReDoS regression: a long run of conditional-comment openers
+    // used to backtrack quadratically. This must finish near-instantly.
+    const evil = '<!--[if '.repeat(50_000);
+    const start = performance.now();
+    cleanWordHtml(evil);
+    expect(performance.now() - start).toBeLessThan(5_000);
+  });
+});
+
+describe('cleanWordHtml namespace-tag stripping', () => {
+  test('removes paired o:/w: namespace blocks and their content', () => {
+    expect(cleanWordHtml('a<o:p>junk</o:p>b')).toBe('ab');
+    expect(cleanWordHtml('a<w:p>junk</w:p>b')).toBe('ab');
+    // First-close-wins (same as the prior lazy regex): the inner close pairs
+    // with the opener, leaving the trailing close tag behind.
+    expect(cleanWordHtml('a<o:x>1</o:y>2</o:z>b')).toBe('a2</o:z>b');
+  });
+
+  test('strips namespace tags case-insensitively (matches the prior /gi regex)', () => {
+    expect(cleanWordHtml('a<O:P>junk</O:P>b')).toBe('ab');
+    expect(cleanWordHtml('a<W:sdt>junk</w:Sdt>b')).toBe('ab');
+  });
+
+  test('removes self-closing o:/w: tags', () => {
+    expect(cleanWordHtml('a<o:p/>b')).toBe('ab');
+  });
+
+  test('keeps content when a namespace opener has no closing tag', () => {
+    // Matches the prior lazy-regex behavior: an unmatched opener is left as-is.
+    expect(cleanWordHtml('keep<o:p>this')).toContain('this');
+  });
+
+  test('stays linear on many unterminated namespace openers (no ReDoS)', () => {
+    // `/<o:[^>]*>[\s\S]*?<\/o:[^>]*>/` backtracked quadratically here.
+    const evil = '<o:p>'.repeat(200_000);
+    const start = performance.now();
+    cleanWordHtml(evil);
+    expect(performance.now() - start).toBeLessThan(5_000);
+  });
+});
+
+describe('htmlToRuns sanitizes and does not execute markup', () => {
+  const runText = (runs: ReturnType<typeof htmlToRuns>): string =>
+    runs
+      .flatMap((r) => r.content ?? [])
+      .map((c) => ('text' in c ? c.text : ''))
+      .join('');
+
+  // NOTE on coverage: these tests verify text extraction and the security
+  // boundary (no script/handler execution, no global mutation). They do NOT
+  // assert that visual formatting (bold/color/font) survives, because under
+  // happy-dom DOMPurify strips formatting tags/attributes. In a real browser
+  // (and jsdom) DOMPurify's default allowlist keeps `<b>/<i>/<u>/<span>` and
+  // the `style` attribute, so consumers calling htmlToRuns get formatting
+  // intact; that path is the DOMPurify default, not asserted here.
+  test('extracts text from pasted HTML', () => {
+    const runs = htmlToRuns('<p>Hello <b>world</b></p>', 'Hello world');
+    expect(runText(runs)).toContain('Hello');
+    expect(runText(runs)).toContain('world');
+  });
+
+  test('does not execute or leak injected handlers/scripts', () => {
+    const g = globalThis as Record<string, unknown>;
+    delete g.__pwned;
+    for (const payload of [
+      '<img src=x onerror="globalThis.__pwned=1">hi',
+      '<svg onload="globalThis.__pwned=1"></svg>hi',
+      '<a href="javascript:globalThis.__pwned=1">hi</a>',
+    ]) {
+      const runs = htmlToRuns(payload, 'hi');
+      // No handler fires (DOMPurify strips them; parsing is inert anyway), and
+      // benign text still comes through.
+      expect(g.__pwned).toBeUndefined();
+      expect(runText(runs)).toContain('hi');
+    }
+  });
+});
diff --git a/packages/core/src/utils/clipboard.ts b/packages/core/src/utils/clipboard.ts
@@ -8,9 +8,21 @@
  * - Ctrl+C, Ctrl+V, Ctrl+X keyboard shortcuts
  */
 
+import createDOMPurify from 'dompurify';
+
 import type { Run, TextFormatting, Paragraph, Theme } from '../types/document';
 import { resolveColorToHex } from './colorResolver';
 
+// dompurify's default export only auto-binds `sanitize` to a `window` that
+// exists at import time (always true in the browser). Bind explicitly to the
+// live window on first use so it also works when a DOM is installed after this
+// module is evaluated (e.g. test environments). The default export is callable
+// as a factory whether it is the bare factory or an already-bound instance.
+let domPurify: ReturnType<typeof createDOMPurify> | undefined;
+function getDomPurify(): ReturnType<typeof createDOMPurify> {
+  return (domPurify ??= createDOMPurify(window));
+}
+
 // ============================================================================
 // TYPES
 // ============================================================================
@@ -412,25 +424,96 @@ export function isEditorHtml(html: string): boolean {
   );
 }
 
+/**
+ * Strip every HTML comment, including downlevel conditional comments
+ * (`<!--[if ...]> ... <![endif]-->`).
+ *
+ * Uses a single linear scan instead of a regex: clipboard HTML is
+ * attacker-controlled, and a lazy `<!--[\s\S]*?-->` against a multi-character
+ * terminator backtracks polynomially. The scan also guarantees no stray
+ * `<!--` survives (an unterminated comment is dropped through end-of-string),
+ * which a single regex pass cannot promise.
+ */
+function stripHtmlComments(html: string): string {
+  let result = '';
+  let i = 0;
+  while (i < html.length) {
+    const start = html.indexOf('<!--', i);
+    if (start === -1) {
+      result += html.slice(i);
+      break;
+    }
+    result += html.slice(i, start);
+    const end = html.indexOf('-->', start + 4);
+    if (end === -1) {
+      // Unterminated comment: drop the remainder so no `<!--` can leak through.
+      break;
+    }
+    i = end + 3;
+  }
+  return result;
+}
+
+/**
+ * Remove `<prefix...> ... </prefix...>` element blocks (e.g. Office `<o:...>`
+ * and Word `<w:...>` namespaced tags) from attacker-controlled clipboard HTML.
+ *
+ * Linear scan instead of `/<prefix[^>]*>[\s\S]*?<\/prefix[^>]*>/gi`: that lazy
+ * pattern backtracks polynomially (O(n^2)) on hostile input with many openers
+ * and no close tag. Mirrors the regex's first-close-wins semantics — each
+ * opener is paired with the next close tag anywhere ahead; an opener with no
+ * close tag anywhere is left intact (exactly what the lazy regex did).
+ */
+function stripPairedNamespaceTags(html: string, prefix: string): string {
+  const open = '<' + prefix;
+  const close = '</' + prefix;
+  // Search case-insensitively to match the `/gi` regex this replaced (Word can
+  // emit `<o:p>` in any case). Tag markers are located in a lowercased copy;
+  // slices are taken from the original `html` so kept text keeps its casing.
+  const lower = html.toLowerCase();
+  let result = '';
+  let i = 0;
+  while (i < html.length) {
+    const start = lower.indexOf(open, i);
+    if (start === -1) {
+      result += html.slice(i);
+      break;
+    }
+    const openTagEnd = html.indexOf('>', start);
+    const closeStart = openTagEnd === -1 ? -1 : lower.indexOf(close, openTagEnd + 1);
+    const closeTagEnd = closeStart === -1 ? -1 : html.indexOf('>', closeStart);
+    if (closeTagEnd === -1) {
+      // No complete `<prefix...>...</prefix...>` remains anywhere ahead, so no
+      // later opener can be paired either. Keep the rest verbatim and stop —
+      // this is what makes the scan linear (no rescanning for every opener).
+      result += html.slice(i);
+      break;
+    }
+    // Drop the whole block, keeping the text before this opener.
+    result += html.slice(i, start);
+    i = closeTagEnd + 1;
+  }
+  return result;
+}
+
 /**
  * Clean Microsoft Word HTML
  */
 export function cleanWordHtml(html: string): string {
   let cleaned = html;
 
-  // Remove Word-specific comments
-  cleaned = cleaned.replace(/<!--\[if[\s\S]*?<!\[endif\]-->/gi, '');
-  cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, '');
+  // Remove Word-specific (and all other) HTML comments
+  cleaned = stripHtmlComments(cleaned);
 
   // Remove XML declarations
   cleaned = cleaned.replace(/<\?xml[^>]*>/gi, '');
 
-  // Remove o: (Office) namespace tags
-  cleaned = cleaned.replace(/<o:[^>]*>[\s\S]*?<\/o:[^>]*>/gi, '');
+  // Remove o: (Office) namespace tags (linear scan; see stripPairedNamespaceTags)
+  cleaned = stripPairedNamespaceTags(cleaned, 'o:');
   cleaned = cleaned.replace(/<o:[^>]*\/>/gi, '');
 
   // Remove w: (Word) namespace tags
-  cleaned = cleaned.replace(/<w:[^>]*>[\s\S]*?<\/w:[^>]*>/gi, '');
+  cleaned = stripPairedNamespaceTags(cleaned, 'w:');
   cleaned = cleaned.replace(/<w:[^>]*\/>/gi, '');
 
   // Remove mso styles but keep other styles
@@ -463,8 +546,14 @@ export function htmlToRuns(html: string, plainTextFallback: string): Run[] {
     return plainTextFallback ? [createTextRun(plainTextFallback)] : [];
   }
 
-  const container = document.createElement('div');
-  container.innerHTML = html;
+  // Sanitize the attacker-controlled clipboard HTML at this trust boundary
+  // (scripts, event handlers, javascript: URLs, dangerous tags all stripped),
+  // then parse the cleaned markup into an inert document. We only walk the
+  // resulting node tree for text and formatting — nothing is ever inserted
+  // into the live DOM.
+  const sanitized = getDomPurify().sanitize(html);
+  const parsed = new DOMParser().parseFromString(sanitized, 'text/html');
+  const container = parsed.body;
 
   const runs: Run[] = [];
   processNode(container, runs, {});