Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/clipboard-html-hardening.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@eigenpal/docx-editor-core': patch
---

Harden clipboard HTML paste against script injection and slow-input denial of service. Pasted HTML is now sanitized (via DOMPurify) and parsed into an inert document instead of being assigned to `innerHTML`, so embedded scripts, event handlers, and `javascript:` URLs cannot run. Word comment stripping and Office/Word namespace-tag removal now use linear scans that cannot backtrack on hostile input or leave a stray comment opener behind.
17 changes: 11 additions & 6 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,7 @@
},
"dependencies": {
"docxtemplater": "^3.50.0",
"dompurify": "^3.2.0",
"jszip": "^3.10.1",
"pizzip": "^3.1.7",
"xml-js": "^1.6.11"
Expand Down
104 changes: 104 additions & 0 deletions packages/core/src/utils/__tests__/clipboard-html.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import { GlobalRegistrator } from '@happy-dom/global-registrator';
import { afterAll, beforeAll, describe, expect, test } from 'bun:test';

import { cleanWordHtml, htmlToRuns } from '../clipboard';

// htmlToRuns binds DOMPurify to the live window lazily on first call, so a
// window registered before any test runs is sufficient.
beforeAll(() => GlobalRegistrator.register());
afterAll(() => GlobalRegistrator.unregister());

describe('cleanWordHtml comment stripping', () => {
test('removes plain HTML comments', () => {
expect(cleanWordHtml('a<!-- hidden -->b')).toBe('ab');
});

test('removes Word downlevel conditional comments', () => {
const html = 'x<!--[if gte mso 9]><xml>junk</xml><![endif]-->y';
expect(cleanWordHtml(html)).toBe('xy');
});

test('leaves no stray "<!--" when a comment is unterminated', () => {
// Incomplete-multi-character-sanitization regression: a lone, unterminated
// comment opener must not survive the cleanup.
expect(cleanWordHtml('safe<!--dangling')).not.toContain('<!--');
});

test('stays linear on adversarial conditional-comment input (no ReDoS)', () => {
// Polynomial-ReDoS regression: a long run of conditional-comment openers
// used to backtrack quadratically. This must finish near-instantly.
const evil = '<!--[if '.repeat(50_000);
const start = performance.now();
cleanWordHtml(evil);
expect(performance.now() - start).toBeLessThan(5_000);
});
});

describe('cleanWordHtml namespace-tag stripping', () => {
test('removes paired o:/w: namespace blocks and their content', () => {
expect(cleanWordHtml('a<o:p>junk</o:p>b')).toBe('ab');
expect(cleanWordHtml('a<w:p>junk</w:p>b')).toBe('ab');
// First-close-wins (same as the prior lazy regex): the inner close pairs
// with the opener, leaving the trailing close tag behind.
expect(cleanWordHtml('a<o:x>1</o:y>2</o:z>b')).toBe('a2</o:z>b');
});

test('strips namespace tags case-insensitively (matches the prior /gi regex)', () => {
expect(cleanWordHtml('a<O:P>junk</O:P>b')).toBe('ab');
expect(cleanWordHtml('a<W:sdt>junk</w:Sdt>b')).toBe('ab');
});

test('removes self-closing o:/w: tags', () => {
expect(cleanWordHtml('a<o:p/>b')).toBe('ab');
});

test('keeps content when a namespace opener has no closing tag', () => {
// Matches the prior lazy-regex behavior: an unmatched opener is left as-is.
expect(cleanWordHtml('keep<o:p>this')).toContain('this');
});

test('stays linear on many unterminated namespace openers (no ReDoS)', () => {
// `/<o:[^>]*>[\s\S]*?<\/o:[^>]*>/` backtracked quadratically here.
const evil = '<o:p>'.repeat(200_000);
const start = performance.now();
cleanWordHtml(evil);
expect(performance.now() - start).toBeLessThan(5_000);
});
});

describe('htmlToRuns sanitizes and does not execute markup', () => {
const runText = (runs: ReturnType<typeof htmlToRuns>): string =>
runs
.flatMap((r) => r.content ?? [])
.map((c) => ('text' in c ? c.text : ''))
.join('');

// NOTE on coverage: these tests verify text extraction and the security
// boundary (no script/handler execution, no global mutation). They do NOT
// assert that visual formatting (bold/color/font) survives, because under
// happy-dom DOMPurify strips formatting tags/attributes. In a real browser
// (and jsdom) DOMPurify's default allowlist keeps `<b>/<i>/<u>/<span>` and
// the `style` attribute, so consumers calling htmlToRuns get formatting
// intact; that path is the DOMPurify default, not asserted here.
test('extracts text from pasted HTML', () => {
const runs = htmlToRuns('<p>Hello <b>world</b></p>', 'Hello world');
expect(runText(runs)).toContain('Hello');
expect(runText(runs)).toContain('world');
});

test('does not execute or leak injected handlers/scripts', () => {
const g = globalThis as Record<string, unknown>;
delete g.__pwned;
for (const payload of [
'<img src=x onerror="globalThis.__pwned=1">hi',
'<svg onload="globalThis.__pwned=1"></svg>hi',
'<a href="javascript:globalThis.__pwned=1">hi</a>',
]) {
const runs = htmlToRuns(payload, 'hi');
// No handler fires (DOMPurify strips them; parsing is inert anyway), and
// benign text still comes through.
expect(g.__pwned).toBeUndefined();
expect(runText(runs)).toContain('hi');
}
});
});
105 changes: 97 additions & 8 deletions packages/core/src/utils/clipboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,21 @@
* - Ctrl+C, Ctrl+V, Ctrl+X keyboard shortcuts
*/

import createDOMPurify from 'dompurify';

import type { Run, TextFormatting, Paragraph, Theme } from '../types/document';
import { resolveColorToHex } from './colorResolver';

// dompurify's default export only auto-binds `sanitize` to a `window` that
// exists at import time (always true in the browser). Bind explicitly to the
// live window on first use so it also works when a DOM is installed after this
// module is evaluated (e.g. test environments). The default export is callable
// as a factory whether it is the bare factory or an already-bound instance.
let domPurify: ReturnType<typeof createDOMPurify> | undefined;
function getDomPurify(): ReturnType<typeof createDOMPurify> {
return (domPurify ??= createDOMPurify(window));
}

// ============================================================================
// TYPES
// ============================================================================
Expand Down Expand Up @@ -412,25 +424,96 @@ export function isEditorHtml(html: string): boolean {
);
}

/**
* Strip every HTML comment, including downlevel conditional comments
* (`<!--[if ...]> ... <![endif]-->`).
*
* Uses a single linear scan instead of a regex: clipboard HTML is
* attacker-controlled, and a lazy `<!--[\s\S]*?-->` against a multi-character
* terminator backtracks polynomially. The scan also guarantees no stray
* `<!--` survives (an unterminated comment is dropped through end-of-string),
* which a single regex pass cannot promise.
*/
function stripHtmlComments(html: string): string {
let result = '';
let i = 0;
while (i < html.length) {
const start = html.indexOf('<!--', i);
if (start === -1) {
result += html.slice(i);
break;
}
result += html.slice(i, start);
const end = html.indexOf('-->', start + 4);
if (end === -1) {
// Unterminated comment: drop the remainder so no `<!--` can leak through.
break;
}
i = end + 3;
}
return result;
}

/**
* Remove `<prefix...> ... </prefix...>` element blocks (e.g. Office `<o:...>`
* and Word `<w:...>` namespaced tags) from attacker-controlled clipboard HTML.
*
* Linear scan instead of `/<prefix[^>]*>[\s\S]*?<\/prefix[^>]*>/gi`: that lazy
* pattern backtracks polynomially (O(n^2)) on hostile input with many openers
* and no close tag. Mirrors the regex's first-close-wins semantics — each
* opener is paired with the next close tag anywhere ahead; an opener with no
* close tag anywhere is left intact (exactly what the lazy regex did).
*/
function stripPairedNamespaceTags(html: string, prefix: string): string {
const open = '<' + prefix;
const close = '</' + prefix;
// Search case-insensitively to match the `/gi` regex this replaced (Word can
// emit `<o:p>` in any case). Tag markers are located in a lowercased copy;
// slices are taken from the original `html` so kept text keeps its casing.
const lower = html.toLowerCase();
let result = '';
let i = 0;
while (i < html.length) {
const start = lower.indexOf(open, i);
if (start === -1) {
result += html.slice(i);
break;
}
const openTagEnd = html.indexOf('>', start);
const closeStart = openTagEnd === -1 ? -1 : lower.indexOf(close, openTagEnd + 1);
const closeTagEnd = closeStart === -1 ? -1 : html.indexOf('>', closeStart);
if (closeTagEnd === -1) {
// No complete `<prefix...>...</prefix...>` remains anywhere ahead, so no
// later opener can be paired either. Keep the rest verbatim and stop —
// this is what makes the scan linear (no rescanning for every opener).
result += html.slice(i);
break;
}
// Drop the whole block, keeping the text before this opener.
result += html.slice(i, start);
i = closeTagEnd + 1;
}
return result;
}

/**
* Clean Microsoft Word HTML
*/
export function cleanWordHtml(html: string): string {
let cleaned = html;

// Remove Word-specific comments
cleaned = cleaned.replace(/<!--\[if[\s\S]*?<!\[endif\]-->/gi, '');
cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, '');
// Remove Word-specific (and all other) HTML comments
cleaned = stripHtmlComments(cleaned);

// Remove XML declarations
cleaned = cleaned.replace(/<\?xml[^>]*>/gi, '');

// Remove o: (Office) namespace tags
cleaned = cleaned.replace(/<o:[^>]*>[\s\S]*?<\/o:[^>]*>/gi, '');
// Remove o: (Office) namespace tags (linear scan; see stripPairedNamespaceTags)
cleaned = stripPairedNamespaceTags(cleaned, 'o:');
cleaned = cleaned.replace(/<o:[^>]*\/>/gi, '');

// Remove w: (Word) namespace tags
cleaned = cleaned.replace(/<w:[^>]*>[\s\S]*?<\/w:[^>]*>/gi, '');
cleaned = stripPairedNamespaceTags(cleaned, 'w:');
cleaned = cleaned.replace(/<w:[^>]*\/>/gi, '');

// Remove mso styles but keep other styles
Expand Down Expand Up @@ -463,8 +546,14 @@ export function htmlToRuns(html: string, plainTextFallback: string): Run[] {
return plainTextFallback ? [createTextRun(plainTextFallback)] : [];
}

const container = document.createElement('div');
container.innerHTML = html;
// Sanitize the attacker-controlled clipboard HTML at this trust boundary
// (scripts, event handlers, javascript: URLs, dangerous tags all stripped),
// then parse the cleaned markup into an inert document. We only walk the
// resulting node tree for text and formatting — nothing is ever inserted
// into the live DOM.
const sanitized = getDomPurify().sanitize(html);
const parsed = new DOMParser().parseFromString(sanitized, 'text/html');
const container = parsed.body;

const runs: Run[] = [];
processNode(container, runs, {});
Expand Down
Loading