Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .changeset/fix-email-htmltotext-sanitization.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
'@objectstack/plugin-email': patch
---

Harden `htmlToText` against double-escaping and incomplete tag stripping

Fixes two CodeQL high-severity alerts in `template-engine.ts`:

- `js/double-escaping`: the order-dependent chain of single-entity
`.replace()` calls could double-unescape (e.g. `&amp;lt;` → `&lt;` → `<`).
Entities are now decoded in a single left-to-right pass via one alternation
regex, so each entity decodes exactly once.
- `js/incomplete-multi-character-sanitization`: the single `<[^>]+>` strip
could leave a live tag behind on crafted/overlapping input
(e.g. `<scr<script>ipt>`). Tag stripping now loops until the string is
stable, and runs before entity decoding so decoding cannot re-introduce a
tag.

Adds adversarial unit tests covering nested entities and overlapping tags.
33 changes: 33 additions & 0 deletions packages/plugins/plugin-email/src/template-engine.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,38 @@ describe('template-engine', () => {
it('collapses 3+ newlines to 2', () => {
expect(htmlToText('<p>a</p><p>b</p>')).toBe('a\nb');
});

describe('adversarial sanitization', () => {
it('does not double-unescape entities', () => {
// &amp;lt; must decode ONCE to the literal text "&lt;", never to "<".
const out = htmlToText('&amp;lt;script&amp;gt;');
expect(out).toBe('&lt;script&gt;');
expect(out).not.toContain('<');
expect(out).not.toContain('>');
});

it('decodes single-escaped entities exactly once', () => {
// Sanity counterpart: single-escaped sequences still decode normally.
expect(htmlToText('a &amp;&amp; b')).toBe('a && b');
});

it('strips overlapping/nested tags so no tag survives', () => {
const out = htmlToText('<scr<script>ipt>alert(1)</script>');
expect(out).not.toContain('<');
expect(out.toLowerCase()).not.toContain('<script');
});

it('strips tags that re-form after a single pass', () => {
const out = htmlToText('<<script>script>alert(1)<</p>/p>');
expect(out).not.toContain('<');
expect(out.toLowerCase()).not.toContain('<script');
});

it('handles deeply nested entities without producing a live tag', () => {
const out = htmlToText('&amp;amp;lt;img src=x onerror=alert(1)&amp;amp;gt;');
expect(out).not.toContain('<');
expect(out).not.toContain('>');
});
});
});
});
60 changes: 51 additions & 9 deletions packages/plugins/plugin-email/src/template-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,24 +69,66 @@ export function requireVars(
}
}

/**
* Decode the small set of HTML entities `escapeHtml` can emit, in a
* SINGLE left-to-right pass. Doing this with one alternation regex —
* rather than a chain of `.replace('&amp;','&').replace('&lt;','<')…`
* — is deliberate: a sequential chain is order-dependent and can
* double-unescape (e.g. `&amp;lt;` → `&lt;` → `<`). Because each match
* is consumed and the scan resumes *after* it, `&amp;lt;` decodes to
* the literal text `&lt;` and stops, never to `<`.
*/
const ENTITIES: Record<string, string> = {
'&nbsp;': ' ',
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&#39;': "'",
};
const ENTITY_RE = /&(?:nbsp|amp|lt|gt|quot|#39);/g;

function decodeEntities(s: string): string {
return s.replace(ENTITY_RE, (m) => ENTITIES[m] ?? m);
}

const TAG_RE = /<[^>]*>/g;

/**
* Remove HTML tags robustly. A single `.replace(/<[^>]*>/g, '')` pass
* is not enough: stripping a tag can splice the surrounding text into a
* fresh tag (e.g. `<scr<script>ipt>` → `<script>`), so we loop until the
* string stops changing. This closes the "incomplete multi-character
* sanitization" gap where crafted/overlapping input leaves a `<…>` tag
* behind.
*/
function stripTags(s: string): string {
let prev: string;
let out = s;
do {
prev = out;
out = out.replace(TAG_RE, '');
} while (out !== prev);
return out;
}

/**
* Strip HTML tags + collapse whitespace to derive a plain-text body
* from an HTML template. Conservative: keeps line breaks at block
* boundaries (<br>, </p>, </div>) so the resulting text is at least
* paragraph-shaped.
*
* Order matters for safety: tags are stripped (looping until stable)
* *before* entities are decoded, and entities are decoded in a single
* pass — so neither tag removal nor entity decoding can re-introduce a
* live tag or double-unescape an entity.
*/
export function htmlToText(html: string): string {
if (!html) return '';
return html
const withBreaks = html
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/(p|div|h[1-6]|li|tr)>/gi, '\n')
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/<\/(p|div|h[1-6]|li|tr)>/gi, '\n');
return decodeEntities(stripTags(withBreaks))
.replace(/[ \t]+/g, ' ')
.replace(/\n[ \t]+/g, '\n')
.replace(/\n{3,}/g, '\n\n')
Expand Down