objectstack-ai · hotlong · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.changeset/fix-email-htmltotext-sanitization.md b/.changeset/fix-email-htmltotext-sanitization.md
@@ -0,0 +1,19 @@
+---
+'@objectstack/plugin-email': patch
+---
+
+Harden `htmlToText` against double-escaping and incomplete tag stripping
+
+Fixes two CodeQL high-severity alerts in `template-engine.ts`:
+
+- `js/double-escaping`: the order-dependent chain of single-entity
+  `.replace()` calls could double-unescape (e.g. `&amp;lt;` → `&lt;` → `<`).
+  Entities are now decoded in a single left-to-right pass via one alternation
+  regex, so each entity decodes exactly once.
+- `js/incomplete-multi-character-sanitization`: the single `<[^>]+>` strip
+  could leave a live tag behind on crafted/overlapping input
+  (e.g. `<scr<script>ipt>`). Tag stripping now loops until the string is
+  stable, and runs before entity decoding so decoding cannot re-introduce a
+  tag.
+
+Adds adversarial unit tests covering nested entities and overlapping tags.
diff --git a/packages/plugins/plugin-email/src/template-engine.test.ts b/packages/plugins/plugin-email/src/template-engine.test.ts
@@ -72,5 +72,38 @@ describe('template-engine', () => {
     it('collapses 3+ newlines to 2', () => {
       expect(htmlToText('<p>a</p><p>b</p>')).toBe('a\nb');
     });
+
+    describe('adversarial sanitization', () => {
+      it('does not double-unescape entities', () => {
+        // &amp;lt; must decode ONCE to the literal text "&lt;", never to "<".
+        const out = htmlToText('&amp;lt;script&amp;gt;');
+        expect(out).toBe('&lt;script&gt;');
+        expect(out).not.toContain('<');
+        expect(out).not.toContain('>');
+      });
+
+      it('decodes single-escaped entities exactly once', () => {
+        // Sanity counterpart: single-escaped sequences still decode normally.
+        expect(htmlToText('a &amp;&amp; b')).toBe('a && b');
+      });
+
+      it('strips overlapping/nested tags so no tag survives', () => {
+        const out = htmlToText('<scr<script>ipt>alert(1)</script>');
+        expect(out).not.toContain('<');
+        expect(out.toLowerCase()).not.toContain('<script');
+      });
+
+      it('strips tags that re-form after a single pass', () => {
+        const out = htmlToText('<<script>script>alert(1)<</p>/p>');
+        expect(out).not.toContain('<');
+        expect(out.toLowerCase()).not.toContain('<script');
+      });
+
+      it('handles deeply nested entities without producing a live tag', () => {
+        const out = htmlToText('&amp;amp;lt;img src=x onerror=alert(1)&amp;amp;gt;');
+        expect(out).not.toContain('<');
+        expect(out).not.toContain('>');
+      });
+    });
   });
 });
diff --git a/packages/plugins/plugin-email/src/template-engine.ts b/packages/plugins/plugin-email/src/template-engine.ts
@@ -69,24 +69,66 @@ export function requireVars(
   }
 }
 
+/**
+ * Decode the small set of HTML entities `escapeHtml` can emit, in a
+ * SINGLE left-to-right pass. Doing this with one alternation regex —
+ * rather than a chain of `.replace('&amp;','&').replace('&lt;','<')…`
+ * — is deliberate: a sequential chain is order-dependent and can
+ * double-unescape (e.g. `&amp;lt;` → `&lt;` → `<`). Because each match
+ * is consumed and the scan resumes *after* it, `&amp;lt;` decodes to
+ * the literal text `&lt;` and stops, never to `<`.
+ */
+const ENTITIES: Record<string, string> = {
+  '&nbsp;': ' ',
+  '&amp;': '&',
+  '&lt;': '<',
+  '&gt;': '>',
+  '&quot;': '"',
+  '&#39;': "'",
+};
+const ENTITY_RE = /&(?:nbsp|amp|lt|gt|quot|#39);/g;
+
+function decodeEntities(s: string): string {
+  return s.replace(ENTITY_RE, (m) => ENTITIES[m] ?? m);
+}
+
+const TAG_RE = /<[^>]*>/g;
+
+/**
+ * Remove HTML tags robustly. A single `.replace(/<[^>]*>/g, '')` pass
+ * is not enough: stripping a tag can splice the surrounding text into a
+ * fresh tag (e.g. `<scr<script>ipt>` → `<script>`), so we loop until the
+ * string stops changing. This closes the "incomplete multi-character
+ * sanitization" gap where crafted/overlapping input leaves a `<…>` tag
+ * behind.
+ */
+function stripTags(s: string): string {
+  let prev: string;
+  let out = s;
+  do {
+    prev = out;
+    out = out.replace(TAG_RE, '');
+  } while (out !== prev);
+  return out;
+}
+
 /**
  * Strip HTML tags + collapse whitespace to derive a plain-text body
  * from an HTML template. Conservative: keeps line breaks at block
  * boundaries (<br>, </p>, </div>) so the resulting text is at least
  * paragraph-shaped.
+ *
+ * Order matters for safety: tags are stripped (looping until stable)
+ * *before* entities are decoded, and entities are decoded in a single
+ * pass — so neither tag removal nor entity decoding can re-introduce a
+ * live tag or double-unescape an entity.
  */
 export function htmlToText(html: string): string {
   if (!html) return '';
-  return html
+  const withBreaks = html
     .replace(/<br\s*\/?>/gi, '\n')
-    .replace(/<\/(p|div|h[1-6]|li|tr)>/gi, '\n')
-    .replace(/<[^>]+>/g, '')
-    .replace(/&nbsp;/g, ' ')
-    .replace(/&amp;/g, '&')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&quot;/g, '"')
-    .replace(/&#39;/g, "'")
+    .replace(/<\/(p|div|h[1-6]|li|tr)>/gi, '\n');
+  return decodeEntities(stripTags(withBreaks))
     .replace(/[ \t]+/g, ' ')
     .replace(/\n[ \t]+/g, '\n')
     .replace(/\n{3,}/g, '\n\n')