python · gaborbernat · Jun 3, 2026 · Jun 3, 2026 · Jun 9, 2026 · Jun 10, 2026
diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py
@@ -133,6 +133,42 @@ def test_object_pairs_hook_with_unicode(self):
                                     object_hook = lambda x: None),
                          OrderedDict(p))
 
+    def test_ensure_ascii_false_long_string_paths(self):
+        # Cover the SWAR scan in _json escape_size(): it inspects eight bytes
+        # per iteration, so exercise runs that cross the 8-byte windows and the
+        # short-string guard with a special character at every offset.
+        dumps, loads = self.dumps, self.loads
+
+        def is_optimized(s):
+            # The no-escape fast path returns the string verbatim in quotes.
+            self.assertEqual(dumps(s, ensure_ascii=False), f'"{s}"')
+
+        # Bytes that are kept as-is, including Latin-1 and 0x7f, stay verbatim.
+        for s in ("abc", "\xe9", "kept latin1 \xe9\xff \x7f text"):
+            is_optimized(s)
+            is_optimized(s * 8)
+
+        def need_escape(s, expected):
+            encoded = dumps(s, ensure_ascii=False)
+            self.assertEqual(encoded, expected)
+            self.assertEqual(loads(encoded), s)
+
+        tail = "tail"
+        for n in range(40):
+            run = "a" * n
+            for char, escaped in (('"', '\\"'), ("\\", "\\\\"), ("\n", "\\n"),
+                                  ("\x00", "\\u0000"), ("\x1f", "\\u001f")):
+                need_escape(run + char + tail, f'"{run}{escaped}{tail}"')
+            for char in ("\x7f", "\xe9", "中", "\U0001f600"):
+                s = run + char + tail
+                need_escape(s, f'"{s}"')
+
+        # Structural escapes and control characters are still escaped after a
+        # long no-escape run.
+        base = "a" * 20
+        for char, escaped in (('"', '\\"'), ("\\", "\\\\"), ("\x01", "\\u0001")):
+            need_escape(base + char, f'"{base}{escaped}"')
+
 
 class TestPyUnicode(TestUnicode, PyTest): pass
 class TestCUnicode(TestUnicode, CTest): pass
diff --git a/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst
@@ -0,0 +1,5 @@
+Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of
+long runs of characters that need no escaping, by scanning eight bytes at a
+time (roughly 1.5x faster for long ASCII or Latin-1 strings). Short strings,
+strings that need escaping, and strings with characters above U+00FF are
+unaffected. Patch by Bernát Gábor.
diff --git a/Modules/_json.c b/Modules/_json.c
@@ -281,6 +281,57 @@ escape_size(const void *input, int kind, Py_ssize_t input_chars)
     Py_ssize_t i;
     Py_ssize_t output_size;
 
+    /* SWAR no-escape fast path (1-byte): in this 1-byte (Latin-1) mode a code
+       point needs escaping only when c == '"', c == '\\', or c < 0x20; any other
+       byte, including non-ASCII (>= 0x80), is copied verbatim.  Scan eight bytes
+       per iteration and drop to the per-character loop at the first byte that
+       needs escaping.  The loop reads one 8-byte word at a time, so strings
+       shorter than a word stay on that per-character loop, where the setup
+       below would not pay off. */
+    if (kind == PyUnicode_1BYTE_KIND && input_chars >= 8
+            /* the output is input_chars + 2 (the surrounding quotes); keep that
+               addition below from overflowing Py_ssize_t */
+            && input_chars < PY_SSIZE_T_MAX - 2) {
+        const Py_UCS1 *p = (const Py_UCS1 *)input;
+        const uint64_t ones = 0x0101010101010101ULL;  /* 1 in every byte lane */
+        const uint64_t high = 0x8080808080808080ULL;  /* high bit of every lane */
+        const uint64_t bq = 0x22ULL * ones;   /* '"' broadcast to all 8 lanes */
+        const uint64_t bs = 0x5cULL * ones;   /* '\\' broadcast to all 8 lanes */
+        const uint64_t bc = 0xE0ULL * ones;   /* 0xE0 per lane; w & bc is zero in
+                                                 a lane exactly when its byte is
+                                                 < 0x20 (top three bits clear) */
+        Py_ssize_t j = 0;
+        int needs_escape = 0;
+        for (; j + 8 <= input_chars; j += 8) {
+            uint64_t w;
+            memcpy(&w, p + j, 8);
+            /* (v - ones) & ~v & high lights a lane's high bit exactly when that
+               lane is zero, so each mask flags the lanes that matched. */
+            uint64_t mq = w ^ bq;
+            mq = (mq - ones) & ~mq & high;            /* lanes equal to '"'  */
+            uint64_t ms = w ^ bs;
+            ms = (ms - ones) & ~ms & high;            /* lanes equal to '\\' */
+            uint64_t vc = w & bc;
+            uint64_t mlo = (vc - ones) & ~vc & high;  /* lanes < 0x20 */
+            if (mq | ms | mlo) {
+                needs_escape = 1;
+                break;
+            }
+        }
+        if (!needs_escape) {
+            for (; j < input_chars; j++) {
+                Py_UCS1 c = p[j];
+                if (c == '"' || c == '\\' || c < 0x20) {
+                    needs_escape = 1;
+                    break;
+                }
+            }
+        }
+        if (!needs_escape) {
+            return input_chars + 2;
+        }
+    }
+
     /* Compute the output size */
     for (i = 0, output_size = 2; i < input_chars; i++) {
         Py_UCS4 c = PyUnicode_READ(kind, input, i);