diff --git a/docs/project/changelog.rst b/docs/project/changelog.rst index 3d637a9f..a691cd4f 100644 --- a/docs/project/changelog.rst +++ b/docs/project/changelog.rst @@ -40,6 +40,12 @@ Improvements * Added wheels for ARMv7, PowerPC, RISC-V, and S/390. +Bug fixes +......... + +* Prevented an exception when logging a text frame that splits a multi-byte + UTF-8 sequence across fragments. + .. _16.0: 16.0 diff --git a/src/websockets/frames.py b/src/websockets/frames.py index 7716e7a2..a2fa0e29 100644 --- a/src/websockets/frames.py +++ b/src/websockets/frames.py @@ -147,6 +147,15 @@ class Frame: # Configure if you want to see more in logs. Should be a multiple of 3. MAX_LOG_SIZE = int(os.environ.get("WEBSOCKETS_MAX_LOG_SIZE", "75")) + def _format_binary(self) -> str: + # We'll show at most the first 16 bytes and the last 8 bytes. + # Encode just what we need, plus two dummy bytes to elide later. + binary = self.data + if len(binary) > self.MAX_LOG_SIZE // 3: + cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8 + binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]]) + return " ".join(f"{byte:02x}" for byte in binary) + def __str__(self) -> str: """ Return a human-readable representation of a frame. @@ -159,15 +168,14 @@ def __str__(self) -> str: if self.opcode is OP_TEXT: # Decoding only the beginning and the end is needlessly hard. # Decode the entire payload then elide later if necessary. - data = repr(bytes(self.data).decode()) + # Fragmentation may split a multi-byte UTF-8 sequence; fall back to + # a binary representation when the payload doesn't decode cleanly. + try: + data = repr(bytes(self.data).decode()) + except UnicodeDecodeError: + data = self._format_binary() elif self.opcode is OP_BINARY: - # We'll show at most the first 16 bytes and the last 8 bytes. - # Encode just what we need, plus two dummy bytes to elide later. - binary = self.data - if len(binary) > self.MAX_LOG_SIZE // 3: - cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8 - binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]]) - data = " ".join(f"{byte:02x}" for byte in binary) + data = self._format_binary() elif self.opcode is OP_CLOSE: data = str(Close.parse(self.data)) elif self.data: @@ -180,11 +188,7 @@ def __str__(self) -> str: data = repr(bytes(self.data).decode()) coding = "text" except (UnicodeDecodeError, AttributeError): - binary = self.data - if len(binary) > self.MAX_LOG_SIZE // 3: - cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8 - binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]]) - data = " ".join(f"{byte:02x}" for byte in binary) + data = self._format_binary() coding = "binary" else: data = "''" diff --git a/tests/test_frames.py b/tests/test_frames.py index 1c372b5d..267b3088 100644 --- a/tests/test_frames.py +++ b/tests/test_frames.py @@ -275,6 +275,19 @@ def test_text_with_newline(self): "TEXT 'Hello\\nworld!' [12 bytes]", ) + def test_text_fragment_with_partial_utf8(self): + self.assertEqual( + str(Frame(OP_TEXT, b" cr\xc3", fin=False)), + "TEXT 20 63 72 c3 [4 bytes, continued]", + ) + + def test_text_fragment_with_partial_utf8_truncated(self): + self.assertEqual( + str(Frame(OP_TEXT, "café ".encode() * 16 + b"\xc3", fin=False)), + "TEXT 63 61 66 c3 a9 20 63 61 66 c3 a9 20 63 61 66 c3 ..." + " 20 63 61 66 c3 a9 20 c3 [97 bytes, continued]", + ) + def test_binary(self): self.assertEqual( str(Frame(OP_BINARY, b"\x00\x01\x02\x03")),