From 5ffbda72ce46bd07d0ae50191d0d9afd8efa41bc Mon Sep 17 00:00:00 2001 From: nitrajen <58795594+nitrajen@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:41:28 -0500 Subject: [PATCH] GH-49058: [Python] Disallow non-UTF-8 bytes in custom metadata Schema.fbs defines metadata keys and values as flatbuffer strings, which are required to be valid UTF-8. PyArrow was silently accepting arbitrary byte sequences, producing schemas that violate the spec and break cross-language interoperability (e.g. Rust enforces UTF-8 via String). Add a UTF-8 check in KeyValueMetadata.__init__ before handing bytes to the C++ layer. Only runs when the input is bytes, so existing TypeError behaviour for invalid types (e.g. integers) is unchanged. --- python/pyarrow/tests/test_schema.py | 26 +++++++++++++++++--------- python/pyarrow/types.pxi | 19 +++++++++++++++++-- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 029e14ca1628..53f422d05e9d 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -785,12 +785,20 @@ def test_schema_merge(): pa.unify_schemas([a, 1]) -def test_undecodable_metadata(): - # ARROW-10214: undecodable metadata shouldn't fail repr() - data1 = b'abcdef\xff\x00' - data2 = b'ghijkl\xff\x00' - schema = pa.schema( - [pa.field('ints', pa.int16(), metadata={'key': data1})], - metadata={'key': data2}) - assert 'abcdef' in str(schema) - assert 'ghijkl' in str(schema) +def test_non_utf8_metadata_rejected(): + # GH-49058: non-UTF-8 bytes in metadata keys/values must be rejected + # because Schema.fbs requires metadata strings to be valid UTF-8. + invalid = b'\xff\xfe\xfa' + + with pytest.raises(ValueError, match="Metadata values must be valid UTF-8"): + pa.schema([pa.field('ints', pa.int16())], metadata={'key': invalid}) + + with pytest.raises(ValueError, match="Metadata keys must be valid UTF-8"): + pa.schema([pa.field('ints', pa.int16())], metadata={invalid: b'value'}) + + with pytest.raises(ValueError, match="Metadata values must be valid UTF-8"): + pa.field('ints', pa.int16(), metadata={'key': invalid}) + + # valid UTF-8 (including plain ASCII) must continue to work + pa.schema([pa.field('ints', pa.int16())], metadata={b'key': b'value'}) + pa.schema([pa.field('ints', pa.int16())], metadata={'key': 'value \u00e9'}) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e9eef8965153..b2f68fdd4249 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2314,8 +2314,23 @@ cdef class KeyValueMetadata(_Metadata, Mapping): keys.reserve(len(items)) for key, value in items: - keys.push_back(tobytes(key)) - values.push_back(tobytes(value)) + v = tobytes(value) + if isinstance(key, bytes): + try: + key.decode('utf-8') + except UnicodeDecodeError: + raise ValueError( + f"Metadata keys must be valid UTF-8, got {key!r}" + ) + if isinstance(v, bytes): + try: + v.decode('utf-8') + except UnicodeDecodeError: + raise ValueError( + f"Metadata values must be valid UTF-8, got {value!r}" + ) + keys.push_back(key) + values.push_back(v) result.reset(new CKeyValueMetadata(move(keys), move(values))) self.init(result)