diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 029e14ca162..53f422d05e9 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -785,12 +785,20 @@ def test_schema_merge(): pa.unify_schemas([a, 1]) -def test_undecodable_metadata(): - # ARROW-10214: undecodable metadata shouldn't fail repr() - data1 = b'abcdef\xff\x00' - data2 = b'ghijkl\xff\x00' - schema = pa.schema( - [pa.field('ints', pa.int16(), metadata={'key': data1})], - metadata={'key': data2}) - assert 'abcdef' in str(schema) - assert 'ghijkl' in str(schema) +def test_non_utf8_metadata_rejected(): + # GH-49058: non-UTF-8 bytes in metadata keys/values must be rejected + # because Schema.fbs requires metadata strings to be valid UTF-8. + invalid = b'\xff\xfe\xfa' + + with pytest.raises(ValueError, match="Metadata values must be valid UTF-8"): + pa.schema([pa.field('ints', pa.int16())], metadata={'key': invalid}) + + with pytest.raises(ValueError, match="Metadata keys must be valid UTF-8"): + pa.schema([pa.field('ints', pa.int16())], metadata={invalid: b'value'}) + + with pytest.raises(ValueError, match="Metadata values must be valid UTF-8"): + pa.field('ints', pa.int16(), metadata={'key': invalid}) + + # valid UTF-8 (including plain ASCII) must continue to work + pa.schema([pa.field('ints', pa.int16())], metadata={b'key': b'value'}) + pa.schema([pa.field('ints', pa.int16())], metadata={'key': 'value \u00e9'}) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e9eef896515..b2f68fdd424 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2314,8 +2314,23 @@ cdef class KeyValueMetadata(_Metadata, Mapping): keys.reserve(len(items)) for key, value in items: - keys.push_back(tobytes(key)) - values.push_back(tobytes(value)) + v = tobytes(value) + if isinstance(key, bytes): + try: + key.decode('utf-8') + except UnicodeDecodeError: + raise ValueError( + f"Metadata keys must be valid UTF-8, got {key!r}" + ) + if isinstance(v, bytes): + try: + v.decode('utf-8') + except UnicodeDecodeError: + raise ValueError( + f"Metadata values must be valid UTF-8, got {value!r}" + ) + keys.push_back(key) + values.push_back(v) result.reset(new CKeyValueMetadata(move(keys), move(values))) self.init(result)