diff --git a/Cargo.toml b/Cargo.toml index 90436b4f1a9..12433bb83cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,7 +97,7 @@ arrow-cast = "58" arrow-data = "58" arrow-ipc = "58" arrow-ord = "58" -arrow-schema = "58" +arrow-schema = { version = "58", features = ["canonical_extension_types"] } arrow-select = "58" arrow-string = "58" async-fs = "2.2.0" diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index c193f10da92..b865156802d 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -14858,6 +14858,14 @@ pub mod vortex_array::extension::uuid pub struct vortex_array::extension::uuid::Uuid +impl vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::default(nullability: vortex_array::dtype::Nullability) -> vortex_array::dtype::extension::ExtDType + +pub fn vortex_array::extension::uuid::Uuid::new(metadata: vortex_array::extension::uuid::UuidMetadata, nullability: vortex_array::dtype::Nullability) -> vortex_array::dtype::extension::ExtDType + +pub fn vortex_array::extension::uuid::Uuid::storage_dtype(nullability: vortex_array::dtype::Nullability) -> vortex_array::dtype::DType + impl core::clone::Clone for vortex_array::extension::uuid::Uuid pub fn vortex_array::extension::uuid::Uuid::clone(&self) -> vortex_array::extension::uuid::Uuid diff --git a/vortex-array/src/arrow/executor/fixed_size_binary.rs b/vortex-array/src/arrow/executor/fixed_size_binary.rs new file mode 100644 index 00000000000..680735ce4ac --- /dev/null +++ b/vortex-array/src/arrow/executor/fixed_size_binary.rs @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use arrow_array::ArrayRef as ArrowArrayRef; +use arrow_array::FixedSizeBinaryArray; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::ArrayRef; +use crate::ExecutionCtx; +use crate::arrays::ExtensionArray; +use crate::arrays::FixedSizeListArray; +use crate::arrays::PrimitiveArray; +use crate::arrow::executor::validity::to_arrow_null_buffer; +use crate::dtype::DType; +use crate::dtype::PType; +use crate::vtable::ValidityHelper; + +/// Convert a Vortex array to an Arrow `FixedSizeBinaryArray`. +/// +/// Accepts either an extension array (e.g. UUID) or a plain `FixedSizeList(Primitive(U8), size)`. +pub(super) fn to_arrow_fixed_size_binary( + array: ArrayRef, + size: i32, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let storage = if array.dtype().is_extension() { + array + .execute::(ctx)? + .storage_array() + .clone() + } else { + array + }; + + let fsl = storage.execute::(ctx)?; + + match fsl.dtype() { + DType::FixedSizeList(elem, list_size, _) + if *list_size == size as u32 + && matches!(elem.as_ref(), DType::Primitive(PType::U8, _)) => {} + other => { + vortex_bail!("FixedSizeBinary({size}) requires FixedSizeList(U8, {size}), got {other}"); + } + } + + let elements = fsl.elements().clone().execute::(ctx)?; + let values = elements.into_buffer::().into_arrow_buffer(); + let null_buffer = to_arrow_null_buffer(fsl.validity(), fsl.len(), ctx)?; + + Ok(Arc::new(FixedSizeBinaryArray::new( + size, + values, + null_buffer, + ))) +} + +#[cfg(test)] +mod tests { + use arrow_array::FixedSizeBinaryArray; + use arrow_schema::DataType; + use vortex_buffer::BitBuffer; + use vortex_buffer::Buffer; + + use crate::IntoArray; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::arrays::ExtensionArray; + use crate::arrays::FixedSizeListArray; + use crate::arrays::PrimitiveArray; + use crate::arrow::ArrowArrayExecutor; + use crate::dtype::Nullability; + use crate::extension::uuid::Uuid; + use crate::extension::uuid::vtable::UUID_BYTE_LEN; + use crate::validity::Validity; + + #[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits u32/i32" + )] + #[test] + fn test_uuid_to_fixed_size_binary() { + let u1 = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); + let u2 = uuid::Uuid::parse_str("f47ac10b-58cc-4372-a567-0e02b2c3d479").unwrap(); + + let flat: Vec = [u1.as_bytes(), &[0u8; 16], u2.as_bytes()] + .into_iter() + .flatten() + .copied() + .collect(); + let elements = PrimitiveArray::new(Buffer::from(flat), Validity::NonNullable).into_array(); + let validity = Validity::from(BitBuffer::from_iter([true, false, true])); + let fsl = FixedSizeListArray::try_new(elements, UUID_BYTE_LEN as u32, validity, 3) + .unwrap() + .into_array(); + let uuid_array = ExtensionArray::new(Uuid::default(Nullability::Nullable).erased(), fsl); + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let arrow = uuid_array + .into_array() + .execute_arrow( + Some(&DataType::FixedSizeBinary(UUID_BYTE_LEN as i32)), + &mut ctx, + ) + .unwrap(); + + let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + [Some(u1.as_bytes().as_slice()), None, Some(u2.as_bytes())].into_iter(), + UUID_BYTE_LEN as i32, + ) + .unwrap(); + assert_eq!(arrow.as_ref(), &expected as &dyn arrow_array::Array); + } +} diff --git a/vortex-array/src/arrow/executor/mod.rs b/vortex-array/src/arrow/executor/mod.rs index edaa244d967..62d1b59ff4b 100644 --- a/vortex-array/src/arrow/executor/mod.rs +++ b/vortex-array/src/arrow/executor/mod.rs @@ -6,6 +6,7 @@ mod byte; pub mod byte_view; mod decimal; mod dictionary; +mod fixed_size_binary; mod fixed_size_list; mod list; mod list_view; @@ -38,6 +39,7 @@ use crate::arrow::executor::byte::to_arrow_byte_array; use crate::arrow::executor::byte_view::to_arrow_byte_view; use crate::arrow::executor::decimal::to_arrow_decimal; use crate::arrow::executor::dictionary::to_arrow_dictionary; +use crate::arrow::executor::fixed_size_binary::to_arrow_fixed_size_binary; use crate::arrow::executor::fixed_size_list::to_arrow_fixed_list; use crate::arrow::executor::list::to_arrow_list; use crate::arrow::executor::list_view::to_arrow_list_view; @@ -156,8 +158,8 @@ impl ArrowArrayExecutor for ArrayRef { DataType::RunEndEncoded(ends_type, values_type) => { to_arrow_run_end(self, ends_type.data_type(), values_type, ctx) } - DataType::FixedSizeBinary(_) - | DataType::Map(..) + DataType::FixedSizeBinary(size) => to_arrow_fixed_size_binary(self, *size, ctx), + DataType::Map(..) | DataType::Duration(_) | DataType::Interval(_) | DataType::Union(..) => { diff --git a/vortex-array/src/dtype/arrow.rs b/vortex-array/src/dtype/arrow.rs index 17af749cfc0..a3faa194e7e 100644 --- a/vortex-array/src/dtype/arrow.rs +++ b/vortex-array/src/dtype/arrow.rs @@ -23,6 +23,7 @@ use arrow_schema::Schema; use arrow_schema::SchemaBuilder; use arrow_schema::SchemaRef; use arrow_schema::TimeUnit as ArrowTimeUnit; +use arrow_schema::extension::ExtensionType as _; use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -42,6 +43,8 @@ use crate::extension::datetime::TemporalMetadata; use crate::extension::datetime::Time; use crate::extension::datetime::TimeUnit; use crate::extension::datetime::Timestamp; +use crate::extension::uuid::Uuid; +use crate::extension::uuid::vtable::UUID_BYTE_LEN; /// Trait for converting Arrow types to Vortex types. pub trait FromArrowType: Sized { @@ -210,15 +213,22 @@ impl FromArrowType<(&DataType, Nullability)> for DType { impl FromArrowType<&Field> for DType { fn from_arrow(field: &Field) -> Self { + let nullability = Nullability::from(field.is_nullable()); + if field .metadata() .get("ARROW:extension:name") .map(|s| s.as_str()) == Some("arrow.parquet.variant") { - return DType::Variant(field.is_nullable().into()); + return DType::Variant(nullability); + } + + if field.extension_type_name() == Some(arrow_schema::extension::Uuid::NAME) { + return DType::Extension(Uuid::default(nullability).erased()); } - Self::from_arrow((field.data_type(), field.is_nullable().into())) + + Self::from_arrow((field.data_type(), nullability)) } } @@ -245,11 +255,17 @@ impl DType { .into(), ) } else { - Field::new( + let mut field = Field::new( field_name.as_ref(), field_dtype.to_arrow_dtype()?, field_dtype.is_nullable(), - ) + ); + if let DType::Extension(ext) = field_dtype + && ext.is::() + { + field = field.with_extension_type(arrow_schema::extension::Uuid); + } + field }; builder.push(field); } @@ -349,6 +365,14 @@ impl DType { }); }; + if ext_dtype.is::() { + #[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits i32" + )] + return Ok(DataType::FixedSizeBinary(UUID_BYTE_LEN as i32)); + } + vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()) } }) @@ -561,4 +585,25 @@ mod test { assert_eq!(original_dtype, roundtripped_dtype); } + + #[test] + fn test_uuid_schema_roundtrip() { + let original = DType::struct_( + [( + "id", + DType::Extension(Uuid::default(Nullability::Nullable).erased()), + )], + Nullability::NonNullable, + ); + let schema = original.to_arrow_schema().unwrap(); + + let field = schema.field(0); + assert_eq!(field.data_type(), &DataType::FixedSizeBinary(16)); + assert_eq!( + field.extension_type_name(), + Some(arrow_schema::extension::Uuid::NAME) + ); + + assert_eq!(DType::from_arrow(&schema), original); + } } diff --git a/vortex-array/src/extension/uuid/mod.rs b/vortex-array/src/extension/uuid/mod.rs index e4347c2513c..74beeeffdeb 100644 --- a/vortex-array/src/extension/uuid/mod.rs +++ b/vortex-array/src/extension/uuid/mod.rs @@ -15,6 +15,42 @@ pub use metadata::UuidMetadata; pub(crate) mod vtable; +use std::sync::Arc; + +use vortex_error::VortexExpect; + +use crate::dtype::DType; +use crate::dtype::Nullability; +use crate::dtype::PType; +use crate::dtype::extension::ExtDType; + /// The VTable for the UUID extension type. #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] pub struct Uuid; + +#[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits u32" +)] +#[allow(clippy::same_name_method)] +impl Uuid { + /// Returns the canonical UUID storage dtype: `FixedSizeList(Primitive(U8, NonNullable), 16)`. + pub fn storage_dtype(nullability: Nullability) -> DType { + DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + vtable::UUID_BYTE_LEN as u32, + nullability, + ) + } + + /// Creates a new UUID extension dtype with the given metadata and nullability. + pub fn new(metadata: UuidMetadata, nullability: Nullability) -> ExtDType { + ExtDType::try_new(metadata, Self::storage_dtype(nullability)) + .vortex_expect("valid UUID storage dtype") + } + + /// Creates a new UUID extension dtype with default metadata. + pub fn default(nullability: Nullability) -> ExtDType { + Self::new(UuidMetadata::default(), nullability) + } +} diff --git a/vortex-array/src/extension/uuid/vtable.rs b/vortex-array/src/extension/uuid/vtable.rs index c76c7352942..ab5dd3efbb0 100644 --- a/vortex-array/src/extension/uuid/vtable.rs +++ b/vortex-array/src/extension/uuid/vtable.rs @@ -182,9 +182,11 @@ mod tests { #[case::non_nullable(Nullability::NonNullable)] #[case::nullable(Nullability::Nullable)] fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> { - let metadata = UuidMetadata::default(); - let storage_dtype = uuid_storage_dtype(nullability); - ExtDType::try_with_vtable(Uuid, metadata, storage_dtype)?; + ExtDType::try_with_vtable( + Uuid, + UuidMetadata::default(), + Uuid::storage_dtype(nullability), + )?; Ok(()) } @@ -229,10 +231,7 @@ mod tests { let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") .map_err(|e| vortex_error::vortex_err!("{e}"))?; - let ext_dtype = ExtDType::try_new( - UuidMetadata::default(), - uuid_storage_dtype(Nullability::NonNullable), - )?; + let ext_dtype = Uuid::default(Nullability::NonNullable); let children: Vec = expected .as_bytes() .iter() @@ -261,13 +260,12 @@ mod tests { assert_eq!(v4_uuid.get_version(), Some(Version::Random)); // Metadata says v7, but the UUID is v4. - let ext_dtype = ExtDType::try_with_vtable( - Uuid, + let ext_dtype = Uuid::new( UuidMetadata { version: Some(Version::SortRand), }, - uuid_storage_dtype(Nullability::NonNullable), - )?; + Nullability::NonNullable, + ); let children: Vec = v4_uuid .as_bytes() .iter() @@ -307,13 +305,12 @@ mod tests { let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") .map_err(|e| vortex_error::vortex_err!("{e}"))?; - let ext_dtype = ExtDType::try_new( + let ext_dtype = Uuid::new( UuidMetadata { version: Some(Version::Random), }, - uuid_storage_dtype(Nullability::NonNullable), - ) - .unwrap(); + Nullability::NonNullable, + ); let storage_value = uuid_storage_scalar(&v4_uuid); let result = Uuid::unpack_native(&ext_dtype, &storage_value)?; @@ -327,23 +324,11 @@ mod tests { let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") .map_err(|e| vortex_error::vortex_err!("{e}"))?; - let ext_dtype = ExtDType::try_new( - UuidMetadata::default(), - uuid_storage_dtype(Nullability::NonNullable), - ) - .unwrap(); + let ext_dtype = Uuid::default(Nullability::NonNullable); let storage_value = uuid_storage_scalar(&v4_uuid); let result = Uuid::unpack_native(&ext_dtype, &storage_value)?; assert_eq!(result, v4_uuid); Ok(()) } - - fn uuid_storage_dtype(nullability: Nullability) -> DType { - DType::FixedSizeList( - Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), - UUID_BYTE_LEN as u32, - nullability, - ) - } }