diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 21b2c55cb04..7742516a2c0 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -2,6 +2,8 @@ pub mod vortex_btrblocks pub use vortex_btrblocks::ArrayAndStats +pub use vortex_btrblocks::BoolStats + pub use vortex_btrblocks::CascadingCompressor pub use vortex_btrblocks::CompressorContext @@ -28,6 +30,12 @@ pub use vortex_btrblocks::integer_dictionary_encode pub mod vortex_btrblocks::schemes +pub mod vortex_btrblocks::schemes::bool + +pub use vortex_btrblocks::schemes::bool::BoolConstantScheme + +pub use vortex_btrblocks::schemes::bool::BoolStats + pub mod vortex_btrblocks::schemes::decimal pub struct vortex_btrblocks::schemes::decimal::DecimalScheme diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index e96dd6a7af3..6127f1e3910 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -10,6 +10,7 @@ use crate::CascadingCompressor; use crate::Scheme; use crate::SchemeExt; use crate::SchemeId; +use crate::schemes::bool; use crate::schemes::decimal; use crate::schemes::float; use crate::schemes::integer; @@ -22,6 +23,10 @@ use crate::schemes::temporal; /// This list is order-sensitive: the builder preserves this order when constructing /// the final scheme list, so that tie-breaking is deterministic. pub const ALL_SCHEMES: &[&dyn Scheme] = &[ + //////////////////////////////////////////////////////////////////////////////////////////////// + // Bool schemes. + //////////////////////////////////////////////////////////////////////////////////////////////// + &bool::BoolConstantScheme, //////////////////////////////////////////////////////////////////////////////////////////////// // Integer schemes. //////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 4ba118defc9..70a005cdbd4 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -62,11 +62,14 @@ mod tests { use rstest::rstest; use vortex_array::DynArray; use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::Constant; use vortex_array::arrays::List; use vortex_array::arrays::ListView; use vortex_array::arrays::ListViewArray; use vortex_array::assert_arrays_eq; use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; use vortex_buffer::buffer; use vortex_error::VortexResult; @@ -107,4 +110,61 @@ mod tests { assert_arrays_eq!(result, input); Ok(()) } + + #[test] + fn test_constant_all_true() -> VortexResult<()> { + let array = BoolArray::new(BitBuffer::from(vec![true; 100]), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.clone().into_array())?; + assert!(compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } + + #[test] + fn test_constant_all_false() -> VortexResult<()> { + let array = BoolArray::new(BitBuffer::from(vec![false; 100]), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.clone().into_array())?; + assert!(compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } + + #[test] + fn test_nullable_all_valid_compressed() -> VortexResult<()> { + let array = BoolArray::new( + BitBuffer::from(vec![true; 100]), + Validity::from(BitBuffer::from(vec![true; 100])), + ); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.clone().into_array())?; + assert!(compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } + + #[test] + fn test_nullable_with_nulls_not_compressed() -> VortexResult<()> { + let validity = Validity::from(BitBuffer::from_iter((0..100).map(|i| i % 3 != 0))); + let array = BoolArray::new(BitBuffer::from(vec![true; 100]), validity); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.clone().into_array())?; + assert!(!compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } + + #[test] + fn test_mixed_not_constant() -> VortexResult<()> { + let array = BoolArray::new( + BitBuffer::from(vec![true, false, true, false, true]), + Validity::NonNullable, + ); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.clone().into_array())?; + assert!(!compressed.is::()); + assert_arrays_eq!(compressed, array); + Ok(()) + } } diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 43b48f2668d..1ae23251a1c 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -76,6 +76,7 @@ pub use vortex_compressor::scheme::SchemeExt; pub use vortex_compressor::scheme::SchemeId; pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling; pub use vortex_compressor::stats::ArrayAndStats; +pub use vortex_compressor::stats::BoolStats; pub use vortex_compressor::stats::FloatStats; pub use vortex_compressor::stats::GenerateStatsOptions; pub use vortex_compressor::stats::IntegerStats; diff --git a/vortex-btrblocks/src/schemes/bool.rs b/vortex-btrblocks/src/schemes/bool.rs new file mode 100644 index 00000000000..c27251a8599 --- /dev/null +++ b/vortex-btrblocks/src/schemes/bool.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Bool compression schemes. + +pub use vortex_compressor::builtins::BoolConstantScheme; +pub use vortex_compressor::stats::BoolStats; diff --git a/vortex-btrblocks/src/schemes/mod.rs b/vortex-btrblocks/src/schemes/mod.rs index 13f1bfecd25..10d99fea475 100644 --- a/vortex-btrblocks/src/schemes/mod.rs +++ b/vortex-btrblocks/src/schemes/mod.rs @@ -3,6 +3,7 @@ //! Compression scheme implementations. +pub mod bool; pub mod float; pub mod integer; pub mod string; diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock index 3fbc28076eb..a2e1dd47677 100644 --- a/vortex-compressor/public-api.lock +++ b/vortex-compressor/public-api.lock @@ -2,6 +2,46 @@ pub mod vortex_compressor pub mod vortex_compressor::builtins +pub struct vortex_compressor::builtins::BoolConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::BoolConstantScheme + +pub fn vortex_compressor::builtins::BoolConstantScheme::clone(&self) -> vortex_compressor::builtins::BoolConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::BoolConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::BoolConstantScheme + +pub fn vortex_compressor::builtins::BoolConstantScheme::eq(&self, other: &vortex_compressor::builtins::BoolConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::BoolConstantScheme + +pub fn vortex_compressor::builtins::BoolConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::BoolConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::BoolConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::BoolConstantScheme + +pub fn vortex_compressor::builtins::BoolConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::BoolConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::BoolConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::BoolConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::BoolConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::BoolConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::BoolConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::BoolConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::BoolConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + pub struct vortex_compressor::builtins::FloatConstantScheme impl core::clone::Clone for vortex_compressor::builtins::FloatConstantScheme @@ -246,6 +286,8 @@ pub fn vortex_compressor::builtins::float_dictionary_encode(stats: &vortex_compr pub fn vortex_compressor::builtins::integer_dictionary_encode(stats: &vortex_compressor::stats::IntegerStats) -> vortex_array::arrays::dict::array::DictArray +pub fn vortex_compressor::builtins::is_bool(canonical: &vortex_array::canonical::Canonical) -> bool + pub fn vortex_compressor::builtins::is_float_primitive(canonical: &vortex_array::canonical::Canonical) -> bool pub fn vortex_compressor::builtins::is_integer_primitive(canonical: &vortex_array::canonical::Canonical) -> bool @@ -386,6 +428,26 @@ pub fn vortex_compressor::scheme::Scheme::scheme_name(&self) -> &'static str pub fn vortex_compressor::scheme::Scheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::BoolConstantScheme + +pub fn vortex_compressor::builtins::BoolConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::BoolConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::BoolConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::BoolConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::BoolConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::BoolConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::BoolConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::BoolConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::BoolConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec @@ -624,6 +686,8 @@ impl vortex_compressor::stats::ArrayAndStats pub fn vortex_compressor::stats::ArrayAndStats::array(&self) -> &vortex_array::array::ArrayRef +pub fn vortex_compressor::stats::ArrayAndStats::bool_stats(&mut self) -> &vortex_compressor::stats::BoolStats + pub fn vortex_compressor::stats::ArrayAndStats::float_stats(&mut self) -> &vortex_compressor::stats::FloatStats pub fn vortex_compressor::stats::ArrayAndStats::get_or_insert_with(&mut self, f: impl core::ops::function::FnOnce() -> T) -> &T @@ -636,6 +700,30 @@ pub fn vortex_compressor::stats::ArrayAndStats::new(array: vortex_array::array:: pub fn vortex_compressor::stats::ArrayAndStats::string_stats(&mut self) -> &vortex_compressor::stats::StringStats +pub struct vortex_compressor::stats::BoolStats + +impl vortex_compressor::stats::BoolStats + +pub fn vortex_compressor::stats::BoolStats::generate(input: &vortex_array::arrays::bool::array::BoolArray) -> vortex_error::VortexResult + +pub fn vortex_compressor::stats::BoolStats::is_constant(&self) -> bool + +pub fn vortex_compressor::stats::BoolStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::BoolStats::source(&self) -> &vortex_array::arrays::bool::array::BoolArray + +pub fn vortex_compressor::stats::BoolStats::true_count(&self) -> u32 + +pub fn vortex_compressor::stats::BoolStats::value_count(&self) -> u32 + +impl core::clone::Clone for vortex_compressor::stats::BoolStats + +pub fn vortex_compressor::stats::BoolStats::clone(&self) -> vortex_compressor::stats::BoolStats + +impl core::fmt::Debug for vortex_compressor::stats::BoolStats + +pub fn vortex_compressor::stats::BoolStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + pub struct vortex_compressor::stats::FloatDistinctInfo impl vortex_compressor::stats::FloatDistinctInfo diff --git a/vortex-compressor/src/builtins/constant.rs b/vortex-compressor/src/builtins/constant.rs index 178f67e3e9d..ac38aee732c 100644 --- a/vortex-compressor/src/builtins/constant.rs +++ b/vortex-compressor/src/builtins/constant.rs @@ -14,6 +14,7 @@ use vortex_array::scalar::Scalar; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; +use super::is_bool; use super::is_float_primitive; use super::is_integer_primitive; use super::is_utf8_string; @@ -22,6 +23,58 @@ use crate::ctx::CompressorContext; use crate::scheme::Scheme; use crate::stats::ArrayAndStats; +/// Constant encoding for bool arrays where all valid values are the same. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct BoolConstantScheme; + +impl Scheme for BoolConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.bool.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_bool(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.bool_stats(); + + // Only compress non-nullable or all-valid nullable arrays. + if stats.source().dtype().is_nullable() && stats.null_count() > 0 { + return Ok(0.0); + } + + if !stats.is_constant() { + return Ok(0.0); + } + + Ok(stats.value_count() as f64) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.bool_stats(); + Ok(ConstantArray::new(stats.source().scalar_at(0)?, stats.source().len()).into_array()) + } +} + /// Constant encoding for integer arrays with a single distinct value. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct IntConstantScheme; diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs index 704453fb40b..59609a6afa3 100644 --- a/vortex-compressor/src/builtins/mod.rs +++ b/vortex-compressor/src/builtins/mod.rs @@ -10,6 +10,7 @@ //! [`DictArray`]: vortex_array::arrays::DictArray //! [`MaskedArray`]: vortex_array::arrays::MaskedArray +pub use constant::BoolConstantScheme; pub use constant::FloatConstantScheme; pub use constant::IntConstantScheme; pub use constant::StringConstantScheme; @@ -26,6 +27,11 @@ use vortex_array::Canonical; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; +/// Returns `true` if the canonical array is a bool type. +pub fn is_bool(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Bool(_)) +} + /// Returns `true` if the canonical array is a primitive with an integer ptype. pub fn is_integer_primitive(canonical: &Canonical) -> bool { matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs index 37940130487..1937cc83273 100644 --- a/vortex-compressor/src/compressor.rs +++ b/vortex-compressor/src/compressor.rs @@ -172,7 +172,9 @@ impl CascadingCompressor { ) -> VortexResult { match array { Canonical::Null(null_array) => Ok(null_array.into_array()), - Canonical::Bool(bool_array) => Ok(bool_array.into_array()), + Canonical::Bool(bool_array) => { + self.choose_and_compress(Canonical::Bool(bool_array), ctx) + } Canonical::Primitive(primitive) => { self.choose_and_compress(Canonical::Primitive(primitive), ctx) } diff --git a/vortex-compressor/src/stats/bool.rs b/vortex-compressor/src/stats/bool.rs new file mode 100644 index 00000000000..0f85d8f52b2 --- /dev/null +++ b/vortex-compressor/src/stats/bool.rs @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Bool compression statistics. + +use vortex_array::arrays::BoolArray; +use vortex_error::VortexResult; +use vortex_mask::AllOr; + +/// Array of booleans and relevant stats for compression. +#[derive(Clone, Debug)] +pub struct BoolStats { + /// The underlying source array. + src: BoolArray, + /// Number of null values. + null_count: u32, + /// Number of `true` values among valid (non-null) elements. + true_count: u32, + /// Number of non-null values. + value_count: u32, +} + +impl BoolStats { + /// Generates stats, returning an error on failure. + /// + /// # Errors + /// + /// Returns an error if getting validity mask fails or values exceed `u32` bounds. + pub fn generate(input: &BoolArray) -> VortexResult { + if input.is_empty() { + return Ok(Self { + src: input.clone(), + null_count: 0, + value_count: 0, + true_count: 0, + }); + } + + if input.all_invalid()? { + return Ok(Self { + src: input.clone(), + null_count: u32::try_from(input.len())?, + value_count: 0, + true_count: 0, + }); + } + + let validity = input.validity_mask()?; + let null_count = validity.false_count(); + let value_count = validity.true_count(); + + let bits = input.to_bit_buffer(); + + // Count how many true values exist among valid elements. + let true_count = match validity.bit_buffer() { + AllOr::All => bits.true_count(), + AllOr::None => unreachable!("all-invalid handled above"), + AllOr::Some(v) => { + // AND the bits with validity to only count valid trues. + (&bits & v).true_count() + } + }; + + Ok(Self { + src: input.clone(), + null_count: u32::try_from(null_count)?, + value_count: u32::try_from(value_count)?, + true_count: u32::try_from(true_count)?, + }) + } + + /// Returns the underlying source array. + pub fn source(&self) -> &BoolArray { + &self.src + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the number of `true` values among valid elements. + pub fn true_count(&self) -> u32 { + self.true_count + } + + /// Returns `true` if all valid values are the same (all-true or all-false). + pub fn is_constant(&self) -> bool { + self.value_count > 0 && (self.true_count == 0 || self.true_count == self.value_count) + } +} + +#[cfg(test)] +mod tests { + use vortex_array::arrays::BoolArray; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + use vortex_error::VortexResult; + + use super::BoolStats; + + #[test] + fn test_all_true() -> VortexResult<()> { + let array = BoolArray::new( + BitBuffer::from(vec![true, true, true]), + Validity::NonNullable, + ); + let stats = BoolStats::generate(&array)?; + assert_eq!(stats.value_count, 3); + assert_eq!(stats.null_count, 0); + assert_eq!(stats.true_count, 3); + assert!(stats.is_constant()); + Ok(()) + } + + #[test] + fn test_all_false() -> VortexResult<()> { + let array = BoolArray::new( + BitBuffer::from(vec![false, false, false]), + Validity::NonNullable, + ); + let stats = BoolStats::generate(&array)?; + assert_eq!(stats.value_count, 3); + assert_eq!(stats.null_count, 0); + assert_eq!(stats.true_count, 0); + assert!(stats.is_constant()); + Ok(()) + } + + #[test] + fn test_mixed() -> VortexResult<()> { + let array = BoolArray::new( + BitBuffer::from(vec![true, false, true]), + Validity::NonNullable, + ); + let stats = BoolStats::generate(&array)?; + assert_eq!(stats.value_count, 3); + assert_eq!(stats.null_count, 0); + assert_eq!(stats.true_count, 2); + assert!(!stats.is_constant()); + Ok(()) + } + + #[test] + fn test_with_nulls() -> VortexResult<()> { + let array = BoolArray::new( + BitBuffer::from(vec![true, false, true]), + Validity::from_iter([true, false, true]), + ); + let stats = BoolStats::generate(&array)?; + assert_eq!(stats.value_count, 2); + assert_eq!(stats.null_count, 1); + assert_eq!(stats.true_count, 2); + assert!(stats.is_constant()); + Ok(()) + } +} diff --git a/vortex-compressor/src/stats/cache.rs b/vortex-compressor/src/stats/cache.rs index bbb6522337f..c83bf044b03 100644 --- a/vortex-compressor/src/stats/cache.rs +++ b/vortex-compressor/src/stats/cache.rs @@ -10,6 +10,7 @@ use vortex_array::ArrayRef; use vortex_array::ToCanonical; use vortex_error::VortexExpect; +use super::BoolStats; use super::FloatStats; use super::GenerateStatsOptions; use super::IntegerStats; @@ -96,6 +97,15 @@ impl ArrayAndStats { self.array } + /// Returns bool stats, generating them lazily on first access. + pub fn bool_stats(&mut self) -> &BoolStats { + let array = self.array.clone(); + + self.cache.get_or_insert_with::(|| { + BoolStats::generate(&array.to_bool()).vortex_expect("BoolStats shouldn't fail") + }) + } + /// Returns integer stats, generating them lazily on first access. pub fn integer_stats(&mut self) -> &IntegerStats { let array = self.array.clone(); diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs index e4417b66b3d..276fa8f056c 100644 --- a/vortex-compressor/src/stats/mod.rs +++ b/vortex-compressor/src/stats/mod.rs @@ -3,12 +3,14 @@ //! Compression statistics types and caching. +mod bool; mod cache; mod float; mod integer; mod options; mod string; +pub use bool::BoolStats; pub use cache::ArrayAndStats; pub use float::DistinctInfo as FloatDistinctInfo; pub use float::ErasedStats as FloatErasedStats;