From 201aa4cde9fe7ea93ac707d347d23f17c59db3dc Mon Sep 17 00:00:00 2001 From: Carson Date: Wed, 18 Mar 2026 20:06:16 -0500 Subject: [PATCH] fix: include all mapped columns in stat transform GROUP BY (#239) Integer columns mapped to non-positional aesthetics like fill/color were dropped by stat transforms because only String/Boolean/Categorical columns were added to partition_by. Now all non-positional, non-stat-consumed mapped columns are included in GROUP BY, matching ggplot2's grouping behavior. Co-Authored-By: Claude Opus 4.6 --- src/execute/layer.rs | 2 +- src/execute/mod.rs | 142 ++++++++++++++++++++-------------------- src/execute/position.rs | 2 +- 3 files changed, 72 insertions(+), 74 deletions(-) diff --git a/src/execute/layer.rs b/src/execute/layer.rs index 1c67dce3..a7d59919 100644 --- a/src/execute/layer.rs +++ b/src/execute/layer.rs @@ -466,7 +466,7 @@ where ); // Build group_by columns from partition_by - // Note: Facet aesthetics are already in partition_by via add_discrete_columns_to_partition_by, + // Note: Facet aesthetics are already in partition_by via add_mapped_columns_to_partition_by, // so we don't add facet.get_variables() here (which would add original column names // instead of aesthetic column names, breaking pre-stat transforms like domain censoring). let mut group_by: Vec = Vec::new(); diff --git a/src/execute/mod.rs b/src/execute/mod.rs index c8c7475f..01ff0530 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -24,10 +24,10 @@ pub use schema::TypeInfo; use crate::naming; use crate::parser; -use crate::plot::aesthetic::{is_positional_aesthetic, AestheticContext}; +use crate::plot::aesthetic::is_positional_aesthetic; use crate::plot::facet::{resolve_properties as resolve_facet_properties, FacetDataContext}; use crate::plot::layer::is_transposed; -use crate::plot::{AestheticValue, Layer, Scale, ScaleTypeKind, Schema}; +use crate::plot::{AestheticValue, Layer, ScaleTypeKind, Schema}; use crate::{DataFrame, DataSource, GgsqlError, Plot, Result}; use std::collections::{HashMap, HashSet}; @@ -656,41 +656,21 @@ fn resolve_facet( } // ============================================================================= -// Discrete Column Handling +// Mapped Column Grouping // ============================================================================= -/// Add discrete mapped columns to partition_by for all layers +/// Add all non-positional mapped columns to partition_by for all layers. /// -/// For each layer, examines all aesthetic mappings and adds any that map to -/// discrete columns to the layer's partition_by. This ensures proper grouping -/// for all layers, not just stat geoms. -/// -/// Discreteness is determined by: -/// 1. If the aesthetic has an explicit scale with a scale_type: -/// - ScaleTypeKind::Discrete or Binned → discrete (add to partition_by) -/// - ScaleTypeKind::Continuous → not discrete (skip) -/// - ScaleTypeKind::Identity → fall back to schema -/// 2. Otherwise, use schema's is_discrete flag (based on column data type) +/// This ensures mapped columns survive stat transforms (GROUP BY), matching +/// ggplot2's behavior where all mapped aesthetics contribute to grouping +/// regardless of data type. Without this, integer columns mapped to +/// fill/color/etc. would be dropped by stat transforms like bar's COUNT. /// /// Columns already in partition_by (from explicit PARTITION BY clause) are skipped. /// Stat-consumed aesthetics (x for bar, x for histogram) are also skipped. -fn add_discrete_columns_to_partition_by( - layers: &mut [Layer], - layer_schemas: &[Schema], - scales: &[Scale], - aesthetic_ctx: &AestheticContext, -) { - // Build a map of aesthetic -> scale for quick lookup - let scale_map: HashMap<&str, &Scale> = - scales.iter().map(|s| (s.aesthetic.as_str(), s)).collect(); - +fn add_mapped_columns_to_partition_by(layers: &mut [Layer], layer_schemas: &[Schema]) { for (layer, schema) in layers.iter_mut().zip(layer_schemas.iter()) { let schema_columns: HashSet<&str> = schema.iter().map(|c| c.name.as_str()).collect(); - let discrete_columns: HashSet<&str> = schema - .iter() - .filter(|c| c.is_discrete) - .map(|c| c.name.as_str()) - .collect(); // Build set of excluded aesthetics that should not trigger auto-grouping: // - Stat-consumed aesthetics (transformed, not grouped) @@ -718,40 +698,11 @@ fn add_discrete_columns_to_partition_by( continue; } - // Determine if this aesthetic is discrete: - // 1. Check if there's an explicit scale with a scale_type - // 2. Fall back to schema's is_discrete - // - // Discrete and Binned scales produce categorical groupings. - // Continuous scales don't group. Identity defers to column type. - let primary_aes = aesthetic_ctx - .primary_internal_positional(aesthetic) - .unwrap_or(aesthetic); - let is_discrete = if let Some(scale) = scale_map.get(primary_aes) { - if let Some(ref scale_type) = scale.scale_type { - match scale_type.scale_type_kind() { - ScaleTypeKind::Discrete - | ScaleTypeKind::Binned - | ScaleTypeKind::Ordinal => true, - ScaleTypeKind::Continuous => false, - ScaleTypeKind::Identity => discrete_columns.contains(col), - } - } else { - // Scale exists but no explicit type - use schema - discrete_columns.contains(col) - } - } else { - // No scale for this aesthetic - use schema - discrete_columns.contains(col) - }; - - // Skip if not discrete - if !is_discrete { - continue; - } - - // Use the prefixed aesthetic column name, since the query renames - // columns to prefixed names (e.g., island → __ggsql_aes_fill__) + // Add all non-positional, non-consumed mapped columns to partition_by. + // This ensures they survive stat transforms (GROUP BY), matching ggplot2 + // where all mapped aesthetics contribute to grouping regardless of + // data type. Without this, integer columns mapped to fill/color/etc. + // would be dropped by stat transforms like bar's COUNT. let aes_col_name = naming::aesthetic_column(aesthetic); // Skip if already in partition_by @@ -1083,15 +1034,9 @@ pub fn prepare_data_with_reader(query: &str, reader: &R) -> Result = layer_df + .get_column_names_str() + .iter() + .map(|s| s.to_string()) + .collect(); + assert!( + col_names.contains(&fill_col), + "Expected fill column '{}' in {:?}", + fill_col, + col_names + ); + } + #[cfg(feature = "duckdb")] #[test] fn test_bar_count_stat_transform() { diff --git a/src/execute/position.rs b/src/execute/position.rs index 7202d8da..6227a1d5 100644 --- a/src/execute/position.rs +++ b/src/execute/position.rs @@ -106,7 +106,7 @@ mod tests { ); m }; - // Add fill to partition_by (simulates what add_discrete_columns_to_partition_by does) + // Add fill to partition_by (simulates what add_mapped_columns_to_partition_by does) layer.partition_by = vec!["__ggsql_aes_fill__".to_string()]; layer }