Skip to content

Commit 201aa4c

Browse files
cpsievertclaude
andcommitted
fix: include all mapped columns in stat transform GROUP BY (#239)
Integer columns mapped to non-positional aesthetics like fill/color were dropped by stat transforms because only String/Boolean/Categorical columns were added to partition_by. Now all non-positional, non-stat-consumed mapped columns are included in GROUP BY, matching ggplot2's grouping behavior. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3bbed9d commit 201aa4c

3 files changed

Lines changed: 72 additions & 74 deletions

File tree

src/execute/layer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ where
466466
);
467467

468468
// Build group_by columns from partition_by
469-
// Note: Facet aesthetics are already in partition_by via add_discrete_columns_to_partition_by,
469+
// Note: Facet aesthetics are already in partition_by via add_mapped_columns_to_partition_by,
470470
// so we don't add facet.get_variables() here (which would add original column names
471471
// instead of aesthetic column names, breaking pre-stat transforms like domain censoring).
472472
let mut group_by: Vec<String> = Vec::new();

src/execute/mod.rs

Lines changed: 70 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ pub use schema::TypeInfo;
2424

2525
use crate::naming;
2626
use crate::parser;
27-
use crate::plot::aesthetic::{is_positional_aesthetic, AestheticContext};
27+
use crate::plot::aesthetic::is_positional_aesthetic;
2828
use crate::plot::facet::{resolve_properties as resolve_facet_properties, FacetDataContext};
2929
use crate::plot::layer::is_transposed;
30-
use crate::plot::{AestheticValue, Layer, Scale, ScaleTypeKind, Schema};
30+
use crate::plot::{AestheticValue, Layer, ScaleTypeKind, Schema};
3131
use crate::{DataFrame, DataSource, GgsqlError, Plot, Result};
3232
use std::collections::{HashMap, HashSet};
3333

@@ -656,41 +656,21 @@ fn resolve_facet(
656656
}
657657

658658
// =============================================================================
659-
// Discrete Column Handling
659+
// Mapped Column Grouping
660660
// =============================================================================
661661

662-
/// Add discrete mapped columns to partition_by for all layers
662+
/// Add all non-positional mapped columns to partition_by for all layers.
663663
///
664-
/// For each layer, examines all aesthetic mappings and adds any that map to
665-
/// discrete columns to the layer's partition_by. This ensures proper grouping
666-
/// for all layers, not just stat geoms.
667-
///
668-
/// Discreteness is determined by:
669-
/// 1. If the aesthetic has an explicit scale with a scale_type:
670-
/// - ScaleTypeKind::Discrete or Binned → discrete (add to partition_by)
671-
/// - ScaleTypeKind::Continuous → not discrete (skip)
672-
/// - ScaleTypeKind::Identity → fall back to schema
673-
/// 2. Otherwise, use schema's is_discrete flag (based on column data type)
664+
/// This ensures mapped columns survive stat transforms (GROUP BY), matching
665+
/// ggplot2's behavior where all mapped aesthetics contribute to grouping
666+
/// regardless of data type. Without this, integer columns mapped to
667+
/// fill/color/etc. would be dropped by stat transforms like bar's COUNT.
674668
///
675669
/// Columns already in partition_by (from explicit PARTITION BY clause) are skipped.
676670
/// Stat-consumed aesthetics (x for bar, x for histogram) are also skipped.
677-
fn add_discrete_columns_to_partition_by(
678-
layers: &mut [Layer],
679-
layer_schemas: &[Schema],
680-
scales: &[Scale],
681-
aesthetic_ctx: &AestheticContext,
682-
) {
683-
// Build a map of aesthetic -> scale for quick lookup
684-
let scale_map: HashMap<&str, &Scale> =
685-
scales.iter().map(|s| (s.aesthetic.as_str(), s)).collect();
686-
671+
fn add_mapped_columns_to_partition_by(layers: &mut [Layer], layer_schemas: &[Schema]) {
687672
for (layer, schema) in layers.iter_mut().zip(layer_schemas.iter()) {
688673
let schema_columns: HashSet<&str> = schema.iter().map(|c| c.name.as_str()).collect();
689-
let discrete_columns: HashSet<&str> = schema
690-
.iter()
691-
.filter(|c| c.is_discrete)
692-
.map(|c| c.name.as_str())
693-
.collect();
694674

695675
// Build set of excluded aesthetics that should not trigger auto-grouping:
696676
// - Stat-consumed aesthetics (transformed, not grouped)
@@ -718,40 +698,11 @@ fn add_discrete_columns_to_partition_by(
718698
continue;
719699
}
720700

721-
// Determine if this aesthetic is discrete:
722-
// 1. Check if there's an explicit scale with a scale_type
723-
// 2. Fall back to schema's is_discrete
724-
//
725-
// Discrete and Binned scales produce categorical groupings.
726-
// Continuous scales don't group. Identity defers to column type.
727-
let primary_aes = aesthetic_ctx
728-
.primary_internal_positional(aesthetic)
729-
.unwrap_or(aesthetic);
730-
let is_discrete = if let Some(scale) = scale_map.get(primary_aes) {
731-
if let Some(ref scale_type) = scale.scale_type {
732-
match scale_type.scale_type_kind() {
733-
ScaleTypeKind::Discrete
734-
| ScaleTypeKind::Binned
735-
| ScaleTypeKind::Ordinal => true,
736-
ScaleTypeKind::Continuous => false,
737-
ScaleTypeKind::Identity => discrete_columns.contains(col),
738-
}
739-
} else {
740-
// Scale exists but no explicit type - use schema
741-
discrete_columns.contains(col)
742-
}
743-
} else {
744-
// No scale for this aesthetic - use schema
745-
discrete_columns.contains(col)
746-
};
747-
748-
// Skip if not discrete
749-
if !is_discrete {
750-
continue;
751-
}
752-
753-
// Use the prefixed aesthetic column name, since the query renames
754-
// columns to prefixed names (e.g., island → __ggsql_aes_fill__)
701+
// Add all non-positional, non-consumed mapped columns to partition_by.
702+
// This ensures they survive stat transforms (GROUP BY), matching ggplot2
703+
// where all mapped aesthetics contribute to grouping regardless of
704+
// data type. Without this, integer columns mapped to fill/color/etc.
705+
// would be dropped by stat transforms like bar's COUNT.
755706
let aes_col_name = naming::aesthetic_column(aesthetic);
756707

757708
// Skip if already in partition_by
@@ -1083,15 +1034,9 @@ pub fn prepare_data_with_reader<R: Reader>(query: &str, reader: &R) -> Result<Pr
10831034
// This must happen before apply_layer_transforms so pre_stat_transform_sql has resolved breaks
10841035
scale::apply_pre_stat_resolve(&mut specs[0], &layer_schemas)?;
10851036

1086-
// Add discrete mapped columns to partition_by for all layers
1087-
let scales = specs[0].scales.clone();
1088-
let aesthetic_ctx = specs[0].get_aesthetic_context();
1089-
add_discrete_columns_to_partition_by(
1090-
&mut specs[0].layers,
1091-
&layer_schemas,
1092-
&scales,
1093-
&aesthetic_ctx,
1094-
);
1037+
// Add all non-positional mapped columns to partition_by so they survive
1038+
// stat transforms (GROUP BY), matching ggplot2's grouping behavior.
1039+
add_mapped_columns_to_partition_by(&mut specs[0].layers, &layer_schemas);
10951040

10961041
// Clone scales for apply_layer_transforms
10971042
let scales = specs[0].scales.clone();
@@ -1519,6 +1464,59 @@ mod tests {
15191464
assert!(layer_df.height() < 100);
15201465
}
15211466

1467+
#[cfg(feature = "duckdb")]
1468+
#[test]
1469+
fn test_bar_with_integer_fill_column() {
1470+
// Integer columns mapped to non-positional aesthetics like fill should
1471+
// survive stat transforms (be included in GROUP BY), matching ggplot2
1472+
// behavior where all mapped aesthetics contribute to grouping.
1473+
let reader = DuckDBReader::from_connection_string("duckdb://memory").unwrap();
1474+
1475+
reader
1476+
.connection()
1477+
.execute(
1478+
"CREATE TABLE int_fill_test AS SELECT * FROM (VALUES
1479+
('Male', 0), ('Male', 1), ('Female', 0), ('Female', 1),
1480+
('Male', 0), ('Male', 0), ('Female', 1), ('Female', 1)
1481+
) AS t(sex, survived)",
1482+
duckdb::params![],
1483+
)
1484+
.unwrap();
1485+
1486+
let query = r#"
1487+
SELECT * FROM int_fill_test
1488+
VISUALISE
1489+
DRAW bar MAPPING sex AS x, survived AS fill
1490+
"#;
1491+
1492+
let result = prepare_data_with_reader(query, &reader);
1493+
assert!(
1494+
result.is_ok(),
1495+
"Integer column mapped to fill should not cause error: {:?}",
1496+
result.err()
1497+
);
1498+
1499+
let result = result.unwrap();
1500+
let layer_df = result.data.get(&naming::layer_key(0)).unwrap();
1501+
1502+
// Should have 4 rows: Male/0, Male/1, Female/0, Female/1
1503+
assert_eq!(layer_df.height(), 4);
1504+
1505+
// fill column should be present
1506+
let fill_col = naming::aesthetic_column("fill");
1507+
let col_names: Vec<String> = layer_df
1508+
.get_column_names_str()
1509+
.iter()
1510+
.map(|s| s.to_string())
1511+
.collect();
1512+
assert!(
1513+
col_names.contains(&fill_col),
1514+
"Expected fill column '{}' in {:?}",
1515+
fill_col,
1516+
col_names
1517+
);
1518+
}
1519+
15221520
#[cfg(feature = "duckdb")]
15231521
#[test]
15241522
fn test_bar_count_stat_transform() {

src/execute/position.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ mod tests {
106106
);
107107
m
108108
};
109-
// Add fill to partition_by (simulates what add_discrete_columns_to_partition_by does)
109+
// Add fill to partition_by (simulates what add_mapped_columns_to_partition_by does)
110110
layer.partition_by = vec!["__ggsql_aes_fill__".to_string()];
111111
layer
112112
}

0 commit comments

Comments
 (0)