diff --git a/feature_engine/encoding/one_hot.py b/feature_engine/encoding/one_hot.py index e94432a3d..3e8b1a7d2 100644 --- a/feature_engine/encoding/one_hot.py +++ b/feature_engine/encoding/one_hot.py @@ -1,6 +1,7 @@ # Authors: Soledad Galli # License: BSD 3 clause +import warnings from typing import List, Optional, Union import numpy as np @@ -80,13 +81,31 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin): categories to encode. In this case, dummy variables will be created only for those popular categories and the rest will be ignored, i.e., they will show the value 0 in all the binary variables. Note that if `top_categories` is not None, - the parameter `drop_last` is ignored. + the parameters `drop_last` and `drop` are ignored. drop_last: boolean, default=False Only used if `top_categories = None`. It indicates whether to create dummy variables for all the categories (k dummies), or if set to `True`, it will ignore the last binary variable and return k-1 dummies. + .. deprecated:: + `drop_last` is deprecated. Use the `drop` parameter instead, which + provides more flexibility. If both `drop_last` and `drop` are set, + a warning will be raised and `drop` will take precedence. + + drop: str, default=None + Only used if `top_categories = None`. Determines which category to drop + to return k-1 binary variables. Options are: + + - None: No category is dropped (k dummies are returned), unless + `drop_last` is True. + - ``'last'``: Drops the last category alphabetically. + - ``'first'``: Drops the first category alphabetically. + - ``'most_frequent'``: Drops the most frequent category observed during + fit(). If multiple categories share the highest frequency, a UserWarning + is raised and the first category alphabetically among the tied ones is + dropped. + drop_last_binary: boolean, default=False Whether to return 1 or 2 dummy variables for binary categorical variables. When a categorical variable has only 2 categories, then the second dummy variable @@ -164,6 +183,7 @@ def __init__( drop_last_binary: bool = False, variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, + drop: Optional[str] = None, ) -> None: if top_categories and ( @@ -185,10 +205,26 @@ def __init__( f"Got {drop_last_binary} instead." ) + if drop is not None and drop not in ("last", "first", "most_frequent"): + raise ValueError( + "drop takes only values 'last', 'first', 'most_frequent' " + f"or None. Got {drop} instead." + ) + + if drop_last is True and drop is not None: + warnings.warn( + "Both 'drop_last' and 'drop' were set. 'drop_last' is deprecated " + "in favour of 'drop'. The 'drop' parameter will take precedence. " + "Please use only 'drop' going forward.", + FutureWarning, + stacklevel=2, + ) + super().__init__(variables, ignore_format) self.top_categories = top_categories self.drop_last = drop_last self.drop_last_binary = drop_last_binary + self.drop = drop def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -228,15 +264,48 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ] else: - category_ls = list(X[var].unique()) - - # return k-1 dummies - if self.drop_last: - self.encoder_dict_[var] = category_ls[:-1] - - # return k dummies + # Determine the effective drop strategy. + # New 'drop' param takes precedence over old 'drop_last'. + drop_strategy = self.drop + if drop_strategy is None and self.drop_last: + drop_strategy = "last" + + if drop_strategy is not None: + # Sort alphabetically so "first"/"last" are deterministic + category_ls = sorted(X[var].unique()) + + # Figure out which single category to remove + if drop_strategy == "last": + cat_to_drop = category_ls[-1] + + elif drop_strategy == "first": + cat_to_drop = category_ls[0] + + elif drop_strategy == "most_frequent": + freq = X[var].value_counts() + max_freq = freq.max() + most_frequent_cats = sorted( + freq[freq == max_freq].index.tolist() + ) + if len(most_frequent_cats) > 1: + warnings.warn( + f"Variable '{var}': multiple categories share " + f"the highest frequency ({max_freq}): " + f"{most_frequent_cats}. Dropping the first " + f"alphabetically: '{most_frequent_cats[0]}'.", + UserWarning, + stacklevel=2, + ) + cat_to_drop = most_frequent_cats[0] + + # Remove that one category from the list + self.encoder_dict_[var] = [ + c for c in category_ls if c != cat_to_drop + ] + + # return k dummies (no category dropped) — preserve insertion order else: - self.encoder_dict_[var] = category_ls + self.encoder_dict_[var] = list(X[var].unique()) self.variables_binary_ = [var for var in variables_ if X[var].nunique() == 2] diff --git a/tests/test_encoding/test_onehot_encoder.py b/tests/test_encoding/test_onehot_encoder.py index aca3448be..48aa1755f 100644 --- a/tests/test_encoding/test_onehot_encoder.py +++ b/tests/test_encoding/test_onehot_encoder.py @@ -1,3 +1,5 @@ +import warnings + import pandas as pd import pytest from sklearn.pipeline import Pipeline @@ -458,13 +460,18 @@ def test_get_feature_names_out(df_enc_binary): tr = OneHotEncoder(drop_last=True) tr.fit(df_enc_binary) + # With drop_last=True, categories are sorted alphabetically and last is dropped. + # var_A: [A, B, C] -> drop C -> keep A, B + # var_B: [A, B, C] -> drop C -> keep A, B + # var_C: [AHA, UHU] -> drop UHU -> keep AHA + # var_D: [EHE, OHO] -> drop OHO -> keep EHE out = [ "var_A_A", "var_A_B", "var_B_A", "var_B_B", "var_C_AHA", - "var_D_OHO", + "var_D_EHE", ] feat_out = original_features + out @@ -534,3 +541,124 @@ def test_inverse_transform_raises_not_implemented_error(df_enc_binary): enc = OneHotEncoder().fit(df_enc_binary) with pytest.raises(NotImplementedError): enc.inverse_transform(df_enc_binary) + + +# ================================================================ +# Tests for the new `drop` parameter +# ================================================================ + + +def test_drop_last_drops_last_category_alphabetically(): + """drop='last' should sort categories alphabetically and drop the last one.""" + df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]}) + encoder = OneHotEncoder(drop="last") + encoder.fit(df) + + # Alphabetically: Blue, Green, Red -> drop "Red" + assert encoder.encoder_dict_ == {"color": ["Blue", "Green"]} + + X = encoder.transform(df) + assert "color_Red" not in X.columns + assert "color_Blue" in X.columns + assert "color_Green" in X.columns + + +def test_drop_first_drops_first_category_alphabetically(): + """drop='first' should sort categories alphabetically and drop the first one.""" + df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]}) + encoder = OneHotEncoder(drop="first") + encoder.fit(df) + + # Alphabetically: Blue, Green, Red -> drop "Blue" + assert encoder.encoder_dict_ == {"color": ["Green", "Red"]} + + X = encoder.transform(df) + assert "color_Blue" not in X.columns + assert "color_Green" in X.columns + assert "color_Red" in X.columns + + +def test_drop_most_frequent_drops_most_common_category(): + """drop='most_frequent' should drop the category with the highest count.""" + df = pd.DataFrame({ + "animal": ["Cat"] * 10 + ["Dog"] * 5 + ["Fish"] * 3 + }) + encoder = OneHotEncoder(drop="most_frequent") + encoder.fit(df) + + # Cat appears 10 times (most frequent) -> it should be dropped + assert "Cat" not in encoder.encoder_dict_["animal"] + assert "Dog" in encoder.encoder_dict_["animal"] + assert "Fish" in encoder.encoder_dict_["animal"] + + X = encoder.transform(df) + assert "animal_Cat" not in X.columns + assert "animal_Dog" in X.columns + assert "animal_Fish" in X.columns + + +def test_drop_most_frequent_with_tie_raises_warning(): + """When multiple categories share the highest frequency, a warning should be + raised and the first one alphabetically among the tied should be dropped.""" + df = pd.DataFrame({ + "fruit": ["Apple"] * 5 + ["Banana"] * 5 + ["Cherry"] * 3 + }) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + encoder = OneHotEncoder(drop="most_frequent") + encoder.fit(df) + + # Apple and Banana are tied at 5 each -> warning expected + user_warnings = [ + x for x in w + if issubclass(x.category, UserWarning) + and "multiple categories" in str(x.message).lower() + ] + assert len(user_warnings) == 1 + + # Should drop "Apple" (first alphabetically among tied) + assert "Apple" not in encoder.encoder_dict_["fruit"] + assert "Banana" in encoder.encoder_dict_["fruit"] + assert "Cherry" in encoder.encoder_dict_["fruit"] + + +def test_deprecation_warning_when_drop_last_and_drop_both_set(): + """Using both drop_last=True and drop='first' should emit a FutureWarning, + and the 'drop' parameter should take precedence.""" + df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]}) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + encoder = OneHotEncoder(drop_last=True, drop="first") + + future_warnings = [x for x in w if issubclass(x.category, FutureWarning)] + assert len(future_warnings) == 1 + assert "deprecated" in str(future_warnings[0].message).lower() + + encoder.fit(df) + + # drop='first' takes precedence -> drops "Blue" (first alphabetically) + assert "Blue" not in encoder.encoder_dict_["color"] + assert "Green" in encoder.encoder_dict_["color"] + assert "Red" in encoder.encoder_dict_["color"] + + +@pytest.mark.parametrize("bad_value", ["middle", "random", 123, True]) +def test_error_if_drop_not_valid(bad_value): + """Invalid values for 'drop' should raise a ValueError.""" + with pytest.raises(ValueError): + OneHotEncoder(drop=bad_value) + + +def test_drop_last_backward_compatibility(df_enc_big): + """Using drop_last=True without the new drop parameter should still work.""" + encoder = OneHotEncoder( + top_categories=None, variables=["var_A", "var_B"], drop_last=True + ) + X = encoder.fit_transform(df_enc_big) + + # Alphabetically last category is "G" -> should be dropped + assert "var_A_G" not in X.columns + assert "var_B_G" not in X.columns + assert "var_A_A" in X.columns + assert "var_B_A" in X.columns