Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 78 additions & 9 deletions feature_engine/encoding/one_hot.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -80,13 +81,31 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin):
categories to encode. In this case, dummy variables will be created only for
those popular categories and the rest will be ignored, i.e., they will show the
value 0 in all the binary variables. Note that if `top_categories` is not None,
the parameter `drop_last` is ignored.
the parameters `drop_last` and `drop` are ignored.

drop_last: boolean, default=False
Only used if `top_categories = None`. It indicates whether to create dummy
variables for all the categories (k dummies), or if set to `True`, it will
ignore the last binary variable and return k-1 dummies.

.. deprecated::
`drop_last` is deprecated. Use the `drop` parameter instead, which
provides more flexibility. If both `drop_last` and `drop` are set,
a warning will be raised and `drop` will take precedence.

drop: str, default=None
Only used if `top_categories = None`. Determines which category to drop
to return k-1 binary variables. Options are:

- None: No category is dropped (k dummies are returned), unless
`drop_last` is True.
- ``'last'``: Drops the last category alphabetically.
- ``'first'``: Drops the first category alphabetically.
- ``'most_frequent'``: Drops the most frequent category observed during
fit(). If multiple categories share the highest frequency, a UserWarning
is raised and the first category alphabetically among the tied ones is
dropped.

drop_last_binary: boolean, default=False
Whether to return 1 or 2 dummy variables for binary categorical variables. When
a categorical variable has only 2 categories, then the second dummy variable
Expand Down Expand Up @@ -164,6 +183,7 @@ def __init__(
drop_last_binary: bool = False,
variables: Union[None, int, str, List[Union[str, int]]] = None,
ignore_format: bool = False,
drop: Optional[str] = None,
) -> None:

if top_categories and (
Expand All @@ -185,10 +205,26 @@ def __init__(
f"Got {drop_last_binary} instead."
)

if drop is not None and drop not in ("last", "first", "most_frequent"):
raise ValueError(
"drop takes only values 'last', 'first', 'most_frequent' "
f"or None. Got {drop} instead."
)

if drop_last is True and drop is not None:
warnings.warn(
"Both 'drop_last' and 'drop' were set. 'drop_last' is deprecated "
"in favour of 'drop'. The 'drop' parameter will take precedence. "
"Please use only 'drop' going forward.",
FutureWarning,
stacklevel=2,
)

super().__init__(variables, ignore_format)
self.top_categories = top_categories
self.drop_last = drop_last
self.drop_last_binary = drop_last_binary
self.drop = drop

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -228,15 +264,48 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
]

else:
category_ls = list(X[var].unique())

# return k-1 dummies
if self.drop_last:
self.encoder_dict_[var] = category_ls[:-1]

# return k dummies
# Determine the effective drop strategy.
# New 'drop' param takes precedence over old 'drop_last'.
drop_strategy = self.drop
if drop_strategy is None and self.drop_last:
drop_strategy = "last"

if drop_strategy is not None:
# Sort alphabetically so "first"/"last" are deterministic
category_ls = sorted(X[var].unique())

# Figure out which single category to remove
if drop_strategy == "last":
cat_to_drop = category_ls[-1]

elif drop_strategy == "first":
cat_to_drop = category_ls[0]

elif drop_strategy == "most_frequent":
freq = X[var].value_counts()
max_freq = freq.max()
most_frequent_cats = sorted(
freq[freq == max_freq].index.tolist()
)
if len(most_frequent_cats) > 1:
warnings.warn(
f"Variable '{var}': multiple categories share "
f"the highest frequency ({max_freq}): "
f"{most_frequent_cats}. Dropping the first "
f"alphabetically: '{most_frequent_cats[0]}'.",
UserWarning,
stacklevel=2,
)
cat_to_drop = most_frequent_cats[0]

# Remove that one category from the list
self.encoder_dict_[var] = [
c for c in category_ls if c != cat_to_drop
]

# return k dummies (no category dropped) — preserve insertion order
else:
self.encoder_dict_[var] = category_ls
self.encoder_dict_[var] = list(X[var].unique())

self.variables_binary_ = [var for var in variables_ if X[var].nunique() == 2]

Expand Down
130 changes: 129 additions & 1 deletion tests/test_encoding/test_onehot_encoder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import pandas as pd
import pytest
from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -458,13 +460,18 @@ def test_get_feature_names_out(df_enc_binary):
tr = OneHotEncoder(drop_last=True)
tr.fit(df_enc_binary)

# With drop_last=True, categories are sorted alphabetically and last is dropped.
# var_A: [A, B, C] -> drop C -> keep A, B
# var_B: [A, B, C] -> drop C -> keep A, B
# var_C: [AHA, UHU] -> drop UHU -> keep AHA
# var_D: [EHE, OHO] -> drop OHO -> keep EHE
out = [
"var_A_A",
"var_A_B",
"var_B_A",
"var_B_B",
"var_C_AHA",
"var_D_OHO",
"var_D_EHE",
]
feat_out = original_features + out

Expand Down Expand Up @@ -534,3 +541,124 @@ def test_inverse_transform_raises_not_implemented_error(df_enc_binary):
enc = OneHotEncoder().fit(df_enc_binary)
with pytest.raises(NotImplementedError):
enc.inverse_transform(df_enc_binary)


# ================================================================
# Tests for the new `drop` parameter
# ================================================================


def test_drop_last_drops_last_category_alphabetically():
"""drop='last' should sort categories alphabetically and drop the last one."""
df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]})
encoder = OneHotEncoder(drop="last")
encoder.fit(df)

# Alphabetically: Blue, Green, Red -> drop "Red"
assert encoder.encoder_dict_ == {"color": ["Blue", "Green"]}

X = encoder.transform(df)
assert "color_Red" not in X.columns
assert "color_Blue" in X.columns
assert "color_Green" in X.columns


def test_drop_first_drops_first_category_alphabetically():
"""drop='first' should sort categories alphabetically and drop the first one."""
df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]})
encoder = OneHotEncoder(drop="first")
encoder.fit(df)

# Alphabetically: Blue, Green, Red -> drop "Blue"
assert encoder.encoder_dict_ == {"color": ["Green", "Red"]}

X = encoder.transform(df)
assert "color_Blue" not in X.columns
assert "color_Green" in X.columns
assert "color_Red" in X.columns


def test_drop_most_frequent_drops_most_common_category():
"""drop='most_frequent' should drop the category with the highest count."""
df = pd.DataFrame({
"animal": ["Cat"] * 10 + ["Dog"] * 5 + ["Fish"] * 3
})
encoder = OneHotEncoder(drop="most_frequent")
encoder.fit(df)

# Cat appears 10 times (most frequent) -> it should be dropped
assert "Cat" not in encoder.encoder_dict_["animal"]
assert "Dog" in encoder.encoder_dict_["animal"]
assert "Fish" in encoder.encoder_dict_["animal"]

X = encoder.transform(df)
assert "animal_Cat" not in X.columns
assert "animal_Dog" in X.columns
assert "animal_Fish" in X.columns


def test_drop_most_frequent_with_tie_raises_warning():
"""When multiple categories share the highest frequency, a warning should be
raised and the first one alphabetically among the tied should be dropped."""
df = pd.DataFrame({
"fruit": ["Apple"] * 5 + ["Banana"] * 5 + ["Cherry"] * 3
})
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
encoder = OneHotEncoder(drop="most_frequent")
encoder.fit(df)

# Apple and Banana are tied at 5 each -> warning expected
user_warnings = [
x for x in w
if issubclass(x.category, UserWarning)
and "multiple categories" in str(x.message).lower()
]
assert len(user_warnings) == 1

# Should drop "Apple" (first alphabetically among tied)
assert "Apple" not in encoder.encoder_dict_["fruit"]
assert "Banana" in encoder.encoder_dict_["fruit"]
assert "Cherry" in encoder.encoder_dict_["fruit"]


def test_deprecation_warning_when_drop_last_and_drop_both_set():
"""Using both drop_last=True and drop='first' should emit a FutureWarning,
and the 'drop' parameter should take precedence."""
df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]})

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
encoder = OneHotEncoder(drop_last=True, drop="first")

future_warnings = [x for x in w if issubclass(x.category, FutureWarning)]
assert len(future_warnings) == 1
assert "deprecated" in str(future_warnings[0].message).lower()

encoder.fit(df)

# drop='first' takes precedence -> drops "Blue" (first alphabetically)
assert "Blue" not in encoder.encoder_dict_["color"]
assert "Green" in encoder.encoder_dict_["color"]
assert "Red" in encoder.encoder_dict_["color"]


@pytest.mark.parametrize("bad_value", ["middle", "random", 123, True])
def test_error_if_drop_not_valid(bad_value):
"""Invalid values for 'drop' should raise a ValueError."""
with pytest.raises(ValueError):
OneHotEncoder(drop=bad_value)


def test_drop_last_backward_compatibility(df_enc_big):
"""Using drop_last=True without the new drop parameter should still work."""
encoder = OneHotEncoder(
top_categories=None, variables=["var_A", "var_B"], drop_last=True
)
X = encoder.fit_transform(df_enc_big)

# Alphabetically last category is "G" -> should be dropped
assert "var_A_G" not in X.columns
assert "var_B_G" not in X.columns
assert "var_A_A" in X.columns
assert "var_B_A" in X.columns