feature-engine · karen-elisha · Jun 26, 2026
diff --git a/feature_engine/encoding/one_hot.py b/feature_engine/encoding/one_hot.py
@@ -1,6 +1,7 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import numpy as np
@@ -80,13 +81,31 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin):
         categories to encode. In this case, dummy variables will be created only for
         those popular categories and the rest will be ignored, i.e., they will show the
         value 0 in all the binary variables. Note that if `top_categories` is not None,
-        the parameter `drop_last` is ignored.
+        the parameters `drop_last` and `drop` are ignored.
 
     drop_last: boolean, default=False
         Only used if `top_categories = None`. It indicates whether to create dummy
         variables for all the categories (k dummies), or if set to `True`, it will
         ignore the last binary variable and return k-1 dummies.
 
+        .. deprecated::
+            `drop_last` is deprecated. Use the `drop` parameter instead, which
+            provides more flexibility. If both `drop_last` and `drop` are set,
+            a warning will be raised and `drop` will take precedence.
+
+    drop: str, default=None
+        Only used if `top_categories = None`. Determines which category to drop
+        to return k-1 binary variables. Options are:
+
+        - None: No category is dropped (k dummies are returned), unless
+          `drop_last` is True.
+        - ``'last'``: Drops the last category alphabetically.
+        - ``'first'``: Drops the first category alphabetically.
+        - ``'most_frequent'``: Drops the most frequent category observed during
+          fit(). If multiple categories share the highest frequency, a UserWarning
+          is raised and the first category alphabetically among the tied ones is
+          dropped.
+
     drop_last_binary: boolean, default=False
         Whether to return 1 or 2 dummy variables for binary categorical variables. When
         a categorical variable has only 2 categories, then the second dummy variable
@@ -164,6 +183,7 @@ def __init__(
         drop_last_binary: bool = False,
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         ignore_format: bool = False,
+        drop: Optional[str] = None,
     ) -> None:
 
         if top_categories and (
@@ -185,10 +205,26 @@ def __init__(
                 f"Got {drop_last_binary} instead."
             )
 
+        if drop is not None and drop not in ("last", "first", "most_frequent"):
+            raise ValueError(
+                "drop takes only values 'last', 'first', 'most_frequent' "
+                f"or None. Got {drop} instead."
+            )
+
+        if drop_last is True and drop is not None:
+            warnings.warn(
+                "Both 'drop_last' and 'drop' were set. 'drop_last' is deprecated "
+                "in favour of 'drop'. The 'drop' parameter will take precedence. "
+                "Please use only 'drop' going forward.",
+                FutureWarning,
+                stacklevel=2,
+            )
+
         super().__init__(variables, ignore_format)
         self.top_categories = top_categories
         self.drop_last = drop_last
         self.drop_last_binary = drop_last_binary
+        self.drop = drop
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -228,15 +264,48 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                 ]
 
             else:
-                category_ls = list(X[var].unique())
-
-                # return k-1 dummies
-                if self.drop_last:
-                    self.encoder_dict_[var] = category_ls[:-1]
-
-                # return k dummies
+                # Determine the effective drop strategy.
+                # New 'drop' param takes precedence over old 'drop_last'.
+                drop_strategy = self.drop
+                if drop_strategy is None and self.drop_last:
+                    drop_strategy = "last"
+
+                if drop_strategy is not None:
+                    # Sort alphabetically so "first"/"last" are deterministic
+                    category_ls = sorted(X[var].unique())
+
+                    # Figure out which single category to remove
+                    if drop_strategy == "last":
+                        cat_to_drop = category_ls[-1]
+
+                    elif drop_strategy == "first":
+                        cat_to_drop = category_ls[0]
+
+                    elif drop_strategy == "most_frequent":
+                        freq = X[var].value_counts()
+                        max_freq = freq.max()
+                        most_frequent_cats = sorted(
+                            freq[freq == max_freq].index.tolist()
+                        )
+                        if len(most_frequent_cats) > 1:
+                            warnings.warn(
+                                f"Variable '{var}': multiple categories share "
+                                f"the highest frequency ({max_freq}): "
+                                f"{most_frequent_cats}. Dropping the first "
+                                f"alphabetically: '{most_frequent_cats[0]}'.",
+                                UserWarning,
+                                stacklevel=2,
+                            )
+                        cat_to_drop = most_frequent_cats[0]
+
+                    # Remove that one category from the list
+                    self.encoder_dict_[var] = [
+                        c for c in category_ls if c != cat_to_drop
+                    ]
+
+                # return k dummies (no category dropped) — preserve insertion order
                 else:
-                    self.encoder_dict_[var] = category_ls
+                    self.encoder_dict_[var] = list(X[var].unique())
 
         self.variables_binary_ = [var for var in variables_ if X[var].nunique() == 2]
 

diff --git a/tests/test_encoding/test_onehot_encoder.py b/tests/test_encoding/test_onehot_encoder.py
@@ -1,3 +1,5 @@
+import warnings
+
 import pandas as pd
 import pytest
 from sklearn.pipeline import Pipeline
@@ -458,13 +460,18 @@ def test_get_feature_names_out(df_enc_binary):
     tr = OneHotEncoder(drop_last=True)
     tr.fit(df_enc_binary)
 
+    # With drop_last=True, categories are sorted alphabetically and last is dropped.
+    # var_A: [A, B, C] -> drop C -> keep A, B
+    # var_B: [A, B, C] -> drop C -> keep A, B
+    # var_C: [AHA, UHU] -> drop UHU -> keep AHA
+    # var_D: [EHE, OHO] -> drop OHO -> keep EHE
     out = [
         "var_A_A",
         "var_A_B",
         "var_B_A",
         "var_B_B",
         "var_C_AHA",
-        "var_D_OHO",
+        "var_D_EHE",
     ]
     feat_out = original_features + out
 
@@ -534,3 +541,124 @@ def test_inverse_transform_raises_not_implemented_error(df_enc_binary):
     enc = OneHotEncoder().fit(df_enc_binary)
     with pytest.raises(NotImplementedError):
         enc.inverse_transform(df_enc_binary)
+
+
+# ================================================================
+# Tests for the new `drop` parameter
+# ================================================================
+
+
+def test_drop_last_drops_last_category_alphabetically():
+    """drop='last' should sort categories alphabetically and drop the last one."""
+    df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]})
+    encoder = OneHotEncoder(drop="last")
+    encoder.fit(df)
+
+    # Alphabetically: Blue, Green, Red -> drop "Red"
+    assert encoder.encoder_dict_ == {"color": ["Blue", "Green"]}
+
+    X = encoder.transform(df)
+    assert "color_Red" not in X.columns
+    assert "color_Blue" in X.columns
+    assert "color_Green" in X.columns
+
+
+def test_drop_first_drops_first_category_alphabetically():
+    """drop='first' should sort categories alphabetically and drop the first one."""
+    df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]})
+    encoder = OneHotEncoder(drop="first")
+    encoder.fit(df)
+
+    # Alphabetically: Blue, Green, Red -> drop "Blue"
+    assert encoder.encoder_dict_ == {"color": ["Green", "Red"]}
+
+    X = encoder.transform(df)
+    assert "color_Blue" not in X.columns
+    assert "color_Green" in X.columns
+    assert "color_Red" in X.columns
+
+
+def test_drop_most_frequent_drops_most_common_category():
+    """drop='most_frequent' should drop the category with the highest count."""
+    df = pd.DataFrame({
+        "animal": ["Cat"] * 10 + ["Dog"] * 5 + ["Fish"] * 3
+    })
+    encoder = OneHotEncoder(drop="most_frequent")
+    encoder.fit(df)
+
+    # Cat appears 10 times (most frequent) -> it should be dropped
+    assert "Cat" not in encoder.encoder_dict_["animal"]
+    assert "Dog" in encoder.encoder_dict_["animal"]
+    assert "Fish" in encoder.encoder_dict_["animal"]
+
+    X = encoder.transform(df)
+    assert "animal_Cat" not in X.columns
+    assert "animal_Dog" in X.columns
+    assert "animal_Fish" in X.columns
+
+
+def test_drop_most_frequent_with_tie_raises_warning():
+    """When multiple categories share the highest frequency, a warning should be
+    raised and the first one alphabetically among the tied should be dropped."""
+    df = pd.DataFrame({
+        "fruit": ["Apple"] * 5 + ["Banana"] * 5 + ["Cherry"] * 3
+    })
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        encoder = OneHotEncoder(drop="most_frequent")
+        encoder.fit(df)
+
+        # Apple and Banana are tied at 5 each -> warning expected
+        user_warnings = [
+            x for x in w
+            if issubclass(x.category, UserWarning)
+            and "multiple categories" in str(x.message).lower()
+        ]
+        assert len(user_warnings) == 1
+
+    # Should drop "Apple" (first alphabetically among tied)
+    assert "Apple" not in encoder.encoder_dict_["fruit"]
+    assert "Banana" in encoder.encoder_dict_["fruit"]
+    assert "Cherry" in encoder.encoder_dict_["fruit"]
+
+
+def test_deprecation_warning_when_drop_last_and_drop_both_set():
+    """Using both drop_last=True and drop='first' should emit a FutureWarning,
+    and the 'drop' parameter should take precedence."""
+    df = pd.DataFrame({"color": ["Red", "Blue", "Green", "Red", "Blue", "Green"]})
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        encoder = OneHotEncoder(drop_last=True, drop="first")
+
+        future_warnings = [x for x in w if issubclass(x.category, FutureWarning)]
+        assert len(future_warnings) == 1
+        assert "deprecated" in str(future_warnings[0].message).lower()
+
+    encoder.fit(df)
+
+    # drop='first' takes precedence -> drops "Blue" (first alphabetically)
+    assert "Blue" not in encoder.encoder_dict_["color"]
+    assert "Green" in encoder.encoder_dict_["color"]
+    assert "Red" in encoder.encoder_dict_["color"]
+
+
+@pytest.mark.parametrize("bad_value", ["middle", "random", 123, True])
+def test_error_if_drop_not_valid(bad_value):
+    """Invalid values for 'drop' should raise a ValueError."""
+    with pytest.raises(ValueError):
+        OneHotEncoder(drop=bad_value)
+
+
+def test_drop_last_backward_compatibility(df_enc_big):
+    """Using drop_last=True without the new drop parameter should still work."""
+    encoder = OneHotEncoder(
+        top_categories=None, variables=["var_A", "var_B"], drop_last=True
+    )
+    X = encoder.fit_transform(df_enc_big)
+
+    # Alphabetically last category is "G" -> should be dropped
+    assert "var_A_G" not in X.columns
+    assert "var_B_G" not in X.columns
+    assert "var_A_A" in X.columns
+    assert "var_B_A" in X.columns