Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fb230fe
feat(CategoricalImputer): add errors param to handle multimodal varia…
direkkakkar319-ops Mar 8, 2026
81be348
style: fix flake8 line length in CategoricalImputer
direkkakkar319-ops Mar 8, 2026
4fb5b7a
style: fix import order and duplicate pandas import
direkkakkar319-ops Mar 8, 2026
835133f
test: add coverage for errors='ignore' branches
direkkakkar319-ops Mar 8, 2026
81f31d8
style: add missing newline at end of test file
direkkakkar319-ops Mar 8, 2026
657de1f
Changes for codedev tests
direkkakkar319-ops Mar 9, 2026
a0ea71d
added space at last of test_categorical_imputer.py
direkkakkar319-ops Mar 16, 2026
0cdcf03
Revert docs/whats_new/v_190.rst to upstream version
direkkakkar319-ops Mar 26, 2026
cf7670e
changes done to `feature_engine/imputation/categorical.py`
direkkakkar319-ops Mar 26, 2026
fb2f8db
changes made to `tests/test_imputation/test_categorical_imputer.py`
direkkakkar319-ops Mar 26, 2026
97d6053
resolved comment done on R15
direkkakkar319-ops Mar 26, 2026
c454edd
reformated the error tests to match the error from within pytest
direkkakkar319-ops Mar 26, 2026
5992d09
made three tests in on test
direkkakkar319-ops Mar 26, 2026
85b1974
left change
direkkakkar319-ops Mar 26, 2026
09429f3
refaactored the multimodal tests
direkkakkar319-ops Mar 26, 2026
0b86cfa
refactored test_errors_invalid_value_raises
direkkakkar319-ops Mar 26, 2026
45f4e2f
changed the function `test_errors_param_ignored_when_imputation_metho…
direkkakkar319-ops Mar 26, 2026
cda93e7
removed `test_errors_ignore_single_variable` `test_errors_ignore_mult…
direkkakkar319-ops Mar 26, 2026
04be1a0
emove the commented block
direkkakkar319-ops Mar 26, 2026
94643d8
last few changes made
direkkakkar319-ops Mar 26, 2026
ab6ba66
test case style updated
direkkakkar319-ops Mar 26, 2026
6ba7fce
Renamed `errors` to `multimodal` in CategoricalImputer and add missin…
direkkakkar319-ops Mar 27, 2026
1a3fde2
Apply suggestion from @solegalli
direkkakkar319-ops Mar 27, 2026
36eb1dc
Apply suggestion from @solegalli
direkkakkar319-ops Mar 27, 2026
aa37d19
Update categorical.py
direkkakkar319-ops Mar 27, 2026
3e58d8b
removed comments and added tests
direkkakkar319-ops Mar 27, 2026
6746429
Merge branch 'issue-904-categorical-imputer-multimodal' of https://gi…
direkkakkar319-ops Mar 27, 2026
c77e8f1
Update .gitignore
direkkakkar319-ops Mar 27, 2026
a22f586
removed the spaces
direkkakkar319-ops Mar 27, 2026
51f8276
Merge branch 'issue-904-categorical-imputer-multimodal' of https://gi…
direkkakkar319-ops Mar 27, 2026
7156d28
removed the spaces
direkkakkar319-ops Mar 27, 2026
5d65fe8
simplified the test case as asked
direkkakkar319-ops Mar 27, 2026
a95f5e0
simplified the test case as asked
direkkakkar319-ops Mar 27, 2026
6f5b4da
simplified the test case as asked
direkkakkar319-ops Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,4 @@ venv.bak/
*.csv
*.DS_Store
*.db
*.pptx
*.pptx
78 changes: 52 additions & 26 deletions feature_engine/imputation/categorical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import pandas as pd
Expand All @@ -12,7 +13,7 @@
_feature_names_in_docstring,
_imputer_dict_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
_variables_attribute_docstring
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
Expand Down Expand Up @@ -88,6 +89,18 @@ class CategoricalImputer(BaseImputer):
type object or categorical. If True, the imputer will select all variables or
accept all variables entered by the user, including those cast as numeric.

multimodal : str, default='raise'
Indicates what to do when imputation_method='frequent'
and a variable has more than 1 mode.

If 'raise', raises a ValueError and stops the fit.

If 'warn', raises a UserWarning and continues the imputation using the
first most frequent category found.

If 'ignore', continues without warnings, imputing using the first
most frequent category found.

Attributes
----------
{imputer_dict_}
Expand Down Expand Up @@ -135,6 +148,7 @@ def __init__(
variables: Union[None, int, str, List[Union[str, int]]] = None,
return_object: bool = False,
ignore_format: bool = False,
multimodal: str = "raise",
) -> None:
if imputation_method not in ["missing", "frequent"]:
raise ValueError(
Expand All @@ -144,11 +158,18 @@ def __init__(
if not isinstance(ignore_format, bool):
raise ValueError("ignore_format takes only booleans True and False")

if multimodal not in ["raise", "warn", "ignore"]:
raise ValueError(
"multimodal takes only values 'raise', 'warn', or 'ignore'. "
f"Got {multimodal} instead."
)

self.imputation_method = imputation_method
self.fill_value = fill_value
self.variables = _check_variables_input_value(variables)
self.return_object = return_object
self.ignore_format = ignore_format
self.multimodal = multimodal

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand All @@ -163,10 +184,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
y is not needed in this imputation. You can pass None or y.
"""

# check input dataframe
X = check_X(X)

# select variables to encode
if self.ignore_format is True:
if self.variables is None:
self.variables_ = find_all_variables(X)
Expand All @@ -182,54 +201,64 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
self.imputer_dict_ = {var: self.fill_value for var in self.variables_}

elif self.imputation_method == "frequent":
# if imputing only 1 variable:
if len(self.variables_) == 1:
var = self.variables_[0]
mode_vals = X[var].mode()

# Some variables may contain more than 1 mode:
if len(mode_vals) > 1:
raise ValueError(
f"The variable {var} contains multiple frequent categories."
)
if self.multimodal == "raise":
raise ValueError(
f"The variable {var} contains multiple "
f"frequent categories. Set multimodal='warn' or "
f"multimodal='ignore' to allow imputation using "
f"the first most frequent category found."
)
elif self.multimodal == "warn":
warnings.warn(
f"Variable {var} has multiple frequent "
f"categories. The first category found, "
f"{mode_vals[0]}, will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = {var: mode_vals[0]}

# imputing multiple variables:
else:
# Returns a dataframe with 1 row if there is one mode per
# variable, or more rows if there are more modes:
mode_vals = X[self.variables_].mode()

# Careful: some variables contain multiple modes
if len(mode_vals) > 1:
varnames = mode_vals.dropna(axis=1).columns.to_list()
if len(varnames) > 1:
varnames_str = ", ".join(varnames)
else:
varnames_str = varnames[0]
raise ValueError(
f"The variable(s) {varnames_str} contain(s) multiple frequent "
f"categories."
)

if self.multimodal == "raise":
raise ValueError(
f"The variable(s) {varnames_str} contain(s) "
f"multiple frequent categories. Set "
f"multimodal='warn' or multimodal='ignore' to allow "
f"imputation using the first most frequent "
f"category found."
)
elif self.multimodal == "warn":
warnings.warn(
f"Variable(s) {varnames_str} have multiple "
f"frequent categories. The first category "
f"found will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = mode_vals.iloc[0].to_dict()

self._get_feature_names_in(X)

return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
# Frequent category imputation
if self.imputation_method == "frequent":
X = super().transform(X)

# Imputation with string
else:
X = self._transform(X)

# if variable is of type category, we need to add the new
# category, before filling in the nan
add_cats = {}
for variable in self.variables_:
if X[variable].dtype.name == "category":
Expand All @@ -243,13 +272,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

X = X.assign(**add_cats).fillna(self.imputer_dict_)

# add additional step to return variables cast as object
if self.return_object:
X[self.variables_] = X[self.variables_].astype("O")

return X

# Get docstring from BaseClass
transform.__doc__ = BaseImputer.transform.__doc__

def _more_tags(self):
Expand Down
Loading