Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7d23f87
added test case `def test_raises_non_fitted_error_when_error_during_f…
direkkakkar319-ops Mar 21, 2026
e1439cb
fixing test case test_style
direkkakkar319-ops Mar 21, 2026
f33a85a
fixing test case test_style
direkkakkar319-ops Mar 21, 2026
7a0e8f5
fixing test case test_style
direkkakkar319-ops Mar 21, 2026
59c8bd1
fix: defer trailing underscore attribute assignment in fit() for impu…
direkkakkar319-ops Mar 22, 2026
fef5a3f
base transformers
direkkakkar319-ops Mar 26, 2026
2d3f734
discretisation
direkkakkar319-ops Mar 26, 2026
f95cb7f
scaling
direkkakkar319-ops Mar 26, 2026
9b2fa4c
creation
direkkakkar319-ops Mar 26, 2026
64c9e74
imputation
direkkakkar319-ops Mar 26, 2026
50075fb
transformation
direkkakkar319-ops Mar 26, 2026
f2e944f
tests
direkkakkar319-ops Mar 26, 2026
ca9241e
creation
direkkakkar319-ops Mar 26, 2026
d813e2d
verified changes for checks
direkkakkar319-ops Mar 26, 2026
519ca1d
verified changes for checks
direkkakkar319-ops Mar 26, 2026
44f75c0
value error
direkkakkar319-ops Mar 26, 2026
7b3c592
ADDED:`test_raises_non_fitted_error_when_error_during_fit`
direkkakkar319-ops Mar 27, 2026
6babea2
added:`test_raises_non_fitted_error_when_error_during_fit`
direkkakkar319-ops Mar 27, 2026
c945dee
addEd:`test_raises_non_fitted_error_when_error_during_fit`
direkkakkar319-ops Mar 27, 2026
b631f63
tranformers
direkkakkar319-ops Mar 27, 2026
0c681f5
left
direkkakkar319-ops Mar 27, 2026
6898a3d
Updated the `Decisiontreefeatures` and `GeoDIstanceFeatures`
direkkakkar319-ops Mar 28, 2026
b807ef4
fixed `geo_features.py`
direkkakkar319-ops Mar 28, 2026
3de9968
Improved failure triggers
direkkakkar319-ops Mar 28, 2026
9a5973d
Improved failure triggers
direkkakkar319-ops Mar 28, 2026
aa75e97
fixed
direkkakkar319-ops Mar 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions feature_engine/_base_transformers/base_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,53 +28,34 @@ class BaseNumericalTransformer(
variable transformers, discretisers, math combination.
"""

def fit(self, X: pd.DataFrame) -> pd.DataFrame:
def _fit_setup(self, X: pd.DataFrame):
"""
Checks that input is a dataframe, finds numerical variables, or alternatively
checks that variables entered by the user are of type numerical.

Parameters
----------
X : Pandas DataFrame

y : Pandas Series, np.array. Default = None
Parameter is necessary for compatibility with sklearn Pipeline.

Raises
------
TypeError
If the input is not a Pandas DataFrame or a numpy array
If any of the user provided variables are not numerical
ValueError
If there are no numerical variables in the df or the df is empty
If the variable(s) contain null values

Returns
-------
X : Pandas DataFrame
The same dataframe entered as parameter
Check dataframe, find numerical variables, check for NA and Inf.
Returns the checked dataframe and the correctly identified numerical variables.
"""

# check input dataframe
X = check_X(X)

# find or check for numerical variables
if self.variables is None:
self.variables_ = find_numerical_variables(X)
variables_ = find_numerical_variables(X)
else:
self.variables_ = check_numerical_variables(X, self.variables)
variables_ = check_numerical_variables(X, self.variables)

# check if dataset contains na or inf
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)
_check_contains_na(X, variables_)
_check_contains_inf(X, variables_)

# save input features
self.feature_names_in_ = X.columns.tolist()
return X, variables_

# save train set shape
def _get_feature_names_in(self, X):
"""Get the names and number of features in the train set (the dataframe
used during fit)."""

self.feature_names_in_ = X.columns.to_list()
self.n_features_in_ = X.shape[1]

return X
return self

def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down
23 changes: 11 additions & 12 deletions feature_engine/_base_transformers/mixins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List, Union
from typing import Dict, List, Tuple, Union

import pandas as pd
from numpy import ndarray
Expand Down Expand Up @@ -46,7 +46,9 @@ def transform_x_y(self, X: pd.DataFrame, y: pd.Series):


class FitFromDictMixin:
def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame:
def _fit_from_dict(
self, X: pd.DataFrame, user_dict_: Dict
) -> Tuple[pd.DataFrame, List[Union[str, int]]]:
"""
Checks that input is a dataframe, checks that variables in the dictionary
entered by the user are of type numerical.
Expand All @@ -71,25 +73,22 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame:
-------
X : Pandas DataFrame
The same dataframe entered as parameter

variables_ : List
The variables in the dictionary.
"""
# check input dataframe
X = check_X(X)

# find or check for numerical variables
variables = list(user_dict_.keys())
self.variables_ = check_numerical_variables(X, variables)
variables_ = check_numerical_variables(X, variables)

# check if dataset contains na or inf
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)

# save input features
self.feature_names_in_ = X.columns.tolist()

# save train set shape
self.n_features_in_ = X.shape[1]
_check_contains_na(X, variables_)
_check_contains_inf(X, variables_)

return X
return X, variables_


class GetFeatureNamesOutMixin:
Expand Down
12 changes: 8 additions & 4 deletions feature_engine/creation/cyclical_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,15 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
It is not needed in this transformer. You can pass y or None.
"""
if self.max_values is None:
X = super().fit(X)
self.max_values_ = X[self.variables_].max().to_dict()
X, variables_ = self._fit_setup(X)
max_values_ = X[variables_].max().to_dict()
else:
super()._fit_from_dict(X, self.max_values)
self.max_values_ = self.max_values
X, variables_ = super()._fit_from_dict(X, self.max_values)
max_values_ = self.max_values

self.variables_ = variables_
self.max_values_ = max_values_
self._get_feature_names_in(X)

return self

Expand Down
23 changes: 4 additions & 19 deletions feature_engine/creation/decision_tree_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,27 +260,25 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
y: pandas Series or np.array = [n_samples,]
The target variable that is used to train the decision tree.
"""
# confirm model type and target variables are compatible.
X, y = check_X_y(X, y)
if self.regression is True:
if type_of_target(y) == "binary":
raise ValueError(
"Trying to fit a regression to a binary target is not "
"allowed by this transformer. Check the target values "
"or set regression to False."
)
is_binary = None
else:
check_classification_targets(y)
self._is_binary = type_of_target(y)

X, y = check_X_y(X, y)
is_binary = type_of_target(y)

# find or check for numerical variables
if self.variables is None:
variables_ = find_numerical_variables(X)
else:
variables_ = check_numerical_variables(X, self.variables)

# check if dataset contains na or inf
_check_contains_na(X, variables_)
_check_contains_inf(X, variables_)

Expand All @@ -289,7 +287,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
else:
param_grid = {"max_depth": [1, 2, 3, 4]}

# get the sets of variables that will be used to create new features
input_features = self._create_variable_combinations(
how_to_combine=self.features_to_combine, variables=variables_
)
Expand All @@ -298,7 +295,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
for features in input_features:
estimator = self._make_decision_tree(param_grid=param_grid)

# single feature models
if isinstance(features, str):
estimator.fit(X[features].to_frame(), y)
# multi feature models
Expand All @@ -310,6 +306,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
self.variables_ = variables_
self.input_features_ = input_features
self.estimators_ = estimators_
self._is_binary = is_binary
self.feature_names_in_ = X.columns.tolist()
self.n_features_in_ = X.shape[1]

Expand All @@ -330,24 +327,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Either the original dataframe plus the new features or
a dataframe of only the new features.
"""
# Check method fit has been called
check_is_fitted(self)

# check that input is a dataframe
X = check_X(X)

# Check if input data contains same number of columns as dataframe used to fit.
_check_X_matches_training_df(X, self.n_features_in_)

# check if dataset contains na or inf
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)

# reorder variables to match train set
X = X[self.feature_names_in_]

# create new features and add them to the original dataframe
# if regression or multiclass, we return the output of predict()
if self.regression is True:
for features, estimator in zip(self.input_features_, self.estimators_):
if isinstance(features, str):
Expand All @@ -361,7 +351,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
preds = np.round(preds, self.precision)
X.loc[:, f"tree({features})"] = preds

# if binary classification, we return the probability
elif self._is_binary == "binary":
for features, estimator in zip(self.input_features_, self.estimators_):
if isinstance(features, str):
Expand All @@ -375,7 +364,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
preds = np.round(preds, self.precision)
X.loc[:, f"tree({features})"] = preds[:, 1]

# if multiclass, we return the output of predict()
else:
for features, estimator in zip(self.input_features_, self.estimators_):
if isinstance(features, str):
Expand Down Expand Up @@ -437,7 +425,6 @@ def _create_variable_combinations(
else:
combos.append(list(feature))

# if output_features is None, int or list.
else:
if how_to_combine is None:
if len(variables) == 1:
Expand All @@ -452,7 +439,6 @@ def _create_variable_combinations(
els = [list(x) for x in itertools.combinations(variables, i)]
combos += els

# output_feature is a list
else:
for i in how_to_combine:
els = [list(x) for x in itertools.combinations(variables, i)]
Expand All @@ -465,7 +451,6 @@ def _get_new_features_name(self) -> List:
feature_names = [f"tree({combo})" for combo in self.input_features_]
return feature_names

# for the check_estimator tests
def _more_tags(self):
tags_dict = _return_tags()
tags_dict["requires_y"] = True
Expand Down
12 changes: 7 additions & 5 deletions feature_engine/creation/geo_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,26 +234,25 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
# check input dataframe
X = check_X(X)

# Store coordinate variables
self.variables_: List[Union[str, int]] = [
variables: List[Union[str, int]] = [
self.lat1,
self.lon1,
self.lat2,
self.lon2,
]

# Check all coordinate columns exist
missing = set(self.variables_) - set(X.columns)
missing = set(variables) - set(X.columns)
if missing:
raise ValueError(
f"Coordinate columns {missing} are not present in the dataframe."
)

# Check coordinate columns are numerical
check_numerical_variables(X, self.variables_)
check_numerical_variables(X, variables)

# Check for missing values
_check_contains_na(X, self.variables_)
_check_contains_na(X, variables)

# Validate coordinate ranges if enabled
if self.validate_ranges:
Expand All @@ -269,6 +268,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
f"Longitude values in '{lon_col}' must be between -180 and 180."
)

# save coordinate variables
self.variables_ = variables

# save input features
self.feature_names_in_ = X.columns.tolist()

Expand Down
4 changes: 3 additions & 1 deletion feature_engine/discretisation/arbitrary.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
y is not needed in this transformer. You can pass y or None.
"""
# check input dataframe
X = super()._fit_from_dict(X, self.binning_dict)
X, variables_ = super()._fit_from_dict(X, self.binning_dict)

# for consistency wit the rest of the discretisers, we add this attribute
self.variables_ = variables_
self.binner_dict_ = self.binning_dict
self._get_feature_names_in(X)

return self

Expand Down
11 changes: 7 additions & 4 deletions feature_engine/discretisation/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def __init__(
self.param_grid = param_grid
self.random_state = random_state

def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
def fit(self, X: pd.DataFrame, y: pd.Series):
"""
Fit one decision tree per variable to discretize with cross-validation and
grid-search for hyperparameters.
Expand All @@ -241,7 +241,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
check_classification_targets(y)

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

if self.param_grid:
param_grid = self.param_grid
Expand All @@ -251,7 +251,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
binner_dict_ = {}
scores_dict_ = {}

for var in self.variables_:
for var in variables_:

if self.regression:
model = DecisionTreeRegressor(random_state=self.random_state)
Expand All @@ -269,7 +269,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore
scores_dict_[var] = tree_model.score(X[var].to_frame(), y)

if self.bin_output != "prediction":
for var in self.variables_:
for var in variables_:
clf = binner_dict_[var].best_estimator_
threshold = clf.tree_.threshold
feature = clf.tree_.feature
Expand All @@ -280,6 +280,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore

self.binner_dict_ = binner_dict_
self.scores_dict_ = scores_dict_
self.variables_ = variables_
self._get_feature_names_in(X)

return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Expand Down
11 changes: 7 additions & 4 deletions feature_engine/discretisation/equal_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""

# check input dataframe
X = super().fit(X)
X, variables_ = self._fit_setup(X)

self.binner_dict_ = {}
binner_dict_ = {}

for var in self.variables_:
for var in variables_:
tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates="drop")

# Prepend/Append infinities to accommodate outliers
bins = list(bins)
bins[0] = float("-inf")
bins[len(bins) - 1] = float("inf")
self.binner_dict_[var] = bins
binner_dict_[var] = bins

self.binner_dict_ = binner_dict_
self.variables_ = variables_
self._get_feature_names_in(X)
return self
Loading