feature-engine · direkkakkar319-ops · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/feature_engine/_base_transformers/base_numerical.py b/feature_engine/_base_transformers/base_numerical.py
@@ -28,53 +28,34 @@ class BaseNumericalTransformer(
     variable transformers, discretisers, math combination.
     """
 
-    def fit(self, X: pd.DataFrame) -> pd.DataFrame:
+    def _fit_setup(self, X: pd.DataFrame):
         """
-        Checks that input is a dataframe, finds numerical variables, or alternatively
-        checks that variables entered by the user are of type numerical.
-
-        Parameters
-        ----------
-        X : Pandas DataFrame
-
-        y : Pandas Series, np.array. Default = None
-            Parameter is necessary for compatibility with sklearn Pipeline.
-
-        Raises
-        ------
-        TypeError
-            If the input is not a Pandas DataFrame or a numpy array
-            If any of the user provided variables are not numerical
-        ValueError
-            If there are no numerical variables in the df or the df is empty
-            If the variable(s) contain null values
-
-        Returns
-        -------
-        X : Pandas DataFrame
-            The same dataframe entered as parameter
+        Check dataframe, find numerical variables, check for NA and Inf.
+        Returns the checked dataframe and the correctly identified numerical variables.
         """
-
         # check input dataframe
         X = check_X(X)
 
         # find or check for numerical variables
         if self.variables is None:
-            self.variables_ = find_numerical_variables(X)
+            variables_ = find_numerical_variables(X)
         else:
-            self.variables_ = check_numerical_variables(X, self.variables)
+            variables_ = check_numerical_variables(X, self.variables)
 
         # check if dataset contains na or inf
-        _check_contains_na(X, self.variables_)
-        _check_contains_inf(X, self.variables_)
+        _check_contains_na(X, variables_)
+        _check_contains_inf(X, variables_)
 
-        # save input features
-        self.feature_names_in_ = X.columns.tolist()
+        return X, variables_
 
-        # save train set shape
+    def _get_feature_names_in(self, X):
+        """Get the names and number of features in the train set (the dataframe
+        used during fit)."""
+
+        self.feature_names_in_ = X.columns.to_list()
         self.n_features_in_ = X.shape[1]
 
-        return X
+        return self
 
     def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame:
         """

diff --git a/feature_engine/_base_transformers/mixins.py b/feature_engine/_base_transformers/mixins.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Union
+from typing import Dict, List, Tuple, Union
 
 import pandas as pd
 from numpy import ndarray
@@ -46,7 +46,9 @@ def transform_x_y(self, X: pd.DataFrame, y: pd.Series):
 
 
 class FitFromDictMixin:
-    def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame:
+    def _fit_from_dict(
+        self, X: pd.DataFrame, user_dict_: Dict
+    ) -> Tuple[pd.DataFrame, List[Union[str, int]]]:
         """
         Checks that input is a dataframe, checks that variables in the dictionary
         entered by the user are of type numerical.
@@ -71,25 +73,22 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame:
         -------
         X : Pandas DataFrame
             The same dataframe entered as parameter
+
+        variables_ : List
+            The variables in the dictionary.
         """
         # check input dataframe
         X = check_X(X)
 
         # find or check for numerical variables
         variables = list(user_dict_.keys())
-        self.variables_ = check_numerical_variables(X, variables)
+        variables_ = check_numerical_variables(X, variables)
 
         # check if dataset contains na or inf
-        _check_contains_na(X, self.variables_)
-        _check_contains_inf(X, self.variables_)
-
-        # save input features
-        self.feature_names_in_ = X.columns.tolist()
-
-        # save train set shape
-        self.n_features_in_ = X.shape[1]
+        _check_contains_na(X, variables_)
+        _check_contains_inf(X, variables_)
 
-        return X
+        return X, variables_
 
 
 class GetFeatureNamesOutMixin:

diff --git a/feature_engine/creation/cyclical_features.py b/feature_engine/creation/cyclical_features.py
@@ -147,11 +147,15 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
             It is not needed in this transformer. You can pass y or None.
         """
         if self.max_values is None:
-            X = super().fit(X)
-            self.max_values_ = X[self.variables_].max().to_dict()
+            X, variables_ = self._fit_setup(X)
+            max_values_ = X[variables_].max().to_dict()
         else:
-            super()._fit_from_dict(X, self.max_values)
-            self.max_values_ = self.max_values
+            X, variables_ = super()._fit_from_dict(X, self.max_values)
+            max_values_ = self.max_values
+
+        self.variables_ = variables_
+        self.max_values_ = max_values_
+        self._get_feature_names_in(X)
 
         return self
 

diff --git a/feature_engine/creation/decision_tree_features.py b/feature_engine/creation/decision_tree_features.py
@@ -260,27 +260,25 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         y: pandas Series or np.array = [n_samples,]
             The target variable that is used to train the decision tree.
         """
-        # confirm model type and target variables are compatible.
+        X, y = check_X_y(X, y)
         if self.regression is True:
             if type_of_target(y) == "binary":
                 raise ValueError(
                     "Trying to fit a regression to a binary target is not "
                     "allowed by this transformer. Check the target values "
                     "or set regression to False."
                 )
+            is_binary = None
         else:
             check_classification_targets(y)
-            self._is_binary = type_of_target(y)
-
-        X, y = check_X_y(X, y)
+            is_binary = type_of_target(y)
 
         # find or check for numerical variables
         if self.variables is None:
             variables_ = find_numerical_variables(X)
         else:
             variables_ = check_numerical_variables(X, self.variables)
 
-        # check if dataset contains na or inf
         _check_contains_na(X, variables_)
         _check_contains_inf(X, variables_)
 
@@ -289,7 +287,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         else:
             param_grid = {"max_depth": [1, 2, 3, 4]}
 
-        # get the sets of variables that will be used to create new features
         input_features = self._create_variable_combinations(
             how_to_combine=self.features_to_combine, variables=variables_
         )
@@ -298,7 +295,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         for features in input_features:
             estimator = self._make_decision_tree(param_grid=param_grid)
 
-            # single feature models
             if isinstance(features, str):
                 estimator.fit(X[features].to_frame(), y)
             # multi feature models
@@ -310,6 +306,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         self.variables_ = variables_
         self.input_features_ = input_features
         self.estimators_ = estimators_
+        self._is_binary = is_binary
         self.feature_names_in_ = X.columns.tolist()
         self.n_features_in_ = X.shape[1]
 
@@ -330,24 +327,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
             Either the original dataframe plus the new features or
             a dataframe of only the new features.
         """
-        # Check method fit has been called
         check_is_fitted(self)
 
-        # check that input is a dataframe
         X = check_X(X)
 
-        # Check if input data contains same number of columns as dataframe used to fit.
         _check_X_matches_training_df(X, self.n_features_in_)
 
-        # check if dataset contains na or inf
         _check_contains_na(X, self.variables_)
         _check_contains_inf(X, self.variables_)
 
-        # reorder variables to match train set
         X = X[self.feature_names_in_]
 
-        # create new features and add them to the original dataframe
-        # if regression or multiclass, we return the output of predict()
         if self.regression is True:
             for features, estimator in zip(self.input_features_, self.estimators_):
                 if isinstance(features, str):
@@ -361,7 +351,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
                         preds = np.round(preds, self.precision)
                     X.loc[:, f"tree({features})"] = preds
 
-        # if binary classification, we return the probability
         elif self._is_binary == "binary":
             for features, estimator in zip(self.input_features_, self.estimators_):
                 if isinstance(features, str):
@@ -375,7 +364,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
                         preds = np.round(preds, self.precision)
                     X.loc[:, f"tree({features})"] = preds[:, 1]
 
-        # if multiclass, we return the output of predict()
         else:
             for features, estimator in zip(self.input_features_, self.estimators_):
                 if isinstance(features, str):
@@ -437,7 +425,6 @@ def _create_variable_combinations(
                 else:
                     combos.append(list(feature))
 
-        # if output_features is None, int or list.
         else:
             if how_to_combine is None:
                 if len(variables) == 1:
@@ -452,7 +439,6 @@ def _create_variable_combinations(
                     els = [list(x) for x in itertools.combinations(variables, i)]
                     combos += els
 
-            # output_feature is a list
             else:
                 for i in how_to_combine:
                     els = [list(x) for x in itertools.combinations(variables, i)]
@@ -465,7 +451,6 @@ def _get_new_features_name(self) -> List:
         feature_names = [f"tree({combo})" for combo in self.input_features_]
         return feature_names
 
-    # for the check_estimator tests
     def _more_tags(self):
         tags_dict = _return_tags()
         tags_dict["requires_y"] = True

diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py
@@ -234,26 +234,25 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         # check input dataframe
         X = check_X(X)
 
-        # Store coordinate variables
-        self.variables_: List[Union[str, int]] = [
+        variables: List[Union[str, int]] = [
             self.lat1,
             self.lon1,
             self.lat2,
             self.lon2,
         ]
 
         # Check all coordinate columns exist
-        missing = set(self.variables_) - set(X.columns)
+        missing = set(variables) - set(X.columns)
         if missing:
             raise ValueError(
                 f"Coordinate columns {missing} are not present in the dataframe."
             )
 
         # Check coordinate columns are numerical
-        check_numerical_variables(X, self.variables_)
+        check_numerical_variables(X, variables)
 
         # Check for missing values
-        _check_contains_na(X, self.variables_)
+        _check_contains_na(X, variables)
 
         # Validate coordinate ranges if enabled
         if self.validate_ranges:
@@ -269,6 +268,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                         f"Longitude values in '{lon_col}' must be between -180 and 180."
                     )
 
+        # save coordinate variables
+        self.variables_ = variables
+
         # save input features
         self.feature_names_in_ = X.columns.tolist()
 

diff --git a/feature_engine/discretisation/arbitrary.py b/feature_engine/discretisation/arbitrary.py
@@ -151,10 +151,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
             y is not needed in this transformer. You can pass y or None.
         """
         # check input dataframe
-        X = super()._fit_from_dict(X, self.binning_dict)
+        X, variables_ = super()._fit_from_dict(X, self.binning_dict)
 
         # for consistency wit the rest of the discretisers, we add this attribute
+        self.variables_ = variables_
         self.binner_dict_ = self.binning_dict
+        self._get_feature_names_in(X)
 
         return self
 

diff --git a/feature_engine/discretisation/decision_tree.py b/feature_engine/discretisation/decision_tree.py
@@ -214,7 +214,7 @@ def __init__(
         self.param_grid = param_grid
         self.random_state = random_state
 
-    def fit(self, X: pd.DataFrame, y: pd.Series):  # type: ignore
+    def fit(self, X: pd.DataFrame, y: pd.Series):
         """
         Fit one decision tree per variable to discretize with cross-validation and
         grid-search for hyperparameters.
@@ -241,7 +241,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):  # type: ignore
             check_classification_targets(y)
 
         # check input dataframe
-        X = super().fit(X)
+        X, variables_ = self._fit_setup(X)
 
         if self.param_grid:
             param_grid = self.param_grid
@@ -251,7 +251,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):  # type: ignore
         binner_dict_ = {}
         scores_dict_ = {}
 
-        for var in self.variables_:
+        for var in variables_:
 
             if self.regression:
                 model = DecisionTreeRegressor(random_state=self.random_state)
@@ -269,7 +269,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):  # type: ignore
             scores_dict_[var] = tree_model.score(X[var].to_frame(), y)
 
         if self.bin_output != "prediction":
-            for var in self.variables_:
+            for var in variables_:
                 clf = binner_dict_[var].best_estimator_
                 threshold = clf.tree_.threshold
                 feature = clf.tree_.feature
@@ -280,6 +280,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series):  # type: ignore
 
         self.binner_dict_ = binner_dict_
         self.scores_dict_ = scores_dict_
+        self.variables_ = variables_
+        self._get_feature_names_in(X)
+
         return self
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:

diff --git a/feature_engine/discretisation/equal_frequency.py b/feature_engine/discretisation/equal_frequency.py
@@ -159,17 +159,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
 
         # check input dataframe
-        X = super().fit(X)
+        X, variables_ = self._fit_setup(X)
 
-        self.binner_dict_ = {}
+        binner_dict_ = {}
 
-        for var in self.variables_:
+        for var in variables_:
             tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates="drop")
 
             # Prepend/Append infinities to accommodate outliers
             bins = list(bins)
             bins[0] = float("-inf")
             bins[len(bins) - 1] = float("inf")
-            self.binner_dict_[var] = bins
+            binner_dict_[var] = bins
 
+        self.binner_dict_ = binner_dict_
+        self.variables_ = variables_
+        self._get_feature_names_in(X)
         return self