Skip to content

bool dtype not correctly supported #38

@simone-mangiante

Description

@simone-mangiante
  • DataSynthesizer version: 0.1.10
  • Python version: 3.7.12
  • Operating System: Debian 10

Description

DataDescriber does not handle well bool dtypes in the source dataset. When the CSV file has columns with only TRUE and FALSE as values, pandas reads such columns as bool dtype (not object) and, when inferring types, the code ends up in checking them as dates and fails.

What I Did

The source dataset is the telco-customer-churn dataset from Kaggle, after being imported in Google BigQuery and exported back to CSV, generating those TRUE and FALSE values instead of Yes and No. Below is my code:

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
import pandas as pd

# input dataset
input_data = "./out/from_bq.csv" # this CSV file has columns with TRUE and FALSE value which get read by pandas as bool dtype

mode = 'correlated_attribute_mode'

# location of two output files
description_file = f'./out/{mode}/description.json'
synthetic_data = f'./out/{mode}/synthetic_data.csv'

# An attribute is categorical if its domain size is less than this threshold.
threshold_value = 20

# list of dicsrete columns and primary key
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "TotalCharges", "Churn"]
primary_key_column = "customerID"

# specify categorical attributes
categorical_attributes = {}
for column in categorical_columns:
    categorical_attributes[column] = True

# specify which attributes are candidate keys of input dataset.
candidate_keys = {primary_key_column: True}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 0

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 2

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 1000

# build the Bayesian Network
describer = DataDescriber(category_threshold=threshold_value)
describer.describe_dataset_in_correlated_attribute_mode(dataset_file="./out/from_bq.csv", 
                                                        epsilon=epsilon, 
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        attribute_to_is_candidate_key=candidate_keys)

# save the output
describer.save_dataset_description_to_file(description_file)

Here is the output:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_4364/1321006366.py in <module>
     46                                                         k=degree_of_bayesian_network,
     47                                                         attribute_to_is_categorical=categorical_attributes,
---> 48                                                         attribute_to_is_candidate_key=candidate_keys)
     49 
     50 # save the output

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_correlated_attribute_mode(self, dataset_file, k, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
    170                                                             categorical_attribute_domain_file,
    171                                                             numerical_attribute_ranges,
--> 172                                                             seed)
    173         self.df_encoded = self.encode_dataset_into_binning_indices()
    174         if self.df_encoded.shape[1] < 2:

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_independent_attribute_mode(self, dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
    118                                              categorical_attribute_domain_file,
    119                                              numerical_attribute_ranges,
--> 120                                              seed=seed)
    121 
    122         for column in self.attr_to_column.values():

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_random_mode(self, dataset_file, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
     85         self.attr_to_is_candidate_key = attribute_to_is_candidate_key
     86         self.read_dataset_from_csv(dataset_file)
---> 87         self.infer_attribute_data_types()
     88         self.analyze_dataset_meta()
     89         self.represent_input_dataset_by_columns()

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in infer_attribute_data_types(self)
    213                 # Sample 20 values to test its data_type.
    214                 samples = column_dropna.sample(20, replace=True)
--> 215                 if all(samples.map(is_datetime)):
    216                     self.attr_to_datatype[attr] = DataType.DATETIME
    217                 else:

/opt/conda/lib/python3.7/site-packages/pandas/core/series.py in map(self, arg, na_action)
   3980         dtype: object
   3981         """
-> 3982         new_values = super()._map_values(arg, na_action=na_action)
   3983         return self._constructor(new_values, index=self.index).__finalize__(
   3984             self, method="map"

/opt/conda/lib/python3.7/site-packages/pandas/core/base.py in _map_values(self, mapper, na_action)
   1158 
   1159         # mapper is a function
-> 1160         new_values = map_f(values, mapper)
   1161 
   1162         return new_values

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/datatypes/DateTimeAttribute.py in is_datetime(value)
     19               'dec', 'december'}
     20 
---> 21     value_lower = value.lower()
     22     if (value_lower in weekdays) or (value_lower in months):
     23         return False

AttributeError: 'bool' object has no attribute 'lower'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions