`bool` dtype not correctly supported

* DataSynthesizer version: 0.1.10
* Python version: 3.7.12
* Operating System: Debian 10

### Description

`DataDescriber` does not handle well `bool` dtypes in the source dataset. When the CSV file has columns with only `TRUE` and `FALSE` as values, `pandas` reads such columns as `bool` dtype (not `object`) and, when inferring types, the code ends up in checking them as dates and fails.

### What I Did

The source dataset is the telco-customer-churn dataset from Kaggle, after being imported in Google BigQuery and exported back to CSV, generating those `TRUE` and `FALSE` values instead of `Yes` and `No`. Below is my code:

```
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
import pandas as pd

# input dataset
input_data = "./out/from_bq.csv" # this CSV file has columns with TRUE and FALSE value which get read by pandas as bool dtype

mode = 'correlated_attribute_mode'

# location of two output files
description_file = f'./out/{mode}/description.json'
synthetic_data = f'./out/{mode}/synthetic_data.csv'

# An attribute is categorical if its domain size is less than this threshold.
threshold_value = 20

# list of dicsrete columns and primary key
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "TotalCharges", "Churn"]
primary_key_column = "customerID"

# specify categorical attributes
categorical_attributes = {}
for column in categorical_columns:
    categorical_attributes[column] = True

# specify which attributes are candidate keys of input dataset.
candidate_keys = {primary_key_column: True}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 0

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 2

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 1000

# build the Bayesian Network
describer = DataDescriber(category_threshold=threshold_value)
describer.describe_dataset_in_correlated_attribute_mode(dataset_file="./out/from_bq.csv", 
                                                        epsilon=epsilon, 
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        attribute_to_is_candidate_key=candidate_keys)

# save the output
describer.save_dataset_description_to_file(description_file)
```

Here is the output:
```
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_4364/1321006366.py in <module>
     46                                                         k=degree_of_bayesian_network,
     47                                                         attribute_to_is_categorical=categorical_attributes,
---> 48                                                         attribute_to_is_candidate_key=candidate_keys)
     49 
     50 # save the output

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_correlated_attribute_mode(self, dataset_file, k, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
    170                                                             categorical_attribute_domain_file,
    171                                                             numerical_attribute_ranges,
--> 172                                                             seed)
    173         self.df_encoded = self.encode_dataset_into_binning_indices()
    174         if self.df_encoded.shape[1] < 2:

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_independent_attribute_mode(self, dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
    118                                              categorical_attribute_domain_file,
    119                                              numerical_attribute_ranges,
--> 120                                              seed=seed)
    121 
    122         for column in self.attr_to_column.values():

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_random_mode(self, dataset_file, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
     85         self.attr_to_is_candidate_key = attribute_to_is_candidate_key
     86         self.read_dataset_from_csv(dataset_file)
---> 87         self.infer_attribute_data_types()
     88         self.analyze_dataset_meta()
     89         self.represent_input_dataset_by_columns()

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in infer_attribute_data_types(self)
    213                 # Sample 20 values to test its data_type.
    214                 samples = column_dropna.sample(20, replace=True)
--> 215                 if all(samples.map(is_datetime)):
    216                     self.attr_to_datatype[attr] = DataType.DATETIME
    217                 else:

/opt/conda/lib/python3.7/site-packages/pandas/core/series.py in map(self, arg, na_action)
   3980         dtype: object
   3981         """
-> 3982         new_values = super()._map_values(arg, na_action=na_action)
   3983         return self._constructor(new_values, index=self.index).__finalize__(
   3984             self, method="map"

/opt/conda/lib/python3.7/site-packages/pandas/core/base.py in _map_values(self, mapper, na_action)
   1158 
   1159         # mapper is a function
-> 1160         new_values = map_f(values, mapper)
   1161 
   1162         return new_values

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/datatypes/DateTimeAttribute.py in is_datetime(value)
     19               'dec', 'december'}
     20 
---> 21     value_lower = value.lower()
     22     if (value_lower in weekdays) or (value_lower in months):
     23         return False

AttributeError: 'bool' object has no attribute 'lower'
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`bool` dtype not correctly supported #38

Description

What I Did

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

bool dtype not correctly supported #38

Description

Description

What I Did

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`bool` dtype not correctly supported #38