-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_set_preparations.py
More file actions
93 lines (83 loc) · 3.23 KB
/
data_set_preparations.py
File metadata and controls
93 lines (83 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import normalize
def prepare_data_set(number_of_data_set):
"""
Prepare the data set for the clustering.
:param number_of_data_set: number of data set to be prepared
:return: prepared data
"""
if number_of_data_set == 1:
return prepare_data_set1()
elif number_of_data_set == 2:
return prepare_data_set2()
else:
raise Exception("No such data set")
def prepare_data_set1():
"""
Prepare the first data set to clustering.
:return: prepared data
"""
data = pd.read_csv("dataset/allUsers.lcl.csv", skiprows=lambda x: x % 10 != 0)
# replace missing values with None
data = data.replace({'?': None})
# refer to all of the columns as numbers
for column in data.columns:
if data.dtypes[column] == 'object':
data[column] = data[column].astype('float64')
print("Missing values")
print(data.isna().sum())
print("Impute missing values with the median value and check again for missing values:")
# impute with median
for column in data.columns:
data.loc[data[column].isnull(), column] = data[column].median()
print(data.isna().sum())
print("There are no missing values now")
data = data.drop(columns=['Class'])
print(data.columns)
print(data.info)
print(data.dtypes)
# scale the data
data = scale_the_data(data)
# normalize the data
data = normalize(data)
data = pd.DataFrame(data=data[0:, 0:], # values
# index=data[1:, 0], # 1st column as index
columns=data[0, 0:]) # 1st row as the column names
print("data after scale info", data.info)
return data
def prepare_data_set2():
"""
Prepare the second data set to clustering.
:return: prepared data
"""
data = pd.read_csv("dataset/HTRU_2.csv",
names=['Mean of the integrated profile', 'Standard deviation of the integrated profile',
'Excess kurtosis of the integrated profile', 'Skewness of the integrated profile',
'Mean of the DM-SNR curve', 'Standard deviation of the DM-SNR curve',
'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve',
'Class'], skiprows=lambda x: x % 3 != 0)
data = data.drop(columns=['Class'])
print(data.columns)
print(data.info)
print(data.dtypes)
# scale the data
data = scale_the_data(data)
# normalize the data
data = normalize(data)
data = pd.DataFrame(data=data[0:, 0:], # values
# index=data[1:, 0], # 1st column as index
columns=data[0, 0:]) # 1st row as the column names
print("data after scale info", data.info)
print(data.dtypes)
return data
def scale_the_data(data):
"""
Scales the data
:param data: data to scale
:return: scaled data
"""
scaler = RobustScaler() # MinMaxScaler()
return scaler.fit_transform(data)
if __name__ == '__main__':
prepare_data_set2()