-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
138 lines (116 loc) · 5.84 KB
/
main.py
File metadata and controls
138 lines (116 loc) · 5.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import autoencoder
import data_set_preparations
import matplotlib.pyplot as plt
import clustering
import fit_to_external_classification
import predict_nuber_of_clusters
import seaborn as sns
def main():
"""
Main function. Clusters the data and compare between clustering methods.
Please note that in order to avoid a lot of figures on the screen,
the next figure won't appear until the current figure is closed.
:return: None
"""
data_set_number = 1
# read and prepare the data
data = data_set_preparations.prepare_data_set(data_set_number)
# plot a boxplot to visualize anomalies
plot_boxplot(1)
plot_boxplot(2)
# anomaly detection + tSNE
[points, anomalies, reg_points, anomalous_points] = autoencoder.main()
reg_points = autoencoder.get_reg_points(data_set_number)
print('len of reg ', len(reg_points))
anomalous_points = autoencoder.get_anomalous_points(data_set_number)
print('len of anomalous ', len(anomalous_points))
points = autoencoder.get_all_points(data_set_number)
is_anomaly = autoencoder.get_is_anomaly_array(data_set_number)
# plt
fig = plt.figure()
ax = fig.add_subplot(111)
plt.title('Anomalies detected by Autoencoder')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
ax.scatter(reg_points[:, 0], reg_points[:, 1], c='c', alpha=0.8, s=8)
ax.scatter(anomalous_points[:, 0], anomalous_points[:, 1], c='r', alpha=0.8, s=8)
plt.show()
fit_to_external_classification.plot_external_tag_distribution(data_set_number=data_set_number, points=points)
# number of real labels
print('data 1 real labels', len(np.unique(fit_to_external_classification.get_real_labels(1))))
print('data 2 real labels', len(np.unique(fit_to_external_classification.get_real_labels(2))))
predict_nuber_of_clusters.perform_elbow_method(reg_points, 'K means')
predict_nuber_of_clusters.perform_elbow_method(reg_points, 'Hierarchical')
predict_nuber_of_clusters.perform_silhouette_method(reg_points, 'GMM')
predict_nuber_of_clusters.perform_silhouette_method(reg_points, 'Fuzzy C Means')
predict_nuber_of_clusters.perform_silhouette_method(reg_points, 'Spectral')
# cluster
clustering.plot_clustering(reg_points, clustering.cluster(reg_points, 4, 'K means'), 'K means')
clustering.plot_clustering(reg_points, clustering.cluster(reg_points, 2, 'GMM'), 'GMM')
clustering.plot_clustering(reg_points, clustering.cluster(reg_points, 2, 'Fuzzy C Means'), 'Fuzzy C Means')
linkages = ['ward', 'average', 'complete', 'single']
for linkage in linkages:
clustering.plot_clustering(reg_points, clustering.cluster(reg_points, 3, 'Hierarchical', linkage=linkage),
'Hierarchical ' + linkage)
clustering.plot_clustering(reg_points, clustering.cluster(reg_points, 2, 'Spectral'), 'Spectral')
# statistical tests
# create a dictionary of method and nmi scores list
algorithms_and_n_clusters = [['K means', 4], ['GMM', 2], ['Fuzzy C Means', 2], ['Spectral', 2]]
algorithm_nmi_dictionary = {}
for algorithm, n_clusters in algorithms_and_n_clusters:
algorithm_nmi_dictionary[algorithm] = fit_to_external_classification.nmi_score(
fit_to_external_classification.get_real_labels_without_anomalies(data_set_number, is_anomaly), reg_points,
n_clusters=n_clusters, method=algorithm)
linkages = ['ward', 'average', 'complete', 'single']
for linkage in linkages:
algorithm_nmi_dictionary['Hierarchical' + linkage] = fit_to_external_classification.nmi_score(
fit_to_external_classification.get_real_labels_without_anomalies(data_set_number, is_anomaly), reg_points,
n_clusters=3, method='Hierarchical', linkage=linkage)
print('u test')
for key1 in algorithm_nmi_dictionary:
for key2 in algorithm_nmi_dictionary:
if key1 != key2:
print(key1, 'is significantly better than ', key2, 'with p-value =$',
fit_to_external_classification.u_test(algorithm_nmi_dictionary[key1],
algorithm_nmi_dictionary[key2]), ' <<0.05$')
print('Average NMI Scores:')
for key in algorithm_nmi_dictionary:
print('for', key, 'the average NMI Score is ', sum(algorithm_nmi_dictionary[key]) / len(
algorithm_nmi_dictionary[key]))
methods = ['K means', 'GMM', 'Fuzzy C Means', 'Spectral']
linkages = ['ward', 'average', 'complete', 'single']
for method in methods:
predict_nuber_of_clusters.compare_silhouette_scores(points, method)
for linkage in linkages:
predict_nuber_of_clusters.compare_silhouette_scores(points, 'Hierarchical', linkage)
predict_nuber_of_clusters.compare_silhouette_scores_between_different_methods(points)
def perform_pca(data):
"""
Performs PCA algorithm to 2 dimensions.
:param data: data to perform the algorithm on
:return: points after dimension reduction
"""
pca = PCA(n_components=2)
points = pca.fit_transform(data)
return points
def plot_boxplot(data_set_number):
"""
Plot a boxplot to visualize anomalies
:param data_set_number: number of data set - 1 or 2
:return: None
"""
data = data_set_preparations.prepare_data_set(data_set_number)
if data_set_number == 1:
data = data.drop(data.columns[range(-1, -3, -1)], axis=1)
plt.figure(figsize=(10, 10))
plt.title("Box Plot")
sns.boxplot(data=data)
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()