C5.0-in-Python/C5.0.py at master · tkhan11/C5.0-in-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#Importing packages
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# import data
wine = pd.read_csv("./winequality-white.csv")
print("Printing the first 20 tuples of the wine dataset:\n",wine.head(20),"\n")

# Data processing
x=wine.iloc[:,0:11] #features  #Selecting All tuples with starting 11 attributes
y=wine['quality']   # Quality Labels


print("Features:\n",x,"\n")
print("Quality Labels:\n",y,"\n")

print("Different Quality labels present in the wine dataset:",wine['quality'].unique(),"\n")

print("Labels counts:\n",wine.quality.value_counts(),"\n")

colnames = list(wine.columns)
print("Column names:\n",colnames,"\n")


# Splitting data into training and test data set
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=40)


print("Shape of train and test datasets:\n", x_train, x_test, y_train, y_test,"\n")


#### Building C5.0 Decision Tree Classifier

'''
C5.0 decision tree can be easily produced when we set the criterion = "entropy"
in the DecisionTreeClassifier
'''

model = DecisionTreeClassifier(criterion = 'entropy', max_depth=3, max_leaf_nodes=7)
model.fit(x_train,y_train)


#Plotting the decision tree

#tree.plot_tree(model) # This will produce the default version of decision tree
#plt.show()

attributes= colnames[:11]
quality_values=['3', '4','5','6','7','8','9']

#quality_values= list(y.unique()) # This will not work because it will generate integer values type list and not string type

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (5,3), dpi=370)
tree.plot_tree(model,
               feature_names = attributes,
               class_names = quality_values,
               filled = True);
plt.show()


# Predicting on test data
preds = model.predict(x_test) # predicting on test data set

print("Printing Quality labels test data value counts\n:",y_test.value_counts(),"\n")

# Result Evaluation
print("Classicfication report:\n",classification_report(y_test,preds))

print("Confusion Matrix:\n",confusion_matrix(y_test,preds))

print("\nAccuracy:",accuracy_score(y_test,preds))