-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtree.py
112 lines (85 loc) · 3.75 KB
/
tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pandas as pd
from pybrain.utilities import percentError
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D
from utils import to_xy, make_sure_path_exists
from sklearn import tree
input_dir = 'data/'
base_file_name = 'data_banknote_authentication'
attributes = ['varianceWT', 'skewnessWT', 'curtosisWT', 'entropy']
label = 'class'
class_names = ['Forged', 'Genuine']
max_depth = None
min_samples_split = 2
min_samples_leaf = 2
train_data = pd.read_csv(input_dir + 'train_' + base_file_name + '.csv', sep=',')
test_data = pd.read_csv(input_dir + 'test_' + base_file_name + '.csv', sep=',')
train_x, train_y = to_xy(train_data, attributes, label)
test_x, test_y = to_xy(test_data, attributes, label)
clf = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf)
clf.fit(train_x, train_y)
out = clf.predict(test_x)
print('Output for test dataset:')
print(out)
error = percentError(out, test_y)
print('Error rate: %.4f%%' % error)
# use Graphviz or for example https://dreampuf.github.io/GraphvizOnline/ to render
make_sure_path_exists('tree_output')
dot_data = tree.export_graphviz(clf, out_file="tree_output/tree.dot",
feature_names=attributes,
class_names=class_names,
filled=True, rounded=True,
special_characters=True)
# try different max_depths to check overfitting
max_depths = np.linspace(1, 10, 10, endpoint=True)
train_results = []
test_results = []
for val in max_depths:
dt = tree.DecisionTreeClassifier(max_depth=val)
dt.fit(train_x, train_y)
train_out = dt.predict(train_x)
test_out = dt.predict(test_x)
train_results.append(percentError(train_out, train_y) / 100)
test_results.append(percentError(test_out, test_y) / 100)
line1, = plt.plot(max_depths, train_results, 'b', label='Train error')
line2, = plt.plot(max_depths, test_results, 'r', label='Test error')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Error')
plt.xlabel('Max tree depth')
plt.show()
# try different min_samples_splits to check underfitting
min_samples_splits = np.linspace(2, 10, 10, endpoint=True)
train_results = []
test_results = []
for val in min_samples_splits:
dt = tree.DecisionTreeClassifier(min_samples_split=int(val))
dt.fit(train_x, train_y)
train_out = dt.predict(train_x)
test_out = dt.predict(test_x)
train_results.append(percentError(train_out, train_y) / 100)
test_results.append(percentError(test_out, test_y) / 100)
line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train error')
line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test error')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Error')
plt.xlabel('Min samples split')
plt.show()
# try different min_samples_leafs to check underfitting
min_samples_leafs = np.linspace(1, 10, 10, endpoint=True)
train_results = []
test_results = []
for val in min_samples_leafs:
dt = tree.DecisionTreeClassifier(min_samples_leaf=int(val))
dt.fit(train_x, train_y)
train_out = dt.predict(train_x)
test_out = dt.predict(test_x)
train_results.append(percentError(train_out, train_y) / 100)
test_results.append(percentError(test_out, test_y) / 100)
line1, = plt.plot(min_samples_leafs, train_results, 'b', label='Train error')
line2, = plt.plot(min_samples_leafs, test_results, 'r', label='Test error')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Error')
plt.xlabel('Min samples leaf')
plt.show()