决策树(Decision Trees)
- 简介
- 基本思想
先从X轴观察,在X = 3时,样本点有一次明显的“突变”,我们以X = 3作为一次决策,进行一次划分:
再从Y轴观察,两类样本点在Y = 4 和Y = 2处可以进行划分,进而进行两次划分:
- 熵(entropy)
熵的定义:它是一系列样本中的不纯度的测量值(measure of impurity in a bunch of examples)
熵描述了数据的混乱程度,熵越大,混乱程度越高,也就是纯度越低;反之,熵越小,混乱程度越低,纯度越高。 熵的计算公式如下所示:
- 信息增益
- 偏差(bias)与方差(variance)
- 代码实现
环境:MacOS mojave 10.14.3
Python 3.7.0
使用库:scikit-learn 0.19.2
>>> from sklearn import tree
>>> X = [[0, 0], [1, 1]] #两个样本点
>>> Y = [0, 1] #分别属于两个标签
>>> clf = tree.DecisionTreeClassifier() #进行分类
>>> clf = clf.fit(X, Y)
>>> clf.predict([[2., 2.]]) #预测新点
array([1]) #新点通过分类属于标签1
Main.py 主程序
import sys
from class_vis import prettyPicture, output_image
from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from classifyDT import classify features_train, labels_train, features_test, labels_test = makeTerrainData() ### the classify() function in classifyDT is where the magic
### happens--fill in this function in the file 'classifyDT.py'!
clf = classify(features_train, labels_train) #### grader code, do not modify below this line prettyPicture(clf, features_test, labels_test)
accuracy = clf.score(features_test, labels_test) # output_image("test.png", "png", open("test.png", "rb").read())
print (accuracy)
acc = accuracy ### you fill this in!
classifyDT.py 决策树分类
def classify(features_train, labels_train): ### your code goes here--should return a trained decision tree classifer
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(features_train,labels_train) return clf
perp_terrain_data.py 生成训练点
import random def makeTerrainData(n_points=1000):
### make the toy dataset
grade = [random.random() for ii in range(0,n_points)]
bumpy = [random.random() for ii in range(0,n_points)]
error = [random.random() for ii in range(0,n_points)]
y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
for ii in range(0, len(y)):
if grade[ii]>0.8 or bumpy[ii]>0.8:
y[ii] = 1.0 ### split into train/test sets
X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
split = int(0.75*n_points)
X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:] grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1] # training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
# , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}} grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1] test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
, "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}} return X_train, y_train, X_test, y_test
# return training_data, test_data
class_vis.py 绘图与保存图像
import warnings
warnings.filterwarnings("ignore") import matplotlib
matplotlib.use('agg') import matplotlib.pyplot as plt
import pylab as pl
import numpy as np #import numpy as np
#import matplotlib.pyplot as plt
#plt.ioff() def prettyPicture(clf, X_test, y_test):
x_min = 0.0; x_max = 1.0
y_min = 0.0; y_max = 1.0 # Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
h = .01 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max()) plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic) # Plot also the test points
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1] plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
plt.ylabel("grade") plt.savefig("test.png")
- 决策树的参数
acc_min_samples.py acc_min_samples对比
import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt
import numpy as np
import pylab as pl features_train, labels_train, features_test, labels_test = makeTerrainData() ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_50, respectively from sklearn.tree import DecisionTreeClassifier
clf1 = DecisionTreeClassifier(min_samples_split=2)
clf2 = DecisionTreeClassifier(min_samples_split=50) clf1.fit(features_train,labels_train)
clf2.fit(features_train,labels_train) acc_min_samples_split_2 = clf1.score(features_test, labels_test)
acc_min_samples_split_50 = clf2.score(features_test, labels_test) print (acc_min_samples_split_2)
print (acc_min_samples_split_50) #choose one of two
prettyPicture(clf1, features_test, labels_test)
# prettyPicture(clf2, features_test, labels_test)
上图,min_samples_split分别为2 和50
- 决策树的优点与缺点
