ID3决策树算法实现（Python版）

 # -*- coding:utf-8 -*-

 from numpy import *

 import numpy as np

 import pandas as pd

 from math import log

 import operator

 #计算数据集的香农熵

 def calcShannonEnt(dataSet):

     numEntries=len(dataSet)

     labelCounts={}

     #给所有可能分类创建字典

     for featVec in dataSet:

         currentLabel=featVec[-1]

         if currentLabel not in labelCounts.keys():

             labelCounts[currentLabel]=0

         labelCounts[currentLabel]+=1

     shannonEnt=0.0

     #以2为底数计算香农熵

     for key in labelCounts:

         prob = float(labelCounts[key])/numEntries

         shannonEnt-=prob*log(prob,2)

     return shannonEnt

 #对离散变量划分数据集，取出该特征取值为value的所有样本

 def splitDataSet(dataSet,axis,value):

     retDataSet=[]

     for featVec in dataSet:

         if featVec[axis]==value:

             reducedFeatVec=featVec[:axis]

             reducedFeatVec.extend(featVec[axis+1:])

             retDataSet.append(reducedFeatVec)

     return retDataSet

 #对连续变量划分数据集，direction规定划分的方向，

 #决定是划分出小于value的数据样本还是大于value的数据样本集

 def splitContinuousDataSet(dataSet,axis,value,direction):

     retDataSet=[]

     for featVec in dataSet:

         if direction==0:

             if featVec[axis]>value:

                 reducedFeatVec=featVec[:axis]

                 reducedFeatVec.extend(featVec[axis+1:])

                 retDataSet.append(reducedFeatVec)

         else:

             if featVec[axis]<=value:

                 reducedFeatVec=featVec[:axis]

                 reducedFeatVec.extend(featVec[axis+1:])

                 retDataSet.append(reducedFeatVec)

     return retDataSet

 #选择最好的数据集划分方式

 def chooseBestFeatureToSplit(dataSet,labels):

     numFeatures=len(dataSet[0])-1

     baseEntropy=calcShannonEnt(dataSet)

     bestInfoGain=0.0

     bestFeature=-1

     bestSplitDict={}

     for i in range(numFeatures):

         featList=[example[i] for example in dataSet]

         #对连续型特征进行处理

         if type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int':

             #产生n-1个候选划分点

             sortfeatList=sorted(featList)

             splitList=[]

             for j in range(len(sortfeatList)-1):

                 splitList.append((sortfeatList[j]+sortfeatList[j+1])/2.0)

             bestSplitEntropy=10000

             slen=len(splitList)

             #求用第j个候选划分点划分时，得到的信息熵，并记录最佳划分点

             for j in range(slen):

                 value=splitList[j]

                 newEntropy=0.0

                 subDataSet0=splitContinuousDataSet(dataSet,i,value,0)

                 subDataSet1=splitContinuousDataSet(dataSet,i,value,1)

                 prob0=len(subDataSet0)/float(len(dataSet))

                 newEntropy+=prob0*calcShannonEnt(subDataSet0)

                 prob1=len(subDataSet1)/float(len(dataSet))

                 newEntropy+=prob1*calcShannonEnt(subDataSet1)

                 if newEntropy<bestSplitEntropy:

                     bestSplitEntropy=newEntropy

                     bestSplit=j

             #用字典记录当前特征的最佳划分点

             bestSplitDict[labels[i]]=splitList[bestSplit]

             infoGain=baseEntropy-bestSplitEntropy

         #对离散型特征进行处理

         else:

             uniqueVals=set(featList)

             newEntropy=0.0

             #计算该特征下每种划分的信息熵

             for value in uniqueVals:

                 subDataSet=splitDataSet(dataSet,i,value)

                 prob=len(subDataSet)/float(len(dataSet))

                 newEntropy+=prob*calcShannonEnt(subDataSet)

             infoGain=baseEntropy-newEntropy

         if infoGain>bestInfoGain:

             bestInfoGain=infoGain

             bestFeature=i

     #若当前节点的最佳划分特征为连续特征，则将其以之前记录的划分点为界进行二值化处理

     #即是否小于等于bestSplitValue

     if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int':

         bestSplitValue=bestSplitDict[labels[bestFeature]]

         labels[bestFeature]=labels[bestFeature]+'<='+str(bestSplitValue)

         for i in range(shape(dataSet)[0]):

             if dataSet[i][bestFeature]<=bestSplitValue:

                 dataSet[i][bestFeature]=1

             else:

                 dataSet[i][bestFeature]=0

     return bestFeature

 #特征若已经划分完，节点下的样本还没有统一取值，则需要进行投票

 def majorityCnt(classList):

     classCount={}

     for vote in classList:

         if vote not in classCount.keys():

             classCount[vote]=0

         classCount[vote]+=1

     return max(classCount)

 #主程序，递归产生决策树

 def createTree(dataSet,labels,data_full,labels_full):

     classList=[example[-1] for example in dataSet]

     if classList.count(classList[0])==len(classList):

         return classList[0]

     if len(dataSet[0])==1:

         return majorityCnt(classList)

     bestFeat=chooseBestFeatureToSplit(dataSet,labels)

     bestFeatLabel=labels[bestFeat]

     myTree={bestFeatLabel:{}}

     featValues=[example[bestFeat] for example in dataSet]

     uniqueVals=set(featValues)

     if type(dataSet[0][bestFeat]).__name__=='str':

         currentlabel=labels_full.index(labels[bestFeat])

         featValuesFull=[example[currentlabel] for example in data_full]

         uniqueValsFull=set(featValuesFull)

     del(labels[bestFeat])

     #针对bestFeat的每个取值，划分出一个子树。

     for value in uniqueVals:

         subLabels=labels[:]

         if type(dataSet[0][bestFeat]).__name__=='str':

             uniqueValsFull.remove(value)

         myTree[bestFeatLabel][value]=createTree(splitDataSet\

          (dataSet,bestFeat,value),subLabels,data_full,labels_full)

     if type(dataSet[0][bestFeat]).__name__=='str':

         for value in uniqueValsFull:

             myTree[bestFeatLabel][value]=majorityCnt(classList)

     return myTree

 import matplotlib.pyplot as plt

 decisionNode=dict(boxstyle="sawtooth",fc="0.8")

 leafNode=dict(boxstyle="round4",fc="0.8")

 arrow_args=dict(arrowstyle="<-")

 #计算树的叶子节点数量

 def getNumLeafs(myTree):

     numLeafs=0

     firstSides = list(myTree.keys())

     firstStr=firstSides[0]

     secondDict=myTree[firstStr]

     for key in secondDict.keys():

         if type(secondDict[key]).__name__=='dict':

             numLeafs+=getNumLeafs(secondDict[key])

         else: numLeafs+=1

     return numLeafs

 #计算树的最大深度

 def getTreeDepth(myTree):

     maxDepth=0

     firstSides = list(myTree.keys())

     firstStr=firstSides[0]

     secondDict=myTree[firstStr]

     for key in secondDict.keys():

         if type(secondDict[key]).__name__=='dict':

             thisDepth=1+getTreeDepth(secondDict[key])

         else: thisDepth=1

         if thisDepth>maxDepth:

             maxDepth=thisDepth

     return maxDepth

 #画节点

 def plotNode(nodeTxt,centerPt,parentPt,nodeType):

     createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\

     xytext=centerPt,textcoords='axes fraction',va="center", ha="center",\

     bbox=nodeType,arrowprops=arrow_args)

 #画箭头上的文字

 def plotMidText(cntrPt,parentPt,txtString):

     lens=len(txtString)

     xMid=(parentPt[0]+cntrPt[0])/2.0-lens*0.002

     yMid=(parentPt[1]+cntrPt[1])/2.0

     createPlot.ax1.text(xMid,yMid,txtString)

 def plotTree(myTree,parentPt,nodeTxt):

     numLeafs=getNumLeafs(myTree)

     depth=getTreeDepth(myTree)

     firstSides = list(myTree.keys())

     firstStr=firstSides[0]

     cntrPt=(plotTree.x0ff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.y0ff)

     plotMidText(cntrPt,parentPt,nodeTxt)

     plotNode(firstStr,cntrPt,parentPt,decisionNode)

     secondDict=myTree[firstStr]

     plotTree.y0ff=plotTree.y0ff-1.0/plotTree.totalD

     for key in secondDict.keys():

         if type(secondDict[key]).__name__=='dict':

             plotTree(secondDict[key],cntrPt,str(key))

         else:

             plotTree.x0ff=plotTree.x0ff+1.0/plotTree.totalW

             plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode)

             plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key))

     plotTree.y0ff=plotTree.y0ff+1.0/plotTree.totalD

 def createPlot(inTree):

     fig=plt.figure(1,facecolor='white')

     fig.clf()

     axprops=dict(xticks=[],yticks=[])

     createPlot.ax1=plt.subplot(111,frameon=False,**axprops)

     plotTree.totalW=float(getNumLeafs(inTree))

     plotTree.totalD=float(getTreeDepth(inTree))

     plotTree.x0ff=-0.5/plotTree.totalW

     plotTree.y0ff=1.0

     plotTree(inTree,(0.5,1.0),'')

     plt.show()

 df=pd.read_csv('watermelon_4_3.csv')

 data=df.values[:,1:].tolist()

 data_full=data[:]

 labels=df.columns.values[1:-1].tolist()

 labels_full=labels[:]

 myTree=createTree(data,labels,data_full,labels_full)

 print(myTree)

 createPlot(myTree)

最终结果如下：

{'texture': {'blur': 0, 'little_blur': {'touch': {'soft_stick': 1, 'hard_smooth': 0}}, 'distinct': {'density<=0.38149999999999995': {0: 1, 1: 0}}}}

得到的决策树如下：

参考资料：

《机器学习实战》

《机器学习》周志华著

ID3决策树算法实现（Python版）的更多相关文章

python机器学习笔记 ID3决策树算法实战
前面学习了决策树的算法原理,这里继续对代码进行深入学习,并掌握ID3的算法实践过程. ID3算法是一种贪心算法,用来构造决策树,ID3算法起源于概念学习系统(CLS),以信息熵的下降速度为选取测试属性 ...
day-8 python自带库实现ID3决策树算法
前一天,我们基于sklearn科学库实现了ID3的决策树程序,本文将基于python自带库实现ID3决策树算法. 一.代码涉及基本知识 1. 为了绘图方便,引入了一个第三方treePlotter模块进 ...
机器学习-ID3决策树算法（附matlab/octave代码）
ID3决策树算法是基于信息增益来构建的,信息增益可以由训练集的信息熵算得,这里举一个简单的例子 data=[心情好天气好出门心情好天气不好出门心情不好天气好出门心情不好天气不好 ...
ID3决策树算法原理及C++实现(其中代码转自别人的博客)
分类是数据挖掘中十分重要的组成部分.分类作为一种无监督学习方式被广泛的使用. 之前关于"数据挖掘中十大经典算法"中,基于ID3核心思想的分类算法C4.5榜上有名.所以不难看出ID3 ...
Kaggle竞赛入门：决策树算法的Python实现
本文翻译自kaggle learn,也就是kaggle官方最快入门kaggle竞赛的教程,强调python编程实践和数学思想(而没有涉及数学细节),笔者在不影响算法和程序理解的基础上删除了一些不必要的 ...
决策树算法的Python实现—基于金融场景实操
决策树是最经常使用的数据挖掘算法,本次分享jacky带你深入浅出,走进决策树的世界基本概念决策树(Decision Tree) 它通过对训练样本的学习,并建立分类规则,然后依据分类规则,对新样本数 ...
决策树算法——ID3
决策树算法是一种有监督的分类学习算法.利用经验数据建立最优分类树,再用分类树预测未知数据. 例子:利用学生上课与作业状态预测考试成绩. 上述例子包含两个可以观测的属性:上课是否认真,作业是否认真,并以 ...
机器学习回顾篇（7）：决策树算法（ID3、C4.5）
.caret, .dropup > .btn > .caret { border-top-color: #000 !important; } .label { border: 1px so ...
决策树算法原理(ID3，C4.5)
决策树算法原理(CART分类树) CART回归树决策树的剪枝决策树可以作为分类算法,也可以作为回归算法,同时特别适合集成学习比如随机森林. 1. 决策树ID3算法的信息论基础 1970年昆兰找 ...

随机推荐

Ajax与XMLHttpRequest随笔
1.XMLHttpRequest对象创建XHR对象:let xhr = new XMLHttpRequest(); open():启动一个请求准备发送 open()接收3个参数:请求类型('GET' ...
【BZOJ1859】【ZJOI2006】碗的叠放
题目大意:给你n个碗,求如何堆叠,使得它们的总高度最低. 首先,我们枚举碗的叠放顺序. 假设我们已经堆好了前i个碗,那么在堆第i+1个碗时,我们要将第i+1个碗与前i个碗比较,确定第i+1个碗的离地高 ...
SLAP（Speaker-Listener Label Propagation Algorithm）社区发现算法
其中部分转载的社区发现SLPA算法文章一.概念社区(community)定义:同一社区内的节点与节点之间关系紧密,而社区与社区之间的关系稀疏. 设图G=G(V,E),所谓社区发现是指在图G中确定n ...
Java8-Guava实战示例
示例一: 跟示例三对比一下,尽量用示例三 List<InvoiceQueryBean> invoiceQueryBeanList = new ArrayList<>(); ...
JDBC链接oracle数据库
package test; import java.sql.* ; public class JDBC_Test { //orcl为oracle数据库中的数据库名,localhost表示连接本机的or ...
Android_设置全屏的方法
在开发的过程中,我们有时候需要让我们应用程序全屏或者是让某个页面全屏,在今天的android小技巧中我们来讲讲如何设置我们的应用程序全屏: 通常我们有三种方式: 1.在onCreate方法中添加代码 ...
CART树
算法概述 CART(Classification And Regression Tree)算法是一种决策树分类方法. 它采用一种二分递归分割的技术,分割方法采用基于最小距离的基尼指数估计函数,将当前的 ...
查看本机的ip
1.Windows 1.1.查看ip地址 ipconfig 1.2.查看MAC地址 ipconfig /all 2.Linux 2.1.查看ip地址 ifconfig 2.2.查看MAC地址 ip l ...
Integer.parseInt() 和 valueOf()
parseInt("1")返回的是int类型,所以如果想要将一个String类型的数字串转为原始类型int ,建议使用这个方法, 而不是使用 valueOf("1&quo ...
java.lang 类String
方法摘要1 charcharAt(int index) 返回指定索引处的 char 值. index - char 值的索引.2 string concat( ...

ID3决策树算法实现（Python版）

ID3决策树算法实现（Python版）的更多相关文章

随机推荐

热门专题