环境:ubuntu 16.04 python 3.6

数据来源:UCI wine_data(比较经典的酒数据)

决策树要点:

1、 如何确定分裂点(CART ID3 C4.5算法有着对应的分裂计算方式)

2、 如何处理不连续的数据,如果处理缺失的数据

3、 剪枝处理

尝试实现算法一是为了熟悉python,二是为了更好的去理解算法的一个流程以及一些要点的处理。

  1. from math import log
  2. import operator
  3. import pickle
  4. import os
  5. import numpy as np
  6.  
  7. def debug(value_name,value):
  8. print("debuging for %s" % value_name)
  9. print(value)
  10.  
  11. # feature map and wind_label
  12.  
  13. def loadDateset():
  14. with open('./wine.data') as f:
  15. wine = [eaxm.strip().split(',') for eaxm in f.readlines()]
  16.  
  17. #for i in range(len(wine)):
  18. # wine[i] = list(map(float,wine[i]))
  19.  
  20. wine = np.array(wine)
  21. wine_label = wine[...,:1]
  22. wine_data = wine[...,1:]
  23.  
  24. # get the map of wine_feature
  25. featLabels = []
  26.  
  27. for i in range(len(wine_data)):
  28. #print(i)
  29. featLabels.append(i)
  30.  
  31. #
  32. wine_data = np.concatenate((wine_data,wine_label),axis=1)
  33. # 这里的label需要做一定的修改 需要的label是属性对应的字典
  34. return wine_data,featLabels
  35.  
  36. # wine_data = dateset[:-1] wine_label = dateset[-1:]
  37. def informationEntropy(dataSet):
  38. m = len(dataSet)
  39. labelMap = {}
  40. for wine in dataSet:
  41. nowLabel = wine[-1]
  42. if nowLabel not in labelMap.keys():
  43. labelMap[nowLabel] = 0
  44. labelMap[nowLabel] += 1
  45. shannoEnt = 0.0
  46. for key in labelMap.keys():
  47. prop = float(labelMap[key]/m)
  48. shannoEnt -= prop*(log(prop,2))
  49.  
  50. return shannoEnt
  51.  
  52. # split the subDataSet Improve reusability
  53. def splitDataSet(dataSet,axis,feature):
  54. subDataSet = []
  55. # date type
  56. for featVec in dataSet:
  57. if(featVec[axis] == feature):
  58. reduceVec = featVec[:axis]
  59. if(isinstance(reduceVec,np.ndarray)):
  60. reduceVec = np.ndarray.tolist(reduceVec)
  61. reduceVec.extend(featVec[axis+1:])
  62. subDataSet.append(reduceVec)
  63. return subDataSet
  64.  
  65. # choose the best Feature to split
  66. def chooseFeature(dataSet):
  67. numFeature = len(dataSet[0])-1
  68. baseEntorpy = informationEntropy(dataSet)
  69. bestInfoGain = 0.0
  70. bestFeature = -1
  71.  
  72. for i in range(numFeature):
  73. #valueList = wine_data[:,i:i+1]
  74. valueList = [value[i] for value in dataSet]
  75.  
  76. # debug
  77. # print("valueList is:")
  78. # print(len(valueList))
  79.  
  80. uniqueVals = set(valueList)
  81. newEntropy = 0.0
  82. for value in uniqueVals:
  83. subDataSet = splitDataSet(dataSet,i,value)
  84.  
  85. #debug
  86. #print("subDataSet is :")
  87. #print(subDataSet)
  88. #print(len(subDataSet[0]))
  89.  
  90. # 数值部分要注意
  91. prop = len(subDataSet)/float(len(dataSet))
  92. newEntropy += prop*informationEntropy(subDataSet)
  93.  
  94. infoGain = baseEntorpy - newEntropy
  95. if(infoGain > bestInfoGain):
  96. bestInfoGain = infoGain
  97. bestFeature = i
  98.  
  99. return bestFeature
  100.  
  101. def majorityCnt(classList):
  102. classMap = {}
  103. for vote in classList:
  104. if vote not in classMap.keys():
  105. classMap[vote] = 0
  106. classMap[vote] += 1
  107.  
  108. #tempMap = sorted(classMap.items(),key = operator.itemgetter(1),reverse = True)
  109. tempMap = sorted(classMap.items(), key=lambda x:x[1], reverse=True)
  110. return tempMap[0][0]
  111.  
  112. # labels for map of Feature
  113. def createTree(dataSet,Featlabels):
  114. classList = [example[-1] for example in dataSet]
  115. # if all of the attribute of classList is same
  116.  
  117. if(classList.count(classList[0])) == len(classList):
  118. #print("all is same")
  119. return classList[0]
  120. # print("debug after")
  121. # feature is empty
  122. if len(dataSet[0]) == 1:
  123. print("len is zero")
  124. return majorityCnt(classList)
  125. # print("debug pre")
  126. bestFeat = chooseFeature(dataSet)
  127. #debug
  128. #print("debug")
  129. #print(bestFeat)
  130.  
  131. bestFeatLabel = Featlabels[bestFeat]
  132. # print(bestFeatLabel)
  133. # python tree use dict for index of feature to build the tree
  134. myTree = {bestFeatLabel:{}}
  135.  
  136. # del redundant label
  137. del(Featlabels[bestFeat])
  138.  
  139. valueList = [example[bestFeat] for example in dataSet]
  140. uniqueVals = set(valueList)
  141.  
  142. # print(uniqueVals)
  143. # 取值都一样的话就没有必要继续划分
  144. if(len(uniqueVals) == 1):
  145. return majorityCnt(dataSet)
  146.  
  147. for value in uniqueVals:
  148. #if(bestFeat == 6):
  149. # print(value)
  150. subFeatLabels = Featlabels[:]
  151. # print(sublabels)
  152. subdataSet = splitDataSet(dataSet,bestFeat,value)
  153.  
  154. if(bestFeatLabel == 6 and value == '3.06'):
  155. #print("debuging ")
  156. myTree[bestFeatLabel][value] = createTree(subdataSet, subFeatLabels)
  157. #print(myTree[bestFeatLabel][value])
  158. #print("len of build")
  159. #print(len(uniqueVals))
  160. # print(value)
  161. else:
  162. myTree[bestFeatLabel][value] = createTree(subdataSet,subFeatLabels)
  163.  
  164. return myTree
  165.  
  166. # classity fuction featLabel and testVes is used to get featvalue of test
  167. def classify(inputTree,featLabels,testVec):
  168. # get the node
  169. nowNode = list(inputTree.keys())[0]
  170.  
  171. # debug
  172. #debug(nowNode)
  173. # print(featLabels)
  174. featIndex = featLabels.index(nowNode)
  175.  
  176. # print(featIndex)
  177. #find the value of testVec in feature
  178. keyValue = testVec[featIndex]
  179.  
  180. #print("len of input")
  181. #print(len(inputTree[nowNode].keys()))
  182. keyValue = str(keyValue)
  183. subTree = inputTree[nowNode][keyValue]
  184. if(isinstance(subTree,dict)):
  185. classLabel = classify(subTree,featLabels,testVec)
  186. else:
  187. classLabel = subTree
  188.  
  189. return classLabel
  190.  
  191. if __name__ == '__main__':
  192. wine_data, featLabels = loadDateset()
  193. #print(featLabels)
  194. #print(wine_data)
  195. myTree = createTree(wine_data,featLabels.copy())
  196.  
  197. #print(type(myTree))
  198. # the type of value
  199. test = [14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065]
  200. #print(featLabels)
  201. print(classify(myTree,featLabels,test))

静下来,你想要的东西才能看见

python ID3决策树实现的更多相关文章

  1. Python3实现机器学习经典算法(三)ID3决策树

    一.ID3决策树概述 ID3决策树是另一种非常重要的用来处理分类问题的结构,它形似一个嵌套N层的IF…ELSE结构,但是它的判断标准不再是一个关系表达式,而是对应的模块的信息增益.它通过信息增益的大小 ...

  2. ID3决策树预测的java实现

    刚才写了ID3决策树的建立,这个是通过决策树来进行预测.这里主要用到的就是XML的遍历解析,比较简单. 关于xml的解析,参考了: http://blog.csdn.net/soszou/articl ...

  3. python利用决策树进行特征选择

    python利用决策树进行特征选择(注释部分为绘图功能),最后输出特征排序: import numpy as np import tflearn from tflearn.layers.core im ...

  4. Python实现决策树ID3算法

    主要思想: 0.训练集格式:特征1,特征2,...特征n,类别 1.采用Python自带的数据结构字典递归的表示数据 2.ID3计算的信息增益是指类别的信息增益,因此每次都是计算类别的熵 3.ID3每 ...

  5. python实现决策树C4.5算法(在ID3基础上改进)

    一.概论 C4.5主要是在ID3的基础上改进,ID3选择(属性)树节点是选择信息增益值最大的属性作为节点.而C4.5引入了新概念"信息增益率",C4.5是选择信息增益率最大的属性作 ...

  6. python 之 决策树分类算法

    发现帮助新手入门机器学习的一篇好文,首先感谢博主!:用Python开始机器学习(2:决策树分类算法) J. Ross Quinlan在1975提出将信息熵的概念引入决策树的构建,这就是鼎鼎大名的ID3 ...

  7. python画决策树

    1.安装graphviz.下载地址在:http://www.graphviz.org/.如果你是linux,可以用apt-get或者yum的方法安装.如果是windows,就在官网下载msi文件安装. ...

  8. ID3决策树的Java实现

    package DecisionTree; import java.io.*; import java.util.*; public class ID3 { //节点类 public class DT ...

  9. python实现决策树

    1.决策树的简介 http://www.cnblogs.com/lufangtao/archive/2013/05/30/3103588.html 2.决策是实现的伪代码 “读入训练数据” “找出每个 ...

随机推荐

  1. Comparison of SIFT Encoded and Deep Learning Features for the Classification and Detection of Esca Disease in Bordeaux Vineyards(分类MobileNet,目标检测 RetinaNet)

    识别葡萄的一种虫害,比较了传统SIFT和深度学习分类,最后还做了目标检测 分类用的 MobileNet,目标检测 RetinaNet MobileNet 是将传统深度可分离卷积分成了两步,深度卷积和逐 ...

  2. PL/SQL Developer插入数据到数据库出现数据中文乱码

    问题描述: 使用PL/SQL Developer往Oracle数据库插入数据,出现中文乱码! 解决办法: 1.执行脚本 select userenv('language') from dual; 结果 ...

  3. 前后端通信—CORS(支持跨域)

    根据前端跨域的那些事这篇文章中的跨域的理解这一块,我们重新创建两个服务,第一个服务使用了test.html const http = require('http') const fs = requir ...

  4. 软件工程 “校园汇” 个人IDEA竞选分析与总结

    IDEA竞选 19/10/8软件工程课上举行了一次IDEA竞选: 我的竞选IDEA是"校友汇",大学生的在线活动中心. 投票结果: 可以看到,校友会(汇)IDEA竞选结果十分惨淡, ...

  5. 学习opencv(1)

    目录 CV_8UC3 Scalar--颜色赋值 using namespace cv找不到命名空间 waitKey() getTickCount() 引用 CV_8UC3 a) 存放单通道图像中像素: ...

  6. Remind Me

    创建2d人物:live2d 创建3d人物:adobe fuse

  7. WEB API 的设计与开发

  8. docker 进程管理

    详文:理解Docker容器的进程管理:https://yq.aliyun.com/articles/5545 在Docker中,每个Container都是Docker Daemon的子进程. dock ...

  9. tomcat启动慢的解决办法

    SessionIdGeneratorBase.createSecureRandom Creation of SecureRandom instance for session ID generatio ...

  10. 【spring源码分析】IOC容器解析

    参考: https://www.iteye.com/topic/1121913(自动注入bean的扫描器) https://m.imooc.com/mip/article/34150(循环依赖的解决方 ...