python ID3决策树实现
环境:ubuntu 16.04 python 3.6
数据来源:UCI wine_data(比较经典的酒数据)
决策树要点:
1、 如何确定分裂点(CART ID3 C4.5算法有着对应的分裂计算方式)
2、 如何处理不连续的数据,如果处理缺失的数据
3、 剪枝处理
尝试实现算法一是为了熟悉python,二是为了更好的去理解算法的一个流程以及一些要点的处理。
- from math import log
- import operator
- import pickle
- import os
- import numpy as np
- def debug(value_name,value):
- print("debuging for %s" % value_name)
- print(value)
- # feature map and wind_label
- def loadDateset():
- with open('./wine.data') as f:
- wine = [eaxm.strip().split(',') for eaxm in f.readlines()]
- #for i in range(len(wine)):
- # wine[i] = list(map(float,wine[i]))
- wine = np.array(wine)
- wine_label = wine[...,:1]
- wine_data = wine[...,1:]
- # get the map of wine_feature
- featLabels = []
- for i in range(len(wine_data)):
- #print(i)
- featLabels.append(i)
- #
- wine_data = np.concatenate((wine_data,wine_label),axis=1)
- # 这里的label需要做一定的修改 需要的label是属性对应的字典
- return wine_data,featLabels
- # wine_data = dateset[:-1] wine_label = dateset[-1:]
- def informationEntropy(dataSet):
- m = len(dataSet)
- labelMap = {}
- for wine in dataSet:
- nowLabel = wine[-1]
- if nowLabel not in labelMap.keys():
- labelMap[nowLabel] = 0
- labelMap[nowLabel] += 1
- shannoEnt = 0.0
- for key in labelMap.keys():
- prop = float(labelMap[key]/m)
- shannoEnt -= prop*(log(prop,2))
- return shannoEnt
- # split the subDataSet Improve reusability
- def splitDataSet(dataSet,axis,feature):
- subDataSet = []
- # date type
- for featVec in dataSet:
- if(featVec[axis] == feature):
- reduceVec = featVec[:axis]
- if(isinstance(reduceVec,np.ndarray)):
- reduceVec = np.ndarray.tolist(reduceVec)
- reduceVec.extend(featVec[axis+1:])
- subDataSet.append(reduceVec)
- return subDataSet
- # choose the best Feature to split
- def chooseFeature(dataSet):
- numFeature = len(dataSet[0])-1
- baseEntorpy = informationEntropy(dataSet)
- bestInfoGain = 0.0
- bestFeature = -1
- for i in range(numFeature):
- #valueList = wine_data[:,i:i+1]
- valueList = [value[i] for value in dataSet]
- # debug
- # print("valueList is:")
- # print(len(valueList))
- uniqueVals = set(valueList)
- newEntropy = 0.0
- for value in uniqueVals:
- subDataSet = splitDataSet(dataSet,i,value)
- #debug
- #print("subDataSet is :")
- #print(subDataSet)
- #print(len(subDataSet[0]))
- # 数值部分要注意
- prop = len(subDataSet)/float(len(dataSet))
- newEntropy += prop*informationEntropy(subDataSet)
- infoGain = baseEntorpy - newEntropy
- if(infoGain > bestInfoGain):
- bestInfoGain = infoGain
- bestFeature = i
- return bestFeature
- def majorityCnt(classList):
- classMap = {}
- for vote in classList:
- if vote not in classMap.keys():
- classMap[vote] = 0
- classMap[vote] += 1
- #tempMap = sorted(classMap.items(),key = operator.itemgetter(1),reverse = True)
- tempMap = sorted(classMap.items(), key=lambda x:x[1], reverse=True)
- return tempMap[0][0]
- # labels for map of Feature
- def createTree(dataSet,Featlabels):
- classList = [example[-1] for example in dataSet]
- # if all of the attribute of classList is same
- if(classList.count(classList[0])) == len(classList):
- #print("all is same")
- return classList[0]
- # print("debug after")
- # feature is empty
- if len(dataSet[0]) == 1:
- print("len is zero")
- return majorityCnt(classList)
- # print("debug pre")
- bestFeat = chooseFeature(dataSet)
- #debug
- #print("debug")
- #print(bestFeat)
- bestFeatLabel = Featlabels[bestFeat]
- # print(bestFeatLabel)
- # python tree use dict for index of feature to build the tree
- myTree = {bestFeatLabel:{}}
- # del redundant label
- del(Featlabels[bestFeat])
- valueList = [example[bestFeat] for example in dataSet]
- uniqueVals = set(valueList)
- # print(uniqueVals)
- # 取值都一样的话就没有必要继续划分
- if(len(uniqueVals) == 1):
- return majorityCnt(dataSet)
- for value in uniqueVals:
- #if(bestFeat == 6):
- # print(value)
- subFeatLabels = Featlabels[:]
- # print(sublabels)
- subdataSet = splitDataSet(dataSet,bestFeat,value)
- if(bestFeatLabel == 6 and value == '3.06'):
- #print("debuging ")
- myTree[bestFeatLabel][value] = createTree(subdataSet, subFeatLabels)
- #print(myTree[bestFeatLabel][value])
- #print("len of build")
- #print(len(uniqueVals))
- # print(value)
- else:
- myTree[bestFeatLabel][value] = createTree(subdataSet,subFeatLabels)
- return myTree
- # classity fuction featLabel and testVes is used to get featvalue of test
- def classify(inputTree,featLabels,testVec):
- # get the node
- nowNode = list(inputTree.keys())[0]
- # debug
- #debug(nowNode)
- # print(featLabels)
- featIndex = featLabels.index(nowNode)
- # print(featIndex)
- #find the value of testVec in feature
- keyValue = testVec[featIndex]
- #print("len of input")
- #print(len(inputTree[nowNode].keys()))
- keyValue = str(keyValue)
- subTree = inputTree[nowNode][keyValue]
- if(isinstance(subTree,dict)):
- classLabel = classify(subTree,featLabels,testVec)
- else:
- classLabel = subTree
- return classLabel
- if __name__ == '__main__':
- wine_data, featLabels = loadDateset()
- #print(featLabels)
- #print(wine_data)
- myTree = createTree(wine_data,featLabels.copy())
- #print(type(myTree))
- # the type of value
- test = [14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065]
- #print(featLabels)
- print(classify(myTree,featLabels,test))
静下来,你想要的东西才能看见
python ID3决策树实现的更多相关文章
- Python3实现机器学习经典算法(三)ID3决策树
一.ID3决策树概述 ID3决策树是另一种非常重要的用来处理分类问题的结构,它形似一个嵌套N层的IF…ELSE结构,但是它的判断标准不再是一个关系表达式,而是对应的模块的信息增益.它通过信息增益的大小 ...
- ID3决策树预测的java实现
刚才写了ID3决策树的建立,这个是通过决策树来进行预测.这里主要用到的就是XML的遍历解析,比较简单. 关于xml的解析,参考了: http://blog.csdn.net/soszou/articl ...
- python利用决策树进行特征选择
python利用决策树进行特征选择(注释部分为绘图功能),最后输出特征排序: import numpy as np import tflearn from tflearn.layers.core im ...
- Python实现决策树ID3算法
主要思想: 0.训练集格式:特征1,特征2,...特征n,类别 1.采用Python自带的数据结构字典递归的表示数据 2.ID3计算的信息增益是指类别的信息增益,因此每次都是计算类别的熵 3.ID3每 ...
- python实现决策树C4.5算法(在ID3基础上改进)
一.概论 C4.5主要是在ID3的基础上改进,ID3选择(属性)树节点是选择信息增益值最大的属性作为节点.而C4.5引入了新概念"信息增益率",C4.5是选择信息增益率最大的属性作 ...
- python 之 决策树分类算法
发现帮助新手入门机器学习的一篇好文,首先感谢博主!:用Python开始机器学习(2:决策树分类算法) J. Ross Quinlan在1975提出将信息熵的概念引入决策树的构建,这就是鼎鼎大名的ID3 ...
- python画决策树
1.安装graphviz.下载地址在:http://www.graphviz.org/.如果你是linux,可以用apt-get或者yum的方法安装.如果是windows,就在官网下载msi文件安装. ...
- ID3决策树的Java实现
package DecisionTree; import java.io.*; import java.util.*; public class ID3 { //节点类 public class DT ...
- python实现决策树
1.决策树的简介 http://www.cnblogs.com/lufangtao/archive/2013/05/30/3103588.html 2.决策是实现的伪代码 “读入训练数据” “找出每个 ...
随机推荐
- Comparison of SIFT Encoded and Deep Learning Features for the Classification and Detection of Esca Disease in Bordeaux Vineyards(分类MobileNet,目标检测 RetinaNet)
识别葡萄的一种虫害,比较了传统SIFT和深度学习分类,最后还做了目标检测 分类用的 MobileNet,目标检测 RetinaNet MobileNet 是将传统深度可分离卷积分成了两步,深度卷积和逐 ...
- PL/SQL Developer插入数据到数据库出现数据中文乱码
问题描述: 使用PL/SQL Developer往Oracle数据库插入数据,出现中文乱码! 解决办法: 1.执行脚本 select userenv('language') from dual; 结果 ...
- 前后端通信—CORS(支持跨域)
根据前端跨域的那些事这篇文章中的跨域的理解这一块,我们重新创建两个服务,第一个服务使用了test.html const http = require('http') const fs = requir ...
- 软件工程 “校园汇” 个人IDEA竞选分析与总结
IDEA竞选 19/10/8软件工程课上举行了一次IDEA竞选: 我的竞选IDEA是"校友汇",大学生的在线活动中心. 投票结果: 可以看到,校友会(汇)IDEA竞选结果十分惨淡, ...
- 学习opencv(1)
目录 CV_8UC3 Scalar--颜色赋值 using namespace cv找不到命名空间 waitKey() getTickCount() 引用 CV_8UC3 a) 存放单通道图像中像素: ...
- Remind Me
创建2d人物:live2d 创建3d人物:adobe fuse
- WEB API 的设计与开发
- docker 进程管理
详文:理解Docker容器的进程管理:https://yq.aliyun.com/articles/5545 在Docker中,每个Container都是Docker Daemon的子进程. dock ...
- tomcat启动慢的解决办法
SessionIdGeneratorBase.createSecureRandom Creation of SecureRandom instance for session ID generatio ...
- 【spring源码分析】IOC容器解析
参考: https://www.iteye.com/topic/1121913(自动注入bean的扫描器) https://m.imooc.com/mip/article/34150(循环依赖的解决方 ...