thon_sklearn机器学习库学习笔记（四）decision

# 决策树

import pandas as pd

from sklearn.tree import DecisionTreeClassifier

from sklearn.cross_validation import train_test_split

from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

from sklearn.grid_search import GridSearchCV

import zipfile

#压缩节省空间

z=zipfile.ZipFile('ad-dataset.zip')

# df=pd.read_csv(z.open(z.namelist()[0]),header=None,low_memory=False)

# df = pd.read_csv(z.open(z.namelist()[0]), header=None, low_memory=False)

df=pd.read_csv('.\\tree_data\\ad.data',header=None)

explanatory_variable_columns=set(df.columns.values)

response_variable_column=df[len(df.columns.values)-1]

#最后一列是代表的标签类型

explanatory_variable_columns.remove(len(df.columns)-1)

y=[1 if e =='ad.' else 0 for e in response_variable_column]

X=df.loc[:,list(explanatory_variable_columns)]

#匹配？字符，并把值转化为-1

X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)

X_train,X_test,y_train,y_test=train_test_split(X,y)

#用信息增益启发式算法建立决策树

pipeline=Pipeline([('clf',DecisionTreeClassifier(criterion='entropy'))])

parameters = {

'clf__max_depth': (150, 155, 160),

'clf__min_samples_split': (1, 2, 3),

'clf__min_samples_leaf': (1, 2, 3)

}

#f1查全率和查准率的调和平均

grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,

                         verbose=1,scoring='f1')

grid_search.fit(X_train,y_train)

print '最佳效果：%0.3f'%grid_search.best_score_

print '最优参数'

best_parameters=grid_search.best_estimator_.get_params()

best_parameters

输出结果：

Fitting 3 folds for each of 27 candidates, totalling 81 fits

[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.0s

[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   34.7s finished

最佳效果：0.888

最优参数

Out[123]:

{'clf': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,

             max_features=None, max_leaf_nodes=None, min_samples_leaf=1,

             min_samples_split=3, min_weight_fraction_leaf=0.0,

             presort=False, random_state=None, splitter='best'),

 'clf__class_weight': None,

 'clf__criterion': 'entropy',

 'clf__max_depth': 160,

 'clf__max_features': None,

 'clf__max_leaf_nodes': None,

 'clf__min_samples_leaf': 1,

 'clf__min_samples_split': 3,

 'clf__min_weight_fraction_leaf': 0.0,

 'clf__presort': False,

 'clf__random_state': None,

 'clf__splitter': 'best',

 'steps': [('clf',

   DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,

               max_features=None, max_leaf_nodes=None, min_samples_leaf=1,

               min_samples_split=3, min_weight_fraction_leaf=0.0,

               presort=False, random_state=None, splitter='best'))]}

for param_name in sorted(parameters.keys()):

    print ('\t%s:%r'%(param_name,best_parameters[param_name]))

predictions=grid_search.predict(X_test)

print classification_report(y_test,predictions)

输出结果：

clf__max_depth:150
clf__min_samples_leaf:1
clf__min_samples_split:1
precision recall f1-score support

0 0.97 0.99 0.98 703
1 0.91 0.84 0.87 117

avg / total 0.96 0.96 0.96 820

df.head()

输出结果;

	0	1	2	3	...	1558
0	125	125	1.0	1	...	ad.
1	57	468	8.2105	1	...	ad.
2	33	230	6.9696	1	...	ad.
3	60	468	7.8	1	...	ad.
4	60	468	7.8	1	...	ad.

# 决策树集成

#coding:utf-8

import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from sklearn.cross_validation import train_test_split

from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

from sklearn.grid_search import GridSearchCV

df=pd.read_csv('.\\tree_data\\ad.data',header=None,low_memory=False)

explanatory_variable_columns=set(df.columns.values)

response_variable_column=df[len(df.columns.values)-1]

df.head()

	0	1	2	3	...	1558
0	125	125	1.0	1	...	ad.
1	57	468	8.2105	1	...	ad.
2	33	230	6.9696	1	...	ad.
3	60	468	7.8	1	...	ad.
4	60	468	7.8	1	...	ad.

#The last column describes the targets(去掉最后一列)

explanatory_variable_columns.remove(len(df.columns.values)-1)

y=[1 if e=='ad.' else 0 for e in response_variable_column]

X=df.loc[:,list(explanatory_variable_columns)]

#置换有？的为-1

X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)

X_train,X_test,y_train,y_test=train_test_split(X,y)

pipeline=Pipeline([('clf',RandomForestClassifier(criterion='entropy'))])

parameters = {

'clf__n_estimators': (5, 10, 20, 50),

'clf__max_depth': (50, 150, 250),

'clf__min_samples_split': (1, 2, 3),

'clf__min_samples_leaf': (1, 2, 3)

}

grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='f1')

grid_search.fit(X_train,y_train)

print(u'最佳效果：%0.3f'%grid_search.best_score_)

print u'最优的参数：'

best_parameters=grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):

    print('\t%s:%r'%(param_name,best_parameters[param_name]))

输出结果：

最佳效果：0.929 最优的参数： clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50

predictions=grid_search.predict(X_test)

print classification_report(y_test,predictions)

输出结果：

precision recall f1-score support

0 0.98 1.00 0.99 705
1 0.97 0.90 0.93 115

avg / total 0.98 0.98 0.98 820

thon_sklearn机器学习库学习笔记（四）decision_tree（决策树）的更多相关文章

muduo网络库学习笔记(四) 通过eventfd实现的事件通知机制
目录 muduo网络库学习笔记(四) 通过eventfd实现的事件通知机制 eventfd的使用 eventfd系统函数使用示例 EventLoop对eventfd的封装工作时序 runInLoo ...
Python_sklearn机器学习库学习笔记（四）decision_tree（决策树）
# 决策树 import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.cross_validat ...
【机器学习实战学习笔记(2-2)】决策树python3.6实现及简单应用
文章目录 1.ID3及C4.5算法基础 1.1 计算香农熵 1.2 按照给定特征划分数据集 1.3 选择最优特征 1.4 多数表决实现 2.基于ID3.C4.5生成算法创建决策树 3.使用决策树进行分 ...
Python_sklearn机器学习库学习笔记（一）_Feature Extraction and Preprocessing(特征提取与预处理）
# Extracting features from categorical variables #Extracting features from categorical variables 独热编 ...
Python_sklearn机器学习库学习笔记（七）the perceptron(感知器）
一.感知器感知器是Frank Rosenblatt在1957年就职于Cornell航空实验室时发明的,其灵感来自于对人脑的仿真,大脑是处理信息的神经元(neurons)细胞和链接神经元细胞进行信息传 ...
Python_sklearn机器学习库学习笔记（一）_一元回归
一.引入相关库 %matplotlib inline import matplotlib.pyplot as plt from matplotlib.font_manager import FontP ...
Python_sklearn机器学习库学习笔记（三）logistic regression（逻辑回归）
# 逻辑回归 ## 逻辑回归处理二元分类 %matplotlib inline import matplotlib.pyplot as plt #显示中文 from matplotlib.font_m ...
Python_sklearn机器学习库学习笔记（五）k-means（聚类）
# K的选择:肘部法则如果问题中没有指定的值,可以通过肘部法则这一技术来估计聚类数量.肘部法则会把不同值的成本函数值画出来.随着值的增大,平均畸变程度会减小:每个类包含的样本数会减少,于是样本 ...
Python_sklearn机器学习库学习笔记（六） dimensionality-reduction-with-pca
# 用PCA降维 #计算协方差矩阵 import numpy as np X=[[2,0,-1.4], [2.2,0.2,-1.5], [2.4,0.1,-1], [1.9,0,-1.2]] np.c ...

随机推荐

LCA
2016.1.28 LCA,就是最近公共祖先,这里介绍倍增的算法. 首先我们要预处理,设f[i][j]为编号为i的节点的2j级祖先,所谓2j级祖先,就是从i节点开始往树的上层数2j个节点.如下图所示 ...
HBase Shell操作
Hbase 是一个分布式的.面向列的开源数据库,其实现是建立在google 的bigTable 理论之上,并基于hadoop HDFS文件系统. Hbase不同于一般的关系型数据库(RDBMS ...
Hibernate之即时更新
昨天工作中遇到了一个简单的问题,弄了好久,都怪自己没有好好的去了解hibernate,导致了这样的问题弄了两三个小时. 问题是这样的:我想即时更改数据,然后再查询 (1)用Spring的getHibe ...
在大于32GB或64GB容量的SD卡上使用NOOB安装树莓派 - Using NOOB on SD cards larger than 32GB or 64GB for Raspberry Pi
在树莓派上玩了一小段时间了,因为装的软件包越来越多,所以越来越感觉16G的SD卡没办法长期使用下去.于是采购了几张64G的SD卡,打算周末装上系统.可是按照一般的流程,在Windows下用SD For ...
onethink入门笔记(一)
由于公司需求所以大概花了一个星期搞了一个一个基于onethink的数据管理平台demo不得不说onethink这个基于thinkphp3.2.3的框架还是很棒的让我这个没基础过php的人也能在一星期 ...
JAVA 通过LDAP获取AD域用户及组织信息
因为工作需求近期做过一个从客户AD域获取数据实现单点登录的功能,在此整理分享. 前提:用户可能有很多系统的情况下,为了方便账号的统一管理使用AD域验证登录,所以不需要我们的系统登录,就需要获取用户的A ...
R12将银行和分行都使用TCA管理
R12将银行和分行都使用TCA管理,后台保存在HZ_PARTIES . 银行帐号:如果是付款或者是收款(本公司的帐号,内部帐号),都保存在ce_bank_accounts,ce_bank_acct_u ...
etcd api 接口
etcd api接口基本操作api: https://github.com/coreos/etcd/blob/6acb3d67fbe131b3b2d5d010e00ec80182be4628/Doc ...
Res_Orders_01
一.燃尽图展示二.项目进展 1.讨论选题内容 2.确定项目的版本(Web版) 3.讨论能达到的效果和内容 4.确定编程方面的难点 5.开始制作大概的框架三.遇到问题 1.不知道怎么部署能达到最好的 ...
mobx源码解读2
我们将上节用到的几个类的构造器列举一下吧: function Reaction(name, onInvalidate) { if (name === void 0) { name = "Re ...

thon_sklearn机器学习库学习笔记（四）decision_tree（决策树）

thon_sklearn机器学习库学习笔记（四）decision_tree（决策树）的更多相关文章

随机推荐

热门专题