Python机器学习实践与Kaggle实战(转)
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5%E4%B8%8Ekaggle%E5%AE%9E%E6%88%98-machine-learning-for-kaggle-competition-in-python/
Author: Miao Fan (范淼), Ph.D. candidate on Computer Science.
Affiliation: Tsinghua University / New York University
[C.V.] [Google Scholar] [Special Talk in NYU]
Email: fanmiao.cslt.thu@gmail.com
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing # 读取数据
iris = load_iris() # 选取特征与标签
X_iris, y_iris = iris.data, iris.target # 选择前两列数据作为特征
X, y = X_iris[:, :2], y_iris # 选取一部分,25%的训练数据作为测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33) # 对原特征数据进行标准化预处理,这个其实挺重要,但是经常被一些选手忽略
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) from sklearn.linear_model import SGDClassifier # 选择使用SGD分类器,适合大规模数据,随机梯度下降方法估计参数
clf = SGDClassifier() clf.fit(X_train, y_train) # 导入评价包
from sklearn import metrics y_train_predict = clf.predict(X_train) # 内测,使用训练样本进行准确性能评估
print metrics.accuracy_score(y_train, y_train_predict) # 标准外测,使用测试样本进行准确性能评估
y_predict = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_predict)
0.660714285714
0.684210526316
# 如果需要更加详细的性能报告,比如precision, recall, accuracy,可以使用如下的函数。
print metrics.classification_report(y_test, y_predict, target_names = iris.target_names)
precision recall f1-score support setosa 1.00 1.00 1.00 8
versicolor 0.43 0.27 0.33 11
virginica 0.65 0.79 0.71 19 avg / total 0.66 0.68 0.66 38
# 如果想详细探查SGDClassifier的分类性能,我们需要充分利用数据,因此需要把数据切分为N个部分,每个部分都用于测试一次模型性能。 from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 这里使用Pipeline,便于精简模型搭建,一般而言,模型在fit之前,对数据需要feature_extraction, preprocessing, 等必要步骤。
# 这里我们使用默认的参数配置
clf = Pipeline([('scaler', StandardScaler()), ('sgd_classifier', SGDClassifier())]) # 5折交叉验证整个数据集合
cv = KFold(X.shape[0], 5, shuffle=True, random_state = 33) scores = cross_val_score(clf, X, y, cv=cv)
print scores # 计算一下模型综合性能,平均精度和标准差
print scores.mean(), scores.std() from scipy.stats import sem
import numpy as np # 这里使用的偏差计算函数略有不同,参考链接
http://www.graphpad.com/guides/prism/6/statistics/index.htm?stat_semandsdnotsame.htm
print np.mean(scores), sem(scores)
[ 0.56666667 0.73333333 0.83333333 0.76666667 0.8 ]
0.74 0.0928559218479
0.74 0.0464279609239
from sklearn.datasets import fetch_olivetti_faces # 这部分数据没有直接存储在现有包中,都是通过这类函数在线下载
faces = fetch_olivetti_faces()
# 这里证明,数据是以Dict的形式存储的,与多数实验性数据的格式一致
faces.keys()
['images', 'data', 'target', 'DESCR']
# 使用shape属性检验数据规模
print faces.data.shape
print faces.target.shape
(400L, 4096L)
(400L,)
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC # 同样是分割数据 25%用于测试
X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0)
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem # 构造一个便于交叉验证模型性能的函数(模块)
def evaluate_cross_validation(clf, X, y, K):
# KFold 函数需要如下参数:数据量, 叉验次数, 是否洗牌
cv = KFold(len(y), K, shuffle=True, random_state = 0)
# 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores)) # 使用线性核的SVC (后面会说到不同的核,结果可能大不相同)
svc_linear = SVC(kernel='linear')
# 五折交叉验证 K = 5
evaluate_cross_validation(svc_linear, X_train, y_train, 5)
[ 0.93333333 0.86666667 0.91666667 0.93333333 0.91666667]
Mean score: 0.913 (+/-0.012)
from sklearn.datasets import fetch_20newsgroups
# 与之前的人脸数据集一样,20类新闻数据同样需要临时下载函数的帮忙
news = fetch_20newsgroups(subset='all')
# 查验数据,依然采用dict格式,共有18846条样本
print len(news.data), len(news.target)
print news.target
18846 18846
[10 3 17 ..., 3 1 7]
# 查验一下新闻类别和种数
print news.target_names
print news.target_names.__len__()
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
20
# 同样,我们选取25%的数据用来测试模型性能
from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25)
print X_train.__len__()
print y_train.__len__()
print X_test.__len__()
14134
14134
4712
# 许多原始数据无法直接被分类器所使用,图像可以直接使用pixel信息,文本则需要进一步处理成数值化的信息
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import *
from scipy.stats import sem
# 我们在NB_Classifier的基础上,对比几种特征抽取方法的性能。并且使用Pipline简化构建训练流程
clf_1 = Pipeline([('count_vec', CountVectorizer()), ('mnb', MultinomialNB())])
clf_2 = Pipeline([('hash_vec', HashingVectorizer(non_negative=True)), ('mnb', MultinomialNB())])
clf_3 = Pipeline([('tfidf_vec', TfidfVectorizer()), ('mnb', MultinomialNB())]) # 构造一个便于交叉验证模型性能的函数(模块)
def evaluate_cross_validation(clf, X, y, K):
# KFold 函数需要如下参数,数据量, K,是否洗牌
cv = KFold(len(y), K, shuffle=True, random_state = 0)
# 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
evaluate_cross_validation(clf, X_train, y_train, 5)
[ 0.83516095 0.83374602 0.84471171 0.83622214 0.83227176]
Mean score: 0.836 (+/-0.002)
[ 0.76052352 0.72727273 0.77538026 0.74778918 0.75194621]
Mean score: 0.753 (+/-0.008)
[ 0.84435798 0.83409975 0.85496993 0.84082066 0.83227176]
Mean score: 0.841 (+/-0.004)
# 从上述结果中,我们发现常用的两个特征提取方法得到的性能相当。 让我们选取其中之一,进一步靠特征的精细筛选提升性能。
clf_4 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB())])
evaluate_cross_validation(clf_4, X_train, y_train, 5)
[ 0.87053414 0.86664308 0.887867 0.87371772 0.86553432]
Mean score: 0.873 (+/-0.004)
# 如果再尝试修改贝叶斯分类器的平滑参数,也许性能会更上一层楼。
clf_5 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB(alpha=0.01))])
evaluate_cross_validation(clf_5, X_train, y_train, 5)
[ 0.90060134 0.89741776 0.91651928 0.90909091 0.90410474]
Mean score: 0.906 (+/-0.003)
# 这里为了处理数据方便,我们引入一个新的工具包pandas import pandas as pd
import numpy as np titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#瞧瞧数据,什么数据特征的都有,有数值型的、类别型的,字符串,甚至还有缺失的数据等等。
titanic.head()
# 使用pandas,数据都转入pandas独有的dataframe格式(二维数据表格),直接使用info(),查看数据的基本特征
titanic.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names 1313 non-null int64
pclass 1313 non-null object
survived 1313 non-null int64
name 1313 non-null object
age 633 non-null float64
embarked 821 non-null object
home.dest 754 non-null object
room 77 non-null object
ticket 69 non-null object
boat 347 non-null object
sex 1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 123.1+ KB
# 这份调查数据是真实的泰坦尼克号乘客个人和登船信息,有助于我们预测每位遇难乘客是否幸免。
# 一共1313条数据,有些特征是完整的(比如 pclass, survived, name),有些是有缺失的;有些是数值类型的信息(age: float64),有些则是字符串。
# 机器学习有一个不太被初学者重视,并且耗时,但是十分重要的一环,特征的选择,这个需要基于一些背景知识。根据我们对这场事故的了解,sex, age, pclass这些都很有可能是决定幸免与否的关键因素。 # we keep pclass, age, sex. X = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']
X.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass 1313 non-null object
age 633 non-null float64
sex 1313 non-null object
dtypes: float64(1), object(2)
memory usage: 41.0+ KB
# 下面有几个对数据处理的任务
# 1) age这个数据列,只有633个
# 2) sex 与 pclass两个数据列的值都是类别型的,需要转化为数值特征,用0/1代替 # 首先我们补充age里的数据,使用平均数或者中位数都是对模型偏离造成最小影响的策略
X['age'].fillna(X['age'].mean(), inplace=True)
C:\Anaconda2\lib\site-packages\pandas\core\generic.py:2748: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
X.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1313 entries, 0 to 1312
Data columns (total 3 columns):
pclass 1313 non-null object
age 1313 non-null float64
sex 1313 non-null object
dtypes: float64(1), object(2)
memory usage: 41.0+ KB
from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33) # 我们使用scikit-learn中的feature_extraction
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
print vec.feature_names_
# 我们发现,凡是类别型的特征都单独剥离出来,独成一列特征,数值型的则保持不变
['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
X_test = vec.transform(X_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)
0.79331306990881456
from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=3, min_samples_leaf=5)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)
0.77203647416413379
from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5) gbc.fit(X_train, y_train)
gbc.score(X_test, y_test)
0.79027355623100304
from sklearn.metrics import classification_report y_predict = gbc.predict(X_test)
print classification_report(y_predict, y_test)
# 这里的函数可以便于生成分类器性能报告(precision,recall)这些是在二分类背景下才有的指标。
precision recall f1-score support 0 0.93 0.78 0.84 241
1 0.57 0.83 0.68 88 avg / total 0.83 0.79 0.80 329
# 首先预读房价数据
from sklearn.datasets import load_boston boston = load_boston() # 查验数据规模
print boston.data.shape
(506L, 13L)
# 多多弄懂数据特征的含义也是一个好习惯
print boston.feature_names
print boston.DESCR
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
'B' 'LSTAT']
Boston House Prices dataset Notes
------
Data Set Characteristics: :Number of Instances: 506 :Number of Attributes: 13 numeric/categorical predictive :Median Value (attribute 14) is usually the target :Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's :Missing Attribute Values: None :Creator: Harrison, D. and Rubinfeld, D.L. This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter. The Boston house-price data has been used in many machine learning papers that address regression
problems. **References** - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
- many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
# 这里多一个步骤,查验数据是否正规化,一般都是没有的
import numpy as np print np.max(boston.target)
print np.min(boston.target)
print np.mean(boston.target)
50.0
5.0
22.5328063241
from sklearn.cross_validation import train_test_split
# 依然如故,我们对数据进行分割
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state=33) from sklearn.preprocessing import StandardScaler # 正规化的目的在于避免原始特征值差异过大,导致训练得到的参数权重不一
scalerX = StandardScaler().fit(X_train)
X_train = scalerX.transform(X_train)
X_test = scalerX.transform(X_test) scalery = StandardScaler().fit(y_train)
y_train = scalery.transform(y_train)
y_test = scalery.transform(y_test)
# 先把评价模块写好,依然是默认5折交叉验证,只是这里的评价指标不再是精度,而是另一个函数R2,大体上,这个得分多少代表有多大百分比的回归结果可以被训练器覆盖和解释
from sklearn.cross_validation import * def train_and_evaluate(clf, X_train, y_train):
cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
scores = cross_val_score(clf, X_train, y_train, cv=cv)
print 'Average coefficient of determination using 5-fold cross validation:', np.mean(scores) #最后让我们看看有多少种回归模型可以被使用(其实有更多)。
# 比较有代表性的有3种
# 先用线性模型尝试, SGD_Regressor
from sklearn import linear_model
# 这里有一个正则化的选项penalty,目前14维特征也许不会有太大影响
clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42)
train_and_evaluate(clf_sgd, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.710809853468
# 再换一个SGD_Regressor的penalty参数为l2,结果貌似影响不大,因为特征太少,正则化意义不大
clf_sgd_l2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42)
train_and_evaluate(clf_sgd_l2, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.71081206667
# 再看看SVM的regressor怎么样(都是默认参数),
from sklearn.svm import SVR
# 使用线性核没有啥子提升,但是因为特征少,所以可以考虑升高维度
clf_svr = SVR(kernel='linear')
train_and_evaluate(clf_svr, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.707838419194
clf_svr_poly = SVR(kernel='poly')
# 升高维度,效果明显,但是此招慎用@@,特征高的话, CPU还是受不了,内存倒是小事。其实到了现在,连我们自己都没办法直接解释这些特征的具体含义了。
train_and_evaluate(clf_svr_poly, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.779288545488
clf_svr_rbf = SVR(kernel='rbf')
# RBF (径向基核更是牛逼!)
train_and_evaluate(clf_svr_rbf, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.833662221567
# 再来个更猛的! 极限回归森林,放大招了!!!
from sklearn import ensemble
clf_et = ensemble.ExtraTreesRegressor()
train_and_evaluate(clf_et, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.853006383633
# 最后看看在测试集上的表现
clf_et.fit(X_train, y_train)
clf_et.score(X_test, y_test)
0.83781467779895469
import numpy as np
# 先热个身,牛刀小试
M = np.array([[1, 2], [2, 4]])
M
array([[1, 2],
[2, 4]])
np.linalg.matrix_rank(M, tol=None)
# 获取M矩阵的秩=1
1
# 载入手写数字的图像像素数据。对于图像处理,除了后续的各种启发式提取有效特征以外,
# 最直接常用的就是像素数据,每个像素都是一个数值,反映颜色。
from sklearn.datasets import load_digits
digits = load_digits()
# 这些经典数据的存储格式非常统一。这是好习惯,统一了接口,也便于快速使用。
digits
{'DESCR': " Optical Recognition of Handwritten Digits Data Set\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 5620\n :Number of Attributes: 64\n :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n :Missing Attribute Values: None\n :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n Graduate Studies in Science and Engineering, Bogazici University.\n - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n Linear dimensionalityreduction using relevance weighted LDA. School of\n Electrical and Electronic Engineering Nanyang Technological University.\n 2005.\n - Claudio Gentile. A New Approximate Maximal Margin Classification\n Algorithm. NIPS. 2000.\n",
'data': array([[ 0., 0., 5., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 10., 0., 0.],
[ 0., 0., 0., ..., 16., 9., 0.],
...,
[ 0., 0., 1., ..., 6., 0., 0.],
[ 0., 0., 2., ..., 12., 0., 0.],
[ 0., 0., 10., ..., 12., 1., 0.]]),
'images': array([[[ 0., 0., 5., ..., 1., 0., 0.],
[ 0., 0., 13., ..., 15., 5., 0.],
[ 0., 3., 15., ..., 11., 8., 0.],
...,
[ 0., 4., 11., ..., 12., 7., 0.],
[ 0., 2., 14., ..., 12., 0., 0.],
[ 0., 0., 6., ..., 0., 0., 0.]], [[ 0., 0., 0., ..., 5., 0., 0.],
[ 0., 0., 0., ..., 9., 0., 0.],
[ 0., 0., 3., ..., 6., 0., 0.],
...,
[ 0., 0., 1., ..., 6., 0., 0.],
[ 0., 0., 1., ..., 6., 0., 0.],
[ 0., 0., 0., ..., 10., 0., 0.]], [[ 0., 0., 0., ..., 12., 0., 0.],
[ 0., 0., 3., ..., 14., 0., 0.],
[ 0., 0., 8., ..., 16., 0., 0.],
...,
[ 0., 9., 16., ..., 0., 0., 0.],
[ 0., 3., 13., ..., 11., 5., 0.],
[ 0., 0., 0., ..., 16., 9., 0.]], ...,
[[ 0., 0., 1., ..., 1., 0., 0.],
[ 0., 0., 13., ..., 2., 1., 0.],
[ 0., 0., 16., ..., 16., 5., 0.],
...,
[ 0., 0., 16., ..., 15., 0., 0.],
[ 0., 0., 15., ..., 16., 0., 0.],
[ 0., 0., 2., ..., 6., 0., 0.]], [[ 0., 0., 2., ..., 0., 0., 0.],
[ 0., 0., 14., ..., 15., 1., 0.],
[ 0., 4., 16., ..., 16., 7., 0.],
...,
[ 0., 0., 0., ..., 16., 2., 0.],
[ 0., 0., 4., ..., 16., 2., 0.],
[ 0., 0., 5., ..., 12., 0., 0.]], [[ 0., 0., 10., ..., 1., 0., 0.],
[ 0., 2., 16., ..., 1., 0., 0.],
[ 0., 0., 15., ..., 15., 0., 0.],
...,
[ 0., 4., 16., ..., 16., 6., 0.],
[ 0., 8., 16., ..., 16., 8., 0.],
[ 0., 1., 8., ..., 12., 1., 0.]]]),
'target': array([0, 1, 2, ..., 8, 9, 8]),
'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])}
# 老套路
X_digits, y_digits = digits.data, digits.target
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
# 最关键的参数就是n_components = 2个主成分 estimator = PCA(n_components=2) X_pca = estimator.fit_transform(X_digits)
# scikit-learn的接口设计的很统一。 # 聚类问题经常需要直观的展现数据,降维度的一个直接目的也为此;因此我们这里多展现几个图片直观一些。 def plot_pca_scatter():
colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray']
for i in xrange(len(colors)):
px = X_pca[:, 0][y_digits == i]
py = X_pca[:, 1][y_digits == i]
plt.scatter(px, py, c=colors[i])
plt.legend(digits.target_names)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show() plot_pca_scatter()
# 这部分代码和原著的第四章节有相同的效果,但是充分利用pandas会表达的更加简洁,因此我重新编写了更加清晰简洁的代码。
import pandas as pd
import numpy as np titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') print titanic.info()
# 还是这组数据
titanic.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1313 entries, 0 to 1312
Data columns (total 11 columns):
row.names 1313 non-null int64
pclass 1313 non-null object
survived 1313 non-null int64
name 1313 non-null object
age 633 non-null float64
embarked 821 non-null object
home.dest 754 non-null object
room 77 non-null object
ticket 69 non-null object
boat 347 non-null object
sex 1313 non-null object
dtypes: float64(1), int64(2), object(8)
memory usage: 123.1+ KB
None
# 我们丢掉一些过于特异的,不利于找到共同点的数据列, row.names, name, 同时分离出预测列。 y = titanic['survived']
X = titanic.drop(['row.names', 'name', 'survived'], axis = 1)
# 对于连续的数值特征,我们采用补完的方式
X['age'].fillna(X['age'].mean(), inplace=True) X.fillna('UNKNOWN', inplace=True)
# 剩下的类别类型数据,我们直接向量化,这样的话,对于有空白特征的列,我们也单独视作一个特征 from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)
474
X_train.toarray()
array([[ 31.19418104, 0. , 0. , ..., 0. ,
0. , 1. ],
[ 31.19418104, 0. , 0. , ..., 0. ,
0. , 0. ],
[ 31.19418104, 0. , 0. , ..., 0. ,
0. , 1. ],
...,
[ 12. , 0. , 0. , ..., 0. ,
0. , 1. ],
[ 18. , 0. , 0. , ..., 0. ,
0. , 1. ],
[ 31.19418104, 0. , 0. , ..., 0. ,
0. , 1. ]])
from sklearn.tree import DecisionTreeClassifier dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
# 采用所有特征的测试精度
0.81762917933130697
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
# 采用20%高预测性特征的测试精度
0.82370820668693012
from sklearn.cross_validation import cross_val_score
percentiles = range(1, 100, 2) results = [] for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
print results opt = np.where(results == results.max())[0]
print 'Optimal number of features %d' %percentiles[opt]
import pylab as pl pl.plot(percentiles, results)
pl.show()
[ 0.85063904 0.85673057 0.87501546 0.88622964 0.86590394 0.87097506
0.87303649 0.86997526 0.87097506 0.87300557 0.86997526 0.86893424
0.87098536 0.86490414 0.86385281 0.86791383 0.86488353 0.86892393
0.86791383 0.86284271 0.86487322 0.86792414 0.86894455 0.87303649
0.86892393 0.86998557 0.86689342 0.86488353 0.86895485 0.86689342
0.87198516 0.8638322 0.86488353 0.87402597 0.87299526 0.87098536
0.86997526 0.86892393 0.86794475 0.86486291 0.87096475 0.86587302
0.86387343 0.86083282 0.86589363 0.8608019 0.86492476 0.85774067
0.8608122 0.85779221]
Optimal number of features 7
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7) X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
# 选取搜索到的最好特征比例的测试精度
0.8571428571428571
# 由此可见,这个技术对于工程上提升精度还是非常有帮助的。
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
# 我们首先使用grid_search的单核版本
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33) from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())]) # 这里需要试验的2个超参数的的个数分别是4、3, svc__gamma的参数共有10^-2, 10^-1...
# 这样我们一共有12种的超参数组合,12个不同参数下的模型
parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)} # 再考虑每个模型需要交叉验证3次,因此一共需要训练36次模型,根据下面的结果,单线程下,每个模型的训练任务耗时5秒左右。
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3) %time _=gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_ print gs.score(X_test, y_test)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] svc__gamma=0.01, svc__C=0.1 .....................................
[CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.1s
[CV] svc__gamma=0.01, svc__C=0.1 .....................................
[CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.3s
[CV] svc__gamma=0.01, svc__C=0.1 .....................................
[CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.2s
[CV] svc__gamma=0.1, svc__C=0.1 ......................................
[CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.1s
[CV] svc__gamma=0.1, svc__C=0.1 ......................................
[CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.2s
[CV] svc__gamma=0.1, svc__C=0.1 ......................................
[CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.3s
[CV] svc__gamma=1.0, svc__C=0.1 ......................................
[CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.7s
[CV] svc__gamma=1.0, svc__C=0.1 ......................................
[CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.8s
[CV] svc__gamma=1.0, svc__C=0.1 ......................................
[CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.9s
[CV] svc__gamma=10.0, svc__C=0.1 .....................................
[CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.4s
[CV] svc__gamma=10.0, svc__C=0.1 .....................................
[CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s
[CV] svc__gamma=10.0, svc__C=0.1 .....................................
[CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s
[CV] svc__gamma=0.01, svc__C=1.0 .....................................
[CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.2s
[CV] svc__gamma=0.01, svc__C=1.0 .....................................
[CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s
[CV] svc__gamma=0.01, svc__C=1.0 .....................................
[CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s
[CV] svc__gamma=0.1, svc__C=1.0 ......................................
[CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.2s
[CV] svc__gamma=0.1, svc__C=1.0 ......................................
[CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.3s
[CV] svc__gamma=0.1, svc__C=1.0 ......................................
[CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.4s
[CV] svc__gamma=1.0, svc__C=1.0 ......................................
[CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.3s
[CV] svc__gamma=1.0, svc__C=1.0 ......................................
[CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.4s
[CV] svc__gamma=1.0, svc__C=1.0 ......................................
[CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.5s
[CV] svc__gamma=10.0, svc__C=1.0 .....................................
[CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s
[CV] svc__gamma=10.0, svc__C=1.0 .....................................
[CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.3s
[CV] svc__gamma=10.0, svc__C=1.0 .....................................
[CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s
[CV] svc__gamma=0.01, svc__C=10.0 ....................................
[CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s
[CV] svc__gamma=0.01, svc__C=10.0 ....................................
[CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s
[CV] svc__gamma=0.01, svc__C=10.0 ....................................
[CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.3s
[CV] svc__gamma=0.1, svc__C=10.0 .....................................
[CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.3s
[CV] svc__gamma=0.1, svc__C=10.0 .....................................
[CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s
[CV] svc__gamma=0.1, svc__C=10.0 .....................................
[CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s
[CV] svc__gamma=1.0, svc__C=10.0 .....................................
[CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.3s
[CV] svc__gamma=1.0, svc__C=10.0 .....................................
[CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.5s
[CV] svc__gamma=1.0, svc__C=10.0 .....................................
[CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.7s
[CV] svc__gamma=10.0, svc__C=10.0 ....................................
[CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s
[CV] svc__gamma=10.0, svc__C=10.0 ....................................
[CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s
[CV] svc__gamma=10.0, svc__C=10.0 ....................................
[CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.9s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 5.1s
[Parallel(n_jobs=1)]: Done 36 out of 36 | elapsed: 3.3min finished
Wall time: 3min 27s
0.822666666667
# 然后我们采用多线程并行搜索,观察时间性能的提高情况 from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33) from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())]) parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)} gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1) %time _=gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test)
# 并行化寻找最优的超参数配置,同样获得相同的最优解,但是训练耗时基本上随着CPU核的数量成倍减少。
[Parallel(n_jobs=-1)]: Done 1 jobs | elapsed: 8.4s
[Parallel(n_jobs=-1)]: Done 22 out of 36 | elapsed: 30.3s remaining: 19.2s
[Parallel(n_jobs=-1)]: Done 36 out of 36 | elapsed: 46.8s finished
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Wall time: 56.5 s
0.822666666667
# 这里需要补充的是得到这个结果的机器的配置,好让读者有一个对并行计算更好的了解。
'''
CPU: i7 四核 2.4Ghz
Memory: DDR3 1600 32GB '''
Python机器学习实践与Kaggle实战(转)的更多相关文章
- 机器学习实践:《Python机器学习实践指南》中文PDF+英文PDF+代码
机器学习是近年来渐趋热门的一个领域,同时Python 语言经过一段时间的发展也已逐渐成为主流的编程语言之一.<Python机器学习实践指南>结合了机器学习和Python 语言两个热门的领域 ...
- Python机器学习实践指南pdf (中文版带书签)、原书代码、数据集
Python机器学习实践指南 目 录 第1章Python机器学习的生态系统 1 1.1 数据科学/机器学习的工作 流程 2 1.1.1 获取 2 1.1.2 检查和探索 2 1.1.3 清理和准备 3 ...
- Python机器学习实践:决策树判别汽车金融违约用户
文章发布于公号[数智物语] (ID:decision_engine),关注公号不错过每一篇干货. 转自 | 法纳斯特(公众号ID:walker398) 作者 | 小F 决策树呈树形结构,是一种基本的回 ...
- Python并发实践_03_并发实战之一
16S数据质控流程,一次下机lane包括很多的项目,每个项目有独立的合同号,一个项目可能包含16S或者ITS两种,通过一个完整的pipeline,将上游拆分好的数据全部整理成可以直接分析的数据.原本这 ...
- python 机器学习实践入门
机器学习概念概念 机器 学习是计算机科学的一个分支,从模式识别.人工智能和计算学习理论发展而来,我们可以将其作为数据挖掘的工具 侧重用于数据分析方法理解给定的数据 目的是:开发能够从先前观测的数据,通 ...
- 《Python机器学习及实践:从零开始通往Kaggle竞赛之路》
<Python 机器学习及实践–从零开始通往kaggle竞赛之路>很基础 主要介绍了Scikit-learn,顺带介绍了pandas.numpy.matplotlib.scipy. 本书代 ...
- Python机器学习及实践_从零开始通往KAGGLE竞赛之路PDF高清完整版免费下载|百度云盘|Python基础教程免费电子书
点击获取提取码:i5nw Python机器学习及实践面向所有对机器学习与数据挖掘的实践及竞赛感兴趣的读者,从零开始,以Python编程语言为基础,在不涉及大量数学模型与复杂编程知识的前提下,逐步带领读 ...
- 《PYTHON机器学习及实践-从零开始通往KAGGLE竞赛之路》 分享下载
转: <PYTHON机器学习及实践-从零开始通往KAGGLE竞赛之路> 分享下载 书籍信息 书名: PYTHON机器学习及实践-从零开始通往KAGGLE竞赛之路 标签: PYTHON机器学 ...
- Python机器学习及实践 课后小题
目录 第二章 2.3章末小结 @(Python机器学习及实践-----从零开始通往Kaggle竞赛之路) 第二章 2.3章末小结 1 机器学习模型按照使用的数据类型,可分为监督学习和无监督学习两大类. ...
随机推荐
- Codeforces Round #397 by Kaspersky Lab and Barcelona Bootcamp (Div. 1 + Div. 2 combined) A. Neverending competitions 水题
A. Neverending competitions 题目连接: http://codeforces.com/contest/765/problem/A Description There are ...
- API测试利器postMan 使用教程
自从开始做API开发之后,我就在寻找合适的API测试工具.一开始不是很想用Chrome扩展,用的 WizTools 的工具,后来试过一次 Postman 之后就停不下来了,还买了付费的Jetpacks ...
- 使用 IntraWeb (2) - Hello IntraWeb
IntraWeb 比我相像中的更贴近 VCL, 传统的非可视组件在这里大都可用(其内部很多复合属性是 TStringList 类型的), 它的诸多可视控件也是从 TControl 继承下来的. 这或许 ...
- .net core中的System.Buffers名字空间
最近研究了一下.net core 2.1的基础类库,发现它引入了一个System.Buffers名字空间,里面提供了一系列比较实用的对象,便简单的管中窥豹浏览一下. ArrayPool<T> ...
- Cocos2d-x 3.0游戏开发之虚拟机IOS环境:匹配才是好,莫要随便升级软件
尊重开发人员的劳动成果.转载的时候请务必注明出处:http://blog.csdn.net/haomengzhu/article/details/34110449 做为一个买不起MAC的Coder,仅 ...
- 读写分离,读写分离死锁解决方案,事务发布死锁解决方案,发布订阅死锁解决方案|事务(进程 ID *)与另一个进程被死锁在 锁 资源上,并且已被选作死锁牺牲品。请重新运行该事务
前言: 由于网站访问压力的问题,综合分析各种因素后结合实际情况,采用数据库读写分离模式来解决当前问题.实际方案中采用“事务发布”模式实现主数据库和只读数据库的同步,其中: 发布服务器1 ...
- Revit API判断直线相交关系移动风管
start ) ); )) )) ); XYZ xyz12 = lCurve1.Curve.get_EndPoint(); XY ...
- ZooKeeper_客户端工具zkCli.sh使用
#一.命令 [root@VM_31_182_centos bin]# ./zkCli.sh -server 127.0.0.1:2181 #二.帮助命令 help #三.创建.修改.删除.退出de ...
- Oracle初级索引学习总结
前言 索引是常见的数据库对象,建立索引的目的是为了提高记录的检索速度.它的设置好坏,使用是否得当,极大地影响数据库应用程序和Database的性能.虽然有许多资料讲索引的用法,DBA和Develop ...
- 使用模拟对象(Mock Object)技术进行测试驱动开发
敏捷开发 敏捷软件开发又称敏捷开发,是一种从上世纪 90 年代开始引起开发人员注意的新型软件开发方法.和传统瀑布式开发方法对比,敏捷开发强调的是在几周或者几个月很短的时间周期,完成相对较小功能,并交付 ...