Python机器学习实践与Kaggle实战(转)
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5%E4%B8%8Ekaggle%E5%AE%9E%E6%88%98-machine-learning-for-kaggle-competition-in-python/
Author: Miao Fan (范淼), Ph.D. candidate on Computer Science.
Affiliation: Tsinghua University / New York University
[C.V.] [Google Scholar] [Special Talk in NYU]
Email: fanmiao.cslt.thu@gmail.com
- from sklearn.datasets import load_iris
- from sklearn.cross_validation import train_test_split
- from sklearn import preprocessing
- # 读取数据
- iris = load_iris()
- # 选取特征与标签
- X_iris, y_iris = iris.data, iris.target
- # 选择前两列数据作为特征
- X, y = X_iris[:, :2], y_iris
- # 选取一部分,25%的训练数据作为测试集
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33)
- # 对原特征数据进行标准化预处理,这个其实挺重要,但是经常被一些选手忽略
- scaler = preprocessing.StandardScaler()
- X_train = scaler.fit_transform(X_train)
- X_test = scaler.transform(X_test)
- from sklearn.linear_model import SGDClassifier
- # 选择使用SGD分类器,适合大规模数据,随机梯度下降方法估计参数
- clf = SGDClassifier()
- clf.fit(X_train, y_train)
- # 导入评价包
- from sklearn import metrics
- y_train_predict = clf.predict(X_train)
- # 内测,使用训练样本进行准确性能评估
- print metrics.accuracy_score(y_train, y_train_predict)
- # 标准外测,使用测试样本进行准确性能评估
- y_predict = clf.predict(X_test)
- print metrics.accuracy_score(y_test, y_predict)
- 0.660714285714
- 0.684210526316
- # 如果需要更加详细的性能报告,比如precision, recall, accuracy,可以使用如下的函数。
- print metrics.classification_report(y_test, y_predict, target_names = iris.target_names)
- precision recall f1-score support
- setosa 1.00 1.00 1.00 8
- versicolor 0.43 0.27 0.33 11
- virginica 0.65 0.79 0.71 19
- avg / total 0.66 0.68 0.66 38
- # 如果想详细探查SGDClassifier的分类性能,我们需要充分利用数据,因此需要把数据切分为N个部分,每个部分都用于测试一次模型性能。
- from sklearn.cross_validation import cross_val_score, KFold
- from sklearn.pipeline import Pipeline
- from sklearn.preprocessing import StandardScaler
- # 这里使用Pipeline,便于精简模型搭建,一般而言,模型在fit之前,对数据需要feature_extraction, preprocessing, 等必要步骤。
- # 这里我们使用默认的参数配置
- clf = Pipeline([('scaler', StandardScaler()), ('sgd_classifier', SGDClassifier())])
- # 5折交叉验证整个数据集合
- cv = KFold(X.shape[0], 5, shuffle=True, random_state = 33)
- scores = cross_val_score(clf, X, y, cv=cv)
- print scores
- # 计算一下模型综合性能,平均精度和标准差
- print scores.mean(), scores.std()
- from scipy.stats import sem
- import numpy as np
- # 这里使用的偏差计算函数略有不同,参考链接
- http://www.graphpad.com/guides/prism/6/statistics/index.htm?stat_semandsdnotsame.htm
- print np.mean(scores), sem(scores)
- [ 0.56666667 0.73333333 0.83333333 0.76666667 0.8 ]
- 0.74 0.0928559218479
- 0.74 0.0464279609239
- from sklearn.datasets import fetch_olivetti_faces
- # 这部分数据没有直接存储在现有包中,都是通过这类函数在线下载
- faces = fetch_olivetti_faces()
- # 这里证明,数据是以Dict的形式存储的,与多数实验性数据的格式一致
- faces.keys()
- ['images', 'data', 'target', 'DESCR']
- # 使用shape属性检验数据规模
- print faces.data.shape
- print faces.target.shape
- (400L, 4096L)
- (400L,)
- from sklearn.cross_validation import train_test_split
- from sklearn.svm import SVC
- # 同样是分割数据 25%用于测试
- X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0)
- from sklearn.cross_validation import cross_val_score, KFold
- from scipy.stats import sem
- # 构造一个便于交叉验证模型性能的函数(模块)
- def evaluate_cross_validation(clf, X, y, K):
- # KFold 函数需要如下参数:数据量, 叉验次数, 是否洗牌
- cv = KFold(len(y), K, shuffle=True, random_state = 0)
- # 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
- scores = cross_val_score(clf, X, y, cv=cv)
- print scores
- print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))
- # 使用线性核的SVC (后面会说到不同的核,结果可能大不相同)
- svc_linear = SVC(kernel='linear')
- # 五折交叉验证 K = 5
- evaluate_cross_validation(svc_linear, X_train, y_train, 5)
- [ 0.93333333 0.86666667 0.91666667 0.93333333 0.91666667]
- Mean score: 0.913 (+/-0.012)
- from sklearn.datasets import fetch_20newsgroups
- # 与之前的人脸数据集一样,20类新闻数据同样需要临时下载函数的帮忙
- news = fetch_20newsgroups(subset='all')
- # 查验数据,依然采用dict格式,共有18846条样本
- print len(news.data), len(news.target)
- print news.target
- 18846 18846
- [10 3 17 ..., 3 1 7]
- # 查验一下新闻类别和种数
- print news.target_names
- print news.target_names.__len__()
- ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
- 20
- # 同样,我们选取25%的数据用来测试模型性能
- from sklearn.cross_validation import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25)
- print X_train.__len__()
- print y_train.__len__()
- print X_test.__len__()
- 14134
- 14134
- 4712
- # 许多原始数据无法直接被分类器所使用,图像可以直接使用pixel信息,文本则需要进一步处理成数值化的信息
- from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.pipeline import Pipeline
- from sklearn.cross_validation import *
- from scipy.stats import sem
- # 我们在NB_Classifier的基础上,对比几种特征抽取方法的性能。并且使用Pipline简化构建训练流程
- clf_1 = Pipeline([('count_vec', CountVectorizer()), ('mnb', MultinomialNB())])
- clf_2 = Pipeline([('hash_vec', HashingVectorizer(non_negative=True)), ('mnb', MultinomialNB())])
- clf_3 = Pipeline([('tfidf_vec', TfidfVectorizer()), ('mnb', MultinomialNB())])
- # 构造一个便于交叉验证模型性能的函数(模块)
- def evaluate_cross_validation(clf, X, y, K):
- # KFold 函数需要如下参数,数据量, K,是否洗牌
- cv = KFold(len(y), K, shuffle=True, random_state = 0)
- # 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
- scores = cross_val_score(clf, X, y, cv=cv)
- print scores
- print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))
- clfs = [clf_1, clf_2, clf_3]
- for clf in clfs:
- evaluate_cross_validation(clf, X_train, y_train, 5)
- [ 0.83516095 0.83374602 0.84471171 0.83622214 0.83227176]
- Mean score: 0.836 (+/-0.002)
- [ 0.76052352 0.72727273 0.77538026 0.74778918 0.75194621]
- Mean score: 0.753 (+/-0.008)
- [ 0.84435798 0.83409975 0.85496993 0.84082066 0.83227176]
- Mean score: 0.841 (+/-0.004)
- # 从上述结果中,我们发现常用的两个特征提取方法得到的性能相当。 让我们选取其中之一,进一步靠特征的精细筛选提升性能。
- clf_4 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB())])
- evaluate_cross_validation(clf_4, X_train, y_train, 5)
- [ 0.87053414 0.86664308 0.887867 0.87371772 0.86553432]
- Mean score: 0.873 (+/-0.004)
- # 如果再尝试修改贝叶斯分类器的平滑参数,也许性能会更上一层楼。
- clf_5 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB(alpha=0.01))])
- evaluate_cross_validation(clf_5, X_train, y_train, 5)
- [ 0.90060134 0.89741776 0.91651928 0.90909091 0.90410474]
- Mean score: 0.906 (+/-0.003)
- # 这里为了处理数据方便,我们引入一个新的工具包pandas
- import pandas as pd
- import numpy as np
- titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
- #瞧瞧数据,什么数据特征的都有,有数值型的、类别型的,字符串,甚至还有缺失的数据等等。
- titanic.head()
- # 使用pandas,数据都转入pandas独有的dataframe格式(二维数据表格),直接使用info(),查看数据的基本特征
- titanic.info()
- <class 'pandas.core.frame.DataFrame'>
- Int64Index: 1313 entries, 0 to 1312
- Data columns (total 11 columns):
- row.names 1313 non-null int64
- pclass 1313 non-null object
- survived 1313 non-null int64
- name 1313 non-null object
- age 633 non-null float64
- embarked 821 non-null object
- home.dest 754 non-null object
- room 77 non-null object
- ticket 69 non-null object
- boat 347 non-null object
- sex 1313 non-null object
- dtypes: float64(1), int64(2), object(8)
- memory usage: 123.1+ KB
- # 这份调查数据是真实的泰坦尼克号乘客个人和登船信息,有助于我们预测每位遇难乘客是否幸免。
- # 一共1313条数据,有些特征是完整的(比如 pclass, survived, name),有些是有缺失的;有些是数值类型的信息(age: float64),有些则是字符串。
- # 机器学习有一个不太被初学者重视,并且耗时,但是十分重要的一环,特征的选择,这个需要基于一些背景知识。根据我们对这场事故的了解,sex, age, pclass这些都很有可能是决定幸免与否的关键因素。
- # we keep pclass, age, sex.
- X = titanic[['pclass', 'age', 'sex']]
- y = titanic['survived']
- X.info()
- <class 'pandas.core.frame.DataFrame'>
- Int64Index: 1313 entries, 0 to 1312
- Data columns (total 3 columns):
- pclass 1313 non-null object
- age 633 non-null float64
- sex 1313 non-null object
- dtypes: float64(1), object(2)
- memory usage: 41.0+ KB
- # 下面有几个对数据处理的任务
- # 1) age这个数据列,只有633个
- # 2) sex 与 pclass两个数据列的值都是类别型的,需要转化为数值特征,用0/1代替
- # 首先我们补充age里的数据,使用平均数或者中位数都是对模型偏离造成最小影响的策略
- X['age'].fillna(X['age'].mean(), inplace=True)
- C:\Anaconda2\lib\site-packages\pandas\core\generic.py:2748: SettingWithCopyWarning:
- A value is trying to be set on a copy of a slice from a DataFrame
- See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
- self._update_inplace(new_data)
- X.info()
- <class 'pandas.core.frame.DataFrame'>
- Int64Index: 1313 entries, 0 to 1312
- Data columns (total 3 columns):
- pclass 1313 non-null object
- age 1313 non-null float64
- sex 1313 non-null object
- dtypes: float64(1), object(2)
- memory usage: 41.0+ KB
- from sklearn.cross_validation import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33)
- # 我们使用scikit-learn中的feature_extraction
- from sklearn.feature_extraction import DictVectorizer
- vec = DictVectorizer(sparse=False)
- X_train = vec.fit_transform(X_train.to_dict(orient='record'))
- print vec.feature_names_
- # 我们发现,凡是类别型的特征都单独剥离出来,独成一列特征,数值型的则保持不变
- ['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
- X_test = vec.transform(X_test.to_dict(orient='record'))
- from sklearn.tree import DecisionTreeClassifier
- dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
- dtc.fit(X_train, y_train)
- dtc.score(X_test, y_test)
- 0.79331306990881456
- from sklearn.ensemble import RandomForestClassifier
- rfc = RandomForestClassifier(max_depth=3, min_samples_leaf=5)
- rfc.fit(X_train, y_train)
- rfc.score(X_test, y_test)
- 0.77203647416413379
- from sklearn.ensemble import GradientBoostingClassifier
- gbc = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5)
- gbc.fit(X_train, y_train)
- gbc.score(X_test, y_test)
- 0.79027355623100304
- from sklearn.metrics import classification_report
- y_predict = gbc.predict(X_test)
- print classification_report(y_predict, y_test)
- # 这里的函数可以便于生成分类器性能报告(precision,recall)这些是在二分类背景下才有的指标。
- precision recall f1-score support
- 0 0.93 0.78 0.84 241
- 1 0.57 0.83 0.68 88
- avg / total 0.83 0.79 0.80 329
- # 首先预读房价数据
- from sklearn.datasets import load_boston
- boston = load_boston()
- # 查验数据规模
- print boston.data.shape
- (506L, 13L)
- # 多多弄懂数据特征的含义也是一个好习惯
- print boston.feature_names
- print boston.DESCR
- ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
- 'B' 'LSTAT']
- Boston House Prices dataset
- Notes
- ------
- Data Set Characteristics:
- :Number of Instances: 506
- :Number of Attributes: 13 numeric/categorical predictive
- :Median Value (attribute 14) is usually the target
- :Attribute Information (in order):
- - CRIM per capita crime rate by town
- - ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- - INDUS proportion of non-retail business acres per town
- - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- - NOX nitric oxides concentration (parts per 10 million)
- - RM average number of rooms per dwelling
- - AGE proportion of owner-occupied units built prior to 1940
- - DIS weighted distances to five Boston employment centres
- - RAD index of accessibility to radial highways
- - TAX full-value property-tax rate per $10,000
- - PTRATIO pupil-teacher ratio by town
- - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- - LSTAT % lower status of the population
- - MEDV Median value of owner-occupied homes in $1000's
- :Missing Attribute Values: None
- :Creator: Harrison, D. and Rubinfeld, D.L.
- This is a copy of UCI ML housing dataset.
- http://archive.ics.uci.edu/ml/datasets/Housing
- This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
- The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
- prices and the demand for clean air', J. Environ. Economics & Management,
- vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
- ...', Wiley, 1980. N.B. Various transformations are used in the table on
- pages 244-261 of the latter.
- The Boston house-price data has been used in many machine learning papers that address regression
- problems.
- **References**
- - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
- - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
- # 这里多一个步骤,查验数据是否正规化,一般都是没有的
- import numpy as np
- print np.max(boston.target)
- print np.min(boston.target)
- print np.mean(boston.target)
- 50.0
- 5.0
- 22.5328063241
- from sklearn.cross_validation import train_test_split
- # 依然如故,我们对数据进行分割
- X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state=33)
- from sklearn.preprocessing import StandardScaler
- # 正规化的目的在于避免原始特征值差异过大,导致训练得到的参数权重不一
- scalerX = StandardScaler().fit(X_train)
- X_train = scalerX.transform(X_train)
- X_test = scalerX.transform(X_test)
- scalery = StandardScaler().fit(y_train)
- y_train = scalery.transform(y_train)
- y_test = scalery.transform(y_test)
- # 先把评价模块写好,依然是默认5折交叉验证,只是这里的评价指标不再是精度,而是另一个函数R2,大体上,这个得分多少代表有多大百分比的回归结果可以被训练器覆盖和解释
- from sklearn.cross_validation import *
- def train_and_evaluate(clf, X_train, y_train):
- cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
- scores = cross_val_score(clf, X_train, y_train, cv=cv)
- print 'Average coefficient of determination using 5-fold cross validation:', np.mean(scores)
- #最后让我们看看有多少种回归模型可以被使用(其实有更多)。
- # 比较有代表性的有3种
- # 先用线性模型尝试, SGD_Regressor
- from sklearn import linear_model
- # 这里有一个正则化的选项penalty,目前14维特征也许不会有太大影响
- clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42)
- train_and_evaluate(clf_sgd, X_train, y_train)
- Average coefficient of determination using 5-fold cross validation: 0.710809853468
- # 再换一个SGD_Regressor的penalty参数为l2,结果貌似影响不大,因为特征太少,正则化意义不大
- clf_sgd_l2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42)
- train_and_evaluate(clf_sgd_l2, X_train, y_train)
- Average coefficient of determination using 5-fold cross validation: 0.71081206667
- # 再看看SVM的regressor怎么样(都是默认参数),
- from sklearn.svm import SVR
- # 使用线性核没有啥子提升,但是因为特征少,所以可以考虑升高维度
- clf_svr = SVR(kernel='linear')
- train_and_evaluate(clf_svr, X_train, y_train)
- Average coefficient of determination using 5-fold cross validation: 0.707838419194
- clf_svr_poly = SVR(kernel='poly')
- # 升高维度,效果明显,但是此招慎用@@,特征高的话, CPU还是受不了,内存倒是小事。其实到了现在,连我们自己都没办法直接解释这些特征的具体含义了。
- train_and_evaluate(clf_svr_poly, X_train, y_train)
- Average coefficient of determination using 5-fold cross validation: 0.779288545488
- clf_svr_rbf = SVR(kernel='rbf')
- # RBF (径向基核更是牛逼!)
- train_and_evaluate(clf_svr_rbf, X_train, y_train)
- Average coefficient of determination using 5-fold cross validation: 0.833662221567
- # 再来个更猛的! 极限回归森林,放大招了!!!
- from sklearn import ensemble
- clf_et = ensemble.ExtraTreesRegressor()
- train_and_evaluate(clf_et, X_train, y_train)
- Average coefficient of determination using 5-fold cross validation: 0.853006383633
- # 最后看看在测试集上的表现
- clf_et.fit(X_train, y_train)
- clf_et.score(X_test, y_test)
- 0.83781467779895469
- import numpy as np
- # 先热个身,牛刀小试
- M = np.array([[1, 2], [2, 4]])
- M
- array([[1, 2],
- [2, 4]])
- np.linalg.matrix_rank(M, tol=None)
- # 获取M矩阵的秩=1
- 1
- # 载入手写数字的图像像素数据。对于图像处理,除了后续的各种启发式提取有效特征以外,
- # 最直接常用的就是像素数据,每个像素都是一个数值,反映颜色。
- from sklearn.datasets import load_digits
- digits = load_digits()
- # 这些经典数据的存储格式非常统一。这是好习惯,统一了接口,也便于快速使用。
- digits
- {'DESCR': " Optical Recognition of Handwritten Digits Data Set\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 5620\n :Number of Attributes: 64\n :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n :Missing Attribute Values: None\n :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n Graduate Studies in Science and Engineering, Bogazici University.\n - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n Linear dimensionalityreduction using relevance weighted LDA. School of\n Electrical and Electronic Engineering Nanyang Technological University.\n 2005.\n - Claudio Gentile. A New Approximate Maximal Margin Classification\n Algorithm. NIPS. 2000.\n",
- 'data': array([[ 0., 0., 5., ..., 0., 0., 0.],
- [ 0., 0., 0., ..., 10., 0., 0.],
- [ 0., 0., 0., ..., 16., 9., 0.],
- ...,
- [ 0., 0., 1., ..., 6., 0., 0.],
- [ 0., 0., 2., ..., 12., 0., 0.],
- [ 0., 0., 10., ..., 12., 1., 0.]]),
- 'images': array([[[ 0., 0., 5., ..., 1., 0., 0.],
- [ 0., 0., 13., ..., 15., 5., 0.],
- [ 0., 3., 15., ..., 11., 8., 0.],
- ...,
- [ 0., 4., 11., ..., 12., 7., 0.],
- [ 0., 2., 14., ..., 12., 0., 0.],
- [ 0., 0., 6., ..., 0., 0., 0.]],
- [[ 0., 0., 0., ..., 5., 0., 0.],
- [ 0., 0., 0., ..., 9., 0., 0.],
- [ 0., 0., 3., ..., 6., 0., 0.],
- ...,
- [ 0., 0., 1., ..., 6., 0., 0.],
- [ 0., 0., 1., ..., 6., 0., 0.],
- [ 0., 0., 0., ..., 10., 0., 0.]],
- [[ 0., 0., 0., ..., 12., 0., 0.],
- [ 0., 0., 3., ..., 14., 0., 0.],
- [ 0., 0., 8., ..., 16., 0., 0.],
- ...,
- [ 0., 9., 16., ..., 0., 0., 0.],
- [ 0., 3., 13., ..., 11., 5., 0.],
- [ 0., 0., 0., ..., 16., 9., 0.]],
- ...,
- [[ 0., 0., 1., ..., 1., 0., 0.],
- [ 0., 0., 13., ..., 2., 1., 0.],
- [ 0., 0., 16., ..., 16., 5., 0.],
- ...,
- [ 0., 0., 16., ..., 15., 0., 0.],
- [ 0., 0., 15., ..., 16., 0., 0.],
- [ 0., 0., 2., ..., 6., 0., 0.]],
- [[ 0., 0., 2., ..., 0., 0., 0.],
- [ 0., 0., 14., ..., 15., 1., 0.],
- [ 0., 4., 16., ..., 16., 7., 0.],
- ...,
- [ 0., 0., 0., ..., 16., 2., 0.],
- [ 0., 0., 4., ..., 16., 2., 0.],
- [ 0., 0., 5., ..., 12., 0., 0.]],
- [[ 0., 0., 10., ..., 1., 0., 0.],
- [ 0., 2., 16., ..., 1., 0., 0.],
- [ 0., 0., 15., ..., 15., 0., 0.],
- ...,
- [ 0., 4., 16., ..., 16., 6., 0.],
- [ 0., 8., 16., ..., 16., 8., 0.],
- [ 0., 1., 8., ..., 12., 1., 0.]]]),
- 'target': array([0, 1, 2, ..., 8, 9, 8]),
- 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])}
- # 老套路
- X_digits, y_digits = digits.data, digits.target
- from sklearn.decomposition import PCA
- from matplotlib import pyplot as plt
- # 最关键的参数就是n_components = 2个主成分
- estimator = PCA(n_components=2)
- X_pca = estimator.fit_transform(X_digits)
- # scikit-learn的接口设计的很统一。
- # 聚类问题经常需要直观的展现数据,降维度的一个直接目的也为此;因此我们这里多展现几个图片直观一些。
- def plot_pca_scatter():
- colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray']
- for i in xrange(len(colors)):
- px = X_pca[:, 0][y_digits == i]
- py = X_pca[:, 1][y_digits == i]
- plt.scatter(px, py, c=colors[i])
- plt.legend(digits.target_names)
- plt.xlabel('First Principal Component')
- plt.ylabel('Second Principal Component')
- plt.show()
- plot_pca_scatter()

- # 这部分代码和原著的第四章节有相同的效果,但是充分利用pandas会表达的更加简洁,因此我重新编写了更加清晰简洁的代码。
- import pandas as pd
- import numpy as np
- titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
- print titanic.info()
- # 还是这组数据
- titanic.head()
- <class 'pandas.core.frame.DataFrame'>
- Int64Index: 1313 entries, 0 to 1312
- Data columns (total 11 columns):
- row.names 1313 non-null int64
- pclass 1313 non-null object
- survived 1313 non-null int64
- name 1313 non-null object
- age 633 non-null float64
- embarked 821 non-null object
- home.dest 754 non-null object
- room 77 non-null object
- ticket 69 non-null object
- boat 347 non-null object
- sex 1313 non-null object
- dtypes: float64(1), int64(2), object(8)
- memory usage: 123.1+ KB
- None

- # 我们丢掉一些过于特异的,不利于找到共同点的数据列, row.names, name, 同时分离出预测列。
- y = titanic['survived']
- X = titanic.drop(['row.names', 'name', 'survived'], axis = 1)
- # 对于连续的数值特征,我们采用补完的方式
- X['age'].fillna(X['age'].mean(), inplace=True)
- X.fillna('UNKNOWN', inplace=True)
- # 剩下的类别类型数据,我们直接向量化,这样的话,对于有空白特征的列,我们也单独视作一个特征
- from sklearn.cross_validation import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
- from sklearn.feature_extraction import DictVectorizer
- vec = DictVectorizer()
- X_train = vec.fit_transform(X_train.to_dict(orient='record'))
- X_test = vec.transform(X_test.to_dict(orient='record'))
- print len(vec.feature_names_)
- 474
- X_train.toarray()
- array([[ 31.19418104, 0. , 0. , ..., 0. ,
- 0. , 1. ],
- [ 31.19418104, 0. , 0. , ..., 0. ,
- 0. , 0. ],
- [ 31.19418104, 0. , 0. , ..., 0. ,
- 0. , 1. ],
- ...,
- [ 12. , 0. , 0. , ..., 0. ,
- 0. , 1. ],
- [ 18. , 0. , 0. , ..., 0. ,
- 0. , 1. ],
- [ 31.19418104, 0. , 0. , ..., 0. ,
- 0. , 1. ]])
- from sklearn.tree import DecisionTreeClassifier
- dt = DecisionTreeClassifier(criterion='entropy')
- dt.fit(X_train, y_train)
- dt.score(X_test, y_test)
- # 采用所有特征的测试精度
- 0.81762917933130697
- from sklearn import feature_selection
- fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
- X_train_fs = fs.fit_transform(X_train, y_train)
- dt.fit(X_train_fs, y_train)
- X_test_fs = fs.transform(X_test)
- dt.score(X_test_fs, y_test)
- # 采用20%高预测性特征的测试精度
- 0.82370820668693012
- from sklearn.cross_validation import cross_val_score
- percentiles = range(1, 100, 2)
- results = []
- for i in percentiles:
- fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i)
- X_train_fs = fs.fit_transform(X_train, y_train)
- scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
- results = np.append(results, scores.mean())
- print results
- opt = np.where(results == results.max())[0]
- print 'Optimal number of features %d' %percentiles[opt]
- import pylab as pl
- pl.plot(percentiles, results)
- pl.show()
- [ 0.85063904 0.85673057 0.87501546 0.88622964 0.86590394 0.87097506
- 0.87303649 0.86997526 0.87097506 0.87300557 0.86997526 0.86893424
- 0.87098536 0.86490414 0.86385281 0.86791383 0.86488353 0.86892393
- 0.86791383 0.86284271 0.86487322 0.86792414 0.86894455 0.87303649
- 0.86892393 0.86998557 0.86689342 0.86488353 0.86895485 0.86689342
- 0.87198516 0.8638322 0.86488353 0.87402597 0.87299526 0.87098536
- 0.86997526 0.86892393 0.86794475 0.86486291 0.87096475 0.86587302
- 0.86387343 0.86083282 0.86589363 0.8608019 0.86492476 0.85774067
- 0.8608122 0.85779221]
- Optimal number of features 7

- from sklearn import feature_selection
- fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
- X_train_fs = fs.fit_transform(X_train, y_train)
- dt.fit(X_train_fs, y_train)
- X_test_fs = fs.transform(X_test)
- dt.score(X_test_fs, y_test)
- # 选取搜索到的最好特征比例的测试精度
- 0.8571428571428571
- # 由此可见,这个技术对于工程上提升精度还是非常有帮助的。
- from sklearn.datasets import fetch_20newsgroups
- import numpy as np
- news = fetch_20newsgroups(subset='all')
- # 我们首先使用grid_search的单核版本
- from sklearn.cross_validation import train_test_split
- from sklearn.grid_search import GridSearchCV
- X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
- from sklearn.svm import SVC
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.pipeline import Pipeline
- clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
- # 这里需要试验的2个超参数的的个数分别是4、3, svc__gamma的参数共有10^-2, 10^-1...
- # 这样我们一共有12种的超参数组合,12个不同参数下的模型
- parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
- # 再考虑每个模型需要交叉验证3次,因此一共需要训练36次模型,根据下面的结果,单线程下,每个模型的训练任务耗时5秒左右。
- gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3)
- %time _=gs.fit(X_train, y_train)
- gs.best_params_, gs.best_score_
- print gs.score(X_test, y_test)
- Fitting 3 folds for each of 12 candidates, totalling 36 fits
- [CV] svc__gamma=0.01, svc__C=0.1 .....................................
- [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.1s
- [CV] svc__gamma=0.01, svc__C=0.1 .....................................
- [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.3s
- [CV] svc__gamma=0.01, svc__C=0.1 .....................................
- [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.2s
- [CV] svc__gamma=0.1, svc__C=0.1 ......................................
- [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.1s
- [CV] svc__gamma=0.1, svc__C=0.1 ......................................
- [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.2s
- [CV] svc__gamma=0.1, svc__C=0.1 ......................................
- [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.3s
- [CV] svc__gamma=1.0, svc__C=0.1 ......................................
- [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.7s
- [CV] svc__gamma=1.0, svc__C=0.1 ......................................
- [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.8s
- [CV] svc__gamma=1.0, svc__C=0.1 ......................................
- [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.9s
- [CV] svc__gamma=10.0, svc__C=0.1 .....................................
- [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.4s
- [CV] svc__gamma=10.0, svc__C=0.1 .....................................
- [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s
- [CV] svc__gamma=10.0, svc__C=0.1 .....................................
- [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s
- [CV] svc__gamma=0.01, svc__C=1.0 .....................................
- [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.2s
- [CV] svc__gamma=0.01, svc__C=1.0 .....................................
- [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s
- [CV] svc__gamma=0.01, svc__C=1.0 .....................................
- [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s
- [CV] svc__gamma=0.1, svc__C=1.0 ......................................
- [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.2s
- [CV] svc__gamma=0.1, svc__C=1.0 ......................................
- [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.3s
- [CV] svc__gamma=0.1, svc__C=1.0 ......................................
- [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.4s
- [CV] svc__gamma=1.0, svc__C=1.0 ......................................
- [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.3s
- [CV] svc__gamma=1.0, svc__C=1.0 ......................................
- [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.4s
- [CV] svc__gamma=1.0, svc__C=1.0 ......................................
- [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.5s
- [CV] svc__gamma=10.0, svc__C=1.0 .....................................
- [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s
- [CV] svc__gamma=10.0, svc__C=1.0 .....................................
- [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.3s
- [CV] svc__gamma=10.0, svc__C=1.0 .....................................
- [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s
- [CV] svc__gamma=0.01, svc__C=10.0 ....................................
- [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s
- [CV] svc__gamma=0.01, svc__C=10.0 ....................................
- [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s
- [CV] svc__gamma=0.01, svc__C=10.0 ....................................
- [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.3s
- [CV] svc__gamma=0.1, svc__C=10.0 .....................................
- [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.3s
- [CV] svc__gamma=0.1, svc__C=10.0 .....................................
- [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s
- [CV] svc__gamma=0.1, svc__C=10.0 .....................................
- [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s
- [CV] svc__gamma=1.0, svc__C=10.0 .....................................
- [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.3s
- [CV] svc__gamma=1.0, svc__C=10.0 .....................................
- [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.5s
- [CV] svc__gamma=1.0, svc__C=10.0 .....................................
- [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.7s
- [CV] svc__gamma=10.0, svc__C=10.0 ....................................
- [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s
- [CV] svc__gamma=10.0, svc__C=10.0 ....................................
- [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s
- [CV] svc__gamma=10.0, svc__C=10.0 ....................................
- [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.9s
- [Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 5.1s
- [Parallel(n_jobs=1)]: Done 36 out of 36 | elapsed: 3.3min finished
- Wall time: 3min 27s
- 0.822666666667
- # 然后我们采用多线程并行搜索,观察时间性能的提高情况
- from sklearn.cross_validation import train_test_split
- from sklearn.grid_search import GridSearchCV
- X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
- from sklearn.svm import SVC
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.pipeline import Pipeline
- clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
- parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
- gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1)
- %time _=gs.fit(X_train, y_train)
- gs.best_params_, gs.best_score_
- print gs.score(X_test, y_test)
- # 并行化寻找最优的超参数配置,同样获得相同的最优解,但是训练耗时基本上随着CPU核的数量成倍减少。
- [Parallel(n_jobs=-1)]: Done 1 jobs | elapsed: 8.4s
- [Parallel(n_jobs=-1)]: Done 22 out of 36 | elapsed: 30.3s remaining: 19.2s
- [Parallel(n_jobs=-1)]: Done 36 out of 36 | elapsed: 46.8s finished
- Fitting 3 folds for each of 12 candidates, totalling 36 fits
- Wall time: 56.5 s
- 0.822666666667
- # 这里需要补充的是得到这个结果的机器的配置,好让读者有一个对并行计算更好的了解。
- '''
- CPU: i7 四核 2.4Ghz
- Memory: DDR3 1600 32GB
- '''
Python机器学习实践与Kaggle实战(转)的更多相关文章
- 机器学习实践:《Python机器学习实践指南》中文PDF+英文PDF+代码
机器学习是近年来渐趋热门的一个领域,同时Python 语言经过一段时间的发展也已逐渐成为主流的编程语言之一.<Python机器学习实践指南>结合了机器学习和Python 语言两个热门的领域 ...
- Python机器学习实践指南pdf (中文版带书签)、原书代码、数据集
Python机器学习实践指南 目 录 第1章Python机器学习的生态系统 1 1.1 数据科学/机器学习的工作 流程 2 1.1.1 获取 2 1.1.2 检查和探索 2 1.1.3 清理和准备 3 ...
- Python机器学习实践:决策树判别汽车金融违约用户
文章发布于公号[数智物语] (ID:decision_engine),关注公号不错过每一篇干货. 转自 | 法纳斯特(公众号ID:walker398) 作者 | 小F 决策树呈树形结构,是一种基本的回 ...
- Python并发实践_03_并发实战之一
16S数据质控流程,一次下机lane包括很多的项目,每个项目有独立的合同号,一个项目可能包含16S或者ITS两种,通过一个完整的pipeline,将上游拆分好的数据全部整理成可以直接分析的数据.原本这 ...
- python 机器学习实践入门
机器学习概念概念 机器 学习是计算机科学的一个分支,从模式识别.人工智能和计算学习理论发展而来,我们可以将其作为数据挖掘的工具 侧重用于数据分析方法理解给定的数据 目的是:开发能够从先前观测的数据,通 ...
- 《Python机器学习及实践:从零开始通往Kaggle竞赛之路》
<Python 机器学习及实践–从零开始通往kaggle竞赛之路>很基础 主要介绍了Scikit-learn,顺带介绍了pandas.numpy.matplotlib.scipy. 本书代 ...
- Python机器学习及实践_从零开始通往KAGGLE竞赛之路PDF高清完整版免费下载|百度云盘|Python基础教程免费电子书
点击获取提取码:i5nw Python机器学习及实践面向所有对机器学习与数据挖掘的实践及竞赛感兴趣的读者,从零开始,以Python编程语言为基础,在不涉及大量数学模型与复杂编程知识的前提下,逐步带领读 ...
- 《PYTHON机器学习及实践-从零开始通往KAGGLE竞赛之路》 分享下载
转: <PYTHON机器学习及实践-从零开始通往KAGGLE竞赛之路> 分享下载 书籍信息 书名: PYTHON机器学习及实践-从零开始通往KAGGLE竞赛之路 标签: PYTHON机器学 ...
- Python机器学习及实践 课后小题
目录 第二章 2.3章末小结 @(Python机器学习及实践-----从零开始通往Kaggle竞赛之路) 第二章 2.3章末小结 1 机器学习模型按照使用的数据类型,可分为监督学习和无监督学习两大类. ...
随机推荐
- mysql分页查询优化(索引延迟关联)
对于web后台报表导出是一种常见的功能点,实际对应服务后端即数据库的排序分页查询.如下示例为公司商户积分报表导出其中一个sql ,当大批量的导出请求进入时候,mysql的cpu急剧上升瞬间有拖垮库的风 ...
- Synchronize Ultimate
支持多种服务器和主流云网盘进行同步 http://www.icecoldapps.com/ Unlock Code : xda201506 Unlock Code : icecoldapps20150 ...
- 快递 API接口
http://www.kuaidi.com/openapi.html http://www.kuaidi100.com/ http://www.cnblogs.com/sususu3/p/577642 ...
- Revit API封装一个通用函数“过名称找元素”
感觉这个函数不错.通过这种方式寻找元素经常需要用到. ) { ]; } // cannot find it. return null; ...
- C#中一种替换switch语句更优雅的写法
今天在项目中遇到了使用switch语句判断条件,但问题是条件比较多,大概有几十个条件,满屏幕的case判断,是否有更优雅的写法替代switch语句呢? 假设有这样的一个场景:商场经常会根据情况采取不同 ...
- javascript: Math.sin() cos() 用法
Math.sin(x) x 的正玄值.返回值在 -1.0 到 1.0 之间: Math.cos(x) x 的余弦值.返回的是 -1.0 到 1.0 之间的数: 这两个函数中的X 都是指 ...
- Android 数据存储04之Content Provider
Content Provider 版本 修改内容 日期 修改人 V1.0 原始版本 2013/2/25 skywang 1 URI 通用资源标志符(Universal Resource Identif ...
- Ubuntu Linux下安装Oracle JDK
from://http://blog.csdn.net/gobitan/article/details/24322561 Ubuntu Linux下安装Oracle JDK Dennis Hu 201 ...
- ibatis.net:第八天,QueryForDictionary
xml <statement id="FindOrdersByCustomer" parameterClass="string" resultClass= ...
- 哥谭第四季/全集Gotham迅雷下载
<哥谭>(Gotham)第三季刚刚结束,第四季首集的集名就公布了.<Pax Penguina>这个集名在拉丁语中意味着「Pax Romana」,也就是「罗马式的和平」(Roma ...