sklearn分类

近期的事务与sklearn有关，且主要用到了分类。在此做一点笔记

进行分类大概涉及三个知识点：

一. 分类器

二.特征选择

三.模型选择

一.分类器(Classification)

实例一：plot_classifier_comparison.py

# Code source: Gaël Varoquaux

#              Andreas Müller

# Modified for documentation by Jaques Grobler

# License: BSD 3 clause

import numpy as np

import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap

from sklearn.cross_validation import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.datasets import make_moons, make_circles, make_classification

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",

         "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",

         "Quadratic Discriminant Analysis"]

classifiers = [

    KNeighborsClassifier(3),

    SVC(kernel="linear", C=0.025),

    SVC(gamma=2, C=1),

    DecisionTreeClassifier(max_depth=5),

    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),

    AdaBoostClassifier(),

    GaussianNB(),

    LinearDiscriminantAnalysis(),

    QuadraticDiscriminantAnalysis()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,

                           random_state=1, n_clusters_per_class=1)

rng = np.random.RandomState(2)

X += 2 * rng.uniform(size=X.shape)

linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),

            make_circles(noise=0.2, factor=0.5, random_state=1),

            linearly_separable

            ]

figure = plt.figure(figsize=(27, 9))

i = 1

# iterate over datasets

for ds in datasets:

    # preprocess dataset, split into training and test part

    X, y = ds

    X = StandardScaler().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5

    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),

                         np.arange(y_min, y_max, h))

    # just plot the dataset first

    cm = plt.cm.RdBu

    cm_bright = ListedColormap(['#FF0000', '#0000FF'])

    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

    # Plot the training points

    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)

    # and testing points

    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)

    ax.set_xlim(xx.min(), xx.max())

    ax.set_ylim(yy.min(), yy.max())

    ax.set_xticks(())

    ax.set_yticks(())

    i += 1

    # iterate over classifiers

    for name, clf in zip(names, classifiers):

        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

        clf.fit(X_train, y_train)

        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each

        # point in the mesh [x_min, m_max]x[y_min, y_max].

        if hasattr(clf, "decision_function"):

            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])

        else:

            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot

        Z = Z.reshape(xx.shape)

        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points

        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)

        # and testing points

        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,

                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())

        ax.set_ylim(yy.min(), yy.max())

        ax.set_xticks(())

        ax.set_yticks(())

        ax.set_title(name)

        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip(''),

                size=15, horizontalalignment='right')

        i += 1

figure.subplots_adjust(left=.02, right=.98)

plt.show()

二.特征选择(Feature Selection)

主要包含下面一个模块
>>> sklearn.feature_selection

例一：feature_selection_pipeline.py

from sklearn.datasets import samples_generator

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn import svm

from sklearn.pipeline import make_pipeline

# 生成数据

X, y = samples_generator.make_classification(

    n_features=20, n_informative=3, n_redundant=0, n_classes=4,

    n_clusters_per_class=2)

# 两个步骤

# 1) 方差分析过滤，使用最好的3个特征

anova_filter = SelectKBest(f_regression, k=3)

# 2) 支持向量机分类

clf = svm.SVC(kernel='linear')

# 组合成一个分类器

anova_svm = make_pipeline(anova_filter, clf)

anova_svm.fit(X, y)

anova_svm.predict(X)

例二：plot_rbm_logistic_classification.py

# Authors: Yann N. Dauphin, Vlad Niculae, Gabriel Synnaeve

# License: BSD

import numpy as np

import matplotlib.pyplot as plt

from scipy.ndimage import convolve

from sklearn import linear_model, datasets, metrics

from sklearn.cross_validation import train_test_split

from sklearn.neural_network import BernoulliRBM

from sklearn.pipeline import Pipeline

###############################################################################

# Setting up

def nudge_dataset(X, Y):

    """

    This produces a dataset 5 times bigger than the original one,

    by moving the 8x8 images in X around by 1px to left, right, down, up

    """

    direction_vectors = [

        [[0, 1, 0],

         [0, 0, 0],

         [0, 0, 0]],

        [[0, 0, 0],

         [1, 0, 0],

         [0, 0, 0]],

        [[0, 0, 0],

         [0, 0, 1],

         [0, 0, 0]],

        [[0, 0, 0],

         [0, 0, 0],

         [0, 1, 0]]]

    shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',

                                  weights=w).ravel()

    X = np.concatenate([X] +

                       [np.apply_along_axis(shift, 1, X, vector)

                        for vector in direction_vectors])

    Y = np.concatenate([Y for _ in range(5)], axis=0)

    return X, Y

# Load Data

digits = datasets.load_digits()

X = np.asarray(digits.data, 'float32')

X, Y = nudge_dataset(X, digits.target)

X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,

                                                    test_size=0.2,

                                                    random_state=0)

# Models we will use

logistic = linear_model.LogisticRegression()

rbm = BernoulliRBM(random_state=0, verbose=True)

classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

###############################################################################

# Training

# Hyper-parameters. These were set by cross-validation,

# using a GridSearchCV. Here we are not performing cross-validation to

# save time.

rbm.learning_rate = 0.06

rbm.n_iter = 20

# More components tend to give better prediction performance, but larger

# fitting time

rbm.n_components = 100

logistic.C = 6000.0

# Training RBM-Logistic Pipeline

classifier.fit(X_train, Y_train)

# Training Logistic regression

logistic_classifier = linear_model.LogisticRegression(C=100.0)

logistic_classifier.fit(X_train, Y_train)

###############################################################################

# Evaluation

print()

print("Logistic regression using RBM features:\n%s\n" % (

    metrics.classification_report(

        Y_test,

        classifier.predict(X_test))))

print("Logistic regression using raw pixel features:\n%s\n" % (

    metrics.classification_report(

        Y_test,

        logistic_classifier.predict(X_test))))

###############################################################################

# Plotting

plt.figure(figsize=(4.2, 4))

for i, comp in enumerate(rbm.components_):

    plt.subplot(10, 10, i + 1)

    plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r,

               interpolation='nearest')

    plt.xticks(())

    plt.yticks(())

plt.suptitle('100 components extracted by RBM', fontsize=16)

plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)

plt.show()

三.模型选择(Model Selection)

主要包含下面两个模块

>>> sklearn.grid_search

>>> sklearn.cross_validation

实例一：randomized_search.py

import numpy as np

from time import time

from operator import itemgetter

from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

from sklearn.datasets import load_digits

from sklearn.ensemble import RandomForestClassifier

# get some data

digits = load_digits()

X, y = digits.data, digits.target

# build a classifier

clf = RandomForestClassifier(n_estimators=20)

# Utility function to report best scores

def report(grid_scores, n_top=3):

    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]

    for i, score in enumerate(top_scores):

        print("Model with rank: {0}".format(i + 1))

        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(

              score.mean_validation_score,

              np.std(score.cv_validation_scores)))

        print("Parameters: {0}".format(score.parameters))

        print("")

# specify parameters and distributions to sample from

param_dist = {"max_depth": [3, None],

              "max_features": sp_randint(1, 11),

              "min_samples_split": sp_randint(1, 11),

              "min_samples_leaf": sp_randint(1, 11),

              "bootstrap": [True, False],

              "criterion": ["gini", "entropy"]}

# run randomized search

n_iter_search = 20

random_search = RandomizedSearchCV(clf, param_distributions=param_dist,

                                   n_iter=n_iter_search)

start = time()

random_search.fit(X, y)

print("RandomizedSearchCV took %.2f seconds for %d candidates"

      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.grid_scores_)

# use a full grid over all parameters

param_grid = {"max_depth": [3, None],

              "max_features": [1, 3, 10],

              "min_samples_split": [1, 3, 10],

              "min_samples_leaf": [1, 3, 10],

              "bootstrap": [True, False],

              "criterion": ["gini", "entropy"]}

# run grid search

grid_search = GridSearchCV(clf, param_grid=param_grid)

start = time()

grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."

      % (time() - start, len(grid_search.grid_scores_)))

report(grid_search.grid_scores_)

实例二：grid_search_text_feature_extraction.py

from pprint import pprint

from time import time

import logging

from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import SGDClassifier

from sklearn.grid_search import GridSearchCV

from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout

logging.basicConfig(level=logging.INFO,

                    format='%(asctime)s %(levelname)s %(message)s')

###############################################################################

# Load some categories from the training set

categories = [

    'alt.atheism',

    'talk.religion.misc',

]

# Uncomment the following to do the analysis on all the categories

#categories = None

print("Loading 20 newsgroups dataset for categories:")

print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)

print("%d documents" % len(data.filenames))

print("%d categories" % len(data.target_names))

print()

###############################################################################

# define a pipeline combining a text feature extractor with a simple

# classifier

pipeline = Pipeline([

    ('vect', CountVectorizer()),

    ('tfidf', TfidfTransformer()),

    ('clf', SGDClassifier()),

])

# uncommenting more parameters will give better exploring power but will

# increase processing time in a combinatorial way

parameters = {

    'vect__max_df': (0.5, 0.75, 1.0),

    #'vect__max_features': (None, 5000, 10000, 50000),

    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams

    #'tfidf__use_idf': (True, False),

    #'tfidf__norm': ('l1', 'l2'),

    'clf__alpha': (0.00001, 0.000001),

    'clf__penalty': ('l2', 'elasticnet'),

    #'clf__n_iter': (10, 50, 80),

}

if __name__ == "__main__":

    # multiprocessing requires the fork to happen in a __main__ protected

    # block

    # find the best parameters for both the feature extraction and the

    # classifier

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")

    print("pipeline:", [name for name, _ in pipeline.steps])

    print("parameters:")

    pprint(parameters)

    t0 = time()

    grid_search.fit(data.data, data.target)

    print("done in %0.3fs" % (time() - t0))

    print()

    print("Best score: %0.3f" % grid_search.best_score_)

    print("Best parameters set:")

    best_parameters = grid_search.best_estimator_.get_params()

    for param_name in sorted(parameters.keys()):

        print("\t%s: %r" % (param_name, best_parameters[param_name]))

sklearn分类的更多相关文章

Sklearn分类树在合成数集上的表现
小伙伴们大家好~o(￣▽￣)ブ,今天我们开始来看一下Sklearn分类树的表现,我的开发环境是Jupyter lab,所用的库和版本大家参考: Python 3.7.1(你的版本至少要3.4以上) S ...
python + sklearn ︱分类效果评估——acc、recall、F1、ROC、回归、距离
之前提到过聚类之后,聚类质量的评价: 聚类︱python实现六大分群质量评估指标(兰德系数.互信息.轮廓系数) R语言相关分类效果评估: R语言︱分类器的性能表现评价(混淆矩阵,准确率,召回率,F ...
[Example of Sklearn] - 分类对比
refrence :http://cloga.info/python/2014/02/07/classify_use_Sklearn/ 加载数据集这里我使用pandas来加载数据集,数据集采用kag ...
Python sklearn 分类效果评估
https://blog.csdn.net/sinat_26917383/article/details/75199996
sklearn调用分类算法的评价指标
sklearn分类算法的评价指标调用#二分类问题的算法评价指标import numpy as npimport matplotlib.pyplot as pltimport pandas as pdf ...
特征选取1-from sklearn.feature_selection import SelectKBest
sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频) https://study.163.com/course/introduction.htm?courseId=1005269003& ...
XGBoost类库使用小结
在XGBoost算法原理小结中,我们讨论了XGBoost的算法原理,这一片我们讨论如何使用XGBoost的Python类库,以及一些重要参数的意义和调参思路. 本文主要参考了XGBoost的Pytho ...
Sklearn库例子——决策树分类
Sklearn上关于决策树算法使用的介绍:http://scikit-learn.org/stable/modules/tree.html 1.关于决策树:决策树是一个非参数的监督式学习方法,主要用于 ...
利用sklearn对MNIST手写数据集开始一个简单的二分类判别器项目（在这个过程中学习关于模型性能的评价指标，如accuracy，precision，recall，混淆矩阵）
.caret, .dropup > .btn > .caret { border-top-color: #000 !important; } .label { border: 1px so ...

随机推荐

.NET下dropdownlist的基本操作
//List列中索引的赋值 teacher.DataValueField = ds.Tables[0].Columns["pidcord"].ColumnName; //List列 ...
Java 实现多线程的三种方式
import java.util.concurrent.Callable; import java.util.concurrent.FutureTask; public class Main { pu ...
OC小实例关于init方法不小心的错误
*:first-child { margin-top: 0 !important; } body > *:last-child { margin-bottom: 0 !important; } ...
yum命令指南-yum使用方法
yum check-update 检查可更新的所有软件包 yum update 下载更新系统已安装的所有软件包 yum upgrade 大规模的版本升级,与yum update不同的 ...
Effective Java 65 Don't ignore exceptions
Principle An empty catch block defeats the purpose of exceptions, which is to force you to handle ex ...
System Center的一些资料收集
MS 的 system center 中文首页 http://www.microsoft.com/zh-cn/server-cloud/system-center/default.aspx 英文首页 ...
oracle中replace、length、lengthb、substr、substrb函数
1.replacereplace(x,y,z)返回值为将字符串X中的Y串用Z串替换后的结果字符串. replace(x,y)返回值将字符串X中为Y串的地方删除例:epacel('aaabbb','bb ...
xcode 自定义include路径
Web性能优化之动态合并JS/CSS文件并缓存客户端
来源:微信公众号CodeL 在Web开发过程中,会产生很多的js/css文件,传统的引用外部文件的方式会产生多次的http请求,从而加重服务器负担且网页加载缓慢,如何在一次请求中将多个文件一次加载出来 ...
On Perseverance
Brothers,I dont consider that I have made it my own.But one thing I do:forgetting what lies behind a ...

sklearn分类

一.分类器(Classification)

实例一：plot_classifier_comparison.py

二.特征选择(Feature Selection)

例一：feature_selection_pipeline.py

例二：plot_rbm_logistic_classification.py

三.模型选择(Model Selection)

实例一：randomized_search.py

实例二：grid_search_text_feature_extraction.py

sklearn分类的更多相关文章

随机推荐

热门专题