from pyspark import SparkContext, SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Load the data stored in LIBSVM format as a DataFrame.
data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers.
model = pipeline.fit(trainingData) # Make predictions.
predictions = model.transform(testData) # Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2]
# summary only
print(treeModel) ############################# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.01, 0.5, 2.0])
.addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
.addGrid(lr.maxIter, [1, 5, 10])
.build())
Copy to clipboardCopy
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing # Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData) # cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions) #We can also access the model’s feature weights and intercepts easily print 'Model Intercept: ', cvModel.bestModel.intercept
ML provides CrossValidator class which can be used to perform cross-validation and parameter search. Assuming your data is already preprocessed you can add cross-validation as follows:

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator // [label: double, features: vector]
trainingData org.apache.spark.sql.DataFrame = ???
val nFolds: Int = ???
val NumTrees: Int = ???
val metric: String = ??? val rf = new RandomForestClassifier()
.setLabelCol("label")
.setFeaturesCol("features")
.setNumTrees(NumTrees) val pipeline = new Pipeline().setStages(Array(rf)) val paramGrid = new ParamGridBuilder().build() // No parameter search val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
// "f1" (default), "weightedPrecision", "weightedRecall", "accuracy"
.setMetricName(metric) val cv = new CrossValidator()
// ml.Pipeline with ml.classification.RandomForestClassifier
.setEstimator(pipeline)
// ml.evaluation.MulticlassClassificationEvaluator
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(nFolds) val model = cv.fit(trainingData) // trainingData: DataFrame
Using PySpark: from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator trainingData = ... # DataFrame[label: double, features: vector]
numFolds = ... # Integer rf = RandomForestClassifier(labelCol="label", featuresCol="features")
evaluator = MulticlassClassificationEvaluator() # + other params as in Scala pipeline = Pipeline(stages=[rf]) crossval = CrossValidator(
estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds=numFolds) model = crossval.fit(trainingData)

spark 针对决策树进行交叉验证的更多相关文章

  1. Spark机器学习——模型选择与参数调优之交叉验证

    spark 模型选择与超参调优 机器学习可以简单的归纳为 通过数据训练y = f(x) 的过程,因此定义完训练模型之后,就需要考虑如何选择最终我们认为最优的模型. 如何选择最优的模型,就是本篇的主要内 ...

  2. 什么是机器学习的分类算法?【K-近邻算法(KNN)、交叉验证、朴素贝叶斯算法、决策树、随机森林】

    1.K-近邻算法(KNN) 1.1 定义 (KNN,K-NearestNeighbor) 如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类 ...

  3. Spark2.0机器学习系列之2:基于Pipeline、交叉验证、ParamMap的模型选择和超参数调优

    Spark中的CrossValidation Spark中采用是k折交叉验证 (k-fold cross validation).举个例子,例如10折交叉验证(10-fold cross valida ...

  4. 几种交叉验证(cross validation)方式的比较

    模型评价的目的:通过模型评价,我们知道当前训练模型的好坏,泛化能力如何?从而知道是否可以应用在解决问题上,如果不行,那又是哪里出了问题? train_test_split 在分类问题中,我们通常通过对 ...

  5. sklearn 中的交叉验证

    sklearn中的交叉验证(Cross-Validation) sklearn是利用python进行机器学习中一个非常全面和好用的第三方库,用过的都说好.今天主要记录一下sklearn中关于交叉验证的 ...

  6. sklearn中的交叉验证(Cross-Validation)

    这个repo 用来记录一些python技巧.书籍.学习链接等,欢迎stargithub地址sklearn是利用python进行机器学习中一个非常全面和好用的第三方库,用过的都说好.今天主要记录一下sk ...

  7. 【scikit-learn】交叉验证及其用于參数选择、模型选择、特征选择的样例

     内容概要¶ 训练集/測试集切割用于模型验证的缺点 K折交叉验证是怎样克服之前的不足 交叉验证怎样用于选择调节參数.选择模型.选择特征 改善交叉验证 1. 模型验证回想¶ 进行模型验证的一个重要目 ...

  8. 十折交叉验证10-fold cross validation, 数据集划分 训练集 验证集 测试集

    机器学习 数据挖掘 数据集划分 训练集 验证集 测试集 Q:如何将数据集划分为测试数据集和训练数据集? A:three ways: 1.像sklearn一样,提供一个将数据集切分成训练集和测试集的函数 ...

  9. 机器学习- Sklearn (交叉验证和Pipeline)

    前面一节咱们已经介绍了决策树的原理已经在sklearn中的应用.那么这里还有两个数据处理和sklearn应用中的小知识点咱们还没有讲,但是在实践中却会经常要用到的,那就是交叉验证cross_valid ...

随机推荐

  1. 不用@Value从Spring的ApplicationContext中获取一个或全部配置

    获取一个配置: applicationContext.getEnvironment().resolvePlaceholders("${propertyKey}"); // 方法1 ...

  2. Maximum Bipartite Matching

    算法旨在用尽可能简单的思路解决这个问题.理解算法也应该是一个越看越简单的过程,当你看到算法里的一串概念,或者一大坨代码,第一感觉是复杂,此时最好还是从样例入手.通过一个简单的样例,并编程实现,这个过程 ...

  3. Binary operations #1

    https://www.codewars.com/kata/binary-operations-number-1/train/csharp Your work is to write a method ...

  4. [JZOJ 5875] [NOIP2018提高组模拟9.20] 听我说,海蜗牛 解题报告(BFS+二分)

    题目链接: http://172.16.0.132/senior/#main/show/5875 题目: 题解: 注意这题只能经过开放的港口 我们考虑用vector存下每个点不能到的点,并把并让vec ...

  5. 解决django.db.utils.InternalError: (1049, "Unknown database 'exam_db'")

    先检查seeting数据库配置DATABASES = { 'default': { 'ENGINE': 'django.db.backends.mysql', 'NAME': 'eaxm_db', ' ...

  6. CSS3的常用属性(二)

    边框 边框圆角 border-radius: 100px 每个角可以设置两个值,x和y 补充: 可分别设置长,短半径,以“/”进行分隔,遵循顺时针的顺序,“/”之前为横轴半径,“/”之后为纵轴半径,如 ...

  7. 关于github里readme编辑的方法

    实验室的老师昨天改完论文发我后,说按照例子改.于是才发现github里readme编辑满满的极客思维. 看了一下csdn给的教程 https://blog.csdn.net/Kaitiren/arti ...

  8. 学习es6 setter/getter研究

    1.背景 在ES6中,我们对类的定义如下 class Person { // 构造函数 constructor (name) { // 属性初始化 this.name = name; } // 成员方 ...

  9. 织梦DEDECMS系统中文章内容为空 用SQL语句如何删除?

    织梦后台里提供了清空内容为空的文章,可是发现并不好用,有些空文章还是删除不了,而有些文章不是空的,只是采到了几个字,这些无法清除,于是就手动来清除这个文章.开始是一个一个文章找,一个一个来删除,后来觉 ...

  10. 2013-11-02 【webrebuild广州站】分享会纪要

    为了不让自己沉浸个人的技术研究当中,也为了多去接触业界新技术新思想,今天去参加了webrebuild广州站的一个分享交流会,效果不错,有一些获益.听了四个主题,依据个人获取信息的情况来做个纪要(比较粗 ...