Spark 决策树--分类模型
package Spark_MLlib import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.sql.SparkSession /**
* Created by soyo on 17-11-5.
*/
case class data_schemas(features:Vector,label:String)
object 决策树 {
val spark=SparkSession.builder().master("local").appName("决策树").getOrCreate()
import spark.implicits._
def main(args: Array[String]): Unit = { val source_DF=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt")
.map(_.split(",")).map(x=>data_schemas(Vectors.dense(x().toDouble,x().toDouble,x().toDouble,x().toDouble),x())).toDF()
source_DF.createOrReplaceTempView("decisonTree")
val DF=spark.sql("select * from decisonTree")
DF.show()
//分别获取标签列和特征列,进行索引和重命名(索引的目的是将字符串label数值化方便机器学习算法学习)
val lableIndexer=new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(DF)
val featureIndexer= new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories().fit(DF)
val labelConverter= new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(lableIndexer.labels)
// 训练数据和测试数据
val Array(trainData,testData)=DF.randomSplit(Array(0.7,0.3))
val decisionTreeClassifier=new DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
//构建机器学习工作流
val dt_pipeline=new Pipeline().setStages(Array(lableIndexer,featureIndexer,decisionTreeClassifier,labelConverter))
val dt_model=dt_pipeline.fit(trainData)
//进行预测
val dtprediction=dt_model.transform(testData)
dtprediction.show()
//评估决策树模型
val evaluatorClassifier=new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
val accuracy=evaluatorClassifier.evaluate(dtprediction)
println("准确率为: "+accuracy)
val error=-accuracy
println("错误率为: "+error)
val treeModelClassifier=dt_model.stages().asInstanceOf[DecisionTreeClassificationModel]
val schema_DecisionTree=treeModelClassifier.toDebugString
println("决策树的模型结构为: "+schema_DecisionTree) }
}
结果为:
+-----------------+------+
| features| label|
+-----------------+------+
|[5.1,3.5,1.4,0.2]|hadoop|
|[4.9,3.0,1.4,0.2]|hadoop|
|[4.7,3.2,1.3,0.2]|hadoop|
|[4.6,3.1,1.5,0.2]|hadoop|
|[5.0,3.6,1.4,0.2]|hadoop|
|[5.4,3.9,1.7,0.4]|hadoop|
|[4.6,3.4,1.4,0.3]|hadoop|
|[5.0,3.4,1.5,0.2]|hadoop|
|[4.4,2.9,1.4,0.2]|hadoop|
|[4.9,3.1,1.5,0.1]|hadoop|
|[5.4,3.7,1.5,0.2]|hadoop|
|[4.8,3.4,1.6,0.2]|hadoop|
|[4.8,3.0,1.4,0.1]|hadoop|
|[4.3,3.0,1.1,0.1]|hadoop|
|[5.8,4.0,1.2,0.2]|hadoop|
|[5.7,4.4,1.5,0.4]|hadoop|
|[5.4,3.9,1.3,0.4]|hadoop|
|[5.1,3.5,1.4,0.3]|hadoop|
|[5.7,3.8,1.7,0.3]|hadoop|
|[5.1,3.8,1.5,0.3]|hadoop|
+-----------------+------+
only showing top 20 rows
+-----------------+------+------------+-----------------+--------------+-------------+----------+--------------+
| features| label|indexedLabel| indexedFeatures| rawPrediction| probability|prediction|predictedLabel|
+-----------------+------+------------+-----------------+--------------+-------------+----------+--------------+
|[4.4,3.0,1.3,0.2]|hadoop| 1.0|[4.4,3.0,1.3,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[4.6,3.4,1.4,0.3]|hadoop| 1.0|[4.6,3.4,1.4,0.3]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[4.6,3.6,1.0,0.2]|hadoop| 1.0|[4.6,3.6,1.0,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[4.9,2.4,3.3,1.0]| spark| 0.0|[4.9,2.4,3.3,1.0]| [0.0,0.0,1.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[5.0,2.0,3.5,1.0]| spark| 0.0|[5.0,2.0,3.5,1.0]| [1.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.0,2.3,3.3,1.0]| spark| 0.0|[5.0,2.3,3.3,1.0]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.0,3.2,1.2,0.2]|hadoop| 1.0|[5.0,3.2,1.2,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.0,3.3,1.4,0.2]|hadoop| 1.0|[5.0,3.3,1.4,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.0,3.4,1.6,0.4]|hadoop| 1.0|[5.0,3.4,1.6,0.4]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.0,3.6,1.4,0.2]|hadoop| 1.0|[5.0,3.6,1.4,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.1,3.5,1.4,0.2]|hadoop| 1.0|[5.1,3.5,1.4,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.1,3.7,1.5,0.4]|hadoop| 1.0|[5.1,3.7,1.5,0.4]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.2,3.4,1.4,0.2]|hadoop| 1.0|[5.2,3.4,1.4,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.2,4.1,1.5,0.1]|hadoop| 1.0|[5.2,4.1,1.5,0.1]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.4,3.0,4.5,1.5]| spark| 0.0|[5.4,3.0,4.5,1.5]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.4,3.9,1.7,0.4]|hadoop| 1.0|[5.4,3.9,1.7,0.4]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.5,2.4,3.7,1.0]| spark| 0.0|[5.5,2.4,3.7,1.0]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.5,2.4,3.8,1.1]| spark| 0.0|[5.5,2.4,3.8,1.1]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.5,2.5,4.0,1.3]| spark| 0.0|[5.5,2.5,4.0,1.3]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.5,2.6,4.4,1.2]| spark| 0.0|[5.5,2.6,4.4,1.2]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.5,4.2,1.4,0.2]|hadoop| 1.0|[5.5,4.2,1.4,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[5.6,2.5,3.9,1.1]| spark| 0.0|[5.6,2.5,3.9,1.1]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.6,2.7,4.2,1.3]| spark| 0.0|[5.6,2.7,4.2,1.3]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.6,3.0,4.1,1.3]| spark| 0.0|[5.6,3.0,4.1,1.3]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.7,2.6,3.5,1.0]| spark| 0.0|[5.7,2.6,3.5,1.0]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.8,2.6,4.0,1.2]| spark| 0.0|[5.8,2.6,4.0,1.2]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[5.8,4.0,1.2,0.2]|hadoop| 1.0|[5.8,4.0,1.2,0.2]|[0.0,36.0,0.0]|[0.0,1.0,0.0]| 1.0| hadoop|
|[6.1,2.6,5.6,1.4]| Scala| 2.0|[6.1,2.6,5.6,1.4]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.2,2.2,4.5,1.5]| spark| 0.0|[6.2,2.2,4.5,1.5]| [0.0,0.0,1.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.2,3.4,5.4,2.3]| Scala| 2.0|[6.2,3.4,5.4,2.3]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.3,2.5,5.0,1.9]| Scala| 2.0|[6.3,2.5,5.0,1.9]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.3,2.8,5.1,1.5]| Scala| 2.0|[6.3,2.8,5.1,1.5]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.4,2.8,5.6,2.1]| Scala| 2.0|[6.4,2.8,5.6,2.1]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.4,2.8,5.6,2.2]| Scala| 2.0|[6.4,2.8,5.6,2.2]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.4,3.2,4.5,1.5]| spark| 0.0|[6.4,3.2,4.5,1.5]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.4,3.2,5.3,2.3]| Scala| 2.0|[6.4,3.2,5.3,2.3]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.5,2.8,4.6,1.5]| spark| 0.0|[6.5,2.8,4.6,1.5]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.6,2.9,4.6,1.3]| spark| 0.0|[6.6,2.9,4.6,1.3]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.6,3.0,4.4,1.4]| spark| 0.0|[6.6,3.0,4.4,1.4]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.8,3.2,5.9,2.3]| Scala| 2.0|[6.8,3.2,5.9,2.3]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[6.9,3.1,4.9,1.5]| spark| 0.0|[6.9,3.1,4.9,1.5]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[6.9,3.2,5.7,2.3]| Scala| 2.0|[6.9,3.2,5.7,2.3]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[7.2,3.0,5.8,1.6]| Scala| 2.0|[7.2,3.0,5.8,1.6]|[29.0,0.0,0.0]|[1.0,0.0,0.0]| 0.0| spark|
|[7.2,3.2,6.0,1.8]| Scala| 2.0|[7.2,3.2,6.0,1.8]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[7.6,3.0,6.6,2.1]| Scala| 2.0|[7.6,3.0,6.6,2.1]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[7.7,3.0,6.1,2.3]| Scala| 2.0|[7.7,3.0,6.1,2.3]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[7.7,3.8,6.7,2.2]| Scala| 2.0|[7.7,3.8,6.7,2.2]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
|[7.9,3.8,6.4,2.0]| Scala| 2.0|[7.9,3.8,6.4,2.0]|[0.0,0.0,31.0]|[0.0,0.0,1.0]| 2.0| Scala|
+-----------------+------+------------+-----------------+--------------+-------------+----------+--------------+
准确率为: 0.8958333333333334
错误率为: 0.10416666666666663
决策树的结构为: DecisionTreeClassificationModel (uid=dtc_218264842cd2) of depth 5 with 15 nodes
If (feature 2 <= 1.9)
Predict: 1.0
Else (feature 2 > 1.9)
If (feature 3 <= 1.7)
If (feature 0 <= 4.9)
Predict: 2.0
Else (feature 0 > 4.9)
If (feature 1 <= 2.2)
If (feature 2 <= 4.0)
Predict: 0.0
Else (feature 2 > 4.0)
Predict: 2.0
Else (feature 1 > 2.2)
Predict: 0.0
Else (feature 3 > 1.7)
If (feature 2 <= 4.8)
If (feature 0 <= 5.9)
Predict: 0.0
Else (feature 0 > 5.9)
Predict: 2.0
Else (feature 2 > 4.8)
Predict: 2.0
Spark 决策树--分类模型的更多相关文章
- Spark 决策树--回归模型
package Spark_MLlib import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.Regres ...
- spark 决策树分类算法demo
分类(Classification) 下面的例子说明了怎样导入LIBSVM 数据文件,解析成RDD[LabeledPoint],然后使用决策树进行分类.GINI不纯度作为不纯度衡量标准并且树的最大深度 ...
- R语言决策树分类模型
rm(list=ls()) gc() memory.limit(4000) library(corrplot) library(rpart) data_health<-read.csv(&quo ...
- Spark学习笔记——构建分类模型
Spark中常见的三种分类模型:线性模型.决策树和朴素贝叶斯模型. 线性模型,简单而且相对容易扩展到非常大的数据集:线性模型又可以分成:1.逻辑回归:2.线性支持向量机 决策树是一个强大的非线性技术, ...
- Spark机器学习4·分类模型(spark-shell)
线性模型 逻辑回归--逻辑损失(logistic loss) 线性支持向量机(Support Vector Machine, SVM)--合页损失(hinge loss) 朴素贝叶斯(Naive Ba ...
- 笔记︱风控分类模型种类(决策、排序)比较与模型评估体系(ROC/gini/KS/lift)
每每以为攀得众山小,可.每每又切实来到起点,大牛们,缓缓脚步来俺笔记葩分享一下吧,please~ --------------------------- 本笔记源于CDA-DSC课程,由常国珍老师主讲 ...
- 初识spark的MLP模型
初识Spark的MLP模型 1. MLP介绍 Multi-layer Perceptron(MLP),即多层感知器,是一个前馈式的.具有监督的人工神经网络结构.通过多层感知器可包含多个隐藏层,实现对非 ...
- sklearn CART决策树分类
sklearn CART决策树分类 决策树是一种常用的机器学习方法,可以用于分类和回归.同时,决策树的训练结果非常容易理解,而且对于数据预处理的要求也不是很高. 理论部分 比较经典的决策树是ID3.C ...
- ML(4): 决策树分类
决策树(Decision Tree)是用于分类和预测的主要技术,它着眼于从一组无规则的事例推理出决策树表示形式的分类规则,采用自顶向下的递归方式,在决策树的内部节点进行属性值的比较,并根据不同属性判断 ...
随机推荐
- PHP 真值与空值
本文参考 http://php.net/manual/en/types.comparisons.php. 1. isset bool isset ( mixed $var [, mixed $... ...
- axios增加的自定义header,后端request取不到
1.拦截器配置 <!--拦截器--> <mvc:interceptors> <!-- web端增加头部接口 --> <mvc:interceptor> ...
- 商业研究(20):滴滴出行,进军海外包车?与OTA携程和包车创业公司,共演“三国杀”?看看分析师、投资人和权威人士等10个人的观点碰撞
小雷友情提示:创业有风险,投资需谨慎. 前一篇文章,在探讨境外游创业公司-皇包车和易途8的时候,提到"滴滴如果进军海外包车,为海外华人提供打车和包车服务,有较大可能对海外包车公司 ...
- 用二分法计算a的n次幂<算法分析>
实验目的:1.复习java编程:2.掌握二分法的基本原理:3.掌握使用java程序进行二分法计算a的n次幂.实验步骤:1.由用户输入a及n(均为整数):2.利用二分法完成计算,并将中间结果打印出来. ...
- 前端开发:JavaScript---ECMAScript
JavaScript:JavaScript是一种web前端的描述语言,也是一种基于对象(object)和事件驱动(Event Driven)的脚本语言.它运行在客户端从而减轻服务器的负担. js是一种 ...
- android开发里跳过的坑——调用已安装视频播放器在有些机器上无效
调用已安装视频播放器播放未修改之前的代码 private void startPlay(String fileName){ File file = new File(fileName); Intent ...
- cdq分治入门--BZOJ1176: [Balkan2007]Mokia
对w*w,w<=2000000的矩形,一开始全是0(或一开始全是s),n<=170000个操作,每次操作:矩阵内某点加上一个数,查某一个子矩阵的和,保证修改数<=160000,询问数 ...
- HTML5学习之语义化标签
一.为什么HTML5要引入新语义标签 在HTML5出现之前,我们一般采用DIV+CSS布局我们的页面.但是这样的布局方式不仅使我们的文档结构不够清晰,而且不利于搜索引擎爬虫对我们页面的爬取.为了解决上 ...
- [bzoj4826][Hnoi2017]影魔_单调栈_主席树
影魔 bzoj-4826 Hnoi-2017 题目大意:给定一个$n$个数的序列$a$,求满足一下情况的点对个数: 注释:$1\le n,m\le 2\cdot 10^5$,$1\le p1,p2\l ...
- Ubuntu 16.04利用SecureCRT上传/下载文件(sz/rz命令)
说明:XShell同样也是支持的. 一.安装软件 sudo apt-get install lrzsz 二.sz下载文件用法: #下载一个文件 sz filename #下载多个文件 sz filen ...