data数据源,请参考我的博客http://www.cnblogs.com/wwxbi/p/6063613.html

import org.apache.Spark.sql.DataFrameStatFunctions

import org.apache.spark.sql.functions._

相关系数

val df = Range(0,10,step=1).toDF("id").withColumn("rand1", rand(seed=10)).withColumn("rand2", rand(seed=27))
df: org.apache.spark.sql.DataFrame = [id: int, rand1: double ... 1 more field] df.show
+---+-------------------+-------------------+
| id| rand1| rand2|
+---+-------------------+-------------------+
| 0|0.41371264720975787| 0.714105256846827|
| 1| 0.7311719281896606| 0.8143487574232506|
| 2| 0.9031701155118229| 0.5282207324381174|
| 3|0.09430205113458567| 0.4420100497826609|
| 4|0.38340505276222947| 0.9387162206758006|
| 5| 0.5569246135523511| 0.6398126862647711|
| 6| 0.4977441406613893| 0.9895498513115722|
| 7| 0.2076666106201438| 0.3398720242725498|
| 8| 0.9571919406508957|0.15042237695815963|
| 9| 0.7429395461204413| 0.7302723457066639|
+---+-------------------+-------------------+ df.stat.corr("rand1", "rand2", "pearson")
res24: Double = -0.10993962467082698

查看数据的统计分布情况

val colArray = Array("age", "yearsmarried", "religiousness", "education", "occupation", "rating")

// 查看数据的统计分布情况
val descrDF = data.describe("age", "yearsmarried", "religiousness", "education", "occupation", "rating")
descrDF: org.apache.spark.sql.DataFrame = [summary: string, age: string ... 5 more fields] descrDF.selectExpr("summary",
"round(age,2) as age",
"round(yearsmarried,2) as yearsmarried",
"round(religiousness,2) as religiousness",
"round(education,2) as education",
"round(occupation,2) as occupation",
"round(rating,2) as rating").show(10, truncate = false)
+-------+-----+------------+-------------+---------+----------+------+
|summary|age |yearsmarried|religiousness|education|occupation|rating|
+-------+-----+------------+-------------+---------+----------+------+
|count |601.0|601.0 |601.0 |601.0 |601.0 |601.0 |
|mean |32.49|8.18 |3.12 |16.17 |4.19 |3.93 |
|stddev |9.29 |5.57 |1.17 |2.4 |1.82 |1.1 |
|min |17.5 |0.13 |1.0 |9.0 |1.0 |1.0 |
|max |57.0 |15.0 |5.0 |20.0 |7.0 |5.0 |
+-------+-----+------------+-------------+---------+----------+------+

统计字段中元素的个数

// 统计字段中元素的个数
val fi = data.stat.freqItems(colArray)
fi: org.apache.spark.sql.DataFrame = [age_freqItems: array<double>, yearsmarried_freqItems: array<double> ... 4 more fields] fi.printSchema()
root
|-- age_freqItems: array (nullable = true)
| |-- element: double (containsNull = false)
|-- yearsmarried_freqItems: array (nullable = true)
| |-- element: double (containsNull = false)
|-- religiousness_freqItems: array (nullable = true)
| |-- element: double (containsNull = false)
|-- education_freqItems: array (nullable = true)
| |-- element: double (containsNull = false)
|-- occupation_freqItems: array (nullable = true)
| |-- element: double (containsNull = false)
|-- rating_freqItems: array (nullable = true)
| |-- element: double (containsNull = false) val f = fi.selectExpr(
| "size(age_freqItems)",
| "size(yearsmarried_freqItems)",
| "size(religiousness_freqItems)",
| "size(education_freqItems)",
| "size(occupation_freqItems)",
| "size(rating_freqItems)")
f: org.apache.spark.sql.DataFrame = [size(age_freqItems): int, size(yearsmarried_freqItems): int ... 4 more fields] f.show(10, truncate = false)
+-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+
|size(age_freqItems)|size(yearsmarried_freqItems)|size(religiousness_freqItems)|size(education_freqItems)|size(occupation_freqItems)|size(rating_freqItems)|
+-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+
|9 |8 |5 |7 |7 |5 |
+-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+

集合字段的元素

// 集合字段的元素
val f1 = data.stat.freqItems(Array("age", "yearsmarried", "religiousness"))
f1: org.apache.spark.sql.DataFrame = [age_freqItems: array<double>, yearsmarried_freqItems: array<double> ... 1 more field] f1.show(10, truncate = false)
+------------------------------------------------------+-----------------------------------------------+-------------------------+
|age_freqItems |yearsmarried_freqItems |religiousness_freqItems |
+------------------------------------------------------+-----------------------------------------------+-------------------------+
|[32.0, 47.0, 22.0, 52.0, 37.0, 17.5, 27.0, 57.0, 42.0]|[0.75, 0.125, 1.5, 0.417, 4.0, 7.0, 10.0, 15.0]|[2.0, 5.0, 4.0, 1.0, 3.0]|
+------------------------------------------------------+-----------------------------------------------+-------------------------+ // 对数组的元素排序 f1.selectExpr("sort_array(age_freqItems)", "sort_array(yearsmarried_freqItems)", "sort_array(religiousness_freqItems)").show(10, truncate = false)
+------------------------------------------------------+-----------------------------------------------+-----------------------------------------+
|sort_array(age_freqItems, true) |sort_array(yearsmarried_freqItems, true) |sort_array(religiousness_freqItems, true)|
+------------------------------------------------------+-----------------------------------------------+-----------------------------------------+
|[17.5, 22.0, 27.0, 32.0, 37.0, 42.0, 47.0, 52.0, 57.0]|[0.125, 0.417, 0.75, 1.5, 4.0, 7.0, 10.0, 15.0]|[1.0, 2.0, 3.0, 4.0, 5.0] |
+------------------------------------------------------+-----------------------------------------------+-----------------------------------------+ // 集合字段的元素
val f2 = data.stat.freqItems(Array("education", "occupation", "rating"))
f2: org.apache.spark.sql.DataFrame = [education_freqItems: array<double>, occupation_freqItems: array<double> ... 1 more field] f2.show(10, truncate = false)
+-----------------------------------------+-----------------------------------+-------------------------+
|education_freqItems |occupation_freqItems |rating_freqItems |
+-----------------------------------------+-----------------------------------+-------------------------+
|[17.0, 20.0, 14.0, 16.0, 9.0, 18.0, 12.0]|[2.0, 5.0, 4.0, 7.0, 1.0, 3.0, 6.0]|[2.0, 5.0, 4.0, 1.0, 3.0]|
+-----------------------------------------+-----------------------------------+-------------------------+ // 对数组的元素排序
f2.selectExpr("sort_array(education_freqItems)", "sort_array(occupation_freqItems)", "sort_array(rating_freqItems)").show(10, truncate = false)
+-----------------------------------------+--------------------------------------+----------------------------------+
|sort_array(education_freqItems, true) |sort_array(occupation_freqItems, true)|sort_array(rating_freqItems, true)|
+-----------------------------------------+--------------------------------------+----------------------------------+
|[9.0, 12.0, 14.0, 16.0, 17.0, 18.0, 20.0]|[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] |[1.0, 2.0, 3.0, 4.0, 5.0] |
+-----------------------------------------+--------------------------------------+----------------------------------+

Spark2 探索性数据统计分析的更多相关文章

  1. 初识Spark2.0之Spark SQL

    内存计算平台spark在今年6月份的时候正式发布了spark2.0,相比上一版本的spark1.6版本,在内存优化,数据组织,流计算等方面都做出了较大的改变,同时更加注重基于DataFrame数据组织 ...

  2. Hadoop 3.1.2(HA)+Zookeeper3.4.13+Hbase1.4.9(HA)+Hive2.3.4+Spark2.4.0(HA)高可用集群搭建

    目录 目录 1.前言 1.1.什么是 Hadoop? 1.1.1.什么是 YARN? 1.2.什么是 Zookeeper? 1.3.什么是 Hbase? 1.4.什么是 Hive 1.5.什么是 Sp ...

  3. spark2.3.0 配置spark sql 操作hive

    spark可以通过读取hive的元数据来兼容hive,读取hive的表数据,然后在spark引擎中进行sql统计分析,从而,通过spark sql与hive结合实现数据分析将成为一种最佳实践.配置步骤 ...

  4. geotrellis使用(二十五)将Geotrellis移植到spark2.0

    目录 前言 升级spark到2.0 将geotrellis最新版部署到spark2.0(CDH) 总结 一.前言        事情总是变化这么快,前面刚写了一篇博客介绍如何将geotrellis移植 ...

  5. 统计分析中Type I Error与Type II Error的区别

    统计分析中Type I Error与Type II Error的区别 在统计分析中,经常提到Type I Error和Type II Error.他们的基本概念是什么?有什么区别? 下面的表格显示 b ...

  6. Ubuntu14.04或16.04下安装JDK1.8+Scala+Hadoop2.7.3+Spark2.0.2

    为了将Hadoop和Spark的安装简单化,今日写下此帖. 首先,要看手头有多少机器,要安装伪分布式的Hadoop+Spark还是完全分布式的,这里分别记录. 1. 伪分布式安装 伪分布式的Hadoo ...

  7. Hadoop学习笔记—20.网站日志分析项目案例(三)统计分析

    网站日志分析项目案例(一)项目介绍:http://www.cnblogs.com/edisonchou/p/4449082.html 网站日志分析项目案例(二)数据清洗:http://www.cnbl ...

  8. maven+spark2.0.0最大连通分量

    运用到了spark2.0.0的grarhx包,要手动的在pom.xml里面添加依赖包,要什么就在里面添加依赖,然后在run->maven install

  9. Eclipse+maven+scala2.11.8+spark2.0.0的环境部署

    主要在maven-for-scalaIDE纠结了,因为在eclipse版本是luna4.x 里面有自己带有的maven. 根据网上面无脑的下一步下一步,出现了错误,在此讲解各个插件的用途,以此新人看见 ...

随机推荐

  1. BarTender连接不上数据库怎么办

    由于各种原因,在使用BarTender连接到数据库时,有可能会出现无法连接的问题,下面下编就针对两种BarTender无法连接到数据库的问题,来教大家解决的方法. 第一种 BarTender无权打开文 ...

  2. 一个非常好的C#字符串操作处理类StringHelper.cs

    /// <summary> /// 类说明:Assistant /// 编 码 人:苏飞 /// 联系方式:361983679 /// 更新网站:http://www.sufeinet.c ...

  3. Git学习笔记(三)

    Git提交相关内容 在Git提交时,会保存一个提交对象,该对象包括一个指向暂存区内容快照的指针,包括本次提交作者等相关附属信息,包括零个或多个指向该提交对象的父对象指针:首次提交时是没有祖先,普通提交 ...

  4. DropDownListFor的种种纠结(禁止转载)

    严重禁止转载,好多爬虫软件为了浏览到处抓东西,真缺德 具有键“CorpType”的 ViewData 项属于类型“System.Int64”,但它必须属于类型“IEnumerable<Selec ...

  5. Tomcat catalina-deamon.out 日志切割 每天生成一个文件

    Tomcat 使用 jsvc 以守护进程的方式启动(daemon.sh ).这样tomcat自身将会生成另外一个日志文件(catalina-daemon.out),而不是之前的catalina.out ...

  6. 品鉴同事发来的炸金花的PHP程序代码

    今天同事发来了一个炸金花的PHP程序,这个代码实现了两个人通过各自的三张牌进行权重计算,得到分数进行比较得到谁的牌大,我觉得里面还有一些问题,代码如下: <?php /** 每张牌的分值为一个2 ...

  7. ios开发之--数组的一些操作

    1,创建数组 NSMutableArray * array =[[NSMutableArray alloc] initWithObjects:@"a",@"b" ...

  8. iOS 图片加载速度极限优化—FastImageCache解析

    FastImageCache是Path团队开发的一个开源库,用于提升图片的加载和渲染速度,让基于图片的列表滑动起来更顺畅,来看看它是怎么做的.优化点iOS从磁盘加载一张图片,使用UIImageVIew ...

  9. 【AI】图像识别-物体检测-百度AI-EasyDL-NodeJS

    var https = require('https') var express = require('express'); var app = express(); var bodyParser = ...

  10. 使用 urllib 处理 HTTP 异常

    (1) 我们发起 HTTP 请求,有时会发生异常,如请求超时,登录密码错误,请求链接不存在等等,使用 urllib.request.URLError 可以捕获这些与 URL 相关的异常(2) urll ...