一, jar依赖,jsc创建。

package ML.BasicStatistics;

import com.google.common.collect.Lists;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.mllib.linalg.Matrices;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.stat.KernelDensity;
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
import org.apache.spark.mllib.stat.Statistics;
import org.apache.spark.mllib.stat.test.ChiSqTestResult;
import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import scala.Tuple2;
import scala.runtime.Statics;
import static org.apache.spark.mllib.random.RandomRDDs.*; import java.util.*; /**
* TODO
*
* @ClassName: BasicStatistics
* @author: DingH
* @since: 2019/4/3 16:11
*/
public class BasicStatistics {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir","E:\\hadoop-2.6.5");
SparkConf conf = new SparkConf().setAppName("BasicStatistics").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);

二。Summary statistics

        /**
* @Title: Statistics.colStats一个实例MultivariateStatisticalSummary,其中包含按列的max,min,mean,variance和非零数,以及总计数
* Summary statistics:摘要统计
*/
JavaRDD<Vector> parallelize = jsc.parallelize(Arrays.asList(
Vectors.dense(1, 0, 3),
Vectors.dense(2, 0, 4),
Vectors.dense(3, 0, 5)
));
MultivariateStatisticalSummary summary = Statistics.colStats(parallelize.rdd());
System.out.println(summary.mean());
System.out.println(summary.variance());
System.out.println(summary.numNonzeros());

三。Correlations:相关性

        /**
* @Title: Correlations:相关性
*/
JavaRDD<Tuple2<String, String>> parallelize = jsc.parallelize(Lists.newArrayList(
new Tuple2<String, String>("cat", "11"),
new Tuple2<String, String>("dog", "22"),
new Tuple2<String, String>("cat", "33"),
new Tuple2<String, String>("pig", "44") )); JavaDoubleRDD seriesX = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction<Iterator<Tuple2<String, String>>>() {
public Iterable<Double> call(Iterator<Tuple2<String, String>> tuple2Iterator) throws Exception {
ArrayList<Double> strings = new ArrayList<Double>();
while (tuple2Iterator.hasNext()){
strings.add(Double.parseDouble(tuple2Iterator.next()._2));
}
return strings;
}
});
JavaDoubleRDD seriesY = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction<Iterator<Tuple2<String, String>>>() {
public Iterable<Double> call(Iterator<Tuple2<String, String>> tuple2Iterator) throws Exception {
ArrayList<Double> strings = new ArrayList<Double>();
while (tuple2Iterator.hasNext()){
strings.add(Double.parseDouble(tuple2Iterator.next()._2)+1);
}
return strings;
}
});
//compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
//method is not specified, Pearson's method will be used by default.
double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); JavaRDD<Vector> parallelize11 = jsc.parallelize(Arrays.asList(
Vectors.dense(1, 0, 3),
Vectors.dense(2, 0, 4),
Vectors.dense(3, 0, 5)
));// note that each Vector is a row and not a column
Matrix correlation2 = Statistics.corr(parallelize11.rdd(), "spearman");
System.out.println(correlation2);

三,Stratified sampling:分层抽样

        /**
* @Title: Stratified sampling:分层抽样
*/
JavaRDD<Tuple2<String, String>> parallelize = jsc.parallelize(Lists.newArrayList(
new Tuple2<String, String>("cat", "11"),
new Tuple2<String, String>("dog", "22"),
new Tuple2<String, String>("cat", "33"),
new Tuple2<String, String>("pig", "44") ));
JavaPairRDD data = parallelize.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {
public Tuple2<String, String> call(Tuple2<String, String> stringStringTuple2) throws Exception {
return new Tuple2<String, String>(stringStringTuple2._1, stringStringTuple2._2);
}
}); // an RDD of any key value pairs
Map<String, Double> fractions = new HashMap<String, Double>(); // specify the exact fraction desired from each key
fractions.put("cat",0.5); //对于每个key取值的概率
fractions.put("dog",0.8);
fractions.put("pig",0.8);
// Get an exact sample from each stratum
JavaPairRDD approxSample = data.sampleByKey(false, fractions);
JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions);
approxSample.foreach(new VoidFunction() {
public void call(Object o) throws Exception {
System.out.println(o);
}
});

四。Hypothesis testing  假设检验

        /**
* @Title: Hypothesis testing 假设检验
*/ Vector vec = Vectors.dense(1,2,3,4); // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
// the test runs against a uniform distribution.
ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
// summary of the test including the p-value, degrees of freedom, test statistic, the method used,
// and the null hypothesis.
System.out.println(goodnessOfFitTestResult); Matrix mat = Matrices.dense(3,2,new double[]{1,2,3,4,5,6}); // a contingency matrix // conduct Pearson's independence test on the input contingency matrix
ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
// summary of the test including the p-value, degrees of freedom...
System.out.println(independenceTestResult); JavaRDD<LabeledPoint> obs = MLUtils.loadLibSVMFile(jsc.sc(), "/data...").toJavaRDD(); // an RDD of labeled points // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
// the independence test. Returns an array containing the ChiSquaredTestResult for every feature
// against the label.
ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
int i = 1;
for (ChiSqTestResult result : featureTestResults) {
System.out.println("Column " + i + ":");
System.out.println(result); // summary of the test
i++;
} JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0,0.3));
KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data,"norm");
// summary of the test including the p-value, test statistic,
// and null hypothesis
// if our p-value indicates significance, we can reject the null hypothesis
System.out.println(testResult);

五。Random data generation

         /**
* @Title: Random data generation :uniform, standard normal, or Poisson.
*/ JavaDoubleRDD u = normalJavaRDD(jsc, 100,2);
// Apply a transform to get a random double RDD following `N(1, 4)`.
JavaRDD<Double> map = u.map(new Function<Double, Double>() {
public Double call(Double aDouble) throws Exception {
return 1.0 + 2.0 * aDouble;
}
});
map.foreach(new VoidFunction<Double>() {
public void call(Double aDouble) throws Exception {
System.out.println(aDouble);
}
});

六。Kernel density estimation

        /**
* @Title: Kernel density estimation
*/
JavaRDD<Double> data = jsc.parallelize(Arrays.asList(1.0, 2.0, 3.0));// an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian
// kernels
KernelDensity kd = new KernelDensity()
.setSample(data)
.setBandwidth(3.0); // Find density estimates for the given values
double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
for (int i = 0; i < densities.length; i++) {
System.out.println(densities[i]);
}

spark MLlib BasicStatistics 统计学基础的更多相关文章

  1. spark MLLib的基础统计部分学习

    参考学习链接:http://www.itnose.net/detail/6269425.html 机器学习相关算法,建议初学者去看看斯坦福的机器学习课程视频:http://open.163.com/s ...

  2. 【原创 Hadoop&Spark 动手实践 12】Spark MLLib 基础、应用与信用卡欺诈检测系统动手实践

    [原创 Hadoop&Spark 动手实践 12]Spark MLLib 基础.应用与信用卡欺诈检测系统动手实践

  3. Spark MLlib 机器学习

    本章导读 机器学习(machine learning, ML)是一门涉及概率论.统计学.逼近论.凸分析.算法复杂度理论等多领域的交叉学科.ML专注于研究计算机模拟或实现人类的学习行为,以获取新知识.新 ...

  4. 《Spark MLlib机器学习实践》内容简介、目录

      http://product.dangdang.com/23829918.html Spark作为新兴的.应用范围最为广泛的大数据处理开源框架引起了广泛的关注,它吸引了大量程序设计和开发人员进行相 ...

  5. Spark MLlib 之 Basic Statistics

    Spark MLlib提供了一些基本的统计学的算法,下面主要说明一下: 1.Summary statistics 对于RDD[Vector]类型,Spark MLlib提供了colStats的统计方法 ...

  6. Spark MLlib - Decision Tree源码分析

    http://spark.apache.org/docs/latest/mllib-decision-tree.html 以决策树作为开始,因为简单,而且也比较容易用到,当前的boosting或ran ...

  7. Spark入门实战系列--8.Spark MLlib(上)--机器学习及SparkMLlib简介

    [注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .机器学习概念 1.1 机器学习的定义 在维基百科上对机器学习提出以下几种定义: l“机器学 ...

  8. Spark入门实战系列--8.Spark MLlib(下)--机器学习库SparkMLlib实战

    [注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .MLlib实例 1.1 聚类实例 1.1.1 算法说明 聚类(Cluster analys ...

  9. Spark MLlib知识点学习整理

    MLlib的设计原理:把数据以RDD的形式表示,然后在分布式数据集上调用各种算法.MLlib就是RDD上一系列可供调用的函数的集合. 操作步骤: 1.用字符串RDD来表示信息. 2.运行MLlib中的 ...

随机推荐

  1. python之面向对象初识

    一.面向对象初识 1.结构上 面向对象分成两部分:属性.方法 class A: name = 'xiaoming' # 静态属性.静态变量.静态字段. def func1(self): # 函数.动态 ...

  2. C. Multi-Subject Competition 思维+前缀和+填表加减复杂度(复杂度计算错误)

    题意: 给出n个学生 m类题目 每个人会做s[i]类的题 并且做这个题的能力为r[i]  组成一个竞赛队 要求可以选择一些题目  在竞赛队中 擅长每一个题目的 人数要均等  求max(sigma(r[ ...

  3. Vivado寄存器初始值问题

    前言 本复位只针对Vivado中的寄存器复位. 什么时候需要复位?到底要不要复位?怎么复位?复位有什么卵用? 该复位的寄存器需要复位,复位使得寄存器恢复初始值,有的寄存器并不需要复位(数据流路径上). ...

  4. 【BZOJ5507】[GXOI/GZOI2019]旧词(树链剖分,线段树)

    [BZOJ5507][GXOI/GZOI2019]旧词(树链剖分,线段树) 题面 BZOJ 洛谷 题解 如果\(k=1\)就是链并裸题了... 其实\(k>1\)发现还是可以用类似链并的思想,这 ...

  5. hdu 3480 Division(四边形不等式优化)

    Problem Description Little D is really interested in the theorem of sets recently. There’s a problem ...

  6. [FJOI2018]领导集团问题

    [FJOI2018]领导集团问题 dp[i][j],i为根子树,最上面的值是j,选择的最大值 观察dp方程 1.整体Dp已经可以做了. 2.考虑优美一些的做法: dp[i]如果对j取后缀最大值,显然是 ...

  7. BZOJ4036 按位或

    解:有两种做法...... 第一种,按照秘密袭击coat的套路,我们只需要求出即可.因为一种操作了i次的方案会被恰好计数i次. 那么这个东西怎么求呢?直接用FWT的思想,对于一个状态s,求出选择s所有 ...

  8. GWAS分析基本流程及分析思路

    数据预处理(DNA genotyping.Quality control.Imputation) QC的工作可以做PLINK上完成Imputation的工作用IMPUTE2完成 2. 表型数据统计分析 ...

  9. LFYZ-OJ ID: 1008 求A/B高精度值

    思路 小数点前的部分可以通过m/n直接计算得出 小数点后的20位可通过循环进行快速计算,计算方法如下: m%=n m*=10 小数点后第i位为m/n,回到第1步 第3步后,如果m%n为0,说明已经除净 ...

  10. 第二节:比较DateTime和DateTimeOffset两种时间类型并介绍Quartz.Net中用到的几类时间形式(定点、四舍五入、倍数、递增)

    一. 时间的类型 1. 背景 这里为什么要介绍时间类型呢,明明是定时调度篇,原因是在定时任务中,任务什么时间开始执行,什么时间结束执行,要用到各种各样的时间模式,虽然这不能算是一个复杂的问题,但在正式 ...