Spark实现销量统计
package com.mengyao.examples.spark.core; import java.io.Serializable; import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /**
* 国内乘用车4月、1-4月销量数据统计
* @author mengyao
*
*/
@SuppressWarnings("all")
public class CarSaleStatistics { static class Sale implements Serializable {
private static final long serialVersionUID = -5393067134730174480L;
//排名
private int no;
//车型
private String model;
//车企
private String brand;
//4月销量
private int fourSale;
//1-4月累计销量
private int totalSale;
public Sale(int no, String model, String brand, int fourSale, int totalSale) {
this.no = no;
this.model = model;
this.brand = brand;
this.fourSale = fourSale;
this.totalSale = totalSale;
}
public int getNo() {
return no;
}
public void setNo(int no) {
this.no = no;
}
public String getModel() {
return model;
}
public void setModel(String model) {
this.model = model;
}
public String getBrand() {
return brand;
}
public void setBrand(String brand) {
this.brand = brand;
}
public int getFourSale() {
return fourSale;
}
public void setFourSale(int fourSale) {
this.fourSale = fourSale;
}
public int getTotalSale() {
return totalSale;
}
public void setTotalSale(int totalSale) {
this.totalSale = totalSale;
}
@Override
public String toString() {
return no + "\t" + model + "\t" + brand + "\t" + fourSale + "\t" + totalSale;
}
} /**
* 集群模式:spark-submit --class com.mengyao.examples.spark.core.CarSaleStatistics --master yarn --deploy-mode cluster --driver-memory 2048m --executor-memory 1024m --executor-cores 1 --queue default examples-0.0.1-SNAPSHOT.jar /data/carsales_data/2018.4-china-car-sales_volume.txt /data/carsales_data/statistics/
* 本地模式:Run As > Java Application
* @param args [in,out]
*/
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName(CarSaleStatistics.class.getName());
if (null==args||args.length==0) {
args = new String[]{"./src/main/resources/data/2018.4-china-car-sales_volume.txt", "D:/"};
System.setProperty("hadoop.home.dir", "D:/softs/dev/apache/hadoop-2.7.5");
conf.setMaster("local");
}
JavaSparkContext sc = new JavaSparkContext(conf);
//中国市场合资、国产乘用车4月分销量数据
JavaRDD<String> linesRDD = sc.textFile(args[0]);
//按品牌分组
JavaPairRDD<String, Sale> brandSalesRDD = linesRDD.mapToPair(new PairFunction<String, String, Sale>() {
private static final long serialVersionUID = -3023653638555855696L;
@Override
public Tuple2<String, Sale> call(String line) throws Exception {
String[] fields = line.split("\t");
Sale sale = new Sale(Integer.parseInt(fields[0]), fields[1], fields[2], Integer.parseInt(fields[3]), Integer.parseInt(fields[4]));
return new Tuple2<String, Sale>(sale.getBrand(), sale);
}
});
//同品牌4月总销量、1-4月总销量
JavaPairRDD<String, Sale> brandTotalSalesRDD = brandSalesRDD.reduceByKey(new Function2<Sale, Sale, Sale>() {
private static final long serialVersionUID = 1L;
@Override
public Sale call(Sale item1, Sale item2) throws Exception {
item2.setFourSale(item1.getFourSale()+item2.getFourSale());
item2.setTotalSale(item1.getTotalSale()+item2.getTotalSale());
item2.setModel(item1.getModel()+","+item2.getModel());
return item2;
}
});
//4月份销量排名,转换key为4月销量
JavaPairRDD<Integer, Sale> fourSaleRankRDD = brandTotalSalesRDD.mapToPair(new PairFunction<Tuple2<String,Sale>, Integer, Sale>() {
private static final long serialVersionUID = 2012736852338064223L;
@Override
public Tuple2<Integer, Sale> call(Tuple2<String, Sale> t) throws Exception {
return new Tuple2<Integer, Sale>(t._2.getFourSale(), t._2);
}
});
//4月份销量排名降序
JavaPairRDD<Integer, Sale> fourSaleRankDescRDD = fourSaleRankRDD.sortByKey(false);
fourSaleRankDescRDD.foreach(new VoidFunction<Tuple2<Integer,Sale>>() {
private static final long serialVersionUID = -8110929872210046547L;
@Override
public void call(Tuple2<Integer, Sale> t) throws Exception {
Sale sale = t._2;
System.out.println("==== 4月份销量排名:"+sale.getBrand()+" = "+sale.getFourSale());
}
});
fourSaleRankDescRDD.saveAsNewAPIHadoopFile(args[1]+"fourSaleRank", NullWritable.class, Text.class, TextOutputFormat.class); //1-4月份累计销量排名,转换key为1-4月销量
JavaPairRDD<Integer, Sale> totalSaleRankRDD = brandTotalSalesRDD.mapToPair(new PairFunction<Tuple2<String,Sale>, Integer, Sale>() {
private static final long serialVersionUID = 2012736852338064223L;
@Override
public Tuple2<Integer, Sale> call(Tuple2<String, Sale> t) throws Exception {
return new Tuple2<Integer, Sale>(t._2.getTotalSale(), t._2);
}
});
//1-4月份累计销量排名降序
JavaPairRDD<Integer, Sale> totalSaleRankDescRDD = totalSaleRankRDD.sortByKey(false);
totalSaleRankDescRDD.foreach(new VoidFunction<Tuple2<Integer,Sale>>() {
private static final long serialVersionUID = -8110929872210046547L;
@Override
public void call(Tuple2<Integer, Sale> t) throws Exception {
Sale sale = t._2;
System.out.println("==== 1-4月份累计销量排名:"+sale.getBrand()+" = "+sale.getTotalSale());
}
});
fourSaleRankDescRDD.saveAsNewAPIHadoopFile(args[1]+"oneTofourSaleRank", NullWritable.class, Text.class, TextOutputFormat.class);
//关闭
sc.close();
} }
查看HDP Spark的HistoryServer(IP,18081),如下图表示成功:
Spark实现销量统计的更多相关文章
- Spark MLib 基本统计汇总 2
4. 假设检验 基础回顾: 假设检验,用于判断一个结果是否在统计上是显著的.这个结果是否有机会发生. 显著性检验 原假设与备择假设 常把一个要检验的假设记作 H0,称为原假设(或零假设) (null ...
- Spark MLib 基本统计汇总 1
1. 概括统计 summary statistics MLlib支持RDD[Vector]列式的概括统计,它通过调用 Statistics 的 colStats方法实现. colStats返回一个 ...
- Spark Streaming 002 统计单词的例子
1.准备 事先在hdfs上创建两个目录: 保存上传数据的目录:hdfs://alamps:9000/library/SparkStreaming/data checkpoint的目录:hdfs://a ...
- [Spark Core] Spark 实现气温统计
0. 说明 聚合气温数据,聚合出 MAX . MIN . AVG 1. Spark Shell 实现 1.1 MAX 分步实现 # 加载文档 val rdd1 = sc.textFile(" ...
- spark 累加历史 + 统计全部 + 行转列
spark 累加历史主要用到了窗口函数,而进行全部统计,则需要用到rollup函数 1 应用场景: 1.我们需要统计用户的总使用时长(累加历史) 2.前台展现页面需要对多个维度进行查询,如:产品.地 ...
- spark 省份次数统计实例
//统计access.log文件里面IP地址对应的省份,并把结果存入到mysql package access1 import java.sql.DriverManager import org.ap ...
- spark复习笔记(3):使用spark实现单词统计
wordcount是spark入门级的demo,不难但是很有趣.接下来我用命令行.scala.Java和python这三种语言来实现单词统计. 一.使用命令行实现单词的统计 1.首先touch一个a. ...
- spark jdk8 单词统计示例
在github上有spark-java8 实例地址: https://github.com/ypriverol/spark-java8 https://github.com/ihr/java8-spa ...
- Spark入门案例 - 统计单词个数 / wordcount
Scala版 import org.apache.spark.{SparkConf, SparkContext} object WordCountScala { def main(args: Arra ...
随机推荐
- WPF中DataGrid的应用-绑定,增改删,分页,样式
参考以下网址: http://www.cnblogs.com/fwbnet/archive/2012/05/08/2490974.html
- 在html在添加cookie和读取cookie
1.保存cookie var oDate = new Date(); oDate.setDate(oDate.getDate() + );//有效期为30天 document.cookie = &qu ...
- 文件上传C:\fakepath\解决方案
1.设置IE:工具 -> Internet选项 -> 安全 -> 自定义级别 -> 找到“其他”中的“将本地文件上载至服务器时包含本地目录路径”,选中“启用”即可 2.利用js ...
- Luogu1041 NOIP2003传染病控制(搜索)
暴搜加个最优性剪枝即可.一直觉得正式比赛出这种不能一眼看出来暴搜就行了的搜索题的出题人都是毒瘤. #include<iostream> #include<cstdio> #in ...
- RESTful Webservice
1,REST和RESTFUL是什么? REST ( REpresentational State Transfer ),State Transfer 为 "状态传输" 或 &quo ...
- 2015 EC L - Multiplication Table
/************************************************************************* > File Name: L.cpp > ...
- [洛谷P3975][TJOI2015]弦论
题目大意:求一个字符串的第$k$大字串,$t$表示长得一样位置不同的字串是否算多个 题解:$SAM$,先求出每个位置可以到达多少个字串($Right$数组),然后在转移图上$DP$,若$t=1$,初始 ...
- [NOI.AC省选模拟赛3.31] 星辰大海 [半平面交]
题面 传送门 思路 懒得解释了......也是比较简单的结论 但是自己看到几何就退缩了...... 下周之内写一个计算几何的学习笔记! Code #include<iostream> #i ...
- POJ.2739 Sum of Consecutive Prime Numbers(水)
POJ.2739 Sum of Consecutive Prime Numbers(水) 代码总览 #include <cstdio> #include <cstring> # ...
- 20181022 考试记录&高级数据结构
题目 W神爷的题解 高级数据结构 T1: 其实是一道easy题,$O(n^3log n)$ 也是能卡过去的,本着要的70分的心态,最后尽然A了. 如果是正解则是$O(n^3)$,当确定你要选择的列时, ...