10、spark高级编程
一、基于排序机制的wordcount程序
1、要求
1、对文本文件内的每个单词都统计出其出现的次数。 2、按照每个单词出现次数的数量,降序排序。
2、代码实现
------java实现------- package cn.spark.study.core; import java.util.Arrays; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class SortWordCount {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("SortWordCount").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\\test-file\\spark.txt"); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { private static final long serialVersionUID = 1L; @Override
public Iterable<String> call(String t) throws Exception {
return Arrays.asList(t.split(" "));
}
}); JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<String, Integer> call(String t) throws Exception {
return new Tuple2<String, Integer>(t, 1);
}
}); JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; @Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}); // 到这里为止,就得到了每个单词出现的次数
// 但是,问题是,我们的新需求,是要按照每个单词出现次数的顺序,降序排序
// wordCounts RDD内的元素是什么?应该是这种格式的吧:(hello, 3) (you, 2)
// 我们需要将RDD转换成(3, hello) (2, you)的这种格式,才能根据单词出现次数进行排序把! // 进行key-value的反转映射
JavaPairRDD<Integer, String> countWords = wordCounts.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
return new Tuple2<Integer, String>(t._2, t._1);
}
}); //按照key进行排序
JavaPairRDD<Integer, String> sortedCountWords = countWords.sortByKey(false); //再次将value-key进行反转映射
JavaPairRDD<String, Integer> sortedWordCounts = sortedCountWords.mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> t) throws Exception {
return new Tuple2<String, Integer>(t._2, t._1);
}
}); // 到此为止,我们获得了按照单词出现次数排序后的单词计数
// 打印出来
sortedWordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>() { private static final long serialVersionUID = 1L; @Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1 + " appears " + t._2 + " times.");
}
}); sc.close();
} } ---------scala实现--------- package cn.spark.study.core import org.apache.spark.SparkConf
import org.apache.spark.SparkContext /**
* @author Administrator
*/
object SortWordCount { def main(args: Array[String]) {
val conf = new SparkConf()
.setAppName("SortWordCount")
.setMaster("local")
val sc = new SparkContext(conf) val lines = sc.textFile("D:\\test-file\\spark.txt", 1)
val words = lines.flatMap { line => line.split(" ") }
val pairs = words.map { word => (word, 1) }
val wordCounts = pairs.reduceByKey(_ + _) val countWords = wordCounts.map(wordCount => (wordCount._2, wordCount._1))
val sortedCountWords = countWords.sortByKey(false)
val sortedWordCounts = sortedCountWords.map(sortedCountWord => (sortedCountWord._2, sortedCountWord._1)) sortedWordCounts.foreach(sortedWordCount => println(
sortedWordCount._1 + " appear " + sortedWordCount._2 + " times."))
} }
二、二次排序
1、要求
1、按照文件中的第一列排序。 2、如果第一列相同,则按照第二列排序。
2、java代码
###SecondarySortKey package cn.spark.study.core; import java.io.Serializable; import scala.math.Ordered; /**
* 自定义的二次排序key
* @author Administrator
*
*/
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable { private static final long serialVersionUID = -2366006422945129991L; // 首先在自定义key里面,定义需要进行排序的列
private int first;
private int second; public SecondarySortKey(int first, int second) {
this.first = first;
this.second = second;
} @Override
public boolean $greater(SecondarySortKey other) {
if(this.first > other.getFirst()) {
return true;
} else if(this.first == other.getFirst() &&
this.second > other.getSecond()) {
return true;
}
return false;
} @Override
public boolean $greater$eq(SecondarySortKey other) {
if(this.$greater(other)) {
return true;
} else if(this.first == other.getFirst() &&
this.second == other.getSecond()) {
return true;
}
return false;
} @Override
public boolean $less(SecondarySortKey other) {
if(this.first < other.getFirst()) {
return true;
} else if(this.first == other.getFirst() &&
this.second < other.getSecond()) {
return true;
}
return false;
} @Override
public boolean $less$eq(SecondarySortKey other) {
if(this.$less(other)) {
return true;
} else if(this.first == other.getFirst() &&
this.second == other.getSecond()) {
return true;
}
return false;
} @Override
public int compare(SecondarySortKey other) {
if(this.first - other.getFirst() != 0) {
return this.first - other.getFirst();
} else {
return this.second - other.getSecond();
}
} @Override
public int compareTo(SecondarySortKey other) {
if(this.first - other.getFirst() != 0) {
return this.first - other.getFirst();
} else {
return this.second - other.getSecond();
}
} // 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法
public int getFirst() {
return first;
} public void setFirst(int first) {
this.first = first;
} public int getSecond() {
return second;
} public void setSecond(int second) {
this.second = second;
} @Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + first;
result = prime * result + second;
return result;
} @Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
SecondarySortKey other = (SecondarySortKey) obj;
if (first != other.first)
return false;
if (second != other.second)
return false;
return true;
} } ###SecondarySort package cn.spark.study.core; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /**
* 二次排序
* 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
* 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
* 3、使用sortByKey算子按照自定义的key进行排序
* 4、再次映射,剔除自定义的key,只保留文本行
* @author Administrator
*
*/
public class SecondarySort { public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("SecondarySort")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\\test-file\\sort.txt"); JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair( new PairFunction<String, SecondarySortKey, String>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
String[] lineSplited = line.split(" ");
SecondarySortKey key = new SecondarySortKey(
Integer.valueOf(lineSplited[0]),
Integer.valueOf(lineSplited[1]));
return new Tuple2<SecondarySortKey, String>(key, line);
} }); JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey(); JavaRDD<String> sortedLines = sortedPairs.map( new Function<Tuple2<SecondarySortKey,String>, String>() { private static final long serialVersionUID = 1L; @Override
public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
return v1._2;
} }); sortedLines.foreach(new VoidFunction<String>() { private static final long serialVersionUID = 1L; @Override
public void call(String t) throws Exception {
System.out.println(t);
} }); sc.close();
} }
3、scala代码
###SecondSortKey package cn.spark.study.core /**
* @author Administrator
*/
class SecondSortKey(val first: Int, val second: Int)
extends Ordered[SecondSortKey] with Serializable { def compare(that: SecondSortKey): Int = {
if(this.first - that.first != 0) {
this.first - that.first
} else {
this.second - that.second
}
} } ###SecondSort package cn.spark.study.core import org.apache.spark.SparkConf
import org.apache.spark.SparkContext /**
* @author Administrator
*/
object SecondSort { def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("SecondSort")
.setMaster("local")
val sc = new SparkContext(conf) val lines = sc.textFile("D:\\test-file\\sort.txt", 1)
val pairs = lines.map { line => (
new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),
line)}
val sortedPairs = pairs.sortByKey()
val sortedLines = sortedPairs.map(sortedPair => sortedPair._2) sortedLines.foreach { sortedLine => println(sortedLine) }
} }
三、topn
1、要求
1、对文本文件内的数字,取最大的前3个。 2、对每个班级内的学生成绩,取出前3名。(分组取topn) 3、课后作业:用Scala来实现分组取topn。
2、获取文本内最大的前三个数
---------java实现---------- package cn.spark.study.core; import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class Top3 {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Top3Java").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\\test-file\\top.txt"); JavaPairRDD<Integer, String> pairs = lines.mapToPair(new PairFunction<String, Integer, String>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<Integer, String> call(String t) throws Exception {
return new Tuple2<Integer, String>(Integer.valueOf(t), t);
}
}); JavaPairRDD<Integer, String> sortedPairs = pairs.sortByKey(false);
JavaRDD<Integer> sortedNumbers = sortedPairs.map(new Function<Tuple2<Integer,String>, Integer>() { private static final long serialVersionUID = 1L; @Override
public Integer call(Tuple2<Integer, String> v1) throws Exception {
return v1._1;
}
}); List<Integer> sortedNumberList = sortedNumbers.take(3); //此时sortedNumberList是: [9, 7, 6]
for(Integer num : sortedNumberList) {
System.out.println(num);
} sc.close(); }
} ---------scala实现---------- package cn.spark.study.core import org.apache.spark.SparkConf
import org.apache.spark.SparkContext /**
* @author Administrator
*/
object Top3 { def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("Top3")
.setMaster("local")
val sc = new SparkContext(conf) val lines = sc.textFile("D:\\test-file\\top.txt", 1)
val pairs = lines.map { line => (line.toInt, line) }
val sortedPairs = pairs.sortByKey(false)
val sortedNumbers = sortedPairs.map(sortedPair => sortedPair._1)
val top3Number = sortedNumbers.take(3) for(num <- top3Number) {
println(num)
}
} }
3、对每个班级内的学生成绩,取出前3名。(分组取topn)
----java实现----- package cn.spark.study.core; import java.util.Arrays;
import java.util.Iterator; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /**
* 分组取top3
* @author Administrator
*
*/
public class GroupTop3 { public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Top3")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D:\\test-file\\score.txt"); JavaPairRDD<String, Integer> pairs = lines.mapToPair( new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<String, Integer> call(String line) throws Exception {
String[] lineSplited = line.split(" ");
return new Tuple2<String, Integer>(lineSplited[0],
//Integer.valueOf()可以将基本类型int转换为包装类型Integer,或者将String转换成Integer,String如果为Null或“”都会报错;
Integer.valueOf(lineSplited[1]));
} }); JavaPairRDD<String, Iterable<Integer>> groupedPairs = pairs.groupByKey(); JavaPairRDD<String, Iterable<Integer>> top3Score = groupedPairs.mapToPair( new PairFunction<Tuple2<String,Iterable<Integer>>, String, Iterable<Integer>>() { private static final long serialVersionUID = 1L; @Override
public Tuple2<String, Iterable<Integer>> call(
Tuple2<String, Iterable<Integer>> classScores)
throws Exception {
Integer[] top3 = new Integer[3]; String className = classScores._1;
Iterator<Integer> scores = classScores._2.iterator(); while(scores.hasNext()) {
Integer score = scores.next(); for(int i = 0; i < 3; i++) {
if(top3[i] == null) {
top3[i] = score;
break;
} else if(score > top3[i]) {
for(int j = 2; j > i; j--) {
top3[j] = top3[j - 1];
} top3[i] = score; break;
}
}
} return new Tuple2<String,
Iterable<Integer>>(className, Arrays.asList(top3));
} }); top3Score.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() { private static final long serialVersionUID = 1L; @Override
public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
System.out.println("class: " + t._1);
Iterator<Integer> scoreIterator = t._2.iterator();
while(scoreIterator.hasNext()) {
Integer score = scoreIterator.next();
System.out.println(score);
}
System.out.println("=======================================");
} }); sc.close();
} } -----scala实现------ package cn.spark.study.core import org.apache.spark.SparkConf
import org.apache.spark.SparkContext object GroupTop3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("GroupTop3Scala").setMaster("local")
val context = new SparkContext(conf)
val linesRDD = context.textFile("D:\\test-file\\score.txt")
val studentScores = linesRDD.map(line => (line.split(" ")(0), line.split(" ")(1).toInt))
val groupStudentScores = studentScores.groupByKey()
val result = groupStudentScores.map(student => {
val maxScore = new Array[Int](3)
val scores = student._2
for(score <- scores) {
var flag = true
for(i <- 0 until maxScore.length if flag) {
if(maxScore(i) == Nil) {
maxScore(i) = score
flag = false
}else{
if(maxScore(i) < score) {
for(j <- (i + 1 to maxScore.length - 1).reverse){
maxScore(j) = maxScore(j - 1)
}
maxScore(i) = score
flag = false
}
}
}
}
(student._1, maxScore)
}) result.foreach(result =>{
print(result._1 + "班级前三明成绩为:")
for(i <- 0 until result._2.length) {
if(i == 0) print(result._2(i))
else print("," + result._2(i))
}
println()
})
}
}
10、spark高级编程的更多相关文章
- Learning Spark中文版--第六章--Spark高级编程(2)
Working on a Per-Partition Basis(基于分区的操作) 以每个分区为基础处理数据使我们可以避免为每个数据项重做配置工作.如打开数据库连接或者创建随机数生成器这样的操作,我们 ...
- Learning Spark中文版--第六章--Spark高级编程(1)
Introduction(介绍) 本章介绍了之前章节没有涵盖的高级Spark编程特性.我们介绍两种类型的共享变量:用来聚合信息的累加器和能有效分配较大值的广播变量.基于对RDD现有的transform ...
- spark高级编程
启动spark-shell 如果你有一个Hadoop 集群, 并且Hadoop 版本支持YARN, 通过为Spark master 设定yarn-client 参数值,就可以在集群上启动Spark 作 ...
- C#高级编程笔记 (6至10章节)运算符/委托/字符/正则/集合
数学的复习,4^-2即是1/4/4的意思, 4^2是1*2*2的意思,而10^-2为0.01! 7.2运算符 符号 说明 例 ++ 操作数加1 int i=3; j=i++; 运算后i的值为4,j ...
- Spark Graphx编程指南
问题导读1.GraphX提供了几种方式从RDD或者磁盘上的顶点和边集合构造图?2.PageRank算法在图中发挥什么作用?3.三角形计数算法的作用是什么?Spark中文手册-编程指南Spark之一个快 ...
- Apache Spark 2.2.0 中文文档 - Spark Streaming 编程指南 | ApacheCN
Spark Streaming 编程指南 概述 一个入门示例 基础概念 依赖 初始化 StreamingContext Discretized Streams (DStreams)(离散化流) Inp ...
- Apache Spark 2.2.0 中文文档 - Spark Streaming 编程指南
Spark Streaming 编程指南 概述 一个入门示例 基础概念 依赖 初始化 StreamingContext Discretized Streams (DStreams)(离散化流) Inp ...
- jQuery高级编程
jquery高级编程1.jquery入门2.Javascript基础3.jQuery核心技术 3.1 jQuery脚本的结构 3.2 非侵扰事JavaScript 3.3 jQuery框架的结构 3. ...
- unix环境高级编程基础知识之第二篇(3)
看了unix环境高级编程第三章,把代码也都自己敲了一遍,另主要讲解了一些IO函数,read/write/fseek/fcntl:这里主要是c函数,比较容易,看多了就熟悉了.对fcntl函数讲解比较到位 ...
随机推荐
- Eclipse 安装反编译插件 Eclipse Class Decompiler
Eclipse Class Decompiler在线安装方法 https://blog.csdn.net/tangjinquan1157/article/details/77506015 Eclips ...
- 使用activiti的designer插件记录
1.activiti添加排他网,条件下载condition中 2.activiti添加监听Listener,知道3种方法 1.实现taskListener 通过加载java class的方式去加载实现 ...
- 私有属性和私有方法l
class Woman: def __init__(self, name): self.name=name self.__age=18 def __secret(self): print(" ...
- 前端开发 Vue Vue.js和Nodejs的关系
首先vue.js 是库,不是框架,不是框架,不是框架. Vue.js 使用了基于 HTML 的模版语法,允许开发者声明式地将 DOM 绑定至底层 Vue 实例的数据. Vue.js 的核心是一个允许你 ...
- BUAA_OO第四单元总结性博客作业——UML(Floyd实现规则检查?)
一.架构设计 1.UML第一次作业——类图 第一次作业基于不同element在UML规格中的从属关系来设计架构.继承了UmlInteraction接口的MyUmlInteraction类是主要的交互层 ...
- netaddr网络地址工具python
print("==========1==========") from netaddr import IPNetwork # IPNetwork('192.168.7.80/30' ...
- expect脚本远程登录、远程执行命令和脚本传参简单用法
expect介绍: 最近想写一个自动化安装脚本,涉及到远程登录.分发文件包.远程执行命令等,其中少不了来回输入登录密码,交互式输入命令等,这样就大大降低了效率,那么有什么方法能解决呢?不妨试试expe ...
- es6 javascript的Class 类的继承
原文链接:https://blog.csdn.net/qq_30100043/article/details/53542531 1 基本用法 Class 之间可以通过extends关键字实现继承, 这 ...
- Linux路由:CentOS6的多种玩法
将一台Linux主机作路由器使用,这本是件很容易的事情,利用Linux主机强大的网络功能,很轻松就实现了.这里在虚拟机环境下设定一台CentOS主机通过另一台CentOS主机路由接入Internet网 ...
- 数据分析常用shell命令
目录 0.vim编辑器 1.awk命令(重要) 1.1 基本语法 1.2 基本用法 1.3 运算符 1.4 内建变量 1.5 其他 1.6 awk是一门变成语言,支持条件判断.数组.循环等功能.所以我 ...