spark算子
1.map
一条一条读取
def map(): Unit ={
val list = List("张无忌", "赵敏", "周芷若")
val listRDD = sc.parallelize(list)
val nameRDD = listRDD.map(name => "Hello " + name)
nameRDD.foreach(name => println(name))
}
2.flatMap
扁平化
def flatMap(): Unit ={
val list = List("张无忌 赵敏","宋青书 周芷若")
val listRDD = sc.parallelize(list) val nameRDD = listRDD.flatMap(line => line.split(" ")).map(name => "Hello " + name)
nameRDD.foreach(name => println(name))
}
3.mapPartitions
一次读取一个分区数据
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list, 2)
rdd.foreach(println)
val rdd2 = rdd.mapPartitions(iterator => {
val newList = new ListBuffer[String]
while (iterator.hasNext) {
newList.append("hello" + iterator.next())
}
newList.toIterator
}) rdd2.foreach(name => println(name))
} }
4.mapPartitionsWithIndex
一次读取一个分区数据,并且知道是哪个分区的
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list, 2)
val rdd2 = rdd.mapPartitionsWithIndex((index, iterator) => {
val newList = new ListBuffer[String]
while (iterator.hasNext) {
newList.append(index + "_" + iterator.next())
}
newList.toIterator
}) rdd2.foreach(name => println(name))
} }
5.reduce
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1, 2, 3, 4, 5, 6)
val rdd = spark.parallelize(list)
val result = rdd.reduce((x, y) => x + y)
println(result)
} }
6.reduceBykey
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(("武当", 99), ("少林", 97), ("武当", 89), ("少林", 77))
val rdd = spark.parallelize(list)
val rdd2 = rdd.reduceByKey(_ + _)
rdd2.foreach(tuple => println(tuple._1 + ":" + tuple._2))
}
}
7.union
合并,但不去重
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list1 = List(1,2,3,4)
val list2 = List(3,4,5,6)
val rdd1 = spark.parallelize(list1)
val rdd2 = spark.parallelize(list2)
rdd1.union(rdd2).foreach(println)
}
}
8.join
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list1 = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
val list2 = List((1, 99), (2, 98), (3, 97))
val rdd1 = spark.parallelize(list1)
val rdd2 = spark.parallelize(list2)
val rdd3 = rdd1.join(rdd2)
rdd3.foreach(tuple => {
val id = tuple._1
val new_tuple = tuple._2
val name = new_tuple._1
val score = new_tuple._2
println("学号:" + id + " 姓名:" + name + " 成绩:" + score)
})
}
}
9.groupbyKey
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(("武当", "张三丰"), ("峨眉", "灭绝师太"), ("武当", "宋青书"), ("峨眉", "周芷若"))
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.groupByKey()
rdd2.foreach(t => {
val menpai = t._1
val iterator = t._2.iterator
var people = ""
while (iterator.hasNext) people = people + iterator.next + " "
println("门派:" + menpai + "人员:" + people)
})
}
}
10.cartesian
笛卡尔积
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list1 = List("A", "B")
val list2 = List(1, 2, 3)
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
list1RDD.cartesian(list2RDD).foreach(t => println(t._1 + "->" + t._2))
}
}
11.filter
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1,2,3,4,5,6,7,8,9,10)
val listRDD = spark.parallelize(list)
listRDD.filter(num => num % 2 ==0).foreach(print(_))
}
}
12.distinct
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1,1,2,2,3,3,4,5)
val rdd = spark.parallelize(list)
rdd.distinct().foreach(println)
}
}
13.intersection
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list1 = List(1,2,3,4)
val list2 = List(3,4,5,6)
val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
list1RDD.intersection(list2RDD).foreach(println(_))
}
}
14.coalesce
分区有多-->少
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1,2,3,4,5)
spark.parallelize(list,3).coalesce(1).foreach(println(_))
}
}
15.repartition
进行重分区
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1,2,3,4)
val listRDD = spark.parallelize(list,1)
listRDD.repartition(2).foreach(println(_))
}
}
16.repartitionAndSortWithinPartitions
在给定的partitioner内部进行排序,性能比repartition要高。
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(1, 4, 55, 66, 33, 48, 23)
val listRDD = spark.parallelize(list, 1)
listRDD.map(num => (num, num))
.repartitionAndSortWithinPartitions(new HashPartitioner(2))
.mapPartitionsWithIndex((index, iterator) => {
val listBuffer: ListBuffer[String] = new ListBuffer
while (iterator.hasNext) {
listBuffer.append(index + "_" + iterator.next())
}
listBuffer.iterator
}, false)
.foreach(println(_))
}
}
17.cogroup
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list1 = List((1, "www"), (2, "bbs"))
val list2 = List((1, "cnblog"), (2, "cnblog"), (3, "very"))
val list3 = List((1, "com"), (2, "com"), (3, "good")) val list1RDD = spark.parallelize(list1)
val list2RDD = spark.parallelize(list2)
val list3RDD = spark.parallelize(list3) list1RDD.cogroup(list2RDD,list3RDD).foreach(tuple =>
println(tuple._1 + " " + tuple._2._1 + " " + tuple._2._2 + " " + tuple._2._3))
}
}
18.sortByKey
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List((99, "张三丰"), (96, "东方不败"), (66, "林平之"), (98, "聂风"))
spark.parallelize(list).sortByKey(false).foreach(tuple => println(tuple._2 + "->" + tuple._1))
}
}
19.aggregateByKey
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List("you,jump", "i,jump")
spark.parallelize(list)
.flatMap(_.split(","))
.map((_, 1))
.aggregateByKey(0)(_ + _, _ + _)
.foreach(tuple => println(tuple._1 + "->" + tuple._2))
}
}
apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession import scala.collection.mutable.ListBuffer object Demo {
val conf = new SparkConf().setAppName("Demo").setMaster("local");
// val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val spark = new SparkContext(conf) def main(args: Array[String]): Unit = {
val list = List(("武当", "张三丰"), ("峨眉", "灭绝师太"), ("武当", "宋青书"), ("峨眉", "周芷若"))
val rdd1 = spark.parallelize(list)
val rdd2 = rdd1.groupByKey()
rdd2.foreach(t => {
val menpai = t._1
val iterator = t._2.iterator
var people = ""
while (iterator.hasNext) people = people + iterator.next + " "
println("门派:" + menpai + "人员:" + people)
})
}
}
spark算子的更多相关文章
- (转)Spark 算子系列文章
http://lxw1234.com/archives/2015/07/363.htm Spark算子:RDD基本转换操作(1)–map.flagMap.distinct Spark算子:RDD创建操 ...
- Spark算子总结及案例
spark算子大致上可分三大类算子: 1.Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据. 2.Key-Value数据类型的Tran ...
- UserView--第二种方式(避免第一种方式Set饱和),基于Spark算子的java代码实现
UserView--第二种方式(避免第一种方式Set饱和),基于Spark算子的java代码实现 测试数据 java代码 package com.hzf.spark.study; import ...
- UserView--第一种方式set去重,基于Spark算子的java代码实现
UserView--第一种方式set去重,基于Spark算子的java代码实现 测试数据 java代码 package com.hzf.spark.study; import java.util.Ha ...
- spark算子之DataFrame和DataSet
前言 传统的RDD相对于mapreduce和storm提供了丰富强大的算子.在spark慢慢步入DataFrame到DataSet的今天,在算子的类型基本不变的情况下,这两个数据集提供了更为强大的的功 ...
- Spark算子总结(带案例)
Spark算子总结(带案例) spark算子大致上可分三大类算子: 1.Value数据类型的Transformation算子,这种变换不触发提交作业,针对处理的数据项是Value型的数据. 2.Key ...
- Spark算子---实战应用
Spark算子实战应用 数据集 :http://grouplens.org/datasets/movielens/ MovieLens 1M Datase 相关数据文件 : users.dat --- ...
- spark算子集锦
Spark 是大数据领域的一大利器,花时间总结了一下 Spark 常用算子,正所谓温故而知新. Spark 算子按照功能分,可以分成两大类:transform 和 action.Transform 不 ...
- Spark算子使用
一.spark的算子分类 转换算子和行动算子 转换算子:在使用的时候,spark是不会真正执行,直到需要行动算子之后才会执行.在spark中每一个算子在计算之后就会产生一个新的RDD. 二.在编写sp ...
- Spark:常用transformation及action,spark算子详解
常用transformation及action介绍,spark算子详解 一.常用transformation介绍 1.1 transformation操作实例 二.常用action介绍 2.1 act ...
随机推荐
- 10.3 Vue 路由系统
Vue 路由系统 简单示例 main.js import Vue from 'vue' import App from './App.vue' //https://router.vuejs.or ...
- 爬虫 BeatifulSoup 模块
BeatifulSoup 模块 介绍 Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库 安装 pip install beautifulsoup4 解析器下载 ...
- Awesome CLI
请移步https://github.com/zhuxiaoxi/awesome-online-tools 欢迎一同维护这个列表 jq JSON工具 shellcheck 更好用的Shell语法检查 c ...
- 关于JavaScript(脚本语言)
1.typeof运算符:判断一个对象是否是什么类型,返回“” 一.数字类型(Number) 1.javascript不擅长计算,不能用于浮点数的计算.如:var a = 0.2; var b = 0. ...
- Ubuntu16.04 安装g++6
https://blog.csdn.net/qq_34877350/article/details/81182022 1.安装gcc-6: sudo apt-get update && ...
- PHP之道 - php各方面的知识汇总
看到一个PHP的知识各方面的汇总,写的很有借鉴意义,搬过来了 转自: https://laravel-china.github.io/php-the-right-way/ 欢迎阅读 其他语言版本 参与 ...
- Go-常用库的介绍
一.Go常用包介绍 fmt.它实现了格式化的输入输出操作,其中的fmt.Printf()和fmt.Println()是开 发者使用最为频繁的函数. io.它实现了一系列非平台相关的IO相关接口 ...
- 【VS】VS2013 未找到与约束contractname 匹配的导出
#事故现场 今天win10更新后,打开vs2013新建项目报错: #解决方案: 1.控制面板->程序->程序和功能,找到 Entity Framework Tools for Visual ...
- CMDB资产管理系统开发【day26】:linux客户端开发
客户端疑难点及获取流程 1.linux客户端支持2就可以,python3就是很麻烦 难道你要求所有的客户端都上pytho3吗? 现在从bin的入口进去 HouseStark.ArgvHandler(s ...
- React 记录(4)
React文档:https://www.reactjscn.com/docs/components-and-props.html 慢慢学习:对照教程文档,逐句猜解,截图 React官网:https:/ ...