制作测试数据源:

c1 85
c2 77
c3 88
c1 22
c1 66
c3 95
c3 54
c2 91
c2 66
c1 54
c1 65
c2 41
c4 65

spark scala实现代码:

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession object GroupTopN1 {
System.setProperty("hadoop.home.dir", "D:\\Java_Study\\hadoop-common-2.2.0-bin-master") case class Rating(userId: String, rating: Long) def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("ALS with ML Pipeline")
val spark = SparkSession
.builder()
.config(sparkConf)
.master("local")
.config("spark.sql.warehouse.dir", "/")
.getOrCreate() import spark.implicits._
import spark.sql val lines = spark.read.textFile("C:\\Users\\Administrator\\Desktop\\group.txt")
val classScores = lines.map(line => Rating(line.split(" ")(0).toString, line.split(" ")(1).toLong)) classScores.createOrReplaceTempView("tb_test") var df = sql(
s"""|select
| userId,
| rating,
| row_number()over(partition by userId order by rating desc) rn
|from tb_test
|having(rn<=3)
|""".stripMargin)
df.show() spark.stop()
}
}

打印结果:

+------+------+---+
|userId|rating| rn|
+------+------+---+
| c1| 85| 1|
| c1| 66| 2|
| c1| 65| 3|
| c4| 65| 1|
| c3| 95| 1|
| c3| 88| 2|
| c3| 54| 3|
| c2| 91| 1|
| c2| 77| 2|
| c2| 66| 3|
+------+------+---+

spark java代码实现:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Function1; import javax.management.RuntimeErrorException;
import java.util.List;
import java.util.ArrayList; public class Test {
public static void main(String[] args) {
System.out.println("Hello");
SparkConf sparkConf = new SparkConf().setAppName("ALS with ML Pipeline");
SparkSession spark = SparkSession
.builder()
.config(sparkConf)
.master("local")
.config("spark.sql.warehouse.dir", "/")
.getOrCreate(); // Create an RDD
JavaRDD<String> peopleRDD = spark.sparkContext()
.textFile("C:\\Users\\Administrator\\Desktop\\group.txt", 1)
.toJavaRDD(); // The schema is encoded in a string
String schemaString = "userId rating"; // Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<>();
StructField field1 = DataTypes.createStructField("userId", DataTypes.StringType, true);
StructField field2 = DataTypes.createStructField("rating", DataTypes.LongType, true);
fields.add(field1);
fields.add(field2);
StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows
JavaRDD<Row> rowRDD = peopleRDD.map((Function<String, Row>) record -> {
String[] attributes = record.split(" ");
if(attributes.length!=2){
throw new Exception();
}
return RowFactory.create(attributes[0],Long.valueOf( attributes[1].trim()));
}); // Apply the schema to the RDD
Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); peopleDataFrame.createOrReplaceTempView("tb_test"); Dataset<Row> items = spark.sql("select userId,rating,row_number()over(partition by userId order by rating desc) rn " +
"from tb_test " +
"having(rn<=3)");
items.show(); spark.stop();
}
}

输出结果同上边输出结果。

Java 中使用combineByKey实现TopN:

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2; import java.util.*; public class SparkJava {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().master("local[*]").appName("Spark").getOrCreate();
final JavaSparkContext ctx = JavaSparkContext.fromSparkContext(spark.sparkContext()); List<String> data = Arrays.asList("a,110,a1", "b,122,b1", "c,123,c1", "a,210,a2", "b,212,b2", "a,310,a3", "b,312,b3", "a,410,a4", "b,412,b4");
JavaRDD<String> javaRDD = ctx.parallelize(data); JavaPairRDD<String, Integer> javaPairRDD = javaRDD.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String key) throws Exception {
return new Tuple2<String, Integer>(key.split(",")[0], Integer.valueOf(key.split(",")[1]));
}
}); final int topN = 3;
JavaPairRDD<String, List<Integer>> combineByKeyRDD2 = javaPairRDD.combineByKey(new Function<Integer, List<Integer>>() {
public List<Integer> call(Integer v1) throws Exception {
List<Integer> items = new ArrayList<Integer>();
items.add(v1);
return items;
}
}, new Function2<List<Integer>, Integer, List<Integer>>() {
public List<Integer> call(List<Integer> v1, Integer v2) throws Exception {
if (v1.size() > topN) {
Integer item = Collections.min(v1);
v1.remove(item);
v1.add(v2);
}
return v1;
}
}, new Function2<List<Integer>, List<Integer>, List<Integer>>() {
public List<Integer> call(List<Integer> v1, List<Integer> v2) throws Exception {
v1.addAll(v2);
while (v1.size() > topN) {
Integer item = Collections.min(v1);
v1.remove(item);
} return v1;
}
}); // 由K:String,V:List<Integer> 转化为 K:String,V:Integer
// old:[(a,[210, 310, 410]), (b,[122, 212, 312]), (c,[123])]
// new:[(a,210), (a,310), (a,410), (b,122), (b,212), (b,312), (c,123)]
JavaRDD<Tuple2<String, Integer>> javaTupleRDD = combineByKeyRDD2.flatMap(new FlatMapFunction<Tuple2<String, List<Integer>>, Tuple2<String, Integer>>() {
public Iterator<Tuple2<String, Integer>> call(Tuple2<String, List<Integer>> stringListTuple2) throws Exception {
List<Tuple2<String, Integer>> items=new ArrayList<Tuple2<String, Integer>>();
for(Integer v:stringListTuple2._2){
items.add(new Tuple2<String, Integer>(stringListTuple2._1,v));
}
return items.iterator();
}
}); JavaRDD<Row> rowRDD = javaTupleRDD.map(new Function<Tuple2<String, Integer>, Row>() {
public Row call(Tuple2<String, Integer> kv) throws Exception {
String key = kv._1;
Integer num = kv._2; return RowFactory.create(key, num);
}
}); ArrayList<StructField> fields = new ArrayList<StructField>();
StructField field = null;
field = DataTypes.createStructField("key", DataTypes.StringType, true);
fields.add(field);
field = DataTypes.createStructField("TopN_values", DataTypes.IntegerType, true);
fields.add(field); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema);
df.printSchema();
df.show(); spark.stop();
}
}

输出:

root
|-- key: string (nullable = true)
|-- TopN_values: integer (nullable = true) +---+-----------+
|key|TopN_values|
+---+-----------+
| a| 210|
| a| 310|
| a| 410|
| b| 122|
| b| 212|
| b| 312|
| c| 123|
+---+-----------+

Spark使用combineByKeyWithClassTag函数实现TopN

combineByKeyWithClassTag函数,借助HashSet的排序,此例是取组内最大的N个元素一下是代码:

  • createcombiner就简单的将首个元素装进HashSet然后返回就可以了;
  • mergevalue插入元素之后,如果元素的个数大于N就删除最小的元素;
  • mergeCombiner在合并之后,如果总的个数大于N,就从一次删除最小的元素,知道Hashset内只有N 个元素。
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession import scala.collection.mutable object Main {
val N = 3 def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[*]")
.appName("Spark")
.getOrCreate()
val sc = spark.sparkContext
var SampleDataset = List(
("apple.com", 3L),
("apple.com", 4L),
("apple.com", 1L),
("apple.com", 9L),
("google.com", 4L),
("google.com", 1L),
("google.com", 2L),
("google.com", 3L),
("google.com", 11L),
("google.com", 32L),
("slashdot.org", 11L),
("slashdot.org", 12L),
("slashdot.org", 13L),
("slashdot.org", 14L),
("slashdot.org", 15L),
("slashdot.org", 16L),
("slashdot.org", 17L),
("slashdot.org", 18L),
("microsoft.com", 5L),
("microsoft.com", 2L),
("microsoft.com", 6L),
("microsoft.com", 9L),
("google.com", 4L))
val urdd: RDD[(String, Long)] = sc.parallelize(SampleDataset).map((t) => (t._1, t._2))
var topNs = urdd.combineByKeyWithClassTag(
//createCombiner
(firstInt: Long) => {
var uset = new mutable.TreeSet[Long]()
uset += firstInt
},
// mergeValue
(uset: mutable.TreeSet[Long], value: Long) => {
uset += value
while (uset.size > N) {
uset.remove(uset.min)
}
uset
},
//mergeCombiners
(uset1: mutable.TreeSet[Long], uset2: mutable.TreeSet[Long]) => {
var resultSet = uset1 ++ uset2
while (resultSet.size > N) {
resultSet.remove(resultSet.min)
}
resultSet
}
)
import spark.implicits._
topNs.flatMap(rdd => {
var uset = new mutable.HashSet[String]()
for (i <- rdd._2.toList) {
uset += rdd._1 + "/" + i.toString
}
uset
}).map(rdd => {
(rdd.split("/")(0), rdd.split("/")(1))
}).toDF("key", "TopN_values").show()
}
}

参考《https://blog.csdn.net/gpwner/article/details/78455234》

输出结果:

+-------------+-----------+
| key|TopN_values|
+-------------+-----------+
| google.com| 4|
| google.com| 11|
| google.com| 32|
|microsoft.com| 9|
|microsoft.com| 6|
|microsoft.com| 5|
| apple.com| 4|
| apple.com| 9|
| apple.com| 3|
| slashdot.org| 16|
| slashdot.org| 17|
| slashdot.org| 18|
+-------------+-----------+

Spark:求出分组内的TopN的更多相关文章

  1. projecteuler 10001st prime (求出第10001个质数)

    By listing the first six prime numbers: 2, 3, 5, 7, 11, and 13, we can see that the 6th prime is 13. ...

  2. 020 Spark中分组后的TopN,以及Spark的优化(重点)

    一:准备 1.源数据 2.上传数据 二:TopN程序编码 1.程序 package com.ibeifeng.bigdata.spark.core import java.util.concurren ...

  3. Trees in a Wood. UVA 10214 欧拉函数或者容斥定理 给定a,b求 |x|<=a, |y|<=b这个范围内的所有整点不包括原点都种一棵树。求出你站在原点向四周看到的树的数量/总的树的数量的值。

    /** 题目:Trees in a Wood. UVA 10214 链接:https://vjudge.net/problem/UVA-10214 题意:给定a,b求 |x|<=a, |y|&l ...

  4. CF 给你三个数字L, R, K,问在[L, R]范围内有多少个数字满足它每一位不同数字不超过k个,求出它们的和(数位DP)

    题意: 给你三个数字L, R, K,问在[L, R]范围内有多少个数字满足它每一位不同数字不超过k个,求出它们的和 分析:考虑用状态压缩 , 10给位0~9 , 如果之前出现过了某个数字x ,那就拿当 ...

  5. hive 分组排序,topN

    hive 分组排序,topN 语法格式:row_number() OVER (partition by COL1 order by COL2 desc ) rankpartition by:类似hiv ...

  6. 洛谷1440 求m区间内的最小值

    洛谷1440 求m区间内的最小值 本题地址:http://www.luogu.org/problem/show?pid=1440 题目描述 一个含有n项的数列(n<=2000000),求出每一项 ...

  7. 单调队列——求m区间内的最小值

    单调队列,顾名思义是指队列内的元素是有序的,队头为当前的最大值(单调递减队列)或最小值(单调递增序列),以单调递减队列为例来看队列的入队和出队操作: 1.入队: 如果当前元素要进队,把当前元素和队尾元 ...

  8. P1440 求m区间内的最小值--洛谷luogu

    题目描述 一个含有n项的数列(n<=2000000),求出每一项前的m个数到它这个区间内的最小值.若前面的数不足m项则从第1个数开始,若前面没有数则输出0. 输入输出格式 输入格式: 第一行两个 ...

  9. 洛谷 1440 求m区间内的最小值

    洛谷  1440 求m区间内的最小值 题目描述 一个含有n项的数列(n<=2000000),求出每一项前的m个数到它这个区间内的最小值.若前面的数不足m项则从第1个数开始,若前面没有数则输出0. ...

随机推荐

  1. Spring boot整合jsp

    这几天在集中学习Spring boot+Shiro框架,因为之前view层用jsp比较多,所以想在spring boot中配置jsp,但是spring boot官方不推荐使用jsp,因为jsp相对于一 ...

  2. go中的接口

    对于golang的接口,纠结两天了,今天有种茅塞顿开的感觉,有必要写点东西了. 纠结接口,说白了就是搞不透接口,方法,结构体几者之间的关系以及具体的用途.可以简单的从三者的定义说起,接口说白了就是一个 ...

  3. Android开发:仿美团下拉列表菜单,帮助类,复用简单

    近期在项目中须要用到下拉菜单.公司比較推崇美团的下拉菜单,于是要实现该功能.想着.这个功能应该是一个常常会用到的.于是何不写一个帮助类,仅仅要往这个类里面传入特定的參数,既能够实现下来菜单,并且还能够 ...

  4. AngularJS中实现显示或隐藏动画效果的3种方式

    本篇体验在AngularJS中实现在"显示/隐藏"这2种状态切换间添加动画效果. 通过CSS方式实现显示/隐藏动画效果 思路: →npm install angular-anima ...

  5. Windows Phone本地数据库(SQLCE):4、[Column]attribute(翻译) (转)

    这是“windows phone mango本地数据库(sqlce)”系列短片文章的第四篇. 为了让你开始在Windows Phone Mango中使用数据库,这一系列短片文章将覆盖所有你需要知道的知 ...

  6. excel 单元格内容太多,替换有问题

    excel 单元格内容太多,替换有问题

  7. Linux网络之设备接口层:发送数据包流程dev_queue_xmit

    转自:http://blog.csdn.net/wdscq1234/article/details/51926808 写在前面 本文主要是分析kernel-3.8的源代码,主要集中在Network的n ...

  8. java程序员必知的8大排序

    先来看看8种排序之间的关系: 1,  直接插入排序 (1)基本思想:在要排序的一组数中,假设前面(n-1) [n>=2] 个数已经是排 好顺序的,现在要把第n个数插到前面的有序数中,使得这n个数 ...

  9. 通过AnimationSet设置动画

    在代码中可以通过set来设置多个动画属性,这里分开来设置不同的属性. 首先先贴上布局文件,里面的imageview是用来做动画的控件 <RelativeLayout xmlns:android= ...

  10. 从源码角度一步一步来修改PreferenceActivity界面

         PreferenceActivity给我们封装好了一个数据存储对象,我们只需要在xml文件中写上控件即可完成简单的设置界面.但是系统提供的设置界面十分的简陋,要想做的好看必须要自己来进行修改 ...