Spark_JGroupByKey
package core.java;
import java.util.Arrays;
import java.util.List; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext; import scala.Tuple2; public class JGroupByKey {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("union");
System.setProperty("hadoop.home.dir", "C:/hadoop-2.8.2");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String, Integer>> list = Arrays.asList(new Tuple2<String, Integer>("cl1", ),
new Tuple2<String, Integer>("cl2", ),new Tuple2<String, Integer>("cl3", ),
new Tuple2<String, Integer>("cl1", ),new Tuple2<String, Integer>("cl1", ),
new Tuple2<String, Integer>("cl3", ),new Tuple2<String, Integer>("cl2", ));
JavaPairRDD<String, Integer> listRDD = sc.parallelizePairs(list);
JavaPairRDD<String, Iterable<Integer>> results = listRDD.groupByKey();
System.out.println(results.collect());
sc.close();
}
} //SLF4J: Class path contains multiple SLF4J bindings.
//SLF4J: Found binding in [jar:file:/E:/bigdata/spark-1.4.0-bin-hadoop2.6/lib/spark-assembly-1.4.0-hadoop2.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
//SLF4J: Found binding in [jar:file:/E:/bigdata/spark-1.4.0-bin-hadoop2.6/lib/spark-examples-1.4.0-hadoop2.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
//SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
//SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
//Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
//17/12/27 20:23:41 INFO SparkContext: Running Spark version 1.4.0
//17/12/27 20:23:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
//17/12/27 20:23:44 INFO SecurityManager: Changing view acls to:
//17/12/27 20:23:44 INFO SecurityManager: Changing modify acls to:
//17/12/27 20:23:44 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(); users with modify permissions: Set()
//17/12/27 20:23:46 INFO Slf4jLogger: Slf4jLogger started
//17/12/27 20:23:47 INFO Remoting: Starting remoting
//17/12/27 20:23:48 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@172.18.3.6:58955]
//17/12/27 20:23:48 INFO Utils: Successfully started service 'sparkDriver' on port 58955.
//17/12/27 20:23:48 INFO SparkEnv: Registering MapOutputTracker
//17/12/27 20:23:49 INFO SparkEnv: Registering BlockManagerMaster
//17/12/27 20:23:49 INFO DiskBlockManager: Created local directory at C:\Users\\AppData\Local\Temp\spark-c1db5ccf-8e4b-4ef9-9a7f-c6ec66d46664\blockmgr-a60ebb60-8b7c-433c-b035-eded748b261b
//17/12/27 20:23:49 INFO MemoryStore: MemoryStore started with capacity 467.6 MB
//17/12/27 20:23:49 INFO HttpFileServer: HTTP File server directory is C:\Users\\AppData\Local\Temp\spark-c1db5ccf-8e4b-4ef9-9a7f-c6ec66d46664\httpd-f45db3a6-b75c-46a5-a1d1-3539b1698cd0
//17/12/27 20:23:49 INFO HttpServer: Starting HTTP Server
//17/12/27 20:23:49 INFO Utils: Successfully started service 'HTTP file server' on port 58959.
//17/12/27 20:23:49 INFO SparkEnv: Registering OutputCommitCoordinator
//17/12/27 20:23:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
//17/12/27 20:23:50 INFO Utils: Successfully started service 'SparkUI' on port 4041.
//17/12/27 20:23:50 INFO SparkUI: Started SparkUI at http://172.18.3.6:4041
//17/12/27 20:23:50 INFO Executor: Starting executor ID driver on host localhost
//17/12/27 20:23:51 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 58978.
//17/12/27 20:23:51 INFO NettyBlockTransferService: Server created on 58978
//17/12/27 20:23:51 INFO BlockManagerMaster: Trying to register BlockManager
//17/12/27 20:23:51 INFO BlockManagerMasterEndpoint: Registering block manager localhost:58978 with 467.6 MB RAM, BlockManagerId(driver, localhost, 58978)
//17/12/27 20:23:51 INFO BlockManagerMaster: Registered BlockManager
//17/12/27 20:23:57 INFO SparkContext: Starting job: collect at JGroupByKey.java:27
//17/12/27 20:23:57 INFO DAGScheduler: Registering RDD 0 (parallelizePairs at JGroupByKey.java:25)
//17/12/27 20:23:57 INFO DAGScheduler: Got job 0 (collect at JGroupByKey.java:27) with 1 output partitions (allowLocal=false)
//17/12/27 20:23:57 INFO DAGScheduler: Final stage: ResultStage 1(collect at JGroupByKey.java:27)
//17/12/27 20:23:57 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 0)
//17/12/27 20:23:57 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 0)
//17/12/27 20:23:57 INFO DAGScheduler: Submitting ShuffleMapStage 0 (ParallelCollectionRDD[0] at parallelizePairs at JGroupByKey.java:25), which has no missing parents
//17/12/27 20:23:58 INFO MemoryStore: ensureFreeSpace(2832) called with curMem=0, maxMem=490356080
//17/12/27 20:23:58 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 2.8 KB, free 467.6 MB)
//17/12/27 20:23:58 INFO MemoryStore: ensureFreeSpace(1553) called with curMem=2832, maxMem=490356080
//17/12/27 20:23:58 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1553.0 B, free 467.6 MB)
//17/12/27 20:23:58 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:58978 (size: 1553.0 B, free: 467.6 MB)
//17/12/27 20:23:58 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:874
//17/12/27 20:23:58 INFO DAGScheduler: Submitting 1 missing tasks from ShuffleMapStage 0 (ParallelCollectionRDD[0] at parallelizePairs at JGroupByKey.java:25)
//17/12/27 20:23:58 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
//17/12/27 20:23:58 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, PROCESS_LOCAL, 1627 bytes)
//17/12/27 20:23:58 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
//17/12/27 20:23:58 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 879 bytes result sent to driver
//17/12/27 20:23:58 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 427 ms on localhost (1/1)
//17/12/27 20:23:58 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
//17/12/27 20:23:58 INFO DAGScheduler: ShuffleMapStage 0 (parallelizePairs at JGroupByKey.java:25) finished in 0.559 s
//17/12/27 20:23:58 INFO DAGScheduler: looking for newly runnable stages
//17/12/27 20:23:58 INFO DAGScheduler: running: Set()
//17/12/27 20:23:58 INFO DAGScheduler: waiting: Set(ResultStage 1)
//17/12/27 20:23:58 INFO DAGScheduler: failed: Set()
//17/12/27 20:23:58 INFO DAGScheduler: Missing parents for ResultStage 1: List()
//17/12/27 20:23:58 INFO DAGScheduler: Submitting ResultStage 1 (MapPartitionsRDD[2] at groupByKey at JGroupByKey.java:26), which is now runnable
//17/12/27 20:23:58 INFO MemoryStore: ensureFreeSpace(4000) called with curMem=4385, maxMem=490356080
//17/12/27 20:23:58 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 3.9 KB, free 467.6 MB)
//17/12/27 20:23:58 INFO MemoryStore: ensureFreeSpace(2129) called with curMem=8385, maxMem=490356080
//17/12/27 20:23:58 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 2.1 KB, free 467.6 MB)
//17/12/27 20:23:58 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:58978 (size: 2.1 KB, free: 467.6 MB)
//17/12/27 20:23:58 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:874
//17/12/27 20:23:58 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (MapPartitionsRDD[2] at groupByKey at JGroupByKey.java:26)
//17/12/27 20:23:58 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
//17/12/27 20:23:58 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, localhost, PROCESS_LOCAL, 1165 bytes)
//17/12/27 20:23:58 INFO Executor: Running task 0.0 in stage 1.0 (TID 1)
//17/12/27 20:23:59 INFO ShuffleBlockFetcherIterator: Getting 1 non-empty blocks out of 1 blocks
//17/12/27 20:23:59 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 57 ms
//17/12/27 20:23:59 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 5294 bytes result sent to driver
//17/12/27 20:23:59 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 231 ms on localhost (1/1)
//17/12/27 20:23:59 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
//17/12/27 20:23:59 INFO DAGScheduler: ResultStage 1 (collect at JGroupByKey.java:27) finished in 0.236 s
//17/12/27 20:23:59 INFO DAGScheduler: Job 0 finished: collect at JGroupByKey.java:27, took 1.963917 s
//[(cl3,[97, 90]), (cl1,[90, 96, 89]), (cl2,[91, 60])]
//17/12/27 20:23:59 INFO SparkUI: Stopped Spark web UI at http://172.18.3.6:4041
//17/12/27 20:23:59 INFO DAGScheduler: Stopping DAGScheduler
//17/12/27 20:23:59 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
//17/12/27 20:23:59 INFO Utils: path = C:\Users\\AppData\Local\Temp\spark-c1db5ccf-8e4b-4ef9-9a7f-c6ec66d46664\blockmgr-a60ebb60-8b7c-433c-b035-eded748b261b, already present as root for deletion.
//17/12/27 20:23:59 INFO MemoryStore: MemoryStore cleared
//17/12/27 20:23:59 INFO BlockManager: BlockManager stopped
//17/12/27 20:23:59 INFO BlockManagerMaster: BlockManagerMaster stopped
//17/12/27 20:23:59 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
//17/12/27 20:23:59 INFO SparkContext: Successfully stopped SparkContext
//17/12/27 20:23:59 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.
//17/12/27 20:23:59 INFO RemoteActorRefProvider$RemotingTerminator: Remote daemon shut down; proceeding with flushing remote transports.
//17/12/27 20:23:59 INFO Utils: Shutdown hook called
//17/12/27 20:23:59 INFO Utils: Deleting directory C:\Users\\AppData\Local\Temp\spark-c1db5ccf-8e4b-4ef9-9a7f-c6ec66d46664
Spark_JGroupByKey的更多相关文章
随机推荐
- [about remote controller]--mstsc-teamviewer-vnc,nomachine
https://www.jianshu.com/p/c80db368ed8a https://www.nomachine.com/download Ubuntu安装VNC,VNC却无法随系统启动,遂换 ...
- TableView刷新跳动问题
https://juejin.im/post/5aca1a04f265da2391486533 解决办法: 将估算高度设置为0即可: tableView.estimatedRowHeight = 0; ...
- delphi传递变量给fastreport
delphi传递变量给fastreport 1.打开frReport报表设计.2.打开file->data dictionary加变量.这里比如加title,bm,zbr,gj,zrs3.在 ...
- STS maven build 访问 jsp页面报错
STS 版本:spring-tool-suite-3.8.1.RELEASE-e4.6-win32-x86_64 maven版本:apache-maven-3.3.9 报错信息如图(图片解决方案来源博 ...
- 20165336 2017-2018-2 《Java程序设计》第4周学习总结
20165336 2017-2018-2 <Java程序设计>第4周学习总结 教材学习内容总结 第五章 使用extends来定义一个子类. Object类是所有类的祖先类. 当子类和父类不 ...
- Es6 的类(class)
首先根据es5的类(原型对象)的基本点做参照. 序号 基本点 es5 >es6 1 实例属性(方法) √ √ 2 原型属性(方法) 或 公共属性(方法) √ √ 3 es5的私有变量 或 私有属 ...
- nginx 、springMvc(非分布式)相应的限流、消峰
互联网服务赖以生存的根本是流量, 产品和运营会经常通过各种方式来为应用倒流,比如淘宝的双十一等,如何让系统在处理高并发的同时还是保证自身系统的稳定, 通常在最短时间内提高并发的做法就是加机器, 但是如 ...
- jquery实现简单的弹出框
弹出框本身是一个div,默认是隐藏不展示的,在需要弹框的时候使其显示,并浮在当前页面之上 弹框样式: .tanchuang { width: 100%; height: 100%; display: ...
- WordPress如何屏蔽恶意关键词搜索
我们在用WordPress建站比较方便,但如果网站有一定的权重后,一些不怀好意的人就会过来制作恶意内容,比如故意搜索邪恶的关键词.垃圾评论等,那我们如何屏蔽恶意搜索关键词呢?不会很难,会写点代码的朋友 ...
- 时间序列分析工具箱——sweep
版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/kMD8d5R/article/details/81977856 作者:徐瑞龙.量化分析师,R语言中文 ...