不多说,直接上干货!

spark-1.6.1-bin-hadoop2.6里Basic包下的JavaPageRank.java

  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17.  
  18. //package org.apache.spark.examples;
  19. package zhouls.bigdata.Basic;
  20.  
  21. import scala.Tuple2;//scala里的元组
  22. import com.google.common.collect.Iterables;
  23. import org.apache.spark.SparkConf;
  24. import org.apache.spark.api.java.JavaPairRDD;
  25. import org.apache.spark.api.java.JavaRDD;
  26. import org.apache.spark.api.java.JavaSparkContext;
  27. import org.apache.spark.api.java.function.Function;
  28. import org.apache.spark.api.java.function.Function2;
  29. import org.apache.spark.api.java.function.PairFlatMapFunction;
  30. import org.apache.spark.api.java.function.PairFunction;
  31. import java.util.ArrayList;
  32. import java.util.List;
  33. import java.util.Iterator;
  34. import java.util.regex.Pattern;
  35.  
  36. /**
  37. * Computes the PageRank of URLs from an input file. Input file should
  38. * be in format of:
  39. * URL neighbor URL
  40. * URL neighbor URL
  41. * URL neighbor URL
  42. * ...
  43. * where URL and their neighbors are separated by space(s).
  44. *
  45. * This is an example implementation for learning how to use Spark. For more conventional use,
  46. * please refer to org.apache.spark.graphx.lib.PageRank
  47. */
  48. public final class JavaPageRank {
  49. private static final Pattern SPACES = Pattern.compile("\\s+");
  50.  
  51. /*
  52. * 显示警告函数
  53. */
  54. static void showWarning() {
  55. String warning = "WARN: This is a naive implementation of PageRank " +
  56. "and is given as an example! \n" +
  57. "Please use the PageRank implementation found in " +
  58. "org.apache.spark.graphx.lib.PageRank for more conventional use.";
  59. System.err.println(warning);
  60. }
  61.  
  62. private static class Sum implements Function2<Double, Double, Double> {
  63. @Override
  64. public Double call(Double a, Double b) {
  65. return a + b;
  66. }
  67. }
  68.  
  69. /*
  70. * 主函数
  71. */
  72. public static void main(String[] args) throws Exception {
  73. if (args.length < ) {
  74. System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
  75. System.exit();
  76. }
  77.  
  78. showWarning();
  79.  
  80. SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank").setMaster("local");
  81. JavaSparkContext ctx = new JavaSparkContext(sparkConf);
  82.  
  83. // Loads in input file. It should be in format of:
  84. // URL neighbor URL
  85. // URL neighbor URL
  86. // URL neighbor URL
  87. // ...
  88. // JavaRDD<String> lines = ctx.textFile(args[0], 1);//这是官网发行包里写的
  89. JavaRDD<String> lines = ctx.textFile("data/input/mllib/pagerank_data.txt", );
  90.  
  91. // Loads all URLs from input file and initialize their neighbors.
  92. //根据边关系数据生成 邻接表 如:(1,(2,3,4,5)) (2,(1,5))...
  93. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
  94. @Override
  95. public Tuple2<String, String> call(String s) {
  96. String[] parts = SPACES.split(s);
  97. return new Tuple2<String, String>(parts[], parts[]);
  98. }
  99. }).distinct().groupByKey().cache();
  100.  
  101. //初始化 ranks, 每一个url初始分值为1
  102. // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
  103. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
  104. @Override
  105. public Double call(Iterable<String> rs) {
  106. return 1.0;
  107. }
  108. });
  109.  
  110. /*
  111. * 迭代iters次; 每次迭代中做如下处理, links(urlKey, neighborUrls) join (urlKey, rank(分值));
  112. * 对neighborUrls以及初始 rank,每一个neighborUrl , neighborUrlKey, 初始rank/size(新的rank贡献值);
  113. * 然后再进行reduceByKey相加 并对分值 做调整 0.15 + 0.85 * _
  114. */
  115. // Calculates and updates URL ranks continuously using PageRank algorithm.
  116. for (int current = ; current < Integer.parseInt(args[]); current++) {
  117. // Calculates URL contributions to the rank of other URLs.
  118. JavaPairRDD<String, Double> contribs = links.join(ranks).values()
  119. .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
  120. @Override
  121. public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
  122. int urlCount = Iterables.size(s._1);
  123. List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
  124. for (String n : s._1) {
  125. results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
  126. }
  127. return results;
  128. }
  129. });
  130.  
  131. // Re-calculates URL ranks based on neighbor contributions.
  132. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
  133. @Override
  134. public Double call(Double sum) {
  135. return 0.15 + sum * 0.85;
  136. }
  137. });
  138. }
  139.  
  140. //输出排名
  141. // Collects all URL ranks and dump them to console.
  142. List<Tuple2<String, Double>> output = ranks.collect();
  143. for (Tuple2<?,?> tuple : output) {
  144. System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
  145. }
  146.  
  147. ctx.stop();
  148. }
  149. }

  没结果,暂时

spark-2.2.0-bin-hadoop2.6里Basic包下的JavaPageRank.java

  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17.  
  18. //package org.apache.spark.examples;
  19. package zhouls.bigdata.Basic;
  20.  
  21. import java.util.ArrayList;
  22. import java.util.List;
  23. import java.util.regex.Pattern;
  24. import scala.Tuple2;
  25. import com.google.common.collect.Iterables;
  26. import org.apache.spark.api.java.JavaPairRDD;
  27. import org.apache.spark.api.java.JavaRDD;
  28. import org.apache.spark.api.java.function.Function2;
  29. import org.apache.spark.sql.SparkSession;
  30.  
  31. /**
  32. * Computes the PageRank of URLs from an input file. Input file should
  33. * be in format of:
  34. * URL neighbor URL
  35. * URL neighbor URL
  36. * URL neighbor URL
  37. * ...
  38. * where URL and their neighbors are separated by space(s).
  39. *
  40. * This is an example implementation for learning how to use Spark. For more conventional use,
  41. * please refer to org.apache.spark.graphx.lib.PageRank
  42. *
  43. * Example Usage:
  44. * <pre>
  45. * bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10
  46. * </pre>
  47. */
  48. public final class JavaPageRank {
  49. private static final Pattern SPACES = Pattern.compile("\\s+");
  50.  
  51. /*
  52. * 显示警告函数
  53. */
  54. static void showWarning() {
  55. String warning = "WARN: This is a naive implementation of PageRank " +
  56. "and is given as an example! \n" +
  57. "Please use the PageRank implementation found in " +
  58. "org.apache.spark.graphx.lib.PageRank for more conventional use.";
  59. System.err.println(warning);
  60. }
  61.  
  62. private static class Sum implements Function2<Double, Double, Double> {
  63. @Override
  64. public Double call(Double a, Double b) {
  65. return a + b;
  66. }
  67. }
  68.  
  69. /*
  70. * 主函数
  71. */
  72. public static void main(String[] args) throws Exception {
  73. if (args.length < ) {
  74. System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
  75. System.exit();
  76. }
  77.  
  78. showWarning();
  79.  
  80. SparkSession spark = SparkSession
  81. .builder()
  82. .master("local")
  83. .appName("JavaPageRank")
  84. .getOrCreate();
  85.  
  86. // Loads in input file. It should be in format of:
  87. // URL neighbor URL
  88. // URL neighbor URL
  89. // URL neighbor URL
  90. // ...
  91. // JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();
  92. JavaRDD<String> lines = spark.read().textFile("data/input/mllib/pagerank_data.txt").javaRDD();
  93.  
  94. // Loads all URLs from input file and initialize their neighbors.
  95. //根据边关系数据生成 邻接表 如:(1,(2,3,4,5)) (2,(1,5))...
  96. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
  97. String[] parts = SPACES.split(s);
  98. return new Tuple2<>(parts[], parts[]);
  99. }).distinct().groupByKey().cache();
  100.  
  101. // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
  102. //初始化 ranks, 每一个url初始分值为1
  103. JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);
  104.  
  105. /*
  106. * 迭代iters次; 每次迭代中做如下处理, links(urlKey, neighborUrls) join (urlKey, rank(分值));
  107. * 对neighborUrls以及初始 rank,每一个neighborUrl , neighborUrlKey, 初始rank/size(新的rank贡献值);
  108. * 然后再进行reduceByKey相加 并对分值 做调整 0.15 + 0.85 * _
  109. */
  110. // Calculates and updates URL ranks continuously using PageRank algorithm.
  111. for (int current = ; current < Integer.parseInt(args[]); current++) {
  112. // Calculates URL contributions to the rank of other URLs.
  113. JavaPairRDD<String, Double> contribs = links.join(ranks).values()
  114. .flatMapToPair(s -> {
  115. int urlCount = Iterables.size(s._1());
  116. List<Tuple2<String, Double>> results = new ArrayList<>();
  117. for (String n : s._1) {
  118. results.add(new Tuple2<>(n, s._2() / urlCount));
  119. }
  120. return results.iterator();
  121. });
  122.  
  123. // Re-calculates URL ranks based on neighbor contributions.
  124. ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
  125. }
  126.  
  127. //输出排名
  128. // Collects all URL ranks and dump them to console.
  129. List<Tuple2<String, Double>> output = ranks.collect();
  130. for (Tuple2<?,?> tuple : output) {
  131. System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
  132. }
  133.  
  134. spark.stop();
  135. }
  136. }

  没结果,暂时

spark-2.2.0-bin-hadoop2.6和spark-1.6.1-bin-hadoop2.6发行包自带案例全面详解(java、python、r和scala)之Basic包下的JavaPageRank.java(图文详解)的更多相关文章

  1. (转)CentOS 6下配置软RAID图文详解

    CentOS 6下配置软RAID图文详解 原文:http://blog.51cto.com/hujiangtao/1929620 一.RAID 简介 RAID 是英文Redundant Array o ...

  2. 反射实现Model修改前后的内容对比 【API调用】腾讯云短信 Windows操作系统下Redis服务安装图文详解 Redis入门学习

    反射实现Model修改前后的内容对比   在开发过程中,我们会遇到这样一个问题,编辑了一个对象之后,我们想要把这个对象修改了哪些内容保存下来,以便将来查看和追责. 首先我们要创建一个User类 1 p ...

  3. java扫描某个包下的所有java类并加载

    最近在学习java的反射和注解,实际情景中需要扫描某个包下的所有java类,然后使用类加载器加载类. 基本思路,获得程序的路径扫描src下某个包内的子包和java类,实现也比较简单. 运行环境:win ...

  4. spark最新源码下载并导入到开发环境下助推高质量代码(Scala IDEA for Eclipse和IntelliJ IDEA皆适用)(以spark2.2.0源码包为例)(图文详解)

    不多说,直接上干货! 前言   其实啊,无论你是初学者还是具备了有一定spark编程经验,都需要对spark源码足够重视起来. 本人,肺腑之己见,想要成为大数据的大牛和顶尖专家,多结合源码和操练编程. ...

  5. 如何用R来处理数据表的长宽转换(图文详解)

    不多说,直接上干货! 很多地方都需用到这个知识点,比如Tableau里.   通常可以采取如python 和 r来作为数据处理的前期. Tableau学习系列之Tableau如何通过数据透视表方式读取 ...

  6. iOS使用Charles(青花瓷)抓包并篡改返回数据图文详解

    写本文的契机主要是前段时间有次用青花瓷抓包有一步忘了,在网上查了半天也没找到写的完整的教程,于是待问题解决后抽时间截了图,自己写一遍封存在博客园中以便以后随时查阅. charles又名青花瓷,在iOS ...

  7. java.util.regex包下的Pattern和Matcher详解(正则匹配)

    java正则表达式通过java.util.regex包下的Pattern类与Matcher类实现(建议在阅读本文时,打开java API文档,当介绍到哪个方法时,查看java API中的方法说明,效果 ...

  8. Java并发机制(8)--concurrent包下辅助类的使用

    Java并发编程:concurrent包下辅助类的使用 整理自:博客园-海子-http://www.cnblogs.com/dolphin0520/p/3920397.html 1.CountDown ...

  9. 执行Hive时出现org.apache.hadoop.util.RunJar.main(RunJar.java:136) Caused by: java.lang.NumberFormatException: For input string: "1s"错误的解决办法(图文详解)

    不多说,直接上干货 问题详情 [kfk@bigdata-pro01 apache-hive--bin]$ bin/hive Logging initialized -bin/conf/hive-log ...

随机推荐

  1. linux命令学习笔记(23):Linux 目录结构

    对于每一个Linux学习者来说,了解Linux文件系统的目录结构,是学好Linux的至关重要的一步.,深入了解linux文件 目录结构的标准和每个目录的详细功能,对于我们用好linux系统只管重要,下 ...

  2. OpenCV——PS滤镜算法之Spherize 球面化(凸出效果)

    // define head function #ifndef PS_ALGORITHM_H_INCLUDED #define PS_ALGORITHM_H_INCLUDED #include < ...

  3. Java之动态代理简介

    图截于<大话设计模式> Proxy模式是常用的设计模式,其特征是代理类与委托类有同样的接口,代理类主要负责为委托类预处理消息.过滤消息.把消息转发给委托类,以及事后处理消息等. 用户可以更 ...

  4. javacpp-FFmpeg系列补充:FFmpeg解决avformat_find_stream_info检索时间过长问题

    javacpp-ffmpeg系列: javacpp-FFmpeg系列之1:视频拉流解码成YUVJ420P,并保存为jpg图片 javacpp-FFmpeg系列之2:通用拉流解码器,支持视频拉流解码并转 ...

  5. Redis 客户端安装与远程连接图解

    Linux环境:Centos 6.8 Redis服务端版本:3.2.6 Redis客户端下载链接:https://redisdesktop.com/download 省略Linux系统安装Redis教 ...

  6. 主备角色switch

    理论知识:Switchover 切换允许primary 和一个备库进行切换,并且这种切换没有数据丢失. 前提条件: 1) 主备库相关参数 fal_client.fal_server .standby_ ...

  7. SQL 优化总结(三) SQL子句

    SQL子句 尽可能编写优化器可以优化的语句. 1. SELECT子句 (1) 在查询Select语句中用Where字句限制返回的行数,避免表扫描,如果返回不必要的数据,浪费了服务器的I/O资源,加重了 ...

  8. 0001_第一个测试小程序Login

    # -*- coding:utf-8 -*- user = raw_input("Username:") password = raw_input("Password:& ...

  9. sass安装方法,绝对好用的方式

    系统重做了,很多东西都重装,sass也一样,结果在安装的过程中遇到了问题,这里记录下,也给同样遇到问题的朋友们一个思路.本方法是参照http://www.w3cplus.com/sassguide/i ...

  10. Spring入门第十一课

    IOC容器中Bean的生命周期 Spring IOC容器可以管理Bean的生命周期,Spring允许在Bean生命周期的特定点执行定制的任务. Spring IOC容器对Bean的生命周期进行管理的过 ...