不多说,直接上干货!

spark-1.6.1-bin-hadoop2.6里Basic包下的JavaPageRank.java

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/ //package org.apache.spark.examples;
package zhouls.bigdata.Basic; import scala.Tuple2;//scala里的元组
import com.google.common.collect.Iterables;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
import java.util.regex.Pattern; /**
* Computes the PageRank of URLs from an input file. Input file should
* be in format of:
* URL neighbor URL
* URL neighbor URL
* URL neighbor URL
* ...
* where URL and their neighbors are separated by space(s).
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
*/
public final class JavaPageRank {
private static final Pattern SPACES = Pattern.compile("\\s+"); /*
* 显示警告函数
*/
static void showWarning() {
String warning = "WARN: This is a naive implementation of PageRank " +
"and is given as an example! \n" +
"Please use the PageRank implementation found in " +
"org.apache.spark.graphx.lib.PageRank for more conventional use.";
System.err.println(warning);
} private static class Sum implements Function2<Double, Double, Double> {
@Override
public Double call(Double a, Double b) {
return a + b;
}
} /*
* 主函数
*/
public static void main(String[] args) throws Exception {
if (args.length < ) {
System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
System.exit();
} showWarning(); SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank").setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of:
// URL neighbor URL
// URL neighbor URL
// URL neighbor URL
// ...
// JavaRDD<String> lines = ctx.textFile(args[0], 1);//这是官网发行包里写的
JavaRDD<String> lines = ctx.textFile("data/input/mllib/pagerank_data.txt", ); // Loads all URLs from input file and initialize their neighbors.
//根据边关系数据生成 邻接表 如:(1,(2,3,4,5)) (2,(1,5))...
JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) {
String[] parts = SPACES.split(s);
return new Tuple2<String, String>(parts[], parts[]);
}
}).distinct().groupByKey().cache(); //初始化 ranks, 每一个url初始分值为1
// Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
@Override
public Double call(Iterable<String> rs) {
return 1.0;
}
}); /*
* 迭代iters次; 每次迭代中做如下处理, links(urlKey, neighborUrls) join (urlKey, rank(分值));
* 对neighborUrls以及初始 rank,每一个neighborUrl , neighborUrlKey, 初始rank/size(新的rank贡献值);
* 然后再进行reduceByKey相加 并对分值 做调整 0.15 + 0.85 * _
*/
// Calculates and updates URL ranks continuously using PageRank algorithm.
for (int current = ; current < Integer.parseInt(args[]); current++) {
// Calculates URL contributions to the rank of other URLs.
JavaPairRDD<String, Double> contribs = links.join(ranks).values()
.flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
@Override
public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
int urlCount = Iterables.size(s._1);
List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
for (String n : s._1) {
results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
}
return results;
}
}); // Re-calculates URL ranks based on neighbor contributions.
ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
@Override
public Double call(Double sum) {
return 0.15 + sum * 0.85;
}
});
} //输出排名
// Collects all URL ranks and dump them to console.
List<Tuple2<String, Double>> output = ranks.collect();
for (Tuple2<?,?> tuple : output) {
System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
} ctx.stop();
}
}

  没结果,暂时

spark-2.2.0-bin-hadoop2.6里Basic包下的JavaPageRank.java

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/ //package org.apache.spark.examples;
package zhouls.bigdata.Basic; import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import scala.Tuple2;
import com.google.common.collect.Iterables;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.sql.SparkSession; /**
* Computes the PageRank of URLs from an input file. Input file should
* be in format of:
* URL neighbor URL
* URL neighbor URL
* URL neighbor URL
* ...
* where URL and their neighbors are separated by space(s).
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
*
* Example Usage:
* <pre>
* bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10
* </pre>
*/
public final class JavaPageRank {
private static final Pattern SPACES = Pattern.compile("\\s+"); /*
* 显示警告函数
*/
static void showWarning() {
String warning = "WARN: This is a naive implementation of PageRank " +
"and is given as an example! \n" +
"Please use the PageRank implementation found in " +
"org.apache.spark.graphx.lib.PageRank for more conventional use.";
System.err.println(warning);
} private static class Sum implements Function2<Double, Double, Double> {
@Override
public Double call(Double a, Double b) {
return a + b;
}
} /*
* 主函数
*/
public static void main(String[] args) throws Exception {
if (args.length < ) {
System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
System.exit();
} showWarning(); SparkSession spark = SparkSession
.builder()
.master("local")
.appName("JavaPageRank")
.getOrCreate(); // Loads in input file. It should be in format of:
// URL neighbor URL
// URL neighbor URL
// URL neighbor URL
// ...
// JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();
JavaRDD<String> lines = spark.read().textFile("data/input/mllib/pagerank_data.txt").javaRDD(); // Loads all URLs from input file and initialize their neighbors.
//根据边关系数据生成 邻接表 如:(1,(2,3,4,5)) (2,(1,5))...
JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
String[] parts = SPACES.split(s);
return new Tuple2<>(parts[], parts[]);
}).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
//初始化 ranks, 每一个url初始分值为1
JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0); /*
* 迭代iters次; 每次迭代中做如下处理, links(urlKey, neighborUrls) join (urlKey, rank(分值));
* 对neighborUrls以及初始 rank,每一个neighborUrl , neighborUrlKey, 初始rank/size(新的rank贡献值);
* 然后再进行reduceByKey相加 并对分值 做调整 0.15 + 0.85 * _
*/
// Calculates and updates URL ranks continuously using PageRank algorithm.
for (int current = ; current < Integer.parseInt(args[]); current++) {
// Calculates URL contributions to the rank of other URLs.
JavaPairRDD<String, Double> contribs = links.join(ranks).values()
.flatMapToPair(s -> {
int urlCount = Iterables.size(s._1());
List<Tuple2<String, Double>> results = new ArrayList<>();
for (String n : s._1) {
results.add(new Tuple2<>(n, s._2() / urlCount));
}
return results.iterator();
}); // Re-calculates URL ranks based on neighbor contributions.
ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
} //输出排名
// Collects all URL ranks and dump them to console.
List<Tuple2<String, Double>> output = ranks.collect();
for (Tuple2<?,?> tuple : output) {
System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
} spark.stop();
}
}

  没结果,暂时

spark-2.2.0-bin-hadoop2.6和spark-1.6.1-bin-hadoop2.6发行包自带案例全面详解(java、python、r和scala)之Basic包下的JavaPageRank.java(图文详解)的更多相关文章

  1. (转)CentOS 6下配置软RAID图文详解

    CentOS 6下配置软RAID图文详解 原文:http://blog.51cto.com/hujiangtao/1929620 一.RAID 简介 RAID 是英文Redundant Array o ...

  2. 反射实现Model修改前后的内容对比 【API调用】腾讯云短信 Windows操作系统下Redis服务安装图文详解 Redis入门学习

    反射实现Model修改前后的内容对比   在开发过程中,我们会遇到这样一个问题,编辑了一个对象之后,我们想要把这个对象修改了哪些内容保存下来,以便将来查看和追责. 首先我们要创建一个User类 1 p ...

  3. java扫描某个包下的所有java类并加载

    最近在学习java的反射和注解,实际情景中需要扫描某个包下的所有java类,然后使用类加载器加载类. 基本思路,获得程序的路径扫描src下某个包内的子包和java类,实现也比较简单. 运行环境:win ...

  4. spark最新源码下载并导入到开发环境下助推高质量代码(Scala IDEA for Eclipse和IntelliJ IDEA皆适用)(以spark2.2.0源码包为例)(图文详解)

    不多说,直接上干货! 前言   其实啊,无论你是初学者还是具备了有一定spark编程经验,都需要对spark源码足够重视起来. 本人,肺腑之己见,想要成为大数据的大牛和顶尖专家,多结合源码和操练编程. ...

  5. 如何用R来处理数据表的长宽转换(图文详解)

    不多说,直接上干货! 很多地方都需用到这个知识点,比如Tableau里.   通常可以采取如python 和 r来作为数据处理的前期. Tableau学习系列之Tableau如何通过数据透视表方式读取 ...

  6. iOS使用Charles(青花瓷)抓包并篡改返回数据图文详解

    写本文的契机主要是前段时间有次用青花瓷抓包有一步忘了,在网上查了半天也没找到写的完整的教程,于是待问题解决后抽时间截了图,自己写一遍封存在博客园中以便以后随时查阅. charles又名青花瓷,在iOS ...

  7. java.util.regex包下的Pattern和Matcher详解(正则匹配)

    java正则表达式通过java.util.regex包下的Pattern类与Matcher类实现(建议在阅读本文时,打开java API文档,当介绍到哪个方法时,查看java API中的方法说明,效果 ...

  8. Java并发机制(8)--concurrent包下辅助类的使用

    Java并发编程:concurrent包下辅助类的使用 整理自:博客园-海子-http://www.cnblogs.com/dolphin0520/p/3920397.html 1.CountDown ...

  9. 执行Hive时出现org.apache.hadoop.util.RunJar.main(RunJar.java:136) Caused by: java.lang.NumberFormatException: For input string: "1s"错误的解决办法(图文详解)

    不多说,直接上干货 问题详情 [kfk@bigdata-pro01 apache-hive--bin]$ bin/hive Logging initialized -bin/conf/hive-log ...

随机推荐

  1. 递归/非递归----python深度遍历二叉树(前序遍历,中序遍历,后序遍历)

    递归代码:递归实现很简单 '二叉树结点类' class TreeNode: def __init__(self, x): self.val = x self.left = None self.righ ...

  2. MMU的理解

    MMU内存管理单元相关知识点总结 1.MMU是Memory Management Unit的缩写,中文名是内存管理单元,它是中央处理器(CPU)中用来管理虚拟存储器.物理存储器的控制线路,同时也负责虚 ...

  3. java性能调优的11个建议

    1.在必要之前,先不要优化 2.使用分析器来找到真正的瓶颈 3 .为整个应用程序创建性能测试套件 4.首先解决最大的瓶颈问题 5.使用StringBuilder以编程方式连接字符串       Str ...

  4. 在python 3.6的eclipse中,导入from lxml import etree老是提示,Unresolved import:etree的错误

    支持代码运行没问题,暂时没有找到真正解决办法,只能通过一下办法暂时解决.如下图:

  5. 【248】◀▶IEW-Unit13

    Unit 13 Technology 流程图讲解 1.model1对应图片讲解 2.Model1范文分析 Model 1 The ice cream making process has five k ...

  6. JAVA之BigInteger(转)【转】【很好用啊】

    用Java来处理高精度问题,相信对很多ACMer来说都是一件很happy的事,简单易懂.用Java刷了一些题,感觉Java还不错,在处理高精度和进制转换中,调用库函数的来处理.下面是写的一些Java中 ...

  7. 【机器学习】决策树C4.5、ID3

    一.算法流程 step1:计算信息熵 step2: 划分数据集 step3: 创建决策树 step4: 利用决策树分类 二.信息熵Entropy.信息增益Gain 重点:选择一个属性进行分支.注意信息 ...

  8. 使用maven导入任意jar包

    http://mvnrepository.com/ 我这里,因为是spark1.5.2版本. 保存,maven会自动下载jar包到本地仓库.

  9. Eclipse下对maven进行配置

    前提:安装好maven插件http://www.cnblogs.com/lchzls/p/6281697.html 1.需要修改配置:首先选择Window->Preferences,弹出如下对话 ...

  10. Linux+ant+jmeter+Jenkins接口持续集成自动化框架搭建

    Linux下安装ant并配置环境变量 1.从http://ant.apache.org 上下载tar.gz版ant 2.复制到/usr下 3.tar -vxzf apache-ant-1.10.1-b ...