Hadoop 系列（三）Java API

<dependency>

    <groupId>org.apache.hadoop</groupId>

    <artifactId>hadoop-hdfs</artifactId>

    <version>2.9.2</version>

</dependency>

<dependency>

    <groupId>org.apache.hadoop</groupId>

    <artifactId>hadoop-client</artifactId>

    <version>2.9.2</version>

</dependency>

<dependency>

    <groupId>org.apache.hadoop</groupId>

    <artifactId>hadoop-common</artifactId>

    <version>2.9.2</version>

</dependency>

一、HDFS 操作

@Test

public void upload() throws Exception {

    Configuration conf = new Configuration();  // (1)

    //conf.set("fs.defaultFS", "hdfs://master:9000/");

    Path dst = new Path("hdfs://master:9000/upload/MPSetup4.log");

    FileSystem fs = FileSystem.get(new URI("hdfs://master:9000/"), conf, "hadoop"); // (2)

    FSDataOutputStream os = fs.create(dst);

    FileInputStream is = new FileInputStream("c:/MPSetup.log");

    IOUtils.copy(is, os);

}

Configuration 配置文件默认读取 resources 目录下的 core-site.xml、hdfs-site.xml、mapred-site.xml、yarn-site.xml 文件。可以将 Hadoop 安装目录下的这些配制文件直接拷贝过来，也可以直接 conf.set() 设置参数。
FileSystem.get() 必须要以 hadoop 的身份运行，否则会出现权限不足的问题。可以配置 -DHADOOP_USER_NAME=hadoop 参数。

下面提供一个 HdfsUtil 工具类：

public class HdfsUtil {

    FileSystem fs = null;

    @Before

    public void init() throws Exception{

        System.setProperty("hadoop.home.dir", "D:/Program_Files/apache/hadoop-common-bin/");

        //1. 读取classpath下的xxx-site.xml 配置文件，并解析其内容，封装到conf对象中

        Configuration conf = new Configuration();

        //2. 也可以在代码中对conf中的配置信息进行手动设置，会覆盖掉配置文件中的读取的值

        conf.set("fs.defaultFS", "hdfs://master:9000/");

        //3. 根据配置信息，去获取一个具体文件系统的客户端操作实例对象

        fs = FileSystem.get(new URI("hdfs://master:9000/"), conf, "hadoop");

    }

    /** 上传文件，封装好的写法 */

    @Test

    public void upload2() throws Exception, IOException{

        fs.copyFromLocalFile(new Path("c:/MPSetup.log"),

                new Path("hdfs://master:9000/aaa/bbb/ccc/MPSetup.log"));

    }

    /** 下载文件 */

    @Test

    public void download() throws Exception {

        fs.copyToLocalFile(new Path("hdfs://master:9000/aaa/bbb/ccc/MPSetup.log"),

                new Path("d:/MPSetup2.txt"));

    }

    /** 查看文件信息 */

    @Test

    public void listFiles() throws FileNotFoundException, IllegalArgumentException, IOException {

        // listFiles列出的是文件信息，而且提供递归遍历

        RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/"), true);

        while(files.hasNext()) {

            LocatedFileStatus file = files.next();

            Path filePath = file.getPath();

            String fileName = filePath.getName();

            System.out.println(fileName);

        }

        System.out.println("---------------------------------");

        //listStatus 可以列出文件和文件夹的信息，但是不提供自带的递归遍历

        FileStatus[] listStatus = fs.listStatus(new Path("/"));

        for(FileStatus status: listStatus){

            String name = status.getPath().getName();

            System.out.println(name + (status.isDirectory()?" is dir":" is file"));

        }

    }

    /** 创建文件夹 */

    @Test

    public void mkdir() throws IllegalArgumentException, Exception {

        fs.mkdirs(new Path("/aaa/bbb/ccc"));

    }

    /** 删除文件或文件夹 */

    @Test

    public void rm() throws IllegalArgumentException, IOException {

        fs.delete(new Path("/aa"), true);

    }

}

二、RPC 调用

(1) LoginServiceInterface 接口

package com.github.binarylei.hadoop.rpc;

public interface LoginServiceInterface {

    public static final long versionID = 1L;

    public String login(String username, String password);

}

public class LoginServiceImpl implements LoginServiceInterface {

    @Override

    public String login(String username, String password) {

        return username + " login in successfully!";

    }

}

(2) RPCServer

// 目前只能上传到 Linux 上运行 ??????

public class RPCServer {

    private static String host = "master";

    private static int port = 10001;

    public static void main(String[] args) throws HadoopIllegalArgumentException, IOException {

        Configuration conf = new Configuration();

        conf.set("fs.defaultFS", "hdfs://master:9000/");

        Builder builder = new Builder(conf);

        builder.setBindAddress("master")

                .setPort(port)

                .setProtocol(LoginServiceInterface.class)

                .setInstance(new LoginServiceImpl());

        Server server = builder.build();

        server.start();

    }

}

将打包后的 hadoop-api-1.0.0.jar 上传到 Linux，启动 RPC 服务，执行

hadoop jar hadoop-api-1.0.0.jar com.github.binarylei.hadoop.rpc.RPCServer

2018-05-13 18:20:16,606 INFO ipc.CallQueueManager: Using callQueue: class java.util.concurrent.LinkedBlockingQueue queueCapacity: 100 scheduler: class org.apache.hadoop.ipc.DefaultRpcScheduler

2018-05-13 18:20:17,631 INFO ipc.Server: Starting Socket Reader #1 for port 10001

2018-05-13 18:20:19,613 INFO ipc.Server: IPC Server Responder: starting

2018-05-13 18:20:19,618 INFO ipc.Server: IPC Server listener on 10001: starting

(3) RPCClient

public class RPCClient {

    private static String host = "master";

    private static int port = 10001;

    public static void main(String[] args) throws Exception {

        System.setProperty("hadoop.home.dir", "D:/Program_Files/apache/hadoop-common-bin/");

        Configuration conf = new Configuration();

        conf.set("fs.defaultFS", "hdfs://master:9000/");

        LoginServiceInterface proxy = RPC.getProxy(

                LoginServiceInterface.class,

                1L,

                new InetSocketAddress(host, port),

                conf);

        String result = proxy.login("hadoop-test", "test");

        System.out.println(result);

    }

}

直接在 Windows 上运行，结果如下：

hadoop-test login in successfully!

三、MapReduce

下面模仿 wordcount，写一个 MapReduce

(1) WCMapper

//4个泛型中，前两个是指定mapper输入数据的类型，KEYIN是输入的key的类型，VALUEIN是输入的value的类型

//map 和 reduce 的数据输入输出都是以 key-value对的形式封装的

//默认情况下，框架传递给我们的mapper的输入数据中，key是要处理的文本中一行的起始偏移量，这一行的内容作为value

public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

    //mapreduce框架每读一行数据就调用一次该方法

    @Override

    protected void map(LongWritable key, Text value,Context context)

            throws IOException, InterruptedException {

        //具体业务逻辑就写在这个方法体中，而且我们业务要处理的数据已经被框架传递进来，在方法的参数中 key-value

        //key 是这一行数据的起始偏移量     value 是这一行的文本内容

        //将这一行的内容转换成string类型

        String line = value.toString();

        //对这一行的文本按特定分隔符切分

        String[] words = StringUtils.split(line, " ");

        //遍历这个单词数组输出为kv形式  k：单词   v ： 1

        for(String word : words){

            context.write(new Text(word), new LongWritable(1));

        }

    }

}

(2) WCReducer

public class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

    //框架在map处理完成之后，将所有kv对缓存起来，进行分组，然后传递一个组<key,valus{}>，调用一次reduce方法

    //<hello,{1,1,1,1,1,1.....}>

    @Override

    protected void reduce(Text key, Iterable<LongWritable> values,Context context)

            throws IOException, InterruptedException {

        long count = 0;

        //遍历value的list，进行累加求和

        for(LongWritable value:values){

            count += value.get();

        }

        //输出这一个单词的统计结果

        context.write(key, new LongWritable(count));

    }

}

(3) WCReducer

/**

 * 用来描述一个特定的作业

 * 比如，该作业使用哪个类作为逻辑处理中的map，哪个作为reduce

 * 还可以指定该作业要处理的数据所在的路径

 * 还可以指定改作业输出的结果放到哪个路径

 * ....

 * @author duanhaitao@itcast.cn

 */

public class WCRunner {

    public static void main(String[] args) throws Exception {

        //System.setProperty("hadoop.home.dir", "D:/Program_Files/apache/hadoop-common-bin/");

        Configuration conf = new Configuration();

        Job wcjob = Job.getInstance(conf);

        //设置整个job所用的那些类在哪个jar包

        wcjob.setJarByClass(WCRunner.class);

        //本job使用的mapper和reducer的类

        wcjob.setMapperClass(WCMapper.class);

        wcjob.setReducerClass(WCReducer.class);

        //指定reduce的输出数据kv类型

        wcjob.setOutputKeyClass(Text.class);

        wcjob.setOutputValueClass(LongWritable.class);

        //指定mapper的输出数据kv类型

        wcjob.setMapOutputKeyClass(Text.class);

        wcjob.setMapOutputValueClass(LongWritable.class);

        //指定要处理的输入数据存放路径

        FileInputFormat.setInputPaths(wcjob, new Path("hdfs://master:9000/wc/input/"));

        //指定处理结果的输出数据存放路径

        FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://master:9000/wc/output5/"));

        //将job提交给集群运行

        wcjob.waitForCompletion(true);

    }

}

四、Hadoop 运行(Windows)

问题 1：缺少 winutils.exe 和 hadoop.dll

# 缺少 winutils.exe

Could not locate executable null \bin\winutils.exe in the hadoop binaries

# 缺少 hadoop.dll

Unable to load native-hadoop library for your platform… using builtin-Java classes where applicable

解决办法：

下载地址：https://github.com/srccodes/hadoop-common-2.2.0-bin
解压后将 hadoop-common-2.2.0-bin/bin 目录下的文件全部拷贝到 HADOOP_HOME/bin 目录下，并配置 HADOOP_HOME 环境变量。
将 hadoop-common-2.2.0-bin/bin/hadoop.dll 拷贝到 C:\Windows\System32 目录下。

问题 2：Exception in thread "main" java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z

解决办法：

首先确保 C:\Windows\System32 目录下已经有 hadoop.dll 文件

在自己的工程中拷贝一份 org.apache.hadoop.io.nativeio.NativeIO 类，修改如下：

public static boolean access(String path, AccessRight desiredAccess)

                throws IOException {

    return true;

    //return access0(path, desiredAccess.accessRight());

}

参考：

《Hadoop 运行问题》：https://blog.csdn.net/congcong68/article/details/42043093
《winutils.exe 下载地址》：https://github.com/srccodes/hadoop-common-2.2.0-bin

每天用心记录一点点。内容也许不重要，但习惯很重要！

Hadoop 系列（三）Java API的更多相关文章

Kafka系列三 java API操作
使用java API操作kafka 1.pom.xml <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xs ...
RabbitMQ系列(三)--Java API
基于java使用RabbitMQ 框架:SpringBoot1.5.14.RELEASE maven依赖: <dependency> <groupId>com.rabbitmq ...
Apache Kafka系列(三) Java API使用
Apache Kafka系列(一) 起步 Apache Kafka系列(二) 命令行工具(CLI) Apache Kafka系列(三) Java API使用摘要: Apache Kafka Java ...
hadoop系列三:mapreduce的使用(一)
转载请在页首明显处注明作者与出处 http://www.cnblogs.com/zhuxiaojie/p/7224772.html 一:说明此为大数据系列的一些博文,有空的话会陆续更新,包含大数据的 ...
Hadoop HDFS Basic JAVA API
org.apache.hadoop.fs.FileSystem 是HDFS的文件系统抽象,在分布式系统中管理HDFS文件和目录.文件内容存储在由多个相同大小的块(如64M)构成的datanode节 ...
jvm系列(三):java GC算法垃圾收集器
GC算法垃圾收集器概述垃圾收集 Garbage Collection 通常被称为“GC”,它诞生于1960年 MIT 的 Lisp 语言,经过半个多世纪,目前已经十分成熟了. jvm 中,程序计 ...
Hadoop HDFS 用java API 进行读写
public class HdfsApp { public static FileSystem getFileSystem() throws Exception { Configuration con ...
消息中间件系列之Java API操作ActiveMQ
一.依赖 <dependency> <groupId>org.apache.activemq</groupId> <artifactId>activem ...
hadoop系列二：HDFS文件系统的命令及JAVA客户端API
转载请在页首明显处注明作者与出处一:说明此为大数据系列的一些博文,有空的话会陆续更新,包含大数据的一些内容,如hadoop,spark,storm,机器学习等. 当前使用的hadoop版本为2.6 ...
hadoop系列四:mapreduce的使用(二)
转载请在页首明显处注明作者与出处一:说明此为大数据系列的一些博文,有空的话会陆续更新,包含大数据的一些内容,如hadoop,spark,storm,机器学习等. 当前使用的hadoop版本为2.6 ...

随机推荐

vagrant up报错 Warning: Authentication failure. Retrying...解决方案
参照链接 https://www.cnblogs.com/zqifa/p/vagrant-1.html 可以解决问题.
【算法和数据结构】_15_小算法_打印EOF的值
/* 本程序打印EOF的值 */ #include <stdio.h> int main(int argc,char* argv[],char* env) { printf("E ...
换上 SansForgetica-Regular 字体,增加记忆能力
最近澳大利亚的RMIT(皇家墨尔本理工大学) 搞出来这么个字体,号称能增强记忆,原理是通过难以识别的字体,让人提起精神去识别,从而记忆更深刻. 果断弄了个试试. 安装过程: 下载字体文件点这里去下载 ...
Using a ScrollView - RN4
使用滚动条. 1. import import {ScrollView} from "react-native"; 2. Using <ScrollView> ... ...
在Tomcat7.0中设置默认服务器和不加端口名访问
前言昨天买了域名,服务器,然后搭建了环境,然后想他通过默认的端口,不用端口就访问. 设置WEB项目的欢迎页在WEB-INF文件夹下有个web.xml文件(最近新建的项目不包含此文件,可以手动新建) ...
java编程思想（1）--对象导论
对象导论: 1.1 抽象过程所有的语言都有抽象机制,抽象是解决复杂问题的根本方法.例如:汇编语言是对底层机器的轻微抽象.命令式语言(如:FORTRAN.BASIC.C)又是对汇编语言的抽象. jav ...
python3中一句话定义函数
import math as marea=lambda r:r**2*m.pi #定义一个计算圆的面积的函数area(8) 显示结果 201.06192982974676
leetcode98
class Solution { public: vector<int> V; void postTree(TreeNode* node) { if (node != NULL) { if ...
echart 单选legend 并排序
java代码 List<Map<String, Object>> AllList = null; JSONArray jsonArray = JSONArray.fromObj ...
Oracle 事务和异常处理
Oracle 的异常和回滚 DECLARE dept_no ) :; BEGIN --开始事务 INSERT INTO dept VALUES (dept_no, '市场部', '北京'); --插入 ...

Hadoop 系列（三）Java API

Hadoop 系列（三）Java API

一、HDFS 操作

二、RPC 调用

三、MapReduce

四、Hadoop 运行(Windows)

问题 1：缺少 winutils.exe 和 hadoop.dll

问题 2：Exception in thread "main" java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z

Hadoop 系列（三）Java API的更多相关文章

随机推荐

热门专题