两个类,一个HDFS文件操作类,一个是wordcount 词数统计类,都是从网上看来的。上代码:

package mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.io.IOUtils;
/**
* file operation on HDFS
* @author liuxingjiaofu
*
*/
public class HDFS_File {
//read the file from HDFS
public void ReadFile(Configuration conf, String FileName){
try{
FileSystem hdfs = FileSystem.get(conf);
FSDataInputStream dis = hdfs.open(new Path(FileName));
IOUtils.copyBytes(dis, System.out, 4096, false);
dis.close();
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//copy the file from HDFS to local
public void GetFile(Configuration conf, String srcFile, String dstFile){
try {
FileSystem hdfs = FileSystem.get(conf);
Path srcPath = new Path(srcFile);
Path dstPath = new Path(dstFile);
hdfs.copyToLocalFile(true,srcPath, dstPath);
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//copy the local file to HDFS
public void PutFile(Configuration conf, String srcFile, String dstFile){
try {
FileSystem hdfs = FileSystem.get(conf);
Path srcPath = new Path(srcFile);
Path dstPath = new Path(dstFile);
hdfs.copyFromLocalFile(srcPath, dstPath);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//create the new file
public FSDataOutputStream CreateFile(Configuration conf, String FileName){
try {
FileSystem hdfs = FileSystem.get(conf);
Path path = new Path(FileName);
FSDataOutputStream outputStream = hdfs.create(path);
return outputStream;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
//rename the file name
public boolean ReNameFile(Configuration conf, String srcName, String dstName){
try {
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(config);
Path fromPath = new Path(srcName);
Path toPath = new Path(dstName);
boolean isRenamed = hdfs.rename(fromPath, toPath);
return isRenamed;
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return false;
}
//delete the file
// tyep = true, delete the directory
// type = false, delete the file
public boolean DelFile(Configuration conf, String FileName, boolean type){
try {
FileSystem hdfs = FileSystem.get(conf);
Path path = new Path(FileName);
boolean isDeleted = hdfs.delete(path, type);
return isDeleted;
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return false;
}
//Get HDFS file last modification time
public long GetFileModTime(Configuration conf, String FileName){
try{
FileSystem hdfs = FileSystem.get(conf);
Path path = new Path(FileName);
FileStatus fileStatus = hdfs.getFileStatus(path);
long modificationTime = fileStatus.getModificationTime();
return modificationTime;
}catch(IOException e){
e.printStackTrace();
}
return 0;
}
//check if a file exists in HDFS
public boolean CheckFileExist(Configuration conf, String FileName){
try{
FileSystem hdfs = FileSystem.get(conf);
Path path = new Path(FileName);
boolean isExists = hdfs.exists(path);
return isExists;
}catch(IOException e){
e.printStackTrace();
}
return false;
}
//Get the locations of a file in the HDFS cluster
public List<String []> GetFileBolckHost(Configuration conf, String FileName){
try{
List<String []> list = new ArrayList<String []>();
FileSystem hdfs = FileSystem.get(conf);
Path path = new Path(FileName);
FileStatus fileStatus = hdfs.getFileStatus(path); BlockLocation[] blkLocations = hdfs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); int blkCount = blkLocations.length;
for (int i=0; i < blkCount; i++) {
String[] hosts = blkLocations[i].getHosts();
list.add(hosts);
}
return list;
}catch(IOException e){
e.printStackTrace();
}
return null;
}
//Get a list of all the nodes host names in the HDFS cluster
// have no authorization to do this operation
public String[] GetAllNodeName(Configuration conf){
try{
FileSystem fs = FileSystem.get(conf);
DistributedFileSystem hdfs = (DistributedFileSystem) fs;
DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
String[] names = new String[dataNodeStats.length];
for (int i = 0; i < dataNodeStats.length; i++) {
names[i] = dataNodeStats[i].getHostName();
}
return names;
}catch(IOException e){
System.out.println("error!!!!");
e.printStackTrace();
}
return null;
}
}

wordcount.java:

package mapreduce;

import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class mywordcount {
public static class wordcountMapper extends
Mapper<LongWritable, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while(itr.hasMoreElements()){
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class wordcountReducer extends
Reducer<Text, IntWritable, Text, IntWritable>{
public void reduce(Text key, Iterable<IntWritable>values, Context context)throws IOException, InterruptedException{
int sum = 0;
for (IntWritable str : values){
sum += str.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* 2 args, the file you want to count words from and the directory you want to save the result
* @param args /home/hadooper/testmp/testtext /home/hadooper/testmp/testresult
* @throws Exception
*/
public static void main(String args[])throws Exception{
//首先定义两个临时文件夹,这里可以使用随机函数+文件名,这样重名的几率就很小。
String dstFile = "temp_src";
String srcFile = "temp_dst";
//这里生成文件操作对象。
HDFS_File file = new HDFS_File(); Configuration conf = new Configuration();
// must!!! config the fs.default.name be the same to the value in core-site.xml
conf.set("fs.default.name","hdfs://node1");
conf.set("mapred.job.tracker","node1:54311"); //从本地上传文件到HDFS,可以是文件也可以是目录
file.PutFile(conf, args[0], dstFile); System.out.println("up ok");
Job job = new Job(conf, "mywordcount");
job.setJarByClass(mywordcount.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); job.setMapperClass(wordcountMapper.class);
job.setReducerClass(wordcountReducer.class);
job.setCombinerClass(wordcountReducer.class);
//注意这里的输入输出都应该是在HDFS下的文件或目录
FileInputFormat.setInputPaths(job, new Path(dstFile));
FileOutputFormat.setOutputPath(job, new Path(srcFile));
//开始运行
job.waitForCompletion(true);
//从HDFS取回文件保存至本地
file.GetFile(conf, srcFile, args[1]);
System.out.println("down the result ok!");
//删除临时文件或目录
file.DelFile(conf, dstFile, true);
file.DelFile(conf, srcFile, true);
System.out.println("delete file on hdfs ok!");
}
}

期间,遇到几个错误:

1.HDFS版本问题--Call to node1/172.*.*.*:8020 failed on local exception: java.io.EOFException

main() {…… 
  Configuration conf = new Configuration();
  conf.set("fs.default.name","hdfs://node1");//与conf/core-site里的值对应,必须
  HDFS_File file = new HDFS_File();
  //print all the node name
  String[] host_name = file.GetAllNodeName(conf); 
……}
public String[] GetAllNodeName(Configuration conf){
  try{
    // Configuration config = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     DistributedFileSystem hdfs = (DistributedFileSystem) fs;
     DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();
     String[] names = new String[dataNodeStats.length];
     for (int i = 0; i < dataNodeStats.length; i++) {
         names[i] = dataNodeStats[i].getHostName();
     }
     return names;
  }catch(IOException e){
   System.out.println("eeeeeeeeeeeeeeeeeeeerror!!!!");
   e.printStackTrace();
  }
  return null;
 }
异常:
eeeeeeeeeeeeeeeeeeeerror!!!!
java.io.IOException: Call to node1/172.10.39.250:8020 failed on local exception: java.io.EOFException
 at org.apache.hadoop.ipc.Client.wrapException(Client.java:775)
 at org.apache.hadoop.ipc.Client.call(Client.java:743)
 at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
 at $Proxy0.getProtocolVersion(Unknown Source)
 at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:359)
 at org.apache.hadoop.hdfs.DFSClient.createRPCNamenode(DFSClient.java:112)
 at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:213)
 at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:176)
 at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:82)
 at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:1378)
 at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:66)
 at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:1390)
 at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:196)
 at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:95)
 at mapreduce.HDFS_File.GetAllNodeName(HDFS_File.java:151)
 at mapreduce.File_Operation.main(File_Operation.java:15)
Caused by: java.io.EOFException
 at java.io.DataInputStream.readInt(DataInputStream.java:392)
 at org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:501)
 at org.apache.hadoop.ipc.Client$Connection.run(Client.java:446)
Exception in thread "main" java.lang.NullPointerException
 at mapreduce.File_Operation.main(File_Operation.java:16)
原因:版本问题,确保java中的jar包跟hadoop集群的jar包是相同版本的
2.HDFS
权限问题

org.apache.hadoop.security.AccessControlException: org.apache.hadoop.security.AccessControlException: Permission denied: user=hadooper, access=WRITE, inode="/user":root:supergroup:drwxr-xr-x

解决方案之
(1 added this entry to conf/hdfs-site.xml
<property>
<name>dfs.permissions</name>
<value>false</value>
</property> 
(2.放开 要写入目录 hadoop 目录的权限 , 命令如下 :$ hadoop fs -chmod 777 /user/
我用的是第2种方案

3.HDFS 2011-12-20 17:00:32 org.apache.hadoop.util.NativeCodeLoader <clinit>
警告: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

在Hadoop的配置文件core-site.xml中可以设置是否使用本地库: 
<property>
  <name>hadoop.native.lib</name>
  <value>true</value>
  <description>Should native hadoop libraries, if present, be used.</description>
</property>

Hadoop默认的配置为启用本地库。 
另外,可以在环境变量中设置使用本地库的位置:
export JAVA_LIBRARY_PATH=/path/to/hadoop-native-libs
有的时候也会发现Hadoop自带的本地库无法使用,这种情况下就需要自己去编译本地库了。在$HADOOP_HOME目录下,使用如下命令即可:
ant compile-native
编译完成后,可以在$HADOOP_HOME/build/native目录下找到相应的文件,然后指定文件的路径或者移动编译好的文件到默认目录下即可
我试了下,那个是64位的,我电脑是32位的,没有源代码,编译不了,那只好一段段程序的试,找出哪段代码出了这个警告,我的是
  try {
     FileSystem hdfs = FileSystem.get(conf);
     Path srcPath = new Path(srcFile);
     Path dstPath = new Path(dstFile);
     hdfs.copyToLocalFile(true,srcPath, dstPath);//定位到此句
  }catch (IOException e) {
 到了此步,便只能如此了,为什么呢,java不是跨平台的吗

4.MR-jar包缺失

ClassNotFoundException: org.codehaus.jackson.map.JsonMappingException
NoClassDefFoundError: org/apache/commons/httpclient/HttpMethod

添加jar包到java工程中

jackson-core-asl-1.5.2.jar
jackson-mapper-asl-1.5.2.jar

commons-httpclient-3.0.1.jar

我是不习惯将所有Jar包都加到工程里,觉得这样很容易便加多了,浪费时空。
完成第一次mapreduce,不错!

5.远程的JOB挂掉了,居然还能运行成功,发现是mapred.job.tracker属性没设,默认在local下运行,其值在namenode的mapred-site.xml中看

conf.set("mapred.job.tracker","node1:54311");

配置完了,运行可以初始化,但是找不到mapper类:

信息: Task Id : attempt_201112221123_0010_m_000000_0, Status : FAILED
java.lang.RuntimeException: java.lang.ClassNotFoundException: mapreduce.mywordcount$wordcountMapper
 at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:996)
 at org.apache.hadoop.mapreduce.JobContext.getMapperClass(JobContext.java:212)
 at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:611)
 at org.apache.hadoop.mapred.MapTask.run(MapTask.java:325)
 at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
 at java.security.AccessController.doPrivileged(Native Method)
 at javax.security.auth.Subject.doAs(Subject.java:396)
 at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
 at org.apache.hadoop.mapred.Child.main(Child.java:264)

将程序打成jar包放到hadoop集群的jobtracker上可用,正常,结果也正确,但是在客户端运行却报上述错误,暂时还没解决。

总结

1.远程操作HDFS文件以及远程提交MR任务,必须配置的两项(其他暂时还没发现):

conf.set("fs.default.name","hdfs://node1");//与conf/core-site.xml里的值对应,必须 
conf.set("mapred.job.tracker","node1:54311");//mapred-site.xml

2.耐心分析问题,解决问题

客户端用java api 远程操作HDFS以及远程提交MR任务(源码和异常处理)的更多相关文章

  1. Phoenix简介概述,Phoenix的Java API 相关操作优秀案例

    Phoenix简介概述,Phoenix的Java API 相关操作优秀案例 一.Phoenix概述简介 二.Phoenix实例一:Java API操作 2.1 phoenix.properties 2 ...

  2. Java并发编程笔记之Unsafe类和LockSupport类源码分析

    一.Unsafe类的源码分析 JDK的rt.jar包中的Unsafe类提供了硬件级别的原子操作,Unsafe里面的方法都是native方法,通过使用JNI的方式来访问本地C++实现库. rt.jar ...

  3. java画图程序_图片用字母画出来_源码发布_版本二

    在上一个版本:java画图程序_图片用字母画出来_源码发布 基础上,增加了图片同比例缩放,使得大像素图片可以很好地显示画在Notepad++中. 项目结构: 运行效果1: 原图:http://imag ...

  4. Java学习-025-类名或方法名应用之一 -- 调试源码

    上文讲述了如何获取类名和方法名,敬请参阅: Java学习-024-获取当前类名或方法名二三文 . 通常在应用开发中,调试或查看是哪个文件中的方法调用了当前文件的此方法,因而在实际的应用中需要获取相应的 ...

  5. Java开源生鲜电商平台-盈利模式详解(源码可下载)

    Java开源生鲜电商平台-盈利模式详解(源码可下载) 该平台提供一个联合买家与卖家的一个平台.(类似淘宝购物,这里指的是食材的购买.) 平台有以下的盈利模式:(类似的平台有美菜网,食材网等) 1. 订 ...

  6. Java开源生鲜电商平台-用户表的设计(源码可下载)

    Java开源生鲜电商平台-用户表的设计(源码可下载) 说明:由于该系统属于B2B平台,不设计到B2C的架构. 角色分析:买家与卖家. 由于买家与卖家所填写的资料都不一样,需要建立两站表进行维护,比如: ...

  7. Java开源生鲜电商平台-商品表的设计(源码可下载)

    Java开源生鲜电商平台-商品表的设计(源码可下载) 任何一个电商,无论是B2C还是B2B的电商,商品表的设计关系到整个系统架构的核心. 1. 商品基本信息表:用单词:goods做为商品表 2. 商品 ...

  8. Java开源生鲜电商平台-订单表的设计(源码可下载)

    Java开源生鲜电商平台-订单表的设计(源码可下载) 场景分析说明: 买家(餐馆)用户,通过APP进行选菜,放入购物车,然后下单,最终支付的流程,我们称为下单过程. 买家可以在张三家买茄子,李四家买萝 ...

  9. HDFS Java API 常用操作

    package com.luogankun.hadoop.hdfs.api; import java.io.BufferedInputStream; import java.io.File; impo ...

随机推荐

  1. 解决小米手机不能运行Android Studio程序的问题

    转载自:解决小米手机不能运行Android Studio程序的问题 问题描述 Android Studio升级到2.3版本之后,小米手机MIUI8不能运行Android Studio程序,报如下错误: ...

  2. 《学习opencv》笔记——矩阵和图像操作——cvInRange,cvInRangeS,cvInvert and cvMahalonobis

    矩阵和图像的操作 (1)cvInRange函数 其结构 void cvInRange(//提取图像中在阈值中间的部分 const CvArr* src,//目标图像 const CvArr* lowe ...

  3. [runtime] MAObjCRuntime

    MAObjCRuntime 源码地址:(引入头文件MARTNSObject.h即可,非arc环境下) http://pan.baidu.com/s/1eQ6776U https://github.co ...

  4. 安卓5.1/7.1/8.1+wifi有叉问题解决

    7.1/8.1 adb shell "settings put global captive_portal_http_url http://captive.v2ex.co/generate_ ...

  5. jquery的each函数的用法

    <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8&quo ...

  6. Flask 学习(二)路由

    Flask  路由 在说明什么是 Flask 路由之前,详细阐述下 Flask “Hello World” 这一 最小应用的代码. Flask “Hello World” from flask imp ...

  7. 泛泰A870(高通600 cpu 720p) 刷4.4专用中文recovery TWRP2.7.1.1版(三版通刷)

    欢迎关注泛泰非盈利专业第三方开发团队 VegaDevTeam  (本team 由 syhost suky zhaochengw(z大) xuefy(大星星) tenfar(R大师) loogeo cr ...

  8. Redis:ERR operation not permitted

    Redis服务开启之后: D:\Redis\redis>redis-server.exe redis.conf 那么另外开一个命令窗口 D:\Redis\redis>redis-cli.e ...

  9. linked-list-random-node

    https://leetcode.com/problems/linked-list-random-node/ // Using Reservoir sampling algorithm // http ...

  10. UVA 156 (13.08.04)

     Ananagrams  Most crossword puzzle fans are used to anagrams--groupsof words with the same letters i ...