本文转载至:

  http://www.aboutyun.com/thread-7358-1-1.html

hadoop涉及输出文本的默认输出编码统一用没有BOM的UTF-8的形式,但是对于中文的输出window系统默认的是GBK,有些格式文件例如CSV格式的文件用excel打开输出编码为没有BOM的UTF-8文件时,输出的结果为乱码,只能由UE或者记事本打开才能正常显示。因此将hadoop默认输出编码更改为GBK成为非常常见的需求。 
      默认的情况下MR主程序中,设定输出编码的设置语句为:

  1. job.setOutputFormatClass(TextOutputFormat.class);

复制代码

  1. TextOutputFormat.class

复制代码

的代码如下:

  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements.  See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership.  The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License.  You may obtain a copy of the License at
  9. *
  10. *     http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.mapreduce.lib.output;
  19. import java.io.DataOutputStream;
  20. import java.io.IOException;
  21. import java.io.UnsupportedEncodingException;
  22. import org.apache.hadoop.classification.InterfaceAudience;
  23. import org.apache.hadoop.classification.InterfaceStability;
  24. import org.apache.hadoop.conf.Configuration;
  25. import org.apache.hadoop.fs.FileSystem;
  26. import org.apache.hadoop.fs.Path;
  27. import org.apache.hadoop.fs.FSDataOutputStream;
  28. import org.apache.hadoop.io.NullWritable;
  29. import org.apache.hadoop.io.Text;
  30. import org.apache.hadoop.io.compress.CompressionCodec;
  31. import org.apache.hadoop.io.compress.GzipCodec;
  32. import org.apache.hadoop.mapreduce.OutputFormat;
  33. import org.apache.hadoop.mapreduce.RecordWriter;
  34. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  35. import org.apache.hadoop.util.*;
  36. /** An {@link OutputFormat} that writes plain text files. */
  37. @InterfaceAudience.Public
  38. @InterfaceStability.Stable
  39. public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {
  40. public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
  41. protected static class LineRecordWriter<K, V>
  42. extends RecordWriter<K, V> {
  43. private static final String utf8 = "UTF-8";  // 将UTF-8转换成GBK
  44. private static final byte[] newline;
  45. static {
  46. try {
  47. newline = "\n".getBytes(utf8);
  48. } catch (UnsupportedEncodingException uee) {
  49. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  50. }
  51. }
  52. protected DataOutputStream out;
  53. private final byte[] keyValueSeparator;
  54. public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
  55. this.out = out;
  56. try {
  57. this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
  58. } catch (UnsupportedEncodingException uee) {
  59. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  60. }
  61. }
  62. public LineRecordWriter(DataOutputStream out) {
  63. this(out, "\t");
  64. }
  65. /**
  66. * Write the object to the byte stream, handling Text as a special
  67. * case.
  68. * @param o the object to print
  69. * @throws IOException if the write throws, we pass it on
  70. */
  71. private void writeObject(Object o) throws IOException {
  72. if (o instanceof Text) {
  73. Text to = (Text) o;   // 将此行代码注释掉
  74. out.write(to.getBytes(), 0, to.getLength());  // 将此行代码注释掉
  75. } else { // 将此行代码注释掉
  76. out.write(o.toString().getBytes(utf8));
  77. }
  78. }
  79. public synchronized void write(K key, V value)
  80. throws IOException {
  81. boolean nullKey = key == null || key instanceof NullWritable;
  82. boolean nullValue = value == null || value instanceof NullWritable;
  83. if (nullKey && nullValue) {
  84. return;
  85. }
  86. if (!nullKey) {
  87. writeObject(key);
  88. }
  89. if (!(nullKey || nullValue)) {
  90. out.write(keyValueSeparator);
  91. }
  92. if (!nullValue) {
  93. writeObject(value);
  94. }
  95. out.write(newline);
  96. }
  97. public synchronized
  98. void close(TaskAttemptContext context) throws IOException {
  99. out.close();
  100. }
  101. }
  102. public RecordWriter<K, V>
  103. getRecordWriter(TaskAttemptContext job
  104. ) throws IOException, InterruptedException {
  105. Configuration conf = job.getConfiguration();
  106. boolean isCompressed = getCompressOutput(job);
  107. String keyValueSeparator= conf.get(SEPERATOR, "\t");
  108. CompressionCodec codec = null;
  109. String extension = "";
  110. if (isCompressed) {
  111. Class<? extends CompressionCodec> codecClass =
  112. getOutputCompressorClass(job, GzipCodec.class);
  113. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
  114. extension = codec.getDefaultExtension();
  115. }
  116. Path file = getDefaultWorkFile(job, extension);
  117. FileSystem fs = file.getFileSystem(conf);
  118. if (!isCompressed) {
  119. FSDataOutputStream fileOut = fs.create(file, false);
  120. return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
  121. } else {
  122. FSDataOutputStream fileOut = fs.create(file, false);
  123. return new LineRecordWriter<K, V>(new DataOutputStream
  124. (codec.createOutputStream(fileOut)),
  125. keyValueSeparator);
  126. }
  127. }
  128. }

复制代码

从上述代码的第48行可以看出hadoop已经限定此输出格式统一为UTF-8,因此为了改变hadoop的输出代码的文本编码只需定义一个和TextOutputFormat相同的类GbkOutputFormat同样继承FileOutputFormat(注意是org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)即可,如下代码:

  1. import java.io.DataOutputStream;
  2. import java.io.IOException;
  3. import java.io.UnsupportedEncodingException;
  4. import org.apache.hadoop.classification.InterfaceAudience;
  5. import org.apache.hadoop.classification.InterfaceStability;
  6. import org.apache.hadoop.conf.Configuration;
  7. import org.apache.hadoop.fs.FileSystem;
  8. import org.apache.hadoop.fs.Path;
  9. import org.apache.hadoop.fs.FSDataOutputStream;
  10. import org.apache.hadoop.io.NullWritable;
  11. import org.apache.hadoop.io.Text;
  12. import org.apache.hadoop.io.compress.CompressionCodec;
  13. import org.apache.hadoop.io.compress.GzipCodec;
  14. import org.apache.hadoop.mapreduce.OutputFormat;
  15. import org.apache.hadoop.mapreduce.RecordWriter;
  16. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  17. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  18. import org.apache.hadoop.util.*;
  19. @InterfaceAudience.Public
  20. @InterfaceStability.Stable
  21. public class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {
  22. public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
  23. protected static class LineRecordWriter<K, V>
  24. extends RecordWriter<K, V> {
  25. private static final String utf8 = "GBK";
  26. private static final byte[] newline;
  27. static {
  28. try {
  29. newline = "\n".getBytes(utf8);
  30. } catch (UnsupportedEncodingException uee) {
  31. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  32. }
  33. }
  34. protected DataOutputStream out;
  35. private final byte[] keyValueSeparator;
  36. public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
  37. this.out = out;
  38. try {
  39. this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
  40. } catch (UnsupportedEncodingException uee) {
  41. throw new IllegalArgumentException("can't find " + utf8 + " encoding");
  42. }
  43. }
  44. public LineRecordWriter(DataOutputStream out) {
  45. this(out, "\t");
  46. }
  47. /**
  48. * Write the object to the byte stream, handling Text as a special
  49. * case.
  50. * @param o the object to print
  51. * @throws IOException if the write throws, we pass it on
  52. */
  53. private void writeObject(Object o) throws IOException {
  54. if (o instanceof Text) {
  55. //        Text to = (Text) o;
  56. //        out.write(to.getBytes(), 0, to.getLength());
  57. //      } else {
  58. out.write(o.toString().getBytes(utf8));
  59. }
  60. }
  61. public synchronized void write(K key, V value)
  62. throws IOException {
  63. boolean nullKey = key == null || key instanceof NullWritable;
  64. boolean nullValue = value == null || value instanceof NullWritable;
  65. if (nullKey && nullValue) {
  66. return;
  67. }
  68. if (!nullKey) {
  69. writeObject(key);
  70. }
  71. if (!(nullKey || nullValue)) {
  72. out.write(keyValueSeparator);
  73. }
  74. if (!nullValue) {
  75. writeObject(value);
  76. }
  77. out.write(newline);
  78. }
  79. public synchronized
  80. void close(TaskAttemptContext context) throws IOException {
  81. out.close();
  82. }
  83. }
  84. public RecordWriter<K, V>
  85. getRecordWriter(TaskAttemptContext job
  86. ) throws IOException, InterruptedException {
  87. Configuration conf = job.getConfiguration();
  88. boolean isCompressed = getCompressOutput(job);
  89. String keyValueSeparator= conf.get(SEPERATOR, "\t");
  90. CompressionCodec codec = null;
  91. String extension = "";
  92. if (isCompressed) {
  93. Class<? extends CompressionCodec> codecClass =
  94. getOutputCompressorClass(job, GzipCodec.class);
  95. codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
  96. extension = codec.getDefaultExtension();
  97. }
  98. Path file = getDefaultWorkFile(job, extension);
  99. FileSystem fs = file.getFileSystem(conf);
  100. if (!isCompressed) {
  101. FSDataOutputStream fileOut = fs.create(file, false);
  102. return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
  103. } else {
  104. FSDataOutputStream fileOut = fs.create(file, false);
  105. return new LineRecordWriter<K, V>(new DataOutputStream
  106. (codec.createOutputStream(fileOut)),
  107. keyValueSeparator);
  108. }
  109. }
  110. }

复制代码

最后将输出编码类型设置成GbkOutputFormat.class,如:

  1. job.setOutputFormatClass(GbkOutputFormat.class);

复制代码

参考:

  1. http://semantic.iteye.com/blog/1846238

复制代码

hadoop 输出中文乱码问题的更多相关文章

  1. .Net Core 控制台输出中文乱码

    Net Core 控制台输出中文乱码的解决方法: public static void Main(string[] args)         {             Console.Output ...

  2. 在Servlet中出现一个输出中文乱码的问题(已经解)。

    在Servlet中出现一个输出中文乱码的问题,已经解. @Override public void doPost(HttpServletRequest reqeust, HttpServletResp ...

  3. idea 控制台输出 中文乱码 解决方法

    使用intellij idea 14.1时,console 会输出中文乱码.下面分两种情况解决这种问题:一种是maven构建项目.一种是tomcat(不以maven构建)构建项目. 1.tomcat输 ...

  4. 编码(ACSII unicod UTF-8)、QT输出中文乱码深入分析

    总结: 1. qt输出中文乱码原因分析 qt的编程环境默认是utf-8编码格式(关于编码见下文知识要点一): cout << "中文" << endl; 程 ...

  5. 使用WebLogic时控制台输出中文乱码解决方法

    使用WebLogic时控制台输出中文乱码解决方法 1.找到weblogic安装目录,当前项目配置的domain 2.找到bin下的setDomainEnv.cmd文件 3.打开文件,从文件最后搜索第一 ...

  6. 二十一、IntelliJ IDEA 控制台输出中文乱码问题的解决方法

    首先,找到 IntelliJ IDEA 的安装目录,进入bin目录下,定位到idea.vmoptions文件,如下图所示: 双击打开idea.vmoptions文件,如下图所示: 然后,在其中追加-D ...

  7. 解决phantomjs输出中文乱码

    解决phantomjs输出中文乱码,可以在js文件里添加如下语句: phantom.outputEncoding="gb2312"; // 解决输出乱码

  8. resin后台输出中文乱码的解决办法!

    resin后台输出中文乱码的解决办法! 学习了:https://blog.csdn.net/kobeguang/article/details/34116429 编辑conf/resin.con文件: ...

  9. resin后台输出中文乱码的解决的方法!

    近期从tomcat移植到resin,发现这东西不错啊! 仅仅是后台输出时有时候中文会乱码. 如今找到resin后台输出中文乱码的解决的方法: 编辑conf/resin.con文件: <!--ja ...

随机推荐

  1. hdu 1853 (费用流 拆点)

    // 给定一个有向图,必须用若干个环来覆盖整个图,要求这些覆盖的环的权值最小. 思路:原图每个点 u 拆为 u 和 u' ,从源点引容量为 1 费用为 0 的边到 u ,从 u' 引相同性质的边到汇点 ...

  2. uva 12730(期望经典)

    选自: http://blog.csdn.net/myhelperisme/article/details/39724515 用dp(n)表示有n个位置时的期望值,那么,对于一个刚进来的人来说,他有 ...

  3. 一个jsp页面引入另一个jsp页面的三种方式 及静态引入和动态引入的区别

    转载下, 转载自:http://blog.csdn.net/fn_2015/article/details/70311495 1.第一种:jstl  import <c:import url=& ...

  4. JS+PHP瀑布流效果(二)

    <!-- 加载商品 --><script>    //用户拖动滚动条,达到底部时ajax加载一次数据    var loading = $("#loading&quo ...

  5. Js用户引导插件bootstrap-tour

    1.demo直接贴上来了,有什么不懂的,直接去官网上看,地址:http://bootstraptour.com/. 2.这个bootstrap-tour插件的版本是v0.12.0,复制下来代码,引入库 ...

  6. spring中配置缓存—ehcache

    常用的缓存工具有ehcache.memcache和redis,这里介绍spring中ehcache的配置. 1.在pom添加依赖: <!-- ehcache 相关依赖 --> <de ...

  7. ACM解题之(ZOJ 1094) Matrix Chain Multiplication

    题目来源: 点击打开链接 题目翻译: 矩阵乘法问题是动态规划的典型例子. 假设你必须评估一个表达式,如A * B * C * D * E,其中A,B,C,D和E是矩阵.由于矩阵乘法是关联的,乘法运算的 ...

  8. jmeter 测试restful接口

    jmeter 测试restful接口,JSON数据格式 1.添加线程组 2.添加HTTP信息头管理器 请求发送JSON数据格式参数,需要设置Content-Type为application/json ...

  9. MySQL 8.0的十大新特性

    今天,让我们看一下MySQL8.0提升数据库管理员工作效率的十大改进. 从一大堆特性你们找出十点并不太容易,以下是这十大特性: 1.临时表的改进 2.持续的全局变量 3.取消默认MyISAM系统表 4 ...

  10. Ubuntu输入su命令提示认证失败的解决办法

    Ubuntu安装后,root用户默认是被锁定了的,不允许登录,也不允许执行"su命令到root".对于桌面用户而言,这样可以提高安全性.但对于服务器可以设置成允许"su命 ...