android开发 WriteUTF与readUTF 原理

今晚上写代码玩，用到java.io.RandomAccessFile.writeUTF(String)函数，而文件默认保存为gbk，显然是乱码。突然想起来去看看存储编码规则，就去找了些文章了解writeUTF(String)的原理,在此记录。
首先需要弄明白unicode与utf8的表示规则，搜到@Feng哥的一篇文章《字符编码笔记：ASCII，Unicode和UTF-8》,写的很明白，在此招录一段：

| Unicode符号范围 | UTF-8编码方式

| 0000 0000-0000 007F | 0xxxxxxx
| 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
| 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
| 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
下面，还是以汉字"严"为例，演示如何实现UTF-8编码。
已知"严"的unicode是4E25（100111000100101），根据上表，可以发现4E25处在第三行的范围内（0000 0800-0000 FFFF），因此"严"的UTF-8编码需要三个字节，即格式是"1110xxxx 10xxxxxx 10xxxxxx”。然后，从"严"的最后一个二进制位开始，依次从后向前填入格式中的x，多出的位补0。这样就得到了，“严"的UTF-8编码是"11100100 10111000 10100101”，转换成十六进制就是E4B8A5。

也就是将4E25(100 111000 100101)依次填充到(1110xxxx 10xxxxxx 10xxxxxx)的x位置里面！

文章中还强调的一点思想就是:Unicode是为统一世界多种编码问题而制定的统一编码，就是UTF-8只是Unicode的一种实现方式。

打印System.out.println(Integer.toHexString('严'));,打印结果为4e25，为Unicode编码.当使用RandomAccessFile.writeUTF(String), “严"以utf8个是写入文件。

下面是源码，以及我做的一些备注，以后回忆时候好用：

java.io.DataOutputStream.writeUTF(String, DataOutput)

static int writeUTF(String str, DataOutput out) throws IOException {

    int strlen = str.length();

    int utflen = 0;

    int c, count = 0;

    /* 根据c的大小决定存储长度utflen的大小,最大65535字节，也就是64kb */

    for (int i = 0; i < strlen; i++) {

            c = str.charAt(i);

        if ((c >= 0x0001) && (c <= 0x007F)) {

        utflen++;

        } else if (c > 0x07FF) {

        utflen += 3;

        } else {

        utflen += 2;

        }

    }

    if (utflen > 65535)

        throw new UTFDataFormatException(

                "encoded string too long: " + utflen + " bytes");

    /*创建"合适"长度字节数组bytearr，下面的+2是因为需要在bytearr前两字节中存储数据长度utflen*/

    byte[] bytearr = null;

    if (out instanceof DataOutputStream) {

        DataOutputStream dos = (DataOutputStream)out;

        if(dos.bytearr == null || (dos.bytearr.length < (utflen+2)))

            dos.bytearr = new byte[(utflen*2) + 2];

        bytearr = dos.bytearr;

    } else {

        bytearr = new byte[utflen+2];

    }

    bytearr[count++] = (byte) ((utflen >>> 8) & 0xFF);

    bytearr[count++] = (byte) ((utflen >>> 0) & 0xFF);

    /*如果是ascii码就直接存在bytearr里面了,毕竟老外写的源码，都是用ascii码机率比较高，就省去下面for中的判断了*/

    int i=0;

    for (i=0; i<strlen; i++) {

       c = str.charAt(i);

       if (!((c >= 0x0001) && (c <= 0x007F))) break;

       bytearr[count++] = (byte) c;

    }

    /*上面满足不了就用下面的。。*/

    for (;i < strlen; i++){

            c = str.charAt(i);

        if ((c >= 0x0001) && (c <= 0x007F)) {

            /*单字节,编码规则:0xxxxxxx ,ascii码处理*/

            bytearr[count++] = (byte) c;

        } else if (c > 0x07FF) {

            /*三字节,编码规则:1110xxxx 10xxxxxx 10xxxxxx

              参考上面的"严"字(100 111000 100101),则结果分别为(`1110`0100 `10`111000 `10`100101)*/

            bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));

            bytearr[count++] = (byte) (0x80 | ((c >>  6) & 0x3F));

            bytearr[count++] = (byte) (0x80 | ((c >>  0) & 0x3F));

        } else {

            /*两字节,编码规则:110xxxxx 10xxxxxx*/

            bytearr[count++] = (byte) (0xC0 | ((c >>  6) & 0x1F));

            bytearr[count++] = (byte) (0x80 | ((c >>  0) & 0x3F));

        }

    }

    out.write(bytearr, 0, utflen+2);

    return utflen + 2;

}

明白了写规则，然后就是读的规则了,反向理解就好。

java.io.DataInputStream.readUTF(DataInput)

public final static String readUTF(DataInput in) throws IOException {

    int utflen = in.readUnsignedShort();

    byte[] bytearr = null;

    char[] chararr = null;

    if (in instanceof DataInputStream) {

        DataInputStream dis = (DataInputStream)in;

        if (dis.bytearr.length < utflen){

            dis.bytearr = new byte[utflen*2];

            dis.chararr = new char[utflen*2];

        }

        chararr = dis.chararr;

        bytearr = dis.bytearr;

    } else {

        bytearr = new byte[utflen];

        chararr = new char[utflen];

    }

    int c, char2, char3;

    int count = 0;

    int chararr_count=0;

    in.readFully(bytearr, 0, utflen);

    while (count < utflen) {

        c = (int) bytearr[count] & 0xff;

        if (c > 127) break;

        count++;

        chararr[chararr_count++]=(char)c;

    }

    while (count < utflen) {

        c = (int) bytearr[count] & 0xff;

        switch (c >> 4) {

            case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:

                /* 0xxxxxxx*/

                count++;

                chararr[chararr_count++]=(char)c;

                break;

            case 12: case 13:

                /* 110x xxxx   10xx xxxx*/

                count += 2;

                if (count > utflen)

                    throw new UTFDataFormatException(

                        "malformed input: partial character at end");

                char2 = (int) bytearr[count-1];

                if ((char2 & 0xC0) != 0x80)

                    throw new UTFDataFormatException(

                        "malformed input around byte " + count);

                chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |

                                                (char2 & 0x3F));

                break;

            case 14:

                /* 1110 xxxx  10xx xxxx  10xx xxxx */

                count += 3;

                if (count > utflen)

                    throw new UTFDataFormatException(

                        "malformed input: partial character at end");

                char2 = (int) bytearr[count-2];

                char3 = (int) bytearr[count-1];

                if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))

                    throw new UTFDataFormatException(

                        "malformed input around byte " + (count-1));

                chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |

                                                ((char2 & 0x3F) << 6)  |

                                                ((char3 & 0x3F) << 0));

                break;

            default:

                /* 10xx xxxx,  1111 xxxx */

                throw new UTFDataFormatException(

                    "malformed input around byte " + count);

        }

    }

    // The number of chars produced may be less than utflen

    return new String(chararr, 0, chararr_count);

}

一切就显得很明了了。

原文：http://my.oschina.net/diligentSt/blog/147933