utf8_to_utf16

17down voteaccepted

Here's some code. Only lightly tested and there's probably a few improvements. Call this function to convert a UTF-8 string to a UTF-16 wstring. If it thinks the input string is not UTF-8 then it will throw an exception, otherwise it returns the equivalent UTF-16 wstring.

std::wstring utf8_to_utf16(const std::string& utf8)

{

    std::vector<unsigned long> unicode;

    size_t i = 0;

    while (i < utf8.size())

    {

        unsigned long uni;

        size_t todo;

        bool error = false;

        unsigned char ch = utf8[i++];

        if (ch <= 0x7F)

        {

            uni = ch;

            todo = 0;

        }

        else if (ch <= 0xBF)

        {

            throw std::logic_error("not a UTF-8 string");

        }

        else if (ch <= 0xDF)

        {

            uni = ch&0x1F;

            todo = 1;

        }

        else if (ch <= 0xEF)

        {

            uni = ch&0x0F;

            todo = 2;

        }

        else if (ch <= 0xF7)

        {

            uni = ch&0x07;

            todo = 3;

        }

        else

        {

            throw std::logic_error("not a UTF-8 string");

        }

        for (size_t j = 0; j < todo; ++j)

        {

            if (i == utf8.size())

                throw std::logic_error("not a UTF-8 string");

            unsigned char ch = utf8[i++];

            if (ch < 0x80 || ch > 0xBF)

                throw std::logic_error("not a UTF-8 string");

            uni <<= 6;

            uni += ch & 0x3F;

        }

        if (uni >= 0xD800 && uni <= 0xDFFF)

            throw std::logic_error("not a UTF-8 string");

        if (uni > 0x10FFFF)

            throw std::logic_error("not a UTF-8 string");

        unicode.push_back(uni);

    }

    std::wstring utf16;

    for (size_t i = 0; i < unicode.size(); ++i)

    {

        unsigned long uni = unicode[i];

        if (uni <= 0xFFFF)

        {

            utf16 += (wchar_t)uni;

        }

        else

        {

            uni -= 0x10000;

            utf16 += (wchar_t)((uni >> 10) + 0xD800);

            utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00);

        }

    }

    return utf16;

}

share improve this answer

http://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring

#pragma once

#include <string>

#ifdef tstring

#error "\"tstring\" Macro has been defined."

#else

#ifdef _UNICODE

#define tstring wstring

#else

#define tstring string

#endif

#endif

class EncodingConverter

{

public:

    static int AnsiStrToWideStr(std::string& strSrc, std::wstring& strDest)

    {

        int nLen = strSrc.length() + ;

        int nRet = ;

        nLen *=  sizeof(wchar_t);

        wchar_t* pszW = new wchar_t[nLen];

        memset(pszW, , nLen);

        nRet = MultiByteToWideChar(CP_ACP, , strSrc.c_str(), -, pszW, nLen); 

        strDest = pszW;

        delete[] pszW;

        return nRet;

    };

    static int WideStrToAnsiStr(std::wstring& strSrc, std::string& strDest)

    {

        int nLen = strSrc.length() + ;

        int nRet = ;

        nLen *= sizeof(wchar_t);

        char* pszA = new char[nLen];

        memset(pszA, , nLen);

        nRet = WideCharToMultiByte(CP_ACP, , strSrc.c_str(), -, pszA, nLen, NULL, NULL); 

        strDest = pszA;

        delete[] pszA;

        return nRet;

    };

    static int AnsiStrToTStr(std::string& strSrc, std::tstring& strDest)

    {

        int nRet = ;

#ifdef _UNICODE

        nRet = AnsiStrToWideStr(strSrc, strDest);

#else

        strDest = strSrc;

        nRet = strDest.length();

#endif

        return nRet;

    };

    static int TStrToAnsiStr(std::tstring& strSrc, std::string& strDest)

    {

        int nRet = ;

#ifdef _UNICODE

        nRet = WideStrToAnsiStr(strSrc, strDest);

#else

        strDest = strSrc;

        nRet = strDest.length();

#endif

        return nRet;

    };

    static int WideStrToTStr(std::wstring& strSrc, std::tstring& strDest)

    {

        int nRet = ;

#ifdef _UNICODE

        strDest = strSrc;

        nRet = strDest.length();

#else

        nRet = WideStrToAnsiStr(strSrc, strDest);

#endif

        return nRet;

    };

    static int TStrToWideStr(std::tstring& strSrc, std::wstring& strDest)

    {

        int nRet = ;

#ifdef _UNICODE

        strDest = strSrc;

        nRet = strDest.length();

#else

        nRet = AnsiStrToWideStr(strSrc, strDest);

#endif

        return nRet;

    };

    static std::string ToAnsiString(const wchar_t* lpStr)

    {

        std::wstring wide_string = lpStr;

        std::string ansi_string;

        WideStrToAnsiStr(wide_string, ansi_string);

        return ansi_string;

    };

    static std::string ToAnsiString(const char* lpStr)

    {

        return std::string(lpStr);

    };

    static std::wstring ToWideString(const wchar_t* lpStr)

    {

        return std::wstring(lpStr);

    };

    static std::wstring ToWideString(const char* lpStr)

    {

        std::string ansi_string = lpStr;

        std::wstring wide_string;

        AnsiStrToWideStr(ansi_string, wide_string);

        return wide_string;

    };

    static std::tstring ToTString(const char* lpStr)

    {

#ifdef _UNICODE

        return ToWideString(lpStr);

#else

        return ToAnsiString(lpStr);

#endif

    };

    static std::tstring ToTString(const wchar_t* lpStr)

    {

#ifdef _UNICODE

        return ToWideString(lpStr);

#else

        return ToAnsiString(lpStr);

#endif

    };

    static int WideStrToUtf8Str(std::wstring& strSrc, std::string& strDest)

    {

        int nRet = ;

        int nLen = ;

        nLen = WideCharToMultiByte(CP_UTF8, , strSrc.c_str(), -, NULL, , NULL, NULL);

        char * lpUtf8Str = new char[nLen+];

        memset(lpUtf8Str, , nLen);

        nRet = WideCharToMultiByte(CP_UTF8, , strSrc.c_str(), -, lpUtf8Str, nLen, NULL, NULL);

        strDest = lpUtf8Str;

        delete[] lpUtf8Str;

        return nRet;

    };

    static int AnsiStrToUtf8Str(std::string& strSrc, std::string& strDest)

    {

        int nRet = ;

        std::wstring wide_string;

        nRet = AnsiStrToWideStr(strSrc, wide_string);

        nRet = WideStrToUtf8Str(wide_string, strDest);

        return nRet;

    };

    static int Utf8StrToWideStr(const std::string& strSrc, std::wstring& strDest)

    {

        int nRet = ;

        int nLen = ;

        nLen = MultiByteToWideChar(CP_UTF8, , strSrc.c_str(), -, NULL, );

        wchar_t* lpWideStr = new wchar_t[nLen];

        memset(lpWideStr, , nLen*sizeof(lpWideStr[]));

        nRet = MultiByteToWideChar(CP_UTF8, , strSrc.c_str(), -, lpWideStr, nLen);

        strDest = lpWideStr;

        delete[] lpWideStr;

        return nRet;

    };

    static int Utf8StrToAnsiStr(const std::string& strSrc, std::string& strDest)

    {

        int nRet = ;

        std::wstring wide_string;

        nRet = Utf8StrToWideStr(strSrc, wide_string);

        nRet = WideStrToAnsiStr(wide_string, strDest);

        return nRet;

    };    

    static int Utf8StrToTStr(const std::string& strSrc, std::tstring& strDest)

    {

#ifdef UNICODE

        return Utf8StrToWideStr(strSrc, strDest);

#else

        return Utf8StrToAnsiStr(strSrc, strDest);

#endif

    };    

    static std::string ToUtf8String(const std::string& str)

    {

        std::string ansi_string = str;

        std::string utf8_string;

        AnsiStrToUtf8Str(ansi_string, utf8_string);

        return utf8_string;

    };

    static std::string ToUtf8String(const std::wstring& str)

    {

        std::wstring wide_string = str;

        std::string utf8_string;

        WideStrToUtf8Str(wide_string, utf8_string);

        return utf8_string;

    };

};

https://github.com/yaocoder/utility/blob/master/src/common/EncodingConverter.h

utf8_to_utf16的更多相关文章

boost::xml——基本操作以及中文乱码解决方案（续）
本博文主要想说明以下两点: 1.对于上一篇的<boost::xml——基本操作以及中文乱码解决方案>解释,这篇博文基本解决了正确输入输出中英文问题,但是好像还没有解决修改中文出现乱码的问题 ...
C++ MFC std::string转为 std::wstring
std::string转为 std::wstring std::wstring UTF8_To_UTF16(const std::string& source) { unsigned long ...
谷歌拼音自带lua
function fast_string_banji(argument) return {"快捷1", "快捷2", "快捷3", &quo ...
谷歌拼音输入法扩展API开发指南
为了帮助开发者在谷歌拼音输入法的基本输入功能基础上,开发和定义更丰富的扩展输入功能,谷歌拼音输入法提供了以Lua脚本编程语言为基础的输入法扩展API.利用输入法扩展API,开发者可以编写自定义的输入功 ...

随机推荐

apache kafka源码分析-Producer分析---转载
原文地址:http://www.aboutyun.com/thread-9938-1-1.html 问题导读1.Kafka提供了Producer类作为java producer的api,此类有几种发送 ...
获取IMEI码
核心代码: Imei = ((TelephonyManager) getSystemService(TELEPHONY_SERVICE)).getDeviceId(); 1.加入权限在manifes ...
gson使用详解
昨天读一篇文章,看到gson这个词,一开始还以为作者写错了,问了度娘之后才发现是我才疏学浅,于是大概了解了一下gson用法,总体来说还是很简单的. Gson.jar下载 JavaBean转json / ...
View事件分发机制
所谓的事件分发,其实就是对MotionEvent事件的分发过程,即当一个MotionEvent产生后,系统需要把这个事件传递给一个具体的View,而这个传递的过程就是分发过程. 点击事件的分发由3个方 ...
String类比较，String类运算比较，String运算
String类比较,String类运算比较 >>>>>>>>>>>>>>>>>>>&g ...
jquery.qrcode和jqprint的联合使用，实现html生成二维码并打印（中文也ok）
在公司的生产现场中,常常会在一些部品或设备上贴上二维码,用于扫描录入数据,免去手动输入的麻烦. 以前曾经做过winform的程序,生成二维码,并打印出来,使用的是zxing的类库, 但是如果二维码是附 ...
c#获取远程图片的方法
public static int SaveImageFromWeb(string imgUrl, string path) { var aaa = Environment.CurrentDirect ...
webstorm 如何配置git
2016-09-22 15:00:25 补充js 操作后发现工具自动生成了两个文件,如果提交到git上会造成不必要的麻烦,看看怎么去掉产生这个的原因吧! 去掉对应的监听:http://fronte ...
文件夹IsShow字段为空
IsShow为YesNo字段,默认值为Yes:在Library中新建一个文件的时候会给出默认值yes,但是新建一个文件夹的时候,默认为空,所以f.Item["IsShow"]为空, ...
[C# 基础知识系列]专题一：深入解析委托——C#中为什么要引入委托
转自http://www.cnblogs.com/zhili/archive/2012/10/22/Delegate.html 引言: 对于一些刚接触C# 不久的朋友可能会对C#中一些基本特性理解的不 ...

utf8_to_utf16

utf8_to_utf16的更多相关文章

随机推荐

热门专题