求二个函数,unicodeToUTF8与UTF8toUnicode

a8323418 2009-10-12 04:42:26

...全文

1603 26 打赏收藏转发到动态举报

写回复

用AI写文章

26 条回复

切换为时间正序

请发表友善的回复…

发表回复

生活简单到无聊 2009-10-13

打赏
举报

void Decode_Utf8_Unicode_To_Unicode(char ** m_pbuff,uint32 *char_len)
{
char* pBuffer = NULL;
if(*m_pbuff==NULL)
return ;
pBuffer = MALLOC(sizeof(AECHAR)*((*char_len)+32)); //多加点内存，用来放版本号,最多放32个字符的版本号
if(pBuffer==NULL)
return ;
if ( ( unsigned char )**m_pbuff == 0xff && ( unsigned char )*(*m_pbuff+1) == 0xfe )//是Unicode
{
WSTRCPY((AECHAR *)pBuffer,(AECHAR *)(*m_pbuff+2));
}
else if(( unsigned char )**m_pbuff == 0xef && ( unsigned char )*(*m_pbuff+1) == 0xbb&& ( unsigned char )*(*m_pbuff+2) == 0xbf) //utf8内容进行解码,偏移3字节
{
UTF8TOWSTR((byte *)(*m_pbuff+3),(*char_len)-3,(AECHAR *)pBuffer,sizeof(AECHAR)*(*char_len));
}
else //未知编码格式内容不处理
{
FREEIF(pBuffer);
return;
}
*char_len=2*WSTRLEN((AECHAR *)pBuffer);
FREEIF(*m_pbuff);
*m_pbuff=pBuffer;
}

jiayucunyan 2009-10-13

打赏
举报

http://www.gnu.org/software/libiconv/
用iconv吧
使用起来很方便

以前用这个做过unicode转gb

猫已经找不回了 2009-10-13

打赏
举报

MultiByteToWideChar给用的话就ok。

nadoo 2009-10-13

打赏
举报

实际项目中使用的话，还是建议用iconv

nadoo 2009-10-13

打赏
举报

C语言版（自己优化）：



/************************************************************************/

/*  Author: NadOo (nadoo@21cn.com)

    Reference: 

            http://blog.csdn.net/lovekatherine/archive/2007/08/30/1765903.aspx

    Testing Tool: 

            http://www.hanzify.org/?Go=Show::List&ID=9627

*/

/************************************************************************/

#include <stdio.h>

#include <string.h>

#include <malloc.h>

#include <memory.h>



#ifdef WIN32

#define uint8_t  unsigned __int8

#define uint16_t unsigned __int16

#define uint32_t unsigned __int32

#define uint64_t unsigned __int64



#define int8_t  __int8

#define int16_t __int16

#define int32_t __int32

#endif



int unicode_to_utf8(uint16_t *in, int insize, uint8_t **out)

{

    int i = 0;

    int outsize = 0;

    int charscount = 0;

    uint8_t *result = NULL;

    uint8_t *tmp = NULL;



    charscount = insize / sizeof(uint16_t);

    result = (uint8_t *)malloc(charscount * 3 + 1);

    memset(result, 0, charscount * 3 + 1);

    tmp = result;



    for (i = 0; i < charscount; i++)

    {

        uint16_t unicode = in[i];

        

        if (unicode >= 0x0000 && unicode <= 0x007f)

        {

            *tmp = (uint8_t)unicode;

            tmp += 1;

            outsize += 1;

        }

        else if (unicode >= 0x0080 && unicode <= 0x07ff)

        {

            *tmp = 0xc0 | (unicode >> 6);

            tmp += 1;

            *tmp = 0x80 | (unicode & (0xff >> 2));

            tmp += 1;

            outsize += 2;

        }

        else if (unicode >= 0x0800 && unicode <= 0xffff)

        {

            *tmp = 0xe0 | (unicode >> 12);

            tmp += 1;

            *tmp = 0x80 | (unicode >> 6 & 0x00ff);

            tmp += 1;

            *tmp = 0x80 | (unicode & (0xff >> 2));

            tmp += 1;

            outsize += 3;

        }



    }



    *tmp = '\0';

    *out = result;

    return 0;

}



int utf8_to_unicode(uint8_t *in, uint16_t **out, int *outsize)

{

    uint8_t *p = in;

    uint16_t *result = NULL;

    int resultsize = 0;

    uint8_t *tmp = NULL;



    result = (uint16_t *)malloc(strlen(in) * 2 + 2); /* should be enough */

    memset(result, 0, strlen(in) * 2 + 2);

    tmp = (uint8_t *)result;



    while(*p)

    {

        if (*p >= 0x00 && *p <= 0x7f)

        {

            *tmp = *p;

            tmp++;

            *tmp = '\0';

            resultsize += 2;

        }

        else if ((*p & (0xff << 5))== 0xc0)

        {

            uint16_t t = 0;

            uint8_t t1 = 0;

            uint8_t t2 = 0;



            t1 = *p & (0xff >> 3);

            p++;

            t2 = *p & (0xff >> 2);



            *tmp = t2 | ((t1 & (0xff >> 6)) << 6);//t1 >> 2;

            tmp++;



            *tmp = t1 >> 2;//t2 | ((t1 & (0xff >> 6)) << 6);

            tmp++;



            resultsize += 2;

        }

        else if ((*p & (0xff << 4))== 0xe0)

        {

            uint16_t t = 0;

            uint8_t t1 = 0;

            uint8_t t2 = 0;

            uint8_t t3 = 0;



            t1 = *p & (0xff >> 3);

            p++;

            t2 = *p & (0xff >> 2);

            p++;

            t3 = *p & (0xff >> 2);



            //Little Endian

            *tmp = ((t2 & (0xff >> 6)) << 6) | t3;//(t1 << 4) | (t2 >> 2);

            tmp++;



            *tmp = (t1 << 4) | (t2 >> 2);//((t2 & (0xff >> 6)) << 6) | t3;

            tmp++;

            resultsize += 2;

        }



        p++;

    }



    *tmp = '\0';

    tmp++;

    *tmp = '\0';

    resultsize += 2;



    *out = result;

    *outsize = resultsize; 

    return 0;

}



void dump_utf8(uint8_t *utf8)

{

    uint8_t *p = utf8;



    while(*p)

    {

        printf("%02X", *p);

        p++;

    }

    putchar('\n');

}



void dump_unicode(uint16_t *utf16, int size)

{

    uint8_t *p = (uint8_t *)utf16;

    int i = 0;



    for (i = 0; i < size; i++)

    {

        printf("%02X", *p);

        p++;

    }

    putchar('\n');

}



int main()

{

    uint16_t unicode[] = L"中文";

    uint8_t *utf8 = NULL;



    int unisize = 0;

    uint16_t *uni = NULL;



    printf("original unicode: \n");

    dump_unicode(unicode, sizeof(unicode));

    

    printf("converted to utf8: \n");

    unicode_to_utf8(unicode, sizeof(unicode), &utf8);

    dump_utf8(utf8);



    printf("converted to unicode: \n");

    utf8_to_unicode(utf8, &uni, &unisize);

    dump_unicode(uni, unisize);



    return 0;

}

输出结果：
original unicode:
4E2D6587
converted to utf8:
E2B58EE89DA5
converted to unicode:
4E2D65870000

测试写的，应该有不少Bug。

a8323418 2009-10-12

打赏
举报

来一个c语言写的

鼠 2009-10-12

打赏
举报

曾经写过一个 C++ 的，未经过优化，未经过充分测试，慎用……仅供参考……
http://blog.csdn.net/hpsmouse/archive/2009/09/23/4586044.aspx

个人觉得 12 楼写得已经很好了。

a8323418 2009-10-12

打赏
举报

lcc编译器

whg01 2009-10-12

打赏
举报

char* UnicodeToUtf8( WCHAR* wszUtf8)
{
int len= WideCharToMultiByte(CP_UTF8,0, wszUtf8,-1, NULL,0, NULL, NULL);
char* szUtf8=new char[len+1];
memset(szUtf8,0, len+1);
WideCharToMultiByte (CP_UTF8,0, wszUtf8,-1, szUtf8, len, NULL,NULL);
return szUtf8;
}
WCHAR* ConvertUtf8ToGBK( char* szUtf8 )
{
int len=MultiByteToWideChar(CP_UTF8,0, szUtf8 ,-1, NULL,0);
WCHAR* wszUCD=new WCHAR[len+1];
memset(wszUCD,0, len*2+2);
MultiByteToWideChar(CP_UTF8,0, szUtf8,-1, wszUCD, len);
return wszUCD;
}
在VS2008里编译通过。要#include <Shlwapi.h>

whg01 2009-10-12

打赏
举报

你是什么平台？

a8323418 2009-10-12

打赏
举报

大家都不会？

a8323418 2009-10-12

打赏
举报

[Quote=引用 14 楼 whg01 的回复:]
前两天刚解答过类似的问题。
直接调用MultiByteToWideChar和WideCharToMultiByte 即可。

char* UnicodeToUtf8( WCHAR* wszUtf8)
{
int len= WideCharToMultiByte(CP_UTF8,0, wszUtf8,-1, NULL,0, NULL, NULL);
char* szUtf8=newchar[len+1];
memset(szUtf8,0, len+1);
WideCharToMultiByte (CP_UTF8,0, wszUtf8,-1, szUtf8, len, NULL,NULL);
return szUtf8;
}
WCHAR* ConvertUtf8ToGBK( char* szUtf8 )
{
int len=MultiByteToWideChar(CP_UTF8,0, (LPCTSTR)szUtf8 ,-1, NULL,0);
WCHAR* wszUCD=new WCHAR[len+1];
memset(wszUCD,0, len*2+2);
MultiByteToWideChar(CP_UTF8,0, (LPCTSTR)strUtf8,-1, wszUCD, len);
return wszUCD;
}

[/Quote]

MultiByteToWideChar
编译器不过

whg01 2009-10-12

打赏
举报

前两天刚解答过类似的问题。
直接调用MultiByteToWideChar和WideCharToMultiByte 即可。

char* UnicodeToUtf8( WCHAR* wszUtf8)
{
int len= WideCharToMultiByte(CP_UTF8,0, wszUtf8,-1, NULL,0, NULL, NULL);
char* szUtf8=newchar[len+1];
memset(szUtf8,0, len+1);
WideCharToMultiByte (CP_UTF8,0, wszUtf8,-1, szUtf8, len, NULL,NULL);
return szUtf8;
}
WCHAR* ConvertUtf8ToGBK( char* szUtf8 )
{
int len=MultiByteToWideChar(CP_UTF8,0, (LPCTSTR)szUtf8 ,-1, NULL,0);
WCHAR* wszUCD=new WCHAR[len+1];
memset(wszUCD,0, len*2+2);
MultiByteToWideChar(CP_UTF8,0, (LPCTSTR)strUtf8,-1, wszUCD, len);
return wszUCD;
}

zpf82118 2009-10-12

打赏
举报

人家已经给你函数了
你改一下不就行了吗？

thy38 2009-10-12

打赏
举报

#include <string>

using namespace std;



class CChineseCode

{

   public:

       static void UTF_8ToUnicode(wchar_t* pOut,char *pText);  // 把UTF-8转换成Unicode

       static void UnicodeToUTF_8(char* pOut,wchar_t* pText);  //Unicode 转换成UTF-8

       static void UnicodeToGB2312(char* pOut,wchar_t uData);  // 把Unicode 转换成 GB2312 

       static void Gb2312ToUnicode(wchar_t* pOut,char *gbBuffer);// GB2312 转换成　Unicode

       static void GB2312ToUTF_8(string& pOut,char *pText, int pLen);//GB2312 转为 UTF-8

       static void UTF_8ToGB2312(string &pOut, char *pText, int pLen);//UTF-8 转为 GB2312

};



void CChineseCode::UTF_8ToUnicode(wchar_t* pOut,char *pText)

{

    char* uchar = (char *)pOut;



    uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);

    uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);



    return;

}



void CChineseCode::UnicodeToUTF_8(char* pOut,wchar_t* pText)

{

    // 注意 WCHAR高低字的顺序,低字节在前，高字节在后

    char* pchar = (char *)pText;



    pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));

    pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);

    pOut[2] = (0x80 | (pchar[0] & 0x3F));



    return;

}



void CChineseCode::UnicodeToGB2312(char* pOut,wchar_t uData)

{

    WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(wchar_t),NULL,NULL);

    return;

}     



void CChineseCode::Gb2312ToUnicode(wchar_t* pOut,char *gbBuffer)

{

    ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1);

    return ;

}



void CChineseCode::GB2312ToUTF_8(string& pOut,char *pText, int pLen)

{

    char buf[4];

    int nLength = pLen* 3;

    char* rst = new char[nLength];

    

    memset(buf,0,4);

    memset(rst,0,nLength);

    

    int i = 0;

    int j = 0;      

    while(i < pLen)

    {

            //如果是英文直接复制就可以

            if( *(pText + i) >= 0)

            {

                    rst[j++] = pText[i++];

            }

            else

            {

                    wchar_t pbuffer;

                    Gb2312ToUnicode(&pbuffer,pText+i);

                    

                    UnicodeToUTF_8(buf,&pbuffer);

                    

                    unsigned short int tmp = 0;

                    tmp = rst[j] = buf[0];

                    tmp = rst[j+1] = buf[1];

                    tmp = rst[j+2] = buf[2];    

                    

                    j += 3;

                    i += 2;

            }

    }

    rst[j] = '\0';



    //返回结果

    pOut = rst;             

    delete []rst;   

    

    return;

}



void CChineseCode::UTF_8ToGB2312(string &pOut, char *pText, int pLen)

{

    char * newBuf = new char[pLen];

    char Ctemp[4];

    memset(Ctemp,0,4);



    int i =0;

    int j = 0;

    

    while(i < pLen)

    {

        if(pText[i] > 0)

        {

                newBuf[j++] = pText[i++];                       

        }

        else                 

        {

                WCHAR Wtemp;

                UTF_8ToUnicode(&Wtemp,pText + i);

        

                UnicodeToGB2312(Ctemp,Wtemp);

            

                newBuf[j] = Ctemp[0];

                newBuf[j + 1] = Ctemp[1];



                i += 3;    

                j += 2;   

        }

    }

    newBuf[j] = '\0';

    

    pOut = newBuf;

    delete []newBuf;

    

    return; 

}

a8323418 2009-10-12

打赏
举报

[Quote=引用 10 楼 akirya 的回复:]
引用 9 楼 a8323418 的回复:
引用 5 楼 lihan6415151528 的回复:
void CChineseCode::UTF_8ToUnicode(wchar_t* pOut,char *pText)
{
char* uchar = (char *)pOut;

uchar[1] = ((pText[0] & 0x0F) < < 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) < < 6) + (pText[2] & 0x3F);

return;
}

void CChineseCode::UnicodeToUTF_8(char* pOut,wchar_t* pText)
{
// 注意 WCHAR高低字的顺序,低字节在前，高字节在后
char* pchar = (char *)pText;

pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
pOut[1] = (0x80 | ((pchar[1] & 0x0F) < < 2)) + ((pchar[0] & 0xC0) >> 6);
pOut[2] = (0x80 | (pchar[0] & 0x3F));

return;
}

#include "stdio.h"

wchar_t* UTF_8ToUnicode(char* UTF_8)
{

}
char* UnicodeToUTF_8(wchar_t* Unicode)
{

}
int main(int argc,char* argv[])
{
return 0;
}
能不能把UTF_8ToUnicode和UnicodeToUTF_8 不全

这个函数声明有问题,
[/Quote]

请指点

珍惜生命远离CPP 2009-10-12

打赏
举报

[Quote=引用 9 楼 a8323418 的回复:]
引用 5 楼 lihan6415151528 的回复:
void CChineseCode::UTF_8ToUnicode(wchar_t* pOut,char *pText)
{
char* uchar = (char *)pOut;

uchar[1] = ((pText[0] & 0x0F) < < 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) < < 6) + (pText[2] & 0x3F);

return;
}

void CChineseCode::UnicodeToUTF_8(char* pOut,wchar_t* pText)
{
// 注意 WCHAR高低字的顺序,低字节在前，高字节在后
char* pchar = (char *)pText;

pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
pOut[1] = (0x80 | ((pchar[1] & 0x0F) < < 2)) + ((pchar[0] & 0xC0) >> 6);
pOut[2] = (0x80 | (pchar[0] & 0x3F));

return;
}

#include "stdio.h"

wchar_t* UTF_8ToUnicode(char* UTF_8)
{

}
char* UnicodeToUTF_8(wchar_t* Unicode)
{

}
int main(int argc,char* argv[])
{
return 0;
}
能不能把UTF_8ToUnicode和UnicodeToUTF_8 不全

[/Quote]
这个函数声明有问题,

a8323418 2009-10-12

打赏
举报

[Quote=引用 5 楼 lihan6415151528 的回复:]
void CChineseCode::UTF_8ToUnicode(wchar_t* pOut,char *pText)
{
char* uchar = (char *)pOut;

uchar[1] = ((pText[0] & 0x0F) < < 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) < < 6) + (pText[2] & 0x3F);

return;
}

void CChineseCode::UnicodeToUTF_8(char* pOut,wchar_t* pText)
{
// 注意 WCHAR高低字的顺序,低字节在前，高字节在后
char* pchar = (char *)pText;

pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
pOut[1] = (0x80 | ((pchar[1] & 0x0F) < < 2)) + ((pchar[0] & 0xC0) >> 6);
pOut[2] = (0x80 | (pchar[0] & 0x3F));

return;
}

[/Quote]

#include "stdio.h"

wchar_t* UTF_8ToUnicode(char* UTF_8)
{

}
char* UnicodeToUTF_8(wchar_t* Unicode)
{

}
int main(int argc,char* argv[])
{
return 0;
}
能不能把UTF_8ToUnicode和UnicodeToUTF_8 不全