linux c/c++ 编码转换

Non_Recursive 2011-04-26 10:06:06

大家好，最近遇到一个问题，要将部分编码转换成UTF8编码，借助于ICU,ICONV，可以完成这样的功能，但我不敢确认是否正确，首先：
1.用ICU可以完成猜测某一个字符串编码的功能，返回一个字符串的编码方式，用 ucsdet_getName（）获取。
2.根据 1 获取的编码方式，如果不是utf8，则用 iconv 的接口转成utf8。

主要疑问有：
1.iconv能根据ICU获取的编码方式完成两种已知编码的转换不？
2.ICU有相应的编码转换功能吗? <unicode/ucnv.h> 这个文件有，但好像只能是宽字符的？没用过，请大家介绍一下。

可能提问的有些不清楚，有相关经验的请解释一下，感激不尽。

...全文

588 11 打赏收藏转发到动态举报

写回复

用AI写文章

11 条回复

切换为时间正序

请发表友善的回复…

发表回复

Non_Recursive 2011-04-28

打赏
举报

谢谢楼上，但2楼得代码也可以实现这些功能了。
现在主要问题是如何更准确的猜测源字符串的编码，
以便用iconv转换时减少出错的概率。。
如何做到更准确，估计是将ICU库本土化，使其尽量猜测中文、英文相关的编码，不要全世界的编码都去猜。
这样就能做到更稳定，准确，还能提高效率呢，呵呵。。。
谢谢。

「已注销」 2011-04-27

打赏
举报

/****************************************************************************

 *  Convert - demonstrate convertion functions using iconv

 *  File:

 *    Convert.c

 *  Description:

 *    demonstrate convertion functions using iconv

 *    you can use code_convert() to make many new conversion functions like Utf8ToGb2312()...

 *  Author:

 *    XCyber   email:XCyber@sohu.com

 *  Date:

 *    Sept 11, 2008

 *  Other:

 *    visit http://www.gnu.org/software/libiconv/ for more help of iconv

 ***************************************************************************/





#include <stdio.h>

#include <tchar.h>

#include <windows.h>

#include "../iconv-1.9.2.win32/include/iconv.h"

#include "Convert.h"



//#pragma comment(lib, "../iconv-1.9.2.win32/lib/iconv.lib")  // using iconv dynamic-link lib, iconv.dll

#pragma comment(lib, "../iconv-1.9.2.win32/lib/iconv_a.lib")  // using iconv static lib 



/*

 *  Description:

 *    convert one type of encoding string to another type of encoding string using iconv

 *  Parameters:

 *    form_encoding:   type of source encoding

 *    to_encoding:     type of target encoding

 *    from_str:        encoding string of source type

 *    from_str_len:    length of encoding string of source type

 *    to_str[out]:     encoding string of target type, ending of string will be set to double zero "\00\00"

 *    to_str_len[out]: length of encoding string of target type

 *  Return:

 *    returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted. 

 *    In case of error, return -1

 *  WARNING:

 *    don't forget to call free() to release memory allocated for "to_str" pointer

 */



size_t code_convert(const char *from_encoding, const char *to_encoding,const char *from_str, const size_t from_str_len,char **to_str, size_t *to_str_len)

{

	iconv_t icv;

	int     argument = 1;

	size_t  ret      = 0;

	size_t  n        = 1;



	const char *from = NULL;

	char *to         = NULL;

	char *p          = NULL;

	

	size_t from_len = 0;

	size_t to_len  = 0;



	 if((!from_encoding) || (!to_encoding) || (!from_str) || (from_str_len == 0)||(!to_str) || (!to_str_len))

		 return -1;



	icv = iconv_open(to_encoding,from_encoding);

	if(icv == 0)

		return -1;



	//enable "illegal sequence discard and continue" feature, so that if met illeagal sequence, 

	//conversion will continue instead of being terminated

	if(iconvctl (icv ,ICONV_SET_DISCARD_ILSEQ,&argument) != 0)

		return -1;



	do

	{

		from = from_str;

		from_len = from_str_len;



		to_len = from_len * n;

		*to_str_len = to_len;

		to = (char*)malloc(to_len);

		if(!to)

		{

			iconv_close(icv);

			return -1;

		}

		*to_str = to;

		ret = iconv(icv,&from,&from_len,&to,&to_len);

		if(ret == -1)

		{

			// not enougt room of output buffer, we should reallocate more room for output buffer

			// just simply enlarge size of output buffer to x2 (x3 x4 and so on) size of input buffer

			if(errno == E2BIG)

			{

				n++;

				free(*to_str);

				*to_str = NULL;

			}

			else

			{

				//something wrong, we should terminate conversation

				if(to)

					free(to);

				iconv_close(icv);

				return -1;

			}

		}

	}while(ret == -1);



	iconv_close(icv);



	//size of output string

	*to_str_len -= to_len;



	//set the ending characters of output string to double zero "\00\00"

	p = *to_str;

	*to_str = (char*)malloc(*to_str_len + 2);

	memcpy(*to_str,p,*to_str_len);

	*(*to_str + *to_str_len) = 0;

	*(*to_str + *to_str_len + 1) = 0;

	free(p);



	return ret;

}





/*

 *  Description:

 *    convert ucs-2le to gb2312

 *  Parameters:

 *    from_str:        ucs-2le encoding string

 *    from_str_len:    length of ucs-2le encoding string

 *    to_str[out]:     gb2312 encoding string

 *    to_str_len[out]: length of gb2312 encoding string

 *  Return:

 *    returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted. 

 *    In case of error, return -1

 *  WARNING:

 *    don't forget to call free() to release memory allocated for "to_str" pointer

 */

size_t Ucs2leToGb2312(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)

{

	return code_convert("UCS-2LE","GB2312",from_str,from_str_len,to_str,to_str_len);

}





/*

 *  Description:

 *    convert gb2312 to ucs-2le

 *  Parameters:

 *    from_str:        gb2312 encoding string

 *    from_str_len:    length of ascii encoding string

 *    to_str[out]:     ucs-2le encoding string

 *    to_str_len[out]: length of ucs-2le encoding string

 *  Return:

 *    returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted. 

 *    In case of error, return -1

 *  WARNING:

 *    don't forget to call free() to release memory allocated for "to_str" pointer

 */

size_t Gb2312ToUcs2le(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)

{

	return code_convert("GB2312","UCS-2LE",from_str,from_str_len,to_str,to_str_len);

}



/*

 *  Description:

 *    convert utf-8 to gb2312

 *  Parameters:

 *    from_str:        utf-8 encoding string

 *    from_str_len:    length of utf-8 encoding string

 *    to_str[out]:     gb2312 encoding string

 *    to_str_len[out]: length of gb2312 encoding string

 *  Return:

 *    returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted. 

 *    In case of error, return -1

 *  WARNING:

 *    don't forget to call free() to release memory allocated for "to_str" pointer

 */

size_t Utf8ToGb2312(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)

{

	return code_convert("UTF-8","GB2312",from_str,from_str_len,to_str,to_str_len);

}





/*

 *  Description:

 *    convert gb2312 to utf8

 *  Parameters:

 *    from_str:        gb2312 encoding string

 *    from_str_len:    length of ascii encoding string

 *    to_str[out]:     utf-8 encoding string

 *    to_str_len[out]: length of utf-8 encoding string

 *  Return:

 *    returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted. 

 *    In case of error, return -1

 *  WARNING:

 *    don't forget to call free() to release memory allocated for "to_str" pointer

 */

size_t Gb2312ToUtf8(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)

{

	return code_convert("GB2312","UTF-8",from_str,from_str_len,to_str,to_str_len);

}







int _tmain(int argc, _TCHAR* argv[])

{

	FILE *pFromFile = NULL;

	FILE *pToFile = NULL;



	char szFromFileName[MAX_PATH];

	char szToFileName[MAX_PATH];



	char szBuf[1024];

	char *pFromBuf = NULL;



	int nType = -1;

	size_t nSize = 0;

	size_t nRet = 0;

	size_t nNonRevrt = 0;  //non-reversible characters



	char* to_str = NULL;

	size_t to_str_len = 0;



	memset(szFromFileName,0,MAX_PATH);

	memset(szToFileName,0,MAX_PATH);



	//prompt to input source file

	printf("Please input source file to convert:");

	scanf("%s",szFromFileName);



	//prompt to input convertion type 

	printf("Following convertion type:\n");

	printf("1.GB2312->UCS-2LE\n");

	printf("2.UCS-2LE->GB2312\n");

	printf("3.GB2312->UTF-8\n");

	printf("4.UTF-8->GB2312\n");

	printf("Please input convertion type:");

	scanf("%d",&nType);



	if(!szFromFileName)

		return -1;



	//destination file name

	sprintf(szToFileName,"%s.converted",szFromFileName);



	//open files

	pFromFile = fopen(szFromFileName,"rb");

	pToFile = fopen(szToFileName,"wb");

	if(!pFromFile || !pToFile)

		return -1;



	//get source file size

	while(!feof(pFromFile))

	{

		nRet = fread(szBuf,1,1024,pFromFile);

		nSize += nRet;

	}



	//if file larger than 1M, exit

	if(nSize > 1024*1024)

	{

		printf("Input file too large, program exit.\n");

		return 0;

	}



	//allocate memory for read buffer

	pFromBuf = (char*)malloc(nSize);

	if(!pFromBuf)

		return -1;

	memset(pFromBuf,0,nSize);



	//reset the read pointer to start of file

	rewind(pFromFile);



	//read whole file into buffer

	nRet = fread(pFromBuf,sizeof(char),nSize,pFromFile);

	if(nRet != nSize)

	{

		return -1;

	}



	//do convertion

	switch(nType)

	{

	case 1:

		nNonRevrt = Gb2312ToUcs2le(pFromBuf,nSize,&to_str,&to_str_len);

		break;

	case 2:

		nNonRevrt = Ucs2leToGb2312(pFromBuf,nSize,&to_str,&to_str_len);

		break;

	case 3:

		nNonRevrt = Gb2312ToUtf8(pFromBuf,nSize,&to_str,&to_str_len);

		break;

	case 4:

		nNonRevrt = Utf8ToGb2312(pFromBuf,nSize,&to_str,&to_str_len);

		break;

	default:

		printf("Invalid convertion type, program exit.\n");

		break;

	}



	printf("the number of output characters is:%d\n",to_str_len);

	printf("the number of non-reversible characters (i.e. can't converted characters) is:%d\n",nNonRevrt);

	//write to destination file

	fwrite(to_str,sizeof(char),to_str_len,pToFile);



	//close files

	fclose(pFromFile);

	fclose(pToFile);



	return 0;

}

http://hi.baidu.com/guanxiansun/blog/item/1faeda47727a24026b63e5bc.html
这个是原理介绍的，可以看看。

这是一个开源工程，跨平台编码转换的：http://xcyber.googlecode.com/svn/trunk/Convert/

Non_Recursive 2011-04-27

打赏
举报

谢谢大家，关于ICU猜测编码的，大家有什么经验不？
好像猜测的不是很准，能不能只猜测本地(locate)的编码，如在中国，则只有
ascii,utf16,utf18,gbk,gb2312,gb180xx....xxxxxx等十来种就够了。。
不然经常猜出其他国家的字符集，这样转成UTF-8也是错的。。
谢谢。

justkk 2011-04-27

打赏
举报

iconv -l 命令可以列出系统支持的字符集编码
通常两种编码之间可以直接转换
不能直接转换时，可以通过utf8中转一下

luciferisnotsatan 2011-04-27

打赏
举报

如果你要自己写字符集转换的话，那就要找字符集相关的文档看下了
http://topic.csdn.net/u/20110407/14/b9c6143c-160a-4dd3-a2d1-3b992fe557a2.html

luciferisnotsatan 2011-04-27

打赏
举报

ICU没用过，ICONV支持很多字符集的互相转换。应该没什么问题

pathuang68 2011-04-26

打赏
举报

补充：ICONV几乎支持常用到的所有字符集的编码。

pathuang68 2011-04-26

打赏
举报

俺曾经在linux上用iconv做过字符集转换，下面是经过多次验证的代码(Linux和Windows平台)，供参考：



#pragma comment(lib,"iconv.lib")    // 记得引入库



// - - - - - - - 以下是在Linux上实现的字符集转换函数

int code_convert(char *from_charset,char *to_charset,const char *inbuf, size_t inlen,char *outbuf, size_t outlen)

{

         iconv_t cd;

         const char **pin = &inbuf;

         char **pout = &outbuf;

 

         cd = iconv_open(to_charset,from_charset);

         if (cd==0) return -1;

         memset(outbuf,0,outlen);

         if (iconv(cd, pin, &inlen,pout, &outlen)==-1) return -1;

         iconv_close(cd);

         return 0;

}





/* 示例：TF-8 to GB2312  */

int u2g(const char *inbuf, size_t inlen, char *outbuf, size_t outlen)

{

         return code_convert("UTF-8","GB2312",inbuf,inlen,outbuf,outlen);

}

 

/* 示例：GB2312 to UTF-8 */

int g2u(const char *inbuf, size_t inlen, char *outbuf, size_t outlen)

{

         return code_convert("GB2312", "UTF-8", inbuf, inlen, outbuf, outlen);

}

// - - - - - - - 以上是在Linux上实现的字符集转换函数