69,373
社区成员
发帖
与我相关
我的任务
分享
/****************************************************************************
* Convert - demonstrate convertion functions using iconv
* File:
* Convert.c
* Description:
* demonstrate convertion functions using iconv
* you can use code_convert() to make many new conversion functions like Utf8ToGb2312()...
* Author:
* XCyber email:XCyber@sohu.com
* Date:
* Sept 11, 2008
* Other:
* visit http://www.gnu.org/software/libiconv/ for more help of iconv
***************************************************************************/
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include "../iconv-1.9.2.win32/include/iconv.h"
#include "Convert.h"
//#pragma comment(lib, "../iconv-1.9.2.win32/lib/iconv.lib") // using iconv dynamic-link lib, iconv.dll
#pragma comment(lib, "../iconv-1.9.2.win32/lib/iconv_a.lib") // using iconv static lib
/*
* Description:
* convert one type of encoding string to another type of encoding string using iconv
* Parameters:
* form_encoding: type of source encoding
* to_encoding: type of target encoding
* from_str: encoding string of source type
* from_str_len: length of encoding string of source type
* to_str[out]: encoding string of target type, ending of string will be set to double zero "\00\00"
* to_str_len[out]: length of encoding string of target type
* Return:
* returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted.
* In case of error, return -1
* WARNING:
* don't forget to call free() to release memory allocated for "to_str" pointer
*/
size_t code_convert(const char *from_encoding, const char *to_encoding,const char *from_str, const size_t from_str_len,char **to_str, size_t *to_str_len)
{
iconv_t icv;
int argument = 1;
size_t ret = 0;
size_t n = 1;
const char *from = NULL;
char *to = NULL;
char *p = NULL;
size_t from_len = 0;
size_t to_len = 0;
if((!from_encoding) || (!to_encoding) || (!from_str) || (from_str_len == 0)||(!to_str) || (!to_str_len))
return -1;
icv = iconv_open(to_encoding,from_encoding);
if(icv == 0)
return -1;
//enable "illegal sequence discard and continue" feature, so that if met illeagal sequence,
//conversion will continue instead of being terminated
if(iconvctl (icv ,ICONV_SET_DISCARD_ILSEQ,&argument) != 0)
return -1;
do
{
from = from_str;
from_len = from_str_len;
to_len = from_len * n;
*to_str_len = to_len;
to = (char*)malloc(to_len);
if(!to)
{
iconv_close(icv);
return -1;
}
*to_str = to;
ret = iconv(icv,&from,&from_len,&to,&to_len);
if(ret == -1)
{
// not enougt room of output buffer, we should reallocate more room for output buffer
// just simply enlarge size of output buffer to x2 (x3 x4 and so on) size of input buffer
if(errno == E2BIG)
{
n++;
free(*to_str);
*to_str = NULL;
}
else
{
//something wrong, we should terminate conversation
if(to)
free(to);
iconv_close(icv);
return -1;
}
}
}while(ret == -1);
iconv_close(icv);
//size of output string
*to_str_len -= to_len;
//set the ending characters of output string to double zero "\00\00"
p = *to_str;
*to_str = (char*)malloc(*to_str_len + 2);
memcpy(*to_str,p,*to_str_len);
*(*to_str + *to_str_len) = 0;
*(*to_str + *to_str_len + 1) = 0;
free(p);
return ret;
}
/*
* Description:
* convert ucs-2le to gb2312
* Parameters:
* from_str: ucs-2le encoding string
* from_str_len: length of ucs-2le encoding string
* to_str[out]: gb2312 encoding string
* to_str_len[out]: length of gb2312 encoding string
* Return:
* returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted.
* In case of error, return -1
* WARNING:
* don't forget to call free() to release memory allocated for "to_str" pointer
*/
size_t Ucs2leToGb2312(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)
{
return code_convert("UCS-2LE","GB2312",from_str,from_str_len,to_str,to_str_len);
}
/*
* Description:
* convert gb2312 to ucs-2le
* Parameters:
* from_str: gb2312 encoding string
* from_str_len: length of ascii encoding string
* to_str[out]: ucs-2le encoding string
* to_str_len[out]: length of ucs-2le encoding string
* Return:
* returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted.
* In case of error, return -1
* WARNING:
* don't forget to call free() to release memory allocated for "to_str" pointer
*/
size_t Gb2312ToUcs2le(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)
{
return code_convert("GB2312","UCS-2LE",from_str,from_str_len,to_str,to_str_len);
}
/*
* Description:
* convert utf-8 to gb2312
* Parameters:
* from_str: utf-8 encoding string
* from_str_len: length of utf-8 encoding string
* to_str[out]: gb2312 encoding string
* to_str_len[out]: length of gb2312 encoding string
* Return:
* returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted.
* In case of error, return -1
* WARNING:
* don't forget to call free() to release memory allocated for "to_str" pointer
*/
size_t Utf8ToGb2312(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)
{
return code_convert("UTF-8","GB2312",from_str,from_str_len,to_str,to_str_len);
}
/*
* Description:
* convert gb2312 to utf8
* Parameters:
* from_str: gb2312 encoding string
* from_str_len: length of ascii encoding string
* to_str[out]: utf-8 encoding string
* to_str_len[out]: length of utf-8 encoding string
* Return:
* returns the number of characters converted in a non-reversible way during this call; reversible conversions are not counted.
* In case of error, return -1
* WARNING:
* don't forget to call free() to release memory allocated for "to_str" pointer
*/
size_t Gb2312ToUtf8(const char *from_str, size_t from_str_len, char **to_str, size_t *to_str_len)
{
return code_convert("GB2312","UTF-8",from_str,from_str_len,to_str,to_str_len);
}
int _tmain(int argc, _TCHAR* argv[])
{
FILE *pFromFile = NULL;
FILE *pToFile = NULL;
char szFromFileName[MAX_PATH];
char szToFileName[MAX_PATH];
char szBuf[1024];
char *pFromBuf = NULL;
int nType = -1;
size_t nSize = 0;
size_t nRet = 0;
size_t nNonRevrt = 0; //non-reversible characters
char* to_str = NULL;
size_t to_str_len = 0;
memset(szFromFileName,0,MAX_PATH);
memset(szToFileName,0,MAX_PATH);
//prompt to input source file
printf("Please input source file to convert:");
scanf("%s",szFromFileName);
//prompt to input convertion type
printf("Following convertion type:\n");
printf("1.GB2312->UCS-2LE\n");
printf("2.UCS-2LE->GB2312\n");
printf("3.GB2312->UTF-8\n");
printf("4.UTF-8->GB2312\n");
printf("Please input convertion type:");
scanf("%d",&nType);
if(!szFromFileName)
return -1;
//destination file name
sprintf(szToFileName,"%s.converted",szFromFileName);
//open files
pFromFile = fopen(szFromFileName,"rb");
pToFile = fopen(szToFileName,"wb");
if(!pFromFile || !pToFile)
return -1;
//get source file size
while(!feof(pFromFile))
{
nRet = fread(szBuf,1,1024,pFromFile);
nSize += nRet;
}
//if file larger than 1M, exit
if(nSize > 1024*1024)
{
printf("Input file too large, program exit.\n");
return 0;
}
//allocate memory for read buffer
pFromBuf = (char*)malloc(nSize);
if(!pFromBuf)
return -1;
memset(pFromBuf,0,nSize);
//reset the read pointer to start of file
rewind(pFromFile);
//read whole file into buffer
nRet = fread(pFromBuf,sizeof(char),nSize,pFromFile);
if(nRet != nSize)
{
return -1;
}
//do convertion
switch(nType)
{
case 1:
nNonRevrt = Gb2312ToUcs2le(pFromBuf,nSize,&to_str,&to_str_len);
break;
case 2:
nNonRevrt = Ucs2leToGb2312(pFromBuf,nSize,&to_str,&to_str_len);
break;
case 3:
nNonRevrt = Gb2312ToUtf8(pFromBuf,nSize,&to_str,&to_str_len);
break;
case 4:
nNonRevrt = Utf8ToGb2312(pFromBuf,nSize,&to_str,&to_str_len);
break;
default:
printf("Invalid convertion type, program exit.\n");
break;
}
printf("the number of output characters is:%d\n",to_str_len);
printf("the number of non-reversible characters (i.e. can't converted characters) is:%d\n",nNonRevrt);
//write to destination file
fwrite(to_str,sizeof(char),to_str_len,pToFile);
//close files
fclose(pFromFile);
fclose(pToFile);
return 0;
}
#pragma comment(lib,"iconv.lib") // 记得引入库
// - - - - - - - 以下是在Linux上实现的字符集转换函数
int code_convert(char *from_charset,char *to_charset,const char *inbuf, size_t inlen,char *outbuf, size_t outlen)
{
iconv_t cd;
const char **pin = &inbuf;
char **pout = &outbuf;
cd = iconv_open(to_charset,from_charset);
if (cd==0) return -1;
memset(outbuf,0,outlen);
if (iconv(cd, pin, &inlen,pout, &outlen)==-1) return -1;
iconv_close(cd);
return 0;
}
/* 示例:TF-8 to GB2312 */
int u2g(const char *inbuf, size_t inlen, char *outbuf, size_t outlen)
{
return code_convert("UTF-8","GB2312",inbuf,inlen,outbuf,outlen);
}
/* 示例:GB2312 to UTF-8 */
int g2u(const char *inbuf, size_t inlen, char *outbuf, size_t outlen)
{
return code_convert("GB2312", "UTF-8", inbuf, inlen, outbuf, outlen);
}
// - - - - - - - 以上是在Linux上实现的字符集转换函数