16,472
社区成员
发帖
与我相关
我的任务
分享
void CMainFrame::OnButton32776()
{
// TODO: Add your command handler code here
char ascii[]="abcdefg";
BOOL ret=CheckNoBOM_utf8((BYTE *)ascii, 7);
afxDump << ret << "\n";// all ascii not utf8
//
char str[]="A丁一万";// ansi
char str8[20]={0};
int len=9;//
WCHAR wBuf[20]={0};
MultiByteToWideChar(CP_ACP,0,str,len,wBuf,len);// ansi 2 unicode
WideCharToMultiByte(CP_UTF8,0,wBuf,len,str8,len,0,0);// unicode to utf8
//str8="A\xE4\xB8\x81\xE4\xB8\x80\xE4\xB8\x87";
len=strlen(str8);
ret= CheckNoBOM_utf8((BYTE *)str8,len);
afxDump << ret << "\n";
}
/* 无 BOM 检测 UTF-8
UCS-4 range (hex.) UTF-8 octet sequence (binary)
0000 0000-0000 007F 0xxxxxxx
0000 0080-0000 07FF 110xxxxx 10xxxxxx // 110 表示 2个字节,一个UTF-8
0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx // 1110 表示3个字节,一个UTF-8
0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // 11110 表示4个,字节一个 UTF-8
0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx // 111110 表示5个字节,一个 UTF-8
0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx // 1111110 表示 6个字节 ,一个 UTF-8
*/
BOOL CheckNoBOM_utf8(BYTE *pText, long length)
{
int i;
DWORD nBytes = 0;
UCHAR chr;
BOOL bAllAscii = TRUE;
// all chars
for (i = 0; i < length; i++)
{
chr = *(pText + i);
if ((chr & 0x80) != 0) bAllAscii = FALSE;// 0xxxxxxx
if (nBytes == 0)
{// check 1st
if (chr >= 0x80)
{
if (chr >= 0xFC && chr <= 0xFD) nBytes = 6;
else if (chr >= 0xF8) nBytes = 5;
else if (chr >= 0xF0) nBytes = 4;
else if (chr >= 0xE0) nBytes = 3;
else if (chr >= 0xC0) nBytes = 2;
else return FALSE;
nBytes--;
}
}
else
{// 10xxxxxx
if ((chr & 0xC0) != 0x80) return FALSE;// not 10xxxxxx
nBytes--;// next "10xxxxxx"
}
}
// more "10xxxxxx" than nBytes
if (nBytes > 0) return FALSE;
// all bytes is ascii
if (bAllAscii) return FALSE;
//
return TRUE;
}