求助: 一段文本的解析,Unicode, UTF8 还是其他???!!!

kingmax54212008 2010-06-10 10:50:11
需要解析的文件我已经放到我的csdn资源空间里了,大家可以取下来看看:
http://download.csdn.net/source/2444008



用ultraedit打开是如下效果:
/**************************** Text starts *************************************/

HDDW C O P I E S P A G E W I D T H o P A G E L E N G T H
C O L O R D U P L E X A C C O U N T j i m m y D O M A I N S P C - 8 A 9 A E A 3 2 4 B 0 G R O U P N A M E D E F A U L T D E V I C E N A M E X e r o x 3 2 0 0 L O C A T I O N $\ \ S P C - 8 A 9 A E A 3 2 4 B 0 P O R T N A M E U S B 0 0 1 J O B D A T E Lb P A G E C O U N T J O B C O S T D O C U M E N T jM i c r o s o f t W o r d - _ -N齎;Sf[褃f[b?Sf[陙㏑噀pS鹼邁頞9e鷁畫蔛X[(W顣槝Gl;`. d o c . d o c J O B S I Z E 6 P O R T T Y P E I P S P C - 8 A 9 A E A 3 2 4 B 0
P O R T U S B 0 0 1 D R I V E R 6X e r o x P h a s e r 3 2 0 0 M F P P C L 6 A U D I T O R _ I D W K S _ I P 1 9 2 . 1 6 8 . 1 . 1 7 9

/******************************* Text ends ***********************************/


能看出其中的一些英文单词,其中也有中文。(看其中乱码部分, 字符“.doc.doc”前是一段中文。)

求助,看看各路高手有什么意见?走过路过,帮个忙!
多谢!!!
...全文
186 13 打赏 收藏 转发到动态 举报
写回复
用AI写文章
13 条回复
切换为时间正序
请发表友善的回复…
发表回复
prayers 2010-06-12
  • 打赏
  • 举报
回复
顶!!!
kingmax54212008 2010-06-12
  • 打赏
  • 举报
回复
To tufaqing,

ParseData的时候, swap_bytes是做什么的?在最后都要进行一次swap,
比如执行: swap_bytes(&pInfo->hddw, sizeof(pInfo->hddw));
tufaqing 2010-06-11
  • 打赏
  • 举报
回复

int _tmain(int argc, _TCHAR* argv[])
{
setlocale(LC_ALL, "chs");

DOCDATAINFO ddi;
if(ParseData(L"C:\\a.txt", &ddi))
{
time_t job_date = ddi.job_date;
char *date = ctime(&job_date);

printf("HDDW: %u\n", ddi.hddw);
printf("COPIES: %u\n", ddi.copies);
printf("PAGEWIDTH: %u\n", ddi.page_width);
printf("PAGELENGTH: %u\n", ddi.page_length);
printf("PAGECOUNT: %u\n", ddi.page_count);
printf("COLOR: %u\n", ddi.color);
printf("DUPLEX: %u\n", ddi.duplex);
printf("JOBDATE: %u - %s\n", ddi.job_date, date);
printf("JOBCOST: %u\n", ddi.job_cost);
printf("JOBSIZE: %u\n", ddi.job_size);
printf("PORTTYPE: %u\n", ddi.port_type);
printf("AUDITOR_ID: %u\n", ddi.auditor_id);
printf("DOCUMENT: %S\n", ddi.document);
printf("ACCOUNT: %S\n", ddi.account);
printf("DOMAIN: %S\n", ddi.domain);
printf("LOCATION: %S\n", ddi.location);
printf("IP: %S\n", ddi.ip);
printf("GROUPNAME: %S\n", ddi.group_name);
printf("DEVICENAME: %S\n", ddi.device_name);
printf("PORTNAME: %S\n", ddi.port_name);
printf("PORT: %S\n", ddi.port);
printf("DRIVER: %S\n", ddi.driver);
printf("WKS_IP: %S\n", ddi.wks_ip);
}

return 0;
}
tufaqing 2010-06-11
  • 打赏
  • 举报
回复
LZ还是要多练习啊!

// test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include <windows.h>
#include <time.h>
#include <locale.h>

struct DOCDATALIST
{
WORD name_len;
WORD data_len;
BYTE *name;
BYTE *data;
DOCDATALIST *next;
};

struct DOCDATAINFO
{
WORD hddw;
DWORD copies;
DWORD page_width;
DWORD page_length;
DWORD page_count;
DWORD color;
DWORD duplex;
DWORD job_date;
DWORD job_cost;
DWORD job_size;
DWORD port_type;
DWORD auditor_id;
WCHAR document[MAX_PATH];
WCHAR account[MAX_PATH];
WCHAR domain[MAX_PATH];
WCHAR location[MAX_PATH];
WCHAR ip[MAX_PATH];
WCHAR group_name[MAX_PATH];
WCHAR device_name[MAX_PATH];
WCHAR port_name[MAX_PATH];
WCHAR port[MAX_PATH];
WCHAR driver[MAX_PATH];
WCHAR wks_ip[MAX_PATH];
};

static void swap_bytes(void *p, int len)
{
unsigned char *s = (unsigned char *)p;
unsigned char *e = (unsigned char *)p + len - 1;
while(s < e)
{
*s ^= *e; *e ^= *s; *s ^= *e;
s++; e--;
}
}

DOCDATALIST *CreateDocDataList(WCHAR *pFileName)
{
if(pFileName == NULL)
{
return NULL;
}

HANDLE hFile = CreateFileW(pFileName, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, 0, NULL);
if(hFile == INVALID_HANDLE_VALUE)
{
return NULL;
}

DOCDATALIST *pList = NULL;
DOCDATALIST *pNode = NULL;

while(1)
{
WORD nNameLen = 0;
WORD nDataLen = 0;
BYTE *pName = NULL;
BYTE *pData = NULL;

DWORD nReadBytes = 0;

ReadFile(hFile, &nNameLen, 2, &nReadBytes, NULL);
if(nNameLen == 0 || nReadBytes < 2)
{
break;
}

swap_bytes(&nNameLen, 2);

pName = new BYTE[nNameLen];

ReadFile(hFile, pName, nNameLen, &nReadBytes, NULL);
if(nReadBytes < nNameLen)
{
delete[] pName;
break;
}

ReadFile(hFile, &nDataLen, 2, &nReadBytes, NULL);
if(nDataLen == 0 || nReadBytes < 2)
{
delete[] pName;
break;
}

swap_bytes(&nDataLen, 2);

pData = new BYTE[nDataLen];

ReadFile(hFile, pData, nDataLen, &nReadBytes, NULL);
if(nReadBytes < nDataLen)
{
delete[] pName;
delete[] pData;
break;
}

if(pList == NULL)
{
pNode = new DOCDATALIST;
pList = pNode;
pNode->name_len = nNameLen;
pNode->data_len = nDataLen;
pNode->name = pName;
pNode->data = pData;
pNode->next = NULL;
}
else
{
pNode->next = new DOCDATALIST;
pNode->next->name_len = nNameLen;
pNode->next->data_len = nDataLen;
pNode->next->name = pName;
pNode->next->data = pData;
pNode->next->next = NULL;
pNode = pNode->next;
}
}

CloseHandle(hFile);

return pList;
}

void DestroyDocDataList(DOCDATALIST *pList)
{
while(pList != NULL)
{
DOCDATALIST *pNext = pList->next;

if(pList->name_len)
{
delete[] pList->name;
}
if(pList->data_len)
{
delete[] pList->data;
}
delete pList;

pList = pNext;
}
}

BOOL ParseData(WCHAR *pFileName, DOCDATAINFO *pInfo)
{
DOCDATALIST *pList = CreateDocDataList(pFileName);
if(pList == NULL)
{
return FALSE;
}

memset(pInfo, 0, sizeof(DOCDATAINFO));

DOCDATALIST *pNode = pList;
while(pNode != NULL)
{
if((pNode->name_len == sizeof("HDDW")) && (memcmp(pNode->name, "HDDW", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->hddw))
{
memcpy(&pInfo->hddw, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"COPIES")) && (memcmp(pNode->name, L"COPIES", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->copies))
{
memcpy(&pInfo->copies, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"PAGEWIDTH")) && (memcmp(pNode->name, L"PAGEWIDTH", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->page_width))
{
memcpy(&pInfo->page_width, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"PAGELENGTH")) && (memcmp(pNode->name, L"PAGELENGTH", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->page_length))
{
memcpy(&pInfo->page_length, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"PAGECOUNT")) && (memcmp(pNode->name, L"PAGECOUNT", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->page_count))
{
memcpy(&pInfo->page_count, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"COLOR")) && (memcmp(pNode->name, L"COLOR", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->color))
{
memcpy(&pInfo->color, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"DUPLEX")) && (memcmp(pNode->name, L"DUPLEX", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->duplex))
{
memcpy(&pInfo->duplex, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"JOBDATE")) && (memcmp(pNode->name, L"JOBDATE", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->job_date))
{
memcpy(&pInfo->job_date, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"JOBCOST")) && (memcmp(pNode->name, L"JOBCOST", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->job_cost))
{
memcpy(&pInfo->job_cost, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"JOBSIZE")) && (memcmp(pNode->name, L"JOBSIZE", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->job_size))
{
memcpy(&pInfo->job_size, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"PORTTYPE")) && (memcmp(pNode->name, L"PORTTYPE", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->port_type))
{
memcpy(&pInfo->port_type, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"AUDITOR_ID")) && (memcmp(pNode->name, L"AUDITOR_ID", pNode->name_len) == 0))
{
if(pNode->data_len == sizeof(pInfo->auditor_id))
{
memcpy(&pInfo->auditor_id, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"DOCUMENT")) && (memcmp(pNode->name, L"DOCUMENT", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->document))
{
memcpy(pInfo->document, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"ACCOUNT")) && (memcmp(pNode->name, L"ACCOUNT", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->account))
{
memcpy(pInfo->account, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"DOMAIN")) && (memcmp(pNode->name, L"DOMAIN", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->domain))
{
memcpy(pInfo->domain, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"LOCATION")) && (memcmp(pNode->name, L"LOCATION", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->location))
{
memcpy(pInfo->location, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"IP")) && (memcmp(pNode->name, L"IP", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->ip))
{
memcpy(pInfo->ip, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"GROUPNAME")) && (memcmp(pNode->name, L"GROUPNAME", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->group_name))
{
memcpy(pInfo->group_name, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"DEVICENAME")) && (memcmp(pNode->name, L"DEVICENAME", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->device_name))
{
memcpy(pInfo->device_name, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"PORTNAME")) && (memcmp(pNode->name, L"PORTNAME", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->port_name))
{
memcpy(pInfo->port_name, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"PORT")) && (memcmp(pNode->name, L"PORT", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->port))
{
memcpy(pInfo->port, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"DRIVER")) && (memcmp(pNode->name, L"DRIVER", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->driver))
{
memcpy(pInfo->driver, pNode->data, pNode->data_len);
}
}
else if((pNode->name_len == sizeof(L"WKS_IP")) && (memcmp(pNode->name, L"WKS_IP", pNode->name_len) == 0))
{
if(pNode->data_len <= sizeof(pInfo->wks_ip))
{
memcpy(pInfo->wks_ip, pNode->data, pNode->data_len);
}
}

pNode = pNode->next;
}

DestroyDocDataList(pList);

swap_bytes(&pInfo->hddw, sizeof(pInfo->hddw));
swap_bytes(&pInfo->copies, sizeof(pInfo->copies));
swap_bytes(&pInfo->page_width, sizeof(pInfo->page_width));
swap_bytes(&pInfo->page_length, sizeof(pInfo->page_length));
swap_bytes(&pInfo->page_count, sizeof(pInfo->page_count));
swap_bytes(&pInfo->color, sizeof(pInfo->color));
swap_bytes(&pInfo->duplex, sizeof(pInfo->duplex));
swap_bytes(&pInfo->job_date, sizeof(pInfo->job_date));
swap_bytes(&pInfo->job_cost, sizeof(pInfo->job_cost));
swap_bytes(&pInfo->job_size, sizeof(pInfo->job_size));
swap_bytes(&pInfo->port_type, sizeof(pInfo->port_type));
swap_bytes(&pInfo->auditor_id, sizeof(pInfo->auditor_id));

return TRUE;
}
tufaqing 2010-06-10
  • 打赏
  • 举报
回复
JOBDATE是时间,相对于1970/1/1过了多少秒(通过time(NULL)取得),可以用gmtime转换年月日(标准时间),或ctime输出时间字符串(系统时间)。
DOCUMENT和其它的一些描述都是unicode编码的,直接new WCHAR[n],memcpy到这个内存就行了。
kingmax54212008 2010-06-10
  • 打赏
  • 举报
回复
to tufaqing,
惊叹!
太厉害了。
对于那段中文与时间,你是怎么解析出来的?

JOBDATE: 0x4c0f62cb(2010/5/9 9:45:47,中国时间17:45:47)
DOCUMENT: Microsoft Word - _中国医学科学院医学自助文印系统修改建议及存在问题汇总.doc.doc
tufaqing 2010-06-10
  • 打赏
  • 举报
回复
我是对照文件人工解析的,没有写代码,自己写一下也比较简单。
00 05 (描述名长度为5)
48 44 44 57 00 (描述名为HDDW)
00 02 (内容长度为2)
00 01 (short 1)

00 0E (描述名长度为14)
43 00 4F 00 50 00 49 00 45 00 53 00 00 00 (描述名为COPIES)
00 04 (内容长度为4)
00 00 00 01 (long 1)

...
kingmax54212008 2010-06-10
  • 打赏
  • 举报
回复
在线等~~~
用户 昵称 2010-06-10
  • 打赏
  • 举报
回复
是utf16 little endian格式的,可是俺没恢复到1楼的程度,俺这边有很多字符是不可显示的。
kingmax54212008 2010-06-10
  • 打赏
  • 举报
回复
楼上的高手啊。
能把你解析出这些数据的例子给我发过来一下么?
kingmax5421#gmail.com (@代替#)
tufaqing 2010-06-10
  • 打赏
  • 举报
回复
定义的一段数据结构,每个字段是两个字节表示长度,后面是内容。

HDDW: 1
COPIES: 1
PAGEWIDTH: 0x86f
PAGELENGTH: 0xaea
COLOR: 2
DUPLEX: 1
ACCOUNT: jimmy
DOMAIN: SPC-8A9AEA324B0
GROUPNAME: DEFAULT
DEVICENAME: Xerox3200
LOCATION: \\SPC-8A9AEA324B0
PORTNAME: USB001
JOBDATE: 0x4c0f62cb(2010/5/9 9:45:47,中国时间17:45:47)
PAGECOUNT: 1
JOBCOST: 0
DOCUMENT: Microsoft Word - _中国医学科学院医学自助文印系统修改建议及存在问题汇总.doc.doc
JOBSIZE: 0x368f
PORTTYPE: 3
IP: SPC-8A9AEA324B0
PORT: USB001
DRIVER: Xerox Phaser 3200MFP PCL 6
AUDITOR_ID: 0
WKS_IP: 192.168.1.179
kingmax54212008 2010-06-10
  • 打赏
  • 举报
回复
这是不是C++里面所谓的序列化,
我的处理是反序列化,有没有这方面的经典例子或参照 ?
kingmax54212008 2010-06-10
  • 打赏
  • 举报
回复
to tufaqing,

sounds great. I will have a try!

16,473

社区成员

发帖
与我相关
我的任务
社区描述
VC/MFC相关问题讨论
社区管理员
  • 基础类社区
  • Web++
  • encoderlee
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告

        VC/MFC社区版块或许是CSDN最“古老”的版块了,记忆之中,与CSDN的年龄几乎差不多。随着时间的推移,MFC技术渐渐的偏离了开发主流,若干年之后的今天,当我们面对着微软的这个经典之笔,内心充满着敬意,那些曾经的记忆,可以说代表着二十年前曾经的辉煌……
        向经典致敬,或许是老一代程序员内心里面难以释怀的感受。互联网大行其道的今天,我们期待着MFC技术能够恢复其曾经的辉煌,或许这个期待会永远成为一种“梦想”,或许一切皆有可能……
        我们希望这个版块可以很好的适配Web时代,期待更好的互联网技术能够使得MFC技术框架得以重现活力,……

试试用AI创作助手写篇文章吧