求一处理文本的算法

abiao1421 2008-02-22 02:12:23

我现在有个文件a.txt是这样的：
订单入库
20010000,CZ1234567810000,PT1234567810000,1
20010001,CZ1234567810001,PT1234567810001,2
20010002,CZ1234567810002,PT1234567810002,3
20010002,CZ1234567810003,PT1234567810003,4
20010004,CZ1234567810004,PT1234567810004,5
20010005,CZ1234567810005,PT1234567810005,6
20010000,CZ1234567810006,PT1234567810006,7
20010007,CZ1234567810007,PT1234567810007,8
20010000,CZ1234567810008,PT1234567810008,9

现在要生成一个新文件 (订单入库.txt) ：
20010000
CZ1234567810000,PT1234567810000,1
CZ1234567810006,PT1234567810006,7
CZ1234567810008,PT1234567810008,9
20010001
CZ1234567810001,PT1234567810001,2
20010002
CZ1234567810002,PT1234567810002,3
CZ1234567810003,PT1234567810003,4
20010004
CZ1234567810004,PT1234567810004,5
20010005
CZ1234567810005,PT1234567810005,6
20010007
CZ1234567810007,PT1234567810007,8

a.txt里面的记录可能达到3万多条,有什么办法能比较快地处理出来,要求处理速度不要太慢.
我做了一个,但是速度太慢.

...全文

217 10 打赏收藏转发到动态举报

写回复

用AI写文章

10 条回复

切换为时间正序

请发表友善的回复…

发表回复

yesiwumian 2008-02-24

打赏
举报

sscanf + print 或其他脚本语言

Mnky 2008-02-24

打赏
举报

错了，逗号不是0x44，是0x2C

Mnky 2008-02-24

打赏
举报

虽然这是在c/c++板块，但是我觉得最简单的方法并不是程序实现。。。
用word打开，将“,CZ”替换成“^pCZ”，即将CZ前的逗号替换成回车，不就行了？
或者用ultraedit之类的工具，将 0x44 替换为 0x0D0A 即可。

jeakcowu 2008-02-24

打赏
举报

我怀疑以上兄台的性能，如果出现4G以上的文件，按每行每行的读，我估计他们的程序都要崩溃。
我建议，首先，在内存充足的情况下，可以使用内排序方法。如果内存不充足的话，那么，完了，外排序吧，时间有的受了。
给你一个内排序的算法，UNIX下面是可以使用的。
/*thread_example.c : c multiple thread programming in linux

xlc++ -q64 -lpthread -g -o sg thread_example.cpp
*/

#ifdef WIN32
#include <windows.h>
#else
#include <sys/time.h>
#endif
#include <pthread.h>
#include <string.h>
#include "stdio.h"
#include <iostream>
#include <algorithm>
#include <functional>
#include <vector>
#include <sstream>
#include <string>
#include <fstream>

using namespace std;
pthread_t thread[4];
pthread_mutex_t mut;
int number=0, i;
int write_num=0;
long int LINENUM=200;
long int ROWSIZE=15;
ifstream inf;
char *pData;
int y = 2;
int x = 1;
FILE* fp1[2];
FILE* fp;

typedef struct
{
int num;
char key[2];
}
Line;

vector< Line> vect[4];

////////////////////////////////////////////////////////
//ReadThread1线程起来分2步，
//1，当第一次读取数据的时候ReadThread1(0,200,10,0);
// 0行开始，一行200字节，10行;
//2，当第一次读取数据的时候ReadThread1(2,200,10,0);
// 2行开始，一行200字节，10行;
///////////////////////////////////////////////////////
//
int strcmp1 ( char * src , char * dst, int length )
{
int ret = 0 ;
int i = 1;

while( ! (ret = *src - *dst) && i <= length)
++src, ++dst, i ++;

if ( ret < 0 )
ret = -1 ;
else if ( ret > 0 )
ret = 1 ;

return( ret );
}

bool SortKey(Line a,Line b)
{
if (strcmp1(a.key,b.key,y) < 0)
//if (a.key < b.key)
return true;
else
return false;
}

//截取字符串，按开始和字符个数实现
char * cutstr(char* result,char * str1,long int begin,long int lenth)
{
memset(result,0,29);
for(int j=0;j<=lenth;i++,j++)
{
result[j]=str1[begin+j];
}
return result;
}

void Write()
{
int vect_num1 = -1;
int vect_num2 = -1;
int j[4] = {0,0,0,0};
int n[2]= {LINENUM/4, LINENUM-(LINENUM/4)*3};
for(int i = 0 ; i < LINENUM; i++,write_num++)
{

vect_num1 = -1;
if(j[0]<n[0]||j[1]<n[0])
{
if (j[0]==n[0]){vect_num1 = 0;}
else if (j[1]==n[0]){vect_num1 = 1;}
else if(SortKey(vect[0][j[0]],vect[1][j[1]]))
{

vect_num1 = 0;

}
else
{
vect_num1 = 1;
}
}
vect_num2 = -1;
if(j[2]<n[0]||j[3]<n[1])
{
if (j[2]==n[0]){vect_num2 = 2;}
else if (j[3]==n[1]){vect_num2 = 3;}
else if(SortKey(vect[2][j[2]],vect[3][j[3]]))
{

vect_num2 = 2;

}
else
{
vect_num2 = 3;
}
}
if (vect_num1==-1){
fwrite(&pData[vect[vect_num2][j[vect_num2]].num*ROWSIZE],ROWSIZE,1,fp1[0]);
j[vect_num2]++;
}
else if (vect_num2==-1)
{
fwrite(&pData[vect[vect_num1][j[vect_num1]].num*ROWSIZE],ROWSIZE,1,fp1[0]);
j[vect_num1]++;
}
else
{
if(SortKey(vect[vect_num1][j[vect_num1]],vect[vect_num2][j[vect_num2]])){
fwrite(&pData[vect[vect_num1][j[vect_num1]].num*ROWSIZE],ROWSIZE,1,fp1[0]);
j[vect_num1]++;
}else
{
fwrite(&pData[vect[vect_num2][j[vect_num2]].num*ROWSIZE],ROWSIZE,1,fp1[0]);
j[vect_num2]++;
}

}

}

}

void *thread1_r(void *args)
{
for(int i=0;i<LINENUM/4;i++)
{
Line line;
line.num = i;
cutstr(line.key ,pData,(long int)(ROWSIZE*i+x-1) ,y);
vect[0].push_back(line);
}
sort(vect[0].begin(), vect[0].end(),SortKey);
pthread_exit(NULL);
return ((void *)0);
}

void *thread2_r(void *args)
{
for(int i=LINENUM/4;i<(LINENUM/4)*2;i++)
{
Line line;
line.num = i;
cutstr(line.key ,pData,(long int)(ROWSIZE*i+x-1) ,y);
vect[1].push_back(line);
}
sort(vect[1].begin(), vect[1].end(),SortKey);
pthread_exit(NULL);
return ((void *)0);
}

void *thread3_r(void *args)
{
for(int i=(LINENUM/4)*2;i<(LINENUM/4)*3;i++)
{
Line line;
line.num = i;
cutstr(line.key ,pData,(long int)(ROWSIZE*i+x-1) ,y);
vect[2].push_back(line);
}
sort(vect[2].begin(), vect[2].end(),SortKey);
pthread_exit(NULL);
return ((void *)0);
}

void *thread4_r(void *args)
{
for(int i=(LINENUM/4)*3;i<LINENUM;i++)
{
Line line;
line.num = i;
cutstr(line.key ,pData,(long int)(ROWSIZE*i+x-1) ,y);
vect[3].push_back(line);
}
sort(vect[3].begin(), vect[3].end(),SortKey);
pthread_exit(NULL);
return ((void *)0);
}

void thread_create_1(int i)
{
int temp;

if(i==0)
{
int temp;
memset(&thread, 0, sizeof(thread)); //comment1
/*创建线程*/
if((temp = pthread_create(&thread[0], NULL, thread1_r, NULL)) != 0) //comment2
printf("READ线程1创建失败!\n");
else
printf("READ线程1被创建\n");

if((temp = pthread_create(&thread[1], NULL, thread2_r, NULL)) != 0) //comment3
printf("READ线程2创建失败\n");
else
printf("READ线程2被创建\n");
if((temp = pthread_create(&thread[2], NULL, thread3_r, NULL)) != 0) //comment3
printf("READ线程3创建失败\n");
else
printf("READ线程3被创建\n");
if((temp = pthread_create(&thread[3], NULL, thread4_r, NULL)) != 0) //comment3
printf("READ线程4创建失败\n");
else
printf("READ线程4被创建\n");
}

}

void thread_wait_1()
{
/*等待线程结束*/
if(thread[0] !=0) { //comment4
pthread_join(thread[0],NULL);
printf("线程1已经结束\n");
}
if(thread[1] !=0) { //comment5
pthread_join(thread[1],NULL);
printf("线程2已经结束\n");
}
if(thread[2] !=0) { //comment4
pthread_join(thread[2],NULL);
printf("线程1已经结束\n");
}
if(thread[3] !=0) { //comment5
pthread_join(thread[3],NULL);
printf("线程2已经结束\n");
}
}

void init()
{
//分配内存块空间
pData = (char *)malloc((ROWSIZE+1)*(LINENUM+1));
if(pData == NULL)
{
printf("not enough memory! \n");
free(pData);
exit(-1);
}

}

void ReadThread1()
{

//cout<<"内存初始化...";
//初始化内存块
//memset(pData,0,LineNum*LineSize);
//cout<<"ok"<<endl;
cout<<"读取内容...";
//读取到内存块
inf.read(pData,LINENUM*ROWSIZE);

cout<<"ok"<<endl;
cout<<"记录格式化..."<<endl;

//4个线程往vect[4]中写
pthread_mutex_init(&mut,NULL);
thread_create_1(0);
thread_wait_1();
cout<<"ok"<<endl;

}

int main(){

//计算读取文件时间
time_t tmcurrent = time(NULL);
init();
inf.open("abc.int");
inf >> noskipws;
ReadThread1();
time_t tmcurrent2 = time(NULL);
cout<<"read and sort time:"<<difftime(tmcurrent2,tmcurrent)<<" second"<<endl;
//输出到文件
//定义输出文件
fp1[0]=fopen("sss","wb");
Write();

time_t tmcurrent4 = time(NULL);
cout<<"write time:"<<difftime(tmcurrent4,tmcurrent2)<<" second"<<endl;
cout<<"total time:"<<difftime(tmcurrent4,tmcurrent)<<" second"<<endl;

fclose(fp1[0]);

//释放空间
free(pData);
return 0;
}

ChamPagneZ 2008-02-22

打赏
举报

4楼我要好好向你学习

manio 2008-02-22

打赏
举报

楼上好精力!

vrace 2008-02-22

打赏
举报

贴代码了，贴代码了

编译过后，使用命令行 <程序名> <要转换的文件> <输出文件> 来执行



#include <stdio.h>

#include <stdlib.h>

#include <malloc.h>

#include <assert.h>



/* order item struct */

struct orderitem

{

	char col[10];

	char rest[40];

	struct orderitem *next;

};



/* order item index struct */

struct orderindex

{

	char col[10];

	struct orderitem *item;

	struct orderindex *next;

};



struct orderindex *_pidx;

struct orderindex *_pidxlast;



int setitem(struct orderitem *pitem, char *pstr);

void additem(char *pstr);

void addindex(struct orderindex *pindex);

void addtail(struct orderitem *psearch, struct orderitem *pitem);

void savefile(FILE *fp);

void release(void);



int main(int argc, char *argv[])

{

	FILE *in;

	FILE *out;

	char line[80];



	/* check for param */

	if(argc != 3)

	{

		printf("Usage: order.exe <source file> <dest file>\n");

		exit(EXIT_SUCCESS);

	}



	_pidx = NULL;

	_pidxlast = NULL;



	/* open source */

	in = fopen(argv[1], "r");

	if(!in)

	{

		printf("Can't open source file %s.\n", argv[1]);

		exit(EXIT_FAILURE);

	}

	

	/* open dest */

	out = fopen(argv[2], "w");

	if(!out)

	{

		printf("Can't open dest file %s.\n", argv[2]);

		exit(EXIT_FAILURE);

	}



	/* process */

	printf("Processing...\n");



	while(1)

	{

		fgets(line, 80, in);

		if(feof(in))

		{

			break;

		}

		line[strlen(line) - 1] = '\0';



		additem(line);

	}

	

	/* save */

	printf("Saving...\n");

	savefile(out);



	/* close */

	fclose(in);

	fclose(out);



	printf("%s -> %s completed.\n", argv[1], argv[2]);



	/* release memory */

	release();



	return 0;

}



/* set an item */

int setitem(struct orderitem *pitem, char *pstr)

{

	int i;

	int len;



	/* check */

	assert(pitem != NULL);

	if(!pitem) return 0;



	assert(pstr != NULL);

	if(!pitem) return 0;



	/* reset memory */

	memset(pitem, 0, sizeof(struct orderitem));



	/* split parts */

	i = 0;

	len = strlen(pstr);

	while(i < len)

	{

		if(pstr[i] != ',')

		{

			pitem->col[i] = pstr[i];

			i++;

		}

		else

		{

			strcpy(pitem->rest, pstr + i + 1);

			break;

		}

	}



	return 1;

}



/* add the item to the tail */

void addtail(struct orderitem *psearch, struct orderitem *pitem)

{

	assert(psearch != NULL);

	if(!psearch) return;



	assert(pitem != NULL);

	if(!pitem) return;



	while(psearch)

	{

		if(psearch->next == NULL)

		{

			psearch->next = pitem;

			break;

		}



		psearch = psearch->next;

	}

}



/* add an index */

void addindex(struct orderindex *pindex)

{

	if(!_pidxlast)

	{

		_pidx = pindex;

		_pidxlast = pindex;

	}

	else

	{

		_pidxlast->next = pindex;

		_pidxlast = pindex;

	}

}



/* add an item to the list */

void additem(char *pstr)

{

	struct orderitem *pitem;

	struct orderindex *idx;



	assert(pstr != NULL);

	if(!pstr) return;



	/* allocate memory for new item */

	pitem = (struct orderitem*)malloc(sizeof(struct orderitem));

	assert(pitem != NULL);

	if(!pitem)

	{

		printf("Out of memory.\n");

		exit(EXIT_FAILURE);

	}



	/* set the item data */

	if(!setitem(pitem, pstr))

	{

		exit(EXIT_FAILURE);

	}



	/* looking for item col in index */

	idx = _pidx;

	while(1)

	{

		/* index not found, add one */

		if(!idx)

		{

			idx = (struct orderindex*)malloc(sizeof(struct orderindex));

			assert(idx != NULL);

			if(!idx)

			{

				printf("Out of memory.\n");

				exit(EXIT_FAILURE);

			}



			memset(idx, 0, sizeof(struct orderindex));

			strcpy(idx->col, pitem->col);

			idx->item = pitem;

			addindex(idx);



			break;

		}



		/* found index, add item */

		if(strcmp(idx->col, pitem->col) == 0)

		{

			addtail(idx->item, pitem);

			break;

		}



		/* continue searching */

		idx = idx->next;

	}

}



/* save result to file */

void savefile(FILE *fp)

{

	struct orderindex *idx;

	struct orderitem *pitem;



	idx = _pidx;

	while(idx)

	{

		fprintf(fp, "%s\n", idx->col);



		pitem = idx->item;

		while(pitem)

		{

			fprintf(fp, "%s\n", pitem->rest);

			pitem = pitem->next;

		}



		idx = idx->next;

	}

}



/* release memory */

void release(void)

{

	struct orderindex *idx;

	struct orderitem *pitem;

	void *p;



	idx = _pidx;

	while(idx)

	{

		pitem = idx->item;

		while(pitem)

		{

			p = pitem;

			pitem = pitem->next;

			free(p);

		}



		p = idx;

		idx = idx->next;

		free(p);

	}

}

baihacker 2008-02-22

打赏
举报

狂晕,注意到你文件的格式

struct RECORD
{
char ??[];
char ??[];
int ??;
RECORD* next;
};

可以换为

struct RECORD
{
unsigned short key;
RECORD* next;
};

把alignment设置为2时占用空间为6Byte
记录占用空间为175.78125k
数组占用空间为78.125k
除去为记录分配内存的一个常数时间
排序几乎是可以在瞬间完成...
最后输出到文件

baihacker 2008-02-22

打赏
举报

说白了就是按第一个关键字排序的问题
可以不断读取,然后插入排序到目标文件...反复操作文件IO太慢...

只有看内存够大的情况下
struct RECORD
{
char ??[];
char ??[];
int ??;
RECORD* next;
};
struct PAIR
{
RECORD* head;
RECORD* tail;
}
一个足够大的PAIR数组(最大为9999)数组名为array
读入一个关键字如20010000时
1.映射为数组index = 0
2.可以在常数时间内访问到array[index];
3.head为空时:head,tail指向这个记录,这个记录的next为NULL;
4.head不为空时:new_record->next = 0, tail->next = new_record, tail = tail->next;
5.总之添加一个记录总是在常数时间
6.如果生成new_record在太耗费时间......可以根据文件大小(你的文件貌似比较规则)事先为所有的记录分配内存...