关于“海量日志数据，提取出某日访问百度次数最多的那个IP”的疑惑

brk1985 2014-02-20 03:14:51

　　1、海量日志数据，提取出某日访问百度次数最多的那个IP。

　　此题，在我之前的一篇文章算法里头有所提到，当时给出的方案是：IP的数目还是有限的，最多2^32个，所以可以考虑使用hash将ip直接存入内存，然后进行统计。

　　再详细介绍下此方案：首先是这一天，并且是访问百度的日志中的IP取出来，逐个写入到一个大文件中。注意到IP是32位的，最多有个2^32个IP。同样可以采用映射的方法，比如模1000，把整个大文件映射为1000个小文件，再找出每个小文中出现频率最大的IP（可以采用hash_map进行频率统计，然后再找出频率最大的几个）及相应的频率。然后再在这1000个最大的IP中，找出那个频率最大的IP，即为所求。

IP地址最多有2^32=4G种取值可能，所以不能完全加载到内存中。可以考虑分而治之的策略，按照IP地址的hash(IP)%1024值，将海量日志存储到1024个小文件中。每个小文件最多包含4M个IP地址。对于每个小文件，可以构建一个IP作为key，出现次数作为value的hash_map，并记录当前出现次数最多的1个IP地址。有了1024个小文件中的出现次数最多的IP，我们就可以轻松得到总体上出现次数最多的IP。

从网上找到这道面试题的思路如上，上述思路根本凸显不了hash_map的好处吧？hash_map的长处是查找效率高，此题根本没查找，只有遍历，然后比较求最大值，是不是用map更好？

...全文

406 17 打赏收藏转发到动态举报

写回复

用AI写文章

17 条回复

切换为时间正序

请发表友善的回复…

发表回复

brk1985 2014-02-26

打赏
举报

引用 13 楼 zhao4zhong1 的回复:

[quote=引用 10 楼 brk1985 的回复:] [quote=引用 8 楼 zhao4zhong1 的回复:] 仅供参考：

//文件1中的内容排序并去重,结果保存到文件2中
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXCHARS 128      //能处理的最大行宽,包括行尾的\n和字符串尾的\0
int MAXLINES=10000,MAXLINES2;
char *buf,*buf2;
int c,n,hh,i,L;
FILE *f;
char ln[MAXCHARS];
int ignore_case=0;
int icompare(const void *arg1,const void *arg2) {
   return stricmp((char *)arg1,(char *)arg2);
}
int compare(const void *arg1,const void *arg2) {
   return strcmp((char *)arg1,(char *)arg2);
}
int main(int argc,char **argv) {
    if (argc<3) {
        printf("Unique line. Designed by zhao4zhong1@163.com. 2012-08-20\n");
        printf("Usage: %s src.txt uniqued.txt [-i]\n",argv[0]);
        return 1;
    }
    if (argc>3) ignore_case=1;//若存在命令行参数3，忽略大小写
    f=fopen(argv[1],"r");
    if (NULL==f) {
        printf("Can not find file %s!\n",argv[1]);
        return 1;
    }
    buf=(char *)malloc(MAXLINES*MAXCHARS);
    if (NULL==buf) {
        fclose(f);
        printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES,MAXCHARS);
        return 2;
    }
    n=0;
    hh=0;
    i=0;
    while (1) {
        if (NULL==fgets(ln,MAXCHARS,f)) break;//
        hh++;
        L=strlen(ln)-1;
        if ('\n'!=ln[L]) {//超长行忽略后面内容
            printf("%s Line %d too long(>%d),spilth ignored.\n",argv[1],hh,MAXCHARS);
            while (1) {
                c=fgetc(f);
                if ('\n'==c || EOF==c) break;//
            }
        }
        while (1) {//去掉行尾的'\n'和空格
            if ('\n'==ln[L] || ' '==ln[L]) {
                ln[L]=0;
                L--;
                if (L<0) break;//
            } else break;//
        }
        if (L>=0) {
            strcpy(buf+i,ln);i+=MAXCHARS;
            n++;
            if (n>=MAXLINES) {
                MAXLINES2=MAXLINES*2;
                if (MAXLINES2==1280000) MAXLINES2=2500000;
                buf2=(char *)realloc(buf,MAXLINES2*MAXCHARS);
                if (NULL==buf2) {
                    printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES2,MAXCHARS);
                    printf("WARNING: Lines >%d ignored.\n",MAXLINES);
                    break;//
                }
                buf=buf2;
                MAXLINES=MAXLINES2;
            }
        }
    }
    fclose(f);
    if (n>1) {
        if (ignore_case) qsort(buf,n,MAXCHARS,icompare);
        else qsort(buf,n,MAXCHARS,compare);
    }
    f=fopen(argv[2],"w");
    if (NULL==f) {
        free(buf);
        printf("Can not create file %s!\n",argv[2]);
        return 2;
    }
    fprintf(f,"%s\n",buf);
    if (n>1) {
        if (ignore_case) {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if (stricmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        } else {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if ( strcmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        }
    }
    fclose(f);
    free(buf);
    return 0;
}

你这个代码和这道题好像没怎么联系？能否讲的清楚些？[/quote] 修改排序函数为提取每行中的IP地址并排序修改去重功能为统计重复行数功能即可解决楼主的问题。如果内存不够用，改用64位机再试。[/quote] 哦，你这个思路和12楼的思路刚好形成互为补充。这样的题目，“输入”数据应该是“已经统计好的ip和访问次数”的对应数据集吧？

brk1985 2014-02-26

打赏
举报

引用 15 楼 q191201771 的回复:

另外提个疑问对ip hash能均匀分布吗

没研究的这么细致，那个hash函数来自网络，CString类型及其他自定义类型没有默认的hash函数，需要自己写，我网上参考了一个。我模拟生成的IP数据，是直接通过遍历，生成一段连续的IP和访问次数。。。就是看到网上清一色的说hash_map实现，感到纳闷，我这里实现的时候根本没有用到find函数（查找函数），而hash_map相比map的优势也在大数据量查找时高效率，没有查找，我在想，map优势更大啊？

brk1985 2014-02-26

打赏
举报

引用 12 楼 baihacker 的回复:

我是指在已经准备分为若干文件的情况下,分为256个文件,然后读入一个文件的数据.由于每个文件中的ip的第一个数字相同,而其它三个数字只有2^16种可能性,直接开数组来计算每个ip出现多少次,然后求出最多的. 题外话:4G内存,是可以随便开的,win64,物理内存16G. 而服务器,几十个G内存很正常.

其它三个数字应该是只有2^24(16M)种可能性，你这种思路，“ip、访问次数”这种数据需要进行排序，然后数组索引和ip一一对应，每次比较最大访问次数值的时候，记录最大值索引和最大访问次数。是这样的吧？这样的思路其实也有map一一对应（映射）的思路。。。

就想叫yoko 2014-02-26

打赏
举报

另外提个疑问对ip hash能均匀分布吗

就想叫yoko 2014-02-25

打赏
举报

学习学习～～

赵4老师 2014-02-21

打赏
举报

引用 10 楼 brk1985 的回复:

[quote=引用 8 楼 zhao4zhong1 的回复:] 仅供参考：

//文件1中的内容排序并去重,结果保存到文件2中
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXCHARS 128      //能处理的最大行宽,包括行尾的\n和字符串尾的\0
int MAXLINES=10000,MAXLINES2;
char *buf,*buf2;
int c,n,hh,i,L;
FILE *f;
char ln[MAXCHARS];
int ignore_case=0;
int icompare(const void *arg1,const void *arg2) {
   return stricmp((char *)arg1,(char *)arg2);
}
int compare(const void *arg1,const void *arg2) {
   return strcmp((char *)arg1,(char *)arg2);
}
int main(int argc,char **argv) {
    if (argc<3) {
        printf("Unique line. Designed by zhao4zhong1@163.com. 2012-08-20\n");
        printf("Usage: %s src.txt uniqued.txt [-i]\n",argv[0]);
        return 1;
    }
    if (argc>3) ignore_case=1;//若存在命令行参数3，忽略大小写
    f=fopen(argv[1],"r");
    if (NULL==f) {
        printf("Can not find file %s!\n",argv[1]);
        return 1;
    }
    buf=(char *)malloc(MAXLINES*MAXCHARS);
    if (NULL==buf) {
        fclose(f);
        printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES,MAXCHARS);
        return 2;
    }
    n=0;
    hh=0;
    i=0;
    while (1) {
        if (NULL==fgets(ln,MAXCHARS,f)) break;//
        hh++;
        L=strlen(ln)-1;
        if ('\n'!=ln[L]) {//超长行忽略后面内容
            printf("%s Line %d too long(>%d),spilth ignored.\n",argv[1],hh,MAXCHARS);
            while (1) {
                c=fgetc(f);
                if ('\n'==c || EOF==c) break;//
            }
        }
        while (1) {//去掉行尾的'\n'和空格
            if ('\n'==ln[L] || ' '==ln[L]) {
                ln[L]=0;
                L--;
                if (L<0) break;//
            } else break;//
        }
        if (L>=0) {
            strcpy(buf+i,ln);i+=MAXCHARS;
            n++;
            if (n>=MAXLINES) {
                MAXLINES2=MAXLINES*2;
                if (MAXLINES2==1280000) MAXLINES2=2500000;
                buf2=(char *)realloc(buf,MAXLINES2*MAXCHARS);
                if (NULL==buf2) {
                    printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES2,MAXCHARS);
                    printf("WARNING: Lines >%d ignored.\n",MAXLINES);
                    break;//
                }
                buf=buf2;
                MAXLINES=MAXLINES2;
            }
        }
    }
    fclose(f);
    if (n>1) {
        if (ignore_case) qsort(buf,n,MAXCHARS,icompare);
        else qsort(buf,n,MAXCHARS,compare);
    }
    f=fopen(argv[2],"w");
    if (NULL==f) {
        free(buf);
        printf("Can not create file %s!\n",argv[2]);
        return 2;
    }
    fprintf(f,"%s\n",buf);
    if (n>1) {
        if (ignore_case) {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if (stricmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        } else {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if ( strcmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        }
    }
    fclose(f);
    free(buf);
    return 0;
}

你这个代码和这道题好像没怎么联系？能否讲的清楚些？[/quote] 修改排序函数为提取每行中的IP地址并排序修改去重功能为统计重复行数功能即可解决楼主的问题。如果内存不够用，改用64位机再试。

baihacker 2014-02-21

打赏
举报

我是指在已经准备分为若干文件的情况下,分为256个文件,然后读入一个文件的数据.由于每个文件中的ip的第一个数字相同,而其它三个数字只有2^16种可能性,直接开数组来计算每个ip出现多少次,然后求出最多的. 题外话:4G内存,是可以随便开的,win64,物理内存16G. 而服务器,几十个G内存很正常.

brk1985 2014-02-21

打赏
举报

引用 2 楼 luciferisnotsatan 的回复:

这个不用找吧，每个小文件里记录下目前最大IP和数量，一旦有其它IP超过了，就替换。

你的意思也是没必要用hash_map吗？我在9楼给了我的思路编码思路：1、生成若干个子文件(模拟)；2、对于每个小文件，可以构建一个IP作为key，出现次数作为value的hash_map；3、获取每个子文件中访问次数最多的键值对；4、获取所有文件中访问次数最多的键值对，即对每个子文件中访问最多的键值对求取最大值。我这样的思路，能求解，但是感觉效率不怎么高。

brk1985 2014-02-21

打赏
举报

引用 8 楼 zhao4zhong1 的回复:

仅供参考：

//文件1中的内容排序并去重,结果保存到文件2中
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXCHARS 128      //能处理的最大行宽,包括行尾的\n和字符串尾的\0
int MAXLINES=10000,MAXLINES2;
char *buf,*buf2;
int c,n,hh,i,L;
FILE *f;
char ln[MAXCHARS];
int ignore_case=0;
int icompare(const void *arg1,const void *arg2) {
   return stricmp((char *)arg1,(char *)arg2);
}
int compare(const void *arg1,const void *arg2) {
   return strcmp((char *)arg1,(char *)arg2);
}
int main(int argc,char **argv) {
    if (argc<3) {
        printf("Unique line. Designed by zhao4zhong1@163.com. 2012-08-20\n");
        printf("Usage: %s src.txt uniqued.txt [-i]\n",argv[0]);
        return 1;
    }
    if (argc>3) ignore_case=1;//若存在命令行参数3，忽略大小写
    f=fopen(argv[1],"r");
    if (NULL==f) {
        printf("Can not find file %s!\n",argv[1]);
        return 1;
    }
    buf=(char *)malloc(MAXLINES*MAXCHARS);
    if (NULL==buf) {
        fclose(f);
        printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES,MAXCHARS);
        return 2;
    }
    n=0;
    hh=0;
    i=0;
    while (1) {
        if (NULL==fgets(ln,MAXCHARS,f)) break;//
        hh++;
        L=strlen(ln)-1;
        if ('\n'!=ln[L]) {//超长行忽略后面内容
            printf("%s Line %d too long(>%d),spilth ignored.\n",argv[1],hh,MAXCHARS);
            while (1) {
                c=fgetc(f);
                if ('\n'==c || EOF==c) break;//
            }
        }
        while (1) {//去掉行尾的'\n'和空格
            if ('\n'==ln[L] || ' '==ln[L]) {
                ln[L]=0;
                L--;
                if (L<0) break;//
            } else break;//
        }
        if (L>=0) {
            strcpy(buf+i,ln);i+=MAXCHARS;
            n++;
            if (n>=MAXLINES) {
                MAXLINES2=MAXLINES*2;
                if (MAXLINES2==1280000) MAXLINES2=2500000;
                buf2=(char *)realloc(buf,MAXLINES2*MAXCHARS);
                if (NULL==buf2) {
                    printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES2,MAXCHARS);
                    printf("WARNING: Lines >%d ignored.\n",MAXLINES);
                    break;//
                }
                buf=buf2;
                MAXLINES=MAXLINES2;
            }
        }
    }
    fclose(f);
    if (n>1) {
        if (ignore_case) qsort(buf,n,MAXCHARS,icompare);
        else qsort(buf,n,MAXCHARS,compare);
    }
    f=fopen(argv[2],"w");
    if (NULL==f) {
        free(buf);
        printf("Can not create file %s!\n",argv[2]);
        return 2;
    }
    fprintf(f,"%s\n",buf);
    if (n>1) {
        if (ignore_case) {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if (stricmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        } else {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if ( strcmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        }
    }
    fclose(f);
    free(buf);
    return 0;
}

你这个代码和这道题好像没怎么联系？能否讲的清楚些？

brk1985 2014-02-21

打赏
举报

引用 6 楼 baihacker 的回复:

[quote=引用 5 楼 brk1985 的回复:] [quote=引用 1 楼 baihacker 的回复:] 输入一个ip,还有一个关联的int,所以是在遍历过程中,查找对应的int,然后进行计数. 另外,这个场景下直接开个4M的int就完了.

我以下的做法没有使用find查找函数，边遍历边比较，不知道和你说的哪种效率高？你的想法是先求得int最大的，然后用find函数求ip？


MyPair MaxValue(hash_map<MyString, int, MyHashCompare<MyString>> ipMap)//求每个hash_map最大值
{
	hash_map<MyString,int>::iterator iter = ipMap.begin();
	MyString maxStr=(*iter).first;
	int maxNum=(*iter).second;
	iter++;
	int countNum=0;
	//int* intArray=new int[10*1024*1024];//10M
	while (iter != ipMap.end())
	{
		if (maxNum<(*iter).second)
		{
			maxStr=(*iter).first;
			maxNum=(*iter).second;
		}
		iter++;
	}	
	//int maxNum=MaxValue(intArray,countNum);
	//delete[] intArray;
	MyPair tempPair;
	tempPair.first=maxStr;
	tempPair.second=maxNum;
	return tempPair;
}

[/quote] 你这里的ipMap已经构建好了,我是指 int best = 0, value = 0; foreach(int ip: 某个源) // 其中所有的ip的开头那个数字一样 { const int mask = (1<<24)-1; if (++cnt[ip&mask] > best) value = ip, best = cnt[ip&mask]; } [/quote] 你的思路也不太明白。。。关于这道题能否给个编程思路（你的理解）或纠正下我的编程思路？大数据文件，我不知道怎么构造，我只做了一个简单的模拟。编码思路：1、生成若干个子文件；2、对于每个小文件，可以构建一个IP作为key，出现次数作为value的hash_map；3、获取每个子文件中访问次数最多的键值对；4、获取所有文件中访问次数最多的键值对，即对每个子文件中访问最多的键值对求取最大值。


#include "StdAfx.h"
#include <atlstr.h>
#include <hash_map>
#include <string>
#include <iostream>
#include <time.h>
using namespace std;
using namespace stdext;

#define FileNum (1024)

//typedef CString ClassA;
class MyString: public CString{
public:
	inline size_t hash_value(const MyString& str) const
	{ 
		size_t value = _HASH_SEED; 
		size_t size  = str.GetLength(); 
		if (size > 0) { 
			size_t temp = (size / 16) + 1; 
			size -= temp; 
			for (size_t idx = 0; idx <= size; idx += temp) { 
				value += (size_t)str[(int)idx]; 
			} 
		} 
		return(value); 
	}
};

template<class _Tkey> 
class MyHashCompare : public hash_compare<_Tkey> 
{ 
public: 
	size_t operator()(const _Tkey& _Key) const 
	{ 
		return(_Key.hash_value(_Key));//此处需要注意更改 
	}

	bool operator()(const _Tkey& _Keyval1, const _Tkey& _Keyval2) const
	{ 
		return (comp(_Keyval1, _Keyval2)); 
	} 
};

typedef pair<MyString, int> MyPair;
MyPair MaxValue(hash_map<MyString, int, MyHashCompare<MyString>> ipMap)//求每个hash_map最大值
{
	hash_map<MyString,int>::iterator iter = ipMap.begin();
	MyString maxStr=(*iter).first;
	int maxNum=(*iter).second;
	iter++;
	int countNum=0;
	//int* intArray=new int[10*1024*1024];//10M
	while (iter != ipMap.end())
	{
		if (maxNum<(*iter).second)
		{
			maxStr=(*iter).first;
			maxNum=(*iter).second;
		}
		iter++;
	}	
	//int maxNum=MaxValue(intArray,countNum);
	//delete[] intArray;
	MyPair tempPair;
	tempPair.first=maxStr;
	tempPair.second=maxNum;
	return tempPair;
}

MyPair MaxKey(MyPair* tempPairs, int num)
{
	MyPair tempPair;
	MyString maxStr=tempPairs[0].first;
	int maxNum=tempPairs[0].second;
	for (int i=1; i<num; i++)
	{
		if (maxNum<tempPairs[i].second)
		{
			maxStr=tempPairs[i].first;
			maxNum=tempPairs[i].second;
		}
	}
	tempPair.first=maxStr;
	tempPair.second=maxNum;
	return tempPair;
}

int main()
{
	hash_map<MyString, int, MyHashCompare<MyString>> ipMap[FileNum];
	FILE* fpData[FileNum];
	MyPair tempPairs[FileNum];
	int iFileNum=0;

	//FILE* fp=fopen("data.txt","wb");
	//生成5个文件
	MyString strTemp1,strTemp2;
	for (int i=101; i<=200; i++)
	{
		//strTemp.Format("192.168.0.%d-%d|",i,rand());
		if ((i-1)%20==0)
		{
			MyString strTemp;
			strTemp.Format("data%d.txt",++iFileNum);
			fpData[iFileNum-1]=fopen((LPSTR)(LPCTSTR)strTemp,"wb");//fpData[0]==data1.txt
		}
		strTemp1.Format("192.168.0.%d\r\n",i);
		fwrite(strTemp1,1,strTemp1.GetLength(),fpData[iFileNum-1]);
		strTemp2.Format("%d\r\n",rand());
		if (i%20==0)
		{
			strTemp2.Format("%d",rand());//最后一行不换行
		}
		fwrite(strTemp2,1,strTemp2.GetLength(),fpData[iFileNum-1]);
		//hmap.insert(MyPair(strTemp,i+1));
	}
	for (int j=0; j<iFileNum; j++)
	{
		fclose(fpData[j]);
	}

	//5个文件的IP和访问次数存入hash_map
	for (int k=0; k<iFileNum; k++)
	{
		MyString strTemp;
		strTemp.Format("data%d.txt",k+1);
		fpData[k]=fopen((LPSTR)(LPCTSTR)strTemp,"rb");//fpData[0]==data1.txt
		while (!feof(fpData[k]))
		{
			char tempIP[25];
			MyString strIP;
			MyString strNum;
			//fgets((LPSTR)(LPCTSTR)strIP,25,fpData[k]);//读取一行，第二个参数为最大长度
			fgets(tempIP,25,fpData[k]);
			strIP.Format("%s",tempIP);//此处strIP=tempIP会出错，MyString为CString的子类
			strIP.TrimRight();//去除右边处空格（不去除strIP包含空行字符，会被strNum覆盖）
			cout<< "strIP==" << strIP<< endl;
			fgets((LPSTR)(LPCTSTR)strNum,15,fpData[k]);
			cout<< "strNum==" << _ttoi(strNum)<< endl;
			ipMap[k].insert(MyPair(strIP,_ttoi(strNum)));
			//cout<< "strIP==" << MyPair(strIP,_ttoi(strNum)).first<< endl;
			//cout<< "strNum==" << MyPair(strIP,_ttoi(strNum)).second<< endl;
		}
	}

	//int intArray[5]={8,10,6,5,3};
	//cout << "Max==" << MaxValue(intArray,5)<< endl;

	for (int k=0; k<iFileNum; k++){
		tempPairs[k].first=MaxValue(ipMap[k]).first;
		tempPairs[k].second=MaxValue(ipMap[k]).second;
		cout << "Max==" << tempPairs[k].first << "--" << tempPairs[k].second<< endl;
	}
	MyPair m_maxPair=MaxKey(tempPairs,iFileNum);
	cout << "Max==" << m_maxPair.first << "--" << m_maxPair.second<< endl;//访问次数最多的IP
	for (int j=0; j<iFileNum; j++)
	{
		fclose(fpData[j]);
	}

	system("pause");
	return 0;
}

赵4老师 2014-02-20

打赏
举报

仅供参考：

//文件1中的内容排序并去重,结果保存到文件2中
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXCHARS 128      //能处理的最大行宽,包括行尾的\n和字符串尾的\0
int MAXLINES=10000,MAXLINES2;
char *buf,*buf2;
int c,n,hh,i,L;
FILE *f;
char ln[MAXCHARS];
int ignore_case=0;
int icompare(const void *arg1,const void *arg2) {
   return stricmp((char *)arg1,(char *)arg2);
}
int compare(const void *arg1,const void *arg2) {
   return strcmp((char *)arg1,(char *)arg2);
}
int main(int argc,char **argv) {
    if (argc<3) {
        printf("Unique line. Designed by zhao4zhong1@163.com. 2012-08-20\n");
        printf("Usage: %s src.txt uniqued.txt [-i]\n",argv[0]);
        return 1;
    }
    if (argc>3) ignore_case=1;//若存在命令行参数3，忽略大小写
    f=fopen(argv[1],"r");
    if (NULL==f) {
        printf("Can not find file %s!\n",argv[1]);
        return 1;
    }
    buf=(char *)malloc(MAXLINES*MAXCHARS);
    if (NULL==buf) {
        fclose(f);
        printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES,MAXCHARS);
        return 2;
    }
    n=0;
    hh=0;
    i=0;
    while (1) {
        if (NULL==fgets(ln,MAXCHARS,f)) break;//
        hh++;
        L=strlen(ln)-1;
        if ('\n'!=ln[L]) {//超长行忽略后面内容
            printf("%s Line %d too long(>%d),spilth ignored.\n",argv[1],hh,MAXCHARS);
            while (1) {
                c=fgetc(f);
                if ('\n'==c || EOF==c) break;//
            }
        }
        while (1) {//去掉行尾的'\n'和空格
            if ('\n'==ln[L] || ' '==ln[L]) {
                ln[L]=0;
                L--;
                if (L<0) break;//
            } else break;//
        }
        if (L>=0) {
            strcpy(buf+i,ln);i+=MAXCHARS;
            n++;
            if (n>=MAXLINES) {
                MAXLINES2=MAXLINES*2;
                if (MAXLINES2==1280000) MAXLINES2=2500000;
                buf2=(char *)realloc(buf,MAXLINES2*MAXCHARS);
                if (NULL==buf2) {
                    printf("Can not malloc(%d LINES*%d CHARS)!\n",MAXLINES2,MAXCHARS);
                    printf("WARNING: Lines >%d ignored.\n",MAXLINES);
                    break;//
                }
                buf=buf2;
                MAXLINES=MAXLINES2;
            }
        }
    }
    fclose(f);
    if (n>1) {
        if (ignore_case) qsort(buf,n,MAXCHARS,icompare);
        else qsort(buf,n,MAXCHARS,compare);
    }
    f=fopen(argv[2],"w");
    if (NULL==f) {
        free(buf);
        printf("Can not create file %s!\n",argv[2]);
        return 2;
    }
    fprintf(f,"%s\n",buf);
    if (n>1) {
        if (ignore_case) {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if (stricmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        } else {
            hh=0;
            L=MAXCHARS;
            for (i=1;i<n;i++) {
                if ( strcmp((const char *)buf+hh,(const char *)buf+L)) {
                    fprintf(f,"%s\n",buf+L);
                }
                hh=L;
                L+=MAXCHARS;
            }
        }
    }
    fclose(f);
    free(buf);
    return 0;
}

brk1985 2014-02-20

打赏
举报

引用 3 楼 adlay 的回复:

不保存每一个 ip 的访问次数，如何知道一个 ip 的访问次数是否超过了当前记录下的最大值？

按你的说法map也行，此题不知道考察的是否是map和hash_map的查找效率吧？

baihacker 2014-02-20

打赏
举报

引用 5 楼 brk1985 的回复:

[quote=引用 1 楼 baihacker 的回复:] 输入一个ip,还有一个关联的int,所以是在遍历过程中,查找对应的int,然后进行计数. 另外,这个场景下直接开个4M的int就完了.

我以下的做法没有使用find查找函数，边遍历边比较，不知道和你说的哪种效率高？你的想法是先求得int最大的，然后用find函数求ip？


MyPair MaxValue(hash_map<MyString, int, MyHashCompare<MyString>> ipMap)//求每个hash_map最大值
{
	hash_map<MyString,int>::iterator iter = ipMap.begin();
	MyString maxStr=(*iter).first;
	int maxNum=(*iter).second;
	iter++;
	int countNum=0;
	//int* intArray=new int[10*1024*1024];//10M
	while (iter != ipMap.end())
	{
		if (maxNum<(*iter).second)
		{
			maxStr=(*iter).first;
			maxNum=(*iter).second;
		}
		iter++;
	}	
	//int maxNum=MaxValue(intArray,countNum);
	//delete[] intArray;
	MyPair tempPair;
	tempPair.first=maxStr;
	tempPair.second=maxNum;
	return tempPair;
}

brk1985 2014-02-20

打赏
举报

引用 1 楼 baihacker 的回复:

输入一个ip,还有一个关联的int,所以是在遍历过程中,查找对应的int,然后进行计数. 另外,这个场景下直接开个4M的int就完了.

我以下的做法没有使用find查找函数，边遍历边比较，不知道和你说的哪种效率高？你的想法是先求得int最大的，然后用find函数求ip？


MyPair MaxValue(hash_map<MyString, int, MyHashCompare<MyString>> ipMap)//求每个hash_map最大值
{
	hash_map<MyString,int>::iterator iter = ipMap.begin();
	MyString maxStr=(*iter).first;
	int maxNum=(*iter).second;
	iter++;
	int countNum=0;
	//int* intArray=new int[10*1024*1024];//10M
	while (iter != ipMap.end())
	{
		if (maxNum<(*iter).second)
		{
			maxStr=(*iter).first;
			maxNum=(*iter).second;
		}
		iter++;
	}	
	//int maxNum=MaxValue(intArray,countNum);
	//delete[] intArray;
	MyPair tempPair;
	tempPair.first=maxStr;
	tempPair.second=maxNum;
	return tempPair;
}