PageRank算法的实现源代码和原理(原创)

renhit 2007-02-02 05:18:18
最后面是源代码,前面是简单的原理介绍

转载请注明出处: http://renxijun.blog.sohu.com/32167163.html

曾经在网上找了很多关于PageRank的论文,基本上都是介绍原理的,没有找到计算过程的具体实现的源代码,前一阵公司有需要于是写了一个,比较简单(没有反作弊,Blog链接的处理,黑洞的处理都没有管),就是用极限编程的思想用最快的速度实现了一个个人感觉计算效率还不错,(没分块,其实前期分块后对后续的计算过程也是一样的了P4 3.0,512M),1700万网页迭代一次需要25秒钟的样子.

SortMap.dat是一个链接关系文件,已经按照DocID(每个网页的唯一编号,64位整型)进行排序了.基本上对这样的大文件排序都是进行分割,排序,归并实现的(大家有兴趣的话我下次将排序源代码也贴上来).如果您手头没有这样的链接数据话http://www.sogou.com/labs/dl/t-link.html(互联网语料链接关系库),这个里面可以提取,然后就是排序,然后就是下面的源代码计算,你就能看到传说中的PageRank了.Good Luck!



PageRank计算实现原理 PageRank基本原理 可能有点帮助吧,中文的PageRank学习笔记

后面是计算PageRank的源代码,希望大家喜欢:(这里贴源代码确实太难看了 )



#include <windows.h>
#include <winbase.h>
#include <stdio.h>

#include <iostream>
#include <fstream>
#include <vector>
#include <hash_map>
#include <hash_set>
#include <assert.h>
#include "../FileSort/Sort.h" //一个实现快速排序的文件,后面需要排序的函数都在这个文件中

#define BLOCKSIZE (64*1024*1024)
//#define GROUP 8
#define LENGTHOFSINGLE 8
#define OUTDEGREESIZE 2048
#pragma pack(1)

using namespace std;

//实现二分查找功能
template <typename type_key>
size_t binaryFind(type_key *begin,type_key *end,const type_key &value)
{
if(begin>=end) return end-begin;
//unsigned __int64 size = sizeof(type_key);
//long len = (end-begin);
int total = (end-begin);
int low=0;
int high=total-1;
int mid=0;
unsigned __int64 temp = 0;
while(low<=high)
{
mid = (low+high)/2;
temp = *(begin+mid);
if(temp==value)
{
return mid;
}
if(temp>value)
{
high=mid-1;
continue;
}
else
{
low=mid+1;
continue;
}
}
return total;
}


//完成将DocID转换成Index的过程,以后每次迭代不用再进行查找,算是中间的辅助函数
unsigned int BuildIndex(FILE * mapFile)
{
cout<<"start to build index"<<endl;
FILE *countFile = fopen("count.dat","rb");
if (countFile==NULL)
{
cout<<"count.dat can not open,please check the file"<<endl;
return 0;
}

unsigned int total;
unsigned int MaxOutDegree;
fread(&total,4,1,countFile);
fread(&MaxOutDegree,4,1,countFile);
fclose(countFile);
cout<<"total is :/t"<<total<<"/tand the MaxOutDegree:"<<MaxOutDegree<<endl;
if (total>0)
{
unsigned __int64 *index = new unsigned __int64[total];
char *sons = new char[MaxOutDegree*(LENGTHOFSINGLE+1)];
FILE *indexMap = fopen("indexMap.dat","wb");
FILE *DocIDIndex = fopen("DocID.index","wb");
unsigned __int64 DocID;

int outDegree;
for(size_t i=0;i<total;++i)
{
fread(index+i,LENGTHOFSINGLE,1,mapFile);
fread(&outDegree,4,1,mapFile);
fread(sons,LENGTHOFSINGLE+1,outDegree,mapFile);
}
cout<<"end of read file"<<endl;
fwrite(index,LENGTHOFSINGLE,total,DocIDIndex);

rewind(mapFile);
size_t position;
for(size_t i=0;i<total;++i)
{
fread(&DocID,LENGTHOFSINGLE,1,mapFile);
fread(&outDegree,4,1,mapFile);
fread(sons,LENGTHOFSINGLE+1,outDegree,mapFile);

fwrite(&i,4,1,indexMap);
fwrite(&outDegree,4,1,indexMap);
for (int j=0;j<outDegree;++j)
{
//fread(sons,LENGTHOFSINGLE+1,1,mapFile);
DocID = *(unsigned __int64 *)(sons+j*(LENGTHOFSINGLE+1));
position = binaryFind(index,index+total,DocID);
if (position != total)
{//找到
fwrite(&position,4,1,indexMap);
}
else fwrite(&total,4,1,indexMap);
}
}

fcloseall();
delete []index;
delete []sons;
}
cout<<"over build the index"<<endl;
return total;
}


int main(int argc, char *argv[])
{
ofstream fout("log.txt");

FILE *file = NULL;
if(argc>=2)
file = fopen(argv[1],"rb");
else
file = fopen("F://ren//SortMap.dat","rb");
if(file==NULL)
cout<<"--------can not open the file--------"<<endl;

unsigned int start,end;
start = GetTickCount();
size_t total=BuildIndex(file);
end = GetTickCount();
cout<<"Build index lost:/t"<<end-start<<endl;

if (total<1)
{
cout<<"BuildIndex Error and the program exit"<<endl;
return 0;
}

size_t DocID=0;
int outDegree = 0;
fclose(file);
file = fopen("indexMap.dat","rb");
FILE *DocIDIndex = fopen("DocID.Index","rb");
cout<<"start the total is:"<<total<<endl;
if(total>0)
{ //真正实现的计算过程
unsigned int * sons = new unsigned int[1];
start = GetTickCount();
float *pageRank = new float[total];
float *prTmp = new float[total];
for (size_t i=0;i<total;++i)
{
pageRank[i] = 1.0f;
prTmp[i] = 0.0f;
}
end = GetTickCount();
cout<<"end of index:"<<end-start<<endl;

ofstream fout("pagerank.txt");
float fatherRank = 1;
const float alpha = 0.85f;
size_t position = total;
size_t index=0;
for (size_t iter=0;iter<30;++iter)
{
rewind(file);
start = GetTickCount();
for (size_t i=0;i<total;++i)
{
fread(&DocID,4,1,file);
fread(&outDegree,4,1,file);
if (outDegree==0)
{
continue;
}
fatherRank = pageRank[i]/outDegree;
for (int k=0;k<outDegree;++k)
{
fread(sons,4,1,file);
if (total > *sons && *sons>=0)
{
prTmp[ *sons ] += fatherRank;
}
}
}
for (int i=0;i<total;++i)
{
prTmp[i] = 0.15f + alpha*prTmp[i];
pageRank[i] = prTmp[i];
prTmp[i] = 0.0f;
}
end = GetTickCount();
cout<<">>>>>>>>>>>>>>>"<<(iter+1)<<"/t"<<end-start<<endl;
}
delete []prTmp;
delete []sons;
cout<<"over is writing"<<endl;
FILE * PageRankList = fopen("PageRankList.dat","wb");
fwrite(pageRank,4,total,PageRankList);
delete []pageRank;
}
fout.close();
fclose(file);
fclose(DocIDIndex);
fcloseall();
cout<<"-------over----------"<<endl;
return 1;
}

...全文
531 1 打赏 收藏 转发到动态 举报
写回复
用AI写文章
1 条回复
切换为时间正序
请发表友善的回复…
发表回复
renhit 2007-02-02
  • 打赏
  • 举报
回复
晕,一大堆相关资源的超链接没有显示出来,大家自己去这里看吧:
转载请注明出处: http://renxijun.blog.sohu.com/32167163.html

64,646

社区成员

发帖
与我相关
我的任务
社区描述
C++ 语言相关问题讨论,技术干货分享,前沿动态等
c++ 技术论坛(原bbs)
社区管理员
  • C++ 语言社区
  • encoderlee
  • paschen
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
  1. 请不要发布与C++技术无关的贴子
  2. 请不要发布与技术无关的招聘、广告的帖子
  3. 请尽可能的描述清楚你的问题,如果涉及到代码请尽可能的格式化一下

试试用AI创作助手写篇文章吧