关于全文检索的算法问题?

yiyi999999999 2003-12-05 03:15:17

我要做一个有全文检索功能的过滤系统，它要对一篇很大的文本文件做全文检索，如果发现有需要过滤的内容就会提示，它对检索的速度要求很高，我该用什么算法来实现才能达到很高的速度，谁有关于这方面的好的算法或源程序，请告诉我，谢谢！！

...全文

118 7 打赏收藏转发到动态举报

写回复

用AI写文章

7 条回复

切换为时间正序

请发表友善的回复…

发表回复

liujianhui 2003-12-10

打赏
举报

值得学习

短歌如风 2003-12-09

打赏
举报

下面是一个实现：
先对关键字组进行预处理，分成256组；
每次读取一行，在行中搜索；
搜索时遍历每个字符，找到相应的关键字组进行比较。

当处理一个长度为22M的2M行没有出现关键字的文件时，大约要用3.5秒，进行完整单词匹配时大约3.2秒，如果不搜索，只读一遍需要2.2秒。这个速度基本上可以满足多数应用了。
编译器是BCC32 5.6.4，STLport 4.5.3，在P3（联想奔月）上运行。

#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>
#include <list>
#include <utility>
#include <cstddef>
#include <ctime>

const std::string keyword_list[] =
{
"int", "unsigned", "long", "short", "char", "double", "float", "void", "const", "volatile", "auto", "register", "static",
"extern", "if", "else", "switch", "case", "default""goto", "break", "continue", "return""for", "while", "do",
"typedef", "struct", "union", "enum"
};

#define ARY_LEN(X) (sizeof(X)/sizeof(X[0]))
struct keywords_grouping
{
std::vector < std::list < std::string > > & list;

keywords_grouping( std::vector < std::list < std::string > > & init_list ) : list( init_list )
{
}

void operator() ( const std::string & str ) const
{
list[str[0]].push_back( str );
}
};

std::pair < std::string::size_type,
std::string::size_type
> search_keywords( const std::string & line, const std::vector < std::list < std::string > > & keywords,
bool word_only = false )
{
std::string::size_type pos = 0;
bool not_found = true;
std::list < std::string >::const_iterator it;
for ( ; not_found && pos <= line.size(); ++pos )
{
if ( !word_only || pos == 0 || std::isspace( line[pos - 1] ) )
{
const std::list < std::string > & list = keywords[static_cast < unsigned char > ( line[pos] )];
for ( it = list.begin(); not_found && it != list.end(); ++it )
{
if ( it->size() <= line.size() - pos
&& ( !word_only || it->size() == line.size() - pos || std::isspace( line[pos + it->size()] ) ) )
not_found = line.compare( pos, it->size(), * it ) != 0;
}
}
}
if ( not_found )
return std::pair < std::string::size_type, std::string::size_type > ( std::string::npos, std::string::npos );
else
return std::pair < std::string::size_type, std::string::size_type > ( --pos, ( --it )->size() );

}

int main( int argc, char * argv[] )
{
std::clock_t clock;
std::vector < std::list < std::string > > keywords( 256 );
std::for_each( keyword_list, keyword_list + ARY_LEN( keyword_list ), keywords_grouping( keywords ) );
std::string line;
unsigned line_no( 0 );
std::pair < std::string::size_type, std::string::size_type > result( std::string::npos, std::string::npos );
clock = std::clock();
while ( result.first == std::string::npos && std::getline( std::cin, line ) )
{
result = search_keywords( line, keywords,
argc > 1 && ( argv[1] [0] == '/' || argv[1] [0] == '-' ) && ( argv[1] [1] == 'w' || argv[1] [1] == 'W' ) );
++line_no;
}
clock = std::clock() - clock;
std::cout << static_cast < double > ( clock ) / CLK_TCK << std::endl;
if ( result.first != std::string::npos )
{
std::cout << "Line Number:" << line_no << "\n";
std::cout << line << "\n";
for ( std::string::size_type i = 0; i < result.first; i++ )
std::cout << ' ';
for ( std::string::size_type i = 0; i < result.second; i++ )
std::cout << '^';
}
}

短歌如风 2003-12-09

打赏
举报

没有什么“原始的”算法文档啊，我直接用C++写的。算法就直接写在这里吧：
需要三种数据结构：串（String)、数组（Array)、线性表（List)

for each KeyWord in KeyWords do
KeyWordGroups[KeyWord[0]].Append(KeyWord);
end for

for each Line:String in Text do
for each Ch:Char in String do
for each KeyWord:String in KeyWordGroups[Ch] do
if KeyWord = Ch位置开始长度与KeyWord相等的子串 then
匹配成功。
end if
end for
end for
end for

其实不对关键字分组也是正确的，分组是为了在关键字很多时提高性能。如果关键字非常多，还可以用前两个字符分成64K组。我把C++代码整理了一下，加上了注释放在下面：

#include <iostream>
#include <algorithm>
#include <iterator>
#include <string>
#include <vector>
#include <list>
#include <utility>
#include <cstddef>

using std::string;
typedef string::size_type str_size_t;
typedef std::pair < str_size_t, str_size_t > search_result_t;
typedef std::list < string > str_list;
typedef std::vector < str_list > str_list_ary;

const string keyword_list[] =
{
"int", "unsigned", "long", "short", "char", "double", "float", "void", "const", "volatile", "auto", "register", "static",
"extern", "if", "else", "switch", "case", "default""goto", "break", "continue", "return""for", "while", "do",
"typedef", "struct", "union", "enum"
};

#define ARY_LEN(X) (sizeof(X)/sizeof(X[0]))

//在一个字符串(line)中查找第一个匹配的关键字
//关键字已经按第一个字符分组，存放在keywords中
//当word_only为true时只匹配完整单词
//返回匹配的位置和长度
//如果未找到则返回search_result_t(string::npos, string::npos)
search_result_t search_keywords( const string & line, const str_list_ary & keywords, bool word_only = false )
{
str_size_t pos = 0;
bool not_found = true;
str_list::const_iterator it;
for ( ; not_found && pos <= line.size(); ++pos )
{
if ( !word_only || pos == 0 || std::isspace( line[pos - 1] ) )//不是完整单词匹配或当前位置是单词首字母
{
const str_list & list = keywords[static_cast < unsigned char > ( line[pos] )];
for ( it = list.begin(); not_found && it != list.end(); ++it )
{
if ( it->size() <= line.size() - pos
&& ( !word_only || it->size() == line.size() - pos || std::isspace( line[pos + it->size()] ) ) )
//剩余字母不比关键字长度少；不是完整单词匹配或者当前位置开始的单词与关键字等长
not_found = line.compare( pos, it->size(), * it ) != 0;
}
}
}
if ( not_found )
return search_result_t( string::npos, string::npos );
else
return search_result_t( --pos, ( --it )->size() );

}

//函数对象，把关键字放到相应的组中
struct keywords_grouping
{
str_list_ary & list;
keywords_grouping( str_list_ary & init_list ) : list( init_list ){}
void operator() ( const string & str ) const
{
list[str[0]].push_back( str );
}
};

//命令行参数：/w表示完整单词匹配
int main( int argc, char * argv[] )
{
//////////////////////////////////////////////////////////////////////////
//对关键字分组
str_list_ary keywords( 256 );
std::for_each( keyword_list, keyword_list + ARY_LEN( keyword_list ), keywords_grouping( keywords ) );
string line;
//
///////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
//按行读取文件并搜索关键字
unsigned line_no( 0 );
bool word_only = argc > 1 && ( argv[1] [0] == '/' || argv[1] [0] == '-' ) && ( argv[1] [1] == 'w' || argv[1] [1] == 'W' );
search_result_t result( string::npos, string::npos );
while ( result.first == string::npos && std::getline( std::cin, line ) )
{
result = search_keywords( line, keywords, word_only );
++line_no;
}
//
/////////////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////
//输出结果
if ( result.first != string::npos )
{
std::cout << "Line Number:" << line_no << "\n";
std::cout << line << "\n";
for ( str_size_t i = 0; i < result.first; i++ )
std::cout << ' ';
for ( str_size_t i = 0; i < result.second; i++ )
std::cout << '^';
return 1;
}
else
return 0;
//
////////////////////////////////////////////////////////////////////////
}

yiyi999999999 2003-12-09