文章相似度的比较,1000篇的时候,就很慢,我的程序有问题吗?
//检查内容重复情况
$content = $repeat = array();
$num = 0;
$dir = "cs/";
if (is_dir($dir)) {
if ($dh = opendir($dir)) {
while (($file = readdir($dh)) !== false) {
if ( is_dir( $dir.$file) ) continue;
$f = file( $dir.$file ) ;
unset($f[0]);
unset($f[1]);
unset($f[2]);
$text = str_replace(array(" ", "\n", " ", "\t"), array("","","", ""), trim(strip_tags(join("", $f))) );
$lenText = strlen($text);
//echo $text;exit;
foreach( $content as $key => $val ) {
$similar = similar_text( $val, $text );
if( $similar/$lenText > 0.9 ) {
$repeat[$key][] = $file;
$num++;
continue 2;
}
}
$content[$file] = $text;
}
closedir($dh);
}
}
echo "Repeat:".$num." ";
echo "content:".count($content);
print_r($repeat);