还是一段C代码的运作
function clean_summary($strSummary) {
// 把结尾错误截断的HTML标签去掉
$arrMatchings = array(
array("<", ">"),
array("&", ";")
);
foreach ($arrMatchings as $arrMatching) {
$intL = strrpos($strSummary, $arrMatching[0]);
if ($intL !== false) {
$intR = strrpos($strSummary, $arrMatching[1]);
if ($intL > $intR) $strSummary = substr($strSummary, 0, $intL);
}
}
// 为了效率,能用str_replace搞定的就不用正则表达式
$strSummary = str_replace(array(
"<br>", "<br/>", "</font>", "</span>"
), array(
"<br />", "<br />", "", ""
), $strSummary);
// 有些非得用正则表达式,顺便复习一下正则表达式的用法
$arrPatterns = array(
"#<script[.\s]*</script>#is",
"#<script.*/>#is",
"#<iframe[.\s]*</iframe>#is",
"#<iframe.*/>#is",
"#<img([^>]*)([^/])>#is",
'#(height|width)="?(\d+)(px)?"?#is',
"#border=[^\s]* #is",
"#<font[^>]*>#is",
"#<span[^>]*>#is",
"#<div[^>]*></div>#is",
"#<p[^>]*></p>#is"
);
$arrReplacements = array(
"", "", "", "",
"<img\1\2 />",
'\1="\2"',
"", "", "", "", ""
);
$strSummary = preg_replace($arrPatterns, $arrReplacements, $strSummary);
// 找出单独的“&”替换成“&”
$intAnd = -1;
while (($intAnd = strpos($strSummary, "&", $intAnd + 1)) !== false) {
if (($intSemicolon = strpos($strSummary, ";", $intAnd)) !== false) {
if ($intSemicolon - $intAnd > 6) {
$strSummary = substr_replace($strSummary, "&", $intAnd, 1);
continue;
}
$strEntity = substr($strSummary, $intAnd, $intSemicolon - $intAnd + 1);
$arrEntities = array(
" ", "&", "<", ">", """
);
$boolNotEscaped = true;
foreach ($arrEntities as $e) {
if ($strEntity == $e) {
$boolNotEscaped = false;
break;
}
}
if (!$boolNotEscaped) continue;
if (!preg_match("^&#\d+;\$", $strEntity)) {
$strSummary = substr_replace($strSummary, "&", $intAnd, 1);
}
} else $strSummary = substr_replace($strSummary, "&", $intAnd, 1);
}
return $strSummary;
}