110,538
社区成员
发帖
与我相关
我的任务
分享
public List<Keyword> GetKeywords(string html, string word)
{
int i = 1;
List<Keyword> keywords = new List<Keyword>();
Regex regTable = new Regex(@"(?is)<h3[^>]*?>(?><h3[^>]*>(?<o>)|</h3>(?<-o>)|(?:(?!</?h3/b).)*)*(?(o)(?!))</h3>", RegexOptions.IgnoreCase);
//Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
MatchCollection mcTable = regTable.Matches(html); // 就是这里,这里Mathces出来是空的,所以下面Foreach返回永远是空的。我看了下代码里的解析方式,似乎不是很好。如果你让我写的话,我估计也得弄一会儿,Response出来的东西还是蛮多。你可以看一下,html返回的是什么。
foreach (Match mTablae in mcTable)
{
if (mTable.Success)
{
Match mA = regA.Match(mTable.Value);
if (mA.Success)
{
Keyword keyword = new Keyword();
keyword.ID = i++;
keyword.Link = mA.Groups["link"].Value;
keyword.Title = mA.Groups["title"].Value;
keywords.Add(keyword);
}
}
}
return keywords;
}
HTML返回的
<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="zh-CN"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/google_favicon_128.png" itemprop="image"><title>123 - Google 搜索<itle><style>#gb{font:13px/27px Arial,sans-serif;height:30px}#gbz,#gbg{position:absolute;white-space:nowrap;top:0;height:30px;z-index:1000}#gbz{left:0;padding-left:4px}#gbg{right:0;padding-right:5px}#gbs{background:transparent;position:absolute;top:-999px;visibility:hidden;z-index:998;right:0}.gbto #gbs{background:#fff}#gbx3,#gbx4{background-color:#2d2d2d;background-image:none;_background-image:none;background-position:0 -138px;background-repeat:repeat-x;border-bottom:1px solid #000;font-size:24px;height:29px;_height:30px;opacity:1;filter:alpha(opacity=100);position:absolute;top:0;width:100%;z-index:990}#gbx3{left:0}#gbx4{right:0}#gbb{position:relative}#gbbw{left:0;position:absolute;top:30px;width:100%}.gbtcb{position:absolute;visibility:hidden}#gbz .gbtcb{right:0}#gbg .gbtcb{left:0}.gbxx{display:none !important}.gbxo{opacity:0 !important;filter:alpha(opacity=0) !important}.gbm{position:absolute;z-index:999;top:-999px;visibility:hidden;text-align:left;border:1px solid #bebebe;background:#fff;-moz-box-shadow:-1px 1px 1px rgba(0,0,0,.2);-webkit-box-shadow:0 2px 4px rgba(0,0,0,.2);box-shadow:0 2px 4px rgba(0,0,0,.2)}.gbrtl .gbm{-moz-box-shadow:1px 1px 1px rgba(0,0,0,.2)}.gbto .gbm,.gbto #gbs{top:29px;visibility:visible}#gbz .gbm{left:0}#gbg .gbm{right:0}.gbxms{background-color:#ccc;display:block;position:absolute;z-index:1;top:-1px;left:-2px;right:-2px;bottom:-2px;opacity:.4;-moz-border-radius:3px;filter:progid:DXImageTransform.Microsoft.Blur(pixelradius=5);*opacity:1;*top:-2px;*left:-5px;*right:5px;*bottom:4px;-ms-filter:"progid:DXImageTransform.Microsoft.Blur(pixelradius=5)";opacity:1\0/;top:-4px\0/;left:-6px\0/;right:5px\0/;bottom:4px\0/}.gbma{position:relative;top:-1px;border-style:solid dashed dashed;border-color:transparent;border-top-color:#c0c0c0;display:-moz-inline-box;display:inline-block;font-size:0;height:0;line-height:0;width:0;border-width:3px 3px 0;padding-top:1px;left:4px}#gbztms1,#gbi4m1,#gbi4s,#gbi4t{zoom:1}.gbtc,.gbmc,.gbmcc{display:block;list-style:none;margin:0;padding:0}.gbmc{background:#fff;padding:10px 0;position:relative;z-index:2;zoom:1}。。。。。。(后面的太多了,几万字符返回,没意思。。。)