13,347
社区成员
发帖
与我相关
我的任务
分享
//以下是部分代码
List<string> Weburllist = new List<string>();
List<string> Weburllistzx = new List<string>();
StringBuilder weburlSB = new StringBuilder();
bool IsGenxin = false;
MatchCollection mcexplain = Regex.Matches(sjurlDR["LinkContent"].ToString(),
@"(?<=\$-\$).*?(?=\$_\$)", RegexOptions.Singleline);
foreach (Match m in mcexplain)
{
Weburllist.Add(m.Value);
}
System.Net.WebRequest newswebrequest = System.Net.WebRequest.Create(sjurlDR
["LinkUrl"].ToString());
Uri uri = new Uri(sjurlDR["LinkUrl"].ToString());
SetHeaderValue(newswebrequest.Headers, "Host", uri.Host);
SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT
6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0");
SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml
+xml,application/xml;q=0.9,*/*;q=0.8");
SetHeaderValue(newswebrequest.Headers, "Accept-Language", "zh-CN,zh;q=0.8,en-
US;q=0.5,en;q=0.3");
SetHeaderValue(newswebrequest.Headers, "Accept-Encoding", "gzip, deflate,
sdch");
SetHeaderValue(newswebrequest.Headers, "Cookie:",
"gscu_792856215=62888640q5c56420; _gscbrs_792856215=1");
SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive");
SetHeaderValue(newswebrequest.Headers, "Cache-Control", "max-age=0");
//newswebrequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate,
sdch");
//newswebrequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-
CN,zh;q=0.8");
//newswebrequest.Headers.Add(HttpRequestHeader.CacheControl, "max-age=0");
//SetHeaderValue(newswebrequest.Headers, "Accept", "text/html,application/xhtml
+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
//SetHeaderValue(newswebrequest.Headers, "Connection", "Keep-Alive");
//newswebrequest.Headers.Add(HttpRequestHeader.Cookie,
"_gscu_792856215=62888640q5c56420; _gscbrs_792856215=1");
//SetHeaderValue(newswebrequest.Headers, "Host", "zjks.com");
//SetHeaderValue(newswebrequest.Headers, "UserAgent", "Mozilla/5.0 (Windows NT
6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36");
System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();
System.IO.Stream newsstream = newswebresponse.GetResponseStream();
System.IO.StreamReader sr = new StreamReader(newsstream,
System.Text.Encoding.UTF8);
string ProductionContent = string.Empty;
ProductionContent = sr.ReadToEnd();
sr.Close();
Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?");
string wangzhanyuming = reg.Match(sjurlDR["LinkUrl"].ToString(), 0).Value;
MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=
\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" +
wangzhanyuming).Replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aA][^>]* href=[^>]*>",
RegexOptions.Singleline);
int Index = 1;
foreach (Match m in mc)
{
MatchCollection mc1 = Regex.Matches(m.Value.Replace("\"", "'"), @"[a-zA-
z]+://[^']*", RegexOptions.Singleline);
if (mc1.Count > 0)
{
foreach (Match m1 in mc1)
{
string linkurlstr = string.Empty;
linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace
(">", "").Replace(";", "");
weburlSB.Append("$-$");
weburlSB.Append(linkurlstr);
weburlSB.Append("$_$");
if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains
(linkurlstr))
{
IsGenxin = true;
Weburllistzx.Add(linkurlstr);
linkSb.AppendFormat("{0}<br/>", linkurlstr);
}
}
}
else
{
if (m.Value.IndexOf("javascript") == -1)
{
string amstr = string.Empty;
string wangzhanxiangduilujin = string.Empty;
wangzhanxiangduilujin = sjurlDR["LinkUrl"].ToString().Substring(0,
sjurlDR["LinkUrl"].ToString().LastIndexOf("/") + 1);
amstr = m.Value.Replace("href=\"", "href=\"" +
wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin);
MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*",
RegexOptions.Singleline);
foreach (Match m1 in mc11)
{
string linkurlstr = string.Empty;
linkurlstr = m1.Value.Replace("\"", "").Replace("'",
"").Replace(">", "").Replace(";", "");
weburlSB.Append("$-$");
weburlSB.Append(linkurlstr);
weburlSB.Append("$_$");
if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains
(linkurlstr))
{
IsGenxin = true;
Weburllistzx.Add(linkurlstr);
linkSb.AppendFormat("{0}<br/>", linkurlstr);
}
}
}
}
Index++;
}
System.Threading.Thread.Sleep(1000);
if (IsGenxin)
{
originlinksInfo oinfo = new originlinksInfo();
oinfo = originlinksLogic.Get(int.Parse(sjurlDR["ID"].ToString()));
oinfo.LinkContentnext = oinfo.LinkContent;
oinfo.LinkContent = weburlSB.ToString();
originlinksLogic.Update(oinfo);
System.Threading.Thread.Sleep(2000);
}
//如http://www.zjks.com/,这个网站总是采集失败,在这句代码
System.Net.WebResponse newswebresponse = newswebrequest.GetResponse();//这里在采集时总是跳出