跪求新闻抓取代码或类

yujiayou 2011-05-09 07:22:04
如题 有类更好 麻烦发至邮箱:yujiajunbmj@163.com
万分感谢
...全文
118 2 打赏 收藏 转发到动态 举报
写回复
用AI写文章
2 条回复
切换为时间正序
请发表友善的回复…
发表回复
白桑 2011-09-02
  • 打赏
  • 举报
回复
package news.robot.fetch;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlUnorderedList;
import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLOListElement;
import org.junit.*;

import java.util.List;

/**
* Created by IntelliJ IDEA.
* User: Rain
* Date: 11-9-2
* Time: 下午3:07
* // To change this template use File | Settings | File Templates.
*/

public class TestFetch {

///html/body/div[7]/div[2]/div/div[2]/table/tbody

@Test
public void homePage() throws Exception {
final WebClient webClient = new WebClient();
final HtmlPage page = webClient.getPage("http://htmlunit.sourceforge.net");
assert ("HtmlUnit - Welcome to HtmlUnit".endsWith(page.getTitleText()));

// System.out.println(page.getTitleText());
final String pageAsXml = page.asXml();
assert (pageAsXml.contains("<body class=\"composite\">"));
// System.out.println(page.getTitleText());
final String pageAsText = page.asText();
assert (pageAsText.contains("Support for the HTTP and HTTPS protocols"));
// System.out.println(page.getTitleText());
webClient.closeAllWindows();
}

@Test
public void homePage_Firefox() throws Exception {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_3_6);
final HtmlPage page = webClient.getPage("http://htmlunit.sourceforge.net");
assert ("HtmlUnit - Welcome to HtmlUnit".endsWith(page.getTitleText()));

webClient.closeAllWindows();
}
/*
@Test
public void getElements() throws Exception {
final WebClient webClient = new WebClient();
final HtmlPage page = webClient.getPage("http://htmlunit.sourceforge.net");
final HtmlDivision div = page.getHtmlElementById("some_div_id");
final HtmlAnchor anchor = page.getAnchorByName("anchor_name");
}
*/

@Test
public void xpath() throws Exception {
final WebClient webClient = new WebClient();
final HtmlPage page = webClient.getPage("http://news.163.com/domestic/");

//get list of all divs
final List<?> divs = page.getByXPath("//div");

//get div which has a 'name' attribute of 'John'
final HtmlDivision div = (HtmlDivision) page.getByXPath("/html/body/div[5]/div[2]/div").get(0);
// final HtmlUnorderedList ul = (HtmlUnorderedList)div.getElementsByTagName("a");
System.out.println(div.getTextContent());
// System.out.println(ul.getTextContent());
System.out.println(div.getElementsByTagName("cList1"));
System.out.println(div.getElementsByTagName("s1"));
// final HTMLOListElement li = (HTMLOListElement) page.getByXPath("/html/body/div[5]/div[2]/div/ul/li");
// System.out.println(li.getAllIds());

}


public static void main(String[] args) {
TestFetch tf = new TestFetch();
try {
tf.homePage();
tf.homePage_Firefox();
//tf.getElements();
tf.xpath();
} catch (Exception e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
}
子夜__ 2011-05-09
  • 打赏
  • 举报
回复
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;

namespace WikiPageCreater.Common
{
public class PageHelper
{
/// <summary>
/// 根据 url 获取网页编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false;

response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);

string html = reader.ReadToEnd();

Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch
{
}
finally
{

if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close();

if (request != null)
request = null;

}

return Encoding.Default.BodyName;
}

/// <summary>
/// 根据 url 和 encoding 获取当前url页面的 html 源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetHtml(string url, Encoding encoding)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false;

response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
else
reader = new StreamReader(response.GetResponseStream(), encoding);
string html = reader.ReadToEnd();

return html;
}
}
catch
{
}
finally
{

if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close();

if (request != null)
request = null;

}

return string.Empty;
}
}
}

62,041

社区成员

发帖
与我相关
我的任务
社区描述
.NET技术交流专区
javascript云原生 企业社区
社区管理员
  • ASP.NET
  • .Net开发者社区
  • R小R
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告

.NET 社区是一个围绕开源 .NET 的开放、热情、创新、包容的技术社区。社区致力于为广大 .NET 爱好者提供一个良好的知识共享、协同互助的 .NET 技术交流环境。我们尊重不同意见,支持健康理性的辩论和互动,反对歧视和攻击。

希望和大家一起共同营造一个活跃、友好的社区氛围。

试试用AI创作助手写篇文章吧