做页面抓取,本地文件都可以抓,但是一抓别的就报错
tinyn 2014-02-28 11:30:44 代码如下
import java.io.IOException;
import java.net.MalformedURLException;
import org.xml.sax.SAXException;
import com.meterware.httpunit.GetMethodWebRequest;
import com.meterware.httpunit.PostMethodWebRequest;
import com.meterware.httpunit.WebConversation;
import com.meterware.httpunit.WebForm;
import com.meterware.httpunit.WebLink;
import com.meterware.httpunit.WebRequest;
import com.meterware.httpunit.WebResponse;
import com.meterware.httpunit.WebTable;
public class Test {
public static void testGetHtmlContent() throws MalformedURLException,
IOException, SAXException {
System.out.println("直接获取网页内容:");
WebConversation wc = new WebConversation();
// ClientProperties client = wc.getClientProperties();
// client.setUserAgent("Mozilla;");
WebResponse wr = wc.getResponse("http://www.baidu.com/");
System.out.println(wr.getText());
}
/*
* 用get方法获取页面内容
*/
public static void testGetMethod() throws MalformedURLException,
IOException, SAXException {
System.out.println("向服务器发送数据,然后获取网页内容:");
WebConversation wc = new WebConversation();
WebRequest req = new GetMethodWebRequest("http://localhost:8080/test.html");
req.setParameter("123","aaa");
WebResponse resp = wc.getResponse(req);
System.out.println(resp.getText());
}
/*
* 用post方法获取页面内容
*/
public static void testPostMethod() throws MalformedURLException,
IOException, SAXException {
System.out.println("使用Post方式向服务器发送数据,然后获取网页内容:");
WebConversation wc = new WebConversation();
WebRequest req = new PostMethodWebRequest(
"http://localhost:8080/test.html");
req.setParameter("hsyj", "test");
// req.setParameter("password", "111111");
WebResponse resp = wc.getResponse(req);
System.out.println(resp.getText());
}
/*
* 获取模拟点击
*/
public static void testClickLink() throws MalformedURLException,
IOException, SAXException {
System.out.println("获取页面中链接指向页面的内容:");
WebConversation wc = new WebConversation();
WebResponse resp = wc.getResponse("http://localhost:8080/test.html");
WebLink link = resp.getLinkWith("阅读");
link.click();
WebResponse nextLink = wc.getCurrentPage();
System.out.println(nextLink.getText());
}
/*
* 获取页面内容的table内容
*/
public static void testTableContent() throws MalformedURLException,
IOException, SAXException {
System.out.println("获取页面中表格的内容:");
WebConversation wc = new WebConversation();
WebResponse resp = wc.getResponse("http://localhost:8080/table.html");
System.out.println(resp.getText());
WebTable webTable = resp.getTables()[0];
// 将表格对象的内容传递给字符串数组
String[][] datas = webTable.asText();
// 循环显示表格内容
int i = 0, j = 0;
int m = datas[0].length;
int n = datas.length;
while (i < n) {
j = 0;
while (j < m) {
System.out.println("表格中第" + (i + 1) + "行第" + (j + 1) + "列的内容是:"
+ datas[i][j]);
++j;
}
++i;
}
}
/*
* 获取页面的表单控件内容
*/
public static void testHtmlContentForm() throws MalformedURLException,
IOException, SAXException {
System.out.println("获取页面中表单的内容:");
WebConversation wc = new WebConversation();
WebResponse resp = wc.getResponse("http://localhost:8080/test.html");
System.out.println(resp.getText());
// 获得对应的表单对象
WebForm webForm = resp.getForms()[0];
// 获得表单中所有控件的名字
String[] pNames = webForm.getParameterNames();
int i = 0;
int m = pNames.length;
// 循环显示表单中所有控件的内容
while (i < m) {
System.out.println("第" + (i + 1) + "个控件的名字是" + pNames[i] + ",里面的内容是"
+ (webForm.getParameterValues(pNames[i])));
++i;
}
}
public static void main(String[] args) throws MalformedURLException,
IOException, SAXException {
testGetHtmlContent();
// testGetMethod();
// testPostMethod();
// testClickLink();
// testTableContent();
// testHtmlContentForm();
}
}
刚刚接触java还不是很懂。之前用WebClient类写的时候就没有出现类似的问题。