81,094
社区成员
发帖
与我相关
我的任务
分享
Parser parser = new Parser();
// 添加 url代理,欺骗网页
URL url = new URL(htmlURL);
HttpURLConnection httpUrlConnection = (HttpURLConnection) url.openConnection();
httpUrlConnection.setDoInput(true);
httpUrlConnection.setRequestMethod("GET");
httpUrlConnection.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
parser.setConnection(httpUrlConnection);
parser.setEncoding(parser.getEncoding());//设置字符集编码
//过滤页面中的链接标签
NodeFilter filter = new NodeClassFilter(Html.class);//以标签为过滤条件提取网页内容
NodeList list = parser.extractAllNodesThatMatch(filter);//将所有符合条件的列出
html = list.toHtml();//按原网页内容格式进行转换