请帮我看看这个程序,为什么数据抓不出来呢

xiaopingshen 2008-12-05 08:45:58
我这个程序是要将网页上的数据抓出来,可是我怎么弄也只能抓到网址,里面的内容却读不出来,请各位高手帮我看看哈
以下是程序代码:
import java.io.*;
import java.net.*;
import java.util.*;
import Catch.Catchwork;

public class trade {
public static void main(String args[]) throws Exception {
PrintWriter fout = new PrintWriter(new FileWriter("trade4.txt"));
for(int i=2;i <552;i++) {
URL url = new URL("http://www.ccen.net/company/search.html?q=%u516C%u53F8&page="+i);
System.out.println("现在抓到这一页:"+url);
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
String s,temp = null;
ArrayList list = new ArrayList();
while((s = br.readLine()) !=null) {
if((s.indexOf("a target=\"_blank\" href='http://")) !=-1) {
temp = Catchwork.CatchFilework(s,"a target=\"_blank\" href='http://",".ccen.net");
list.add(temp);
}
}
for(int j=0;j <list.size();j++) {
try{
String temps = (String)list.get(j);
temps ="http://"+temps+".ccen.net/?type=contactus";
URL url2 = new URL(temps);
System.out.println("正在抓取这家公司:"+url2);
BufferedReader br2 = new BufferedReader(new InputStreamReader(url2.openStream()));
String ss,tempss = null;
String phone="",company="",adress="",fax="",email="",web="",zip="",product="",person="",tel="";
while((ss = br2.readLine()) !=null) {
if((ss.indexOf("class=\"STYLE12\">联系人:")) !=-1) {
ss=br2.readLine();
person=Catchwork.CatchFilework(ss," <span class=\"STYLE12\">"," </span> </td>");
System.out.println("联系人:"+person);
}
if((ss.indexOf("企业名称: ")) !=-1) {
company=Catchwork.CatchFilework(ss,"企业名称: "," <br>");
System.out.println("公司名称:"+company);
}
if((ss.indexOf("电话: ")) !=-1) {
tel=Catchwork.CatchFilework(ss,"电话: "," <br>");
System.out.println("电话:"+tel);
}
if((ss.indexOf("传真: ")) !=-1) {
fax=Catchwork.CatchFilework(ss,"传真: "," <br>");
System.out.println("传真:"+fax);
}
if((ss.indexOf("公司网址:")) !=-1) {
web=Catchwork.CatchFilework(ss,"target=\"_blank\">"," </a> <br>");
System.out.println("网址:"+web);
}
if((ss.indexOf("详细地址: ")) !=-1) {
adress=Catchwork.CatchFilework(ss,"详细地址: "," <br>");
System.out.println("公司地址:"+adress);
}
if((ss.indexOf("邮政编码:")) !=-1) {
zip=Catchwork.CatchFilework(ss,"邮政编码:"," <br>");
System.out.println("邮编:"+zip);
}
if((ss.indexOf("联系手机: ")) !=-1) {
email=Catchwork.CatchFilework(ss,"联系手机: "," <BR>");
System.out.println("手机:"+phone);
}
if((ss.indexOf("e-mail: ")) !=-1) {
email=Catchwork.CatchFilework(ss,"iqiu@yiqiu.com'>"," </a> </span>");
System.out.println("电子邮箱:"+email);
}
}
if(company.equals("")) continue;
fout.print(company+" ");
fout.print(person+" ");
fout.print(adress+" ");
fout.print(tel+" ");
fout.print(fax+" ");
fout.print(zip+" ");
fout.print(email+" ");
fout.print(phone+" ");
fout.println(web+" ");
fout.flush();
} catch(Exception e) {
System.out.println("该页面无法打开!");
}
}

}
}
}
这个是那个catch包的程序
package Catch;

import java.io.*;
import java.net.*;
import java.util.*;

public class Catchwork {
public static void DownLoadPages(String urlStr, String outPath) {
int chByte = 0;
URL url = null;
HttpURLConnection httpConn = null;
InputStream in = null;
FileOutputStream out = null;
try {
url = new URL(urlStr);
httpConn = (HttpURLConnection) url.openConnection();
HttpURLConnection.setFollowRedirects(true);
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");

// logger.info(httpConn.getResponseMessage());
in = httpConn.getInputStream();
out = new FileOutputStream(new File(outPath));

chByte = in.read();
while (chByte != -1) {
out.write(chByte);
chByte = in.read();
}
}catch (MalformedURLException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}
finally {
try {
out.close();
in.close();
httpConn.disconnect();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
public static String CatchFilework(String s,String a,String b) {
String temp = null;
int index;
if((a !="") && (b !="")) {
if((index = s.indexOf(a)) !=-1) {
temp = s.substring(index+a.length());
if((index = temp.indexOf(b)) !=-1) {
temp = temp.substring(0,index);
temp = temp.trim();
}
}
}
if((a =="") && (b !="")) {
if((index = s.indexOf(b)) !=-1) {
temp = s.substring(0,index);
temp = temp.trim();
}
}
if((a !="") && (b =="")) {
if((index = s.indexOf(a)) !=-1) {
temp = s.substring(index+a.length());
temp = temp.trim();
}
}
return temp;
}
public static String CatchTelwork(String temp) {
int index,index1,index2;
String temp1=null,temp2=null,temp3=null,Qu="",tel="";
if(((index1 = temp.indexOf("-")) != -1) && (index1+1 <temp.length())) {
temp1 = temp.substring(0,index1);
temp2 = temp.substring(index1+1);
if(((index = temp1.indexOf("86")) != -1) && (temp1.length() <6)) {
if(((index2 = temp2.indexOf("-")) != -1) && (index2+1 <temp2.length())) {
temp3 = temp2.substring(index2+1);
temp2 = temp2.substring(0,index2);
if(temp2.length() <6) {
Qu = temp2;
tel = temp3;
}else {
tel = temp2+"-"+temp3;
}
}else {
tel = temp2;
}
}else if(temp1.length() <6) {
Qu = temp1;
tel = temp2;
}else {
tel = temp1;
}
}else {
tel = temp;
}
return Qu+" "+tel;
}
public static void main(String args[]) throws Exception {

}
}
...全文
132 10 打赏 收藏 转发到动态 举报
写回复
用AI写文章
10 条回复
切换为时间正序
请发表友善的回复…
发表回复
SylvanLiu 2008-12-05
  • 打赏
  • 举报
回复
有点乱
pepsighost 2008-12-05
  • 打赏
  • 举报
回复
又看了一下。代码不好理解了
tianice 2008-12-05
  • 打赏
  • 举报
回复
把代码放到
代码
中,看起来太费事了
pepsighost 2008-12-05
  • 打赏
  • 举报
回复
不好理解。帮顶
lshy168 2008-12-05
  • 打赏
  • 举报
回复
[Quote=引用 2 楼 JackyNone 的回复:]
而且代码的可读性比较糟糕。
[/Quote]
正解,个人觉得一看就不想看了。因为代码可读性真的很差。
JackyNone 2008-12-05
  • 打赏
  • 举报
回复
而且代码的可读性比较糟糕。
yjfjebj789 2008-12-05
  • 打赏
  • 举报
回复
你不应该读一行就去indexOf判断, 如果你要查的那个字符串一半在上一行一半在下一行你就找不到了,你应该要全部取出来之后再用 indexOf,substring,或者replace 进行处理, 还有 你的字符编码也没有设
lkw5657 2008-12-05
  • 打赏
  • 举报
回复
94
[Quote=引用 3 楼 lshy168 的回复:]
引用 2 楼 JackyNone 的回复:
而且代码的可读性比较糟糕。

正解,个人觉得一看就不想看了。因为代码可读性真的很差。
[/Quote]
myjava_024 2008-12-05
  • 打赏
  • 举报
回复
弄个搜索引擎的东东,很简单的,open上面有很多的
kbyst 2008-12-05
  • 打赏
  • 举报
回复
抓网页内容一般都用正则吧
用string自己写确实累了点 有重新发明轮子之嫌

62,614

社区成员

发帖
与我相关
我的任务
社区描述
Java 2 Standard Edition
社区管理员
  • Java SE
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧