java新闻抓取程序图片下载不全的问题

lyboyc 2008-11-21 06:08:27

我做了个程序把新浪上的天气新闻抓过来存到本地，考虑访问速度问题，新闻中的图片也要保存到本地。
程序如下



package vnet.com.weather1;



import java.io.BufferedReader;

import java.io.ByteArrayOutputStream;

import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.URL;

import java.net.URLConnection;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;



import vnet.com.update.Getdata;

/**

 * 正则方式抓取新浪天气新闻上的新闻

 * 地址http://weather.news.sina.com.cn/weather/news/index.html

 * @param args

 */

public class Newlist {

	private static final Log log = LogFactory.getLog(Newlist.class);

	/**

	 * 测试

	 * @param args

	 */

	public  static void main(String args[]){

		Newlist n=new Newlist();

		String[] k=n.getNewList();

		for (int i=0;i<k.length;i++){

		System.out.println(k[i].replace("href=\"", "href=\"newinfo2.jsp?url="));

		}

		String[] m=n.getNewinfo("news/2008/1119/35261.html");

		for (int l=0;l<m.length;l++){		

			System.out.println(m[l]);	

		}

		

	}

	/**

	 * 由url地址获得新闻内容string[]

	 * 新闻中的图片下载到本地，文中新闻地址改成本地地址

	 * @param url

	 * @return

	 */

	public String[] getNewinfo(String url){

		String URL="http://weather.news.sina.com.cn/"+url;

		//30是指取30段满足给出的正则条件的字符串，如果只找出10个，那数组后面的全为null

		String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);

		for (int i=0;i<s.length;i++){

			Pattern sp = Pattern.compile("src=\"(.*?)\"");

		    Matcher matcher = sp.matcher(s[i]);

		    if (matcher.find()){

		    	

		    	 String imageurl=analysis("src=\"(.*?)\"" , s[i] , 1)[0];

		    	 if(!imageurl.startsWith("http://")){

		    		 imageurl="http://weather.news.sina.com.cn/"+imageurl;

		    	  }

		    	System.out.println("新闻有图片:"+imageurl);

		    	String content=getContent(imageurl);

				  String[] images=imageurl.split("/");

				  String imagename=images[images.length-1];

				  System.out.println("图片名:"+imagename);

				  

				 

		try {

			File fwl = new File(imagename); 

			PrintWriter outl = new PrintWriter(fwl);

			outl.println(content);

			outl.close();

			} catch (IOException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

			System.out.println("s[i]:"+s[i]);

			//修改文件图片地址

			s[i]=s[i].replace(analysis("src=\"(.*?)\"" , s[i] , 1)[0], imagename);

		    }

		}

		

		return s;

	}

	public  String[] getNewList(){

		String url="http://weather.news.sina.com.cn/weather/news/index.html";

		return getNewList(getContent(url));   	

    }



	private  String[] getNewList(String content ){

		//String[] s = analysis("align=\"center\" valign=\"top\"><img src=\"../images/a(.*?).gif\" width=\"70\" height=\"65\"></td>" , content , 50);	

		String[] s = analysis("<li>(.*?)</li>" , content , 50);

		

		return s;

	}

	private String[] analysis(String pattern, String match , int i){

		Pattern sp = Pattern.compile(pattern);

	    Matcher matcher = sp.matcher(match);

	    String[] content = new String[i];

	    for (int i1 = 0; matcher.find(); i1++){    	

	    	content[i1] = matcher.group(1);   	

	    }

	    //下面一段是为了剔除为空的串

	    int l=0;

	    for (int k=0;k<content.length;k++){

	    	if (content[k]==null){

	    		l=k;

	    		break;

	    	}

	    }

	    String[] content2;

	    if (l!=0){

	    	content2=new String[l];

	    	for (int n=0;n<l;n++){

	    		content2[n]=content[n];

	    	}

	    	 return content2;

	    }else{

	    	return content;	

	    }

	   

	}

	/**

	 * 由地址获取网页内容

	 * @param strUrl

	 * @return

	private String getContent(String strUrl){

		try{

			//URL url = new URL(strUrl);	

			//BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));

			URLConnection uc = new URL(strUrl).openConnection(); 

			   //通过修改http头的User-Agent来伪装成是通过浏览器提交的请求

		      uc.setRequestProperty("User-Agent",  

				                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");		   	   

			  System.out.println("-----------------------------------------");  

			  System.out.println("Content-Length:     "+uc.getContentLength());  

			  System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie"));  

			  System.out.println("-----------------------------------------"); 

			  //获取文件头信息

			  System.out.println("Header"+uc.getHeaderFields().toString());

			  System.out.println("-----------------------------------------");  

			BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312")); 

			String s = "";

			StringBuffer sb=new StringBuffer();

			while((s = br.readLine())!=null){

				sb.append(s+"\r\n");

			}

			System.out.println("长度+"+sb.toString().length());

			

			return sb.toString();

		}catch(Exception e){

			return "error open url" + strUrl;

		}

	}

	*/



	public static  String getContent (String strUrl){

	    URLConnection uc = null;

	    String all_content=null;



	   

	try {

	           all_content =new  String();

	           URL url = new URL(strUrl);



	           uc = url.openConnection();

	           uc.setRequestProperty("User-Agent",  

				                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");		   	   

			  System.out.println("-----------------------------------------");  

			  System.out.println("Content-Length:     "+uc.getContentLength());  

			  System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie"));  

			  System.out.println("-----------------------------------------"); 

			  //获取文件头信息

			  System.out.println("Header"+uc.getHeaderFields().toString());

			  System.out.println("-----------------------------------------");  

	           if (uc == null)

	               return null;



	           InputStream ins = uc.getInputStream();

	            ByteArrayOutputStream outputstream = new ByteArrayOutputStream();

	           byte[] str_b = new byte[1024];

	               int i = -1;

	               while ((i=ins.read(str_b)) > 0) {

	                outputstream.write(str_b,0,i);

	               }

	               all_content = outputstream.toString();

	              // System.out.println(all_content);



	       } catch (Exception e) {

	           e.printStackTrace();

	           log.error("获取网页内容出错");

	       }finally{

	    	   uc = null;

	       }

	      

	      // return new String(all_content.getBytes("ISO8859-1"));

	       System.out.println(all_content.length());

	       return all_content;

	   }

	  

}

测试通过，现在的问题是:图片下载不全，我用后面两种getContent方法下图片，下来的图片大小都和文件头里获得的Content-Length，也就是图片的实际大小不符，预览不了。
而且反复测试，两种方法每次下来的东西大小是固定的，所以重复下载没有用？
望朋友们指点下怎么解决。

...全文

1290 8 打赏收藏转发到动态举报

写回复

用AI写文章

8 条回复

切换为时间正序

请发表友善的回复…

发表回复

lyboyc 2008-11-27

打赏
举报



public  int saveImage (String strUrl){

	    URLConnection uc = null;

	try {

	           URL url = new URL(strUrl);

	           uc = url.openConnection();

	           uc.setRequestProperty("User-Agent",  

				                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");	

	           //uc.setReadTimeout(30000);

		 //获取图片长度  

		  //System.out.println("Content-Length:     "+uc.getContentLength()); 

		  //获取文件头信息

		   //System.out.println("Header"+uc.getHeaderFields().toString());		 

	           if (uc == null)

	               return 0;

	           InputStream ins = uc.getInputStream(); 

                 byte[] str_b = new byte[1024];         

	             int byteRead=0;                           

            	String[] images=strUrl.split("/");

		String imagename=images[images.length-1];

        	  File fwl = new File(imagename);

        	  FileOutputStream fos= new FileOutputStream(fwl); 

        		  while ((byteRead=ins.read(str_b)) > 0) {

        			  fos.write(str_b,0,byteRead);

  	               };

  	              fos.flush();  

	              fos.close();

	       } catch (Exception e) {

	           e.printStackTrace();

	           log.error("获取网页内容出错");

	       }finally{

	    	   uc = null;

	       }

	       return 1;

	   }