java爬虫抓取天猫商品信息重定向问题求救！

sinat_34526074 2016-08-17 04:00:58

我想访问天猫某店铺的商品页面，然后把各商品名字保存下来，但是没成功。。代码如下：

import java.io.*;

import java.net.*;

import java.util.regex.*;

public class test {

 static String SendGet(String url) {

  // 定义一个字符串用来存储网页内容

  String result = "";

  // 定义一个缓冲字符输入流

  BufferedReader in = null;

  try {

   // 将string转成url对象

   URL realUrl = new URL(url);

   // 初始化一个链接到那个url的连接

   URLConnection connection = realUrl.openConnection();

   // 开始实际的连接

   connection.connect();

   // 初始化 BufferedReader输入流来读取URL的响应

   in = new BufferedReader(new InputStreamReader(

     connection.getInputStream(),"UTF-8"));

   // 用来临时存储抓取到的每一行的数据

   String line;

   while ((line = in.readLine()) != null) {

    // 遍历抓取到的每一行并将其存储到result里面

    result += line;

   }

  } catch (Exception e) {

   System.out.println("发送GET请求出现异常！" + e);

   e.printStackTrace();

  }

  // 使用finally来关闭输入流

  finally {

   try {

    if (in != null) {

     in.close();

    }

   } catch (Exception e2) {

    e2.printStackTrace();

   }

  }

  return result;

 }

 static String RegexString(String targetStr, String patternStr) {

  // 定义一个样式模板，此中使用正则表达式，括号中是要抓的内容

  // 相当于埋好了陷阱匹配的地方就会掉下去

  Pattern pattern = Pattern.compile(patternStr);

  // 定义一个matcher用来做匹配

  Matcher matcher = pattern.matcher(targetStr);

  // 如果找到了

  if (matcher.find()) {

   // 打印出结果

   return matcher.group(3);

  }

  return "Nothing";

 }

 public static void main(String[] args) {

	  // 定义即将访问的链接

	  String url = "https://zj10086.tmall.com/category-511237447.htm?spm=a1z10.1-b.w11522107-14547707429.18.gdMmiH&search=y&catName=%C2%F2%CA%D6%BB%FA&scene=taobao_shop";

	  // 访问链接并获取页面内容

	  String result = SendGet(url);

	  // 使用正则匹配图片的src内容

	  String imgSrc = RegexString(result, "(.*)(item-name)(.*?)(</a>)(.*) ");

	  // 打印结果

	  System.out.println(imgSrc);

	 }

}

执行后得到的是：
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"><html><head><title>302 Found</title></head><body bgcolor="white"><h1>302 Found</h1><p>The requested resource resides temporarily under a different URI.</p><hr/>Powered by Tengine</body></html>

求各位帮我看看呀，是不是要加reference？怎么加呀？谢谢

...全文

355 2 打赏收藏转发到动态举报

写回复

用AI写文章

2 条回复

切换为时间正序

请发表友善的回复…

发表回复

_哼哼哈嘿 2019-03-26

打赏
举报

没看内容，重定向加个循环判断就行了

a_b_a_b_a_b_a_b 2016-08-17

打赏
举报

public static String readUrl(String urlString, String encoding) throws IOException {
        URL url = new URL(urlString);
        InputStream is = null;
        ByteArrayOutputStream os = null;
        try {
//            is = url.openStream();
            HttpURLConnection uc = (HttpURLConnection) url.openConnection();
            uc.setConnectTimeout(30000);
            uc.setReadTimeout(30000);
            uc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0");
            uc = (HttpURLConnection) reload(uc);
            is = uc.getInputStream();
            os = new ByteArrayOutputStream(is.available());
            int len;
            byte[] bytes = new byte[1024 * 8];
            while((len = is.read(bytes)) != -1) {
                os.write(bytes, 0, len);
            }
            if (encoding.equals("auto")){
                Document document = Jsoup.parseBodyFragment(os.toString());//Jsoup.parse(url, 1000 * 60 * 30);
                Elements elements = document.select("meta");
                Iterator<Element> i = elements.iterator();
                while(i.hasNext()) {
                    Element element = i.next();
                    if(element.attr("http-equiv").equals("Content-Type")) {
                        String content = element.attr("content").trim();
                        int index = content.indexOf("=");
                        encoding = content.substring(index + 1);
                        break;
                    }
                }
                if(encoding.equals("auto")){
                    encoding = "gb2312";
                }
                return new String(os.toByteArray(),encoding);
            }else {
                return new String(os.toByteArray(), encoding);
            }
        } catch (Exception e) {
            return "";
        } finally {
            if(os != null) os.close();
            if(is != null) is.close();
        }
    }

    private static URLConnection reload(URLConnection uc) throws Exception {

        HttpURLConnection huc = (HttpURLConnection) uc;

        if (huc.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP
                || huc.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)// 302, 301
            return reload(new URL(huc.getHeaderField("location")).openConnection());

        return uc;
    }

你试试这样子呢？