81,095
社区成员
发帖
与我相关
我的任务
分享
import java.io.*;
import java.net.*;
import java.util.regex.*;
public class test {
static String SendGet(String url) {
// 定义一个字符串用来存储网页内容
String result = "";
// 定义一个缓冲字符输入流
BufferedReader in = null;
try {
// 将string转成url对象
URL realUrl = new URL(url);
// 初始化一个链接到那个url的连接
URLConnection connection = realUrl.openConnection();
// 开始实际的连接
connection.connect();
// 初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"UTF-8"));
// 用来临时存储抓取到的每一行的数据
String line;
while ((line = in.readLine()) != null) {
// 遍历抓取到的每一行并将其存储到result里面
result += line;
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
static String RegexString(String targetStr, String patternStr) {
// 定义一个样式模板,此中使用正则表达式,括号中是要抓的内容
// 相当于埋好了陷阱匹配的地方就会掉下去
Pattern pattern = Pattern.compile(patternStr);
// 定义一个matcher用来做匹配
Matcher matcher = pattern.matcher(targetStr);
// 如果找到了
if (matcher.find()) {
// 打印出结果
return matcher.group(3);
}
return "Nothing";
}
public static void main(String[] args) {
// 定义即将访问的链接
String url = "https://zj10086.tmall.com/category-511237447.htm?spm=a1z10.1-b.w11522107-14547707429.18.gdMmiH&search=y&catName=%C2%F2%CA%D6%BB%FA&scene=taobao_shop";
// 访问链接并获取页面内容
String result = SendGet(url);
// 使用正则匹配图片的src内容
String imgSrc = RegexString(result, "(.*)(item-name)(.*?)(</a>)(.*) ");
// 打印结果
System.out.println(imgSrc);
}
}
public static String readUrl(String urlString, String encoding) throws IOException {
URL url = new URL(urlString);
InputStream is = null;
ByteArrayOutputStream os = null;
try {
// is = url.openStream();
HttpURLConnection uc = (HttpURLConnection) url.openConnection();
uc.setConnectTimeout(30000);
uc.setReadTimeout(30000);
uc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0");
uc = (HttpURLConnection) reload(uc);
is = uc.getInputStream();
os = new ByteArrayOutputStream(is.available());
int len;
byte[] bytes = new byte[1024 * 8];
while((len = is.read(bytes)) != -1) {
os.write(bytes, 0, len);
}
if (encoding.equals("auto")){
Document document = Jsoup.parseBodyFragment(os.toString());//Jsoup.parse(url, 1000 * 60 * 30);
Elements elements = document.select("meta");
Iterator<Element> i = elements.iterator();
while(i.hasNext()) {
Element element = i.next();
if(element.attr("http-equiv").equals("Content-Type")) {
String content = element.attr("content").trim();
int index = content.indexOf("=");
encoding = content.substring(index + 1);
break;
}
}
if(encoding.equals("auto")){
encoding = "gb2312";
}
return new String(os.toByteArray(),encoding);
}else {
return new String(os.toByteArray(), encoding);
}
} catch (Exception e) {
return "";
} finally {
if(os != null) os.close();
if(is != null) is.close();
}
}
private static URLConnection reload(URLConnection uc) throws Exception {
HttpURLConnection huc = (HttpURLConnection) uc;
if (huc.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP
|| huc.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)// 302, 301
return reload(new URL(huc.getHeaderField("location")).openConnection());
return uc;
}
你试试这样子呢?