求助、提示异常内存溢出

zxczxcvvvvv 2013-05-04 02:23:54
我想利用URL类和网页进行连接并且从网页中抓取部分文本存在本地。代码如下
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
public class wangyi{
public static ArrayList<String> NewUrls = new ArrayList<String>();
public static String Nowurl;
public static int WebIndex = 2501;
public static String fPath = "F:/JAVA代码存储/163-travel/data";
public static int num = 2501;
public static int num2 = 5000;
public static StringBuffer Webcode = new StringBuffer();
static String GetWebcode(String Nowurl) // 通过url得到网页源码
{
Webcode.delete(0, Webcode.length());
BufferedReader reader = null;
try {
URL nowurl;
nowurl = new URL(Nowurl);
URLConnection Conn;
Conn = nowurl.openConnection();
Conn.setDoInput(true);
InputStreamReader read = new InputStreamReader(
Conn.getInputStream());
reader = new BufferedReader(read);
String str;
while ((str = reader.readLine()) != null) {
str = str + "\n";
Webcode.append(str);
}
read.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
System.out.println(Nowurl);
} finally { }
return Webcode.toString();
}
static void DrawUrl(String Precode)// PreCode为网页源码
{
String link = "";
String temple = "";
String reg1 = "http://travel.163";
int index0 = Precode.indexOf(reg1);
int index1 = -1;//链接之前的双引号
int index2 = -1;//链接之后的双引号
int rebegin = 0;//用于表示下一次开始的起点
//新的提取url的部分
rebegin = 0;
while(index0 >0){
temple = Precode.substring(rebegin+1, index0 );
index1 = temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
link = Precode.substring(index0,index2);
if(index2>index1&&index1>0&&link.contains("http")&&link.length()<255)
if (!NewUrls.contains(link))
NewUrls.add(link);
rebegin = index2;
index0 = Precode.indexOf(reg1, rebegin);
}
}
static String GetNewUrl(ArrayList<String> al) {
String tmpAUrl = al.get(0);
return tmpAUrl;
}
static void removeurl(ArrayList<String> al) {
al.remove(0);
}
static String Drawtext1(String reg1, String reg2, String PreCode) // 只抓取两个标签之间内容并且去除空格
{
String temple = "";
int index1 = -1;
int index2 = -1;
int length1 = reg1.length();
index1 = PreCode.indexOf(reg1);
index2 = PreCode.indexOf(reg2);
if (index1 < 0 || index2 < 0)
return "";
temple = PreCode.substring(index1 + length1, index2);
String kongge = "\\s*";
temple = temple.replaceAll(kongge, "");
return temple;
}
static String Drawtext2(String reg1, String reg2, String PreCode)// 在Drawtext1的基础上抓起<p><\p>之间内容,然后去除其他标签
{
String result = "";
result = Drawtext1(reg1, reg2, PreCode);
String lastreg = "<[^>]+>";
result = result.replaceAll(lastreg, "");
lastreg = "\\pP|\\pS|\\pC|\\d";
result = result.replaceAll(lastreg,",");
lastreg = ",{2,}";
result = result.replaceAll(lastreg,"");
String ereg = "([A-Za-z]|&)*";
result = result.replaceAll(ereg, "");
return result;
}
static void DrawTCode(String PreCode) // 网页源码为参数,抓取旅游内容
{
String result = "";
result = result + Drawtext1("<title>", "</title>", PreCode);
System.out.println(result);
result = result + '\n';
result = result + Nowurl + '\n';
result = result + Drawtext2("ep-summary","<!-- 分页 -->",PreCode);
save(result);
}
static void save(String text) // 向文件夹中存入文本
{
try {
File file = new File(fPath, WebIndex + ".txt");
if (!file.exists()) {
file.getParentFile().mkdirs();
}
file.createNewFile();
BufferedWriter bw = new BufferedWriter(new FileWriter(file));
bw.write(text);
bw.close();
} catch (IOException e) {
e.printStackTrace();
System.out.println(Nowurl);
}
WebIndex++;
}
static boolean isshtml(String url) {
int lastd = url.lastIndexOf('.');
int size = url.length();
String end = url.substring(lastd + 1, size);
if (end.compareTo("html") == 0)
return true;
else
return false;
}
public static void main(String[] args) {
Nowurl = "http://travel.163.com/";
DrawUrl(GetWebcode(Nowurl));
while(WebIndex<=num2){
while (NewUrls.size() > 0) {
Nowurl = GetNewUrl(NewUrls);
DrawUrl(GetWebcode(Nowurl));//有可能输出异常信息,在GetWebcode中
System.out.println(num);
System.out.println(Nowurl);
if (isshtml(Nowurl) == true&&
Nowurl.matches(".*\\d{2}/\\d{4}/\\d{2}.*")) {
num++;
DrawTCode(GetWebcode(Nowurl));
removeurl(NewUrls);
} else {
removeurl(NewUrls);
}
}
}
}
}

设计文件和网络的函数我用颜色标出来了。报出的异常信息如下:
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOfRange(Arrays.java:3209)
at java.lang.String.<init>(String.java:216)
at java.lang.StringBuffer.toString(StringBuffer.java:585)
at wangyi.GetWebcode(wangyi.java:42)
at wangyi.main(wangyi.java:151)
异常信息里标明的代码我用绿色标出了。
求大神解救,为何出现内存异常?
...全文
196 1 打赏 收藏 转发到动态 举报
写回复
用AI写文章
1 条回复
切换为时间正序
请发表友善的回复…
发表回复
zxczxcvvvvv 2013-05-04
  • 打赏
  • 举报
回复
我去,在代码里标颜色不好使啊

67,513

社区成员

发帖
与我相关
我的任务
社区描述
J2EE只是Java企业应用。我们需要一个跨J2SE/WEB/EJB的微容器,保护我们的业务核心组件(中间件),以延续它的生命力,而不是依赖J2SE/J2EE版本。
社区管理员
  • Java EE
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧