stringtoken
我最近在做一个抓取网页数据的程序,抓取数据时我用的是dom来实现抓取的,数据抓取下来后在一个字符串了,我写个方法把他们写入文本文件里了,但是里面有很多空格之类的东西,数据显得很乱,我要实现分析写库那样的数据是没有办法实现的,于是我用stringtoken来对这个字符串进行分割后存入一个数组内,以方便我提取我想要的数据写入数据库,可是我那样做了之后发现数据还没有达到我想要的那种要求,不知道该怎么做才更好了,那位大侠知道给我指点一下。我的代码如下:
public class Ahold {
private static final ResourceBundle config = ResourceBundle.getBundle("com.finet.hkfuture.configure");
public static void parse() throws Exception {
DOMParser parser = new DOMParser();
// parser.parse(config.getString("URL"));
String urls = config.getString("URL");
StringTokenizer tokenzer = new StringTokenizer(urls, ";");
while (tokenzer.hasMoreTokens()) {
String url = tokenzer.nextToken();
parser.parse(url);
print(parser.getDocument(), "");
}
}
public static void main(String[] args) throws Exception {
Ahold.parse();
}
public static void Filewrite (String s) throws IOException {
FileWriter fw = null;
try {
fw = new FileWriter("f:/jackie.txt",true);//创建FileWriter对象,用来写入字符流
String newline = "\r\n";
fw.write(s);
fw.write(newline);
fw.flush();
}
catch (IOException e) {
e.printStackTrace();
}
finally{
fw.close();
}
}
public static String[] StrSplit(String Record,String delimiter){
StringTokenizer st = new StringTokenizer(Record,delimiter);
String[] RecordArray = new String[st.countTokens()];
//System.out.println(st.countTokens());
int i = 0;
//将分割的元素放到数组 RecordArray 元素中。
while (st.hasMoreTokens()) {
try {
RecordArray[i] = st.nextToken();
i++;
}
catch(Exception e) {
e.printStackTrace();
}
}
return RecordArray;
}
public static void print(Node node, String indent) {
if (node.getNodeName().equals("HEAD"))
return;
if (node.getNodeName().equals("#comment"))
return;
if (node.getNodeName().equals("SCRIPT"))
return;
if (node.getNodeName().equals("STYLE"))
return;
//if (node.getNodeName().equals("HEAD"))
//return;
if (node.getNodeName().equals("#text")) {
// Here only print the text content for testing, and you can write
// the text into database
// System.out.println(node.getTextContent());
EncodingConvertorUtils encodingUtils = EncodingConvertorUtils.getInstance();
try {
String text = encodingUtils.big52gb(node.getTextContent());
// System.out.println(text);
// Filewrite(text);
String[] str2 = StrSplit(text," ");
for(int j=0;j<str2.length;j++)
{
System.out.println(str2[j]); //这里在我分割后在数组里打出数据
}
// xmlElements(text);
/****
String[] ss=text.split("");
for(int i=0;i<ss.length;i++)
System.out.println(ss[i]);
*********/
} catch (DOMException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//Conndb db = new Conndb();
}
Node child = node.getFirstChild();
while (child != null) {
print(child, indent + " ");
child = child.getNextSibling();
}
}
}