向各位大神求助。。解析pdf文档转换成txt,为什么转换到一半就不行了,不明白什么原因
package ir2.pdfbox;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringWriter;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
public class readfile {
public static boolean readfile(String filepath) throws FileNotFoundException {
try {
File file = new File(filepath);
File txtFile = new File("E:\\ir\\TXT");
String[] filelist = file.list();
for (int i = 0; i < filelist.length; i++) {
File readfile = new File(filepath + "\\" + filelist[i]);
System.out.println("absolutepath=" + readfile.getAbsolutePath());
//System.out.println("name=" + readfile.getName());
COSDocument cosDoc = null;
FileInputStream fis = new FileInputStream(readfile.getAbsolutePath());
PDFParser p = new PDFParser(fis);
p.parse();
cosDoc = p.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
String docText = stripper.getText(new PDDocument(cosDoc));
BufferedWriter writer1 = new BufferedWriter(new FileWriter(txtFile+ "\\" +
readfile.getAbsolutePath().substring(12, readfile.getAbsolutePath().length() - 4) + ".txt"));
cosDoc.close();
writer1.write(docText);
fis.close();
}
} catch (IOException e) {
e.printStackTrace();
}
return true;
}
public static void main(String[] args) throws Exception{
String pdfFile_path = "E:\\ir\\work";
readfile pdfFile = new readfile();
pdfFile.readfile(pdfFile_path);
}
}
结果显示是:
absolutepath=E:\ir\work\_A88-1018.pdf
absolutepath=E:\ir\work\_A97-1019.pdf
absolutepath=E:\ir\work\_BongardWGW02.pdf
absolutepath=E:\ir\work\_feweco20011214153931.pdf
absolutepath=E:\ir\work\_finance2.pdf
absolutepath=E:\ir\work\_Fleten_fwdConstruction(publ).pdf
java.io.IOException: expected='<' actual='?'
at org.pdfbox.pdfparser.BaseParser.parseCOSDictionary(BaseParser.java:185)
at org.pdfbox.pdfparser.PDFParser.parseTrailer(PDFParser.java:565)
at org.pdfbox.pdfparser.PDFParser.parseXrefSection(PDFParser.java:528)
at org.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:341)
at org.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:176)
at ir2.pdfbox.readfile.readfile(readfile.java:34)
at ir2.pdfbox.readfile.main(readfile.java:56)