我现在正在全文检索，现在对txt，HTML,WORD都能生成检索，对PDF的处理我才用了 PDFBOX0.7.3.1来做的，当它对中文不支持，现在我只能对完全是英文的PDF出来，生成检索，不知道谁能说怎么对中文的PDF文档进行处理，获取到它的中文内容。-CSDN社区

Sou2012 2009-01-20

打赏
举报

我也正打算研究这方面

dsn21 2009-01-20

打赏
举报

你可以参考下面的代码这是我以前用得贡献出来呵呵呵
/**
*
*/

import com.bit.dlde.resourcetrans.text.extraction.Extractor;
import com.bit.dlde.resourcetrans.text.extraction.exception.FileNameException;
import com.bit.dlde.resourcetrans.text.extraction.word.WordExtractor;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.swing.JOptionPane;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.encryption.AccessPermission;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFText2HTML;
import org.pdfbox.util.PDFTextStripper;

public class PdfExtractor extends Extractor {

public static final String DEFAULT_ENCODING = "gbk";

public static final String DEFAULT_PDFFILE = "F:\\Archives\\Java Tools\\pdf\\PDFBox-0.7.3\\PDFBox-0.7.3\\bin\\基于OLAP的网络广告投放分析系统的设计与实现.pdf";

private static final String PASSWORD = "-password";

private static final String ENCODING = "-encoding";

private static final String CONSOLE = "-console";

private static final String START_PAGE = "-startPage";

private static final String END_PAGE = "-endPage";

private static final String SORT = "-sort";

private static final String HTML = "-html"; // jjb - added simple HTML

private List pdfFlieList = new ArrayList();

private List txtFileList = new ArrayList();

private String filename = null;

private static String title = null;

private static String author = null;

private static String subject = null;

private static String text=null;

private static int pageCount = 0;

/**
* @return the author
* @uml.property name="author"
*/
@Override
public String getAuthor() throws Exception {
// TODO Auto-generated method stub
return super.getAuthor();
}

/**
* @return the pageCount
* @uml.property name="pageCount"
*/
@Override
public int getPageCount() throws Exception {
// TODO Auto-generated method stub
return super.getPageCount();
}

/**
* @return the text
* @uml.property name="text"
*/
@Override
public String getText() throws Exception {
// TODO Auto-generated method stub
return this.textExtract("", DEFAULT_ENCODING, filename);
}

/**
* @return the title
* @uml.property name="title"
*/
@Override
public String getTitle() throws Exception {
// TODO Auto-generated method stub
return super.getTitle();
}

/**
* @author 张琨
*用来一次性得到全部信息
*/

public HashMap getTotaltext(){
// TODO Auto-generated method stub
HashMap<Integer,String> hm =new HashMap<Integer,String>();
hm.put(1,PdfExtractor.title );
hm.put(2, PdfExtractor.text);
hm.put(3, "null");

return hm;
}

/**
* @param filename the filename to set
* @uml.property name="filename"
*/
@Override
public void setFilename(String filename) {
// TODO Auto-generated method stub
super.setFilename(filename);
}

private String textExtract(String password, String encoding, String pdfFile)
throws Exception {
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
int startPage = 1;
int endPage = Integer.MAX_VALUE;
PDDocument document = null;
String pdfStr=null;

try {
// basically try to load it from a url first and if the URL
// is not recognized then try to load it from the file
// system.
URL url = new URL(pdfFile); // 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件
document = PDDocument.load(url);
String fileName = url.getFile();
} catch (MalformedURLException e) {
document = PDDocument.load(pdfFile); // 如果作为URL装载得到异常则从文件系统装载
}
if (document.isEncrypted()) {
// StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(
// password);
// document.openProtection(sdm);
// AccessPermission ap = document.getCurrentAccessPermission();
//
// if (!ap.canExtractContent()) {
// throw new IOException("PDF文件加密，请输入密码");
System.err.println("PDF文件加密，请输入密码");
if (document!=null)
document.close();
return null;
// }
}
else{
PDFTextStripper stripper = null;
if (toHTML) {
stripper = new PDFText2HTML();
} else {
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
pdfStr= stripper.getText(document);
if (document!=null)
document.close();
return pdfStr;}

// if (!document.isEncrypted()) {
// PDFTextStripper stripper = null;
// if (toHTML) {
// stripper = new PDFText2HTML();
// } else {
// stripper = new PDFTextStripper();
// }
// stripper.setSortByPosition(sort);
// stripper.setStartPage(startPage);
// stripper.setEndPage(endPage);
// return stripper.getText(document);
// } else {
// StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(
// password);
// document.openProtection(sdm);
// AccessPermission ap = document.getCurrentAccessPermission();
//
// if (!ap.canExtractContent()) {
// throw new IOException("PDF文件加密，请输入密码");
// }
// PDFTextStripper stripper = null;
// if (toHTML) {
// stripper = new PDFText2HTML();
// } else {
// stripper = new PDFTextStripper();
// }
// stripper.setSortByPosition(sort);
// stripper.setStartPage(startPage);
// stripper.setEndPage(endPage);
// return stripper.getText(document);
// }

}

public PdfExtractor(String filename) {
super(filename);
this.filename = filename;
try {
PdfExtractor.text=this.getText();
PdfExtractor.title=this.getTitle();

} catch (Exception e) {
// TODO Auto-generated catch block
System.err.println("PDF文件转换失败！");
// e.printStackTrace();
}

// TODO Auto-generated constructor stub
}

}

dsn21 2009-01-20

打赏
举报

你可以参考下面的代码这是我以前用得贡献出来呵呵呵
/**
*
*/

import com.bit.dlde.resourcetrans.text.extraction.Extractor;
import com.bit.dlde.resourcetrans.text.extraction.exception.FileNameException;
import com.bit.dlde.resourcetrans.text.extraction.word.WordExtractor;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.swing.JOptionPane;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.encryption.AccessPermission;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFText2HTML;
import org.pdfbox.util.PDFTextStripper;

public class PdfExtractor extends Extractor {

public static final String DEFAULT_ENCODING = "gbk";

public static final String DEFAULT_PDFFILE = "F:\\Archives\\Java Tools\\pdf\\PDFBox-0.7.3\\PDFBox-0.7.3\\bin\\基于OLAP的网络广告投放分析系统的设计与实现.pdf";

private static final String PASSWORD = "-password";

private static final String ENCODING = "-encoding";

private static final String CONSOLE = "-console";

private static final String START_PAGE = "-startPage";

private static final String END_PAGE = "-endPage";

private static final String SORT = "-sort";

private static final String HTML = "-html"; // jjb - added simple HTML

private List pdfFlieList = new ArrayList();

private List txtFileList = new ArrayList();

private String filename = null;

private static String title = null;

private static String author = null;

private static String subject = null;

private static String text=null;

private static int pageCount = 0;

/**
* @return the author
* @uml.property name="author"
*/
@Override
public String getAuthor() throws Exception {
// TODO Auto-generated method stub
return super.getAuthor();
}

/**
* @return the pageCount
* @uml.property name="pageCount"
*/
@Override
public int getPageCount() throws Exception {
// TODO Auto-generated method stub
return super.getPageCount();
}

/**
* @return the text
* @uml.property name="text"
*/
@Override
public String getText() throws Exception {
// TODO Auto-generated method stub
return this.textExtract("", DEFAULT_ENCODING, filename);
}

/**
* @return the title
* @uml.property name="title"
*/
@Override
public String getTitle() throws Exception {
// TODO Auto-generated method stub
return super.getTitle();
}

/**
* @author 张琨
*用来一次性得到全部信息
*/

public HashMap getTotaltext(){
// TODO Auto-generated method stub
HashMap<Integer,String> hm =new HashMap<Integer,String>();
hm.put(1,PdfExtractor.title );
hm.put(2, PdfExtractor.text);
hm.put(3, "null");

return hm;
}

/**
* @param filename the filename to set
* @uml.property name="filename"
*/
@Override
public void setFilename(String filename) {
// TODO Auto-generated method stub
super.setFilename(filename);
}

private String textExtract(String password, String encoding, String pdfFile)
throws Exception {
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
int startPage = 1;
int endPage = Integer.MAX_VALUE;
PDDocument document = null;
String pdfStr=null;

try {
// basically try to load it from a url first and if the URL
// is not recognized then try to load it from the file
// system.
URL url = new URL(pdfFile); // 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件
document = PDDocument.load(url);
String fileName = url.getFile();
} catch (MalformedURLException e) {
document = PDDocument.load(pdfFile); // 如果作为URL装载得到异常则从文件系统装载
}
if (document.isEncrypted()) {
// StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(
// password);
// document.openProtection(sdm);
// AccessPermission ap = document.getCurrentAccessPermission();
//
// if (!ap.canExtractContent()) {
// throw new IOException("PDF文件加密，请输入密码");
System.err.println("PDF文件加密，请输入密码");
if (document!=null)
document.close();
return null;
// }
}
else{
PDFTextStripper stripper = null;
if (toHTML) {
stripper = new PDFText2HTML();
} else {
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
pdfStr= stripper.getText(document);
if (document!=null)
document.close();
return pdfStr;}

// if (!document.isEncrypted()) {
// PDFTextStripper stripper = null;
// if (toHTML) {
// stripper = new PDFText2HTML();
// } else {
// stripper = new PDFTextStripper();
// }
// stripper.setSortByPosition(sort);
// stripper.setStartPage(startPage);
// stripper.setEndPage(endPage);
// return stripper.getText(document);
// } else {
// StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(
// password);
// document.openProtection(sdm);
// AccessPermission ap = document.getCurrentAccessPermission();
//
// if (!ap.canExtractContent()) {
// throw new IOException("PDF文件加密，请输入密码");
// }
// PDFTextStripper stripper = null;
// if (toHTML) {
// stripper = new PDFText2HTML();
// } else {
// stripper = new PDFTextStripper();
// }
// stripper.setSortByPosition(sort);
// stripper.setStartPage(startPage);
// stripper.setEndPage(endPage);
// return stripper.getText(document);
// }

}

public PdfExtractor(String filename) {
super(filename);
this.filename = filename;
try {
PdfExtractor.text=this.getText();
PdfExtractor.title=this.getTitle();

} catch (Exception e) {
// TODO Auto-generated catch block
System.err.println("PDF文件转换失败！");
// e.printStackTrace();
}

// TODO Auto-generated constructor stub
}

}