pdfMiner3k
寒潭烟光 2018-03-28 10:17:27 from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.pdfdevice import PDFDevice
def read_pdf(pdf_filename, txt_filename):
fp = open(pdf_filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
with open(txt_filename, 'a') as f:
results = x.get_text()
print(results)
f.write(results + '\n')
if __name__ == '__main__':
pdf_filename = 'E:\\知识图谱项目\\《中国药典》2015年版 第一部.pdf'
txt_filename = 'E:\\知识图谱项目\\《中国药典》2015年版 第一部.txt'
read_pdf(pdf_filename, txt_filename)
D:\python3.6.2\python.exe E:/知识图谱项目源码/extractEntity/extract_entity_codex.py
WARNING:root:Cannot locate objid=21077
WARNING:root:Cannot locate objid=21077
WARNING:root:Wrong type: None required: <class 'dict'>
WARNING:root:Cannot locate objid=21074
Process finished with exit code 0
为什么为出现警告,而且没有读取PDF文件的任何内容