Python的PDFMIner - PDF到CSV
问题描述:
我希望能够以PDF文件转换成CSV文件,并发现了一些有用的脚本,但是,作为新的Python,我有一个问题:Python的PDFMIner - PDF到CSV
你在哪里指定的文件路径PDF和您想要打印的CSV?
我使用Python 2.7.11和PDFMiner 20140328.
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print data
if __name__ == '__main__':
pdfparser(sys.argv[1])
答
def pdf_to_csv(filename, separator, threshold):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
self.separator = separator
self.threshold = threshold
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda: {})
for child in self.cur_item._objs: # <-- changed
if isinstance(child, LTChar):
(_, _, x, y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) # <-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.line_creator(line)
self.outfp.write(self.line_creator(line))
self.outfp.write("\n")
def line_creator(self, line):
keys = sorted(line.keys())
# calculate the average distange between each character on this row
average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))])/len(keys)
# append the first character to the result
result = [line[keys[0]]]
for i in range(1, len(keys)):
# if the distance between this character and the last character is greater than the average*threshold
if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
# append the separator into that position
result.append(self.separator)
# append the character
result.append(line[keys[i]])
printable_line = ''.join(result)
return printable_line
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
fp = open(filename, 'rb')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(PDFPage.get_pages(fp)):
outfp.write("START PAGE %d\n" % i)
if page is not None:
print 'none'
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
if __name__ == '__main__':
# the separator to use with the CSV
separator = ';'
# the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well
threshold = 1.5
print pdf_to_csv('myLovelyFile.pdf', separator, threshold)
的答案之间的主要区别链接和这一个是line_creator方法,它试图从PDF中提取一些结构。
应与PDFminer工作20140328.
1.输入是在命令行上:'pdfparser(sys.argv中[1])'。 2.此代码不打印到文件,它只是*打印*:'打印数据'。我想你想要的东西像'python yourScriptName.py input.pdf> output.csv'。但*首先*您将需要更正一些缩进错误,或者确保您正确地复制源。 – usr2564301