Python的PDFMIner - PDF到CSV

问题描述:

我希望能够以PDF文件转换成CSV文件,并发现了一些有用的脚本,但是,作为新的Python,我有一个问题:Python的PDFMIner - PDF到CSV

你在哪里指定的文件路径PDF和您想要打印的CSV?

我使用Python 2.7.11和PDFMiner 20140328.

import sys 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.pdfpage import PDFPage 
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter 
from pdfminer.layout import LAParams 
from cStringIO import StringIO 

def pdfparser(data): 

    fp = file(data, 'rb') 
    rsrcmgr = PDFResourceManager() 
    retstr = StringIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 


    for page in PDFPage.get_pages(fp): 
    interpreter.process_page(page) 
    data = retstr.getvalue() 

    print data 

if __name__ == '__main__': 
pdfparser(sys.argv[1]) 
+0

1.输入是在命令行上:'pdfparser(sys.argv中[1])'。 2.此代码不打印到文件,它只是*打印*:'打印数据'。我想你想要的东西像'python yourScriptName.py input.pdf> output.csv'。但*首先*您将需要更正一些缩进错误,或者确保您正确地复制源。 – usr2564301

下面是一些this修改后的代码SO回答tgray写:

def pdf_to_csv(filename, separator, threshold): 
    from cStringIO import StringIO 
    from pdfminer.converter import LTChar, TextConverter 
    from pdfminer.layout import LAParams 
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
    from pdfminer.pdfpage import PDFPage 

    class CsvConverter(TextConverter): 
     def __init__(self, *args, **kwargs): 
      TextConverter.__init__(self, *args, **kwargs) 
      self.separator = separator 
      self.threshold = threshold 

     def end_page(self, i): 
      from collections import defaultdict 
      lines = defaultdict(lambda: {}) 
      for child in self.cur_item._objs: # <-- changed 
       if isinstance(child, LTChar): 
        (_, _, x, y) = child.bbox 
        line = lines[int(-y)] 
        line[x] = child._text.encode(self.codec) # <-- changed 
      for y in sorted(lines.keys()): 
       line = lines[y] 
       self.line_creator(line) 
       self.outfp.write(self.line_creator(line)) 
       self.outfp.write("\n") 

     def line_creator(self, line): 
      keys = sorted(line.keys()) 
      # calculate the average distange between each character on this row 
      average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))])/len(keys) 
      # append the first character to the result 
      result = [line[keys[0]]] 
      for i in range(1, len(keys)): 
       # if the distance between this character and the last character is greater than the average*threshold 
       if (keys[i] - keys[i - 1]) > average_distance * self.threshold: 
        # append the separator into that position 
        result.append(self.separator) 
       # append the character 
       result.append(line[keys[i]]) 
      printable_line = ''.join(result) 
      return printable_line 

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module 
    rsrc = PDFResourceManager() 
    outfp = StringIO() 
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) 
    # becuase my test documents are utf-8 (note: utf-8 is the default codec) 

    fp = open(filename, 'rb') 

    interpreter = PDFPageInterpreter(rsrc, device) 
    for i, page in enumerate(PDFPage.get_pages(fp)): 
     outfp.write("START PAGE %d\n" % i) 
     if page is not None: 
      print 'none' 
      interpreter.process_page(page) 
     outfp.write("END PAGE %d\n" % i) 

    device.close() 
    fp.close() 

    return outfp.getvalue() 


if __name__ == '__main__': 
    # the separator to use with the CSV 
    separator = ';' 
    # the distance multiplier after which a character is considered part of a new word/column/block. Usually 1.5 works quite well 
    threshold = 1.5 
    print pdf_to_csv('myLovelyFile.pdf', separator, threshold) 

的答案之间的主要区别链接和这一个是line_creator方法,它试图从PDF中提取一些结构。

应与PDFminer工作20140328.