我如何使用pdfminer作为一个库
我正在尝试使用pdfminer从pdf获取文本数据。 我可以使用pdfminer命令行工具pdf2txt.py成功将这些数据提取到.txt文件。 我目前这样做,然后使用python脚本来清理.txt文件。 我想将pdf提取过程合并到脚本中,并保存一个步骤。
当我发现这个链接的时候 , 我以为自己在做什么 ,但是我没有任何解决scheme的成功。 也许在那里列出的function需要再次更新,因为我正在使用更新版本的pdfminer。
我也尝试了这里显示的function,但它也没有工作。
我尝试的另一种方法是使用os.system
在脚本中调用脚本。 这也是不成功的。
我正在使用Python版本2.7.1和pdfminer版本20110227。
这是我终于生产的清理版本,为我工作。 下面只是简单地返回一个PDF文件中的string。 我希望这可以节省一些时间。
from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO def convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
该解决scheme在2013年11月API更改之前一直有效。
这是一个新的解决scheme,与最新版本一起工作:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
我知道回答你自己的问题是不好的,但我想我可能已经知道了,我不希望其他人浪费时间寻找解决问题的办法。
我在我的问题中发布了一个链接,并重新提出了pdfminer中包含的当前pdf2txt.py脚本。 如果这个function对其他人有用的话,这个function是可以的。 感谢用户skyl发布这个答案,我只需要做一些修改就可以使用当前版本的pdfminer。
这个函数采用PDF格式,并在相同的目录中创build一个.txt文件,并具有相同的名称。
def convert_pdf(path, outtype='txt', opts={}): import sys from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams import getopt outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() outtype = 'text' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() outfp.close() return
这对我使用最新版本的pdfminer(截至2014年9月):
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import unicodedata, codecs from cStringIO import StringIO def getPDFText(pdfFilenamePath): retstr = StringIO() parser = PDFParser(open(pdfFilenamePath,'r')) try: document = PDFDocument(parser) except Exception as e: print(pdfFilenamePath,'is not a readable pdf') return '' if document.is_extractable: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr,retstr, codec='ascii' , laparams = LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) return retstr.getvalue() else: print(pdfFilenamePath,"Warning: could not extract text from pdf file.") return '' if __name__ == '__main__': words = getPDFText(path)
如果你正在使用urllib2处理数据,试试这个( 在这里开发和解释):
def pdf_to_text(scraped_pdf_data): from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import StringIO fp = StringIO.StringIO() fp.write(scraped_pdf_data) fp.seek(0) outfp = StringIO.StringIO() rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp, laparams=LAParams()) process_pdf(rsrcmgr, device, fp) device.close() t = outfp.getvalue() outfp.close() fp.close() return t
与其他答案一样,此处的代码也适用于PDFMiner自身提供的pdf2txt实用程序。 因此,您也可以转换为HTML或XML – 只是在上面的任何地方都可以使用HTML转换器或者XMLConverter
的TextConverter
。
以下修改的非process_pdf答案直接从URLstring名称中拉取文本,并与版本20140328和Python 2.7一起使用:
from urllib2 import urlopen from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO def convert_pdf_to_txt(url): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) scrape = urlopen(url).read() fp = StringIO(scrape) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() textstr = retstr.getvalue() retstr.close() return textstr
以下代码适用于最新版本的PDFMiner,它采用pdfpath并以.txt格式返回文本。
PS:这是对上述答案的修改。
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO def convert_pdf_to_txt(path, outtype='txt'): outfile = path[:-3] + outtype rsrcmgr = PDFResourceManager() codec = 'utf-8' laparams = LAParams() if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close() return
这是我终于生产的清理版本,为我工作。 下面只是简单地返回一个PDF文件中的string。 我希望这可以节省一些时间。
from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO def convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
有人可以说我:是否有任何特定的地方,PDF文件将被安置?
以防万一还需要这个,得到它与请求和Python 3.4的工作。 感谢@ bahmait的回答:)
import requests from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams def pdf_to_text(url=None): text = None pdf = requests.get(url) if pdf.ok: fp = StringIO(str(pdf.content, 'utf-8')) outfp = StringIO() rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp, laparams=LAParams()) process_pdf(rsrcmgr, device, fp) device.close() text = outfp.getvalue() outfp.close() fp.close() return text if __name__ == "__main__": hello_world_text = pdf_to_text("https://bytebucket.org/hsoft/pdfminer3k/raw/28edfc91caed830674ca0b928f42571f7dee6091/samples/simple1.pdf") no_pdf = pdf_to_text('http://www.google.com/404') print(hello_world_text) print(no_pdf)
感谢user3577380
我得到pdfminer.six https://github.com/goulu/pdfminer
我为python3.5做了一点改动,成功地parsing了当地的中文PDF,但是我怎样才能parsing在线PDF格式的请求呢? 例如: http : //pythonscraping.com/pages/warandpeace/chapter1.pdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from io import StringIO import sys, io sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() print(text.encode('gb18030', 'ignore').decode('gb18030', 'ignore')) return text path_of_the_pdf_file = r'E:\迅雷下载\过expressionSmad7基因对瘢痕疙瘩成纤维细胞的影响.pdf' convert_pdf_to_txt(path_of_the_pdf_file)
这是我的解决scheme
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO import os def convert_pdf_to_txt(path, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(path, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() return text
例如,您只想阅读pdf文件的前三页:
text = convert('../Data/EN-FINAL Table 9.pdf', pages=[0,1,2])
pdfminer.six == 20160614
python:3.x
只有当有人仍然需要它时:如何使用PDFMiner从PDF打印HTML:
import sys import getopt from Core.Interfaces.IReader import IReader from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from cStringIO import StringIO class PdfReader(object): def __init__(self): pass def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return reader = PdfReader() opt = map(None,['-W','-L','-t'],[0.5,0.4,'html']) reader.readText("/test_data/test.pdf","html",opt)
以下代码片段能够使用最新版本的pdfminer从pdf文档中提取纯文本(截至2016年3月23日)。 希望这可以帮助。
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() print text return text convert_pdf_to_txt(<path_of_the_pdf_file>)
这是一个可以运行python 3.6的pdfminer.six
的答案。 它使用pdfminer.high_level
模块,如果您只是想从简单的PDF文件中获取原始文本,就会抽象出很多底层细节。
import pdfminer import io def extract_raw_text(pdf_filename): output = io.StringIO() laparams = pdfminer.layout.LAParams() # Using the defaults seems to work fine with open(pdf_filename, "rb") as pdffile: pdfminer.high_level.extract_text_to_fp(pdffile, output, laparams=laparams) return output.getvalue()