发布于2020-03-12 11:13 阅读(1278) 评论(0) 点赞(9) 收藏(1)
安装:
- apt-get install python-poppler
- apt install poppler-utils
- pip3 install pdfminer.six
- pip3 install pdf2image
pdf_decompose.py
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
-
- import io
- import os
- import sys
- import time
-
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdfinterp import PDFResourceManager
- from pdfminer.pdfinterp import PDFPageInterpreter
- from pdfminer.layout import LAParams
- from pdfminer.layout import LTText
- from pdfminer.converter import PDFPageAggregator
- from pdf2image import convert_from_path, convert_from_bytes
-
- class PDFDecompose(object):
- """
- pdf文件转为image
- """
- def __init__(self):
- pass
-
- def decompose_from_bytes(self, file_bytes, dpi=96):
- """
- :param file_bytes:byte type of pdf file
- :return: image list, each element is a PIL image, RGB format
- """
- try:
- images = convert_from_bytes(file_bytes, dpi=dpi)
- return images
- except Exception as e:
- #gl.log.error('PDF Decompose from byte fail, error: {}'.format(str(e)))
- return None
-
- def decompose_from_file(self, file_name, check_content=False):
- """
- :param file_name: file in disk
- :param check_content: if True, check pdf content whether text or image
- :return:
- """
- time_start = time.time()
- if check_content:
- try:
- with open(file_name, 'rb') as fp:
- parser = PDFParser(fp)
- document = PDFDocument(parser)
- if not document.is_extractable:
- # can not extract
- self._log_helper('fail, can not extract', time_start)
- return None
-
- rsrcmgr = PDFResourceManager()
- laparams = LAParams()
- # Create a PDF page aggregator object.
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
-
- if self._is_image_file(device, document, interpreter):
- images = self._to_images(file_name)
- num_pages = len(images)
- self._log_helper('success, pages: {0}'.format(num_pages), time_start)
-
- return images
-
- self._log_helper('fail, no image content', time_start)
- return None
- except Exception as e:
- # no file
- self._log_helper('fail, file io error, {0}'.format(file_name), time_start)
- return None
- else:
- images = self._to_images(file_name)
- num_pages = len(images)
- self._log_helper('success, pages: {0}'.format(num_pages), time_start)
- return images
-
- def _is_image_file(self, device, document, interpreter):
- """
- 检查pdf内前十个page,是否image page占多数,如果是,就认为是一个image的pdf
- :param device:
- :param document:
- :param interpreter:
- :return:
- """
- pages = PDFPage.create_pages(document)
-
- page_count = 0
- image_page_count = 0
- for i, page in enumerate(pages):
- if i > 10:
- break
- page_count += 1
-
- interpreter.process_page(page)
- # receive the LTPage object for the page.
- layout = device.get_result()
-
- if not self._is_text_page(layout):
- image_page_count += 1
-
- if page_count <= 0:
- return True
- if image_page_count // page_count > 0.8:
- return True
-
- return False
-
- def _is_text_page(self, page):
- """
- 检查page内前十个对象,是否text对象占多数,如果是,就认为page是一个text page
- :param layout:
- :return:
- """
- object_count = len(page._objs)
- if object_count <= 0:
- return False
-
- if object_count > 10:
- object_count = 10
-
- text_line_count = 0
- for j, obj in enumerate(page._objs):
- if j > object_count:
- break
-
- if isinstance(obj, LTText):
- text_line_count += 1
- continue
-
- if text_line_count // object_count > 0.8:
- return True
-
- return False
-
- def _to_images(self, file_name):
-
- images = convert_from_path(file_name, dpi=96)
- result = []
- for image in images:
- byteArray = io.BytesIO()
- image.save(byteArray, format='JPEG')
- result.append(byteArray.getvalue())
- return result
-
- def _log_helper(self, log_content, start_time_point):
- time_end = time.time()
- consume = time_end - start_time_point
test_pdf_to_images.py
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
-
- import os
- import sys
- import unittest
-
- from pdf_decompose import PDFDecompose
-
-
- class PDFDecomposeTestCase(unittest.TestCase):
- def setUp(self):
- self.decomposer = PDFDecompose()
-
- def test_pdf_decompose_image(self):
- pdf_file_path = './decompose.pdf'
- images = self.decomposer.decompose_from_file(pdf_file_path,
- check_content=False)
-
- for i, image in enumerate(images):
- image_path = os.path.join("./", 'decompose_{0}.jpg'.format(i))
- with open(image_path, 'wb') as f:
- f.write(image)
-
- image_count = len(images)
- self.assertEqual(image_count, 2)
-
-
- if __name__ == '__main__':
- unittest.main()
原文链接:https://blog.csdn.net/qq_14845119/article/details/104798552
作者:我Lovepython
链接:https://www.pythonheidong.com/blog/article/253986/9815cdd74b472c0df285/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!