基于python3 pdf转化为图片-python黑洞网

本站消息

站长简介/公众号

出租广告位,需要合作请联系站长

我Lovepython

746

文章

627218

访问

+关注

分类

暂无分类

日期归档

2024-11(1)

基于python3 pdf转化为图片

发布于2020-03-12 11:13 阅读(1364) 评论(0) 点赞(9) 收藏(1)

安装：


apt-get install python-poppler
apt install poppler-utils
pip3 install pdfminer.six
pip3 install pdf2image

pdf_decompose.py


#!/usr/bin/python3
# -*- coding: utf-8 -*-
 
import io
import os
import sys
import time
 
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.layout import LTText
from pdfminer.converter import PDFPageAggregator
from pdf2image import convert_from_path, convert_from_bytes
 
class PDFDecompose(object):
    """
    pdf文件转为image
    """
    def __init__(self):
        pass
 
    def decompose_from_bytes(self, file_bytes, dpi=96):
        """
        :param file_bytes:byte type of pdf file
        :return: image list, each element is a PIL image, RGB format
        """
        try:
            images = convert_from_bytes(file_bytes, dpi=dpi)
            return images
        except Exception as e:
            #gl.log.error('PDF Decompose from byte fail, error: {}'.format(str(e)))
            return None
 
    def decompose_from_file(self, file_name, check_content=False):
        """
        :param file_name: file in disk
        :param check_content: if True, check pdf content whether text or image
        :return:
        """
        time_start = time.time()
        if check_content:
            try:
                with open(file_name, 'rb') as fp:
                    parser = PDFParser(fp)
                    document = PDFDocument(parser)
                    if not document.is_extractable:
                        # can not extract
                        self._log_helper('fail, can not extract', time_start)
                        return None
 
                    rsrcmgr = PDFResourceManager()
                    laparams = LAParams()
                    # Create a PDF page aggregator object.
                    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                    interpreter = PDFPageInterpreter(rsrcmgr, device)
 
                    if self._is_image_file(device, document, interpreter):
                        images = self._to_images(file_name)
                        num_pages = len(images)
                        self._log_helper('success, pages: {0}'.format(num_pages), time_start)
 
                        return images
 
                    self._log_helper('fail, no image content', time_start)
                    return None
            except Exception as e:
                # no file
                self._log_helper('fail, file io error, {0}'.format(file_name), time_start)
                return None
        else:
            images = self._to_images(file_name)
            num_pages = len(images)
            self._log_helper('success, pages: {0}'.format(num_pages), time_start)
            return images
 
    def _is_image_file(self, device, document, interpreter):
        """
        检查pdf内前十个page，是否image page占多数，如果是，就认为是一个image的pdf
        :param device:
        :param document:
        :param interpreter:
        :return:
        """
        pages = PDFPage.create_pages(document)
 
        page_count = 0
        image_page_count = 0
        for i, page in enumerate(pages):
            if i > 10:
                break
            page_count += 1
 
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
 
            if not self._is_text_page(layout):
                image_page_count += 1
 
        if page_count <= 0:
            return True
        if image_page_count // page_count > 0.8:
            return True
 
        return False
 
    def _is_text_page(self, page):
        """
        检查page内前十个对象，是否text对象占多数，如果是，就认为page是一个text page
        :param layout:
        :return:
        """
        object_count = len(page._objs)
        if object_count <= 0:
            return False
 
        if object_count > 10:
            object_count = 10
 
        text_line_count = 0
        for j, obj in enumerate(page._objs):
            if j > object_count:
                break
 
            if isinstance(obj, LTText):
                text_line_count += 1
                continue
 
        if text_line_count // object_count > 0.8:
            return True
 
        return False
 
    def _to_images(self, file_name):
 
        images = convert_from_path(file_name, dpi=96)
        result = []
        for image in images:
            byteArray = io.BytesIO()
            image.save(byteArray, format='JPEG')
            result.append(byteArray.getvalue())
        return result
 
    def _log_helper(self, log_content, start_time_point):
        time_end = time.time()
        consume = time_end - start_time_point

test_pdf_to_images.py


#!/usr/bin/python3
# -*- coding: utf-8 -*-
 
import os
import sys
import unittest
 
from pdf_decompose import PDFDecompose
 
 
class PDFDecomposeTestCase(unittest.TestCase):
    def setUp(self):
        self.decomposer = PDFDecompose()
 
    def test_pdf_decompose_image(self):
        pdf_file_path = './decompose.pdf'
        images = self.decomposer.decompose_from_file(pdf_file_path,
                                           check_content=False)
 
        for i, image in enumerate(images):
            image_path = os.path.join("./", 'decompose_{0}.jpg'.format(i))
            with open(image_path, 'wb') as f:
                f.write(image)
 
        image_count = len(images)
        self.assertEqual(image_count, 2)
 
 
if __name__ == '__main__':
    unittest.main()