sbt-idp/test_pdf_reader.py
2023-12-13 16:01:31 +07:00

35 lines
1.2 KiB
Python

from PyPDF2 import PdfReader, PdfWriter
from PIL import Image
from pdf2image import convert_from_bytes
def get_first_page_pdf(filename, max_size=2048):
def pdf_scale_page(page, size=297):
"""Scale page to specified size mm"""
(w, h) = page.mediabox[2:]
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
pmm = (1/72*25.4)
ks = size / (float(max((w, h))) * pmm)
page.scale_by(ks)
return page
reader = PdfReader(filename)
page = reader.pages[0]
scaled_page = pdf_scale_page(page, max_size)
# Create BytesIO
pdf_bytes = io.BytesIO()
dst_pdf = PdfWriter()
dst_pdf.add_page(scaled_page)
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
image = convert_from_bytes(pdf_bytes.read())
if isinstance(image, list):
return image[0]
return image
img = get_first_page_pdf("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", max_size=300)
img.save("invoice.jpg", "JPEG")