2023-12-13 09:01:31 +00:00
import io
2023-12-12 05:54:34 +00:00
from PIL import Image
2023-12-13 09:01:31 +00:00
from PyPDF2 import PdfReader , PdfWriter
from pdf2image import convert_from_bytes
2023-12-12 05:54:34 +00:00
def resize ( image , max_w = 2048 , max_h = 2048 ) :
2023-12-13 09:01:31 +00:00
cur_w = image . width
cur_h = image . height
2023-12-12 05:54:34 +00:00
if cur_h > max_w or cur_h > max_h :
ratio_w = max_w / cur_w
ratio_h = max_h / cur_h
ratio = min ( [ ratio_h , ratio_w ] )
new_w = int ( ratio * cur_w )
new_h = int ( ratio * cur_h )
image = image . resize ( ( new_w , new_h ) )
2023-12-13 09:01:31 +00:00
return image
def fitz_pixmap_to_pillow_with_resize ( image , max_w = 2048 , max_h = 2048 ) :
cur_w , cur_h = image . width , image . height
image_bytes = image . samples
image = Image . frombytes ( " RGB " , [ cur_w , cur_h ] , image_bytes )
image = resize ( image , max_w , max_h )
return image
def get_first_page_pdf ( filename , max_size = 300 ) :
def pdf_scale_page ( page , size = 297 ) :
""" Scale page to specified size mm """
( w , h ) = page . mediabox [ 2 : ]
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
pmm = ( 1 / 72 * 25.4 )
ks = size / ( float ( max ( ( w , h ) ) ) * pmm )
page . scale_by ( ks )
return page
reader = PdfReader ( filename )
page = reader . pages [ 0 ]
scaled_page = pdf_scale_page ( page , max_size )
# Create BytesIO
pdf_bytes = io . BytesIO ( )
dst_pdf = PdfWriter ( )
dst_pdf . add_page ( scaled_page )
dst_pdf . write ( pdf_bytes )
pdf_bytes . seek ( 0 )
2023-12-12 05:54:34 +00:00
2023-12-13 09:01:31 +00:00
image = convert_from_bytes ( pdf_bytes . read ( ) )
if isinstance ( image , list ) :
return image [ 0 ]
return image