sbt-idp/cope2n-ai-fi/modules/TemplateMatching/templatebasedextraction/src/utils/pdf2image.py
2023-11-30 18:22:16 +07:00

46 lines
1.4 KiB
Python
Executable File

import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
import os
import glob
from tqdm import tqdm
import argparse
import cv2
from PIL import Image
def convert_pdf2image(file_path, outdir, img_max_size=None):
if not os.path.exists(outdir):
os.makedirs(outdir)
doc = fitz.open(file_path) # open document
# dpi = 300 # choose desired dpi here
zoom = 2 # zoom factor, standard: 72 dpi
magnify = fitz.Matrix(zoom, zoom)
for idx, page in enumerate(doc):
pix = page.get_pixmap(matrix=magnify) # render page to an image
outpath = os.path.join(
outdir,
os.path.splitext(os.path.basename(file_path))[0] + "_" + str(idx) + ".png",
)
pix.save(outpath)
img = Image.open(outpath)
img = img.convert("L")
# img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
img.save(outpath)
# if status:
# print("OK")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pdf_dir", type=str)
parser.add_argument("--out_dir", type=str)
args = parser.parse_args()
# pdf_dir = "/home/sds/hoanglv/FWD_Raw_Data/Form POS01"
# outdir = "/home/sds/hoanglv/Projects/FWD/assets/test/test_image_transformer/template_aligner/pdf2image"
pdf_paths = glob.glob(args.pdf_dir + "/*.pdf")
print(pdf_paths[:5])
for pdf_path in tqdm(pdf_paths):
convert_pdf2image(pdf_path, args.out_dir)