import os import json import glob from tqdm import tqdm from pdf2image import convert_from_path from dicttoxml import dicttoxml def create_dir(save_dir=''): if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) # else: # print("DIR already existed.") # print('Save dir : {}'.format(save_dir)) def convert_pdf2img(pdf_dir, save_dir): pdf_files = glob.glob(f'{pdf_dir}/*.pdf') print('No. pdf files:', len(pdf_files)) print(pdf_files) for file in tqdm(pdf_files): pdf2img(file, save_dir, n_pages=-1, return_fname=False) # pages = convert_from_path(file, 500) # for i, page in enumerate(pages): # page.save(os.path.join(save_dir, os.path.basename(file).replace('.pdf', f'_{i}.jpg')), 'JPEG') print('Done!!!') def pdf2img(pdf_path, save_dir, n_pages=-1, return_fname=False): file_names = [] pages = convert_from_path(pdf_path) if n_pages != -1: pages = pages[:n_pages] for i, page in enumerate(pages): _save_path = os.path.join(save_dir, os.path.basename(pdf_path).replace('.pdf', f'_{i}.jpg')) page.save(_save_path, 'JPEG') file_names.append(_save_path) if return_fname: return file_names def xyxy2xywh(bbox): return [ float(bbox[0]), float(bbox[1]), float(bbox[2]) - float(bbox[0]), float(bbox[3]) - float(bbox[1]), ] def write_to_json(file_path, content): with open(file_path, mode='w', encoding='utf8') as f: json.dump(content, f, ensure_ascii=False) def read_json(file_path): with open(file_path, 'r') as f: return json.load(f) def read_xml(file_path): with open(file_path, 'r') as xml_file: return xml_file.read() def write_to_xml(file_path, content): with open(file_path, mode="w", encoding='utf8') as f: f.write(content) def write_to_xml_from_dict(file_path, content): xml = dicttoxml(content) xml = content xml_decode = xml.decode() with open(file_path, mode="w") as f: f.write(xml_decode) def read_txt(ocr_path): with open(ocr_path, "r") as f: lines = f.read().splitlines() return lines def load_ocr_result(ocr_path): with open(ocr_path, 'r') as f: lines = f.read().splitlines() preds = [] for line in lines: preds.append(line.split('\t')) return preds def post_process_basic_ocr(lwords: list) -> list: pp_lwords = [] for word in lwords: pp_lwords.append(word.replace("✪", " ")) return pp_lwords def read_ocr_result_from_txt(file_path: str): ''' return list of bounding boxes, list of words ''' with open(file_path, 'r') as f: lines = f.read().splitlines() boxes, words = [], [] for line in lines: if line == "": continue word_info = line.split("\t") if len(word_info) == 6: x1, y1, x2, y2, text, _ = word_info elif len(word_info) == 5: x1, y1, x2, y2, text = word_info x1, y1, x2, y2 = int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2)) if text and text != " ": words.append(text) boxes.append((x1, y1, x2, y2)) return boxes, words