130 lines
3.3 KiB
Python
130 lines
3.3 KiB
Python
|
import os
|
||
|
import json
|
||
|
import glob
|
||
|
from tqdm import tqdm
|
||
|
from pdf2image import convert_from_path
|
||
|
from dicttoxml import dicttoxml
|
||
|
|
||
|
|
||
|
def create_dir(save_dir=''):
|
||
|
if not os.path.exists(save_dir):
|
||
|
os.makedirs(save_dir, exist_ok=True)
|
||
|
# else:
|
||
|
# print("DIR already existed.")
|
||
|
# print('Save dir : {}'.format(save_dir))
|
||
|
|
||
|
def convert_pdf2img(pdf_dir, save_dir):
|
||
|
pdf_files = glob.glob(f'{pdf_dir}/*.pdf')
|
||
|
print('No. pdf files:', len(pdf_files))
|
||
|
print(pdf_files)
|
||
|
|
||
|
for file in tqdm(pdf_files):
|
||
|
pdf2img(file, save_dir, n_pages=-1, return_fname=False)
|
||
|
# pages = convert_from_path(file, 500)
|
||
|
# for i, page in enumerate(pages):
|
||
|
# page.save(os.path.join(save_dir, os.path.basename(file).replace('.pdf', f'_{i}.jpg')), 'JPEG')
|
||
|
print('Done!!!')
|
||
|
|
||
|
def pdf2img(pdf_path, save_dir, n_pages=-1, return_fname=False):
|
||
|
file_names = []
|
||
|
pages = convert_from_path(pdf_path)
|
||
|
if n_pages != -1:
|
||
|
pages = pages[:n_pages]
|
||
|
for i, page in enumerate(pages):
|
||
|
_save_path = os.path.join(save_dir, os.path.basename(pdf_path).replace('.pdf', f'_{i}.jpg'))
|
||
|
page.save(_save_path, 'JPEG')
|
||
|
file_names.append(_save_path)
|
||
|
if return_fname:
|
||
|
return file_names
|
||
|
|
||
|
def xyxy2xywh(bbox):
|
||
|
return [
|
||
|
float(bbox[0]),
|
||
|
float(bbox[1]),
|
||
|
float(bbox[2]) - float(bbox[0]),
|
||
|
float(bbox[3]) - float(bbox[1]),
|
||
|
]
|
||
|
|
||
|
def write_to_json(file_path, content):
|
||
|
with open(file_path, mode='w', encoding='utf8') as f:
|
||
|
json.dump(content, f, ensure_ascii=False)
|
||
|
|
||
|
|
||
|
def read_json(file_path):
|
||
|
with open(file_path, 'r') as f:
|
||
|
return json.load(f)
|
||
|
|
||
|
def read_xml(file_path):
|
||
|
with open(file_path, 'r') as xml_file:
|
||
|
return xml_file.read()
|
||
|
|
||
|
def write_to_xml(file_path, content):
|
||
|
with open(file_path, mode="w", encoding='utf8') as f:
|
||
|
f.write(content)
|
||
|
|
||
|
def write_to_xml_from_dict(file_path, content):
|
||
|
xml = dicttoxml(content)
|
||
|
xml = content
|
||
|
xml_decode = xml.decode()
|
||
|
|
||
|
with open(file_path, mode="w") as f:
|
||
|
f.write(xml_decode)
|
||
|
|
||
|
def read_txt(ocr_path):
|
||
|
with open(ocr_path, "r") as f:
|
||
|
lines = f.read().splitlines()
|
||
|
return lines
|
||
|
|
||
|
def load_ocr_result(ocr_path):
|
||
|
with open(ocr_path, 'r') as f:
|
||
|
lines = f.read().splitlines()
|
||
|
|
||
|
preds = []
|
||
|
for line in lines:
|
||
|
preds.append(line.split('\t'))
|
||
|
return preds
|
||
|
|
||
|
def post_process_basic_ocr(lwords: list) -> list:
|
||
|
pp_lwords = []
|
||
|
for word in lwords:
|
||
|
pp_lwords.append(word.replace("✪", " "))
|
||
|
return pp_lwords
|
||
|
|
||
|
def read_ocr_result_from_txt(file_path: str):
|
||
|
'''
|
||
|
return list of bounding boxes, list of words
|
||
|
'''
|
||
|
with open(file_path, 'r') as f:
|
||
|
lines = f.read().splitlines()
|
||
|
|
||
|
boxes, words = [], []
|
||
|
for line in lines:
|
||
|
if line == "":
|
||
|
continue
|
||
|
word_info = line.split("\t")
|
||
|
if len(word_info) == 6:
|
||
|
x1, y1, x2, y2, text, _ = word_info
|
||
|
elif len(word_info) == 5:
|
||
|
x1, y1, x2, y2, text = word_info
|
||
|
|
||
|
x1, y1, x2, y2 = int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2))
|
||
|
if text and text != " ":
|
||
|
words.append(text)
|
||
|
boxes.append((x1, y1, x2, y2))
|
||
|
return boxes, words
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|