sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/utils.py
2023-11-30 18:22:16 +07:00

130 lines
3.3 KiB
Python

import os
import json
import glob
from tqdm import tqdm
from pdf2image import convert_from_path
from dicttoxml import dicttoxml
def create_dir(save_dir=''):
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
# else:
# print("DIR already existed.")
# print('Save dir : {}'.format(save_dir))
def convert_pdf2img(pdf_dir, save_dir):
pdf_files = glob.glob(f'{pdf_dir}/*.pdf')
print('No. pdf files:', len(pdf_files))
print(pdf_files)
for file in tqdm(pdf_files):
pdf2img(file, save_dir, n_pages=-1, return_fname=False)
# pages = convert_from_path(file, 500)
# for i, page in enumerate(pages):
# page.save(os.path.join(save_dir, os.path.basename(file).replace('.pdf', f'_{i}.jpg')), 'JPEG')
print('Done!!!')
def pdf2img(pdf_path, save_dir, n_pages=-1, return_fname=False):
file_names = []
pages = convert_from_path(pdf_path)
if n_pages != -1:
pages = pages[:n_pages]
for i, page in enumerate(pages):
_save_path = os.path.join(save_dir, os.path.basename(pdf_path).replace('.pdf', f'_{i}.jpg'))
page.save(_save_path, 'JPEG')
file_names.append(_save_path)
if return_fname:
return file_names
def xyxy2xywh(bbox):
return [
float(bbox[0]),
float(bbox[1]),
float(bbox[2]) - float(bbox[0]),
float(bbox[3]) - float(bbox[1]),
]
def write_to_json(file_path, content):
with open(file_path, mode='w', encoding='utf8') as f:
json.dump(content, f, ensure_ascii=False)
def read_json(file_path):
with open(file_path, 'r') as f:
return json.load(f)
def read_xml(file_path):
with open(file_path, 'r') as xml_file:
return xml_file.read()
def write_to_xml(file_path, content):
with open(file_path, mode="w", encoding='utf8') as f:
f.write(content)
def write_to_xml_from_dict(file_path, content):
xml = dicttoxml(content)
xml = content
xml_decode = xml.decode()
with open(file_path, mode="w") as f:
f.write(xml_decode)
def read_txt(ocr_path):
with open(ocr_path, "r") as f:
lines = f.read().splitlines()
return lines
def load_ocr_result(ocr_path):
with open(ocr_path, 'r') as f:
lines = f.read().splitlines()
preds = []
for line in lines:
preds.append(line.split('\t'))
return preds
def post_process_basic_ocr(lwords: list) -> list:
pp_lwords = []
for word in lwords:
pp_lwords.append(word.replace("", " "))
return pp_lwords
def read_ocr_result_from_txt(file_path: str):
'''
return list of bounding boxes, list of words
'''
with open(file_path, 'r') as f:
lines = f.read().splitlines()
boxes, words = [], []
for line in lines:
if line == "":
continue
word_info = line.split("\t")
if len(word_info) == 6:
x1, y1, x2, y2, text, _ = word_info
elif len(word_info) == 5:
x1, y1, x2, y2, text = word_info
x1, y1, x2, y2 = int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2))
if text and text != " ":
words.append(text)
boxes.append((x1, y1, x2, y2))
return boxes, words