sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/utils.py

import os
import json
import glob
from tqdm import tqdm
from pdf2image import convert_from_path
from dicttoxml import dicttoxml


def create_dir(save_dir=''):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    # else:
    #     print("DIR already existed.")
    # print('Save dir : {}'.format(save_dir))

def convert_pdf2img(pdf_dir, save_dir):
    pdf_files = glob.glob(f'{pdf_dir}/*.pdf')
    print('No. pdf files:', len(pdf_files))
    print(pdf_files)

    for file in tqdm(pdf_files):
        pdf2img(file, save_dir, n_pages=-1, return_fname=False)
        # pages = convert_from_path(file, 500)
        # for i, page in enumerate(pages):
        #     page.save(os.path.join(save_dir, os.path.basename(file).replace('.pdf', f'_{i}.jpg')), 'JPEG')
    print('Done!!!')

def pdf2img(pdf_path, save_dir, n_pages=-1, return_fname=False):
    file_names = []
    pages = convert_from_path(pdf_path)
    if n_pages != -1:
        pages = pages[:n_pages]
    for i, page in enumerate(pages):
        _save_path = os.path.join(save_dir, os.path.basename(pdf_path).replace('.pdf', f'_{i}.jpg'))
        page.save(_save_path, 'JPEG')
        file_names.append(_save_path)
    if return_fname:
        return file_names

def xyxy2xywh(bbox):
    return [
        float(bbox[0]),
        float(bbox[1]),
        float(bbox[2]) - float(bbox[0]),
        float(bbox[3]) - float(bbox[1]),
    ]

def write_to_json(file_path, content):
    with open(file_path, mode='w', encoding='utf8') as f:
        json.dump(content, f, ensure_ascii=False)


def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def read_xml(file_path):
    with open(file_path, 'r') as xml_file:
        return xml_file.read()

def write_to_xml(file_path, content):
    with open(file_path, mode="w", encoding='utf8') as f:
        f.write(content)

def write_to_xml_from_dict(file_path, content):
    xml = dicttoxml(content)
    xml = content
    xml_decode = xml.decode()

    with open(file_path, mode="w") as f:
        f.write(xml_decode)

def read_txt(ocr_path):
    with open(ocr_path, "r") as f:
        lines = f.read().splitlines()
    return lines

def load_ocr_result(ocr_path):
    with open(ocr_path, 'r') as f:
        lines = f.read().splitlines()

    preds = []
    for line in lines:
        preds.append(line.split('\t'))
    return preds

def post_process_basic_ocr(lwords: list) -> list:
    pp_lwords = []
    for word in lwords:
        pp_lwords.append(word.replace("✪", " "))
    return pp_lwords

def read_ocr_result_from_txt(file_path: str):
    '''
    return list of bounding boxes, list of words
    '''
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()

    boxes, words = [], []
    for line in lines:
        if line == "":
            continue
        word_info = line.split("\t")
        if len(word_info) == 6:
            x1, y1, x2, y2, text, _ = word_info
        elif len(word_info) == 5:
            x1, y1, x2, y2, text = word_info

        x1, y1, x2, y2 = int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2))
        if text and text != " ":
            words.append(text)
            boxes.append((x1, y1, x2, y2))
    return boxes, words