sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/utils.py

import os
import json
import glob
from tqdm import tqdm
from pdf2image import convert_from_path
from dicttoxml import dicttoxml


def create_dir(save_dir=''):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    # else:
    #     print("DIR already existed.")
    # print('Save dir : {}'.format(save_dir))

def convert_pdf2img(pdf_dir, save_dir):
    pdf_files = glob.glob(f'{pdf_dir}/*.pdf')
    print('No. pdf files:', len(pdf_files))
    print(pdf_files)

    for file in tqdm(pdf_files):
        pdf2img(file, save_dir, n_pages=-1, return_fname=False)
        # pages = convert_from_path(file, 500)
        # for i, page in enumerate(pages):
        #     page.save(os.path.join(save_dir, os.path.basename(file).replace('.pdf', f'_{i}.jpg')), 'JPEG')
    print('Done!!!')
    
def pdf2img(pdf_path, save_dir, n_pages=-1, return_fname=False):
    file_names = []
    pages = convert_from_path(pdf_path)
    if n_pages != -1:
        pages = pages[:n_pages]
    for i, page in enumerate(pages):
        _save_path = os.path.join(save_dir, os.path.basename(pdf_path).replace('.pdf', f'_{i}.jpg'))
        page.save(_save_path, 'JPEG')
        file_names.append(_save_path)
    if return_fname:
        return file_names

def xyxy2xywh(bbox):
    return [
        float(bbox[0]),
        float(bbox[1]),
        float(bbox[2]) - float(bbox[0]),
        float(bbox[3]) - float(bbox[1]),
    ]

def write_to_json(file_path, content):
    with open(file_path, mode='w', encoding='utf8') as f:
        json.dump(content, f, ensure_ascii=False)


def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def read_xml(file_path):
    with open(file_path, 'r') as xml_file:
        return xml_file.read()
    
def write_to_xml(file_path, content):
    with open(file_path, mode="w", encoding='utf8') as f:
        f.write(content)

def write_to_xml_from_dict(file_path, content):
    xml = dicttoxml(content)
    xml = content
    xml_decode = xml.decode()
    
    with open(file_path, mode="w") as f:
        f.write(xml_decode)

def read_txt(ocr_path):
    with open(ocr_path, "r") as f:
        lines = f.read().splitlines()
    return lines

def load_ocr_result(ocr_path):
    with open(ocr_path, 'r') as f:
        lines = f.read().splitlines()

    preds = []
    for line in lines:
        preds.append(line.split('\t'))
    return preds

def post_process_basic_ocr(lwords: list) -> list:
    pp_lwords = []
    for word in lwords:
        pp_lwords.append(word.replace("✪", " "))
    return pp_lwords

def read_ocr_result_from_txt(file_path: str):
    '''
    return list of bounding boxes, list of words
    '''
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()

    boxes, words = [], []
    for line in lines:
        if line == "":
            continue
        word_info = line.split("\t")
        if len(word_info) == 6:
            x1, y1, x2, y2, text, _ = word_info
        elif len(word_info) == 5:
            x1, y1, x2, y2, text = word_info

        x1, y1, x2, y2 = int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2))
        if text and text != " ":
            words.append(text)
            boxes.append((x1, y1, x2, y2))
    return boxes, words
Add everything 2023-11-30 11:22:16 +00:00			`import os`
			`import json`
			`import glob`
			`from tqdm import tqdm`
			`from pdf2image import convert_from_path`
			`from dicttoxml import dicttoxml`


			`def create_dir(save_dir=''):`
			`if not os.path.exists(save_dir):`
			`os.makedirs(save_dir, exist_ok=True)`
			`# else:`
			`# print("DIR already existed.")`
			`# print('Save dir : {}'.format(save_dir))`

			`def convert_pdf2img(pdf_dir, save_dir):`
			`pdf_files = glob.glob(f'{pdf_dir}/*.pdf')`
			`print('No. pdf files:', len(pdf_files))`
			`print(pdf_files)`

			`for file in tqdm(pdf_files):`
			`pdf2img(file, save_dir, n_pages=-1, return_fname=False)`
			`# pages = convert_from_path(file, 500)`
			`# for i, page in enumerate(pages):`
			`# page.save(os.path.join(save_dir, os.path.basename(file).replace('.pdf', f'_{i}.jpg')), 'JPEG')`
			`print('Done!!!')`

			`def pdf2img(pdf_path, save_dir, n_pages=-1, return_fname=False):`
			`file_names = []`
			`pages = convert_from_path(pdf_path)`
			`if n_pages != -1:`
			`pages = pages[:n_pages]`
			`for i, page in enumerate(pages):`
			`_save_path = os.path.join(save_dir, os.path.basename(pdf_path).replace('.pdf', f'_{i}.jpg'))`
			`page.save(_save_path, 'JPEG')`
			`file_names.append(_save_path)`
			`if return_fname:`
			`return file_names`

			`def xyxy2xywh(bbox):`
			`return [`
			`float(bbox[0]),`
			`float(bbox[1]),`
			`float(bbox[2]) - float(bbox[0]),`
			`float(bbox[3]) - float(bbox[1]),`
			`]`

			`def write_to_json(file_path, content):`
			`with open(file_path, mode='w', encoding='utf8') as f:`
			`json.dump(content, f, ensure_ascii=False)`


			`def read_json(file_path):`
			`with open(file_path, 'r') as f:`
			`return json.load(f)`

			`def read_xml(file_path):`
			`with open(file_path, 'r') as xml_file:`
			`return xml_file.read()`

			`def write_to_xml(file_path, content):`
			`with open(file_path, mode="w", encoding='utf8') as f:`
			`f.write(content)`

			`def write_to_xml_from_dict(file_path, content):`
			`xml = dicttoxml(content)`
			`xml = content`
			`xml_decode = xml.decode()`

			`with open(file_path, mode="w") as f:`
			`f.write(xml_decode)`

			`def read_txt(ocr_path):`
			`with open(ocr_path, "r") as f:`
			`lines = f.read().splitlines()`
			`return lines`

			`def load_ocr_result(ocr_path):`
			`with open(ocr_path, 'r') as f:`
			`lines = f.read().splitlines()`

			`preds = []`
			`for line in lines:`
			`preds.append(line.split('\t'))`
			`return preds`

			`def post_process_basic_ocr(lwords: list) -> list:`
			`pp_lwords = []`
			`for word in lwords:`
			`pp_lwords.append(word.replace("✪", " "))`
			`return pp_lwords`

			`def read_ocr_result_from_txt(file_path: str):`
			`'''`
			`return list of bounding boxes, list of words`
			`'''`
			`with open(file_path, 'r') as f:`
			`lines = f.read().splitlines()`

			`boxes, words = [], []`
			`for line in lines:`
			`if line == "":`
			`continue`
			`word_info = line.split("\t")`
			`if len(word_info) == 6:`
			`x1, y1, x2, y2, text, _ = word_info`
			`elif len(word_info) == 5:`
			`x1, y1, x2, y2, text = word_info`

			`x1, y1, x2, y2 = int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2))`
			`if text and text != " ":`
			`words.append(text)`
			`boxes.append((x1, y1, x2, y2))`
			`return boxes, words`