sbt-idp/cope2n-ai-fi/common/utils_ocr/create_kie_labels.py

68 lines
1.8 KiB
Python
Raw Normal View History

2023-11-30 11:22:16 +00:00
# %%
# from pathlib import Path # add Fiintrade path to import config, required to run main()
import sys
# TODO: Why??? for what reason ???????????????
sys.path.append(".") # add Fiintrade/ to path
from srcc.tools.utils import (
load_kie_labels_yolo,
create_empty_kie_dict,
write_to_json_,
load_train_val_id_cards,
)
import glob
from OCRBase.config import config as cfg
import os
import pandas as pd
sys.path.append("/home/sds/hoangmd/TokenClassification") # TODO: Why there are bunch of absolute path here
from src.experiments.word_formation import *
from process_label import *
KIE_LABEL_DIR = "data/label/207/kie"
KIE_LABEL_LINE_PATH = "/home/sds/hungbnt/KIE_pretrained/data/label/207/json" # TODO: Absolute path ?????
# %%
def create_kie_dict(list_words):
kie_dict = create_empty_kie_dict()
list_words = throw_overlapping_words(list_words)
for word in list_words:
if word.kie_label in kie_dict:
kie_dict[word.kie_label].append(word)
word.text = word.text.strip()
for kie_label in kie_dict:
list_lines, _ = words_to_lines(kie_dict[kie_label])
kie_dict[kie_label] = "\n ".join([line.text.strip() for line in list_lines])
return kie_dict
# %%
def main():
label_paths = glob.glob(f"{KIE_LABEL_DIR}/*.txt")
for label_path in label_paths:
words, bboxes, kie_labels = load_kie_labels_yolo(label_path)
list_words = []
for i, kie_label in enumerate(kie_labels):
list_words.append(
Word(text=words[i], bndbox=bboxes[i], kie_label=kie_label)
)
kie_dict = create_kie_dict(list_words)
kie_path = os.path.join(
KIE_LABEL_LINE_PATH, os.path.basename(label_path).replace(".txt", ".json")
)
write_to_json_(kie_path, kie_dict)
# %%
if __name__ == "__main__":
main()