sbt-idp/cope2n-ai-fi/modules/sdsvkie/notebooks/data.py

84 lines
2.3 KiB
Python
Raw Normal View History

2023-12-12 08:14:54 +00:00
# from sdsvkie.utils.io_file import read_json
import json
import Levenshtein
from pathlib import Path
import shutil
import re
from unidecode import unidecode
# from sdsvkie.utils.io_file import read_json
def normalize(text):
text = text.lower()
text = unidecode(text)
text = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
return text
def is_match(src, str_new, thr=0.7):
src = normalize(src)
str_new = normalize(str_new)
distance = Levenshtein.ratio(src, str_new)
if distance > thr:
return True
else:
return False
def get_store_name(gt_store, store_list):
for store in store_list:
if is_match(store, gt_store, thr=0.6):
return store.lower()
if len(gt_store) == 0:
return "other_non_title"
else:
return "other_have_title_{}".format(gt_store)
def read_json(json_path):
with open(json_path, "r", encoding="utf8") as f:
data = json.load(f)
return data
json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_ss_receipt.json"
pred_data = read_json(json_path)
store_names = [normalize(item['Store_name_value']) for k, item in pred_data.items()]
# store_names = list(set(store_names))
from collections import Counter
my_counter = Counter(store_names)
list_tuples = my_counter.most_common()
print(list_tuples)
stores = [x[0] for x in list_tuples]
print(stores)
store_names = stores[1:]
img_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/All"
out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done"
out_dir = Path(out_dir)
for img_name, item in pred_data.items():
store_name = item['Store_name_value']
store_category = get_store_name(store_name, store_list=store_names)
store_category = store_category.replace(" ", "_")
print(store_category)
out_dir_by_store = out_dir / store_category
if not out_dir_by_store.exists():
out_dir_by_store.mkdir(parents=True, exist_ok=True)
img_full_name = Path(img_name).with_suffix(".jpg")
img_full_path = Path(img_dir) / img_full_name
txt_full_path = img_full_path.with_suffix(".txt")
if not img_full_path.exists():
print(str(img_full_path))
continue
else:
shutil.copy(str(img_full_path), out_dir_by_store)
shutil.copy(str(txt_full_path), out_dir_by_store)