# from sdsvkie.utils.io_file import read_json import json import Levenshtein from pathlib import Path import shutil import re from unidecode import unidecode # from sdsvkie.utils.io_file import read_json def normalize(text): text = text.lower() text = unidecode(text) text = re.sub(r'[^a-zA-Z0-9\s]+', '', text) return text def is_match(src, str_new, thr=0.7): src = normalize(src) str_new = normalize(str_new) distance = Levenshtein.ratio(src, str_new) if distance > thr: return True else: return False def get_store_name(gt_store, store_list): for store in store_list: if is_match(store, gt_store, thr=0.6): return store.lower() if len(gt_store) == 0: return "other_non_title" else: return "other_have_title_{}".format(gt_store) def read_json(json_path): with open(json_path, "r", encoding="utf8") as f: data = json.load(f) return data json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_ss_receipt.json" pred_data = read_json(json_path) store_names = [normalize(item['Store_name_value']) for k, item in pred_data.items()] # store_names = list(set(store_names)) from collections import Counter my_counter = Counter(store_names) list_tuples = my_counter.most_common() print(list_tuples) stores = [x[0] for x in list_tuples] print(stores) store_names = stores[1:] img_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/All" out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done" out_dir = Path(out_dir) for img_name, item in pred_data.items(): store_name = item['Store_name_value'] store_category = get_store_name(store_name, store_list=store_names) store_category = store_category.replace(" ", "_") print(store_category) out_dir_by_store = out_dir / store_category if not out_dir_by_store.exists(): out_dir_by_store.mkdir(parents=True, exist_ok=True) img_full_name = Path(img_name).with_suffix(".jpg") img_full_path = Path(img_dir) / img_full_name txt_full_path = img_full_path.with_suffix(".txt") if not img_full_path.exists(): print(str(img_full_path)) continue else: shutil.copy(str(img_full_path), out_dir_by_store) shutil.copy(str(txt_full_path), out_dir_by_store)