84 lines
2.3 KiB
Python
84 lines
2.3 KiB
Python
|
# from sdsvkie.utils.io_file import read_json
|
||
|
import json
|
||
|
import Levenshtein
|
||
|
from pathlib import Path
|
||
|
import shutil
|
||
|
import re
|
||
|
from unidecode import unidecode
|
||
|
|
||
|
# from sdsvkie.utils.io_file import read_json
|
||
|
|
||
|
|
||
|
def normalize(text):
|
||
|
text = text.lower()
|
||
|
text = unidecode(text)
|
||
|
text = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
|
||
|
return text
|
||
|
|
||
|
|
||
|
|
||
|
def is_match(src, str_new, thr=0.7):
|
||
|
src = normalize(src)
|
||
|
str_new = normalize(str_new)
|
||
|
distance = Levenshtein.ratio(src, str_new)
|
||
|
if distance > thr:
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
def get_store_name(gt_store, store_list):
|
||
|
for store in store_list:
|
||
|
if is_match(store, gt_store, thr=0.6):
|
||
|
return store.lower()
|
||
|
|
||
|
if len(gt_store) == 0:
|
||
|
return "other_non_title"
|
||
|
else:
|
||
|
return "other_have_title_{}".format(gt_store)
|
||
|
|
||
|
|
||
|
def read_json(json_path):
|
||
|
with open(json_path, "r", encoding="utf8") as f:
|
||
|
data = json.load(f)
|
||
|
return data
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_ss_receipt.json"
|
||
|
pred_data = read_json(json_path)
|
||
|
|
||
|
store_names = [normalize(item['Store_name_value']) for k, item in pred_data.items()]
|
||
|
# store_names = list(set(store_names))
|
||
|
from collections import Counter
|
||
|
my_counter = Counter(store_names)
|
||
|
list_tuples = my_counter.most_common()
|
||
|
print(list_tuples)
|
||
|
stores = [x[0] for x in list_tuples]
|
||
|
print(stores)
|
||
|
|
||
|
|
||
|
store_names = stores[1:]
|
||
|
|
||
|
img_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/All"
|
||
|
out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done"
|
||
|
out_dir = Path(out_dir)
|
||
|
for img_name, item in pred_data.items():
|
||
|
store_name = item['Store_name_value']
|
||
|
store_category = get_store_name(store_name, store_list=store_names)
|
||
|
store_category = store_category.replace(" ", "_")
|
||
|
print(store_category)
|
||
|
out_dir_by_store = out_dir / store_category
|
||
|
if not out_dir_by_store.exists():
|
||
|
out_dir_by_store.mkdir(parents=True, exist_ok=True)
|
||
|
|
||
|
img_full_name = Path(img_name).with_suffix(".jpg")
|
||
|
img_full_path = Path(img_dir) / img_full_name
|
||
|
|
||
|
txt_full_path = img_full_path.with_suffix(".txt")
|
||
|
if not img_full_path.exists():
|
||
|
print(str(img_full_path))
|
||
|
continue
|
||
|
else:
|
||
|
shutil.copy(str(img_full_path), out_dir_by_store)
|
||
|
shutil.copy(str(txt_full_path), out_dir_by_store)
|