118 lines
3.3 KiB
Python
Executable File
118 lines
3.3 KiB
Python
Executable File
|
|
from pathlib import Path
|
|
import shutil
|
|
import json
|
|
|
|
def write_txt(txt, data, mode="w"):
|
|
with open(txt, mode, encoding="utf8") as f:
|
|
for line in data:
|
|
f.write(line + "\n")
|
|
|
|
|
|
def read_txt(txt):
|
|
with open(txt, "r", encoding="utf8") as f:
|
|
data = [line.strip() for line in f]
|
|
return data
|
|
|
|
def get_no(items):
|
|
no = "xxxx"
|
|
for item in items:
|
|
if "No_value" in item:
|
|
tmp = item.split("\t")
|
|
no = tmp[-2]
|
|
|
|
return no
|
|
|
|
def write_json(json_path, data):
|
|
with open(json_path, "w", encoding="utf8") as f:
|
|
json.dump(data, f, ensure_ascii=False, sort_keys=True)
|
|
|
|
|
|
def read_json(json_path):
|
|
with open(json_path, "r", encoding="utf8") as f:
|
|
data = json.load(f)
|
|
return data
|
|
|
|
|
|
def check(txt_dir):
|
|
log_dict = {}
|
|
txt_dir = Path(txt_dir)
|
|
txt_paths = txt_dir.rglob("*.txt")
|
|
for txt_path in txt_paths:
|
|
items = read_txt(str(txt_path))
|
|
no_doc = get_no(items)
|
|
if no_doc not in log_dict:
|
|
log_dict[no_doc] = [str(txt_path.with_suffix(".jpg"))]
|
|
else:
|
|
log_dict[no_doc].append(str(txt_path.with_suffix(".jpg")))
|
|
|
|
not_dups = []
|
|
for no, paths in log_dict.items():
|
|
if len(paths) == 1:
|
|
not_dups.append(no)
|
|
# if "xxxx" in log_dict.keys():
|
|
# log_dict.pop("xxxx")
|
|
for _ in not_dups:
|
|
log_dict.pop(_)
|
|
|
|
print(log_dict.keys())
|
|
return log_dict
|
|
|
|
# print(check("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SS_Invoice"))
|
|
|
|
|
|
def get_leak_test(data_dir):
|
|
test_keys = []
|
|
data_dir = Path(data_dir)
|
|
test_paths = data_dir.rglob("test_*")
|
|
# print(list(test_paths))
|
|
for path in test_paths:
|
|
img_name = path.stem
|
|
img_name = img_name.replace("test_","")
|
|
test_keys.append(img_name)
|
|
|
|
|
|
# write_txt("leak.txt", test_keys)
|
|
return test_keys
|
|
|
|
|
|
def create_new_test(ori_dir, out_dir, test_keys):
|
|
ori_dir = Path(ori_dir)
|
|
out_dir = Path(out_dir)
|
|
if not out_dir.exists():
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
img_paths = ori_dir.rglob("*.jpg")
|
|
for img_path in img_paths:
|
|
img_key = img_path.stem
|
|
if img_key in test_keys:
|
|
continue
|
|
txt_path = img_path.with_suffix(".txt")
|
|
shutil.copy(str(img_path), str(out_dir))
|
|
shutil.copy(str(txt_path), str(out_dir))
|
|
|
|
def create_new_e2e_test(ori_json, out_json, test_keys):
|
|
ori_data = read_json(ori_json)
|
|
out_dict = {}
|
|
for k, v in ori_data.items():
|
|
if k in test_keys:
|
|
continue
|
|
out_dict[k] = v
|
|
|
|
write_json(out_json, out_dict)
|
|
|
|
|
|
test_keys = get_leak_test("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/EXCESS")
|
|
# create_new_test(
|
|
# ori_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss",
|
|
# out_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_rm_leak",
|
|
# test_keys=test_keys
|
|
# )
|
|
|
|
create_new_e2e_test(
|
|
ori_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e.json",
|
|
out_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e_rm_leak.json",
|
|
test_keys=test_keys
|
|
)
|
|
|