sbt-idp/cope2n-ai-fi/modules/sdsvkie/scripts/common/check_duplicate_vat.py

118 lines
3.3 KiB
Python
Raw Normal View History

2023-12-12 08:14:54 +00:00
from pathlib import Path
import shutil
import json
def write_txt(txt, data, mode="w"):
with open(txt, mode, encoding="utf8") as f:
for line in data:
f.write(line + "\n")
def read_txt(txt):
with open(txt, "r", encoding="utf8") as f:
data = [line.strip() for line in f]
return data
def get_no(items):
no = "xxxx"
for item in items:
if "No_value" in item:
tmp = item.split("\t")
no = tmp[-2]
return no
def write_json(json_path, data):
with open(json_path, "w", encoding="utf8") as f:
json.dump(data, f, ensure_ascii=False, sort_keys=True)
def read_json(json_path):
with open(json_path, "r", encoding="utf8") as f:
data = json.load(f)
return data
def check(txt_dir):
log_dict = {}
txt_dir = Path(txt_dir)
txt_paths = txt_dir.rglob("*.txt")
for txt_path in txt_paths:
items = read_txt(str(txt_path))
no_doc = get_no(items)
if no_doc not in log_dict:
log_dict[no_doc] = [str(txt_path.with_suffix(".jpg"))]
else:
log_dict[no_doc].append(str(txt_path.with_suffix(".jpg")))
not_dups = []
for no, paths in log_dict.items():
if len(paths) == 1:
not_dups.append(no)
# if "xxxx" in log_dict.keys():
# log_dict.pop("xxxx")
for _ in not_dups:
log_dict.pop(_)
print(log_dict.keys())
return log_dict
# print(check("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SS_Invoice"))
def get_leak_test(data_dir):
test_keys = []
data_dir = Path(data_dir)
test_paths = data_dir.rglob("test_*")
# print(list(test_paths))
for path in test_paths:
img_name = path.stem
img_name = img_name.replace("test_","")
test_keys.append(img_name)
# write_txt("leak.txt", test_keys)
return test_keys
def create_new_test(ori_dir, out_dir, test_keys):
ori_dir = Path(ori_dir)
out_dir = Path(out_dir)
if not out_dir.exists():
out_dir.mkdir(parents=True, exist_ok=True)
img_paths = ori_dir.rglob("*.jpg")
for img_path in img_paths:
img_key = img_path.stem
if img_key in test_keys:
continue
txt_path = img_path.with_suffix(".txt")
shutil.copy(str(img_path), str(out_dir))
shutil.copy(str(txt_path), str(out_dir))
def create_new_e2e_test(ori_json, out_json, test_keys):
ori_data = read_json(ori_json)
out_dict = {}
for k, v in ori_data.items():
if k in test_keys:
continue
out_dict[k] = v
write_json(out_json, out_dict)
test_keys = get_leak_test("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/EXCESS")
# create_new_test(
# ori_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss",
# out_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_rm_leak",
# test_keys=test_keys
# )
create_new_e2e_test(
ori_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e.json",
out_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e_rm_leak.json",
test_keys=test_keys
)