from pathlib import Path import shutil import json def write_txt(txt, data, mode="w"): with open(txt, mode, encoding="utf8") as f: for line in data: f.write(line + "\n") def read_txt(txt): with open(txt, "r", encoding="utf8") as f: data = [line.strip() for line in f] return data def get_no(items): no = "xxxx" for item in items: if "No_value" in item: tmp = item.split("\t") no = tmp[-2] return no def write_json(json_path, data): with open(json_path, "w", encoding="utf8") as f: json.dump(data, f, ensure_ascii=False, sort_keys=True) def read_json(json_path): with open(json_path, "r", encoding="utf8") as f: data = json.load(f) return data def check(txt_dir): log_dict = {} txt_dir = Path(txt_dir) txt_paths = txt_dir.rglob("*.txt") for txt_path in txt_paths: items = read_txt(str(txt_path)) no_doc = get_no(items) if no_doc not in log_dict: log_dict[no_doc] = [str(txt_path.with_suffix(".jpg"))] else: log_dict[no_doc].append(str(txt_path.with_suffix(".jpg"))) not_dups = [] for no, paths in log_dict.items(): if len(paths) == 1: not_dups.append(no) # if "xxxx" in log_dict.keys(): # log_dict.pop("xxxx") for _ in not_dups: log_dict.pop(_) print(log_dict.keys()) return log_dict # print(check("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SS_Invoice")) def get_leak_test(data_dir): test_keys = [] data_dir = Path(data_dir) test_paths = data_dir.rglob("test_*") # print(list(test_paths)) for path in test_paths: img_name = path.stem img_name = img_name.replace("test_","") test_keys.append(img_name) # write_txt("leak.txt", test_keys) return test_keys def create_new_test(ori_dir, out_dir, test_keys): ori_dir = Path(ori_dir) out_dir = Path(out_dir) if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) img_paths = ori_dir.rglob("*.jpg") for img_path in img_paths: img_key = img_path.stem if img_key in test_keys: continue txt_path = img_path.with_suffix(".txt") shutil.copy(str(img_path), str(out_dir)) shutil.copy(str(txt_path), str(out_dir)) def create_new_e2e_test(ori_json, out_json, test_keys): ori_data = read_json(ori_json) out_dict = {} for k, v in ori_data.items(): if k in test_keys: continue out_dict[k] = v write_json(out_json, out_dict) test_keys = get_leak_test("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/EXCESS") # create_new_test( # ori_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss", # out_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_rm_leak", # test_keys=test_keys # ) create_new_e2e_test( ori_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e.json", out_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e_rm_leak.json", test_keys=test_keys )