import os import json from pathlib import Path def read_json(json_path): with open(json_path, 'r') as f: data = json.load(f) return data def write_json(json_path, data): with open(json_path, 'w') as f: json.dump(data, f, ensure_ascii=False) def clean_json(json_in, json_out, valid_names): out_data = {} data = read_json(json_in) for name_key, items in data.items(): if name_key in valid_names: out_data[name_key] = items write_json(json_out, out_data) # DIRNAMES = ['SL_HCM', 'SL_HN_batch_1', 'SL_HN_batch_2', 'Invoices_SAVINA'] # ROOT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page" DIRNAMES = ['test_sbt_v2'] ROOT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed" for dirname in DIRNAMES: json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e.json" json_out_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e_v2.json" valid_names = [p.stem for p in (Path(ROOT_DIR) / dirname).glob("*")] print(valid_names) clean_json(json_path, json_out_path, valid_names) # def combine_json(json_paths, json_out): # datas = [read_json(json_path) for json_path in json_paths] # out_data = {} # for data in datas: # out_data.update(data) # write_json(json_out, out_data) # json_paths = [Path(ROOT_DIR) / (dirname + "_out.json") for dirname in DIRNAMES] # json_out = ROOT_DIR + "/test_e2e_multi_pages.json" # combine_json(json_paths, json_out)