sbt-idp/cope2n-ai-fi/modules/sdsvkie/scripts/common/clean_multipage_data.py

50 lines
1.6 KiB
Python
Raw Normal View History

2023-12-12 08:14:54 +00:00
import os
import json
from pathlib import Path
def read_json(json_path):
with open(json_path, 'r') as f:
data = json.load(f)
return data
def write_json(json_path, data):
with open(json_path, 'w') as f:
json.dump(data, f, ensure_ascii=False)
def clean_json(json_in, json_out, valid_names):
out_data = {}
data = read_json(json_in)
for name_key, items in data.items():
if name_key in valid_names:
out_data[name_key] = items
write_json(json_out, out_data)
# DIRNAMES = ['SL_HCM', 'SL_HN_batch_1', 'SL_HN_batch_2', 'Invoices_SAVINA']
# ROOT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page"
DIRNAMES = ['test_sbt_v2']
ROOT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed"
for dirname in DIRNAMES:
json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e.json"
json_out_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e_v2.json"
valid_names = [p.stem for p in (Path(ROOT_DIR) / dirname).glob("*")]
print(valid_names)
clean_json(json_path, json_out_path, valid_names)
# def combine_json(json_paths, json_out):
# datas = [read_json(json_path) for json_path in json_paths]
# out_data = {}
# for data in datas:
# out_data.update(data)
# write_json(json_out, out_data)
# json_paths = [Path(ROOT_DIR) / (dirname + "_out.json") for dirname in DIRNAMES]
# json_out = ROOT_DIR + "/test_e2e_multi_pages.json"
# combine_json(json_paths, json_out)