import argparse from pathlib import Path from tqdm import tqdm from sdsvkie.utils.io_file import read_json, write_json def clean_json(in_json, out_json, pdf_dir): data_src = read_json(in_json) pdf_dir = Path(pdf_dir) pdf_paths = pdf_dir.glob("*.pdf") pdf_keys = set([pdf_path.stem for pdf_path in pdf_paths]) data_tgt = {} for src_pdf_key in data_src.keys(): if src_pdf_key in pdf_keys: data_tgt[src_pdf_key] = data_src[src_pdf_key] write_json(out_json, data_tgt, sort_keys=False) if __name__ == "__main__": parser = argparse.ArgumentParser(prog="Rename labels") parser.add_argument("--input", type=str, required=True, help="dataset directory") parser.add_argument("--out", type=str, required=False, help="output") parser.add_argument("--dir", type=str, required=True, help="document type: receipt / invoice") args = parser.parse_args() clean_json( in_json=args.input, out_json=args.out, pdf_dir=args.dir )