sbt-idp/cope2n-ai-fi/modules/sdsvkie/scripts/common/clean_json.py
2023-12-12 15:14:54 +07:00

40 lines
1.0 KiB
Python
Executable File

import argparse
from pathlib import Path
from tqdm import tqdm
from sdsvkie.utils.io_file import read_json, write_json
def clean_json(in_json, out_json, pdf_dir):
data_src = read_json(in_json)
pdf_dir = Path(pdf_dir)
pdf_paths = pdf_dir.glob("*.pdf")
pdf_keys = set([pdf_path.stem for pdf_path in pdf_paths])
data_tgt = {}
for src_pdf_key in data_src.keys():
if src_pdf_key in pdf_keys:
data_tgt[src_pdf_key] = data_src[src_pdf_key]
write_json(out_json, data_tgt, sort_keys=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="Rename labels")
parser.add_argument("--input", type=str, required=True, help="dataset directory")
parser.add_argument("--out", type=str, required=False, help="output")
parser.add_argument("--dir", type=str, required=True, help="document type: receipt / invoice")
args = parser.parse_args()
clean_json(
in_json=args.input,
out_json=args.out,
pdf_dir=args.dir
)