40 lines
1.0 KiB
Python
Executable File
40 lines
1.0 KiB
Python
Executable File
import argparse
|
|
from pathlib import Path
|
|
from tqdm import tqdm
|
|
from sdsvkie.utils.io_file import read_json, write_json
|
|
|
|
|
|
def clean_json(in_json, out_json, pdf_dir):
|
|
data_src = read_json(in_json)
|
|
|
|
pdf_dir = Path(pdf_dir)
|
|
pdf_paths = pdf_dir.glob("*.pdf")
|
|
pdf_keys = set([pdf_path.stem for pdf_path in pdf_paths])
|
|
|
|
data_tgt = {}
|
|
for src_pdf_key in data_src.keys():
|
|
if src_pdf_key in pdf_keys:
|
|
data_tgt[src_pdf_key] = data_src[src_pdf_key]
|
|
|
|
write_json(out_json, data_tgt, sort_keys=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(prog="Rename labels")
|
|
parser.add_argument("--input", type=str, required=True, help="dataset directory")
|
|
parser.add_argument("--out", type=str, required=False, help="output")
|
|
parser.add_argument("--dir", type=str, required=True, help="document type: receipt / invoice")
|
|
|
|
args = parser.parse_args()
|
|
clean_json(
|
|
in_json=args.input,
|
|
out_json=args.out,
|
|
pdf_dir=args.dir
|
|
)
|