import argparse from pathlib import Path from tqdm import tqdm import json INVOICE_MAPPING = { 'no_key': 'No_key', # số hóa đơn 'no_value': 'No_value', 'form_key': 'Form_key', # mẫu số hóa đơn 'form_value': 'Form_value', 'serial_key': 'Serial_key', # số kí hiệu hoá đơn 'serial_value': 'Serial_value', 'date': 'Date_value', # seller info 'seller_company_name_key': 'Seller_company_name_key', 'seller_company_name_value': 'Seller_company_name_value', 'seller_tax_code_key': 'Seller_tax_code_key', 'seller_tax_code_value': 'Seller_tax_code_value', 'seller_address_value': 'Seller_address_value', 'seller_address_key': 'Seller_address_key', 'seller_mobile_key': 'Seller_tel_key', 'seller_mobile_value': 'Seller_tel_value', # buyer info 'buyer_name_key': 'Buyer_personal_name_key', 'buyer_name_value': 'Buyer_personal_name_value', 'buyer_company_name_value': 'Buyer_company_name_value', 'buyer_company_name_key': 'Buyer_company_name_key', 'buyer_tax_code_key': 'Buyer_tax_code_key', 'buyer_tax_code_value': 'Buyer_tax_code_value', 'buyer_address_key': 'Buyer_address_key', 'buyer_address_value': 'Buyer_address_value', 'buyer_mobile_key': 'Buyer_tel_key', 'buyer_mobile_value': 'Buyer_tel_value', # money info 'VAT_amount_key': 'Tax_amount_key', 'VAT_amount_value': 'Tax_amount_value', 'total_key': 'Total_key', 'total_value': 'Total_value', 'total_in_words_key': 'Total_in_words_key', 'total_in_words_value': 'Total_in_words_value', 'other': 'Other', } RECEIPT_MAPPING = { "Store_name_value": "seller_company_name_value", "Seller_company_name_value": "seller_company_name_value", "id": "no_value", "No_value": "no_value", "Date_value": "date_value", "Total_key": "total_key", "Total_value": "total_value", "Others": "other", "others": "other", "Other": "other", } def write_txt(txt, data, mode="w"): with open(txt, mode, encoding="utf8") as f: for line in data: f.write(line + "\n") def read_txt(txt): with open(txt, "r", encoding="utf8") as f: data = [line.strip() for line in f] return data def edit_file(in_txt, out_txt, mapping): data = read_txt(in_txt) new_items = [] not_exits_label = False not_edit = True for item in data: splited_item = item.split("\t") label = splited_item[-1] if label in mapping.keys(): new_label = mapping[label] splited_item[-1] = new_label not_edit = False else: # print(label, "not in ", mapping.keys()) not_exits_label = True splited_item[-1] = label.lower() splited_item[-1] = splited_item[-1].lower() new_item = "\t".join(splited_item) new_items.append(new_item) if not_exits_label: print("Not exists label: ", in_txt) if not not_edit: print("Not edit: ", in_txt) write_txt(out_txt, new_items) def rename_labels(data_dir, out_dir, doc_type): data_dir = Path(data_dir) out_dir = Path(out_dir) if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) if doc_type == "receipt": mapping = RECEIPT_MAPPING elif doc_type == 'invoice': mapping = INVOICE_MAPPING else: raise NotImplementedError() txt_paths = data_dir.rglob("*.txt") for txt_path in tqdm(txt_paths): txt_dir = str(Path(str(txt_path).replace(str(data_dir), "")).parent) # a/b/c/x.txt -> c/x.txt -> c if txt_dir[0] == "/": txt_dir = txt_dir[1:] out_sub_dir = out_dir / Path(txt_dir) if not out_sub_dir.exists(): out_sub_dir.mkdir(parents=True, exist_ok=True) out_txt = out_sub_dir / txt_path.name # if "failure" in str(out_txt): # # print(out_txt) # print(out_sub_dir) # print(out_txt) # print(out_txt) edit_file(str(txt_path), out_txt=out_txt, mapping=mapping) def write_json(json_path, data): with open(json_path, "w", encoding="utf8") as f: json.dump(data, f, ensure_ascii=False, sort_keys=True) def read_json(json_path): with open(json_path, "r", encoding="utf8") as f: data = json.load(f) return data def rename_label_in_json(json_in, json_out, doc_type): if doc_type == "invoice": mapping = INVOICE_MAPPING else: mapping = RECEIPT_MAPPING ori_data = read_json(json_in) new_data = {} for img_key, field_item in ori_data.items(): new_field_item = {} for field_key, field_value in field_item.items(): if field_key in mapping: new_field_key = mapping[field_key] else: new_field_key = field_key new_field_key = new_field_key.lower() new_field_item[new_field_key] = field_value new_data[img_key] = new_field_item write_json(json_out, new_data) if __name__ == "__main__": parser = argparse.ArgumentParser(prog="Rename labels") parser.add_argument("--in_dir", type=str, required=True, help="dataset directory") parser.add_argument("--out_dir", type=str, required=False, help="output") parser.add_argument("--doc_type", type=str, required=True, help="document type: receipt / invoice") args = parser.parse_args() rename_labels( data_dir=args.in_dir, out_dir=args.out_dir, doc_type=args.doc_type ) # rename_label_in_json( # json_in=args.in_dir, # json_out=args.out_dir, # doc_type=args.doc_type # ) """ """