sbt-idp/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename_labels.py
2023-12-12 15:14:54 +07:00

191 lines
5.7 KiB
Python
Executable File

import argparse
from pathlib import Path
from tqdm import tqdm
import json
INVOICE_MAPPING = {
'no_key': 'No_key', # số hóa đơn
'no_value': 'No_value',
'form_key': 'Form_key', # mẫu số hóa đơn
'form_value': 'Form_value',
'serial_key': 'Serial_key', # số kí hiệu hoá đơn
'serial_value': 'Serial_value',
'date': 'Date_value',
# seller info
'seller_company_name_key': 'Seller_company_name_key',
'seller_company_name_value': 'Seller_company_name_value',
'seller_tax_code_key': 'Seller_tax_code_key',
'seller_tax_code_value': 'Seller_tax_code_value',
'seller_address_value': 'Seller_address_value',
'seller_address_key': 'Seller_address_key',
'seller_mobile_key': 'Seller_tel_key',
'seller_mobile_value': 'Seller_tel_value',
# buyer info
'buyer_name_key': 'Buyer_personal_name_key',
'buyer_name_value': 'Buyer_personal_name_value',
'buyer_company_name_value': 'Buyer_company_name_value',
'buyer_company_name_key': 'Buyer_company_name_key',
'buyer_tax_code_key': 'Buyer_tax_code_key',
'buyer_tax_code_value': 'Buyer_tax_code_value',
'buyer_address_key': 'Buyer_address_key',
'buyer_address_value': 'Buyer_address_value',
'buyer_mobile_key': 'Buyer_tel_key',
'buyer_mobile_value': 'Buyer_tel_value',
# money info
'VAT_amount_key': 'Tax_amount_key',
'VAT_amount_value': 'Tax_amount_value',
'total_key': 'Total_key',
'total_value': 'Total_value',
'total_in_words_key': 'Total_in_words_key',
'total_in_words_value': 'Total_in_words_value',
'other': 'Other',
}
RECEIPT_MAPPING = {
"Store_name_value": "seller_company_name_value",
"Seller_company_name_value": "seller_company_name_value",
"id": "no_value",
"No_value": "no_value",
"Date_value": "date_value",
"Total_key": "total_key",
"Total_value": "total_value",
"Others": "other",
"others": "other",
"Other": "other",
}
def write_txt(txt, data, mode="w"):
with open(txt, mode, encoding="utf8") as f:
for line in data:
f.write(line + "\n")
def read_txt(txt):
with open(txt, "r", encoding="utf8") as f:
data = [line.strip() for line in f]
return data
def edit_file(in_txt, out_txt, mapping):
data = read_txt(in_txt)
new_items = []
not_exits_label = False
not_edit = True
for item in data:
splited_item = item.split("\t")
label = splited_item[-1]
if label in mapping.keys():
new_label = mapping[label]
splited_item[-1] = new_label
not_edit = False
else:
# print(label, "not in ", mapping.keys())
not_exits_label = True
splited_item[-1] = label.lower()
splited_item[-1] = splited_item[-1].lower()
new_item = "\t".join(splited_item)
new_items.append(new_item)
if not_exits_label:
print("Not exists label: ", in_txt)
if not not_edit:
print("Not edit: ", in_txt)
write_txt(out_txt, new_items)
def rename_labels(data_dir, out_dir, doc_type):
data_dir = Path(data_dir)
out_dir = Path(out_dir)
if not out_dir.exists():
out_dir.mkdir(parents=True, exist_ok=True)
if doc_type == "receipt":
mapping = RECEIPT_MAPPING
elif doc_type == 'invoice':
mapping = INVOICE_MAPPING
else:
raise NotImplementedError()
txt_paths = data_dir.rglob("*.txt")
for txt_path in tqdm(txt_paths):
txt_dir = str(Path(str(txt_path).replace(str(data_dir), "")).parent) # a/b/c/x.txt -> c/x.txt -> c
if txt_dir[0] == "/":
txt_dir = txt_dir[1:]
out_sub_dir = out_dir / Path(txt_dir)
if not out_sub_dir.exists():
out_sub_dir.mkdir(parents=True, exist_ok=True)
out_txt = out_sub_dir / txt_path.name
# if "failure" in str(out_txt):
# # print(out_txt)
# print(out_sub_dir)
# print(out_txt)
# print(out_txt)
edit_file(str(txt_path), out_txt=out_txt, mapping=mapping)
def write_json(json_path, data):
with open(json_path, "w", encoding="utf8") as f:
json.dump(data, f, ensure_ascii=False, sort_keys=True)
def read_json(json_path):
with open(json_path, "r", encoding="utf8") as f:
data = json.load(f)
return data
def rename_label_in_json(json_in, json_out, doc_type):
if doc_type == "invoice":
mapping = INVOICE_MAPPING
else:
mapping = RECEIPT_MAPPING
ori_data = read_json(json_in)
new_data = {}
for img_key, field_item in ori_data.items():
new_field_item = {}
for field_key, field_value in field_item.items():
if field_key in mapping:
new_field_key = mapping[field_key]
else:
new_field_key = field_key
new_field_key = new_field_key.lower()
new_field_item[new_field_key] = field_value
new_data[img_key] = new_field_item
write_json(json_out, new_data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="Rename labels")
parser.add_argument("--in_dir", type=str, required=True, help="dataset directory")
parser.add_argument("--out_dir", type=str, required=False, help="output")
parser.add_argument("--doc_type", type=str, required=True, help="document type: receipt / invoice")
args = parser.parse_args()
rename_labels(
data_dir=args.in_dir,
out_dir=args.out_dir,
doc_type=args.doc_type
)
# rename_label_in_json(
# json_in=args.in_dir,
# json_out=args.out_dir,
# doc_type=args.doc_type
# )
"""
"""