191 lines
5.7 KiB
Python
Executable File
191 lines
5.7 KiB
Python
Executable File
import argparse
|
|
from pathlib import Path
|
|
from tqdm import tqdm
|
|
import json
|
|
|
|
INVOICE_MAPPING = {
|
|
'no_key': 'No_key', # số hóa đơn
|
|
'no_value': 'No_value',
|
|
'form_key': 'Form_key', # mẫu số hóa đơn
|
|
'form_value': 'Form_value',
|
|
'serial_key': 'Serial_key', # số kí hiệu hoá đơn
|
|
'serial_value': 'Serial_value',
|
|
'date': 'Date_value',
|
|
|
|
# seller info
|
|
'seller_company_name_key': 'Seller_company_name_key',
|
|
'seller_company_name_value': 'Seller_company_name_value',
|
|
'seller_tax_code_key': 'Seller_tax_code_key',
|
|
'seller_tax_code_value': 'Seller_tax_code_value',
|
|
'seller_address_value': 'Seller_address_value',
|
|
'seller_address_key': 'Seller_address_key',
|
|
'seller_mobile_key': 'Seller_tel_key',
|
|
'seller_mobile_value': 'Seller_tel_value',
|
|
|
|
# buyer info
|
|
'buyer_name_key': 'Buyer_personal_name_key',
|
|
'buyer_name_value': 'Buyer_personal_name_value',
|
|
'buyer_company_name_value': 'Buyer_company_name_value',
|
|
'buyer_company_name_key': 'Buyer_company_name_key',
|
|
'buyer_tax_code_key': 'Buyer_tax_code_key',
|
|
'buyer_tax_code_value': 'Buyer_tax_code_value',
|
|
'buyer_address_key': 'Buyer_address_key',
|
|
'buyer_address_value': 'Buyer_address_value',
|
|
'buyer_mobile_key': 'Buyer_tel_key',
|
|
'buyer_mobile_value': 'Buyer_tel_value',
|
|
|
|
# money info
|
|
'VAT_amount_key': 'Tax_amount_key',
|
|
'VAT_amount_value': 'Tax_amount_value',
|
|
'total_key': 'Total_key',
|
|
'total_value': 'Total_value',
|
|
'total_in_words_key': 'Total_in_words_key',
|
|
'total_in_words_value': 'Total_in_words_value',
|
|
|
|
'other': 'Other',
|
|
}
|
|
|
|
RECEIPT_MAPPING = {
|
|
"Store_name_value": "seller_company_name_value",
|
|
"Seller_company_name_value": "seller_company_name_value",
|
|
"id": "no_value",
|
|
"No_value": "no_value",
|
|
|
|
"Date_value": "date_value",
|
|
"Total_key": "total_key",
|
|
"Total_value": "total_value",
|
|
|
|
"Others": "other",
|
|
"others": "other",
|
|
"Other": "other",
|
|
}
|
|
|
|
def write_txt(txt, data, mode="w"):
|
|
with open(txt, mode, encoding="utf8") as f:
|
|
for line in data:
|
|
f.write(line + "\n")
|
|
|
|
|
|
def read_txt(txt):
|
|
with open(txt, "r", encoding="utf8") as f:
|
|
data = [line.strip() for line in f]
|
|
return data
|
|
|
|
|
|
|
|
def edit_file(in_txt, out_txt, mapping):
|
|
data = read_txt(in_txt)
|
|
new_items = []
|
|
not_exits_label = False
|
|
not_edit = True
|
|
for item in data:
|
|
splited_item = item.split("\t")
|
|
label = splited_item[-1]
|
|
if label in mapping.keys():
|
|
new_label = mapping[label]
|
|
splited_item[-1] = new_label
|
|
not_edit = False
|
|
else:
|
|
# print(label, "not in ", mapping.keys())
|
|
not_exits_label = True
|
|
splited_item[-1] = label.lower()
|
|
|
|
splited_item[-1] = splited_item[-1].lower()
|
|
new_item = "\t".join(splited_item)
|
|
new_items.append(new_item)
|
|
|
|
if not_exits_label:
|
|
print("Not exists label: ", in_txt)
|
|
|
|
if not not_edit:
|
|
print("Not edit: ", in_txt)
|
|
write_txt(out_txt, new_items)
|
|
|
|
def rename_labels(data_dir, out_dir, doc_type):
|
|
data_dir = Path(data_dir)
|
|
out_dir = Path(out_dir)
|
|
if not out_dir.exists():
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
if doc_type == "receipt":
|
|
mapping = RECEIPT_MAPPING
|
|
elif doc_type == 'invoice':
|
|
mapping = INVOICE_MAPPING
|
|
else:
|
|
raise NotImplementedError()
|
|
txt_paths = data_dir.rglob("*.txt")
|
|
for txt_path in tqdm(txt_paths):
|
|
txt_dir = str(Path(str(txt_path).replace(str(data_dir), "")).parent) # a/b/c/x.txt -> c/x.txt -> c
|
|
|
|
if txt_dir[0] == "/":
|
|
txt_dir = txt_dir[1:]
|
|
out_sub_dir = out_dir / Path(txt_dir)
|
|
|
|
if not out_sub_dir.exists():
|
|
out_sub_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
out_txt = out_sub_dir / txt_path.name
|
|
|
|
# if "failure" in str(out_txt):
|
|
# # print(out_txt)
|
|
# print(out_sub_dir)
|
|
# print(out_txt)
|
|
# print(out_txt)
|
|
edit_file(str(txt_path), out_txt=out_txt, mapping=mapping)
|
|
|
|
def write_json(json_path, data):
|
|
with open(json_path, "w", encoding="utf8") as f:
|
|
json.dump(data, f, ensure_ascii=False, sort_keys=True)
|
|
|
|
|
|
def read_json(json_path):
|
|
with open(json_path, "r", encoding="utf8") as f:
|
|
data = json.load(f)
|
|
return data
|
|
def rename_label_in_json(json_in, json_out, doc_type):
|
|
if doc_type == "invoice":
|
|
mapping = INVOICE_MAPPING
|
|
else:
|
|
mapping = RECEIPT_MAPPING
|
|
ori_data = read_json(json_in)
|
|
new_data = {}
|
|
for img_key, field_item in ori_data.items():
|
|
new_field_item = {}
|
|
for field_key, field_value in field_item.items():
|
|
if field_key in mapping:
|
|
new_field_key = mapping[field_key]
|
|
else:
|
|
new_field_key = field_key
|
|
new_field_key = new_field_key.lower()
|
|
new_field_item[new_field_key] = field_value
|
|
|
|
new_data[img_key] = new_field_item
|
|
|
|
write_json(json_out, new_data)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(prog="Rename labels")
|
|
parser.add_argument("--in_dir", type=str, required=True, help="dataset directory")
|
|
parser.add_argument("--out_dir", type=str, required=False, help="output")
|
|
parser.add_argument("--doc_type", type=str, required=True, help="document type: receipt / invoice")
|
|
|
|
args = parser.parse_args()
|
|
rename_labels(
|
|
data_dir=args.in_dir,
|
|
out_dir=args.out_dir,
|
|
doc_type=args.doc_type
|
|
)
|
|
|
|
# rename_label_in_json(
|
|
# json_in=args.in_dir,
|
|
# json_out=args.out_dir,
|
|
# doc_type=args.doc_type
|
|
# )
|
|
|
|
|
|
"""
|
|
|
|
""" |