314 lines
11 KiB
Python
314 lines
11 KiB
Python
|
import shutil
|
||
|
import xml.etree.ElementTree as ET
|
||
|
from datetime import datetime
|
||
|
# from sdsvkie.utils.io_file import read_json, write_json
|
||
|
import json
|
||
|
|
||
|
import csv
|
||
|
import ast
|
||
|
def get_xml_from_csv(csv_file):
|
||
|
data = {}
|
||
|
with open(csv_file, 'r') as file:
|
||
|
csv_reader = csv.DictReader(file)
|
||
|
for row in csv_reader:
|
||
|
# print(row)
|
||
|
pdf_path = row['file_path']
|
||
|
pdf_key = Path(pdf_path).stem
|
||
|
xml_paths = ast.literal_eval(row['xml_path'])
|
||
|
data[pdf_key] = xml_paths
|
||
|
|
||
|
return data
|
||
|
|
||
|
def get_xml_from_dirs(dir_path, pdf_keys):
|
||
|
dir_path = Path(dir_path)
|
||
|
xml_paths = dir_path.rglob("*.xml")
|
||
|
xml_paths = [str(path) for path in xml_paths]
|
||
|
xml_infos = {}
|
||
|
|
||
|
for pdf_key in pdf_keys:
|
||
|
xml_infos[pdf_key] = xml_paths
|
||
|
return xml_infos
|
||
|
|
||
|
|
||
|
def write_json(json_path, data, sort_keys=True):
|
||
|
with open(json_path, "w", encoding="utf8") as f:
|
||
|
json.dump(data, f, ensure_ascii=False, sort_keys=sort_keys)
|
||
|
|
||
|
|
||
|
def read_json(json_path):
|
||
|
with open(json_path, "r", encoding="utf8") as f:
|
||
|
data = json.load(f)
|
||
|
return data
|
||
|
|
||
|
|
||
|
from pathlib import Path
|
||
|
import tqdm
|
||
|
import logging
|
||
|
logging.basicConfig(level=logging.INFO)
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
def convert_date(date_str: str, ori_pattern: str = '%Y-%m-%d', tgt_pattern: str = '%d/%m/%Y'):
|
||
|
date_obj = datetime.strptime(date_str, ori_pattern)
|
||
|
|
||
|
# convert back to string in DD-MM-YY format
|
||
|
new_date_str = date_obj.strftime(tgt_pattern)
|
||
|
return new_date_str
|
||
|
|
||
|
def extract(xml_in, field_mapping):
|
||
|
|
||
|
|
||
|
with open(xml_in, "r") as f:
|
||
|
xml_string = f.read()
|
||
|
# parse the XML string
|
||
|
root = ET.fromstring(xml_string)
|
||
|
|
||
|
# extract the SHDon and NLap elements
|
||
|
output = {}
|
||
|
for key in field_mapping:
|
||
|
pattern = f".//{field_mapping[key]}"
|
||
|
value = root.find(pattern)
|
||
|
value = "" if value is None else value.text
|
||
|
if key == "date_value" and value != "":
|
||
|
value = convert_date(value)
|
||
|
|
||
|
if key in ["tax_amount_value", "total_value"] and value != "":
|
||
|
value = str(int(float(value)))
|
||
|
|
||
|
|
||
|
output[key] = value
|
||
|
return output
|
||
|
|
||
|
field_mapping = {
|
||
|
"no_value": "SHDon",
|
||
|
"form_value": "KHMSHDon",
|
||
|
"serial_value": "XXXXXXX",
|
||
|
"date_value": "NLap", # 2023-06-05 -> YY-MM-DD
|
||
|
"seller_company_name_value": "NBan/Ten",
|
||
|
"seller_address_value": "NBan/DChi",
|
||
|
"seller_tel_value": "XXXXXXXXX",
|
||
|
"seller_tax_code_value": "NBan/MST",
|
||
|
"buyer_personal_name_value": "NMua/HVTNMHang",
|
||
|
"buyer_company_name_value": "NMua/Ten",
|
||
|
"buyer_address_value": "NMua/DChi",
|
||
|
"buyer_tax_code_value": "NMua/MST",
|
||
|
"buyer_tel_value": "NMua/SDT",
|
||
|
"tax_amount_value": "TThue",
|
||
|
"total_value": "TgTTTBSo",
|
||
|
"total_in_words_value": "TgTTTBChu"
|
||
|
}
|
||
|
|
||
|
## fields need care: serial_value, seller_tel_value, buyer_tel_value
|
||
|
|
||
|
def get_xml_list_info(xml_dir):
|
||
|
xml_dir = Path(xml_dir)
|
||
|
xml_files = xml_dir.glob("*/*.xml")
|
||
|
xml_info = {}
|
||
|
for xml_file in xml_files:
|
||
|
pdf_key = xml_file.stem
|
||
|
xml_info[pdf_key] = str(xml_file)
|
||
|
return xml_info
|
||
|
|
||
|
def process(json_in, json_out, xml_dir):
|
||
|
assert Path(json_in).exists() == True
|
||
|
assert Path(xml_dir).exists() == True
|
||
|
data_in = read_json(json_in)
|
||
|
data_out = {}
|
||
|
if data_in is None or not data_in:
|
||
|
logger.error("empty file")
|
||
|
return
|
||
|
|
||
|
xml_info = get_xml_list_info(xml_dir)
|
||
|
for pdf_key in tqdm.tqdm(data_in.keys()):
|
||
|
|
||
|
xml_path = xml_info[pdf_key] if pdf_key in xml_info else None
|
||
|
if xml_path is None:
|
||
|
continue
|
||
|
else:
|
||
|
output = extract(xml_path, field_mapping)
|
||
|
|
||
|
data_out[pdf_key] = output
|
||
|
|
||
|
write_json(json_out, data_out, sort_keys=False)
|
||
|
|
||
|
|
||
|
def get_xml_list_info_v2(xml_dir):
|
||
|
xml_dir = Path(xml_dir)
|
||
|
xml_files = xml_dir.glob("*/*.xml")
|
||
|
|
||
|
xml_info = {}
|
||
|
for xml_file in xml_files:
|
||
|
pdf_key = xml_file.stem
|
||
|
|
||
|
if pdf_key in xml_info:
|
||
|
xml_info[pdf_key].append(str(xml_file))
|
||
|
else:
|
||
|
xml_info[pdf_key] = [str(xml_file)]
|
||
|
|
||
|
return xml_info
|
||
|
|
||
|
def extract_v2(xml_paths, preds, field_mapping, pdf_key=None):
|
||
|
|
||
|
|
||
|
xml_path = None
|
||
|
if len(xml_paths) == 1:
|
||
|
xml_path = xml_paths[0]
|
||
|
else:
|
||
|
# find best xml
|
||
|
for xml_in in xml_paths:
|
||
|
try:
|
||
|
with open(xml_in, "r", encoding='utf8') as f:
|
||
|
xml_string = f.read()
|
||
|
root = ET.fromstring(xml_string, parser = ET.XMLParser(encoding = 'iso-8859-5'))
|
||
|
except Exception as err:
|
||
|
print("Error exception (check) ", err, xml_in)
|
||
|
continue
|
||
|
|
||
|
key_checks = ["no_value"]
|
||
|
is_exists_xml = False
|
||
|
for key_check in key_checks:
|
||
|
pattern = f".//{field_mapping[key_check]}"
|
||
|
value = root.find(pattern)
|
||
|
value = "" if value is None else value.text
|
||
|
|
||
|
if value == preds[key_check]:
|
||
|
is_exists_xml = True
|
||
|
if is_exists_xml:
|
||
|
xml_path = xml_in
|
||
|
if xml_path is None:
|
||
|
print("Not found best xml for ",pdf_key, xml_paths)
|
||
|
return None, None
|
||
|
|
||
|
#
|
||
|
try:
|
||
|
with open(xml_path, "r") as f:
|
||
|
xml_string = f.read()
|
||
|
# parse the XML string
|
||
|
root = ET.fromstring(xml_string)
|
||
|
except Exception as err:
|
||
|
print("Error exception: ", err, xml_path)
|
||
|
return None, None
|
||
|
# extract the SHDon and NLap elements
|
||
|
output = {}
|
||
|
for key in field_mapping:
|
||
|
pattern = f".//{field_mapping[key]}"
|
||
|
value = root.find(pattern)
|
||
|
value = "" if value is None else value.text
|
||
|
if key == "date_value" and value != "":
|
||
|
value = convert_date(value)
|
||
|
if key in ["tax_amount_value", "total_value"] and value != "":
|
||
|
value = str(int(float(value)))
|
||
|
|
||
|
output[key] = value
|
||
|
|
||
|
return output, xml_path
|
||
|
|
||
|
def process_v2(json_in, json_out, csv_file, xml_dir, xml_out_dir, pdf_xml_json):
|
||
|
assert Path(json_in).exists() == True
|
||
|
assert Path(xml_dir).exists() == True
|
||
|
# make dir
|
||
|
if not Path(xml_out_dir).exists():
|
||
|
Path(xml_out_dir).mkdir(parents=True, exist_ok=True)
|
||
|
|
||
|
data_in = read_json(json_in)
|
||
|
data_out = {}
|
||
|
if data_in is None or not data_in:
|
||
|
logger.error("empty file")
|
||
|
return
|
||
|
|
||
|
# xml_info = get_xml_list_info_v2(xml_dir)
|
||
|
# xml_info = get_xml_from_csv(csv_file=csv_file)
|
||
|
xml_info = get_xml_from_dirs(dir_path=csv_file, pdf_keys=list(data_in.keys()))
|
||
|
print("Num xml: ", len(xml_info))
|
||
|
succes = 0
|
||
|
pdf_xml_info = {}
|
||
|
set_xml_paths = set()
|
||
|
for pdf_key in tqdm.tqdm(data_in.keys()):
|
||
|
|
||
|
xml_paths = xml_info[pdf_key] if pdf_key in xml_info else None
|
||
|
# print(xml_paths)
|
||
|
preds = data_in[pdf_key]
|
||
|
if xml_paths is None or len(xml_paths) == 0:
|
||
|
print("Not exist xml because xml_paths is None or len xml_paths = 0", pdf_key)
|
||
|
continue
|
||
|
else:
|
||
|
output, xml_path = extract_v2(xml_paths, preds, field_mapping, pdf_key=pdf_key)
|
||
|
|
||
|
if output is not None:
|
||
|
pdf_xml_info[pdf_key] = xml_path
|
||
|
shutil.copy(xml_path, xml_out_dir)
|
||
|
# if Path(xml_path).stem in set_xml_paths:
|
||
|
# print(pdf_key, xml_path)
|
||
|
set_xml_paths.add(Path(xml_path).stem)
|
||
|
succes += 1
|
||
|
data_out[pdf_key] = output
|
||
|
print("Succes: ", succes)
|
||
|
print(len(set_xml_paths))
|
||
|
write_json(pdf_xml_json, pdf_xml_info, sort_keys=False)
|
||
|
write_json(json_out, data_out, sort_keys=False)
|
||
|
|
||
|
|
||
|
def combine_xml(json_src, json_refer):
|
||
|
data_src = read_json(json_src)
|
||
|
data_refer = read_json(json_refer)
|
||
|
|
||
|
for pdf_key in data_src.keys():
|
||
|
for field_key in data_src[pdf_key]:
|
||
|
if data_src[pdf_key][field_key] == "":
|
||
|
data_src[pdf_key][field_key] = data_refer[pdf_key][field_key]
|
||
|
|
||
|
write_json(json_src, data=data_src, sort_keys=False)
|
||
|
|
||
|
|
||
|
|
||
|
def create_data_from_json(in_dir, out_dir, json_path):
|
||
|
in_dir = Path(in_dir)
|
||
|
out_dir = Path(out_dir)
|
||
|
|
||
|
if not out_dir.exists():
|
||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
||
|
data = read_json(json_path)
|
||
|
|
||
|
for pdf_key in data.keys():
|
||
|
pdf_path = in_dir / (pdf_key + ".pdf")
|
||
|
shutil.copy(str(pdf_path), str(out_dir))
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
|
||
|
|
||
|
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json"
|
||
|
# json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
|
||
|
# xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test"
|
||
|
# process(json_in=json_in, json_out=json_out, xml_dir=xml_dir)
|
||
|
|
||
|
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
|
||
|
# json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json"
|
||
|
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
|
||
|
# json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json"
|
||
|
|
||
|
# combine_xml(json_src=json_in, json_refer=json_refer)
|
||
|
|
||
|
## One page
|
||
|
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json"
|
||
|
# json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
|
||
|
## Multi page
|
||
|
json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page.json"
|
||
|
json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page_from_xml.json"
|
||
|
|
||
|
# csv_file = "/mnt/ssd1T/tuanlv/02.KeyValueUnderstanding/inferences/e2e_outputs/FI_June_data.csv"
|
||
|
csv_file = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Data"
|
||
|
pdf_xml_json = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v1_multi_page_metadata.json"
|
||
|
|
||
|
xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test"
|
||
|
xml_out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v1_multi_page_xml"
|
||
|
|
||
|
process_v2(json_in=json_in, json_out=json_out, csv_file=csv_file, xml_dir=xml_dir, xml_out_dir=xml_out_dir, pdf_xml_json=pdf_xml_json)
|
||
|
|
||
|
# in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page"
|
||
|
# out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page_clean"
|
||
|
# json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
|
||
|
# in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page"
|
||
|
# out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page_clean"
|
||
|
# json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
|
||
|
|
||
|
# create_data_from_json(in_dir, out_dir, json_path)
|