import shutil import xml.etree.ElementTree as ET from datetime import datetime # from sdsvkie.utils.io_file import read_json, write_json import json import csv import ast def get_xml_from_csv(csv_file): data = {} with open(csv_file, 'r') as file: csv_reader = csv.DictReader(file) for row in csv_reader: # print(row) pdf_path = row['file_path'] pdf_key = Path(pdf_path).stem xml_paths = ast.literal_eval(row['xml_path']) data[pdf_key] = xml_paths return data def get_xml_from_dirs(dir_path, pdf_keys): dir_path = Path(dir_path) xml_paths = dir_path.rglob("*.xml") xml_paths = [str(path) for path in xml_paths] xml_infos = {} for pdf_key in pdf_keys: xml_infos[pdf_key] = xml_paths return xml_infos def write_json(json_path, data, sort_keys=True): with open(json_path, "w", encoding="utf8") as f: json.dump(data, f, ensure_ascii=False, sort_keys=sort_keys) def read_json(json_path): with open(json_path, "r", encoding="utf8") as f: data = json.load(f) return data from pathlib import Path import tqdm import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def convert_date(date_str: str, ori_pattern: str = '%Y-%m-%d', tgt_pattern: str = '%d/%m/%Y'): date_obj = datetime.strptime(date_str, ori_pattern) # convert back to string in DD-MM-YY format new_date_str = date_obj.strftime(tgt_pattern) return new_date_str def extract(xml_in, field_mapping): with open(xml_in, "r") as f: xml_string = f.read() # parse the XML string root = ET.fromstring(xml_string) # extract the SHDon and NLap elements output = {} for key in field_mapping: pattern = f".//{field_mapping[key]}" value = root.find(pattern) value = "" if value is None else value.text if key == "date_value" and value != "": value = convert_date(value) if key in ["tax_amount_value", "total_value"] and value != "": value = str(int(float(value))) output[key] = value return output field_mapping = { "no_value": "SHDon", "form_value": "KHMSHDon", "serial_value": "XXXXXXX", "date_value": "NLap", # 2023-06-05 -> YY-MM-DD "seller_company_name_value": "NBan/Ten", "seller_address_value": "NBan/DChi", "seller_tel_value": "XXXXXXXXX", "seller_tax_code_value": "NBan/MST", "buyer_personal_name_value": "NMua/HVTNMHang", "buyer_company_name_value": "NMua/Ten", "buyer_address_value": "NMua/DChi", "buyer_tax_code_value": "NMua/MST", "buyer_tel_value": "NMua/SDT", "tax_amount_value": "TThue", "total_value": "TgTTTBSo", "total_in_words_value": "TgTTTBChu" } ## fields need care: serial_value, seller_tel_value, buyer_tel_value def get_xml_list_info(xml_dir): xml_dir = Path(xml_dir) xml_files = xml_dir.glob("*/*.xml") xml_info = {} for xml_file in xml_files: pdf_key = xml_file.stem xml_info[pdf_key] = str(xml_file) return xml_info def process(json_in, json_out, xml_dir): assert Path(json_in).exists() == True assert Path(xml_dir).exists() == True data_in = read_json(json_in) data_out = {} if data_in is None or not data_in: logger.error("empty file") return xml_info = get_xml_list_info(xml_dir) for pdf_key in tqdm.tqdm(data_in.keys()): xml_path = xml_info[pdf_key] if pdf_key in xml_info else None if xml_path is None: continue else: output = extract(xml_path, field_mapping) data_out[pdf_key] = output write_json(json_out, data_out, sort_keys=False) def get_xml_list_info_v2(xml_dir): xml_dir = Path(xml_dir) xml_files = xml_dir.glob("*/*.xml") xml_info = {} for xml_file in xml_files: pdf_key = xml_file.stem if pdf_key in xml_info: xml_info[pdf_key].append(str(xml_file)) else: xml_info[pdf_key] = [str(xml_file)] return xml_info def extract_v2(xml_paths, preds, field_mapping, pdf_key=None): xml_path = None if len(xml_paths) == 1: xml_path = xml_paths[0] else: # find best xml for xml_in in xml_paths: try: with open(xml_in, "r", encoding='utf8') as f: xml_string = f.read() root = ET.fromstring(xml_string, parser = ET.XMLParser(encoding = 'iso-8859-5')) except Exception as err: print("Error exception (check) ", err, xml_in) continue key_checks = ["no_value"] is_exists_xml = False for key_check in key_checks: pattern = f".//{field_mapping[key_check]}" value = root.find(pattern) value = "" if value is None else value.text if value == preds[key_check]: is_exists_xml = True if is_exists_xml: xml_path = xml_in if xml_path is None: print("Not found best xml for ",pdf_key, xml_paths) return None, None # try: with open(xml_path, "r") as f: xml_string = f.read() # parse the XML string root = ET.fromstring(xml_string) except Exception as err: print("Error exception: ", err, xml_path) return None, None # extract the SHDon and NLap elements output = {} for key in field_mapping: pattern = f".//{field_mapping[key]}" value = root.find(pattern) value = "" if value is None else value.text if key == "date_value" and value != "": value = convert_date(value) if key in ["tax_amount_value", "total_value"] and value != "": value = str(int(float(value))) output[key] = value return output, xml_path def process_v2(json_in, json_out, csv_file, xml_dir, xml_out_dir, pdf_xml_json): assert Path(json_in).exists() == True assert Path(xml_dir).exists() == True # make dir if not Path(xml_out_dir).exists(): Path(xml_out_dir).mkdir(parents=True, exist_ok=True) data_in = read_json(json_in) data_out = {} if data_in is None or not data_in: logger.error("empty file") return # xml_info = get_xml_list_info_v2(xml_dir) # xml_info = get_xml_from_csv(csv_file=csv_file) xml_info = get_xml_from_dirs(dir_path=csv_file, pdf_keys=list(data_in.keys())) print("Num xml: ", len(xml_info)) succes = 0 pdf_xml_info = {} set_xml_paths = set() for pdf_key in tqdm.tqdm(data_in.keys()): xml_paths = xml_info[pdf_key] if pdf_key in xml_info else None # print(xml_paths) preds = data_in[pdf_key] if xml_paths is None or len(xml_paths) == 0: print("Not exist xml because xml_paths is None or len xml_paths = 0", pdf_key) continue else: output, xml_path = extract_v2(xml_paths, preds, field_mapping, pdf_key=pdf_key) if output is not None: pdf_xml_info[pdf_key] = xml_path shutil.copy(xml_path, xml_out_dir) # if Path(xml_path).stem in set_xml_paths: # print(pdf_key, xml_path) set_xml_paths.add(Path(xml_path).stem) succes += 1 data_out[pdf_key] = output print("Succes: ", succes) print(len(set_xml_paths)) write_json(pdf_xml_json, pdf_xml_info, sort_keys=False) write_json(json_out, data_out, sort_keys=False) def combine_xml(json_src, json_refer): data_src = read_json(json_src) data_refer = read_json(json_refer) for pdf_key in data_src.keys(): for field_key in data_src[pdf_key]: if data_src[pdf_key][field_key] == "": data_src[pdf_key][field_key] = data_refer[pdf_key][field_key] write_json(json_src, data=data_src, sort_keys=False) def create_data_from_json(in_dir, out_dir, json_path): in_dir = Path(in_dir) out_dir = Path(out_dir) if not out_dir.exists(): out_dir.mkdir(parents=True, exist_ok=True) data = read_json(json_path) for pdf_key in data.keys(): pdf_path = in_dir / (pdf_key + ".pdf") shutil.copy(str(pdf_path), str(out_dir)) if __name__ == "__main__": # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json" # json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json" # xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test" # process(json_in=json_in, json_out=json_out, xml_dir=xml_dir) # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json" # json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json" # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json" # json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json" # combine_xml(json_src=json_in, json_refer=json_refer) ## One page # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json" # json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json" ## Multi page json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page.json" json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page_from_xml.json" # csv_file = "/mnt/ssd1T/tuanlv/02.KeyValueUnderstanding/inferences/e2e_outputs/FI_June_data.csv" csv_file = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Data" pdf_xml_json = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v1_multi_page_metadata.json" xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test" xml_out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v1_multi_page_xml" process_v2(json_in=json_in, json_out=json_out, csv_file=csv_file, xml_dir=xml_dir, xml_out_dir=xml_out_dir, pdf_xml_json=pdf_xml_json) # in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page" # out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page_clean" # json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json" # in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page" # out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page_clean" # json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json" # create_data_from_json(in_dir, out_dir, json_path)