sbt-idp/cope2n-ai-fi/modules/sdsvkie/scripts/common/extract_xml.py
2023-12-12 15:14:54 +07:00

314 lines
11 KiB
Python
Executable File

import shutil
import xml.etree.ElementTree as ET
from datetime import datetime
# from sdsvkie.utils.io_file import read_json, write_json
import json
import csv
import ast
def get_xml_from_csv(csv_file):
data = {}
with open(csv_file, 'r') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
# print(row)
pdf_path = row['file_path']
pdf_key = Path(pdf_path).stem
xml_paths = ast.literal_eval(row['xml_path'])
data[pdf_key] = xml_paths
return data
def get_xml_from_dirs(dir_path, pdf_keys):
dir_path = Path(dir_path)
xml_paths = dir_path.rglob("*.xml")
xml_paths = [str(path) for path in xml_paths]
xml_infos = {}
for pdf_key in pdf_keys:
xml_infos[pdf_key] = xml_paths
return xml_infos
def write_json(json_path, data, sort_keys=True):
with open(json_path, "w", encoding="utf8") as f:
json.dump(data, f, ensure_ascii=False, sort_keys=sort_keys)
def read_json(json_path):
with open(json_path, "r", encoding="utf8") as f:
data = json.load(f)
return data
from pathlib import Path
import tqdm
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def convert_date(date_str: str, ori_pattern: str = '%Y-%m-%d', tgt_pattern: str = '%d/%m/%Y'):
date_obj = datetime.strptime(date_str, ori_pattern)
# convert back to string in DD-MM-YY format
new_date_str = date_obj.strftime(tgt_pattern)
return new_date_str
def extract(xml_in, field_mapping):
with open(xml_in, "r") as f:
xml_string = f.read()
# parse the XML string
root = ET.fromstring(xml_string)
# extract the SHDon and NLap elements
output = {}
for key in field_mapping:
pattern = f".//{field_mapping[key]}"
value = root.find(pattern)
value = "" if value is None else value.text
if key == "date_value" and value != "":
value = convert_date(value)
if key in ["tax_amount_value", "total_value"] and value != "":
value = str(int(float(value)))
output[key] = value
return output
field_mapping = {
"no_value": "SHDon",
"form_value": "KHMSHDon",
"serial_value": "XXXXXXX",
"date_value": "NLap", # 2023-06-05 -> YY-MM-DD
"seller_company_name_value": "NBan/Ten",
"seller_address_value": "NBan/DChi",
"seller_tel_value": "XXXXXXXXX",
"seller_tax_code_value": "NBan/MST",
"buyer_personal_name_value": "NMua/HVTNMHang",
"buyer_company_name_value": "NMua/Ten",
"buyer_address_value": "NMua/DChi",
"buyer_tax_code_value": "NMua/MST",
"buyer_tel_value": "NMua/SDT",
"tax_amount_value": "TThue",
"total_value": "TgTTTBSo",
"total_in_words_value": "TgTTTBChu"
}
## fields need care: serial_value, seller_tel_value, buyer_tel_value
def get_xml_list_info(xml_dir):
xml_dir = Path(xml_dir)
xml_files = xml_dir.glob("*/*.xml")
xml_info = {}
for xml_file in xml_files:
pdf_key = xml_file.stem
xml_info[pdf_key] = str(xml_file)
return xml_info
def process(json_in, json_out, xml_dir):
assert Path(json_in).exists() == True
assert Path(xml_dir).exists() == True
data_in = read_json(json_in)
data_out = {}
if data_in is None or not data_in:
logger.error("empty file")
return
xml_info = get_xml_list_info(xml_dir)
for pdf_key in tqdm.tqdm(data_in.keys()):
xml_path = xml_info[pdf_key] if pdf_key in xml_info else None
if xml_path is None:
continue
else:
output = extract(xml_path, field_mapping)
data_out[pdf_key] = output
write_json(json_out, data_out, sort_keys=False)
def get_xml_list_info_v2(xml_dir):
xml_dir = Path(xml_dir)
xml_files = xml_dir.glob("*/*.xml")
xml_info = {}
for xml_file in xml_files:
pdf_key = xml_file.stem
if pdf_key in xml_info:
xml_info[pdf_key].append(str(xml_file))
else:
xml_info[pdf_key] = [str(xml_file)]
return xml_info
def extract_v2(xml_paths, preds, field_mapping, pdf_key=None):
xml_path = None
if len(xml_paths) == 1:
xml_path = xml_paths[0]
else:
# find best xml
for xml_in in xml_paths:
try:
with open(xml_in, "r", encoding='utf8') as f:
xml_string = f.read()
root = ET.fromstring(xml_string, parser = ET.XMLParser(encoding = 'iso-8859-5'))
except Exception as err:
print("Error exception (check) ", err, xml_in)
continue
key_checks = ["no_value"]
is_exists_xml = False
for key_check in key_checks:
pattern = f".//{field_mapping[key_check]}"
value = root.find(pattern)
value = "" if value is None else value.text
if value == preds[key_check]:
is_exists_xml = True
if is_exists_xml:
xml_path = xml_in
if xml_path is None:
print("Not found best xml for ",pdf_key, xml_paths)
return None, None
#
try:
with open(xml_path, "r") as f:
xml_string = f.read()
# parse the XML string
root = ET.fromstring(xml_string)
except Exception as err:
print("Error exception: ", err, xml_path)
return None, None
# extract the SHDon and NLap elements
output = {}
for key in field_mapping:
pattern = f".//{field_mapping[key]}"
value = root.find(pattern)
value = "" if value is None else value.text
if key == "date_value" and value != "":
value = convert_date(value)
if key in ["tax_amount_value", "total_value"] and value != "":
value = str(int(float(value)))
output[key] = value
return output, xml_path
def process_v2(json_in, json_out, csv_file, xml_dir, xml_out_dir, pdf_xml_json):
assert Path(json_in).exists() == True
assert Path(xml_dir).exists() == True
# make dir
if not Path(xml_out_dir).exists():
Path(xml_out_dir).mkdir(parents=True, exist_ok=True)
data_in = read_json(json_in)
data_out = {}
if data_in is None or not data_in:
logger.error("empty file")
return
# xml_info = get_xml_list_info_v2(xml_dir)
# xml_info = get_xml_from_csv(csv_file=csv_file)
xml_info = get_xml_from_dirs(dir_path=csv_file, pdf_keys=list(data_in.keys()))
print("Num xml: ", len(xml_info))
succes = 0
pdf_xml_info = {}
set_xml_paths = set()
for pdf_key in tqdm.tqdm(data_in.keys()):
xml_paths = xml_info[pdf_key] if pdf_key in xml_info else None
# print(xml_paths)
preds = data_in[pdf_key]
if xml_paths is None or len(xml_paths) == 0:
print("Not exist xml because xml_paths is None or len xml_paths = 0", pdf_key)
continue
else:
output, xml_path = extract_v2(xml_paths, preds, field_mapping, pdf_key=pdf_key)
if output is not None:
pdf_xml_info[pdf_key] = xml_path
shutil.copy(xml_path, xml_out_dir)
# if Path(xml_path).stem in set_xml_paths:
# print(pdf_key, xml_path)
set_xml_paths.add(Path(xml_path).stem)
succes += 1
data_out[pdf_key] = output
print("Succes: ", succes)
print(len(set_xml_paths))
write_json(pdf_xml_json, pdf_xml_info, sort_keys=False)
write_json(json_out, data_out, sort_keys=False)
def combine_xml(json_src, json_refer):
data_src = read_json(json_src)
data_refer = read_json(json_refer)
for pdf_key in data_src.keys():
for field_key in data_src[pdf_key]:
if data_src[pdf_key][field_key] == "":
data_src[pdf_key][field_key] = data_refer[pdf_key][field_key]
write_json(json_src, data=data_src, sort_keys=False)
def create_data_from_json(in_dir, out_dir, json_path):
in_dir = Path(in_dir)
out_dir = Path(out_dir)
if not out_dir.exists():
out_dir.mkdir(parents=True, exist_ok=True)
data = read_json(json_path)
for pdf_key in data.keys():
pdf_path = in_dir / (pdf_key + ".pdf")
shutil.copy(str(pdf_path), str(out_dir))
if __name__ == "__main__":
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json"
# json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
# xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test"
# process(json_in=json_in, json_out=json_out, xml_dir=xml_dir)
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
# json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json"
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
# json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json"
# combine_xml(json_src=json_in, json_refer=json_refer)
## One page
# json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json"
# json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
## Multi page
json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page.json"
json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page_from_xml.json"
# csv_file = "/mnt/ssd1T/tuanlv/02.KeyValueUnderstanding/inferences/e2e_outputs/FI_June_data.csv"
csv_file = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Data"
pdf_xml_json = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v1_multi_page_metadata.json"
xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test"
xml_out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v1_multi_page_xml"
process_v2(json_in=json_in, json_out=json_out, csv_file=csv_file, xml_dir=xml_dir, xml_out_dir=xml_out_dir, pdf_xml_json=pdf_xml_json)
# in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page"
# out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page_clean"
# json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
# in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page"
# out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page_clean"
# json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
# create_data_from_json(in_dir, out_dir, json_path)