sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/query/manulife.py

133 lines
4.9 KiB
Python
Raw Normal View History

2023-11-30 11:22:16 +00:00
from sdsvkvu.utils.dictionary.manulife import get_dict
from sdsvkvu.utils.post_processing import (
split_key_value_by_colon,
remove_bullet_points_and_punctuation,
longestCommonSubsequence,
preprocessing
)
def manulife_key_matching(text: str, threshold: float, dict_type: str):
dictionary = get_dict(type=dict_type)
processed_text = preprocessing(text)
for key, candidates in dictionary.items():
if any([txt == processed_text for txt in candidates]):
return True, key, 5 * (1 + len(processed_text)), processed_text
if any([txt in processed_text for txt in candidates]):
return True, key, 5, processed_text
scores = {k: 0.0 for k in dictionary}
for k, v in dictionary.items():
if len(v) == 0:
continue
scores[k] = max(
[
longestCommonSubsequence(processed_text, key) / len(key)
for key in dictionary[k]
]
)
key, score = max(scores.items(), key=lambda x: x[1])
return score > threshold, key if score > threshold else text, score, processed_text
def normalize_kvu_output_for_manulife(raw_outputs: dict) -> dict:
outputs = {}
for key, values in raw_outputs.items():
if key == "tables" and len(values) > 0:
table_list = []
for table in values:
headers, data = [], []
headers = [
remove_bullet_points_and_punctuation(header).upper()
for header in table["headers"]
]
for row in table["data"]:
item = []
for k, v in row.items():
if v is not None and len(v) > 0:
item.append(remove_bullet_points_and_punctuation(v))
else:
item.append(v)
data.append(item)
table_list.append({"headers": headers, "data": data})
outputs[key] = table_list
else:
key = remove_bullet_points_and_punctuation(key).capitalize()
if isinstance(values, list):
values = [remove_bullet_points_and_punctuation(v) for v in values]
elif values is not None and len(values) > 0:
values = remove_bullet_points_and_punctuation(values)
outputs[key] = values
return outputs
def export_kvu_for_manulife(raw_outputs: dict) -> dict:
outputs = {}
# Title
title_list = []
for title in raw_outputs["title"]:
is_match, title_name, score, proceessed_text = manulife_key_matching(title["text"], threshold=0.6, dict_type="title")
title_list.append({
'documment_type': title_name if is_match else "",
'content': title['text'],
'processed_key_name': proceessed_text,
'lcs_score': score,
'token_id': title['id']
})
if len(title_list) > 0:
selected_element = max(title_list, key=lambda x: x['lcs_score'])
outputs["title"] = f"({selected_element['documment_type']}) {selected_element['content']}"
else:
outputs["title"] = None
# Pairs of key-value
for pair in raw_outputs["single"]:
for key, values in pair.items():
# outputs[key] = values["text"]
elements = split_key_value_by_colon(key, values["text"])
outputs[elements[0]] = elements[1]
# Only key fields
for key in raw_outputs["key"]:
# outputs[key["text"]] = None
elements = split_key_value_by_colon(key["text"], None)
outputs[elements[0]] = elements[1]
# Triplet data
for triplet in raw_outputs["triplet"]:
for key, list_value in triplet.items():
outputs[key] = [value["text"] for value in list_value]
# Table data
table = []
header_list = {cell['header']: cell['header_bbox'] for row in raw_outputs['table'] for cell in row}
if header_list:
header_list = dict(sorted(header_list.items(), key=lambda x: int(x[1][0])))
# print("Header_list:", header_list.keys())
for row in raw_outputs["table"]:
item = {header: None for header in list(header_list.keys())}
for cell in row:
item[cell["header"]] = cell["text"]
table.append(item)
outputs["tables"] = [{"headers": list(header_list.keys()), "data": table}]
else:
outputs["tables"] = []
outputs = normalize_kvu_output_for_manulife(outputs)
return outputs
def merged_kvu_for_manulife_for_multi_pages(loutputs: list) -> dict:
merged_outputs = {}
table = []
for outputs in loutputs:
for key, value in outputs.items():
if key == "tables":
table.append(value)
else:
merged_outputs[key] = value
merged_outputs['tables'] = table
return merged_outputs