sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/query/vtb.py
2023-11-30 18:22:16 +07:00

153 lines
7.3 KiB
Python

import re
from sdsvkvu.utils.post_processing import preprocessing, date_regexing, remove_bullet_points_and_punctuation
from sdsvkvu.utils.dictionary.vtb import get_dict
# For Vietin Bank project
def vietin_key_matching(text: str, threshold: float, dict_type: str):
dictionary = get_dict(type=dict_type)
processed_text = preprocessing(text)
# Step 1: Exactly matching
date_dict = get_dict("date")
for time_key, candidates in date_dict.items():
if any([txt in processed_text for txt in candidates]):
return "date", 5, time_key
extra_dict = get_dict("extra")
for key, candidates in dictionary.items():
candidates = candidates + extra_dict[key] if key in extra_dict.keys() else candidates
if processed_text[-4:] == "dien": # EX: Bộ trưởng Bộ GTVT điện: A, B, C
return "sender", 15, processed_text
if any([txt in processed_text for txt in candidates]):
return key, 10, processed_text
# Step 2: LCS score
scores = {k: 0.0 for k in dictionary}
## Disable temporarily
# for k, v in dictionary.items():
# if k in ("date", "title", "number", 'signee', 'sender', 'receiver'): continue
# scores[k] = max([longestCommonSubsequence(processed_text, key)/len(key) for key in dictionary[k]])
key, score = max(scores.items(), key=lambda x: x[1])
return key if score > threshold else text, score, processed_text
def get_vietin_info(outputs):
# Vietin Information
single_pairs = {k: [] for k in get_dict(type="key").keys()}
for pair in outputs['single']:
for raw_key_name, value in pair.items():
key_name, score, proceessed_text = vietin_key_matching(raw_key_name, threshold=0.8, dict_type="key")
# print(f"{raw_key_name} ==> {proceessed_text} ==> {key_name} : {score} - {value['text']}")
if key_name in list(single_pairs.keys()):
single_pairs[key_name].append({
'content': value['text'],
'processed_key_name': proceessed_text,
'raw_key_name': raw_key_name,
'lcs_score': score,
'token_id': value['id'],
'single_entity': False
})
for single_item in outputs['key'] + outputs['value']:
key_name, score, proceessed_text = vietin_key_matching(single_item['text'], threshold=0.8, dict_type="key")
# print(f"{single_item['text']} ==> {proceessed_text} ==> {key_name} : {score} - {single_item['text']}")
# if key_name not in ('number', 'date'): continue
if key_name in list(single_pairs.keys()):
single_pairs[key_name].append({
'content': single_item['text'],
'processed_key_name': proceessed_text,
'raw_key_name': single_item['text'],
'lcs_score': score,
'token_id': single_item['id'],
'single_entity': True
})
# Sender and receiver are usually in triplet
for triplet in outputs['triplet']:
for raw_key_name, value_list in triplet.items():
key_name, score, proceessed_text = vietin_key_matching(raw_key_name, threshold=0.8, dict_type="key")
# print(f"{raw_key_name} ==> {proceessed_text} ==> {key_name} : {score} - {value_list[0]['text']}")
if key_name in list(single_pairs.keys()):
for pair in value_list:
single_pairs[key_name].append({
'content': pair['text'],
'raw_key_name': raw_key_name,
'processed_key_name': proceessed_text,
'lcs_score': score,
'token_id': pair['id'],
'single_entity': False
})
return single_pairs
def post_process_vietin_info(single_pairs):
vietin_outputs = {k: None for k in get_dict(type="key").keys()}
for key_name, list_potential_value in single_pairs.items():
if key_name in ("date"):
if len(list_potential_value) == 1:
check_string = list_potential_value[0]['content'].replace(" ", "")
if check_string.replace('/', '').isdigit():
vietin_outputs[key_name] = check_string
else:
# date_time = {'day': 'dd', 'month': 'mm', 'year': 'yyyy'}
# if len(list_potential_value) == 3:
# for value in list_potential_value:
# date_time[value['processed_key_name']] = re.sub("[^0-9]", "", value['content'])
# vietin_outputs[key_name] = f"{date_time['day']}/{date_time['month']}/{date_time['year']}"
# else:
list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False)
full_string = ' '.join([v['raw_key_name'] + v['content'] for v in list_potential_value])
d, m, y = date_regexing(full_string)
vietin_outputs[key_name] = f"{d}/{m}/{y}"
# print(full_string)
# print(d, m, y)
elif key_name in ("receiver", "sender"):
list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False)
vietin_outputs[key_name] = [remove_bullet_points_and_punctuation(value['content']) for value in list_potential_value]
elif key_name in ("signee"):
list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False)
vietin_outputs[key_name] = [f"{value['content']} - {value['raw_key_name']}" for value in list_potential_value if value['single_entity'] == False]
else:
if len(list_potential_value) == 0: continue
selected_value = max(list_potential_value, key=lambda x: x['lcs_score']) # Get max lsc score
vietin_outputs[key_name] = selected_value['content']
if key_name in ("number"):
number = re.sub("[^0-9]", "", selected_value['raw_key_name'])
start_idx = selected_value['content'].find(number)
if start_idx != -1:
vietin_outputs[key_name] = selected_value['content'].replace(" ", "")[start_idx:]
else:
vietin_outputs[key_name] = number + selected_value['content'].replace(" ", "")
return vietin_outputs
def export_kvu_for_vietin(outputs):
single_pairs = get_vietin_info(outputs)
vietin_outputs = post_process_vietin_info(single_pairs)
vietin_outputs['title'] = [title['text'] for title in outputs["title"]]
return vietin_outputs
def merged_kvu_for_vietin_for_multi_pages(lvietin_outputs: list):
merged_outputs = {k: [] for k in get_dict("key").keys()}
for outputs in lvietin_outputs:
for key_name, value in outputs.items():
if value == None or value == "dd/mm/yyyy":
# print(key_name, value)
continue
merged_outputs[key_name].append(value)
for key, value in merged_outputs.items():
if len(value) == 0:
merged_outputs[key] = None
elif key == "receiver":
merged_outputs[key] = value[-1]
elif key in ("number", "title", "date", "signee", "sender"):
merged_outputs[key] = value[0]
return merged_outputs