153 lines
7.3 KiB
Python
153 lines
7.3 KiB
Python
import re
|
|
from sdsvkvu.utils.post_processing import preprocessing, date_regexing, remove_bullet_points_and_punctuation
|
|
from sdsvkvu.utils.dictionary.vtb import get_dict
|
|
|
|
# For Vietin Bank project
|
|
def vietin_key_matching(text: str, threshold: float, dict_type: str):
|
|
dictionary = get_dict(type=dict_type)
|
|
processed_text = preprocessing(text)
|
|
|
|
# Step 1: Exactly matching
|
|
date_dict = get_dict("date")
|
|
for time_key, candidates in date_dict.items():
|
|
if any([txt in processed_text for txt in candidates]):
|
|
return "date", 5, time_key
|
|
|
|
extra_dict = get_dict("extra")
|
|
for key, candidates in dictionary.items():
|
|
candidates = candidates + extra_dict[key] if key in extra_dict.keys() else candidates
|
|
|
|
if processed_text[-4:] == "dien": # EX: Bộ trưởng Bộ GTVT điện: A, B, C
|
|
return "sender", 15, processed_text
|
|
|
|
if any([txt in processed_text for txt in candidates]):
|
|
return key, 10, processed_text
|
|
|
|
# Step 2: LCS score
|
|
scores = {k: 0.0 for k in dictionary}
|
|
## Disable temporarily
|
|
# for k, v in dictionary.items():
|
|
# if k in ("date", "title", "number", 'signee', 'sender', 'receiver'): continue
|
|
# scores[k] = max([longestCommonSubsequence(processed_text, key)/len(key) for key in dictionary[k]])
|
|
key, score = max(scores.items(), key=lambda x: x[1])
|
|
return key if score > threshold else text, score, processed_text
|
|
|
|
|
|
def get_vietin_info(outputs):
|
|
# Vietin Information
|
|
single_pairs = {k: [] for k in get_dict(type="key").keys()}
|
|
for pair in outputs['single']:
|
|
for raw_key_name, value in pair.items():
|
|
key_name, score, proceessed_text = vietin_key_matching(raw_key_name, threshold=0.8, dict_type="key")
|
|
# print(f"{raw_key_name} ==> {proceessed_text} ==> {key_name} : {score} - {value['text']}")
|
|
|
|
if key_name in list(single_pairs.keys()):
|
|
single_pairs[key_name].append({
|
|
'content': value['text'],
|
|
'processed_key_name': proceessed_text,
|
|
'raw_key_name': raw_key_name,
|
|
'lcs_score': score,
|
|
'token_id': value['id'],
|
|
'single_entity': False
|
|
})
|
|
|
|
|
|
for single_item in outputs['key'] + outputs['value']:
|
|
key_name, score, proceessed_text = vietin_key_matching(single_item['text'], threshold=0.8, dict_type="key")
|
|
# print(f"{single_item['text']} ==> {proceessed_text} ==> {key_name} : {score} - {single_item['text']}")
|
|
|
|
# if key_name not in ('number', 'date'): continue
|
|
if key_name in list(single_pairs.keys()):
|
|
single_pairs[key_name].append({
|
|
'content': single_item['text'],
|
|
'processed_key_name': proceessed_text,
|
|
'raw_key_name': single_item['text'],
|
|
'lcs_score': score,
|
|
'token_id': single_item['id'],
|
|
'single_entity': True
|
|
})
|
|
|
|
|
|
# Sender and receiver are usually in triplet
|
|
for triplet in outputs['triplet']:
|
|
for raw_key_name, value_list in triplet.items():
|
|
key_name, score, proceessed_text = vietin_key_matching(raw_key_name, threshold=0.8, dict_type="key")
|
|
# print(f"{raw_key_name} ==> {proceessed_text} ==> {key_name} : {score} - {value_list[0]['text']}")
|
|
|
|
if key_name in list(single_pairs.keys()):
|
|
for pair in value_list:
|
|
single_pairs[key_name].append({
|
|
'content': pair['text'],
|
|
'raw_key_name': raw_key_name,
|
|
'processed_key_name': proceessed_text,
|
|
'lcs_score': score,
|
|
'token_id': pair['id'],
|
|
'single_entity': False
|
|
})
|
|
return single_pairs
|
|
|
|
def post_process_vietin_info(single_pairs):
|
|
vietin_outputs = {k: None for k in get_dict(type="key").keys()}
|
|
for key_name, list_potential_value in single_pairs.items():
|
|
if key_name in ("date"):
|
|
if len(list_potential_value) == 1:
|
|
check_string = list_potential_value[0]['content'].replace(" ", "")
|
|
if check_string.replace('/', '').isdigit():
|
|
vietin_outputs[key_name] = check_string
|
|
else:
|
|
# date_time = {'day': 'dd', 'month': 'mm', 'year': 'yyyy'}
|
|
# if len(list_potential_value) == 3:
|
|
# for value in list_potential_value:
|
|
# date_time[value['processed_key_name']] = re.sub("[^0-9]", "", value['content'])
|
|
# vietin_outputs[key_name] = f"{date_time['day']}/{date_time['month']}/{date_time['year']}"
|
|
# else:
|
|
list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False)
|
|
full_string = ' '.join([v['raw_key_name'] + v['content'] for v in list_potential_value])
|
|
d, m, y = date_regexing(full_string)
|
|
vietin_outputs[key_name] = f"{d}/{m}/{y}"
|
|
# print(full_string)
|
|
# print(d, m, y)
|
|
elif key_name in ("receiver", "sender"):
|
|
list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False)
|
|
vietin_outputs[key_name] = [remove_bullet_points_and_punctuation(value['content']) for value in list_potential_value]
|
|
elif key_name in ("signee"):
|
|
list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False)
|
|
vietin_outputs[key_name] = [f"{value['content']} - {value['raw_key_name']}" for value in list_potential_value if value['single_entity'] == False]
|
|
else:
|
|
if len(list_potential_value) == 0: continue
|
|
selected_value = max(list_potential_value, key=lambda x: x['lcs_score']) # Get max lsc score
|
|
vietin_outputs[key_name] = selected_value['content']
|
|
if key_name in ("number"):
|
|
number = re.sub("[^0-9]", "", selected_value['raw_key_name'])
|
|
start_idx = selected_value['content'].find(number)
|
|
if start_idx != -1:
|
|
vietin_outputs[key_name] = selected_value['content'].replace(" ", "")[start_idx:]
|
|
else:
|
|
vietin_outputs[key_name] = number + selected_value['content'].replace(" ", "")
|
|
|
|
return vietin_outputs
|
|
|
|
def export_kvu_for_vietin(outputs):
|
|
single_pairs = get_vietin_info(outputs)
|
|
vietin_outputs = post_process_vietin_info(single_pairs)
|
|
vietin_outputs['title'] = [title['text'] for title in outputs["title"]]
|
|
return vietin_outputs
|
|
|
|
def merged_kvu_for_vietin_for_multi_pages(lvietin_outputs: list):
|
|
merged_outputs = {k: [] for k in get_dict("key").keys()}
|
|
for outputs in lvietin_outputs:
|
|
for key_name, value in outputs.items():
|
|
if value == None or value == "dd/mm/yyyy":
|
|
# print(key_name, value)
|
|
continue
|
|
merged_outputs[key_name].append(value)
|
|
|
|
for key, value in merged_outputs.items():
|
|
if len(value) == 0:
|
|
merged_outputs[key] = None
|
|
elif key == "receiver":
|
|
merged_outputs[key] = value[-1]
|
|
elif key in ("number", "title", "date", "signee", "sender"):
|
|
merged_outputs[key] = value[0]
|
|
|
|
return merged_outputs |