import re from sdsvkvu.utils.post_processing import preprocessing, date_regexing, remove_bullet_points_and_punctuation from sdsvkvu.utils.dictionary.vtb import get_dict # For Vietin Bank project def vietin_key_matching(text: str, threshold: float, dict_type: str): dictionary = get_dict(type=dict_type) processed_text = preprocessing(text) # Step 1: Exactly matching date_dict = get_dict("date") for time_key, candidates in date_dict.items(): if any([txt in processed_text for txt in candidates]): return "date", 5, time_key extra_dict = get_dict("extra") for key, candidates in dictionary.items(): candidates = candidates + extra_dict[key] if key in extra_dict.keys() else candidates if processed_text[-4:] == "dien": # EX: Bộ trưởng Bộ GTVT điện: A, B, C return "sender", 15, processed_text if any([txt in processed_text for txt in candidates]): return key, 10, processed_text # Step 2: LCS score scores = {k: 0.0 for k in dictionary} ## Disable temporarily # for k, v in dictionary.items(): # if k in ("date", "title", "number", 'signee', 'sender', 'receiver'): continue # scores[k] = max([longestCommonSubsequence(processed_text, key)/len(key) for key in dictionary[k]]) key, score = max(scores.items(), key=lambda x: x[1]) return key if score > threshold else text, score, processed_text def get_vietin_info(outputs): # Vietin Information single_pairs = {k: [] for k in get_dict(type="key").keys()} for pair in outputs['single']: for raw_key_name, value in pair.items(): key_name, score, proceessed_text = vietin_key_matching(raw_key_name, threshold=0.8, dict_type="key") # print(f"{raw_key_name} ==> {proceessed_text} ==> {key_name} : {score} - {value['text']}") if key_name in list(single_pairs.keys()): single_pairs[key_name].append({ 'content': value['text'], 'processed_key_name': proceessed_text, 'raw_key_name': raw_key_name, 'lcs_score': score, 'token_id': value['id'], 'single_entity': False }) for single_item in outputs['key'] + outputs['value']: key_name, score, proceessed_text = vietin_key_matching(single_item['text'], threshold=0.8, dict_type="key") # print(f"{single_item['text']} ==> {proceessed_text} ==> {key_name} : {score} - {single_item['text']}") # if key_name not in ('number', 'date'): continue if key_name in list(single_pairs.keys()): single_pairs[key_name].append({ 'content': single_item['text'], 'processed_key_name': proceessed_text, 'raw_key_name': single_item['text'], 'lcs_score': score, 'token_id': single_item['id'], 'single_entity': True }) # Sender and receiver are usually in triplet for triplet in outputs['triplet']: for raw_key_name, value_list in triplet.items(): key_name, score, proceessed_text = vietin_key_matching(raw_key_name, threshold=0.8, dict_type="key") # print(f"{raw_key_name} ==> {proceessed_text} ==> {key_name} : {score} - {value_list[0]['text']}") if key_name in list(single_pairs.keys()): for pair in value_list: single_pairs[key_name].append({ 'content': pair['text'], 'raw_key_name': raw_key_name, 'processed_key_name': proceessed_text, 'lcs_score': score, 'token_id': pair['id'], 'single_entity': False }) return single_pairs def post_process_vietin_info(single_pairs): vietin_outputs = {k: None for k in get_dict(type="key").keys()} for key_name, list_potential_value in single_pairs.items(): if key_name in ("date"): if len(list_potential_value) == 1: check_string = list_potential_value[0]['content'].replace(" ", "") if check_string.replace('/', '').isdigit(): vietin_outputs[key_name] = check_string else: # date_time = {'day': 'dd', 'month': 'mm', 'year': 'yyyy'} # if len(list_potential_value) == 3: # for value in list_potential_value: # date_time[value['processed_key_name']] = re.sub("[^0-9]", "", value['content']) # vietin_outputs[key_name] = f"{date_time['day']}/{date_time['month']}/{date_time['year']}" # else: list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False) full_string = ' '.join([v['raw_key_name'] + v['content'] for v in list_potential_value]) d, m, y = date_regexing(full_string) vietin_outputs[key_name] = f"{d}/{m}/{y}" # print(full_string) # print(d, m, y) elif key_name in ("receiver", "sender"): list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False) vietin_outputs[key_name] = [remove_bullet_points_and_punctuation(value['content']) for value in list_potential_value] elif key_name in ("signee"): list_potential_value = sorted(list_potential_value, key=lambda x: x['token_id'], reverse=False) vietin_outputs[key_name] = [f"{value['content']} - {value['raw_key_name']}" for value in list_potential_value if value['single_entity'] == False] else: if len(list_potential_value) == 0: continue selected_value = max(list_potential_value, key=lambda x: x['lcs_score']) # Get max lsc score vietin_outputs[key_name] = selected_value['content'] if key_name in ("number"): number = re.sub("[^0-9]", "", selected_value['raw_key_name']) start_idx = selected_value['content'].find(number) if start_idx != -1: vietin_outputs[key_name] = selected_value['content'].replace(" ", "")[start_idx:] else: vietin_outputs[key_name] = number + selected_value['content'].replace(" ", "") return vietin_outputs def export_kvu_for_vietin(outputs): single_pairs = get_vietin_info(outputs) vietin_outputs = post_process_vietin_info(single_pairs) vietin_outputs['title'] = [title['text'] for title in outputs["title"]] return vietin_outputs def merged_kvu_for_vietin_for_multi_pages(lvietin_outputs: list): merged_outputs = {k: [] for k in get_dict("key").keys()} for outputs in lvietin_outputs: for key_name, value in outputs.items(): if value == None or value == "dd/mm/yyyy": # print(key_name, value) continue merged_outputs[key_name].append(value) for key, value in merged_outputs.items(): if len(value) == 0: merged_outputs[key] = None elif key == "receiver": merged_outputs[key] = value[-1] elif key in ("number", "title", "date", "signee", "sender"): merged_outputs[key] = value[0] return merged_outputs