from sdsvkvu.utils.dictionary.manulife import get_dict from sdsvkvu.utils.post_processing import ( split_key_value_by_colon, remove_bullet_points_and_punctuation, longestCommonSubsequence, preprocessing ) def manulife_key_matching(text: str, threshold: float, dict_type: str): dictionary = get_dict(type=dict_type) processed_text = preprocessing(text) for key, candidates in dictionary.items(): if any([txt == processed_text for txt in candidates]): return True, key, 5 * (1 + len(processed_text)), processed_text if any([txt in processed_text for txt in candidates]): return True, key, 5, processed_text scores = {k: 0.0 for k in dictionary} for k, v in dictionary.items(): if len(v) == 0: continue scores[k] = max( [ longestCommonSubsequence(processed_text, key) / len(key) for key in dictionary[k] ] ) key, score = max(scores.items(), key=lambda x: x[1]) return score > threshold, key if score > threshold else text, score, processed_text def normalize_kvu_output_for_manulife(raw_outputs: dict) -> dict: outputs = {} for key, values in raw_outputs.items(): if key == "tables" and len(values) > 0: table_list = [] for table in values: headers, data = [], [] headers = [ remove_bullet_points_and_punctuation(header).upper() for header in table["headers"] ] for row in table["data"]: item = [] for k, v in row.items(): if v is not None and len(v) > 0: item.append(remove_bullet_points_and_punctuation(v)) else: item.append(v) data.append(item) table_list.append({"headers": headers, "data": data}) outputs[key] = table_list else: key = remove_bullet_points_and_punctuation(key).capitalize() if isinstance(values, list): values = [remove_bullet_points_and_punctuation(v) for v in values] elif values is not None and len(values) > 0: values = remove_bullet_points_and_punctuation(values) outputs[key] = values return outputs def export_kvu_for_manulife(raw_outputs: dict) -> dict: outputs = {} # Title title_list = [] for title in raw_outputs["title"]: is_match, title_name, score, proceessed_text = manulife_key_matching(title["text"], threshold=0.6, dict_type="title") title_list.append({ 'documment_type': title_name if is_match else "", 'content': title['text'], 'processed_key_name': proceessed_text, 'lcs_score': score, 'token_id': title['id'] }) if len(title_list) > 0: selected_element = max(title_list, key=lambda x: x['lcs_score']) outputs["title"] = f"({selected_element['documment_type']}) {selected_element['content']}" else: outputs["title"] = None # Pairs of key-value for pair in raw_outputs["single"]: for key, values in pair.items(): # outputs[key] = values["text"] elements = split_key_value_by_colon(key, values["text"]) outputs[elements[0]] = elements[1] # Only key fields for key in raw_outputs["key"]: # outputs[key["text"]] = None elements = split_key_value_by_colon(key["text"], None) outputs[elements[0]] = elements[1] # Triplet data for triplet in raw_outputs["triplet"]: for key, list_value in triplet.items(): outputs[key] = [value["text"] for value in list_value] # Table data table = [] header_list = {cell['header']: cell['header_bbox'] for row in raw_outputs['table'] for cell in row} if header_list: header_list = dict(sorted(header_list.items(), key=lambda x: int(x[1][0]))) # print("Header_list:", header_list.keys()) for row in raw_outputs["table"]: item = {header: None for header in list(header_list.keys())} for cell in row: item[cell["header"]] = cell["text"] table.append(item) outputs["tables"] = [{"headers": list(header_list.keys()), "data": table}] else: outputs["tables"] = [] outputs = normalize_kvu_output_for_manulife(outputs) return outputs def merged_kvu_for_manulife_for_multi_pages(loutputs: list) -> dict: merged_outputs = {} table = [] for outputs in loutputs: for key, value in outputs.items(): if key == "tables": table.append(value) else: merged_outputs[key] = value merged_outputs['tables'] = table return merged_outputs