from sdsvkvu.utils.post_processing import split_key_value_by_colon, remove_bullet_points_and_punctuation def normalize_kvu_output(raw_outputs: dict) -> dict: outputs = {} for key, values in raw_outputs.items(): if key == "table": table = [] for row in values: item = {} for k, v in row.items(): k = remove_bullet_points_and_punctuation(k) if v is not None and len(v) > 0: v = remove_bullet_points_and_punctuation(v) item[k] = v table.append(item) outputs[key] = table else: key = remove_bullet_points_and_punctuation(key) if isinstance(values, list): values = [remove_bullet_points_and_punctuation(v) for v in values] elif values is not None and len(values) > 0: values = remove_bullet_points_and_punctuation(values) outputs[key] = values return outputs def export_kvu_for_all(raw_outputs: dict) -> dict: outputs = {} # Title outputs["title"] = ( raw_outputs["title"][0]["text"] if len(raw_outputs["title"]) > 0 else None ) # Pairs of key-value for pair in raw_outputs["single"]: for key, values in pair.items(): # outputs[key] = values["text"] elements = split_key_value_by_colon(key, values["text"]) outputs[elements[0]] = elements[1] # Only key fields for key in raw_outputs["key"]: # outputs[key["text"]] = None elements = split_key_value_by_colon(key["text"], None) outputs[elements[0]] = elements[1] # Triplet data for triplet in raw_outputs["triplet"]: for key, list_value in triplet.items(): outputs[key] = [value["text"] for value in list_value] # Table data table = [] for row in raw_outputs["table"]: item = {} for cell in row: item[cell["header"]] = cell["text"] table.append(item) outputs["table"] = table outputs = normalize_kvu_output(outputs) return outputs def merged_kvu_for_all_for_multi_pages(loutputs: list) -> dict: merged_outputs = {} table = [] for outputs in loutputs: for key, value in outputs.items(): if key == "table": table.append(value) else: merged_outputs[key] = value merged_outputs['table'] = table return merged_outputs