sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/query/all.py
2023-11-30 18:22:16 +07:00

75 lines
2.5 KiB
Python

from sdsvkvu.utils.post_processing import split_key_value_by_colon, remove_bullet_points_and_punctuation
def normalize_kvu_output(raw_outputs: dict) -> dict:
outputs = {}
for key, values in raw_outputs.items():
if key == "table":
table = []
for row in values:
item = {}
for k, v in row.items():
k = remove_bullet_points_and_punctuation(k)
if v is not None and len(v) > 0:
v = remove_bullet_points_and_punctuation(v)
item[k] = v
table.append(item)
outputs[key] = table
else:
key = remove_bullet_points_and_punctuation(key)
if isinstance(values, list):
values = [remove_bullet_points_and_punctuation(v) for v in values]
elif values is not None and len(values) > 0:
values = remove_bullet_points_and_punctuation(values)
outputs[key] = values
return outputs
def export_kvu_for_all(raw_outputs: dict) -> dict:
outputs = {}
# Title
outputs["title"] = (
raw_outputs["title"][0]["text"] if len(raw_outputs["title"]) > 0 else None
)
# Pairs of key-value
for pair in raw_outputs["single"]:
for key, values in pair.items():
# outputs[key] = values["text"]
elements = split_key_value_by_colon(key, values["text"])
outputs[elements[0]] = elements[1]
# Only key fields
for key in raw_outputs["key"]:
# outputs[key["text"]] = None
elements = split_key_value_by_colon(key["text"], None)
outputs[elements[0]] = elements[1]
# Triplet data
for triplet in raw_outputs["triplet"]:
for key, list_value in triplet.items():
outputs[key] = [value["text"] for value in list_value]
# Table data
table = []
for row in raw_outputs["table"]:
item = {}
for cell in row:
item[cell["header"]] = cell["text"]
table.append(item)
outputs["table"] = table
outputs = normalize_kvu_output(outputs)
return outputs
def merged_kvu_for_all_for_multi_pages(loutputs: list) -> dict:
merged_outputs = {}
table = []
for outputs in loutputs:
for key, value in outputs.items():
if key == "table":
table.append(value)
else:
merged_outputs[key] = value
merged_outputs['table'] = table
return merged_outputs