sbt-idp/cope2n-ai-fi/common/process_pdf.py

86 lines
2.4 KiB
Python
Raw Permalink Normal View History

2024-10-29 04:07:30 +00:00
import logging
import logging.config
2023-11-30 11:22:16 +00:00
import os
import time
2023-11-30 11:22:16 +00:00
2024-10-29 04:07:30 +00:00
from opentelemetry import trace
2023-11-30 11:22:16 +00:00
from api.sdsap_sbt.prediction_sbt import predict as predict_sbt
2024-10-29 04:07:30 +00:00
from common.utils_kvu.split_docs import merge_sbt_output
2024-07-05 13:14:47 +00:00
from utils.logging.logging import LOGGER_CONFIG
2024-10-29 04:07:30 +00:00
2024-07-05 13:14:47 +00:00
logging.config.dictConfig(LOGGER_CONFIG)
logger = logging.getLogger(__name__)
2024-10-29 08:03:31 +00:00
tracer = trace.get_tracer("sbt_celery_ai")
2023-11-30 11:22:16 +00:00
os.environ['PYTHONPATH'] = '/home/thucpd/thucpd/cope2n-ai/cope2n-ai/'
def check_label_exists(array, target_label):
for obj in array:
if obj["label"] == target_label:
return True # Label exists in the array
return False # Label does not exist in the array
def update_null_values(kvu_result, next_kvu_result):
for key, value in kvu_result.items():
if value is None and next_kvu_result.get(key) is not None:
kvu_result[key] = next_kvu_result[key]
def replace_empty_null_values(my_dict):
for key, value in my_dict.items():
if value == '':
my_dict[key] = None
return my_dict
2024-10-29 04:07:30 +00:00
@tracer.start_as_current_span("compile_output_sbt")
2024-04-05 11:50:41 +00:00
def compile_output_sbt(list_url, metadata):
2023-11-30 11:22:16 +00:00
"""_summary_
Args:
pdf_extracted (list): list: [{
"1": url},{"2": url},
...]
Raises:
NotImplementedError: _description_
Returns:
dict: output compiled
"""
inference_profile = {}
2023-11-30 11:22:16 +00:00
results = {
"model":{
"name":"Invoice",
"confidence": 1.0,
"type": "finance/invoice",
"isValid": True,
"shape": "letter",
}
}
outputs = []
start = time.time()
pages_predict_time = []
2024-10-29 04:07:30 +00:00
2023-11-30 11:22:16 +00:00
for page in list_url:
2024-04-05 11:50:41 +00:00
output_model = predict_sbt(page['page_number'], page['file_url'], metadata)
pages_predict_time.append(time.time())
2023-11-30 11:22:16 +00:00
if "doc_type" in page:
output_model['doc_type'] = page['doc_type']
outputs.append(output_model)
start_postprocess = time.time()
2024-10-29 04:07:30 +00:00
2023-11-30 11:22:16 +00:00
documents = merge_sbt_output(outputs)
2024-10-29 04:07:30 +00:00
inference_profile["postprocess"] = [start_postprocess, time.time()]
inference_profile["inference"] = [start, pages_predict_time]
2023-11-30 11:22:16 +00:00
results = {
"total_pages": len(list_url),
"ocr_num_pages": len(list_url),
"document": documents,
"inference_profile": inference_profile
2023-11-30 11:22:16 +00:00
}
return results