2024-10-29 04:07:30 +00:00
|
|
|
import logging
|
|
|
|
import logging.config
|
2023-11-30 11:22:16 +00:00
|
|
|
import os
|
2023-12-27 11:19:17 +00:00
|
|
|
import time
|
2023-11-30 11:22:16 +00:00
|
|
|
|
2024-10-29 04:07:30 +00:00
|
|
|
from opentelemetry import trace
|
2023-11-30 11:22:16 +00:00
|
|
|
|
|
|
|
from api.sdsap_sbt.prediction_sbt import predict as predict_sbt
|
2024-10-29 04:07:30 +00:00
|
|
|
from common.utils_kvu.split_docs import merge_sbt_output
|
2024-07-05 13:14:47 +00:00
|
|
|
from utils.logging.logging import LOGGER_CONFIG
|
2024-10-29 04:07:30 +00:00
|
|
|
|
2024-07-05 13:14:47 +00:00
|
|
|
logging.config.dictConfig(LOGGER_CONFIG)
|
|
|
|
logger = logging.getLogger(__name__)
|
2024-10-29 08:03:31 +00:00
|
|
|
tracer = trace.get_tracer("sbt_celery_ai")
|
2023-11-30 11:22:16 +00:00
|
|
|
|
|
|
|
os.environ['PYTHONPATH'] = '/home/thucpd/thucpd/cope2n-ai/cope2n-ai/'
|
|
|
|
|
|
|
|
def check_label_exists(array, target_label):
|
|
|
|
for obj in array:
|
|
|
|
if obj["label"] == target_label:
|
|
|
|
return True # Label exists in the array
|
|
|
|
return False # Label does not exist in the array
|
|
|
|
|
|
|
|
def update_null_values(kvu_result, next_kvu_result):
|
|
|
|
for key, value in kvu_result.items():
|
|
|
|
if value is None and next_kvu_result.get(key) is not None:
|
|
|
|
kvu_result[key] = next_kvu_result[key]
|
|
|
|
|
|
|
|
def replace_empty_null_values(my_dict):
|
|
|
|
for key, value in my_dict.items():
|
|
|
|
if value == '':
|
|
|
|
my_dict[key] = None
|
|
|
|
return my_dict
|
|
|
|
|
2024-10-29 04:07:30 +00:00
|
|
|
@tracer.start_as_current_span("compile_output_sbt")
|
2024-04-05 11:50:41 +00:00
|
|
|
def compile_output_sbt(list_url, metadata):
|
2023-11-30 11:22:16 +00:00
|
|
|
"""_summary_
|
|
|
|
|
|
|
|
Args:
|
|
|
|
pdf_extracted (list): list: [{
|
|
|
|
"1": url},{"2": url},
|
|
|
|
...]
|
|
|
|
Raises:
|
|
|
|
NotImplementedError: _description_
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
dict: output compiled
|
|
|
|
"""
|
|
|
|
|
2023-12-27 11:19:17 +00:00
|
|
|
inference_profile = {}
|
|
|
|
|
2023-11-30 11:22:16 +00:00
|
|
|
results = {
|
|
|
|
"model":{
|
|
|
|
"name":"Invoice",
|
|
|
|
"confidence": 1.0,
|
|
|
|
"type": "finance/invoice",
|
|
|
|
"isValid": True,
|
|
|
|
"shape": "letter",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
outputs = []
|
2023-12-27 11:19:17 +00:00
|
|
|
start = time.time()
|
|
|
|
pages_predict_time = []
|
2024-10-29 04:07:30 +00:00
|
|
|
|
2023-11-30 11:22:16 +00:00
|
|
|
for page in list_url:
|
2024-04-05 11:50:41 +00:00
|
|
|
output_model = predict_sbt(page['page_number'], page['file_url'], metadata)
|
2023-12-27 11:19:17 +00:00
|
|
|
pages_predict_time.append(time.time())
|
2023-11-30 11:22:16 +00:00
|
|
|
if "doc_type" in page:
|
|
|
|
output_model['doc_type'] = page['doc_type']
|
|
|
|
outputs.append(output_model)
|
2023-12-27 11:19:17 +00:00
|
|
|
start_postprocess = time.time()
|
2024-10-29 04:07:30 +00:00
|
|
|
|
2023-11-30 11:22:16 +00:00
|
|
|
documents = merge_sbt_output(outputs)
|
2024-10-29 04:07:30 +00:00
|
|
|
|
2023-12-27 11:19:17 +00:00
|
|
|
inference_profile["postprocess"] = [start_postprocess, time.time()]
|
|
|
|
inference_profile["inference"] = [start, pages_predict_time]
|
2023-11-30 11:22:16 +00:00
|
|
|
results = {
|
|
|
|
"total_pages": len(list_url),
|
|
|
|
"ocr_num_pages": len(list_url),
|
2023-12-27 11:19:17 +00:00
|
|
|
"document": documents,
|
|
|
|
"inference_profile": inference_profile
|
2023-11-30 11:22:16 +00:00
|
|
|
}
|
|
|
|
return results
|