sbt-idp/cope2n-ai-fi/common/process_pdf.py
PhanThanhTrung 5bcbc257de update
2024-10-29 15:03:31 +07:00

86 lines
2.4 KiB
Python
Executable File

import logging
import logging.config
import os
import time
from opentelemetry import trace
from api.sdsap_sbt.prediction_sbt import predict as predict_sbt
from common.utils_kvu.split_docs import merge_sbt_output
from utils.logging.logging import LOGGER_CONFIG
logging.config.dictConfig(LOGGER_CONFIG)
logger = logging.getLogger(__name__)
tracer = trace.get_tracer("sbt_celery_ai")
os.environ['PYTHONPATH'] = '/home/thucpd/thucpd/cope2n-ai/cope2n-ai/'
def check_label_exists(array, target_label):
for obj in array:
if obj["label"] == target_label:
return True # Label exists in the array
return False # Label does not exist in the array
def update_null_values(kvu_result, next_kvu_result):
for key, value in kvu_result.items():
if value is None and next_kvu_result.get(key) is not None:
kvu_result[key] = next_kvu_result[key]
def replace_empty_null_values(my_dict):
for key, value in my_dict.items():
if value == '':
my_dict[key] = None
return my_dict
@tracer.start_as_current_span("compile_output_sbt")
def compile_output_sbt(list_url, metadata):
"""_summary_
Args:
pdf_extracted (list): list: [{
"1": url},{"2": url},
...]
Raises:
NotImplementedError: _description_
Returns:
dict: output compiled
"""
inference_profile = {}
results = {
"model":{
"name":"Invoice",
"confidence": 1.0,
"type": "finance/invoice",
"isValid": True,
"shape": "letter",
}
}
outputs = []
start = time.time()
pages_predict_time = []
for page in list_url:
output_model = predict_sbt(page['page_number'], page['file_url'], metadata)
pages_predict_time.append(time.time())
if "doc_type" in page:
output_model['doc_type'] = page['doc_type']
outputs.append(output_model)
start_postprocess = time.time()
documents = merge_sbt_output(outputs)
inference_profile["postprocess"] = [start_postprocess, time.time()]
inference_profile["inference"] = [start, pages_predict_time]
results = {
"total_pages": len(list_url),
"ocr_num_pages": len(list_url),
"document": documents,
"inference_profile": inference_profile
}
return results