from celery import shared_task import time import fitz import uuid import os import base64 import boto3 from fwd_api.celery_worker.worker import app from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \ FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions from ..utils import FileUtils, ProcessUtil, S3_process from celery.utils.log import get_task_logger from fwd import settings logger = get_task_logger(__name__) s3_client = S3_process.MinioS3Client( endpoint=settings.S3_ENDPOINT, access_key=settings.S3_ACCESS_KEY, secret_key=settings.S3_SECRET_KEY, bucket_name=settings.S3_BUCKET_NAME ) def process_pdf_file(file_name: str, file_path: str, request, user) -> list: from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile from fwd_api.constant.common import ProcessType doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf") # Origin file new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path, request=request, file_name=file_name, code=f'FIL{uuid.uuid4().hex}') new_request_file.save() # Sub-file return ProcessUtil.pdf_to_images_urls(doc, request, user) def process_image_file(file_name: str, file_path, request, user) -> list: from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path, request=request, file_name=file_name, code=f'FIL{uuid.uuid4().hex}') new_request_file.save() return [{ 'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name), 'page_number': 0, 'request_file_id': new_request_file.code }] @app.task(name='do_pdf') def process_pdf(rq_id, sub_id, p_type, user_id, file_name, file_path): from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0] user = UserProfile.objects.filter(id=user_id).first() file_extension = file_name.split(".")[-1] # logger.info(f"[DEBUG]: file_path: {file_path}") if file_extension in pdf_extensions: b_url = process_pdf_file(file_name, file_path, new_request, user) else: b_url = process_image_file(file_name, file_path, new_request, user) j_time = time.time() # logger.info(f"[INFO]: Duration of Pre-processing: {j_time - 0}s") # logger.info(f"[INFO]: b_url: {b_url}") if p_type in standard_ocr_list: ProcessUtil.send_to_queue2(rq_id, sub_id, b_url, user_id, p_type) if p_type == ProcessType.TEMPLATE_MATCHING.value: ProcessUtil.send_template_queue(rq_id, b_url, '', user_id) @app.task(name='upload_file_to_s3') def upload_file_to_s3(local_file_path, s3_key): if s3_client.s3_client is not None: res = s3_client.upload_file(local_file_path, s3_key) if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200: os.remove(local_file_path) else: print(f"[INFO] S3 is not available, skipping,...") @app.task(name='upload_obj_to_s3') def upload_obj_to_s3(byte_obj, s3_key): if s3_client.s3_client is not None: obj = base64.b64decode(byte_obj) res = s3_client.update_object(s3_key, obj) else: print(f"[INFO] S3 is not available, skipping,...")