92 lines
3.8 KiB
Python
Executable File
92 lines
3.8 KiB
Python
Executable File
from celery import shared_task
|
|
import time
|
|
import fitz
|
|
import uuid
|
|
import os
|
|
import base64
|
|
import boto3
|
|
|
|
from fwd_api.celery_worker.worker import app
|
|
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
|
|
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions
|
|
from ..utils import FileUtils, ProcessUtil, S3_process
|
|
from celery.utils.log import get_task_logger
|
|
from fwd import settings
|
|
|
|
|
|
logger = get_task_logger(__name__)
|
|
|
|
s3_client = S3_process.MinioS3Client(
|
|
endpoint=settings.S3_ENDPOINT,
|
|
access_key=settings.S3_ACCESS_KEY,
|
|
secret_key=settings.S3_SECRET_KEY,
|
|
bucket_name=settings.S3_BUCKET_NAME
|
|
)
|
|
|
|
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
|
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile
|
|
from fwd_api.constant.common import ProcessType
|
|
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
|
|
|
|
# Origin file
|
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
|
request=request,
|
|
file_name=file_name,
|
|
code=f'FIL{uuid.uuid4().hex}')
|
|
new_request_file.save()
|
|
# Sub-file
|
|
return ProcessUtil.pdf_to_images_urls(doc, request, user)
|
|
|
|
|
|
def process_image_file(file_name: str, file_path, request, user) -> list:
|
|
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile
|
|
|
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
|
request=request,
|
|
file_name=file_name,
|
|
code=f'FIL{uuid.uuid4().hex}')
|
|
new_request_file.save()
|
|
return [{
|
|
'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),
|
|
'page_number': 0,
|
|
'request_file_id': new_request_file.code
|
|
}]
|
|
|
|
|
|
@app.task(name='do_pdf')
|
|
def process_pdf(rq_id, sub_id, p_type, user_id, file_name, file_path):
|
|
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile
|
|
|
|
new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0]
|
|
user = UserProfile.objects.filter(id=user_id).first()
|
|
file_extension = file_name.split(".")[-1]
|
|
# logger.info(f"[DEBUG]: file_path: {file_path}")
|
|
if file_extension in pdf_extensions:
|
|
b_url = process_pdf_file(file_name, file_path, new_request, user)
|
|
else:
|
|
b_url = process_image_file(file_name, file_path, new_request, user)
|
|
|
|
j_time = time.time()
|
|
# logger.info(f"[INFO]: Duration of Pre-processing: {j_time - 0}s")
|
|
# logger.info(f"[INFO]: b_url: {b_url}")
|
|
if p_type in standard_ocr_list:
|
|
ProcessUtil.send_to_queue2(rq_id, sub_id, b_url, user_id, p_type)
|
|
if p_type == ProcessType.TEMPLATE_MATCHING.value:
|
|
ProcessUtil.send_template_queue(rq_id, b_url, '', user_id)
|
|
|
|
@app.task(name='upload_file_to_s3')
|
|
def upload_file_to_s3(local_file_path, s3_key):
|
|
if s3_client.s3_client is not None:
|
|
res = s3_client.upload_file(local_file_path, s3_key)
|
|
if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200:
|
|
os.remove(local_file_path)
|
|
else:
|
|
print(f"[INFO] S3 is not available, skipping,...")
|
|
|
|
@app.task(name='upload_obj_to_s3')
|
|
def upload_obj_to_s3(byte_obj, s3_key):
|
|
if s3_client.s3_client is not None:
|
|
obj = base64.b64decode(byte_obj)
|
|
res = s3_client.update_object(s3_key, obj)
|
|
else:
|
|
print(f"[INFO] S3 is not available, skipping,...") |