Add: support for pdf file

This commit is contained in:
dx-tan 2023-12-05 12:59:06 +07:00
parent 7e9a8e2d4b
commit a84e3dce05
9 changed files with 134 additions and 124 deletions

View File

@ -3,6 +3,7 @@ import urllib
import random import random
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
import uuid
import sys, os import sys, os
cur_dir = str(Path(__file__).parents[2]) cur_dir = str(Path(__file__).parents[2])
sys.path.append(cur_dir) sys.path.append(cur_dir)
@ -35,14 +36,15 @@ def sbt_predict(image_url, engine) -> None:
save_dir = "./tmp_results" save_dir = "./tmp_results"
# image_path = os.path.join(save_dir, f"{image_url}.jpg") # image_path = os.path.join(save_dir, f"{image_url}.jpg")
image_path = os.path.join(save_dir, "abc.jpg") tmp_image_path = os.path.join(save_dir, f"{uuid.uuid4()}.jpg")
cv2.imwrite(image_path, img) cv2.imwrite(tmp_image_path, img)
outputs = process_img(img_path=image_path, outputs = process_img(img_path=tmp_image_path,
save_dir=save_dir, save_dir=save_dir,
engine=engine, engine=engine,
export_all=False, export_all=False,
option=option) option=option)
os.remove(tmp_image_path)
return outputs return outputs
def predict(page_numb, image_url): def predict(page_numb, image_url):
@ -70,6 +72,7 @@ def predict(page_numb, image_url):
""" """
sbt_result = sbt_predict(image_url, engine=sbt_engine) sbt_result = sbt_predict(image_url, engine=sbt_engine)
print(sbt_result)
output_dict = { output_dict = {
"document_type": "invoice", "document_type": "invoice",
"document_class": " ", "document_class": " ",

View File

@ -102,6 +102,8 @@ def merge_sbt_output(loutputs):
}) })
return output return output
print("concat outputs: \n", loutputs)
merged_output = [] merged_output = []
combined_output = {"retailername": None, combined_output = {"retailername": None,
"sold_to_party": None, "sold_to_party": None,

View File

@ -1,6 +1,7 @@
import time import time
import uuid import uuid
from wsgiref.util import FileWrapper from wsgiref.util import FileWrapper
import base64
from django.core.files.uploadedfile import TemporaryUploadedFile from django.core.files.uploadedfile import TemporaryUploadedFile
from django.db import transaction from django.db import transaction
@ -10,15 +11,15 @@ from drf_spectacular.utils import extend_schema
from rest_framework import status, viewsets from rest_framework import status, viewsets
from rest_framework.decorators import action from rest_framework.decorators import action
from rest_framework.response import Response from rest_framework.response import Response
import io
from typing import List from typing import List
from fwd import settings from fwd import settings
from ..celery_worker.client_connector import c_connector
from ..annotation.api import throw_on_failure from ..annotation.api import throw_on_failure
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \ from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions, allowed_file_extensions
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \ from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
PermissionDeniedException, LimitReachedException, LockedEntityException PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
from ..response.ReportSerializer import ReportSerializer from ..response.ReportSerializer import ReportSerializer
from ..utils import FileUtils, ProcessUtil from ..utils import FileUtils, ProcessUtil
@ -43,7 +44,7 @@ class CtelViewSet(viewsets.ViewSet):
} }
}, responses=None, tags=['ocr']) }, responses=None, tags=['ocr'])
@action(detail=False, url_path="image/process", methods=["POST"]) @action(detail=False, url_path="image/process", methods=["POST"])
@transaction.atomic # @transaction.atomic
def process(self, request): def process(self, request):
s_time = time.time() s_time = time.time()
# print(30*"=") # print(30*"=")
@ -59,7 +60,7 @@ class CtelViewSet(viewsets.ViewSet):
rq_id = provider_code + uuid.uuid4().hex rq_id = provider_code + uuid.uuid4().hex
file_obj: TemporaryUploadedFile = validated_data['file'] file_obj: TemporaryUploadedFile = validated_data['file']
file_extension = file_obj.name.split(".")[-1] file_extension = file_obj.name.split(".")[-1].lower()
p_type = validated_data['type'] p_type = validated_data['type']
file_name = f"temp_{rq_id}.{file_extension}" file_name = f"temp_{rq_id}.{file_extension}"
@ -73,12 +74,16 @@ class CtelViewSet(viewsets.ViewSet):
from ..celery_worker.client_connector import c_connector from ..celery_worker.client_connector import c_connector
file_obj.seek(0) file_obj.seek(0)
file_path = FileUtils.resize_and_save_file(file_name, new_request, file_obj, 100) file_path = FileUtils.resize_and_save_file(file_name, new_request, file_obj, 100)
if settings.S3_ENDPOINT!="": S3_path = FileUtils.save_to_S3(file_name, new_request, file_path)
FileUtils.save_to_S3(file_name, new_request, file_obj.read())
# print(f"[DEBUG]: file_path: {file_path}") files: [{
"file_name": file_name,
"file_path": file_path, # local path to file
"file_type": ""
},]
if file_extension in pdf_extensions: if file_extension in pdf_extensions:
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, file_name, file_path)) c_connector.do_pdf((rq_id, sub.id, p_type, user.id, files))
# b_url = ProcessUtil.process_pdf_file(file_name, file_obj, new_request, user) # b_url = ProcessUtil.process_pdf_file(file_name, file_obj, new_request, user)
elif file_extension in image_extensions: elif file_extension in image_extensions:
b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user) b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user)
@ -117,7 +122,7 @@ class CtelViewSet(viewsets.ViewSet):
} }
}, responses=None, tags=['ocr']) }, responses=None, tags=['ocr'])
@action(detail=False, url_path="images/process", methods=["POST"]) @action(detail=False, url_path="images/process", methods=["POST"])
@transaction.atomic # @transaction.atomic
def processes(self, request): def processes(self, request):
s_time = time.time() s_time = time.time()
# print(30*"=") # print(30*"=")
@ -148,34 +153,27 @@ class CtelViewSet(viewsets.ViewSet):
provider_code=provider_code, provider_code=provider_code,
subscription=sub) subscription=sub)
new_request.save() new_request.save()
count = 0 count = 0
compact_files = []
for doc_type, doc_files in files.items(): for doc_type, doc_files in files.items():
for i, doc_file in enumerate(doc_files): for i, doc_file in enumerate(doc_files):
_ext = doc_file.name.split(".")[-1] _ext = doc_file.name.split(".")[-1]
if _ext not in image_extensions: if _ext not in allowed_file_extensions:
return JsonResponse(status=status.HTTP_406_NOT_ACCEPTABLE, data={"request_id": rq_id, "message": f"File {_ext} is now allowed"}) return JsonResponse(status=status.HTTP_406_NOT_ACCEPTABLE, data={"request_id": rq_id, "message": f"File {_ext} is now allowed"})
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}" _name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
doc_file.seek(0) doc_file.seek(0)
# file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100) file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
# input_file = io.BytesIO(open(doc_file, 'rb').read()) S3_path = FileUtils.save_to_S3(_name, new_request, file_path)
input_file = doc_file.read()
if settings.S3_ENDPOINT!="":
FileUtils.save_to_S3(_name, new_request, input_file)
else:
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
list_urls.append(ProcessUtil.process_image_file(_name, doc_file, new_request, user)[0])
list_urls[count]["page_number"] = count
list_urls[count]["doc_type"] = doc_type
count += 1 count += 1
this_file = {
"file_name": _name,
"file_path": file_path,
"file_type": doc_type
}
compact_files.append(this_file)
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
if p_type in standard_ocr_list:
ProcessUtil.send_to_queue2(rq_id, sub.id, list_urls, user.id, p_type)
elif p_type == ProcessType.TEMPLATE_MATCHING.value:
ProcessUtil.send_template_queue(rq_id, list_urls, validated_data['template'], user.id)
j_time = time.time() j_time = time.time()
print(f"[INFO]: Duration of Pre-processing: {j_time - s_time}s")
print(f"[INFO]: list_urls: {list_urls}")
return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id}) return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id})
@extend_schema(request=None, responses=None, tags=['data']) @extend_schema(request=None, responses=None, tags=['data'])
@ -289,6 +287,8 @@ class CtelViewSet(viewsets.ViewSet):
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True) serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
serializer.is_valid() serializer.is_valid()
# print(f"[DEBUG]: result: {serializer.data[0]}") # print(f"[DEBUG]: result: {serializer.data[0]}")
if report_filter[0].status == 400:
raise FileContentInvalidException()
return Response(status=status.HTTP_200_OK, data=serializer.data[0]) return Response(status=status.HTTP_200_OK, data=serializer.data[0])
@ -317,14 +317,13 @@ class CtelViewSet(viewsets.ViewSet):
# return Response(status=status.HTTP_200_OK, data=xml_as_string, content_type="application/xml; charset=utf-8") # return Response(status=status.HTTP_200_OK, data=xml_as_string, content_type="application/xml; charset=utf-8")
return HttpResponse(xml_as_string,content_type="text/xml") return HttpResponse(xml_as_string,content_type="text/xml")
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True) serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
serializer.is_valid() serializer.is_valid()
return Response(status=status.HTTP_200_OK, data=serializer.data[0]) return Response(status=status.HTTP_200_OK, data=serializer.data[0])
@action(detail=False, url_path="image/process/app", methods=["POST"]) @action(detail=False, url_path="image/process/app", methods=["POST"])
@transaction.atomic # @transaction.atomic
def process_app(self, request): def process_app(self, request):
app_id = "THIS_IS_OUR_APP_TEST_ACCOUNT_9123" app_id = "THIS_IS_OUR_APP_TEST_ACCOUNT_9123"
users = UserProfile.objects.filter(sync_id=app_id) users = UserProfile.objects.filter(sync_id=app_id)

View File

@ -4,11 +4,12 @@ import fitz
import uuid import uuid
import os import os
import base64 import base64
import boto3
from fwd_api.celery_worker.worker import app from fwd_api.celery_worker.worker import app
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \ from ..constant.common import ProcessType, \
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions FolderFileType, standard_ocr_list, image_extensions
from django.core.files.uploadedfile import TemporaryUploadedFile
from ..exception.exceptions import FileContentInvalidException
from ..utils import FileUtils, ProcessUtil, S3_process from ..utils import FileUtils, ProcessUtil, S3_process
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
from fwd import settings from fwd import settings
@ -24,9 +25,27 @@ s3_client = S3_process.MinioS3Client(
) )
def process_pdf_file(file_name: str, file_path: str, request, user) -> list: def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile from fwd_api.models import SubscriptionRequestFile
from fwd_api.constant.common import ProcessType try:
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf") doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
# Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
# Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
except Exception as e:
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
return None
def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
from fwd_api.models import SubscriptionRequestFile
doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
# Origin file # Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path, new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
@ -34,8 +53,14 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
file_name=file_name, file_name=file_name,
code=f'FIL{uuid.uuid4().hex}') code=f'FIL{uuid.uuid4().hex}')
new_request_file.save() new_request_file.save()
# Sub-file try:
return ProcessUtil.pdf_to_images_urls(doc, request, user) # Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
except Exception as e:
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
return None
def process_image_file(file_name: str, file_path, request, user) -> list: def process_image_file(file_name: str, file_path, request, user) -> list:
@ -54,25 +79,45 @@ def process_image_file(file_name: str, file_path, request, user) -> list:
@app.task(name='do_pdf') @app.task(name='do_pdf')
def process_pdf(rq_id, sub_id, p_type, user_id, file_name, file_path): def process_pdf(rq_id, sub_id, p_type, user_id, files):
"""
pdf_files: [{
"file_name": "",
"file_path": "", # local path to file
"file_type": ""
},]
"""
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile
start = time.time()
from django.conf import settings
new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0] new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0]
user = UserProfile.objects.filter(id=user_id).first() user = UserProfile.objects.filter(id=user_id).first()
file_extension = file_name.split(".")[-1] b_urls = []
# logger.info(f"[DEBUG]: file_path: {file_path}") for i, file in enumerate(files):
if file_extension in pdf_extensions: extension = file["file_name"].split(".")[-1].lower()
b_url = process_pdf_file(file_name, file_path, new_request, user) if extension == "pdf":
else: _b_urls = process_pdf_file(file["file_name"], file["file_path"], new_request, user)
b_url = process_image_file(file_name, file_path, new_request, user) if _b_urls is None:
raise FileContentInvalidException
for i in range(len(_b_urls)):
_b_urls[i]["doc_type"] = file["file_type"]
# b_urls += _b_urls # TODO: Client may request all images in a file, for now, extract the first page only
for j in range(len(b_urls)):
_b_urls[j]["page_number"] = j + len(b_urls)
b_urls.append(_b_urls[0])
elif extension in image_extensions:
this_url = ProcessUtil.process_image_local_file(file["file_name"], file["file_path"], new_request, user)[0]
this_url["page_number"] = len(b_urls)
if file["file_type"]:
this_url["doc_type"] = file["file_type"]
b_urls.append(this_url)
j_time = time.time() start_process = time.time()
# logger.info(f"[INFO]: Duration of Pre-processing: {j_time - 0}s") logger.info(f"BE proccessing time: {start_process - start}")
# logger.info(f"[INFO]: b_url: {b_url}")
if p_type in standard_ocr_list: if p_type in standard_ocr_list:
ProcessUtil.send_to_queue2(rq_id, sub_id, b_url, user_id, p_type) ProcessUtil.send_to_queue2(rq_id, sub_id, b_urls, user_id, p_type)
if p_type == ProcessType.TEMPLATE_MATCHING.value: if p_type == ProcessType.TEMPLATE_MATCHING.value:
ProcessUtil.send_template_queue(rq_id, b_url, '', user_id) ProcessUtil.send_template_queue(rq_id, b_urls, '', user_id)
@app.task(name='upload_file_to_s3') @app.task(name='upload_file_to_s3')
def upload_file_to_s3(local_file_path, s3_key): def upload_file_to_s3(local_file_path, s3_key):
@ -81,7 +126,7 @@ def upload_file_to_s3(local_file_path, s3_key):
if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200: if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200:
os.remove(local_file_path) os.remove(local_file_path)
else: else:
print(f"[INFO] S3 is not available, skipping,...") logger.info(f"S3 is not available, skipping,...")
@app.task(name='upload_obj_to_s3') @app.task(name='upload_obj_to_s3')
def upload_obj_to_s3(byte_obj, s3_key): def upload_obj_to_s3(byte_obj, s3_key):
@ -89,4 +134,4 @@ def upload_obj_to_s3(byte_obj, s3_key):
obj = base64.b64decode(byte_obj) obj = base64.b64decode(byte_obj)
res = s3_client.update_object(s3_key, obj) res = s3_client.update_object(s3_key, obj)
else: else:
print(f"[INFO] S3 is not available, skipping,...") logger.info(f"S3 is not available, skipping,...")

View File

@ -4,67 +4,7 @@ import re
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG') image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
pdf_extensions = ('pdf', 'PDF') pdf_extensions = ('pdf', 'PDF')
allowed_file_extensions = image_extensions + pdf_extensions allowed_file_extensions = image_extensions + pdf_extensions
allowed_p_type = [2, 3, 4, 5, 6] # allowed_file_extensions = image_extensions
LIST_BOX_MESSAGE = 'list_box'
NAME_MESSAGE = 'name'
VN_AND_SPACE_REGEX = r"[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z0-9 ]+"
IMAGE_NAME = "image_croped.jpg"
TEMPLATE_ID = 'template_id'
pattern = re.compile(VN_AND_SPACE_REGEX)
REQUEST_ID = 'requestId'
FOLDER_TYPE = 'folderType'
MAX_NUMBER_OF_TEMPLATE_DATA_BOX = 20
MAX_NUMBER_OF_TEMPLATE_ANCHOR_BOX = 3
NUMBER_OF_ITEM_IN_A_BOX = 4 # 4 coordinates
ESCAPE_VALUE = 'W5@X8#'
USER_MESSAGE = 'user'
PLAN_MESSAGE = 'plan'
class FolderFileType(Enum):
TEMPLATES = 'templates'
REQUESTS = 'requests'
class FileCategory(Enum):
CROP = 'Crop'
Origin = 'Origin'
BREAK = 'Break'
class EntityStatus(Enum):
ACTIVE = 1
INACTIVE = 0
class TEMPLATE_BOX_TYPE(Enum):
ANCHOR = 1
DATA = 2
class ProcessType(Enum):
TEMPLATE_MATCHING = 2
ID_CARD = 3
DRIVER_LICENSE = 4
INVOICE = 5
OCR_WITH_BOX = 6
AP_INVOICE = 7
FI_INVOICE = 10
class PlanCode(Enum):
TRIAL = 'TRIAL'
BASIC = 'BASIC'
ADVANCED = 'ADVANCED'
standard_ocr_list = (ProcessType.INVOICE.value, ProcessType.ID_CARD.value, ProcessType.DRIVER_LICENSE.value, ProcessType.OCR_WITH_BOX.value)
from enum import Enum
import re
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
pdf_extensions = ('pdf', 'PDF')
# allowed_file_extensions = image_extensions + pdf_extensions
allowed_file_extensions = image_extensions
allowed_p_type = [12] allowed_p_type = [12]
LIST_BOX_MESSAGE = 'list_box' LIST_BOX_MESSAGE = 'list_box'
NAME_MESSAGE = 'name' NAME_MESSAGE = 'name'

View File

@ -97,6 +97,12 @@ class FileFormatInvalidException(InvalidException):
default_detail = 'File invalid type' default_detail = 'File invalid type'
detail_with_arg = 'File must have type {}' detail_with_arg = 'File must have type {}'
class FileContentInvalidException(InvalidException):
status_code = status.HTTP_400_BAD_REQUEST
default_code = 4007
default_detail = 'Invalid content file'
detail_with_arg = 'One of the files is broken, please select other file and try again'
class TokenExpiredException(GeneralException): class TokenExpiredException(GeneralException):
status_code = status.HTTP_401_UNAUTHORIZED status_code = status.HTTP_401_UNAUTHORIZED

View File

@ -7,8 +7,10 @@ from fwd_api.constant.common import EntityStatus
class UserProfile(models.Model): class UserProfile(models.Model):
id = models.AutoField(primary_key=True) id = models.AutoField(primary_key=True)
full_name: str = models.CharField(max_length=200) user_name: str = models.CharField(max_length=200, null=True)
sync_id: str = models.CharField(max_length=100) password: str = models.CharField(max_length=200, null=True)
full_name: str = models.CharField(max_length=200, null=True)
sync_id: str = models.CharField(max_length=100, null=True)
provider_id: str = models.CharField(max_length=100, default='Ctel') # CTel/GCP/Azure :v provider_id: str = models.CharField(max_length=100, default='Ctel') # CTel/GCP/Azure :v
current_total_pages: int = models.IntegerField(default=0) current_total_pages: int = models.IntegerField(default=0)
limit_total_pages: int = models.IntegerField(default=0) limit_total_pages: int = models.IntegerField(default=0)

View File

@ -27,7 +27,7 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
if not isinstance(f, TemporaryUploadedFile): if not isinstance(f, TemporaryUploadedFile):
# print(f'[DEBUG]: {f.name}') # print(f'[DEBUG]: {f.name}')
raise InvalidException(excArgs="files") raise InvalidException(excArgs="files")
extension = f.name.split(".")[-1] in allowed_file_extensions extension = f.name.split(".")[-1].lower() in allowed_file_extensions
if not extension or "." not in f.name: if not extension or "." not in f.name:
raise FileFormatInvalidException(excArgs=allowed_file_extensions) raise FileFormatInvalidException(excArgs=allowed_file_extensions)
if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE: if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE:
@ -129,14 +129,15 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
print(f"[ERROR]: {e}") print(f"[ERROR]: {e}")
raise ServiceUnavailableException() raise ServiceUnavailableException()
def save_to_S3(file_name, rq, obj): def save_to_S3(file_name, rq, local_file_path):
try: try:
base64_obj = base64.b64encode(obj).decode('utf-8') # base64_obj = base64.b64encode(obj).decode('utf-8')
file_path = get_folder_path(rq) file_path = get_folder_path(rq)
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id" assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name) s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
# c_connector.upload_file_to_s3((file_path, s3_key)) # c_connector.upload_file_to_s3((file_path, s3_key))
c_connector.upload_obj_to_s3((base64_obj, s3_key)) c_connector.upload_file_to_s3((local_file_path, s3_key))
return s3_key
except Exception as e: except Exception as e:
print(f"[ERROR]: {e}") print(f"[ERROR]: {e}")
raise ServiceUnavailableException() raise ServiceUnavailableException()

View File

@ -376,6 +376,18 @@ def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request:
'request_file_id': new_request_file.code 'request_file_id': new_request_file.code
}] }]
def process_image_local_file(file_name: str, file_path: str, request: SubscriptionRequest, user) -> list:
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
return [{
'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),
'page_number': 0,
'request_file_id': new_request_file.code
}]
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list: def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
def resize(image, max_w=1920, max_h=1080): def resize(image, max_w=1920, max_h=1080):
logger.info(f"[DEBUG]: image.size: {image.size}, type(image): {type(image)}") logger.info(f"[DEBUG]: image.size: {image.size}, type(image): {type(image)}")