Add: support for pdf file
This commit is contained in:
parent
7e9a8e2d4b
commit
a84e3dce05
@ -3,6 +3,7 @@ import urllib
|
|||||||
import random
|
import random
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import uuid
|
||||||
import sys, os
|
import sys, os
|
||||||
cur_dir = str(Path(__file__).parents[2])
|
cur_dir = str(Path(__file__).parents[2])
|
||||||
sys.path.append(cur_dir)
|
sys.path.append(cur_dir)
|
||||||
@ -35,14 +36,15 @@ def sbt_predict(image_url, engine) -> None:
|
|||||||
|
|
||||||
save_dir = "./tmp_results"
|
save_dir = "./tmp_results"
|
||||||
# image_path = os.path.join(save_dir, f"{image_url}.jpg")
|
# image_path = os.path.join(save_dir, f"{image_url}.jpg")
|
||||||
image_path = os.path.join(save_dir, "abc.jpg")
|
tmp_image_path = os.path.join(save_dir, f"{uuid.uuid4()}.jpg")
|
||||||
cv2.imwrite(image_path, img)
|
cv2.imwrite(tmp_image_path, img)
|
||||||
|
|
||||||
outputs = process_img(img_path=image_path,
|
outputs = process_img(img_path=tmp_image_path,
|
||||||
save_dir=save_dir,
|
save_dir=save_dir,
|
||||||
engine=engine,
|
engine=engine,
|
||||||
export_all=False,
|
export_all=False,
|
||||||
option=option)
|
option=option)
|
||||||
|
os.remove(tmp_image_path)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def predict(page_numb, image_url):
|
def predict(page_numb, image_url):
|
||||||
@ -70,6 +72,7 @@ def predict(page_numb, image_url):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
sbt_result = sbt_predict(image_url, engine=sbt_engine)
|
sbt_result = sbt_predict(image_url, engine=sbt_engine)
|
||||||
|
print(sbt_result)
|
||||||
output_dict = {
|
output_dict = {
|
||||||
"document_type": "invoice",
|
"document_type": "invoice",
|
||||||
"document_class": " ",
|
"document_class": " ",
|
||||||
|
@ -102,6 +102,8 @@ def merge_sbt_output(loutputs):
|
|||||||
})
|
})
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
print("concat outputs: \n", loutputs)
|
||||||
|
|
||||||
merged_output = []
|
merged_output = []
|
||||||
combined_output = {"retailername": None,
|
combined_output = {"retailername": None,
|
||||||
"sold_to_party": None,
|
"sold_to_party": None,
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from wsgiref.util import FileWrapper
|
from wsgiref.util import FileWrapper
|
||||||
|
import base64
|
||||||
|
|
||||||
from django.core.files.uploadedfile import TemporaryUploadedFile
|
from django.core.files.uploadedfile import TemporaryUploadedFile
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
@ -10,15 +11,15 @@ from drf_spectacular.utils import extend_schema
|
|||||||
from rest_framework import status, viewsets
|
from rest_framework import status, viewsets
|
||||||
from rest_framework.decorators import action
|
from rest_framework.decorators import action
|
||||||
from rest_framework.response import Response
|
from rest_framework.response import Response
|
||||||
import io
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from fwd import settings
|
from fwd import settings
|
||||||
|
from ..celery_worker.client_connector import c_connector
|
||||||
from ..annotation.api import throw_on_failure
|
from ..annotation.api import throw_on_failure
|
||||||
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
|
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
|
||||||
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions
|
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions, allowed_file_extensions
|
||||||
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
|
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
|
||||||
PermissionDeniedException, LimitReachedException, LockedEntityException
|
PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException
|
||||||
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
|
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
|
||||||
from ..response.ReportSerializer import ReportSerializer
|
from ..response.ReportSerializer import ReportSerializer
|
||||||
from ..utils import FileUtils, ProcessUtil
|
from ..utils import FileUtils, ProcessUtil
|
||||||
@ -43,7 +44,7 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
}
|
}
|
||||||
}, responses=None, tags=['ocr'])
|
}, responses=None, tags=['ocr'])
|
||||||
@action(detail=False, url_path="image/process", methods=["POST"])
|
@action(detail=False, url_path="image/process", methods=["POST"])
|
||||||
@transaction.atomic
|
# @transaction.atomic
|
||||||
def process(self, request):
|
def process(self, request):
|
||||||
s_time = time.time()
|
s_time = time.time()
|
||||||
# print(30*"=")
|
# print(30*"=")
|
||||||
@ -59,7 +60,7 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
rq_id = provider_code + uuid.uuid4().hex
|
rq_id = provider_code + uuid.uuid4().hex
|
||||||
|
|
||||||
file_obj: TemporaryUploadedFile = validated_data['file']
|
file_obj: TemporaryUploadedFile = validated_data['file']
|
||||||
file_extension = file_obj.name.split(".")[-1]
|
file_extension = file_obj.name.split(".")[-1].lower()
|
||||||
p_type = validated_data['type']
|
p_type = validated_data['type']
|
||||||
file_name = f"temp_{rq_id}.{file_extension}"
|
file_name = f"temp_{rq_id}.{file_extension}"
|
||||||
|
|
||||||
@ -73,12 +74,16 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
from ..celery_worker.client_connector import c_connector
|
from ..celery_worker.client_connector import c_connector
|
||||||
file_obj.seek(0)
|
file_obj.seek(0)
|
||||||
file_path = FileUtils.resize_and_save_file(file_name, new_request, file_obj, 100)
|
file_path = FileUtils.resize_and_save_file(file_name, new_request, file_obj, 100)
|
||||||
if settings.S3_ENDPOINT!="":
|
S3_path = FileUtils.save_to_S3(file_name, new_request, file_path)
|
||||||
FileUtils.save_to_S3(file_name, new_request, file_obj.read())
|
|
||||||
# print(f"[DEBUG]: file_path: {file_path}")
|
|
||||||
|
|
||||||
|
files: [{
|
||||||
|
"file_name": file_name,
|
||||||
|
"file_path": file_path, # local path to file
|
||||||
|
"file_type": ""
|
||||||
|
},]
|
||||||
|
|
||||||
if file_extension in pdf_extensions:
|
if file_extension in pdf_extensions:
|
||||||
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, file_name, file_path))
|
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, files))
|
||||||
# b_url = ProcessUtil.process_pdf_file(file_name, file_obj, new_request, user)
|
# b_url = ProcessUtil.process_pdf_file(file_name, file_obj, new_request, user)
|
||||||
elif file_extension in image_extensions:
|
elif file_extension in image_extensions:
|
||||||
b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user)
|
b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user)
|
||||||
@ -117,7 +122,7 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
}
|
}
|
||||||
}, responses=None, tags=['ocr'])
|
}, responses=None, tags=['ocr'])
|
||||||
@action(detail=False, url_path="images/process", methods=["POST"])
|
@action(detail=False, url_path="images/process", methods=["POST"])
|
||||||
@transaction.atomic
|
# @transaction.atomic
|
||||||
def processes(self, request):
|
def processes(self, request):
|
||||||
s_time = time.time()
|
s_time = time.time()
|
||||||
# print(30*"=")
|
# print(30*"=")
|
||||||
@ -148,34 +153,27 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
provider_code=provider_code,
|
provider_code=provider_code,
|
||||||
subscription=sub)
|
subscription=sub)
|
||||||
new_request.save()
|
new_request.save()
|
||||||
|
count = 0
|
||||||
count = 0
|
compact_files = []
|
||||||
for doc_type, doc_files in files.items():
|
for doc_type, doc_files in files.items():
|
||||||
for i, doc_file in enumerate(doc_files):
|
for i, doc_file in enumerate(doc_files):
|
||||||
_ext = doc_file.name.split(".")[-1]
|
_ext = doc_file.name.split(".")[-1]
|
||||||
if _ext not in image_extensions:
|
if _ext not in allowed_file_extensions:
|
||||||
return JsonResponse(status=status.HTTP_406_NOT_ACCEPTABLE, data={"request_id": rq_id, "message": f"File {_ext} is now allowed"})
|
return JsonResponse(status=status.HTTP_406_NOT_ACCEPTABLE, data={"request_id": rq_id, "message": f"File {_ext} is now allowed"})
|
||||||
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
|
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
|
||||||
doc_file.seek(0)
|
doc_file.seek(0)
|
||||||
# file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
||||||
# input_file = io.BytesIO(open(doc_file, 'rb').read())
|
S3_path = FileUtils.save_to_S3(_name, new_request, file_path)
|
||||||
input_file = doc_file.read()
|
|
||||||
if settings.S3_ENDPOINT!="":
|
|
||||||
FileUtils.save_to_S3(_name, new_request, input_file)
|
|
||||||
else:
|
|
||||||
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
|
||||||
list_urls.append(ProcessUtil.process_image_file(_name, doc_file, new_request, user)[0])
|
|
||||||
list_urls[count]["page_number"] = count
|
|
||||||
list_urls[count]["doc_type"] = doc_type
|
|
||||||
count += 1
|
count += 1
|
||||||
|
this_file = {
|
||||||
|
"file_name": _name,
|
||||||
|
"file_path": file_path,
|
||||||
|
"file_type": doc_type
|
||||||
|
}
|
||||||
|
compact_files.append(this_file)
|
||||||
|
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
|
||||||
|
|
||||||
if p_type in standard_ocr_list:
|
|
||||||
ProcessUtil.send_to_queue2(rq_id, sub.id, list_urls, user.id, p_type)
|
|
||||||
elif p_type == ProcessType.TEMPLATE_MATCHING.value:
|
|
||||||
ProcessUtil.send_template_queue(rq_id, list_urls, validated_data['template'], user.id)
|
|
||||||
j_time = time.time()
|
j_time = time.time()
|
||||||
print(f"[INFO]: Duration of Pre-processing: {j_time - s_time}s")
|
|
||||||
print(f"[INFO]: list_urls: {list_urls}")
|
|
||||||
return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id})
|
return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id})
|
||||||
|
|
||||||
@extend_schema(request=None, responses=None, tags=['data'])
|
@extend_schema(request=None, responses=None, tags=['data'])
|
||||||
@ -289,6 +287,8 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
|
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
|
||||||
serializer.is_valid()
|
serializer.is_valid()
|
||||||
# print(f"[DEBUG]: result: {serializer.data[0]}")
|
# print(f"[DEBUG]: result: {serializer.data[0]}")
|
||||||
|
if report_filter[0].status == 400:
|
||||||
|
raise FileContentInvalidException()
|
||||||
|
|
||||||
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
|
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
|
||||||
|
|
||||||
@ -317,14 +317,13 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
# return Response(status=status.HTTP_200_OK, data=xml_as_string, content_type="application/xml; charset=utf-8")
|
# return Response(status=status.HTTP_200_OK, data=xml_as_string, content_type="application/xml; charset=utf-8")
|
||||||
return HttpResponse(xml_as_string,content_type="text/xml")
|
return HttpResponse(xml_as_string,content_type="text/xml")
|
||||||
|
|
||||||
|
|
||||||
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
|
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
|
||||||
serializer.is_valid()
|
serializer.is_valid()
|
||||||
|
|
||||||
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
|
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
|
||||||
|
|
||||||
@action(detail=False, url_path="image/process/app", methods=["POST"])
|
@action(detail=False, url_path="image/process/app", methods=["POST"])
|
||||||
@transaction.atomic
|
# @transaction.atomic
|
||||||
def process_app(self, request):
|
def process_app(self, request):
|
||||||
app_id = "THIS_IS_OUR_APP_TEST_ACCOUNT_9123"
|
app_id = "THIS_IS_OUR_APP_TEST_ACCOUNT_9123"
|
||||||
users = UserProfile.objects.filter(sync_id=app_id)
|
users = UserProfile.objects.filter(sync_id=app_id)
|
||||||
|
@ -4,11 +4,12 @@ import fitz
|
|||||||
import uuid
|
import uuid
|
||||||
import os
|
import os
|
||||||
import base64
|
import base64
|
||||||
import boto3
|
|
||||||
|
|
||||||
from fwd_api.celery_worker.worker import app
|
from fwd_api.celery_worker.worker import app
|
||||||
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
|
from ..constant.common import ProcessType, \
|
||||||
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions
|
FolderFileType, standard_ocr_list, image_extensions
|
||||||
|
from django.core.files.uploadedfile import TemporaryUploadedFile
|
||||||
|
from ..exception.exceptions import FileContentInvalidException
|
||||||
from ..utils import FileUtils, ProcessUtil, S3_process
|
from ..utils import FileUtils, ProcessUtil, S3_process
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
from fwd import settings
|
from fwd import settings
|
||||||
@ -24,9 +25,27 @@ s3_client = S3_process.MinioS3Client(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
||||||
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile
|
from fwd_api.models import SubscriptionRequestFile
|
||||||
from fwd_api.constant.common import ProcessType
|
try:
|
||||||
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
|
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
|
||||||
|
|
||||||
|
# Origin file
|
||||||
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
||||||
|
request=request,
|
||||||
|
file_name=file_name,
|
||||||
|
code=f'FIL{uuid.uuid4().hex}')
|
||||||
|
new_request_file.save()
|
||||||
|
# Sub-file
|
||||||
|
return ProcessUtil.pdf_to_images_urls(doc, request, user)
|
||||||
|
except Exception as e:
|
||||||
|
request.status = 400
|
||||||
|
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
|
||||||
|
request.save()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
|
||||||
|
from fwd_api.models import SubscriptionRequestFile
|
||||||
|
doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
|
||||||
|
|
||||||
# Origin file
|
# Origin file
|
||||||
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
||||||
@ -34,8 +53,14 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
|||||||
file_name=file_name,
|
file_name=file_name,
|
||||||
code=f'FIL{uuid.uuid4().hex}')
|
code=f'FIL{uuid.uuid4().hex}')
|
||||||
new_request_file.save()
|
new_request_file.save()
|
||||||
# Sub-file
|
try:
|
||||||
return ProcessUtil.pdf_to_images_urls(doc, request, user)
|
# Sub-file
|
||||||
|
return ProcessUtil.pdf_to_images_urls(doc, request, user)
|
||||||
|
except Exception as e:
|
||||||
|
request.status = 400
|
||||||
|
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
|
||||||
|
request.save()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def process_image_file(file_name: str, file_path, request, user) -> list:
|
def process_image_file(file_name: str, file_path, request, user) -> list:
|
||||||
@ -54,25 +79,45 @@ def process_image_file(file_name: str, file_path, request, user) -> list:
|
|||||||
|
|
||||||
|
|
||||||
@app.task(name='do_pdf')
|
@app.task(name='do_pdf')
|
||||||
def process_pdf(rq_id, sub_id, p_type, user_id, file_name, file_path):
|
def process_pdf(rq_id, sub_id, p_type, user_id, files):
|
||||||
|
"""
|
||||||
|
pdf_files: [{
|
||||||
|
"file_name": "",
|
||||||
|
"file_path": "", # local path to file
|
||||||
|
"file_type": ""
|
||||||
|
},]
|
||||||
|
"""
|
||||||
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile
|
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile
|
||||||
|
start = time.time()
|
||||||
|
from django.conf import settings
|
||||||
new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0]
|
new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0]
|
||||||
user = UserProfile.objects.filter(id=user_id).first()
|
user = UserProfile.objects.filter(id=user_id).first()
|
||||||
file_extension = file_name.split(".")[-1]
|
b_urls = []
|
||||||
# logger.info(f"[DEBUG]: file_path: {file_path}")
|
for i, file in enumerate(files):
|
||||||
if file_extension in pdf_extensions:
|
extension = file["file_name"].split(".")[-1].lower()
|
||||||
b_url = process_pdf_file(file_name, file_path, new_request, user)
|
if extension == "pdf":
|
||||||
else:
|
_b_urls = process_pdf_file(file["file_name"], file["file_path"], new_request, user)
|
||||||
b_url = process_image_file(file_name, file_path, new_request, user)
|
if _b_urls is None:
|
||||||
|
raise FileContentInvalidException
|
||||||
|
for i in range(len(_b_urls)):
|
||||||
|
_b_urls[i]["doc_type"] = file["file_type"]
|
||||||
|
# b_urls += _b_urls # TODO: Client may request all images in a file, for now, extract the first page only
|
||||||
|
for j in range(len(b_urls)):
|
||||||
|
_b_urls[j]["page_number"] = j + len(b_urls)
|
||||||
|
b_urls.append(_b_urls[0])
|
||||||
|
elif extension in image_extensions:
|
||||||
|
this_url = ProcessUtil.process_image_local_file(file["file_name"], file["file_path"], new_request, user)[0]
|
||||||
|
this_url["page_number"] = len(b_urls)
|
||||||
|
if file["file_type"]:
|
||||||
|
this_url["doc_type"] = file["file_type"]
|
||||||
|
b_urls.append(this_url)
|
||||||
|
|
||||||
j_time = time.time()
|
start_process = time.time()
|
||||||
# logger.info(f"[INFO]: Duration of Pre-processing: {j_time - 0}s")
|
logger.info(f"BE proccessing time: {start_process - start}")
|
||||||
# logger.info(f"[INFO]: b_url: {b_url}")
|
|
||||||
if p_type in standard_ocr_list:
|
if p_type in standard_ocr_list:
|
||||||
ProcessUtil.send_to_queue2(rq_id, sub_id, b_url, user_id, p_type)
|
ProcessUtil.send_to_queue2(rq_id, sub_id, b_urls, user_id, p_type)
|
||||||
if p_type == ProcessType.TEMPLATE_MATCHING.value:
|
if p_type == ProcessType.TEMPLATE_MATCHING.value:
|
||||||
ProcessUtil.send_template_queue(rq_id, b_url, '', user_id)
|
ProcessUtil.send_template_queue(rq_id, b_urls, '', user_id)
|
||||||
|
|
||||||
@app.task(name='upload_file_to_s3')
|
@app.task(name='upload_file_to_s3')
|
||||||
def upload_file_to_s3(local_file_path, s3_key):
|
def upload_file_to_s3(local_file_path, s3_key):
|
||||||
@ -81,7 +126,7 @@ def upload_file_to_s3(local_file_path, s3_key):
|
|||||||
if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200:
|
if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200:
|
||||||
os.remove(local_file_path)
|
os.remove(local_file_path)
|
||||||
else:
|
else:
|
||||||
print(f"[INFO] S3 is not available, skipping,...")
|
logger.info(f"S3 is not available, skipping,...")
|
||||||
|
|
||||||
@app.task(name='upload_obj_to_s3')
|
@app.task(name='upload_obj_to_s3')
|
||||||
def upload_obj_to_s3(byte_obj, s3_key):
|
def upload_obj_to_s3(byte_obj, s3_key):
|
||||||
@ -89,4 +134,4 @@ def upload_obj_to_s3(byte_obj, s3_key):
|
|||||||
obj = base64.b64decode(byte_obj)
|
obj = base64.b64decode(byte_obj)
|
||||||
res = s3_client.update_object(s3_key, obj)
|
res = s3_client.update_object(s3_key, obj)
|
||||||
else:
|
else:
|
||||||
print(f"[INFO] S3 is not available, skipping,...")
|
logger.info(f"S3 is not available, skipping,...")
|
@ -4,67 +4,7 @@ import re
|
|||||||
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
|
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
|
||||||
pdf_extensions = ('pdf', 'PDF')
|
pdf_extensions = ('pdf', 'PDF')
|
||||||
allowed_file_extensions = image_extensions + pdf_extensions
|
allowed_file_extensions = image_extensions + pdf_extensions
|
||||||
allowed_p_type = [2, 3, 4, 5, 6]
|
# allowed_file_extensions = image_extensions
|
||||||
LIST_BOX_MESSAGE = 'list_box'
|
|
||||||
NAME_MESSAGE = 'name'
|
|
||||||
VN_AND_SPACE_REGEX = r"[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z0-9 ]+"
|
|
||||||
IMAGE_NAME = "image_croped.jpg"
|
|
||||||
TEMPLATE_ID = 'template_id'
|
|
||||||
pattern = re.compile(VN_AND_SPACE_REGEX)
|
|
||||||
REQUEST_ID = 'requestId'
|
|
||||||
FOLDER_TYPE = 'folderType'
|
|
||||||
MAX_NUMBER_OF_TEMPLATE_DATA_BOX = 20
|
|
||||||
MAX_NUMBER_OF_TEMPLATE_ANCHOR_BOX = 3
|
|
||||||
NUMBER_OF_ITEM_IN_A_BOX = 4 # 4 coordinates
|
|
||||||
ESCAPE_VALUE = 'W5@X8#'
|
|
||||||
USER_MESSAGE = 'user'
|
|
||||||
PLAN_MESSAGE = 'plan'
|
|
||||||
|
|
||||||
|
|
||||||
class FolderFileType(Enum):
|
|
||||||
TEMPLATES = 'templates'
|
|
||||||
REQUESTS = 'requests'
|
|
||||||
|
|
||||||
|
|
||||||
class FileCategory(Enum):
|
|
||||||
CROP = 'Crop'
|
|
||||||
Origin = 'Origin'
|
|
||||||
BREAK = 'Break'
|
|
||||||
|
|
||||||
|
|
||||||
class EntityStatus(Enum):
|
|
||||||
ACTIVE = 1
|
|
||||||
INACTIVE = 0
|
|
||||||
|
|
||||||
|
|
||||||
class TEMPLATE_BOX_TYPE(Enum):
|
|
||||||
ANCHOR = 1
|
|
||||||
DATA = 2
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessType(Enum):
|
|
||||||
TEMPLATE_MATCHING = 2
|
|
||||||
ID_CARD = 3
|
|
||||||
DRIVER_LICENSE = 4
|
|
||||||
INVOICE = 5
|
|
||||||
OCR_WITH_BOX = 6
|
|
||||||
AP_INVOICE = 7
|
|
||||||
FI_INVOICE = 10
|
|
||||||
|
|
||||||
class PlanCode(Enum):
|
|
||||||
TRIAL = 'TRIAL'
|
|
||||||
BASIC = 'BASIC'
|
|
||||||
ADVANCED = 'ADVANCED'
|
|
||||||
|
|
||||||
|
|
||||||
standard_ocr_list = (ProcessType.INVOICE.value, ProcessType.ID_CARD.value, ProcessType.DRIVER_LICENSE.value, ProcessType.OCR_WITH_BOX.value)
|
|
||||||
from enum import Enum
|
|
||||||
import re
|
|
||||||
|
|
||||||
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
|
|
||||||
pdf_extensions = ('pdf', 'PDF')
|
|
||||||
# allowed_file_extensions = image_extensions + pdf_extensions
|
|
||||||
allowed_file_extensions = image_extensions
|
|
||||||
allowed_p_type = [12]
|
allowed_p_type = [12]
|
||||||
LIST_BOX_MESSAGE = 'list_box'
|
LIST_BOX_MESSAGE = 'list_box'
|
||||||
NAME_MESSAGE = 'name'
|
NAME_MESSAGE = 'name'
|
||||||
|
@ -97,6 +97,12 @@ class FileFormatInvalidException(InvalidException):
|
|||||||
default_detail = 'File invalid type'
|
default_detail = 'File invalid type'
|
||||||
detail_with_arg = 'File must have type {}'
|
detail_with_arg = 'File must have type {}'
|
||||||
|
|
||||||
|
class FileContentInvalidException(InvalidException):
|
||||||
|
status_code = status.HTTP_400_BAD_REQUEST
|
||||||
|
default_code = 4007
|
||||||
|
default_detail = 'Invalid content file'
|
||||||
|
detail_with_arg = 'One of the files is broken, please select other file and try again'
|
||||||
|
|
||||||
|
|
||||||
class TokenExpiredException(GeneralException):
|
class TokenExpiredException(GeneralException):
|
||||||
status_code = status.HTTP_401_UNAUTHORIZED
|
status_code = status.HTTP_401_UNAUTHORIZED
|
||||||
|
@ -7,8 +7,10 @@ from fwd_api.constant.common import EntityStatus
|
|||||||
|
|
||||||
class UserProfile(models.Model):
|
class UserProfile(models.Model):
|
||||||
id = models.AutoField(primary_key=True)
|
id = models.AutoField(primary_key=True)
|
||||||
full_name: str = models.CharField(max_length=200)
|
user_name: str = models.CharField(max_length=200, null=True)
|
||||||
sync_id: str = models.CharField(max_length=100)
|
password: str = models.CharField(max_length=200, null=True)
|
||||||
|
full_name: str = models.CharField(max_length=200, null=True)
|
||||||
|
sync_id: str = models.CharField(max_length=100, null=True)
|
||||||
provider_id: str = models.CharField(max_length=100, default='Ctel') # CTel/GCP/Azure :v
|
provider_id: str = models.CharField(max_length=100, default='Ctel') # CTel/GCP/Azure :v
|
||||||
current_total_pages: int = models.IntegerField(default=0)
|
current_total_pages: int = models.IntegerField(default=0)
|
||||||
limit_total_pages: int = models.IntegerField(default=0)
|
limit_total_pages: int = models.IntegerField(default=0)
|
||||||
|
@ -27,7 +27,7 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
|
|||||||
if not isinstance(f, TemporaryUploadedFile):
|
if not isinstance(f, TemporaryUploadedFile):
|
||||||
# print(f'[DEBUG]: {f.name}')
|
# print(f'[DEBUG]: {f.name}')
|
||||||
raise InvalidException(excArgs="files")
|
raise InvalidException(excArgs="files")
|
||||||
extension = f.name.split(".")[-1] in allowed_file_extensions
|
extension = f.name.split(".")[-1].lower() in allowed_file_extensions
|
||||||
if not extension or "." not in f.name:
|
if not extension or "." not in f.name:
|
||||||
raise FileFormatInvalidException(excArgs=allowed_file_extensions)
|
raise FileFormatInvalidException(excArgs=allowed_file_extensions)
|
||||||
if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE:
|
if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE:
|
||||||
@ -129,14 +129,15 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
|
|||||||
print(f"[ERROR]: {e}")
|
print(f"[ERROR]: {e}")
|
||||||
raise ServiceUnavailableException()
|
raise ServiceUnavailableException()
|
||||||
|
|
||||||
def save_to_S3(file_name, rq, obj):
|
def save_to_S3(file_name, rq, local_file_path):
|
||||||
try:
|
try:
|
||||||
base64_obj = base64.b64encode(obj).decode('utf-8')
|
# base64_obj = base64.b64encode(obj).decode('utf-8')
|
||||||
file_path = get_folder_path(rq)
|
file_path = get_folder_path(rq)
|
||||||
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
|
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
|
||||||
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
|
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
|
||||||
# c_connector.upload_file_to_s3((file_path, s3_key))
|
# c_connector.upload_file_to_s3((file_path, s3_key))
|
||||||
c_connector.upload_obj_to_s3((base64_obj, s3_key))
|
c_connector.upload_file_to_s3((local_file_path, s3_key))
|
||||||
|
return s3_key
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR]: {e}")
|
print(f"[ERROR]: {e}")
|
||||||
raise ServiceUnavailableException()
|
raise ServiceUnavailableException()
|
||||||
|
@ -376,6 +376,18 @@ def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request:
|
|||||||
'request_file_id': new_request_file.code
|
'request_file_id': new_request_file.code
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
def process_image_local_file(file_name: str, file_path: str, request: SubscriptionRequest, user) -> list:
|
||||||
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
||||||
|
request=request,
|
||||||
|
file_name=file_name,
|
||||||
|
code=f'FIL{uuid.uuid4().hex}')
|
||||||
|
new_request_file.save()
|
||||||
|
return [{
|
||||||
|
'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),
|
||||||
|
'page_number': 0,
|
||||||
|
'request_file_id': new_request_file.code
|
||||||
|
}]
|
||||||
|
|
||||||
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
|
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
|
||||||
def resize(image, max_w=1920, max_h=1080):
|
def resize(image, max_w=1920, max_h=1080):
|
||||||
logger.info(f"[DEBUG]: image.size: {image.size}, type(image): {type(image)}")
|
logger.info(f"[DEBUG]: image.size: {image.size}, type(image): {type(image)}")
|
||||||
|
Loading…
Reference in New Issue
Block a user