Resize images for pdfs

This commit is contained in:
Viet Anh Nguyen 2023-12-13 16:01:31 +07:00
parent e0eaac3611
commit bace56baf7
29 changed files with 45612 additions and 349 deletions

1
.gitignore vendored
View File

@ -20,3 +20,4 @@ media/
postgres_data/
curl.md
cope2n-api/fwd_api/commands/init_database.py
/data

View File

@ -1,4 +1,3 @@
# FROM thucpd2408/env-cope2n:v1
FROM thucpd2408/env-deskew
COPY ./packages/cudnn-linux*.tar.xz /tmp/cudnn-linux*.tar.xz
@ -9,8 +8,7 @@ RUN tar -xvf /tmp/cudnn-linux*.tar.xz -C /tmp/ \
&& chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* \
&& rm -rf /tmp/cudnn-*-archive
RUN apt-get update && apt-get install -y gcc g++ ffmpeg libsm6 libxext6
# RUN pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
RUN apt-get update && apt-get install -y gcc g++ ffmpeg libsm6 libxext6 poppler-utils
WORKDIR /workspace
@ -25,7 +23,6 @@ RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsv_dewarp && pip3
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtd && pip3 install -v -e .
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtr && pip3 install -v -e .
# RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/ && pip3 install -r requirements.txt
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkie && pip3 install -v -e .
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkvu && pip3 install -v -e .
RUN cd /workspace/cope2n-ai-fi && pip3 install -r requirements.txt
@ -38,6 +35,6 @@ RUN rm -f /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libcublasLt.
ln -s /usr/local/cuda-11.8/targets/x86_64-linux/lib/libnvblas.so.11 /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libnvblas.so.11
ENV PYTHONPATH="."
ENV TZ="Asia/Ho_Chi_Minh"
CMD [ "sh", "run.sh"]
# CMD ["tail -f > /dev/null"]

View File

@ -23,4 +23,3 @@
- [ ] `Kie_Invoice_AP/prediction.py` seems to be the base function, this should act as a proxy which import all other `predict_{anything else}` functions
- [ ] There should be a unique folder to keep all models with different versions then mount as /models in container. Currently, `fi` is loading from `/models/Kie_invoice_fi` while `sap` is loading from `Kie_Invoice_AP/AnyKey_Value/experiments/key_value_understanding-20231003-171748`. Another model weight is at `sdsvtd/hub` for unknown reason
- [ ] Env variables should have its description in README
- [ ]

@ -1 +1 @@
Subproject commit e0edcd3266f59801a22eea673bce15aeeaf01f01
Subproject commit d351bb79dab7d3e449bf8ccd945a3f24f62dd33d

View File

@ -8,3 +8,6 @@ sdsvkvu
pymupdf
easydict
imagesize==1.4.1
pdf2image==1.16.3

View File

@ -5,12 +5,11 @@ ARG USERNAME=container-user
RUN groupadd --gid ${GID} ${USERNAME} \
&& useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
&& apt-get update \
&& apt-get install -y sudo \
&& apt-get install -y sudo bash gettext poppler-utils \
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
&& chmod 0440 /etc/sudoers.d/${USERNAME}
RUN yes | apt install postgresql gcc musl-dev
RUN pip install --upgrade pip
RUN apt install bash gettext
RUN pip install uvicorn gunicorn Celery
USER ${UID}
@ -21,3 +20,5 @@ WORKDIR /app
RUN pip install -r requirements.txt --no-cache-dir
COPY --chown=${UID}:${GID} . /app
ENV TZ="Asia/Ho_Chi_Minh"

View File

@ -134,11 +134,11 @@ AUTH_PASSWORD_VALIDATORS = [
# https://docs.djangoproject.com/en/4.1/topics/i18n/
LANGUAGE_CODE = "en-us"
TIME_ZONE = "Asia/Ho_Chi_Minh"
USE_I18N = True
CELERY_ENABLE_UTC = False
CELERY_TIMEZONE = "Asia/Ho_Chi_Minh"
TIME_ZONE = "Asia/Ho_Chi_Minh"
USE_TZ = True
# Static files (CSS, JavaScript, Images)
@ -195,7 +195,6 @@ CORS_ORIGIN_ALLOW_ALL = True
MEDIA_ROOT = env.str("MEDIA_ROOT", default=r"/var/www/example.com/media/")
BROKER_URL = env.str("BROKER_URL", default="amqp://test:test@107.120.70.226:5672//")
CELERY_TIMEZONE = "Australia/Tasmania"
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60

View File

@ -68,7 +68,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
@extend_schema(request=None, responses=None, tags=['templates'])
@action(detail=False, methods=["DELETE"], url_path=r"templates/(?P<template_id>\d+)")
@throw_on_failure(InvalidException(excArgs='data'))
@transaction.atomic
def delete_template(self, request, template_id=None):
user_data: UserData = ProcessUtil.get_user(request)
@ -112,7 +111,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
else:
return self.insert_template(request, user_data)
@transaction.atomic
def insert_template(self, request, user_data: UserData):
file_list = request.data.getlist('file')
FileUtils.validate_list_file(file_list)
@ -148,7 +146,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
"id": template.id,
})
@transaction.atomic
def update_template(self, request, user_data: UserData):
# Validate
data = request.data

View File

@ -8,12 +8,10 @@ from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiExampl
from rest_framework import status, viewsets
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.decorators import authentication_classes, permission_classes
from ..annotation.api import throw_on_failure
from ..constant.common import USER_MESSAGE, EntityStatus, PLAN_MESSAGE, PlanCode
from ..exception.exceptions import InvalidException, NotFoundException, LockedEntityException, TrialOneException, \
LimitReachedException, NotAuthenticatedException
from ..exception.exceptions import InvalidException, NotFoundException, LockedEntityException, TrialOneException, NotAuthenticatedException
from ..models import UserProfile, PricingPlan, Subscription
from ..request.UpsertUserRequest import UpsertUserRequest
from ..response.SubscriptionResponse import SubscriptionResponse
@ -135,7 +133,6 @@ class CtelUserViewSet(viewsets.ViewSet):
else:
return self.get_user(request)
@transaction.atomic
def upsert_user(self, request):
if not hasattr(request, 'user_data'):
raise NotFoundException(excArgs=USER_MESSAGE)

View File

@ -4,23 +4,21 @@ from wsgiref.util import FileWrapper
from django.core.files.uploadedfile import TemporaryUploadedFile
from django.http import HttpResponse, JsonResponse
from django.utils.crypto import get_random_string
from drf_spectacular.utils import extend_schema
from rest_framework import status, viewsets
from rest_framework.decorators import action
from rest_framework.response import Response
from typing import List
from rest_framework.renderers import JSONRenderer
from rest_framework_xml.renderers import XMLRenderer
from rest_framework_xml.renderers import XMLRenderer
from fwd import settings
from ..celery_worker.client_connector import c_connector
from ..annotation.api import throw_on_failure
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
FolderFileType, TEMPLATE_ID, EntityStatus, pdf_extensions, allowed_file_extensions
from ..constant.common import ProcessType, REQUEST_ID, FOLDER_TYPE, EntityStatus, pdf_extensions, allowed_file_extensions
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
PermissionDeniedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException
from ..models import SubscriptionRequest, SubscriptionRequestFile, OcrTemplate
from ..response.ReportSerializer import ReportSerializer
from ..utils import FileUtils, ProcessUtil
@ -87,7 +85,7 @@ class CtelViewSet(viewsets.ViewSet):
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
doc_file.seek(0)
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
S3_path = FileUtils.save_to_S3(_name, new_request, file_path)
FileUtils.save_to_S3(_name, new_request, file_path)
count += 1
this_file = {
"file_name": _name,
@ -157,7 +155,7 @@ class CtelViewSet(viewsets.ViewSet):
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
doc_file.seek(0)
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
_ = FileUtils.save_to_S3(_name, new_request, file_path)
FileUtils.save_to_S3(_name, new_request, file_path)
count += 1
this_file = {
"file_name": _name,
@ -167,12 +165,19 @@ class CtelViewSet(viewsets.ViewSet):
compact_files.append(this_file)
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
time_out = 120
start = time.time()
while time.time() - start < time_out:
time.sleep(0.1)
time_limit = 120
start_time = time.time()
while True:
current_time = time.time()
waiting_time = current_time - start_time
print("Waiting for: ", waiting_time)
if waiting_time > time_limit:
print("Timeout!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
break
time.sleep(0.2)
report_filter = SubscriptionRequest.objects.filter(request_id=rq_id)
if len(report_filter) != 1:
if report_filter.count() != 1:
raise InvalidException(excArgs='requestId')
if user_info.current_sub.id != report_filter[0].subscription.id:
@ -191,13 +196,19 @@ class CtelViewSet(viewsets.ViewSet):
if report_filter[0].status == 400:
raise FileContentInvalidException()
if report_filter[0].status == 100: # continue, only return when result is fullfilled
print(serializer.data)
print("Status Code: 100")
continue
if len(serializer.data) == 0:
print("No data found")
continue
if serializer.data[0].get("data", None) is None:
print(serializer.data[0])
print("No data[0] found")
continue
if serializer.data[0]["data"].get("status", 200) != 200:
print("No data status found")
continue
return Response(status=status.HTTP_200_OK, data=serializer.data[0])

View File

@ -88,7 +88,7 @@ class CeleryConnector:
def send_task(self, name=None, args=None):
if name not in self.task_routes or 'queue' not in self.task_routes[name]:
raise GeneralException("System")
return self.app.send_task(name, args, queue=self.task_routes[name]['queue'])
return self.app.send_task(name, args, queue=self.task_routes[name]['queue'], expires=300)
c_connector = CeleryConnector()

View File

@ -1,13 +1,14 @@
import time
import fitz
import uuid
import os
import base64
import traceback
from fwd_api.models import SubscriptionRequest, UserProfile
from fwd_api.celery_worker.worker import app
from ..constant.common import FolderFileType, image_extensions
from ..exception.exceptions import FileContentInvalidException
from fwd_api.models import SubscriptionRequestFile
from ..utils import FileUtils, ProcessUtil, S3_process
from celery.utils.log import get_task_logger
from fwd import settings
@ -20,13 +21,10 @@ s3_client = S3_process.MinioS3Client(
access_key=settings.S3_ACCESS_KEY,
secret_key=settings.S3_SECRET_KEY,
bucket_name=settings.S3_BUCKET_NAME
)
)
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
from fwd_api.models import SubscriptionRequestFile
try:
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
# Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
@ -34,27 +32,9 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
# Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
except Exception as e:
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
return None
def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
from fwd_api.models import SubscriptionRequestFile
doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
# Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
try:
# Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
return ProcessUtil.pdf_to_images_urls(FileUtils.get_file(file_path), request, user)
except Exception as e:
traceback.print_exc()
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
@ -62,8 +42,6 @@ def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) ->
def process_image_file(file_name: str, file_path, request, user) -> list:
from fwd_api.models import SubscriptionRequestFile
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,

View File

@ -1,6 +1,10 @@
import traceback
from fwd_api.celery_worker.worker import app
from fwd_api.models import SubscriptionRequest
from fwd_api.exception.exceptions import InvalidException
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
def aggregate_result(src_result, des_result, doc_type):
@ -45,11 +49,6 @@ def update_user(rq: SubscriptionRequest):
@app.task(name='process_sap_invoice_result')
def process_invoice_sap_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(rq_id)
try:
rq: SubscriptionRequest = \
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
@ -66,16 +65,12 @@ def process_invoice_sap_result(rq_id, result):
except Exception as e:
print(e)
print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice"
@app.task(name='process_fi_invoice_result')
def process_invoice_fi_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(rq_id)
print(result)
try:
rq: SubscriptionRequest = \
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.FI_INVOICE.value)[0]
@ -92,14 +87,11 @@ def process_invoice_fi_result(rq_id, result):
except Exception as e:
print(e)
print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice"
@app.task(name='process_manulife_invoice_result')
def process_invoice_manulife_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(f"[DEBUG]: Received manulife request with id {rq_id}")
try:
rq: SubscriptionRequest = \
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.MANULIFE_INVOICE.value)[0]
@ -116,13 +108,11 @@ def process_invoice_manulife_result(rq_id, result):
except Exception as e:
print(e)
print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice"
@app.task(name='process_sbt_invoice_result')
def process_invoice_sbt_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(f"[DEBUG]: Received SBT request with id {rq_id}")
print_id(f"[DEBUG]: result: {result}")
try:
@ -156,205 +146,6 @@ def process_invoice_sbt_result(rq_id, result):
except Exception as e:
print(e)
print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice"
# @app.task(name='process_id_result', queue='id_card_rs')
# def process_id_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# from fwd_api.models import SubscriptionRequestFile
# from fwd_api.constant.common import FileCategory
# print_id(rq_id)
# try:
# s_time = time.time()
# print("Start")
# j_time = time.time()
# print("Json {}".format(j_time - s_time))
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.ID_CARD.value)[0]
# if 'content' in result and 'pages' in result['content']:
# pages = result['content']['pages']
# if isinstance(pages, list):
# new_pages = []
# for idx, page in enumerate(pages):
# if 'path_image_croped' in page:
# img_name = f'crop_{idx}_{get_random_string(3)}.jpg'
# path = page['path_image_croped']
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=img_name, request=rq,
# file_category=FileCategory.CROP.value,
# file_path=path,
# code=f'IDC{uuid.uuid4().hex}')
# rq_file.save()
# page['path_image_croped'] = rq_file.code
# l_time = time.time()
# print("Save {}".format(l_time - j_time))
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# e_time = time.time()
# print("End {}".format(e_time - l_time))
# except IndexError as e:
# traceback.format_exc()
# print(e)
# except Exception as e:
# traceback.format_exc()
# print(e)
# print("Fail ID %d", rq_id)
# return "Fail"
# return "Success"
# @app.task(name='process_driver_license_result')
# def process_driver_license_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.models import SubscriptionRequestFile
# from fwd_api.constant.common import FileCategory
# from fwd_api.constant.common import ProcessType
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.DRIVER_LICENSE.value)[0]
# if 'content' in result and 'pages' in result['content']:
# pages = result['content']['pages']
# if isinstance(pages, list):
# new_pages = []
# for idx, page in enumerate(pages):
# if 'path_image_croped' in page:
# img_name = f'crop_{idx}_{get_random_string(3)}.jpg'
# path = page['path_image_croped']
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=img_name, request=rq,
# file_category=FileCategory.CROP.value,
# file_path=path,
# code=f'DLC{uuid.uuid4().hex}')
# rq_file.save()
# page['path_image_croped'] = rq_file.code
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# print(e)
# except Exception as e:
# print(e)
# print("Fail DL %d", rq_id)
# return "Fail"
# return "Success"
# @app.task(name='process_invoice_result')
# def process_invoice_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)
# print(rq)
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# print(e)
# print("NotFound request by requestId, %d", rq_id)
# except Exception as e:
# print(e)
# traceback.format_exc()
# print("Fail Invoice %d", rq_id)
# return "FailInvoice"
# return "Success"
# @app.task(name='process_ocr_with_box_result')
# def process_ocr_with_box_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.OCR_WITH_BOX.value)[0]
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# traceback.format_exc()
# print(e)
# except Exception as e:
# traceback.format_exc()
# print(e)
# print("Fail OCR %d", rq_id)
# return "FailOCR"
# return "Success"
# @app.task(name='process_template_matching_result')
# def template_matching_result(rq_id, result, align_img):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# from fwd_api.constant.common import FileCategory
# from fwd_api.models import SubscriptionRequestFile
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.TEMPLATE_MATCHING.value)[0]
# if align_img:
# from fwd_api.constant.common import IMAGE_NAME
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=IMAGE_NAME, request=rq,
# file_category=FileCategory.CROP.value,
# file_path=align_img)
# rq_file.save()
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# traceback.format_exc()
# print(e)
# except Exception as e:
# traceback.format_exc()
# print(e)
# print("Fail Template %d", rq_id)
# return "FailTemplate"
# return "Success"

View File

@ -18,11 +18,6 @@ app: Celery = Celery(
app.conf.update({
'task_queues':
[
# Queue('id_card_rs'),
# Queue('driver_license_rs'),
# Queue('invoice_rs'),
# Queue('ocr_with_box_rs'),
# Queue('template_matching_rs'),
Queue('invoice_sap_rs'),
Queue('invoice_fi_rs'),
Queue('invoice_manulife_rs'),
@ -33,11 +28,6 @@ app.conf.update({
],
'task_routes': {
# 'process_id_result': {'queue': 'id_card_rs'},
# 'process_driver_license_result': {'queue': "driver_license_rs"},
# 'process_invoice_result': {'queue': "invoice_rs"},
# 'process_ocr_with_box_result': {'queue': "ocr_with_box_rs"},
# 'process_template_matching_result': {'queue': 'template_matching_rs'},
'process_sap_invoice_result': {'queue': 'invoice_sap_rs'},
'process_sap_invoice': {'queue': "invoice_sap"},
'process_fi_invoice_result': {'queue': 'invoice_fi_rs'},

View File

@ -72,10 +72,3 @@ class ReportSerializer(serializers.Serializer):
new_data.append(new_page_object)
data['pages'] = new_data
return data
# def get_predict_result(self, obj: SubscriptionRequest):
# from fwd_api.constant.common import ProcessType
# typez = int(obj.process_type)
# if typez == ProcessType.OCR_WITH_BOX.value or typez == ProcessType.TEMPLATE_MATCHING.value:
# return obj.predict_result
# return None

View File

@ -156,11 +156,9 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
def save_to_S3(file_name, rq, local_file_path):
try:
# base64_obj = base64.b64encode(obj).decode('utf-8')
file_path = get_folder_path(rq)
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
# c_connector.upload_file_to_s3((file_path, s3_key))
c_connector.upload_file_to_s3((local_file_path, s3_key))
return s3_key
except Exception as e:

View File

@ -8,7 +8,9 @@ from django.core.files.uploadedfile import TemporaryUploadedFile
from django.db import transaction
from rest_framework import status
from fwd_api.utils.image import get_first_page_pdf
from fwd import settings
from fwd_api.utils.image import resize
from fwd_api.constant.common import LIST_BOX_MESSAGE, pattern, NAME_MESSAGE, allowed_p_type, TEMPLATE_ID, \
FolderFileType, FileCategory
from fwd_api.exception.exceptions import NumberOfBoxLimitReachedException, \
@ -21,7 +23,6 @@ from ..models import UserProfile, OcrTemplate, OcrTemplateBox, \
Subscription, SubscriptionRequestFile, SubscriptionRequest
from ..celery_worker.client_connector import c_connector
import uuid
from PIL import Image
from celery.utils.log import get_task_logger
@ -286,7 +287,6 @@ def validate_vn_and_space(txt: str):
raise InvalidException(excArgs=NAME_MESSAGE)
@transaction.atomic
def save_template_boxs(data, template):
saving_list = []
for d_box in data['data_boxs']:
@ -410,18 +410,15 @@ def process_image_local_file(file_name: str, file_path: str, request: Subscripti
'request_file_id': new_request_file.code
}]
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
def pdf_to_images_urls(doc_path, request: SubscriptionRequest, user, dpi: int = 300) -> list:
pdf_extracted = []
for idx, page in enumerate(doc):
saving_path = FileUtils.get_folder_path(request)
break_file_name = f'break_{idx}.jpg'
break_file_name = f'break_0.jpg'
saving_path = os.path.join(saving_path, break_file_name)
page = doc.load_page(idx)
pix = page.get_pixmap(dpi=250) # render page to an image
if pix.size > 8*3*settings.MAX_PIXEL_IN_A_FILE*settings.MAX_PIXEL_IN_A_FILE:
raise InvalidDecompressedSizeException(excArgs=(str(pix.width), str(pix.height), str(settings.MAX_PIXEL_IN_A_FILE)))
pix.save(saving_path)
image = get_first_page_pdf(doc_path, 300)
image = resize(image, max_w=settings.TARGET_MAX_IMAGE_SIZE[0], max_h=settings.TARGET_MAX_IMAGE_SIZE[1])
image.save(saving_path)
print(f"Saving {saving_path}")
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=saving_path,
request=request,
@ -434,7 +431,7 @@ def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, d
pdf_extracted.append(
{
'file_url': file_url,
'page_number': idx,
'page_number': 0,
'request_file_id': new_request_file.code
}
)

View File

@ -1,9 +1,11 @@
import io
from PIL import Image
from PyPDF2 import PdfReader, PdfWriter
from pdf2image import convert_from_bytes
def resize(image, max_w=2048, max_h=2048):
cur_w, cur_h = image.width, image.height
image_bytes = image.samples
image = Image.frombytes("RGB", [cur_w, cur_h], image_bytes)
cur_w = image.width
cur_h = image.height
if cur_h > max_w or cur_h > max_h:
ratio_w = max_w/cur_w
ratio_h = max_h/cur_h
@ -11,5 +13,40 @@ def resize(image, max_w=2048, max_h=2048):
new_w = int(ratio*cur_w)
new_h = int(ratio*cur_h)
image = image.resize((new_w, new_h))
return image
def fitz_pixmap_to_pillow_with_resize(image, max_w=2048, max_h=2048):
cur_w, cur_h = image.width, image.height
image_bytes = image.samples
image = Image.frombytes("RGB", [cur_w, cur_h], image_bytes)
image = resize(image, max_w, max_h)
return image
def get_first_page_pdf(filename, max_size=300):
def pdf_scale_page(page, size=297):
"""Scale page to specified size mm"""
(w, h) = page.mediabox[2:]
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
pmm = (1/72*25.4)
ks = size / (float(max((w, h))) * pmm)
page.scale_by(ks)
return page
reader = PdfReader(filename)
page = reader.pages[0]
scaled_page = pdf_scale_page(page, max_size)
# Create BytesIO
pdf_bytes = io.BytesIO()
dst_pdf = PdfWriter()
dst_pdf.add_page(scaled_page)
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
image = convert_from_bytes(pdf_bytes.read())
if isinstance(image, list):
return image[0]
return image

View File

@ -23,7 +23,6 @@ Jinja2==3.1.2
jsonschema==4.17.1
MarkupSafe==2.1.1
packaging==21.3
pdf2image==1.16.0
Pillow==9.3.0
psycopg2==2.9.5
psycopg2-binary==2.9.5
@ -41,7 +40,6 @@ tzdata==2022.6
uritemplate==4.1.1
urllib3==1.26.13
uvicorn==0.20.0
celery~=5.2.7
kombu~=5.2.4
PyJWT~=2.6.0
@ -50,3 +48,4 @@ PyMuPDF==1.21.1
djangorestframework-xml==2.0.0
boto3==1.29.7
imagesize==1.4.1
pdf2image==1.16.3

View File

@ -8,6 +8,9 @@ server {
location ~ ^/api {
proxy_pass {{proxy_server}};
proxy_read_timeout 300;
proxy_connect_timeout 300;
proxy_send_timeout 300;
}
location /static/drf_spectacular_sidecar/ {

View File

@ -26,7 +26,7 @@ export async function createKieJob({
const response = await API.post<{
estimated_wating_time: number;
request_id: string;
}>('/ctel/image/process/', formData);
}>('/ctel/images/process_sync', formData);
return response.data.request_id;
}

View File

@ -11,8 +11,7 @@ services:
shm_size: 10gb
dockerfile: Dockerfile
shm_size: 10gb
image: sidp/cope2n-ai-fi-sbt
mem_limit: 8g
# mem_limit: 8g
restart: always
# container_name: "sidp-cope2n-ai-fi-sbt"
networks:
@ -43,7 +42,6 @@ services:
dockerfile: Dockerfile
# ports:
# - 9880:9000
image: sidp/cope2n-be-fi-sbt
# container_name: "sidp-cope2n-be-ctel-sbt"
environment:
- MEDIA_ROOT=${MEDIA_ROOT}
@ -71,20 +69,20 @@ services:
- S3_ACCESS_KEY=${S3_ACCESS_KEY}
- S3_SECRET_KEY=${S3_SECRET_KEY}
- S3_BUCKET_NAME=${S3_BUCKET_NAME}
# restart: always
restart: always
networks:
- ctel-sbt
volumes:
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
- BE_static:/app/static
# - ./cope2n-api:/app
- ./cope2n-api:/app
working_dir: /app
depends_on:
db-sbt:
condition: service_started
# rabbitmq:
# condition: service_started
command: sh -c "python manage.py collectstatic --no-input &&
command: sh -c "sleep 5; python manage.py collectstatic --no-input &&
python manage.py migrate &&
python manage.py compilemessages &&
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker -b 0.0.0.0:9000" # pre-makemigrations on prod
@ -111,8 +109,9 @@ services:
# args:
# - "UID=${UID:-1000}"
# - "GID=${GID:-1000}"
image: sidp/cope2n-be-fi-sbt
# container_name: "sidp-cope2n-be-celery-sbt"
build:
context: cope2n-api
dockerfile: Dockerfile
environment:
- MEDIA_ROOT=${MEDIA_ROOT}
- PYTHONPATH=${PYTHONPATH}:/app # For import module
@ -146,19 +145,19 @@ services:
condition: service_started
volumes:
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
- ./cope2n-api:/app
working_dir: /app
command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO"
command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO --pool=solo"
# Back-end persistent
db-sbt:
mem_reservation: 500m
mem_limit: 1g
# mem_limit: 1g
# container_name: sidp-cope2n-be-sbt-db
image: postgres:14.7-alpine
volumes:
- ./data/postgres_data:/var/lib/postgresql/data
working_dir: /workspace/cope2n-api
networks:
- ctel-sbt
environment:
@ -168,7 +167,7 @@ services:
rabbitmq-sbt:
mem_reservation: 600m
mem_limit: 4g
# mem_limit: 4g
# container_name: sidp-cope2n-be-rabbitmq-sbt
restart: always
image: rabbitmq:3.10-alpine
@ -182,6 +181,7 @@ services:
environment:
- RABBITMQ_DEFAULT_USER=${RABBITMQ_DEFAULT_USER}
- RABBITMQ_DEFAULT_PASS=${RABBITMQ_DEFAULT_PASS}
# Front-end services
fe-sbt:
build:
@ -189,7 +189,6 @@ services:
shm_size: 10gb
dockerfile: Dockerfile
shm_size: 10gb
image: sidp/cope2n-fe-fi-sbt
# container_name: "sidp-cope2n-fe-ctel-sbt"
privileged: true
ports:

BIN
invoice.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 270 KiB

View File

@ -146,10 +146,10 @@ def process_file(data):
}
invoice_files = [
('invoice_file', ('invoice.jpg', open("test_samples/sbt/big_image.jpg", "rb").read())),
('invoice_file', ('invoice.jpg', open("test_samples/sbt/invoice.jpg", "rb").read())),
]
imei_files = [
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/big_image.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/invoice.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),

157
speedtest_sync.py Normal file
View File

@ -0,0 +1,157 @@
import requests
import time
import argparse
import multiprocessing
import tqdm
import random
import traceback
parser = argparse.ArgumentParser()
parser.add_argument("--host", dest="host", default="https://sbt.idp.sdsrv.ai", required=False)
parser.add_argument("-u", "--username", help="Username to connect to server", required=True)
parser.add_argument("-p", "--password", help="Password to connect to server", required=True)
parser.add_argument("--num_requests", type=int, help="Number of requests", required=False, default=100)
parser.add_argument("--num_workers", type=int, help="Number of workers", required=False, default=3)
parser.add_argument("--checking_interval", type=float, help="Interval result checking time", required=False, default=0.5)
args = parser.parse_args()
PROCESSING_TIMEOUT = 60
# =================================================================
# GET THE TOKEN
response = requests.post(f'{args.host}/api/ctel/login/', json={
'username': args.username,
'password': args.password
})
try:
token = response.json()['token']
except:
print("Failed to login")
print(response.content)
# After the login, store the token in the memory (RAM) or DB
# Re-login to issue a new token after 6 days.
# =================================================================
def process_file(data):
files, token = data
num_files = len(files)
files.append(
('processType', (None, 12)),
)
# =================================================================
# UPLOAD THE FILE
start_time = time.time()
try:
response = requests.post(f'{args.host}/api/ctel/images/process_sync/', headers={
'Authorization': token,
}, files=files, timeout=300)
except requests.exceptions.Timeout:
print("Timeout occurred while uploading")
return {
"success": False,
"status": "timeout",
"upload_time": 0,
"process_time": 0,
"num_files": 0,
}
except Exception as e:
print(e)
traceback.print_exc()
print("Unknown exception occurred while uploading")
return {
"success": False,
"status": "unknown error",
"upload_time": 0,
"process_time": 0,
"num_files": 0,
}
end_time = time.time()
upload_time = end_time - start_time
# =================================================================
try:
data = response.json()
data.pop("files", None)
print(data)
except:
print(response.content)
return {
"success": False,
"status": "timeout",
"upload_time": 0,
"process_time": 0,
"num_files": 0,
}
return {
"success": True,
"status": 200,
"upload_time": upload_time,
"process_time": upload_time,
"num_files": num_files,
}
invoice_files = [
('invoice_file', ('invoice.pdf', open("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", "rb").read())),
]
# invoice_files = [
# ('invoice_file', ('invoice.jpg', open("test_samples/sbt/invoice.jpg", "rb").read())),
# ]
imei_files = [
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/invoice.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),
('imei_files', ("test_samples/sbt/imei5.jpg", open("test_samples/sbt/imei5.jpg", "rb").read())),
]
def get_imei_files():
# num_files = random.randint(1, len(imei_files) + 1)
num_files = 1
print("Num imeis", num_files)
files = imei_files[:num_files]
# print("Num of imei files:", len(files))
return files
def get_files():
return invoice_files + get_imei_files()
def gen_input(num_input):
for _ in range(num_input):
yield (get_files(), token)
pool = multiprocessing.Pool(processes=args.num_workers)
results = []
for result in tqdm.tqdm(pool.imap_unordered(process_file, gen_input(num_input=args.num_requests)), total=args.num_requests):
results.append(result)
print("## TEST REPORT #################################")
print("Number of requests: {}".format(args.num_requests))
print("Number of concurrent requests: {}".format(args.num_workers))
print("Number of files: 1 invoice, 1-5 imei files (random)")
print("Query time interval for result: {:.3f}s ".format(args.checking_interval))
print("--------------------------------------")
print("SUCCESS RATE")
counter = {}
for result in results:
counter[result["status"]] = counter.get(result["status"], 0) + 1
total_requests = sum(counter.values())
print("Success rate: {}".format(counter.get(200, 0) / total_requests if total_requests > 0 else -1))
print("Statuses:", counter)
print("--------------------------------------")
print("TIME BY REQUEST")
uploading_time = [x["upload_time"] for x in results if x["success"]]
if len(uploading_time) == 0:
print("No valid uploading time")
print("Check the results!")
processing_time = [x["process_time"] for x in results if x["success"]]
print("Uploading time (Avg / Min / Max): {:.3f}s {:.3f}s {:.3f}s".format(sum(uploading_time) / len(uploading_time), min(uploading_time), max(uploading_time)))
print("Processing time (Avg / Min / Max): {:.3f}s {:.3f}s {:.3f}s".format(sum(processing_time) / len(processing_time), min(processing_time), max(processing_time)))
print("--------------------------------------")
print("TIME BY IMAGE")
uploading_time = [x["upload_time"] for x in results if x["success"]]
processing_time = [x["process_time"] for x in results if x["success"]]
num_images = sum(x["num_files"] for x in results if x["success"])
print("Total images:", num_images)
print("Uploading time: {:.3f}s".format(sum(uploading_time) / num_images))
print("Processing time: {:.3f}s".format(sum(processing_time) / num_images))
print("--------------------------------------")

34
test_pdf_reader.py Normal file
View File

@ -0,0 +1,34 @@
from PyPDF2 import PdfReader, PdfWriter
from PIL import Image
from pdf2image import convert_from_bytes
def get_first_page_pdf(filename, max_size=2048):
def pdf_scale_page(page, size=297):
"""Scale page to specified size mm"""
(w, h) = page.mediabox[2:]
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
pmm = (1/72*25.4)
ks = size / (float(max((w, h))) * pmm)
page.scale_by(ks)
return page
reader = PdfReader(filename)
page = reader.pages[0]
scaled_page = pdf_scale_page(page, max_size)
# Create BytesIO
pdf_bytes = io.BytesIO()
dst_pdf = PdfWriter()
dst_pdf.add_page(scaled_page)
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
image = convert_from_bytes(pdf_bytes.read())
if isinstance(image, list):
return image[0]
return image
img = get_first_page_pdf("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", max_size=300)
img.save("invoice.jpg", "JPEG")

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 MiB

File diff suppressed because one or more lines are too long