Resize images for pdfs

This commit is contained in:
Viet Anh Nguyen 2023-12-13 16:01:31 +07:00
parent e0eaac3611
commit bace56baf7
29 changed files with 45612 additions and 349 deletions

1
.gitignore vendored
View File

@ -20,3 +20,4 @@ media/
postgres_data/ postgres_data/
curl.md curl.md
cope2n-api/fwd_api/commands/init_database.py cope2n-api/fwd_api/commands/init_database.py
/data

View File

@ -1,4 +1,3 @@
# FROM thucpd2408/env-cope2n:v1
FROM thucpd2408/env-deskew FROM thucpd2408/env-deskew
COPY ./packages/cudnn-linux*.tar.xz /tmp/cudnn-linux*.tar.xz COPY ./packages/cudnn-linux*.tar.xz /tmp/cudnn-linux*.tar.xz
@ -9,8 +8,7 @@ RUN tar -xvf /tmp/cudnn-linux*.tar.xz -C /tmp/ \
&& chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* \ && chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* \
&& rm -rf /tmp/cudnn-*-archive && rm -rf /tmp/cudnn-*-archive
RUN apt-get update && apt-get install -y gcc g++ ffmpeg libsm6 libxext6 RUN apt-get update && apt-get install -y gcc g++ ffmpeg libsm6 libxext6 poppler-utils
# RUN pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
WORKDIR /workspace WORKDIR /workspace
@ -25,7 +23,6 @@ RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsv_dewarp && pip3
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtd && pip3 install -v -e . RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtd && pip3 install -v -e .
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtr && pip3 install -v -e . RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtr && pip3 install -v -e .
# RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/ && pip3 install -r requirements.txt
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkie && pip3 install -v -e . RUN cd /workspace/cope2n-ai-fi/modules/sdsvkie && pip3 install -v -e .
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkvu && pip3 install -v -e . RUN cd /workspace/cope2n-ai-fi/modules/sdsvkvu && pip3 install -v -e .
RUN cd /workspace/cope2n-ai-fi && pip3 install -r requirements.txt RUN cd /workspace/cope2n-ai-fi && pip3 install -r requirements.txt
@ -38,6 +35,6 @@ RUN rm -f /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libcublasLt.
ln -s /usr/local/cuda-11.8/targets/x86_64-linux/lib/libnvblas.so.11 /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libnvblas.so.11 ln -s /usr/local/cuda-11.8/targets/x86_64-linux/lib/libnvblas.so.11 /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libnvblas.so.11
ENV PYTHONPATH="." ENV PYTHONPATH="."
ENV TZ="Asia/Ho_Chi_Minh"
CMD [ "sh", "run.sh"] CMD [ "sh", "run.sh"]
# CMD ["tail -f > /dev/null"]

View File

@ -23,4 +23,3 @@
- [ ] `Kie_Invoice_AP/prediction.py` seems to be the base function, this should act as a proxy which import all other `predict_{anything else}` functions - [ ] `Kie_Invoice_AP/prediction.py` seems to be the base function, this should act as a proxy which import all other `predict_{anything else}` functions
- [ ] There should be a unique folder to keep all models with different versions then mount as /models in container. Currently, `fi` is loading from `/models/Kie_invoice_fi` while `sap` is loading from `Kie_Invoice_AP/AnyKey_Value/experiments/key_value_understanding-20231003-171748`. Another model weight is at `sdsvtd/hub` for unknown reason - [ ] There should be a unique folder to keep all models with different versions then mount as /models in container. Currently, `fi` is loading from `/models/Kie_invoice_fi` while `sap` is loading from `Kie_Invoice_AP/AnyKey_Value/experiments/key_value_understanding-20231003-171748`. Another model weight is at `sdsvtd/hub` for unknown reason
- [ ] Env variables should have its description in README - [ ] Env variables should have its description in README
- [ ]

@ -1 +1 @@
Subproject commit e0edcd3266f59801a22eea673bce15aeeaf01f01 Subproject commit d351bb79dab7d3e449bf8ccd945a3f24f62dd33d

View File

@ -8,3 +8,6 @@ sdsvkvu
pymupdf pymupdf
easydict easydict
imagesize==1.4.1
pdf2image==1.16.3

View File

@ -5,12 +5,11 @@ ARG USERNAME=container-user
RUN groupadd --gid ${GID} ${USERNAME} \ RUN groupadd --gid ${GID} ${USERNAME} \
&& useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \ && useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
&& apt-get update \ && apt-get update \
&& apt-get install -y sudo \ && apt-get install -y sudo bash gettext poppler-utils \
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \ && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
&& chmod 0440 /etc/sudoers.d/${USERNAME} && chmod 0440 /etc/sudoers.d/${USERNAME}
RUN yes | apt install postgresql gcc musl-dev RUN yes | apt install postgresql gcc musl-dev
RUN pip install --upgrade pip RUN pip install --upgrade pip
RUN apt install bash gettext
RUN pip install uvicorn gunicorn Celery RUN pip install uvicorn gunicorn Celery
USER ${UID} USER ${UID}
@ -21,3 +20,5 @@ WORKDIR /app
RUN pip install -r requirements.txt --no-cache-dir RUN pip install -r requirements.txt --no-cache-dir
COPY --chown=${UID}:${GID} . /app COPY --chown=${UID}:${GID} . /app
ENV TZ="Asia/Ho_Chi_Minh"

View File

@ -134,11 +134,11 @@ AUTH_PASSWORD_VALIDATORS = [
# https://docs.djangoproject.com/en/4.1/topics/i18n/ # https://docs.djangoproject.com/en/4.1/topics/i18n/
LANGUAGE_CODE = "en-us" LANGUAGE_CODE = "en-us"
TIME_ZONE = "Asia/Ho_Chi_Minh"
USE_I18N = True USE_I18N = True
CELERY_ENABLE_UTC = False
CELERY_TIMEZONE = "Asia/Ho_Chi_Minh"
TIME_ZONE = "Asia/Ho_Chi_Minh"
USE_TZ = True USE_TZ = True
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)
@ -195,7 +195,6 @@ CORS_ORIGIN_ALLOW_ALL = True
MEDIA_ROOT = env.str("MEDIA_ROOT", default=r"/var/www/example.com/media/") MEDIA_ROOT = env.str("MEDIA_ROOT", default=r"/var/www/example.com/media/")
BROKER_URL = env.str("BROKER_URL", default="amqp://test:test@107.120.70.226:5672//") BROKER_URL = env.str("BROKER_URL", default="amqp://test:test@107.120.70.226:5672//")
CELERY_TIMEZONE = "Australia/Tasmania"
CELERY_TASK_TRACK_STARTED = True CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60 CELERY_TASK_TIME_LIMIT = 30 * 60

View File

@ -68,7 +68,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
@extend_schema(request=None, responses=None, tags=['templates']) @extend_schema(request=None, responses=None, tags=['templates'])
@action(detail=False, methods=["DELETE"], url_path=r"templates/(?P<template_id>\d+)") @action(detail=False, methods=["DELETE"], url_path=r"templates/(?P<template_id>\d+)")
@throw_on_failure(InvalidException(excArgs='data')) @throw_on_failure(InvalidException(excArgs='data'))
@transaction.atomic
def delete_template(self, request, template_id=None): def delete_template(self, request, template_id=None):
user_data: UserData = ProcessUtil.get_user(request) user_data: UserData = ProcessUtil.get_user(request)
@ -112,7 +111,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
else: else:
return self.insert_template(request, user_data) return self.insert_template(request, user_data)
@transaction.atomic
def insert_template(self, request, user_data: UserData): def insert_template(self, request, user_data: UserData):
file_list = request.data.getlist('file') file_list = request.data.getlist('file')
FileUtils.validate_list_file(file_list) FileUtils.validate_list_file(file_list)
@ -148,7 +146,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
"id": template.id, "id": template.id,
}) })
@transaction.atomic
def update_template(self, request, user_data: UserData): def update_template(self, request, user_data: UserData):
# Validate # Validate
data = request.data data = request.data

View File

@ -8,12 +8,10 @@ from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiExampl
from rest_framework import status, viewsets from rest_framework import status, viewsets
from rest_framework.decorators import action from rest_framework.decorators import action
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework.decorators import authentication_classes, permission_classes
from ..annotation.api import throw_on_failure from ..annotation.api import throw_on_failure
from ..constant.common import USER_MESSAGE, EntityStatus, PLAN_MESSAGE, PlanCode from ..constant.common import USER_MESSAGE, EntityStatus, PLAN_MESSAGE, PlanCode
from ..exception.exceptions import InvalidException, NotFoundException, LockedEntityException, TrialOneException, \ from ..exception.exceptions import InvalidException, NotFoundException, LockedEntityException, TrialOneException, NotAuthenticatedException
LimitReachedException, NotAuthenticatedException
from ..models import UserProfile, PricingPlan, Subscription from ..models import UserProfile, PricingPlan, Subscription
from ..request.UpsertUserRequest import UpsertUserRequest from ..request.UpsertUserRequest import UpsertUserRequest
from ..response.SubscriptionResponse import SubscriptionResponse from ..response.SubscriptionResponse import SubscriptionResponse
@ -135,7 +133,6 @@ class CtelUserViewSet(viewsets.ViewSet):
else: else:
return self.get_user(request) return self.get_user(request)
@transaction.atomic
def upsert_user(self, request): def upsert_user(self, request):
if not hasattr(request, 'user_data'): if not hasattr(request, 'user_data'):
raise NotFoundException(excArgs=USER_MESSAGE) raise NotFoundException(excArgs=USER_MESSAGE)

View File

@ -4,7 +4,6 @@ from wsgiref.util import FileWrapper
from django.core.files.uploadedfile import TemporaryUploadedFile from django.core.files.uploadedfile import TemporaryUploadedFile
from django.http import HttpResponse, JsonResponse from django.http import HttpResponse, JsonResponse
from django.utils.crypto import get_random_string
from drf_spectacular.utils import extend_schema from drf_spectacular.utils import extend_schema
from rest_framework import status, viewsets from rest_framework import status, viewsets
from rest_framework.decorators import action from rest_framework.decorators import action
@ -16,11 +15,10 @@ from rest_framework.renderers import JSONRenderer
from fwd import settings from fwd import settings
from ..celery_worker.client_connector import c_connector from ..celery_worker.client_connector import c_connector
from ..annotation.api import throw_on_failure from ..annotation.api import throw_on_failure
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \ from ..constant.common import ProcessType, REQUEST_ID, FOLDER_TYPE, EntityStatus, pdf_extensions, allowed_file_extensions
FolderFileType, TEMPLATE_ID, EntityStatus, pdf_extensions, allowed_file_extensions
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \ from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException PermissionDeniedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription from ..models import SubscriptionRequest, SubscriptionRequestFile, OcrTemplate
from ..response.ReportSerializer import ReportSerializer from ..response.ReportSerializer import ReportSerializer
from ..utils import FileUtils, ProcessUtil from ..utils import FileUtils, ProcessUtil
@ -87,7 +85,7 @@ class CtelViewSet(viewsets.ViewSet):
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}" _name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
doc_file.seek(0) doc_file.seek(0)
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100) file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
S3_path = FileUtils.save_to_S3(_name, new_request, file_path) FileUtils.save_to_S3(_name, new_request, file_path)
count += 1 count += 1
this_file = { this_file = {
"file_name": _name, "file_name": _name,
@ -157,7 +155,7 @@ class CtelViewSet(viewsets.ViewSet):
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}" _name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
doc_file.seek(0) doc_file.seek(0)
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100) file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
_ = FileUtils.save_to_S3(_name, new_request, file_path) FileUtils.save_to_S3(_name, new_request, file_path)
count += 1 count += 1
this_file = { this_file = {
"file_name": _name, "file_name": _name,
@ -167,12 +165,19 @@ class CtelViewSet(viewsets.ViewSet):
compact_files.append(this_file) compact_files.append(this_file)
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files)) c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
time_out = 120
start = time.time() time_limit = 120
while time.time() - start < time_out: start_time = time.time()
time.sleep(0.1) while True:
current_time = time.time()
waiting_time = current_time - start_time
print("Waiting for: ", waiting_time)
if waiting_time > time_limit:
print("Timeout!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
break
time.sleep(0.2)
report_filter = SubscriptionRequest.objects.filter(request_id=rq_id) report_filter = SubscriptionRequest.objects.filter(request_id=rq_id)
if len(report_filter) != 1: if report_filter.count() != 1:
raise InvalidException(excArgs='requestId') raise InvalidException(excArgs='requestId')
if user_info.current_sub.id != report_filter[0].subscription.id: if user_info.current_sub.id != report_filter[0].subscription.id:
@ -191,13 +196,19 @@ class CtelViewSet(viewsets.ViewSet):
if report_filter[0].status == 400: if report_filter[0].status == 400:
raise FileContentInvalidException() raise FileContentInvalidException()
if report_filter[0].status == 100: # continue, only return when result is fullfilled if report_filter[0].status == 100: # continue, only return when result is fullfilled
print(serializer.data)
print("Status Code: 100")
continue continue
if len(serializer.data) == 0: if len(serializer.data) == 0:
print("No data found")
continue continue
if serializer.data[0].get("data", None) is None: if serializer.data[0].get("data", None) is None:
print(serializer.data[0])
print("No data[0] found")
continue continue
if serializer.data[0]["data"].get("status", 200) != 200: if serializer.data[0]["data"].get("status", 200) != 200:
print("No data status found")
continue continue
return Response(status=status.HTTP_200_OK, data=serializer.data[0]) return Response(status=status.HTTP_200_OK, data=serializer.data[0])

View File

@ -88,7 +88,7 @@ class CeleryConnector:
def send_task(self, name=None, args=None): def send_task(self, name=None, args=None):
if name not in self.task_routes or 'queue' not in self.task_routes[name]: if name not in self.task_routes or 'queue' not in self.task_routes[name]:
raise GeneralException("System") raise GeneralException("System")
return self.app.send_task(name, args, queue=self.task_routes[name]['queue']) return self.app.send_task(name, args, queue=self.task_routes[name]['queue'], expires=300)
c_connector = CeleryConnector() c_connector = CeleryConnector()

View File

@ -1,13 +1,14 @@
import time import time
import fitz
import uuid import uuid
import os import os
import base64 import base64
import traceback
from fwd_api.models import SubscriptionRequest, UserProfile from fwd_api.models import SubscriptionRequest, UserProfile
from fwd_api.celery_worker.worker import app from fwd_api.celery_worker.worker import app
from ..constant.common import FolderFileType, image_extensions from ..constant.common import FolderFileType, image_extensions
from ..exception.exceptions import FileContentInvalidException from ..exception.exceptions import FileContentInvalidException
from fwd_api.models import SubscriptionRequestFile
from ..utils import FileUtils, ProcessUtil, S3_process from ..utils import FileUtils, ProcessUtil, S3_process
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
from fwd import settings from fwd import settings
@ -23,10 +24,7 @@ s3_client = S3_process.MinioS3Client(
) )
def process_pdf_file(file_name: str, file_path: str, request, user) -> list: def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
from fwd_api.models import SubscriptionRequestFile
try: try:
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
# Origin file # Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path, new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request, request=request,
@ -34,27 +32,9 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
code=f'FIL{uuid.uuid4().hex}') code=f'FIL{uuid.uuid4().hex}')
new_request_file.save() new_request_file.save()
# Sub-file # Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user) return ProcessUtil.pdf_to_images_urls(FileUtils.get_file(file_path), request, user)
except Exception as e:
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
return None
def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
from fwd_api.models import SubscriptionRequestFile
doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
# Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
try:
# Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
except Exception as e: except Exception as e:
traceback.print_exc()
request.status = 400 request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"} request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save() request.save()
@ -62,8 +42,6 @@ def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) ->
def process_image_file(file_name: str, file_path, request, user) -> list: def process_image_file(file_name: str, file_path, request, user) -> list:
from fwd_api.models import SubscriptionRequestFile
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path, new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request, request=request,
file_name=file_name, file_name=file_name,

View File

@ -1,6 +1,10 @@
import traceback
from fwd_api.celery_worker.worker import app from fwd_api.celery_worker.worker import app
from fwd_api.models import SubscriptionRequest from fwd_api.models import SubscriptionRequest
from fwd_api.exception.exceptions import InvalidException from fwd_api.exception.exceptions import InvalidException
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
def aggregate_result(src_result, des_result, doc_type): def aggregate_result(src_result, des_result, doc_type):
@ -45,11 +49,6 @@ def update_user(rq: SubscriptionRequest):
@app.task(name='process_sap_invoice_result') @app.task(name='process_sap_invoice_result')
def process_invoice_sap_result(rq_id, result): def process_invoice_sap_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(rq_id)
try: try:
rq: SubscriptionRequest = \ rq: SubscriptionRequest = \
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0] SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
@ -66,16 +65,12 @@ def process_invoice_sap_result(rq_id, result):
except Exception as e: except Exception as e:
print(e) print(e)
print("Fail Invoice %d", rq_id) print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice" return "FailInvoice"
@app.task(name='process_fi_invoice_result') @app.task(name='process_fi_invoice_result')
def process_invoice_fi_result(rq_id, result): def process_invoice_fi_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(rq_id)
print(result)
try: try:
rq: SubscriptionRequest = \ rq: SubscriptionRequest = \
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.FI_INVOICE.value)[0] SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.FI_INVOICE.value)[0]
@ -92,14 +87,11 @@ def process_invoice_fi_result(rq_id, result):
except Exception as e: except Exception as e:
print(e) print(e)
print("Fail Invoice %d", rq_id) print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice" return "FailInvoice"
@app.task(name='process_manulife_invoice_result') @app.task(name='process_manulife_invoice_result')
def process_invoice_manulife_result(rq_id, result): def process_invoice_manulife_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(f"[DEBUG]: Received manulife request with id {rq_id}")
try: try:
rq: SubscriptionRequest = \ rq: SubscriptionRequest = \
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.MANULIFE_INVOICE.value)[0] SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.MANULIFE_INVOICE.value)[0]
@ -116,13 +108,11 @@ def process_invoice_manulife_result(rq_id, result):
except Exception as e: except Exception as e:
print(e) print(e)
print("Fail Invoice %d", rq_id) print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice" return "FailInvoice"
@app.task(name='process_sbt_invoice_result') @app.task(name='process_sbt_invoice_result')
def process_invoice_sbt_result(rq_id, result): def process_invoice_sbt_result(rq_id, result):
from fwd_api.models import SubscriptionRequest
from fwd_api.constant.common import ProcessType
print_id(f"[DEBUG]: Received SBT request with id {rq_id}") print_id(f"[DEBUG]: Received SBT request with id {rq_id}")
print_id(f"[DEBUG]: result: {result}") print_id(f"[DEBUG]: result: {result}")
try: try:
@ -156,205 +146,6 @@ def process_invoice_sbt_result(rq_id, result):
except Exception as e: except Exception as e:
print(e) print(e)
print("Fail Invoice %d", rq_id) print("Fail Invoice %d", rq_id)
traceback.print_exc()
return "FailInvoice" return "FailInvoice"
# @app.task(name='process_id_result', queue='id_card_rs')
# def process_id_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# from fwd_api.models import SubscriptionRequestFile
# from fwd_api.constant.common import FileCategory
# print_id(rq_id)
# try:
# s_time = time.time()
# print("Start")
# j_time = time.time()
# print("Json {}".format(j_time - s_time))
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.ID_CARD.value)[0]
# if 'content' in result and 'pages' in result['content']:
# pages = result['content']['pages']
# if isinstance(pages, list):
# new_pages = []
# for idx, page in enumerate(pages):
# if 'path_image_croped' in page:
# img_name = f'crop_{idx}_{get_random_string(3)}.jpg'
# path = page['path_image_croped']
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=img_name, request=rq,
# file_category=FileCategory.CROP.value,
# file_path=path,
# code=f'IDC{uuid.uuid4().hex}')
# rq_file.save()
# page['path_image_croped'] = rq_file.code
# l_time = time.time()
# print("Save {}".format(l_time - j_time))
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# e_time = time.time()
# print("End {}".format(e_time - l_time))
# except IndexError as e:
# traceback.format_exc()
# print(e)
# except Exception as e:
# traceback.format_exc()
# print(e)
# print("Fail ID %d", rq_id)
# return "Fail"
# return "Success"
# @app.task(name='process_driver_license_result')
# def process_driver_license_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.models import SubscriptionRequestFile
# from fwd_api.constant.common import FileCategory
# from fwd_api.constant.common import ProcessType
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.DRIVER_LICENSE.value)[0]
# if 'content' in result and 'pages' in result['content']:
# pages = result['content']['pages']
# if isinstance(pages, list):
# new_pages = []
# for idx, page in enumerate(pages):
# if 'path_image_croped' in page:
# img_name = f'crop_{idx}_{get_random_string(3)}.jpg'
# path = page['path_image_croped']
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=img_name, request=rq,
# file_category=FileCategory.CROP.value,
# file_path=path,
# code=f'DLC{uuid.uuid4().hex}')
# rq_file.save()
# page['path_image_croped'] = rq_file.code
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# print(e)
# except Exception as e:
# print(e)
# print("Fail DL %d", rq_id)
# return "Fail"
# return "Success"
# @app.task(name='process_invoice_result')
# def process_invoice_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)
# print(rq)
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# print(e)
# print("NotFound request by requestId, %d", rq_id)
# except Exception as e:
# print(e)
# traceback.format_exc()
# print("Fail Invoice %d", rq_id)
# return "FailInvoice"
# return "Success"
# @app.task(name='process_ocr_with_box_result')
# def process_ocr_with_box_result(rq_id, result):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.OCR_WITH_BOX.value)[0]
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# traceback.format_exc()
# print(e)
# except Exception as e:
# traceback.format_exc()
# print(e)
# print("Fail OCR %d", rq_id)
# return "FailOCR"
# return "Success"
# @app.task(name='process_template_matching_result')
# def template_matching_result(rq_id, result, align_img):
# from fwd_api.models import SubscriptionRequest
# from fwd_api.constant.common import ProcessType
# from fwd_api.constant.common import FileCategory
# from fwd_api.models import SubscriptionRequestFile
# print_id(rq_id)
# try:
# rq: SubscriptionRequest = \
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.TEMPLATE_MATCHING.value)[0]
# if align_img:
# from fwd_api.constant.common import IMAGE_NAME
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=IMAGE_NAME, request=rq,
# file_category=FileCategory.CROP.value,
# file_path=align_img)
# rq_file.save()
# status = to_status(result)
# rq.predict_result = result
# rq.status = status
# rq.save()
# update_user(rq)
# except IndexError as e:
# traceback.format_exc()
# print(e)
# except Exception as e:
# traceback.format_exc()
# print(e)
# print("Fail Template %d", rq_id)
# return "FailTemplate"
# return "Success"

View File

@ -18,11 +18,6 @@ app: Celery = Celery(
app.conf.update({ app.conf.update({
'task_queues': 'task_queues':
[ [
# Queue('id_card_rs'),
# Queue('driver_license_rs'),
# Queue('invoice_rs'),
# Queue('ocr_with_box_rs'),
# Queue('template_matching_rs'),
Queue('invoice_sap_rs'), Queue('invoice_sap_rs'),
Queue('invoice_fi_rs'), Queue('invoice_fi_rs'),
Queue('invoice_manulife_rs'), Queue('invoice_manulife_rs'),
@ -33,11 +28,6 @@ app.conf.update({
], ],
'task_routes': { 'task_routes': {
# 'process_id_result': {'queue': 'id_card_rs'},
# 'process_driver_license_result': {'queue': "driver_license_rs"},
# 'process_invoice_result': {'queue': "invoice_rs"},
# 'process_ocr_with_box_result': {'queue': "ocr_with_box_rs"},
# 'process_template_matching_result': {'queue': 'template_matching_rs'},
'process_sap_invoice_result': {'queue': 'invoice_sap_rs'}, 'process_sap_invoice_result': {'queue': 'invoice_sap_rs'},
'process_sap_invoice': {'queue': "invoice_sap"}, 'process_sap_invoice': {'queue': "invoice_sap"},
'process_fi_invoice_result': {'queue': 'invoice_fi_rs'}, 'process_fi_invoice_result': {'queue': 'invoice_fi_rs'},

View File

@ -72,10 +72,3 @@ class ReportSerializer(serializers.Serializer):
new_data.append(new_page_object) new_data.append(new_page_object)
data['pages'] = new_data data['pages'] = new_data
return data return data
# def get_predict_result(self, obj: SubscriptionRequest):
# from fwd_api.constant.common import ProcessType
# typez = int(obj.process_type)
# if typez == ProcessType.OCR_WITH_BOX.value or typez == ProcessType.TEMPLATE_MATCHING.value:
# return obj.predict_result
# return None

View File

@ -156,11 +156,9 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
def save_to_S3(file_name, rq, local_file_path): def save_to_S3(file_name, rq, local_file_path):
try: try:
# base64_obj = base64.b64encode(obj).decode('utf-8')
file_path = get_folder_path(rq) file_path = get_folder_path(rq)
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id" assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name) s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
# c_connector.upload_file_to_s3((file_path, s3_key))
c_connector.upload_file_to_s3((local_file_path, s3_key)) c_connector.upload_file_to_s3((local_file_path, s3_key))
return s3_key return s3_key
except Exception as e: except Exception as e:

View File

@ -8,7 +8,9 @@ from django.core.files.uploadedfile import TemporaryUploadedFile
from django.db import transaction from django.db import transaction
from rest_framework import status from rest_framework import status
from fwd_api.utils.image import get_first_page_pdf
from fwd import settings from fwd import settings
from fwd_api.utils.image import resize
from fwd_api.constant.common import LIST_BOX_MESSAGE, pattern, NAME_MESSAGE, allowed_p_type, TEMPLATE_ID, \ from fwd_api.constant.common import LIST_BOX_MESSAGE, pattern, NAME_MESSAGE, allowed_p_type, TEMPLATE_ID, \
FolderFileType, FileCategory FolderFileType, FileCategory
from fwd_api.exception.exceptions import NumberOfBoxLimitReachedException, \ from fwd_api.exception.exceptions import NumberOfBoxLimitReachedException, \
@ -21,7 +23,6 @@ from ..models import UserProfile, OcrTemplate, OcrTemplateBox, \
Subscription, SubscriptionRequestFile, SubscriptionRequest Subscription, SubscriptionRequestFile, SubscriptionRequest
from ..celery_worker.client_connector import c_connector from ..celery_worker.client_connector import c_connector
import uuid import uuid
from PIL import Image
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
@ -286,7 +287,6 @@ def validate_vn_and_space(txt: str):
raise InvalidException(excArgs=NAME_MESSAGE) raise InvalidException(excArgs=NAME_MESSAGE)
@transaction.atomic
def save_template_boxs(data, template): def save_template_boxs(data, template):
saving_list = [] saving_list = []
for d_box in data['data_boxs']: for d_box in data['data_boxs']:
@ -410,18 +410,15 @@ def process_image_local_file(file_name: str, file_path: str, request: Subscripti
'request_file_id': new_request_file.code 'request_file_id': new_request_file.code
}] }]
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list: def pdf_to_images_urls(doc_path, request: SubscriptionRequest, user, dpi: int = 300) -> list:
pdf_extracted = [] pdf_extracted = []
for idx, page in enumerate(doc):
saving_path = FileUtils.get_folder_path(request) saving_path = FileUtils.get_folder_path(request)
break_file_name = f'break_{idx}.jpg' break_file_name = f'break_0.jpg'
saving_path = os.path.join(saving_path, break_file_name) saving_path = os.path.join(saving_path, break_file_name)
page = doc.load_page(idx) image = get_first_page_pdf(doc_path, 300)
pix = page.get_pixmap(dpi=250) # render page to an image image = resize(image, max_w=settings.TARGET_MAX_IMAGE_SIZE[0], max_h=settings.TARGET_MAX_IMAGE_SIZE[1])
if pix.size > 8*3*settings.MAX_PIXEL_IN_A_FILE*settings.MAX_PIXEL_IN_A_FILE: image.save(saving_path)
raise InvalidDecompressedSizeException(excArgs=(str(pix.width), str(pix.height), str(settings.MAX_PIXEL_IN_A_FILE)))
pix.save(saving_path)
print(f"Saving {saving_path}") print(f"Saving {saving_path}")
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=saving_path, new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=saving_path,
request=request, request=request,
@ -434,7 +431,7 @@ def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, d
pdf_extracted.append( pdf_extracted.append(
{ {
'file_url': file_url, 'file_url': file_url,
'page_number': idx, 'page_number': 0,
'request_file_id': new_request_file.code 'request_file_id': new_request_file.code
} }
) )

View File

@ -1,9 +1,11 @@
import io
from PIL import Image from PIL import Image
from PyPDF2 import PdfReader, PdfWriter
from pdf2image import convert_from_bytes
def resize(image, max_w=2048, max_h=2048): def resize(image, max_w=2048, max_h=2048):
cur_w, cur_h = image.width, image.height cur_w = image.width
image_bytes = image.samples cur_h = image.height
image = Image.frombytes("RGB", [cur_w, cur_h], image_bytes)
if cur_h > max_w or cur_h > max_h: if cur_h > max_w or cur_h > max_h:
ratio_w = max_w/cur_w ratio_w = max_w/cur_w
ratio_h = max_h/cur_h ratio_h = max_h/cur_h
@ -11,5 +13,40 @@ def resize(image, max_w=2048, max_h=2048):
new_w = int(ratio*cur_w) new_w = int(ratio*cur_w)
new_h = int(ratio*cur_h) new_h = int(ratio*cur_h)
image = image.resize((new_w, new_h)) image = image.resize((new_w, new_h))
return image
def fitz_pixmap_to_pillow_with_resize(image, max_w=2048, max_h=2048):
cur_w, cur_h = image.width, image.height
image_bytes = image.samples
image = Image.frombytes("RGB", [cur_w, cur_h], image_bytes)
image = resize(image, max_w, max_h)
return image
def get_first_page_pdf(filename, max_size=300):
def pdf_scale_page(page, size=297):
"""Scale page to specified size mm"""
(w, h) = page.mediabox[2:]
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
pmm = (1/72*25.4)
ks = size / (float(max((w, h))) * pmm)
page.scale_by(ks)
return page
reader = PdfReader(filename)
page = reader.pages[0]
scaled_page = pdf_scale_page(page, max_size)
# Create BytesIO
pdf_bytes = io.BytesIO()
dst_pdf = PdfWriter()
dst_pdf.add_page(scaled_page)
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
image = convert_from_bytes(pdf_bytes.read())
if isinstance(image, list):
return image[0]
return image return image

View File

@ -23,7 +23,6 @@ Jinja2==3.1.2
jsonschema==4.17.1 jsonschema==4.17.1
MarkupSafe==2.1.1 MarkupSafe==2.1.1
packaging==21.3 packaging==21.3
pdf2image==1.16.0
Pillow==9.3.0 Pillow==9.3.0
psycopg2==2.9.5 psycopg2==2.9.5
psycopg2-binary==2.9.5 psycopg2-binary==2.9.5
@ -41,7 +40,6 @@ tzdata==2022.6
uritemplate==4.1.1 uritemplate==4.1.1
urllib3==1.26.13 urllib3==1.26.13
uvicorn==0.20.0 uvicorn==0.20.0
celery~=5.2.7 celery~=5.2.7
kombu~=5.2.4 kombu~=5.2.4
PyJWT~=2.6.0 PyJWT~=2.6.0
@ -50,3 +48,4 @@ PyMuPDF==1.21.1
djangorestframework-xml==2.0.0 djangorestframework-xml==2.0.0
boto3==1.29.7 boto3==1.29.7
imagesize==1.4.1 imagesize==1.4.1
pdf2image==1.16.3

View File

@ -8,6 +8,9 @@ server {
location ~ ^/api { location ~ ^/api {
proxy_pass {{proxy_server}}; proxy_pass {{proxy_server}};
proxy_read_timeout 300;
proxy_connect_timeout 300;
proxy_send_timeout 300;
} }
location /static/drf_spectacular_sidecar/ { location /static/drf_spectacular_sidecar/ {

View File

@ -26,7 +26,7 @@ export async function createKieJob({
const response = await API.post<{ const response = await API.post<{
estimated_wating_time: number; estimated_wating_time: number;
request_id: string; request_id: string;
}>('/ctel/image/process/', formData); }>('/ctel/images/process_sync', formData);
return response.data.request_id; return response.data.request_id;
} }

View File

@ -11,8 +11,7 @@ services:
shm_size: 10gb shm_size: 10gb
dockerfile: Dockerfile dockerfile: Dockerfile
shm_size: 10gb shm_size: 10gb
image: sidp/cope2n-ai-fi-sbt # mem_limit: 8g
mem_limit: 8g
restart: always restart: always
# container_name: "sidp-cope2n-ai-fi-sbt" # container_name: "sidp-cope2n-ai-fi-sbt"
networks: networks:
@ -43,7 +42,6 @@ services:
dockerfile: Dockerfile dockerfile: Dockerfile
# ports: # ports:
# - 9880:9000 # - 9880:9000
image: sidp/cope2n-be-fi-sbt
# container_name: "sidp-cope2n-be-ctel-sbt" # container_name: "sidp-cope2n-be-ctel-sbt"
environment: environment:
- MEDIA_ROOT=${MEDIA_ROOT} - MEDIA_ROOT=${MEDIA_ROOT}
@ -71,20 +69,20 @@ services:
- S3_ACCESS_KEY=${S3_ACCESS_KEY} - S3_ACCESS_KEY=${S3_ACCESS_KEY}
- S3_SECRET_KEY=${S3_SECRET_KEY} - S3_SECRET_KEY=${S3_SECRET_KEY}
- S3_BUCKET_NAME=${S3_BUCKET_NAME} - S3_BUCKET_NAME=${S3_BUCKET_NAME}
# restart: always restart: always
networks: networks:
- ctel-sbt - ctel-sbt
volumes: volumes:
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT} - ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
- BE_static:/app/static - BE_static:/app/static
# - ./cope2n-api:/app - ./cope2n-api:/app
working_dir: /app working_dir: /app
depends_on: depends_on:
db-sbt: db-sbt:
condition: service_started condition: service_started
# rabbitmq: # rabbitmq:
# condition: service_started # condition: service_started
command: sh -c "python manage.py collectstatic --no-input && command: sh -c "sleep 5; python manage.py collectstatic --no-input &&
python manage.py migrate && python manage.py migrate &&
python manage.py compilemessages && python manage.py compilemessages &&
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker -b 0.0.0.0:9000" # pre-makemigrations on prod gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker -b 0.0.0.0:9000" # pre-makemigrations on prod
@ -111,8 +109,9 @@ services:
# args: # args:
# - "UID=${UID:-1000}" # - "UID=${UID:-1000}"
# - "GID=${GID:-1000}" # - "GID=${GID:-1000}"
image: sidp/cope2n-be-fi-sbt build:
# container_name: "sidp-cope2n-be-celery-sbt" context: cope2n-api
dockerfile: Dockerfile
environment: environment:
- MEDIA_ROOT=${MEDIA_ROOT} - MEDIA_ROOT=${MEDIA_ROOT}
- PYTHONPATH=${PYTHONPATH}:/app # For import module - PYTHONPATH=${PYTHONPATH}:/app # For import module
@ -146,19 +145,19 @@ services:
condition: service_started condition: service_started
volumes: volumes:
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT} - ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
- ./cope2n-api:/app
working_dir: /app working_dir: /app
command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO" command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO --pool=solo"
# Back-end persistent # Back-end persistent
db-sbt: db-sbt:
mem_reservation: 500m mem_reservation: 500m
mem_limit: 1g # mem_limit: 1g
# container_name: sidp-cope2n-be-sbt-db # container_name: sidp-cope2n-be-sbt-db
image: postgres:14.7-alpine image: postgres:14.7-alpine
volumes: volumes:
- ./data/postgres_data:/var/lib/postgresql/data - ./data/postgres_data:/var/lib/postgresql/data
working_dir: /workspace/cope2n-api
networks: networks:
- ctel-sbt - ctel-sbt
environment: environment:
@ -168,7 +167,7 @@ services:
rabbitmq-sbt: rabbitmq-sbt:
mem_reservation: 600m mem_reservation: 600m
mem_limit: 4g # mem_limit: 4g
# container_name: sidp-cope2n-be-rabbitmq-sbt # container_name: sidp-cope2n-be-rabbitmq-sbt
restart: always restart: always
image: rabbitmq:3.10-alpine image: rabbitmq:3.10-alpine
@ -182,6 +181,7 @@ services:
environment: environment:
- RABBITMQ_DEFAULT_USER=${RABBITMQ_DEFAULT_USER} - RABBITMQ_DEFAULT_USER=${RABBITMQ_DEFAULT_USER}
- RABBITMQ_DEFAULT_PASS=${RABBITMQ_DEFAULT_PASS} - RABBITMQ_DEFAULT_PASS=${RABBITMQ_DEFAULT_PASS}
# Front-end services # Front-end services
fe-sbt: fe-sbt:
build: build:
@ -189,7 +189,6 @@ services:
shm_size: 10gb shm_size: 10gb
dockerfile: Dockerfile dockerfile: Dockerfile
shm_size: 10gb shm_size: 10gb
image: sidp/cope2n-fe-fi-sbt
# container_name: "sidp-cope2n-fe-ctel-sbt" # container_name: "sidp-cope2n-fe-ctel-sbt"
privileged: true privileged: true
ports: ports:

BIN
invoice.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 270 KiB

View File

@ -146,10 +146,10 @@ def process_file(data):
} }
invoice_files = [ invoice_files = [
('invoice_file', ('invoice.jpg', open("test_samples/sbt/big_image.jpg", "rb").read())), ('invoice_file', ('invoice.jpg', open("test_samples/sbt/invoice.jpg", "rb").read())),
] ]
imei_files = [ imei_files = [
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/big_image.jpg", "rb").read())), ('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/invoice.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())), ('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())), ('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())), ('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),

157
speedtest_sync.py Normal file
View File

@ -0,0 +1,157 @@
import requests
import time
import argparse
import multiprocessing
import tqdm
import random
import traceback
parser = argparse.ArgumentParser()
parser.add_argument("--host", dest="host", default="https://sbt.idp.sdsrv.ai", required=False)
parser.add_argument("-u", "--username", help="Username to connect to server", required=True)
parser.add_argument("-p", "--password", help="Password to connect to server", required=True)
parser.add_argument("--num_requests", type=int, help="Number of requests", required=False, default=100)
parser.add_argument("--num_workers", type=int, help="Number of workers", required=False, default=3)
parser.add_argument("--checking_interval", type=float, help="Interval result checking time", required=False, default=0.5)
args = parser.parse_args()
PROCESSING_TIMEOUT = 60
# =================================================================
# GET THE TOKEN
response = requests.post(f'{args.host}/api/ctel/login/', json={
'username': args.username,
'password': args.password
})
try:
token = response.json()['token']
except:
print("Failed to login")
print(response.content)
# After the login, store the token in the memory (RAM) or DB
# Re-login to issue a new token after 6 days.
# =================================================================
def process_file(data):
files, token = data
num_files = len(files)
files.append(
('processType', (None, 12)),
)
# =================================================================
# UPLOAD THE FILE
start_time = time.time()
try:
response = requests.post(f'{args.host}/api/ctel/images/process_sync/', headers={
'Authorization': token,
}, files=files, timeout=300)
except requests.exceptions.Timeout:
print("Timeout occurred while uploading")
return {
"success": False,
"status": "timeout",
"upload_time": 0,
"process_time": 0,
"num_files": 0,
}
except Exception as e:
print(e)
traceback.print_exc()
print("Unknown exception occurred while uploading")
return {
"success": False,
"status": "unknown error",
"upload_time": 0,
"process_time": 0,
"num_files": 0,
}
end_time = time.time()
upload_time = end_time - start_time
# =================================================================
try:
data = response.json()
data.pop("files", None)
print(data)
except:
print(response.content)
return {
"success": False,
"status": "timeout",
"upload_time": 0,
"process_time": 0,
"num_files": 0,
}
return {
"success": True,
"status": 200,
"upload_time": upload_time,
"process_time": upload_time,
"num_files": num_files,
}
invoice_files = [
('invoice_file', ('invoice.pdf', open("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", "rb").read())),
]
# invoice_files = [
# ('invoice_file', ('invoice.jpg', open("test_samples/sbt/invoice.jpg", "rb").read())),
# ]
imei_files = [
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/invoice.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),
('imei_files', ("test_samples/sbt/imei5.jpg", open("test_samples/sbt/imei5.jpg", "rb").read())),
]
def get_imei_files():
# num_files = random.randint(1, len(imei_files) + 1)
num_files = 1
print("Num imeis", num_files)
files = imei_files[:num_files]
# print("Num of imei files:", len(files))
return files
def get_files():
return invoice_files + get_imei_files()
def gen_input(num_input):
for _ in range(num_input):
yield (get_files(), token)
pool = multiprocessing.Pool(processes=args.num_workers)
results = []
for result in tqdm.tqdm(pool.imap_unordered(process_file, gen_input(num_input=args.num_requests)), total=args.num_requests):
results.append(result)
print("## TEST REPORT #################################")
print("Number of requests: {}".format(args.num_requests))
print("Number of concurrent requests: {}".format(args.num_workers))
print("Number of files: 1 invoice, 1-5 imei files (random)")
print("Query time interval for result: {:.3f}s ".format(args.checking_interval))
print("--------------------------------------")
print("SUCCESS RATE")
counter = {}
for result in results:
counter[result["status"]] = counter.get(result["status"], 0) + 1
total_requests = sum(counter.values())
print("Success rate: {}".format(counter.get(200, 0) / total_requests if total_requests > 0 else -1))
print("Statuses:", counter)
print("--------------------------------------")
print("TIME BY REQUEST")
uploading_time = [x["upload_time"] for x in results if x["success"]]
if len(uploading_time) == 0:
print("No valid uploading time")
print("Check the results!")
processing_time = [x["process_time"] for x in results if x["success"]]
print("Uploading time (Avg / Min / Max): {:.3f}s {:.3f}s {:.3f}s".format(sum(uploading_time) / len(uploading_time), min(uploading_time), max(uploading_time)))
print("Processing time (Avg / Min / Max): {:.3f}s {:.3f}s {:.3f}s".format(sum(processing_time) / len(processing_time), min(processing_time), max(processing_time)))
print("--------------------------------------")
print("TIME BY IMAGE")
uploading_time = [x["upload_time"] for x in results if x["success"]]
processing_time = [x["process_time"] for x in results if x["success"]]
num_images = sum(x["num_files"] for x in results if x["success"])
print("Total images:", num_images)
print("Uploading time: {:.3f}s".format(sum(uploading_time) / num_images))
print("Processing time: {:.3f}s".format(sum(processing_time) / num_images))
print("--------------------------------------")

34
test_pdf_reader.py Normal file
View File

@ -0,0 +1,34 @@
from PyPDF2 import PdfReader, PdfWriter
from PIL import Image
from pdf2image import convert_from_bytes
def get_first_page_pdf(filename, max_size=2048):
def pdf_scale_page(page, size=297):
"""Scale page to specified size mm"""
(w, h) = page.mediabox[2:]
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
pmm = (1/72*25.4)
ks = size / (float(max((w, h))) * pmm)
page.scale_by(ks)
return page
reader = PdfReader(filename)
page = reader.pages[0]
scaled_page = pdf_scale_page(page, max_size)
# Create BytesIO
pdf_bytes = io.BytesIO()
dst_pdf = PdfWriter()
dst_pdf.add_page(scaled_page)
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
image = convert_from_bytes(pdf_bytes.read())
if isinstance(image, list):
return image[0]
return image
img = get_first_page_pdf("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", max_size=300)
img.save("invoice.jpg", "JPEG")

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 MiB

File diff suppressed because one or more lines are too long