Resize images for pdfs
This commit is contained in:
parent
e0eaac3611
commit
bace56baf7
1
.gitignore
vendored
1
.gitignore
vendored
@ -20,3 +20,4 @@ media/
|
|||||||
postgres_data/
|
postgres_data/
|
||||||
curl.md
|
curl.md
|
||||||
cope2n-api/fwd_api/commands/init_database.py
|
cope2n-api/fwd_api/commands/init_database.py
|
||||||
|
/data
|
@ -1,4 +1,3 @@
|
|||||||
# FROM thucpd2408/env-cope2n:v1
|
|
||||||
FROM thucpd2408/env-deskew
|
FROM thucpd2408/env-deskew
|
||||||
|
|
||||||
COPY ./packages/cudnn-linux*.tar.xz /tmp/cudnn-linux*.tar.xz
|
COPY ./packages/cudnn-linux*.tar.xz /tmp/cudnn-linux*.tar.xz
|
||||||
@ -9,8 +8,7 @@ RUN tar -xvf /tmp/cudnn-linux*.tar.xz -C /tmp/ \
|
|||||||
&& chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* \
|
&& chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* \
|
||||||
&& rm -rf /tmp/cudnn-*-archive
|
&& rm -rf /tmp/cudnn-*-archive
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y gcc g++ ffmpeg libsm6 libxext6
|
RUN apt-get update && apt-get install -y gcc g++ ffmpeg libsm6 libxext6 poppler-utils
|
||||||
# RUN pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
@ -25,7 +23,6 @@ RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsv_dewarp && pip3
|
|||||||
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtd && pip3 install -v -e .
|
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtd && pip3 install -v -e .
|
||||||
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtr && pip3 install -v -e .
|
RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/externals/sdsvtr && pip3 install -v -e .
|
||||||
|
|
||||||
# RUN cd /workspace/cope2n-ai-fi/modules/ocr_engine/ && pip3 install -r requirements.txt
|
|
||||||
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkie && pip3 install -v -e .
|
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkie && pip3 install -v -e .
|
||||||
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkvu && pip3 install -v -e .
|
RUN cd /workspace/cope2n-ai-fi/modules/sdsvkvu && pip3 install -v -e .
|
||||||
RUN cd /workspace/cope2n-ai-fi && pip3 install -r requirements.txt
|
RUN cd /workspace/cope2n-ai-fi && pip3 install -r requirements.txt
|
||||||
@ -38,6 +35,6 @@ RUN rm -f /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libcublasLt.
|
|||||||
ln -s /usr/local/cuda-11.8/targets/x86_64-linux/lib/libnvblas.so.11 /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libnvblas.so.11
|
ln -s /usr/local/cuda-11.8/targets/x86_64-linux/lib/libnvblas.so.11 /usr/local/lib/python3.10/dist-packages/nvidia/cublas/lib/libnvblas.so.11
|
||||||
|
|
||||||
ENV PYTHONPATH="."
|
ENV PYTHONPATH="."
|
||||||
|
ENV TZ="Asia/Ho_Chi_Minh"
|
||||||
|
|
||||||
CMD [ "sh", "run.sh"]
|
CMD [ "sh", "run.sh"]
|
||||||
# CMD ["tail -f > /dev/null"]
|
|
@ -23,4 +23,3 @@
|
|||||||
- [ ] `Kie_Invoice_AP/prediction.py` seems to be the base function, this should act as a proxy which import all other `predict_{anything else}` functions
|
- [ ] `Kie_Invoice_AP/prediction.py` seems to be the base function, this should act as a proxy which import all other `predict_{anything else}` functions
|
||||||
- [ ] There should be a unique folder to keep all models with different versions then mount as /models in container. Currently, `fi` is loading from `/models/Kie_invoice_fi` while `sap` is loading from `Kie_Invoice_AP/AnyKey_Value/experiments/key_value_understanding-20231003-171748`. Another model weight is at `sdsvtd/hub` for unknown reason
|
- [ ] There should be a unique folder to keep all models with different versions then mount as /models in container. Currently, `fi` is loading from `/models/Kie_invoice_fi` while `sap` is loading from `Kie_Invoice_AP/AnyKey_Value/experiments/key_value_understanding-20231003-171748`. Another model weight is at `sdsvtd/hub` for unknown reason
|
||||||
- [ ] Env variables should have its description in README
|
- [ ] Env variables should have its description in README
|
||||||
- [ ]
|
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit e0edcd3266f59801a22eea673bce15aeeaf01f01
|
Subproject commit d351bb79dab7d3e449bf8ccd945a3f24f62dd33d
|
@ -8,3 +8,6 @@ sdsvkvu
|
|||||||
|
|
||||||
pymupdf
|
pymupdf
|
||||||
easydict
|
easydict
|
||||||
|
|
||||||
|
imagesize==1.4.1
|
||||||
|
pdf2image==1.16.3
|
@ -5,12 +5,11 @@ ARG USERNAME=container-user
|
|||||||
RUN groupadd --gid ${GID} ${USERNAME} \
|
RUN groupadd --gid ${GID} ${USERNAME} \
|
||||||
&& useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
|
&& useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
|
||||||
&& apt-get update \
|
&& apt-get update \
|
||||||
&& apt-get install -y sudo \
|
&& apt-get install -y sudo bash gettext poppler-utils \
|
||||||
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
|
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
|
||||||
&& chmod 0440 /etc/sudoers.d/${USERNAME}
|
&& chmod 0440 /etc/sudoers.d/${USERNAME}
|
||||||
RUN yes | apt install postgresql gcc musl-dev
|
RUN yes | apt install postgresql gcc musl-dev
|
||||||
RUN pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
RUN apt install bash gettext
|
|
||||||
RUN pip install uvicorn gunicorn Celery
|
RUN pip install uvicorn gunicorn Celery
|
||||||
|
|
||||||
USER ${UID}
|
USER ${UID}
|
||||||
@ -21,3 +20,5 @@ WORKDIR /app
|
|||||||
RUN pip install -r requirements.txt --no-cache-dir
|
RUN pip install -r requirements.txt --no-cache-dir
|
||||||
|
|
||||||
COPY --chown=${UID}:${GID} . /app
|
COPY --chown=${UID}:${GID} . /app
|
||||||
|
|
||||||
|
ENV TZ="Asia/Ho_Chi_Minh"
|
||||||
|
@ -134,11 +134,11 @@ AUTH_PASSWORD_VALIDATORS = [
|
|||||||
# https://docs.djangoproject.com/en/4.1/topics/i18n/
|
# https://docs.djangoproject.com/en/4.1/topics/i18n/
|
||||||
|
|
||||||
LANGUAGE_CODE = "en-us"
|
LANGUAGE_CODE = "en-us"
|
||||||
|
|
||||||
TIME_ZONE = "Asia/Ho_Chi_Minh"
|
|
||||||
|
|
||||||
USE_I18N = True
|
USE_I18N = True
|
||||||
|
|
||||||
|
CELERY_ENABLE_UTC = False
|
||||||
|
CELERY_TIMEZONE = "Asia/Ho_Chi_Minh"
|
||||||
|
TIME_ZONE = "Asia/Ho_Chi_Minh"
|
||||||
USE_TZ = True
|
USE_TZ = True
|
||||||
|
|
||||||
# Static files (CSS, JavaScript, Images)
|
# Static files (CSS, JavaScript, Images)
|
||||||
@ -195,7 +195,6 @@ CORS_ORIGIN_ALLOW_ALL = True
|
|||||||
|
|
||||||
MEDIA_ROOT = env.str("MEDIA_ROOT", default=r"/var/www/example.com/media/")
|
MEDIA_ROOT = env.str("MEDIA_ROOT", default=r"/var/www/example.com/media/")
|
||||||
BROKER_URL = env.str("BROKER_URL", default="amqp://test:test@107.120.70.226:5672//")
|
BROKER_URL = env.str("BROKER_URL", default="amqp://test:test@107.120.70.226:5672//")
|
||||||
CELERY_TIMEZONE = "Australia/Tasmania"
|
|
||||||
CELERY_TASK_TRACK_STARTED = True
|
CELERY_TASK_TRACK_STARTED = True
|
||||||
CELERY_TASK_TIME_LIMIT = 30 * 60
|
CELERY_TASK_TIME_LIMIT = 30 * 60
|
||||||
|
|
||||||
|
@ -68,7 +68,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
|
|||||||
@extend_schema(request=None, responses=None, tags=['templates'])
|
@extend_schema(request=None, responses=None, tags=['templates'])
|
||||||
@action(detail=False, methods=["DELETE"], url_path=r"templates/(?P<template_id>\d+)")
|
@action(detail=False, methods=["DELETE"], url_path=r"templates/(?P<template_id>\d+)")
|
||||||
@throw_on_failure(InvalidException(excArgs='data'))
|
@throw_on_failure(InvalidException(excArgs='data'))
|
||||||
@transaction.atomic
|
|
||||||
def delete_template(self, request, template_id=None):
|
def delete_template(self, request, template_id=None):
|
||||||
user_data: UserData = ProcessUtil.get_user(request)
|
user_data: UserData = ProcessUtil.get_user(request)
|
||||||
|
|
||||||
@ -112,7 +111,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
|
|||||||
else:
|
else:
|
||||||
return self.insert_template(request, user_data)
|
return self.insert_template(request, user_data)
|
||||||
|
|
||||||
@transaction.atomic
|
|
||||||
def insert_template(self, request, user_data: UserData):
|
def insert_template(self, request, user_data: UserData):
|
||||||
file_list = request.data.getlist('file')
|
file_list = request.data.getlist('file')
|
||||||
FileUtils.validate_list_file(file_list)
|
FileUtils.validate_list_file(file_list)
|
||||||
@ -148,7 +146,6 @@ class CtelTemplateViewSet(viewsets.ViewSet):
|
|||||||
"id": template.id,
|
"id": template.id,
|
||||||
})
|
})
|
||||||
|
|
||||||
@transaction.atomic
|
|
||||||
def update_template(self, request, user_data: UserData):
|
def update_template(self, request, user_data: UserData):
|
||||||
# Validate
|
# Validate
|
||||||
data = request.data
|
data = request.data
|
||||||
|
@ -8,12 +8,10 @@ from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiExampl
|
|||||||
from rest_framework import status, viewsets
|
from rest_framework import status, viewsets
|
||||||
from rest_framework.decorators import action
|
from rest_framework.decorators import action
|
||||||
from rest_framework.response import Response
|
from rest_framework.response import Response
|
||||||
from rest_framework.decorators import authentication_classes, permission_classes
|
|
||||||
|
|
||||||
from ..annotation.api import throw_on_failure
|
from ..annotation.api import throw_on_failure
|
||||||
from ..constant.common import USER_MESSAGE, EntityStatus, PLAN_MESSAGE, PlanCode
|
from ..constant.common import USER_MESSAGE, EntityStatus, PLAN_MESSAGE, PlanCode
|
||||||
from ..exception.exceptions import InvalidException, NotFoundException, LockedEntityException, TrialOneException, \
|
from ..exception.exceptions import InvalidException, NotFoundException, LockedEntityException, TrialOneException, NotAuthenticatedException
|
||||||
LimitReachedException, NotAuthenticatedException
|
|
||||||
from ..models import UserProfile, PricingPlan, Subscription
|
from ..models import UserProfile, PricingPlan, Subscription
|
||||||
from ..request.UpsertUserRequest import UpsertUserRequest
|
from ..request.UpsertUserRequest import UpsertUserRequest
|
||||||
from ..response.SubscriptionResponse import SubscriptionResponse
|
from ..response.SubscriptionResponse import SubscriptionResponse
|
||||||
@ -135,7 +133,6 @@ class CtelUserViewSet(viewsets.ViewSet):
|
|||||||
else:
|
else:
|
||||||
return self.get_user(request)
|
return self.get_user(request)
|
||||||
|
|
||||||
@transaction.atomic
|
|
||||||
def upsert_user(self, request):
|
def upsert_user(self, request):
|
||||||
if not hasattr(request, 'user_data'):
|
if not hasattr(request, 'user_data'):
|
||||||
raise NotFoundException(excArgs=USER_MESSAGE)
|
raise NotFoundException(excArgs=USER_MESSAGE)
|
||||||
|
@ -4,23 +4,21 @@ from wsgiref.util import FileWrapper
|
|||||||
|
|
||||||
from django.core.files.uploadedfile import TemporaryUploadedFile
|
from django.core.files.uploadedfile import TemporaryUploadedFile
|
||||||
from django.http import HttpResponse, JsonResponse
|
from django.http import HttpResponse, JsonResponse
|
||||||
from django.utils.crypto import get_random_string
|
|
||||||
from drf_spectacular.utils import extend_schema
|
from drf_spectacular.utils import extend_schema
|
||||||
from rest_framework import status, viewsets
|
from rest_framework import status, viewsets
|
||||||
from rest_framework.decorators import action
|
from rest_framework.decorators import action
|
||||||
from rest_framework.response import Response
|
from rest_framework.response import Response
|
||||||
from typing import List
|
from typing import List
|
||||||
from rest_framework.renderers import JSONRenderer
|
from rest_framework.renderers import JSONRenderer
|
||||||
from rest_framework_xml.renderers import XMLRenderer
|
from rest_framework_xml.renderers import XMLRenderer
|
||||||
|
|
||||||
from fwd import settings
|
from fwd import settings
|
||||||
from ..celery_worker.client_connector import c_connector
|
from ..celery_worker.client_connector import c_connector
|
||||||
from ..annotation.api import throw_on_failure
|
from ..annotation.api import throw_on_failure
|
||||||
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
|
from ..constant.common import ProcessType, REQUEST_ID, FOLDER_TYPE, EntityStatus, pdf_extensions, allowed_file_extensions
|
||||||
FolderFileType, TEMPLATE_ID, EntityStatus, pdf_extensions, allowed_file_extensions
|
|
||||||
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
|
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
|
||||||
PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException
|
PermissionDeniedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException
|
||||||
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
|
from ..models import SubscriptionRequest, SubscriptionRequestFile, OcrTemplate
|
||||||
from ..response.ReportSerializer import ReportSerializer
|
from ..response.ReportSerializer import ReportSerializer
|
||||||
from ..utils import FileUtils, ProcessUtil
|
from ..utils import FileUtils, ProcessUtil
|
||||||
|
|
||||||
@ -87,7 +85,7 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
|
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
|
||||||
doc_file.seek(0)
|
doc_file.seek(0)
|
||||||
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
||||||
S3_path = FileUtils.save_to_S3(_name, new_request, file_path)
|
FileUtils.save_to_S3(_name, new_request, file_path)
|
||||||
count += 1
|
count += 1
|
||||||
this_file = {
|
this_file = {
|
||||||
"file_name": _name,
|
"file_name": _name,
|
||||||
@ -157,7 +155,7 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
|
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
|
||||||
doc_file.seek(0)
|
doc_file.seek(0)
|
||||||
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
|
||||||
_ = FileUtils.save_to_S3(_name, new_request, file_path)
|
FileUtils.save_to_S3(_name, new_request, file_path)
|
||||||
count += 1
|
count += 1
|
||||||
this_file = {
|
this_file = {
|
||||||
"file_name": _name,
|
"file_name": _name,
|
||||||
@ -167,12 +165,19 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
compact_files.append(this_file)
|
compact_files.append(this_file)
|
||||||
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
|
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
|
||||||
|
|
||||||
time_out = 120
|
|
||||||
start = time.time()
|
time_limit = 120
|
||||||
while time.time() - start < time_out:
|
start_time = time.time()
|
||||||
time.sleep(0.1)
|
while True:
|
||||||
|
current_time = time.time()
|
||||||
|
waiting_time = current_time - start_time
|
||||||
|
print("Waiting for: ", waiting_time)
|
||||||
|
if waiting_time > time_limit:
|
||||||
|
print("Timeout!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
||||||
|
break
|
||||||
|
time.sleep(0.2)
|
||||||
report_filter = SubscriptionRequest.objects.filter(request_id=rq_id)
|
report_filter = SubscriptionRequest.objects.filter(request_id=rq_id)
|
||||||
if len(report_filter) != 1:
|
if report_filter.count() != 1:
|
||||||
raise InvalidException(excArgs='requestId')
|
raise InvalidException(excArgs='requestId')
|
||||||
|
|
||||||
if user_info.current_sub.id != report_filter[0].subscription.id:
|
if user_info.current_sub.id != report_filter[0].subscription.id:
|
||||||
@ -191,13 +196,19 @@ class CtelViewSet(viewsets.ViewSet):
|
|||||||
if report_filter[0].status == 400:
|
if report_filter[0].status == 400:
|
||||||
raise FileContentInvalidException()
|
raise FileContentInvalidException()
|
||||||
if report_filter[0].status == 100: # continue, only return when result is fullfilled
|
if report_filter[0].status == 100: # continue, only return when result is fullfilled
|
||||||
|
print(serializer.data)
|
||||||
|
print("Status Code: 100")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(serializer.data) == 0:
|
if len(serializer.data) == 0:
|
||||||
|
print("No data found")
|
||||||
continue
|
continue
|
||||||
if serializer.data[0].get("data", None) is None:
|
if serializer.data[0].get("data", None) is None:
|
||||||
|
print(serializer.data[0])
|
||||||
|
print("No data[0] found")
|
||||||
continue
|
continue
|
||||||
if serializer.data[0]["data"].get("status", 200) != 200:
|
if serializer.data[0]["data"].get("status", 200) != 200:
|
||||||
|
print("No data status found")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
|
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
|
||||||
|
@ -88,7 +88,7 @@ class CeleryConnector:
|
|||||||
def send_task(self, name=None, args=None):
|
def send_task(self, name=None, args=None):
|
||||||
if name not in self.task_routes or 'queue' not in self.task_routes[name]:
|
if name not in self.task_routes or 'queue' not in self.task_routes[name]:
|
||||||
raise GeneralException("System")
|
raise GeneralException("System")
|
||||||
return self.app.send_task(name, args, queue=self.task_routes[name]['queue'])
|
return self.app.send_task(name, args, queue=self.task_routes[name]['queue'], expires=300)
|
||||||
|
|
||||||
|
|
||||||
c_connector = CeleryConnector()
|
c_connector = CeleryConnector()
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
import time
|
import time
|
||||||
import fitz
|
|
||||||
import uuid
|
import uuid
|
||||||
import os
|
import os
|
||||||
import base64
|
import base64
|
||||||
|
import traceback
|
||||||
|
|
||||||
from fwd_api.models import SubscriptionRequest, UserProfile
|
from fwd_api.models import SubscriptionRequest, UserProfile
|
||||||
from fwd_api.celery_worker.worker import app
|
from fwd_api.celery_worker.worker import app
|
||||||
from ..constant.common import FolderFileType, image_extensions
|
from ..constant.common import FolderFileType, image_extensions
|
||||||
from ..exception.exceptions import FileContentInvalidException
|
from ..exception.exceptions import FileContentInvalidException
|
||||||
|
from fwd_api.models import SubscriptionRequestFile
|
||||||
from ..utils import FileUtils, ProcessUtil, S3_process
|
from ..utils import FileUtils, ProcessUtil, S3_process
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
from fwd import settings
|
from fwd import settings
|
||||||
@ -20,13 +21,10 @@ s3_client = S3_process.MinioS3Client(
|
|||||||
access_key=settings.S3_ACCESS_KEY,
|
access_key=settings.S3_ACCESS_KEY,
|
||||||
secret_key=settings.S3_SECRET_KEY,
|
secret_key=settings.S3_SECRET_KEY,
|
||||||
bucket_name=settings.S3_BUCKET_NAME
|
bucket_name=settings.S3_BUCKET_NAME
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
||||||
from fwd_api.models import SubscriptionRequestFile
|
|
||||||
try:
|
try:
|
||||||
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
|
|
||||||
|
|
||||||
# Origin file
|
# Origin file
|
||||||
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
||||||
request=request,
|
request=request,
|
||||||
@ -34,27 +32,9 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
|
|||||||
code=f'FIL{uuid.uuid4().hex}')
|
code=f'FIL{uuid.uuid4().hex}')
|
||||||
new_request_file.save()
|
new_request_file.save()
|
||||||
# Sub-file
|
# Sub-file
|
||||||
return ProcessUtil.pdf_to_images_urls(doc, request, user)
|
return ProcessUtil.pdf_to_images_urls(FileUtils.get_file(file_path), request, user)
|
||||||
except Exception as e:
|
|
||||||
request.status = 400
|
|
||||||
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
|
|
||||||
request.save()
|
|
||||||
return None
|
|
||||||
|
|
||||||
def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
|
|
||||||
from fwd_api.models import SubscriptionRequestFile
|
|
||||||
doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
|
|
||||||
|
|
||||||
# Origin file
|
|
||||||
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
|
||||||
request=request,
|
|
||||||
file_name=file_name,
|
|
||||||
code=f'FIL{uuid.uuid4().hex}')
|
|
||||||
new_request_file.save()
|
|
||||||
try:
|
|
||||||
# Sub-file
|
|
||||||
return ProcessUtil.pdf_to_images_urls(doc, request, user)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
request.status = 400
|
request.status = 400
|
||||||
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
|
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
|
||||||
request.save()
|
request.save()
|
||||||
@ -62,8 +42,6 @@ def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) ->
|
|||||||
|
|
||||||
|
|
||||||
def process_image_file(file_name: str, file_path, request, user) -> list:
|
def process_image_file(file_name: str, file_path, request, user) -> list:
|
||||||
from fwd_api.models import SubscriptionRequestFile
|
|
||||||
|
|
||||||
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
|
||||||
request=request,
|
request=request,
|
||||||
file_name=file_name,
|
file_name=file_name,
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
|
import traceback
|
||||||
|
|
||||||
from fwd_api.celery_worker.worker import app
|
from fwd_api.celery_worker.worker import app
|
||||||
from fwd_api.models import SubscriptionRequest
|
from fwd_api.models import SubscriptionRequest
|
||||||
from fwd_api.exception.exceptions import InvalidException
|
from fwd_api.exception.exceptions import InvalidException
|
||||||
|
from fwd_api.models import SubscriptionRequest
|
||||||
|
from fwd_api.constant.common import ProcessType
|
||||||
|
|
||||||
|
|
||||||
def aggregate_result(src_result, des_result, doc_type):
|
def aggregate_result(src_result, des_result, doc_type):
|
||||||
@ -45,11 +49,6 @@ def update_user(rq: SubscriptionRequest):
|
|||||||
|
|
||||||
@app.task(name='process_sap_invoice_result')
|
@app.task(name='process_sap_invoice_result')
|
||||||
def process_invoice_sap_result(rq_id, result):
|
def process_invoice_sap_result(rq_id, result):
|
||||||
from fwd_api.models import SubscriptionRequest
|
|
||||||
from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
print_id(rq_id)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rq: SubscriptionRequest = \
|
rq: SubscriptionRequest = \
|
||||||
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
|
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
|
||||||
@ -66,16 +65,12 @@ def process_invoice_sap_result(rq_id, result):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print("Fail Invoice %d", rq_id)
|
print("Fail Invoice %d", rq_id)
|
||||||
|
traceback.print_exc()
|
||||||
return "FailInvoice"
|
return "FailInvoice"
|
||||||
|
|
||||||
|
|
||||||
@app.task(name='process_fi_invoice_result')
|
@app.task(name='process_fi_invoice_result')
|
||||||
def process_invoice_fi_result(rq_id, result):
|
def process_invoice_fi_result(rq_id, result):
|
||||||
from fwd_api.models import SubscriptionRequest
|
|
||||||
from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
print_id(rq_id)
|
|
||||||
print(result)
|
|
||||||
try:
|
try:
|
||||||
rq: SubscriptionRequest = \
|
rq: SubscriptionRequest = \
|
||||||
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.FI_INVOICE.value)[0]
|
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.FI_INVOICE.value)[0]
|
||||||
@ -92,14 +87,11 @@ def process_invoice_fi_result(rq_id, result):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print("Fail Invoice %d", rq_id)
|
print("Fail Invoice %d", rq_id)
|
||||||
|
traceback.print_exc()
|
||||||
return "FailInvoice"
|
return "FailInvoice"
|
||||||
|
|
||||||
@app.task(name='process_manulife_invoice_result')
|
@app.task(name='process_manulife_invoice_result')
|
||||||
def process_invoice_manulife_result(rq_id, result):
|
def process_invoice_manulife_result(rq_id, result):
|
||||||
from fwd_api.models import SubscriptionRequest
|
|
||||||
from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
print_id(f"[DEBUG]: Received manulife request with id {rq_id}")
|
|
||||||
try:
|
try:
|
||||||
rq: SubscriptionRequest = \
|
rq: SubscriptionRequest = \
|
||||||
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.MANULIFE_INVOICE.value)[0]
|
SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.MANULIFE_INVOICE.value)[0]
|
||||||
@ -116,13 +108,11 @@ def process_invoice_manulife_result(rq_id, result):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print("Fail Invoice %d", rq_id)
|
print("Fail Invoice %d", rq_id)
|
||||||
|
traceback.print_exc()
|
||||||
return "FailInvoice"
|
return "FailInvoice"
|
||||||
|
|
||||||
@app.task(name='process_sbt_invoice_result')
|
@app.task(name='process_sbt_invoice_result')
|
||||||
def process_invoice_sbt_result(rq_id, result):
|
def process_invoice_sbt_result(rq_id, result):
|
||||||
from fwd_api.models import SubscriptionRequest
|
|
||||||
from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
print_id(f"[DEBUG]: Received SBT request with id {rq_id}")
|
print_id(f"[DEBUG]: Received SBT request with id {rq_id}")
|
||||||
print_id(f"[DEBUG]: result: {result}")
|
print_id(f"[DEBUG]: result: {result}")
|
||||||
try:
|
try:
|
||||||
@ -156,205 +146,6 @@ def process_invoice_sbt_result(rq_id, result):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print("Fail Invoice %d", rq_id)
|
print("Fail Invoice %d", rq_id)
|
||||||
|
traceback.print_exc()
|
||||||
return "FailInvoice"
|
return "FailInvoice"
|
||||||
|
|
||||||
|
|
||||||
# @app.task(name='process_id_result', queue='id_card_rs')
|
|
||||||
# def process_id_result(rq_id, result):
|
|
||||||
# from fwd_api.models import SubscriptionRequest
|
|
||||||
# from fwd_api.constant.common import ProcessType
|
|
||||||
# from fwd_api.models import SubscriptionRequestFile
|
|
||||||
# from fwd_api.constant.common import FileCategory
|
|
||||||
|
|
||||||
# print_id(rq_id)
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# s_time = time.time()
|
|
||||||
# print("Start")
|
|
||||||
# j_time = time.time()
|
|
||||||
# print("Json {}".format(j_time - s_time))
|
|
||||||
|
|
||||||
# rq: SubscriptionRequest = \
|
|
||||||
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.ID_CARD.value)[0]
|
|
||||||
|
|
||||||
# if 'content' in result and 'pages' in result['content']:
|
|
||||||
# pages = result['content']['pages']
|
|
||||||
# if isinstance(pages, list):
|
|
||||||
# new_pages = []
|
|
||||||
# for idx, page in enumerate(pages):
|
|
||||||
# if 'path_image_croped' in page:
|
|
||||||
# img_name = f'crop_{idx}_{get_random_string(3)}.jpg'
|
|
||||||
# path = page['path_image_croped']
|
|
||||||
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=img_name, request=rq,
|
|
||||||
# file_category=FileCategory.CROP.value,
|
|
||||||
# file_path=path,
|
|
||||||
# code=f'IDC{uuid.uuid4().hex}')
|
|
||||||
# rq_file.save()
|
|
||||||
# page['path_image_croped'] = rq_file.code
|
|
||||||
|
|
||||||
# l_time = time.time()
|
|
||||||
# print("Save {}".format(l_time - j_time))
|
|
||||||
|
|
||||||
# status = to_status(result)
|
|
||||||
|
|
||||||
# rq.predict_result = result
|
|
||||||
|
|
||||||
# rq.status = status
|
|
||||||
# rq.save()
|
|
||||||
|
|
||||||
# update_user(rq)
|
|
||||||
# e_time = time.time()
|
|
||||||
# print("End {}".format(e_time - l_time))
|
|
||||||
|
|
||||||
# except IndexError as e:
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print(e)
|
|
||||||
# except Exception as e:
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print(e)
|
|
||||||
# print("Fail ID %d", rq_id)
|
|
||||||
# return "Fail"
|
|
||||||
|
|
||||||
# return "Success"
|
|
||||||
|
|
||||||
|
|
||||||
# @app.task(name='process_driver_license_result')
|
|
||||||
# def process_driver_license_result(rq_id, result):
|
|
||||||
# from fwd_api.models import SubscriptionRequest
|
|
||||||
# from fwd_api.models import SubscriptionRequestFile
|
|
||||||
# from fwd_api.constant.common import FileCategory
|
|
||||||
# from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
# print_id(rq_id)
|
|
||||||
# try:
|
|
||||||
# rq: SubscriptionRequest = \
|
|
||||||
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.DRIVER_LICENSE.value)[0]
|
|
||||||
|
|
||||||
# if 'content' in result and 'pages' in result['content']:
|
|
||||||
# pages = result['content']['pages']
|
|
||||||
# if isinstance(pages, list):
|
|
||||||
# new_pages = []
|
|
||||||
# for idx, page in enumerate(pages):
|
|
||||||
# if 'path_image_croped' in page:
|
|
||||||
# img_name = f'crop_{idx}_{get_random_string(3)}.jpg'
|
|
||||||
# path = page['path_image_croped']
|
|
||||||
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=img_name, request=rq,
|
|
||||||
# file_category=FileCategory.CROP.value,
|
|
||||||
# file_path=path,
|
|
||||||
# code=f'DLC{uuid.uuid4().hex}')
|
|
||||||
# rq_file.save()
|
|
||||||
# page['path_image_croped'] = rq_file.code
|
|
||||||
# status = to_status(result)
|
|
||||||
|
|
||||||
# rq.predict_result = result
|
|
||||||
# rq.status = status
|
|
||||||
# rq.save()
|
|
||||||
|
|
||||||
# update_user(rq)
|
|
||||||
|
|
||||||
# except IndexError as e:
|
|
||||||
# print(e)
|
|
||||||
# except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
# print("Fail DL %d", rq_id)
|
|
||||||
# return "Fail"
|
|
||||||
|
|
||||||
# return "Success"
|
|
||||||
|
|
||||||
|
|
||||||
# @app.task(name='process_invoice_result')
|
|
||||||
# def process_invoice_result(rq_id, result):
|
|
||||||
# from fwd_api.models import SubscriptionRequest
|
|
||||||
# from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
# print_id(rq_id)
|
|
||||||
# try:
|
|
||||||
# rq: SubscriptionRequest = \
|
|
||||||
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)
|
|
||||||
# print(rq)
|
|
||||||
# rq: SubscriptionRequest = \
|
|
||||||
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.INVOICE.value)[0]
|
|
||||||
# status = to_status(result)
|
|
||||||
|
|
||||||
# rq.predict_result = result
|
|
||||||
# rq.status = status
|
|
||||||
# rq.save()
|
|
||||||
|
|
||||||
# update_user(rq)
|
|
||||||
# except IndexError as e:
|
|
||||||
# print(e)
|
|
||||||
# print("NotFound request by requestId, %d", rq_id)
|
|
||||||
# except Exception as e:
|
|
||||||
# print(e)
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print("Fail Invoice %d", rq_id)
|
|
||||||
# return "FailInvoice"
|
|
||||||
|
|
||||||
# return "Success"
|
|
||||||
|
|
||||||
|
|
||||||
# @app.task(name='process_ocr_with_box_result')
|
|
||||||
# def process_ocr_with_box_result(rq_id, result):
|
|
||||||
# from fwd_api.models import SubscriptionRequest
|
|
||||||
# from fwd_api.constant.common import ProcessType
|
|
||||||
|
|
||||||
# print_id(rq_id)
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# rq: SubscriptionRequest = \
|
|
||||||
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.OCR_WITH_BOX.value)[0]
|
|
||||||
# status = to_status(result)
|
|
||||||
|
|
||||||
# rq.predict_result = result
|
|
||||||
# rq.status = status
|
|
||||||
# rq.save()
|
|
||||||
|
|
||||||
# update_user(rq)
|
|
||||||
# except IndexError as e:
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print(e)
|
|
||||||
# except Exception as e:
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print(e)
|
|
||||||
# print("Fail OCR %d", rq_id)
|
|
||||||
# return "FailOCR"
|
|
||||||
|
|
||||||
# return "Success"
|
|
||||||
|
|
||||||
|
|
||||||
# @app.task(name='process_template_matching_result')
|
|
||||||
# def template_matching_result(rq_id, result, align_img):
|
|
||||||
# from fwd_api.models import SubscriptionRequest
|
|
||||||
# from fwd_api.constant.common import ProcessType
|
|
||||||
# from fwd_api.constant.common import FileCategory
|
|
||||||
# from fwd_api.models import SubscriptionRequestFile
|
|
||||||
|
|
||||||
# print_id(rq_id)
|
|
||||||
# try:
|
|
||||||
# rq: SubscriptionRequest = \
|
|
||||||
# SubscriptionRequest.objects.filter(request_id=rq_id, process_type=ProcessType.TEMPLATE_MATCHING.value)[0]
|
|
||||||
|
|
||||||
# if align_img:
|
|
||||||
# from fwd_api.constant.common import IMAGE_NAME
|
|
||||||
# rq_file: SubscriptionRequestFile = SubscriptionRequestFile(file_name=IMAGE_NAME, request=rq,
|
|
||||||
# file_category=FileCategory.CROP.value,
|
|
||||||
# file_path=align_img)
|
|
||||||
# rq_file.save()
|
|
||||||
# status = to_status(result)
|
|
||||||
|
|
||||||
# rq.predict_result = result
|
|
||||||
# rq.status = status
|
|
||||||
# rq.save()
|
|
||||||
|
|
||||||
# update_user(rq)
|
|
||||||
|
|
||||||
# except IndexError as e:
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print(e)
|
|
||||||
# except Exception as e:
|
|
||||||
# traceback.format_exc()
|
|
||||||
# print(e)
|
|
||||||
# print("Fail Template %d", rq_id)
|
|
||||||
# return "FailTemplate"
|
|
||||||
|
|
||||||
# return "Success"
|
|
||||||
|
@ -18,11 +18,6 @@ app: Celery = Celery(
|
|||||||
app.conf.update({
|
app.conf.update({
|
||||||
'task_queues':
|
'task_queues':
|
||||||
[
|
[
|
||||||
# Queue('id_card_rs'),
|
|
||||||
# Queue('driver_license_rs'),
|
|
||||||
# Queue('invoice_rs'),
|
|
||||||
# Queue('ocr_with_box_rs'),
|
|
||||||
# Queue('template_matching_rs'),
|
|
||||||
Queue('invoice_sap_rs'),
|
Queue('invoice_sap_rs'),
|
||||||
Queue('invoice_fi_rs'),
|
Queue('invoice_fi_rs'),
|
||||||
Queue('invoice_manulife_rs'),
|
Queue('invoice_manulife_rs'),
|
||||||
@ -33,11 +28,6 @@ app.conf.update({
|
|||||||
|
|
||||||
],
|
],
|
||||||
'task_routes': {
|
'task_routes': {
|
||||||
# 'process_id_result': {'queue': 'id_card_rs'},
|
|
||||||
# 'process_driver_license_result': {'queue': "driver_license_rs"},
|
|
||||||
# 'process_invoice_result': {'queue': "invoice_rs"},
|
|
||||||
# 'process_ocr_with_box_result': {'queue': "ocr_with_box_rs"},
|
|
||||||
# 'process_template_matching_result': {'queue': 'template_matching_rs'},
|
|
||||||
'process_sap_invoice_result': {'queue': 'invoice_sap_rs'},
|
'process_sap_invoice_result': {'queue': 'invoice_sap_rs'},
|
||||||
'process_sap_invoice': {'queue': "invoice_sap"},
|
'process_sap_invoice': {'queue': "invoice_sap"},
|
||||||
'process_fi_invoice_result': {'queue': 'invoice_fi_rs'},
|
'process_fi_invoice_result': {'queue': 'invoice_fi_rs'},
|
||||||
|
@ -72,10 +72,3 @@ class ReportSerializer(serializers.Serializer):
|
|||||||
new_data.append(new_page_object)
|
new_data.append(new_page_object)
|
||||||
data['pages'] = new_data
|
data['pages'] = new_data
|
||||||
return data
|
return data
|
||||||
|
|
||||||
# def get_predict_result(self, obj: SubscriptionRequest):
|
|
||||||
# from fwd_api.constant.common import ProcessType
|
|
||||||
# typez = int(obj.process_type)
|
|
||||||
# if typez == ProcessType.OCR_WITH_BOX.value or typez == ProcessType.TEMPLATE_MATCHING.value:
|
|
||||||
# return obj.predict_result
|
|
||||||
# return None
|
|
@ -156,11 +156,9 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
|
|||||||
|
|
||||||
def save_to_S3(file_name, rq, local_file_path):
|
def save_to_S3(file_name, rq, local_file_path):
|
||||||
try:
|
try:
|
||||||
# base64_obj = base64.b64encode(obj).decode('utf-8')
|
|
||||||
file_path = get_folder_path(rq)
|
file_path = get_folder_path(rq)
|
||||||
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
|
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
|
||||||
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
|
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
|
||||||
# c_connector.upload_file_to_s3((file_path, s3_key))
|
|
||||||
c_connector.upload_file_to_s3((local_file_path, s3_key))
|
c_connector.upload_file_to_s3((local_file_path, s3_key))
|
||||||
return s3_key
|
return s3_key
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -8,7 +8,9 @@ from django.core.files.uploadedfile import TemporaryUploadedFile
|
|||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
from rest_framework import status
|
from rest_framework import status
|
||||||
|
|
||||||
|
from fwd_api.utils.image import get_first_page_pdf
|
||||||
from fwd import settings
|
from fwd import settings
|
||||||
|
from fwd_api.utils.image import resize
|
||||||
from fwd_api.constant.common import LIST_BOX_MESSAGE, pattern, NAME_MESSAGE, allowed_p_type, TEMPLATE_ID, \
|
from fwd_api.constant.common import LIST_BOX_MESSAGE, pattern, NAME_MESSAGE, allowed_p_type, TEMPLATE_ID, \
|
||||||
FolderFileType, FileCategory
|
FolderFileType, FileCategory
|
||||||
from fwd_api.exception.exceptions import NumberOfBoxLimitReachedException, \
|
from fwd_api.exception.exceptions import NumberOfBoxLimitReachedException, \
|
||||||
@ -21,7 +23,6 @@ from ..models import UserProfile, OcrTemplate, OcrTemplateBox, \
|
|||||||
Subscription, SubscriptionRequestFile, SubscriptionRequest
|
Subscription, SubscriptionRequestFile, SubscriptionRequest
|
||||||
from ..celery_worker.client_connector import c_connector
|
from ..celery_worker.client_connector import c_connector
|
||||||
import uuid
|
import uuid
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
|
|
||||||
@ -286,7 +287,6 @@ def validate_vn_and_space(txt: str):
|
|||||||
raise InvalidException(excArgs=NAME_MESSAGE)
|
raise InvalidException(excArgs=NAME_MESSAGE)
|
||||||
|
|
||||||
|
|
||||||
@transaction.atomic
|
|
||||||
def save_template_boxs(data, template):
|
def save_template_boxs(data, template):
|
||||||
saving_list = []
|
saving_list = []
|
||||||
for d_box in data['data_boxs']:
|
for d_box in data['data_boxs']:
|
||||||
@ -410,32 +410,29 @@ def process_image_local_file(file_name: str, file_path: str, request: Subscripti
|
|||||||
'request_file_id': new_request_file.code
|
'request_file_id': new_request_file.code
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
|
def pdf_to_images_urls(doc_path, request: SubscriptionRequest, user, dpi: int = 300) -> list:
|
||||||
pdf_extracted = []
|
pdf_extracted = []
|
||||||
for idx, page in enumerate(doc):
|
saving_path = FileUtils.get_folder_path(request)
|
||||||
saving_path = FileUtils.get_folder_path(request)
|
break_file_name = f'break_0.jpg'
|
||||||
break_file_name = f'break_{idx}.jpg'
|
saving_path = os.path.join(saving_path, break_file_name)
|
||||||
saving_path = os.path.join(saving_path, break_file_name)
|
|
||||||
|
|
||||||
page = doc.load_page(idx)
|
image = get_first_page_pdf(doc_path, 300)
|
||||||
pix = page.get_pixmap(dpi=250) # render page to an image
|
image = resize(image, max_w=settings.TARGET_MAX_IMAGE_SIZE[0], max_h=settings.TARGET_MAX_IMAGE_SIZE[1])
|
||||||
if pix.size > 8*3*settings.MAX_PIXEL_IN_A_FILE*settings.MAX_PIXEL_IN_A_FILE:
|
image.save(saving_path)
|
||||||
raise InvalidDecompressedSizeException(excArgs=(str(pix.width), str(pix.height), str(settings.MAX_PIXEL_IN_A_FILE)))
|
print(f"Saving {saving_path}")
|
||||||
pix.save(saving_path)
|
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=saving_path,
|
||||||
print(f"Saving {saving_path}")
|
request=request,
|
||||||
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=saving_path,
|
file_name=break_file_name,
|
||||||
request=request,
|
file_category=FileCategory.BREAK.value,
|
||||||
file_name=break_file_name,
|
code=f'FIL{uuid.uuid4().hex}')
|
||||||
file_category=FileCategory.BREAK.value,
|
new_request_file.save()
|
||||||
code=f'FIL{uuid.uuid4().hex}')
|
|
||||||
new_request_file.save()
|
|
||||||
|
|
||||||
file_url = FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, break_file_name)
|
file_url = FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, break_file_name)
|
||||||
pdf_extracted.append(
|
pdf_extracted.append(
|
||||||
{
|
{
|
||||||
'file_url': file_url,
|
'file_url': file_url,
|
||||||
'page_number': idx,
|
'page_number': 0,
|
||||||
'request_file_id': new_request_file.code
|
'request_file_id': new_request_file.code
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return pdf_extracted
|
return pdf_extracted
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import io
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
|
||||||
def resize(image, max_w=2048, max_h=2048):
|
def resize(image, max_w=2048, max_h=2048):
|
||||||
cur_w, cur_h = image.width, image.height
|
cur_w = image.width
|
||||||
image_bytes = image.samples
|
cur_h = image.height
|
||||||
image = Image.frombytes("RGB", [cur_w, cur_h], image_bytes)
|
|
||||||
if cur_h > max_w or cur_h > max_h:
|
if cur_h > max_w or cur_h > max_h:
|
||||||
ratio_w = max_w/cur_w
|
ratio_w = max_w/cur_w
|
||||||
ratio_h = max_h/cur_h
|
ratio_h = max_h/cur_h
|
||||||
@ -11,5 +13,40 @@ def resize(image, max_w=2048, max_h=2048):
|
|||||||
new_w = int(ratio*cur_w)
|
new_w = int(ratio*cur_w)
|
||||||
new_h = int(ratio*cur_h)
|
new_h = int(ratio*cur_h)
|
||||||
image = image.resize((new_w, new_h))
|
image = image.resize((new_w, new_h))
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def fitz_pixmap_to_pillow_with_resize(image, max_w=2048, max_h=2048):
|
||||||
|
cur_w, cur_h = image.width, image.height
|
||||||
|
image_bytes = image.samples
|
||||||
|
image = Image.frombytes("RGB", [cur_w, cur_h], image_bytes)
|
||||||
|
image = resize(image, max_w, max_h)
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def get_first_page_pdf(filename, max_size=300):
|
||||||
|
def pdf_scale_page(page, size=297):
|
||||||
|
"""Scale page to specified size mm"""
|
||||||
|
(w, h) = page.mediabox[2:]
|
||||||
|
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
|
||||||
|
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
|
||||||
|
pmm = (1/72*25.4)
|
||||||
|
ks = size / (float(max((w, h))) * pmm)
|
||||||
|
page.scale_by(ks)
|
||||||
|
return page
|
||||||
|
|
||||||
|
reader = PdfReader(filename)
|
||||||
|
page = reader.pages[0]
|
||||||
|
scaled_page = pdf_scale_page(page, max_size)
|
||||||
|
|
||||||
|
# Create BytesIO
|
||||||
|
pdf_bytes = io.BytesIO()
|
||||||
|
dst_pdf = PdfWriter()
|
||||||
|
dst_pdf.add_page(scaled_page)
|
||||||
|
dst_pdf.write(pdf_bytes)
|
||||||
|
pdf_bytes.seek(0)
|
||||||
|
|
||||||
|
image = convert_from_bytes(pdf_bytes.read())
|
||||||
|
if isinstance(image, list):
|
||||||
|
return image[0]
|
||||||
return image
|
return image
|
@ -23,7 +23,6 @@ Jinja2==3.1.2
|
|||||||
jsonschema==4.17.1
|
jsonschema==4.17.1
|
||||||
MarkupSafe==2.1.1
|
MarkupSafe==2.1.1
|
||||||
packaging==21.3
|
packaging==21.3
|
||||||
pdf2image==1.16.0
|
|
||||||
Pillow==9.3.0
|
Pillow==9.3.0
|
||||||
psycopg2==2.9.5
|
psycopg2==2.9.5
|
||||||
psycopg2-binary==2.9.5
|
psycopg2-binary==2.9.5
|
||||||
@ -41,7 +40,6 @@ tzdata==2022.6
|
|||||||
uritemplate==4.1.1
|
uritemplate==4.1.1
|
||||||
urllib3==1.26.13
|
urllib3==1.26.13
|
||||||
uvicorn==0.20.0
|
uvicorn==0.20.0
|
||||||
|
|
||||||
celery~=5.2.7
|
celery~=5.2.7
|
||||||
kombu~=5.2.4
|
kombu~=5.2.4
|
||||||
PyJWT~=2.6.0
|
PyJWT~=2.6.0
|
||||||
@ -50,3 +48,4 @@ PyMuPDF==1.21.1
|
|||||||
djangorestframework-xml==2.0.0
|
djangorestframework-xml==2.0.0
|
||||||
boto3==1.29.7
|
boto3==1.29.7
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
|
pdf2image==1.16.3
|
@ -8,6 +8,9 @@ server {
|
|||||||
|
|
||||||
location ~ ^/api {
|
location ~ ^/api {
|
||||||
proxy_pass {{proxy_server}};
|
proxy_pass {{proxy_server}};
|
||||||
|
proxy_read_timeout 300;
|
||||||
|
proxy_connect_timeout 300;
|
||||||
|
proxy_send_timeout 300;
|
||||||
}
|
}
|
||||||
|
|
||||||
location /static/drf_spectacular_sidecar/ {
|
location /static/drf_spectacular_sidecar/ {
|
||||||
|
@ -26,7 +26,7 @@ export async function createKieJob({
|
|||||||
const response = await API.post<{
|
const response = await API.post<{
|
||||||
estimated_wating_time: number;
|
estimated_wating_time: number;
|
||||||
request_id: string;
|
request_id: string;
|
||||||
}>('/ctel/image/process/', formData);
|
}>('/ctel/images/process_sync', formData);
|
||||||
return response.data.request_id;
|
return response.data.request_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,8 +11,7 @@ services:
|
|||||||
shm_size: 10gb
|
shm_size: 10gb
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
shm_size: 10gb
|
shm_size: 10gb
|
||||||
image: sidp/cope2n-ai-fi-sbt
|
# mem_limit: 8g
|
||||||
mem_limit: 8g
|
|
||||||
restart: always
|
restart: always
|
||||||
# container_name: "sidp-cope2n-ai-fi-sbt"
|
# container_name: "sidp-cope2n-ai-fi-sbt"
|
||||||
networks:
|
networks:
|
||||||
@ -43,7 +42,6 @@ services:
|
|||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
# ports:
|
# ports:
|
||||||
# - 9880:9000
|
# - 9880:9000
|
||||||
image: sidp/cope2n-be-fi-sbt
|
|
||||||
# container_name: "sidp-cope2n-be-ctel-sbt"
|
# container_name: "sidp-cope2n-be-ctel-sbt"
|
||||||
environment:
|
environment:
|
||||||
- MEDIA_ROOT=${MEDIA_ROOT}
|
- MEDIA_ROOT=${MEDIA_ROOT}
|
||||||
@ -71,20 +69,20 @@ services:
|
|||||||
- S3_ACCESS_KEY=${S3_ACCESS_KEY}
|
- S3_ACCESS_KEY=${S3_ACCESS_KEY}
|
||||||
- S3_SECRET_KEY=${S3_SECRET_KEY}
|
- S3_SECRET_KEY=${S3_SECRET_KEY}
|
||||||
- S3_BUCKET_NAME=${S3_BUCKET_NAME}
|
- S3_BUCKET_NAME=${S3_BUCKET_NAME}
|
||||||
# restart: always
|
restart: always
|
||||||
networks:
|
networks:
|
||||||
- ctel-sbt
|
- ctel-sbt
|
||||||
volumes:
|
volumes:
|
||||||
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
|
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
|
||||||
- BE_static:/app/static
|
- BE_static:/app/static
|
||||||
# - ./cope2n-api:/app
|
- ./cope2n-api:/app
|
||||||
working_dir: /app
|
working_dir: /app
|
||||||
depends_on:
|
depends_on:
|
||||||
db-sbt:
|
db-sbt:
|
||||||
condition: service_started
|
condition: service_started
|
||||||
# rabbitmq:
|
# rabbitmq:
|
||||||
# condition: service_started
|
# condition: service_started
|
||||||
command: sh -c "python manage.py collectstatic --no-input &&
|
command: sh -c "sleep 5; python manage.py collectstatic --no-input &&
|
||||||
python manage.py migrate &&
|
python manage.py migrate &&
|
||||||
python manage.py compilemessages &&
|
python manage.py compilemessages &&
|
||||||
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker -b 0.0.0.0:9000" # pre-makemigrations on prod
|
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker -b 0.0.0.0:9000" # pre-makemigrations on prod
|
||||||
@ -111,8 +109,9 @@ services:
|
|||||||
# args:
|
# args:
|
||||||
# - "UID=${UID:-1000}"
|
# - "UID=${UID:-1000}"
|
||||||
# - "GID=${GID:-1000}"
|
# - "GID=${GID:-1000}"
|
||||||
image: sidp/cope2n-be-fi-sbt
|
build:
|
||||||
# container_name: "sidp-cope2n-be-celery-sbt"
|
context: cope2n-api
|
||||||
|
dockerfile: Dockerfile
|
||||||
environment:
|
environment:
|
||||||
- MEDIA_ROOT=${MEDIA_ROOT}
|
- MEDIA_ROOT=${MEDIA_ROOT}
|
||||||
- PYTHONPATH=${PYTHONPATH}:/app # For import module
|
- PYTHONPATH=${PYTHONPATH}:/app # For import module
|
||||||
@ -146,19 +145,19 @@ services:
|
|||||||
condition: service_started
|
condition: service_started
|
||||||
volumes:
|
volumes:
|
||||||
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
|
- ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
|
||||||
|
- ./cope2n-api:/app
|
||||||
|
|
||||||
working_dir: /app
|
working_dir: /app
|
||||||
command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO"
|
command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO --pool=solo"
|
||||||
|
|
||||||
# Back-end persistent
|
# Back-end persistent
|
||||||
db-sbt:
|
db-sbt:
|
||||||
mem_reservation: 500m
|
mem_reservation: 500m
|
||||||
mem_limit: 1g
|
# mem_limit: 1g
|
||||||
# container_name: sidp-cope2n-be-sbt-db
|
# container_name: sidp-cope2n-be-sbt-db
|
||||||
image: postgres:14.7-alpine
|
image: postgres:14.7-alpine
|
||||||
volumes:
|
volumes:
|
||||||
- ./data/postgres_data:/var/lib/postgresql/data
|
- ./data/postgres_data:/var/lib/postgresql/data
|
||||||
working_dir: /workspace/cope2n-api
|
|
||||||
networks:
|
networks:
|
||||||
- ctel-sbt
|
- ctel-sbt
|
||||||
environment:
|
environment:
|
||||||
@ -168,7 +167,7 @@ services:
|
|||||||
|
|
||||||
rabbitmq-sbt:
|
rabbitmq-sbt:
|
||||||
mem_reservation: 600m
|
mem_reservation: 600m
|
||||||
mem_limit: 4g
|
# mem_limit: 4g
|
||||||
# container_name: sidp-cope2n-be-rabbitmq-sbt
|
# container_name: sidp-cope2n-be-rabbitmq-sbt
|
||||||
restart: always
|
restart: always
|
||||||
image: rabbitmq:3.10-alpine
|
image: rabbitmq:3.10-alpine
|
||||||
@ -182,6 +181,7 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- RABBITMQ_DEFAULT_USER=${RABBITMQ_DEFAULT_USER}
|
- RABBITMQ_DEFAULT_USER=${RABBITMQ_DEFAULT_USER}
|
||||||
- RABBITMQ_DEFAULT_PASS=${RABBITMQ_DEFAULT_PASS}
|
- RABBITMQ_DEFAULT_PASS=${RABBITMQ_DEFAULT_PASS}
|
||||||
|
|
||||||
# Front-end services
|
# Front-end services
|
||||||
fe-sbt:
|
fe-sbt:
|
||||||
build:
|
build:
|
||||||
@ -189,7 +189,6 @@ services:
|
|||||||
shm_size: 10gb
|
shm_size: 10gb
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
shm_size: 10gb
|
shm_size: 10gb
|
||||||
image: sidp/cope2n-fe-fi-sbt
|
|
||||||
# container_name: "sidp-cope2n-fe-ctel-sbt"
|
# container_name: "sidp-cope2n-fe-ctel-sbt"
|
||||||
privileged: true
|
privileged: true
|
||||||
ports:
|
ports:
|
||||||
|
BIN
invoice.jpg
Normal file
BIN
invoice.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 270 KiB |
@ -146,10 +146,10 @@ def process_file(data):
|
|||||||
}
|
}
|
||||||
|
|
||||||
invoice_files = [
|
invoice_files = [
|
||||||
('invoice_file', ('invoice.jpg', open("test_samples/sbt/big_image.jpg", "rb").read())),
|
('invoice_file', ('invoice.jpg', open("test_samples/sbt/invoice.jpg", "rb").read())),
|
||||||
]
|
]
|
||||||
imei_files = [
|
imei_files = [
|
||||||
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/big_image.jpg", "rb").read())),
|
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/invoice.jpg", "rb").read())),
|
||||||
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
|
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
|
||||||
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
|
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
|
||||||
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),
|
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),
|
||||||
|
157
speedtest_sync.py
Normal file
157
speedtest_sync.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import multiprocessing
|
||||||
|
import tqdm
|
||||||
|
import random
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--host", dest="host", default="https://sbt.idp.sdsrv.ai", required=False)
|
||||||
|
parser.add_argument("-u", "--username", help="Username to connect to server", required=True)
|
||||||
|
parser.add_argument("-p", "--password", help="Password to connect to server", required=True)
|
||||||
|
parser.add_argument("--num_requests", type=int, help="Number of requests", required=False, default=100)
|
||||||
|
parser.add_argument("--num_workers", type=int, help="Number of workers", required=False, default=3)
|
||||||
|
parser.add_argument("--checking_interval", type=float, help="Interval result checking time", required=False, default=0.5)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
PROCESSING_TIMEOUT = 60
|
||||||
|
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
# GET THE TOKEN
|
||||||
|
response = requests.post(f'{args.host}/api/ctel/login/', json={
|
||||||
|
'username': args.username,
|
||||||
|
'password': args.password
|
||||||
|
})
|
||||||
|
try:
|
||||||
|
token = response.json()['token']
|
||||||
|
except:
|
||||||
|
print("Failed to login")
|
||||||
|
print(response.content)
|
||||||
|
# After the login, store the token in the memory (RAM) or DB
|
||||||
|
# Re-login to issue a new token after 6 days.
|
||||||
|
# =================================================================
|
||||||
|
|
||||||
|
def process_file(data):
|
||||||
|
files, token = data
|
||||||
|
num_files = len(files)
|
||||||
|
files.append(
|
||||||
|
('processType', (None, 12)),
|
||||||
|
)
|
||||||
|
# =================================================================
|
||||||
|
# UPLOAD THE FILE
|
||||||
|
start_time = time.time()
|
||||||
|
try:
|
||||||
|
response = requests.post(f'{args.host}/api/ctel/images/process_sync/', headers={
|
||||||
|
'Authorization': token,
|
||||||
|
}, files=files, timeout=300)
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
print("Timeout occurred while uploading")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"status": "timeout",
|
||||||
|
"upload_time": 0,
|
||||||
|
"process_time": 0,
|
||||||
|
"num_files": 0,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
traceback.print_exc()
|
||||||
|
print("Unknown exception occurred while uploading")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"status": "unknown error",
|
||||||
|
"upload_time": 0,
|
||||||
|
"process_time": 0,
|
||||||
|
"num_files": 0,
|
||||||
|
}
|
||||||
|
end_time = time.time()
|
||||||
|
upload_time = end_time - start_time
|
||||||
|
# =================================================================
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = response.json()
|
||||||
|
data.pop("files", None)
|
||||||
|
print(data)
|
||||||
|
except:
|
||||||
|
print(response.content)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"status": "timeout",
|
||||||
|
"upload_time": 0,
|
||||||
|
"process_time": 0,
|
||||||
|
"num_files": 0,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"status": 200,
|
||||||
|
"upload_time": upload_time,
|
||||||
|
"process_time": upload_time,
|
||||||
|
"num_files": num_files,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
invoice_files = [
|
||||||
|
('invoice_file', ('invoice.pdf', open("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", "rb").read())),
|
||||||
|
]
|
||||||
|
# invoice_files = [
|
||||||
|
# ('invoice_file', ('invoice.jpg', open("test_samples/sbt/invoice.jpg", "rb").read())),
|
||||||
|
# ]
|
||||||
|
imei_files = [
|
||||||
|
('imei_files', ("test_samples/sbt/imei1.jpg", open("test_samples/sbt/invoice.jpg", "rb").read())),
|
||||||
|
('imei_files', ("test_samples/sbt/imei2.jpg", open("test_samples/sbt/imei2.jpg", "rb").read())),
|
||||||
|
('imei_files', ("test_samples/sbt/imei3.jpg", open("test_samples/sbt/imei3.jpg", "rb").read())),
|
||||||
|
('imei_files', ("test_samples/sbt/imei4.jpeg", open("test_samples/sbt/imei4.jpeg", "rb").read())),
|
||||||
|
('imei_files', ("test_samples/sbt/imei5.jpg", open("test_samples/sbt/imei5.jpg", "rb").read())),
|
||||||
|
]
|
||||||
|
def get_imei_files():
|
||||||
|
# num_files = random.randint(1, len(imei_files) + 1)
|
||||||
|
num_files = 1
|
||||||
|
print("Num imeis", num_files)
|
||||||
|
files = imei_files[:num_files]
|
||||||
|
# print("Num of imei files:", len(files))
|
||||||
|
return files
|
||||||
|
def get_files():
|
||||||
|
return invoice_files + get_imei_files()
|
||||||
|
def gen_input(num_input):
|
||||||
|
for _ in range(num_input):
|
||||||
|
yield (get_files(), token)
|
||||||
|
pool = multiprocessing.Pool(processes=args.num_workers)
|
||||||
|
results = []
|
||||||
|
for result in tqdm.tqdm(pool.imap_unordered(process_file, gen_input(num_input=args.num_requests)), total=args.num_requests):
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
print("## TEST REPORT #################################")
|
||||||
|
print("Number of requests: {}".format(args.num_requests))
|
||||||
|
print("Number of concurrent requests: {}".format(args.num_workers))
|
||||||
|
print("Number of files: 1 invoice, 1-5 imei files (random)")
|
||||||
|
print("Query time interval for result: {:.3f}s ".format(args.checking_interval))
|
||||||
|
print("--------------------------------------")
|
||||||
|
print("SUCCESS RATE")
|
||||||
|
counter = {}
|
||||||
|
for result in results:
|
||||||
|
counter[result["status"]] = counter.get(result["status"], 0) + 1
|
||||||
|
total_requests = sum(counter.values())
|
||||||
|
print("Success rate: {}".format(counter.get(200, 0) / total_requests if total_requests > 0 else -1))
|
||||||
|
print("Statuses:", counter)
|
||||||
|
print("--------------------------------------")
|
||||||
|
print("TIME BY REQUEST")
|
||||||
|
uploading_time = [x["upload_time"] for x in results if x["success"]]
|
||||||
|
if len(uploading_time) == 0:
|
||||||
|
print("No valid uploading time")
|
||||||
|
print("Check the results!")
|
||||||
|
processing_time = [x["process_time"] for x in results if x["success"]]
|
||||||
|
print("Uploading time (Avg / Min / Max): {:.3f}s {:.3f}s {:.3f}s".format(sum(uploading_time) / len(uploading_time), min(uploading_time), max(uploading_time)))
|
||||||
|
print("Processing time (Avg / Min / Max): {:.3f}s {:.3f}s {:.3f}s".format(sum(processing_time) / len(processing_time), min(processing_time), max(processing_time)))
|
||||||
|
print("--------------------------------------")
|
||||||
|
print("TIME BY IMAGE")
|
||||||
|
uploading_time = [x["upload_time"] for x in results if x["success"]]
|
||||||
|
processing_time = [x["process_time"] for x in results if x["success"]]
|
||||||
|
num_images = sum(x["num_files"] for x in results if x["success"])
|
||||||
|
print("Total images:", num_images)
|
||||||
|
print("Uploading time: {:.3f}s".format(sum(uploading_time) / num_images))
|
||||||
|
print("Processing time: {:.3f}s".format(sum(processing_time) / num_images))
|
||||||
|
print("--------------------------------------")
|
34
test_pdf_reader.py
Normal file
34
test_pdf_reader.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
|
from PIL import Image
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
|
||||||
|
|
||||||
|
def get_first_page_pdf(filename, max_size=2048):
|
||||||
|
def pdf_scale_page(page, size=297):
|
||||||
|
"""Scale page to specified size mm"""
|
||||||
|
(w, h) = page.mediabox[2:]
|
||||||
|
# Units of measurement are not "points". The units of measurement are user defined and default to 1/72 inch. See section 4.2.1 Coordinate spaces of the PDF specification.
|
||||||
|
# If we have a page width of 297 mm and at points 1 inch = 25.4 mm
|
||||||
|
pmm = (1/72*25.4)
|
||||||
|
ks = size / (float(max((w, h))) * pmm)
|
||||||
|
page.scale_by(ks)
|
||||||
|
return page
|
||||||
|
|
||||||
|
reader = PdfReader(filename)
|
||||||
|
page = reader.pages[0]
|
||||||
|
scaled_page = pdf_scale_page(page, max_size)
|
||||||
|
|
||||||
|
# Create BytesIO
|
||||||
|
pdf_bytes = io.BytesIO()
|
||||||
|
dst_pdf = PdfWriter()
|
||||||
|
dst_pdf.add_page(scaled_page)
|
||||||
|
dst_pdf.write(pdf_bytes)
|
||||||
|
pdf_bytes.seek(0)
|
||||||
|
|
||||||
|
image = convert_from_bytes(pdf_bytes.read())
|
||||||
|
if isinstance(image, list):
|
||||||
|
return image[0]
|
||||||
|
return image
|
||||||
|
|
||||||
|
img = get_first_page_pdf("test_samples/20220303025923NHNE_20220222_Starhub_Order_Confirmation_by_Email.pdf", max_size=300)
|
||||||
|
img.save("invoice.jpg", "JPEG")
|
Binary file not shown.
BIN
test_samples/sbt/invoice_10k.jpg
Normal file
BIN
test_samples/sbt/invoice_10k.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.5 MiB |
45282
test_samples/temp_SAPea0fc9acc1254372a5b8490c5de95721.pdf
Normal file
45282
test_samples/temp_SAPea0fc9acc1254372a5b8490c5de95721.pdf
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user