Add: support for pdf file

2023-12-05 12:59:06 +07:00 · 2023-12-05 12:59:06 +07:00 · a84e3dce05
commit a84e3dce05
parent 7e9a8e2d4b
9 changed files with 134 additions and 124 deletions
--- a/cope2n-ai-fi/api/sdsap_sbt/prediction_sbt.py
+++ b/cope2n-ai-fi/api/sdsap_sbt/prediction_sbt.py
@ -3,6 +3,7 @@ import urllib
 import random
 import numpy as np
 from pathlib import Path
 import uuid
 import sys, os
 cur_dir = str(Path(__file__).parents[2])
 sys.path.append(cur_dir)
@ -35,14 +36,15 @@ def sbt_predict(image_url, engine) -> None:
    save_dir = "./tmp_results"
    # image_path = os.path.join(save_dir, f"{image_url}.jpg")
-    image_path = os.path.join(save_dir, "abc.jpg")
+    tmp_image_path = os.path.join(save_dir, f"{uuid.uuid4()}.jpg")
-    cv2.imwrite(image_path, img)
+    cv2.imwrite(tmp_image_path, img)
-    outputs = process_img(img_path=image_path, 
+    outputs = process_img(img_path=tmp_image_path, 
                          save_dir=save_dir, 
                          engine=engine, 
                          export_all=False, 
                          option=option)
    os.remove(tmp_image_path)
    return outputs
 def predict(page_numb, image_url):
@ -70,6 +72,7 @@ def predict(page_numb, image_url):
    """
    sbt_result = sbt_predict(image_url, engine=sbt_engine)
    print(sbt_result)
    output_dict = {
        "document_type": "invoice",
        "document_class": " ",
--- a/cope2n-ai-fi/common/utils_kvu/split_docs.py
+++ b/cope2n-ai-fi/common/utils_kvu/split_docs.py
@ -102,6 +102,8 @@ def merge_sbt_output(loutputs):
                })
        return output
    print("concat outputs: \n", loutputs)
    merged_output = []
    combined_output = {"retailername": None,
                       "sold_to_party": None,
--- a/cope2n-api/fwd_api/api/ctel_view.py
+++ b/cope2n-api/fwd_api/api/ctel_view.py
@ -1,6 +1,7 @@
 import time
 import uuid
 from wsgiref.util import FileWrapper
 import base64
 from django.core.files.uploadedfile import TemporaryUploadedFile
 from django.db import transaction
@ -10,15 +11,15 @@ from drf_spectacular.utils import extend_schema
 from rest_framework import status, viewsets
 from rest_framework.decorators import action
 from rest_framework.response import Response
 import io
 from typing import List
 from fwd import settings
 from ..celery_worker.client_connector import c_connector
 from ..annotation.api import throw_on_failure
 from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
-    FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions
+    FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions, allowed_file_extensions
 from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
-    PermissionDeniedException, LimitReachedException, LockedEntityException
+    PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException
 from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
 from ..response.ReportSerializer import ReportSerializer
 from ..utils import FileUtils, ProcessUtil
@ -43,7 +44,7 @@ class CtelViewSet(viewsets.ViewSet):
        }
    }, responses=None, tags=['ocr'])
    @action(detail=False, url_path="image/process", methods=["POST"])
-    @transaction.atomic
+    # @transaction.atomic
    def process(self, request):
        s_time = time.time()
        # print(30*"=")
@ -59,7 +60,7 @@ class CtelViewSet(viewsets.ViewSet):
        rq_id = provider_code + uuid.uuid4().hex
        file_obj: TemporaryUploadedFile = validated_data['file']
-        file_extension = file_obj.name.split(".")[-1]
+        file_extension = file_obj.name.split(".")[-1].lower()
        p_type = validated_data['type']
        file_name = f"temp_{rq_id}.{file_extension}"
@ -73,12 +74,16 @@ class CtelViewSet(viewsets.ViewSet):
        from ..celery_worker.client_connector import c_connector
        file_obj.seek(0)
        file_path = FileUtils.resize_and_save_file(file_name, new_request, file_obj, 100)
-        if settings.S3_ENDPOINT!="":
+        S3_path = FileUtils.save_to_S3(file_name, new_request, file_path)
-            FileUtils.save_to_S3(file_name, new_request, file_obj.read())
+        
-        # print(f"[DEBUG]: file_path: {file_path}")
+        files: [{
            "file_name": file_name,
            "file_path": file_path, # local path to file
            "file_type": ""
        },]
        if file_extension in pdf_extensions:
-            c_connector.do_pdf((rq_id, sub.id, p_type, user.id, file_name, file_path))
+            c_connector.do_pdf((rq_id, sub.id, p_type, user.id, files))
        #     b_url = ProcessUtil.process_pdf_file(file_name, file_obj, new_request, user)
        elif file_extension in image_extensions:
            b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user)
@ -117,7 +122,7 @@ class CtelViewSet(viewsets.ViewSet):
        }
    }, responses=None, tags=['ocr'])
    @action(detail=False, url_path="images/process", methods=["POST"])
-    @transaction.atomic
+    # @transaction.atomic
    def processes(self, request):
        s_time = time.time()
        # print(30*"=")
@ -148,34 +153,27 @@ class CtelViewSet(viewsets.ViewSet):
                                                            provider_code=provider_code,
                                                            subscription=sub)
        new_request.save()
        count = 0
        compact_files = []
        for doc_type, doc_files in files.items():
            for i, doc_file in enumerate(doc_files): 
                _ext = doc_file.name.split(".")[-1]
-                if _ext not in image_extensions:
+                if _ext not in allowed_file_extensions:
                    return JsonResponse(status=status.HTTP_406_NOT_ACCEPTABLE, data={"request_id": rq_id, "message": f"File {_ext} is now allowed"})
                _name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
                doc_file.seek(0)
-                # file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
+                file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
-                # input_file = io.BytesIO(open(doc_file, 'rb').read())
+                S3_path = FileUtils.save_to_S3(_name, new_request, file_path)
                input_file = doc_file.read()
                if settings.S3_ENDPOINT!="":
                    FileUtils.save_to_S3(_name, new_request, input_file)
                else:
                    file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
                list_urls.append(ProcessUtil.process_image_file(_name, doc_file, new_request, user)[0])
                list_urls[count]["page_number"] = count
                list_urls[count]["doc_type"] = doc_type
                count += 1
                this_file = {
                "file_name": _name,
                "file_path": file_path,
                "file_type": doc_type
                }
                compact_files.append(this_file)
        c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
        if p_type in standard_ocr_list:
            ProcessUtil.send_to_queue2(rq_id, sub.id, list_urls, user.id, p_type)
        elif p_type == ProcessType.TEMPLATE_MATCHING.value:
            ProcessUtil.send_template_queue(rq_id, list_urls, validated_data['template'], user.id)
        j_time = time.time()
        print(f"[INFO]: Duration of Pre-processing: {j_time - s_time}s")
        print(f"[INFO]: list_urls: {list_urls}")
        return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id})
    @extend_schema(request=None, responses=None, tags=['data'])
@ -289,6 +287,8 @@ class CtelViewSet(viewsets.ViewSet):
        serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
        serializer.is_valid()
        # print(f"[DEBUG]: result: {serializer.data[0]}")
        if report_filter[0].status == 400:
            raise FileContentInvalidException()
        return Response(status=status.HTTP_200_OK, data=serializer.data[0])
@ -317,14 +317,13 @@ class CtelViewSet(viewsets.ViewSet):
                # return Response(status=status.HTTP_200_OK, data=xml_as_string, content_type="application/xml; charset=utf-8")
                return HttpResponse(xml_as_string,content_type="text/xml")
        serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
        serializer.is_valid()
        return Response(status=status.HTTP_200_OK, data=serializer.data[0])
    @action(detail=False, url_path="image/process/app", methods=["POST"])
-    @transaction.atomic
+    # @transaction.atomic
    def process_app(self, request):
        app_id = "THIS_IS_OUR_APP_TEST_ACCOUNT_9123"
        users = UserProfile.objects.filter(sync_id=app_id)
--- a/cope2n-api/fwd_api/celery_worker/internal_task.py
+++ b/cope2n-api/fwd_api/celery_worker/internal_task.py
@ -4,11 +4,12 @@ import fitz
 import uuid
 import os
 import base64
 import boto3
 from fwd_api.celery_worker.worker import app
-from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
+from ..constant.common import ProcessType, \
-    FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions
+    FolderFileType, standard_ocr_list, image_extensions
 from django.core.files.uploadedfile import TemporaryUploadedFile
 from ..exception.exceptions import FileContentInvalidException
 from ..utils import FileUtils, ProcessUtil, S3_process
 from celery.utils.log import get_task_logger
 from fwd import settings
@ -24,9 +25,27 @@ s3_client = S3_process.MinioS3Client(
    )
 def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
-    from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile
+    from fwd_api.models import SubscriptionRequestFile
-    from fwd_api.constant.common import ProcessType
+    try:
-    doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
+        doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
        # Origin file
        new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
                                                                            request=request,
                                                                            file_name=file_name,
                                                                            code=f'FIL{uuid.uuid4().hex}')
        new_request_file.save()
        # Sub-file
        return ProcessUtil.pdf_to_images_urls(doc, request, user)
    except Exception as e:
        request.status = 400
        request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
        request.save()
        return None
 def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
    from fwd_api.models import SubscriptionRequestFile
    doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
    # Origin file
    new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
@ -34,8 +53,14 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
                                                                        file_name=file_name,
                                                                        code=f'FIL{uuid.uuid4().hex}')
    new_request_file.save()
-    # Sub-file
+    try:
-    return ProcessUtil.pdf_to_images_urls(doc, request, user)
+        # Sub-file
        return ProcessUtil.pdf_to_images_urls(doc, request, user)
    except Exception as e:
        request.status = 400
        request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
        request.save()
        return None
 def process_image_file(file_name: str, file_path, request, user) -> list:
@ -54,25 +79,45 @@ def process_image_file(file_name: str, file_path, request, user) -> list:
@app.task(name='do_pdf')
-def process_pdf(rq_id, sub_id, p_type, user_id, file_name, file_path):
+def process_pdf(rq_id, sub_id, p_type, user_id, files):
    """
    pdf_files: [{
            "file_name": "",
            "file_path": "", # local path to file
            "file_type": ""
        },]
    """
    from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile
-
+    start = time.time()
    from django.conf import settings
    new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0]
    user = UserProfile.objects.filter(id=user_id).first()
-    file_extension = file_name.split(".")[-1]
+    b_urls = []
-    # logger.info(f"[DEBUG]: file_path: {file_path}")
+    for i, file in enumerate(files):
-    if file_extension in pdf_extensions:
+        extension = file["file_name"].split(".")[-1].lower()
-        b_url = process_pdf_file(file_name, file_path, new_request, user)
+        if extension == "pdf":
-    else:
+            _b_urls = process_pdf_file(file["file_name"], file["file_path"], new_request, user)
-        b_url = process_image_file(file_name, file_path, new_request, user)
+            if _b_urls is None:
                raise FileContentInvalidException
            for i in range(len(_b_urls)):
                _b_urls[i]["doc_type"] = file["file_type"]
            # b_urls += _b_urls # TODO: Client may request all images in a file, for now, extract the first page only
            for j in range(len(b_urls)):
                _b_urls[j]["page_number"] = j + len(b_urls)
            b_urls.append(_b_urls[0])
        elif extension in image_extensions:
            this_url = ProcessUtil.process_image_local_file(file["file_name"], file["file_path"], new_request, user)[0]
            this_url["page_number"] = len(b_urls)
            if file["file_type"]:   
                this_url["doc_type"] = file["file_type"]
            b_urls.append(this_url)
-    j_time = time.time()
+    start_process = time.time()
-    # logger.info(f"[INFO]: Duration of Pre-processing: {j_time - 0}s")
+    logger.info(f"BE proccessing time: {start_process - start}")
    # logger.info(f"[INFO]: b_url: {b_url}")
    if p_type in standard_ocr_list:
-        ProcessUtil.send_to_queue2(rq_id, sub_id, b_url, user_id, p_type)
+        ProcessUtil.send_to_queue2(rq_id, sub_id, b_urls, user_id, p_type)
    if p_type == ProcessType.TEMPLATE_MATCHING.value:
-        ProcessUtil.send_template_queue(rq_id, b_url, '', user_id)
+        ProcessUtil.send_template_queue(rq_id, b_urls, '', user_id)
@app.task(name='upload_file_to_s3')
 def upload_file_to_s3(local_file_path, s3_key):
@ -81,7 +126,7 @@ def upload_file_to_s3(local_file_path, s3_key):
        if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200:
            os.remove(local_file_path)
    else:
-        print(f"[INFO] S3 is not available, skipping,...")
+        logger.info(f"S3 is not available, skipping,...")
@app.task(name='upload_obj_to_s3')
 def upload_obj_to_s3(byte_obj, s3_key):
@ -89,4 +134,4 @@ def upload_obj_to_s3(byte_obj, s3_key):
        obj = base64.b64decode(byte_obj)
        res = s3_client.update_object(s3_key, obj)
    else:
-        print(f"[INFO] S3 is not available, skipping,...")
+        logger.info(f"S3 is not available, skipping,...")
--- a/cope2n-api/fwd_api/constant/common.py
+++ b/cope2n-api/fwd_api/constant/common.py
@ -4,67 +4,7 @@ import re
 image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
 pdf_extensions = ('pdf', 'PDF')
 allowed_file_extensions = image_extensions + pdf_extensions
-allowed_p_type = [2, 3, 4, 5, 6]
+# allowed_file_extensions = image_extensions
 LIST_BOX_MESSAGE = 'list_box'
 NAME_MESSAGE = 'name'
 VN_AND_SPACE_REGEX = r"[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z0-9 ]+"
 IMAGE_NAME = "image_croped.jpg"
 TEMPLATE_ID = 'template_id'
 pattern = re.compile(VN_AND_SPACE_REGEX)
 REQUEST_ID = 'requestId'
 FOLDER_TYPE = 'folderType'
 MAX_NUMBER_OF_TEMPLATE_DATA_BOX = 20
 MAX_NUMBER_OF_TEMPLATE_ANCHOR_BOX = 3
 NUMBER_OF_ITEM_IN_A_BOX = 4  # 4 coordinates
 ESCAPE_VALUE = 'W5@X8#'
 USER_MESSAGE = 'user'
 PLAN_MESSAGE = 'plan'
 class FolderFileType(Enum):
    TEMPLATES = 'templates'
    REQUESTS = 'requests'
 class FileCategory(Enum):
    CROP = 'Crop'
    Origin = 'Origin'
    BREAK = 'Break'
 class EntityStatus(Enum):
    ACTIVE = 1
    INACTIVE = 0
 class TEMPLATE_BOX_TYPE(Enum):
    ANCHOR = 1
    DATA = 2
 class ProcessType(Enum):
    TEMPLATE_MATCHING = 2
    ID_CARD = 3
    DRIVER_LICENSE = 4
    INVOICE = 5
    OCR_WITH_BOX = 6
    AP_INVOICE = 7
    FI_INVOICE = 10	
 class PlanCode(Enum):
    TRIAL = 'TRIAL'
    BASIC = 'BASIC'
    ADVANCED = 'ADVANCED'
 standard_ocr_list = (ProcessType.INVOICE.value, ProcessType.ID_CARD.value, ProcessType.DRIVER_LICENSE.value, ProcessType.OCR_WITH_BOX.value)
 from enum import Enum
 import re
 image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
 pdf_extensions = ('pdf', 'PDF')
 # allowed_file_extensions = image_extensions + pdf_extensions
 allowed_file_extensions = image_extensions
 allowed_p_type = [12]
 LIST_BOX_MESSAGE = 'list_box'
 NAME_MESSAGE = 'name'
--- a/cope2n-api/fwd_api/exception/exceptions.py
+++ b/cope2n-api/fwd_api/exception/exceptions.py
@ -97,6 +97,12 @@ class FileFormatInvalidException(InvalidException):
    default_detail = 'File invalid type'
    detail_with_arg = 'File must have type {}'
 class FileContentInvalidException(InvalidException):
    status_code = status.HTTP_400_BAD_REQUEST
    default_code = 4007
    default_detail = 'Invalid content file'
    detail_with_arg = 'One of the files is broken, please select other file and try again'
 class TokenExpiredException(GeneralException):
    status_code = status.HTTP_401_UNAUTHORIZED
--- a/cope2n-api/fwd_api/models/UserProfile.py
+++ b/cope2n-api/fwd_api/models/UserProfile.py
@ -7,8 +7,10 @@ from fwd_api.constant.common import EntityStatus
 class UserProfile(models.Model):
    id = models.AutoField(primary_key=True)
-    full_name: str = models.CharField(max_length=200)
+    user_name: str = models.CharField(max_length=200, null=True)
-    sync_id: str = models.CharField(max_length=100)
+    password: str = models.CharField(max_length=200, null=True)
    full_name: str = models.CharField(max_length=200, null=True)
    sync_id: str = models.CharField(max_length=100, null=True)
    provider_id: str = models.CharField(max_length=100, default='Ctel')  # CTel/GCP/Azure :v
    current_total_pages: int = models.IntegerField(default=0)
    limit_total_pages: int = models.IntegerField(default=0)
--- a/cope2n-api/fwd_api/utils/FileUtils.py
+++ b/cope2n-api/fwd_api/utils/FileUtils.py
@ -27,7 +27,7 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
        if not isinstance(f, TemporaryUploadedFile):
            # print(f'[DEBUG]: {f.name}')
            raise InvalidException(excArgs="files")
-        extension = f.name.split(".")[-1] in allowed_file_extensions
+        extension = f.name.split(".")[-1].lower() in allowed_file_extensions
        if not extension or "." not in f.name:
            raise FileFormatInvalidException(excArgs=allowed_file_extensions)
        if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE:
@ -129,14 +129,15 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
        print(f"[ERROR]: {e}")
        raise ServiceUnavailableException()
-def save_to_S3(file_name, rq, obj):
+def save_to_S3(file_name, rq, local_file_path):
    try:
-        base64_obj = base64.b64encode(obj).decode('utf-8')
+        # base64_obj = base64.b64encode(obj).decode('utf-8')
        file_path =  get_folder_path(rq)
        assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
        s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
        # c_connector.upload_file_to_s3((file_path, s3_key))
-        c_connector.upload_obj_to_s3((base64_obj, s3_key))
+        c_connector.upload_file_to_s3((local_file_path, s3_key))
        return s3_key
    except Exception as e:
        print(f"[ERROR]: {e}")
        raise ServiceUnavailableException()
--- a/cope2n-api/fwd_api/utils/ProcessUtil.py
+++ b/cope2n-api/fwd_api/utils/ProcessUtil.py
@ -376,6 +376,18 @@ def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request:
        'request_file_id': new_request_file.code
    }]
 def process_image_local_file(file_name: str, file_path: str, request: SubscriptionRequest, user) -> list:
    new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
                                                                        request=request,
                                                                        file_name=file_name,
                                                                        code=f'FIL{uuid.uuid4().hex}')
    new_request_file.save()
    return [{
        'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),
        'page_number': 0,
        'request_file_id': new_request_file.code
    }]
 def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
    def resize(image, max_w=1920, max_h=1080):
        logger.info(f"[DEBUG]: image.size: {image.size}, type(image): {type(image)}")