Add: support for pdf file

This commit is contained in:
dx-tan 2023-12-05 12:59:06 +07:00
parent 7e9a8e2d4b
commit a84e3dce05
9 changed files with 134 additions and 124 deletions

View File

@ -3,6 +3,7 @@ import urllib
import random
import numpy as np
from pathlib import Path
import uuid
import sys, os
cur_dir = str(Path(__file__).parents[2])
sys.path.append(cur_dir)
@ -35,14 +36,15 @@ def sbt_predict(image_url, engine) -> None:
save_dir = "./tmp_results"
# image_path = os.path.join(save_dir, f"{image_url}.jpg")
image_path = os.path.join(save_dir, "abc.jpg")
cv2.imwrite(image_path, img)
tmp_image_path = os.path.join(save_dir, f"{uuid.uuid4()}.jpg")
cv2.imwrite(tmp_image_path, img)
outputs = process_img(img_path=image_path,
outputs = process_img(img_path=tmp_image_path,
save_dir=save_dir,
engine=engine,
export_all=False,
option=option)
os.remove(tmp_image_path)
return outputs
def predict(page_numb, image_url):
@ -70,6 +72,7 @@ def predict(page_numb, image_url):
"""
sbt_result = sbt_predict(image_url, engine=sbt_engine)
print(sbt_result)
output_dict = {
"document_type": "invoice",
"document_class": " ",

View File

@ -102,6 +102,8 @@ def merge_sbt_output(loutputs):
})
return output
print("concat outputs: \n", loutputs)
merged_output = []
combined_output = {"retailername": None,
"sold_to_party": None,

View File

@ -1,6 +1,7 @@
import time
import uuid
from wsgiref.util import FileWrapper
import base64
from django.core.files.uploadedfile import TemporaryUploadedFile
from django.db import transaction
@ -10,15 +11,15 @@ from drf_spectacular.utils import extend_schema
from rest_framework import status, viewsets
from rest_framework.decorators import action
from rest_framework.response import Response
import io
from typing import List
from fwd import settings
from ..celery_worker.client_connector import c_connector
from ..annotation.api import throw_on_failure
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions, image_extensions, allowed_file_extensions
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
PermissionDeniedException, LimitReachedException, LockedEntityException
PermissionDeniedException, LimitReachedException, LockedEntityException, FileContentInvalidException
from ..models import SubscriptionRequest, UserProfile, SubscriptionRequestFile, OcrTemplate, Subscription
from ..response.ReportSerializer import ReportSerializer
from ..utils import FileUtils, ProcessUtil
@ -43,7 +44,7 @@ class CtelViewSet(viewsets.ViewSet):
}
}, responses=None, tags=['ocr'])
@action(detail=False, url_path="image/process", methods=["POST"])
@transaction.atomic
# @transaction.atomic
def process(self, request):
s_time = time.time()
# print(30*"=")
@ -59,7 +60,7 @@ class CtelViewSet(viewsets.ViewSet):
rq_id = provider_code + uuid.uuid4().hex
file_obj: TemporaryUploadedFile = validated_data['file']
file_extension = file_obj.name.split(".")[-1]
file_extension = file_obj.name.split(".")[-1].lower()
p_type = validated_data['type']
file_name = f"temp_{rq_id}.{file_extension}"
@ -73,12 +74,16 @@ class CtelViewSet(viewsets.ViewSet):
from ..celery_worker.client_connector import c_connector
file_obj.seek(0)
file_path = FileUtils.resize_and_save_file(file_name, new_request, file_obj, 100)
if settings.S3_ENDPOINT!="":
FileUtils.save_to_S3(file_name, new_request, file_obj.read())
# print(f"[DEBUG]: file_path: {file_path}")
S3_path = FileUtils.save_to_S3(file_name, new_request, file_path)
files: [{
"file_name": file_name,
"file_path": file_path, # local path to file
"file_type": ""
},]
if file_extension in pdf_extensions:
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, file_name, file_path))
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, files))
# b_url = ProcessUtil.process_pdf_file(file_name, file_obj, new_request, user)
elif file_extension in image_extensions:
b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user)
@ -117,7 +122,7 @@ class CtelViewSet(viewsets.ViewSet):
}
}, responses=None, tags=['ocr'])
@action(detail=False, url_path="images/process", methods=["POST"])
@transaction.atomic
# @transaction.atomic
def processes(self, request):
s_time = time.time()
# print(30*"=")
@ -148,34 +153,27 @@ class CtelViewSet(viewsets.ViewSet):
provider_code=provider_code,
subscription=sub)
new_request.save()
count = 0
compact_files = []
for doc_type, doc_files in files.items():
for i, doc_file in enumerate(doc_files):
_ext = doc_file.name.split(".")[-1]
if _ext not in image_extensions:
if _ext not in allowed_file_extensions:
return JsonResponse(status=status.HTTP_406_NOT_ACCEPTABLE, data={"request_id": rq_id, "message": f"File {_ext} is now allowed"})
_name = f"temp_{doc_type}_{rq_id}_{i}.{_ext}"
doc_file.seek(0)
# file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
# input_file = io.BytesIO(open(doc_file, 'rb').read())
input_file = doc_file.read()
if settings.S3_ENDPOINT!="":
FileUtils.save_to_S3(_name, new_request, input_file)
else:
file_path = FileUtils.resize_and_save_file(_name, new_request, doc_file, 100)
list_urls.append(ProcessUtil.process_image_file(_name, doc_file, new_request, user)[0])
list_urls[count]["page_number"] = count
list_urls[count]["doc_type"] = doc_type
S3_path = FileUtils.save_to_S3(_name, new_request, file_path)
count += 1
this_file = {
"file_name": _name,
"file_path": file_path,
"file_type": doc_type
}
compact_files.append(this_file)
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
if p_type in standard_ocr_list:
ProcessUtil.send_to_queue2(rq_id, sub.id, list_urls, user.id, p_type)
elif p_type == ProcessType.TEMPLATE_MATCHING.value:
ProcessUtil.send_template_queue(rq_id, list_urls, validated_data['template'], user.id)
j_time = time.time()
print(f"[INFO]: Duration of Pre-processing: {j_time - s_time}s")
print(f"[INFO]: list_urls: {list_urls}")
return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id})
@extend_schema(request=None, responses=None, tags=['data'])
@ -289,6 +287,8 @@ class CtelViewSet(viewsets.ViewSet):
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
serializer.is_valid()
# print(f"[DEBUG]: result: {serializer.data[0]}")
if report_filter[0].status == 400:
raise FileContentInvalidException()
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
@ -317,14 +317,13 @@ class CtelViewSet(viewsets.ViewSet):
# return Response(status=status.HTTP_200_OK, data=xml_as_string, content_type="application/xml; charset=utf-8")
return HttpResponse(xml_as_string,content_type="text/xml")
serializer: ReportSerializer = ReportSerializer(data=report_filter, many=True)
serializer.is_valid()
return Response(status=status.HTTP_200_OK, data=serializer.data[0])
@action(detail=False, url_path="image/process/app", methods=["POST"])
@transaction.atomic
# @transaction.atomic
def process_app(self, request):
app_id = "THIS_IS_OUR_APP_TEST_ACCOUNT_9123"
users = UserProfile.objects.filter(sync_id=app_id)

View File

@ -4,11 +4,12 @@ import fitz
import uuid
import os
import base64
import boto3
from fwd_api.celery_worker.worker import app
from ..constant.common import allowed_p_type, ProcessType, REQUEST_ID, FOLDER_TYPE, \
FolderFileType, TEMPLATE_ID, EntityStatus, standard_ocr_list, pdf_extensions
from ..constant.common import ProcessType, \
FolderFileType, standard_ocr_list, image_extensions
from django.core.files.uploadedfile import TemporaryUploadedFile
from ..exception.exceptions import FileContentInvalidException
from ..utils import FileUtils, ProcessUtil, S3_process
from celery.utils.log import get_task_logger
from fwd import settings
@ -24,8 +25,8 @@ s3_client = S3_process.MinioS3Client(
)
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile
from fwd_api.constant.common import ProcessType
from fwd_api.models import SubscriptionRequestFile
try:
doc: fitz.Document = fitz.open(stream=FileUtils.get_file(file_path).read(), filetype="pdf")
# Origin file
@ -36,6 +37,30 @@ def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
new_request_file.save()
# Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
except Exception as e:
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
return None
def process_pdf_byte(file_name: str, file_path: str, request, user, file_obj) -> list:
from fwd_api.models import SubscriptionRequestFile
doc: fitz.Document = fitz.open(stream=file_obj, filetype="pdf")
# Origin file
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
try:
# Sub-file
return ProcessUtil.pdf_to_images_urls(doc, request, user)
except Exception as e:
request.status = 400
request.predict_result = {"status": 400, "content": "", "message": f"Unable to extract pdf files {e}"}
request.save()
return None
def process_image_file(file_name: str, file_path, request, user) -> list:
@ -54,25 +79,45 @@ def process_image_file(file_name: str, file_path, request, user) -> list:
@app.task(name='do_pdf')
def process_pdf(rq_id, sub_id, p_type, user_id, file_name, file_path):
def process_pdf(rq_id, sub_id, p_type, user_id, files):
"""
pdf_files: [{
"file_name": "",
"file_path": "", # local path to file
"file_type": ""
},]
"""
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile,UserProfile
start = time.time()
from django.conf import settings
new_request = SubscriptionRequest.objects.filter(request_id=rq_id)[0]
user = UserProfile.objects.filter(id=user_id).first()
file_extension = file_name.split(".")[-1]
# logger.info(f"[DEBUG]: file_path: {file_path}")
if file_extension in pdf_extensions:
b_url = process_pdf_file(file_name, file_path, new_request, user)
else:
b_url = process_image_file(file_name, file_path, new_request, user)
b_urls = []
for i, file in enumerate(files):
extension = file["file_name"].split(".")[-1].lower()
if extension == "pdf":
_b_urls = process_pdf_file(file["file_name"], file["file_path"], new_request, user)
if _b_urls is None:
raise FileContentInvalidException
for i in range(len(_b_urls)):
_b_urls[i]["doc_type"] = file["file_type"]
# b_urls += _b_urls # TODO: Client may request all images in a file, for now, extract the first page only
for j in range(len(b_urls)):
_b_urls[j]["page_number"] = j + len(b_urls)
b_urls.append(_b_urls[0])
elif extension in image_extensions:
this_url = ProcessUtil.process_image_local_file(file["file_name"], file["file_path"], new_request, user)[0]
this_url["page_number"] = len(b_urls)
if file["file_type"]:
this_url["doc_type"] = file["file_type"]
b_urls.append(this_url)
j_time = time.time()
# logger.info(f"[INFO]: Duration of Pre-processing: {j_time - 0}s")
# logger.info(f"[INFO]: b_url: {b_url}")
start_process = time.time()
logger.info(f"BE proccessing time: {start_process - start}")
if p_type in standard_ocr_list:
ProcessUtil.send_to_queue2(rq_id, sub_id, b_url, user_id, p_type)
ProcessUtil.send_to_queue2(rq_id, sub_id, b_urls, user_id, p_type)
if p_type == ProcessType.TEMPLATE_MATCHING.value:
ProcessUtil.send_template_queue(rq_id, b_url, '', user_id)
ProcessUtil.send_template_queue(rq_id, b_urls, '', user_id)
@app.task(name='upload_file_to_s3')
def upload_file_to_s3(local_file_path, s3_key):
@ -81,7 +126,7 @@ def upload_file_to_s3(local_file_path, s3_key):
if res != None and res["ResponseMetadata"]["HTTPStatusCode"] == 200:
os.remove(local_file_path)
else:
print(f"[INFO] S3 is not available, skipping,...")
logger.info(f"S3 is not available, skipping,...")
@app.task(name='upload_obj_to_s3')
def upload_obj_to_s3(byte_obj, s3_key):
@ -89,4 +134,4 @@ def upload_obj_to_s3(byte_obj, s3_key):
obj = base64.b64decode(byte_obj)
res = s3_client.update_object(s3_key, obj)
else:
print(f"[INFO] S3 is not available, skipping,...")
logger.info(f"S3 is not available, skipping,...")

View File

@ -4,67 +4,7 @@ import re
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
pdf_extensions = ('pdf', 'PDF')
allowed_file_extensions = image_extensions + pdf_extensions
allowed_p_type = [2, 3, 4, 5, 6]
LIST_BOX_MESSAGE = 'list_box'
NAME_MESSAGE = 'name'
VN_AND_SPACE_REGEX = r"[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z0-9 ]+"
IMAGE_NAME = "image_croped.jpg"
TEMPLATE_ID = 'template_id'
pattern = re.compile(VN_AND_SPACE_REGEX)
REQUEST_ID = 'requestId'
FOLDER_TYPE = 'folderType'
MAX_NUMBER_OF_TEMPLATE_DATA_BOX = 20
MAX_NUMBER_OF_TEMPLATE_ANCHOR_BOX = 3
NUMBER_OF_ITEM_IN_A_BOX = 4 # 4 coordinates
ESCAPE_VALUE = 'W5@X8#'
USER_MESSAGE = 'user'
PLAN_MESSAGE = 'plan'
class FolderFileType(Enum):
TEMPLATES = 'templates'
REQUESTS = 'requests'
class FileCategory(Enum):
CROP = 'Crop'
Origin = 'Origin'
BREAK = 'Break'
class EntityStatus(Enum):
ACTIVE = 1
INACTIVE = 0
class TEMPLATE_BOX_TYPE(Enum):
ANCHOR = 1
DATA = 2
class ProcessType(Enum):
TEMPLATE_MATCHING = 2
ID_CARD = 3
DRIVER_LICENSE = 4
INVOICE = 5
OCR_WITH_BOX = 6
AP_INVOICE = 7
FI_INVOICE = 10
class PlanCode(Enum):
TRIAL = 'TRIAL'
BASIC = 'BASIC'
ADVANCED = 'ADVANCED'
standard_ocr_list = (ProcessType.INVOICE.value, ProcessType.ID_CARD.value, ProcessType.DRIVER_LICENSE.value, ProcessType.OCR_WITH_BOX.value)
from enum import Enum
import re
image_extensions = ('jpg', 'jpeg', 'png', 'JPG', 'JPEG', 'PNG')
pdf_extensions = ('pdf', 'PDF')
# allowed_file_extensions = image_extensions + pdf_extensions
allowed_file_extensions = image_extensions
# allowed_file_extensions = image_extensions
allowed_p_type = [12]
LIST_BOX_MESSAGE = 'list_box'
NAME_MESSAGE = 'name'

View File

@ -97,6 +97,12 @@ class FileFormatInvalidException(InvalidException):
default_detail = 'File invalid type'
detail_with_arg = 'File must have type {}'
class FileContentInvalidException(InvalidException):
status_code = status.HTTP_400_BAD_REQUEST
default_code = 4007
default_detail = 'Invalid content file'
detail_with_arg = 'One of the files is broken, please select other file and try again'
class TokenExpiredException(GeneralException):
status_code = status.HTTP_401_UNAUTHORIZED

View File

@ -7,8 +7,10 @@ from fwd_api.constant.common import EntityStatus
class UserProfile(models.Model):
id = models.AutoField(primary_key=True)
full_name: str = models.CharField(max_length=200)
sync_id: str = models.CharField(max_length=100)
user_name: str = models.CharField(max_length=200, null=True)
password: str = models.CharField(max_length=200, null=True)
full_name: str = models.CharField(max_length=200, null=True)
sync_id: str = models.CharField(max_length=100, null=True)
provider_id: str = models.CharField(max_length=100, default='Ctel') # CTel/GCP/Azure :v
current_total_pages: int = models.IntegerField(default=0)
limit_total_pages: int = models.IntegerField(default=0)

View File

@ -27,7 +27,7 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
if not isinstance(f, TemporaryUploadedFile):
# print(f'[DEBUG]: {f.name}')
raise InvalidException(excArgs="files")
extension = f.name.split(".")[-1] in allowed_file_extensions
extension = f.name.split(".")[-1].lower() in allowed_file_extensions
if not extension or "." not in f.name:
raise FileFormatInvalidException(excArgs=allowed_file_extensions)
if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE:
@ -129,14 +129,15 @@ def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: Temporar
print(f"[ERROR]: {e}")
raise ServiceUnavailableException()
def save_to_S3(file_name, rq, obj):
def save_to_S3(file_name, rq, local_file_path):
try:
base64_obj = base64.b64encode(obj).decode('utf-8')
# base64_obj = base64.b64encode(obj).decode('utf-8')
file_path = get_folder_path(rq)
assert len(file_path.split("/")) >= 2, "file_path must have at least process type and request id"
s3_key = os.path.join(file_path.split("/")[-2], file_path.split("/")[-1], file_name)
# c_connector.upload_file_to_s3((file_path, s3_key))
c_connector.upload_obj_to_s3((base64_obj, s3_key))
c_connector.upload_file_to_s3((local_file_path, s3_key))
return s3_key
except Exception as e:
print(f"[ERROR]: {e}")
raise ServiceUnavailableException()

View File

@ -376,6 +376,18 @@ def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request:
'request_file_id': new_request_file.code
}]
def process_image_local_file(file_name: str, file_path: str, request: SubscriptionRequest, user) -> list:
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
new_request_file.save()
return [{
'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),
'page_number': 0,
'request_file_id': new_request_file.code
}]
def pdf_to_images_urls(doc: fitz.Document, request: SubscriptionRequest, user, dpi: int = 300) -> list:
def resize(image, max_w=1920, max_h=1080):
logger.info(f"[DEBUG]: image.size: {image.size}, type(image): {type(image)}")