Merged from vietanh99, Add APIs

This commit is contained in:
dx-tan 2024-01-31 10:00:18 +07:00
parent 3f524e677d
commit dd206c4a3c
33 changed files with 2899 additions and 140 deletions

View File

@ -8,10 +8,17 @@ RUN groupadd --gid ${GID} ${USERNAME} \
&& apt-get install -y sudo bash gettext poppler-utils \ && apt-get install -y sudo bash gettext poppler-utils \
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \ && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
&& chmod 0440 /etc/sudoers.d/${USERNAME} && chmod 0440 /etc/sudoers.d/${USERNAME}
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
RUN yes | apt install postgresql gcc musl-dev RUN yes | apt install postgresql gcc musl-dev
RUN pip install --upgrade pip RUN pip install --upgrade pip
RUN pip install uvicorn gunicorn Celery RUN pip install uvicorn gunicorn Celery
# For intergration with sdskvu
RUN pip install pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
RUN pip install -U openmim==0.3.7 --no-cache-dir
RUN mim install mmcv-full==1.7.2
# End intergration with sdskvu
USER ${UID} USER ${UID}
ADD --chown=${UID}:${GID} fwd /app ADD --chown=${UID}:${GID} fwd /app
COPY --chown=${UID}:${GID} requirements.txt /app COPY --chown=${UID}:${GID} requirements.txt /app
@ -21,4 +28,27 @@ RUN pip install -r requirements.txt --no-cache-dir
COPY --chown=${UID}:${GID} . /app COPY --chown=${UID}:${GID} . /app
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir
# For intergration with sdskvu
RUN python -m pip install paddlepaddle-gpu==2.4.2.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html --no-cache-dir
ENV TZ="Asia/Ho_Chi_Minh" ENV TZ="Asia/Ho_Chi_Minh"
# FROM cope2n-api-base AS builder
# ARG UID=1000
# ARG GID=1000
# ARG USERNAME=container-user
# # Create a new user
# RUN groupadd --gid ${GID} ${USERNAME} \
# && useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
# && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
# && chmod 0440 /etc/sudoers.d/${USERNAME}
# WORKDIR /app
# COPY --chown=${UID}:${GID} . /app

View File

@ -0,0 +1,17 @@
FROM python:3.9.17-buster
RUN apt-get update \
&& apt-get install -y sudo bash gettext poppler-utils postgresql gcc musl-dev
COPY requirements.txt /tmp
COPY ./fwd_api/utils/sdsvkvu /app/fwd_api/utils/sdsvkvu
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir
RUN pip install --upgrade pip && pip install uvicorn gunicorn Celery
RUN pip install -r /tmp/requirements.txt --no-cache-dir
ENV TZ="Asia/Ho_Chi_Minh"

View File

@ -3,88 +3,87 @@ from rest_framework.decorators import action
from rest_framework.response import Response from rest_framework.response import Response
from django.core.paginator import Paginator from django.core.paginator import Paginator
from django.http import JsonResponse from django.http import JsonResponse
from datetime import datetime
from django.utils import timezone from django.utils import timezone
from django.db.models import Q from django.db.models import Q
import uuid
from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiTypes from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiTypes
# from drf_spectacular.types import OpenApiString # from drf_spectacular.types import OpenApiString
from ..models import SubscriptionRequest
from ..exception.exceptions import RequiredFieldException
import json import json
from ..exception.exceptions import InvalidException, RequiredFieldException
from ..models import SubscriptionRequest, Report, ReportFile
from ..utils.accuracy import shadow_report, MonthReportAccumulate
from ..utils.file import validate_report_list
from ..utils.process import string_to_boolean
def first_of_list(the_list):
if not the_list:
return None
return the_list[0]
class AccuracyViewSet(viewsets.ViewSet): class AccuracyViewSet(viewsets.ViewSet):
lookup_field = "username" lookup_field = "username"
@extend_schema( @extend_schema(
parameters=[ parameters=[
OpenApiParameter( OpenApiParameter(
name='start_date', name='start_date',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SS)', description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE, type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00', default='2023-01-02T00:00:00+0700',
), ),
OpenApiParameter( OpenApiParameter(
name='end_date', name='end_date',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SS)', description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE, type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00', default='2024-01-10T00:00:00+0700',
), ),
OpenApiParameter( OpenApiParameter(
name='include_test', name='include_test',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='Whether to include test record or not', description='Whether to include test record or not',
type=OpenApiTypes.BOOL, type=OpenApiTypes.BOOL,
), ),
OpenApiParameter( OpenApiParameter(
name='is_reviewed', name='is_reviewed',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='Which records to be query', description='Which records to be query',
type=OpenApiTypes.STR, type=OpenApiTypes.STR,
enum=['reviewed', 'not reviewed', 'all'], enum=['reviewed', 'not reviewed', 'all'],
), ),
OpenApiParameter( OpenApiParameter(
name='request_id', name='request_id',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='Specific request id', description='Specific request id',
type=OpenApiTypes.STR, type=OpenApiTypes.STR,
), ),
OpenApiParameter( OpenApiParameter(
name='redemption_id', name='redemption_id',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='Specific redemption id', description='Specific redemption id',
type=OpenApiTypes.STR, type=OpenApiTypes.STR,
), ),
OpenApiParameter( OpenApiParameter(
name='quality', name='page',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='One or more of [bad, good, all]', description='Page number',
type=OpenApiTypes.STR, type=OpenApiTypes.INT,
enum=['bad', 'good', 'all'], required=False
), ),
OpenApiParameter( OpenApiParameter(
name='page', name='page_size',
location=OpenApiParameter.QUERY, location=OpenApiParameter.QUERY,
description='Page number', description='Number of items per page',
type=OpenApiTypes.INT, type=OpenApiTypes.INT,
required=False required=False
), ),
OpenApiParameter( ],
name='page_size', responses=None, tags=['Accuracy']
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
) )
@action(detail=False, url_path="request_list", methods=["GET"]) @action(detail=False, url_path="request_list", methods=["GET"])
def get_subscription_requests(self, request): def get_request_list(self, request):
if request.method == 'GET': if request.method == 'GET':
start_date_str = request.GET.get('start_date') start_date_str = request.GET.get('start_date')
end_date_str = request.GET.get('end_date') end_date_str = request.GET.get('end_date')
@ -94,14 +93,13 @@ class AccuracyViewSet(viewsets.ViewSet):
redemption_id = request.GET.get('redemption_id', None) redemption_id = request.GET.get('redemption_id', None)
is_reviewed = request.GET.get('is_reviewed', None) is_reviewed = request.GET.get('is_reviewed', None)
include_test = request.GET.get('include_test', False) include_test = request.GET.get('include_test', False)
quality = request.GET.get('quality', None)
try: try:
start_date = datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S') start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S') end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError: except ValueError:
return JsonResponse({'error': 'Invalid date format. Please use YYYY-MM-DD.'}, status=400) raise InvalidException(excArgs="Date format")
base_query = Q(created_at__range=(start_date, end_date)) base_query = Q(created_at__range=(start_date, end_date))
if request_id: if request_id:
base_query &= Q(request_id=request_id) base_query &= Q(request_id=request_id)
@ -124,19 +122,12 @@ class AccuracyViewSet(viewsets.ViewSet):
base_query &= Q(is_reviewed=False) base_query &= Q(is_reviewed=False)
elif is_reviewed == "all": elif is_reviewed == "all":
pass pass
if isinstance(quality, str):
if quality == "good":
base_query &= Q(is_bad_image_quality=False)
elif quality == "bad":
base_query &= Q(is_bad_image_quality=True)
elif quality == "all":
pass
subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at') subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
paginator = Paginator(subscription_requests, page_size) paginator = Paginator(subscription_requests, page_size)
page = paginator.get_page(page_number) page = paginator.get_page(page_number)
data = [] data = []
for request in page: for request in page:
imeis = [] imeis = []
@ -184,7 +175,369 @@ class AccuracyViewSet(viewsets.ViewSet):
return JsonResponse(response) return JsonResponse(response)
return JsonResponse({'error': 'Invalid request method.'}, status=405) return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[
OpenApiParameter(
name='is_daily_report',
location=OpenApiParameter.QUERY,
description='Whether to include test record or not',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='include_test',
location=OpenApiParameter.QUERY,
description='Whether to include test record or not',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='is_reviewed',
location=OpenApiParameter.QUERY,
description='Which records to be query',
type=OpenApiTypes.STR,
enum=['reviewed', 'not reviewed', 'all'],
),
OpenApiParameter(
name='request_id',
location=OpenApiParameter.QUERY,
description='Specific request id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='redemption_id',
location=OpenApiParameter.QUERY,
description='Specific redemption id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='subsidiary',
location=OpenApiParameter.QUERY,
description='Subsidiary',
type=OpenApiTypes.STR,
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="make_report", methods=["GET"])
def make_report(self, request):
if request.method == 'GET':
start_date_str = request.GET.get('start_date')
end_date_str = request.GET.get('end_date')
request_id = request.GET.get('request_id', None)
redemption_id = request.GET.get('redemption_id', None)
is_reviewed = string_to_boolean(request.data.get('is_reviewed', "false"))
include_test = string_to_boolean(request.data.get('include_test', "false"))
subsidiary = request.GET.get("subsidiary", "all")
is_daily_report = string_to_boolean(request.data.get('is_daily_report', "false"))
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
query_set = {"start_date_str": start_date_str,
"end_date_str": end_date_str,
"request_id": request_id,
"redemption_id": redemption_id,
"is_reviewed": is_reviewed,
"include_test": include_test,
"subsidiary": subsidiary,
"is_daily_report": is_daily_report,
}
report_id = "report" + "_" + timezone.datetime.now().strftime("%Y%m%d%H%M%S%z") + "_" + uuid.uuid4().hex
new_report: Report = Report(
report_id=report_id,
is_daily_report=is_daily_report,
subsidiary=subsidiary.lower().replace(" ", ""),
include_test=include_test,
include_reviewed=is_reviewed,
start_at=start_date,
end_at=end_date,
)
new_report.save()
# Background job to calculate accuracy
shadow_report(report_id, query_set)
return JsonResponse(status=status.HTTP_200_OK, data={"report_id": report_id})
@extend_schema(
parameters=[
OpenApiParameter(
name='report_id',
location=OpenApiParameter.QUERY,
description='Specific report id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="report_detail_list", methods=["GET"])
def get_report_detail_list(self, request):
if request.method == 'GET':
report_id = request.GET.get('report_id', None)
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
report = Report.objects.filter(report_id=report_id).first()
report_files = ReportFile.objects.filter(report=report)
paginator = Paginator(report_files, page_size)
page = paginator.get_page(page_number)
data = []
for report_file in page:
data.append({
"Request ID": report_file.correspond_request_id,
"Redemption Number": report_file.correspond_redemption_id,
"Image type": report_file.doc_type,
"IMEI_user submitted": first_of_list(report_file.feedback_result.get("imei_number", [None])),
"IMEI_OCR retrieved": first_of_list(report_file.predict_result.get("imei_number", [None])),
"IMEI1 Accuracy": first_of_list(report_file.feedback_accuracy.get("imei_number", [None])),
"Invoice_Purchase Date_Consumer": report_file.feedback_result.get("purchase_date", None),
"Invoice_Purchase Date_OCR": report_file.predict_result.get("purchase_date", []),
"Invoice_Purchase Date Accuracy": first_of_list(report_file.feedback_accuracy.get("purchase_date", [None])),
"Invoice_Retailer_Consumer": report_file.feedback_result.get("retailername", None),
"Invoice_Retailer_OCR": report_file.predict_result.get("retailername", None),
"Invoice_Retailer Accuracy": first_of_list(report_file.feedback_accuracy.get("retailername", [None])),
"OCR Image Accuracy": report_file.acc,
"OCR Image Speed (seconds)": report_file.time_cost,
"Reviewed?": "No",
"Bad Image Reasons": report_file.bad_image_reason,
"Countermeasures": report_file.counter_measures,
"IMEI_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("imei_number", [None])),
"Purchase Date_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("purchase_date", [None])),
"Retailer_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("retailername", [None])),
})
response = {
'report_detail': data,
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response, status=200)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='daily_report_only',
location=OpenApiParameter.QUERY,
description='Specific report id',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="report_list", methods=["GET"])
def get_report_list(self, request):
if request.method == 'GET':
daily_report_only = request.GET.get('daily_report_only', False)
start_date_str = request.GET.get('start_date', "")
end_date_str = request.GET.get('end_date', "")
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
if not start_date_str or not end_date_str:
reports = Report.objects.all()
else:
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
base_query = Q(created_at__range=(start_date, end_date))
if daily_report_only:
base_query &= Q(is_daily_report=True)
reports = Report.objects.filter(base_query).order_by('created_at')
paginator = Paginator(reports, page_size)
page = paginator.get_page(page_number)
data = []
for report in page:
data.append({
"ID": report.id,
"Created Date": report.created_at,
"No. Requests": report.number_request,
"Status": report.status,
"Purchase Date Acc": report.reviewed_accuracy.get("purchase_date", None) if report.reviewed_accuracy else None,
"Retailer Acc": report.feedback_accuracy.get("retailername", None) if report.reviewed_accuracy else None,
"IMEI Acc": report.feedback_accuracy.get("imei_number", None) if report.reviewed_accuracy else None,
"Avg. Accuracy": report.feedback_accuracy.get("avg", None) if report.reviewed_accuracy else None,
"Avg. Client Request Time": report.average_client_time.get("avg", 0) if report.average_client_time else 0,
"Avg. OCR Processing Time": report.average_OCR_time.get("avg", 0) if report.average_client_time else 0,
"report_id": report.report_id,
})
response = {
'report_detail': data,
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response, status=200)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='subsidiary',
location=OpenApiParameter.QUERY,
description='Subsidiary',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="overview", methods=["GET"])
def overview(self, request):
if request.method == 'GET':
subsidiary = request.GET.get('subsidiary', None)
start_date_str = request.GET.get('start_date', "")
end_date_str = request.GET.get('end_date', "")
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
if not start_date_str or not end_date_str:
reports = Report.objects.all()
else:
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
base_query = Q(created_at__range=(start_date, end_date))
if subsidiary:
base_query &= Q(subsidiary=subsidiary)
base_query &= Q(is_daily_report=True)
reports = Report.objects.filter(base_query).order_by('created_at')
paginator = Paginator(reports, page_size)
page = paginator.get_page(page_number)
data = []
this_month_report = MonthReportAccumulate()
for report in page:
res = this_month_report.add(report)
if not(res):
_, _data, total = this_month_report()
data += [total]
data += _data
this_month_report = MonthReportAccumulate()
this_month_report.add(report)
else:
continue
_, _data, total = this_month_report()
data += [total]
data += _data
response = {
'overview_data': data,
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response, status=200)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
class RequestViewSet(viewsets.ViewSet): class RequestViewSet(viewsets.ViewSet):
lookup_field = "username" lookup_field = "username"
@ -269,4 +622,4 @@ class RequestViewSet(viewsets.ViewSet):
return JsonResponse({'message': 'success.'}, status=200) return JsonResponse({'message': 'success.'}, status=200)
else: else:
return JsonResponse({'error': 'Invalid request method.'}, status=405) return JsonResponse({'error': 'Invalid request method.'}, status=405)

View File

@ -34,13 +34,16 @@ class CeleryConnector:
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"}, 'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
'remove_local_file': {'queue': "remove_local_file"}, 'remove_local_file': {'queue': "remove_local_file"},
'csv_feedback': {'queue': "csv_feedback"}, 'csv_feedback': {'queue': "csv_feedback"},
'make_a_report': {'queue': "report"},
} }
app = Celery( app = Celery(
'postman', 'postman',
broker=settings.BROKER_URL, broker=settings.BROKER_URL,
broker_transport_options={'confirm_publish': False}, broker_transport_options={'confirm_publish': False},
) )
def make_a_report(self, args):
return self.send_task('make_a_report', args)
def csv_feedback(self, args): def csv_feedback(self, args):
return self.send_task('csv_feedback', args) return self.send_task('csv_feedback', args)
def do_pdf(self, args): def do_pdf(self, args):

View File

@ -0,0 +1,149 @@
import time
import uuid
import os
import base64
import traceback
from multiprocessing.pool import ThreadPool
from fwd_api.models import SubscriptionRequest, UserProfile
from fwd_api.celery_worker.worker import app
from ..constant.common import FolderFileType, image_extensions
from ..exception.exceptions import FileContentInvalidException
from fwd_api.models import SubscriptionRequestFile, FeedbackRequest, Report
from ..utils import file as FileUtils
from ..utils import process as ProcessUtil
from ..utils import s3 as S3Util
from ..utils.accuracy import update_temp_accuracy, IterAvg, calculate_and_save_subcription_file
from fwd_api.constant.common import ProcessType
from django.utils import timezone
from django.db.models import Q
import csv
import json
from celery.utils.log import get_task_logger
from fwd import settings
logger = get_task_logger(__name__)
s3_client = S3Util.MinioS3Client(
endpoint=settings.S3_ENDPOINT,
access_key=settings.S3_ACCESS_KEY,
secret_key=settings.S3_SECRET_KEY,
bucket_name=settings.S3_BUCKET_NAME
)
def mean_list(l):
l = [x for x in l if x is not None]
if len(l) == 0:
return 0
return sum(l)/len(l)
@app.task(name='make_a_report')
def make_a_report(report_id, query_set):
try:
start_date = timezone.datetime.strptime(query_set["start_date_str"], '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(query_set["end_date_str"], '%Y-%m-%dT%H:%M:%S%z')
base_query = Q(created_at__range=(start_date, end_date))
if query_set["request_id"]:
base_query &= Q(request_id=query_set["request_id"])
if query_set["redemption_id"]:
base_query &= Q(redemption_id=query_set["redemption_id"])
base_query &= Q(is_test_request=False)
if isinstance(query_set["include_test"], str):
query_set["include_test"] = True if query_set["include_test"].lower() in ["true", "yes", "1"] else False
if query_set["include_test"]:
# base_query = ~base_query
base_query.children = base_query.children[:-1]
elif isinstance(query_set["include_test"], bool):
if query_set["include_test"]:
base_query = ~base_query
if isinstance(query_set["subsidiary"], str):
if query_set["subsidiary"] and query_set["subsidiary"].lower().replace(" ", "")!="all":
base_query &= Q(redemption_id__startswith=query_set["subsidiary"])
if isinstance(query_set["is_reviewed"], str):
if query_set["is_reviewed"] == "reviewed":
base_query &= Q(is_reviewed=True)
elif query_set["is_reviewed"] == "not reviewed":
base_query &= Q(is_reviewed=False)
# elif query_set["is_reviewed"] == "all":
# pass
errors = []
# Create a placeholder to fill
accuracy = {"feedback" :{"imei_number": IterAvg(),
"purchase_date": IterAvg(),
"retailername": IterAvg(),
"sold_to_party": IterAvg(),},
"reviewed" :{"imei_number": IterAvg(),
"purchase_date": IterAvg(),
"retailername": IterAvg(),
"sold_to_party": IterAvg(),}
} # {"imei": {"acc": 0.1, count: 1}, ...}
time_cost = {"invoice": IterAvg(),
"imei": IterAvg()}
number_images = 0
number_bad_images = 0
# TODO: Multithreading
# Calculate accuracy, processing time, ....Then save.
subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
report: Report = \
Report.objects.filter(report_id=report_id).first()
# TODO: number of transaction by doc type
num_request = 0
for request in subscription_requests:
if request.status != 200 or not (request.reviewed_result or request.feedback_result):
# Failed requests or lack of reviewed_result/feedback_result
continue
request_att = calculate_and_save_subcription_file(report, request)
request.feedback_accuracy = {"imei_number" : mean_list(request_att["acc"]["feedback"].get("imei_number", [None])),
"purchase_date" : mean_list(request_att["acc"]["feedback"].get("purchase_date", [None])),
"retailername" : mean_list(request_att["acc"]["feedback"].get("retailername", [None])),
"sold_to_party" : mean_list(request_att["acc"]["feedback"].get("sold_to_party", [None]))}
request.reviewed_accuracy = {"imei_number" : mean_list(request_att["acc"]["reviewed"].get("imei_number", [None])),
"purchase_date" : mean_list(request_att["acc"]["reviewed"].get("purchase_date", [None])),
"retailername" : mean_list(request_att["acc"]["reviewed"].get("retailername", [None])),
"sold_to_party" : mean_list(request_att["acc"]["reviewed"].get("sold_to_party", [None]))}
request.save()
number_images += request_att["total_images"]
number_bad_images += request_att["bad_images"]
update_temp_accuracy(accuracy["feedback"], request_att["acc"]["feedback"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"])
update_temp_accuracy(accuracy["reviewed"], request_att["acc"]["reviewed"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"])
time_cost["imei"].add(request_att["time_cost"].get("imei", []))
time_cost["invoice"].add(request_att["time_cost"].get("invoice", []))
errors += request_att["err"]
num_request += 1
# Do saving process
report.number_request = num_request
report.number_images = number_images
report.number_imei = time_cost["imei"].count
report.number_invoice = time_cost["invoice"].count
report.number_bad_images = number_bad_images
report.average_OCR_time = {"invoice": time_cost["invoice"](), "imei": time_cost["imei"](),
"invoice_count": time_cost["invoice"].count, "imei_count": time_cost["imei"].count}
acumulated_acc = {"feedback": {},
"reviewed": {}}
for acc_type in ["feedback", "reviewed"]:
for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]:
acumulated_acc[acc_type][key] = accuracy[acc_type][key]()
acumulated_acc[acc_type][key+"_count"] = accuracy[acc_type][key].count
report.feedback_accuracy = acumulated_acc["feedback"]
report.reviewed_accuracy = acumulated_acc["reviewed"]
report.errors = "|".join(errors)
report.save()
except IndexError as e:
print(e)
traceback.print_exc()
print("NotFound request by report id, %d", report_id)
except Exception as e:
print("[ERROR]: an error occured while processing report: ", report_id)
traceback.print_exc()
return 400

View File

@ -12,7 +12,7 @@ django.setup()
app: Celery = Celery( app: Celery = Celery(
'postman', 'postman',
broker=settings.BROKER_URL, broker=settings.BROKER_URL,
include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task'], include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task', 'fwd_api.celery_worker.process_report_tasks'],
broker_transport_options={'confirm_publish': False}, broker_transport_options={'confirm_publish': False},
) )
@ -40,6 +40,7 @@ app.conf.update({
Queue('upload_obj_to_s3'), Queue('upload_obj_to_s3'),
Queue('remove_local_file'), Queue('remove_local_file'),
Queue('csv_feedback'), Queue('csv_feedback'),
Queue('report'),
], ],
'task_routes': { 'task_routes': {
@ -57,6 +58,7 @@ app.conf.update({
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"}, 'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
'remove_local_file': {'queue': "remove_local_file"}, 'remove_local_file': {'queue': "remove_local_file"},
'csv_feedback': {'queue': "csv_feedback"}, 'csv_feedback': {'queue': "csv_feedback"},
'make_a_report': {'queue': "report"},
} }
}) })

View File

@ -0,0 +1,71 @@
# myapp/management/commands/mycustomcommand.py
from django.core.management.base import BaseCommand
from tqdm import tqdm
from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest
from fwd_api.utils.accuracy import predict_result_to_ready
import traceback
import copy
class Command(BaseCommand):
help = 'Refactor database for image level'
def add_arguments(self, parser):
# Add your command-line arguments here
parser.add_argument('test', type=str, help='Value for the argument')
def process_request(self, request):
if len(request.request_id.split(".")[0].split("_")) < 2:
return
images = SubscriptionRequestFile.objects.filter(request=request)
time_cost = {"imei": [], "invoice": [], "all": []}
if request.ai_inference_profile is None:
time_cost["imei"] = [-1 for _ in range(len(images))]
time_cost["invoice"] = [-1]
time_cost["all"] = [-1]
else:
for k, v in request.ai_inference_profile.items():
time_cost[k.split("_")[0]].append(v["inference"][1][0] - v["inference"][0] + (v["postprocess"][1]-v["postprocess"][0]))
for i, image in enumerate(images):
# temp_imei_SAP_20240127223644_a493434edbf84fc08aeb87ef6cdde102_0.jpg
try:
image.index_in_request = int(image.file_name.split(".")[0].split("_")[-1]) if len(image.file_name.split(".")[0].split("_")) > 4 else 0
image.doc_type = image.file_name.split(".")[0].split("_")[1] if len(image.file_name.split(".")[0].split("_")) > 4 else "all"
image.processing_time = time_cost[image.doc_type][image.index_in_request]
if not request.predict_result:
raise KeyError(f"Key predict_result not found in {request.request_id}")
if request.predict_result.get("status", 200) != 200:
raise AttributeError(f"Failed request: {request.request_id}")
_predict_result = copy.deepcopy(predict_result_to_ready(request.predict_result))
_feedback_result = copy.deepcopy(request.feedback_result)
_reviewed_result = copy.deepcopy(request.reviewed_result)
if image.doc_type == "invoice":
_predict_result["imei_number"] = []
if _feedback_result:
_feedback_result["imei_number"] = []
else:
None
if _reviewed_result:
_reviewed_result["imei_number"] = []
else:
None
else:
_predict_result = {"retailername": None, "sold_to_party": None, "purchase_date": [], "imei_number": [_predict_result["imei_number"][image.index_in_request]]}
_feedback_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_feedback_result["imei_number"][image.index_in_request]]} if _feedback_result else None
_reviewed_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_reviewed_result["imei_number"][image.index_in_request]]} if _reviewed_result else None
image.predict_result = _predict_result
image.feedback_result = _feedback_result
image.reviewed_result = _reviewed_result
image.save()
except Exception as e:
self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}"))
print(traceback.format_exc())
continue
def handle(self, *args, **options):
test = options['test']
subcription_iter = SubscriptionRequest.objects.all()
for request in tqdm(subcription_iter.iterator()):
self.process_request(request)
self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully!'))

View File

@ -0,0 +1,102 @@
# Generated by Django 4.1.3 on 2024-01-25 06:22
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0166_remove_subscriptionrequest_is_bad_image_quality_and_more'),
]
operations = [
migrations.CreateModel(
name='Report',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('report_id', models.CharField(max_length=200)),
('local_file_name', models.CharField(max_length=200)),
('error_status', models.JSONField(null=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('updated_at', models.DateTimeField(auto_now=True)),
('start_at', models.DateTimeField(null=True)),
('end_at', models.DateTimeField(null=True)),
('include_for_test_sample', models.BooleanField(default=False)),
('status', models.CharField(max_length=100)),
('is_daily_report', models.BooleanField(default=False)),
('errors', models.TextField(default='')),
('S3_uploaded', models.BooleanField(default=False)),
('number_request', models.IntegerField(default=0)),
('number_images', models.IntegerField(default=0)),
('number_bad_images', models.IntegerField(default=0)),
('average_client_time_profile', models.JSONField(null=True)),
('average_OCR_time_profile', models.JSONField(null=True)),
('average_OCR_time', models.JSONField(null=True)),
('average_client_time', models.JSONField(null=True)),
('imei_accuracy', models.FloatField(default=-1)),
('purchase_date_accuracy', models.FloatField(default=-1)),
('retailer_name_accuracy', models.FloatField(default=-1)),
('sold_to_party_accuracy', models.FloatField(default=-1)),
],
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='accuracy',
),
migrations.AddField(
model_name='subscriptionrequest',
name='imei_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='purchase_date_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='retailer_name_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='sold_to_party_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='counter_measures',
field=models.TextField(blank=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='imei_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='processing_time',
field=models.IntegerField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='purchase_date_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='reason',
field=models.TextField(blank=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='retailer_name_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='sold_to_party_accuracy',
field=models.FloatField(default=-1),
),
]

View File

@ -0,0 +1,23 @@
# Generated by Django 4.1.3 on 2024-01-25 09:44
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0167_report_remove_subscriptionrequestfile_accuracy_and_more'),
]
operations = [
migrations.AddField(
model_name='report',
name='number_imei_transaction',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='report',
name='number_ivoice_transaction',
field=models.IntegerField(default=0),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-25 11:17
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0168_report_number_imei_transaction_and_more'),
]
operations = [
migrations.AddField(
model_name='report',
name='include_reviewed',
field=models.TextField(default=''),
),
migrations.AddField(
model_name='report',
name='include_test',
field=models.CharField(default='', max_length=200),
),
migrations.AddField(
model_name='report',
name='subsidiary',
field=models.TextField(default=''),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-25 11:19
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0169_report_include_reviewed_report_include_test_and_more'),
]
operations = [
migrations.AlterField(
model_name='report',
name='errors',
field=models.TextField(default='', null=True),
),
migrations.AlterField(
model_name='report',
name='include_reviewed',
field=models.TextField(default='', null=True),
),
migrations.AlterField(
model_name='report',
name='subsidiary',
field=models.TextField(default='', null=True),
),
]

View File

@ -0,0 +1,112 @@
# Generated by Django 4.1.3 on 2024-01-28 08:11
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0170_alter_report_errors_alter_report_include_reviewed_and_more'),
]
operations = [
migrations.RenameField(
model_name='report',
old_name='imei_accuracy',
new_name='imei_accuracy_ocr',
),
migrations.RenameField(
model_name='report',
old_name='purchase_date_accuracy',
new_name='imei_accuracy_revised',
),
migrations.RenameField(
model_name='report',
old_name='retailer_name_accuracy',
new_name='purchase_date_accuracy_ocr',
),
migrations.RenameField(
model_name='report',
old_name='sold_to_party_accuracy',
new_name='purchase_date_accuracy_revised',
),
migrations.AddField(
model_name='report',
name='retailer_name_accuracy_ocr',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='report',
name='retailer_name_accuracy_revised',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='report',
name='sold_to_party_accuracy_ocr',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='report',
name='sold_to_party_accuracy_revised',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='feedback_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='predict_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='reviewed_result',
field=models.JSONField(null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='doc_type',
field=models.CharField(default='', max_length=10),
),
migrations.CreateModel(
name='ReportFile',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('correspond_request_id', models.CharField(max_length=200)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('updated_at', models.DateTimeField(auto_now=True)),
('S3_uploaded', models.BooleanField(default=False)),
('doc_type', models.CharField(max_length=200)),
('imei_feedback', models.CharField(default=None, max_length=200, null=True)),
('purchase_date_feedback', models.CharField(default=None, max_length=200, null=True)),
('retailer_feedback', models.CharField(default=None, max_length=200, null=True)),
('sold_to_party_feedback', models.CharField(default=None, max_length=200, null=True)),
('imei_ocr', models.CharField(default=None, max_length=200, null=True)),
('purchase_date_ocr', models.CharField(default=None, max_length=200, null=True)),
('retailer_ocr', models.CharField(default=None, max_length=200, null=True)),
('sold_to_party_ocr', models.CharField(default=None, max_length=200, null=True)),
('imei_revised', models.CharField(default=None, max_length=200, null=True)),
('purchase_date_revised', models.CharField(default=None, max_length=200, null=True)),
('retailer_revised', models.CharField(default=None, max_length=200, null=True)),
('sold_to_party_revised', models.CharField(default=None, max_length=200, null=True)),
('imei_acc_feedback', models.FloatField(default=None, null=True)),
('purchase_date_acc_feedback', models.FloatField(default=None, null=True)),
('retailer_acc_feedback', models.FloatField(default=None, null=True)),
('sold_to_party_acc_feedback', models.CharField(default=None, max_length=200, null=True)),
('acc_feedback', models.FloatField(default=None, null=True)),
('imei_acc_revised', models.FloatField(default=None, null=True)),
('purchase_date_acc_revised', models.FloatField(default=None, null=True)),
('retailer_acc_revised', models.FloatField(default=None, null=True)),
('acc_revised', models.FloatField(default=None, null=True)),
('time_cost', models.FloatField(default=0)),
('is_reviewed', models.CharField(default='NA', max_length=5)),
('bad_image_reason', models.TextField(default='')),
('countermeasures', models.TextField(default='')),
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='files', to='fwd_api.report')),
],
),
]

View File

@ -0,0 +1,38 @@
# Generated by Django 4.1.3 on 2024-01-28 09:27
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more'),
]
operations = [
migrations.AlterField(
model_name='subscriptionrequestfile',
name='imei_accuracy',
field=models.FloatField(default=None, null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='processing_time',
field=models.FloatField(default=-1),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='purchase_date_accuracy',
field=models.FloatField(default=None, null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='retailer_name_accuracy',
field=models.FloatField(default=None, null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='sold_to_party_accuracy',
field=models.FloatField(default=None, null=True),
),
]

View File

@ -0,0 +1,226 @@
# Generated by Django 4.1.3 on 2024-01-28 18:00
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0172_alter_subscriptionrequestfile_imei_accuracy_and_more'),
]
operations = [
migrations.RenameField(
model_name='reportfile',
old_name='countermeasures',
new_name='counter_measures',
),
migrations.RemoveField(
model_name='report',
name='imei_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='imei_accuracy_revised',
),
migrations.RemoveField(
model_name='report',
name='purchase_date_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='purchase_date_accuracy_revised',
),
migrations.RemoveField(
model_name='report',
name='retailer_name_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='retailer_name_accuracy_revised',
),
migrations.RemoveField(
model_name='report',
name='sold_to_party_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='sold_to_party_accuracy_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_revised',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='imei_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='purchase_date_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='retailer_name_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='sold_to_party_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='imei_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='purchase_date_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='retailer_name_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='sold_to_party_accuracy',
),
migrations.AddField(
model_name='report',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='report',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='error',
field=models.TextField(default=''),
),
migrations.AddField(
model_name='reportfile',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='feedback_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='predict_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='reviewed_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequest',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequest',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-29 05:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0173_rename_countermeasures_reportfile_counter_measures_and_more'),
]
operations = [
migrations.AddField(
model_name='reportfile',
name='acc',
field=models.FloatField(default=0),
),
migrations.AddField(
model_name='reportfile',
name='correspond_redemption_id',
field=models.CharField(default='', max_length=200),
),
migrations.AlterField(
model_name='reportfile',
name='correspond_request_id',
field=models.CharField(default='', max_length=200),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-30 12:29
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0174_reportfile_acc_reportfile_correspond_redemption_id_and_more'),
]
operations = [
migrations.RenameField(
model_name='report',
old_name='number_ivoice_transaction',
new_name='number_imei',
),
migrations.AddField(
model_name='report',
name='number_invoice',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='report',
name='number_invoice_transaction',
field=models.IntegerField(default=0),
),
]

View File

@ -13,19 +13,28 @@ class Report(models.Model):
start_at = models.DateTimeField(null=True) start_at = models.DateTimeField(null=True)
end_at = models.DateTimeField(null=True) end_at = models.DateTimeField(null=True)
include_for_test_sample = models.BooleanField(default=False) include_for_test_sample = models.BooleanField(default=False)
status = models.CharField(null=True) status = models.CharField(max_length=100)
is_daily_report = models.BooleanField(default=False) is_daily_report = models.BooleanField(default=False)
errors = models.TextField(default="", null=True)
subsidiary = models.TextField(default="", null=True)
include_reviewed = models.TextField(default="", null=True)
include_test = models.CharField(max_length=200, default="")
# Data # Data
S3_uploaded = models.BooleanField(default=False) S3_uploaded = models.BooleanField(default=False)
number_request = models.IntegerField(default=0) number_request = models.IntegerField(default=0)
number_images = models.IntegerField(default=0) number_images = models.IntegerField(default=0)
number_bad_images = models.IntegerField(default=0) number_bad_images = models.IntegerField(default=0)
average_client_time_profile = models.JSONField(default=0) # {"0.1": 100, 0.2: 200, ...} number_imei = models.IntegerField(default=0)
average_OCR_time_profile = models.JSONField(default=0) # {"0.1": 98, 0.2: 202, ...} number_invoice = models.IntegerField(default=0)
average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1}
number_imei_transaction = models.IntegerField(default=0)
number_invoice_transaction = models.IntegerField(default=0)
average_client_time_profile = models.JSONField(null=True) # {"0.1": 100, 0.2: 200, ...} | Future feature
average_OCR_time_profile = models.JSONField(null=True) # {"0.1": 98, 0.2: 202, ...} | Future feature
average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} | Future feature
average_client_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} average_client_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1}
imei_accuracy = models.FloatField(default=-1)
purchase_date_accuracy = models.FloatField(default=-1) feedback_accuracy = models.JSONField(null=True)
retailer_name_accuracy = models.FloatField(default=-1) reviewed_accuracy = models.JSONField(null=True)
sold_to_party_accuracy = models.FloatField(default=-1)

View File

@ -0,0 +1,35 @@
from django.db import models
from django.utils import timezone
from fwd_api.models.Subscription import Subscription
from fwd_api.models.SubscriptionRequest import SubscriptionRequest
from fwd_api.models.Report import Report
class ReportFile(models.Model):
# Metadata
id = models.AutoField(primary_key=True)
correspond_request_id = models.CharField(max_length=200, default="")
correspond_redemption_id = models.CharField(max_length=200, default="")
created_at = models.DateTimeField(default=timezone.now, db_index=True)
updated_at = models.DateTimeField(auto_now=True)
report = models.ForeignKey(Report, related_name="files", on_delete=models.CASCADE)
# Data
S3_uploaded = models.BooleanField(default=False)
doc_type = models.CharField(max_length=200)
predict_result = models.JSONField(null=True)
feedback_result = models.JSONField(null=True)
reviewed_result = models.JSONField(null=True)
feedback_accuracy = models.JSONField(null=True)
reviewed_accuracy = models.JSONField(null=True)
acc = models.FloatField(default=0)
time_cost = models.FloatField(default=0)
is_reviewed = models.CharField(default="NA", max_length=5) # NA, No, Yes
bad_image_reason = models.TextField(default="")
counter_measures = models.TextField(default="")
error = models.TextField(default="")

View File

@ -21,10 +21,9 @@ class SubscriptionRequest(models.Model):
updated_at = models.DateTimeField(auto_now=True) updated_at = models.DateTimeField(auto_now=True)
is_test_request = models.BooleanField(default=False) is_test_request = models.BooleanField(default=False)
S3_uploaded = models.BooleanField(default=False) S3_uploaded = models.BooleanField(default=False)
imei_accuracy = models.FloatField(default=-1)
purchase_date_accuracy = models.FloatField(default=-1) feedback_accuracy = models.JSONField(null=True)
retailer_name_accuracy = models.FloatField(default=-1) reviewed_accuracy = models.JSONField(null=True)
sold_to_party_accuracy = models.FloatField(default=-1)
ai_inference_profile = models.JSONField(null=True) ai_inference_profile = models.JSONField(null=True)
preprocessing_time = models.FloatField(default=-1) preprocessing_time = models.FloatField(default=-1)

View File

@ -20,12 +20,15 @@ class SubscriptionRequestFile(models.Model):
created_at = models.DateTimeField(default=timezone.now, db_index=True) created_at = models.DateTimeField(default=timezone.now, db_index=True)
updated_at = models.DateTimeField(auto_now=True) updated_at = models.DateTimeField(auto_now=True)
is_bad_image_quality = models.BooleanField(default=False) is_bad_image_quality = models.BooleanField(default=False)
doc_type = models.CharField(max_length=100, default="") doc_type = models.CharField(max_length=10, default="")
index_in_request = models.IntegerField(default=0) index_in_request = models.IntegerField(default=0) # by doc_type
processing_time = models.IntegerField(default=-1) # in milisecond processing_time = models.FloatField(default=-1) # in milisecond
reason = models.TextField(blank=True) reason = models.TextField(blank=True)
counter_measures = models.TextField(blank=True) counter_measures = models.TextField(blank=True)
imei_accuracy = models.FloatField(default=-1)
purchase_date_accuracy = models.FloatField(default=-1) predict_result = models.JSONField(null=True)
retailer_name_accuracy = models.FloatField(default=-1) feedback_result = models.JSONField(null=True)
sold_to_party_accuracy = models.FloatField(default=-1) reviewed_result = models.JSONField(null=True)
feedback_accuracy = models.JSONField(null=True)
reviewed_accuracy = models.JSONField(null=True)

View File

@ -6,4 +6,7 @@ from .OcrTemplateBox import OcrTemplateBox
from .PricingPlan import PricingPlan from .PricingPlan import PricingPlan
from .Subscription import Subscription from .Subscription import Subscription
from .FeedbackRequest import FeedbackRequest from .FeedbackRequest import FeedbackRequest
from .Report import Report
from .ReportFile import ReportFile

View File

@ -0,0 +1,417 @@
import re
from datetime import datetime
import copy
from .ocr_utils.ocr_metrics import eval_ocr_metric
from .ocr_utils.sbt_report import post_processing_str
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile
from ..celery_worker.client_connector import c_connector
BAD_THRESHOLD = 0.75
valid_keys = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
class MonthReportAccumulate:
def __init__(self):
self.month = None
self.total = {
'subs': "+",
'extraction_date': "Subtotal ()",
'total_images': 0,
'images_quality': {
'successful': 0,
'successful_percent': 0,
'bad': 0,
'bad_percent': 0
},
'average_accuracy_rate': {
'imei': IterAvg(),
'purchase_date': IterAvg(),
'retailer_name': IterAvg()
},
'average_processing_time': {
'imei': IterAvg(),
'invoice': IterAvg()
},
'usage': {
'imei':0,
'invoice': 0
}
}
self.data = []
self.data_format = {
'num_imei': 0,
'num_invoice': 0,
'total_images': 0,
'images_quality': {
'successful': 0,
'successful_percent': 0,
'bad': 0,
'bad_percent': 0
},
'average_accuracy_rate': {
'imei': 0,
'purchase_date': 0,
'retailer_name': 0
},
'average_processing_time': {
'imei': 0,
'invoice': 0
},
'usage': {
'imei':0,
'invoice': 0
}
},
def accumulate(self, report):
self.total["total_images"] += report.number_images
self.total["images_quality"]["successful"] += report.number_images - report.number_bad_images
self.total["images_quality"]["bad"] += report.number_bad_images
if sum([report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]) > 0 :
self.total["average_accuracy_rate"]["imei"].add_avg(report.reviewed_accuracy.get("imei_number", 0), report.reviewed_accuracy.get("imei_number_count", 0))
self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.reviewed_accuracy.get("purchase_date", 0), report.reviewed_accuracy.get("purchase_date_count", 0))
self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.reviewed_accuracy.get("retailername", 0), report.reviewed_accuracy.get("retailername_count", 0))
elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]) > 0:
self.total["average_accuracy_rate"]["imei"].add_avg(report.feedback_accuracy.get("imei_number", 0), report.feedback_accuracy.get("imei_number_count", 0))
self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.feedback_accuracy.get("purchase_date", 0), report.feedback_accuracy.get("purchase_date_count", 0))
self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.feedback_accuracy.get("retailername", 0), report.feedback_accuracy.get("retailername_count", 0))
self.total["average_processing_time"]["imei"].add_avg(report.average_OCR_time.get("imei", 0), report.average_OCR_time.get("imei_count", 0))
self.total["average_processing_time"]["invoice"].add_avg(report.average_OCR_time.get("invoice", 0), report.average_OCR_time.get("invoice_count", 0))
self.total["usage"]["imei"] += report.number_imei_transaction
self.total["usage"]["invoice"] += report.number_invoice_transaction
def add(self, report):
report_month = report.created_at.month
if self.month is None:
self.month = report_month
self.total["extraction_date"] = f"Subtotal ({self.month})"
elif self.month != report_month:
self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"]
self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"]
return False # Reports from a different month, stop accumulating
# accumulate fields
new_data = copy.deepcopy(self.data_format)[0]
new_data["num_imei"] = report.number_imei
new_data["num_invoice"] = report.number_invoice
new_data["total_images"] = report.number_images
new_data["images_quality"]["successful"] = report.number_images - report.number_bad_images
new_data["images_quality"]["bad"] = report.number_bad_images
if sum([ report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]):
new_data["average_accuracy_rate"]["imei"] = report.reviewed_accuracy.get("imei_number", None)
new_data["average_accuracy_rate"]["purchase_date"] = report.reviewed_accuracy.get("purchase_date", None)
new_data["average_accuracy_rate"]["retailer_name"] = report.reviewed_accuracy.get("retailername", None)
elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]):
new_data["average_accuracy_rate"]["imei"] = report.feedback_accuracy.get("imei_number", None)
new_data["average_accuracy_rate"]["purchase_date"] = report.feedback_accuracy.get("purchase_date", None)
new_data["average_accuracy_rate"]["retailer_name"] = report.feedback_accuracy.get("retailername", None)
new_data["average_processing_time"]["imei"] = report.average_OCR_time.get("imei", 0)
new_data["average_processing_time"]["invoice"] = report.average_OCR_time.get("invoice", 0)
new_data["usage"]["imei"] = report.number_imei_transaction
new_data["usage"]["invoice"] = report.number_invoice_transaction
new_data["images_quality"]["successful_percent"] += new_data["images_quality"]["successful"]/new_data["total_images"]
new_data["images_quality"]["bad_percent"] += new_data["images_quality"]["bad"]/new_data["total_images"]
self.data.append(new_data)
self.accumulate(report)
return True
def __call__(self):
self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"]
self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"]
total = copy.deepcopy(self.total)
total["average_accuracy_rate"]["imei"] = total["average_accuracy_rate"]["imei"]()
total["average_accuracy_rate"]["purchase_date"] = total["average_accuracy_rate"]["purchase_date"]()
total["average_accuracy_rate"]["retailer_name"] = total["average_accuracy_rate"]["retailer_name"]()
total["average_processing_time"]["imei"] = total["average_processing_time"]["imei"]()
total["average_processing_time"]["invoice"] = total["average_processing_time"]["invoice"]()
return self.month, self.data, total
class IterAvg:
def __init__(self, name="default"):
self.name = name
self.avg = 0
self.count = 0
def add(self, values):
"""
Args:
values (list[float]):
"""
values = [x for x in values if x is not None]
if len(values) == 0:
return
self.avg = (self.avg*self.count + sum(values))/(self.count+len(values))
self.count += len(values)
def add_avg(self, avg, count):
if avg is None or count is None or count == 0:
return
self.count += count
self.avg = (self.avg*(self.count-count) + avg*count)/(self.count)
def __call__(self):
return self.avg
def convert_datetime_format(date_string: str, is_gt=False) -> str:
# pattern_date_string = "2023-02-28"
input_format = "%Y-%m-%d"
output_format = "%d/%m/%Y"
# Validate the input date string format
pattern = r"\d{4}-\d{2}-\d{2}"
if re.match(pattern, date_string):
# Convert the date string to a datetime object
date_object = datetime.strptime(date_string, input_format)
# Convert the datetime object to the desired output format
formatted_date = date_object.strftime(output_format)
return formatted_date
return date_string
def predict_result_to_ready(result):
dict_result = {"retailername": "",
"sold_to_party": "",
"purchase_date": [],
"imei_number": [],}
dict_result["retailername"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}])[0].get("value", None)
dict_result["sold_to_party"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}])[1].get("value", None)
dict_result["purchase_date"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}])[2].get("value", [])
dict_result["imei_number"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}, {}])[3].get("value", [])
return dict_result
def align_fine_result(ready_predict, fine_result):
# print(f"[DEBUG]: fine_result: {fine_result}")
# print(f"[DEBUG]: ready_predict: {ready_predict}")
if fine_result:
if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0:
ready_predict["purchase_date"] = [None]
if fine_result["retailername"] and not ready_predict["retailername"]:
ready_predict["retailername"] = [None]
fine_result["purchase_date"] = [fine_result["purchase_date"] for _ in range(len(ready_predict["purchase_date"]))]
# else:
# fine_result = {}
# for key in ready_predict.keys():
# fine_result[key] = []
# fine_result["purchase_date"] = [None for _ in range(len(ready_predict["purchase_date"]))]
return ready_predict, fine_result
def update_temp_accuracy(accuracy, acc, keys):
for key in keys:
accuracy[key].add(acc[key])
return accuracy
def calculate_accuracy(key_name, inference, target):
"""_summary_
Args:
key_name (string): key to calculate accuracy on, ex: retailername
inference (dict): result from ocr, refined to align with the target down below
target (dict): result of type
"""
acc = []
data = []
if not target or not inference:
return acc, data
if not isinstance(inference[key_name], list):
if inference[key_name] is None:
inference[key_name] = []
else:
inference[key_name] = [inference[key_name]]
if not isinstance(target[key_name], list):
if target[key_name] is None:
target[key_name] = []
else:
target[key_name] = [target[key_name]]
for i, v in enumerate(inference[key_name]):
# TODO: target[key_name][i] is None, ""
x = post_processing_str(key_name, inference[key_name][i], is_gt=False)
y = post_processing_str(key_name, target[key_name][i], is_gt=True)
score = eval_ocr_metric(
[x],
[y],
metric=[
"one_minus_ned",
# "line_acc_ignore_case_symbol",
# "line_acc",
# "one_minus_ned_word",
])
acc.append(list(score.values())[0])
data.append([x, y])
return acc, data
def calculate_avg_accuracy(acc, type, keys=[]):
acc_list = []
# print(f"[DEBUG]: type: {type} - acc: {acc}")
for key in keys:
acc_list += acc.get(type, {}).get(key, [])
acc_list = [x for x in acc_list if x is not None]
return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None
def calculate_and_save_subcription_file(report, request):
request_att = {"acc": {"feedback": {"imei_number": [],
"purchase_date": [],
"retailername": [],
"sold_to_party": [],
},
"reviewed": {"imei_number": [],
"purchase_date": [],
"retailername": [],
"sold_to_party": [],
}},
"err": [],
"time_cost": {},
"total_images": 0,
"bad_images": 0}
images = SubscriptionRequestFile.objects.filter(request=request)
for image in images:
status, att = calculate_subcription_file(image)
if status != 200:
continue
image.feedback_accuracy = att["acc"]["feedback"]
image.reviewed_accuracy = att["acc"]["reviewed"]
image.is_bad_image_quality = att["is_bad_image"]
image.save()
new_report_file = ReportFile(report=report,
correspond_request_id=request.request_id,
correspond_redemption_id=request.redemption_id,
doc_type=image.doc_type,
predict_result=image.predict_result,
feedback_result=image.feedback_result,
reviewed_result=image.reviewed_result,
feedback_accuracy=att["acc"]["feedback"],
reviewed_accuracy=att["acc"]["reviewed"],
acc=att["avg_acc"],
time_cost=image.processing_time,
bad_image_reason=image.reason,
counter_measures=image.counter_measures,
error="|".join(att["err"])
)
new_report_file.save()
if request_att["time_cost"].get(image.doc_type, None):
request_att["time_cost"][image.doc_type].append(image.processing_time)
else:
request_att["time_cost"][image.doc_type] = [image.processing_time]
try:
request_att["acc"]["feedback"]["imei_number"] += att["acc"]["feedback"]["imei_number"]
request_att["acc"]["feedback"]["purchase_date"] += att["acc"]["feedback"]["purchase_date"]
request_att["acc"]["feedback"]["retailername"] += att["acc"]["feedback"]["retailername"]
request_att["acc"]["feedback"]["sold_to_party"] += att["acc"]["feedback"]["sold_to_party"]
request_att["acc"]["reviewed"]["imei_number"] += att["acc"]["reviewed"]["imei_number"]
request_att["acc"]["reviewed"]["purchase_date"] += att["acc"]["reviewed"]["purchase_date"]
request_att["acc"]["reviewed"]["retailername"] += att["acc"]["reviewed"]["retailername"]
request_att["acc"]["reviewed"]["sold_to_party"] += att["acc"]["reviewed"]["sold_to_party"]
request_att["bad_images"] += int(att["is_bad_image"])
request_att["total_images"] += 1
request_att["err"] += att["err"]
except Exception as e:
print(e)
continue
return request_att
def calculate_subcription_file(subcription_request_file):
att = {"acc": {"feedback": {},
"reviewed": {}},
"err": [],
"is_bad_image": False,
"avg_acc": None}
if not subcription_request_file.predict_result:
return 400, att
inference_result = copy.deepcopy(subcription_request_file.predict_result)
inference_result, feedback_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.feedback_result))
inference_result, reviewed_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.reviewed_result))
# print(f"[DEBUG]: predict_result: {subcription_request_file.predict_result}")
# print(f"[DEBUG]: inference_result: {inference_result}")
# print(f"[DEBUG]: feedback_result: {feedback_result}")
# print(f"[DEBUG]: reviewed_result: {reviewed_result}")
for key_name in valid_keys:
try:
att["acc"]["feedback"][key_name], _ = calculate_accuracy(key_name, inference_result, feedback_result)
att["acc"]["reviewed"][key_name], _ = calculate_accuracy(key_name, inference_result, reviewed_result)
except Exception as e:
att["err"].append(str(e))
# print(f"[DEBUG]: e: {e} -key_name: {key_name}")
avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", ["retailername", "sold_to_party", "purchase_date", "imei_number"])
avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", ["retailername", "sold_to_party", "purchase_date", "imei_number"])
if avg_feedback is not None or avg_reviewed is not None:
avg_acc = max([x for x in [avg_feedback, avg_reviewed] if x is not None])
if avg_acc < BAD_THRESHOLD:
att["is_bad_image"] = True
att["avg_acc"] = avg_acc
return 200, att
def calculate_attributions(request): # for one request, return in order
acc = {"feedback": {},
"reviewed": {}} # {"feedback": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]},
# "reviewed": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}}
data = {"feedback": {},
"reviewed": {}} # {"feedback": {"retailername": [[ocr, feedback], ...], "sold_to_party":[[ocr, feedback], ...], "purchase_date":[[ocr, feedback], ...], "imei_number":[[ocr, feedback], ...]}}
# {"reviewed": {"retailername": [[ocr, reviewed], ...], "sold_to_party":[[ocr, reviewed], ...], "purchase_date":[[ocr, reviewed], ...], "imei_number":[[ocr, reviewed], ...]}}
time_cost = {} # {"imei": [0.1], "invoice": [0.1]}
image_quality_num = [0, 0] # [good, bad]
image_quality_num[0] = len(request.doc_type.split(","))
error = ""
inference_result = predict_result_to_ready(request.predict_result)
reviewed_result = align_fine_result(inference_result, request.reviewed_result)
feedback_result = align_fine_result(inference_result, request.feedback_result)
# accuracy calculation
for key_name in valid_keys:
if isinstance(inference_result[key_name], list):
if len(inference_result[key_name]) != len(reviewed_result.get(key_name, [])):
error = f"Request {request.request_id} failed with different {key_name} in predict and reviewed_result"
break
if len(inference_result[key_name]) != len(feedback_result.get(key_name, [])):
error = f"Request {request.request_id} failed with different {key_name} in predict and feedback_result"
break
# calculate accuracy for feedback result
acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result)
acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result)
else:
inference_result[key_name] = [inference_result[key_name]]
feedback_result[key_name] = [feedback_result[key_name]]
reviewed_result[key_name] = [reviewed_result[key_name]]
acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result)
acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result)
acc["feedback"]["purchase_date"] = [max(acc["feedback"]["purchase_date"])] if len(acc["feedback"]["purchase_date"]) > 0 else []
acc["reviewed"]["purchase_date"] = [max(acc["reviewed"]["purchase_date"])] if len(acc["reviewed"]["purchase_date"]) > 0 else []
# Count for bad and total images
avg_invoice_feedback = calculate_avg_accuracy(acc, "feedback", ["retailername", "sold_to_party", "purchase_date"])
avg_invoice_reviewed = calculate_avg_accuracy(acc, "reviewed", ["retailername", "sold_to_party", "purchase_date"])
if avg_invoice_feedback is not None or avg_invoice_reviewed is not None:
if max([x for x in [avg_invoice_feedback, avg_invoice_reviewed] if x is not None]) < BAD_THRESHOLD:
image_quality_num[1] += 1
for i, _ in enumerate(acc["feedback"]["imei_number"]):
if acc["feedback"]["imei_number"][i] is not None and acc["reviewed"]["imei_number"][i] is not None:
if max([x for x in [acc["feedback"]["imei_number"][i], acc["reviewed"]["imei_number"][i]] if x is not None]) < BAD_THRESHOLD:
image_quality_num[1] += 1
# time cost and quality calculation
# TODO: to be deprecated, doc_type would be in file level in the future
try:
for doc_type, doc_profile in request.ai_inference_profile.items():
doc_type = doc_type.split("_")[0]
inference_time = doc_profile["inference"][1][0] - doc_profile["inference"][0]
postprocess_time = doc_profile["postprocess"][1] - doc_profile["postprocess"][0]
time_cost[doc_type].append(inference_time + postprocess_time)
except Exception as e:
error = f"Request id {request.request_id} failed with error: {e}"
return acc, data, time_cost, image_quality_num, error
def shadow_report(report_id, query):
c_connector.make_a_report(
(report_id, query))

View File

@ -6,6 +6,7 @@ import json
from PIL import Image, ExifTags from PIL import Image, ExifTags
from django.core.files.uploadedfile import TemporaryUploadedFile from django.core.files.uploadedfile import TemporaryUploadedFile
from django.utils import timezone
from fwd import settings from fwd import settings
from fwd_api.constant.common import allowed_file_extensions from fwd_api.constant.common import allowed_file_extensions
@ -18,10 +19,33 @@ from fwd_api.utils.image import resize
from ..celery_worker.client_connector import c_connector from ..celery_worker.client_connector import c_connector
import imagesize import imagesize
import csv import csv
from openpyxl import load_workbook from openpyxl import load_workbook
from openpyxl.styles import Font, Border, Side, PatternFill, NamedStyle from openpyxl.styles import Font, Border, Side, PatternFill, NamedStyle
def validate_report_list(request):
start_date_str = request.GET.get('start_date')
end_date_str = request.GET.get('end_date')
page_number = int(request.GET.get('page', 0))
page_size = int(request.GET.get('page_size', 10))
report_id = request.GET.get('report_id', None)
validated_data = {}
validated_data["start_date"] = None
validated_data["end_date"] = None
if len(start_date_str) > 0 and len(end_date_str) > 0:
try:
validated_data["start_date"] = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
validated_data["end_date"] = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
validated_data["report_id"] = report_id
validated_data["page_size"] = page_size
validated_data["page_number"] = page_number
if validated_data["report_id"] is None and validated_data["start_date"] is None:
raise RequiredFieldException(excArgs="report_id, start_date, end_date")
return validated_data
def validate_feedback_file(csv_file_path): def validate_feedback_file(csv_file_path):
required_columns = ['redemptionNumber', 'requestId', 'imeiNumber', 'imeiNumber2', 'Purchase Date', 'retailer', 'Sold to party', 'timetakenmilli'] required_columns = ['redemptionNumber', 'requestId', 'imeiNumber', 'imeiNumber2', 'Purchase Date', 'retailer', 'Sold to party', 'timetakenmilli']
missing_columns = [] missing_columns = []
@ -57,7 +81,6 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
if total_file_size > settings.MAX_UPLOAD_FILE_SIZE_OF_A_REQUEST: if total_file_size > settings.MAX_UPLOAD_FILE_SIZE_OF_A_REQUEST:
raise LimitReachedException(excArgs=('Total size of all files', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB')) raise LimitReachedException(excArgs=('Total size of all files', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB'))
def validate_csv_feedback(files, max_file_num=1, min_file_num=1, file_field="csv files"): def validate_csv_feedback(files, max_file_num=1, min_file_num=1, file_field="csv files"):
total_file_size = 0 total_file_size = 0
if len(files) < min_file_num: if len(files) < min_file_num:

View File

@ -0,0 +1,385 @@
import re
from pathlib import Path
from difflib import SequenceMatcher
from terminaltables import AsciiTable
from rapidfuzz.distance import Levenshtein
from .wiki_diff import inline_diff
def is_type_list(x, type):
if not isinstance(x, list):
return False
return all(isinstance(item, type) for item in x)
def cal_true_positive_char(pred, gt):
"""Calculate correct character number in prediction.
Args:
pred (str): Prediction text.
gt (str): Ground truth text.
Returns:
true_positive_char_num (int): The true positive number.
"""
all_opt = SequenceMatcher(None, pred, gt)
true_positive_char_num = 0
for opt, _, _, s2, e2 in all_opt.get_opcodes():
if opt == "equal":
true_positive_char_num += e2 - s2
else:
pass
return true_positive_char_num
def post_processing(text):
"""
- Remove special characters and extra spaces + lower case
"""
text = re.sub(
r"[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789 ]",
" ",
text,
)
text = re.sub(r"\s\s+", " ", text)
text = text.strip()
return text
def count_matches(pred_texts, gt_texts, use_ignore=True):
"""Count the various match number for metric calculation.
Args:
pred_texts (list[str]): Predicted text string.
gt_texts (list[str]): Ground truth text string.
Returns:
match_res: (dict[str: int]): Match number used for
metric calculation.
"""
match_res = {
"gt_char_num": 0,
"pred_char_num": 0,
"true_positive_char_num": 0,
"gt_word_num": 0,
"match_word_num": 0,
"match_word_ignore_case": 0,
"match_word_ignore_case_symbol": 0,
"match_kie": 0,
"match_kie_ignore_case": 0,
}
# comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
# comp = re.compile('[]')
norm_ed_sum = 0.0
gt_texts_for_ned_word = []
pred_texts_for_ned_word = []
for pred_text, gt_text in zip(pred_texts, gt_texts):
if gt_text == pred_text:
match_res["match_word_num"] += 1
match_res["match_kie"] += 1
gt_text_lower = str(gt_text).lower()
pred_text_lower = str(pred_text).lower()
if gt_text_lower == pred_text_lower:
match_res["match_word_ignore_case"] += 1
# gt_text_lower_ignore = comp.sub('', gt_text_lower)
# pred_text_lower_ignore = comp.sub('', pred_text_lower)
if use_ignore:
gt_text_lower_ignore = post_processing(gt_text_lower)
pred_text_lower_ignore = post_processing(pred_text_lower)
else:
gt_text_lower_ignore = gt_text_lower
pred_text_lower_ignore = pred_text_lower
if gt_text_lower_ignore == pred_text_lower_ignore:
match_res["match_kie_ignore_case"] += 1
gt_texts_for_ned_word.append(gt_text_lower_ignore.split(" "))
pred_texts_for_ned_word.append(pred_text_lower_ignore.split(" "))
match_res["gt_word_num"] += 1
norm_ed = Levenshtein.normalized_distance(
pred_text_lower_ignore, gt_text_lower_ignore
)
# if norm_ed > 0.1:
# print(gt_text_lower_ignore, pred_text_lower_ignore, sep='\n')
# print("-"*20)
norm_ed_sum += norm_ed
# number to calculate char level recall & precision
match_res["gt_char_num"] += len(gt_text_lower_ignore)
match_res["pred_char_num"] += len(pred_text_lower_ignore)
true_positive_char_num = cal_true_positive_char(
pred_text_lower_ignore, gt_text_lower_ignore
)
match_res["true_positive_char_num"] += true_positive_char_num
normalized_edit_distance = norm_ed_sum / max(1, len(gt_texts))
match_res["ned"] = normalized_edit_distance
# NED for word-level
norm_ed_word_sum = 0.0
# print(pred_texts_for_ned_word[0])
unique_words = list(
set(
[x for line in pred_texts_for_ned_word for x in line]
+ [x for line in gt_texts_for_ned_word for x in line]
)
)
preds = [
[unique_words.index(w) for w in pred_text_for_ned_word]
for pred_text_for_ned_word in pred_texts_for_ned_word
]
truths = [
[unique_words.index(w) for w in gt_text_for_ned_word]
for gt_text_for_ned_word in gt_texts_for_ned_word
]
for pred_text, gt_text in zip(preds, truths):
norm_ed_word = Levenshtein.normalized_distance(pred_text, gt_text)
# if norm_ed_word < 0.2:
# print(pred_text, gt_text)
norm_ed_word_sum += norm_ed_word
normalized_edit_distance_word = norm_ed_word_sum / max(1, len(gt_texts))
match_res["ned_word"] = normalized_edit_distance_word
return match_res
def eval_ocr_metric(pred_texts, gt_texts, metric="acc"):
"""Evaluate the text recognition performance with metric: word accuracy and
1-N.E.D. See https://rrc.cvc.uab.es/?ch=14&com=tasks for details.
Args:
pred_texts (list[str]): Text strings of prediction.
gt_texts (list[str]): Text strings of ground truth.
metric (str | list[str]): Metric(s) to be evaluated. Options are:
- 'word_acc': Accuracy at word level.
- 'word_acc_ignore_case': Accuracy at word level, ignoring letter
case.
- 'word_acc_ignore_case_symbol': Accuracy at word level, ignoring
letter case and symbol. (Default metric for academic evaluation)
- 'char_recall': Recall at character level, ignoring
letter case and symbol.
- 'char_precision': Precision at character level, ignoring
letter case and symbol.
- 'one_minus_ned': 1 - normalized_edit_distance
In particular, if ``metric == 'acc'``, results on all metrics above
will be reported.
Returns:
dict{str: float}: Result dict for text recognition, keys could be some
of the following: ['word_acc', 'word_acc_ignore_case',
'word_acc_ignore_case_symbol', 'char_recall', 'char_precision',
'1-N.E.D'].
"""
assert isinstance(pred_texts, list)
assert isinstance(gt_texts, list)
assert len(pred_texts) == len(gt_texts)
assert isinstance(metric, str) or is_type_list(metric, str)
if metric == "acc" or metric == ["acc"]:
metric = [
"word_acc",
"word_acc_ignore_case",
"word_acc_ignore_case_symbol",
"char_recall",
"char_precision",
"one_minus_ned",
]
metric = set([metric]) if isinstance(metric, str) else set(metric)
# supported_metrics = set([
# 'word_acc', 'word_acc_ignore_case', 'word_acc_ignore_case_symbol',
# 'char_recall', 'char_precision', 'one_minus_ned', 'one_minust_ned_word'
# ])
# assert metric.issubset(supported_metrics)
match_res = count_matches(pred_texts, gt_texts)
eps = 1e-8
eval_res = {}
if "char_recall" in metric:
char_recall = (
1.0 * match_res["true_positive_char_num"] / (eps + match_res["gt_char_num"])
)
eval_res["char_recall"] = char_recall
if "char_precision" in metric:
char_precision = (
1.0
* match_res["true_positive_char_num"]
/ (eps + match_res["pred_char_num"])
)
eval_res["char_precision"] = char_precision
if "word_acc" in metric:
word_acc = 1.0 * match_res["match_word_num"] / (eps + match_res["gt_word_num"])
eval_res["word_acc"] = word_acc
if "word_acc_ignore_case" in metric:
word_acc_ignore_case = (
1.0 * match_res["match_word_ignore_case"] / (eps + match_res["gt_word_num"])
)
eval_res["word_acc_ignore_case"] = word_acc_ignore_case
if "word_acc_ignore_case_symbol" in metric:
word_acc_ignore_case_symbol = (
1.0
* match_res["match_word_ignore_case_symbol"]
/ (eps + match_res["gt_word_num"])
)
eval_res["word_acc_ignore_case_symbol"] = word_acc_ignore_case_symbol
if "one_minus_ned" in metric:
eval_res["1-N.E.D"] = 1.0 - match_res["ned"]
if "one_minus_ned_word" in metric:
eval_res["1-N.E.D_word"] = 1.0 - match_res["ned_word"]
if "line_acc_ignore_case_symbol" in metric:
line_acc_ignore_case_symbol = (
1.0 * match_res["match_kie_ignore_case"] / (eps + match_res["gt_word_num"])
)
eval_res["line_acc_ignore_case_symbol"] = line_acc_ignore_case_symbol
if "line_acc" in metric:
word_acc_ignore_case_symbol = (
1.0 * match_res["match_kie"] / (eps + match_res["gt_word_num"])
)
eval_res["line_acc"] = word_acc_ignore_case_symbol
for key, value in eval_res.items():
eval_res[key] = float("{:.4f}".format(value))
return eval_res
def eval_kie(preds_e2e: dict[str, dict[str, str]], gt_e2e: dict[str, dict[str, str]], labels, skip_labels=[]):
results = {label: 1 for label in labels}
pred_texts_dict = {label: [] for label in labels}
gt_texts_dict = {label: [] for label in labels}
fail_cases = {}
for img_id in gt_e2e.keys():
fail_cases[img_id] = {}
pred_items = preds_e2e.get(img_id, {k: '' for k in gt_e2e[img_id]})
gt_items = gt_e2e[img_id]
for class_name, text_gt in gt_items.items():
if class_name in skip_labels:
continue
# if class_name == 'seller_name_value':
# print(gt_items)
if class_name not in pred_items:
text_pred = ""
else:
text_pred = pred_items[class_name]
if str(text_pred) != str(text_gt):
diff = inline_diff(text_pred, text_gt)
fail_cases[img_id][class_name] = {
'pred': text_pred,
'gt': text_gt,
"diff": diff['res_text'],
"ned": diff["ned"],
"score": eval_ocr_metric([text_pred], [text_gt], metric=[
"one_minus_ned"])["1-N.E.D"],
}
pred_texts_dict[class_name].append(text_pred)
gt_texts_dict[class_name].append(text_gt)
for class_name in labels:
pred_texts = pred_texts_dict[class_name]
gt_texts = gt_texts_dict[class_name]
result = eval_ocr_metric(
pred_texts,
gt_texts,
metric=[
"one_minus_ned",
"line_acc_ignore_case_symbol",
"line_acc",
"one_minus_ned_word",
],
)
results[class_name] = {
"1-ned": result["1-N.E.D"],
"1-ned-word": result["1-N.E.D_word"],
"line_acc": result["line_acc"],
"line_acc_ignore_case_symbol": result["line_acc_ignore_case_symbol"],
"samples": len(pred_texts),
}
# avg reusults
sum_1_ned = sum(
[
results[class_name]["1-ned"] * results[class_name]["samples"]
for class_name in labels
]
)
sum_1_ned_word = sum(
[
results[class_name]["1-ned-word"] * results[class_name]["samples"]
for class_name in labels
]
)
sum_line_acc = sum(
[
results[class_name]["line_acc"] * results[class_name]["samples"]
for class_name in labels
]
)
sum_line_acc_ignore_case_symbol = sum(
[
results[class_name]["line_acc_ignore_case_symbol"]
* results[class_name]["samples"]
for class_name in labels
]
)
total_samples = sum(
[results[class_name]["samples"] for class_name in labels]
)
results["avg_all"] = {
"1-ned": round(sum_1_ned / total_samples, 4),
"1-ned-word": round(sum_1_ned_word / total_samples, 4),
"line_acc": round(sum_line_acc / total_samples, 4),
"line_acc_ignore_case_symbol": round(
sum_line_acc_ignore_case_symbol / total_samples, 4
),
"samples": total_samples,
}
table_data = [
[
"class_name",
"1-NED",
"1-N.E.D_word",
"line-acc",
"line_acc_ignore_case_symbol",
"#samples",
]
]
for class_name in results.keys():
# if c < p.shape[0]:
table_data.append(
[
class_name,
results[class_name]["1-ned"],
results[class_name]["1-ned-word"],
results[class_name]["line_acc"],
results[class_name]["line_acc_ignore_case_symbol"],
results[class_name]["samples"],
]
)
table = AsciiTable(table_data)
print(table.table)
return results, fail_cases

View File

@ -0,0 +1,432 @@
import os
import re
import ast
import time
import json
import glob
import shutil
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from .ocr_metrics import eval_ocr_metric
import sys
# sys.path.append(os.path.dirname(__file__))
from sdsvkvu.utils.query.sbt_v2 import get_seller, post_process_seller
def read_json(file_path: str):
with open(file_path, 'r') as f:
return json.load(f)
def write_to_json(file_path, content):
with open(file_path, mode='w', encoding='utf8') as f:
json.dump(content, f, ensure_ascii=False)
def convert_datetime_format(date_string: str, is_gt=False) -> str:
# pattern_date_string = "2023-02-28"
output_format = "%Y-%m-%d"
input_format = "%d/%m/%Y"
# Validate the input date string format
pattern = r"\d{2}\/\d{2}\/\d{4}"
if re.match(pattern, date_string):
# Convert the date string to a datetime object
date_object = datetime.strptime(date_string, input_format)
# Convert the datetime object to the desired output format
formatted_date = date_object.strftime(output_format)
return formatted_date
return date_string
def normalise_retailer_name(retailer: str):
input_value = {
"text": retailer,
"id": 0,
"class": "seller",
"bbox": [0, 0, 0, 0],
}
output = get_seller({'seller': [input_value]})
norm_seller_name = post_process_seller(output)
return norm_seller_name
def post_processing_str(class_name: str, s: str, is_gt: bool) -> str:
s = str(s).replace('', ' ').strip()
if s.lower() in ['null', 'nan', "none"]:
return ''
if class_name == "purchase_date" and is_gt == True:
s = convert_datetime_format(s)
if class_name == "retailername":
s = normalise_retailer_name(s)
return s
def convert_groundtruth_from_csv(
csv_path: str,
save_dir: str,
classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
):
# if isinstance(csv_path_list, str):
# csv_path_list = [csv_path_list]
df = pd.read_csv(csv_path)
total_output = {}
for _, request in df.iterrows():
req_id = request['requestId']
if req_id not in total_output:
total_output[req_id] = {k: None for k in classes}
total_output[req_id]["imei_number"] = []
total_output[req_id]["imei_number"].extend([request["imeiNumber"], request["imeiNumber2"]])
total_output[req_id]["imei_number"] = list(set(total_output[req_id]["imei_number"]))
total_output[req_id]["purchase_date"] = request["Purchase Date"]
total_output[req_id]["retailername"] = request["retailer"]
for req_id, output in total_output.items():
save_path = os.path.join(save_dir, req_id)
os.makedirs(save_path, exist_ok=True)
write_to_json(os.path.join(save_path, f"{req_id}.json"), output)
def convert_predict_from_csv(
csv_path: str,
save_dir: str,
classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
):
# if isinstance(csv_path_list, str):
# csv_path_list = [csv_path_list]
df = pd.read_csv(csv_path)
for _, request in df.iterrows():
n_pages = request['pages']
req_id = request['request_id']
if not isinstance(request['doc_type'], str) or not isinstance(request['predict_result'], str):
print(f"[WARNING] Skipped request id {req_id}")
continue
doc_type_list = request['doc_type'].split(',')
assert n_pages == len(doc_type_list), \
"No. pages is different no. documents"
json_path = os.path.join(save_dir, req_id)
os.makedirs(json_path, exist_ok=True)
# For user_submitted_results
if "feedback_result" in request:
feedback_data = ast.literal_eval(request['feedback_result'])
fname = f"{req_id}.json"
write_to_json(os.path.join(json_path, fname), feedback_data)
# For predict_results
data = ast.literal_eval(request['predict_result'])['content']['document'][0]['content']
infer_time = float(request['ai_inference_time']) + float(request['preprocessing_time']) + 0.1
n_imei, n_invoice = 0, 0
for doc_type in doc_type_list:
output = {k: None for k in classes}
if not os.path.exists(json_path):
os.makedirs(json_path, exist_ok=True)
if doc_type == "imei":
for info in data:
if info['label'] == "imei_number":
output['imei_number'] = info['value'][n_imei]
output['processing_time'] = infer_time
fname = f"temp_{doc_type}_{req_id}_{n_imei}.json"
write_to_json(os.path.join(json_path, fname), output)
n_imei += 1
break
elif doc_type == "invoice":
for info in data:
if info['label'] == "imei_number":
continue
output[info['label']] = info['value']
output['processing_time'] = infer_time
fname = f"temp_{doc_type}_{req_id}_{n_invoice}.json"
write_to_json(os.path.join(json_path, fname), output)
n_invoice += 1
def gen_req_to_red_dict(csv_path: str):
df = pd.read_csv(csv_path)
df = df.loc[:, ["requestId", "redemptionNumber"]]
req_to_red = {row["requestId"]: row["redemptionNumber"] for _, row in df.iterrows()}
return req_to_red
def gen_req_to_red_dict_2(csv_path: str):
df = pd.read_csv(csv_path)
df = df.loc[:, ["request_id", "redemption_id"]]
req_to_red = {row["request_id"]: row["redemption_id"] for _, row in df.iterrows()}
return req_to_red
def init_csv(
gt_dir: str,
pred_dir: str,
req_to_red: dict,
):
list_request_id = os.listdir(gt_dir)
total = []
for request_id in list_request_id:
gt_path = os.path.join(gt_dir, request_id, request_id+".json")
if not os.path.exists(gt_path):
print(f"[WARNING] Skipped request id {os.path.basename(os.path.dirname(gt_path))}")
continue
gt_data = read_json(gt_path)
json_file_list = glob.glob(os.path.join(pred_dir, request_id, "temp_*.json"))
json_file_list = sorted(json_file_list, key=lambda x: int(x.split(".json")[0].split('_')[-1]))
n_imei, n_invoice = 0, 0
# if len(json_file_list) > 3:
# continue
for json_file in json_file_list:
pred_data = read_json(json_file)
if "imei" in json_file:
pred_value = pred_data['imei_number']
gt_value = gt_data['imei_number'][n_imei]
n_imei += 1
score = eval_ocr_metric(
[post_processing_str("imei_number", pred_value, is_gt=False)],
[post_processing_str("imei_number", gt_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
total.append({
"requestId": request_id,
"redemptionNumber": req_to_red[request_id],
"userSubmitResults": gt_value,
"OCRResults": pred_value,
"revisedResults_by_SDSRV": "",
"accuracy": score,
"processingTime (by request)": pred_data['processing_time'],
"class_name": "imei_number",
"file_path": json_file
})
elif "invoice" in json_file:
for class_name in ["retailername", "purchase_date"]:
pred_value = pred_data[class_name]
gt_value = gt_data[class_name]
if isinstance(gt_value, list):
gt_value = gt_value[0]
n_invoice += 1
if not isinstance(pred_value, list):
pred_value = [pred_value]
score = 0
for _pred_value in pred_value:
score1 = eval_ocr_metric(
[post_processing_str(class_name, _pred_value, is_gt=False)],
[post_processing_str(class_name, gt_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
score = max(score, score1)
total.append({
"requestId": request_id,
"redemptionNumber": req_to_red[request_id],
"userSubmitResults": gt_value,
"OCRResults": pred_value[0] if class_name == "retailername" else pred_value,
"revisedResults_by_SDSRV": "",
"accuracy": score,
"processingTime (by request)": pred_data['processing_time'],
"class_name": class_name,
"file_path": json_file
})
return total
def export_report(
init_csv: str,
):
df = pd.read_csv(init_csv)
for index, request in df.iterrows():
file_path = request['file_path']
class_name = request['class_name']
pred_value = request['OCRResults']
revised_value = read_json(file_path)[class_name]
if class_name == "purchase_date":
pred_value = ast.literal_eval(pred_value)
if isinstance(revised_value, list):
if len(revised_value) > 0:
revised_value = revised_value[0]
else:
revised_value = None
if len(pred_value) == 0:
pred_value = [None]
score = 0
for _pred_value in pred_value:
score1 = eval_ocr_metric(
[post_processing_str(class_name, _pred_value, is_gt=False)],
[post_processing_str(class_name, revised_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
score = max(score, score1)
else:
score = eval_ocr_metric(
[post_processing_str(class_name, pred_value, is_gt=False)],
[post_processing_str(class_name, revised_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
df.at[index, "revisedResults_by_SDSRV"] = revised_value
df.at[index, "accuracy"] = score
return df
def pick_sample_to_revise(
ocr_accuracy: list,
gt_dir: str,
save_dir: str
):
empty_err_path = os.path.join(save_dir, "empty_results")
other_err_path = os.path.join(save_dir, "diff_results")
os.makedirs(empty_err_path, exist_ok=True)
os.makedirs(other_err_path, exist_ok=True)
for request in ocr_accuracy:
score = request['accuracy']
json_path = request['file_path']
request_id = request['requestId']
img_path_folder = os.path.join(gt_dir, Path(json_path).parts[-2], Path(json_path).parts[-1])
img_path = [ff for ff in glob.glob(img_path_folder.replace(".json", ".*")) if ".json" not in ff]
if len(img_path) == 0:
print(f"[WARNING] Skipped request id {request_id}")
continue
img_path = img_path[0]
# img_path = [ff for ff in glob.glob(json_path.replace(".json", ".*"))][0]
if score == 0:
save_path = os.path.join(empty_err_path, request_id)
elif score < 1:
save_path = os.path.join(other_err_path, request_id)
else:
continue
os.makedirs(save_path, exist_ok=True)
shutil.copy(img_path, save_path)
shutil.copy(json_path, save_path)
def merge_revised_sample(
revised_path_list: list,
save_dir: str
):
if not isinstance(revised_path_list, list):
revised_path_list = [revised_path_list]
for revised_path in revised_path_list:
list_request = [os.path.basename(ff) for ff in os.listdir(revised_path)]
for request in list_request:
file_list = glob.glob(os.path.join(revised_path, request, "*.json*"))
for file_path in file_list:
# shutil.copyfile(file_path, os.path.join(save_path, request))
os.system(f"sudo cp {file_path} {os.path.join(save_dir, request)}")
def calculate_average_by_column(df, column_name):
df = df.groupby(by=["requestId"])
time_list = []
for req, sub_df in df:
if len(sub_df) > 0:
time_list.append(sub_df.iloc[0][column_name])
if len(time_list) > 0:
return sum(time_list)/len(time_list)
return 0
if __name__ == "__main__":
save_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan"
save_csv = "logs/eval_20240115"
csv_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan.csv"
csv_path_end_user = "logs/eval_20240115/OCR_15Jan2024.csv"
# Step 1: Convert a csv file to get user submitted results for each request
print("[INFO] Starting convert csv from customer to json")
os.system(f"sudo chmod -R 777 {save_path}")
convert_groundtruth_from_csv(csv_path=csv_path_end_user, save_dir=save_path)
print("[INFO] Converted")
# # Step 2: Convert a csv file to get predict OCR results for each image
print("[INFO] Starting convert csv from SDSV to json")
convert_predict_from_csv(csv_path=csv_path, save_dir=save_path)
print("[INFO] Converted")
# # Step 3: Gen initial csv file and calculate OCR result between submitted results and ocr results
print("[INFO] Starting generate csv to get performance")
gt_path = save_path
pred_path = save_path
req_to_red_dict = gen_req_to_red_dict(csv_path_end_user)
init_data = init_csv(gt_dir=gt_path, pred_dir=pred_path, req_to_red=req_to_red_dict)
pd.DataFrame(init_data).to_csv(os.path.join(save_csv, "init1.csv"), index=False)
print("[INFO] Done")
# # Step 4: Split requests whose accuracy is less than 1 to revise
# print("[INFO] Starting split data to review")
# revised_path = os.path.join(save_csv, "revised")
# # shutil.rmtree(revised_path)
# pick_sample_to_revise(ocr_accuracy=init_data, gt_dir=save_path, save_dir=revised_path)
# print("[INFO] Done")
# # Step 5: Merge revised results to gt folder
# print("[INFO] Merging revised data to ground truth folder")
# revised_path = os.path.join(save_csv, "revised")
# revised_path = [f'{revised_path}/empty_results', f'{revised_path}/diff_results']
# merge_revised_sample(revised_path_list=revised_path, save_dir=save_path)
# print("Done")
# # Step 6: Caculate OCR result between ocr results and revised results
# print("[INFO] Exporting OCR report")
# init_csv_path = os.path.join(save_csv, "init1.csv")
# report = export_report(init_csv=init_csv_path)
# error_path = os.path.join(save_csv, "errors")
# pick_sample_to_revise(ocr_accuracy=report[report.accuracy < 0.75].to_dict('records'), gt_dir=save_path, save_dir=error_path)
# n_total_images = len(report)
# n_bad_images = len(report[report.accuracy < 0.75])
# average_acc = report[report.accuracy >= 0.75]['accuracy'].mean()
# print("Total requests:", len(report['requestId'].unique()))
# print("Total images:", n_total_images)
# print("No. imei images:", len(report[report.class_name == "imei_number"]))
# print("No. invoice images:", len(report[report.class_name == "retailername"]))
# print("No. bad quality images:", n_bad_images)
# print("No. valid images:", n_total_images - n_bad_images)
# print("No. per of bad quality images:", 100*n_bad_images/n_total_images)
# print("Average accuracy:", 100*average_acc)
# last_row = n_total_images
# report.at[last_row, "requestId"] = "Total requests:"
# report.at[last_row, "redemptionNumber"] = len(report['requestId'].unique())
# report.at[last_row+1, "requestId"] = "Total images:"
# report.at[last_row+1, "redemptionNumber"] = n_total_images
# report.at[last_row+2, "requestId"] = "No. imei images:"
# report.at[last_row+2, "redemptionNumber"] = len(report[report.class_name == "imei_number"])
# report.at[last_row+3, "requestId"] = "No. invoice images:"
# report.at[last_row+3, "redemptionNumber"] = len(report[report.class_name == "retailername"])
# report.at[last_row+4, "requestId"] = "No. bad quality images:"
# report.at[last_row+4, "redemptionNumber"] = n_bad_images
# report.at[last_row+5, "requestId"] = "No. valid images:"
# report.at[last_row+5, "redemptionNumber"] = n_total_images - n_bad_images
# report.at[last_row+6, "requestId"] = "No. per of bad quality images:"
# report.at[last_row+6, "redemptionNumber"] = 100*n_bad_images/n_total_images
# report.at[last_row+7, "requestId"] = "Average accuracy:"
# report.at[last_row+7, "redemptionNumber"] = 100*average_acc
# report.drop(columns=["file_path", "class_name"]).to_csv(os.path.join(save_csv, f"SBT_report_{time.strftime('%Y%m%d')}.csv"), index=False)
# print("[INFO] Done")

View File

@ -0,0 +1,201 @@
# https://stackoverflow.com/questions/774316/python-difflib-highlighting-differences-inline
import difflib
import unidecode
import os
import glob
import pandas as pd
VOWELS = 'aeouiy' + 'AEOUIY'
CONSONANTS = 'bcdfghjklmnpqrstvxwz' + 'BCDFGHJKLMNPQRSTVXWZ'
# PREDICT_PATH = 'ocr/result'
# GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/wiki/ground_truth'
PREDICT_PATH = 'ocr/result/cinamon'
GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/Backup/1.Hand_writing/Lines/cinnamon_data'
# note that we also use different preprocess for cinamon data
# SAVE_PATH = 'wiki_diff'
SAVE_PATH = 'wiki_diff/cinamon'
RES_PATH = f'{SAVE_PATH}/result/'
WRONG_ACCENT_FILE = f'{SAVE_PATH}/wrong_accent.txt'
LOST_ACCENT_FILE = f'{SAVE_PATH}/lost_accent.txt'
TOTAL_WORD = 0
def write_accent_error(path, err):
# path should be wrong_accent_file or lost_accent_file
with open(path, 'a') as f:
f.write(err)
f.write('\n')
def update_ddata_specialchars(ddata_specialchars, correction_key, char_key):
if char_key in ddata_specialchars[correction_key]:
ddata_specialchars[correction_key][char_key] += 1
else:
ddata_specialchars[correction_key][char_key] = 1
def process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars):
a_char = matcher.a[i1:i2]
b_char = matcher.b[j1:j2]
ddata['res_text'] += ' ### {' + a_char + ' -> ' + b_char + '} ### '
ddata['nwrongs'] += 1*len(b_char)
if len(a_char) == 1 and len(b_char) == 1: # single char case
if a_char.lower() == b_char.lower(): # wrong upper/lower case
ddata['UL_single'] += 1
update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char))
else:
ddata['nwrongs_single'] += 1
a_ori = unidecode.unidecode(a_char).lower()
b_ori = unidecode.unidecode(b_char).lower()
if a_ori in VOWELS and b_ori in VOWELS:
if a_ori == b_ori:
err = a_char + ' -> ' + b_char
if b_ori == b_char.lower(): # e.g. Ơ -> O
ddata['nlost_accent'] += 1
# write_accent_error(LOST_ACCENT_FILE, err)
else: # e.g Ơ -> Ớ
ddata['nwrong_accent'] += 1
# write_accent_error(WRONG_ACCENT_FILE, err)
else: # e.g Ă -> Â
ddata['nwrong_vowels'] += 1
else:
if a_ori in CONSONANTS and b_ori in CONSONANTS:
ddata['nwrong_consonants'] += 1
else:
ddata['nwrong_specialchars'] += 1
update_ddata_specialchars(ddata_specialchars, 'wrong', (a_char, b_char))
else:
if a_char.lower() == b_char.lower():
ddata['UL_multiple'] += 1
update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char))
else:
ddata['nwrongs_multiple'] += 1
if len(a_char) > 10 or len(b_char) > 10:
ddata['nlong_sequences'] += 1
# print(a_char)
def process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars):
a_char = matcher.a[i1:i2]
ddata['res_text'] += ' ### {- ' + a_char + '} ### '
ddata['nadds'] += 1*len(a_char)
if len(a_char) == 1:
ddata['nadds_single'] += 1
if a_char.lower() in CONSONANTS + VOWELS:
ddata['nadds_chars'] += 1
else:
if a_char == ' ':
ddata['nadds_space'] += 1
else:
ddata['nadds_specialchars'] += 1
update_ddata_specialchars(ddata_specialchars, 'add', a_char)
else:
ddata['nadds_multiple'] += 1
if len(a_char) > 10:
ddata['nlong_sequences'] += 1
# print(a_char)
def process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars):
b_char = matcher.b[j1:j2]
ddata['nlosts'] += 1*len(b_char)
ddata['res_text'] += ' ### {+ ' + b_char + '} ### '
if len(b_char) == 1:
ddata['nlosts_single'] += 1
if b_char.lower() in CONSONANTS + VOWELS:
ddata['nlosts_chars'] += 1
else:
if b_char == ' ':
ddata['nlosts_space'] += 1
else:
ddata['nlosts_specialchars'] += 1
update_ddata_specialchars(ddata_specialchars, 'lost', b_char)
else:
ddata['nlosts_multiple'] += 1
if len(b_char) > 10:
ddata['nlong_sequences'] += 1
# print(b_char)
def inline_diff(a, b, ddata_specialchars={'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}):
matcher = difflib.SequenceMatcher(None, a, b)
ddata = {'res_text': ''}
# ddata = ddata | {key: 0 for key in ['nsingle', 'nmultiple']}
ddata = ddata | {key: 0 for key in ['UL_single', 'UL_multiple']}
ddata = ddata | {
key: 0 for key in
['nlosts', 'nlosts_single', 'nlosts_multiple', 'nlosts_chars', 'nlosts_specialchars', 'nlosts_space']}
ddata = ddata | {
key: 0 for key in
['nadds', 'nadds_single', 'nadds_multiple', 'nadds_chars', 'nadds_specialchars', 'nadds_space']}
ddata = ddata | {
key: 0 for key in
['nwrongs', 'nwrongs_single', 'nwrongs_multiple', 'nwrong_accent', 'nlost_accent', 'nwrong_vowels',
'nwrong_consonants', 'nwrong_specialchars']}
ddata['nlong_sequences'] = 0
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'replace': # wrong
process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars)
if tag == 'delete': # OCR add char so the matcher "delete"
process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars)
if tag == 'equal':
ddata['res_text'] += matcher.a[i1:i2]
if tag == 'insert': # OCR lost char so the matcher "insert"
process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars)
ddata["ned"] = ddata['nwrongs'] + ddata['nadds'] + ddata['nlosts']
return ddata
def process_single_file(file_name, ddata_specialchars):
# read predict file
with open(os.path.join(PREDICT_PATH, file_name), 'r') as f:
predict = f.readlines()[0].strip()
# predict = ''.join(predict)
# predict = predict.replace(' ', '')
# predict = predict.replace('\n', '')
# print(predict)
# read groundtruth file
with open(os.path.join(GROUNDTRUTH_PATH, file_name), 'r') as f:
gt = f.readlines()[0].strip()
# gt = ''.join(gt)
# gt = gt.replace('\n', '')
# get statiscal data of difference between predict and ground truth
ddata = inline_diff(predict, gt, ddata_specialchars)
global TOTAL_WORD
TOTAL_WORD = TOTAL_WORD + len(gt.split())
# write to save_path
res_text = ddata.pop('res_text', None)
save_file = os.path.join(RES_PATH, file_name)
with open(save_file, 'w') as f:
f.write(res_text)
# generate csv file
ddata = {'file_name': save_file} | ddata
return ddata
def main(overwrite=False):
for accent_file in [WRONG_ACCENT_FILE, LOST_ACCENT_FILE]:
if os.path.exists(accent_file):
os.remove(accent_file)
lddata = []
ddata_specialchars = {'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}
for file_ in glob.glob(f'{PREDICT_PATH}/*.txt'):
file_name = file_.split('/')[-1]
ddata = process_single_file(file_name, ddata_specialchars)
lddata.append(ddata)
if overwrite:
df = pd.DataFrame(lddata)
df.to_csv(f'{SAVE_PATH}/wiki_diff.csv', sep='\t')
df_ = pd.DataFrame(ddata_specialchars)
df_.to_csv(f'{SAVE_PATH}/wiki_diff_specialchars.csv', sep='\t')
print(TOTAL_WORD)
if __name__ == '__main__':
main(overwrite=True)

View File

@ -36,7 +36,7 @@ requests==2.28.1
ruamel.yaml==0.17.21 ruamel.yaml==0.17.21
ruamel.yaml.clib==0.2.7 ruamel.yaml.clib==0.2.7
sqlparse==0.4.3 sqlparse==0.4.3
tzdata==2022.6 tzdata==2022.7
uritemplate==4.1.1 uritemplate==4.1.1
urllib3==1.26.13 urllib3==1.26.13
uvicorn==0.20.0 uvicorn==0.20.0
@ -50,4 +50,13 @@ boto3==1.29.7
imagesize==1.4.1 imagesize==1.4.1
pdf2image==1.16.3 pdf2image==1.16.3
redis==5.0.1 redis==5.0.1
django-celery-beat==2.5.0 django-celery-beat==2.5.0
terminaltables==3.1.10
rapidfuzz==3.6.1
Unidecode==1.3.8
pandas==2.2.0
openpyxl==3.1.2
# For sdsvkvu compatibility
# torch==1.13.1+cu116
# torchvision==0.14.1+cu116
# --extra-index-url https://download.pytorch.org/whl/cu116

View File

@ -0,0 +1 @@
pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql

1
cope2n-api/token.txt Normal file
View File

@ -0,0 +1 @@
eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpZCI6InNidCIsImV4cGlyZWRfYXQiOiIwMS8wMi8yMDI0IDEyOjQ2OjA3IiwiaW50ZXJuYWxfaWQiOjEsInN0YXR1cyI6MSwic3Vic2NyaXB0aW9uX2lkIjoxfQ.VFsoGm5BdeyNptMsdU4f4l70bDIYHTmB8Y-2-PXs7cKhzGB1pUpgqax-V39N_IEXriRl3caDiotzU0psR0WR3g

View File

@ -83,12 +83,12 @@ services:
depends_on: depends_on:
db-sbt: db-sbt:
condition: service_started condition: service_started
command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input && # command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input &&
python manage.py makemigrations && # python manage.py makemigrations &&
python manage.py migrate && # python manage.py migrate &&
python manage.py compilemessages && # python manage.py compilemessages &&
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod # gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
# command: bash -c "tail -f > /dev/null" command: bash -c "tail -f > /dev/null"
minio: minio:
image: minio/minio image: minio/minio
@ -188,6 +188,8 @@ services:
- POSTGRES_USER=${DB_USER} - POSTGRES_USER=${DB_USER}
- POSTGRES_PASSWORD=${DB_PASSWORD} - POSTGRES_PASSWORD=${DB_PASSWORD}
- POSTGRES_DB=${DB_SCHEMA} - POSTGRES_DB=${DB_SCHEMA}
ports:
- 5432:5432
rabbitmq-sbt: rabbitmq-sbt:
mem_reservation: 600m mem_reservation: 600m

View File

@ -10,9 +10,9 @@ from dotenv import load_dotenv
load_dotenv("../.env_prod") load_dotenv("../.env_prod")
OUTPUT_NAME = "missing_records" OUTPUT_NAME = "Jan"
START_DATE = datetime(2023, 12, 28, tzinfo=timezone('Asia/Ho_Chi_Minh')) START_DATE = datetime(2024, 1, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
END_DATE = datetime(2024, 1, 3, tzinfo=timezone('Asia/Ho_Chi_Minh')) END_DATE = datetime(2024, 2, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
# Database connection details # Database connection details
db_host = os.environ.get('DB_HOST', "") db_host = os.environ.get('DB_HOST', "")
@ -62,32 +62,32 @@ with open(csv_file_path, 'w', newline='') as csv_file:
cursor.close() cursor.close()
conn.close() conn.close()
# Download folders from S3 # # Download folders from S3
s3_client = boto3.client( # s3_client = boto3.client(
's3', # 's3',
aws_access_key_id=access_key, # aws_access_key_id=access_key,
aws_secret_access_key=secret_key # aws_secret_access_key=secret_key
) # )
request_ids = [] # request_ids = []
for rq in data: # for rq in data:
rq_id = rq[3] # rq_id = rq[3]
request_ids.append(rq_id) # request_ids.append(rq_id)
for request_id in tqdm(request_ids): # for request_id in tqdm(request_ids):
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/ # folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files # local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
os.makedirs(OUTPUT_NAME, exist_ok=True) # os.makedirs(OUTPUT_NAME, exist_ok=True)
os.makedirs(local_folder_path, exist_ok=True) # os.makedirs(local_folder_path, exist_ok=True)
# List objects in the S3 folder # # List objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key) # response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
objects = response.get('Contents', []) # objects = response.get('Contents', [])
for s3_object in objects: # for s3_object in objects:
object_key = s3_object['Key'] # object_key = s3_object['Key']
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key # local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
# Download the S3 object to the local file # # Download the S3 object to the local file
s3_client.download_file(s3_bucket_name, object_key, local_file_path) # s3_client.download_file(s3_bucket_name, object_key, local_file_path)

View File

@ -0,0 +1 @@
pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql