Merged from vietanh99, Add APIs
This commit is contained in:
parent
3f524e677d
commit
dd206c4a3c
@ -8,10 +8,17 @@ RUN groupadd --gid ${GID} ${USERNAME} \
|
||||
&& apt-get install -y sudo bash gettext poppler-utils \
|
||||
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
|
||||
&& chmod 0440 /etc/sudoers.d/${USERNAME}
|
||||
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
|
||||
RUN yes | apt install postgresql gcc musl-dev
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install uvicorn gunicorn Celery
|
||||
|
||||
# For intergration with sdskvu
|
||||
RUN pip install pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
RUN pip install -U openmim==0.3.7 --no-cache-dir
|
||||
RUN mim install mmcv-full==1.7.2
|
||||
# End intergration with sdskvu
|
||||
|
||||
USER ${UID}
|
||||
ADD --chown=${UID}:${GID} fwd /app
|
||||
COPY --chown=${UID}:${GID} requirements.txt /app
|
||||
@ -21,4 +28,27 @@ RUN pip install -r requirements.txt --no-cache-dir
|
||||
|
||||
COPY --chown=${UID}:${GID} . /app
|
||||
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir
|
||||
|
||||
# For intergration with sdskvu
|
||||
RUN python -m pip install paddlepaddle-gpu==2.4.2.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html --no-cache-dir
|
||||
|
||||
ENV TZ="Asia/Ho_Chi_Minh"
|
||||
|
||||
|
||||
# FROM cope2n-api-base AS builder
|
||||
# ARG UID=1000
|
||||
# ARG GID=1000
|
||||
# ARG USERNAME=container-user
|
||||
|
||||
# # Create a new user
|
||||
# RUN groupadd --gid ${GID} ${USERNAME} \
|
||||
# && useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
|
||||
# && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
|
||||
# && chmod 0440 /etc/sudoers.d/${USERNAME}
|
||||
|
||||
# WORKDIR /app
|
||||
# COPY --chown=${UID}:${GID} . /app
|
||||
|
17
cope2n-api/Dockerfile.base
Normal file
17
cope2n-api/Dockerfile.base
Normal file
@ -0,0 +1,17 @@
|
||||
FROM python:3.9.17-buster
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y sudo bash gettext poppler-utils postgresql gcc musl-dev
|
||||
|
||||
COPY requirements.txt /tmp
|
||||
COPY ./fwd_api/utils/sdsvkvu /app/fwd_api/utils/sdsvkvu
|
||||
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir
|
||||
RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir
|
||||
|
||||
RUN pip install --upgrade pip && pip install uvicorn gunicorn Celery
|
||||
RUN pip install -r /tmp/requirements.txt --no-cache-dir
|
||||
|
||||
ENV TZ="Asia/Ho_Chi_Minh"
|
@ -3,88 +3,87 @@ from rest_framework.decorators import action
|
||||
from rest_framework.response import Response
|
||||
from django.core.paginator import Paginator
|
||||
from django.http import JsonResponse
|
||||
from datetime import datetime
|
||||
from django.utils import timezone
|
||||
from django.db.models import Q
|
||||
import uuid
|
||||
|
||||
from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiTypes
|
||||
# from drf_spectacular.types import OpenApiString
|
||||
from ..models import SubscriptionRequest
|
||||
from ..exception.exceptions import RequiredFieldException
|
||||
|
||||
import json
|
||||
from ..exception.exceptions import InvalidException, RequiredFieldException
|
||||
from ..models import SubscriptionRequest, Report, ReportFile
|
||||
from ..utils.accuracy import shadow_report, MonthReportAccumulate
|
||||
from ..utils.file import validate_report_list
|
||||
from ..utils.process import string_to_boolean
|
||||
|
||||
def first_of_list(the_list):
|
||||
if not the_list:
|
||||
return None
|
||||
return the_list[0]
|
||||
|
||||
class AccuracyViewSet(viewsets.ViewSet):
|
||||
lookup_field = "username"
|
||||
|
||||
@extend_schema(
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name='start_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Start date (YYYY-mm-DDTHH:MM:SS)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2023-01-02T00:00:00',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='end_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='End date (YYYY-mm-DDTHH:MM:SS)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2024-01-10T00:00:00',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='include_test',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Whether to include test record or not',
|
||||
type=OpenApiTypes.BOOL,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='is_reviewed',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Which records to be query',
|
||||
type=OpenApiTypes.STR,
|
||||
enum=['reviewed', 'not reviewed', 'all'],
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='request_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific request id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='redemption_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific redemption id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='quality',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='One or more of [bad, good, all]',
|
||||
type=OpenApiTypes.STR,
|
||||
enum=['bad', 'good', 'all'],
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Page number',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page_size',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Number of items per page',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
],
|
||||
responses=None, tags=['Accuracy']
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name='start_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2023-01-02T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='end_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2024-01-10T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='include_test',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Whether to include test record or not',
|
||||
type=OpenApiTypes.BOOL,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='is_reviewed',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Which records to be query',
|
||||
type=OpenApiTypes.STR,
|
||||
enum=['reviewed', 'not reviewed', 'all'],
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='request_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific request id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='redemption_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific redemption id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Page number',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page_size',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Number of items per page',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
],
|
||||
responses=None, tags=['Accuracy']
|
||||
)
|
||||
@action(detail=False, url_path="request_list", methods=["GET"])
|
||||
def get_subscription_requests(self, request):
|
||||
def get_request_list(self, request):
|
||||
if request.method == 'GET':
|
||||
start_date_str = request.GET.get('start_date')
|
||||
end_date_str = request.GET.get('end_date')
|
||||
@ -94,13 +93,12 @@ class AccuracyViewSet(viewsets.ViewSet):
|
||||
redemption_id = request.GET.get('redemption_id', None)
|
||||
is_reviewed = request.GET.get('is_reviewed', None)
|
||||
include_test = request.GET.get('include_test', False)
|
||||
quality = request.GET.get('quality', None)
|
||||
|
||||
try:
|
||||
start_date = datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S')
|
||||
end_date = datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S')
|
||||
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
except ValueError:
|
||||
return JsonResponse({'error': 'Invalid date format. Please use YYYY-MM-DD.'}, status=400)
|
||||
raise InvalidException(excArgs="Date format")
|
||||
|
||||
base_query = Q(created_at__range=(start_date, end_date))
|
||||
if request_id:
|
||||
@ -124,13 +122,6 @@ class AccuracyViewSet(viewsets.ViewSet):
|
||||
base_query &= Q(is_reviewed=False)
|
||||
elif is_reviewed == "all":
|
||||
pass
|
||||
if isinstance(quality, str):
|
||||
if quality == "good":
|
||||
base_query &= Q(is_bad_image_quality=False)
|
||||
elif quality == "bad":
|
||||
base_query &= Q(is_bad_image_quality=True)
|
||||
elif quality == "all":
|
||||
pass
|
||||
|
||||
subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
|
||||
|
||||
@ -185,6 +176,368 @@ class AccuracyViewSet(viewsets.ViewSet):
|
||||
|
||||
return JsonResponse({'error': 'Invalid request method.'}, status=405)
|
||||
|
||||
@extend_schema(
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name='is_daily_report',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Whether to include test record or not',
|
||||
type=OpenApiTypes.BOOL,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='start_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2023-01-02T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='end_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2024-01-10T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='include_test',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Whether to include test record or not',
|
||||
type=OpenApiTypes.BOOL,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='is_reviewed',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Which records to be query',
|
||||
type=OpenApiTypes.STR,
|
||||
enum=['reviewed', 'not reviewed', 'all'],
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='request_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific request id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='redemption_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific redemption id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='subsidiary',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Subsidiary',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
],
|
||||
responses=None, tags=['Accuracy']
|
||||
)
|
||||
@action(detail=False, url_path="make_report", methods=["GET"])
|
||||
def make_report(self, request):
|
||||
if request.method == 'GET':
|
||||
start_date_str = request.GET.get('start_date')
|
||||
end_date_str = request.GET.get('end_date')
|
||||
request_id = request.GET.get('request_id', None)
|
||||
redemption_id = request.GET.get('redemption_id', None)
|
||||
is_reviewed = string_to_boolean(request.data.get('is_reviewed', "false"))
|
||||
include_test = string_to_boolean(request.data.get('include_test', "false"))
|
||||
subsidiary = request.GET.get("subsidiary", "all")
|
||||
is_daily_report = string_to_boolean(request.data.get('is_daily_report', "false"))
|
||||
|
||||
try:
|
||||
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
except ValueError:
|
||||
raise InvalidException(excArgs="Date format")
|
||||
|
||||
query_set = {"start_date_str": start_date_str,
|
||||
"end_date_str": end_date_str,
|
||||
"request_id": request_id,
|
||||
"redemption_id": redemption_id,
|
||||
"is_reviewed": is_reviewed,
|
||||
"include_test": include_test,
|
||||
"subsidiary": subsidiary,
|
||||
"is_daily_report": is_daily_report,
|
||||
}
|
||||
|
||||
report_id = "report" + "_" + timezone.datetime.now().strftime("%Y%m%d%H%M%S%z") + "_" + uuid.uuid4().hex
|
||||
new_report: Report = Report(
|
||||
report_id=report_id,
|
||||
is_daily_report=is_daily_report,
|
||||
subsidiary=subsidiary.lower().replace(" ", ""),
|
||||
include_test=include_test,
|
||||
include_reviewed=is_reviewed,
|
||||
start_at=start_date,
|
||||
end_at=end_date,
|
||||
)
|
||||
new_report.save()
|
||||
# Background job to calculate accuracy
|
||||
shadow_report(report_id, query_set)
|
||||
|
||||
return JsonResponse(status=status.HTTP_200_OK, data={"report_id": report_id})
|
||||
|
||||
@extend_schema(
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name='report_id',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific report id',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Page number',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page_size',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Number of items per page',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
],
|
||||
responses=None, tags=['Accuracy']
|
||||
)
|
||||
@action(detail=False, url_path="report_detail_list", methods=["GET"])
|
||||
def get_report_detail_list(self, request):
|
||||
if request.method == 'GET':
|
||||
report_id = request.GET.get('report_id', None)
|
||||
page_number = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 10))
|
||||
|
||||
report = Report.objects.filter(report_id=report_id).first()
|
||||
report_files = ReportFile.objects.filter(report=report)
|
||||
|
||||
paginator = Paginator(report_files, page_size)
|
||||
page = paginator.get_page(page_number)
|
||||
|
||||
data = []
|
||||
for report_file in page:
|
||||
data.append({
|
||||
"Request ID": report_file.correspond_request_id,
|
||||
"Redemption Number": report_file.correspond_redemption_id,
|
||||
"Image type": report_file.doc_type,
|
||||
"IMEI_user submitted": first_of_list(report_file.feedback_result.get("imei_number", [None])),
|
||||
"IMEI_OCR retrieved": first_of_list(report_file.predict_result.get("imei_number", [None])),
|
||||
"IMEI1 Accuracy": first_of_list(report_file.feedback_accuracy.get("imei_number", [None])),
|
||||
"Invoice_Purchase Date_Consumer": report_file.feedback_result.get("purchase_date", None),
|
||||
"Invoice_Purchase Date_OCR": report_file.predict_result.get("purchase_date", []),
|
||||
"Invoice_Purchase Date Accuracy": first_of_list(report_file.feedback_accuracy.get("purchase_date", [None])),
|
||||
"Invoice_Retailer_Consumer": report_file.feedback_result.get("retailername", None),
|
||||
"Invoice_Retailer_OCR": report_file.predict_result.get("retailername", None),
|
||||
"Invoice_Retailer Accuracy": first_of_list(report_file.feedback_accuracy.get("retailername", [None])),
|
||||
"OCR Image Accuracy": report_file.acc,
|
||||
"OCR Image Speed (seconds)": report_file.time_cost,
|
||||
"Reviewed?": "No",
|
||||
"Bad Image Reasons": report_file.bad_image_reason,
|
||||
"Countermeasures": report_file.counter_measures,
|
||||
"IMEI_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("imei_number", [None])),
|
||||
"Purchase Date_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("purchase_date", [None])),
|
||||
"Retailer_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("retailername", [None])),
|
||||
})
|
||||
|
||||
response = {
|
||||
'report_detail': data,
|
||||
'page': {
|
||||
'number': page.number,
|
||||
'total_pages': page.paginator.num_pages,
|
||||
'count': page.paginator.count,
|
||||
}
|
||||
}
|
||||
return JsonResponse(response, status=200)
|
||||
|
||||
return JsonResponse({'error': 'Invalid request method.'}, status=405)
|
||||
|
||||
@extend_schema(
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name='start_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2023-01-02T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='end_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2024-01-10T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='daily_report_only',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Specific report id',
|
||||
type=OpenApiTypes.BOOL,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Page number',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page_size',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Number of items per page',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
],
|
||||
responses=None, tags=['Accuracy']
|
||||
)
|
||||
@action(detail=False, url_path="report_list", methods=["GET"])
|
||||
def get_report_list(self, request):
|
||||
if request.method == 'GET':
|
||||
daily_report_only = request.GET.get('daily_report_only', False)
|
||||
start_date_str = request.GET.get('start_date', "")
|
||||
end_date_str = request.GET.get('end_date', "")
|
||||
page_number = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 10))
|
||||
|
||||
if not start_date_str or not end_date_str:
|
||||
reports = Report.objects.all()
|
||||
else:
|
||||
try:
|
||||
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
except ValueError:
|
||||
raise InvalidException(excArgs="Date format")
|
||||
base_query = Q(created_at__range=(start_date, end_date))
|
||||
if daily_report_only:
|
||||
base_query &= Q(is_daily_report=True)
|
||||
reports = Report.objects.filter(base_query).order_by('created_at')
|
||||
|
||||
|
||||
paginator = Paginator(reports, page_size)
|
||||
page = paginator.get_page(page_number)
|
||||
|
||||
data = []
|
||||
for report in page:
|
||||
data.append({
|
||||
"ID": report.id,
|
||||
"Created Date": report.created_at,
|
||||
"No. Requests": report.number_request,
|
||||
"Status": report.status,
|
||||
"Purchase Date Acc": report.reviewed_accuracy.get("purchase_date", None) if report.reviewed_accuracy else None,
|
||||
"Retailer Acc": report.feedback_accuracy.get("retailername", None) if report.reviewed_accuracy else None,
|
||||
"IMEI Acc": report.feedback_accuracy.get("imei_number", None) if report.reviewed_accuracy else None,
|
||||
"Avg. Accuracy": report.feedback_accuracy.get("avg", None) if report.reviewed_accuracy else None,
|
||||
"Avg. Client Request Time": report.average_client_time.get("avg", 0) if report.average_client_time else 0,
|
||||
"Avg. OCR Processing Time": report.average_OCR_time.get("avg", 0) if report.average_client_time else 0,
|
||||
"report_id": report.report_id,
|
||||
})
|
||||
|
||||
response = {
|
||||
'report_detail': data,
|
||||
'page': {
|
||||
'number': page.number,
|
||||
'total_pages': page.paginator.num_pages,
|
||||
'count': page.paginator.count,
|
||||
}
|
||||
}
|
||||
return JsonResponse(response, status=200)
|
||||
|
||||
return JsonResponse({'error': 'Invalid request method.'}, status=405)
|
||||
|
||||
@extend_schema(
|
||||
parameters=[
|
||||
OpenApiParameter(
|
||||
name='start_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2023-01-02T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='end_date',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
|
||||
type=OpenApiTypes.DATE,
|
||||
default='2024-01-10T00:00:00+0700',
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='subsidiary',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Subsidiary',
|
||||
type=OpenApiTypes.STR,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Page number',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
OpenApiParameter(
|
||||
name='page_size',
|
||||
location=OpenApiParameter.QUERY,
|
||||
description='Number of items per page',
|
||||
type=OpenApiTypes.INT,
|
||||
required=False
|
||||
),
|
||||
],
|
||||
responses=None, tags=['Accuracy']
|
||||
)
|
||||
@action(detail=False, url_path="overview", methods=["GET"])
|
||||
def overview(self, request):
|
||||
if request.method == 'GET':
|
||||
subsidiary = request.GET.get('subsidiary', None)
|
||||
start_date_str = request.GET.get('start_date', "")
|
||||
end_date_str = request.GET.get('end_date', "")
|
||||
page_number = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 10))
|
||||
|
||||
|
||||
if not start_date_str or not end_date_str:
|
||||
reports = Report.objects.all()
|
||||
else:
|
||||
try:
|
||||
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
except ValueError:
|
||||
raise InvalidException(excArgs="Date format")
|
||||
base_query = Q(created_at__range=(start_date, end_date))
|
||||
if subsidiary:
|
||||
base_query &= Q(subsidiary=subsidiary)
|
||||
base_query &= Q(is_daily_report=True)
|
||||
reports = Report.objects.filter(base_query).order_by('created_at')
|
||||
|
||||
paginator = Paginator(reports, page_size)
|
||||
page = paginator.get_page(page_number)
|
||||
|
||||
data = []
|
||||
this_month_report = MonthReportAccumulate()
|
||||
for report in page:
|
||||
res = this_month_report.add(report)
|
||||
if not(res):
|
||||
_, _data, total = this_month_report()
|
||||
data += [total]
|
||||
data += _data
|
||||
this_month_report = MonthReportAccumulate()
|
||||
this_month_report.add(report)
|
||||
else:
|
||||
continue
|
||||
_, _data, total = this_month_report()
|
||||
data += [total]
|
||||
data += _data
|
||||
|
||||
response = {
|
||||
'overview_data': data,
|
||||
'page': {
|
||||
'number': page.number,
|
||||
'total_pages': page.paginator.num_pages,
|
||||
'count': page.paginator.count,
|
||||
}
|
||||
}
|
||||
return JsonResponse(response, status=200)
|
||||
|
||||
return JsonResponse({'error': 'Invalid request method.'}, status=405)
|
||||
|
||||
|
||||
class RequestViewSet(viewsets.ViewSet):
|
||||
lookup_field = "username"
|
||||
|
@ -34,6 +34,7 @@ class CeleryConnector:
|
||||
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
|
||||
'remove_local_file': {'queue': "remove_local_file"},
|
||||
'csv_feedback': {'queue': "csv_feedback"},
|
||||
'make_a_report': {'queue': "report"},
|
||||
|
||||
}
|
||||
app = Celery(
|
||||
@ -41,6 +42,8 @@ class CeleryConnector:
|
||||
broker=settings.BROKER_URL,
|
||||
broker_transport_options={'confirm_publish': False},
|
||||
)
|
||||
def make_a_report(self, args):
|
||||
return self.send_task('make_a_report', args)
|
||||
def csv_feedback(self, args):
|
||||
return self.send_task('csv_feedback', args)
|
||||
def do_pdf(self, args):
|
||||
|
149
cope2n-api/fwd_api/celery_worker/process_report_tasks.py
Normal file
149
cope2n-api/fwd_api/celery_worker/process_report_tasks.py
Normal file
@ -0,0 +1,149 @@
|
||||
import time
|
||||
import uuid
|
||||
import os
|
||||
import base64
|
||||
import traceback
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
from fwd_api.models import SubscriptionRequest, UserProfile
|
||||
from fwd_api.celery_worker.worker import app
|
||||
from ..constant.common import FolderFileType, image_extensions
|
||||
from ..exception.exceptions import FileContentInvalidException
|
||||
from fwd_api.models import SubscriptionRequestFile, FeedbackRequest, Report
|
||||
from ..utils import file as FileUtils
|
||||
from ..utils import process as ProcessUtil
|
||||
from ..utils import s3 as S3Util
|
||||
from ..utils.accuracy import update_temp_accuracy, IterAvg, calculate_and_save_subcription_file
|
||||
from fwd_api.constant.common import ProcessType
|
||||
from django.utils import timezone
|
||||
from django.db.models import Q
|
||||
import csv
|
||||
import json
|
||||
|
||||
from celery.utils.log import get_task_logger
|
||||
from fwd import settings
|
||||
|
||||
|
||||
logger = get_task_logger(__name__)
|
||||
|
||||
s3_client = S3Util.MinioS3Client(
|
||||
endpoint=settings.S3_ENDPOINT,
|
||||
access_key=settings.S3_ACCESS_KEY,
|
||||
secret_key=settings.S3_SECRET_KEY,
|
||||
bucket_name=settings.S3_BUCKET_NAME
|
||||
)
|
||||
|
||||
def mean_list(l):
|
||||
l = [x for x in l if x is not None]
|
||||
if len(l) == 0:
|
||||
return 0
|
||||
return sum(l)/len(l)
|
||||
|
||||
@app.task(name='make_a_report')
|
||||
def make_a_report(report_id, query_set):
|
||||
try:
|
||||
start_date = timezone.datetime.strptime(query_set["start_date_str"], '%Y-%m-%dT%H:%M:%S%z')
|
||||
end_date = timezone.datetime.strptime(query_set["end_date_str"], '%Y-%m-%dT%H:%M:%S%z')
|
||||
base_query = Q(created_at__range=(start_date, end_date))
|
||||
if query_set["request_id"]:
|
||||
base_query &= Q(request_id=query_set["request_id"])
|
||||
if query_set["redemption_id"]:
|
||||
base_query &= Q(redemption_id=query_set["redemption_id"])
|
||||
base_query &= Q(is_test_request=False)
|
||||
if isinstance(query_set["include_test"], str):
|
||||
query_set["include_test"] = True if query_set["include_test"].lower() in ["true", "yes", "1"] else False
|
||||
if query_set["include_test"]:
|
||||
# base_query = ~base_query
|
||||
base_query.children = base_query.children[:-1]
|
||||
|
||||
elif isinstance(query_set["include_test"], bool):
|
||||
if query_set["include_test"]:
|
||||
base_query = ~base_query
|
||||
if isinstance(query_set["subsidiary"], str):
|
||||
if query_set["subsidiary"] and query_set["subsidiary"].lower().replace(" ", "")!="all":
|
||||
base_query &= Q(redemption_id__startswith=query_set["subsidiary"])
|
||||
if isinstance(query_set["is_reviewed"], str):
|
||||
if query_set["is_reviewed"] == "reviewed":
|
||||
base_query &= Q(is_reviewed=True)
|
||||
elif query_set["is_reviewed"] == "not reviewed":
|
||||
base_query &= Q(is_reviewed=False)
|
||||
# elif query_set["is_reviewed"] == "all":
|
||||
# pass
|
||||
|
||||
errors = []
|
||||
# Create a placeholder to fill
|
||||
accuracy = {"feedback" :{"imei_number": IterAvg(),
|
||||
"purchase_date": IterAvg(),
|
||||
"retailername": IterAvg(),
|
||||
"sold_to_party": IterAvg(),},
|
||||
"reviewed" :{"imei_number": IterAvg(),
|
||||
"purchase_date": IterAvg(),
|
||||
"retailername": IterAvg(),
|
||||
"sold_to_party": IterAvg(),}
|
||||
} # {"imei": {"acc": 0.1, count: 1}, ...}
|
||||
time_cost = {"invoice": IterAvg(),
|
||||
"imei": IterAvg()}
|
||||
number_images = 0
|
||||
number_bad_images = 0
|
||||
# TODO: Multithreading
|
||||
# Calculate accuracy, processing time, ....Then save.
|
||||
subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
|
||||
report: Report = \
|
||||
Report.objects.filter(report_id=report_id).first()
|
||||
# TODO: number of transaction by doc type
|
||||
num_request = 0
|
||||
for request in subscription_requests:
|
||||
if request.status != 200 or not (request.reviewed_result or request.feedback_result):
|
||||
# Failed requests or lack of reviewed_result/feedback_result
|
||||
continue
|
||||
request_att = calculate_and_save_subcription_file(report, request)
|
||||
|
||||
request.feedback_accuracy = {"imei_number" : mean_list(request_att["acc"]["feedback"].get("imei_number", [None])),
|
||||
"purchase_date" : mean_list(request_att["acc"]["feedback"].get("purchase_date", [None])),
|
||||
"retailername" : mean_list(request_att["acc"]["feedback"].get("retailername", [None])),
|
||||
"sold_to_party" : mean_list(request_att["acc"]["feedback"].get("sold_to_party", [None]))}
|
||||
request.reviewed_accuracy = {"imei_number" : mean_list(request_att["acc"]["reviewed"].get("imei_number", [None])),
|
||||
"purchase_date" : mean_list(request_att["acc"]["reviewed"].get("purchase_date", [None])),
|
||||
"retailername" : mean_list(request_att["acc"]["reviewed"].get("retailername", [None])),
|
||||
"sold_to_party" : mean_list(request_att["acc"]["reviewed"].get("sold_to_party", [None]))}
|
||||
request.save()
|
||||
number_images += request_att["total_images"]
|
||||
number_bad_images += request_att["bad_images"]
|
||||
update_temp_accuracy(accuracy["feedback"], request_att["acc"]["feedback"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"])
|
||||
update_temp_accuracy(accuracy["reviewed"], request_att["acc"]["reviewed"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"])
|
||||
|
||||
time_cost["imei"].add(request_att["time_cost"].get("imei", []))
|
||||
time_cost["invoice"].add(request_att["time_cost"].get("invoice", []))
|
||||
|
||||
errors += request_att["err"]
|
||||
num_request += 1
|
||||
# Do saving process
|
||||
report.number_request = num_request
|
||||
report.number_images = number_images
|
||||
report.number_imei = time_cost["imei"].count
|
||||
report.number_invoice = time_cost["invoice"].count
|
||||
report.number_bad_images = number_bad_images
|
||||
report.average_OCR_time = {"invoice": time_cost["invoice"](), "imei": time_cost["imei"](),
|
||||
"invoice_count": time_cost["invoice"].count, "imei_count": time_cost["imei"].count}
|
||||
|
||||
acumulated_acc = {"feedback": {},
|
||||
"reviewed": {}}
|
||||
|
||||
for acc_type in ["feedback", "reviewed"]:
|
||||
for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]:
|
||||
acumulated_acc[acc_type][key] = accuracy[acc_type][key]()
|
||||
acumulated_acc[acc_type][key+"_count"] = accuracy[acc_type][key].count
|
||||
|
||||
report.feedback_accuracy = acumulated_acc["feedback"]
|
||||
report.reviewed_accuracy = acumulated_acc["reviewed"]
|
||||
|
||||
report.errors = "|".join(errors)
|
||||
report.save()
|
||||
except IndexError as e:
|
||||
print(e)
|
||||
traceback.print_exc()
|
||||
print("NotFound request by report id, %d", report_id)
|
||||
except Exception as e:
|
||||
print("[ERROR]: an error occured while processing report: ", report_id)
|
||||
traceback.print_exc()
|
||||
return 400
|
@ -12,7 +12,7 @@ django.setup()
|
||||
app: Celery = Celery(
|
||||
'postman',
|
||||
broker=settings.BROKER_URL,
|
||||
include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task'],
|
||||
include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task', 'fwd_api.celery_worker.process_report_tasks'],
|
||||
broker_transport_options={'confirm_publish': False},
|
||||
)
|
||||
|
||||
@ -40,6 +40,7 @@ app.conf.update({
|
||||
Queue('upload_obj_to_s3'),
|
||||
Queue('remove_local_file'),
|
||||
Queue('csv_feedback'),
|
||||
Queue('report'),
|
||||
|
||||
],
|
||||
'task_routes': {
|
||||
@ -57,6 +58,7 @@ app.conf.update({
|
||||
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
|
||||
'remove_local_file': {'queue': "remove_local_file"},
|
||||
'csv_feedback': {'queue': "csv_feedback"},
|
||||
'make_a_report': {'queue': "report"},
|
||||
}
|
||||
})
|
||||
|
||||
|
@ -0,0 +1,71 @@
|
||||
# myapp/management/commands/mycustomcommand.py
|
||||
from django.core.management.base import BaseCommand
|
||||
from tqdm import tqdm
|
||||
from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest
|
||||
from fwd_api.utils.accuracy import predict_result_to_ready
|
||||
import traceback
|
||||
import copy
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Refactor database for image level'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
# Add your command-line arguments here
|
||||
parser.add_argument('test', type=str, help='Value for the argument')
|
||||
|
||||
|
||||
def process_request(self, request):
|
||||
if len(request.request_id.split(".")[0].split("_")) < 2:
|
||||
return
|
||||
images = SubscriptionRequestFile.objects.filter(request=request)
|
||||
time_cost = {"imei": [], "invoice": [], "all": []}
|
||||
if request.ai_inference_profile is None:
|
||||
time_cost["imei"] = [-1 for _ in range(len(images))]
|
||||
time_cost["invoice"] = [-1]
|
||||
time_cost["all"] = [-1]
|
||||
else:
|
||||
for k, v in request.ai_inference_profile.items():
|
||||
time_cost[k.split("_")[0]].append(v["inference"][1][0] - v["inference"][0] + (v["postprocess"][1]-v["postprocess"][0]))
|
||||
for i, image in enumerate(images):
|
||||
# temp_imei_SAP_20240127223644_a493434edbf84fc08aeb87ef6cdde102_0.jpg
|
||||
try:
|
||||
image.index_in_request = int(image.file_name.split(".")[0].split("_")[-1]) if len(image.file_name.split(".")[0].split("_")) > 4 else 0
|
||||
image.doc_type = image.file_name.split(".")[0].split("_")[1] if len(image.file_name.split(".")[0].split("_")) > 4 else "all"
|
||||
image.processing_time = time_cost[image.doc_type][image.index_in_request]
|
||||
if not request.predict_result:
|
||||
raise KeyError(f"Key predict_result not found in {request.request_id}")
|
||||
if request.predict_result.get("status", 200) != 200:
|
||||
raise AttributeError(f"Failed request: {request.request_id}")
|
||||
_predict_result = copy.deepcopy(predict_result_to_ready(request.predict_result))
|
||||
_feedback_result = copy.deepcopy(request.feedback_result)
|
||||
_reviewed_result = copy.deepcopy(request.reviewed_result)
|
||||
|
||||
if image.doc_type == "invoice":
|
||||
_predict_result["imei_number"] = []
|
||||
if _feedback_result:
|
||||
_feedback_result["imei_number"] = []
|
||||
else:
|
||||
None
|
||||
if _reviewed_result:
|
||||
_reviewed_result["imei_number"] = []
|
||||
else:
|
||||
None
|
||||
else:
|
||||
_predict_result = {"retailername": None, "sold_to_party": None, "purchase_date": [], "imei_number": [_predict_result["imei_number"][image.index_in_request]]}
|
||||
_feedback_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_feedback_result["imei_number"][image.index_in_request]]} if _feedback_result else None
|
||||
_reviewed_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_reviewed_result["imei_number"][image.index_in_request]]} if _reviewed_result else None
|
||||
image.predict_result = _predict_result
|
||||
image.feedback_result = _feedback_result
|
||||
image.reviewed_result = _reviewed_result
|
||||
image.save()
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}"))
|
||||
print(traceback.format_exc())
|
||||
continue
|
||||
|
||||
def handle(self, *args, **options):
|
||||
test = options['test']
|
||||
subcription_iter = SubscriptionRequest.objects.all()
|
||||
for request in tqdm(subcription_iter.iterator()):
|
||||
self.process_request(request)
|
||||
self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully!'))
|
@ -0,0 +1,102 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-25 06:22
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0166_remove_subscriptionrequest_is_bad_image_quality_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Report',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('report_id', models.CharField(max_length=200)),
|
||||
('local_file_name', models.CharField(max_length=200)),
|
||||
('error_status', models.JSONField(null=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('start_at', models.DateTimeField(null=True)),
|
||||
('end_at', models.DateTimeField(null=True)),
|
||||
('include_for_test_sample', models.BooleanField(default=False)),
|
||||
('status', models.CharField(max_length=100)),
|
||||
('is_daily_report', models.BooleanField(default=False)),
|
||||
('errors', models.TextField(default='')),
|
||||
('S3_uploaded', models.BooleanField(default=False)),
|
||||
('number_request', models.IntegerField(default=0)),
|
||||
('number_images', models.IntegerField(default=0)),
|
||||
('number_bad_images', models.IntegerField(default=0)),
|
||||
('average_client_time_profile', models.JSONField(null=True)),
|
||||
('average_OCR_time_profile', models.JSONField(null=True)),
|
||||
('average_OCR_time', models.JSONField(null=True)),
|
||||
('average_client_time', models.JSONField(null=True)),
|
||||
('imei_accuracy', models.FloatField(default=-1)),
|
||||
('purchase_date_accuracy', models.FloatField(default=-1)),
|
||||
('retailer_name_accuracy', models.FloatField(default=-1)),
|
||||
('sold_to_party_accuracy', models.FloatField(default=-1)),
|
||||
],
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='accuracy',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequest',
|
||||
name='imei_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequest',
|
||||
name='purchase_date_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequest',
|
||||
name='retailer_name_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequest',
|
||||
name='sold_to_party_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='counter_measures',
|
||||
field=models.TextField(blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='imei_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='processing_time',
|
||||
field=models.IntegerField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='purchase_date_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='reason',
|
||||
field=models.TextField(blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='retailer_name_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='sold_to_party_accuracy',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
]
|
@ -0,0 +1,23 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-25 09:44
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0167_report_remove_subscriptionrequestfile_accuracy_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='number_imei_transaction',
|
||||
field=models.IntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='number_ivoice_transaction',
|
||||
field=models.IntegerField(default=0),
|
||||
),
|
||||
]
|
@ -0,0 +1,28 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-25 11:17
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0168_report_number_imei_transaction_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='include_reviewed',
|
||||
field=models.TextField(default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='include_test',
|
||||
field=models.CharField(default='', max_length=200),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='subsidiary',
|
||||
field=models.TextField(default=''),
|
||||
),
|
||||
]
|
@ -0,0 +1,28 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-25 11:19
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0169_report_include_reviewed_report_include_test_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='report',
|
||||
name='errors',
|
||||
field=models.TextField(default='', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='report',
|
||||
name='include_reviewed',
|
||||
field=models.TextField(default='', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='report',
|
||||
name='subsidiary',
|
||||
field=models.TextField(default='', null=True),
|
||||
),
|
||||
]
|
@ -0,0 +1,112 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-28 08:11
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0170_alter_report_errors_alter_report_include_reviewed_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='report',
|
||||
old_name='imei_accuracy',
|
||||
new_name='imei_accuracy_ocr',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='report',
|
||||
old_name='purchase_date_accuracy',
|
||||
new_name='imei_accuracy_revised',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='report',
|
||||
old_name='retailer_name_accuracy',
|
||||
new_name='purchase_date_accuracy_ocr',
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='report',
|
||||
old_name='sold_to_party_accuracy',
|
||||
new_name='purchase_date_accuracy_revised',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='retailer_name_accuracy_ocr',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='retailer_name_accuracy_revised',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='sold_to_party_accuracy_ocr',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='sold_to_party_accuracy_revised',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='feedback_result',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='predict_result',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='reviewed_result',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='doc_type',
|
||||
field=models.CharField(default='', max_length=10),
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ReportFile',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('correspond_request_id', models.CharField(max_length=200)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('updated_at', models.DateTimeField(auto_now=True)),
|
||||
('S3_uploaded', models.BooleanField(default=False)),
|
||||
('doc_type', models.CharField(max_length=200)),
|
||||
('imei_feedback', models.CharField(default=None, max_length=200, null=True)),
|
||||
('purchase_date_feedback', models.CharField(default=None, max_length=200, null=True)),
|
||||
('retailer_feedback', models.CharField(default=None, max_length=200, null=True)),
|
||||
('sold_to_party_feedback', models.CharField(default=None, max_length=200, null=True)),
|
||||
('imei_ocr', models.CharField(default=None, max_length=200, null=True)),
|
||||
('purchase_date_ocr', models.CharField(default=None, max_length=200, null=True)),
|
||||
('retailer_ocr', models.CharField(default=None, max_length=200, null=True)),
|
||||
('sold_to_party_ocr', models.CharField(default=None, max_length=200, null=True)),
|
||||
('imei_revised', models.CharField(default=None, max_length=200, null=True)),
|
||||
('purchase_date_revised', models.CharField(default=None, max_length=200, null=True)),
|
||||
('retailer_revised', models.CharField(default=None, max_length=200, null=True)),
|
||||
('sold_to_party_revised', models.CharField(default=None, max_length=200, null=True)),
|
||||
('imei_acc_feedback', models.FloatField(default=None, null=True)),
|
||||
('purchase_date_acc_feedback', models.FloatField(default=None, null=True)),
|
||||
('retailer_acc_feedback', models.FloatField(default=None, null=True)),
|
||||
('sold_to_party_acc_feedback', models.CharField(default=None, max_length=200, null=True)),
|
||||
('acc_feedback', models.FloatField(default=None, null=True)),
|
||||
('imei_acc_revised', models.FloatField(default=None, null=True)),
|
||||
('purchase_date_acc_revised', models.FloatField(default=None, null=True)),
|
||||
('retailer_acc_revised', models.FloatField(default=None, null=True)),
|
||||
('acc_revised', models.FloatField(default=None, null=True)),
|
||||
('time_cost', models.FloatField(default=0)),
|
||||
('is_reviewed', models.CharField(default='NA', max_length=5)),
|
||||
('bad_image_reason', models.TextField(default='')),
|
||||
('countermeasures', models.TextField(default='')),
|
||||
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='files', to='fwd_api.report')),
|
||||
],
|
||||
),
|
||||
]
|
@ -0,0 +1,38 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-28 09:27
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='imei_accuracy',
|
||||
field=models.FloatField(default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='processing_time',
|
||||
field=models.FloatField(default=-1),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='purchase_date_accuracy',
|
||||
field=models.FloatField(default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='retailer_name_accuracy',
|
||||
field=models.FloatField(default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='sold_to_party_accuracy',
|
||||
field=models.FloatField(default=None, null=True),
|
||||
),
|
||||
]
|
@ -0,0 +1,226 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-28 18:00
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0172_alter_subscriptionrequestfile_imei_accuracy_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='reportfile',
|
||||
old_name='countermeasures',
|
||||
new_name='counter_measures',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='imei_accuracy_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='imei_accuracy_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='purchase_date_accuracy_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='purchase_date_accuracy_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='retailer_name_accuracy_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='retailer_name_accuracy_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='sold_to_party_accuracy_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='report',
|
||||
name='sold_to_party_accuracy_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='acc_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='acc_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='imei_acc_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='imei_acc_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='imei_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='imei_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='imei_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='purchase_date_acc_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='purchase_date_acc_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='purchase_date_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='purchase_date_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='purchase_date_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='retailer_acc_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='retailer_acc_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='retailer_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='retailer_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='retailer_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='sold_to_party_acc_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='sold_to_party_feedback',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='sold_to_party_ocr',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='reportfile',
|
||||
name='sold_to_party_revised',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequest',
|
||||
name='imei_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequest',
|
||||
name='purchase_date_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequest',
|
||||
name='retailer_name_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequest',
|
||||
name='sold_to_party_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='imei_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='purchase_date_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='retailer_name_accuracy',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='sold_to_party_accuracy',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='feedback_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='reviewed_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='error',
|
||||
field=models.TextField(default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='feedback_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='feedback_result',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='predict_result',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='reviewed_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='reviewed_result',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequest',
|
||||
name='feedback_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequest',
|
||||
name='reviewed_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='feedback_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='subscriptionrequestfile',
|
||||
name='reviewed_accuracy',
|
||||
field=models.JSONField(null=True),
|
||||
),
|
||||
]
|
@ -0,0 +1,28 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-29 05:59
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0173_rename_countermeasures_reportfile_counter_measures_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='acc',
|
||||
field=models.FloatField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='reportfile',
|
||||
name='correspond_redemption_id',
|
||||
field=models.CharField(default='', max_length=200),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='reportfile',
|
||||
name='correspond_request_id',
|
||||
field=models.CharField(default='', max_length=200),
|
||||
),
|
||||
]
|
@ -0,0 +1,28 @@
|
||||
# Generated by Django 4.1.3 on 2024-01-30 12:29
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fwd_api', '0174_reportfile_acc_reportfile_correspond_redemption_id_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='report',
|
||||
old_name='number_ivoice_transaction',
|
||||
new_name='number_imei',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='number_invoice',
|
||||
field=models.IntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='report',
|
||||
name='number_invoice_transaction',
|
||||
field=models.IntegerField(default=0),
|
||||
),
|
||||
]
|
@ -13,19 +13,28 @@ class Report(models.Model):
|
||||
start_at = models.DateTimeField(null=True)
|
||||
end_at = models.DateTimeField(null=True)
|
||||
include_for_test_sample = models.BooleanField(default=False)
|
||||
status = models.CharField(null=True)
|
||||
status = models.CharField(max_length=100)
|
||||
is_daily_report = models.BooleanField(default=False)
|
||||
errors = models.TextField(default="", null=True)
|
||||
subsidiary = models.TextField(default="", null=True)
|
||||
include_reviewed = models.TextField(default="", null=True)
|
||||
include_test = models.CharField(max_length=200, default="")
|
||||
|
||||
# Data
|
||||
S3_uploaded = models.BooleanField(default=False)
|
||||
number_request = models.IntegerField(default=0)
|
||||
number_images = models.IntegerField(default=0)
|
||||
number_bad_images = models.IntegerField(default=0)
|
||||
average_client_time_profile = models.JSONField(default=0) # {"0.1": 100, 0.2: 200, ...}
|
||||
average_OCR_time_profile = models.JSONField(default=0) # {"0.1": 98, 0.2: 202, ...}
|
||||
average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1}
|
||||
number_imei = models.IntegerField(default=0)
|
||||
number_invoice = models.IntegerField(default=0)
|
||||
|
||||
number_imei_transaction = models.IntegerField(default=0)
|
||||
number_invoice_transaction = models.IntegerField(default=0)
|
||||
|
||||
average_client_time_profile = models.JSONField(null=True) # {"0.1": 100, 0.2: 200, ...} | Future feature
|
||||
average_OCR_time_profile = models.JSONField(null=True) # {"0.1": 98, 0.2: 202, ...} | Future feature
|
||||
average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} | Future feature
|
||||
average_client_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1}
|
||||
imei_accuracy = models.FloatField(default=-1)
|
||||
purchase_date_accuracy = models.FloatField(default=-1)
|
||||
retailer_name_accuracy = models.FloatField(default=-1)
|
||||
sold_to_party_accuracy = models.FloatField(default=-1)
|
||||
|
||||
feedback_accuracy = models.JSONField(null=True)
|
||||
reviewed_accuracy = models.JSONField(null=True)
|
35
cope2n-api/fwd_api/models/ReportFile.py
Normal file
35
cope2n-api/fwd_api/models/ReportFile.py
Normal file
@ -0,0 +1,35 @@
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from fwd_api.models.Subscription import Subscription
|
||||
from fwd_api.models.SubscriptionRequest import SubscriptionRequest
|
||||
from fwd_api.models.Report import Report
|
||||
|
||||
class ReportFile(models.Model):
|
||||
# Metadata
|
||||
id = models.AutoField(primary_key=True)
|
||||
correspond_request_id = models.CharField(max_length=200, default="")
|
||||
correspond_redemption_id = models.CharField(max_length=200, default="")
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
report = models.ForeignKey(Report, related_name="files", on_delete=models.CASCADE)
|
||||
|
||||
# Data
|
||||
S3_uploaded = models.BooleanField(default=False)
|
||||
doc_type = models.CharField(max_length=200)
|
||||
|
||||
predict_result = models.JSONField(null=True)
|
||||
feedback_result = models.JSONField(null=True)
|
||||
reviewed_result = models.JSONField(null=True)
|
||||
|
||||
feedback_accuracy = models.JSONField(null=True)
|
||||
reviewed_accuracy = models.JSONField(null=True)
|
||||
acc = models.FloatField(default=0)
|
||||
|
||||
time_cost = models.FloatField(default=0)
|
||||
is_reviewed = models.CharField(default="NA", max_length=5) # NA, No, Yes
|
||||
bad_image_reason = models.TextField(default="")
|
||||
counter_measures = models.TextField(default="")
|
||||
error = models.TextField(default="")
|
||||
|
||||
|
||||
|
@ -21,10 +21,9 @@ class SubscriptionRequest(models.Model):
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
is_test_request = models.BooleanField(default=False)
|
||||
S3_uploaded = models.BooleanField(default=False)
|
||||
imei_accuracy = models.FloatField(default=-1)
|
||||
purchase_date_accuracy = models.FloatField(default=-1)
|
||||
retailer_name_accuracy = models.FloatField(default=-1)
|
||||
sold_to_party_accuracy = models.FloatField(default=-1)
|
||||
|
||||
feedback_accuracy = models.JSONField(null=True)
|
||||
reviewed_accuracy = models.JSONField(null=True)
|
||||
|
||||
ai_inference_profile = models.JSONField(null=True)
|
||||
preprocessing_time = models.FloatField(default=-1)
|
||||
|
@ -20,12 +20,15 @@ class SubscriptionRequestFile(models.Model):
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
is_bad_image_quality = models.BooleanField(default=False)
|
||||
doc_type = models.CharField(max_length=100, default="")
|
||||
index_in_request = models.IntegerField(default=0)
|
||||
processing_time = models.IntegerField(default=-1) # in milisecond
|
||||
doc_type = models.CharField(max_length=10, default="")
|
||||
index_in_request = models.IntegerField(default=0) # by doc_type
|
||||
processing_time = models.FloatField(default=-1) # in milisecond
|
||||
reason = models.TextField(blank=True)
|
||||
counter_measures = models.TextField(blank=True)
|
||||
imei_accuracy = models.FloatField(default=-1)
|
||||
purchase_date_accuracy = models.FloatField(default=-1)
|
||||
retailer_name_accuracy = models.FloatField(default=-1)
|
||||
sold_to_party_accuracy = models.FloatField(default=-1)
|
||||
|
||||
predict_result = models.JSONField(null=True)
|
||||
feedback_result = models.JSONField(null=True)
|
||||
reviewed_result = models.JSONField(null=True)
|
||||
|
||||
feedback_accuracy = models.JSONField(null=True)
|
||||
reviewed_accuracy = models.JSONField(null=True)
|
@ -6,4 +6,7 @@ from .OcrTemplateBox import OcrTemplateBox
|
||||
from .PricingPlan import PricingPlan
|
||||
from .Subscription import Subscription
|
||||
from .FeedbackRequest import FeedbackRequest
|
||||
from .Report import Report
|
||||
from .ReportFile import ReportFile
|
||||
|
||||
|
||||
|
417
cope2n-api/fwd_api/utils/accuracy.py
Normal file
417
cope2n-api/fwd_api/utils/accuracy.py
Normal file
@ -0,0 +1,417 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import copy
|
||||
from .ocr_utils.ocr_metrics import eval_ocr_metric
|
||||
from .ocr_utils.sbt_report import post_processing_str
|
||||
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile
|
||||
from ..celery_worker.client_connector import c_connector
|
||||
|
||||
BAD_THRESHOLD = 0.75
|
||||
|
||||
valid_keys = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
|
||||
|
||||
class MonthReportAccumulate:
|
||||
def __init__(self):
|
||||
self.month = None
|
||||
self.total = {
|
||||
'subs': "+",
|
||||
'extraction_date': "Subtotal ()",
|
||||
'total_images': 0,
|
||||
'images_quality': {
|
||||
'successful': 0,
|
||||
'successful_percent': 0,
|
||||
'bad': 0,
|
||||
'bad_percent': 0
|
||||
},
|
||||
'average_accuracy_rate': {
|
||||
'imei': IterAvg(),
|
||||
'purchase_date': IterAvg(),
|
||||
'retailer_name': IterAvg()
|
||||
},
|
||||
'average_processing_time': {
|
||||
'imei': IterAvg(),
|
||||
'invoice': IterAvg()
|
||||
},
|
||||
'usage': {
|
||||
'imei':0,
|
||||
'invoice': 0
|
||||
}
|
||||
}
|
||||
self.data = []
|
||||
self.data_format = {
|
||||
'num_imei': 0,
|
||||
'num_invoice': 0,
|
||||
'total_images': 0,
|
||||
'images_quality': {
|
||||
'successful': 0,
|
||||
'successful_percent': 0,
|
||||
'bad': 0,
|
||||
'bad_percent': 0
|
||||
},
|
||||
'average_accuracy_rate': {
|
||||
'imei': 0,
|
||||
'purchase_date': 0,
|
||||
'retailer_name': 0
|
||||
},
|
||||
'average_processing_time': {
|
||||
'imei': 0,
|
||||
'invoice': 0
|
||||
},
|
||||
'usage': {
|
||||
'imei':0,
|
||||
'invoice': 0
|
||||
}
|
||||
},
|
||||
|
||||
def accumulate(self, report):
|
||||
self.total["total_images"] += report.number_images
|
||||
self.total["images_quality"]["successful"] += report.number_images - report.number_bad_images
|
||||
self.total["images_quality"]["bad"] += report.number_bad_images
|
||||
|
||||
if sum([report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]) > 0 :
|
||||
self.total["average_accuracy_rate"]["imei"].add_avg(report.reviewed_accuracy.get("imei_number", 0), report.reviewed_accuracy.get("imei_number_count", 0))
|
||||
self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.reviewed_accuracy.get("purchase_date", 0), report.reviewed_accuracy.get("purchase_date_count", 0))
|
||||
self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.reviewed_accuracy.get("retailername", 0), report.reviewed_accuracy.get("retailername_count", 0))
|
||||
elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]) > 0:
|
||||
self.total["average_accuracy_rate"]["imei"].add_avg(report.feedback_accuracy.get("imei_number", 0), report.feedback_accuracy.get("imei_number_count", 0))
|
||||
self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.feedback_accuracy.get("purchase_date", 0), report.feedback_accuracy.get("purchase_date_count", 0))
|
||||
self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.feedback_accuracy.get("retailername", 0), report.feedback_accuracy.get("retailername_count", 0))
|
||||
|
||||
self.total["average_processing_time"]["imei"].add_avg(report.average_OCR_time.get("imei", 0), report.average_OCR_time.get("imei_count", 0))
|
||||
self.total["average_processing_time"]["invoice"].add_avg(report.average_OCR_time.get("invoice", 0), report.average_OCR_time.get("invoice_count", 0))
|
||||
self.total["usage"]["imei"] += report.number_imei_transaction
|
||||
self.total["usage"]["invoice"] += report.number_invoice_transaction
|
||||
|
||||
def add(self, report):
|
||||
report_month = report.created_at.month
|
||||
|
||||
if self.month is None:
|
||||
self.month = report_month
|
||||
self.total["extraction_date"] = f"Subtotal ({self.month})"
|
||||
elif self.month != report_month:
|
||||
self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"]
|
||||
self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"]
|
||||
return False # Reports from a different month, stop accumulating
|
||||
# accumulate fields
|
||||
new_data = copy.deepcopy(self.data_format)[0]
|
||||
new_data["num_imei"] = report.number_imei
|
||||
new_data["num_invoice"] = report.number_invoice
|
||||
new_data["total_images"] = report.number_images
|
||||
new_data["images_quality"]["successful"] = report.number_images - report.number_bad_images
|
||||
new_data["images_quality"]["bad"] = report.number_bad_images
|
||||
|
||||
if sum([ report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]):
|
||||
new_data["average_accuracy_rate"]["imei"] = report.reviewed_accuracy.get("imei_number", None)
|
||||
new_data["average_accuracy_rate"]["purchase_date"] = report.reviewed_accuracy.get("purchase_date", None)
|
||||
new_data["average_accuracy_rate"]["retailer_name"] = report.reviewed_accuracy.get("retailername", None)
|
||||
elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]):
|
||||
new_data["average_accuracy_rate"]["imei"] = report.feedback_accuracy.get("imei_number", None)
|
||||
new_data["average_accuracy_rate"]["purchase_date"] = report.feedback_accuracy.get("purchase_date", None)
|
||||
new_data["average_accuracy_rate"]["retailer_name"] = report.feedback_accuracy.get("retailername", None)
|
||||
new_data["average_processing_time"]["imei"] = report.average_OCR_time.get("imei", 0)
|
||||
new_data["average_processing_time"]["invoice"] = report.average_OCR_time.get("invoice", 0)
|
||||
new_data["usage"]["imei"] = report.number_imei_transaction
|
||||
new_data["usage"]["invoice"] = report.number_invoice_transaction
|
||||
|
||||
new_data["images_quality"]["successful_percent"] += new_data["images_quality"]["successful"]/new_data["total_images"]
|
||||
new_data["images_quality"]["bad_percent"] += new_data["images_quality"]["bad"]/new_data["total_images"]
|
||||
self.data.append(new_data)
|
||||
self.accumulate(report)
|
||||
return True
|
||||
|
||||
def __call__(self):
|
||||
self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"]
|
||||
self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"]
|
||||
total = copy.deepcopy(self.total)
|
||||
total["average_accuracy_rate"]["imei"] = total["average_accuracy_rate"]["imei"]()
|
||||
total["average_accuracy_rate"]["purchase_date"] = total["average_accuracy_rate"]["purchase_date"]()
|
||||
total["average_accuracy_rate"]["retailer_name"] = total["average_accuracy_rate"]["retailer_name"]()
|
||||
total["average_processing_time"]["imei"] = total["average_processing_time"]["imei"]()
|
||||
total["average_processing_time"]["invoice"] = total["average_processing_time"]["invoice"]()
|
||||
return self.month, self.data, total
|
||||
|
||||
class IterAvg:
|
||||
def __init__(self, name="default"):
|
||||
self.name = name
|
||||
self.avg = 0
|
||||
self.count = 0
|
||||
|
||||
def add(self, values):
|
||||
"""
|
||||
Args:
|
||||
values (list[float]):
|
||||
"""
|
||||
values = [x for x in values if x is not None]
|
||||
if len(values) == 0:
|
||||
return
|
||||
self.avg = (self.avg*self.count + sum(values))/(self.count+len(values))
|
||||
self.count += len(values)
|
||||
|
||||
def add_avg(self, avg, count):
|
||||
if avg is None or count is None or count == 0:
|
||||
return
|
||||
self.count += count
|
||||
self.avg = (self.avg*(self.count-count) + avg*count)/(self.count)
|
||||
|
||||
def __call__(self):
|
||||
return self.avg
|
||||
|
||||
def convert_datetime_format(date_string: str, is_gt=False) -> str:
|
||||
# pattern_date_string = "2023-02-28"
|
||||
input_format = "%Y-%m-%d"
|
||||
output_format = "%d/%m/%Y"
|
||||
# Validate the input date string format
|
||||
pattern = r"\d{4}-\d{2}-\d{2}"
|
||||
if re.match(pattern, date_string):
|
||||
# Convert the date string to a datetime object
|
||||
date_object = datetime.strptime(date_string, input_format)
|
||||
# Convert the datetime object to the desired output format
|
||||
formatted_date = date_object.strftime(output_format)
|
||||
return formatted_date
|
||||
return date_string
|
||||
|
||||
def predict_result_to_ready(result):
|
||||
dict_result = {"retailername": "",
|
||||
"sold_to_party": "",
|
||||
"purchase_date": [],
|
||||
"imei_number": [],}
|
||||
dict_result["retailername"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}])[0].get("value", None)
|
||||
dict_result["sold_to_party"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}])[1].get("value", None)
|
||||
dict_result["purchase_date"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}])[2].get("value", [])
|
||||
dict_result["imei_number"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}, {}])[3].get("value", [])
|
||||
return dict_result
|
||||
|
||||
def align_fine_result(ready_predict, fine_result):
|
||||
# print(f"[DEBUG]: fine_result: {fine_result}")
|
||||
# print(f"[DEBUG]: ready_predict: {ready_predict}")
|
||||
if fine_result:
|
||||
if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0:
|
||||
ready_predict["purchase_date"] = [None]
|
||||
if fine_result["retailername"] and not ready_predict["retailername"]:
|
||||
ready_predict["retailername"] = [None]
|
||||
fine_result["purchase_date"] = [fine_result["purchase_date"] for _ in range(len(ready_predict["purchase_date"]))]
|
||||
# else:
|
||||
# fine_result = {}
|
||||
# for key in ready_predict.keys():
|
||||
# fine_result[key] = []
|
||||
# fine_result["purchase_date"] = [None for _ in range(len(ready_predict["purchase_date"]))]
|
||||
return ready_predict, fine_result
|
||||
|
||||
def update_temp_accuracy(accuracy, acc, keys):
|
||||
for key in keys:
|
||||
accuracy[key].add(acc[key])
|
||||
return accuracy
|
||||
def calculate_accuracy(key_name, inference, target):
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
key_name (string): key to calculate accuracy on, ex: retailername
|
||||
inference (dict): result from ocr, refined to align with the target down below
|
||||
target (dict): result of type
|
||||
"""
|
||||
acc = []
|
||||
data = []
|
||||
|
||||
if not target or not inference:
|
||||
return acc, data
|
||||
if not isinstance(inference[key_name], list):
|
||||
if inference[key_name] is None:
|
||||
inference[key_name] = []
|
||||
else:
|
||||
inference[key_name] = [inference[key_name]]
|
||||
if not isinstance(target[key_name], list):
|
||||
if target[key_name] is None:
|
||||
target[key_name] = []
|
||||
else:
|
||||
target[key_name] = [target[key_name]]
|
||||
for i, v in enumerate(inference[key_name]):
|
||||
# TODO: target[key_name][i] is None, ""
|
||||
x = post_processing_str(key_name, inference[key_name][i], is_gt=False)
|
||||
y = post_processing_str(key_name, target[key_name][i], is_gt=True)
|
||||
|
||||
score = eval_ocr_metric(
|
||||
[x],
|
||||
[y],
|
||||
metric=[
|
||||
"one_minus_ned",
|
||||
# "line_acc_ignore_case_symbol",
|
||||
# "line_acc",
|
||||
# "one_minus_ned_word",
|
||||
])
|
||||
acc.append(list(score.values())[0])
|
||||
data.append([x, y])
|
||||
return acc, data
|
||||
|
||||
def calculate_avg_accuracy(acc, type, keys=[]):
|
||||
acc_list = []
|
||||
# print(f"[DEBUG]: type: {type} - acc: {acc}")
|
||||
for key in keys:
|
||||
acc_list += acc.get(type, {}).get(key, [])
|
||||
|
||||
acc_list = [x for x in acc_list if x is not None]
|
||||
return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None
|
||||
|
||||
|
||||
def calculate_and_save_subcription_file(report, request):
|
||||
request_att = {"acc": {"feedback": {"imei_number": [],
|
||||
"purchase_date": [],
|
||||
"retailername": [],
|
||||
"sold_to_party": [],
|
||||
},
|
||||
"reviewed": {"imei_number": [],
|
||||
"purchase_date": [],
|
||||
"retailername": [],
|
||||
"sold_to_party": [],
|
||||
}},
|
||||
"err": [],
|
||||
"time_cost": {},
|
||||
"total_images": 0,
|
||||
"bad_images": 0}
|
||||
images = SubscriptionRequestFile.objects.filter(request=request)
|
||||
for image in images:
|
||||
status, att = calculate_subcription_file(image)
|
||||
if status != 200:
|
||||
continue
|
||||
image.feedback_accuracy = att["acc"]["feedback"]
|
||||
image.reviewed_accuracy = att["acc"]["reviewed"]
|
||||
image.is_bad_image_quality = att["is_bad_image"]
|
||||
image.save()
|
||||
new_report_file = ReportFile(report=report,
|
||||
correspond_request_id=request.request_id,
|
||||
correspond_redemption_id=request.redemption_id,
|
||||
doc_type=image.doc_type,
|
||||
predict_result=image.predict_result,
|
||||
feedback_result=image.feedback_result,
|
||||
reviewed_result=image.reviewed_result,
|
||||
feedback_accuracy=att["acc"]["feedback"],
|
||||
reviewed_accuracy=att["acc"]["reviewed"],
|
||||
acc=att["avg_acc"],
|
||||
time_cost=image.processing_time,
|
||||
bad_image_reason=image.reason,
|
||||
counter_measures=image.counter_measures,
|
||||
error="|".join(att["err"])
|
||||
)
|
||||
new_report_file.save()
|
||||
if request_att["time_cost"].get(image.doc_type, None):
|
||||
request_att["time_cost"][image.doc_type].append(image.processing_time)
|
||||
else:
|
||||
request_att["time_cost"][image.doc_type] = [image.processing_time]
|
||||
try:
|
||||
request_att["acc"]["feedback"]["imei_number"] += att["acc"]["feedback"]["imei_number"]
|
||||
request_att["acc"]["feedback"]["purchase_date"] += att["acc"]["feedback"]["purchase_date"]
|
||||
request_att["acc"]["feedback"]["retailername"] += att["acc"]["feedback"]["retailername"]
|
||||
request_att["acc"]["feedback"]["sold_to_party"] += att["acc"]["feedback"]["sold_to_party"]
|
||||
|
||||
request_att["acc"]["reviewed"]["imei_number"] += att["acc"]["reviewed"]["imei_number"]
|
||||
request_att["acc"]["reviewed"]["purchase_date"] += att["acc"]["reviewed"]["purchase_date"]
|
||||
request_att["acc"]["reviewed"]["retailername"] += att["acc"]["reviewed"]["retailername"]
|
||||
request_att["acc"]["reviewed"]["sold_to_party"] += att["acc"]["reviewed"]["sold_to_party"]
|
||||
|
||||
request_att["bad_images"] += int(att["is_bad_image"])
|
||||
request_att["total_images"] += 1
|
||||
request_att["err"] += att["err"]
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
return request_att
|
||||
|
||||
|
||||
def calculate_subcription_file(subcription_request_file):
|
||||
att = {"acc": {"feedback": {},
|
||||
"reviewed": {}},
|
||||
"err": [],
|
||||
"is_bad_image": False,
|
||||
"avg_acc": None}
|
||||
if not subcription_request_file.predict_result:
|
||||
return 400, att
|
||||
|
||||
inference_result = copy.deepcopy(subcription_request_file.predict_result)
|
||||
inference_result, feedback_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.feedback_result))
|
||||
inference_result, reviewed_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.reviewed_result))
|
||||
# print(f"[DEBUG]: predict_result: {subcription_request_file.predict_result}")
|
||||
# print(f"[DEBUG]: inference_result: {inference_result}")
|
||||
# print(f"[DEBUG]: feedback_result: {feedback_result}")
|
||||
# print(f"[DEBUG]: reviewed_result: {reviewed_result}")
|
||||
|
||||
for key_name in valid_keys:
|
||||
try:
|
||||
att["acc"]["feedback"][key_name], _ = calculate_accuracy(key_name, inference_result, feedback_result)
|
||||
att["acc"]["reviewed"][key_name], _ = calculate_accuracy(key_name, inference_result, reviewed_result)
|
||||
except Exception as e:
|
||||
att["err"].append(str(e))
|
||||
# print(f"[DEBUG]: e: {e} -key_name: {key_name}")
|
||||
avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", ["retailername", "sold_to_party", "purchase_date", "imei_number"])
|
||||
avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", ["retailername", "sold_to_party", "purchase_date", "imei_number"])
|
||||
if avg_feedback is not None or avg_reviewed is not None:
|
||||
avg_acc = max([x for x in [avg_feedback, avg_reviewed] if x is not None])
|
||||
if avg_acc < BAD_THRESHOLD:
|
||||
att["is_bad_image"] = True
|
||||
att["avg_acc"] = avg_acc
|
||||
return 200, att
|
||||
|
||||
def calculate_attributions(request): # for one request, return in order
|
||||
acc = {"feedback": {},
|
||||
"reviewed": {}} # {"feedback": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]},
|
||||
# "reviewed": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}}
|
||||
data = {"feedback": {},
|
||||
"reviewed": {}} # {"feedback": {"retailername": [[ocr, feedback], ...], "sold_to_party":[[ocr, feedback], ...], "purchase_date":[[ocr, feedback], ...], "imei_number":[[ocr, feedback], ...]}}
|
||||
# {"reviewed": {"retailername": [[ocr, reviewed], ...], "sold_to_party":[[ocr, reviewed], ...], "purchase_date":[[ocr, reviewed], ...], "imei_number":[[ocr, reviewed], ...]}}
|
||||
time_cost = {} # {"imei": [0.1], "invoice": [0.1]}
|
||||
image_quality_num = [0, 0] # [good, bad]
|
||||
image_quality_num[0] = len(request.doc_type.split(","))
|
||||
error = ""
|
||||
|
||||
inference_result = predict_result_to_ready(request.predict_result)
|
||||
reviewed_result = align_fine_result(inference_result, request.reviewed_result)
|
||||
feedback_result = align_fine_result(inference_result, request.feedback_result)
|
||||
|
||||
# accuracy calculation
|
||||
for key_name in valid_keys:
|
||||
if isinstance(inference_result[key_name], list):
|
||||
if len(inference_result[key_name]) != len(reviewed_result.get(key_name, [])):
|
||||
error = f"Request {request.request_id} failed with different {key_name} in predict and reviewed_result"
|
||||
break
|
||||
if len(inference_result[key_name]) != len(feedback_result.get(key_name, [])):
|
||||
error = f"Request {request.request_id} failed with different {key_name} in predict and feedback_result"
|
||||
break
|
||||
# calculate accuracy for feedback result
|
||||
acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result)
|
||||
acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result)
|
||||
else:
|
||||
inference_result[key_name] = [inference_result[key_name]]
|
||||
feedback_result[key_name] = [feedback_result[key_name]]
|
||||
reviewed_result[key_name] = [reviewed_result[key_name]]
|
||||
|
||||
acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result)
|
||||
acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result)
|
||||
|
||||
acc["feedback"]["purchase_date"] = [max(acc["feedback"]["purchase_date"])] if len(acc["feedback"]["purchase_date"]) > 0 else []
|
||||
acc["reviewed"]["purchase_date"] = [max(acc["reviewed"]["purchase_date"])] if len(acc["reviewed"]["purchase_date"]) > 0 else []
|
||||
# Count for bad and total images
|
||||
avg_invoice_feedback = calculate_avg_accuracy(acc, "feedback", ["retailername", "sold_to_party", "purchase_date"])
|
||||
avg_invoice_reviewed = calculate_avg_accuracy(acc, "reviewed", ["retailername", "sold_to_party", "purchase_date"])
|
||||
if avg_invoice_feedback is not None or avg_invoice_reviewed is not None:
|
||||
if max([x for x in [avg_invoice_feedback, avg_invoice_reviewed] if x is not None]) < BAD_THRESHOLD:
|
||||
image_quality_num[1] += 1
|
||||
for i, _ in enumerate(acc["feedback"]["imei_number"]):
|
||||
if acc["feedback"]["imei_number"][i] is not None and acc["reviewed"]["imei_number"][i] is not None:
|
||||
if max([x for x in [acc["feedback"]["imei_number"][i], acc["reviewed"]["imei_number"][i]] if x is not None]) < BAD_THRESHOLD:
|
||||
image_quality_num[1] += 1
|
||||
# time cost and quality calculation
|
||||
# TODO: to be deprecated, doc_type would be in file level in the future
|
||||
try:
|
||||
for doc_type, doc_profile in request.ai_inference_profile.items():
|
||||
doc_type = doc_type.split("_")[0]
|
||||
inference_time = doc_profile["inference"][1][0] - doc_profile["inference"][0]
|
||||
postprocess_time = doc_profile["postprocess"][1] - doc_profile["postprocess"][0]
|
||||
time_cost[doc_type].append(inference_time + postprocess_time)
|
||||
except Exception as e:
|
||||
error = f"Request id {request.request_id} failed with error: {e}"
|
||||
|
||||
return acc, data, time_cost, image_quality_num, error
|
||||
|
||||
def shadow_report(report_id, query):
|
||||
c_connector.make_a_report(
|
||||
(report_id, query))
|
@ -6,6 +6,7 @@ import json
|
||||
|
||||
from PIL import Image, ExifTags
|
||||
from django.core.files.uploadedfile import TemporaryUploadedFile
|
||||
from django.utils import timezone
|
||||
|
||||
from fwd import settings
|
||||
from fwd_api.constant.common import allowed_file_extensions
|
||||
@ -18,10 +19,33 @@ from fwd_api.utils.image import resize
|
||||
from ..celery_worker.client_connector import c_connector
|
||||
import imagesize
|
||||
import csv
|
||||
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import Font, Border, Side, PatternFill, NamedStyle
|
||||
|
||||
def validate_report_list(request):
|
||||
start_date_str = request.GET.get('start_date')
|
||||
end_date_str = request.GET.get('end_date')
|
||||
page_number = int(request.GET.get('page', 0))
|
||||
page_size = int(request.GET.get('page_size', 10))
|
||||
report_id = request.GET.get('report_id', None)
|
||||
|
||||
validated_data = {}
|
||||
validated_data["start_date"] = None
|
||||
validated_data["end_date"] = None
|
||||
|
||||
if len(start_date_str) > 0 and len(end_date_str) > 0:
|
||||
try:
|
||||
validated_data["start_date"] = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
validated_data["end_date"] = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
|
||||
except ValueError:
|
||||
raise InvalidException(excArgs="Date format")
|
||||
validated_data["report_id"] = report_id
|
||||
validated_data["page_size"] = page_size
|
||||
validated_data["page_number"] = page_number
|
||||
if validated_data["report_id"] is None and validated_data["start_date"] is None:
|
||||
raise RequiredFieldException(excArgs="report_id, start_date, end_date")
|
||||
return validated_data
|
||||
|
||||
def validate_feedback_file(csv_file_path):
|
||||
required_columns = ['redemptionNumber', 'requestId', 'imeiNumber', 'imeiNumber2', 'Purchase Date', 'retailer', 'Sold to party', 'timetakenmilli']
|
||||
missing_columns = []
|
||||
@ -57,7 +81,6 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
|
||||
if total_file_size > settings.MAX_UPLOAD_FILE_SIZE_OF_A_REQUEST:
|
||||
raise LimitReachedException(excArgs=('Total size of all files', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB'))
|
||||
|
||||
|
||||
def validate_csv_feedback(files, max_file_num=1, min_file_num=1, file_field="csv files"):
|
||||
total_file_size = 0
|
||||
if len(files) < min_file_num:
|
||||
|
0
cope2n-api/fwd_api/utils/ocr_utils/__init__.py
Normal file
0
cope2n-api/fwd_api/utils/ocr_utils/__init__.py
Normal file
385
cope2n-api/fwd_api/utils/ocr_utils/ocr_metrics.py
Normal file
385
cope2n-api/fwd_api/utils/ocr_utils/ocr_metrics.py
Normal file
@ -0,0 +1,385 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
from terminaltables import AsciiTable
|
||||
from rapidfuzz.distance import Levenshtein
|
||||
|
||||
from .wiki_diff import inline_diff
|
||||
|
||||
|
||||
def is_type_list(x, type):
|
||||
|
||||
if not isinstance(x, list):
|
||||
return False
|
||||
|
||||
return all(isinstance(item, type) for item in x)
|
||||
|
||||
|
||||
def cal_true_positive_char(pred, gt):
|
||||
"""Calculate correct character number in prediction.
|
||||
Args:
|
||||
pred (str): Prediction text.
|
||||
gt (str): Ground truth text.
|
||||
Returns:
|
||||
true_positive_char_num (int): The true positive number.
|
||||
"""
|
||||
|
||||
all_opt = SequenceMatcher(None, pred, gt)
|
||||
true_positive_char_num = 0
|
||||
for opt, _, _, s2, e2 in all_opt.get_opcodes():
|
||||
if opt == "equal":
|
||||
true_positive_char_num += e2 - s2
|
||||
else:
|
||||
pass
|
||||
return true_positive_char_num
|
||||
|
||||
|
||||
def post_processing(text):
|
||||
"""
|
||||
- Remove special characters and extra spaces + lower case
|
||||
"""
|
||||
|
||||
text = re.sub(
|
||||
r"[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789 ]",
|
||||
" ",
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"\s\s+", " ", text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def count_matches(pred_texts, gt_texts, use_ignore=True):
|
||||
"""Count the various match number for metric calculation.
|
||||
Args:
|
||||
pred_texts (list[str]): Predicted text string.
|
||||
gt_texts (list[str]): Ground truth text string.
|
||||
Returns:
|
||||
match_res: (dict[str: int]): Match number used for
|
||||
metric calculation.
|
||||
"""
|
||||
match_res = {
|
||||
"gt_char_num": 0,
|
||||
"pred_char_num": 0,
|
||||
"true_positive_char_num": 0,
|
||||
"gt_word_num": 0,
|
||||
"match_word_num": 0,
|
||||
"match_word_ignore_case": 0,
|
||||
"match_word_ignore_case_symbol": 0,
|
||||
"match_kie": 0,
|
||||
"match_kie_ignore_case": 0,
|
||||
}
|
||||
# comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
|
||||
# comp = re.compile('[]')
|
||||
norm_ed_sum = 0.0
|
||||
|
||||
gt_texts_for_ned_word = []
|
||||
pred_texts_for_ned_word = []
|
||||
for pred_text, gt_text in zip(pred_texts, gt_texts):
|
||||
if gt_text == pred_text:
|
||||
match_res["match_word_num"] += 1
|
||||
match_res["match_kie"] += 1
|
||||
gt_text_lower = str(gt_text).lower()
|
||||
pred_text_lower = str(pred_text).lower()
|
||||
|
||||
if gt_text_lower == pred_text_lower:
|
||||
match_res["match_word_ignore_case"] += 1
|
||||
|
||||
# gt_text_lower_ignore = comp.sub('', gt_text_lower)
|
||||
# pred_text_lower_ignore = comp.sub('', pred_text_lower)
|
||||
if use_ignore:
|
||||
gt_text_lower_ignore = post_processing(gt_text_lower)
|
||||
pred_text_lower_ignore = post_processing(pred_text_lower)
|
||||
else:
|
||||
gt_text_lower_ignore = gt_text_lower
|
||||
pred_text_lower_ignore = pred_text_lower
|
||||
|
||||
if gt_text_lower_ignore == pred_text_lower_ignore:
|
||||
match_res["match_kie_ignore_case"] += 1
|
||||
|
||||
gt_texts_for_ned_word.append(gt_text_lower_ignore.split(" "))
|
||||
pred_texts_for_ned_word.append(pred_text_lower_ignore.split(" "))
|
||||
|
||||
match_res["gt_word_num"] += 1
|
||||
|
||||
norm_ed = Levenshtein.normalized_distance(
|
||||
pred_text_lower_ignore, gt_text_lower_ignore
|
||||
)
|
||||
# if norm_ed > 0.1:
|
||||
# print(gt_text_lower_ignore, pred_text_lower_ignore, sep='\n')
|
||||
# print("-"*20)
|
||||
norm_ed_sum += norm_ed
|
||||
|
||||
# number to calculate char level recall & precision
|
||||
match_res["gt_char_num"] += len(gt_text_lower_ignore)
|
||||
match_res["pred_char_num"] += len(pred_text_lower_ignore)
|
||||
true_positive_char_num = cal_true_positive_char(
|
||||
pred_text_lower_ignore, gt_text_lower_ignore
|
||||
)
|
||||
match_res["true_positive_char_num"] += true_positive_char_num
|
||||
|
||||
normalized_edit_distance = norm_ed_sum / max(1, len(gt_texts))
|
||||
match_res["ned"] = normalized_edit_distance
|
||||
|
||||
# NED for word-level
|
||||
norm_ed_word_sum = 0.0
|
||||
# print(pred_texts_for_ned_word[0])
|
||||
unique_words = list(
|
||||
set(
|
||||
[x for line in pred_texts_for_ned_word for x in line]
|
||||
+ [x for line in gt_texts_for_ned_word for x in line]
|
||||
)
|
||||
)
|
||||
preds = [
|
||||
[unique_words.index(w) for w in pred_text_for_ned_word]
|
||||
for pred_text_for_ned_word in pred_texts_for_ned_word
|
||||
]
|
||||
truths = [
|
||||
[unique_words.index(w) for w in gt_text_for_ned_word]
|
||||
for gt_text_for_ned_word in gt_texts_for_ned_word
|
||||
]
|
||||
for pred_text, gt_text in zip(preds, truths):
|
||||
norm_ed_word = Levenshtein.normalized_distance(pred_text, gt_text)
|
||||
# if norm_ed_word < 0.2:
|
||||
# print(pred_text, gt_text)
|
||||
norm_ed_word_sum += norm_ed_word
|
||||
|
||||
normalized_edit_distance_word = norm_ed_word_sum / max(1, len(gt_texts))
|
||||
match_res["ned_word"] = normalized_edit_distance_word
|
||||
|
||||
return match_res
|
||||
|
||||
|
||||
def eval_ocr_metric(pred_texts, gt_texts, metric="acc"):
|
||||
"""Evaluate the text recognition performance with metric: word accuracy and
|
||||
1-N.E.D. See https://rrc.cvc.uab.es/?ch=14&com=tasks for details.
|
||||
Args:
|
||||
pred_texts (list[str]): Text strings of prediction.
|
||||
gt_texts (list[str]): Text strings of ground truth.
|
||||
metric (str | list[str]): Metric(s) to be evaluated. Options are:
|
||||
- 'word_acc': Accuracy at word level.
|
||||
- 'word_acc_ignore_case': Accuracy at word level, ignoring letter
|
||||
case.
|
||||
- 'word_acc_ignore_case_symbol': Accuracy at word level, ignoring
|
||||
letter case and symbol. (Default metric for academic evaluation)
|
||||
- 'char_recall': Recall at character level, ignoring
|
||||
letter case and symbol.
|
||||
- 'char_precision': Precision at character level, ignoring
|
||||
letter case and symbol.
|
||||
- 'one_minus_ned': 1 - normalized_edit_distance
|
||||
In particular, if ``metric == 'acc'``, results on all metrics above
|
||||
will be reported.
|
||||
Returns:
|
||||
dict{str: float}: Result dict for text recognition, keys could be some
|
||||
of the following: ['word_acc', 'word_acc_ignore_case',
|
||||
'word_acc_ignore_case_symbol', 'char_recall', 'char_precision',
|
||||
'1-N.E.D'].
|
||||
"""
|
||||
assert isinstance(pred_texts, list)
|
||||
assert isinstance(gt_texts, list)
|
||||
assert len(pred_texts) == len(gt_texts)
|
||||
|
||||
assert isinstance(metric, str) or is_type_list(metric, str)
|
||||
if metric == "acc" or metric == ["acc"]:
|
||||
metric = [
|
||||
"word_acc",
|
||||
"word_acc_ignore_case",
|
||||
"word_acc_ignore_case_symbol",
|
||||
"char_recall",
|
||||
"char_precision",
|
||||
"one_minus_ned",
|
||||
]
|
||||
metric = set([metric]) if isinstance(metric, str) else set(metric)
|
||||
|
||||
# supported_metrics = set([
|
||||
# 'word_acc', 'word_acc_ignore_case', 'word_acc_ignore_case_symbol',
|
||||
# 'char_recall', 'char_precision', 'one_minus_ned', 'one_minust_ned_word'
|
||||
# ])
|
||||
# assert metric.issubset(supported_metrics)
|
||||
|
||||
match_res = count_matches(pred_texts, gt_texts)
|
||||
eps = 1e-8
|
||||
eval_res = {}
|
||||
|
||||
if "char_recall" in metric:
|
||||
char_recall = (
|
||||
1.0 * match_res["true_positive_char_num"] / (eps + match_res["gt_char_num"])
|
||||
)
|
||||
eval_res["char_recall"] = char_recall
|
||||
|
||||
if "char_precision" in metric:
|
||||
char_precision = (
|
||||
1.0
|
||||
* match_res["true_positive_char_num"]
|
||||
/ (eps + match_res["pred_char_num"])
|
||||
)
|
||||
eval_res["char_precision"] = char_precision
|
||||
|
||||
if "word_acc" in metric:
|
||||
word_acc = 1.0 * match_res["match_word_num"] / (eps + match_res["gt_word_num"])
|
||||
eval_res["word_acc"] = word_acc
|
||||
|
||||
if "word_acc_ignore_case" in metric:
|
||||
word_acc_ignore_case = (
|
||||
1.0 * match_res["match_word_ignore_case"] / (eps + match_res["gt_word_num"])
|
||||
)
|
||||
eval_res["word_acc_ignore_case"] = word_acc_ignore_case
|
||||
|
||||
if "word_acc_ignore_case_symbol" in metric:
|
||||
word_acc_ignore_case_symbol = (
|
||||
1.0
|
||||
* match_res["match_word_ignore_case_symbol"]
|
||||
/ (eps + match_res["gt_word_num"])
|
||||
)
|
||||
eval_res["word_acc_ignore_case_symbol"] = word_acc_ignore_case_symbol
|
||||
|
||||
if "one_minus_ned" in metric:
|
||||
|
||||
eval_res["1-N.E.D"] = 1.0 - match_res["ned"]
|
||||
|
||||
if "one_minus_ned_word" in metric:
|
||||
|
||||
eval_res["1-N.E.D_word"] = 1.0 - match_res["ned_word"]
|
||||
|
||||
if "line_acc_ignore_case_symbol" in metric:
|
||||
line_acc_ignore_case_symbol = (
|
||||
1.0 * match_res["match_kie_ignore_case"] / (eps + match_res["gt_word_num"])
|
||||
)
|
||||
eval_res["line_acc_ignore_case_symbol"] = line_acc_ignore_case_symbol
|
||||
|
||||
if "line_acc" in metric:
|
||||
word_acc_ignore_case_symbol = (
|
||||
1.0 * match_res["match_kie"] / (eps + match_res["gt_word_num"])
|
||||
)
|
||||
eval_res["line_acc"] = word_acc_ignore_case_symbol
|
||||
|
||||
for key, value in eval_res.items():
|
||||
eval_res[key] = float("{:.4f}".format(value))
|
||||
|
||||
return eval_res
|
||||
|
||||
|
||||
def eval_kie(preds_e2e: dict[str, dict[str, str]], gt_e2e: dict[str, dict[str, str]], labels, skip_labels=[]):
|
||||
|
||||
results = {label: 1 for label in labels}
|
||||
pred_texts_dict = {label: [] for label in labels}
|
||||
gt_texts_dict = {label: [] for label in labels}
|
||||
fail_cases = {}
|
||||
for img_id in gt_e2e.keys():
|
||||
fail_cases[img_id] = {}
|
||||
pred_items = preds_e2e.get(img_id, {k: '' for k in gt_e2e[img_id]})
|
||||
gt_items = gt_e2e[img_id]
|
||||
|
||||
for class_name, text_gt in gt_items.items():
|
||||
if class_name in skip_labels:
|
||||
continue
|
||||
# if class_name == 'seller_name_value':
|
||||
# print(gt_items)
|
||||
if class_name not in pred_items:
|
||||
text_pred = ""
|
||||
else:
|
||||
text_pred = pred_items[class_name]
|
||||
|
||||
if str(text_pred) != str(text_gt):
|
||||
diff = inline_diff(text_pred, text_gt)
|
||||
fail_cases[img_id][class_name] = {
|
||||
'pred': text_pred,
|
||||
'gt': text_gt,
|
||||
"diff": diff['res_text'],
|
||||
"ned": diff["ned"],
|
||||
"score": eval_ocr_metric([text_pred], [text_gt], metric=[
|
||||
"one_minus_ned"])["1-N.E.D"],
|
||||
}
|
||||
|
||||
pred_texts_dict[class_name].append(text_pred)
|
||||
gt_texts_dict[class_name].append(text_gt)
|
||||
|
||||
for class_name in labels:
|
||||
pred_texts = pred_texts_dict[class_name]
|
||||
gt_texts = gt_texts_dict[class_name]
|
||||
result = eval_ocr_metric(
|
||||
pred_texts,
|
||||
gt_texts,
|
||||
metric=[
|
||||
"one_minus_ned",
|
||||
"line_acc_ignore_case_symbol",
|
||||
"line_acc",
|
||||
"one_minus_ned_word",
|
||||
],
|
||||
)
|
||||
results[class_name] = {
|
||||
"1-ned": result["1-N.E.D"],
|
||||
"1-ned-word": result["1-N.E.D_word"],
|
||||
"line_acc": result["line_acc"],
|
||||
"line_acc_ignore_case_symbol": result["line_acc_ignore_case_symbol"],
|
||||
"samples": len(pred_texts),
|
||||
}
|
||||
|
||||
# avg reusults
|
||||
sum_1_ned = sum(
|
||||
[
|
||||
results[class_name]["1-ned"] * results[class_name]["samples"]
|
||||
for class_name in labels
|
||||
]
|
||||
)
|
||||
sum_1_ned_word = sum(
|
||||
[
|
||||
results[class_name]["1-ned-word"] * results[class_name]["samples"]
|
||||
for class_name in labels
|
||||
]
|
||||
)
|
||||
|
||||
sum_line_acc = sum(
|
||||
[
|
||||
results[class_name]["line_acc"] * results[class_name]["samples"]
|
||||
for class_name in labels
|
||||
]
|
||||
)
|
||||
sum_line_acc_ignore_case_symbol = sum(
|
||||
[
|
||||
results[class_name]["line_acc_ignore_case_symbol"]
|
||||
* results[class_name]["samples"]
|
||||
for class_name in labels
|
||||
]
|
||||
)
|
||||
|
||||
total_samples = sum(
|
||||
[results[class_name]["samples"] for class_name in labels]
|
||||
)
|
||||
results["avg_all"] = {
|
||||
"1-ned": round(sum_1_ned / total_samples, 4),
|
||||
"1-ned-word": round(sum_1_ned_word / total_samples, 4),
|
||||
"line_acc": round(sum_line_acc / total_samples, 4),
|
||||
"line_acc_ignore_case_symbol": round(
|
||||
sum_line_acc_ignore_case_symbol / total_samples, 4
|
||||
),
|
||||
"samples": total_samples,
|
||||
}
|
||||
|
||||
table_data = [
|
||||
[
|
||||
"class_name",
|
||||
"1-NED",
|
||||
"1-N.E.D_word",
|
||||
"line-acc",
|
||||
"line_acc_ignore_case_symbol",
|
||||
"#samples",
|
||||
]
|
||||
]
|
||||
for class_name in results.keys():
|
||||
# if c < p.shape[0]:
|
||||
table_data.append(
|
||||
[
|
||||
class_name,
|
||||
results[class_name]["1-ned"],
|
||||
results[class_name]["1-ned-word"],
|
||||
results[class_name]["line_acc"],
|
||||
results[class_name]["line_acc_ignore_case_symbol"],
|
||||
results[class_name]["samples"],
|
||||
]
|
||||
)
|
||||
|
||||
table = AsciiTable(table_data)
|
||||
print(table.table)
|
||||
return results, fail_cases
|
432
cope2n-api/fwd_api/utils/ocr_utils/sbt_report.py
Normal file
432
cope2n-api/fwd_api/utils/ocr_utils/sbt_report.py
Normal file
@ -0,0 +1,432 @@
|
||||
import os
|
||||
import re
|
||||
import ast
|
||||
import time
|
||||
import json
|
||||
import glob
|
||||
import shutil
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from .ocr_metrics import eval_ocr_metric
|
||||
|
||||
import sys
|
||||
# sys.path.append(os.path.dirname(__file__))
|
||||
from sdsvkvu.utils.query.sbt_v2 import get_seller, post_process_seller
|
||||
|
||||
|
||||
def read_json(file_path: str):
|
||||
with open(file_path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
def write_to_json(file_path, content):
|
||||
with open(file_path, mode='w', encoding='utf8') as f:
|
||||
json.dump(content, f, ensure_ascii=False)
|
||||
|
||||
|
||||
def convert_datetime_format(date_string: str, is_gt=False) -> str:
|
||||
# pattern_date_string = "2023-02-28"
|
||||
output_format = "%Y-%m-%d"
|
||||
input_format = "%d/%m/%Y"
|
||||
# Validate the input date string format
|
||||
pattern = r"\d{2}\/\d{2}\/\d{4}"
|
||||
if re.match(pattern, date_string):
|
||||
# Convert the date string to a datetime object
|
||||
date_object = datetime.strptime(date_string, input_format)
|
||||
# Convert the datetime object to the desired output format
|
||||
formatted_date = date_object.strftime(output_format)
|
||||
return formatted_date
|
||||
return date_string
|
||||
|
||||
|
||||
def normalise_retailer_name(retailer: str):
|
||||
input_value = {
|
||||
"text": retailer,
|
||||
"id": 0,
|
||||
"class": "seller",
|
||||
"bbox": [0, 0, 0, 0],
|
||||
}
|
||||
output = get_seller({'seller': [input_value]})
|
||||
|
||||
norm_seller_name = post_process_seller(output)
|
||||
return norm_seller_name
|
||||
|
||||
|
||||
def post_processing_str(class_name: str, s: str, is_gt: bool) -> str:
|
||||
s = str(s).replace('✪', ' ').strip()
|
||||
if s.lower() in ['null', 'nan', "none"]:
|
||||
return ''
|
||||
if class_name == "purchase_date" and is_gt == True:
|
||||
s = convert_datetime_format(s)
|
||||
if class_name == "retailername":
|
||||
s = normalise_retailer_name(s)
|
||||
return s
|
||||
|
||||
|
||||
def convert_groundtruth_from_csv(
|
||||
csv_path: str,
|
||||
save_dir: str,
|
||||
classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
|
||||
):
|
||||
# if isinstance(csv_path_list, str):
|
||||
# csv_path_list = [csv_path_list]
|
||||
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
total_output = {}
|
||||
for _, request in df.iterrows():
|
||||
req_id = request['requestId']
|
||||
|
||||
if req_id not in total_output:
|
||||
total_output[req_id] = {k: None for k in classes}
|
||||
total_output[req_id]["imei_number"] = []
|
||||
|
||||
total_output[req_id]["imei_number"].extend([request["imeiNumber"], request["imeiNumber2"]])
|
||||
total_output[req_id]["imei_number"] = list(set(total_output[req_id]["imei_number"]))
|
||||
|
||||
total_output[req_id]["purchase_date"] = request["Purchase Date"]
|
||||
total_output[req_id]["retailername"] = request["retailer"]
|
||||
|
||||
for req_id, output in total_output.items():
|
||||
save_path = os.path.join(save_dir, req_id)
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
write_to_json(os.path.join(save_path, f"{req_id}.json"), output)
|
||||
|
||||
|
||||
def convert_predict_from_csv(
|
||||
csv_path: str,
|
||||
save_dir: str,
|
||||
classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
|
||||
):
|
||||
# if isinstance(csv_path_list, str):
|
||||
# csv_path_list = [csv_path_list]
|
||||
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
for _, request in df.iterrows():
|
||||
n_pages = request['pages']
|
||||
req_id = request['request_id']
|
||||
if not isinstance(request['doc_type'], str) or not isinstance(request['predict_result'], str):
|
||||
print(f"[WARNING] Skipped request id {req_id}")
|
||||
continue
|
||||
|
||||
doc_type_list = request['doc_type'].split(',')
|
||||
assert n_pages == len(doc_type_list), \
|
||||
"No. pages is different no. documents"
|
||||
|
||||
json_path = os.path.join(save_dir, req_id)
|
||||
os.makedirs(json_path, exist_ok=True)
|
||||
|
||||
# For user_submitted_results
|
||||
if "feedback_result" in request:
|
||||
feedback_data = ast.literal_eval(request['feedback_result'])
|
||||
fname = f"{req_id}.json"
|
||||
write_to_json(os.path.join(json_path, fname), feedback_data)
|
||||
|
||||
# For predict_results
|
||||
data = ast.literal_eval(request['predict_result'])['content']['document'][0]['content']
|
||||
infer_time = float(request['ai_inference_time']) + float(request['preprocessing_time']) + 0.1
|
||||
|
||||
n_imei, n_invoice = 0, 0
|
||||
for doc_type in doc_type_list:
|
||||
output = {k: None for k in classes}
|
||||
if not os.path.exists(json_path):
|
||||
os.makedirs(json_path, exist_ok=True)
|
||||
|
||||
if doc_type == "imei":
|
||||
for info in data:
|
||||
if info['label'] == "imei_number":
|
||||
output['imei_number'] = info['value'][n_imei]
|
||||
output['processing_time'] = infer_time
|
||||
fname = f"temp_{doc_type}_{req_id}_{n_imei}.json"
|
||||
write_to_json(os.path.join(json_path, fname), output)
|
||||
n_imei += 1
|
||||
break
|
||||
elif doc_type == "invoice":
|
||||
for info in data:
|
||||
if info['label'] == "imei_number":
|
||||
continue
|
||||
output[info['label']] = info['value']
|
||||
output['processing_time'] = infer_time
|
||||
fname = f"temp_{doc_type}_{req_id}_{n_invoice}.json"
|
||||
write_to_json(os.path.join(json_path, fname), output)
|
||||
n_invoice += 1
|
||||
|
||||
|
||||
def gen_req_to_red_dict(csv_path: str):
|
||||
df = pd.read_csv(csv_path)
|
||||
df = df.loc[:, ["requestId", "redemptionNumber"]]
|
||||
req_to_red = {row["requestId"]: row["redemptionNumber"] for _, row in df.iterrows()}
|
||||
return req_to_red
|
||||
|
||||
|
||||
def gen_req_to_red_dict_2(csv_path: str):
|
||||
df = pd.read_csv(csv_path)
|
||||
df = df.loc[:, ["request_id", "redemption_id"]]
|
||||
req_to_red = {row["request_id"]: row["redemption_id"] for _, row in df.iterrows()}
|
||||
return req_to_red
|
||||
|
||||
|
||||
def init_csv(
|
||||
gt_dir: str,
|
||||
pred_dir: str,
|
||||
req_to_red: dict,
|
||||
):
|
||||
list_request_id = os.listdir(gt_dir)
|
||||
total = []
|
||||
for request_id in list_request_id:
|
||||
gt_path = os.path.join(gt_dir, request_id, request_id+".json")
|
||||
if not os.path.exists(gt_path):
|
||||
print(f"[WARNING] Skipped request id {os.path.basename(os.path.dirname(gt_path))}")
|
||||
continue
|
||||
gt_data = read_json(gt_path)
|
||||
json_file_list = glob.glob(os.path.join(pred_dir, request_id, "temp_*.json"))
|
||||
json_file_list = sorted(json_file_list, key=lambda x: int(x.split(".json")[0].split('_')[-1]))
|
||||
n_imei, n_invoice = 0, 0
|
||||
# if len(json_file_list) > 3:
|
||||
# continue
|
||||
|
||||
for json_file in json_file_list:
|
||||
pred_data = read_json(json_file)
|
||||
if "imei" in json_file:
|
||||
pred_value = pred_data['imei_number']
|
||||
gt_value = gt_data['imei_number'][n_imei]
|
||||
n_imei += 1
|
||||
score = eval_ocr_metric(
|
||||
[post_processing_str("imei_number", pred_value, is_gt=False)],
|
||||
[post_processing_str("imei_number", gt_value, is_gt=True)],
|
||||
metric=["one_minus_ned"]
|
||||
)['1-N.E.D']
|
||||
|
||||
total.append({
|
||||
"requestId": request_id,
|
||||
"redemptionNumber": req_to_red[request_id],
|
||||
"userSubmitResults": gt_value,
|
||||
"OCRResults": pred_value,
|
||||
"revisedResults_by_SDSRV": "",
|
||||
"accuracy": score,
|
||||
"processingTime (by request)": pred_data['processing_time'],
|
||||
"class_name": "imei_number",
|
||||
"file_path": json_file
|
||||
})
|
||||
|
||||
elif "invoice" in json_file:
|
||||
for class_name in ["retailername", "purchase_date"]:
|
||||
pred_value = pred_data[class_name]
|
||||
gt_value = gt_data[class_name]
|
||||
if isinstance(gt_value, list):
|
||||
gt_value = gt_value[0]
|
||||
n_invoice += 1
|
||||
|
||||
if not isinstance(pred_value, list):
|
||||
pred_value = [pred_value]
|
||||
|
||||
score = 0
|
||||
for _pred_value in pred_value:
|
||||
score1 = eval_ocr_metric(
|
||||
[post_processing_str(class_name, _pred_value, is_gt=False)],
|
||||
[post_processing_str(class_name, gt_value, is_gt=True)],
|
||||
metric=["one_minus_ned"]
|
||||
)['1-N.E.D']
|
||||
score = max(score, score1)
|
||||
|
||||
total.append({
|
||||
"requestId": request_id,
|
||||
"redemptionNumber": req_to_red[request_id],
|
||||
"userSubmitResults": gt_value,
|
||||
"OCRResults": pred_value[0] if class_name == "retailername" else pred_value,
|
||||
"revisedResults_by_SDSRV": "",
|
||||
"accuracy": score,
|
||||
"processingTime (by request)": pred_data['processing_time'],
|
||||
"class_name": class_name,
|
||||
"file_path": json_file
|
||||
})
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def export_report(
|
||||
init_csv: str,
|
||||
):
|
||||
df = pd.read_csv(init_csv)
|
||||
for index, request in df.iterrows():
|
||||
file_path = request['file_path']
|
||||
class_name = request['class_name']
|
||||
pred_value = request['OCRResults']
|
||||
revised_value = read_json(file_path)[class_name]
|
||||
if class_name == "purchase_date":
|
||||
pred_value = ast.literal_eval(pred_value)
|
||||
if isinstance(revised_value, list):
|
||||
if len(revised_value) > 0:
|
||||
revised_value = revised_value[0]
|
||||
else:
|
||||
revised_value = None
|
||||
|
||||
if len(pred_value) == 0:
|
||||
pred_value = [None]
|
||||
|
||||
score = 0
|
||||
for _pred_value in pred_value:
|
||||
score1 = eval_ocr_metric(
|
||||
[post_processing_str(class_name, _pred_value, is_gt=False)],
|
||||
[post_processing_str(class_name, revised_value, is_gt=True)],
|
||||
metric=["one_minus_ned"]
|
||||
)['1-N.E.D']
|
||||
score = max(score, score1)
|
||||
else:
|
||||
score = eval_ocr_metric(
|
||||
[post_processing_str(class_name, pred_value, is_gt=False)],
|
||||
[post_processing_str(class_name, revised_value, is_gt=True)],
|
||||
metric=["one_minus_ned"]
|
||||
)['1-N.E.D']
|
||||
|
||||
|
||||
df.at[index, "revisedResults_by_SDSRV"] = revised_value
|
||||
df.at[index, "accuracy"] = score
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def pick_sample_to_revise(
|
||||
ocr_accuracy: list,
|
||||
gt_dir: str,
|
||||
save_dir: str
|
||||
):
|
||||
empty_err_path = os.path.join(save_dir, "empty_results")
|
||||
other_err_path = os.path.join(save_dir, "diff_results")
|
||||
os.makedirs(empty_err_path, exist_ok=True)
|
||||
os.makedirs(other_err_path, exist_ok=True)
|
||||
for request in ocr_accuracy:
|
||||
score = request['accuracy']
|
||||
json_path = request['file_path']
|
||||
request_id = request['requestId']
|
||||
|
||||
img_path_folder = os.path.join(gt_dir, Path(json_path).parts[-2], Path(json_path).parts[-1])
|
||||
img_path = [ff for ff in glob.glob(img_path_folder.replace(".json", ".*")) if ".json" not in ff]
|
||||
|
||||
if len(img_path) == 0:
|
||||
print(f"[WARNING] Skipped request id {request_id}")
|
||||
continue
|
||||
img_path = img_path[0]
|
||||
# img_path = [ff for ff in glob.glob(json_path.replace(".json", ".*"))][0]
|
||||
|
||||
if score == 0:
|
||||
save_path = os.path.join(empty_err_path, request_id)
|
||||
elif score < 1:
|
||||
save_path = os.path.join(other_err_path, request_id)
|
||||
else:
|
||||
continue
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
shutil.copy(img_path, save_path)
|
||||
shutil.copy(json_path, save_path)
|
||||
|
||||
def merge_revised_sample(
|
||||
revised_path_list: list,
|
||||
save_dir: str
|
||||
):
|
||||
if not isinstance(revised_path_list, list):
|
||||
revised_path_list = [revised_path_list]
|
||||
|
||||
for revised_path in revised_path_list:
|
||||
list_request = [os.path.basename(ff) for ff in os.listdir(revised_path)]
|
||||
for request in list_request:
|
||||
file_list = glob.glob(os.path.join(revised_path, request, "*.json*"))
|
||||
for file_path in file_list:
|
||||
# shutil.copyfile(file_path, os.path.join(save_path, request))
|
||||
os.system(f"sudo cp {file_path} {os.path.join(save_dir, request)}")
|
||||
|
||||
def calculate_average_by_column(df, column_name):
|
||||
df = df.groupby(by=["requestId"])
|
||||
time_list = []
|
||||
for req, sub_df in df:
|
||||
if len(sub_df) > 0:
|
||||
time_list.append(sub_df.iloc[0][column_name])
|
||||
if len(time_list) > 0:
|
||||
return sum(time_list)/len(time_list)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
save_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan"
|
||||
save_csv = "logs/eval_20240115"
|
||||
csv_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan.csv"
|
||||
csv_path_end_user = "logs/eval_20240115/OCR_15Jan2024.csv"
|
||||
|
||||
# Step 1: Convert a csv file to get user submitted results for each request
|
||||
print("[INFO] Starting convert csv from customer to json")
|
||||
os.system(f"sudo chmod -R 777 {save_path}")
|
||||
convert_groundtruth_from_csv(csv_path=csv_path_end_user, save_dir=save_path)
|
||||
print("[INFO] Converted")
|
||||
|
||||
# # Step 2: Convert a csv file to get predict OCR results for each image
|
||||
print("[INFO] Starting convert csv from SDSV to json")
|
||||
convert_predict_from_csv(csv_path=csv_path, save_dir=save_path)
|
||||
print("[INFO] Converted")
|
||||
|
||||
# # Step 3: Gen initial csv file and calculate OCR result between submitted results and ocr results
|
||||
print("[INFO] Starting generate csv to get performance")
|
||||
gt_path = save_path
|
||||
pred_path = save_path
|
||||
req_to_red_dict = gen_req_to_red_dict(csv_path_end_user)
|
||||
init_data = init_csv(gt_dir=gt_path, pred_dir=pred_path, req_to_red=req_to_red_dict)
|
||||
pd.DataFrame(init_data).to_csv(os.path.join(save_csv, "init1.csv"), index=False)
|
||||
print("[INFO] Done")
|
||||
|
||||
# # Step 4: Split requests whose accuracy is less than 1 to revise
|
||||
# print("[INFO] Starting split data to review")
|
||||
# revised_path = os.path.join(save_csv, "revised")
|
||||
# # shutil.rmtree(revised_path)
|
||||
# pick_sample_to_revise(ocr_accuracy=init_data, gt_dir=save_path, save_dir=revised_path)
|
||||
# print("[INFO] Done")
|
||||
|
||||
# # Step 5: Merge revised results to gt folder
|
||||
# print("[INFO] Merging revised data to ground truth folder")
|
||||
# revised_path = os.path.join(save_csv, "revised")
|
||||
# revised_path = [f'{revised_path}/empty_results', f'{revised_path}/diff_results']
|
||||
# merge_revised_sample(revised_path_list=revised_path, save_dir=save_path)
|
||||
# print("Done")
|
||||
|
||||
# # Step 6: Caculate OCR result between ocr results and revised results
|
||||
# print("[INFO] Exporting OCR report")
|
||||
# init_csv_path = os.path.join(save_csv, "init1.csv")
|
||||
# report = export_report(init_csv=init_csv_path)
|
||||
# error_path = os.path.join(save_csv, "errors")
|
||||
# pick_sample_to_revise(ocr_accuracy=report[report.accuracy < 0.75].to_dict('records'), gt_dir=save_path, save_dir=error_path)
|
||||
|
||||
# n_total_images = len(report)
|
||||
# n_bad_images = len(report[report.accuracy < 0.75])
|
||||
# average_acc = report[report.accuracy >= 0.75]['accuracy'].mean()
|
||||
|
||||
# print("Total requests:", len(report['requestId'].unique()))
|
||||
# print("Total images:", n_total_images)
|
||||
# print("No. imei images:", len(report[report.class_name == "imei_number"]))
|
||||
# print("No. invoice images:", len(report[report.class_name == "retailername"]))
|
||||
# print("No. bad quality images:", n_bad_images)
|
||||
# print("No. valid images:", n_total_images - n_bad_images)
|
||||
# print("No. per of bad quality images:", 100*n_bad_images/n_total_images)
|
||||
# print("Average accuracy:", 100*average_acc)
|
||||
|
||||
# last_row = n_total_images
|
||||
# report.at[last_row, "requestId"] = "Total requests:"
|
||||
# report.at[last_row, "redemptionNumber"] = len(report['requestId'].unique())
|
||||
# report.at[last_row+1, "requestId"] = "Total images:"
|
||||
# report.at[last_row+1, "redemptionNumber"] = n_total_images
|
||||
# report.at[last_row+2, "requestId"] = "No. imei images:"
|
||||
# report.at[last_row+2, "redemptionNumber"] = len(report[report.class_name == "imei_number"])
|
||||
# report.at[last_row+3, "requestId"] = "No. invoice images:"
|
||||
# report.at[last_row+3, "redemptionNumber"] = len(report[report.class_name == "retailername"])
|
||||
# report.at[last_row+4, "requestId"] = "No. bad quality images:"
|
||||
# report.at[last_row+4, "redemptionNumber"] = n_bad_images
|
||||
# report.at[last_row+5, "requestId"] = "No. valid images:"
|
||||
# report.at[last_row+5, "redemptionNumber"] = n_total_images - n_bad_images
|
||||
# report.at[last_row+6, "requestId"] = "No. per of bad quality images:"
|
||||
# report.at[last_row+6, "redemptionNumber"] = 100*n_bad_images/n_total_images
|
||||
# report.at[last_row+7, "requestId"] = "Average accuracy:"
|
||||
# report.at[last_row+7, "redemptionNumber"] = 100*average_acc
|
||||
|
||||
|
||||
# report.drop(columns=["file_path", "class_name"]).to_csv(os.path.join(save_csv, f"SBT_report_{time.strftime('%Y%m%d')}.csv"), index=False)
|
||||
# print("[INFO] Done")
|
||||
|
||||
|
201
cope2n-api/fwd_api/utils/ocr_utils/wiki_diff.py
Normal file
201
cope2n-api/fwd_api/utils/ocr_utils/wiki_diff.py
Normal file
@ -0,0 +1,201 @@
|
||||
# https://stackoverflow.com/questions/774316/python-difflib-highlighting-differences-inline
|
||||
import difflib
|
||||
import unidecode
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
|
||||
VOWELS = 'aeouiy' + 'AEOUIY'
|
||||
CONSONANTS = 'bcdfghjklmnpqrstvxwz' + 'BCDFGHJKLMNPQRSTVXWZ'
|
||||
# PREDICT_PATH = 'ocr/result'
|
||||
# GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/wiki/ground_truth'
|
||||
PREDICT_PATH = 'ocr/result/cinamon'
|
||||
GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/Backup/1.Hand_writing/Lines/cinnamon_data'
|
||||
# note that we also use different preprocess for cinamon data
|
||||
# SAVE_PATH = 'wiki_diff'
|
||||
SAVE_PATH = 'wiki_diff/cinamon'
|
||||
RES_PATH = f'{SAVE_PATH}/result/'
|
||||
WRONG_ACCENT_FILE = f'{SAVE_PATH}/wrong_accent.txt'
|
||||
LOST_ACCENT_FILE = f'{SAVE_PATH}/lost_accent.txt'
|
||||
TOTAL_WORD = 0
|
||||
|
||||
|
||||
def write_accent_error(path, err):
|
||||
# path should be wrong_accent_file or lost_accent_file
|
||||
with open(path, 'a') as f:
|
||||
f.write(err)
|
||||
f.write('\n')
|
||||
|
||||
|
||||
def update_ddata_specialchars(ddata_specialchars, correction_key, char_key):
|
||||
if char_key in ddata_specialchars[correction_key]:
|
||||
ddata_specialchars[correction_key][char_key] += 1
|
||||
else:
|
||||
ddata_specialchars[correction_key][char_key] = 1
|
||||
|
||||
|
||||
def process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars):
|
||||
a_char = matcher.a[i1:i2]
|
||||
b_char = matcher.b[j1:j2]
|
||||
ddata['res_text'] += ' ### {' + a_char + ' -> ' + b_char + '} ### '
|
||||
ddata['nwrongs'] += 1*len(b_char)
|
||||
if len(a_char) == 1 and len(b_char) == 1: # single char case
|
||||
if a_char.lower() == b_char.lower(): # wrong upper/lower case
|
||||
ddata['UL_single'] += 1
|
||||
update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char))
|
||||
else:
|
||||
ddata['nwrongs_single'] += 1
|
||||
a_ori = unidecode.unidecode(a_char).lower()
|
||||
b_ori = unidecode.unidecode(b_char).lower()
|
||||
if a_ori in VOWELS and b_ori in VOWELS:
|
||||
if a_ori == b_ori:
|
||||
err = a_char + ' -> ' + b_char
|
||||
if b_ori == b_char.lower(): # e.g. Ơ -> O
|
||||
ddata['nlost_accent'] += 1
|
||||
# write_accent_error(LOST_ACCENT_FILE, err)
|
||||
else: # e.g Ơ -> Ớ
|
||||
ddata['nwrong_accent'] += 1
|
||||
# write_accent_error(WRONG_ACCENT_FILE, err)
|
||||
else: # e.g Ă -> Â
|
||||
ddata['nwrong_vowels'] += 1
|
||||
else:
|
||||
if a_ori in CONSONANTS and b_ori in CONSONANTS:
|
||||
ddata['nwrong_consonants'] += 1
|
||||
else:
|
||||
ddata['nwrong_specialchars'] += 1
|
||||
update_ddata_specialchars(ddata_specialchars, 'wrong', (a_char, b_char))
|
||||
else:
|
||||
if a_char.lower() == b_char.lower():
|
||||
ddata['UL_multiple'] += 1
|
||||
update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char))
|
||||
else:
|
||||
ddata['nwrongs_multiple'] += 1
|
||||
if len(a_char) > 10 or len(b_char) > 10:
|
||||
ddata['nlong_sequences'] += 1
|
||||
# print(a_char)
|
||||
|
||||
|
||||
def process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars):
|
||||
a_char = matcher.a[i1:i2]
|
||||
ddata['res_text'] += ' ### {- ' + a_char + '} ### '
|
||||
ddata['nadds'] += 1*len(a_char)
|
||||
if len(a_char) == 1:
|
||||
ddata['nadds_single'] += 1
|
||||
if a_char.lower() in CONSONANTS + VOWELS:
|
||||
ddata['nadds_chars'] += 1
|
||||
else:
|
||||
if a_char == ' ':
|
||||
ddata['nadds_space'] += 1
|
||||
else:
|
||||
ddata['nadds_specialchars'] += 1
|
||||
update_ddata_specialchars(ddata_specialchars, 'add', a_char)
|
||||
|
||||
else:
|
||||
ddata['nadds_multiple'] += 1
|
||||
if len(a_char) > 10:
|
||||
ddata['nlong_sequences'] += 1
|
||||
# print(a_char)
|
||||
|
||||
|
||||
def process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars):
|
||||
b_char = matcher.b[j1:j2]
|
||||
ddata['nlosts'] += 1*len(b_char)
|
||||
ddata['res_text'] += ' ### {+ ' + b_char + '} ### '
|
||||
if len(b_char) == 1:
|
||||
ddata['nlosts_single'] += 1
|
||||
if b_char.lower() in CONSONANTS + VOWELS:
|
||||
ddata['nlosts_chars'] += 1
|
||||
else:
|
||||
if b_char == ' ':
|
||||
ddata['nlosts_space'] += 1
|
||||
else:
|
||||
ddata['nlosts_specialchars'] += 1
|
||||
update_ddata_specialchars(ddata_specialchars, 'lost', b_char)
|
||||
|
||||
else:
|
||||
ddata['nlosts_multiple'] += 1
|
||||
if len(b_char) > 10:
|
||||
ddata['nlong_sequences'] += 1
|
||||
# print(b_char)
|
||||
|
||||
|
||||
def inline_diff(a, b, ddata_specialchars={'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}):
|
||||
matcher = difflib.SequenceMatcher(None, a, b)
|
||||
ddata = {'res_text': ''}
|
||||
# ddata = ddata | {key: 0 for key in ['nsingle', 'nmultiple']}
|
||||
ddata = ddata | {key: 0 for key in ['UL_single', 'UL_multiple']}
|
||||
ddata = ddata | {
|
||||
key: 0 for key in
|
||||
['nlosts', 'nlosts_single', 'nlosts_multiple', 'nlosts_chars', 'nlosts_specialchars', 'nlosts_space']}
|
||||
ddata = ddata | {
|
||||
key: 0 for key in
|
||||
['nadds', 'nadds_single', 'nadds_multiple', 'nadds_chars', 'nadds_specialchars', 'nadds_space']}
|
||||
ddata = ddata | {
|
||||
key: 0 for key in
|
||||
['nwrongs', 'nwrongs_single', 'nwrongs_multiple', 'nwrong_accent', 'nlost_accent', 'nwrong_vowels',
|
||||
'nwrong_consonants', 'nwrong_specialchars']}
|
||||
ddata['nlong_sequences'] = 0
|
||||
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
||||
if tag == 'replace': # wrong
|
||||
process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars)
|
||||
if tag == 'delete': # OCR add char so the matcher "delete"
|
||||
process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars)
|
||||
if tag == 'equal':
|
||||
ddata['res_text'] += matcher.a[i1:i2]
|
||||
if tag == 'insert': # OCR lost char so the matcher "insert"
|
||||
process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars)
|
||||
ddata["ned"] = ddata['nwrongs'] + ddata['nadds'] + ddata['nlosts']
|
||||
return ddata
|
||||
|
||||
|
||||
def process_single_file(file_name, ddata_specialchars):
|
||||
|
||||
# read predict file
|
||||
with open(os.path.join(PREDICT_PATH, file_name), 'r') as f:
|
||||
predict = f.readlines()[0].strip()
|
||||
# predict = ''.join(predict)
|
||||
# predict = predict.replace(' ', '')
|
||||
# predict = predict.replace('\n', '')
|
||||
# print(predict)
|
||||
|
||||
# read groundtruth file
|
||||
with open(os.path.join(GROUNDTRUTH_PATH, file_name), 'r') as f:
|
||||
gt = f.readlines()[0].strip()
|
||||
# gt = ''.join(gt)
|
||||
# gt = gt.replace('\n', '')
|
||||
|
||||
# get statiscal data of difference between predict and ground truth
|
||||
ddata = inline_diff(predict, gt, ddata_specialchars)
|
||||
global TOTAL_WORD
|
||||
TOTAL_WORD = TOTAL_WORD + len(gt.split())
|
||||
# write to save_path
|
||||
res_text = ddata.pop('res_text', None)
|
||||
save_file = os.path.join(RES_PATH, file_name)
|
||||
with open(save_file, 'w') as f:
|
||||
f.write(res_text)
|
||||
|
||||
# generate csv file
|
||||
ddata = {'file_name': save_file} | ddata
|
||||
return ddata
|
||||
|
||||
|
||||
def main(overwrite=False):
|
||||
for accent_file in [WRONG_ACCENT_FILE, LOST_ACCENT_FILE]:
|
||||
if os.path.exists(accent_file):
|
||||
os.remove(accent_file)
|
||||
lddata = []
|
||||
ddata_specialchars = {'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}
|
||||
for file_ in glob.glob(f'{PREDICT_PATH}/*.txt'):
|
||||
file_name = file_.split('/')[-1]
|
||||
ddata = process_single_file(file_name, ddata_specialchars)
|
||||
lddata.append(ddata)
|
||||
if overwrite:
|
||||
df = pd.DataFrame(lddata)
|
||||
df.to_csv(f'{SAVE_PATH}/wiki_diff.csv', sep='\t')
|
||||
df_ = pd.DataFrame(ddata_specialchars)
|
||||
df_.to_csv(f'{SAVE_PATH}/wiki_diff_specialchars.csv', sep='\t')
|
||||
print(TOTAL_WORD)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(overwrite=True)
|
@ -36,7 +36,7 @@ requests==2.28.1
|
||||
ruamel.yaml==0.17.21
|
||||
ruamel.yaml.clib==0.2.7
|
||||
sqlparse==0.4.3
|
||||
tzdata==2022.6
|
||||
tzdata==2022.7
|
||||
uritemplate==4.1.1
|
||||
urllib3==1.26.13
|
||||
uvicorn==0.20.0
|
||||
@ -51,3 +51,12 @@ imagesize==1.4.1
|
||||
pdf2image==1.16.3
|
||||
redis==5.0.1
|
||||
django-celery-beat==2.5.0
|
||||
terminaltables==3.1.10
|
||||
rapidfuzz==3.6.1
|
||||
Unidecode==1.3.8
|
||||
pandas==2.2.0
|
||||
openpyxl==3.1.2
|
||||
# For sdsvkvu compatibility
|
||||
# torch==1.13.1+cu116
|
||||
# torchvision==0.14.1+cu116
|
||||
# --extra-index-url https://download.pytorch.org/whl/cu116
|
1
cope2n-api/scripts/database_cloning.sh
Normal file
1
cope2n-api/scripts/database_cloning.sh
Normal file
@ -0,0 +1 @@
|
||||
pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql
|
1
cope2n-api/token.txt
Normal file
1
cope2n-api/token.txt
Normal file
@ -0,0 +1 @@
|
||||
eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpZCI6InNidCIsImV4cGlyZWRfYXQiOiIwMS8wMi8yMDI0IDEyOjQ2OjA3IiwiaW50ZXJuYWxfaWQiOjEsInN0YXR1cyI6MSwic3Vic2NyaXB0aW9uX2lkIjoxfQ.VFsoGm5BdeyNptMsdU4f4l70bDIYHTmB8Y-2-PXs7cKhzGB1pUpgqax-V39N_IEXriRl3caDiotzU0psR0WR3g
|
@ -83,12 +83,12 @@ services:
|
||||
depends_on:
|
||||
db-sbt:
|
||||
condition: service_started
|
||||
command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input &&
|
||||
python manage.py makemigrations &&
|
||||
python manage.py migrate &&
|
||||
python manage.py compilemessages &&
|
||||
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
|
||||
# command: bash -c "tail -f > /dev/null"
|
||||
# command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input &&
|
||||
# python manage.py makemigrations &&
|
||||
# python manage.py migrate &&
|
||||
# python manage.py compilemessages &&
|
||||
# gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
|
||||
command: bash -c "tail -f > /dev/null"
|
||||
|
||||
minio:
|
||||
image: minio/minio
|
||||
@ -188,6 +188,8 @@ services:
|
||||
- POSTGRES_USER=${DB_USER}
|
||||
- POSTGRES_PASSWORD=${DB_PASSWORD}
|
||||
- POSTGRES_DB=${DB_SCHEMA}
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
rabbitmq-sbt:
|
||||
mem_reservation: 600m
|
||||
|
@ -10,9 +10,9 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv("../.env_prod")
|
||||
|
||||
OUTPUT_NAME = "missing_records"
|
||||
START_DATE = datetime(2023, 12, 28, tzinfo=timezone('Asia/Ho_Chi_Minh'))
|
||||
END_DATE = datetime(2024, 1, 3, tzinfo=timezone('Asia/Ho_Chi_Minh'))
|
||||
OUTPUT_NAME = "Jan"
|
||||
START_DATE = datetime(2024, 1, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
|
||||
END_DATE = datetime(2024, 2, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
|
||||
|
||||
# Database connection details
|
||||
db_host = os.environ.get('DB_HOST', "")
|
||||
@ -62,32 +62,32 @@ with open(csv_file_path, 'w', newline='') as csv_file:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
# Download folders from S3
|
||||
s3_client = boto3.client(
|
||||
's3',
|
||||
aws_access_key_id=access_key,
|
||||
aws_secret_access_key=secret_key
|
||||
)
|
||||
# # Download folders from S3
|
||||
# s3_client = boto3.client(
|
||||
# 's3',
|
||||
# aws_access_key_id=access_key,
|
||||
# aws_secret_access_key=secret_key
|
||||
# )
|
||||
|
||||
request_ids = []
|
||||
for rq in data:
|
||||
rq_id = rq[3]
|
||||
request_ids.append(rq_id)
|
||||
# request_ids = []
|
||||
# for rq in data:
|
||||
# rq_id = rq[3]
|
||||
# request_ids.append(rq_id)
|
||||
|
||||
for request_id in tqdm(request_ids):
|
||||
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
|
||||
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
|
||||
os.makedirs(OUTPUT_NAME, exist_ok=True)
|
||||
os.makedirs(local_folder_path, exist_ok=True)
|
||||
# for request_id in tqdm(request_ids):
|
||||
# folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
|
||||
# local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
|
||||
# os.makedirs(OUTPUT_NAME, exist_ok=True)
|
||||
# os.makedirs(local_folder_path, exist_ok=True)
|
||||
|
||||
|
||||
# List objects in the S3 folder
|
||||
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
|
||||
objects = response.get('Contents', [])
|
||||
# # List objects in the S3 folder
|
||||
# response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
|
||||
# objects = response.get('Contents', [])
|
||||
|
||||
for s3_object in objects:
|
||||
object_key = s3_object['Key']
|
||||
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
|
||||
# for s3_object in objects:
|
||||
# object_key = s3_object['Key']
|
||||
# local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
|
||||
|
||||
# Download the S3 object to the local file
|
||||
s3_client.download_file(s3_bucket_name, object_key, local_file_path)
|
||||
# # Download the S3 object to the local file
|
||||
# s3_client.download_file(s3_bucket_name, object_key, local_file_path)
|
1
scripts/database_cloning.sh
Normal file
1
scripts/database_cloning.sh
Normal file
@ -0,0 +1 @@
|
||||
pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql
|
Loading…
Reference in New Issue
Block a user