diff --git a/cope2n-api/Dockerfile b/cope2n-api/Dockerfile index 18d6e3f..c841ccb 100755 --- a/cope2n-api/Dockerfile +++ b/cope2n-api/Dockerfile @@ -8,10 +8,17 @@ RUN groupadd --gid ${GID} ${USERNAME} \ && apt-get install -y sudo bash gettext poppler-utils \ && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \ && chmod 0440 /etc/sudoers.d/${USERNAME} +RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y RUN yes | apt install postgresql gcc musl-dev RUN pip install --upgrade pip RUN pip install uvicorn gunicorn Celery +# For intergration with sdskvu +RUN pip install pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 +RUN pip install -U openmim==0.3.7 --no-cache-dir +RUN mim install mmcv-full==1.7.2 +# End intergration with sdskvu + USER ${UID} ADD --chown=${UID}:${GID} fwd /app COPY --chown=${UID}:${GID} requirements.txt /app @@ -21,4 +28,27 @@ RUN pip install -r requirements.txt --no-cache-dir COPY --chown=${UID}:${GID} . /app +RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir +RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir +RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir +RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir + +# For intergration with sdskvu +RUN python -m pip install paddlepaddle-gpu==2.4.2.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html --no-cache-dir + ENV TZ="Asia/Ho_Chi_Minh" + + +# FROM cope2n-api-base AS builder +# ARG UID=1000 +# ARG GID=1000 +# ARG USERNAME=container-user + +# # Create a new user +# RUN groupadd --gid ${GID} ${USERNAME} \ +# && useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \ +# && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \ +# && chmod 0440 /etc/sudoers.d/${USERNAME} + +# WORKDIR /app +# COPY --chown=${UID}:${GID} . /app diff --git a/cope2n-api/Dockerfile.base b/cope2n-api/Dockerfile.base new file mode 100644 index 0000000..c13dc27 --- /dev/null +++ b/cope2n-api/Dockerfile.base @@ -0,0 +1,17 @@ +FROM python:3.9.17-buster + +RUN apt-get update \ + && apt-get install -y sudo bash gettext poppler-utils postgresql gcc musl-dev + +COPY requirements.txt /tmp +COPY ./fwd_api/utils/sdsvkvu /app/fwd_api/utils/sdsvkvu + +RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir +RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir +RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir +RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir + +RUN pip install --upgrade pip && pip install uvicorn gunicorn Celery +RUN pip install -r /tmp/requirements.txt --no-cache-dir + +ENV TZ="Asia/Ho_Chi_Minh" \ No newline at end of file diff --git a/cope2n-api/fwd_api/api/accuracy_view.py b/cope2n-api/fwd_api/api/accuracy_view.py index abeb682..2159ad0 100644 --- a/cope2n-api/fwd_api/api/accuracy_view.py +++ b/cope2n-api/fwd_api/api/accuracy_view.py @@ -3,88 +3,87 @@ from rest_framework.decorators import action from rest_framework.response import Response from django.core.paginator import Paginator from django.http import JsonResponse -from datetime import datetime from django.utils import timezone from django.db.models import Q +import uuid from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiTypes # from drf_spectacular.types import OpenApiString -from ..models import SubscriptionRequest -from ..exception.exceptions import RequiredFieldException - import json +from ..exception.exceptions import InvalidException, RequiredFieldException +from ..models import SubscriptionRequest, Report, ReportFile +from ..utils.accuracy import shadow_report, MonthReportAccumulate +from ..utils.file import validate_report_list +from ..utils.process import string_to_boolean +def first_of_list(the_list): + if not the_list: + return None + return the_list[0] class AccuracyViewSet(viewsets.ViewSet): lookup_field = "username" @extend_schema( - parameters=[ - OpenApiParameter( - name='start_date', - location=OpenApiParameter.QUERY, - description='Start date (YYYY-mm-DDTHH:MM:SS)', - type=OpenApiTypes.DATE, - default='2023-01-02T00:00:00', - ), - OpenApiParameter( - name='end_date', - location=OpenApiParameter.QUERY, - description='End date (YYYY-mm-DDTHH:MM:SS)', - type=OpenApiTypes.DATE, - default='2024-01-10T00:00:00', - ), - OpenApiParameter( - name='include_test', - location=OpenApiParameter.QUERY, - description='Whether to include test record or not', - type=OpenApiTypes.BOOL, - ), - OpenApiParameter( - name='is_reviewed', - location=OpenApiParameter.QUERY, - description='Which records to be query', - type=OpenApiTypes.STR, - enum=['reviewed', 'not reviewed', 'all'], - ), - OpenApiParameter( - name='request_id', - location=OpenApiParameter.QUERY, - description='Specific request id', - type=OpenApiTypes.STR, - ), - OpenApiParameter( - name='redemption_id', - location=OpenApiParameter.QUERY, - description='Specific redemption id', - type=OpenApiTypes.STR, - ), - OpenApiParameter( - name='quality', - location=OpenApiParameter.QUERY, - description='One or more of [bad, good, all]', - type=OpenApiTypes.STR, - enum=['bad', 'good', 'all'], - ), - OpenApiParameter( - name='page', - location=OpenApiParameter.QUERY, - description='Page number', - type=OpenApiTypes.INT, - required=False - ), - OpenApiParameter( - name='page_size', - location=OpenApiParameter.QUERY, - description='Number of items per page', - type=OpenApiTypes.INT, - required=False - ), - ], - responses=None, tags=['Accuracy'] + parameters=[ + OpenApiParameter( + name='start_date', + location=OpenApiParameter.QUERY, + description='Start date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2023-01-02T00:00:00+0700', + ), + OpenApiParameter( + name='end_date', + location=OpenApiParameter.QUERY, + description='End date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2024-01-10T00:00:00+0700', + ), + OpenApiParameter( + name='include_test', + location=OpenApiParameter.QUERY, + description='Whether to include test record or not', + type=OpenApiTypes.BOOL, + ), + OpenApiParameter( + name='is_reviewed', + location=OpenApiParameter.QUERY, + description='Which records to be query', + type=OpenApiTypes.STR, + enum=['reviewed', 'not reviewed', 'all'], + ), + OpenApiParameter( + name='request_id', + location=OpenApiParameter.QUERY, + description='Specific request id', + type=OpenApiTypes.STR, + ), + OpenApiParameter( + name='redemption_id', + location=OpenApiParameter.QUERY, + description='Specific redemption id', + type=OpenApiTypes.STR, + ), + OpenApiParameter( + name='page', + location=OpenApiParameter.QUERY, + description='Page number', + type=OpenApiTypes.INT, + required=False + ), + OpenApiParameter( + name='page_size', + location=OpenApiParameter.QUERY, + description='Number of items per page', + type=OpenApiTypes.INT, + required=False + ), + ], + responses=None, tags=['Accuracy'] ) @action(detail=False, url_path="request_list", methods=["GET"]) - def get_subscription_requests(self, request): + def get_request_list(self, request): if request.method == 'GET': start_date_str = request.GET.get('start_date') end_date_str = request.GET.get('end_date') @@ -94,14 +93,13 @@ class AccuracyViewSet(viewsets.ViewSet): redemption_id = request.GET.get('redemption_id', None) is_reviewed = request.GET.get('is_reviewed', None) include_test = request.GET.get('include_test', False) - quality = request.GET.get('quality', None) try: - start_date = datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S') - end_date = datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S') + start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') + end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z') except ValueError: - return JsonResponse({'error': 'Invalid date format. Please use YYYY-MM-DD.'}, status=400) - + raise InvalidException(excArgs="Date format") + base_query = Q(created_at__range=(start_date, end_date)) if request_id: base_query &= Q(request_id=request_id) @@ -124,19 +122,12 @@ class AccuracyViewSet(viewsets.ViewSet): base_query &= Q(is_reviewed=False) elif is_reviewed == "all": pass - if isinstance(quality, str): - if quality == "good": - base_query &= Q(is_bad_image_quality=False) - elif quality == "bad": - base_query &= Q(is_bad_image_quality=True) - elif quality == "all": - pass subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at') paginator = Paginator(subscription_requests, page_size) page = paginator.get_page(page_number) - + data = [] for request in page: imeis = [] @@ -184,7 +175,369 @@ class AccuracyViewSet(viewsets.ViewSet): return JsonResponse(response) return JsonResponse({'error': 'Invalid request method.'}, status=405) - + + @extend_schema( + parameters=[ + OpenApiParameter( + name='is_daily_report', + location=OpenApiParameter.QUERY, + description='Whether to include test record or not', + type=OpenApiTypes.BOOL, + ), + OpenApiParameter( + name='start_date', + location=OpenApiParameter.QUERY, + description='Start date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2023-01-02T00:00:00+0700', + ), + OpenApiParameter( + name='end_date', + location=OpenApiParameter.QUERY, + description='End date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2024-01-10T00:00:00+0700', + ), + OpenApiParameter( + name='include_test', + location=OpenApiParameter.QUERY, + description='Whether to include test record or not', + type=OpenApiTypes.BOOL, + ), + OpenApiParameter( + name='is_reviewed', + location=OpenApiParameter.QUERY, + description='Which records to be query', + type=OpenApiTypes.STR, + enum=['reviewed', 'not reviewed', 'all'], + ), + OpenApiParameter( + name='request_id', + location=OpenApiParameter.QUERY, + description='Specific request id', + type=OpenApiTypes.STR, + ), + OpenApiParameter( + name='redemption_id', + location=OpenApiParameter.QUERY, + description='Specific redemption id', + type=OpenApiTypes.STR, + ), + OpenApiParameter( + name='subsidiary', + location=OpenApiParameter.QUERY, + description='Subsidiary', + type=OpenApiTypes.STR, + ), + ], + responses=None, tags=['Accuracy'] + ) + @action(detail=False, url_path="make_report", methods=["GET"]) + def make_report(self, request): + if request.method == 'GET': + start_date_str = request.GET.get('start_date') + end_date_str = request.GET.get('end_date') + request_id = request.GET.get('request_id', None) + redemption_id = request.GET.get('redemption_id', None) + is_reviewed = string_to_boolean(request.data.get('is_reviewed', "false")) + include_test = string_to_boolean(request.data.get('include_test', "false")) + subsidiary = request.GET.get("subsidiary", "all") + is_daily_report = string_to_boolean(request.data.get('is_daily_report', "false")) + + try: + start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') + end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z') + except ValueError: + raise InvalidException(excArgs="Date format") + + query_set = {"start_date_str": start_date_str, + "end_date_str": end_date_str, + "request_id": request_id, + "redemption_id": redemption_id, + "is_reviewed": is_reviewed, + "include_test": include_test, + "subsidiary": subsidiary, + "is_daily_report": is_daily_report, + } + + report_id = "report" + "_" + timezone.datetime.now().strftime("%Y%m%d%H%M%S%z") + "_" + uuid.uuid4().hex + new_report: Report = Report( + report_id=report_id, + is_daily_report=is_daily_report, + subsidiary=subsidiary.lower().replace(" ", ""), + include_test=include_test, + include_reviewed=is_reviewed, + start_at=start_date, + end_at=end_date, + ) + new_report.save() + # Background job to calculate accuracy + shadow_report(report_id, query_set) + + return JsonResponse(status=status.HTTP_200_OK, data={"report_id": report_id}) + + @extend_schema( + parameters=[ + OpenApiParameter( + name='report_id', + location=OpenApiParameter.QUERY, + description='Specific report id', + type=OpenApiTypes.STR, + ), + OpenApiParameter( + name='page', + location=OpenApiParameter.QUERY, + description='Page number', + type=OpenApiTypes.INT, + required=False + ), + OpenApiParameter( + name='page_size', + location=OpenApiParameter.QUERY, + description='Number of items per page', + type=OpenApiTypes.INT, + required=False + ), + ], + responses=None, tags=['Accuracy'] + ) + @action(detail=False, url_path="report_detail_list", methods=["GET"]) + def get_report_detail_list(self, request): + if request.method == 'GET': + report_id = request.GET.get('report_id', None) + page_number = int(request.GET.get('page', 1)) + page_size = int(request.GET.get('page_size', 10)) + + report = Report.objects.filter(report_id=report_id).first() + report_files = ReportFile.objects.filter(report=report) + + paginator = Paginator(report_files, page_size) + page = paginator.get_page(page_number) + + data = [] + for report_file in page: + data.append({ + "Request ID": report_file.correspond_request_id, + "Redemption Number": report_file.correspond_redemption_id, + "Image type": report_file.doc_type, + "IMEI_user submitted": first_of_list(report_file.feedback_result.get("imei_number", [None])), + "IMEI_OCR retrieved": first_of_list(report_file.predict_result.get("imei_number", [None])), + "IMEI1 Accuracy": first_of_list(report_file.feedback_accuracy.get("imei_number", [None])), + "Invoice_Purchase Date_Consumer": report_file.feedback_result.get("purchase_date", None), + "Invoice_Purchase Date_OCR": report_file.predict_result.get("purchase_date", []), + "Invoice_Purchase Date Accuracy": first_of_list(report_file.feedback_accuracy.get("purchase_date", [None])), + "Invoice_Retailer_Consumer": report_file.feedback_result.get("retailername", None), + "Invoice_Retailer_OCR": report_file.predict_result.get("retailername", None), + "Invoice_Retailer Accuracy": first_of_list(report_file.feedback_accuracy.get("retailername", [None])), + "OCR Image Accuracy": report_file.acc, + "OCR Image Speed (seconds)": report_file.time_cost, + "Reviewed?": "No", + "Bad Image Reasons": report_file.bad_image_reason, + "Countermeasures": report_file.counter_measures, + "IMEI_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("imei_number", [None])), + "Purchase Date_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("purchase_date", [None])), + "Retailer_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("retailername", [None])), + }) + + response = { + 'report_detail': data, + 'page': { + 'number': page.number, + 'total_pages': page.paginator.num_pages, + 'count': page.paginator.count, + } + } + return JsonResponse(response, status=200) + + return JsonResponse({'error': 'Invalid request method.'}, status=405) + + @extend_schema( + parameters=[ + OpenApiParameter( + name='start_date', + location=OpenApiParameter.QUERY, + description='Start date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2023-01-02T00:00:00+0700', + ), + OpenApiParameter( + name='end_date', + location=OpenApiParameter.QUERY, + description='End date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2024-01-10T00:00:00+0700', + ), + OpenApiParameter( + name='daily_report_only', + location=OpenApiParameter.QUERY, + description='Specific report id', + type=OpenApiTypes.BOOL, + ), + OpenApiParameter( + name='page', + location=OpenApiParameter.QUERY, + description='Page number', + type=OpenApiTypes.INT, + required=False + ), + OpenApiParameter( + name='page_size', + location=OpenApiParameter.QUERY, + description='Number of items per page', + type=OpenApiTypes.INT, + required=False + ), + ], + responses=None, tags=['Accuracy'] + ) + @action(detail=False, url_path="report_list", methods=["GET"]) + def get_report_list(self, request): + if request.method == 'GET': + daily_report_only = request.GET.get('daily_report_only', False) + start_date_str = request.GET.get('start_date', "") + end_date_str = request.GET.get('end_date', "") + page_number = int(request.GET.get('page', 1)) + page_size = int(request.GET.get('page_size', 10)) + + if not start_date_str or not end_date_str: + reports = Report.objects.all() + else: + try: + start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') + end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z') + except ValueError: + raise InvalidException(excArgs="Date format") + base_query = Q(created_at__range=(start_date, end_date)) + if daily_report_only: + base_query &= Q(is_daily_report=True) + reports = Report.objects.filter(base_query).order_by('created_at') + + + paginator = Paginator(reports, page_size) + page = paginator.get_page(page_number) + + data = [] + for report in page: + data.append({ + "ID": report.id, + "Created Date": report.created_at, + "No. Requests": report.number_request, + "Status": report.status, + "Purchase Date Acc": report.reviewed_accuracy.get("purchase_date", None) if report.reviewed_accuracy else None, + "Retailer Acc": report.feedback_accuracy.get("retailername", None) if report.reviewed_accuracy else None, + "IMEI Acc": report.feedback_accuracy.get("imei_number", None) if report.reviewed_accuracy else None, + "Avg. Accuracy": report.feedback_accuracy.get("avg", None) if report.reviewed_accuracy else None, + "Avg. Client Request Time": report.average_client_time.get("avg", 0) if report.average_client_time else 0, + "Avg. OCR Processing Time": report.average_OCR_time.get("avg", 0) if report.average_client_time else 0, + "report_id": report.report_id, + }) + + response = { + 'report_detail': data, + 'page': { + 'number': page.number, + 'total_pages': page.paginator.num_pages, + 'count': page.paginator.count, + } + } + return JsonResponse(response, status=200) + + return JsonResponse({'error': 'Invalid request method.'}, status=405) + + @extend_schema( + parameters=[ + OpenApiParameter( + name='start_date', + location=OpenApiParameter.QUERY, + description='Start date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2023-01-02T00:00:00+0700', + ), + OpenApiParameter( + name='end_date', + location=OpenApiParameter.QUERY, + description='End date (YYYY-mm-DDTHH:MM:SSZ)', + type=OpenApiTypes.DATE, + default='2024-01-10T00:00:00+0700', + ), + OpenApiParameter( + name='subsidiary', + location=OpenApiParameter.QUERY, + description='Subsidiary', + type=OpenApiTypes.STR, + ), + OpenApiParameter( + name='page', + location=OpenApiParameter.QUERY, + description='Page number', + type=OpenApiTypes.INT, + required=False + ), + OpenApiParameter( + name='page_size', + location=OpenApiParameter.QUERY, + description='Number of items per page', + type=OpenApiTypes.INT, + required=False + ), + ], + responses=None, tags=['Accuracy'] + ) + @action(detail=False, url_path="overview", methods=["GET"]) + def overview(self, request): + if request.method == 'GET': + subsidiary = request.GET.get('subsidiary', None) + start_date_str = request.GET.get('start_date', "") + end_date_str = request.GET.get('end_date', "") + page_number = int(request.GET.get('page', 1)) + page_size = int(request.GET.get('page_size', 10)) + + + if not start_date_str or not end_date_str: + reports = Report.objects.all() + else: + try: + start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') + end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z') + except ValueError: + raise InvalidException(excArgs="Date format") + base_query = Q(created_at__range=(start_date, end_date)) + if subsidiary: + base_query &= Q(subsidiary=subsidiary) + base_query &= Q(is_daily_report=True) + reports = Report.objects.filter(base_query).order_by('created_at') + + paginator = Paginator(reports, page_size) + page = paginator.get_page(page_number) + + data = [] + this_month_report = MonthReportAccumulate() + for report in page: + res = this_month_report.add(report) + if not(res): + _, _data, total = this_month_report() + data += [total] + data += _data + this_month_report = MonthReportAccumulate() + this_month_report.add(report) + else: + continue + _, _data, total = this_month_report() + data += [total] + data += _data + + response = { + 'overview_data': data, + 'page': { + 'number': page.number, + 'total_pages': page.paginator.num_pages, + 'count': page.paginator.count, + } + } + return JsonResponse(response, status=200) + + return JsonResponse({'error': 'Invalid request method.'}, status=405) + class RequestViewSet(viewsets.ViewSet): lookup_field = "username" @@ -269,4 +622,4 @@ class RequestViewSet(viewsets.ViewSet): return JsonResponse({'message': 'success.'}, status=200) else: - return JsonResponse({'error': 'Invalid request method.'}, status=405) \ No newline at end of file + return JsonResponse({'error': 'Invalid request method.'}, status=405) diff --git a/cope2n-api/fwd_api/celery_worker/client_connector.py b/cope2n-api/fwd_api/celery_worker/client_connector.py index 16c7dd5..5e0d59c 100755 --- a/cope2n-api/fwd_api/celery_worker/client_connector.py +++ b/cope2n-api/fwd_api/celery_worker/client_connector.py @@ -34,13 +34,16 @@ class CeleryConnector: 'upload_obj_to_s3': {'queue': "upload_obj_to_s3"}, 'remove_local_file': {'queue': "remove_local_file"}, 'csv_feedback': {'queue': "csv_feedback"}, + 'make_a_report': {'queue': "report"}, } app = Celery( 'postman', broker=settings.BROKER_URL, broker_transport_options={'confirm_publish': False}, - ) + ) + def make_a_report(self, args): + return self.send_task('make_a_report', args) def csv_feedback(self, args): return self.send_task('csv_feedback', args) def do_pdf(self, args): diff --git a/cope2n-api/fwd_api/celery_worker/process_report_tasks.py b/cope2n-api/fwd_api/celery_worker/process_report_tasks.py new file mode 100644 index 0000000..c5b2a86 --- /dev/null +++ b/cope2n-api/fwd_api/celery_worker/process_report_tasks.py @@ -0,0 +1,149 @@ +import time +import uuid +import os +import base64 +import traceback +from multiprocessing.pool import ThreadPool + +from fwd_api.models import SubscriptionRequest, UserProfile +from fwd_api.celery_worker.worker import app +from ..constant.common import FolderFileType, image_extensions +from ..exception.exceptions import FileContentInvalidException +from fwd_api.models import SubscriptionRequestFile, FeedbackRequest, Report +from ..utils import file as FileUtils +from ..utils import process as ProcessUtil +from ..utils import s3 as S3Util +from ..utils.accuracy import update_temp_accuracy, IterAvg, calculate_and_save_subcription_file +from fwd_api.constant.common import ProcessType +from django.utils import timezone +from django.db.models import Q +import csv +import json + +from celery.utils.log import get_task_logger +from fwd import settings + + +logger = get_task_logger(__name__) + +s3_client = S3Util.MinioS3Client( + endpoint=settings.S3_ENDPOINT, + access_key=settings.S3_ACCESS_KEY, + secret_key=settings.S3_SECRET_KEY, + bucket_name=settings.S3_BUCKET_NAME +) + +def mean_list(l): + l = [x for x in l if x is not None] + if len(l) == 0: + return 0 + return sum(l)/len(l) + +@app.task(name='make_a_report') +def make_a_report(report_id, query_set): + try: + start_date = timezone.datetime.strptime(query_set["start_date_str"], '%Y-%m-%dT%H:%M:%S%z') + end_date = timezone.datetime.strptime(query_set["end_date_str"], '%Y-%m-%dT%H:%M:%S%z') + base_query = Q(created_at__range=(start_date, end_date)) + if query_set["request_id"]: + base_query &= Q(request_id=query_set["request_id"]) + if query_set["redemption_id"]: + base_query &= Q(redemption_id=query_set["redemption_id"]) + base_query &= Q(is_test_request=False) + if isinstance(query_set["include_test"], str): + query_set["include_test"] = True if query_set["include_test"].lower() in ["true", "yes", "1"] else False + if query_set["include_test"]: + # base_query = ~base_query + base_query.children = base_query.children[:-1] + + elif isinstance(query_set["include_test"], bool): + if query_set["include_test"]: + base_query = ~base_query + if isinstance(query_set["subsidiary"], str): + if query_set["subsidiary"] and query_set["subsidiary"].lower().replace(" ", "")!="all": + base_query &= Q(redemption_id__startswith=query_set["subsidiary"]) + if isinstance(query_set["is_reviewed"], str): + if query_set["is_reviewed"] == "reviewed": + base_query &= Q(is_reviewed=True) + elif query_set["is_reviewed"] == "not reviewed": + base_query &= Q(is_reviewed=False) + # elif query_set["is_reviewed"] == "all": + # pass + + errors = [] + # Create a placeholder to fill + accuracy = {"feedback" :{"imei_number": IterAvg(), + "purchase_date": IterAvg(), + "retailername": IterAvg(), + "sold_to_party": IterAvg(),}, + "reviewed" :{"imei_number": IterAvg(), + "purchase_date": IterAvg(), + "retailername": IterAvg(), + "sold_to_party": IterAvg(),} + } # {"imei": {"acc": 0.1, count: 1}, ...} + time_cost = {"invoice": IterAvg(), + "imei": IterAvg()} + number_images = 0 + number_bad_images = 0 + # TODO: Multithreading + # Calculate accuracy, processing time, ....Then save. + subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at') + report: Report = \ + Report.objects.filter(report_id=report_id).first() + # TODO: number of transaction by doc type + num_request = 0 + for request in subscription_requests: + if request.status != 200 or not (request.reviewed_result or request.feedback_result): + # Failed requests or lack of reviewed_result/feedback_result + continue + request_att = calculate_and_save_subcription_file(report, request) + + request.feedback_accuracy = {"imei_number" : mean_list(request_att["acc"]["feedback"].get("imei_number", [None])), + "purchase_date" : mean_list(request_att["acc"]["feedback"].get("purchase_date", [None])), + "retailername" : mean_list(request_att["acc"]["feedback"].get("retailername", [None])), + "sold_to_party" : mean_list(request_att["acc"]["feedback"].get("sold_to_party", [None]))} + request.reviewed_accuracy = {"imei_number" : mean_list(request_att["acc"]["reviewed"].get("imei_number", [None])), + "purchase_date" : mean_list(request_att["acc"]["reviewed"].get("purchase_date", [None])), + "retailername" : mean_list(request_att["acc"]["reviewed"].get("retailername", [None])), + "sold_to_party" : mean_list(request_att["acc"]["reviewed"].get("sold_to_party", [None]))} + request.save() + number_images += request_att["total_images"] + number_bad_images += request_att["bad_images"] + update_temp_accuracy(accuracy["feedback"], request_att["acc"]["feedback"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"]) + update_temp_accuracy(accuracy["reviewed"], request_att["acc"]["reviewed"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"]) + + time_cost["imei"].add(request_att["time_cost"].get("imei", [])) + time_cost["invoice"].add(request_att["time_cost"].get("invoice", [])) + + errors += request_att["err"] + num_request += 1 + # Do saving process + report.number_request = num_request + report.number_images = number_images + report.number_imei = time_cost["imei"].count + report.number_invoice = time_cost["invoice"].count + report.number_bad_images = number_bad_images + report.average_OCR_time = {"invoice": time_cost["invoice"](), "imei": time_cost["imei"](), + "invoice_count": time_cost["invoice"].count, "imei_count": time_cost["imei"].count} + + acumulated_acc = {"feedback": {}, + "reviewed": {}} + + for acc_type in ["feedback", "reviewed"]: + for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]: + acumulated_acc[acc_type][key] = accuracy[acc_type][key]() + acumulated_acc[acc_type][key+"_count"] = accuracy[acc_type][key].count + + report.feedback_accuracy = acumulated_acc["feedback"] + report.reviewed_accuracy = acumulated_acc["reviewed"] + + report.errors = "|".join(errors) + report.save() + except IndexError as e: + print(e) + traceback.print_exc() + print("NotFound request by report id, %d", report_id) + except Exception as e: + print("[ERROR]: an error occured while processing report: ", report_id) + traceback.print_exc() + return 400 \ No newline at end of file diff --git a/cope2n-api/fwd_api/celery_worker/worker.py b/cope2n-api/fwd_api/celery_worker/worker.py index a056266..31a3262 100755 --- a/cope2n-api/fwd_api/celery_worker/worker.py +++ b/cope2n-api/fwd_api/celery_worker/worker.py @@ -12,7 +12,7 @@ django.setup() app: Celery = Celery( 'postman', broker=settings.BROKER_URL, - include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task'], + include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task', 'fwd_api.celery_worker.process_report_tasks'], broker_transport_options={'confirm_publish': False}, ) @@ -40,6 +40,7 @@ app.conf.update({ Queue('upload_obj_to_s3'), Queue('remove_local_file'), Queue('csv_feedback'), + Queue('report'), ], 'task_routes': { @@ -57,6 +58,7 @@ app.conf.update({ 'upload_obj_to_s3': {'queue': "upload_obj_to_s3"}, 'remove_local_file': {'queue': "remove_local_file"}, 'csv_feedback': {'queue': "csv_feedback"}, + 'make_a_report': {'queue': "report"}, } }) diff --git a/cope2n-api/fwd_api/management/commands/migrate-database-010224.py b/cope2n-api/fwd_api/management/commands/migrate-database-010224.py new file mode 100644 index 0000000..bc81388 --- /dev/null +++ b/cope2n-api/fwd_api/management/commands/migrate-database-010224.py @@ -0,0 +1,71 @@ +# myapp/management/commands/mycustomcommand.py +from django.core.management.base import BaseCommand +from tqdm import tqdm +from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest +from fwd_api.utils.accuracy import predict_result_to_ready +import traceback +import copy + +class Command(BaseCommand): + help = 'Refactor database for image level' + + def add_arguments(self, parser): + # Add your command-line arguments here + parser.add_argument('test', type=str, help='Value for the argument') + + + def process_request(self, request): + if len(request.request_id.split(".")[0].split("_")) < 2: + return + images = SubscriptionRequestFile.objects.filter(request=request) + time_cost = {"imei": [], "invoice": [], "all": []} + if request.ai_inference_profile is None: + time_cost["imei"] = [-1 for _ in range(len(images))] + time_cost["invoice"] = [-1] + time_cost["all"] = [-1] + else: + for k, v in request.ai_inference_profile.items(): + time_cost[k.split("_")[0]].append(v["inference"][1][0] - v["inference"][0] + (v["postprocess"][1]-v["postprocess"][0])) + for i, image in enumerate(images): + # temp_imei_SAP_20240127223644_a493434edbf84fc08aeb87ef6cdde102_0.jpg + try: + image.index_in_request = int(image.file_name.split(".")[0].split("_")[-1]) if len(image.file_name.split(".")[0].split("_")) > 4 else 0 + image.doc_type = image.file_name.split(".")[0].split("_")[1] if len(image.file_name.split(".")[0].split("_")) > 4 else "all" + image.processing_time = time_cost[image.doc_type][image.index_in_request] + if not request.predict_result: + raise KeyError(f"Key predict_result not found in {request.request_id}") + if request.predict_result.get("status", 200) != 200: + raise AttributeError(f"Failed request: {request.request_id}") + _predict_result = copy.deepcopy(predict_result_to_ready(request.predict_result)) + _feedback_result = copy.deepcopy(request.feedback_result) + _reviewed_result = copy.deepcopy(request.reviewed_result) + + if image.doc_type == "invoice": + _predict_result["imei_number"] = [] + if _feedback_result: + _feedback_result["imei_number"] = [] + else: + None + if _reviewed_result: + _reviewed_result["imei_number"] = [] + else: + None + else: + _predict_result = {"retailername": None, "sold_to_party": None, "purchase_date": [], "imei_number": [_predict_result["imei_number"][image.index_in_request]]} + _feedback_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_feedback_result["imei_number"][image.index_in_request]]} if _feedback_result else None + _reviewed_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_reviewed_result["imei_number"][image.index_in_request]]} if _reviewed_result else None + image.predict_result = _predict_result + image.feedback_result = _feedback_result + image.reviewed_result = _reviewed_result + image.save() + except Exception as e: + self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}")) + print(traceback.format_exc()) + continue + + def handle(self, *args, **options): + test = options['test'] + subcription_iter = SubscriptionRequest.objects.all() + for request in tqdm(subcription_iter.iterator()): + self.process_request(request) + self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully!')) diff --git a/cope2n-api/fwd_api/migrations/0167_report_remove_subscriptionrequestfile_accuracy_and_more.py b/cope2n-api/fwd_api/migrations/0167_report_remove_subscriptionrequestfile_accuracy_and_more.py new file mode 100644 index 0000000..bb1a36f --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0167_report_remove_subscriptionrequestfile_accuracy_and_more.py @@ -0,0 +1,102 @@ +# Generated by Django 4.1.3 on 2024-01-25 06:22 + +from django.db import migrations, models +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0166_remove_subscriptionrequest_is_bad_image_quality_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='Report', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('report_id', models.CharField(max_length=200)), + ('local_file_name', models.CharField(max_length=200)), + ('error_status', models.JSONField(null=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('updated_at', models.DateTimeField(auto_now=True)), + ('start_at', models.DateTimeField(null=True)), + ('end_at', models.DateTimeField(null=True)), + ('include_for_test_sample', models.BooleanField(default=False)), + ('status', models.CharField(max_length=100)), + ('is_daily_report', models.BooleanField(default=False)), + ('errors', models.TextField(default='')), + ('S3_uploaded', models.BooleanField(default=False)), + ('number_request', models.IntegerField(default=0)), + ('number_images', models.IntegerField(default=0)), + ('number_bad_images', models.IntegerField(default=0)), + ('average_client_time_profile', models.JSONField(null=True)), + ('average_OCR_time_profile', models.JSONField(null=True)), + ('average_OCR_time', models.JSONField(null=True)), + ('average_client_time', models.JSONField(null=True)), + ('imei_accuracy', models.FloatField(default=-1)), + ('purchase_date_accuracy', models.FloatField(default=-1)), + ('retailer_name_accuracy', models.FloatField(default=-1)), + ('sold_to_party_accuracy', models.FloatField(default=-1)), + ], + ), + migrations.RemoveField( + model_name='subscriptionrequestfile', + name='accuracy', + ), + migrations.AddField( + model_name='subscriptionrequest', + name='imei_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequest', + name='purchase_date_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequest', + name='retailer_name_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequest', + name='sold_to_party_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='counter_measures', + field=models.TextField(blank=True), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='imei_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='processing_time', + field=models.IntegerField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='purchase_date_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='reason', + field=models.TextField(blank=True), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='retailer_name_accuracy', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='sold_to_party_accuracy', + field=models.FloatField(default=-1), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0168_report_number_imei_transaction_and_more.py b/cope2n-api/fwd_api/migrations/0168_report_number_imei_transaction_and_more.py new file mode 100644 index 0000000..5b38f2c --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0168_report_number_imei_transaction_and_more.py @@ -0,0 +1,23 @@ +# Generated by Django 4.1.3 on 2024-01-25 09:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0167_report_remove_subscriptionrequestfile_accuracy_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='report', + name='number_imei_transaction', + field=models.IntegerField(default=0), + ), + migrations.AddField( + model_name='report', + name='number_ivoice_transaction', + field=models.IntegerField(default=0), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0169_report_include_reviewed_report_include_test_and_more.py b/cope2n-api/fwd_api/migrations/0169_report_include_reviewed_report_include_test_and_more.py new file mode 100644 index 0000000..8586cd6 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0169_report_include_reviewed_report_include_test_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.1.3 on 2024-01-25 11:17 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0168_report_number_imei_transaction_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='report', + name='include_reviewed', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='report', + name='include_test', + field=models.CharField(default='', max_length=200), + ), + migrations.AddField( + model_name='report', + name='subsidiary', + field=models.TextField(default=''), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0170_alter_report_errors_alter_report_include_reviewed_and_more.py b/cope2n-api/fwd_api/migrations/0170_alter_report_errors_alter_report_include_reviewed_and_more.py new file mode 100644 index 0000000..1bb2793 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0170_alter_report_errors_alter_report_include_reviewed_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.1.3 on 2024-01-25 11:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0169_report_include_reviewed_report_include_test_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='report', + name='errors', + field=models.TextField(default='', null=True), + ), + migrations.AlterField( + model_name='report', + name='include_reviewed', + field=models.TextField(default='', null=True), + ), + migrations.AlterField( + model_name='report', + name='subsidiary', + field=models.TextField(default='', null=True), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more.py b/cope2n-api/fwd_api/migrations/0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more.py new file mode 100644 index 0000000..fb95803 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more.py @@ -0,0 +1,112 @@ +# Generated by Django 4.1.3 on 2024-01-28 08:11 + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0170_alter_report_errors_alter_report_include_reviewed_and_more'), + ] + + operations = [ + migrations.RenameField( + model_name='report', + old_name='imei_accuracy', + new_name='imei_accuracy_ocr', + ), + migrations.RenameField( + model_name='report', + old_name='purchase_date_accuracy', + new_name='imei_accuracy_revised', + ), + migrations.RenameField( + model_name='report', + old_name='retailer_name_accuracy', + new_name='purchase_date_accuracy_ocr', + ), + migrations.RenameField( + model_name='report', + old_name='sold_to_party_accuracy', + new_name='purchase_date_accuracy_revised', + ), + migrations.AddField( + model_name='report', + name='retailer_name_accuracy_ocr', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='report', + name='retailer_name_accuracy_revised', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='report', + name='sold_to_party_accuracy_ocr', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='report', + name='sold_to_party_accuracy_revised', + field=models.FloatField(default=-1), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='feedback_result', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='predict_result', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='reviewed_result', + field=models.JSONField(null=True), + ), + migrations.AlterField( + model_name='subscriptionrequestfile', + name='doc_type', + field=models.CharField(default='', max_length=10), + ), + migrations.CreateModel( + name='ReportFile', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('correspond_request_id', models.CharField(max_length=200)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('updated_at', models.DateTimeField(auto_now=True)), + ('S3_uploaded', models.BooleanField(default=False)), + ('doc_type', models.CharField(max_length=200)), + ('imei_feedback', models.CharField(default=None, max_length=200, null=True)), + ('purchase_date_feedback', models.CharField(default=None, max_length=200, null=True)), + ('retailer_feedback', models.CharField(default=None, max_length=200, null=True)), + ('sold_to_party_feedback', models.CharField(default=None, max_length=200, null=True)), + ('imei_ocr', models.CharField(default=None, max_length=200, null=True)), + ('purchase_date_ocr', models.CharField(default=None, max_length=200, null=True)), + ('retailer_ocr', models.CharField(default=None, max_length=200, null=True)), + ('sold_to_party_ocr', models.CharField(default=None, max_length=200, null=True)), + ('imei_revised', models.CharField(default=None, max_length=200, null=True)), + ('purchase_date_revised', models.CharField(default=None, max_length=200, null=True)), + ('retailer_revised', models.CharField(default=None, max_length=200, null=True)), + ('sold_to_party_revised', models.CharField(default=None, max_length=200, null=True)), + ('imei_acc_feedback', models.FloatField(default=None, null=True)), + ('purchase_date_acc_feedback', models.FloatField(default=None, null=True)), + ('retailer_acc_feedback', models.FloatField(default=None, null=True)), + ('sold_to_party_acc_feedback', models.CharField(default=None, max_length=200, null=True)), + ('acc_feedback', models.FloatField(default=None, null=True)), + ('imei_acc_revised', models.FloatField(default=None, null=True)), + ('purchase_date_acc_revised', models.FloatField(default=None, null=True)), + ('retailer_acc_revised', models.FloatField(default=None, null=True)), + ('acc_revised', models.FloatField(default=None, null=True)), + ('time_cost', models.FloatField(default=0)), + ('is_reviewed', models.CharField(default='NA', max_length=5)), + ('bad_image_reason', models.TextField(default='')), + ('countermeasures', models.TextField(default='')), + ('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='files', to='fwd_api.report')), + ], + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0172_alter_subscriptionrequestfile_imei_accuracy_and_more.py b/cope2n-api/fwd_api/migrations/0172_alter_subscriptionrequestfile_imei_accuracy_and_more.py new file mode 100644 index 0000000..504bb65 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0172_alter_subscriptionrequestfile_imei_accuracy_and_more.py @@ -0,0 +1,38 @@ +# Generated by Django 4.1.3 on 2024-01-28 09:27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='subscriptionrequestfile', + name='imei_accuracy', + field=models.FloatField(default=None, null=True), + ), + migrations.AlterField( + model_name='subscriptionrequestfile', + name='processing_time', + field=models.FloatField(default=-1), + ), + migrations.AlterField( + model_name='subscriptionrequestfile', + name='purchase_date_accuracy', + field=models.FloatField(default=None, null=True), + ), + migrations.AlterField( + model_name='subscriptionrequestfile', + name='retailer_name_accuracy', + field=models.FloatField(default=None, null=True), + ), + migrations.AlterField( + model_name='subscriptionrequestfile', + name='sold_to_party_accuracy', + field=models.FloatField(default=None, null=True), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0173_rename_countermeasures_reportfile_counter_measures_and_more.py b/cope2n-api/fwd_api/migrations/0173_rename_countermeasures_reportfile_counter_measures_and_more.py new file mode 100644 index 0000000..e40c9d4 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0173_rename_countermeasures_reportfile_counter_measures_and_more.py @@ -0,0 +1,226 @@ +# Generated by Django 4.1.3 on 2024-01-28 18:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0172_alter_subscriptionrequestfile_imei_accuracy_and_more'), + ] + + operations = [ + migrations.RenameField( + model_name='reportfile', + old_name='countermeasures', + new_name='counter_measures', + ), + migrations.RemoveField( + model_name='report', + name='imei_accuracy_ocr', + ), + migrations.RemoveField( + model_name='report', + name='imei_accuracy_revised', + ), + migrations.RemoveField( + model_name='report', + name='purchase_date_accuracy_ocr', + ), + migrations.RemoveField( + model_name='report', + name='purchase_date_accuracy_revised', + ), + migrations.RemoveField( + model_name='report', + name='retailer_name_accuracy_ocr', + ), + migrations.RemoveField( + model_name='report', + name='retailer_name_accuracy_revised', + ), + migrations.RemoveField( + model_name='report', + name='sold_to_party_accuracy_ocr', + ), + migrations.RemoveField( + model_name='report', + name='sold_to_party_accuracy_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='acc_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='acc_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='imei_acc_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='imei_acc_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='imei_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='imei_ocr', + ), + migrations.RemoveField( + model_name='reportfile', + name='imei_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='purchase_date_acc_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='purchase_date_acc_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='purchase_date_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='purchase_date_ocr', + ), + migrations.RemoveField( + model_name='reportfile', + name='purchase_date_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='retailer_acc_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='retailer_acc_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='retailer_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='retailer_ocr', + ), + migrations.RemoveField( + model_name='reportfile', + name='retailer_revised', + ), + migrations.RemoveField( + model_name='reportfile', + name='sold_to_party_acc_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='sold_to_party_feedback', + ), + migrations.RemoveField( + model_name='reportfile', + name='sold_to_party_ocr', + ), + migrations.RemoveField( + model_name='reportfile', + name='sold_to_party_revised', + ), + migrations.RemoveField( + model_name='subscriptionrequest', + name='imei_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequest', + name='purchase_date_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequest', + name='retailer_name_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequest', + name='sold_to_party_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequestfile', + name='imei_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequestfile', + name='purchase_date_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequestfile', + name='retailer_name_accuracy', + ), + migrations.RemoveField( + model_name='subscriptionrequestfile', + name='sold_to_party_accuracy', + ), + migrations.AddField( + model_name='report', + name='feedback_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='report', + name='reviewed_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='reportfile', + name='error', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='reportfile', + name='feedback_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='reportfile', + name='feedback_result', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='reportfile', + name='predict_result', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='reportfile', + name='reviewed_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='reportfile', + name='reviewed_result', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='subscriptionrequest', + name='feedback_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='subscriptionrequest', + name='reviewed_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='feedback_accuracy', + field=models.JSONField(null=True), + ), + migrations.AddField( + model_name='subscriptionrequestfile', + name='reviewed_accuracy', + field=models.JSONField(null=True), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0174_reportfile_acc_reportfile_correspond_redemption_id_and_more.py b/cope2n-api/fwd_api/migrations/0174_reportfile_acc_reportfile_correspond_redemption_id_and_more.py new file mode 100644 index 0000000..d7c8142 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0174_reportfile_acc_reportfile_correspond_redemption_id_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.1.3 on 2024-01-29 05:59 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0173_rename_countermeasures_reportfile_counter_measures_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='reportfile', + name='acc', + field=models.FloatField(default=0), + ), + migrations.AddField( + model_name='reportfile', + name='correspond_redemption_id', + field=models.CharField(default='', max_length=200), + ), + migrations.AlterField( + model_name='reportfile', + name='correspond_request_id', + field=models.CharField(default='', max_length=200), + ), + ] diff --git a/cope2n-api/fwd_api/migrations/0175_rename_number_ivoice_transaction_report_number_imei_and_more.py b/cope2n-api/fwd_api/migrations/0175_rename_number_ivoice_transaction_report_number_imei_and_more.py new file mode 100644 index 0000000..ae622bb --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0175_rename_number_ivoice_transaction_report_number_imei_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.1.3 on 2024-01-30 12:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0174_reportfile_acc_reportfile_correspond_redemption_id_and_more'), + ] + + operations = [ + migrations.RenameField( + model_name='report', + old_name='number_ivoice_transaction', + new_name='number_imei', + ), + migrations.AddField( + model_name='report', + name='number_invoice', + field=models.IntegerField(default=0), + ), + migrations.AddField( + model_name='report', + name='number_invoice_transaction', + field=models.IntegerField(default=0), + ), + ] diff --git a/cope2n-api/fwd_api/models/Report.py b/cope2n-api/fwd_api/models/Report.py index ef03c59..340b305 100644 --- a/cope2n-api/fwd_api/models/Report.py +++ b/cope2n-api/fwd_api/models/Report.py @@ -13,19 +13,28 @@ class Report(models.Model): start_at = models.DateTimeField(null=True) end_at = models.DateTimeField(null=True) include_for_test_sample = models.BooleanField(default=False) - status = models.CharField(null=True) + status = models.CharField(max_length=100) is_daily_report = models.BooleanField(default=False) + errors = models.TextField(default="", null=True) + subsidiary = models.TextField(default="", null=True) + include_reviewed = models.TextField(default="", null=True) + include_test = models.CharField(max_length=200, default="") # Data S3_uploaded = models.BooleanField(default=False) number_request = models.IntegerField(default=0) number_images = models.IntegerField(default=0) number_bad_images = models.IntegerField(default=0) - average_client_time_profile = models.JSONField(default=0) # {"0.1": 100, 0.2: 200, ...} - average_OCR_time_profile = models.JSONField(default=0) # {"0.1": 98, 0.2: 202, ...} - average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} + number_imei = models.IntegerField(default=0) + number_invoice = models.IntegerField(default=0) + + number_imei_transaction = models.IntegerField(default=0) + number_invoice_transaction = models.IntegerField(default=0) + + average_client_time_profile = models.JSONField(null=True) # {"0.1": 100, 0.2: 200, ...} | Future feature + average_OCR_time_profile = models.JSONField(null=True) # {"0.1": 98, 0.2: 202, ...} | Future feature + average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} | Future feature average_client_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} - imei_accuracy = models.FloatField(default=-1) - purchase_date_accuracy = models.FloatField(default=-1) - retailer_name_accuracy = models.FloatField(default=-1) - sold_to_party_accuracy = models.FloatField(default=-1) \ No newline at end of file + + feedback_accuracy = models.JSONField(null=True) + reviewed_accuracy = models.JSONField(null=True) \ No newline at end of file diff --git a/cope2n-api/fwd_api/models/ReportFile.py b/cope2n-api/fwd_api/models/ReportFile.py new file mode 100644 index 0000000..f5ccaab --- /dev/null +++ b/cope2n-api/fwd_api/models/ReportFile.py @@ -0,0 +1,35 @@ +from django.db import models +from django.utils import timezone +from fwd_api.models.Subscription import Subscription +from fwd_api.models.SubscriptionRequest import SubscriptionRequest +from fwd_api.models.Report import Report + +class ReportFile(models.Model): + # Metadata + id = models.AutoField(primary_key=True) + correspond_request_id = models.CharField(max_length=200, default="") + correspond_redemption_id = models.CharField(max_length=200, default="") + created_at = models.DateTimeField(default=timezone.now, db_index=True) + updated_at = models.DateTimeField(auto_now=True) + report = models.ForeignKey(Report, related_name="files", on_delete=models.CASCADE) + + # Data + S3_uploaded = models.BooleanField(default=False) + doc_type = models.CharField(max_length=200) + + predict_result = models.JSONField(null=True) + feedback_result = models.JSONField(null=True) + reviewed_result = models.JSONField(null=True) + + feedback_accuracy = models.JSONField(null=True) + reviewed_accuracy = models.JSONField(null=True) + acc = models.FloatField(default=0) + + time_cost = models.FloatField(default=0) + is_reviewed = models.CharField(default="NA", max_length=5) # NA, No, Yes + bad_image_reason = models.TextField(default="") + counter_measures = models.TextField(default="") + error = models.TextField(default="") + + + diff --git a/cope2n-api/fwd_api/models/SubscriptionRequest.py b/cope2n-api/fwd_api/models/SubscriptionRequest.py index 3839c3a..9ca9ac2 100755 --- a/cope2n-api/fwd_api/models/SubscriptionRequest.py +++ b/cope2n-api/fwd_api/models/SubscriptionRequest.py @@ -21,10 +21,9 @@ class SubscriptionRequest(models.Model): updated_at = models.DateTimeField(auto_now=True) is_test_request = models.BooleanField(default=False) S3_uploaded = models.BooleanField(default=False) - imei_accuracy = models.FloatField(default=-1) - purchase_date_accuracy = models.FloatField(default=-1) - retailer_name_accuracy = models.FloatField(default=-1) - sold_to_party_accuracy = models.FloatField(default=-1) + + feedback_accuracy = models.JSONField(null=True) + reviewed_accuracy = models.JSONField(null=True) ai_inference_profile = models.JSONField(null=True) preprocessing_time = models.FloatField(default=-1) diff --git a/cope2n-api/fwd_api/models/SubscriptionRequestFile.py b/cope2n-api/fwd_api/models/SubscriptionRequestFile.py index 6293421..93d62f5 100755 --- a/cope2n-api/fwd_api/models/SubscriptionRequestFile.py +++ b/cope2n-api/fwd_api/models/SubscriptionRequestFile.py @@ -20,12 +20,15 @@ class SubscriptionRequestFile(models.Model): created_at = models.DateTimeField(default=timezone.now, db_index=True) updated_at = models.DateTimeField(auto_now=True) is_bad_image_quality = models.BooleanField(default=False) - doc_type = models.CharField(max_length=100, default="") - index_in_request = models.IntegerField(default=0) - processing_time = models.IntegerField(default=-1) # in milisecond + doc_type = models.CharField(max_length=10, default="") + index_in_request = models.IntegerField(default=0) # by doc_type + processing_time = models.FloatField(default=-1) # in milisecond reason = models.TextField(blank=True) counter_measures = models.TextField(blank=True) - imei_accuracy = models.FloatField(default=-1) - purchase_date_accuracy = models.FloatField(default=-1) - retailer_name_accuracy = models.FloatField(default=-1) - sold_to_party_accuracy = models.FloatField(default=-1) \ No newline at end of file + + predict_result = models.JSONField(null=True) + feedback_result = models.JSONField(null=True) + reviewed_result = models.JSONField(null=True) + + feedback_accuracy = models.JSONField(null=True) + reviewed_accuracy = models.JSONField(null=True) \ No newline at end of file diff --git a/cope2n-api/fwd_api/models/__init__.py b/cope2n-api/fwd_api/models/__init__.py index 3cfcd22..47f23f0 100755 --- a/cope2n-api/fwd_api/models/__init__.py +++ b/cope2n-api/fwd_api/models/__init__.py @@ -6,4 +6,7 @@ from .OcrTemplateBox import OcrTemplateBox from .PricingPlan import PricingPlan from .Subscription import Subscription from .FeedbackRequest import FeedbackRequest +from .Report import Report +from .ReportFile import ReportFile + diff --git a/cope2n-api/fwd_api/utils/accuracy.py b/cope2n-api/fwd_api/utils/accuracy.py new file mode 100644 index 0000000..56152f6 --- /dev/null +++ b/cope2n-api/fwd_api/utils/accuracy.py @@ -0,0 +1,417 @@ +import re +from datetime import datetime + +import copy +from .ocr_utils.ocr_metrics import eval_ocr_metric +from .ocr_utils.sbt_report import post_processing_str +from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile +from ..celery_worker.client_connector import c_connector + +BAD_THRESHOLD = 0.75 + +valid_keys = ["retailername", "sold_to_party", "purchase_date", "imei_number"] + +class MonthReportAccumulate: + def __init__(self): + self.month = None + self.total = { + 'subs': "+", + 'extraction_date': "Subtotal ()", + 'total_images': 0, + 'images_quality': { + 'successful': 0, + 'successful_percent': 0, + 'bad': 0, + 'bad_percent': 0 + }, + 'average_accuracy_rate': { + 'imei': IterAvg(), + 'purchase_date': IterAvg(), + 'retailer_name': IterAvg() + }, + 'average_processing_time': { + 'imei': IterAvg(), + 'invoice': IterAvg() + }, + 'usage': { + 'imei':0, + 'invoice': 0 + } + } + self.data = [] + self.data_format = { + 'num_imei': 0, + 'num_invoice': 0, + 'total_images': 0, + 'images_quality': { + 'successful': 0, + 'successful_percent': 0, + 'bad': 0, + 'bad_percent': 0 + }, + 'average_accuracy_rate': { + 'imei': 0, + 'purchase_date': 0, + 'retailer_name': 0 + }, + 'average_processing_time': { + 'imei': 0, + 'invoice': 0 + }, + 'usage': { + 'imei':0, + 'invoice': 0 + } + }, + + def accumulate(self, report): + self.total["total_images"] += report.number_images + self.total["images_quality"]["successful"] += report.number_images - report.number_bad_images + self.total["images_quality"]["bad"] += report.number_bad_images + + if sum([report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]) > 0 : + self.total["average_accuracy_rate"]["imei"].add_avg(report.reviewed_accuracy.get("imei_number", 0), report.reviewed_accuracy.get("imei_number_count", 0)) + self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.reviewed_accuracy.get("purchase_date", 0), report.reviewed_accuracy.get("purchase_date_count", 0)) + self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.reviewed_accuracy.get("retailername", 0), report.reviewed_accuracy.get("retailername_count", 0)) + elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]) > 0: + self.total["average_accuracy_rate"]["imei"].add_avg(report.feedback_accuracy.get("imei_number", 0), report.feedback_accuracy.get("imei_number_count", 0)) + self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.feedback_accuracy.get("purchase_date", 0), report.feedback_accuracy.get("purchase_date_count", 0)) + self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.feedback_accuracy.get("retailername", 0), report.feedback_accuracy.get("retailername_count", 0)) + + self.total["average_processing_time"]["imei"].add_avg(report.average_OCR_time.get("imei", 0), report.average_OCR_time.get("imei_count", 0)) + self.total["average_processing_time"]["invoice"].add_avg(report.average_OCR_time.get("invoice", 0), report.average_OCR_time.get("invoice_count", 0)) + self.total["usage"]["imei"] += report.number_imei_transaction + self.total["usage"]["invoice"] += report.number_invoice_transaction + + def add(self, report): + report_month = report.created_at.month + + if self.month is None: + self.month = report_month + self.total["extraction_date"] = f"Subtotal ({self.month})" + elif self.month != report_month: + self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"] + self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"] + return False # Reports from a different month, stop accumulating + # accumulate fields + new_data = copy.deepcopy(self.data_format)[0] + new_data["num_imei"] = report.number_imei + new_data["num_invoice"] = report.number_invoice + new_data["total_images"] = report.number_images + new_data["images_quality"]["successful"] = report.number_images - report.number_bad_images + new_data["images_quality"]["bad"] = report.number_bad_images + + if sum([ report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]): + new_data["average_accuracy_rate"]["imei"] = report.reviewed_accuracy.get("imei_number", None) + new_data["average_accuracy_rate"]["purchase_date"] = report.reviewed_accuracy.get("purchase_date", None) + new_data["average_accuracy_rate"]["retailer_name"] = report.reviewed_accuracy.get("retailername", None) + elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]): + new_data["average_accuracy_rate"]["imei"] = report.feedback_accuracy.get("imei_number", None) + new_data["average_accuracy_rate"]["purchase_date"] = report.feedback_accuracy.get("purchase_date", None) + new_data["average_accuracy_rate"]["retailer_name"] = report.feedback_accuracy.get("retailername", None) + new_data["average_processing_time"]["imei"] = report.average_OCR_time.get("imei", 0) + new_data["average_processing_time"]["invoice"] = report.average_OCR_time.get("invoice", 0) + new_data["usage"]["imei"] = report.number_imei_transaction + new_data["usage"]["invoice"] = report.number_invoice_transaction + + new_data["images_quality"]["successful_percent"] += new_data["images_quality"]["successful"]/new_data["total_images"] + new_data["images_quality"]["bad_percent"] += new_data["images_quality"]["bad"]/new_data["total_images"] + self.data.append(new_data) + self.accumulate(report) + return True + + def __call__(self): + self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"] + self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"] + total = copy.deepcopy(self.total) + total["average_accuracy_rate"]["imei"] = total["average_accuracy_rate"]["imei"]() + total["average_accuracy_rate"]["purchase_date"] = total["average_accuracy_rate"]["purchase_date"]() + total["average_accuracy_rate"]["retailer_name"] = total["average_accuracy_rate"]["retailer_name"]() + total["average_processing_time"]["imei"] = total["average_processing_time"]["imei"]() + total["average_processing_time"]["invoice"] = total["average_processing_time"]["invoice"]() + return self.month, self.data, total + +class IterAvg: + def __init__(self, name="default"): + self.name = name + self.avg = 0 + self.count = 0 + + def add(self, values): + """ + Args: + values (list[float]): + """ + values = [x for x in values if x is not None] + if len(values) == 0: + return + self.avg = (self.avg*self.count + sum(values))/(self.count+len(values)) + self.count += len(values) + + def add_avg(self, avg, count): + if avg is None or count is None or count == 0: + return + self.count += count + self.avg = (self.avg*(self.count-count) + avg*count)/(self.count) + + def __call__(self): + return self.avg + +def convert_datetime_format(date_string: str, is_gt=False) -> str: + # pattern_date_string = "2023-02-28" + input_format = "%Y-%m-%d" + output_format = "%d/%m/%Y" + # Validate the input date string format + pattern = r"\d{4}-\d{2}-\d{2}" + if re.match(pattern, date_string): + # Convert the date string to a datetime object + date_object = datetime.strptime(date_string, input_format) + # Convert the datetime object to the desired output format + formatted_date = date_object.strftime(output_format) + return formatted_date + return date_string + +def predict_result_to_ready(result): + dict_result = {"retailername": "", + "sold_to_party": "", + "purchase_date": [], + "imei_number": [],} + dict_result["retailername"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}])[0].get("value", None) + dict_result["sold_to_party"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}])[1].get("value", None) + dict_result["purchase_date"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}])[2].get("value", []) + dict_result["imei_number"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}, {}])[3].get("value", []) + return dict_result + +def align_fine_result(ready_predict, fine_result): + # print(f"[DEBUG]: fine_result: {fine_result}") + # print(f"[DEBUG]: ready_predict: {ready_predict}") + if fine_result: + if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0: + ready_predict["purchase_date"] = [None] + if fine_result["retailername"] and not ready_predict["retailername"]: + ready_predict["retailername"] = [None] + fine_result["purchase_date"] = [fine_result["purchase_date"] for _ in range(len(ready_predict["purchase_date"]))] + # else: + # fine_result = {} + # for key in ready_predict.keys(): + # fine_result[key] = [] + # fine_result["purchase_date"] = [None for _ in range(len(ready_predict["purchase_date"]))] + return ready_predict, fine_result + +def update_temp_accuracy(accuracy, acc, keys): + for key in keys: + accuracy[key].add(acc[key]) + return accuracy +def calculate_accuracy(key_name, inference, target): + """_summary_ + + Args: + key_name (string): key to calculate accuracy on, ex: retailername + inference (dict): result from ocr, refined to align with the target down below + target (dict): result of type + """ + acc = [] + data = [] + + if not target or not inference: + return acc, data + if not isinstance(inference[key_name], list): + if inference[key_name] is None: + inference[key_name] = [] + else: + inference[key_name] = [inference[key_name]] + if not isinstance(target[key_name], list): + if target[key_name] is None: + target[key_name] = [] + else: + target[key_name] = [target[key_name]] + for i, v in enumerate(inference[key_name]): + # TODO: target[key_name][i] is None, "" + x = post_processing_str(key_name, inference[key_name][i], is_gt=False) + y = post_processing_str(key_name, target[key_name][i], is_gt=True) + + score = eval_ocr_metric( + [x], + [y], + metric=[ + "one_minus_ned", + # "line_acc_ignore_case_symbol", + # "line_acc", + # "one_minus_ned_word", + ]) + acc.append(list(score.values())[0]) + data.append([x, y]) + return acc, data + +def calculate_avg_accuracy(acc, type, keys=[]): + acc_list = [] + # print(f"[DEBUG]: type: {type} - acc: {acc}") + for key in keys: + acc_list += acc.get(type, {}).get(key, []) + + acc_list = [x for x in acc_list if x is not None] + return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None + + +def calculate_and_save_subcription_file(report, request): + request_att = {"acc": {"feedback": {"imei_number": [], + "purchase_date": [], + "retailername": [], + "sold_to_party": [], + }, + "reviewed": {"imei_number": [], + "purchase_date": [], + "retailername": [], + "sold_to_party": [], + }}, + "err": [], + "time_cost": {}, + "total_images": 0, + "bad_images": 0} + images = SubscriptionRequestFile.objects.filter(request=request) + for image in images: + status, att = calculate_subcription_file(image) + if status != 200: + continue + image.feedback_accuracy = att["acc"]["feedback"] + image.reviewed_accuracy = att["acc"]["reviewed"] + image.is_bad_image_quality = att["is_bad_image"] + image.save() + new_report_file = ReportFile(report=report, + correspond_request_id=request.request_id, + correspond_redemption_id=request.redemption_id, + doc_type=image.doc_type, + predict_result=image.predict_result, + feedback_result=image.feedback_result, + reviewed_result=image.reviewed_result, + feedback_accuracy=att["acc"]["feedback"], + reviewed_accuracy=att["acc"]["reviewed"], + acc=att["avg_acc"], + time_cost=image.processing_time, + bad_image_reason=image.reason, + counter_measures=image.counter_measures, + error="|".join(att["err"]) + ) + new_report_file.save() + if request_att["time_cost"].get(image.doc_type, None): + request_att["time_cost"][image.doc_type].append(image.processing_time) + else: + request_att["time_cost"][image.doc_type] = [image.processing_time] + try: + request_att["acc"]["feedback"]["imei_number"] += att["acc"]["feedback"]["imei_number"] + request_att["acc"]["feedback"]["purchase_date"] += att["acc"]["feedback"]["purchase_date"] + request_att["acc"]["feedback"]["retailername"] += att["acc"]["feedback"]["retailername"] + request_att["acc"]["feedback"]["sold_to_party"] += att["acc"]["feedback"]["sold_to_party"] + + request_att["acc"]["reviewed"]["imei_number"] += att["acc"]["reviewed"]["imei_number"] + request_att["acc"]["reviewed"]["purchase_date"] += att["acc"]["reviewed"]["purchase_date"] + request_att["acc"]["reviewed"]["retailername"] += att["acc"]["reviewed"]["retailername"] + request_att["acc"]["reviewed"]["sold_to_party"] += att["acc"]["reviewed"]["sold_to_party"] + + request_att["bad_images"] += int(att["is_bad_image"]) + request_att["total_images"] += 1 + request_att["err"] += att["err"] + except Exception as e: + print(e) + continue + + return request_att + + +def calculate_subcription_file(subcription_request_file): + att = {"acc": {"feedback": {}, + "reviewed": {}}, + "err": [], + "is_bad_image": False, + "avg_acc": None} + if not subcription_request_file.predict_result: + return 400, att + + inference_result = copy.deepcopy(subcription_request_file.predict_result) + inference_result, feedback_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.feedback_result)) + inference_result, reviewed_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.reviewed_result)) + # print(f"[DEBUG]: predict_result: {subcription_request_file.predict_result}") + # print(f"[DEBUG]: inference_result: {inference_result}") + # print(f"[DEBUG]: feedback_result: {feedback_result}") + # print(f"[DEBUG]: reviewed_result: {reviewed_result}") + + for key_name in valid_keys: + try: + att["acc"]["feedback"][key_name], _ = calculate_accuracy(key_name, inference_result, feedback_result) + att["acc"]["reviewed"][key_name], _ = calculate_accuracy(key_name, inference_result, reviewed_result) + except Exception as e: + att["err"].append(str(e)) + # print(f"[DEBUG]: e: {e} -key_name: {key_name}") + avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", ["retailername", "sold_to_party", "purchase_date", "imei_number"]) + avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", ["retailername", "sold_to_party", "purchase_date", "imei_number"]) + if avg_feedback is not None or avg_reviewed is not None: + avg_acc = max([x for x in [avg_feedback, avg_reviewed] if x is not None]) + if avg_acc < BAD_THRESHOLD: + att["is_bad_image"] = True + att["avg_acc"] = avg_acc + return 200, att + +def calculate_attributions(request): # for one request, return in order + acc = {"feedback": {}, + "reviewed": {}} # {"feedback": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}, + # "reviewed": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}} + data = {"feedback": {}, + "reviewed": {}} # {"feedback": {"retailername": [[ocr, feedback], ...], "sold_to_party":[[ocr, feedback], ...], "purchase_date":[[ocr, feedback], ...], "imei_number":[[ocr, feedback], ...]}} + # {"reviewed": {"retailername": [[ocr, reviewed], ...], "sold_to_party":[[ocr, reviewed], ...], "purchase_date":[[ocr, reviewed], ...], "imei_number":[[ocr, reviewed], ...]}} + time_cost = {} # {"imei": [0.1], "invoice": [0.1]} + image_quality_num = [0, 0] # [good, bad] + image_quality_num[0] = len(request.doc_type.split(",")) + error = "" + + inference_result = predict_result_to_ready(request.predict_result) + reviewed_result = align_fine_result(inference_result, request.reviewed_result) + feedback_result = align_fine_result(inference_result, request.feedback_result) + + # accuracy calculation + for key_name in valid_keys: + if isinstance(inference_result[key_name], list): + if len(inference_result[key_name]) != len(reviewed_result.get(key_name, [])): + error = f"Request {request.request_id} failed with different {key_name} in predict and reviewed_result" + break + if len(inference_result[key_name]) != len(feedback_result.get(key_name, [])): + error = f"Request {request.request_id} failed with different {key_name} in predict and feedback_result" + break + # calculate accuracy for feedback result + acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result) + acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result) + else: + inference_result[key_name] = [inference_result[key_name]] + feedback_result[key_name] = [feedback_result[key_name]] + reviewed_result[key_name] = [reviewed_result[key_name]] + + acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result) + acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result) + + acc["feedback"]["purchase_date"] = [max(acc["feedback"]["purchase_date"])] if len(acc["feedback"]["purchase_date"]) > 0 else [] + acc["reviewed"]["purchase_date"] = [max(acc["reviewed"]["purchase_date"])] if len(acc["reviewed"]["purchase_date"]) > 0 else [] + # Count for bad and total images + avg_invoice_feedback = calculate_avg_accuracy(acc, "feedback", ["retailername", "sold_to_party", "purchase_date"]) + avg_invoice_reviewed = calculate_avg_accuracy(acc, "reviewed", ["retailername", "sold_to_party", "purchase_date"]) + if avg_invoice_feedback is not None or avg_invoice_reviewed is not None: + if max([x for x in [avg_invoice_feedback, avg_invoice_reviewed] if x is not None]) < BAD_THRESHOLD: + image_quality_num[1] += 1 + for i, _ in enumerate(acc["feedback"]["imei_number"]): + if acc["feedback"]["imei_number"][i] is not None and acc["reviewed"]["imei_number"][i] is not None: + if max([x for x in [acc["feedback"]["imei_number"][i], acc["reviewed"]["imei_number"][i]] if x is not None]) < BAD_THRESHOLD: + image_quality_num[1] += 1 + # time cost and quality calculation + # TODO: to be deprecated, doc_type would be in file level in the future + try: + for doc_type, doc_profile in request.ai_inference_profile.items(): + doc_type = doc_type.split("_")[0] + inference_time = doc_profile["inference"][1][0] - doc_profile["inference"][0] + postprocess_time = doc_profile["postprocess"][1] - doc_profile["postprocess"][0] + time_cost[doc_type].append(inference_time + postprocess_time) + except Exception as e: + error = f"Request id {request.request_id} failed with error: {e}" + + return acc, data, time_cost, image_quality_num, error + +def shadow_report(report_id, query): + c_connector.make_a_report( + (report_id, query)) \ No newline at end of file diff --git a/cope2n-api/fwd_api/utils/file.py b/cope2n-api/fwd_api/utils/file.py index bd0b4c8..a4d364c 100644 --- a/cope2n-api/fwd_api/utils/file.py +++ b/cope2n-api/fwd_api/utils/file.py @@ -6,6 +6,7 @@ import json from PIL import Image, ExifTags from django.core.files.uploadedfile import TemporaryUploadedFile +from django.utils import timezone from fwd import settings from fwd_api.constant.common import allowed_file_extensions @@ -18,10 +19,33 @@ from fwd_api.utils.image import resize from ..celery_worker.client_connector import c_connector import imagesize import csv - from openpyxl import load_workbook from openpyxl.styles import Font, Border, Side, PatternFill, NamedStyle +def validate_report_list(request): + start_date_str = request.GET.get('start_date') + end_date_str = request.GET.get('end_date') + page_number = int(request.GET.get('page', 0)) + page_size = int(request.GET.get('page_size', 10)) + report_id = request.GET.get('report_id', None) + + validated_data = {} + validated_data["start_date"] = None + validated_data["end_date"] = None + + if len(start_date_str) > 0 and len(end_date_str) > 0: + try: + validated_data["start_date"] = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') + validated_data["end_date"] = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z') + except ValueError: + raise InvalidException(excArgs="Date format") + validated_data["report_id"] = report_id + validated_data["page_size"] = page_size + validated_data["page_number"] = page_number + if validated_data["report_id"] is None and validated_data["start_date"] is None: + raise RequiredFieldException(excArgs="report_id, start_date, end_date") + return validated_data + def validate_feedback_file(csv_file_path): required_columns = ['redemptionNumber', 'requestId', 'imeiNumber', 'imeiNumber2', 'Purchase Date', 'retailer', 'Sold to party', 'timetakenmilli'] missing_columns = [] @@ -57,7 +81,6 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES if total_file_size > settings.MAX_UPLOAD_FILE_SIZE_OF_A_REQUEST: raise LimitReachedException(excArgs=('Total size of all files', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB')) - def validate_csv_feedback(files, max_file_num=1, min_file_num=1, file_field="csv files"): total_file_size = 0 if len(files) < min_file_num: diff --git a/cope2n-api/fwd_api/utils/ocr_utils/__init__.py b/cope2n-api/fwd_api/utils/ocr_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cope2n-api/fwd_api/utils/ocr_utils/ocr_metrics.py b/cope2n-api/fwd_api/utils/ocr_utils/ocr_metrics.py new file mode 100644 index 0000000..71cd84d --- /dev/null +++ b/cope2n-api/fwd_api/utils/ocr_utils/ocr_metrics.py @@ -0,0 +1,385 @@ +import re +from pathlib import Path +from difflib import SequenceMatcher +from terminaltables import AsciiTable +from rapidfuzz.distance import Levenshtein + +from .wiki_diff import inline_diff + + +def is_type_list(x, type): + + if not isinstance(x, list): + return False + + return all(isinstance(item, type) for item in x) + + +def cal_true_positive_char(pred, gt): + """Calculate correct character number in prediction. + Args: + pred (str): Prediction text. + gt (str): Ground truth text. + Returns: + true_positive_char_num (int): The true positive number. + """ + + all_opt = SequenceMatcher(None, pred, gt) + true_positive_char_num = 0 + for opt, _, _, s2, e2 in all_opt.get_opcodes(): + if opt == "equal": + true_positive_char_num += e2 - s2 + else: + pass + return true_positive_char_num + + +def post_processing(text): + """ + - Remove special characters and extra spaces + lower case + """ + + text = re.sub( + r"[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789 ]", + " ", + text, + ) + text = re.sub(r"\s\s+", " ", text) + text = text.strip() + + return text + + +def count_matches(pred_texts, gt_texts, use_ignore=True): + """Count the various match number for metric calculation. + Args: + pred_texts (list[str]): Predicted text string. + gt_texts (list[str]): Ground truth text string. + Returns: + match_res: (dict[str: int]): Match number used for + metric calculation. + """ + match_res = { + "gt_char_num": 0, + "pred_char_num": 0, + "true_positive_char_num": 0, + "gt_word_num": 0, + "match_word_num": 0, + "match_word_ignore_case": 0, + "match_word_ignore_case_symbol": 0, + "match_kie": 0, + "match_kie_ignore_case": 0, + } + # comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + # comp = re.compile('[]') + norm_ed_sum = 0.0 + + gt_texts_for_ned_word = [] + pred_texts_for_ned_word = [] + for pred_text, gt_text in zip(pred_texts, gt_texts): + if gt_text == pred_text: + match_res["match_word_num"] += 1 + match_res["match_kie"] += 1 + gt_text_lower = str(gt_text).lower() + pred_text_lower = str(pred_text).lower() + + if gt_text_lower == pred_text_lower: + match_res["match_word_ignore_case"] += 1 + + # gt_text_lower_ignore = comp.sub('', gt_text_lower) + # pred_text_lower_ignore = comp.sub('', pred_text_lower) + if use_ignore: + gt_text_lower_ignore = post_processing(gt_text_lower) + pred_text_lower_ignore = post_processing(pred_text_lower) + else: + gt_text_lower_ignore = gt_text_lower + pred_text_lower_ignore = pred_text_lower + + if gt_text_lower_ignore == pred_text_lower_ignore: + match_res["match_kie_ignore_case"] += 1 + + gt_texts_for_ned_word.append(gt_text_lower_ignore.split(" ")) + pred_texts_for_ned_word.append(pred_text_lower_ignore.split(" ")) + + match_res["gt_word_num"] += 1 + + norm_ed = Levenshtein.normalized_distance( + pred_text_lower_ignore, gt_text_lower_ignore + ) + # if norm_ed > 0.1: + # print(gt_text_lower_ignore, pred_text_lower_ignore, sep='\n') + # print("-"*20) + norm_ed_sum += norm_ed + + # number to calculate char level recall & precision + match_res["gt_char_num"] += len(gt_text_lower_ignore) + match_res["pred_char_num"] += len(pred_text_lower_ignore) + true_positive_char_num = cal_true_positive_char( + pred_text_lower_ignore, gt_text_lower_ignore + ) + match_res["true_positive_char_num"] += true_positive_char_num + + normalized_edit_distance = norm_ed_sum / max(1, len(gt_texts)) + match_res["ned"] = normalized_edit_distance + + # NED for word-level + norm_ed_word_sum = 0.0 + # print(pred_texts_for_ned_word[0]) + unique_words = list( + set( + [x for line in pred_texts_for_ned_word for x in line] + + [x for line in gt_texts_for_ned_word for x in line] + ) + ) + preds = [ + [unique_words.index(w) for w in pred_text_for_ned_word] + for pred_text_for_ned_word in pred_texts_for_ned_word + ] + truths = [ + [unique_words.index(w) for w in gt_text_for_ned_word] + for gt_text_for_ned_word in gt_texts_for_ned_word + ] + for pred_text, gt_text in zip(preds, truths): + norm_ed_word = Levenshtein.normalized_distance(pred_text, gt_text) + # if norm_ed_word < 0.2: + # print(pred_text, gt_text) + norm_ed_word_sum += norm_ed_word + + normalized_edit_distance_word = norm_ed_word_sum / max(1, len(gt_texts)) + match_res["ned_word"] = normalized_edit_distance_word + + return match_res + + +def eval_ocr_metric(pred_texts, gt_texts, metric="acc"): + """Evaluate the text recognition performance with metric: word accuracy and + 1-N.E.D. See https://rrc.cvc.uab.es/?ch=14&com=tasks for details. + Args: + pred_texts (list[str]): Text strings of prediction. + gt_texts (list[str]): Text strings of ground truth. + metric (str | list[str]): Metric(s) to be evaluated. Options are: + - 'word_acc': Accuracy at word level. + - 'word_acc_ignore_case': Accuracy at word level, ignoring letter + case. + - 'word_acc_ignore_case_symbol': Accuracy at word level, ignoring + letter case and symbol. (Default metric for academic evaluation) + - 'char_recall': Recall at character level, ignoring + letter case and symbol. + - 'char_precision': Precision at character level, ignoring + letter case and symbol. + - 'one_minus_ned': 1 - normalized_edit_distance + In particular, if ``metric == 'acc'``, results on all metrics above + will be reported. + Returns: + dict{str: float}: Result dict for text recognition, keys could be some + of the following: ['word_acc', 'word_acc_ignore_case', + 'word_acc_ignore_case_symbol', 'char_recall', 'char_precision', + '1-N.E.D']. + """ + assert isinstance(pred_texts, list) + assert isinstance(gt_texts, list) + assert len(pred_texts) == len(gt_texts) + + assert isinstance(metric, str) or is_type_list(metric, str) + if metric == "acc" or metric == ["acc"]: + metric = [ + "word_acc", + "word_acc_ignore_case", + "word_acc_ignore_case_symbol", + "char_recall", + "char_precision", + "one_minus_ned", + ] + metric = set([metric]) if isinstance(metric, str) else set(metric) + + # supported_metrics = set([ + # 'word_acc', 'word_acc_ignore_case', 'word_acc_ignore_case_symbol', + # 'char_recall', 'char_precision', 'one_minus_ned', 'one_minust_ned_word' + # ]) + # assert metric.issubset(supported_metrics) + + match_res = count_matches(pred_texts, gt_texts) + eps = 1e-8 + eval_res = {} + + if "char_recall" in metric: + char_recall = ( + 1.0 * match_res["true_positive_char_num"] / (eps + match_res["gt_char_num"]) + ) + eval_res["char_recall"] = char_recall + + if "char_precision" in metric: + char_precision = ( + 1.0 + * match_res["true_positive_char_num"] + / (eps + match_res["pred_char_num"]) + ) + eval_res["char_precision"] = char_precision + + if "word_acc" in metric: + word_acc = 1.0 * match_res["match_word_num"] / (eps + match_res["gt_word_num"]) + eval_res["word_acc"] = word_acc + + if "word_acc_ignore_case" in metric: + word_acc_ignore_case = ( + 1.0 * match_res["match_word_ignore_case"] / (eps + match_res["gt_word_num"]) + ) + eval_res["word_acc_ignore_case"] = word_acc_ignore_case + + if "word_acc_ignore_case_symbol" in metric: + word_acc_ignore_case_symbol = ( + 1.0 + * match_res["match_word_ignore_case_symbol"] + / (eps + match_res["gt_word_num"]) + ) + eval_res["word_acc_ignore_case_symbol"] = word_acc_ignore_case_symbol + + if "one_minus_ned" in metric: + + eval_res["1-N.E.D"] = 1.0 - match_res["ned"] + + if "one_minus_ned_word" in metric: + + eval_res["1-N.E.D_word"] = 1.0 - match_res["ned_word"] + + if "line_acc_ignore_case_symbol" in metric: + line_acc_ignore_case_symbol = ( + 1.0 * match_res["match_kie_ignore_case"] / (eps + match_res["gt_word_num"]) + ) + eval_res["line_acc_ignore_case_symbol"] = line_acc_ignore_case_symbol + + if "line_acc" in metric: + word_acc_ignore_case_symbol = ( + 1.0 * match_res["match_kie"] / (eps + match_res["gt_word_num"]) + ) + eval_res["line_acc"] = word_acc_ignore_case_symbol + + for key, value in eval_res.items(): + eval_res[key] = float("{:.4f}".format(value)) + + return eval_res + + +def eval_kie(preds_e2e: dict[str, dict[str, str]], gt_e2e: dict[str, dict[str, str]], labels, skip_labels=[]): + + results = {label: 1 for label in labels} + pred_texts_dict = {label: [] for label in labels} + gt_texts_dict = {label: [] for label in labels} + fail_cases = {} + for img_id in gt_e2e.keys(): + fail_cases[img_id] = {} + pred_items = preds_e2e.get(img_id, {k: '' for k in gt_e2e[img_id]}) + gt_items = gt_e2e[img_id] + + for class_name, text_gt in gt_items.items(): + if class_name in skip_labels: + continue + # if class_name == 'seller_name_value': + # print(gt_items) + if class_name not in pred_items: + text_pred = "" + else: + text_pred = pred_items[class_name] + + if str(text_pred) != str(text_gt): + diff = inline_diff(text_pred, text_gt) + fail_cases[img_id][class_name] = { + 'pred': text_pred, + 'gt': text_gt, + "diff": diff['res_text'], + "ned": diff["ned"], + "score": eval_ocr_metric([text_pred], [text_gt], metric=[ + "one_minus_ned"])["1-N.E.D"], + } + + pred_texts_dict[class_name].append(text_pred) + gt_texts_dict[class_name].append(text_gt) + + for class_name in labels: + pred_texts = pred_texts_dict[class_name] + gt_texts = gt_texts_dict[class_name] + result = eval_ocr_metric( + pred_texts, + gt_texts, + metric=[ + "one_minus_ned", + "line_acc_ignore_case_symbol", + "line_acc", + "one_minus_ned_word", + ], + ) + results[class_name] = { + "1-ned": result["1-N.E.D"], + "1-ned-word": result["1-N.E.D_word"], + "line_acc": result["line_acc"], + "line_acc_ignore_case_symbol": result["line_acc_ignore_case_symbol"], + "samples": len(pred_texts), + } + + # avg reusults + sum_1_ned = sum( + [ + results[class_name]["1-ned"] * results[class_name]["samples"] + for class_name in labels + ] + ) + sum_1_ned_word = sum( + [ + results[class_name]["1-ned-word"] * results[class_name]["samples"] + for class_name in labels + ] + ) + + sum_line_acc = sum( + [ + results[class_name]["line_acc"] * results[class_name]["samples"] + for class_name in labels + ] + ) + sum_line_acc_ignore_case_symbol = sum( + [ + results[class_name]["line_acc_ignore_case_symbol"] + * results[class_name]["samples"] + for class_name in labels + ] + ) + + total_samples = sum( + [results[class_name]["samples"] for class_name in labels] + ) + results["avg_all"] = { + "1-ned": round(sum_1_ned / total_samples, 4), + "1-ned-word": round(sum_1_ned_word / total_samples, 4), + "line_acc": round(sum_line_acc / total_samples, 4), + "line_acc_ignore_case_symbol": round( + sum_line_acc_ignore_case_symbol / total_samples, 4 + ), + "samples": total_samples, + } + + table_data = [ + [ + "class_name", + "1-NED", + "1-N.E.D_word", + "line-acc", + "line_acc_ignore_case_symbol", + "#samples", + ] + ] + for class_name in results.keys(): + # if c < p.shape[0]: + table_data.append( + [ + class_name, + results[class_name]["1-ned"], + results[class_name]["1-ned-word"], + results[class_name]["line_acc"], + results[class_name]["line_acc_ignore_case_symbol"], + results[class_name]["samples"], + ] + ) + + table = AsciiTable(table_data) + print(table.table) + return results, fail_cases diff --git a/cope2n-api/fwd_api/utils/ocr_utils/sbt_report.py b/cope2n-api/fwd_api/utils/ocr_utils/sbt_report.py new file mode 100644 index 0000000..04e02be --- /dev/null +++ b/cope2n-api/fwd_api/utils/ocr_utils/sbt_report.py @@ -0,0 +1,432 @@ +import os +import re +import ast +import time +import json +import glob +import shutil +import pandas as pd +from tqdm import tqdm +from pathlib import Path +from datetime import datetime +from .ocr_metrics import eval_ocr_metric + +import sys +# sys.path.append(os.path.dirname(__file__)) +from sdsvkvu.utils.query.sbt_v2 import get_seller, post_process_seller + + +def read_json(file_path: str): + with open(file_path, 'r') as f: + return json.load(f) + +def write_to_json(file_path, content): + with open(file_path, mode='w', encoding='utf8') as f: + json.dump(content, f, ensure_ascii=False) + + +def convert_datetime_format(date_string: str, is_gt=False) -> str: + # pattern_date_string = "2023-02-28" + output_format = "%Y-%m-%d" + input_format = "%d/%m/%Y" + # Validate the input date string format + pattern = r"\d{2}\/\d{2}\/\d{4}" + if re.match(pattern, date_string): + # Convert the date string to a datetime object + date_object = datetime.strptime(date_string, input_format) + # Convert the datetime object to the desired output format + formatted_date = date_object.strftime(output_format) + return formatted_date + return date_string + + +def normalise_retailer_name(retailer: str): + input_value = { + "text": retailer, + "id": 0, + "class": "seller", + "bbox": [0, 0, 0, 0], + } + output = get_seller({'seller': [input_value]}) + + norm_seller_name = post_process_seller(output) + return norm_seller_name + + +def post_processing_str(class_name: str, s: str, is_gt: bool) -> str: + s = str(s).replace('✪', ' ').strip() + if s.lower() in ['null', 'nan', "none"]: + return '' + if class_name == "purchase_date" and is_gt == True: + s = convert_datetime_format(s) + if class_name == "retailername": + s = normalise_retailer_name(s) + return s + + +def convert_groundtruth_from_csv( + csv_path: str, + save_dir: str, + classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"] +): + # if isinstance(csv_path_list, str): + # csv_path_list = [csv_path_list] + + df = pd.read_csv(csv_path) + + total_output = {} + for _, request in df.iterrows(): + req_id = request['requestId'] + + if req_id not in total_output: + total_output[req_id] = {k: None for k in classes} + total_output[req_id]["imei_number"] = [] + + total_output[req_id]["imei_number"].extend([request["imeiNumber"], request["imeiNumber2"]]) + total_output[req_id]["imei_number"] = list(set(total_output[req_id]["imei_number"])) + + total_output[req_id]["purchase_date"] = request["Purchase Date"] + total_output[req_id]["retailername"] = request["retailer"] + + for req_id, output in total_output.items(): + save_path = os.path.join(save_dir, req_id) + os.makedirs(save_path, exist_ok=True) + write_to_json(os.path.join(save_path, f"{req_id}.json"), output) + + +def convert_predict_from_csv( + csv_path: str, + save_dir: str, + classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"] +): + # if isinstance(csv_path_list, str): + # csv_path_list = [csv_path_list] + + df = pd.read_csv(csv_path) + + for _, request in df.iterrows(): + n_pages = request['pages'] + req_id = request['request_id'] + if not isinstance(request['doc_type'], str) or not isinstance(request['predict_result'], str): + print(f"[WARNING] Skipped request id {req_id}") + continue + + doc_type_list = request['doc_type'].split(',') + assert n_pages == len(doc_type_list), \ + "No. pages is different no. documents" + + json_path = os.path.join(save_dir, req_id) + os.makedirs(json_path, exist_ok=True) + + # For user_submitted_results + if "feedback_result" in request: + feedback_data = ast.literal_eval(request['feedback_result']) + fname = f"{req_id}.json" + write_to_json(os.path.join(json_path, fname), feedback_data) + + # For predict_results + data = ast.literal_eval(request['predict_result'])['content']['document'][0]['content'] + infer_time = float(request['ai_inference_time']) + float(request['preprocessing_time']) + 0.1 + + n_imei, n_invoice = 0, 0 + for doc_type in doc_type_list: + output = {k: None for k in classes} + if not os.path.exists(json_path): + os.makedirs(json_path, exist_ok=True) + + if doc_type == "imei": + for info in data: + if info['label'] == "imei_number": + output['imei_number'] = info['value'][n_imei] + output['processing_time'] = infer_time + fname = f"temp_{doc_type}_{req_id}_{n_imei}.json" + write_to_json(os.path.join(json_path, fname), output) + n_imei += 1 + break + elif doc_type == "invoice": + for info in data: + if info['label'] == "imei_number": + continue + output[info['label']] = info['value'] + output['processing_time'] = infer_time + fname = f"temp_{doc_type}_{req_id}_{n_invoice}.json" + write_to_json(os.path.join(json_path, fname), output) + n_invoice += 1 + + +def gen_req_to_red_dict(csv_path: str): + df = pd.read_csv(csv_path) + df = df.loc[:, ["requestId", "redemptionNumber"]] + req_to_red = {row["requestId"]: row["redemptionNumber"] for _, row in df.iterrows()} + return req_to_red + + +def gen_req_to_red_dict_2(csv_path: str): + df = pd.read_csv(csv_path) + df = df.loc[:, ["request_id", "redemption_id"]] + req_to_red = {row["request_id"]: row["redemption_id"] for _, row in df.iterrows()} + return req_to_red + + +def init_csv( + gt_dir: str, + pred_dir: str, + req_to_red: dict, +): + list_request_id = os.listdir(gt_dir) + total = [] + for request_id in list_request_id: + gt_path = os.path.join(gt_dir, request_id, request_id+".json") + if not os.path.exists(gt_path): + print(f"[WARNING] Skipped request id {os.path.basename(os.path.dirname(gt_path))}") + continue + gt_data = read_json(gt_path) + json_file_list = glob.glob(os.path.join(pred_dir, request_id, "temp_*.json")) + json_file_list = sorted(json_file_list, key=lambda x: int(x.split(".json")[0].split('_')[-1])) + n_imei, n_invoice = 0, 0 + # if len(json_file_list) > 3: + # continue + + for json_file in json_file_list: + pred_data = read_json(json_file) + if "imei" in json_file: + pred_value = pred_data['imei_number'] + gt_value = gt_data['imei_number'][n_imei] + n_imei += 1 + score = eval_ocr_metric( + [post_processing_str("imei_number", pred_value, is_gt=False)], + [post_processing_str("imei_number", gt_value, is_gt=True)], + metric=["one_minus_ned"] + )['1-N.E.D'] + + total.append({ + "requestId": request_id, + "redemptionNumber": req_to_red[request_id], + "userSubmitResults": gt_value, + "OCRResults": pred_value, + "revisedResults_by_SDSRV": "", + "accuracy": score, + "processingTime (by request)": pred_data['processing_time'], + "class_name": "imei_number", + "file_path": json_file + }) + + elif "invoice" in json_file: + for class_name in ["retailername", "purchase_date"]: + pred_value = pred_data[class_name] + gt_value = gt_data[class_name] + if isinstance(gt_value, list): + gt_value = gt_value[0] + n_invoice += 1 + + if not isinstance(pred_value, list): + pred_value = [pred_value] + + score = 0 + for _pred_value in pred_value: + score1 = eval_ocr_metric( + [post_processing_str(class_name, _pred_value, is_gt=False)], + [post_processing_str(class_name, gt_value, is_gt=True)], + metric=["one_minus_ned"] + )['1-N.E.D'] + score = max(score, score1) + + total.append({ + "requestId": request_id, + "redemptionNumber": req_to_red[request_id], + "userSubmitResults": gt_value, + "OCRResults": pred_value[0] if class_name == "retailername" else pred_value, + "revisedResults_by_SDSRV": "", + "accuracy": score, + "processingTime (by request)": pred_data['processing_time'], + "class_name": class_name, + "file_path": json_file + }) + + return total + + +def export_report( + init_csv: str, +): + df = pd.read_csv(init_csv) + for index, request in df.iterrows(): + file_path = request['file_path'] + class_name = request['class_name'] + pred_value = request['OCRResults'] + revised_value = read_json(file_path)[class_name] + if class_name == "purchase_date": + pred_value = ast.literal_eval(pred_value) + if isinstance(revised_value, list): + if len(revised_value) > 0: + revised_value = revised_value[0] + else: + revised_value = None + + if len(pred_value) == 0: + pred_value = [None] + + score = 0 + for _pred_value in pred_value: + score1 = eval_ocr_metric( + [post_processing_str(class_name, _pred_value, is_gt=False)], + [post_processing_str(class_name, revised_value, is_gt=True)], + metric=["one_minus_ned"] + )['1-N.E.D'] + score = max(score, score1) + else: + score = eval_ocr_metric( + [post_processing_str(class_name, pred_value, is_gt=False)], + [post_processing_str(class_name, revised_value, is_gt=True)], + metric=["one_minus_ned"] + )['1-N.E.D'] + + + df.at[index, "revisedResults_by_SDSRV"] = revised_value + df.at[index, "accuracy"] = score + + return df + + +def pick_sample_to_revise( + ocr_accuracy: list, + gt_dir: str, + save_dir: str +): + empty_err_path = os.path.join(save_dir, "empty_results") + other_err_path = os.path.join(save_dir, "diff_results") + os.makedirs(empty_err_path, exist_ok=True) + os.makedirs(other_err_path, exist_ok=True) + for request in ocr_accuracy: + score = request['accuracy'] + json_path = request['file_path'] + request_id = request['requestId'] + + img_path_folder = os.path.join(gt_dir, Path(json_path).parts[-2], Path(json_path).parts[-1]) + img_path = [ff for ff in glob.glob(img_path_folder.replace(".json", ".*")) if ".json" not in ff] + + if len(img_path) == 0: + print(f"[WARNING] Skipped request id {request_id}") + continue + img_path = img_path[0] + # img_path = [ff for ff in glob.glob(json_path.replace(".json", ".*"))][0] + + if score == 0: + save_path = os.path.join(empty_err_path, request_id) + elif score < 1: + save_path = os.path.join(other_err_path, request_id) + else: + continue + os.makedirs(save_path, exist_ok=True) + shutil.copy(img_path, save_path) + shutil.copy(json_path, save_path) + +def merge_revised_sample( + revised_path_list: list, + save_dir: str +): + if not isinstance(revised_path_list, list): + revised_path_list = [revised_path_list] + + for revised_path in revised_path_list: + list_request = [os.path.basename(ff) for ff in os.listdir(revised_path)] + for request in list_request: + file_list = glob.glob(os.path.join(revised_path, request, "*.json*")) + for file_path in file_list: + # shutil.copyfile(file_path, os.path.join(save_path, request)) + os.system(f"sudo cp {file_path} {os.path.join(save_dir, request)}") + +def calculate_average_by_column(df, column_name): + df = df.groupby(by=["requestId"]) + time_list = [] + for req, sub_df in df: + if len(sub_df) > 0: + time_list.append(sub_df.iloc[0][column_name]) + if len(time_list) > 0: + return sum(time_list)/len(time_list) + return 0 + + +if __name__ == "__main__": + save_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan" + save_csv = "logs/eval_20240115" + csv_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan.csv" + csv_path_end_user = "logs/eval_20240115/OCR_15Jan2024.csv" + + # Step 1: Convert a csv file to get user submitted results for each request + print("[INFO] Starting convert csv from customer to json") + os.system(f"sudo chmod -R 777 {save_path}") + convert_groundtruth_from_csv(csv_path=csv_path_end_user, save_dir=save_path) + print("[INFO] Converted") + + # # Step 2: Convert a csv file to get predict OCR results for each image + print("[INFO] Starting convert csv from SDSV to json") + convert_predict_from_csv(csv_path=csv_path, save_dir=save_path) + print("[INFO] Converted") + + # # Step 3: Gen initial csv file and calculate OCR result between submitted results and ocr results + print("[INFO] Starting generate csv to get performance") + gt_path = save_path + pred_path = save_path + req_to_red_dict = gen_req_to_red_dict(csv_path_end_user) + init_data = init_csv(gt_dir=gt_path, pred_dir=pred_path, req_to_red=req_to_red_dict) + pd.DataFrame(init_data).to_csv(os.path.join(save_csv, "init1.csv"), index=False) + print("[INFO] Done") + + # # Step 4: Split requests whose accuracy is less than 1 to revise + # print("[INFO] Starting split data to review") + # revised_path = os.path.join(save_csv, "revised") + # # shutil.rmtree(revised_path) + # pick_sample_to_revise(ocr_accuracy=init_data, gt_dir=save_path, save_dir=revised_path) + # print("[INFO] Done") + + # # Step 5: Merge revised results to gt folder + # print("[INFO] Merging revised data to ground truth folder") + # revised_path = os.path.join(save_csv, "revised") + # revised_path = [f'{revised_path}/empty_results', f'{revised_path}/diff_results'] + # merge_revised_sample(revised_path_list=revised_path, save_dir=save_path) + # print("Done") + + # # Step 6: Caculate OCR result between ocr results and revised results + # print("[INFO] Exporting OCR report") + # init_csv_path = os.path.join(save_csv, "init1.csv") + # report = export_report(init_csv=init_csv_path) + # error_path = os.path.join(save_csv, "errors") + # pick_sample_to_revise(ocr_accuracy=report[report.accuracy < 0.75].to_dict('records'), gt_dir=save_path, save_dir=error_path) + + # n_total_images = len(report) + # n_bad_images = len(report[report.accuracy < 0.75]) + # average_acc = report[report.accuracy >= 0.75]['accuracy'].mean() + + # print("Total requests:", len(report['requestId'].unique())) + # print("Total images:", n_total_images) + # print("No. imei images:", len(report[report.class_name == "imei_number"])) + # print("No. invoice images:", len(report[report.class_name == "retailername"])) + # print("No. bad quality images:", n_bad_images) + # print("No. valid images:", n_total_images - n_bad_images) + # print("No. per of bad quality images:", 100*n_bad_images/n_total_images) + # print("Average accuracy:", 100*average_acc) + + # last_row = n_total_images + # report.at[last_row, "requestId"] = "Total requests:" + # report.at[last_row, "redemptionNumber"] = len(report['requestId'].unique()) + # report.at[last_row+1, "requestId"] = "Total images:" + # report.at[last_row+1, "redemptionNumber"] = n_total_images + # report.at[last_row+2, "requestId"] = "No. imei images:" + # report.at[last_row+2, "redemptionNumber"] = len(report[report.class_name == "imei_number"]) + # report.at[last_row+3, "requestId"] = "No. invoice images:" + # report.at[last_row+3, "redemptionNumber"] = len(report[report.class_name == "retailername"]) + # report.at[last_row+4, "requestId"] = "No. bad quality images:" + # report.at[last_row+4, "redemptionNumber"] = n_bad_images + # report.at[last_row+5, "requestId"] = "No. valid images:" + # report.at[last_row+5, "redemptionNumber"] = n_total_images - n_bad_images + # report.at[last_row+6, "requestId"] = "No. per of bad quality images:" + # report.at[last_row+6, "redemptionNumber"] = 100*n_bad_images/n_total_images + # report.at[last_row+7, "requestId"] = "Average accuracy:" + # report.at[last_row+7, "redemptionNumber"] = 100*average_acc + + + # report.drop(columns=["file_path", "class_name"]).to_csv(os.path.join(save_csv, f"SBT_report_{time.strftime('%Y%m%d')}.csv"), index=False) + # print("[INFO] Done") + + \ No newline at end of file diff --git a/cope2n-api/fwd_api/utils/ocr_utils/wiki_diff.py b/cope2n-api/fwd_api/utils/ocr_utils/wiki_diff.py new file mode 100644 index 0000000..dfbbb54 --- /dev/null +++ b/cope2n-api/fwd_api/utils/ocr_utils/wiki_diff.py @@ -0,0 +1,201 @@ +# https://stackoverflow.com/questions/774316/python-difflib-highlighting-differences-inline +import difflib +import unidecode +import os +import glob +import pandas as pd + +VOWELS = 'aeouiy' + 'AEOUIY' +CONSONANTS = 'bcdfghjklmnpqrstvxwz' + 'BCDFGHJKLMNPQRSTVXWZ' +# PREDICT_PATH = 'ocr/result' +# GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/wiki/ground_truth' +PREDICT_PATH = 'ocr/result/cinamon' +GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/Backup/1.Hand_writing/Lines/cinnamon_data' +# note that we also use different preprocess for cinamon data +# SAVE_PATH = 'wiki_diff' +SAVE_PATH = 'wiki_diff/cinamon' +RES_PATH = f'{SAVE_PATH}/result/' +WRONG_ACCENT_FILE = f'{SAVE_PATH}/wrong_accent.txt' +LOST_ACCENT_FILE = f'{SAVE_PATH}/lost_accent.txt' +TOTAL_WORD = 0 + + +def write_accent_error(path, err): + # path should be wrong_accent_file or lost_accent_file + with open(path, 'a') as f: + f.write(err) + f.write('\n') + + +def update_ddata_specialchars(ddata_specialchars, correction_key, char_key): + if char_key in ddata_specialchars[correction_key]: + ddata_specialchars[correction_key][char_key] += 1 + else: + ddata_specialchars[correction_key][char_key] = 1 + + +def process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars): + a_char = matcher.a[i1:i2] + b_char = matcher.b[j1:j2] + ddata['res_text'] += ' ### {' + a_char + ' -> ' + b_char + '} ### ' + ddata['nwrongs'] += 1*len(b_char) + if len(a_char) == 1 and len(b_char) == 1: # single char case + if a_char.lower() == b_char.lower(): # wrong upper/lower case + ddata['UL_single'] += 1 + update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char)) + else: + ddata['nwrongs_single'] += 1 + a_ori = unidecode.unidecode(a_char).lower() + b_ori = unidecode.unidecode(b_char).lower() + if a_ori in VOWELS and b_ori in VOWELS: + if a_ori == b_ori: + err = a_char + ' -> ' + b_char + if b_ori == b_char.lower(): # e.g. Ơ -> O + ddata['nlost_accent'] += 1 + # write_accent_error(LOST_ACCENT_FILE, err) + else: # e.g Ơ -> Ớ + ddata['nwrong_accent'] += 1 + # write_accent_error(WRONG_ACCENT_FILE, err) + else: # e.g Ă ->  + ddata['nwrong_vowels'] += 1 + else: + if a_ori in CONSONANTS and b_ori in CONSONANTS: + ddata['nwrong_consonants'] += 1 + else: + ddata['nwrong_specialchars'] += 1 + update_ddata_specialchars(ddata_specialchars, 'wrong', (a_char, b_char)) + else: + if a_char.lower() == b_char.lower(): + ddata['UL_multiple'] += 1 + update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char)) + else: + ddata['nwrongs_multiple'] += 1 + if len(a_char) > 10 or len(b_char) > 10: + ddata['nlong_sequences'] += 1 + # print(a_char) + + +def process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars): + a_char = matcher.a[i1:i2] + ddata['res_text'] += ' ### {- ' + a_char + '} ### ' + ddata['nadds'] += 1*len(a_char) + if len(a_char) == 1: + ddata['nadds_single'] += 1 + if a_char.lower() in CONSONANTS + VOWELS: + ddata['nadds_chars'] += 1 + else: + if a_char == ' ': + ddata['nadds_space'] += 1 + else: + ddata['nadds_specialchars'] += 1 + update_ddata_specialchars(ddata_specialchars, 'add', a_char) + + else: + ddata['nadds_multiple'] += 1 + if len(a_char) > 10: + ddata['nlong_sequences'] += 1 + # print(a_char) + + +def process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars): + b_char = matcher.b[j1:j2] + ddata['nlosts'] += 1*len(b_char) + ddata['res_text'] += ' ### {+ ' + b_char + '} ### ' + if len(b_char) == 1: + ddata['nlosts_single'] += 1 + if b_char.lower() in CONSONANTS + VOWELS: + ddata['nlosts_chars'] += 1 + else: + if b_char == ' ': + ddata['nlosts_space'] += 1 + else: + ddata['nlosts_specialchars'] += 1 + update_ddata_specialchars(ddata_specialchars, 'lost', b_char) + + else: + ddata['nlosts_multiple'] += 1 + if len(b_char) > 10: + ddata['nlong_sequences'] += 1 + # print(b_char) + + +def inline_diff(a, b, ddata_specialchars={'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}): + matcher = difflib.SequenceMatcher(None, a, b) + ddata = {'res_text': ''} + # ddata = ddata | {key: 0 for key in ['nsingle', 'nmultiple']} + ddata = ddata | {key: 0 for key in ['UL_single', 'UL_multiple']} + ddata = ddata | { + key: 0 for key in + ['nlosts', 'nlosts_single', 'nlosts_multiple', 'nlosts_chars', 'nlosts_specialchars', 'nlosts_space']} + ddata = ddata | { + key: 0 for key in + ['nadds', 'nadds_single', 'nadds_multiple', 'nadds_chars', 'nadds_specialchars', 'nadds_space']} + ddata = ddata | { + key: 0 for key in + ['nwrongs', 'nwrongs_single', 'nwrongs_multiple', 'nwrong_accent', 'nlost_accent', 'nwrong_vowels', + 'nwrong_consonants', 'nwrong_specialchars']} + ddata['nlong_sequences'] = 0 + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == 'replace': # wrong + process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars) + if tag == 'delete': # OCR add char so the matcher "delete" + process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars) + if tag == 'equal': + ddata['res_text'] += matcher.a[i1:i2] + if tag == 'insert': # OCR lost char so the matcher "insert" + process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars) + ddata["ned"] = ddata['nwrongs'] + ddata['nadds'] + ddata['nlosts'] + return ddata + + +def process_single_file(file_name, ddata_specialchars): + + # read predict file + with open(os.path.join(PREDICT_PATH, file_name), 'r') as f: + predict = f.readlines()[0].strip() + # predict = ''.join(predict) + # predict = predict.replace(' ', '') + # predict = predict.replace('\n', '') + # print(predict) + + # read groundtruth file + with open(os.path.join(GROUNDTRUTH_PATH, file_name), 'r') as f: + gt = f.readlines()[0].strip() + # gt = ''.join(gt) + # gt = gt.replace('\n', '') + + # get statiscal data of difference between predict and ground truth + ddata = inline_diff(predict, gt, ddata_specialchars) + global TOTAL_WORD + TOTAL_WORD = TOTAL_WORD + len(gt.split()) + # write to save_path + res_text = ddata.pop('res_text', None) + save_file = os.path.join(RES_PATH, file_name) + with open(save_file, 'w') as f: + f.write(res_text) + + # generate csv file + ddata = {'file_name': save_file} | ddata + return ddata + + +def main(overwrite=False): + for accent_file in [WRONG_ACCENT_FILE, LOST_ACCENT_FILE]: + if os.path.exists(accent_file): + os.remove(accent_file) + lddata = [] + ddata_specialchars = {'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}} + for file_ in glob.glob(f'{PREDICT_PATH}/*.txt'): + file_name = file_.split('/')[-1] + ddata = process_single_file(file_name, ddata_specialchars) + lddata.append(ddata) + if overwrite: + df = pd.DataFrame(lddata) + df.to_csv(f'{SAVE_PATH}/wiki_diff.csv', sep='\t') + df_ = pd.DataFrame(ddata_specialchars) + df_.to_csv(f'{SAVE_PATH}/wiki_diff_specialchars.csv', sep='\t') + print(TOTAL_WORD) + + +if __name__ == '__main__': + main(overwrite=True) diff --git a/cope2n-api/requirements.txt b/cope2n-api/requirements.txt index c204228..698f2db 100755 --- a/cope2n-api/requirements.txt +++ b/cope2n-api/requirements.txt @@ -36,7 +36,7 @@ requests==2.28.1 ruamel.yaml==0.17.21 ruamel.yaml.clib==0.2.7 sqlparse==0.4.3 -tzdata==2022.6 +tzdata==2022.7 uritemplate==4.1.1 urllib3==1.26.13 uvicorn==0.20.0 @@ -50,4 +50,13 @@ boto3==1.29.7 imagesize==1.4.1 pdf2image==1.16.3 redis==5.0.1 -django-celery-beat==2.5.0 \ No newline at end of file +django-celery-beat==2.5.0 +terminaltables==3.1.10 +rapidfuzz==3.6.1 +Unidecode==1.3.8 +pandas==2.2.0 +openpyxl==3.1.2 +# For sdsvkvu compatibility +# torch==1.13.1+cu116 +# torchvision==0.14.1+cu116 +# --extra-index-url https://download.pytorch.org/whl/cu116 \ No newline at end of file diff --git a/cope2n-api/scripts/database_cloning.sh b/cope2n-api/scripts/database_cloning.sh new file mode 100644 index 0000000..eaf54f6 --- /dev/null +++ b/cope2n-api/scripts/database_cloning.sh @@ -0,0 +1 @@ +pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql \ No newline at end of file diff --git a/cope2n-api/token.txt b/cope2n-api/token.txt new file mode 100644 index 0000000..5f06852 --- /dev/null +++ b/cope2n-api/token.txt @@ -0,0 +1 @@ +eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJpZCI6InNidCIsImV4cGlyZWRfYXQiOiIwMS8wMi8yMDI0IDEyOjQ2OjA3IiwiaW50ZXJuYWxfaWQiOjEsInN0YXR1cyI6MSwic3Vic2NyaXB0aW9uX2lkIjoxfQ.VFsoGm5BdeyNptMsdU4f4l70bDIYHTmB8Y-2-PXs7cKhzGB1pUpgqax-V39N_IEXriRl3caDiotzU0psR0WR3g \ No newline at end of file diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index f50412c..e9bf8c7 100755 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -83,12 +83,12 @@ services: depends_on: db-sbt: condition: service_started - command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input && - python manage.py makemigrations && - python manage.py migrate && - python manage.py compilemessages && - gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod - # command: bash -c "tail -f > /dev/null" + # command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input && + # python manage.py makemigrations && + # python manage.py migrate && + # python manage.py compilemessages && + # gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod + command: bash -c "tail -f > /dev/null" minio: image: minio/minio @@ -188,6 +188,8 @@ services: - POSTGRES_USER=${DB_USER} - POSTGRES_PASSWORD=${DB_PASSWORD} - POSTGRES_DB=${DB_SCHEMA} + ports: + - 5432:5432 rabbitmq-sbt: mem_reservation: 600m diff --git a/scripts/crawl_database_by_time.py b/scripts/crawl_database_by_time.py index 17f6570..4befe9a 100644 --- a/scripts/crawl_database_by_time.py +++ b/scripts/crawl_database_by_time.py @@ -10,9 +10,9 @@ from dotenv import load_dotenv load_dotenv("../.env_prod") -OUTPUT_NAME = "missing_records" -START_DATE = datetime(2023, 12, 28, tzinfo=timezone('Asia/Ho_Chi_Minh')) -END_DATE = datetime(2024, 1, 3, tzinfo=timezone('Asia/Ho_Chi_Minh')) +OUTPUT_NAME = "Jan" +START_DATE = datetime(2024, 1, 1, tzinfo=timezone('Asia/Ho_Chi_Minh')) +END_DATE = datetime(2024, 2, 1, tzinfo=timezone('Asia/Ho_Chi_Minh')) # Database connection details db_host = os.environ.get('DB_HOST', "") @@ -62,32 +62,32 @@ with open(csv_file_path, 'w', newline='') as csv_file: cursor.close() conn.close() -# Download folders from S3 -s3_client = boto3.client( - 's3', - aws_access_key_id=access_key, - aws_secret_access_key=secret_key -) +# # Download folders from S3 +# s3_client = boto3.client( +# 's3', +# aws_access_key_id=access_key, +# aws_secret_access_key=secret_key +# ) -request_ids = [] -for rq in data: - rq_id = rq[3] - request_ids.append(rq_id) +# request_ids = [] +# for rq in data: +# rq_id = rq[3] +# request_ids.append(rq_id) -for request_id in tqdm(request_ids): - folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/ - local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files - os.makedirs(OUTPUT_NAME, exist_ok=True) - os.makedirs(local_folder_path, exist_ok=True) +# for request_id in tqdm(request_ids): +# folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/ +# local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files +# os.makedirs(OUTPUT_NAME, exist_ok=True) +# os.makedirs(local_folder_path, exist_ok=True) - # List objects in the S3 folder - response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key) - objects = response.get('Contents', []) +# # List objects in the S3 folder +# response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key) +# objects = response.get('Contents', []) - for s3_object in objects: - object_key = s3_object['Key'] - local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key +# for s3_object in objects: +# object_key = s3_object['Key'] +# local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key - # Download the S3 object to the local file - s3_client.download_file(s3_bucket_name, object_key, local_file_path) \ No newline at end of file +# # Download the S3 object to the local file +# s3_client.download_file(s3_bucket_name, object_key, local_file_path) \ No newline at end of file diff --git a/scripts/database_cloning.sh b/scripts/database_cloning.sh new file mode 100644 index 0000000..eaf54f6 --- /dev/null +++ b/scripts/database_cloning.sh @@ -0,0 +1 @@ +pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql \ No newline at end of file