Merge pull request #1 from SDSRV-IDP/feature/accuracy_calculation

Feature/accuracy calculation
This commit is contained in:
Nguyen Viet Anh 2024-02-01 14:47:34 +07:00 committed by GitHub Enterprise
commit 2fd0babd7f
60 changed files with 4157 additions and 77 deletions

41
.env_sample Normal file
View File

@ -0,0 +1,41 @@
MEDIA_ROOT=/app/media
# DATABASE django setup
DB_ENGINE=django.db.backends.postgresql_psycopg2
DB_SCHEMA=sbt_dev
DB_USER=postgres
DB_PASSWORD=extraordinary
DB_HOST=db-sbt
DB_PUBLIC_PORT=5432
DB_INTERNAL_PORT=5432
DEBUG=TRUE
CORS_ALLOWED_ORIGINS=*
CTEL_KEY=secret
DB_INTERNAL_KEY=secret
ALLOWED_HOSTS='*'
BROKER_URL=amqp://test:test@rabbitmq-manulife-sbt:5672
BASE_URL=http://be-ctel-sbt:9000
BASE_UI_URL=http://fe-sbt:9801
HOST_MEDIA_FOLDER=./media
GID=1000
UID=198
SECRET_KEY=secret
RABBITMQ_DEFAULT_USER=test
RABBITMQ_DEFAULT_PASS=test
BASE_PORT=9000
S3_ENDPOINT=minio
S3_ACCESS_KEY=sample-key
S3_SECRET_KEY=sample-key
S3_BUCKET_NAME=sample-key
AUTH_TOKEN_LIFE_TIME=168
IMAGE_TOKEN_LIFE_TIME=168
INTERNAL_SDS_KEY=sample-key
FI_USER_NAME=sbt
FI_PASSWORD=abc
# Front end env variables
# VITE_PORT=80
# VITE_PROXY=http://0.0.0.0
# VITE_API_BASE_URL=http://0.0.0.0:8000
# PORT=8002

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "cope2n-ai-fi/modules/sdsvkvu"]
path = cope2n-ai-fi/modules/sdsvkvu
url = https://code.sdsdev.co.kr/SDSRV-IDP/sdsvkvu.git
[submodule "cope2n-api/fwd_api/utils/sdsvkvu"]
path = cope2n-api/fwd_api/utils/sdsvkvu
url = https://code.sdsdev.co.kr/SDSRV-IDP/sdsvkvu

View File

@ -5,4 +5,5 @@ packages/
__pycache__
DataBase/image_temp/
DataBase/json_temp/
DataBase/template.db
DataBase/template.db
key_value_understanding-20231024-125646_manulife2/

View File

@ -33,6 +33,7 @@ def sbt_predict(image_url, engine) -> None:
img = cv2.imdecode(arr, -1)
save_dir = "./tmp_results"
os.makedirs(save_dir, exist_ok=True)
# image_path = os.path.join(save_dir, f"{image_url}.jpg")
os.makedirs(save_dir, exist_ok = True)
tmp_image_path = os.path.join(save_dir, f"{uuid.uuid4()}.jpg")

View File

@ -69,6 +69,7 @@ def process_sbt_invoice(rq_id, list_url, metadata):
c_connector.process_sbt_invoice_result((rq_id, hoadon, metadata))
return {"rq_id": rq_id}
except Exception as e:
print(f"[ERROR]: Failed to extract invoice: {e}")
print(e)
hoadon = {"status": 404, "content": {}}
c_connector.process_sbt_invoice_result((rq_id, hoadon, metadata))

@ -1 +1 @@
Subproject commit 11fb9588df7e6cb03e7a761e3f728f11045bee09
Subproject commit 6907ea0183b141e3b4f3c21758c9123f1e9b2a27

View File

@ -8,10 +8,17 @@ RUN groupadd --gid ${GID} ${USERNAME} \
&& apt-get install -y sudo bash gettext poppler-utils \
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
&& chmod 0440 /etc/sudoers.d/${USERNAME}
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
RUN yes | apt install postgresql gcc musl-dev
RUN pip install --upgrade pip
RUN pip install uvicorn gunicorn Celery
# For intergration with sdskvu
RUN pip install pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
RUN pip install -U openmim==0.3.7 --no-cache-dir
RUN mim install mmcv-full==1.7.2
# End intergration with sdskvu
USER ${UID}
ADD --chown=${UID}:${GID} fwd /app
COPY --chown=${UID}:${GID} requirements.txt /app
@ -21,4 +28,27 @@ RUN pip install -r requirements.txt --no-cache-dir
COPY --chown=${UID}:${GID} . /app
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir
# For intergration with sdskvu
RUN python -m pip install paddlepaddle-gpu==2.4.2.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html --no-cache-dir
ENV TZ="Asia/Ho_Chi_Minh"
# FROM cope2n-api-base AS builder
# ARG UID=1000
# ARG GID=1000
# ARG USERNAME=container-user
# # Create a new user
# RUN groupadd --gid ${GID} ${USERNAME} \
# && useradd --uid ${UID} --gid ${GID} -m ${USERNAME} \
# && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
# && chmod 0440 /etc/sudoers.d/${USERNAME}
# WORKDIR /app
# COPY --chown=${UID}:${GID} . /app

View File

@ -0,0 +1,17 @@
FROM python:3.9.17-buster
RUN apt-get update \
&& apt-get install -y sudo bash gettext poppler-utils postgresql gcc musl-dev
COPY requirements.txt /tmp
COPY ./fwd_api/utils/sdsvkvu /app/fwd_api/utils/sdsvkvu
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsv_dewarp && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtd && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu/sdsvkvu/externals/sdsvocr/externals/sdsvtr && pip3 install -v -e . --no-cache-dir
RUN cd /app/fwd_api/utils/sdsvkvu && pip3 install -v -e . --no-cache-dir
RUN pip install --upgrade pip && pip install uvicorn gunicorn Celery
RUN pip install -r /tmp/requirements.txt --no-cache-dir
ENV TZ="Asia/Ho_Chi_Minh"

View File

@ -36,8 +36,11 @@ BASE_URL = env.str("BASE_URL", "")
BASE_UI_URL = env.str("BASE_UI_URL", "")
AUTH_TOKEN_LIFE_TIME = env.int("AUTH_TOKEN_LIFE_TIME", 0)
IMAGE_TOKEN_LIFE_TIME = env.int("IMAGE_TOKEN_LIFE_TIME", 0)
FI_USER_NAME = env.str("FI_USER_NAME", "secret_username")
FI_PASSWORD = env.str("FI_PASSWORD", 'admin')# SECURITY WARNING: don't run with debug turned on in production!
ADMIN_USER_NAME = env.str("ADMIN_USER_NAME", "")
ADMIN_PASSWORD = env.str("ADMIN_PASSWORD", '')# SECURITY WARNING: don't run with debug turned on in production!
STANDARD_USER_NAME = env.str("STANDARD_USER_NAME", "")
STANDARD_PASSWORD = env.str("STANDARD_PASSWORD", '')# SECURITY WARNING: don't run with debug turned on in production!
# Application definition
S3_ENDPOINT = env.str("S3_ENDPOINT", "")
S3_ACCESS_KEY = env.str("S3_ACCESS_KEY", "")

View File

@ -0,0 +1,643 @@
from rest_framework import status, viewsets
from rest_framework.decorators import action
from rest_framework.response import Response
from django.core.paginator import Paginator
from django.http import JsonResponse, FileResponse, HttpResponse
from django.utils import timezone
from django.db.models import Q
import uuid
import os
from fwd import settings
from drf_spectacular.utils import extend_schema, OpenApiParameter, OpenApiTypes
# from drf_spectacular.types import OpenApiString
import json
from ..exception.exceptions import InvalidException, RequiredFieldException, NotFoundException
from ..models import SubscriptionRequest, Report, ReportFile
from ..utils.accuracy import shadow_report, MonthReportAccumulate, first_of_list, extract_report_detail_list, IterAvg
from ..utils.file import download_from_S3
from ..utils.process import string_to_boolean
from ..celery_worker.client_connector import c_connector
class AccuracyViewSet(viewsets.ViewSet):
lookup_field = "username"
@extend_schema(
parameters=[
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='include_test',
location=OpenApiParameter.QUERY,
description='Whether to include test record or not',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='is_reviewed',
location=OpenApiParameter.QUERY,
description='Which records to be query',
type=OpenApiTypes.STR,
enum=['reviewed', 'not reviewed', 'all'],
),
OpenApiParameter(
name='request_id',
location=OpenApiParameter.QUERY,
description='Specific request id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='redemption_id',
location=OpenApiParameter.QUERY,
description='Specific redemption id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="request_list", methods=["GET"])
def get_request_list(self, request):
if request.method == 'GET':
start_date_str = request.GET.get('start_date')
end_date_str = request.GET.get('end_date')
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
request_id = request.GET.get('request_id', None)
redemption_id = request.GET.get('redemption_id', None)
is_reviewed = request.GET.get('is_reviewed', None)
include_test = request.GET.get('include_test', False)
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
base_query = Q(created_at__range=(start_date, end_date))
if request_id:
base_query &= Q(request_id=request_id)
if redemption_id:
base_query &= Q(redemption_id=redemption_id)
base_query &= Q(is_test_request=False)
if isinstance(include_test, str):
include_test = True if include_test=="true" else False
if include_test:
# base_query = ~base_query
base_query.children = base_query.children[:-1]
elif isinstance(include_test, bool):
if include_test:
base_query = ~base_query
if isinstance(is_reviewed, str):
if is_reviewed == "reviewed":
base_query &= Q(is_reviewed=True)
elif is_reviewed == "not reviewed":
base_query &= Q(is_reviewed=False)
elif is_reviewed == "all":
pass
subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
paginator = Paginator(subscription_requests, page_size)
page = paginator.get_page(page_number)
data = []
for request in page:
imeis = []
purchase_date = []
retailer = ""
try:
if request.reviewed_result is not None:
imeis = request.reviewed_result.get("imei_number", [])
purchase_date = request.reviewed_result.get("purchase_date", [])
retailer = request.reviewed_result.get("retailername", "")
elif request.feedback_result is not None :
imeis = request.feedback_result.get("imei_number", [])
purchase_date = request.feedback_result.get("purchase_date", [])
retailer = request.feedback_result.get("retailername", "")
elif request.predict_result is not None:
if request.predict_result.get("status", 404) == 200:
imeis = request.predict_result.get("content", {}).get("document", [])[0].get("content", [])[3].get("value", [])
purchase_date = request.predict_result.get("content", {}).get("document", [])[0].get("content", [])[2].get("value", [])
retailer = request.predict_result.get("content", {}).get("document", [])[0].get("content", [])[0].get("value", [])
except Exception as e:
print(f"[ERROR]: {e}")
print(f"[ERROR]: {request}")
data.append({
'RequestID': request.request_id,
'RedemptionID': request.redemption_id,
'IMEIs': imeis,
'Purchase Date': purchase_date,
'Retailer': retailer,
'Client Request Time (ms)': request.client_request_time,
'Server Processing Time (ms)': request.preprocessing_time + request.ai_inference_time,
'Is Reviewed': request.is_reviewed,
# 'Is Bad Quality': request.is_bad_image_quality,
'created_at': request.created_at.isoformat()
})
response = {
'subscription_requests': data,
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[
OpenApiParameter(
name='is_daily_report',
location=OpenApiParameter.QUERY,
description='Whether to include test record or not',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='include_test',
location=OpenApiParameter.QUERY,
description='Whether to include test record or not',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='is_reviewed',
location=OpenApiParameter.QUERY,
description='Which records to be query',
type=OpenApiTypes.STR,
enum=['reviewed', 'not reviewed', 'all'],
),
OpenApiParameter(
name='request_id',
location=OpenApiParameter.QUERY,
description='Specific request id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='redemption_id',
location=OpenApiParameter.QUERY,
description='Specific redemption id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='subsidiary',
location=OpenApiParameter.QUERY,
description='Subsidiary',
type=OpenApiTypes.STR,
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="make_report", methods=["GET"])
def make_report(self, request):
if request.method == 'GET':
start_date_str = request.GET.get('start_date')
end_date_str = request.GET.get('end_date')
request_id = request.GET.get('request_id', None)
redemption_id = request.GET.get('redemption_id', None)
is_reviewed = string_to_boolean(request.GET.get('is_reviewed', "false"))
include_test = string_to_boolean(request.GET.get('include_test', "false"))
subsidiary = request.GET.get("subsidiary", "all")
is_daily_report = string_to_boolean(request.GET.get('is_daily_report', "false"))
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
query_set = {"start_date_str": start_date_str,
"end_date_str": end_date_str,
"request_id": request_id,
"redemption_id": redemption_id,
"is_reviewed": is_reviewed,
"include_test": include_test,
"subsidiary": subsidiary,
"is_daily_report": is_daily_report,
}
report_id = "report" + "_" + timezone.datetime.now().strftime("%Y%m%d%H%M%S%z") + "_" + uuid.uuid4().hex
new_report: Report = Report(
report_id=report_id,
is_daily_report=is_daily_report,
subsidiary=subsidiary.lower().replace(" ", ""),
include_test=include_test,
include_reviewed=is_reviewed,
start_at=start_date,
end_at=end_date,
status="Processing",
)
if is_daily_report:
new_report.created_at = end_date
new_report.save()
# Background job to calculate accuracy
shadow_report(report_id, query_set)
return JsonResponse(status=status.HTTP_200_OK, data={"report_id": report_id})
@extend_schema(
parameters=[
OpenApiParameter(
name='report_id',
location=OpenApiParameter.QUERY,
description='Specific report id',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="report_detail_list", methods=["GET"])
def get_report_detail_list(self, request):
if request.method == 'GET':
report_id = request.GET.get('report_id', None)
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
report = Report.objects.filter(report_id=report_id).first()
report_files = ReportFile.objects.filter(report=report)
paginator = Paginator(report_files, page_size)
page = paginator.get_page(page_number)
data = extract_report_detail_list(page, in_percent=False)
response = {
'report_detail': data,
'metadata': {"subsidiary": report.subsidiary,
"start_at": report.start_at,
"end_at": report.end_at},
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response, status=200)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='daily_report_only',
location=OpenApiParameter.QUERY,
description='Specific report id',
type=OpenApiTypes.BOOL,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="report_list", methods=["GET"])
def get_report_list(self, request):
if request.method == 'GET':
daily_report_only = request.GET.get('daily_report_only', False)
start_date_str = request.GET.get('start_date', "")
end_date_str = request.GET.get('end_date', "")
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
if not start_date_str or not end_date_str:
reports = Report.objects.all()
else:
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
base_query = Q(created_at__range=(start_date, end_date))
if daily_report_only:
base_query &= Q(is_daily_report=True)
reports = Report.objects.filter(base_query).order_by('created_at')
paginator = Paginator(reports, page_size)
page = paginator.get_page(page_number)
data = []
for report in page:
data.append({
"ID": report.id,
"Created Date": report.created_at,
"No. Requests": report.number_request,
"Status": report.status,
"Purchase Date Acc": report.reviewed_accuracy.get("purchase_date", None) if report.reviewed_accuracy else None,
"Retailer Acc": report.feedback_accuracy.get("retailername", None) if report.reviewed_accuracy else None,
"IMEI Acc": report.feedback_accuracy.get("imei_number", None) if report.reviewed_accuracy else None,
"Avg. Accuracy": report.feedback_accuracy.get("avg", None) if report.reviewed_accuracy else None,
"Avg. Client Request Time": report.average_client_time.get("avg", 0) if report.average_client_time else 0,
"Avg. OCR Processing Time": report.average_OCR_time.get("avg", 0) if report.average_OCR_time else 0,
"report_id": report.report_id,
})
response = {
'report_detail': data,
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response, status=200)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[
OpenApiParameter(
name='start_date',
location=OpenApiParameter.QUERY,
description='Start date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2023-01-02T00:00:00+0700',
),
OpenApiParameter(
name='end_date',
location=OpenApiParameter.QUERY,
description='End date (YYYY-mm-DDTHH:MM:SSZ)',
type=OpenApiTypes.DATE,
default='2024-01-10T00:00:00+0700',
),
OpenApiParameter(
name='subsidiary',
location=OpenApiParameter.QUERY,
description='Subsidiary',
type=OpenApiTypes.STR,
),
OpenApiParameter(
name='page',
location=OpenApiParameter.QUERY,
description='Page number',
type=OpenApiTypes.INT,
required=False
),
OpenApiParameter(
name='page_size',
location=OpenApiParameter.QUERY,
description='Number of items per page',
type=OpenApiTypes.INT,
required=False
),
],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path="overview", methods=["GET"])
def overview(self, request):
if request.method == 'GET':
subsidiary = request.GET.get('subsidiary', None)
start_date_str = request.GET.get('start_date', "")
end_date_str = request.GET.get('end_date', "")
page_number = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 10))
base_query = Q()
if start_date_str and end_date_str:
try:
start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
base_query &= Q(created_at__range=(start_date, end_date))
if subsidiary:
base_query &= Q(subsidiary=subsidiary)
base_query &= Q(is_daily_report=True)
reports = Report.objects.filter(base_query).order_by('created_at')
paginator = Paginator(reports, page_size)
page = paginator.get_page(page_number)
data = []
this_month_report = MonthReportAccumulate()
for report in page:
res = this_month_report.add(report)
if not(res):
_, _data, total = this_month_report()
data += [total]
data += _data
this_month_report = MonthReportAccumulate()
this_month_report.add(report)
else:
continue
_, _data, total = this_month_report()
data += [total]
data += _data
# Generate xlsx file
# workbook = dict2xlsx(data, _type="report")
# tmp_file = f"/tmp/{str(uuid.uuid4())}.xlsx"
# os.makedirs(os.path.dirname(tmp_file), exist_ok=True)
# workbook.save(tmp_file)
# c_connector.remove_local_file((tmp_file, "fake_request_id"))
response = {
# 'file': load_xlsx_file(),
'overview_data': data,
'page': {
'number': page.number,
'total_pages': page.paginator.num_pages,
'count': page.paginator.count,
}
}
return JsonResponse(response, status=200)
return JsonResponse({'error': 'Invalid request method.'}, status=405)
@extend_schema(
parameters=[],
responses=None, tags=['Accuracy']
)
@action(detail=False, url_path=r"get_report_file/(?P<report_id>[\w\-]+)", methods=["GET"])
def get_report_file(self, request, report_id):
if request.method == 'GET':
# report_id = request.GET.get('report_id', None)
if not report_id:
raise RequiredFieldException(excArgs="report_id1")
report_num = Report.objects.filter(report_id=report_id).count()
if report_num == 0:
raise NotFoundException(excArgs=f"report: {report_id}")
report = Report.objects.filter(report_id=report_id).first()
# download from s3 to local
tmp_file = "/tmp/" + "report_" + uuid.uuid4().hex + ".xlsx"
os.makedirs("/tmp", exist_ok=True)
if not report.S3_file_name:
raise NotFoundException(excArgs="S3 file name")
download_from_S3(report.S3_file_name, tmp_file)
file = open(tmp_file, 'rb')
response = FileResponse(file, status=200)
# Set the content type and content disposition headers
response['Content-Type'] = 'application/octet-stream'
response['Content-Disposition'] = 'attachment; filename="{0}"'.format(os.path.basename(tmp_file))
return response
return JsonResponse({'error': 'Invalid request method.'}, status=405)
class RequestViewSet(viewsets.ViewSet):
lookup_field = "username"
@extend_schema(
request={
'multipart/form-data': {
'type': 'object',
'properties': {
'reviewed_result': {
'type': 'string',
'default': '''{"request_id": "Sample request_id", "imei_number": ["sample_imei1", "sample_imei2"], "retailername": "Sample Retailer", "purchase_date": "01/01/1970", "sold_to_party": "Sample party"}''',
},
},
},
},
responses=None,
tags=['Request']
)
@action(detail=False, url_path=r"request/(?P<request_id>[\w\-]+)", methods=["GET", "POST"])
def get_subscription_request(self, request, request_id=None):
if request.method == 'GET':
base_query = Q(request_id=request_id)
subscription_request = SubscriptionRequest.objects.filter(base_query).first()
data = []
imeis = []
purchase_date = []
retailer = ""
try:
if subscription_request.reviewed_result is not None:
imeis = subscription_request.reviewed_result.get("imei_number", [])
purchase_date = subscription_request.reviewed_result.get("purchase_date", [])
retailer = subscription_request.reviewed_result.get("retailername", "")
elif subscription_request.feedback_result is not None :
imeis = subscription_request.feedback_result.get("imei_number", [])
purchase_date = subscription_request.feedback_result.get("purchase_date", [])
retailer = subscription_request.feedback_result.get("retailername", "")
elif subscription_request.predict_result is not None:
if subscription_request.predict_result.get("status", 404) == 200:
imeis = subscription_request.predict_result.get("content", {}).get("document", [])[0].get("content", [])[3].get("value", [])
purchase_date = subscription_request.predict_result.get("content", {}).get("document", [])[0].get("content", [])[2].get("value", [])
retailer = subscription_request.predict_result.get("content", {}).get("document", [])[0].get("content", [])[0].get("value", [])
except Exception as e:
print(f"[ERROR]: {e}")
print(f"[ERROR]: {subscription_request}")
data.append({
'RequestID': subscription_request.request_id,
'RedemptionID': subscription_request.redemption_id,
'IMEIs': imeis,
'Purchase Date': purchase_date,
'Retailer': retailer,
'Reviewed result': subscription_request.reviewed_result,
'Feedback result': subscription_request.feedback_result,
'Client Request Time (ms)': subscription_request.client_request_time,
'Server Processing Time (ms)': subscription_request.preprocessing_time + subscription_request.ai_inference_time,
'Is Reviewed': subscription_request.is_reviewed,
# 'Is Bad Quality': subscription_request.is_bad_image_quality,
'created_at': subscription_request.created_at.isoformat(),
'updated_at': subscription_request.updated_at.isoformat()
})
response = {
'subscription_requests': data
}
return JsonResponse(response)
elif request.method == 'POST':
data = request.data
base_query = Q(request_id=request_id)
subscription_request = SubscriptionRequest.objects.filter(base_query).first()
reviewed_result = json.loads(data["reviewed_result"])
for field in ['retailername', 'sold_to_party', 'purchase_date', 'imei_number']:
if not field in reviewed_result.keys():
raise RequiredFieldException(excArgs=f'reviewed_result.{field}')
subscription_request.reviewed_result = reviewed_result
subscription_request.reviewed_result['request_id'] = request_id
subscription_request.is_reviewed = True
subscription_request.save()
return JsonResponse({'message': 'success.'}, status=200)
else:
return JsonResponse({'error': 'Invalid request method.'}, status=405)

View File

@ -48,15 +48,23 @@ class CtelUserViewSet(viewsets.ViewSet):
print(serializer.is_valid(raise_exception=True))
data = serializer.validated_data
if data['username'] != settings.FI_USER_NAME or data['password'] != settings.FI_PASSWORD:
token_limit = 999999
if data['username'] == settings.ADMIN_USER_NAME:
if data['password'] != settings.ADMIN_PASSWORD:
raise NotAuthenticatedException()
elif data['username'] == settings.STANDARD_USER_NAME:
if data['password'] != settings.STANDARD_PASSWORD:
raise NotAuthenticatedException()
token_limit = 1000
else:
raise NotAuthenticatedException()
users = UserProfile.objects.filter(sync_id=settings.FI_USER_NAME)
users = UserProfile.objects.filter(sync_id=data['username'])
if len(users) > 1:
raise InvalidException(excArgs=USER_MESSAGE)
if len(users) == 0:
user = UserProfile(sync_id=settings.FI_USER_NAME, status=EntityStatus.ACTIVE.value)
user = UserProfile(sync_id=data['username'], status=EntityStatus.ACTIVE.value)
user.save()
else:
user = users[0]
@ -69,7 +77,7 @@ class CtelUserViewSet(viewsets.ViewSet):
if len(plans) > 1:
raise TrialOneException(excArgs=PLAN_MESSAGE)
if len(plans) == 0:
plan = PricingPlan(code=p_code, duration=365, token_limitations=999999)
plan = PricingPlan(code=p_code, duration=365, token_limitations=token_limit)
plan.save()
else:
plan: PricingPlan = plans[0]
@ -84,9 +92,9 @@ class CtelUserViewSet(viewsets.ViewSet):
else:
sub = subs[0]
return Response(status=status.HTTP_200_OK, data={
'user_id': 'SBT',
'user_name': settings.FI_USER_NAME,
'token': sds_authenticator.generate_token(user_id=settings.FI_USER_NAME, internal_id=user.id, status=EntityStatus.ACTIVE.value, sub_id=sub.id)
'user_id': user.id,
'user_name': data['username'],
'token': sds_authenticator.generate_token(user_id=data['username'], internal_id=user.id, status=EntityStatus.ACTIVE.value, sub_id=sub.id)
})

View File

@ -20,7 +20,7 @@ from ..annotation.api import throw_on_failure
from ..constant.common import ProcessType, REQUEST_ID, FOLDER_TYPE, EntityStatus, pdf_extensions, allowed_file_extensions, image_extensions, standard_ocr_list
from ..exception.exceptions import RequiredFieldException, InvalidException, NotFoundException, \
PermissionDeniedException, LockedEntityException, FileContentInvalidException, ServiceTimeoutException
from ..models import SubscriptionRequest, SubscriptionRequestFile, OcrTemplate
from ..models import SubscriptionRequest, SubscriptionRequestFile, OcrTemplate, FeedbackRequest
from ..response.ReportSerializer import ReportSerializer
from ..utils import file as FileUtils
from ..utils import process as ProcessUtil
@ -70,7 +70,6 @@ class CtelViewSet(viewsets.ViewSet):
new_request: SubscriptionRequest = SubscriptionRequest(
pages=total_page,
pages_left=total_page,
doc_type="all",
process_type=p_type, status=1, request_id=rq_id,
provider_code=provider_code,
subscription=sub,
@ -91,7 +90,7 @@ class CtelViewSet(viewsets.ViewSet):
if file_extension in pdf_extensions:
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, files))
elif file_extension in image_extensions:
b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user)
b_url = ProcessUtil.process_image_file(file_name, file_obj, new_request, user, "all", 0)
j_time = time.time()
print(f"[INFO]: Duration of Pre-processing: {j_time - s_time}s")
print(f"[INFO]: b_url: {b_url}")
@ -122,6 +121,9 @@ class CtelViewSet(viewsets.ViewSet):
'redemption_ID': {
'type': 'string'
},
'is_test_request': {
'type': 'boolean',
},
},
'required': {'imei_files'}
}
@ -144,14 +146,16 @@ class CtelViewSet(viewsets.ViewSet):
"invoice": invoice_file_objs
}
total_page = len(files.keys())
is_test_request = validated_data.get("is_test_request", False)
rq_id = provider_code + "_" + datetime.now().strftime("%Y%m%d%H%M%S") + "_" + uuid.uuid4().hex
p_type = validated_data['type']
new_request: SubscriptionRequest = SubscriptionRequest(pages=total_page,
pages_left=total_page,
process_type=p_type, status=1, request_id=rq_id,
provider_code=provider_code,
subscription=sub)
subscription=sub,
redemption_id=validated_data["redemption_ID"],
is_test_request=is_test_request)
new_request.save()
count = 0
compact_files = []
@ -166,9 +170,10 @@ class CtelViewSet(viewsets.ViewSet):
FileUtils.save_to_S3(_name, new_request, file_path)
count += 1
this_file = {
"file_name": _name,
"file_path": file_path,
"file_type": doc_type
"index_in_request": i,
"file_name": _name,
"file_path": file_path,
"file_type": doc_type
}
compact_files.append(this_file)
c_connector.do_pdf((rq_id, sub.id, p_type, user.id, compact_files))
@ -193,6 +198,9 @@ class CtelViewSet(viewsets.ViewSet):
'redemption_ID': {
'type': 'string'
},
'is_test_request': {
'type': 'boolean',
},
},
'required': {'imei_files'}
}
@ -215,7 +223,7 @@ class CtelViewSet(viewsets.ViewSet):
"invoice": invoice_file_objs
}
rq_id = provider_code + "_" + datetime.now().strftime("%Y%m%d%H%M%S") + "_" + uuid.uuid4().hex
is_test_request = validated_data.get("is_test_request", False)
count = 0
doc_files_with_type = []
for doc_type, doc_files in files.items():
@ -235,7 +243,9 @@ class CtelViewSet(viewsets.ViewSet):
pages_left=total_page,
process_type=p_type, status=1, request_id=rq_id,
provider_code=provider_code,
subscription=sub)
subscription=sub,
redemption_id=validated_data["redemption_ID"],
is_test_request=is_test_request)
new_request.save()
# Run file processing in a pool of 2 threads. TODO: Convert to Celery worker when possible
@ -244,9 +254,11 @@ class CtelViewSet(viewsets.ViewSet):
def process_file(data):
idx, doc_type, doc_file, tmp_file_name = data
doc_file.seek(0)
index_in_request = int(tmp_file_name.split(".")[0].split("_")[-1])
file_path = FileUtils.resize_and_save_file(tmp_file_name, new_request, doc_file, 100)
FileUtils.save_to_S3(tmp_file_name, new_request, file_path)
return {
"index_in_request": index_in_request,
"idx": idx,
"file_name": tmp_file_name,
"file_path": file_path,
@ -265,7 +277,7 @@ class CtelViewSet(viewsets.ViewSet):
waiting_time = current_time - start_time
if waiting_time > time_limit:
break
time.sleep(0.2)
time.sleep(0.1)
report_filter = SubscriptionRequest.objects.filter(request_id=rq_id)
if report_filter.count() != 1:
raise InvalidException(excArgs='requestId')
@ -347,8 +359,59 @@ class CtelViewSet(viewsets.ViewSet):
S3_path = FileUtils.save_to_S3(file_name, subcription_request, file_path)
return JsonResponse(status=status.HTTP_200_OK, data={"request_id": rq_id})
@extend_schema(request={
'multipart/form-data': {
'type': 'object',
'properties': {
'files': {
'type': 'array',
'items': {
'type': 'string',
'format': 'binary'
}
},
},
'required': ['files']
}
}, responses=None, tags=['OCR'])
@action(detail=False, url_path="images/feedback_file", methods=["POST"])
def feedback_file(self, request):
files = request.data.getlist('files')
FileUtils.validate_csv_feedback(files)
user_info = ProcessUtil.get_user(request)
user = user_info.user
sub = user_info.current_sub
feedback_id = "FB_" + datetime.now().strftime("%Y%m%d%H%M%S") + "_" + uuid.uuid4().hex
origin_name = ""
file_names = ""
for i, file in enumerate(files):
origin_name += file.name + ","
file_names += f"{feedback_id}_{i}.csv"
origin_name = origin_name[:-1]
new_request: FeedbackRequest = FeedbackRequest(feedback_id=feedback_id,
origin_name=origin_name,
file_name=file_names,
subscription=sub)
new_request.save()
for i, file in enumerate(files):
file_name = f"{feedback_id}_{i}.csv"
# Save to local
file_path = FileUtils.save_feedback_file(file_name, new_request, file)
# Upload to S3
S3_path = FileUtils.save_feedback_to_S3(file_name, feedback_id, file_path)
# validate
FileUtils.validate_feedback_file(file_path)
# Process csv file in the background
ProcessUtil.process_feedback(feedback_id, file_path)
return JsonResponse(status=status.HTTP_200_OK, data={"feedback_id": feedback_id})
@extend_schema(request=None, responses=None, tags=['Data'])
@extend_schema(request=None, responses=None, tags=['templates'], methods=['GET'])
@action(detail=False, url_path=r"media/(?P<folder_type>\w+)/(?P<uq_id>\w+)", methods=["GET"])
@ -389,6 +452,7 @@ class CtelViewSet(viewsets.ViewSet):
if user.id != user_data['internal_id'] or user.status != EntityStatus.ACTIVE.value:
raise PermissionDeniedException()
# print(f"[DEBUG]: rq: {rq}, file_name: {file_name}")
file_data = SubscriptionRequestFile.objects.filter(request=rq, file_name=file_name)[0]
except IndexError:
raise NotFoundException(excArgs='file')

View File

@ -2,6 +2,8 @@ from django.conf import settings
from rest_framework.routers import DefaultRouter, SimpleRouter
from fwd_api.api.ctel_view import CtelViewSet
from fwd_api.api.accuracy_view import AccuracyViewSet, RequestViewSet
from fwd_api.api.ctel_user_view import CtelUserViewSet
from fwd_api.api.ctel_template_view import CtelTemplateViewSet
@ -13,6 +15,8 @@ else:
router.register("ctel", CtelViewSet, basename="CtelAPI")
router.register("ctel", CtelUserViewSet, basename="CtelUserAPI")
router.register("ctel", AccuracyViewSet, basename="AccuracyAPI")
router.register("ctel", RequestViewSet, basename="RequestAPI")
app_name = "api"
urlpatterns = router.urls

View File

@ -30,19 +30,31 @@ class CeleryConnector:
'process_sbt_invoice': {'queue': "invoice_sbt"},
'do_pdf': {'queue': "do_pdf"},
'upload_file_to_s3': {'queue': "upload_file_to_s3"},
'upload_feedback_to_s3': {'queue': "upload_feedback_to_s3"},
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
'upload_report_to_s3': {'queue': "upload_report_to_s3"},
'remove_local_file': {'queue': "remove_local_file"},
'csv_feedback': {'queue': "csv_feedback"},
'make_a_report': {'queue': "report"},
}
app = Celery(
'postman',
broker=settings.BROKER_URL,
broker_transport_options={'confirm_publish': False},
)
)
def make_a_report(self, args):
return self.send_task('make_a_report', args)
def csv_feedback(self, args):
return self.send_task('csv_feedback', args)
def do_pdf(self, args):
return self.send_task('do_pdf', args)
def upload_file_to_s3(self, args):
return self.send_task('upload_file_to_s3', args)
def upload_file_to_s3(self, args):
return self.send_task('upload_file_to_s3', args)
def upload_report_to_s3(self, args):
return self.send_task('upload_report_to_s3', args)
def upload_obj_to_s3(self, args):
return self.send_task('upload_obj_to_s3', args)
def remove_local_file(self, args):

View File

@ -9,12 +9,13 @@ from fwd_api.models import SubscriptionRequest, UserProfile
from fwd_api.celery_worker.worker import app
from ..constant.common import FolderFileType, image_extensions
from ..exception.exceptions import FileContentInvalidException
from fwd_api.models import SubscriptionRequestFile
from fwd_api.models import SubscriptionRequestFile, FeedbackRequest, Report
from ..utils import file as FileUtils
from ..utils import process as ProcessUtil
from ..utils import s3 as S3Util
from fwd_api.constant.common import ProcessType
import csv
import json
from celery.utils.log import get_task_logger
from fwd import settings
@ -29,13 +30,16 @@ s3_client = S3Util.MinioS3Client(
bucket_name=settings.S3_BUCKET_NAME
)
def process_pdf_file(file_name: str, file_path: str, request, user) -> list:
def process_pdf_file(file_name: str, file_path: str, request, user, doc_type: str, index_in_request: int) -> list:
try:
# Origin file
code = f'FIL{uuid.uuid4().hex}'
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
code=code,
doc_type=doc_type,
index_in_request=index_in_request)
new_request_file.save()
# Sub-file
return ProcessUtil.pdf_to_images_urls(FileUtils.get_file(file_path), request, user)
@ -59,12 +63,68 @@ def process_image_file(file_name: str, file_path, request, user) -> list:
'request_file_id': new_request_file.code
}]
@app.task(name="csv_feedback")
def process_csv_feedback(csv_file_path, feedback_id):
# load file to RAM
status = {}
with open(csv_file_path, 'r') as file:
reader = csv.DictReader(file)
# for rq in rqs
for row in reader:
# get request_subcription
request_id = row.get('requestId')
sub_rqs = SubscriptionRequest.objects.filter(request_id=request_id)
if len(sub_rqs) != 1:
status[request_id] = f"Found {len(sub_rqs)} records of request id {request_id}"
continue
else:
sub_rq = sub_rqs[0]
fb = {}
# update user result (with validate)
redemption_id = row.get('redemptionNumber')
imei1 = row.get('imeiNumber')
imei2 = row.get('imeiNumber2')
purchase_date = row.get('Purchase Date')
retailer = row.get('retailer')
sold_to_party = row.get('Sold to party')
server_time = float(row.get('timetakenmilli'))
fb['request_id'] = request_id
fb['retailername'] = retailer
fb['sold_to_party'] = sold_to_party
fb['purchase_date'] = purchase_date
fb['imei_number'] = [imei1, imei2]
sub_rq.feedback_result = fb
sub_rq.client_request_time = server_time
# update redemption_id if exist
if len(redemption_id) > 0:
sub_rq.redemption_id = redemption_id
sub_rq.save()
# update log into database
feedback_rq = FeedbackRequest.objects.filter(feedback_id=feedback_id).first()
feedback_rq.error_status = status
# save log to local
directory_name = os.path.dirname(csv_file_path)
file_path = csv_file_path.replace(".csv", "_error.json")
with open(file_path, "w") as outfile:
json.dump(status, outfile)
# save to s3
s3_key = os.path.join("feedback", directory_name.split("/")[-1], file_path.split("/")[-1])
if s3_client.s3_client is not None:
try:
# check if saved then delete local
s3_client.upload_file(file_path, s3_key)
os.remove(file_path)
except Exception as e:
logger.error(f"Unable to set S3: {e}")
print(f"Unable to set S3: {e}")
feedback_rq.save()
@app.task(name='do_pdf')
def process_pdf(rq_id, sub_id, p_type, user_id, files):
"""
files: [{
"idx": int
"index_in_request": int,
"idx": int,
"file_name": "",
"file_path": "", # local path to file
"file_type": ""
@ -79,7 +139,7 @@ def process_pdf(rq_id, sub_id, p_type, user_id, files):
idx, file = data
extension = file["file_name"].split(".")[-1].lower()
if extension == "pdf":
_b_urls = process_pdf_file(file["file_name"], file["file_path"], new_request, user)
_b_urls = process_pdf_file(file["file_name"], file["file_path"], new_request, user, file["file_type"], file["index_in_request"])
if _b_urls is None:
new_request.status = 400
new_request.save()
@ -89,7 +149,7 @@ def process_pdf(rq_id, sub_id, p_type, user_id, files):
_b_urls[j]["page_number"] = idx
return idx, _b_urls[0]
elif extension in image_extensions:
this_url = ProcessUtil.process_image_local_file(file["file_name"], file["file_path"], new_request, user)[0]
this_url = ProcessUtil.process_image_local_file(file["file_name"], file["file_path"], new_request, user, file["file_type"], file["index_in_request"])[0]
this_url["page_number"] = idx
if file["file_type"]:
this_url["doc_type"] = file["file_type"]
@ -136,6 +196,37 @@ def upload_file_to_s3(local_file_path, s3_key, request_id):
else:
logger.info(f"S3 is not available, skipping,...")
@app.task(name='upload_feedback_to_s3')
def upload_feedback_to_s3(local_file_path, s3_key, feedback_id):
if s3_client.s3_client is not None:
try:
s3_client.upload_file(local_file_path, s3_key)
feed_request = FeedbackRequest.objects.filter(feedback_id=feedback_id)[0]
feed_request.S3_uploaded = True
feed_request.save()
except Exception as e:
logger.error(f"Unable to set S3: {e}")
print(f"Unable to set S3: {e}")
return
else:
logger.info(f"S3 is not available, skipping,...")
@app.task(name='upload_report_to_s3')
def upload_report_to_s3(local_file_path, s3_key, report_id):
if s3_client.s3_client is not None:
try:
s3_client.upload_file(local_file_path, s3_key)
report = Report.objects.filter(report_id=report_id)[0]
report.S3_uploaded = True
report.S3_file_name = s3_key
report.save()
except Exception as e:
logger.error(f"Unable to set S3: {e}")
print(f"Unable to set S3: {e}")
return
else:
logger.info(f"S3 is not available, skipping,...")
@app.task(name='remove_local_file')
def remove_local_file(local_file_path, request_id):
print(f"[INFO] Removing local file: {local_file_path}, ...")

View File

@ -0,0 +1,154 @@
import traceback
from fwd_api.models import SubscriptionRequest, Report, ReportFile
from fwd_api.celery_worker.worker import app
from ..utils import s3 as S3Util
from ..utils.accuracy import update_temp_accuracy, IterAvg, calculate_and_save_subcription_file, count_transactions, extract_report_detail_list
from ..utils.file import dict2xlsx, save_workbook_file, save_report_to_S3
from django.utils import timezone
from django.db.models import Q
from celery.utils.log import get_task_logger
from fwd import settings
logger = get_task_logger(__name__)
s3_client = S3Util.MinioS3Client(
endpoint=settings.S3_ENDPOINT,
access_key=settings.S3_ACCESS_KEY,
secret_key=settings.S3_SECRET_KEY,
bucket_name=settings.S3_BUCKET_NAME
)
def mean_list(l):
l = [x for x in l if x is not None]
if len(l) == 0:
return 0
return sum(l)/len(l)
@app.task(name='make_a_report')
def make_a_report(report_id, query_set):
try:
start_date = timezone.datetime.strptime(query_set["start_date_str"], '%Y-%m-%dT%H:%M:%S%z')
end_date = timezone.datetime.strptime(query_set["end_date_str"], '%Y-%m-%dT%H:%M:%S%z')
base_query = Q(created_at__range=(start_date, end_date))
if query_set["request_id"]:
base_query &= Q(request_id=query_set["request_id"])
if query_set["redemption_id"]:
base_query &= Q(redemption_id=query_set["redemption_id"])
base_query &= Q(is_test_request=False)
if isinstance(query_set["include_test"], str):
query_set["include_test"] = True if query_set["include_test"].lower() in ["true", "yes", "1"] else False
if query_set["include_test"]:
# base_query = ~base_query
base_query.children = base_query.children[:-1]
elif isinstance(query_set["include_test"], bool):
if query_set["include_test"]:
base_query = ~base_query
if isinstance(query_set["subsidiary"], str):
if query_set["subsidiary"] and query_set["subsidiary"].lower().replace(" ", "")!="all":
base_query &= Q(redemption_id__startswith=query_set["subsidiary"])
if isinstance(query_set["is_reviewed"], str):
if query_set["is_reviewed"] == "reviewed":
base_query &= Q(is_reviewed=True)
elif query_set["is_reviewed"] == "not reviewed":
base_query &= Q(is_reviewed=False)
# elif query_set["is_reviewed"] == "all":
# pass
errors = []
# Create a placeholder to fill
accuracy = {"feedback" :{"imei_number": IterAvg(),
"purchase_date": IterAvg(),
"retailername": IterAvg(),
"sold_to_party": IterAvg(),},
"reviewed" :{"imei_number": IterAvg(),
"purchase_date": IterAvg(),
"retailername": IterAvg(),
"sold_to_party": IterAvg(),}
} # {"imei": {"acc": 0.1, count: 1}, ...}
time_cost = {"invoice": IterAvg(),
"imei": IterAvg()}
number_images = 0
number_bad_images = 0
# TODO: Multithreading
# Calculate accuracy, processing time, ....Then save.
subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
report: Report = \
Report.objects.filter(report_id=report_id).first()
# TODO: number of transaction by doc type
num_request = 0
for request in subscription_requests:
if request.status != 200 or not (request.reviewed_result or request.feedback_result):
# Failed requests or lack of reviewed_result/feedback_result
continue
request_att = calculate_and_save_subcription_file(report, request)
request.feedback_accuracy = {"imei_number" : mean_list(request_att["acc"]["feedback"].get("imei_number", [None])),
"purchase_date" : mean_list(request_att["acc"]["feedback"].get("purchase_date", [None])),
"retailername" : mean_list(request_att["acc"]["feedback"].get("retailername", [None])),
"sold_to_party" : mean_list(request_att["acc"]["feedback"].get("sold_to_party", [None]))}
request.reviewed_accuracy = {"imei_number" : mean_list(request_att["acc"]["reviewed"].get("imei_number", [None])),
"purchase_date" : mean_list(request_att["acc"]["reviewed"].get("purchase_date", [None])),
"retailername" : mean_list(request_att["acc"]["reviewed"].get("retailername", [None])),
"sold_to_party" : mean_list(request_att["acc"]["reviewed"].get("sold_to_party", [None]))}
request.save()
number_images += request_att["total_images"]
number_bad_images += request_att["bad_images"]
update_temp_accuracy(accuracy["feedback"], request_att["acc"]["feedback"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"])
update_temp_accuracy(accuracy["reviewed"], request_att["acc"]["reviewed"], keys=["imei_number", "purchase_date", "retailername", "sold_to_party"])
time_cost["imei"].add(request_att["time_cost"].get("imei", []))
time_cost["invoice"].add(request_att["time_cost"].get("invoice", []))
errors += request_att["err"]
num_request += 1
transaction_att = count_transactions(start_date, end_date)
# Do saving process
report.number_request = num_request
report.number_images = number_images
report.number_imei = time_cost["imei"].count
report.number_invoice = time_cost["invoice"].count
report.number_bad_images = number_bad_images
# FIXME: refactor this data stream for endurability
report.average_OCR_time = {"invoice": time_cost["invoice"](), "imei": time_cost["imei"](),
"invoice_count": time_cost["invoice"].count, "imei_count": time_cost["imei"].count}
report.average_OCR_time["avg"] = (report.average_OCR_time["invoice"]*report.average_OCR_time["invoice_count"] + report.average_OCR_time["imei"]*report.average_OCR_time["imei_count"])/(report.average_OCR_time["imei_count"] + report.average_OCR_time["invoice_count"])
report.number_imei_transaction = transaction_att.get("imei", 0)
report.number_invoice_transaction = transaction_att.get("invoice", 0)
acumulated_acc = {"feedback": {},
"reviewed": {}}
for acc_type in ["feedback", "reviewed"]:
avg_acc = IterAvg()
for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]:
acumulated_acc[acc_type][key] = accuracy[acc_type][key]()
acumulated_acc[acc_type][key+"_count"] = accuracy[acc_type][key].count
avg_acc.add_avg(acumulated_acc[acc_type][key], acumulated_acc[acc_type][key+"_count"])
acumulated_acc[acc_type]["avg"] = avg_acc()
report.feedback_accuracy = acumulated_acc["feedback"]
report.reviewed_accuracy = acumulated_acc["reviewed"]
report.errors = "|".join(errors)
report.status = "Ready"
report.save()
# Saving a xlsx file
report_files = ReportFile.objects.filter(report=report)
data = extract_report_detail_list(report_files, lower=True)
data_workbook = dict2xlsx(data, _type='report_detail')
local_workbook = save_workbook_file(report.report_id + ".xlsx", report, data_workbook)
s3_key=save_report_to_S3(report.report_id, local_workbook)
except IndexError as e:
print(e)
traceback.print_exc()
print("NotFound request by report id, %d", report_id)
except Exception as e:
print("[ERROR]: an error occured while processing report: ", report_id)
traceback.print_exc()
return 400

View File

@ -12,7 +12,7 @@ django.setup()
app: Celery = Celery(
'postman',
broker=settings.BROKER_URL,
include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task'],
include=['fwd_api.celery_worker.process_result_tasks', 'fwd_api.celery_worker.internal_task', 'fwd_api.celery_worker.process_report_tasks'],
broker_transport_options={'confirm_publish': False},
)
@ -36,9 +36,12 @@ app.conf.update({
Queue('invoice_sbt_rs'),
Queue('do_pdf'),
Queue('upload_file_to_s3'),
Queue('upload_feedback_to_s3'),
Queue('upload_obj_to_s3'),
Queue('upload_report_to_s3'),
Queue('remove_local_file'),
Queue('csv_feedback'),
Queue('report'),
],
'task_routes': {
@ -52,9 +55,12 @@ app.conf.update({
'process_sbt_invoice': {'queue': "invoice_sbt"},
'do_pdf': {'queue': "do_pdf"},
'upload_file_to_s3': {'queue': "upload_file_to_s3"},
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
'upload_file_to_s3': {'queue': "upload_file_to_s3"},
'upload_feedback_to_s3': {'queue': "upload_feedback_to_s3"},
'upload_obj_to_s3': {'queue': "upload_obj_to_s3"},
'upload_report_to_s3': {'queue': "upload_report_to_s3"},
'remove_local_file': {'queue': "remove_local_file"},
'csv_feedback': {'queue': "csv_feedback"},
'make_a_report': {'queue': "report"},
}
})

View File

@ -67,6 +67,11 @@ class RequiredFieldException(GeneralException):
default_detail = 'Field required'
detail_with_arg = '{} param is required'
class RequiredColumnException(GeneralException):
status_code = status.HTTP_400_BAD_REQUEST
default_code = 4003
default_detail = 'Collumns required'
detail_with_arg = '{} collumns are required'
class DuplicateEntityException(GeneralException):
status_code = status.HTTP_400_BAD_REQUEST

View File

@ -0,0 +1,71 @@
# myapp/management/commands/mycustomcommand.py
from django.core.management.base import BaseCommand
from tqdm import tqdm
from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest
from fwd_api.utils.accuracy import predict_result_to_ready
import traceback
import copy
class Command(BaseCommand):
help = 'Refactor database for image level'
def add_arguments(self, parser):
# Add your command-line arguments here
parser.add_argument('test', type=str, help='Value for the argument')
def process_request(self, request):
if len(request.request_id.split(".")[0].split("_")) < 2:
return
images = SubscriptionRequestFile.objects.filter(request=request)
time_cost = {"imei": [], "invoice": [], "all": []}
if request.ai_inference_profile is None:
time_cost["imei"] = [-1 for _ in range(len(images))]
time_cost["invoice"] = [-1]
time_cost["all"] = [-1]
else:
for k, v in request.ai_inference_profile.items():
time_cost[k.split("_")[0]].append(v["inference"][1][0] - v["inference"][0] + (v["postprocess"][1]-v["postprocess"][0]))
for i, image in enumerate(images):
# temp_imei_SAP_20240127223644_a493434edbf84fc08aeb87ef6cdde102_0.jpg
try:
image.index_in_request = int(image.file_name.split(".")[0].split("_")[-1]) if len(image.file_name.split(".")[0].split("_")) > 4 else 0
image.doc_type = image.file_name.split(".")[0].split("_")[1] if len(image.file_name.split(".")[0].split("_")) > 4 else "all"
image.processing_time = time_cost[image.doc_type][image.index_in_request]
if not request.predict_result:
raise KeyError(f"Key predict_result not found in {request.request_id}")
if request.predict_result.get("status", 200) != 200:
raise AttributeError(f"Failed request: {request.request_id}")
_predict_result = copy.deepcopy(predict_result_to_ready(request.predict_result))
_feedback_result = copy.deepcopy(request.feedback_result)
_reviewed_result = copy.deepcopy(request.reviewed_result)
if image.doc_type == "invoice":
_predict_result["imei_number"] = []
if _feedback_result:
_feedback_result["imei_number"] = []
else:
None
if _reviewed_result:
_reviewed_result["imei_number"] = []
else:
None
else:
_predict_result = {"retailername": None, "sold_to_party": None, "purchase_date": [], "imei_number": [_predict_result["imei_number"][image.index_in_request]]}
_feedback_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_feedback_result["imei_number"][image.index_in_request]]} if _feedback_result else None
_reviewed_result = {"retailername": None, "sold_to_party": None, "purchase_date": None, "imei_number": [_reviewed_result["imei_number"][image.index_in_request]]} if _reviewed_result else None
image.predict_result = _predict_result
image.feedback_result = _feedback_result
image.reviewed_result = _reviewed_result
image.save()
except Exception as e:
self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}"))
print(traceback.format_exc())
continue
def handle(self, *args, **options):
test = options['test']
subcription_iter = SubscriptionRequest.objects.all()
for request in tqdm(subcription_iter.iterator()):
self.process_request(request)
self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully!'))

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-04 08:24
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0163_subscriptionrequest_ai_inference_profile'),
]
operations = [
migrations.AddField(
model_name='subscriptionrequest',
name='client_request_time',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='redemption_id',
field=models.CharField(max_length=200, null=True),
),
migrations.AddField(
model_name='subscriptionrequest',
name='reviewed_result',
field=models.JSONField(null=True),
),
]

View File

@ -0,0 +1,29 @@
# Generated by Django 4.1.3 on 2024-01-09 10:08
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0164_subscriptionrequest_client_request_time_and_more'),
]
operations = [
migrations.CreateModel(
name='FeedbackRequest',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('feedback_id', models.CharField(max_length=200)),
('file_name', models.CharField(max_length=200)),
('origin_name', models.CharField(max_length=200)),
('error_status', models.JSONField(null=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('updated_at', models.DateTimeField(auto_now=True)),
('S3_uploaded', models.BooleanField(default=False)),
('subscription', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='fwd_api.subscription')),
],
),
]

View File

@ -0,0 +1,48 @@
# Generated by Django 4.1.3 on 2024-01-17 03:47
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0165_feedbackrequest'),
]
operations = [
migrations.RemoveField(
model_name='subscriptionrequest',
name='is_bad_image_quality',
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='doc_type',
field=models.CharField(default='', max_length=100),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='index_in_request',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='is_bad_image_quality',
field=models.BooleanField(default=False),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='origin_name',
field=models.CharField(default='', max_length=300),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
]

View File

@ -0,0 +1,102 @@
# Generated by Django 4.1.3 on 2024-01-25 06:22
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0166_remove_subscriptionrequest_is_bad_image_quality_and_more'),
]
operations = [
migrations.CreateModel(
name='Report',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('report_id', models.CharField(max_length=200)),
('local_file_name', models.CharField(max_length=200)),
('error_status', models.JSONField(null=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('updated_at', models.DateTimeField(auto_now=True)),
('start_at', models.DateTimeField(null=True)),
('end_at', models.DateTimeField(null=True)),
('include_for_test_sample', models.BooleanField(default=False)),
('status', models.CharField(max_length=100)),
('is_daily_report', models.BooleanField(default=False)),
('errors', models.TextField(default='')),
('S3_uploaded', models.BooleanField(default=False)),
('number_request', models.IntegerField(default=0)),
('number_images', models.IntegerField(default=0)),
('number_bad_images', models.IntegerField(default=0)),
('average_client_time_profile', models.JSONField(null=True)),
('average_OCR_time_profile', models.JSONField(null=True)),
('average_OCR_time', models.JSONField(null=True)),
('average_client_time', models.JSONField(null=True)),
('imei_accuracy', models.FloatField(default=-1)),
('purchase_date_accuracy', models.FloatField(default=-1)),
('retailer_name_accuracy', models.FloatField(default=-1)),
('sold_to_party_accuracy', models.FloatField(default=-1)),
],
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='accuracy',
),
migrations.AddField(
model_name='subscriptionrequest',
name='imei_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='purchase_date_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='retailer_name_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequest',
name='sold_to_party_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='counter_measures',
field=models.TextField(blank=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='imei_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='processing_time',
field=models.IntegerField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='purchase_date_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='reason',
field=models.TextField(blank=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='retailer_name_accuracy',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='sold_to_party_accuracy',
field=models.FloatField(default=-1),
),
]

View File

@ -0,0 +1,23 @@
# Generated by Django 4.1.3 on 2024-01-25 09:44
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0167_report_remove_subscriptionrequestfile_accuracy_and_more'),
]
operations = [
migrations.AddField(
model_name='report',
name='number_imei_transaction',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='report',
name='number_ivoice_transaction',
field=models.IntegerField(default=0),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-25 11:17
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0168_report_number_imei_transaction_and_more'),
]
operations = [
migrations.AddField(
model_name='report',
name='include_reviewed',
field=models.TextField(default=''),
),
migrations.AddField(
model_name='report',
name='include_test',
field=models.CharField(default='', max_length=200),
),
migrations.AddField(
model_name='report',
name='subsidiary',
field=models.TextField(default=''),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-25 11:19
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0169_report_include_reviewed_report_include_test_and_more'),
]
operations = [
migrations.AlterField(
model_name='report',
name='errors',
field=models.TextField(default='', null=True),
),
migrations.AlterField(
model_name='report',
name='include_reviewed',
field=models.TextField(default='', null=True),
),
migrations.AlterField(
model_name='report',
name='subsidiary',
field=models.TextField(default='', null=True),
),
]

View File

@ -0,0 +1,112 @@
# Generated by Django 4.1.3 on 2024-01-28 08:11
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0170_alter_report_errors_alter_report_include_reviewed_and_more'),
]
operations = [
migrations.RenameField(
model_name='report',
old_name='imei_accuracy',
new_name='imei_accuracy_ocr',
),
migrations.RenameField(
model_name='report',
old_name='purchase_date_accuracy',
new_name='imei_accuracy_revised',
),
migrations.RenameField(
model_name='report',
old_name='retailer_name_accuracy',
new_name='purchase_date_accuracy_ocr',
),
migrations.RenameField(
model_name='report',
old_name='sold_to_party_accuracy',
new_name='purchase_date_accuracy_revised',
),
migrations.AddField(
model_name='report',
name='retailer_name_accuracy_ocr',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='report',
name='retailer_name_accuracy_revised',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='report',
name='sold_to_party_accuracy_ocr',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='report',
name='sold_to_party_accuracy_revised',
field=models.FloatField(default=-1),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='feedback_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='predict_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='reviewed_result',
field=models.JSONField(null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='doc_type',
field=models.CharField(default='', max_length=10),
),
migrations.CreateModel(
name='ReportFile',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('correspond_request_id', models.CharField(max_length=200)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('updated_at', models.DateTimeField(auto_now=True)),
('S3_uploaded', models.BooleanField(default=False)),
('doc_type', models.CharField(max_length=200)),
('imei_feedback', models.CharField(default=None, max_length=200, null=True)),
('purchase_date_feedback', models.CharField(default=None, max_length=200, null=True)),
('retailer_feedback', models.CharField(default=None, max_length=200, null=True)),
('sold_to_party_feedback', models.CharField(default=None, max_length=200, null=True)),
('imei_ocr', models.CharField(default=None, max_length=200, null=True)),
('purchase_date_ocr', models.CharField(default=None, max_length=200, null=True)),
('retailer_ocr', models.CharField(default=None, max_length=200, null=True)),
('sold_to_party_ocr', models.CharField(default=None, max_length=200, null=True)),
('imei_revised', models.CharField(default=None, max_length=200, null=True)),
('purchase_date_revised', models.CharField(default=None, max_length=200, null=True)),
('retailer_revised', models.CharField(default=None, max_length=200, null=True)),
('sold_to_party_revised', models.CharField(default=None, max_length=200, null=True)),
('imei_acc_feedback', models.FloatField(default=None, null=True)),
('purchase_date_acc_feedback', models.FloatField(default=None, null=True)),
('retailer_acc_feedback', models.FloatField(default=None, null=True)),
('sold_to_party_acc_feedback', models.CharField(default=None, max_length=200, null=True)),
('acc_feedback', models.FloatField(default=None, null=True)),
('imei_acc_revised', models.FloatField(default=None, null=True)),
('purchase_date_acc_revised', models.FloatField(default=None, null=True)),
('retailer_acc_revised', models.FloatField(default=None, null=True)),
('acc_revised', models.FloatField(default=None, null=True)),
('time_cost', models.FloatField(default=0)),
('is_reviewed', models.CharField(default='NA', max_length=5)),
('bad_image_reason', models.TextField(default='')),
('countermeasures', models.TextField(default='')),
('report', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='files', to='fwd_api.report')),
],
),
]

View File

@ -0,0 +1,38 @@
# Generated by Django 4.1.3 on 2024-01-28 09:27
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0171_rename_imei_accuracy_report_imei_accuracy_ocr_and_more'),
]
operations = [
migrations.AlterField(
model_name='subscriptionrequestfile',
name='imei_accuracy',
field=models.FloatField(default=None, null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='processing_time',
field=models.FloatField(default=-1),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='purchase_date_accuracy',
field=models.FloatField(default=None, null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='retailer_name_accuracy',
field=models.FloatField(default=None, null=True),
),
migrations.AlterField(
model_name='subscriptionrequestfile',
name='sold_to_party_accuracy',
field=models.FloatField(default=None, null=True),
),
]

View File

@ -0,0 +1,226 @@
# Generated by Django 4.1.3 on 2024-01-28 18:00
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0172_alter_subscriptionrequestfile_imei_accuracy_and_more'),
]
operations = [
migrations.RenameField(
model_name='reportfile',
old_name='countermeasures',
new_name='counter_measures',
),
migrations.RemoveField(
model_name='report',
name='imei_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='imei_accuracy_revised',
),
migrations.RemoveField(
model_name='report',
name='purchase_date_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='purchase_date_accuracy_revised',
),
migrations.RemoveField(
model_name='report',
name='retailer_name_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='retailer_name_accuracy_revised',
),
migrations.RemoveField(
model_name='report',
name='sold_to_party_accuracy_ocr',
),
migrations.RemoveField(
model_name='report',
name='sold_to_party_accuracy_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='imei_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='purchase_date_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_acc_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='retailer_revised',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_acc_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_feedback',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_ocr',
),
migrations.RemoveField(
model_name='reportfile',
name='sold_to_party_revised',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='imei_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='purchase_date_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='retailer_name_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequest',
name='sold_to_party_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='imei_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='purchase_date_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='retailer_name_accuracy',
),
migrations.RemoveField(
model_name='subscriptionrequestfile',
name='sold_to_party_accuracy',
),
migrations.AddField(
model_name='report',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='report',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='error',
field=models.TextField(default=''),
),
migrations.AddField(
model_name='reportfile',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='feedback_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='predict_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='reportfile',
name='reviewed_result',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequest',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequest',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='feedback_accuracy',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='subscriptionrequestfile',
name='reviewed_accuracy',
field=models.JSONField(null=True),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-29 05:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0173_rename_countermeasures_reportfile_counter_measures_and_more'),
]
operations = [
migrations.AddField(
model_name='reportfile',
name='acc',
field=models.FloatField(default=0),
),
migrations.AddField(
model_name='reportfile',
name='correspond_redemption_id',
field=models.CharField(default='', max_length=200),
),
migrations.AlterField(
model_name='reportfile',
name='correspond_request_id',
field=models.CharField(default='', max_length=200),
),
]

View File

@ -0,0 +1,28 @@
# Generated by Django 4.1.3 on 2024-01-30 12:29
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0174_reportfile_acc_reportfile_correspond_redemption_id_and_more'),
]
operations = [
migrations.RenameField(
model_name='report',
old_name='number_ivoice_transaction',
new_name='number_imei',
),
migrations.AddField(
model_name='report',
name='number_invoice',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='report',
name='number_invoice_transaction',
field=models.IntegerField(default=0),
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.1.3 on 2024-01-31 09:31
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0175_rename_number_ivoice_transaction_report_number_imei_and_more'),
]
operations = [
migrations.AddField(
model_name='report',
name='S3_file_name',
field=models.TextField(default=None, null=True),
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.1.3 on 2024-02-01 03:27
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('fwd_api', '0176_report_s3_file_name'),
]
operations = [
migrations.AlterField(
model_name='report',
name='subsidiary',
field=models.CharField(default='', max_length=200, null=True),
),
]

View File

@ -0,0 +1,14 @@
from django.db import models
from django.utils import timezone
from fwd_api.models.Subscription import Subscription
class FeedbackRequest(models.Model):
id = models.AutoField(primary_key=True)
feedback_id = models.CharField(max_length=200) # Change to request_id
file_name = models.CharField(max_length=200) # Change to request_id
origin_name = models.CharField(max_length=200) # Change to request_id
error_status = models.JSONField(null=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
updated_at = models.DateTimeField(auto_now=True)
subscription = models.ForeignKey(Subscription, on_delete=models.CASCADE)
S3_uploaded = models.BooleanField(default=False)

View File

@ -0,0 +1,41 @@
from django.db import models
from django.utils import timezone
from fwd_api.models.Subscription import Subscription
class Report(models.Model):
# Metadata
id = models.AutoField(primary_key=True)
report_id = models.CharField(max_length=200) # Change to request_id
local_file_name = models.CharField(max_length=200) # Change to request_id
error_status = models.JSONField(null=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
updated_at = models.DateTimeField(auto_now=True)
start_at = models.DateTimeField(null=True)
end_at = models.DateTimeField(null=True)
include_for_test_sample = models.BooleanField(default=False)
status = models.CharField(max_length=100)
is_daily_report = models.BooleanField(default=False)
errors = models.TextField(default="", null=True)
subsidiary = models.CharField(default="", null=True, max_length=200)
include_reviewed = models.TextField(default="", null=True, )
include_test = models.CharField(max_length=200, default="")
# Data
S3_uploaded = models.BooleanField(default=False)
S3_file_name = models.TextField(default=None, null=True)
number_request = models.IntegerField(default=0)
number_images = models.IntegerField(default=0)
number_bad_images = models.IntegerField(default=0)
number_imei = models.IntegerField(default=0)
number_invoice = models.IntegerField(default=0)
number_imei_transaction = models.IntegerField(default=0)
number_invoice_transaction = models.IntegerField(default=0)
average_client_time_profile = models.JSONField(null=True) # {"0.1": 100, 0.2: 200, ...} | Future feature
average_OCR_time_profile = models.JSONField(null=True) # {"0.1": 98, 0.2: 202, ...} | Future feature
average_OCR_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1} | Future feature
average_client_time = models.JSONField(null=True) # {"invoice": 0.1, "imei": 0.1}
feedback_accuracy = models.JSONField(null=True)
reviewed_accuracy = models.JSONField(null=True)

View File

@ -0,0 +1,35 @@
from django.db import models
from django.utils import timezone
from fwd_api.models.Subscription import Subscription
from fwd_api.models.SubscriptionRequest import SubscriptionRequest
from fwd_api.models.Report import Report
class ReportFile(models.Model):
# Metadata
id = models.AutoField(primary_key=True)
correspond_request_id = models.CharField(max_length=200, default="")
correspond_redemption_id = models.CharField(max_length=200, default="")
created_at = models.DateTimeField(default=timezone.now, db_index=True)
updated_at = models.DateTimeField(auto_now=True)
report = models.ForeignKey(Report, related_name="files", on_delete=models.CASCADE)
# Data
S3_uploaded = models.BooleanField(default=False)
doc_type = models.CharField(max_length=200)
predict_result = models.JSONField(null=True)
feedback_result = models.JSONField(null=True)
reviewed_result = models.JSONField(null=True)
feedback_accuracy = models.JSONField(null=True)
reviewed_accuracy = models.JSONField(null=True)
acc = models.FloatField(default=0)
time_cost = models.FloatField(default=0)
is_reviewed = models.CharField(default="NA", max_length=5) # NA, No, Yes
bad_image_reason = models.TextField(default="")
counter_measures = models.TextField(default="")
error = models.TextField(default="")

View File

@ -3,17 +3,18 @@ from django.utils import timezone
from fwd_api.models.Subscription import Subscription
class SubscriptionRequest(models.Model):
id = models.AutoField(primary_key=True)
pages: int = models.IntegerField()
pages_left: int = models.IntegerField(default=1)
doc_type: str = models.CharField(max_length=100)
request_id = models.CharField(max_length=200) # Change to request_id
redemption_id = models.CharField(max_length=200, null=True)
process_type = models.CharField(max_length=200) # driver/id/invoice
provider_code = models.CharField(max_length=200, default="Guest") # Request source FWD/CTel
predict_result = models.JSONField(null=True)
feedback_result = models.JSONField(null=True)
reviewed_result = models.JSONField(null=True)
status = models.IntegerField() # 1: Processing(Pending) 2: PredictCompleted 3: ReturnCompleted
subscription = models.ForeignKey(Subscription, on_delete=models.CASCADE)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
@ -21,8 +22,12 @@ class SubscriptionRequest(models.Model):
is_test_request = models.BooleanField(default=False)
S3_uploaded = models.BooleanField(default=False)
feedback_accuracy = models.JSONField(null=True)
reviewed_accuracy = models.JSONField(null=True)
ai_inference_profile = models.JSONField(null=True)
preprocessing_time = models.FloatField(default=-1)
client_request_time = models.FloatField(default=-1)
ai_inference_start_time = models.FloatField(default=0)
ai_inference_time = models.FloatField(default=0)
cpu_percent = models.FloatField(default=-1)
@ -31,4 +36,3 @@ class SubscriptionRequest(models.Model):
total_memory = models.FloatField(default=-1)
gpu_stats = models.CharField(max_length=100, null=True)
is_reviewed = models.BooleanField(default=False)
is_bad_image_quality = models.BooleanField(default=False)

View File

@ -12,9 +12,23 @@ class SubscriptionRequestFile(models.Model):
return f"FIL{uuid.uuid4().hex}"
code = models.CharField(max_length=300, default=gen_random_code)
origin_name = models.CharField(max_length=300, default="")
file_name = models.CharField(max_length=300, default=None)
file_path = EncryptedCharField(max_length=500, default=None)
file_category = models.CharField(max_length=200, default=FileCategory.Origin.value)
request = models.ForeignKey(SubscriptionRequest, related_name="files", on_delete=models.CASCADE)
created_at = models.DateTimeField(default=timezone.now)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
updated_at = models.DateTimeField(auto_now=True)
is_bad_image_quality = models.BooleanField(default=False)
doc_type = models.CharField(max_length=10, default="")
index_in_request = models.IntegerField(default=0) # by doc_type
processing_time = models.FloatField(default=-1) # in milisecond
reason = models.TextField(blank=True)
counter_measures = models.TextField(blank=True)
predict_result = models.JSONField(null=True)
feedback_result = models.JSONField(null=True)
reviewed_result = models.JSONField(null=True)
feedback_accuracy = models.JSONField(null=True)
reviewed_accuracy = models.JSONField(null=True)

View File

@ -5,3 +5,8 @@ from .OcrTemplate import OcrTemplate
from .OcrTemplateBox import OcrTemplateBox
from .PricingPlan import PricingPlan
from .Subscription import Subscription
from .FeedbackRequest import FeedbackRequest
from .Report import Report
from .ReportFile import ReportFile

View File

@ -0,0 +1,488 @@
import re
from datetime import datetime
import copy
from typing import Any
from .ocr_utils.ocr_metrics import eval_ocr_metric
from .ocr_utils.sbt_report import post_processing_str
from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile
from ..celery_worker.client_connector import c_connector
from django.db.models import Q
BAD_THRESHOLD = 0.75
valid_keys = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
class MonthReportAccumulate:
def __init__(self):
self.month = None
self.total = {
'subs': "+",
'extraction_date': "Subtotal ()",
'total_images': 0,
'images_quality': {
'successful': 0,
'successful_percent': 0,
'bad': 0,
'bad_percent': 0
},
'average_accuracy_rate': {
'imei': IterAvg(),
'purchase_date': IterAvg(),
'retailer_name': IterAvg()
},
'average_processing_time': {
'imei': IterAvg(),
'invoice': IterAvg()
},
'usage': {
'imei':0,
'invoice': 0
}
}
self.data = []
self.data_format = {
'subs': "",
'extraction_date': "",
'num_imei': 0,
'num_invoice': 0,
'total_images': 0,
'images_quality': {
'successful': 0,
'successful_percent': 0,
'bad': 0,
'bad_percent': 0
},
'average_accuracy_rate': {
'imei': 0,
'purchase_date': 0,
'retailer_name': 0
},
'average_processing_time': {
'imei': 0,
'invoice': 0
},
'usage': {
'imei':0,
'invoice': 0
}
},
def accumulate(self, report):
self.total["total_images"] += report.number_images
self.total["images_quality"]["successful"] += report.number_images - report.number_bad_images
self.total["images_quality"]["bad"] += report.number_bad_images
if sum([report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]) > 0 :
self.total["average_accuracy_rate"]["imei"].add_avg(report.reviewed_accuracy.get("imei_number", 0), report.reviewed_accuracy.get("imei_number_count", 0))
self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.reviewed_accuracy.get("purchase_date", 0), report.reviewed_accuracy.get("purchase_date_count", 0))
self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.reviewed_accuracy.get("retailername", 0), report.reviewed_accuracy.get("retailername_count", 0))
elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]) > 0:
self.total["average_accuracy_rate"]["imei"].add_avg(report.feedback_accuracy.get("imei_number", 0), report.feedback_accuracy.get("imei_number_count", 0))
self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.feedback_accuracy.get("purchase_date", 0), report.feedback_accuracy.get("purchase_date_count", 0))
self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.feedback_accuracy.get("retailername", 0), report.feedback_accuracy.get("retailername_count", 0))
self.total["average_processing_time"]["imei"].add_avg(report.average_OCR_time.get("imei", 0), report.average_OCR_time.get("imei_count", 0)) if report.average_OCR_time else 0
self.total["average_processing_time"]["invoice"].add_avg(report.average_OCR_time.get("invoice", 0), report.average_OCR_time.get("invoice_count", 0)) if report.average_OCR_time else 0
self.total["usage"]["imei"] += report.number_imei_transaction
self.total["usage"]["invoice"] += report.number_invoice_transaction
def add(self, report):
report_month = report.created_at.month
if self.month is None:
self.month = report_month
self.total["extraction_date"] = f"Subtotal ({self.month})"
elif self.month != report_month:
self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"]
self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"]
return False # Reports from a different month, stop accumulating
# accumulate fields
new_data = copy.deepcopy(self.data_format)[0]
new_data["num_imei"] = report.number_imei
new_data["subs"] = report.subsidiary
new_data["extraction_date"] = report.created_at
new_data["num_invoice"] = report.number_invoice
new_data["total_images"] = report.number_images
new_data["images_quality"]["successful"] = report.number_images - report.number_bad_images
new_data["images_quality"]["bad"] = report.number_bad_images
report.reviewed_accuracy = {} if report.reviewed_accuracy is None else report.reviewed_accuracy
report.feedback_accuracy = {} if report.feedback_accuracy is None else report.feedback_accuracy
if sum([ report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]):
new_data["average_accuracy_rate"]["imei"] = report.reviewed_accuracy.get("imei_number", None)
new_data["average_accuracy_rate"]["purchase_date"] = report.reviewed_accuracy.get("purchase_date", None)
new_data["average_accuracy_rate"]["retailer_name"] = report.reviewed_accuracy.get("retailername", None)
elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]):
new_data["average_accuracy_rate"]["imei"] = report.feedback_accuracy.get("imei_number", None)
new_data["average_accuracy_rate"]["purchase_date"] = report.feedback_accuracy.get("purchase_date", None)
new_data["average_accuracy_rate"]["retailer_name"] = report.feedback_accuracy.get("retailername", None)
new_data["average_processing_time"]["imei"] = report.average_OCR_time.get("imei", 0) if report.average_OCR_time else 0
new_data["average_processing_time"]["invoice"] = report.average_OCR_time.get("invoice", 0) if report.average_OCR_time else 0
new_data["usage"]["imei"] = report.number_imei_transaction
new_data["usage"]["invoice"] = report.number_invoice_transaction
new_data["images_quality"]["successful_percent"] += new_data["images_quality"]["successful"]/new_data["total_images"] if new_data["total_images"] else 0
new_data["images_quality"]["bad_percent"] += new_data["images_quality"]["bad"]/new_data["total_images"] if new_data["total_images"] else 0
self.data.append(new_data)
self.accumulate(report)
return True
def __call__(self):
self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"] if self.total["total_images"] else 0
self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"] if self.total["total_images"] else 0
total = copy.deepcopy(self.total)
total["average_accuracy_rate"]["imei"] = total["average_accuracy_rate"]["imei"]()
total["average_accuracy_rate"]["purchase_date"] = total["average_accuracy_rate"]["purchase_date"]()
total["average_accuracy_rate"]["retailer_name"] = total["average_accuracy_rate"]["retailer_name"]()
total["average_processing_time"]["imei"] = total["average_processing_time"]["imei"]()
total["average_processing_time"]["invoice"] = total["average_processing_time"]["invoice"]()
return self.month, self.data, total
class IterAvg:
def __init__(self, name="default"):
self.name = name
self.avg = 0
self.count = 0
def add(self, values):
"""
Args:
values (list[float]):
"""
values = [x for x in values if x is not None]
if len(values) == 0:
return
self.avg = (self.avg*self.count + sum(values))/(self.count+len(values))
self.count += len(values)
def add_avg(self, avg, count):
if avg is None or count is None or count == 0:
return
self.count += count
self.avg = (self.avg*(self.count-count) + avg*count)/(self.count)
def __call__(self):
return self.avg
def first_of_list(the_list):
if not the_list:
return None
return the_list[0]
def extract_report_detail_list(report_detail_list, lower=False, in_percent=True):
data = []
for report_file in report_detail_list:
data.append({
"Request ID": report_file.correspond_request_id,
"Redemption Number": report_file.correspond_redemption_id,
"Image type": report_file.doc_type,
"IMEI_user submitted": first_of_list(report_file.feedback_result.get("imei_number", [None])),
"IMEI_OCR retrieved": first_of_list(report_file.predict_result.get("imei_number", [None])),
"IMEI1 Accuracy": first_of_list(report_file.feedback_accuracy.get("imei_number", [None])),
"Invoice_Purchase Date_Consumer": report_file.feedback_result.get("purchase_date", None),
"Invoice_Purchase Date_OCR": report_file.predict_result.get("purchase_date", []),
"Invoice_Purchase Date Accuracy": first_of_list(report_file.feedback_accuracy.get("purchase_date", [None])),
"Invoice_Retailer_Consumer": report_file.feedback_result.get("retailername", None),
"Invoice_Retailer_OCR": report_file.predict_result.get("retailername", None),
"Invoice_Retailer Accuracy": first_of_list(report_file.feedback_accuracy.get("retailername", [None])),
"OCR Image Accuracy": report_file.acc,
"OCR Image Speed (seconds)": report_file.time_cost,
"Reviewed?": "No",
"Bad Image Reasons": report_file.bad_image_reason,
"Countermeasures": report_file.counter_measures,
"IMEI_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("imei_number", [None])),
"Purchase Date_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("purchase_date", [None])),
"Retailer_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("retailername", [None])),
})
if lower:
for i, dat in enumerate(data):
keys = list(dat.keys())
for old_key in keys:
data[i][old_key.lower().replace(" ", "_")] = data[i].pop(old_key)
if in_percent:
for i, dat in enumerate(data):
keys = [x for x in list(dat.keys()) if "accuracy" in x.lower()]
for key in keys:
if data[i][key]:
data[i][key] = data[i][key]*100
return data
def count_transactions(start_date, end_date):
base_query = Q(created_at__range=(start_date, end_date))
base_query &= Q(is_test_request=False)
transaction_att = {}
print(f"[DEBUG]: atracting transactions attribute...")
total_transaction_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')
for request in total_transaction_requests:
if not request.doc_type:
continue
doc_types = request.doc_type.split(",")
for doc_type in doc_types:
if transaction_att.get(doc_type, None) == None:
transaction_att[doc_type] = 1
else:
transaction_att[doc_type] += 1
return transaction_att
def convert_datetime_format(date_string: str, is_gt=False) -> str:
# pattern_date_string = "2023-02-28"
input_format = "%Y-%m-%d"
output_format = "%d/%m/%Y"
# Validate the input date string format
pattern = r"\d{4}-\d{2}-\d{2}"
if re.match(pattern, date_string):
# Convert the date string to a datetime object
date_object = datetime.strptime(date_string, input_format)
# Convert the datetime object to the desired output format
formatted_date = date_object.strftime(output_format)
return formatted_date
return date_string
def predict_result_to_ready(result):
dict_result = {"retailername": "",
"sold_to_party": "",
"purchase_date": [],
"imei_number": [],}
dict_result["retailername"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}])[0].get("value", None)
dict_result["sold_to_party"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}])[1].get("value", None)
dict_result["purchase_date"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}])[2].get("value", [])
dict_result["imei_number"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}, {}])[3].get("value", [])
return dict_result
def align_fine_result(ready_predict, fine_result):
# print(f"[DEBUG]: fine_result: {fine_result}")
# print(f"[DEBUG]: ready_predict: {ready_predict}")
if fine_result:
if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0:
ready_predict["purchase_date"] = [None]
if fine_result["retailername"] and not ready_predict["retailername"]:
ready_predict["retailername"] = [None]
fine_result["purchase_date"] = [fine_result["purchase_date"] for _ in range(len(ready_predict["purchase_date"]))]
# else:
# fine_result = {}
# for key in ready_predict.keys():
# fine_result[key] = []
# fine_result["purchase_date"] = [None for _ in range(len(ready_predict["purchase_date"]))]
return ready_predict, fine_result
def update_temp_accuracy(accuracy, acc, keys):
for key in keys:
accuracy[key].add(acc[key])
return accuracy
def calculate_accuracy(key_name, inference, target):
"""_summary_
Args:
key_name (string): key to calculate accuracy on, ex: retailername
inference (dict): result from ocr, refined to align with the target down below
target (dict): result of type
"""
acc = []
data = []
if not target or not inference:
return acc, data
if not isinstance(inference[key_name], list):
if inference[key_name] is None:
inference[key_name] = []
else:
inference[key_name] = [inference[key_name]]
if not isinstance(target[key_name], list):
if target[key_name] is None:
target[key_name] = []
else:
target[key_name] = [target[key_name]]
for i, v in enumerate(inference[key_name]):
# TODO: target[key_name][i] is None, ""
x = post_processing_str(key_name, inference[key_name][i], is_gt=False)
y = post_processing_str(key_name, target[key_name][i], is_gt=True)
score = eval_ocr_metric(
[x],
[y],
metric=[
"one_minus_ned",
# "line_acc_ignore_case_symbol",
# "line_acc",
# "one_minus_ned_word",
])
acc.append(list(score.values())[0])
data.append([x, y])
return acc, data
def calculate_avg_accuracy(acc, type, keys=[]):
acc_list = []
# print(f"[DEBUG]: type: {type} - acc: {acc}")
for key in keys:
acc_list += acc.get(type, {}).get(key, [])
acc_list = [x for x in acc_list if x is not None]
return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None
def calculate_and_save_subcription_file(report, request):
request_att = {"acc": {"feedback": {"imei_number": [],
"purchase_date": [],
"retailername": [],
"sold_to_party": [],
},
"reviewed": {"imei_number": [],
"purchase_date": [],
"retailername": [],
"sold_to_party": [],
}},
"err": [],
"time_cost": {},
"total_images": 0,
"bad_images": 0}
images = SubscriptionRequestFile.objects.filter(request=request)
for image in images:
status, att = calculate_subcription_file(image)
if status != 200:
continue
image.feedback_accuracy = att["acc"]["feedback"]
image.reviewed_accuracy = att["acc"]["reviewed"]
image.is_bad_image_quality = att["is_bad_image"]
image.save()
new_report_file = ReportFile(report=report,
correspond_request_id=request.request_id,
correspond_redemption_id=request.redemption_id,
doc_type=image.doc_type,
predict_result=image.predict_result,
feedback_result=image.feedback_result,
reviewed_result=image.reviewed_result,
feedback_accuracy=att["acc"]["feedback"],
reviewed_accuracy=att["acc"]["reviewed"],
acc=att["avg_acc"],
time_cost=image.processing_time,
bad_image_reason=image.reason,
counter_measures=image.counter_measures,
error="|".join(att["err"])
)
new_report_file.save()
if request_att["time_cost"].get(image.doc_type, None):
request_att["time_cost"][image.doc_type].append(image.processing_time)
else:
request_att["time_cost"][image.doc_type] = [image.processing_time]
try:
request_att["acc"]["feedback"]["imei_number"] += att["acc"]["feedback"]["imei_number"]
request_att["acc"]["feedback"]["purchase_date"] += att["acc"]["feedback"]["purchase_date"]
request_att["acc"]["feedback"]["retailername"] += att["acc"]["feedback"]["retailername"]
request_att["acc"]["feedback"]["sold_to_party"] += att["acc"]["feedback"]["sold_to_party"]
request_att["acc"]["reviewed"]["imei_number"] += att["acc"]["reviewed"]["imei_number"]
request_att["acc"]["reviewed"]["purchase_date"] += att["acc"]["reviewed"]["purchase_date"]
request_att["acc"]["reviewed"]["retailername"] += att["acc"]["reviewed"]["retailername"]
request_att["acc"]["reviewed"]["sold_to_party"] += att["acc"]["reviewed"]["sold_to_party"]
request_att["bad_images"] += int(att["is_bad_image"])
request_att["total_images"] += 1
request_att["err"] += att["err"]
except Exception as e:
print(e)
continue
return request_att
def calculate_subcription_file(subcription_request_file):
att = {"acc": {"feedback": {},
"reviewed": {}},
"err": [],
"is_bad_image": False,
"avg_acc": None}
if not subcription_request_file.predict_result:
return 400, att
inference_result = copy.deepcopy(subcription_request_file.predict_result)
inference_result, feedback_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.feedback_result))
inference_result, reviewed_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.reviewed_result))
# print(f"[DEBUG]: predict_result: {subcription_request_file.predict_result}")
# print(f"[DEBUG]: inference_result: {inference_result}")
# print(f"[DEBUG]: feedback_result: {feedback_result}")
# print(f"[DEBUG]: reviewed_result: {reviewed_result}")
for key_name in valid_keys:
try:
att["acc"]["feedback"][key_name], _ = calculate_accuracy(key_name, inference_result, feedback_result)
att["acc"]["reviewed"][key_name], _ = calculate_accuracy(key_name, inference_result, reviewed_result)
except Exception as e:
att["err"].append(str(e))
# print(f"[DEBUG]: e: {e} -key_name: {key_name}")
avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", ["retailername", "sold_to_party", "purchase_date", "imei_number"])
avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", ["retailername", "sold_to_party", "purchase_date", "imei_number"])
if avg_feedback is not None or avg_reviewed is not None:
avg_acc = max([x for x in [avg_feedback, avg_reviewed] if x is not None])
if avg_acc < BAD_THRESHOLD:
att["is_bad_image"] = True
att["avg_acc"] = avg_acc
return 200, att
def calculate_attributions(request): # for one request, return in order
acc = {"feedback": {},
"reviewed": {}} # {"feedback": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]},
# "reviewed": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}}
data = {"feedback": {},
"reviewed": {}} # {"feedback": {"retailername": [[ocr, feedback], ...], "sold_to_party":[[ocr, feedback], ...], "purchase_date":[[ocr, feedback], ...], "imei_number":[[ocr, feedback], ...]}}
# {"reviewed": {"retailername": [[ocr, reviewed], ...], "sold_to_party":[[ocr, reviewed], ...], "purchase_date":[[ocr, reviewed], ...], "imei_number":[[ocr, reviewed], ...]}}
time_cost = {} # {"imei": [0.1], "invoice": [0.1]}
image_quality_num = [0, 0] # [good, bad]
image_quality_num[0] = len(request.doc_type.split(","))
error = ""
inference_result = predict_result_to_ready(request.predict_result)
reviewed_result = align_fine_result(inference_result, request.reviewed_result)
feedback_result = align_fine_result(inference_result, request.feedback_result)
# accuracy calculation
for key_name in valid_keys:
if isinstance(inference_result[key_name], list):
if len(inference_result[key_name]) != len(reviewed_result.get(key_name, [])):
error = f"Request {request.request_id} failed with different {key_name} in predict and reviewed_result"
break
if len(inference_result[key_name]) != len(feedback_result.get(key_name, [])):
error = f"Request {request.request_id} failed with different {key_name} in predict and feedback_result"
break
# calculate accuracy for feedback result
acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result)
acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result)
else:
inference_result[key_name] = [inference_result[key_name]]
feedback_result[key_name] = [feedback_result[key_name]]
reviewed_result[key_name] = [reviewed_result[key_name]]
acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result)
acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result)
acc["feedback"]["purchase_date"] = [max(acc["feedback"]["purchase_date"])] if len(acc["feedback"]["purchase_date"]) > 0 else []
acc["reviewed"]["purchase_date"] = [max(acc["reviewed"]["purchase_date"])] if len(acc["reviewed"]["purchase_date"]) > 0 else []
# Count for bad and total images
avg_invoice_feedback = calculate_avg_accuracy(acc, "feedback", ["retailername", "sold_to_party", "purchase_date"])
avg_invoice_reviewed = calculate_avg_accuracy(acc, "reviewed", ["retailername", "sold_to_party", "purchase_date"])
if avg_invoice_feedback is not None or avg_invoice_reviewed is not None:
if max([x for x in [avg_invoice_feedback, avg_invoice_reviewed] if x is not None]) < BAD_THRESHOLD:
image_quality_num[1] += 1
for i, _ in enumerate(acc["feedback"]["imei_number"]):
if acc["feedback"]["imei_number"][i] is not None and acc["reviewed"]["imei_number"][i] is not None:
if max([x for x in [acc["feedback"]["imei_number"][i], acc["reviewed"]["imei_number"][i]] if x is not None]) < BAD_THRESHOLD:
image_quality_num[1] += 1
# time cost and quality calculation
# TODO: to be deprecated, doc_type would be in file level in the future
try:
for doc_type, doc_profile in request.ai_inference_profile.items():
doc_type = doc_type.split("_")[0]
inference_time = doc_profile["inference"][1][0] - doc_profile["inference"][0]
postprocess_time = doc_profile["postprocess"][1] - doc_profile["postprocess"][0]
time_cost[doc_type].append(inference_time + postprocess_time)
except Exception as e:
error = f"Request id {request.request_id} failed with error: {e}"
return acc, data, time_cost, image_quality_num, error
def shadow_report(report_id, query):
c_connector.make_a_report(
(report_id, query))

View File

@ -6,17 +6,68 @@ import json
from PIL import Image, ExifTags
from django.core.files.uploadedfile import TemporaryUploadedFile
from django.utils import timezone
from fwd import settings
from ..utils import s3 as S3Util
from fwd_api.constant.common import allowed_file_extensions
from fwd_api.exception.exceptions import GeneralException, RequiredFieldException, InvalidException, \
ServiceUnavailableException, FileFormatInvalidException, LimitReachedException, InvalidDecompressedSizeException
from fwd_api.models import SubscriptionRequest, OcrTemplate
ServiceUnavailableException, FileFormatInvalidException, LimitReachedException, InvalidDecompressedSizeException, RequiredColumnException
from fwd_api.models import SubscriptionRequest, OcrTemplate, FeedbackRequest, SubscriptionRequestFile, Report, ReportFile
from fwd_api.utils import process as ProcessUtil
from fwd_api.utils.crypto import image_authenticator
from fwd_api.utils.image import resize
from ..celery_worker.client_connector import c_connector
import imagesize
import csv
from openpyxl import load_workbook
from openpyxl.styles import Font, Border, Side, PatternFill, NamedStyle
s3_client = S3Util.MinioS3Client(
endpoint=settings.S3_ENDPOINT,
access_key=settings.S3_ACCESS_KEY,
secret_key=settings.S3_SECRET_KEY,
bucket_name=settings.S3_BUCKET_NAME
)
def validate_report_list(request):
start_date_str = request.GET.get('start_date')
end_date_str = request.GET.get('end_date')
page_number = int(request.GET.get('page', 0))
page_size = int(request.GET.get('page_size', 10))
report_id = request.GET.get('report_id', None)
validated_data = {}
validated_data["start_date"] = None
validated_data["end_date"] = None
if len(start_date_str) > 0 and len(end_date_str) > 0:
try:
validated_data["start_date"] = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z')
validated_data["end_date"] = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')
except ValueError:
raise InvalidException(excArgs="Date format")
validated_data["report_id"] = report_id
validated_data["page_size"] = page_size
validated_data["page_number"] = page_number
if validated_data["report_id"] is None and validated_data["start_date"] is None:
raise RequiredFieldException(excArgs="report_id, start_date, end_date")
return validated_data
def validate_feedback_file(csv_file_path):
required_columns = ['redemptionNumber', 'requestId', 'imeiNumber', 'imeiNumber2', 'Purchase Date', 'retailer', 'Sold to party', 'timetakenmilli']
missing_columns = []
with open(csv_file_path, 'r') as file:
reader = csv.DictReader(file)
# Check if all required columns are present
for column in required_columns:
if column not in reader.fieldnames:
missing_columns.append(column)
if missing_columns:
raise RequiredColumnException(excArgs=str(missing_columns))
def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUEST, min_file_num=1, file_field="files"):
total_file_size = 0
@ -38,6 +89,25 @@ def validate_list_file(files, max_file_num=settings.MAX_UPLOAD_FILES_IN_A_REQUES
if total_file_size > settings.MAX_UPLOAD_FILE_SIZE_OF_A_REQUEST:
raise LimitReachedException(excArgs=('Total size of all files', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB'))
def validate_csv_feedback(files, max_file_num=1, min_file_num=1, file_field="csv files"):
total_file_size = 0
if len(files) < min_file_num:
raise RequiredFieldException(excArgs=file_field)
if len(files) > max_file_num:
raise LimitReachedException(excArgs=(f'Number of {file_field}', str(max_file_num), ''))
for f in files:
if not isinstance(f, TemporaryUploadedFile):
# print(f'[DEBUG]: {f.name}')
raise InvalidException(excArgs="files")
extension = f.name.split(".")[-1].lower() in ["csv"]
if not extension or "." not in f.name:
raise FileFormatInvalidException(excArgs=[".csv"])
if f.size > settings.MAX_UPLOAD_SIZE_OF_A_FILE:
raise LimitReachedException(excArgs=('A file', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB'))
total_file_size += f.size
if total_file_size > settings.MAX_UPLOAD_FILE_SIZE_OF_A_REQUEST:
raise LimitReachedException(excArgs=('Total size of all files', str(settings.MAX_UPLOAD_SIZE_OF_A_FILE / 1024 / 1024), 'MB'))
def get_file(file_path: str):
try:
@ -105,6 +175,31 @@ def save_json_file(file_name: str, rq: SubscriptionRequest, data: dict):
json.dump(data, json_file)
return file_path
def save_feedback_file(file_name: str, rq: FeedbackRequest, uploaded_file: dict):
user_id = str(rq.subscription.user.id)
feedback_id = str(rq.id)
folder_path = os.path.join(settings.MEDIA_ROOT, 'users', user_id, "feedbacks", feedback_id)
os.makedirs(folder_path, exist_ok = True)
file_path = os.path.join(folder_path, file_name)
with uploaded_file.open() as file:
# Read the contents of the file
file_contents = file.read().decode('utf-8')
with open(file_path, 'w', newline='') as csvfile:
csvfile.write(file_contents)
return file_path
def save_workbook_file(file_name: str, rp: Report, workbook):
report_id = str(rp.report_id)
folder_path = os.path.join(settings.MEDIA_ROOT, "report", report_id)
os.makedirs(folder_path, exist_ok = True)
file_path = os.path.join(folder_path, file_name)
workbook.save(file_path)
return file_path
def delete_file_with_path(file_path: str) -> bool:
try:
os.remove(file_path)
@ -126,7 +221,7 @@ def save_template_file(file_name: str, rq: OcrTemplate, file: TemporaryUploadedF
print(e)
raise ServiceUnavailableException()
def save_file_with_path(file_name: str, file: TemporaryUploadedFile, quality, folder_path):
def save_file_with_path(file_name: str, file: TemporaryUploadedFile, quality, folder_path: str):
try:
file_path = os.path.join(folder_path, file_name)
extension = file_name.split(".")[-1]
@ -142,10 +237,19 @@ def save_file_with_path(file_name: str, file: TemporaryUploadedFile, quality, fo
return file_path
def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: TemporaryUploadedFile, quality):
def resize_and_save_file(file_name: str, rq: SubscriptionRequest, file: TemporaryUploadedFile, quality: int):
try:
folder_path = get_folder_path(rq)
pathlib.Path(folder_path).mkdir(exist_ok=True, parents=True)
# request_file: SubscriptionRequestFile = SubscriptionRequestFile(
# file_name = file_name,
# file_path = os.path.join(folder_path, file_name),
# doc_type = doc_type,
# origin_name = file.name,
# request = rq,
# index_in_request= index_in_request
# )
# request_file.save()
return save_file_with_path(file_name, file, quality, folder_path)
except InvalidDecompressedSizeException as e:
raise e
@ -166,6 +270,33 @@ def save_to_S3(file_name, rq, local_file_path):
print(f"[ERROR]: {e}")
raise ServiceUnavailableException()
def save_feedback_to_S3(file_name, id, local_file_path):
try:
# print(f"[DEBUG]: Uploading feedback to S3 with local path {local_file_path}, id: {id}, file_name: {file_name}")
assert len(local_file_path.split("/")) >= 3, "file_path must have at least feedback_folder and feedback_id"
# s3_key = os.path.join(local_file_path.split("/")[-3], local_file_path.split("/")[-2], file_name)
s3_key = os.path.join("feedback", local_file_path.split("/")[-2], file_name)
# print(f"[DEBUG]: Uploading feedback to S3 with s3_key {s3_key}")
c_connector.upload_feedback_to_s3((local_file_path, s3_key, id))
c_connector.remove_local_file((local_file_path, id))
return s3_key
except Exception as e:
print(f"[ERROR]: {e}")
raise ServiceUnavailableException()
def save_report_to_S3(id, local_file_path):
try:
s3_key = os.path.join("report", local_file_path.split("/")[-2], local_file_path.split("/")[-1])
c_connector.upload_report_to_s3((local_file_path, s3_key, id))
c_connector.remove_local_file((local_file_path, id))
return s3_key
except Exception as e:
print(f"[ERROR]: {e}")
raise ServiceUnavailableException()
def download_from_S3(s3_key, local_file_path):
s3_client.download_file(s3_key, local_file_path)
def save_file_with_path(file_name: str, file: TemporaryUploadedFile, quality, folder_path):
try:
file_path = os.path.join(folder_path, file_name)
@ -252,3 +383,129 @@ def build_url(folder: str, data_id: str, user_id: int, file_name: str = None) ->
def build_media_url_v2(media_id: str, user_id: int, sub_id: int, u_sync_id: str) -> str:
token = image_authenticator.generate_img_token_v2(user_id, sub_id, u_sync_id)
return f'{settings.BASE_URL}/api/ctel/v2/media/request/{media_id}/?token={token}'
def get_value(_dict, keys):
keys = keys.split('.')
value = _dict
for key in keys:
if not key in value.keys():
return "-"
else:
value = value.get(key, {})
if not value:
return "-"
elif isinstance(value, list):
value = str(value)
return value
def dict2xlsx(input: json, _type='report'):
red = "FF0000"
black = "000000"
green = "E2EFDA"
yellow = "FFF2CC"
gray = "D0CECE"
font_black = Font(name="Calibri", size=11, color=black)
font_black_bold = Font(name="Calibri", size=11, color=black, bold=True)
font_red = Font(name="Calibri", size=11, color=red)
thin = Side(border_style="thin", color=black)
border = Border(left=thin, right=thin, top=thin, bottom=thin)
fill_green = PatternFill(start_color=green, end_color=green, fill_type = "solid")
fill_yellow = PatternFill(start_color=yellow, end_color=yellow, fill_type = "solid")
fill_gray = PatternFill(start_color=gray, end_color=gray, fill_type = "solid")
normal_cell = NamedStyle(name="normal_cell", font=font_black, border=border)
normal_cell_red = NamedStyle(name="normal_cell_red", font=font_red, border=border)
if _type == 'report':
wb = load_workbook(filename = 'report.xlsx')
ws = wb['Sheet1']
mapping = {
'A': 'subs',
'B': 'extraction_date',
'C': 'num_imei',
'D': 'num_invoice',
'E': 'total_images',
'F': 'images_quality.successful',
'G': 'images_quality.successful_percent',
'H': 'images_quality.bad',
'I': 'images_quality.bad_percent',
'J': 'average_accuracy_rate.imei',
'K': 'average_accuracy_rate.purchase_date',
'L': 'average_accuracy_rate.retailer_name',
'M': 'average_processing_time.imei',
'N': 'average_processing_time.invoice',
'O': 'usage.imei',
'P': 'usage.invoice',
}
start_index = 5
elif _type == 'report_detail':
wb = load_workbook(filename = 'report_detail.xlsx')
ws = wb['Sheet1']
mapping = {
'A': 'request_id',
'B': 'redemption_number',
'C': 'image_type',
'D': 'imei_user_submitted',
'E': "imei_ocr_retrieved",
'F': "imei1_accuracy",
'G': "invoice_purchase_date_consumer",
'H': "invoice_purchase_date_ocr",
'I': "invoice_purchase_date_accuracy",
'J': "invoice_retailer_consumer",
'K': "invoice_retailer_ocr",
'L': "invoice_retailer_accuracy",
'M': "ocr_image_accuracy",
'N': "ocr_image_speed",
'O': "is_reviewed",
'P': "bad_image_reasons",
'Q': "countermeasures",
'R': 'imei_revised_accuracy',
'S': 'purchase_date_revised_accuracy',
'T': 'retailer_revised_accuracy',
}
start_index = 4
for subtotal in input:
for key_index, key in enumerate(mapping.keys()):
value = get_value(subtotal, mapping[key])
ws[key + str(start_index)] = value
ws[key + str(start_index)].border = border
if _type == 'report':
ws[key + str(start_index)].font = font_black_bold
if key_index == 0 or (key_index >= 9 and key_index <= 15):
ws[key + str(start_index)].fill = fill_gray
elif key_index == 1:
ws[key + str(start_index)].fill = fill_green
elif key_index >= 4 and key_index <= 8:
ws[key + str(start_index)].fill = fill_yellow
elif _type == 'report_detail':
if 'accuracy' in mapping[key] and type(value) in [int, float] and value < 95:
ws[key + str(start_index)].style = normal_cell_red
elif 'speed' in mapping[key] and type(value) in [int, float] and value > 2.0:
ws[key + str(start_index)].style = normal_cell_red
else:
ws[key + str(start_index)].style = normal_cell
start_index += 1
if 'data' in subtotal.keys():
for record in subtotal['data']:
for key in mapping.keys():
value = get_value(record, mapping[key])
ws[key + str(start_index)] = value
if 'average_accuracy_rate' in mapping[key] and type(value) in [int, float] and value < 95:
ws[key + str(start_index)].style = normal_cell_red
elif 'average_processing_time' in mapping[key] and type(value) in [int, float] and value > 2.0:
ws[key + str(start_index)].style = normal_cell_red
elif 'bad_percent' in mapping[key] and type(value) in [int, float] and value > 10:
ws[key + str(start_index)].style = normal_cell_red
else :
ws[key + str(start_index)].style = normal_cell
start_index += 1
return wb

View File

@ -23,7 +23,7 @@ def get_latest_requests(limit=50):
"total_memory": request.total_memory,
"gpu_stats": request.gpu_stats,
"is_reviewed": request.is_reviewed,
"is_bad_image_quality": request.is_bad_image_quality,
# "is_bad_image_quality": request.is_bad_image_quality,
})
return requests_dict

View File

@ -0,0 +1,385 @@
import re
from pathlib import Path
from difflib import SequenceMatcher
from terminaltables import AsciiTable
from rapidfuzz.distance import Levenshtein
from .wiki_diff import inline_diff
def is_type_list(x, type):
if not isinstance(x, list):
return False
return all(isinstance(item, type) for item in x)
def cal_true_positive_char(pred, gt):
"""Calculate correct character number in prediction.
Args:
pred (str): Prediction text.
gt (str): Ground truth text.
Returns:
true_positive_char_num (int): The true positive number.
"""
all_opt = SequenceMatcher(None, pred, gt)
true_positive_char_num = 0
for opt, _, _, s2, e2 in all_opt.get_opcodes():
if opt == "equal":
true_positive_char_num += e2 - s2
else:
pass
return true_positive_char_num
def post_processing(text):
"""
- Remove special characters and extra spaces + lower case
"""
text = re.sub(
r"[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789 ]",
" ",
text,
)
text = re.sub(r"\s\s+", " ", text)
text = text.strip()
return text
def count_matches(pred_texts, gt_texts, use_ignore=True):
"""Count the various match number for metric calculation.
Args:
pred_texts (list[str]): Predicted text string.
gt_texts (list[str]): Ground truth text string.
Returns:
match_res: (dict[str: int]): Match number used for
metric calculation.
"""
match_res = {
"gt_char_num": 0,
"pred_char_num": 0,
"true_positive_char_num": 0,
"gt_word_num": 0,
"match_word_num": 0,
"match_word_ignore_case": 0,
"match_word_ignore_case_symbol": 0,
"match_kie": 0,
"match_kie_ignore_case": 0,
}
# comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
# comp = re.compile('[]')
norm_ed_sum = 0.0
gt_texts_for_ned_word = []
pred_texts_for_ned_word = []
for pred_text, gt_text in zip(pred_texts, gt_texts):
if gt_text == pred_text:
match_res["match_word_num"] += 1
match_res["match_kie"] += 1
gt_text_lower = str(gt_text).lower()
pred_text_lower = str(pred_text).lower()
if gt_text_lower == pred_text_lower:
match_res["match_word_ignore_case"] += 1
# gt_text_lower_ignore = comp.sub('', gt_text_lower)
# pred_text_lower_ignore = comp.sub('', pred_text_lower)
if use_ignore:
gt_text_lower_ignore = post_processing(gt_text_lower)
pred_text_lower_ignore = post_processing(pred_text_lower)
else:
gt_text_lower_ignore = gt_text_lower
pred_text_lower_ignore = pred_text_lower
if gt_text_lower_ignore == pred_text_lower_ignore:
match_res["match_kie_ignore_case"] += 1
gt_texts_for_ned_word.append(gt_text_lower_ignore.split(" "))
pred_texts_for_ned_word.append(pred_text_lower_ignore.split(" "))
match_res["gt_word_num"] += 1
norm_ed = Levenshtein.normalized_distance(
pred_text_lower_ignore, gt_text_lower_ignore
)
# if norm_ed > 0.1:
# print(gt_text_lower_ignore, pred_text_lower_ignore, sep='\n')
# print("-"*20)
norm_ed_sum += norm_ed
# number to calculate char level recall & precision
match_res["gt_char_num"] += len(gt_text_lower_ignore)
match_res["pred_char_num"] += len(pred_text_lower_ignore)
true_positive_char_num = cal_true_positive_char(
pred_text_lower_ignore, gt_text_lower_ignore
)
match_res["true_positive_char_num"] += true_positive_char_num
normalized_edit_distance = norm_ed_sum / max(1, len(gt_texts))
match_res["ned"] = normalized_edit_distance
# NED for word-level
norm_ed_word_sum = 0.0
# print(pred_texts_for_ned_word[0])
unique_words = list(
set(
[x for line in pred_texts_for_ned_word for x in line]
+ [x for line in gt_texts_for_ned_word for x in line]
)
)
preds = [
[unique_words.index(w) for w in pred_text_for_ned_word]
for pred_text_for_ned_word in pred_texts_for_ned_word
]
truths = [
[unique_words.index(w) for w in gt_text_for_ned_word]
for gt_text_for_ned_word in gt_texts_for_ned_word
]
for pred_text, gt_text in zip(preds, truths):
norm_ed_word = Levenshtein.normalized_distance(pred_text, gt_text)
# if norm_ed_word < 0.2:
# print(pred_text, gt_text)
norm_ed_word_sum += norm_ed_word
normalized_edit_distance_word = norm_ed_word_sum / max(1, len(gt_texts))
match_res["ned_word"] = normalized_edit_distance_word
return match_res
def eval_ocr_metric(pred_texts, gt_texts, metric="acc"):
"""Evaluate the text recognition performance with metric: word accuracy and
1-N.E.D. See https://rrc.cvc.uab.es/?ch=14&com=tasks for details.
Args:
pred_texts (list[str]): Text strings of prediction.
gt_texts (list[str]): Text strings of ground truth.
metric (str | list[str]): Metric(s) to be evaluated. Options are:
- 'word_acc': Accuracy at word level.
- 'word_acc_ignore_case': Accuracy at word level, ignoring letter
case.
- 'word_acc_ignore_case_symbol': Accuracy at word level, ignoring
letter case and symbol. (Default metric for academic evaluation)
- 'char_recall': Recall at character level, ignoring
letter case and symbol.
- 'char_precision': Precision at character level, ignoring
letter case and symbol.
- 'one_minus_ned': 1 - normalized_edit_distance
In particular, if ``metric == 'acc'``, results on all metrics above
will be reported.
Returns:
dict{str: float}: Result dict for text recognition, keys could be some
of the following: ['word_acc', 'word_acc_ignore_case',
'word_acc_ignore_case_symbol', 'char_recall', 'char_precision',
'1-N.E.D'].
"""
assert isinstance(pred_texts, list)
assert isinstance(gt_texts, list)
assert len(pred_texts) == len(gt_texts)
assert isinstance(metric, str) or is_type_list(metric, str)
if metric == "acc" or metric == ["acc"]:
metric = [
"word_acc",
"word_acc_ignore_case",
"word_acc_ignore_case_symbol",
"char_recall",
"char_precision",
"one_minus_ned",
]
metric = set([metric]) if isinstance(metric, str) else set(metric)
# supported_metrics = set([
# 'word_acc', 'word_acc_ignore_case', 'word_acc_ignore_case_symbol',
# 'char_recall', 'char_precision', 'one_minus_ned', 'one_minust_ned_word'
# ])
# assert metric.issubset(supported_metrics)
match_res = count_matches(pred_texts, gt_texts)
eps = 1e-8
eval_res = {}
if "char_recall" in metric:
char_recall = (
1.0 * match_res["true_positive_char_num"] / (eps + match_res["gt_char_num"])
)
eval_res["char_recall"] = char_recall
if "char_precision" in metric:
char_precision = (
1.0
* match_res["true_positive_char_num"]
/ (eps + match_res["pred_char_num"])
)
eval_res["char_precision"] = char_precision
if "word_acc" in metric:
word_acc = 1.0 * match_res["match_word_num"] / (eps + match_res["gt_word_num"])
eval_res["word_acc"] = word_acc
if "word_acc_ignore_case" in metric:
word_acc_ignore_case = (
1.0 * match_res["match_word_ignore_case"] / (eps + match_res["gt_word_num"])
)
eval_res["word_acc_ignore_case"] = word_acc_ignore_case
if "word_acc_ignore_case_symbol" in metric:
word_acc_ignore_case_symbol = (
1.0
* match_res["match_word_ignore_case_symbol"]
/ (eps + match_res["gt_word_num"])
)
eval_res["word_acc_ignore_case_symbol"] = word_acc_ignore_case_symbol
if "one_minus_ned" in metric:
eval_res["1-N.E.D"] = 1.0 - match_res["ned"]
if "one_minus_ned_word" in metric:
eval_res["1-N.E.D_word"] = 1.0 - match_res["ned_word"]
if "line_acc_ignore_case_symbol" in metric:
line_acc_ignore_case_symbol = (
1.0 * match_res["match_kie_ignore_case"] / (eps + match_res["gt_word_num"])
)
eval_res["line_acc_ignore_case_symbol"] = line_acc_ignore_case_symbol
if "line_acc" in metric:
word_acc_ignore_case_symbol = (
1.0 * match_res["match_kie"] / (eps + match_res["gt_word_num"])
)
eval_res["line_acc"] = word_acc_ignore_case_symbol
for key, value in eval_res.items():
eval_res[key] = float("{:.4f}".format(value))
return eval_res
def eval_kie(preds_e2e: dict[str, dict[str, str]], gt_e2e: dict[str, dict[str, str]], labels, skip_labels=[]):
results = {label: 1 for label in labels}
pred_texts_dict = {label: [] for label in labels}
gt_texts_dict = {label: [] for label in labels}
fail_cases = {}
for img_id in gt_e2e.keys():
fail_cases[img_id] = {}
pred_items = preds_e2e.get(img_id, {k: '' for k in gt_e2e[img_id]})
gt_items = gt_e2e[img_id]
for class_name, text_gt in gt_items.items():
if class_name in skip_labels:
continue
# if class_name == 'seller_name_value':
# print(gt_items)
if class_name not in pred_items:
text_pred = ""
else:
text_pred = pred_items[class_name]
if str(text_pred) != str(text_gt):
diff = inline_diff(text_pred, text_gt)
fail_cases[img_id][class_name] = {
'pred': text_pred,
'gt': text_gt,
"diff": diff['res_text'],
"ned": diff["ned"],
"score": eval_ocr_metric([text_pred], [text_gt], metric=[
"one_minus_ned"])["1-N.E.D"],
}
pred_texts_dict[class_name].append(text_pred)
gt_texts_dict[class_name].append(text_gt)
for class_name in labels:
pred_texts = pred_texts_dict[class_name]
gt_texts = gt_texts_dict[class_name]
result = eval_ocr_metric(
pred_texts,
gt_texts,
metric=[
"one_minus_ned",
"line_acc_ignore_case_symbol",
"line_acc",
"one_minus_ned_word",
],
)
results[class_name] = {
"1-ned": result["1-N.E.D"],
"1-ned-word": result["1-N.E.D_word"],
"line_acc": result["line_acc"],
"line_acc_ignore_case_symbol": result["line_acc_ignore_case_symbol"],
"samples": len(pred_texts),
}
# avg reusults
sum_1_ned = sum(
[
results[class_name]["1-ned"] * results[class_name]["samples"]
for class_name in labels
]
)
sum_1_ned_word = sum(
[
results[class_name]["1-ned-word"] * results[class_name]["samples"]
for class_name in labels
]
)
sum_line_acc = sum(
[
results[class_name]["line_acc"] * results[class_name]["samples"]
for class_name in labels
]
)
sum_line_acc_ignore_case_symbol = sum(
[
results[class_name]["line_acc_ignore_case_symbol"]
* results[class_name]["samples"]
for class_name in labels
]
)
total_samples = sum(
[results[class_name]["samples"] for class_name in labels]
)
results["avg_all"] = {
"1-ned": round(sum_1_ned / total_samples, 4),
"1-ned-word": round(sum_1_ned_word / total_samples, 4),
"line_acc": round(sum_line_acc / total_samples, 4),
"line_acc_ignore_case_symbol": round(
sum_line_acc_ignore_case_symbol / total_samples, 4
),
"samples": total_samples,
}
table_data = [
[
"class_name",
"1-NED",
"1-N.E.D_word",
"line-acc",
"line_acc_ignore_case_symbol",
"#samples",
]
]
for class_name in results.keys():
# if c < p.shape[0]:
table_data.append(
[
class_name,
results[class_name]["1-ned"],
results[class_name]["1-ned-word"],
results[class_name]["line_acc"],
results[class_name]["line_acc_ignore_case_symbol"],
results[class_name]["samples"],
]
)
table = AsciiTable(table_data)
print(table.table)
return results, fail_cases

View File

@ -0,0 +1,432 @@
import os
import re
import ast
import time
import json
import glob
import shutil
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from .ocr_metrics import eval_ocr_metric
import sys
# sys.path.append(os.path.dirname(__file__))
from sdsvkvu.utils.query.sbt_v2 import get_seller, post_process_seller
def read_json(file_path: str):
with open(file_path, 'r') as f:
return json.load(f)
def write_to_json(file_path, content):
with open(file_path, mode='w', encoding='utf8') as f:
json.dump(content, f, ensure_ascii=False)
def convert_datetime_format(date_string: str, is_gt=False) -> str:
# pattern_date_string = "2023-02-28"
output_format = "%Y-%m-%d"
input_format = "%d/%m/%Y"
# Validate the input date string format
pattern = r"\d{2}\/\d{2}\/\d{4}"
if re.match(pattern, date_string):
# Convert the date string to a datetime object
date_object = datetime.strptime(date_string, input_format)
# Convert the datetime object to the desired output format
formatted_date = date_object.strftime(output_format)
return formatted_date
return date_string
def normalise_retailer_name(retailer: str):
input_value = {
"text": retailer,
"id": 0,
"class": "seller",
"bbox": [0, 0, 0, 0],
}
output = get_seller({'seller': [input_value]})
norm_seller_name = post_process_seller(output)
return norm_seller_name
def post_processing_str(class_name: str, s: str, is_gt: bool) -> str:
s = str(s).replace('', ' ').strip()
if s.lower() in ['null', 'nan', "none"]:
return ''
if class_name == "purchase_date" and is_gt == True:
s = convert_datetime_format(s)
if class_name == "retailername":
s = normalise_retailer_name(s)
return s
def convert_groundtruth_from_csv(
csv_path: str,
save_dir: str,
classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
):
# if isinstance(csv_path_list, str):
# csv_path_list = [csv_path_list]
df = pd.read_csv(csv_path)
total_output = {}
for _, request in df.iterrows():
req_id = request['requestId']
if req_id not in total_output:
total_output[req_id] = {k: None for k in classes}
total_output[req_id]["imei_number"] = []
total_output[req_id]["imei_number"].extend([request["imeiNumber"], request["imeiNumber2"]])
total_output[req_id]["imei_number"] = list(set(total_output[req_id]["imei_number"]))
total_output[req_id]["purchase_date"] = request["Purchase Date"]
total_output[req_id]["retailername"] = request["retailer"]
for req_id, output in total_output.items():
save_path = os.path.join(save_dir, req_id)
os.makedirs(save_path, exist_ok=True)
write_to_json(os.path.join(save_path, f"{req_id}.json"), output)
def convert_predict_from_csv(
csv_path: str,
save_dir: str,
classes: list = ["retailername", "sold_to_party", "purchase_date", "imei_number"]
):
# if isinstance(csv_path_list, str):
# csv_path_list = [csv_path_list]
df = pd.read_csv(csv_path)
for _, request in df.iterrows():
n_pages = request['pages']
req_id = request['request_id']
if not isinstance(request['doc_type'], str) or not isinstance(request['predict_result'], str):
print(f"[WARNING] Skipped request id {req_id}")
continue
doc_type_list = request['doc_type'].split(',')
assert n_pages == len(doc_type_list), \
"No. pages is different no. documents"
json_path = os.path.join(save_dir, req_id)
os.makedirs(json_path, exist_ok=True)
# For user_submitted_results
if "feedback_result" in request:
feedback_data = ast.literal_eval(request['feedback_result'])
fname = f"{req_id}.json"
write_to_json(os.path.join(json_path, fname), feedback_data)
# For predict_results
data = ast.literal_eval(request['predict_result'])['content']['document'][0]['content']
infer_time = float(request['ai_inference_time']) + float(request['preprocessing_time']) + 0.1
n_imei, n_invoice = 0, 0
for doc_type in doc_type_list:
output = {k: None for k in classes}
if not os.path.exists(json_path):
os.makedirs(json_path, exist_ok=True)
if doc_type == "imei":
for info in data:
if info['label'] == "imei_number":
output['imei_number'] = info['value'][n_imei]
output['processing_time'] = infer_time
fname = f"temp_{doc_type}_{req_id}_{n_imei}.json"
write_to_json(os.path.join(json_path, fname), output)
n_imei += 1
break
elif doc_type == "invoice":
for info in data:
if info['label'] == "imei_number":
continue
output[info['label']] = info['value']
output['processing_time'] = infer_time
fname = f"temp_{doc_type}_{req_id}_{n_invoice}.json"
write_to_json(os.path.join(json_path, fname), output)
n_invoice += 1
def gen_req_to_red_dict(csv_path: str):
df = pd.read_csv(csv_path)
df = df.loc[:, ["requestId", "redemptionNumber"]]
req_to_red = {row["requestId"]: row["redemptionNumber"] for _, row in df.iterrows()}
return req_to_red
def gen_req_to_red_dict_2(csv_path: str):
df = pd.read_csv(csv_path)
df = df.loc[:, ["request_id", "redemption_id"]]
req_to_red = {row["request_id"]: row["redemption_id"] for _, row in df.iterrows()}
return req_to_red
def init_csv(
gt_dir: str,
pred_dir: str,
req_to_red: dict,
):
list_request_id = os.listdir(gt_dir)
total = []
for request_id in list_request_id:
gt_path = os.path.join(gt_dir, request_id, request_id+".json")
if not os.path.exists(gt_path):
print(f"[WARNING] Skipped request id {os.path.basename(os.path.dirname(gt_path))}")
continue
gt_data = read_json(gt_path)
json_file_list = glob.glob(os.path.join(pred_dir, request_id, "temp_*.json"))
json_file_list = sorted(json_file_list, key=lambda x: int(x.split(".json")[0].split('_')[-1]))
n_imei, n_invoice = 0, 0
# if len(json_file_list) > 3:
# continue
for json_file in json_file_list:
pred_data = read_json(json_file)
if "imei" in json_file:
pred_value = pred_data['imei_number']
gt_value = gt_data['imei_number'][n_imei]
n_imei += 1
score = eval_ocr_metric(
[post_processing_str("imei_number", pred_value, is_gt=False)],
[post_processing_str("imei_number", gt_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
total.append({
"requestId": request_id,
"redemptionNumber": req_to_red[request_id],
"userSubmitResults": gt_value,
"OCRResults": pred_value,
"revisedResults_by_SDSRV": "",
"accuracy": score,
"processingTime (by request)": pred_data['processing_time'],
"class_name": "imei_number",
"file_path": json_file
})
elif "invoice" in json_file:
for class_name in ["retailername", "purchase_date"]:
pred_value = pred_data[class_name]
gt_value = gt_data[class_name]
if isinstance(gt_value, list):
gt_value = gt_value[0]
n_invoice += 1
if not isinstance(pred_value, list):
pred_value = [pred_value]
score = 0
for _pred_value in pred_value:
score1 = eval_ocr_metric(
[post_processing_str(class_name, _pred_value, is_gt=False)],
[post_processing_str(class_name, gt_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
score = max(score, score1)
total.append({
"requestId": request_id,
"redemptionNumber": req_to_red[request_id],
"userSubmitResults": gt_value,
"OCRResults": pred_value[0] if class_name == "retailername" else pred_value,
"revisedResults_by_SDSRV": "",
"accuracy": score,
"processingTime (by request)": pred_data['processing_time'],
"class_name": class_name,
"file_path": json_file
})
return total
def export_report(
init_csv: str,
):
df = pd.read_csv(init_csv)
for index, request in df.iterrows():
file_path = request['file_path']
class_name = request['class_name']
pred_value = request['OCRResults']
revised_value = read_json(file_path)[class_name]
if class_name == "purchase_date":
pred_value = ast.literal_eval(pred_value)
if isinstance(revised_value, list):
if len(revised_value) > 0:
revised_value = revised_value[0]
else:
revised_value = None
if len(pred_value) == 0:
pred_value = [None]
score = 0
for _pred_value in pred_value:
score1 = eval_ocr_metric(
[post_processing_str(class_name, _pred_value, is_gt=False)],
[post_processing_str(class_name, revised_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
score = max(score, score1)
else:
score = eval_ocr_metric(
[post_processing_str(class_name, pred_value, is_gt=False)],
[post_processing_str(class_name, revised_value, is_gt=True)],
metric=["one_minus_ned"]
)['1-N.E.D']
df.at[index, "revisedResults_by_SDSRV"] = revised_value
df.at[index, "accuracy"] = score
return df
def pick_sample_to_revise(
ocr_accuracy: list,
gt_dir: str,
save_dir: str
):
empty_err_path = os.path.join(save_dir, "empty_results")
other_err_path = os.path.join(save_dir, "diff_results")
os.makedirs(empty_err_path, exist_ok=True)
os.makedirs(other_err_path, exist_ok=True)
for request in ocr_accuracy:
score = request['accuracy']
json_path = request['file_path']
request_id = request['requestId']
img_path_folder = os.path.join(gt_dir, Path(json_path).parts[-2], Path(json_path).parts[-1])
img_path = [ff for ff in glob.glob(img_path_folder.replace(".json", ".*")) if ".json" not in ff]
if len(img_path) == 0:
print(f"[WARNING] Skipped request id {request_id}")
continue
img_path = img_path[0]
# img_path = [ff for ff in glob.glob(json_path.replace(".json", ".*"))][0]
if score == 0:
save_path = os.path.join(empty_err_path, request_id)
elif score < 1:
save_path = os.path.join(other_err_path, request_id)
else:
continue
os.makedirs(save_path, exist_ok=True)
shutil.copy(img_path, save_path)
shutil.copy(json_path, save_path)
def merge_revised_sample(
revised_path_list: list,
save_dir: str
):
if not isinstance(revised_path_list, list):
revised_path_list = [revised_path_list]
for revised_path in revised_path_list:
list_request = [os.path.basename(ff) for ff in os.listdir(revised_path)]
for request in list_request:
file_list = glob.glob(os.path.join(revised_path, request, "*.json*"))
for file_path in file_list:
# shutil.copyfile(file_path, os.path.join(save_path, request))
os.system(f"sudo cp {file_path} {os.path.join(save_dir, request)}")
def calculate_average_by_column(df, column_name):
df = df.groupby(by=["requestId"])
time_list = []
for req, sub_df in df:
if len(sub_df) > 0:
time_list.append(sub_df.iloc[0][column_name])
if len(time_list) > 0:
return sum(time_list)/len(time_list)
return 0
if __name__ == "__main__":
save_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan"
save_csv = "logs/eval_20240115"
csv_path = "/mnt/hdd4T/TannedCung/OCR/Data/SBT_for_acc/15Jan.csv"
csv_path_end_user = "logs/eval_20240115/OCR_15Jan2024.csv"
# Step 1: Convert a csv file to get user submitted results for each request
print("[INFO] Starting convert csv from customer to json")
os.system(f"sudo chmod -R 777 {save_path}")
convert_groundtruth_from_csv(csv_path=csv_path_end_user, save_dir=save_path)
print("[INFO] Converted")
# # Step 2: Convert a csv file to get predict OCR results for each image
print("[INFO] Starting convert csv from SDSV to json")
convert_predict_from_csv(csv_path=csv_path, save_dir=save_path)
print("[INFO] Converted")
# # Step 3: Gen initial csv file and calculate OCR result between submitted results and ocr results
print("[INFO] Starting generate csv to get performance")
gt_path = save_path
pred_path = save_path
req_to_red_dict = gen_req_to_red_dict(csv_path_end_user)
init_data = init_csv(gt_dir=gt_path, pred_dir=pred_path, req_to_red=req_to_red_dict)
pd.DataFrame(init_data).to_csv(os.path.join(save_csv, "init1.csv"), index=False)
print("[INFO] Done")
# # Step 4: Split requests whose accuracy is less than 1 to revise
# print("[INFO] Starting split data to review")
# revised_path = os.path.join(save_csv, "revised")
# # shutil.rmtree(revised_path)
# pick_sample_to_revise(ocr_accuracy=init_data, gt_dir=save_path, save_dir=revised_path)
# print("[INFO] Done")
# # Step 5: Merge revised results to gt folder
# print("[INFO] Merging revised data to ground truth folder")
# revised_path = os.path.join(save_csv, "revised")
# revised_path = [f'{revised_path}/empty_results', f'{revised_path}/diff_results']
# merge_revised_sample(revised_path_list=revised_path, save_dir=save_path)
# print("Done")
# # Step 6: Caculate OCR result between ocr results and revised results
# print("[INFO] Exporting OCR report")
# init_csv_path = os.path.join(save_csv, "init1.csv")
# report = export_report(init_csv=init_csv_path)
# error_path = os.path.join(save_csv, "errors")
# pick_sample_to_revise(ocr_accuracy=report[report.accuracy < 0.75].to_dict('records'), gt_dir=save_path, save_dir=error_path)
# n_total_images = len(report)
# n_bad_images = len(report[report.accuracy < 0.75])
# average_acc = report[report.accuracy >= 0.75]['accuracy'].mean()
# print("Total requests:", len(report['requestId'].unique()))
# print("Total images:", n_total_images)
# print("No. imei images:", len(report[report.class_name == "imei_number"]))
# print("No. invoice images:", len(report[report.class_name == "retailername"]))
# print("No. bad quality images:", n_bad_images)
# print("No. valid images:", n_total_images - n_bad_images)
# print("No. per of bad quality images:", 100*n_bad_images/n_total_images)
# print("Average accuracy:", 100*average_acc)
# last_row = n_total_images
# report.at[last_row, "requestId"] = "Total requests:"
# report.at[last_row, "redemptionNumber"] = len(report['requestId'].unique())
# report.at[last_row+1, "requestId"] = "Total images:"
# report.at[last_row+1, "redemptionNumber"] = n_total_images
# report.at[last_row+2, "requestId"] = "No. imei images:"
# report.at[last_row+2, "redemptionNumber"] = len(report[report.class_name == "imei_number"])
# report.at[last_row+3, "requestId"] = "No. invoice images:"
# report.at[last_row+3, "redemptionNumber"] = len(report[report.class_name == "retailername"])
# report.at[last_row+4, "requestId"] = "No. bad quality images:"
# report.at[last_row+4, "redemptionNumber"] = n_bad_images
# report.at[last_row+5, "requestId"] = "No. valid images:"
# report.at[last_row+5, "redemptionNumber"] = n_total_images - n_bad_images
# report.at[last_row+6, "requestId"] = "No. per of bad quality images:"
# report.at[last_row+6, "redemptionNumber"] = 100*n_bad_images/n_total_images
# report.at[last_row+7, "requestId"] = "Average accuracy:"
# report.at[last_row+7, "redemptionNumber"] = 100*average_acc
# report.drop(columns=["file_path", "class_name"]).to_csv(os.path.join(save_csv, f"SBT_report_{time.strftime('%Y%m%d')}.csv"), index=False)
# print("[INFO] Done")

View File

@ -0,0 +1,201 @@
# https://stackoverflow.com/questions/774316/python-difflib-highlighting-differences-inline
import difflib
import unidecode
import os
import glob
import pandas as pd
VOWELS = 'aeouiy' + 'AEOUIY'
CONSONANTS = 'bcdfghjklmnpqrstvxwz' + 'BCDFGHJKLMNPQRSTVXWZ'
# PREDICT_PATH = 'ocr/result'
# GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/wiki/ground_truth'
PREDICT_PATH = 'ocr/result/cinamon'
GROUNDTRUTH_PATH = '/mnt/hdd2T/AICR/Datasets/Backup/1.Hand_writing/Lines/cinnamon_data'
# note that we also use different preprocess for cinamon data
# SAVE_PATH = 'wiki_diff'
SAVE_PATH = 'wiki_diff/cinamon'
RES_PATH = f'{SAVE_PATH}/result/'
WRONG_ACCENT_FILE = f'{SAVE_PATH}/wrong_accent.txt'
LOST_ACCENT_FILE = f'{SAVE_PATH}/lost_accent.txt'
TOTAL_WORD = 0
def write_accent_error(path, err):
# path should be wrong_accent_file or lost_accent_file
with open(path, 'a') as f:
f.write(err)
f.write('\n')
def update_ddata_specialchars(ddata_specialchars, correction_key, char_key):
if char_key in ddata_specialchars[correction_key]:
ddata_specialchars[correction_key][char_key] += 1
else:
ddata_specialchars[correction_key][char_key] = 1
def process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars):
a_char = matcher.a[i1:i2]
b_char = matcher.b[j1:j2]
ddata['res_text'] += ' ### {' + a_char + ' -> ' + b_char + '} ### '
ddata['nwrongs'] += 1*len(b_char)
if len(a_char) == 1 and len(b_char) == 1: # single char case
if a_char.lower() == b_char.lower(): # wrong upper/lower case
ddata['UL_single'] += 1
update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char))
else:
ddata['nwrongs_single'] += 1
a_ori = unidecode.unidecode(a_char).lower()
b_ori = unidecode.unidecode(b_char).lower()
if a_ori in VOWELS and b_ori in VOWELS:
if a_ori == b_ori:
err = a_char + ' -> ' + b_char
if b_ori == b_char.lower(): # e.g. Ơ -> O
ddata['nlost_accent'] += 1
# write_accent_error(LOST_ACCENT_FILE, err)
else: # e.g Ơ -> Ớ
ddata['nwrong_accent'] += 1
# write_accent_error(WRONG_ACCENT_FILE, err)
else: # e.g Ă -> Â
ddata['nwrong_vowels'] += 1
else:
if a_ori in CONSONANTS and b_ori in CONSONANTS:
ddata['nwrong_consonants'] += 1
else:
ddata['nwrong_specialchars'] += 1
update_ddata_specialchars(ddata_specialchars, 'wrong', (a_char, b_char))
else:
if a_char.lower() == b_char.lower():
ddata['UL_multiple'] += 1
update_ddata_specialchars(ddata_specialchars, 'UL', (a_char, b_char))
else:
ddata['nwrongs_multiple'] += 1
if len(a_char) > 10 or len(b_char) > 10:
ddata['nlong_sequences'] += 1
# print(a_char)
def process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars):
a_char = matcher.a[i1:i2]
ddata['res_text'] += ' ### {- ' + a_char + '} ### '
ddata['nadds'] += 1*len(a_char)
if len(a_char) == 1:
ddata['nadds_single'] += 1
if a_char.lower() in CONSONANTS + VOWELS:
ddata['nadds_chars'] += 1
else:
if a_char == ' ':
ddata['nadds_space'] += 1
else:
ddata['nadds_specialchars'] += 1
update_ddata_specialchars(ddata_specialchars, 'add', a_char)
else:
ddata['nadds_multiple'] += 1
if len(a_char) > 10:
ddata['nlong_sequences'] += 1
# print(a_char)
def process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars):
b_char = matcher.b[j1:j2]
ddata['nlosts'] += 1*len(b_char)
ddata['res_text'] += ' ### {+ ' + b_char + '} ### '
if len(b_char) == 1:
ddata['nlosts_single'] += 1
if b_char.lower() in CONSONANTS + VOWELS:
ddata['nlosts_chars'] += 1
else:
if b_char == ' ':
ddata['nlosts_space'] += 1
else:
ddata['nlosts_specialchars'] += 1
update_ddata_specialchars(ddata_specialchars, 'lost', b_char)
else:
ddata['nlosts_multiple'] += 1
if len(b_char) > 10:
ddata['nlong_sequences'] += 1
# print(b_char)
def inline_diff(a, b, ddata_specialchars={'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}):
matcher = difflib.SequenceMatcher(None, a, b)
ddata = {'res_text': ''}
# ddata = ddata | {key: 0 for key in ['nsingle', 'nmultiple']}
ddata = ddata | {key: 0 for key in ['UL_single', 'UL_multiple']}
ddata = ddata | {
key: 0 for key in
['nlosts', 'nlosts_single', 'nlosts_multiple', 'nlosts_chars', 'nlosts_specialchars', 'nlosts_space']}
ddata = ddata | {
key: 0 for key in
['nadds', 'nadds_single', 'nadds_multiple', 'nadds_chars', 'nadds_specialchars', 'nadds_space']}
ddata = ddata | {
key: 0 for key in
['nwrongs', 'nwrongs_single', 'nwrongs_multiple', 'nwrong_accent', 'nlost_accent', 'nwrong_vowels',
'nwrong_consonants', 'nwrong_specialchars']}
ddata['nlong_sequences'] = 0
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'replace': # wrong
process_replace_tag(matcher, i1, i2, j1, j2, ddata, ddata_specialchars)
if tag == 'delete': # OCR add char so the matcher "delete"
process_delete_tag(matcher, i1, i2, ddata, ddata_specialchars)
if tag == 'equal':
ddata['res_text'] += matcher.a[i1:i2]
if tag == 'insert': # OCR lost char so the matcher "insert"
process_insert_tag(matcher, j1, j2, ddata, ddata_specialchars)
ddata["ned"] = ddata['nwrongs'] + ddata['nadds'] + ddata['nlosts']
return ddata
def process_single_file(file_name, ddata_specialchars):
# read predict file
with open(os.path.join(PREDICT_PATH, file_name), 'r') as f:
predict = f.readlines()[0].strip()
# predict = ''.join(predict)
# predict = predict.replace(' ', '')
# predict = predict.replace('\n', '')
# print(predict)
# read groundtruth file
with open(os.path.join(GROUNDTRUTH_PATH, file_name), 'r') as f:
gt = f.readlines()[0].strip()
# gt = ''.join(gt)
# gt = gt.replace('\n', '')
# get statiscal data of difference between predict and ground truth
ddata = inline_diff(predict, gt, ddata_specialchars)
global TOTAL_WORD
TOTAL_WORD = TOTAL_WORD + len(gt.split())
# write to save_path
res_text = ddata.pop('res_text', None)
save_file = os.path.join(RES_PATH, file_name)
with open(save_file, 'w') as f:
f.write(res_text)
# generate csv file
ddata = {'file_name': save_file} | ddata
return ddata
def main(overwrite=False):
for accent_file in [WRONG_ACCENT_FILE, LOST_ACCENT_FILE]:
if os.path.exists(accent_file):
os.remove(accent_file)
lddata = []
ddata_specialchars = {'lost': {}, 'add': {}, 'wrong': {}, 'UL': {}}
for file_ in glob.glob(f'{PREDICT_PATH}/*.txt'):
file_name = file_.split('/')[-1]
ddata = process_single_file(file_name, ddata_specialchars)
lddata.append(ddata)
if overwrite:
df = pd.DataFrame(lddata)
df.to_csv(f'{SAVE_PATH}/wiki_diff.csv', sep='\t')
df_ = pd.DataFrame(ddata_specialchars)
df_.to_csv(f'{SAVE_PATH}/wiki_diff_specialchars.csv', sep='\t')
print(TOTAL_WORD)
if __name__ == '__main__':
main(overwrite=True)

View File

@ -104,7 +104,9 @@ def validate_ocr_request_and_get(request, subscription):
FileUtils.validate_list_file(list_file)
validated_data['file'] = list_file[0]
validated_data['is_test_request'] = request.data.get('is_test_request', False)
# validated_data['is_test_request'] = bool(request.data.get('is_test_request', False))
validated_data['is_test_request'] = string_to_boolean(request.data.get('is_test_request', "false"))
# print(f"[DEBUG]: is_test_request: ", validated_data['is_test_request'])
return validated_data
@ -139,9 +141,22 @@ def sbt_validate_ocr_request_and_get(request, subscription):
validated_data['imei_file'] = imei_files
validated_data['invoice_file'] = invoice_file
validated_data['redemption_ID'] = redemption_ID
validated_data['is_test_request'] = string_to_boolean(request.data.get('is_test_request', "false"))
# print(f"[DEBUG]: is_test_request: ", validated_data['is_test_request'])
return validated_data
def string_to_boolean(value):
true_strings = ['true', 'yes', '1', 'on']
false_strings = ['false', 'no', '0', 'off']
if isinstance(value, str):
lower_value = value.lower()
if lower_value in true_strings:
return True
else:
return False
def sbt_validate_feedback(request):
validated_data = {}
@ -306,7 +321,6 @@ def token_value(token_type):
return 5
return 1 # Basic OCR
def send_to_queue2(rq_id, sub_id, file_url, user_id, typez, metadata={}):
try:
if typez == ProcessType.ID_CARD.value:
@ -324,7 +338,6 @@ def send_to_queue2(rq_id, sub_id, file_url, user_id, typez, metadata={}):
print(e)
raise BadGatewayException()
def build_template_matching_data(template):
temp_dict = {
@ -362,8 +375,10 @@ def send_template_queue(rq_id, file_url, template: OcrTemplate, uid):
print(e)
raise BadGatewayException()
def process_feedback(feedback_id, local_file_path):
c_connector.csv_feedback((local_file_path, feedback_id))
def process_pdf_file(file_name: str, file_obj: TemporaryUploadedFile, request: SubscriptionRequest, user) -> list:
def process_pdf_file(file_name: str, file_obj: TemporaryUploadedFile, request: SubscriptionRequest, user, doc_type: str, index_in_request: int) -> list:
doc: fitz.Document = fitz.open(stream=file_obj.file.read())
if doc.page_count > settings.MAX_PAGES_OF_PDF_FILE:
raise LimitReachedException(excArgs=('Number of pages', str(settings.MAX_PAGES_OF_PDF_FILE), 'pages'))
@ -372,16 +387,18 @@ def process_pdf_file(file_name: str, file_obj: TemporaryUploadedFile, request: S
# Origin file
file_obj.seek(0)
file_path = FileUtils.resize_and_save_file(file_name, request, file_obj, 100)
code = f'FIL{uuid.uuid4().hex}'
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
code=code,
doc_type=doc_type,
index_in_request=index_in_request)
new_request_file.save()
# Sub-file
return pdf_to_images_urls(doc, request, user)
def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request: SubscriptionRequest, user) -> list:
def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request: SubscriptionRequest, user, doc_type: str, index_in_request: int) -> list:
if file_obj.size > settings.SIZE_TO_COMPRESS:
quality = 95
else:
@ -390,7 +407,9 @@ def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request:
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
code=f'FIL{uuid.uuid4().hex}',
doc_type=doc_type,
index_in_request=index_in_request)
new_request_file.save()
return [{
'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),
@ -398,11 +417,13 @@ def process_image_file(file_name: str, file_obj: TemporaryUploadedFile, request:
'request_file_id': new_request_file.code
}]
def process_image_local_file(file_name: str, file_path: str, request: SubscriptionRequest, user) -> list:
def process_image_local_file(file_name: str, file_path: str, request: SubscriptionRequest, user, doc_type: str, index_in_request: int) -> list:
new_request_file: SubscriptionRequestFile = SubscriptionRequestFile(file_path=file_path,
request=request,
file_name=file_name,
code=f'FIL{uuid.uuid4().hex}')
code=f'FIL{uuid.uuid4().hex}',
doc_type=doc_type,
index_in_request=index_in_request)
new_request_file.save()
return [{
'file_url': FileUtils.build_url(FolderFileType.REQUESTS.value, request.request_id, user.id, file_name),

@ -0,0 +1 @@
Subproject commit b6d4fab46f7f8689dd6b050cfbff2faa6a6f3fec

BIN
cope2n-api/report.xlsx Normal file

Binary file not shown.

Binary file not shown.

View File

@ -36,7 +36,7 @@ requests==2.28.1
ruamel.yaml==0.17.21
ruamel.yaml.clib==0.2.7
sqlparse==0.4.3
tzdata==2022.6
tzdata==2022.7
uritemplate==4.1.1
urllib3==1.26.13
uvicorn==0.20.0
@ -50,4 +50,13 @@ boto3==1.29.7
imagesize==1.4.1
pdf2image==1.16.3
redis==5.0.1
django-celery-beat==2.5.0
django-celery-beat==2.5.0
terminaltables==3.1.10
rapidfuzz==3.6.1
Unidecode==1.3.8
pandas==2.2.0
openpyxl==3.1.2
# For sdsvkvu compatibility
# torch==1.13.1+cu116
# torchvision==0.14.1+cu116
# --extra-index-url https://download.pytorch.org/whl/cu116

View File

@ -0,0 +1 @@
pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql

View File

@ -1,4 +1,5 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

View File

@ -2,26 +2,33 @@
set -e
tag=$1
# is_prod=${$2:-False}
echo "[INFO] Tag received from Python: $tag"
echo "[INFO] Updating everything the remote..."
git submodule update --recursive --remote
# echo "[INFO] Updating everything the remote..."
# git submodule update --recursive --remote
echo "[INFO] Pushing AI image with tag: $tag..."
docker compose -f docker-compose-dev.yml build cope2n-fi-sbt
docker tag sidp/cope2n-ai-fi-sbt:latest public.ecr.aws/v4n9y6r8/sidp/cope2n-ai-fi-sbt:${tag}
docker push public.ecr.aws/v4n9y6r8/sidp/cope2n-ai-fi-sbt:${tag}
docker tag sidp/cope2n-ai-fi-sbt:latest public.ecr.aws/sdsrv/sidp/cope2n-ai-fi-sbt:${tag}
docker push public.ecr.aws/sdsrv/sidp/cope2n-ai-fi-sbt:${tag}
# docker tag sidp/cope2n-ai-fi-sbt:latest public.ecr.aws/sdsrv/sidp/cope2n-ai-fi-sbt:production
# docker push public.ecr.aws/sdsrv/sidp/cope2n-ai-fi-sbt:production
echo "[INFO] Pushing BE image with tag: $tag..."
docker compose -f docker-compose-dev.yml build be-ctel-sbt
docker tag sidp/cope2n-be-fi-sbt:latest public.ecr.aws/v4n9y6r8/sidp/cope2n-be-fi-sbt:${tag}
docker push public.ecr.aws/v4n9y6r8/sidp/cope2n-be-fi-sbt:${tag}
docker tag sidp/cope2n-be-fi-sbt:latest public.ecr.aws/sdsrv/sidp/cope2n-be-fi-sbt:${tag}
# docker tag sidp/cope2n-be-fi-sbt:latest public.ecr.aws/sdsrv/sidp/cope2n-be-fi-sbt:production
docker push public.ecr.aws/sdsrv/sidp/cope2n-be-fi-sbt:${tag}
# docker push public.ecr.aws/sdsrv/sidp/cope2n-be-fi-sbt:production
echo "[INFO] Pushing FE image with tag: $tag..."
docker compose -f docker-compose-dev.yml build fe-sbt
docker tag sidp/cope2n-fe-fi-sbt:latest public.ecr.aws/v4n9y6r8/sidp/cope2n-fe-fi-sbt:${tag}
docker push public.ecr.aws/v4n9y6r8/sidp/cope2n-fe-fi-sbt:${tag}
docker tag sidp/cope2n-fe-fi-sbt:latest public.ecr.aws/sdsrv/sidp/cope2n-fe-fi-sbt:${tag}
# docker tag sidp/cope2n-fe-fi-sbt:latest public.ecr.aws/sdsrv/sidp/cope2n-fe-fi-sbt:production
docker push public.ecr.aws/sdsrv/sidp/cope2n-fe-fi-sbt:${tag}
# docker push public.ecr.aws/sdsrv/sidp/cope2n-fe-fi-sbt:production
cp ./docker-compose-prod.yml ./docker-compose_${tag}.yml
sed -i "s/{{tag}}/$tag/g" ./docker-compose_${tag}.yml

View File

@ -63,8 +63,10 @@ services:
- AUTH_TOKEN_LIFE_TIME=${AUTH_TOKEN_LIFE_TIME}
- IMAGE_TOKEN_LIFE_TIME=${IMAGE_TOKEN_LIFE_TIME}
- INTERNAL_SDS_KEY=${INTERNAL_SDS_KEY}
- FI_USER_NAME=${FI_USER_NAME}
- FI_PASSWORD=${FI_PASSWORD}
- ADMIN_USER_NAME=${ADMIN_USER_NAME}
- ADMIN_PASSWORD=${ADMIN_PASSWORD}
- STANDARD_USER_NAME=${STANDARD_USER_NAME}
- STANDARD_PASSWORD=${STANDARD_PASSWORD}
- S3_ENDPOINT=${S3_ENDPOINT}
- S3_ACCESS_KEY=${S3_ACCESS_KEY}
- S3_SECRET_KEY=${S3_SECRET_KEY}
@ -75,19 +77,19 @@ services:
networks:
- ctel-sbt
volumes:
# - ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
# - BE_media:${MEDIA_ROOT}
- BE_static:/app/static
- ./cope2n-api:/app
working_dir: /app
depends_on:
db-sbt:
condition: service_started
command: sh -c "chmod -R 777 /app/static; sleep 5; python manage.py collectstatic --no-input &&
python manage.py makemigrations &&
python manage.py migrate &&
python manage.py compilemessages &&
gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
# command: bash -c "tail -f > /dev/null"
# command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input &&
# python manage.py makemigrations &&
# python manage.py migrate &&
# python manage.py compilemessages &&
# gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
command: bash -c "tail -f > /dev/null"
minio:
image: minio/minio
@ -98,6 +100,9 @@ services:
- MINIO_SECRET_KEY=${S3_SECRET_KEY}
volumes:
- ./data/minio_data:/data
ports:
- 9884:9884
- 9885:9885
networks:
- ctel-sbt
restart: always
@ -165,7 +170,7 @@ services:
rabbitmq-sbt:
condition: service_started
volumes:
# - ${HOST_MEDIA_FOLDER}:${MEDIA_ROOT}
# - BE_media:${MEDIA_ROOT}
- ./cope2n-api:/app
working_dir: /app
@ -184,6 +189,8 @@ services:
- POSTGRES_USER=${DB_USER}
- POSTGRES_PASSWORD=${DB_PASSWORD}
- POSTGRES_DB=${DB_SCHEMA}
ports:
- 5432:5432
rabbitmq-sbt:
mem_reservation: 600m
@ -223,4 +230,5 @@ services:
volumes:
db_data:
BE_static:
BE_static:
BE_media:

View File

@ -44,8 +44,10 @@ services:
- AUTH_TOKEN_LIFE_TIME=${AUTH_TOKEN_LIFE_TIME}
- IMAGE_TOKEN_LIFE_TIME=${IMAGE_TOKEN_LIFE_TIME}
- INTERNAL_SDS_KEY=${INTERNAL_SDS_KEY}
- FI_USER_NAME=${FI_USER_NAME}
- FI_PASSWORD=${FI_PASSWORD}
- ADMIN_USER_NAME=${ADMIN_USER_NAME}
- ADMIN_PASSWORD=${ADMIN_PASSWORD}
- STANDARD_USER_NAME=${STANDARD_USER_NAME}
- STANDARD_PASSWORD=${STANDARD_PASSWORD}
- S3_ENDPOINT=${S3_ENDPOINT}
- S3_ACCESS_KEY=${S3_ACCESS_KEY}
- S3_SECRET_KEY=${S3_SECRET_KEY}
@ -137,7 +139,6 @@ services:
condition: service_started
volumes:
- BE_media:${MEDIA_ROOT}
working_dir: /app
command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO -c 5"

152
scripts/crawl_database.py Normal file
View File

@ -0,0 +1,152 @@
import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv("../.env_prod")
OUTPUT_NAME = "5Jan"
# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")
# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'
# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")
# Request IDs for filtering
request_ids = [
'SAP_20240104082259_85c7f4dd262946d183dbec826fc6709e',
'SAP_20240104082709_c05319c56fd3422dbf133aee33fc3e10',
'SAP_20240104091512_23ae1a81f1314be0a27ebeae0e8fa0d7',
'SAP_20240104091512_23ae1a81f1314be0a27ebeae0e8fa0d7',
'SAP_20240104091816_025c90b9789246ed811772003622fa0d',
'SAP_20240104092541_5c71e535f07c4cc8803b45336ec70f77',
'SAP_20240104100259_5a667d33cb914e7ba5a4447b9e17d649',
'SAP_20240104101145_a7010bac159f47bc95d5866e6c5f5bdf',
'SAP_20240104105702_95252229252b4e238add117919ce882a',
'SAP_20240104112108_34b2cca84a42473ca77bc316e787fe2e',
'SAP_20240104114038_dd57ecf7982c4a5eaf1409f5ef050fab',
'SAP_20240104115942_1b77f411791940a4a85c838c2e9931ad',
'SAP_20240104120746_d63319f4cde343d894f9b89706756a9d',
'SAP_20240104123607_48d25c04fec6411dbf013c6a19054e77',
'SAP_20240104130957_ece21bad331b4f2cad0887693331aa3a',
'SAP_20240104131228_edebee4000ae4bd382feaea5d6c82031',
'SAP_20240104132641_97909efd013f45e89d83d36a5ea35c52',
'SAP_20240104133527_ad55f6ee667643ba8ae65e9ef1c32418',
'SAP_20240104134014_2d2cdbc1b06a44868ce1b32cdb53864f',
'SAP_20240104134425_9b37555ef8094153838e6048f7c63c9b',
'SAP_20240104134457_55a1cf1e371146d995c8849cc0ba7c7b',
'SAP_20240104134609_3f7d308e467d43dbb59a7bcc02e3a7d2',
'SAP_20240104134709_c708daf83f7e4aa69ab9696afe1a9081',
'SAP_20240104135007_44b7a30c5e9c41a0b8065ac4e7000223',
'SAP_20240104141547_7203ddb915274e99a08ae6e54ec49cbd',
'SAP_20240104141559_62fd19a6179248ecb4ff15b33338b294',
'SAP_20240104142352_68699cbe140f4264b858981a3ac67e40',
'SAP_20240104143937_801931cc1f344a4ca8384dfe13d1accc',
'SAP_20240104144730_3180a8919e604e26a188ce051465c392',
'SAP_20240104144933_3380f64019634769befed49e9a671bc6',
'SAP_20240104151239_76ae2f1d02444f7fabbc104eb77fe45f',
'SAP_20240104151243_61775c88685d434d98bb9fc7a9889b8e',
'SAP_20240104151243_61775c88685d434d98bb9fc7a9889b8e',
'SAP_20240104151243_61775c88685d434d98bb9fc7a9889b8e',
'SAP_20240104151638_a08a61448a58459a8f2209f64e54c213',
'SAP_20240104152030_479259e84c5b449499df2cb1023e91ac',
'SAP_20240104160108_a03634c80583454494b77efcdecbcc71',
'SAP_20240104160108_a03634c80583454494b77efcdecbcc71',
'SAP_20240104160311_e7cb02a11bbd4ea1906b3758e97f33ab',
'SAP_20240104161305_89c5518563224ab89345439dffd504a5',
'SAP_20240104161305_89c5518563224ab89345439dffd504a5',
'SAP_20240104164022_0b94af24db9d4ebe9af2086a4bd3cd7e',
'SAP_20240104170837_58165ec9f88d4e4aa3095ba3dda201d7',
'SAP_20240104171740_10279cfebbf344f184bbb429cb9a15ad',
'SAP_20240104175202_247892a4dc7f40f28eafac9c2ad85971',
'SAP_20240104180517_8ce7a1981dc743e08e09284fd904d536',
'SAP_20240104182034_406bac0ab0684727b9efb1bb9b422026',
'SAP_20240104182426_92a48bb4b85a4c3abb48e0d7cf727777',
'SAP_20240104183506_aa1fa7d6774a4509a142a6f4a7b5af29',
'SAP_20240104185716_f9d464e42c314370910913b37133e6c3',
'SAP_20240104190220_573244d03bb8408dbca422ff60eb527a',
'SAP_20240104191236_deedcc588b7b4928a950f7dc2ce4230c',
'SAP_20240104191236_deedcc588b7b4928a950f7dc2ce4230c',
'SAP_20240104192614_990bf10c38e144a7bf489548d356720e',
'SAP_20240104192614_990bf10c38e144a7bf489548d356720e',
'SAP_20240104212143_f8c1b4a6e6e443fcb5e882c7a5b917f3',
'SAP_20240104212924_ee1998a60d6848af9576292ac383037f',
'SAP_20240104214418_f8e1abf808c8499097ecddf014d401c7',
'SAP_20240104214619_8d27c05a9ce74b738b20195cb816bfbf',
'SAP_20240104215037_477863cdc0aa4d5fa1f05bbb0ae673ed',
'SAP_20240104221543_37605982df624324ad2594e268054361',
'SAP_20240104225026_acacd06ea6de4a738bc47683dc53f378',
'SAP_20240104235743_b48aa3e744ed428795171d84066adefe',
]
# Connect to the PostgreSQL database
conn = psycopg2.connect(
host=db_host,
database=db_name,
user=db_user,
password=db_password
)
# Create a cursor
cursor = conn.cursor()
# Generate the placeholder string for the IN statement
placeholders = ','.join(['%s'] * len(request_ids))
# Execute the SELECT query with the filter
query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})"
cursor.execute(query, request_ids)
# Fetch the filtered data
data = cursor.fetchall()
# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'
# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([desc[0] for desc in cursor.description]) # Write column headers
writer.writerows(data) # Write the filtered data rows
# Close the cursor and database connection
cursor.close()
conn.close()
# Download folders from S3
s3_client = boto3.client(
's3',
aws_access_key_id=access_key,
aws_secret_access_key=secret_key
)
for request_id in tqdm(request_ids):
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
os.makedirs(OUTPUT_NAME, exist_ok=True)
os.makedirs(local_folder_path, exist_ok=True)
# List objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
objects = response.get('Contents', [])
for s3_object in objects:
object_key = s3_object['Key']
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
# Download the S3 object to the local file
s3_client.download_file(s3_bucket_name, object_key, local_file_path)

View File

@ -0,0 +1,93 @@
import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
from datetime import datetime, timedelta
from pytz import timezone
from dotenv import load_dotenv
load_dotenv("../.env_prod")
OUTPUT_NAME = "Jan"
START_DATE = datetime(2024, 1, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
END_DATE = datetime(2024, 2, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")
# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'
# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")
# Request IDs for filtering
# Connect to the PostgreSQL database
conn = psycopg2.connect(
host=db_host,
database=db_name,
user=db_user,
password=db_password
)
# Create a cursor
cursor = conn.cursor()
# Execute the SELECT query with the filter
query = "SELECT * FROM fwd_api_subscriptionrequest WHERE created_at >= %s AND created_at <= %s"
cursor.execute(query, (START_DATE, END_DATE))
# Fetch the filtered data
data = cursor.fetchall()
# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'
# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([desc[0] for desc in cursor.description]) # Write column headers
writer.writerows(data) # Write the filtered data rows
# Close the cursor and database connection
cursor.close()
conn.close()
# # Download folders from S3
# s3_client = boto3.client(
# 's3',
# aws_access_key_id=access_key,
# aws_secret_access_key=secret_key
# )
# request_ids = []
# for rq in data:
# rq_id = rq[3]
# request_ids.append(rq_id)
# for request_id in tqdm(request_ids):
# folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
# local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
# os.makedirs(OUTPUT_NAME, exist_ok=True)
# os.makedirs(local_folder_path, exist_ok=True)
# # List objects in the S3 folder
# response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
# objects = response.get('Contents', [])
# for s3_object in objects:
# object_key = s3_object['Key']
# local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
# # Download the S3 object to the local file
# s3_client.download_file(s3_bucket_name, object_key, local_file_path)

View File

@ -0,0 +1 @@
pg_dump -U sbt -h sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com sbt2 >> sbt2.sql

View File

@ -34,7 +34,7 @@ def make_sbt_request(host, token, invoice_files=None, imei_files=None, ensure_su
files.append(('imei_files', (file, open(file, "rb"), 'application/octet-stream')))
num_files = len(files)
files.append(('processType', '12'))
files.append(('is_test_request', 'true'))
files.append(('is_test_request', 'True'))
start_time = time.time()
end_of_upload_time = 0