Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into vietanh99-update-xlsx

2024-02-19 09:22:49 +07:00 · 2024-02-19 09:22:49 +07:00 · 6cf90f1d19
commit 6cf90f1d19
parent 0900b44e65 c0031d68b6
11 changed files with 163 additions and 98 deletions
--- a/.gitignore
+++ b/.gitignore
@ -38,3 +38,4 @@ cope2n-ai-fi/Dockerfile_old_work
 /feedback/
 cope2n-api/public/SBT_report_20240122.csv
 Jan.csv
+*.csv
--- a/cope2n-ai-fi/modules/sdsvkvu
+++ b/cope2n-ai-fi/modules/sdsvkvu
@ -1 +1 @@
-Subproject commit 6907ea0183b141e3b4f3c21758c9123f1e9b2a27
+Subproject commit b6d4fab46f7f8689dd6b050cfbff2faa6a6f3fec
--- a/cope2n-api/fwd_api/api/accuracy_view.py
+++ b/cope2n-api/fwd_api/api/accuracy_view.py
@ -433,10 +433,10 @@ class AccuracyViewSet(viewsets.ViewSet):
    @action(detail=False, url_path="overview", methods=["GET"])
    def overview(self, request):
        if request.method == 'GET':
-            subsidiary = request.GET.get('subsidiary', "ALL")
+            _subsidiary = request.GET.get('subsidiary', "ALL")
            duration = request.GET.get('duration', "")

-            subsidiary = map_subsidiary_long_to_short(subsidiary)
+            subsidiary = map_subsidiary_long_to_short(_subsidiary)

            # Retrive data from Redis
            key =  f"{subsidiary}_{duration}"
--- a/cope2n-api/fwd_api/migrations/0181_reportfile_subsidiary.py
+++ b/cope2n-api/fwd_api/migrations/0181_reportfile_subsidiary.py
@ -0,0 +1,18 @@
+# Generated by Django 4.1.3 on 2024-02-15 09:12
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('fwd_api', '0180_alter_reportfile_time_cost'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='reportfile',
+            name='subsidiary',
+            field=models.CharField(default='', max_length=200, null=True),
+        ),
+    ]
--- a/cope2n-api/fwd_api/models/ReportFile.py
+++ b/cope2n-api/fwd_api/models/ReportFile.py
@ -9,6 +9,7 @@ class ReportFile(models.Model):
    id = models.AutoField(primary_key=True)
    correspond_request_id = models.CharField(max_length=200, default="")
    correspond_redemption_id = models.CharField(max_length=200, default="")
+    subsidiary = models.CharField(default="", null=True, max_length=200)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    updated_at = models.DateTimeField(auto_now=True)
    report = models.ForeignKey(Report, related_name="files", on_delete=models.CASCADE)
--- a/cope2n-api/fwd_api/utils/accuracy.py
+++ b/cope2n-api/fwd_api/utils/accuracy.py
@ -9,6 +9,7 @@ import uuid
 from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile
 from ..celery_worker.client_connector import c_connector
 from ..utils.file import dict2xlsx, save_workbook_file, save_report_to_S3
+from ..utils.subsidiary import map_subsidiary_short_to_long
 from django.db.models import Q
 from django.utils import timezone
 import redis
@ -29,6 +30,8 @@ class ReportAccumulateByRequest:
        self.total_format = {
                    'subs': "+",
                    'extraction_date': "Subtotal ()",
+                    'num_imei': 0,
+                    'num_invoice': 0,
                    'total_images': 0,
                    'images_quality': {
                        'successful': 0,
@ -49,6 +52,7 @@ class ReportAccumulateByRequest:
                    'usage': {
                        'imei':0,
                        'invoice': 0,
+                        'total_images': 0,
                        'request': 0
                        },
                    'feedback_accuracy': {
@ -90,6 +94,7 @@ class ReportAccumulateByRequest:
                    'usage': {
                        'imei': 0,
                        'invoice': 0,
+                        'total_images': 0,
                        'request': 0
                    },
                    'feedback_accuracy': {
@ -113,7 +118,13 @@ class ReportAccumulateByRequest:
        total["total_images"] += 1
        total["images_quality"]["successful"] += 1 if not report_file.is_bad_image else 0
        total["images_quality"]["bad"] += 1 if report_file.is_bad_image else 0
-        # total["report_files"].append(report_file)
+        doc_type = "imei"
+        if report_file.doc_type in ["imei", "invoice", "all"]:
+            doc_type = report_file.doc_type
+        else:
+            print(f"[WARM]: Weird doc type {report_file.doc_type} if request id: {report_file.correspond_request_id}")
+        total["num_imei"] += 1 if doc_type == "imei" else 0
+        total["num_invoice"] += 1 if doc_type == "invoice" else 0

        if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 :
            total["average_accuracy_rate"]["imei"].add(report_file.reviewed_accuracy.get("imei_number", []))
@ -136,8 +147,14 @@ class ReportAccumulateByRequest:
            total["average_processing_time"][report_file.doc_type] = IterAvg()
        total["average_processing_time"][report_file.doc_type].add_avg(report_file.time_cost, 1) if report_file.time_cost else 0 

-        total["usage"]["imei"] += 1 if report_file.doc_type == "imei" else 0
-        total["usage"]["invoice"] += 1 if report_file.doc_type == "invoice" else 0
+        doc_type = "imei"
+        if report_file.doc_type in ["imei", "invoice", "all"]:
+            doc_type = report_file.doc_type
+        else:
+            print(f"[WARM]: Weird doc type {report_file.doc_type} if request id: {report_file.correspond_request_id}")
+        total["usage"]["imei"] += 1 if doc_type == "imei" else 0
+        total["usage"]["invoice"] += 1 if doc_type == "invoice" else 0
+        total["usage"]["total_images"] += 1
        
        return total

@ -146,8 +163,13 @@ class ReportAccumulateByRequest:
        day_data["total_images"] += 1
        day_data["images_quality"]["successful"] += 1 if not report_file.is_bad_image else 0
        day_data["images_quality"]["bad"] += 1 if report_file.is_bad_image else 0
-        day_data["num_imei"] += 1 if report_file.doc_type == "imei" else 0
-        day_data["num_invoice"] += 1 if report_file.doc_type == "invoice" else 0
+        doc_type = "imei"
+        if report_file.doc_type in ["imei", "invoice", "all"]:
+            doc_type = report_file.doc_type
+        else:
+            print(f"[WARM]: Weird doc type {report_file.doc_type} if request id: {report_file.correspond_request_id}")
+        day_data["num_imei"] += 1 if doc_type == "imei" else 0
+        day_data["num_invoice"] += 1 if doc_type == "invoice" else 0
        day_data["report_files"].append(report_file)
        
        if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 :
@ -186,6 +208,7 @@ class ReportAccumulateByRequest:
            self.data[this_month][1][this_day]["usage"]["imei"] = usage.get("imei", 0)
            self.data[this_month][1][this_day]["usage"]["invoice"] = usage.get("invoice", 0)
            self.data[this_month][1][this_day]["usage"]["request"] = usage.get("request", 0)
+            self.data[this_month][1][this_day]["usage"]["total_images"] = usage.get("imei", 0) + usage.get("invoice", 0)

        self.data[this_month][1][this_day]['num_request'] += 1
        self.data[this_month][0]['num_request'] += 1
@ -213,6 +236,7 @@ class ReportAccumulateByRequest:
            day_keys = list(report_data[month][1].keys())
            day_keys.sort(reverse = True)
            for day in day_keys:
+                report_data[month][1][day]['subs'] = map_subsidiary_short_to_long(report_data[month][1][day]['subs'])
                fine_data.append(report_data[month][1][day])
                # save daily reports
                report_id = root_report_id + "_" + day
@ -293,6 +317,7 @@ class ReportAccumulateByRequest:

            _data[month][0]["usage"]["imei"] = num_transaction_imei
            _data[month][0]["usage"]["invoice"] = num_transaction_invoice
+            _data[month][0]["usage"]["total_images"] = num_transaction_invoice + num_transaction_imei
            _data[month][0]["average_accuracy_rate"]["imei"] = _data[month][0]["average_accuracy_rate"]["imei"]()
            _data[month][0]["average_accuracy_rate"]["purchase_date"] = _data[month][0]["average_accuracy_rate"]["purchase_date"]()
            _data[month][0]["average_accuracy_rate"]["retailer_name"] = _data[month][0]["average_accuracy_rate"]["retailer_name"]()
@ -311,7 +336,6 @@ class ReportAccumulateByRequest:

        return _data
    
-
 class MonthReportAccumulate:
    def __init__(self):
        self.month = None
@ -513,6 +537,7 @@ def extract_report_detail_list(report_detail_list, lower=False, in_percent=True)
    data = []
    for report_file in report_detail_list:
        data.append({
+            "Subs": report_file.subsidiary,
            "Request ID": report_file.correspond_request_id,
            "Redemption Number": report_file.correspond_redemption_id,
            "Image type": report_file.doc_type,
@ -600,6 +625,9 @@ def align_fine_result(ready_predict, fine_result):
    # print(f"[DEBUG]: fine_result: {fine_result}")
    # print(f"[DEBUG]: ready_predict: {ready_predict}")
    if fine_result:
+        if isinstance(ready_predict["purchase_date"], str):
+            ready_predict["purchase_date"] = [ready_predict["purchase_date"]]
+            # ready_predict.save()
        if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0:
            ready_predict["purchase_date"] = [None]
        if fine_result["retailername"] and not ready_predict["retailername"]:
@ -616,6 +644,7 @@ def update_temp_accuracy(accuracy, acc, keys):
    for key in keys:
        accuracy[key].add(acc[key])
    return accuracy
+
 def calculate_accuracy(key_name, inference, target):
    """_summary_

@ -661,7 +690,10 @@ def calculate_avg_accuracy(acc, type, keys=[]):
    acc_list = []
    # print(f"[DEBUG]: type: {type} - acc: {acc}")
    for key in keys:
-        acc_list += acc.get(type, {}).get(key, [])
+        this_acc = acc.get(type, {}).get(key, [])
+        if len(this_acc) > 0:
+            this_acc = [max(this_acc)]
+        acc_list += this_acc
    
    acc_list = [x for x in acc_list if x is not None]
    return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None
@ -732,6 +764,12 @@ def calculate_and_save_subcription_file(report, request):

    return request_att

+def acc_maximize_list_values(acc):
+    for k in acc.keys():
+        if isinstance(acc[k], list) and len(acc[k]) > 0:
+            acc[k] = [max(acc[k])] 
+    return acc
+
 def calculate_a_request(report, request):
    request_att = {"acc": {"feedback": {"imei_number": [],
                                        "purchase_date": [],
@ -753,19 +791,30 @@ def calculate_a_request(report, request):
        status, att = calculate_subcription_file(image)
        if status != 200:
            continue
-        image.feedback_accuracy = att["acc"]["feedback"]
-        image.reviewed_accuracy = att["acc"]["reviewed"]
+        image.feedback_accuracy = att["acc"]["feedback"] # dict {key: [values]}
+        image.reviewed_accuracy = att["acc"]["reviewed"] # dict {key: [values]}
        image.is_bad_image_quality = att["is_bad_image"]
+        if not image.doc_type:
+            # try to revert doc type from filename
+            _doc_type = image.file_name.split("_")[1]
+            if _doc_type in ["imei", "invoice"]:
+                image.doc_type = _doc_type
        image.save()
+        _sub = "NA"
+        if request.redemption_id:
+            _sub = map_subsidiary_short_to_long(request.redemption_id[:2])
+        else:
+            print(f"[WARM]: empty redemption_id, check request: {request.request_id}")
        new_report_file = ReportFile(report=report,
+                                    subsidiary=_sub,
                                    correspond_request_id=request.request_id,
                                    correspond_redemption_id=request.redemption_id,
                                    doc_type=image.doc_type,
                                    predict_result=image.predict_result,
                                    feedback_result=image.feedback_result,
                                    reviewed_result=image.reviewed_result,
-                                    feedback_accuracy=att["acc"]["feedback"],
-                                    reviewed_accuracy=att["acc"]["reviewed"],
+                                    feedback_accuracy=acc_maximize_list_values(att["acc"]["feedback"]),
+                                    reviewed_accuracy=acc_maximize_list_values(att["acc"]["reviewed"]),
                                    acc=att["avg_acc"],
                                    is_bad_image=att["is_bad_image"],
                                    time_cost=image.processing_time,
@ -798,7 +847,6 @@ def calculate_a_request(report, request):

    return request_att, report_files

-
 def calculate_subcription_file(subcription_request_file):
    att = {"acc": {"feedback": {},
                    "reviewed": {}},
--- a/cope2n-api/scripts/re_feedback.py
+++ b/cope2n-api/scripts/re_feedback.py
@ -17,8 +17,8 @@ login_token = None
 # Define the login credentials
 login_credentials = {
    'username': 'sbt',
-    'password': '7Eg4AbWIXDnufgn'
-    # 'password': 'abc'
+    # 'password': '7Eg4AbWIXDnufgn'
+    'password': 'abc'
 }

 # Define the command to call the update API
--- a/cope2n-api/scripts/script.py
+++ b/cope2n-api/scripts/script.py
@ -5,7 +5,7 @@ from datetime import datetime

 # Get the proxy URL from the environment variable
 interval = 60*60*1  # 1 minute
-update_cost = 60*2
+update_cost = int(60*1.5)
 proxy_url = os.getenv('PROXY', "localhost")

 # Define the login API URL
@ -15,8 +15,8 @@ login_token = None
 # Define the login credentials
 login_credentials = {
    'username': 'sbt',
-    # 'password': '7Eg4AbWIXDnufgn'
-    'password': 'abc'
+    'password': '7Eg4AbWIXDnufgn'
+    # 'password': 'abc'
 }

 # Define the command to call the update API
--- a/deploy_images.sh
+++ b/deploy_images.sh
@ -7,7 +7,7 @@ tag=$1
 echo "[INFO] Tag received from Python: $tag"

 # echo "[INFO] Updating everything the remote..."
-# git submodule update --recursive --remote
+git submodule update --recursive --remote

 echo "[INFO] Pushing AI image with tag: $tag..."
 docker compose -f docker-compose-dev.yml build cope2n-fi-sbt
--- a/docker-compose-dev.yml
+++ b/docker-compose-dev.yml
@ -84,12 +84,12 @@ services:
    depends_on:
      db-sbt:
        condition: service_started
-    # command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input &&
-    #               python manage.py makemigrations &&
-    #               python manage.py migrate &&
-    #               python manage.py compilemessages &&
-    #               gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
-    command: bash -c "tail -f > /dev/null"
+    command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input &&
+                  python manage.py makemigrations &&
+                  python manage.py migrate &&
+                  python manage.py compilemessages &&
+                  gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod
+    # command: bash -c "tail -f > /dev/null"

  minio:
    image: minio/minio
@ -174,8 +174,8 @@ services:
      - ./cope2n-api:/app

    working_dir: /app
-    command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO -c 5"
-    # command: bash -c "tail -f > /dev/null"
+    # command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO -c 5"
+    command: bash -c "tail -f > /dev/null"

  # Back-end persistent
  db-sbt:
--- a/scripts/crawl_database_by_time_with_accuracy_contrain.py
+++ b/scripts/crawl_database_by_time_with_accuracy_contrain.py
@ -10,23 +10,34 @@ from pytz import timezone
 from dotenv import load_dotenv

 load_dotenv("../.env_prod")
+# load_dotenv(".env_prod")
 # load_dotenv("../.env")

-OUTPUT_NAME = "0131-0206"
-START_DATE = datetime(2024, 1, 31, tzinfo=timezone('Asia/Singapore'))
-END_DATE = datetime(2024, 2, 6, tzinfo=timezone('Asia/Singapore'))
+OUTPUT_NAME = "0116-0216"
+START_DATE = datetime(2024, 1, 16, tzinfo=timezone('Asia/Singapore'))
+END_DATE = datetime(2024, 2, 16, tzinfo=timezone('Asia/Singapore'))
 BAD_THRESHOLD = 0.75
+# ("requestId", "redemptionNumber", "fileName", "userSubmitResults", "OCRResults", "revisedResults_by_SDSRV", "accuracy")
+REQUEST_ID_COL = 3
+REQUEST_NUMBER_COL = 6
+REQUEST_REDEMPTION_COL = 27
+FILE_NAME_COL = 1
+OCR_RESULT_COL = 16
+FEEDBACK_RESULT_COL = 15
+REVIEWED_RESULT_COL = 17

 REVIEW_ACC_COL = 19
 FEEDBACK_ACC_COL = 18
-REQUEST_ID_COL = 6

 # Database connection details
 db_host = os.environ.get('DB_HOST', "")
-# db_host = "42.96.42.13"
 db_name = os.environ.get('DB_SCHEMA', "")
 db_user = os.environ.get('DB_USER', "")
 db_password = os.environ.get('DB_PASSWORD', "")
+# db_host = "sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com"
+# db_name = "sbt2"
+# db_user = "sbt"
+# db_password = "sbtCH240"

 # S3 bucket details
 s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
@ -36,40 +47,6 @@ s3_folder_prefix = 'sbt_invoice'
 access_key = os.environ.get('S3_ACCESS_KEY', "")
 secret_key = os.environ.get('S3_SECRET_KEY', "")

-class RequestAtt:
-    def __init__(self) -> None:
-        self.feedback_accuracy = []
-        self.reiviewed_accuracy = []
-        self.acc = 0
-        self.request_id = None
-        self.is_bad = False
-        self.data = []
-
-    def add_file(self, file):
-        self.data.append(file)
-        if file[REVIEW_ACC_COL]:
-            for key in file[REVIEW_ACC_COL].keys():
-                self.feedback_accuracy += file[REVIEW_ACC_COL][key]
-        if file[FEEDBACK_ACC_COL]:
-            for key in file[FEEDBACK_ACC_COL].keys():
-                self.feedback_accuracy += file[FEEDBACK_ACC_COL][key]
-    
-    def is_bad_image(self):
-        fb = min(self.feedback_accuracy)/len(self.feedback_accuracy) if len(self.feedback_accuracy) else None
-        rv = min(self.reiviewed_accuracy)/len(self.reiviewed_accuracy) if len(self.reiviewed_accuracy) else None
-        if not fb and not rv:
-            self.is_bad = False
-            return False
-        elif fb and rv is None:
-            self.is_bad = fb < BAD_THRESHOLD
-            self.acc = fb
-            return fb < BAD_THRESHOLD
-        elif fb and rv:
-            self.is_bad = rv < BAD_THRESHOLD
-            self.acc = rv
-            return rv < BAD_THRESHOLD            
-        return False
-
 def get_request(cursor, request_in_id):
    query = "SELECT * FROM fwd_api_subscriptionrequest WHERE id = %s"
    cursor.execute(query, (request_in_id,))
@ -99,44 +76,62 @@ def main():

    # Define the CSV file path
    csv_file_path = f'{OUTPUT_NAME}.csv'
-    data_dict = {}
+    
+    bad_image_list = [] # [("requestId", "redemptionNumber", "fileName", "userSubmitResults", "OCRResults", "revisedResults_by_SDSRV", "accuracy"), ...]
+    request_ids = [] # for crawling images
    # Filter out requests request that has quality < 75%
    for i, _d in enumerate(data):
-        if not data_dict.get(_d[REQUEST_ID_COL], None):
-            data_dict[_d[REQUEST_ID_COL]] = RequestAtt()
-            data_dict[_d[REQUEST_ID_COL]].request_id = _d[REQUEST_ID_COL]
-        data_dict[_d[REQUEST_ID_COL]].add_file(_d)
+        if _d[FEEDBACK_ACC_COL] and _d[FEEDBACK_RESULT_COL]:
+            acc_len = 0
+            for key in _d[FEEDBACK_ACC_COL].keys():
+                if key == "purchase_date":
+                    continue
+                acc_len += len(_d[FEEDBACK_ACC_COL][key])
+                if len(_d[FEEDBACK_ACC_COL][key]):
+                    if min(_d[FEEDBACK_ACC_COL][key]) < BAD_THRESHOLD:
+                        parent_request = get_request(cursor, _d[REQUEST_NUMBER_COL])
+                        requestId = parent_request[REQUEST_ID_COL]
+                        redemptionNumber = parent_request[REQUEST_REDEMPTION_COL]
+                        fileName = _d[FILE_NAME_COL]
+                        userSubmitResults = str(_d[FEEDBACK_RESULT_COL][key]) if _d[FEEDBACK_RESULT_COL] else ""
+                        OCRResults = str(_d[OCR_RESULT_COL][key]) if _d[OCR_RESULT_COL] else ""
+                        revisedResults_by_SDSRV = str(_d[REVIEWED_RESULT_COL][key]) if _d[REVIEWED_RESULT_COL] else ""
+                        accuracy = _d[FEEDBACK_ACC_COL][key]
+                        bad_image_list.append((requestId, redemptionNumber, fileName, userSubmitResults, OCRResults, revisedResults_by_SDSRV, accuracy))
+                        request_ids.append(requestId)
+            if acc_len == 0: #  This is the request with acc < 0.75
+                for key in _d[FEEDBACK_ACC_COL].keys():
+                    if key == "purchase_date":
+                        continue
+                    # if not 
+                    if str(_d[FEEDBACK_RESULT_COL][key]) == str(_d[OCR_RESULT_COL][key]):
+                        continue
+                    parent_request = get_request(cursor, _d[REQUEST_NUMBER_COL])
+                    requestId = parent_request[REQUEST_ID_COL]
+                    redemptionNumber = parent_request[REQUEST_REDEMPTION_COL]
+                    fileName = _d[FILE_NAME_COL]
+                    userSubmitResults = str(_d[FEEDBACK_RESULT_COL][key]) if _d[FEEDBACK_RESULT_COL] else ""
+                    OCRResults = str(_d[OCR_RESULT_COL][key]) if _d[OCR_RESULT_COL] else ""
+                    revisedResults_by_SDSRV = str(_d[REVIEWED_RESULT_COL][key]) if _d[REVIEWED_RESULT_COL] else ""
+                    accuracy = "Unknown (avg request acc < 0.75 is excluded from the acc report)"
+                    bad_image_list.append((requestId, redemptionNumber, fileName, userSubmitResults, OCRResults, revisedResults_by_SDSRV, accuracy))
+                    request_ids.append(requestId)

-    bad_images = []
-    for k in data_dict.keys():
-        if data_dict[k].is_bad_image():
-            bad_images.append(data_dict[k])
-
-    request_ids = []
    # Write the data to the CSV file
-    for bad_image in bad_images:
-        request = get_request(cursor, bad_image.request_id)
-        if request:
-            request_ids.append(request[3])
+    # for bad_image in bad_images:
+    #     request = get_request(cursor, bad_image.request_id)
+    #     if request:
+    #         request_ids.append(request[3])

    # ###################### Get bad requests ######################
-    placeholders = ','.join(['%s'] * len(request_ids))
-
-    # Execute the SELECT query with the filter
-    query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})"
-    cursor.execute(query, request_ids)
-
-    # Fetch the filtered data
-    data = cursor.fetchall()
-
    # Define the CSV file path
    csv_file_path = f'{OUTPUT_NAME}.csv'

    # Write the data to the CSV file
    with open(csv_file_path, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
-        writer.writerow([desc[0] for desc in cursor.description])  # Write column headers
-        writer.writerows(data)  # Write the filtered data rows
+        writer.writerow(["requestId", "redemptionNumber", "fileName", "userSubmitResults", "OCRResults", "revisedResults_by_SDSRV", "accuracy"])  # Write column headers
+        writer.writerows(bad_image_list)  # Write the filtered data rows

    # Close the cursor and database connection
    cursor.close()
@ -149,6 +144,8 @@ def main():
        aws_secret_access_key=secret_key
    )

+    request_ids = list(set(request_ids))
+
    for request_id in tqdm(request_ids):
        folder_key = f"{s3_folder_prefix}/{request_id}/"  # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
        local_folder_path = f"{OUTPUT_NAME}/{request_id}/"  # Path to the local folder to save the downloaded files