diff --git a/.gitignore b/.gitignore index ff69baa..52a2e06 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,4 @@ cope2n-ai-fi/Dockerfile_old_work /feedback/ cope2n-api/public/SBT_report_20240122.csv Jan.csv +*.csv diff --git a/cope2n-ai-fi/modules/sdsvkvu b/cope2n-ai-fi/modules/sdsvkvu index 6907ea0..b6d4fab 160000 --- a/cope2n-ai-fi/modules/sdsvkvu +++ b/cope2n-ai-fi/modules/sdsvkvu @@ -1 +1 @@ -Subproject commit 6907ea0183b141e3b4f3c21758c9123f1e9b2a27 +Subproject commit b6d4fab46f7f8689dd6b050cfbff2faa6a6f3fec diff --git a/cope2n-api/fwd_api/api/accuracy_view.py b/cope2n-api/fwd_api/api/accuracy_view.py index a5660e2..88a0d24 100644 --- a/cope2n-api/fwd_api/api/accuracy_view.py +++ b/cope2n-api/fwd_api/api/accuracy_view.py @@ -433,10 +433,10 @@ class AccuracyViewSet(viewsets.ViewSet): @action(detail=False, url_path="overview", methods=["GET"]) def overview(self, request): if request.method == 'GET': - subsidiary = request.GET.get('subsidiary', "ALL") + _subsidiary = request.GET.get('subsidiary', "ALL") duration = request.GET.get('duration', "") - subsidiary = map_subsidiary_long_to_short(subsidiary) + subsidiary = map_subsidiary_long_to_short(_subsidiary) # Retrive data from Redis key = f"{subsidiary}_{duration}" diff --git a/cope2n-api/fwd_api/migrations/0181_reportfile_subsidiary.py b/cope2n-api/fwd_api/migrations/0181_reportfile_subsidiary.py new file mode 100644 index 0000000..327b0f7 --- /dev/null +++ b/cope2n-api/fwd_api/migrations/0181_reportfile_subsidiary.py @@ -0,0 +1,18 @@ +# Generated by Django 4.1.3 on 2024-02-15 09:12 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('fwd_api', '0180_alter_reportfile_time_cost'), + ] + + operations = [ + migrations.AddField( + model_name='reportfile', + name='subsidiary', + field=models.CharField(default='', max_length=200, null=True), + ), + ] diff --git a/cope2n-api/fwd_api/models/ReportFile.py b/cope2n-api/fwd_api/models/ReportFile.py index 9599d5d..a4559d3 100644 --- a/cope2n-api/fwd_api/models/ReportFile.py +++ b/cope2n-api/fwd_api/models/ReportFile.py @@ -9,6 +9,7 @@ class ReportFile(models.Model): id = models.AutoField(primary_key=True) correspond_request_id = models.CharField(max_length=200, default="") correspond_redemption_id = models.CharField(max_length=200, default="") + subsidiary = models.CharField(default="", null=True, max_length=200) created_at = models.DateTimeField(default=timezone.now, db_index=True) updated_at = models.DateTimeField(auto_now=True) report = models.ForeignKey(Report, related_name="files", on_delete=models.CASCADE) diff --git a/cope2n-api/fwd_api/utils/accuracy.py b/cope2n-api/fwd_api/utils/accuracy.py index 1d78331..2eeece0 100644 --- a/cope2n-api/fwd_api/utils/accuracy.py +++ b/cope2n-api/fwd_api/utils/accuracy.py @@ -9,6 +9,7 @@ import uuid from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile from ..celery_worker.client_connector import c_connector from ..utils.file import dict2xlsx, save_workbook_file, save_report_to_S3 +from ..utils.subsidiary import map_subsidiary_short_to_long from django.db.models import Q from django.utils import timezone import redis @@ -29,6 +30,8 @@ class ReportAccumulateByRequest: self.total_format = { 'subs': "+", 'extraction_date': "Subtotal ()", + 'num_imei': 0, + 'num_invoice': 0, 'total_images': 0, 'images_quality': { 'successful': 0, @@ -49,6 +52,7 @@ class ReportAccumulateByRequest: 'usage': { 'imei':0, 'invoice': 0, + 'total_images': 0, 'request': 0 }, 'feedback_accuracy': { @@ -90,6 +94,7 @@ class ReportAccumulateByRequest: 'usage': { 'imei': 0, 'invoice': 0, + 'total_images': 0, 'request': 0 }, 'feedback_accuracy': { @@ -113,7 +118,13 @@ class ReportAccumulateByRequest: total["total_images"] += 1 total["images_quality"]["successful"] += 1 if not report_file.is_bad_image else 0 total["images_quality"]["bad"] += 1 if report_file.is_bad_image else 0 - # total["report_files"].append(report_file) + doc_type = "imei" + if report_file.doc_type in ["imei", "invoice", "all"]: + doc_type = report_file.doc_type + else: + print(f"[WARM]: Weird doc type {report_file.doc_type} if request id: {report_file.correspond_request_id}") + total["num_imei"] += 1 if doc_type == "imei" else 0 + total["num_invoice"] += 1 if doc_type == "invoice" else 0 if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 : total["average_accuracy_rate"]["imei"].add(report_file.reviewed_accuracy.get("imei_number", [])) @@ -136,8 +147,14 @@ class ReportAccumulateByRequest: total["average_processing_time"][report_file.doc_type] = IterAvg() total["average_processing_time"][report_file.doc_type].add_avg(report_file.time_cost, 1) if report_file.time_cost else 0 - total["usage"]["imei"] += 1 if report_file.doc_type == "imei" else 0 - total["usage"]["invoice"] += 1 if report_file.doc_type == "invoice" else 0 + doc_type = "imei" + if report_file.doc_type in ["imei", "invoice", "all"]: + doc_type = report_file.doc_type + else: + print(f"[WARM]: Weird doc type {report_file.doc_type} if request id: {report_file.correspond_request_id}") + total["usage"]["imei"] += 1 if doc_type == "imei" else 0 + total["usage"]["invoice"] += 1 if doc_type == "invoice" else 0 + total["usage"]["total_images"] += 1 return total @@ -146,8 +163,13 @@ class ReportAccumulateByRequest: day_data["total_images"] += 1 day_data["images_quality"]["successful"] += 1 if not report_file.is_bad_image else 0 day_data["images_quality"]["bad"] += 1 if report_file.is_bad_image else 0 - day_data["num_imei"] += 1 if report_file.doc_type == "imei" else 0 - day_data["num_invoice"] += 1 if report_file.doc_type == "invoice" else 0 + doc_type = "imei" + if report_file.doc_type in ["imei", "invoice", "all"]: + doc_type = report_file.doc_type + else: + print(f"[WARM]: Weird doc type {report_file.doc_type} if request id: {report_file.correspond_request_id}") + day_data["num_imei"] += 1 if doc_type == "imei" else 0 + day_data["num_invoice"] += 1 if doc_type == "invoice" else 0 day_data["report_files"].append(report_file) if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 : @@ -186,6 +208,7 @@ class ReportAccumulateByRequest: self.data[this_month][1][this_day]["usage"]["imei"] = usage.get("imei", 0) self.data[this_month][1][this_day]["usage"]["invoice"] = usage.get("invoice", 0) self.data[this_month][1][this_day]["usage"]["request"] = usage.get("request", 0) + self.data[this_month][1][this_day]["usage"]["total_images"] = usage.get("imei", 0) + usage.get("invoice", 0) self.data[this_month][1][this_day]['num_request'] += 1 self.data[this_month][0]['num_request'] += 1 @@ -213,6 +236,7 @@ class ReportAccumulateByRequest: day_keys = list(report_data[month][1].keys()) day_keys.sort(reverse = True) for day in day_keys: + report_data[month][1][day]['subs'] = map_subsidiary_short_to_long(report_data[month][1][day]['subs']) fine_data.append(report_data[month][1][day]) # save daily reports report_id = root_report_id + "_" + day @@ -254,10 +278,10 @@ class ReportAccumulateByRequest: ) if is_daily_report: new_report.save() - data = extract_report_detail_list(self.data[month][1][day]["report_files"], lower=True) - data_workbook = dict2xlsx(data, _type='report_detail') - local_workbook = save_workbook_file(report_id + ".xlsx", new_report, data_workbook) - s3_key=save_report_to_S3(report_id, local_workbook) + data = extract_report_detail_list(self.data[month][1][day]["report_files"], lower=True) + data_workbook = dict2xlsx(data, _type='report_detail') + local_workbook = save_workbook_file(report_id + ".xlsx", new_report, data_workbook) + s3_key=save_report_to_S3(report_id, local_workbook) return fine_data, save_data def get(self) -> Any: @@ -293,6 +317,7 @@ class ReportAccumulateByRequest: _data[month][0]["usage"]["imei"] = num_transaction_imei _data[month][0]["usage"]["invoice"] = num_transaction_invoice + _data[month][0]["usage"]["total_images"] = num_transaction_invoice + num_transaction_imei _data[month][0]["average_accuracy_rate"]["imei"] = _data[month][0]["average_accuracy_rate"]["imei"]() _data[month][0]["average_accuracy_rate"]["purchase_date"] = _data[month][0]["average_accuracy_rate"]["purchase_date"]() _data[month][0]["average_accuracy_rate"]["retailer_name"] = _data[month][0]["average_accuracy_rate"]["retailer_name"]() @@ -311,7 +336,6 @@ class ReportAccumulateByRequest: return _data - class MonthReportAccumulate: def __init__(self): self.month = None @@ -513,6 +537,7 @@ def extract_report_detail_list(report_detail_list, lower=False, in_percent=True) data = [] for report_file in report_detail_list: data.append({ + "Subs": report_file.subsidiary, "Request ID": report_file.correspond_request_id, "Redemption Number": report_file.correspond_redemption_id, "Image type": report_file.doc_type, @@ -600,6 +625,9 @@ def align_fine_result(ready_predict, fine_result): # print(f"[DEBUG]: fine_result: {fine_result}") # print(f"[DEBUG]: ready_predict: {ready_predict}") if fine_result: + if isinstance(ready_predict["purchase_date"], str): + ready_predict["purchase_date"] = [ready_predict["purchase_date"]] + # ready_predict.save() if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0: ready_predict["purchase_date"] = [None] if fine_result["retailername"] and not ready_predict["retailername"]: @@ -616,6 +644,7 @@ def update_temp_accuracy(accuracy, acc, keys): for key in keys: accuracy[key].add(acc[key]) return accuracy + def calculate_accuracy(key_name, inference, target): """_summary_ @@ -661,7 +690,10 @@ def calculate_avg_accuracy(acc, type, keys=[]): acc_list = [] # print(f"[DEBUG]: type: {type} - acc: {acc}") for key in keys: - acc_list += acc.get(type, {}).get(key, []) + this_acc = acc.get(type, {}).get(key, []) + if len(this_acc) > 0: + this_acc = [max(this_acc)] + acc_list += this_acc acc_list = [x for x in acc_list if x is not None] return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None @@ -732,6 +764,12 @@ def calculate_and_save_subcription_file(report, request): return request_att +def acc_maximize_list_values(acc): + for k in acc.keys(): + if isinstance(acc[k], list) and len(acc[k]) > 0: + acc[k] = [max(acc[k])] + return acc + def calculate_a_request(report, request): request_att = {"acc": {"feedback": {"imei_number": [], "purchase_date": [], @@ -753,19 +791,30 @@ def calculate_a_request(report, request): status, att = calculate_subcription_file(image) if status != 200: continue - image.feedback_accuracy = att["acc"]["feedback"] - image.reviewed_accuracy = att["acc"]["reviewed"] + image.feedback_accuracy = att["acc"]["feedback"] # dict {key: [values]} + image.reviewed_accuracy = att["acc"]["reviewed"] # dict {key: [values]} image.is_bad_image_quality = att["is_bad_image"] + if not image.doc_type: + # try to revert doc type from filename + _doc_type = image.file_name.split("_")[1] + if _doc_type in ["imei", "invoice"]: + image.doc_type = _doc_type image.save() + _sub = "NA" + if request.redemption_id: + _sub = map_subsidiary_short_to_long(request.redemption_id[:2]) + else: + print(f"[WARM]: empty redemption_id, check request: {request.request_id}") new_report_file = ReportFile(report=report, + subsidiary=_sub, correspond_request_id=request.request_id, correspond_redemption_id=request.redemption_id, doc_type=image.doc_type, predict_result=image.predict_result, feedback_result=image.feedback_result, reviewed_result=image.reviewed_result, - feedback_accuracy=att["acc"]["feedback"], - reviewed_accuracy=att["acc"]["reviewed"], + feedback_accuracy=acc_maximize_list_values(att["acc"]["feedback"]), + reviewed_accuracy=acc_maximize_list_values(att["acc"]["reviewed"]), acc=att["avg_acc"], is_bad_image=att["is_bad_image"], time_cost=image.processing_time, @@ -797,7 +846,6 @@ def calculate_a_request(report, request): continue return request_att, report_files - def calculate_subcription_file(subcription_request_file): att = {"acc": {"feedback": {}, diff --git a/cope2n-api/scripts/re_feedback.py b/cope2n-api/scripts/re_feedback.py index 34ce227..34f82a7 100644 --- a/cope2n-api/scripts/re_feedback.py +++ b/cope2n-api/scripts/re_feedback.py @@ -17,8 +17,8 @@ login_token = None # Define the login credentials login_credentials = { 'username': 'sbt', - 'password': '7Eg4AbWIXDnufgn' - # 'password': 'abc' + # 'password': '7Eg4AbWIXDnufgn' + 'password': 'abc' } # Define the command to call the update API diff --git a/cope2n-api/scripts/script.py b/cope2n-api/scripts/script.py index fa554c6..e25ad22 100644 --- a/cope2n-api/scripts/script.py +++ b/cope2n-api/scripts/script.py @@ -5,7 +5,7 @@ from datetime import datetime # Get the proxy URL from the environment variable interval = 60*60*1 # 1 minute -update_cost = 60*2 +update_cost = int(60*1.5) proxy_url = os.getenv('PROXY', "localhost") # Define the login API URL @@ -15,8 +15,8 @@ login_token = None # Define the login credentials login_credentials = { 'username': 'sbt', - # 'password': '7Eg4AbWIXDnufgn' - 'password': 'abc' + 'password': '7Eg4AbWIXDnufgn' + # 'password': 'abc' } # Define the command to call the update API diff --git a/deploy_images.sh b/deploy_images.sh index 3b57e42..df4360f 100755 --- a/deploy_images.sh +++ b/deploy_images.sh @@ -7,7 +7,7 @@ tag=$1 echo "[INFO] Tag received from Python: $tag" # echo "[INFO] Updating everything the remote..." -# git submodule update --recursive --remote +git submodule update --recursive --remote echo "[INFO] Pushing AI image with tag: $tag..." docker compose -f docker-compose-dev.yml build cope2n-fi-sbt diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 613a8e7..f6d8386 100755 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -84,12 +84,12 @@ services: depends_on: db-sbt: condition: service_started - # command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input && - # python manage.py makemigrations && - # python manage.py migrate && - # python manage.py compilemessages && - # gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod - command: bash -c "tail -f > /dev/null" + command: sh -c "chmod -R 777 /app; sleep 5; python manage.py collectstatic --no-input && + python manage.py makemigrations && + python manage.py migrate && + python manage.py compilemessages && + gunicorn fwd.asgi:application -k uvicorn.workers.UvicornWorker --timeout 300 -b 0.0.0.0:9000" # pre-makemigrations on prod + # command: bash -c "tail -f > /dev/null" minio: image: minio/minio @@ -174,8 +174,8 @@ services: - ./cope2n-api:/app working_dir: /app - command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO -c 5" - # command: bash -c "tail -f > /dev/null" + # command: sh -c "celery -A fwd_api.celery_worker.worker worker -l INFO -c 5" + command: bash -c "tail -f > /dev/null" # Back-end persistent db-sbt: diff --git a/scripts/crawl_database_by_time_with_accuracy_contrain.py b/scripts/crawl_database_by_time_with_accuracy_contrain.py index afb0b71..0538423 100644 --- a/scripts/crawl_database_by_time_with_accuracy_contrain.py +++ b/scripts/crawl_database_by_time_with_accuracy_contrain.py @@ -10,23 +10,34 @@ from pytz import timezone from dotenv import load_dotenv load_dotenv("../.env_prod") +# load_dotenv(".env_prod") # load_dotenv("../.env") -OUTPUT_NAME = "0131-0206" -START_DATE = datetime(2024, 1, 31, tzinfo=timezone('Asia/Singapore')) -END_DATE = datetime(2024, 2, 6, tzinfo=timezone('Asia/Singapore')) +OUTPUT_NAME = "0116-0216" +START_DATE = datetime(2024, 1, 16, tzinfo=timezone('Asia/Singapore')) +END_DATE = datetime(2024, 2, 16, tzinfo=timezone('Asia/Singapore')) BAD_THRESHOLD = 0.75 +# ("requestId", "redemptionNumber", "fileName", "userSubmitResults", "OCRResults", "revisedResults_by_SDSRV", "accuracy") +REQUEST_ID_COL = 3 +REQUEST_NUMBER_COL = 6 +REQUEST_REDEMPTION_COL = 27 +FILE_NAME_COL = 1 +OCR_RESULT_COL = 16 +FEEDBACK_RESULT_COL = 15 +REVIEWED_RESULT_COL = 17 REVIEW_ACC_COL = 19 FEEDBACK_ACC_COL = 18 -REQUEST_ID_COL = 6 # Database connection details db_host = os.environ.get('DB_HOST', "") -# db_host = "42.96.42.13" db_name = os.environ.get('DB_SCHEMA', "") db_user = os.environ.get('DB_USER', "") db_password = os.environ.get('DB_PASSWORD', "") +# db_host = "sbt.cxetpslawu4p.ap-southeast-1.rds.amazonaws.com" +# db_name = "sbt2" +# db_user = "sbt" +# db_password = "sbtCH240" # S3 bucket details s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "") @@ -36,40 +47,6 @@ s3_folder_prefix = 'sbt_invoice' access_key = os.environ.get('S3_ACCESS_KEY', "") secret_key = os.environ.get('S3_SECRET_KEY', "") -class RequestAtt: - def __init__(self) -> None: - self.feedback_accuracy = [] - self.reiviewed_accuracy = [] - self.acc = 0 - self.request_id = None - self.is_bad = False - self.data = [] - - def add_file(self, file): - self.data.append(file) - if file[REVIEW_ACC_COL]: - for key in file[REVIEW_ACC_COL].keys(): - self.feedback_accuracy += file[REVIEW_ACC_COL][key] - if file[FEEDBACK_ACC_COL]: - for key in file[FEEDBACK_ACC_COL].keys(): - self.feedback_accuracy += file[FEEDBACK_ACC_COL][key] - - def is_bad_image(self): - fb = min(self.feedback_accuracy)/len(self.feedback_accuracy) if len(self.feedback_accuracy) else None - rv = min(self.reiviewed_accuracy)/len(self.reiviewed_accuracy) if len(self.reiviewed_accuracy) else None - if not fb and not rv: - self.is_bad = False - return False - elif fb and rv is None: - self.is_bad = fb < BAD_THRESHOLD - self.acc = fb - return fb < BAD_THRESHOLD - elif fb and rv: - self.is_bad = rv < BAD_THRESHOLD - self.acc = rv - return rv < BAD_THRESHOLD - return False - def get_request(cursor, request_in_id): query = "SELECT * FROM fwd_api_subscriptionrequest WHERE id = %s" cursor.execute(query, (request_in_id,)) @@ -99,44 +76,62 @@ def main(): # Define the CSV file path csv_file_path = f'{OUTPUT_NAME}.csv' - data_dict = {} + + bad_image_list = [] # [("requestId", "redemptionNumber", "fileName", "userSubmitResults", "OCRResults", "revisedResults_by_SDSRV", "accuracy"), ...] + request_ids = [] # for crawling images # Filter out requests request that has quality < 75% for i, _d in enumerate(data): - if not data_dict.get(_d[REQUEST_ID_COL], None): - data_dict[_d[REQUEST_ID_COL]] = RequestAtt() - data_dict[_d[REQUEST_ID_COL]].request_id = _d[REQUEST_ID_COL] - data_dict[_d[REQUEST_ID_COL]].add_file(_d) - - bad_images = [] - for k in data_dict.keys(): - if data_dict[k].is_bad_image(): - bad_images.append(data_dict[k]) + if _d[FEEDBACK_ACC_COL] and _d[FEEDBACK_RESULT_COL]: + acc_len = 0 + for key in _d[FEEDBACK_ACC_COL].keys(): + if key == "purchase_date": + continue + acc_len += len(_d[FEEDBACK_ACC_COL][key]) + if len(_d[FEEDBACK_ACC_COL][key]): + if min(_d[FEEDBACK_ACC_COL][key]) < BAD_THRESHOLD: + parent_request = get_request(cursor, _d[REQUEST_NUMBER_COL]) + requestId = parent_request[REQUEST_ID_COL] + redemptionNumber = parent_request[REQUEST_REDEMPTION_COL] + fileName = _d[FILE_NAME_COL] + userSubmitResults = str(_d[FEEDBACK_RESULT_COL][key]) if _d[FEEDBACK_RESULT_COL] else "" + OCRResults = str(_d[OCR_RESULT_COL][key]) if _d[OCR_RESULT_COL] else "" + revisedResults_by_SDSRV = str(_d[REVIEWED_RESULT_COL][key]) if _d[REVIEWED_RESULT_COL] else "" + accuracy = _d[FEEDBACK_ACC_COL][key] + bad_image_list.append((requestId, redemptionNumber, fileName, userSubmitResults, OCRResults, revisedResults_by_SDSRV, accuracy)) + request_ids.append(requestId) + if acc_len == 0: # This is the request with acc < 0.75 + for key in _d[FEEDBACK_ACC_COL].keys(): + if key == "purchase_date": + continue + # if not + if str(_d[FEEDBACK_RESULT_COL][key]) == str(_d[OCR_RESULT_COL][key]): + continue + parent_request = get_request(cursor, _d[REQUEST_NUMBER_COL]) + requestId = parent_request[REQUEST_ID_COL] + redemptionNumber = parent_request[REQUEST_REDEMPTION_COL] + fileName = _d[FILE_NAME_COL] + userSubmitResults = str(_d[FEEDBACK_RESULT_COL][key]) if _d[FEEDBACK_RESULT_COL] else "" + OCRResults = str(_d[OCR_RESULT_COL][key]) if _d[OCR_RESULT_COL] else "" + revisedResults_by_SDSRV = str(_d[REVIEWED_RESULT_COL][key]) if _d[REVIEWED_RESULT_COL] else "" + accuracy = "Unknown (avg request acc < 0.75 is excluded from the acc report)" + bad_image_list.append((requestId, redemptionNumber, fileName, userSubmitResults, OCRResults, revisedResults_by_SDSRV, accuracy)) + request_ids.append(requestId) - request_ids = [] # Write the data to the CSV file - for bad_image in bad_images: - request = get_request(cursor, bad_image.request_id) - if request: - request_ids.append(request[3]) + # for bad_image in bad_images: + # request = get_request(cursor, bad_image.request_id) + # if request: + # request_ids.append(request[3]) # ###################### Get bad requests ###################### - placeholders = ','.join(['%s'] * len(request_ids)) - - # Execute the SELECT query with the filter - query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})" - cursor.execute(query, request_ids) - - # Fetch the filtered data - data = cursor.fetchall() - # Define the CSV file path csv_file_path = f'{OUTPUT_NAME}.csv' # Write the data to the CSV file with open(csv_file_path, 'w', newline='') as csv_file: writer = csv.writer(csv_file) - writer.writerow([desc[0] for desc in cursor.description]) # Write column headers - writer.writerows(data) # Write the filtered data rows + writer.writerow(["requestId", "redemptionNumber", "fileName", "userSubmitResults", "OCRResults", "revisedResults_by_SDSRV", "accuracy"]) # Write column headers + writer.writerows(bad_image_list) # Write the filtered data rows # Close the cursor and database connection cursor.close() @@ -149,6 +144,8 @@ def main(): aws_secret_access_key=secret_key ) + request_ids = list(set(request_ids)) + for request_id in tqdm(request_ids): folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/ local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files