import csv from typing import Any import psycopg2 import boto3 import os from tqdm import tqdm from datetime import datetime, timedelta from pytz import timezone from dotenv import load_dotenv load_dotenv("../.env_prod") # load_dotenv("../.env") OUTPUT_NAME = "0131-0206" START_DATE = datetime(2024, 1, 31, tzinfo=timezone('Asia/Singapore')) END_DATE = datetime(2024, 2, 6, tzinfo=timezone('Asia/Singapore')) BAD_THRESHOLD = 0.75 REVIEW_ACC_COL = 19 FEEDBACK_ACC_COL = 18 REQUEST_ID_COL = 6 # Database connection details db_host = os.environ.get('DB_HOST', "") # db_host = "42.96.42.13" db_name = os.environ.get('DB_SCHEMA', "") db_user = os.environ.get('DB_USER', "") db_password = os.environ.get('DB_PASSWORD', "") # S3 bucket details s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "") s3_folder_prefix = 'sbt_invoice' # S3 access credentials access_key = os.environ.get('S3_ACCESS_KEY', "") secret_key = os.environ.get('S3_SECRET_KEY', "") class RequestAtt: def __init__(self) -> None: self.feedback_accuracy = [] self.reiviewed_accuracy = [] self.acc = 0 self.request_id = None self.is_bad = False self.data = [] def add_file(self, file): self.data.append(file) if file[REVIEW_ACC_COL]: for key in file[REVIEW_ACC_COL].keys(): self.feedback_accuracy += file[REVIEW_ACC_COL][key] if file[FEEDBACK_ACC_COL]: for key in file[FEEDBACK_ACC_COL].keys(): self.feedback_accuracy += file[FEEDBACK_ACC_COL][key] def is_bad_image(self): fb = min(self.feedback_accuracy)/len(self.feedback_accuracy) if len(self.feedback_accuracy) else None rv = min(self.reiviewed_accuracy)/len(self.reiviewed_accuracy) if len(self.reiviewed_accuracy) else None if not fb and not rv: self.is_bad = False return False elif fb and rv is None: self.is_bad = fb < BAD_THRESHOLD self.acc = fb return fb < BAD_THRESHOLD elif fb and rv: self.is_bad = rv < BAD_THRESHOLD self.acc = rv return rv < BAD_THRESHOLD return False def get_request(cursor, request_in_id): query = "SELECT * FROM fwd_api_subscriptionrequest WHERE id = %s" cursor.execute(query, (request_in_id,)) data = cursor.fetchone() return data if data else None # Request IDs for filtering def main(): # Connect to the PostgreSQL database conn = psycopg2.connect( host=db_host, database=db_name, user=db_user, password=db_password ) # Create a cursor cursor = conn.cursor() # Execute the SELECT query with the filter query = "SELECT * FROM fwd_api_subscriptionrequestfile WHERE created_at >= %s AND created_at <= %s AND feedback_accuracy IS NOT NULL" cursor.execute(query, (START_DATE, END_DATE)) # Fetch the filtered data data = cursor.fetchall() # Define the CSV file path csv_file_path = f'{OUTPUT_NAME}.csv' data_dict = {} # Filter out requests request that has quality < 75% for i, _d in enumerate(data): if not data_dict.get(_d[REQUEST_ID_COL], None): data_dict[_d[REQUEST_ID_COL]] = RequestAtt() data_dict[_d[REQUEST_ID_COL]].request_id = _d[REQUEST_ID_COL] data_dict[_d[REQUEST_ID_COL]].add_file(_d) bad_images = [] for k in data_dict.keys(): if data_dict[k].is_bad_image(): bad_images.append(data_dict[k]) request_ids = [] # Write the data to the CSV file for bad_image in bad_images: request = get_request(cursor, bad_image.request_id) if request: request_ids.append(request[3]) # ###################### Get bad requests ###################### placeholders = ','.join(['%s'] * len(request_ids)) # Execute the SELECT query with the filter query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})" cursor.execute(query, request_ids) # Fetch the filtered data data = cursor.fetchall() # Define the CSV file path csv_file_path = f'{OUTPUT_NAME}.csv' # Write the data to the CSV file with open(csv_file_path, 'w', newline='') as csv_file: writer = csv.writer(csv_file) writer.writerow([desc[0] for desc in cursor.description]) # Write column headers writer.writerows(data) # Write the filtered data rows # Close the cursor and database connection cursor.close() conn.close() # Download folders from S3 s3_client = boto3.client( 's3', aws_access_key_id=access_key, aws_secret_access_key=secret_key ) for request_id in tqdm(request_ids): folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/ local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files os.makedirs(OUTPUT_NAME, exist_ok=True) os.makedirs(local_folder_path, exist_ok=True) # List objects in the S3 folder response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key) objects = response.get('Contents', []) for s3_object in objects: object_key = s3_object['Key'] local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key # Download the S3 object to the local file s3_client.download_file(s3_bucket_name, object_key, local_file_path) if __name__ == "__main__": main()