From 7b8cfbb0628dec3283f103dd9ae2ffd0867c26cb Mon Sep 17 00:00:00 2001 From: dx-tan Date: Wed, 3 Jan 2024 11:07:58 +0700 Subject: [PATCH] Add: crawl database script --- scripts/crawl_database.py | 96 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 scripts/crawl_database.py diff --git a/scripts/crawl_database.py b/scripts/crawl_database.py new file mode 100644 index 0000000..7a06cf0 --- /dev/null +++ b/scripts/crawl_database.py @@ -0,0 +1,96 @@ +import csv +import psycopg2 +import boto3 +import os +from tqdm import tqdm + +OUTPUT_NAME = "issue_7" + +# Database connection details +db_host = os.environ.get('DB_HOST', "") +db_name = os.environ.get('DB_SCHEMA', "") +db_user = os.environ.get('DB_USER', "") +db_password = os.environ.get('DB_PASSWORD', "") + +# S3 bucket details +s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "") +s3_folder_prefix = 'sbt_invoice' + +# S3 access credentials +access_key = os.environ.get('S3_ACCESS_KEY', "") +secret_key = os.environ.get('S3_SECRET_KEY', "") + +# Request IDs for filtering +request_ids = [ +'SAPe39e970592394b27a17d4a64c39f7ed0', +'SAP477a02a21faf41ecbd1a0bb21636e644', +'SAP459d58a7dba84e7195f5ad8f46fc1530', +'SAPa5aaa0e1ce8c4824a7b0ded2e550caec', +'SAP492c063db44049c6b1e44f59c531f8d8', +'SAP3d0bdd5cb4ce4291b0cb77d7de0a48e9', +'SAP7e2c673e49c441a991661d1227342131', +'SAPc26974bcac2649b28227981459a427aa', +'SAP25b12dde6b854c70b512ac79059ac1d4', +'SAP_20240102194138_bf4a3cc4e0304d0385126b6592c2632d', +'SAP_20240102214550_8389ec1b84a249738eed9d2152bf0922', +] + +# Connect to the PostgreSQL database +conn = psycopg2.connect( + host=db_host, + database=db_name, + user=db_user, + password=db_password +) + +# Create a cursor +cursor = conn.cursor() + +# Generate the placeholder string for the IN statement +placeholders = ','.join(['%s'] * len(request_ids)) + +# Execute the SELECT query with the filter +query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})" +cursor.execute(query, request_ids) + +# Fetch the filtered data +data = cursor.fetchall() + +# Define the CSV file path +csv_file_path = f'{OUTPUT_NAME}.csv' + +# Write the data to the CSV file +with open(csv_file_path, 'w', newline='') as csv_file: + writer = csv.writer(csv_file) + writer.writerow([desc[0] for desc in cursor.description]) # Write column headers + writer.writerows(data) # Write the filtered data rows + +# Close the cursor and database connection +cursor.close() +conn.close() + +# Download folders from S3 +s3_client = boto3.client( + 's3', + aws_access_key_id=access_key, + aws_secret_access_key=secret_key +) + + +for request_id in tqdm(request_ids): + folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/ + local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files + os.makedirs(OUTPUT_NAME, exist_ok=True) + os.makedirs(local_folder_path, exist_ok=True) + + + # List objects in the S3 folder + response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key) + objects = response.get('Contents', []) + + for s3_object in objects: + object_key = s3_object['Key'] + local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key + + # Download the S3 object to the local file + s3_client.download_file(s3_bucket_name, object_key, local_file_path) \ No newline at end of file