Add: crawl database script

2024-01-03 11:07:58 +07:00 · 2024-01-03 11:07:58 +07:00 · 9686791d59
commit 9686791d59
parent cdce8df227
1 changed files with 96 additions and 0 deletions
--- a/scripts/crawl_database.py
+++ b/scripts/crawl_database.py
@ -0,0 +1,96 @@
+import csv
+import psycopg2
+import boto3
+import os
+from tqdm import tqdm
+
+OUTPUT_NAME = "issue_7"
+
+# Database connection details
+db_host = os.environ.get('DB_HOST', "")
+db_name = os.environ.get('DB_SCHEMA', "")
+db_user = os.environ.get('DB_USER', "")
+db_password = os.environ.get('DB_PASSWORD', "")
+
+# S3 bucket details
+s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
+s3_folder_prefix = 'sbt_invoice'
+
+# S3 access credentials
+access_key = os.environ.get('S3_ACCESS_KEY', "")
+secret_key = os.environ.get('S3_SECRET_KEY', "")
+
+# Request IDs for filtering
+request_ids = [
+'SAPe39e970592394b27a17d4a64c39f7ed0',
+'SAP477a02a21faf41ecbd1a0bb21636e644',
+'SAP459d58a7dba84e7195f5ad8f46fc1530',
+'SAPa5aaa0e1ce8c4824a7b0ded2e550caec',
+'SAP492c063db44049c6b1e44f59c531f8d8',
+'SAP3d0bdd5cb4ce4291b0cb77d7de0a48e9',
+'SAP7e2c673e49c441a991661d1227342131',
+'SAPc26974bcac2649b28227981459a427aa',
+'SAP25b12dde6b854c70b512ac79059ac1d4',
+'SAP_20240102194138_bf4a3cc4e0304d0385126b6592c2632d',
+'SAP_20240102214550_8389ec1b84a249738eed9d2152bf0922',
+]
+
+# Connect to the PostgreSQL database
+conn = psycopg2.connect(
+    host=db_host,
+    database=db_name,
+    user=db_user,
+    password=db_password
+)
+
+# Create a cursor
+cursor = conn.cursor()
+
+# Generate the placeholder string for the IN statement
+placeholders = ','.join(['%s'] * len(request_ids))
+
+# Execute the SELECT query with the filter
+query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})"
+cursor.execute(query, request_ids)
+
+# Fetch the filtered data
+data = cursor.fetchall()
+
+# Define the CSV file path
+csv_file_path = f'{OUTPUT_NAME}.csv'
+
+# Write the data to the CSV file
+with open(csv_file_path, 'w', newline='') as csv_file:
+    writer = csv.writer(csv_file)
+    writer.writerow([desc[0] for desc in cursor.description])  # Write column headers
+    writer.writerows(data)  # Write the filtered data rows
+
+# Close the cursor and database connection
+cursor.close()
+conn.close()
+
+# Download folders from S3
+s3_client = boto3.client(
+    's3',
+    aws_access_key_id=access_key,
+    aws_secret_access_key=secret_key
+)
+
+
+for request_id in tqdm(request_ids):
+    folder_key = f"{s3_folder_prefix}/{request_id}/"  # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
+    local_folder_path = f"{OUTPUT_NAME}/{request_id}/"  # Path to the local folder to save the downloaded files
+    os.makedirs(OUTPUT_NAME, exist_ok=True)
+    os.makedirs(local_folder_path, exist_ok=True)
+
+    
+    # List objects in the S3 folder
+    response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
+    objects = response.get('Contents', [])
+    
+    for s3_object in objects:
+        object_key = s3_object['Key']
+        local_file_path = local_folder_path + object_key.split('/')[-1]  # Extracting the file name from the object key
+        
+        # Download the S3 object to the local file
+        s3_client.download_file(s3_bucket_name, object_key, local_file_path)