Add: crawl database script
This commit is contained in:
parent
cdce8df227
commit
9686791d59
96
scripts/crawl_database.py
Normal file
96
scripts/crawl_database.py
Normal file
@ -0,0 +1,96 @@
|
||||
import csv
|
||||
import psycopg2
|
||||
import boto3
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
||||
OUTPUT_NAME = "issue_7"
|
||||
|
||||
# Database connection details
|
||||
db_host = os.environ.get('DB_HOST', "")
|
||||
db_name = os.environ.get('DB_SCHEMA', "")
|
||||
db_user = os.environ.get('DB_USER', "")
|
||||
db_password = os.environ.get('DB_PASSWORD', "")
|
||||
|
||||
# S3 bucket details
|
||||
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
|
||||
s3_folder_prefix = 'sbt_invoice'
|
||||
|
||||
# S3 access credentials
|
||||
access_key = os.environ.get('S3_ACCESS_KEY', "")
|
||||
secret_key = os.environ.get('S3_SECRET_KEY', "")
|
||||
|
||||
# Request IDs for filtering
|
||||
request_ids = [
|
||||
'SAPe39e970592394b27a17d4a64c39f7ed0',
|
||||
'SAP477a02a21faf41ecbd1a0bb21636e644',
|
||||
'SAP459d58a7dba84e7195f5ad8f46fc1530',
|
||||
'SAPa5aaa0e1ce8c4824a7b0ded2e550caec',
|
||||
'SAP492c063db44049c6b1e44f59c531f8d8',
|
||||
'SAP3d0bdd5cb4ce4291b0cb77d7de0a48e9',
|
||||
'SAP7e2c673e49c441a991661d1227342131',
|
||||
'SAPc26974bcac2649b28227981459a427aa',
|
||||
'SAP25b12dde6b854c70b512ac79059ac1d4',
|
||||
'SAP_20240102194138_bf4a3cc4e0304d0385126b6592c2632d',
|
||||
'SAP_20240102214550_8389ec1b84a249738eed9d2152bf0922',
|
||||
]
|
||||
|
||||
# Connect to the PostgreSQL database
|
||||
conn = psycopg2.connect(
|
||||
host=db_host,
|
||||
database=db_name,
|
||||
user=db_user,
|
||||
password=db_password
|
||||
)
|
||||
|
||||
# Create a cursor
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Generate the placeholder string for the IN statement
|
||||
placeholders = ','.join(['%s'] * len(request_ids))
|
||||
|
||||
# Execute the SELECT query with the filter
|
||||
query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})"
|
||||
cursor.execute(query, request_ids)
|
||||
|
||||
# Fetch the filtered data
|
||||
data = cursor.fetchall()
|
||||
|
||||
# Define the CSV file path
|
||||
csv_file_path = f'{OUTPUT_NAME}.csv'
|
||||
|
||||
# Write the data to the CSV file
|
||||
with open(csv_file_path, 'w', newline='') as csv_file:
|
||||
writer = csv.writer(csv_file)
|
||||
writer.writerow([desc[0] for desc in cursor.description]) # Write column headers
|
||||
writer.writerows(data) # Write the filtered data rows
|
||||
|
||||
# Close the cursor and database connection
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
# Download folders from S3
|
||||
s3_client = boto3.client(
|
||||
's3',
|
||||
aws_access_key_id=access_key,
|
||||
aws_secret_access_key=secret_key
|
||||
)
|
||||
|
||||
|
||||
for request_id in tqdm(request_ids):
|
||||
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
|
||||
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
|
||||
os.makedirs(OUTPUT_NAME, exist_ok=True)
|
||||
os.makedirs(local_folder_path, exist_ok=True)
|
||||
|
||||
|
||||
# List objects in the S3 folder
|
||||
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
|
||||
objects = response.get('Contents', [])
|
||||
|
||||
for s3_object in objects:
|
||||
object_key = s3_object['Key']
|
||||
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
|
||||
|
||||
# Download the S3 object to the local file
|
||||
s3_client.download_file(s3_bucket_name, object_key, local_file_path)
|
Loading…
Reference in New Issue
Block a user