sbt-idp/scripts/crawl_database.py
2024-01-11 18:27:33 +07:00

152 lines
5.9 KiB
Python

import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv("../.env_prod")
OUTPUT_NAME = "5Jan"
# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")
# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'
# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")
# Request IDs for filtering
request_ids = [
'SAP_20240104082259_85c7f4dd262946d183dbec826fc6709e',
'SAP_20240104082709_c05319c56fd3422dbf133aee33fc3e10',
'SAP_20240104091512_23ae1a81f1314be0a27ebeae0e8fa0d7',
'SAP_20240104091512_23ae1a81f1314be0a27ebeae0e8fa0d7',
'SAP_20240104091816_025c90b9789246ed811772003622fa0d',
'SAP_20240104092541_5c71e535f07c4cc8803b45336ec70f77',
'SAP_20240104100259_5a667d33cb914e7ba5a4447b9e17d649',
'SAP_20240104101145_a7010bac159f47bc95d5866e6c5f5bdf',
'SAP_20240104105702_95252229252b4e238add117919ce882a',
'SAP_20240104112108_34b2cca84a42473ca77bc316e787fe2e',
'SAP_20240104114038_dd57ecf7982c4a5eaf1409f5ef050fab',
'SAP_20240104115942_1b77f411791940a4a85c838c2e9931ad',
'SAP_20240104120746_d63319f4cde343d894f9b89706756a9d',
'SAP_20240104123607_48d25c04fec6411dbf013c6a19054e77',
'SAP_20240104130957_ece21bad331b4f2cad0887693331aa3a',
'SAP_20240104131228_edebee4000ae4bd382feaea5d6c82031',
'SAP_20240104132641_97909efd013f45e89d83d36a5ea35c52',
'SAP_20240104133527_ad55f6ee667643ba8ae65e9ef1c32418',
'SAP_20240104134014_2d2cdbc1b06a44868ce1b32cdb53864f',
'SAP_20240104134425_9b37555ef8094153838e6048f7c63c9b',
'SAP_20240104134457_55a1cf1e371146d995c8849cc0ba7c7b',
'SAP_20240104134609_3f7d308e467d43dbb59a7bcc02e3a7d2',
'SAP_20240104134709_c708daf83f7e4aa69ab9696afe1a9081',
'SAP_20240104135007_44b7a30c5e9c41a0b8065ac4e7000223',
'SAP_20240104141547_7203ddb915274e99a08ae6e54ec49cbd',
'SAP_20240104141559_62fd19a6179248ecb4ff15b33338b294',
'SAP_20240104142352_68699cbe140f4264b858981a3ac67e40',
'SAP_20240104143937_801931cc1f344a4ca8384dfe13d1accc',
'SAP_20240104144730_3180a8919e604e26a188ce051465c392',
'SAP_20240104144933_3380f64019634769befed49e9a671bc6',
'SAP_20240104151239_76ae2f1d02444f7fabbc104eb77fe45f',
'SAP_20240104151243_61775c88685d434d98bb9fc7a9889b8e',
'SAP_20240104151243_61775c88685d434d98bb9fc7a9889b8e',
'SAP_20240104151243_61775c88685d434d98bb9fc7a9889b8e',
'SAP_20240104151638_a08a61448a58459a8f2209f64e54c213',
'SAP_20240104152030_479259e84c5b449499df2cb1023e91ac',
'SAP_20240104160108_a03634c80583454494b77efcdecbcc71',
'SAP_20240104160108_a03634c80583454494b77efcdecbcc71',
'SAP_20240104160311_e7cb02a11bbd4ea1906b3758e97f33ab',
'SAP_20240104161305_89c5518563224ab89345439dffd504a5',
'SAP_20240104161305_89c5518563224ab89345439dffd504a5',
'SAP_20240104164022_0b94af24db9d4ebe9af2086a4bd3cd7e',
'SAP_20240104170837_58165ec9f88d4e4aa3095ba3dda201d7',
'SAP_20240104171740_10279cfebbf344f184bbb429cb9a15ad',
'SAP_20240104175202_247892a4dc7f40f28eafac9c2ad85971',
'SAP_20240104180517_8ce7a1981dc743e08e09284fd904d536',
'SAP_20240104182034_406bac0ab0684727b9efb1bb9b422026',
'SAP_20240104182426_92a48bb4b85a4c3abb48e0d7cf727777',
'SAP_20240104183506_aa1fa7d6774a4509a142a6f4a7b5af29',
'SAP_20240104185716_f9d464e42c314370910913b37133e6c3',
'SAP_20240104190220_573244d03bb8408dbca422ff60eb527a',
'SAP_20240104191236_deedcc588b7b4928a950f7dc2ce4230c',
'SAP_20240104191236_deedcc588b7b4928a950f7dc2ce4230c',
'SAP_20240104192614_990bf10c38e144a7bf489548d356720e',
'SAP_20240104192614_990bf10c38e144a7bf489548d356720e',
'SAP_20240104212143_f8c1b4a6e6e443fcb5e882c7a5b917f3',
'SAP_20240104212924_ee1998a60d6848af9576292ac383037f',
'SAP_20240104214418_f8e1abf808c8499097ecddf014d401c7',
'SAP_20240104214619_8d27c05a9ce74b738b20195cb816bfbf',
'SAP_20240104215037_477863cdc0aa4d5fa1f05bbb0ae673ed',
'SAP_20240104221543_37605982df624324ad2594e268054361',
'SAP_20240104225026_acacd06ea6de4a738bc47683dc53f378',
'SAP_20240104235743_b48aa3e744ed428795171d84066adefe',
]
# Connect to the PostgreSQL database
conn = psycopg2.connect(
host=db_host,
database=db_name,
user=db_user,
password=db_password
)
# Create a cursor
cursor = conn.cursor()
# Generate the placeholder string for the IN statement
placeholders = ','.join(['%s'] * len(request_ids))
# Execute the SELECT query with the filter
query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})"
cursor.execute(query, request_ids)
# Fetch the filtered data
data = cursor.fetchall()
# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'
# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([desc[0] for desc in cursor.description]) # Write column headers
writer.writerows(data) # Write the filtered data rows
# Close the cursor and database connection
cursor.close()
conn.close()
# Download folders from S3
s3_client = boto3.client(
's3',
aws_access_key_id=access_key,
aws_secret_access_key=secret_key
)
for request_id in tqdm(request_ids):
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
os.makedirs(OUTPUT_NAME, exist_ok=True)
os.makedirs(local_folder_path, exist_ok=True)
# List objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
objects = response.get('Contents', [])
for s3_object in objects:
object_key = s3_object['Key']
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
# Download the S3 object to the local file
s3_client.download_file(s3_bucket_name, object_key, local_file_path)