sbt-idp/scripts/crawl_database_by_time.py
2024-03-29 18:34:02 +07:00

100 lines
2.8 KiB
Python

import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
from datetime import datetime, timedelta
import pytz
from django.utils import timezone
from dotenv import load_dotenv
load_dotenv("../.env_prod")
tz = pytz.timezone('Asia/Singapore')
OUTPUT_NAME = "0303-0327"
START_DATE = datetime(2024, 3, 3)
END_DATE = datetime(2024, 3, 27)
START_DATE = timezone.make_aware(START_DATE, tz)
END_DATE = timezone.make_aware(END_DATE, tz)
# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")
# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'
# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")
# Request IDs for filtering
# Connect to the PostgreSQL database
conn = psycopg2.connect(
host=db_host,
database=db_name,
user=db_user,
password=db_password
)
# Create a cursor
cursor = conn.cursor()
# Execute the SELECT query with the filter
query = "SELECT * FROM fwd_api_subscriptionrequest WHERE created_at >= %s AND created_at <= %s"
cursor.execute(query, (START_DATE, END_DATE))
# Fetch the filtered data
data = cursor.fetchall()
# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'
# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([desc[0] for desc in cursor.description]) # Write column headers
writer.writerows(data) # Write the filtered data rows
# Close the cursor and database connection
cursor.close()
conn.close()
# # Download folders from S3
s3_client = boto3.client(
's3',
aws_access_key_id=access_key,
aws_secret_access_key=secret_key
)
request_ids = []
for rq in data:
rq_id = rq[3]
request_ids.append(rq_id)
for request_id in tqdm(request_ids):
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
os.makedirs(OUTPUT_NAME, exist_ok=True)
if os.path.exists(local_folder_path):
continue
os.makedirs(local_folder_path, exist_ok=True)
# List objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
objects = response.get('Contents', [])
for s3_object in objects:
object_key = s3_object['Key']
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
# Download the S3 object to the local file
s3_client.download_file(s3_bucket_name, object_key, local_file_path)