sbt-idp/scripts/crawl_database_by_time.py

import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
from datetime import datetime, timedelta
from pytz import timezone

from dotenv import load_dotenv

load_dotenv("../.env_prod")

OUTPUT_NAME = "Jan"
START_DATE = datetime(2024, 1, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))
END_DATE = datetime(2024, 2, 1, tzinfo=timezone('Asia/Ho_Chi_Minh'))

# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")

# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'

# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")

# Request IDs for filtering

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)

# Create a cursor
cursor = conn.cursor()


# Execute the SELECT query with the filter
query = "SELECT * FROM fwd_api_subscriptionrequest WHERE created_at >= %s AND created_at <= %s"
cursor.execute(query, (START_DATE, END_DATE))

# Fetch the filtered data
data = cursor.fetchall()

# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([desc[0] for desc in cursor.description])  # Write column headers
    writer.writerows(data)  # Write the filtered data rows

# Close the cursor and database connection
cursor.close()
conn.close()

# # Download folders from S3
# s3_client = boto3.client(
#     's3',
#     aws_access_key_id=access_key,
#     aws_secret_access_key=secret_key
# )

# request_ids = []
# for rq in data:
#     rq_id = rq[3]
#     request_ids.append(rq_id)

# for request_id in tqdm(request_ids):
#     folder_key = f"{s3_folder_prefix}/{request_id}/"  # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
#     local_folder_path = f"{OUTPUT_NAME}/{request_id}/"  # Path to the local folder to save the downloaded files
#     os.makedirs(OUTPUT_NAME, exist_ok=True)
#     os.makedirs(local_folder_path, exist_ok=True)


#     # List objects in the S3 folder
#     response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
#     objects = response.get('Contents', [])

#     for s3_object in objects:
#         object_key = s3_object['Key']
#         local_file_path = local_folder_path + object_key.split('/')[-1]  # Extracting the file name from the object key

#         # Download the S3 object to the local file
#         s3_client.download_file(s3_bucket_name, object_key, local_file_path)