sbt-idp/scripts/crawl_database_by_time.py

import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
from datetime import datetime, timedelta
import pytz
from django.utils import timezone

from dotenv import load_dotenv

load_dotenv("../.env_prod")

tz = pytz.timezone('Asia/Singapore')

OUTPUT_NAME = "Feb29"
START_DATE = datetime(2024, 2, 29)
END_DATE = datetime(2024, 3, 1)
START_DATE = timezone.make_aware(START_DATE, tz)
END_DATE = timezone.make_aware(END_DATE, tz)

# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")

# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'

# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")

# Request IDs for filtering

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)

# Create a cursor
cursor = conn.cursor()


# Execute the SELECT query with the filter
query = "SELECT * FROM fwd_api_subscriptionrequest WHERE created_at >= %s AND created_at <= %s"
cursor.execute(query, (START_DATE, END_DATE))

# Fetch the filtered data
data = cursor.fetchall()

# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([desc[0] for desc in cursor.description])  # Write column headers
    writer.writerows(data)  # Write the filtered data rows

# Close the cursor and database connection
cursor.close()
conn.close()

# # Download folders from S3
s3_client = boto3.client(
    's3',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

request_ids = []
for rq in data:
    rq_id = rq[3]
    request_ids.append(rq_id)

for request_id in tqdm(request_ids):
    folder_key = f"{s3_folder_prefix}/{request_id}/"  # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
    local_folder_path = f"{OUTPUT_NAME}/{request_id}/"  # Path to the local folder to save the downloaded files
    os.makedirs(OUTPUT_NAME, exist_ok=True)
    os.makedirs(local_folder_path, exist_ok=True)

    
    # List objects in the S3 folder
    response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
    objects = response.get('Contents', [])
    
    for s3_object in objects:
        object_key = s3_object['Key']
        local_file_path = local_folder_path + object_key.split('/')[-1]  # Extracting the file name from the object key
        
        # Download the S3 object to the local file
        s3_client.download_file(s3_bucket_name, object_key, local_file_path)
Add: API list request 2024-01-05 07:18:16 +00:00			`import csv`
			`import psycopg2`
			`import boto3`
			`import os`
			`from tqdm import tqdm`
			`from datetime import datetime, timedelta`
Update: #61 on prod environment 2024-03-05 03:22:15 +00:00			`import pytz`
			`from django.utils import timezone`
Add: API list request 2024-01-05 07:18:16 +00:00
			`from dotenv import load_dotenv`

			`load_dotenv("../.env_prod")`

Update: #61 on prod environment 2024-03-05 03:22:15 +00:00			`tz = pytz.timezone('Asia/Singapore')`

			`OUTPUT_NAME = "Feb29"`
			`START_DATE = datetime(2024, 2, 29)`
			`END_DATE = datetime(2024, 3, 1)`
			`START_DATE = timezone.make_aware(START_DATE, tz)`
			`END_DATE = timezone.make_aware(END_DATE, tz)`
Add: API list request 2024-01-05 07:18:16 +00:00
			`# Database connection details`
			`db_host = os.environ.get('DB_HOST', "")`
			`db_name = os.environ.get('DB_SCHEMA', "")`
			`db_user = os.environ.get('DB_USER', "")`
			`db_password = os.environ.get('DB_PASSWORD', "")`

			`# S3 bucket details`
			`s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")`
			`s3_folder_prefix = 'sbt_invoice'`

			`# S3 access credentials`
			`access_key = os.environ.get('S3_ACCESS_KEY', "")`
			`secret_key = os.environ.get('S3_SECRET_KEY', "")`

			`# Request IDs for filtering`

			`# Connect to the PostgreSQL database`
			`conn = psycopg2.connect(`
			`host=db_host,`
			`database=db_name,`
			`user=db_user,`
			`password=db_password`
			`)`

			`# Create a cursor`
			`cursor = conn.cursor()`


			`# Execute the SELECT query with the filter`
			`query = "SELECT * FROM fwd_api_subscriptionrequest WHERE created_at >= %s AND created_at <= %s"`
			`cursor.execute(query, (START_DATE, END_DATE))`

			`# Fetch the filtered data`
			`data = cursor.fetchall()`

			`# Define the CSV file path`
			`csv_file_path = f'{OUTPUT_NAME}.csv'`

			`# Write the data to the CSV file`
			`with open(csv_file_path, 'w', newline='') as csv_file:`
			`writer = csv.writer(csv_file)`
			`writer.writerow([desc[0] for desc in cursor.description]) # Write column headers`
			`writer.writerows(data) # Write the filtered data rows`

			`# Close the cursor and database connection`
			`cursor.close()`
			`conn.close()`

Merged from vietanh99, Add APIs 2024-01-31 03:00:18 +00:00			`# # Download folders from S3`
Fix issues on 29 Feb 2024-02-28 11:45:10 +00:00			`s3_client = boto3.client(`
			`'s3',`
			`aws_access_key_id=access_key,`
			`aws_secret_access_key=secret_key`
			`)`

			`request_ids = []`
			`for rq in data:`
			`rq_id = rq[3]`
			`request_ids.append(rq_id)`

			`for request_id in tqdm(request_ids):`
			`folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/`
			`local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files`
			`os.makedirs(OUTPUT_NAME, exist_ok=True)`
			`os.makedirs(local_folder_path, exist_ok=True)`
Add: API list request 2024-01-05 07:18:16 +00:00

Fix issues on 29 Feb 2024-02-28 11:45:10 +00:00			`# List objects in the S3 folder`
			`response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)`
			`objects = response.get('Contents', [])`
Add: API list request 2024-01-05 07:18:16 +00:00
Fix issues on 29 Feb 2024-02-28 11:45:10 +00:00			`for s3_object in objects:`
			`object_key = s3_object['Key']`
			`local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key`
Add: API list request 2024-01-05 07:18:16 +00:00
Fix issues on 29 Feb 2024-02-28 11:45:10 +00:00			`# Download the S3 object to the local file`
			`s3_client.download_file(s3_bucket_name, object_key, local_file_path)`