sbt-idp/scripts/crawl_database.py

271 lines
12 KiB
Python
Raw Permalink Normal View History

2024-01-03 04:07:58 +00:00
import csv
import psycopg2
import boto3
import os
from tqdm import tqdm
2024-01-05 07:18:16 +00:00
from dotenv import load_dotenv
load_dotenv("../.env_prod")
2024-03-18 06:04:05 +00:00
OUTPUT_NAME = "possible_test"
2024-01-03 04:07:58 +00:00
# Database connection details
db_host = os.environ.get('DB_HOST', "")
db_name = os.environ.get('DB_SCHEMA', "")
db_user = os.environ.get('DB_USER', "")
db_password = os.environ.get('DB_PASSWORD', "")
# S3 bucket details
s3_bucket_name = os.environ.get('S3_BUCKET_NAME', "")
s3_folder_prefix = 'sbt_invoice'
# S3 access credentials
access_key = os.environ.get('S3_ACCESS_KEY', "")
secret_key = os.environ.get('S3_SECRET_KEY', "")
# Request IDs for filtering
request_ids = [
2024-03-18 06:04:05 +00:00
'SAP_20240105150247_cd45453e440c41068c8aa5ce804c6ac5',
'SAP_20240111145530_219e783311404e12bb8b14a26eec804b',
'SAP_20240111143618_a2bfed95f0624f65a668309be72251d4',
'SAP_20240206142539_9445f405c2a144f2adb43cb70fc66030',
'SAP_20240209133540_2e4edf0837e142dca7062d370d359116',
'SAP_20240209170013_0cfbd55689bc4bb18029883570987395',
'SAP_20240130145752_399dfa7936e34471b54571f7a425ced0',
'SAP_20240203101503_fa876d1a323b4199a5e3af7d56d0d0f1',
'SAP_20240206201610_411c4a42c6bc4a0bb3c23cf00a44bbf4',
'SAP_20240130151625_4c06ab2728304c9f873e3f6a8cb1cc6a',
'SAP_20240130153633_c911feda8307419dbe21420c2599bf12',
'SAP_20240203114112_13dcee9f08b244f6ab8440bca2993d8b',
'SAP_20240207213859_80b1fe039eca440696cb222ba7d58244',
'SAP_20240128114117_0a352b8f90034800bd460ec0b676bd71',
'SAP_20240130182943_385335a6c7d046b08b9ba19c9733c45f',
'SAP_20240130185043_4bc097bccaa243929b0e0f3e679607ff',
'SAP_20240130191730_1b0169695d724f4883021cf831e3fb80',
'SAP_20240127095627_ca5483e31c04418f9d8e27c75bf3017b',
'SAP_20240127080133_ba41c7301cb742c097576b7a033f714c',
'SAP_20240127080912_79b186e7075d4adbb2f36729909eafa7',
'SAP_20240127081511_5a1593167da84e6abc7699a3ad5a8917',
'SAP_20240202095744_c22254156de143f9b9288d12301015ae',
'SAP_20240127143437_b5f16f906fb3401f8d56233c6d357291',
'SAP_20240130203642_163aa3f319a340a4a4b118b99e708a0f',
'SAP_20240130221931_bf4245a4efb04f31b9b001c240d73bb5',
'SAP_20240130222513_b6b1a0b81b9c4be0a9f859c2d6758589',
'SAP_20240130223545_8e43ce0bb37b49a3bfc012d76c2861fd',
'SAP_20240130232330_a44faa31a2314e2ead7a0aabcc8f22c6',
'SAP_20240130233355_77f4d20c4b554b39bc1045c19213ec3b',
'SAP_20240130204736_0fbefc13ea8542289c2fa67d4f36f270',
'SAP_20240129194345_ff0578a42fd34ed0969d1b83abccb160',
'SAP_20240131094443_77ce08812b5045c497d9b2a05a23da7b',
'SAP_20240130210829_93c0bf0277cf41ac96ae9eadb31e61d4',
'SAP_20240131114352_5b01f9fe496f48bea8b1511ad29c8958',
'SAP_20240201113203_f3c964e2a44841ee88dab86b8bdd6c59',
'SAP_20240131135943_f5b89290a0e743bfa192a29a4dfba3d0',
'SAP_20240131141657_af0db06da4194bb596018135d0aad6ba',
'SAP_20240131153605_3f6f2431b0e04c979d2b102cba3a92d0',
'SAP_20240131162709_01b2db786d824f3abf3d51efe1f14906',
'SAP_20240131172119_f123f0f43c184fe5ab6ac94e87762d40',
'SAP_20240131175029_7c6b587a59624b62aad72325d64a27f5',
'SAP_20240206203220_6c815ef66d214635bc14e0affa425e22',
'SAP_20240130131509_edaadcc6e82b474ba6888662d64ae5f5',
'SAP_20240130183515_a7dc68c51cd14694a6a0190d5896af3a',
'SAP_20240131225503_9a91f1e995904b5cacf795e91d7de1f3',
'SAP_20240131080747_80c2b0701bc44ad2acce0e278069db9e',
'SAP_20240201085824_0b22f093f45c47abab7298c8567a0187',
'SAP_20240203155516_1f84eac8680841169fb11a202896181f',
'SAP_20240131130549_05ed8fb82d384e7e97daa66af70d11f5',
'SAP_20240201130640_0b9d4ac0c28b45f783110b39f5ec7d3f',
'SAP_20240131225955_7221e7854d1646ed914fdada7a1a88af',
'SAP_20240201000050_20963ca447cd4a0db15f2c29f74c7e39',
'SAP_20240201110455_f5179cbb464e42a0b8e6e119813f6469',
'SAP_20240201113407_6225cb07f56b4bd8bf4ec1e1b62df7db',
'SAP_20240206150912_82c8cfedd5ae4b55ab8bb4c1ec8b6303',
'SAP_20240201153138_115d71aba51340b09a057f7ca2403b0a',
'SAP_20240201173238_8858ef3098c94c619890dfaa2d2f1b0c',
'SAP_20240201214111_670687c1bc984e4caf5f06de8e3aaf36',
'SAP_20240201214838_d62c8bcfbafc410f98629bf28060ae34',
'SAP_20240207142000_0090e42e0bec4643acf2c4bdd311f6d0',
'SAP_20240201012315_b3c3a77e558b4b47b48a0c6756fdd1dc',
'SAP_20240201115010_21f021971f0a4dff861b343d7d1645f2',
'SAP_20240201163135_5495e7cf28fb4a1e838c55f587869706',
'SAP_20240201164329_05c40987df7f4451ab62e58dd1d05d4b',
'SAP_20240130193934_379de93d3501496e94bd02ac994a2d31',
'SAP_20240201215219_7c4c7cbc3636440f888c3d85cd1e17a7',
'SAP_20240202104415_e4389d6576c9405e8b2cc5d8392502be',
'SAP_20240202113205_fbae574e17f24cd2a3dafd51bd9279e2',
'SAP_20240206120308_7d1924951da347c4a4c336a6dc662b7d',
'SAP_20240203201702_677762cdb5bc484bbef3bd945355f2f3',
'SAP_20240201000252_d7d23365a3334f58bfe02d865b261532',
'SAP_20240201000514_97ef0eeeb538463ba1abcf1bfa66110c',
'SAP_20240201005704_21be390b7e6c43129fdb9cf7022f8c85',
'SAP_20240203000154_8ff7effa54264bd5ad551a6917b03cf0',
'SAP_20240203133112_12719d7ba12f42cca731877e3517d19f',
'SAP_20240201114044_3da4f899c4e241b59eea8ecd7ddeea55',
'SAP_20240201114519_cbc54d32f65f47eabe6d6ac4a6cbef3a',
'SAP_20240201121617_d294849ddb1d4a5b8acb26b35d1d7c56',
'SAP_20240201125936_70fcf9ea03ef4ef19b3e822e3e5b4ed0',
'SAP_20240204133921_28de5f72a5294ca88202d01874ddb898',
'SAP_20240204134555_6252a7f3d3584d1b9fef50f9f882bb78',
'SAP_20240204135801_696e5acff18f4fd99067b3d9cc13acc8',
'SAP_20240204184026_0095125a337a45399c6627443fb2809c',
'SAP_20240204202837_8ecf60184d714fa7b74352b301897d8e',
'SAP_20240204203505_edd7ac14d5f54529978ac4e1cac98275',
'SAP_20240204214531_59af28a5eb7a4a219e0327b5b5ac4198',
'SAP_20240202203759_efcbd8e5c4e14e4db1885f1a2b455698',
'SAP_20240207155502_8a8281d4e0e94e42921550248c1c9253',
'SAP_20240205104502_5d4d1435fa964d918db245bac7f29825',
'SAP_20240205122344_051e356d6e47434d81b117b3b716df7e',
'SAP_20240203220754_948aecccf29c4ee4b38bb31d1866f7c8',
'SAP_20240201231307_71dc1557e46248f2921f544c160a50bc',
'SAP_20240205142554_192372a311544fd2b0d8e12330d3e131',
'SAP_20240201231409_293633e97745457e80f7e22984b967ef',
'SAP_20240205152002_06515566f15041988718e1cc2279376f',
'SAP_20240202103100_3f2bf5c5397b4adeafe66369dfd406e1',
'SAP_20240204140541_04c3647483ca45dbbb2d3698aa2a2f9e',
'SAP_20240202114015_dab36ad3f6d94fc09360ced2e7303098',
'SAP_20240203214357_f1809ee448394419b45dc577b599d9f8',
'SAP_20240209131241_7042d1b45b3447999e713e92d08f275e',
'SAP_20240205105355_61830207b0874ba9a2e4bdb8b5d2f23f',
'SAP_20240130150814_bcbd1cb941244fd888e89de91910b0b9',
'SAP_20240205150922_92b3cd37ed414363846ef97eca6b767e',
'SAP_20240205104327_add42791a9e04769814f90a612ebe325',
'SAP_20240206142303_be619241683949da8130171c6ab944c7',
'SAP_20240205200647_2a4afa2cc24b4378a209553746920985',
'SAP_20240206005021_9635b064af604ff6b4f5780b9beb1c3f',
'SAP_20240208190424_97a6ec6e60204ed2a08af99057ed46bb',
'SAP_20240206010454_f70bbe6ec0d345dc9ecfd0a34db9cf0c',
'SAP_20240206142856_9293e55ecd28499580dc6267f909a282',
'SAP_20240207144719_b870eed54f3f42eeb3ef136aa91fec19',
'SAP_20240208104533_f8576e640af04ec8a139e469ca9745a5',
'SAP_20240206154445_b0464949fd7341ab91dc248e49f7ccd9',
'SAP_20240208224819_e5b10a142d4841dcb7f67827ccd2b9f6',
'SAP_20240115152747_d4d73d7dc03e4dbbbc8034dce7b45c1d',
'SAP_20240203133442_3b2b1af9cc2a469cae7fb84190e49e0f',
'SAP_20240207201452_142d0db8532248dcb863e6323375fa2e',
'SAP_20240207203540_50e31f5ba9094d88a707cf8e906c51cf',
'SAP_20240125114600_231b3c239d0f408fbcbc97a975b6a286',
'SAP_20240125134355_b575051032ce42a2bd9a4f7a0825a9ba',
'SAP_20240125145604_de6d55acae44488f94afe7ba49868c35',
'SAP_20240208222636_7a65cfc778fa4ff6be8a75042eac3549',
'SAP_20240208231626_8a39b7c36c6845a0ac264a08da4f596f',
'SAP_20240204135652_eacb604d6da1447ab6e0195419d4cbfb',
'SAP_20240207155340_6b30a07dd3ba41baa3d1902dcc1f37a6',
'SAP_20240205183046_711fb8da646c45bba4ce7789a7cfcd49',
'SAP_20240201112759_b6ec8e1c0b45495a90c22d5a9305ceb5',
'SAP_20240204184514_3e4b9b6e10cb4ce3af14f25e4873f009',
'SAP_20240205213153_e98fa3147643494b9e2de32e4c3382e3',
'SAP_20240205173852_f2b3b98fb28b43cdac2cab9bb35170a1',
'SAP_20240129153233_367de58c8b504d0bbc5b5d71a48442e1',
'SAP_20240206202807_bbfdd9dae29a42d59f5b69ca1d6c091f',
'SAP_20240203194418_ade51ce92c2a463fbec3ae75cbae6b0e',
'SAP_20240203214100_b245fb7d115a4842a77534084f0544c7',
'SAP_20240207142500_71cbd6cfdd984baca36e953904a09726',
'SAP_20240207143331_d583e47af987422ba07b76dcdda7cb57',
'SAP_20240125114657_582106f68a9a4c0a8a89beffa1a4ad2a',
'SAP_20240130073108_c2b438564c534798babe7fc7dc4dabc0',
'SAP_20240205122103_f62bce47060f4077875a2e891a31c3ac',
'SAP_20240131150338_c602f037b83d4862a92dd8f582246a08',
'SAP_20240207135750_c25b4a95ce434ad2ae195817c42bc332',
'SAP_20240128114401_c8ef9be4fe604f19bbed477e84b3bb0d',
'SAP_20240203183625_8a282ed896ea48d6b565c12edbb4149e',
'SAP_20240130093313_a273212ddc8741fea13c121cbfe56b41',
'SAP_20240130094904_13ccfd9311b34bc89e8f6f17e84a9823',
'SAP_20240129113741_a9ff76f80b77491790d8dcbb80e7404b',
'SAP_20240131212309_64f9623dfd0440ba862457a5ce43baa6',
'SAP_20240207141147_a5d526e312a1474caddbe81f2c9653e7',
'SAP_20240130131627_d22774c04918466b8eca12db9df9a4b0',
'SAP_20240130131732_cc8747a1754b4dadacd85d0f221d482c',
'SAP_20240201102500_b10ffe3e4d384eb48727e88b9a04cb69',
'SAP_20240201103923_cabc4189b8ed4e79b1dd5c0af1b719a0',
'SAP_20240130210435_5a330b99a3b3470e81fde0be7a87e21c',
'SAP_20240130220948_f580aad038c6449eace07e381b69d923',
'SAP_20240130082854_02c13ded93ba434c9ca7028805f7e870',
'SAP_20240208083548_ccfd1889cd6e4a6abbc85616ee50fd43',
'SAP_20240208141202_35ba5fb5480e4180b97109395fb82210',
'SAP_20240208165449_250444df8c91401ab7c6487969e10a87',
'SAP_20240204184848_4ed9674a9cff4b7d9798392655e7ad0c',
'SAP_20240208211623_52693b1ca6c34cd4a86096eeae6492c6',
'SAP_20240207140158_389feecc33af4280b1eaa6d2f1f1f3af',
'SAP_20240207140304_49f05af86b42464790ae752231ae74c9',
'SAP_20240129203245_4d7fbc485cbf448783ca7e5931c28544',
'SAP_20240207150318_9779a58bed9847f980d1d18e8bf0d1af',
'SAP_20240207151949_cb25425b6fd04e6f8f24d53d24d1d5e4',
'SAP_20240129234649_79af35670f8c4db8be1c40a1007a7df3',
'SAP_20240207135511_bfd85fcdb55947848cc566dc87f390cf',
'SAP_20240207143245_0e5ceb9737174404b0f48c9a012b485e',
'SAP_20240129194208_443d0c7e86404e33a10bb5a26a8b67a6',
'SAP_20240130083633_4ee6ab512a6f40af8771ec35d5a93293',
'SAP_20240131081559_a8344a059cdc4363ad641fa76185d6a6',
'SAP_20240207141507_01e78eea090649daaf1827053191acf6',
'SAP_20240207142012_3558b44105b44fe6a5e805803a9d5aad',
'SAP_20240130154409_78a36295728a4df28983b1b55432515d',
'SAP_20240207193406_ea2d379e900649aba8d066749537ac2d',
'SAP_20240207212515_c04427ed27af413298ef131956996a9c',
'SAP_20240205125127_cbe5503d09774b12904b0f9e0665a51e',
'SAP_20240209154036_3bac79b974974afc8f77189d7748935d',
'SAP_20240204135604_80a5109a5f5b46618ff1df1bee8dbe11',
'SAP_20240208211217_0db522e3a44145f9aa0ea6e855fbf38a',
'SAP_20240208105747_3117b91ff2f84b7ca6ebd80a800b42ea',
'SAP_20240204140735_371a26579ebc455f930a2ca638a3f7fd',
2024-01-03 04:07:58 +00:00
]
# Connect to the PostgreSQL database
conn = psycopg2.connect(
host=db_host,
database=db_name,
user=db_user,
password=db_password
)
# Create a cursor
cursor = conn.cursor()
# Generate the placeholder string for the IN statement
placeholders = ','.join(['%s'] * len(request_ids))
# Execute the SELECT query with the filter
query = f"SELECT * FROM fwd_api_subscriptionrequest WHERE request_id IN ({placeholders})"
cursor.execute(query, request_ids)
# Fetch the filtered data
data = cursor.fetchall()
# Define the CSV file path
csv_file_path = f'{OUTPUT_NAME}.csv'
# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([desc[0] for desc in cursor.description]) # Write column headers
writer.writerows(data) # Write the filtered data rows
# Close the cursor and database connection
cursor.close()
conn.close()
# Download folders from S3
s3_client = boto3.client(
's3',
aws_access_key_id=access_key,
aws_secret_access_key=secret_key
)
for request_id in tqdm(request_ids):
folder_key = f"{s3_folder_prefix}/{request_id}/" # Assuming folder structure like: s3_bucket_name/s3_folder_prefix/request_id/
local_folder_path = f"{OUTPUT_NAME}/{request_id}/" # Path to the local folder to save the downloaded files
os.makedirs(OUTPUT_NAME, exist_ok=True)
os.makedirs(local_folder_path, exist_ok=True)
# List objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_key)
objects = response.get('Contents', [])
for s3_object in objects:
object_key = s3_object['Key']
local_file_path = local_folder_path + object_key.split('/')[-1] # Extracting the file name from the object key
# Download the S3 object to the local file
s3_client.download_file(s3_bucket_name, object_key, local_file_path)