sbt-idp/cope2n-api/EDA/processing_time.ipynb

477 lines
16 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install -r /mnt/hdd4T/TannedCung/OCR/sbt-idp/cope2n-api/requirements.txt\n",
"# !pip install matplotlib tqdm python-dotenv"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import os\n",
"# from dotenv import load_dotenv\n",
"\n",
"# load_dotenv(\"/mnt/hdd4T/TannedCung/OCR/sbt-idp/.env_prod\")"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"# !export DB_ENGINE=django.db.backends.postgresql_psycopg2\n",
"# !export DB_SCHEMA=sbt_prod_20240422_1\n",
"# !export DB_USER=postgres\n",
"# !export DB_PASSWORD=TannedCung\n",
"# !export DB_HOST=db-sbt\n",
"# !export DB_PUBLIC_PORT=5432\n",
"# !export DB_INTERNAL_PORT=5432"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import sys\n",
"import os\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"import datetime\n",
"\n",
"current_dir = os.getcwd()\n",
"parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))\n",
"sys.path.append(parent_dir)\n",
"\n",
"import django\n",
"from django.db import models\n",
"from django.db.models import Q\n",
"from django.utils import timezone\n",
"from asgiref.sync import sync_to_async\n",
"from fwd import settings\n",
"os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"fwd.settings\")\n",
"os.environ[\"DJANGO_ALLOW_ASYNC_UNSAFE\"] = \"true\"\n",
"django.setup()\n",
"from fwd_api.models.SubscriptionRequest import SubscriptionRequest\n",
"from utils.processing_time import cost_profile, backend_cost\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
2024-06-26 07:58:24 +00:00
"start_date_str = \"2024-05-25T00:00:00+0800\"\n",
"end_date_str = \"2024-06-24T00:00:00+0800\""
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@sync_to_async\n",
"def query_data(start_date_str, end_date_str):\n",
" base_query = Q(status=200)\n",
" start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only\n",
" end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')\n",
" # start_date = timezone.make_aware(start_date)\n",
" # end_date = timezone.make_aware(end_date)\n",
2024-06-26 07:58:24 +00:00
" base_query &= Q(redemption_id__startswith=\"AU\")\n",
" base_query &= Q(created_at__range=(start_date, end_date))\n",
" subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')\n",
" return subscription_requests\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"subscription_requests = await query_data(start_date_str, end_date_str)"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def process(requests):\n",
" result_by_file = {\"queue\": [],\n",
" \"inference\": [],\n",
" \"postprocessing\": [],\n",
" \"created\": []}\n",
" result_by_file_type = {\"imei\": {\"queue\": [],\n",
" \"inference\": [],\n",
" \"postprocessing\": [],\n",
" \"created\": []},\n",
" \"invoice\": {\"queue\": [],\n",
" \"inference\": [],\n",
" \"postprocessing\": [],\n",
" \"created\": []}}\n",
" result_by_request = {\"backend_cost\": [],\n",
" \"number_of_file\": [],\n",
" \"request_cost\": [],\n",
" \"created\": []}\n",
" for request in tqdm(requests):\n",
" if not request.ai_inference_profile:\n",
" continue\n",
" result_by_request[\"created\"].append(request.created_at.timestamp())\n",
" result_by_request[\"number_of_file\"].append(request.pages)\n",
" result_by_request[\"backend_cost\"].append(backend_cost(request.created_at, request.ai_inference_start_time))\n",
" result_by_request[\"request_cost\"].append(request.ai_inference_time)\n",
"\n",
" for key in request.ai_inference_profile.keys():\n",
" profile = cost_profile(request.ai_inference_start_time, request.ai_inference_profile[key])\n",
" result_by_file[\"queue\"].append(profile[\"queue\"])\n",
" result_by_file[\"inference\"].append(profile[\"inference\"])\n",
" result_by_file[\"postprocessing\"].append(profile[\"postprocessing\"])\n",
" result_by_file[\"created\"].append(request.ai_inference_start_time)\n",
" if key.split(\"_\")[0] in [\"imei\", \"invoice\"]:\n",
" result_by_file_type[key.split(\"_\")[0]][\"queue\"].append(profile[\"queue\"])\n",
" result_by_file_type[key.split(\"_\")[0]][\"inference\"].append(profile[\"inference\"])\n",
" result_by_file_type[key.split(\"_\")[0]][\"postprocessing\"].append(profile[\"postprocessing\"])\n",
" result_by_file_type[key.split(\"_\")[0]][\"created\"].append(request.ai_inference_start_time)\n",
"\n",
" \n",
" return result_by_request, result_by_file, result_by_file_type\n",
"\n",
" "
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"result_by_request, result_by_file, result_by_file_type = process(subscription_requests)"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frame_by_file = pd.DataFrame(result_by_file)\n",
"frame_by_request = pd.DataFrame(result_by_request)\n",
"frame_by_imei = pd.DataFrame(result_by_file_type[\"imei\"])\n",
2024-06-26 07:58:24 +00:00
"frame_by_invoice = pd.DataFrame(result_by_file_type[\"invoice\"])\n",
"\n",
"frame_by_request[\"image_avg_cost\"] = (frame_by_request[\"backend_cost\"] + frame_by_request[\"request_cost\"])/frame_by_request[\"number_of_file\"]"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frame_by_file.set_index('created', inplace=True)\n",
"frame_by_request.set_index('created', inplace=True)\n",
"frame_by_imei.set_index('created', inplace=True)\n",
"frame_by_invoice.set_index('created', inplace=True)"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def to_datetime(timestamp):\n",
" # Convert the timestamp to a datetime object\n",
" dt = datetime.datetime.fromtimestamp(timestamp)\n",
"\n",
" # Format the datetime object as YYYY-MM-DD\n",
" formatted_date = dt.strftime('%Y-%m-%d')\n",
" return formatted_date"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_trend(x, y, title, outliner_threah = 95, num_bins=200):\n",
" y = y[x>=0]\n",
" x = x[x>=0]\n",
" if outliner_threah:\n",
" # Calculate z-scores\n",
" z_scores = np.abs((y - np.mean(y)) / np.std(y))\n",
"\n",
" # Determine the threshold based on the desired percentage of inliers\n",
" threshold = np.percentile(z_scores, outliner_threah)\n",
"\n",
" # Filter out outliers\n",
" filtered_x = x[z_scores <= threshold]\n",
" filtered_y = y[z_scores <= threshold]\n",
" else:\n",
" filtered_x = x\n",
" filtered_y = y\n",
"\n",
" # Compute the histogram\n",
" if num_bins:\n",
" hist, bin_edges = np.histogram(filtered_x, bins=num_bins)\n",
" # Compute the average value for each bin\n",
" bin_avg = np.zeros(num_bins)\n",
" for i in range(num_bins):\n",
" bin_values = filtered_y[(filtered_x >= bin_edges[i]) & (filtered_x < bin_edges[i + 1])]\n",
" bin_avg[i] = np.mean(bin_values)\n",
" nan_mask = np.isnan(bin_avg)\n",
" edges = bin_edges[:-1][~nan_mask]\n",
" bin_avg = bin_avg[~nan_mask]\n",
" else:\n",
" bin_avg = filtered_y\n",
" edges = filtered_x\n",
"\n",
"\n",
2024-06-26 07:58:24 +00:00
" average = np.mean(y)\n",
" highest = np.max(y)\n",
" lowest = np.min(y)\n",
"\n",
" date_time = []\n",
" for e in edges:\n",
" date_time.append(to_datetime(e))\n",
" plt.plot(edges, bin_avg)\n",
" # plt.plot(filtered_x, filtered_y)\n",
2024-06-26 07:58:24 +00:00
" plt.axhline(average, color='orange', linestyle='--', label='Average')\n",
" plt.text(x[-1], average, f'Avg: {average:.2f}', ha='right', va='center')\n",
2024-06-26 07:58:24 +00:00
"\n",
" plt.axhline(highest, color='red', linestyle='--', label='Highest')\n",
" plt.text(x[-1], highest, f'High: {highest:.2f}', ha='right', va='center')\n",
"\n",
" plt.axhline(lowest, color='green', linestyle='--', label='Lowest')\n",
" plt.text(x[-1], lowest, f'Avg: {lowest:.2f}', ha='right', va='center')\n",
" plt.xlabel('Timestamp')\n",
" plt.ylabel('Processing Time (seconds)')\n",
" plt.title(title)\n",
" plt.xticks(rotation=45)\n",
" plt.show()"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_request.index, frame_by_request[\"image_avg_cost\"], \"Image average cost\", outliner_threah=95, num_bins=30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_trend(frame_by_request.index, frame_by_request[\"backend_cost\"], \"Backend cost Trend\")\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_request.index, frame_by_request[\"request_cost\"], \"Request_cost Trend\")\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_request.index, frame_by_request[\"number_of_file\"], \"Files in a request Trend\", outliner_threah=None, num_bins=30)\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_file.index, frame_by_file[\"postprocessing\"], \"AI postprocessing Trend\")\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_imei.index, frame_by_imei[\"inference\"], \"IMEI Inference Trend\")\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_invoice.index, frame_by_invoice[\"inference\"], \"Invoice Inference Trend\")\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_file.index, frame_by_file[\"inference\"], \"AI inference Trend\")\n"
]
},
{
"cell_type": "code",
2024-06-26 07:58:24 +00:00
"execution_count": null,
"metadata": {},
2024-06-26 07:58:24 +00:00
"outputs": [],
"source": [
"plot_trend(frame_by_file.index, frame_by_file[\"queue\"], \"AI queuing Trend\")\n"
]
},
2024-06-26 07:58:24 +00:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def split_timestamps_by_day(timestamps):\n",
" \"\"\"\n",
" Splits a list of timestamps into a list of lists, where each inner list contains the timestamps for a single day.\n",
" \n",
" Args:\n",
" timestamps (list): A list of timestamp values.\n",
" \n",
" Returns:\n",
" list: A list of lists, where each inner list contains the timestamps for a single day.\n",
" \"\"\"\n",
" # Convert timestamps to datetime objects\n",
" datetimes = [pd.Timestamp(t, unit='s') for t in timestamps]\n",
" \n",
" # Create a DataFrame with the datetime objects\n",
" df = pd.DataFrame({'timestamp': datetimes})\n",
" \n",
" # Group the DataFrame by day and collect the timestamps for each day\n",
" timestamps_by_day = []\n",
" for _, group in df.groupby(pd.Grouper(key='timestamp', freq='D')):\n",
" day_timestamps = [t.timestamp() for t in group['timestamp']]\n",
" timestamps_by_day.append(day_timestamps)\n",
" \n",
" return timestamps_by_day"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def visualize_requests_by_time(timestamps_by_day):\n",
" \"\"\"\n",
" Generates a histogram to visualize the number of requests received by time.\n",
" \n",
" Args:\n",
" timestamps (list): A list of timestamps (in seconds) representing the time when each request was received.\n",
" \"\"\"\n",
" num_days = len(timestamps_by_day)\n",
" fig, axes = plt.subplots(nrows=int(np.ceil(num_days / 2)), ncols=2, figsize=(90, 450))\n",
" for i, day_timestamps in enumerate(timestamps_by_day):\n",
" row = i // 2\n",
" col = i % 2\n",
" \n",
" day_timestamps = [pd.Timestamp(t, unit='s') for t in day_timestamps]\n",
" # Get the current axis\n",
" ax = axes[row, col] if num_days > 1 else axes\n",
" \n",
" # Plot the histogram for the current day\n",
" ax.hist(pd.to_datetime(day_timestamps), bins=60*24, edgecolor='black')\n",
" # ax.xticks(rotation=45)\n",
" ax.set_title(f\"Day {i+1}\")\n",
" ax.set_xlabel(\"Time\")\n",
" ax.set_ylabel(\"Count\")\n",
"\n",
" ax2 = ax.twinx()\n",
" ax2.set_ylabel(\"Time Process (s)\")\n",
" ax2.set_ylim(min(day_timestamps), max(day_timestamps))\n",
" # Convert timestamps to datetime objects\n",
" # timestamps = [pd.Timestamp(t, unit='s') for t in timestamps]\n",
" \n",
" # # Create a histogram with 24 bins (one for each hour of the day)\n",
" # plt.figure(figsize=(12, 6))\n",
" # # plt.hist(pd.to_datetime([t.hour for t in timestamps], unit='h'), bins=24, edgecolor='black')\n",
" # plt.hist(pd.to_datetime(timestamps), bins=1000, edgecolor='black')\n",
" \n",
" # Set x-axis labels to show the hour of the day\n",
" # x_labels = [f'{i:02d}:00' for i in range(24)]\n",
" # plt.xticks(np.arange(0, 24), x_labels, rotation=45)\n",
" # plt.xticks(rotation=45)\n",
" # plt.xlabel('Hour of the Day')\n",
" # plt.ylabel('Number of Requests')\n",
" # plt.title('Requests Received by Time of Day')\n",
" # plt.show()\n",
" plt.suptitle(\"Distribution of Requests by Time\")\n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# timestamps_by_day=split_timestamps_by_day(result_by_file[\"created\"])\n",
"# visualize_requests_by_time(timestamps_by_day)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
2024-06-12 07:46:18 +00:00
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}