{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install -r /mnt/hdd4T/TannedCung/OCR/sbt-idp/cope2n-api/requirements.txt\n", "# !pip install matplotlib tqdm python-dotenv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import os\n", "# from dotenv import load_dotenv\n", "\n", "# load_dotenv(\"/mnt/hdd4T/TannedCung/OCR/sbt-idp/.env_prod\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !export DB_ENGINE=django.db.backends.postgresql_psycopg2\n", "# !export DB_SCHEMA=sbt_prod_20240422_1\n", "# !export DB_USER=postgres\n", "# !export DB_PASSWORD=TannedCung\n", "# !export DB_HOST=db-sbt\n", "# !export DB_PUBLIC_PORT=5432\n", "# !export DB_INTERNAL_PORT=5432" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import sys\n", "import os\n", "import numpy as np\n", "from tqdm import tqdm\n", "import datetime\n", "\n", "current_dir = os.getcwd()\n", "parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))\n", "sys.path.append(parent_dir)\n", "\n", "import django\n", "from django.db import models\n", "from django.db.models import Q\n", "from django.utils import timezone\n", "from asgiref.sync import sync_to_async\n", "from fwd import settings\n", "os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"fwd.settings\")\n", "os.environ[\"DJANGO_ALLOW_ASYNC_UNSAFE\"] = \"true\"\n", "django.setup()\n", "from fwd_api.models.SubscriptionRequest import SubscriptionRequest\n", "from utils.processing_time import cost_profile, backend_cost\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start_date_str = \"2024-05-25T00:00:00+0800\"\n", "end_date_str = \"2024-06-24T00:00:00+0800\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "@sync_to_async\n", "def query_data(start_date_str, end_date_str):\n", " base_query = Q(status=200)\n", " start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only\n", " end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')\n", " # start_date = timezone.make_aware(start_date)\n", " # end_date = timezone.make_aware(end_date)\n", " base_query &= Q(redemption_id__startswith=\"AU\")\n", " base_query &= Q(created_at__range=(start_date, end_date))\n", " subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')\n", " return subscription_requests\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "subscription_requests = await query_data(start_date_str, end_date_str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process(requests):\n", " result_by_file = {\"queue\": [],\n", " \"inference\": [],\n", " \"postprocessing\": [],\n", " \"created\": []}\n", " result_by_file_type = {\"imei\": {\"queue\": [],\n", " \"inference\": [],\n", " \"postprocessing\": [],\n", " \"created\": []},\n", " \"invoice\": {\"queue\": [],\n", " \"inference\": [],\n", " \"postprocessing\": [],\n", " \"created\": []}}\n", " result_by_request = {\"backend_cost\": [],\n", " \"number_of_file\": [],\n", " \"request_cost\": [],\n", " \"created\": []}\n", " for request in tqdm(requests):\n", " if not request.ai_inference_profile:\n", " continue\n", " result_by_request[\"created\"].append(request.created_at.timestamp())\n", " result_by_request[\"number_of_file\"].append(request.pages)\n", " result_by_request[\"backend_cost\"].append(backend_cost(request.created_at, request.ai_inference_start_time))\n", " result_by_request[\"request_cost\"].append(request.ai_inference_time)\n", "\n", " for key in request.ai_inference_profile.keys():\n", " profile = cost_profile(request.ai_inference_start_time, request.ai_inference_profile[key])\n", " result_by_file[\"queue\"].append(profile[\"queue\"])\n", " result_by_file[\"inference\"].append(profile[\"inference\"])\n", " result_by_file[\"postprocessing\"].append(profile[\"postprocessing\"])\n", " result_by_file[\"created\"].append(request.ai_inference_start_time)\n", " if key.split(\"_\")[0] in [\"imei\", \"invoice\"]:\n", " result_by_file_type[key.split(\"_\")[0]][\"queue\"].append(profile[\"queue\"])\n", " result_by_file_type[key.split(\"_\")[0]][\"inference\"].append(profile[\"inference\"])\n", " result_by_file_type[key.split(\"_\")[0]][\"postprocessing\"].append(profile[\"postprocessing\"])\n", " result_by_file_type[key.split(\"_\")[0]][\"created\"].append(request.ai_inference_start_time)\n", "\n", " \n", " return result_by_request, result_by_file, result_by_file_type\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result_by_request, result_by_file, result_by_file_type = process(subscription_requests)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "frame_by_file = pd.DataFrame(result_by_file)\n", "frame_by_request = pd.DataFrame(result_by_request)\n", "frame_by_imei = pd.DataFrame(result_by_file_type[\"imei\"])\n", "frame_by_invoice = pd.DataFrame(result_by_file_type[\"invoice\"])\n", "\n", "frame_by_request[\"image_avg_cost\"] = (frame_by_request[\"backend_cost\"] + frame_by_request[\"request_cost\"])/frame_by_request[\"number_of_file\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "frame_by_file.set_index('created', inplace=True)\n", "frame_by_request.set_index('created', inplace=True)\n", "frame_by_imei.set_index('created', inplace=True)\n", "frame_by_invoice.set_index('created', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def to_datetime(timestamp):\n", " # Convert the timestamp to a datetime object\n", " dt = datetime.datetime.fromtimestamp(timestamp)\n", "\n", " # Format the datetime object as YYYY-MM-DD\n", " formatted_date = dt.strftime('%Y-%m-%d')\n", " return formatted_date" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_trend(x, y, title, outliner_threah = 95, num_bins=200):\n", " y = y[x>=0]\n", " x = x[x>=0]\n", " if outliner_threah:\n", " # Calculate z-scores\n", " z_scores = np.abs((y - np.mean(y)) / np.std(y))\n", "\n", " # Determine the threshold based on the desired percentage of inliers\n", " threshold = np.percentile(z_scores, outliner_threah)\n", "\n", " # Filter out outliers\n", " filtered_x = x[z_scores <= threshold]\n", " filtered_y = y[z_scores <= threshold]\n", " else:\n", " filtered_x = x\n", " filtered_y = y\n", "\n", " # Compute the histogram\n", " if num_bins:\n", " hist, bin_edges = np.histogram(filtered_x, bins=num_bins)\n", " # Compute the average value for each bin\n", " bin_avg = np.zeros(num_bins)\n", " for i in range(num_bins):\n", " bin_values = filtered_y[(filtered_x >= bin_edges[i]) & (filtered_x < bin_edges[i + 1])]\n", " bin_avg[i] = np.mean(bin_values)\n", " nan_mask = np.isnan(bin_avg)\n", " edges = bin_edges[:-1][~nan_mask]\n", " bin_avg = bin_avg[~nan_mask]\n", " else:\n", " bin_avg = filtered_y\n", " edges = filtered_x\n", "\n", "\n", " average = np.mean(y)\n", " highest = np.max(y)\n", " lowest = np.min(y)\n", "\n", " date_time = []\n", " for e in edges:\n", " date_time.append(to_datetime(e))\n", " plt.plot(edges, bin_avg)\n", " # plt.plot(filtered_x, filtered_y)\n", " plt.axhline(average, color='orange', linestyle='--', label='Average')\n", " plt.text(x[-1], average, f'Avg: {average:.2f}', ha='right', va='center')\n", "\n", " plt.axhline(highest, color='red', linestyle='--', label='Highest')\n", " plt.text(x[-1], highest, f'High: {highest:.2f}', ha='right', va='center')\n", "\n", " plt.axhline(lowest, color='green', linestyle='--', label='Lowest')\n", " plt.text(x[-1], lowest, f'Avg: {lowest:.2f}', ha='right', va='center')\n", " plt.xlabel('Timestamp')\n", " plt.ylabel('Processing Time (seconds)')\n", " plt.title(title)\n", " plt.xticks(rotation=45)\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_request.index, frame_by_request[\"image_avg_cost\"], \"Image average cost\", outliner_threah=95, num_bins=30)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_request.index, frame_by_request[\"backend_cost\"], \"Backend cost Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_request.index, frame_by_request[\"request_cost\"], \"Request_cost Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_request.index, frame_by_request[\"number_of_file\"], \"Files in a request Trend\", outliner_threah=None, num_bins=30)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_file.index, frame_by_file[\"postprocessing\"], \"AI postprocessing Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_imei.index, frame_by_imei[\"inference\"], \"IMEI Inference Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_invoice.index, frame_by_invoice[\"inference\"], \"Invoice Inference Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_file.index, frame_by_file[\"inference\"], \"AI inference Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_trend(frame_by_file.index, frame_by_file[\"queue\"], \"AI queuing Trend\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def split_timestamps_by_day(timestamps):\n", " \"\"\"\n", " Splits a list of timestamps into a list of lists, where each inner list contains the timestamps for a single day.\n", " \n", " Args:\n", " timestamps (list): A list of timestamp values.\n", " \n", " Returns:\n", " list: A list of lists, where each inner list contains the timestamps for a single day.\n", " \"\"\"\n", " # Convert timestamps to datetime objects\n", " datetimes = [pd.Timestamp(t, unit='s') for t in timestamps]\n", " \n", " # Create a DataFrame with the datetime objects\n", " df = pd.DataFrame({'timestamp': datetimes})\n", " \n", " # Group the DataFrame by day and collect the timestamps for each day\n", " timestamps_by_day = []\n", " for _, group in df.groupby(pd.Grouper(key='timestamp', freq='D')):\n", " day_timestamps = [t.timestamp() for t in group['timestamp']]\n", " timestamps_by_day.append(day_timestamps)\n", " \n", " return timestamps_by_day" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def visualize_requests_by_time(timestamps_by_day):\n", " \"\"\"\n", " Generates a histogram to visualize the number of requests received by time.\n", " \n", " Args:\n", " timestamps (list): A list of timestamps (in seconds) representing the time when each request was received.\n", " \"\"\"\n", " num_days = len(timestamps_by_day)\n", " fig, axes = plt.subplots(nrows=int(np.ceil(num_days / 2)), ncols=2, figsize=(90, 450))\n", " for i, day_timestamps in enumerate(timestamps_by_day):\n", " row = i // 2\n", " col = i % 2\n", " \n", " day_timestamps = [pd.Timestamp(t, unit='s') for t in day_timestamps]\n", " # Get the current axis\n", " ax = axes[row, col] if num_days > 1 else axes\n", " \n", " # Plot the histogram for the current day\n", " ax.hist(pd.to_datetime(day_timestamps), bins=60*24, edgecolor='black')\n", " # ax.xticks(rotation=45)\n", " ax.set_title(f\"Day {i+1}\")\n", " ax.set_xlabel(\"Time\")\n", " ax.set_ylabel(\"Count\")\n", "\n", " ax2 = ax.twinx()\n", " ax2.set_ylabel(\"Time Process (s)\")\n", " ax2.set_ylim(min(day_timestamps), max(day_timestamps))\n", " # Convert timestamps to datetime objects\n", " # timestamps = [pd.Timestamp(t, unit='s') for t in timestamps]\n", " \n", " # # Create a histogram with 24 bins (one for each hour of the day)\n", " # plt.figure(figsize=(12, 6))\n", " # # plt.hist(pd.to_datetime([t.hour for t in timestamps], unit='h'), bins=24, edgecolor='black')\n", " # plt.hist(pd.to_datetime(timestamps), bins=1000, edgecolor='black')\n", " \n", " # Set x-axis labels to show the hour of the day\n", " # x_labels = [f'{i:02d}:00' for i in range(24)]\n", " # plt.xticks(np.arange(0, 24), x_labels, rotation=45)\n", " # plt.xticks(rotation=45)\n", " # plt.xlabel('Hour of the Day')\n", " # plt.ylabel('Number of Requests')\n", " # plt.title('Requests Received by Time of Day')\n", " # plt.show()\n", " plt.suptitle(\"Distribution of Requests by Time\")\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# timestamps_by_day=split_timestamps_by_day(result_by_file[\"created\"])\n", "# visualize_requests_by_time(timestamps_by_day)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }