sbt-idp/cope2n-api/EDA/processing_time.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install -r /mnt/hdd4T/TannedCung/OCR/sbt-idp/cope2n-api/requirements.txt\n",
    "# !pip install matplotlib tqdm python-dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "# from dotenv import load_dotenv\n",
    "\n",
    "# load_dotenv(\"/mnt/hdd4T/TannedCung/OCR/sbt-idp/.env_prod\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !export DB_ENGINE=django.db.backends.postgresql_psycopg2\n",
    "# !export DB_SCHEMA=sbt_prod_20240422_1\n",
    "# !export DB_USER=postgres\n",
    "# !export DB_PASSWORD=TannedCung\n",
    "# !export DB_HOST=db-sbt\n",
    "# !export DB_PUBLIC_PORT=5432\n",
    "# !export DB_INTERNAL_PORT=5432"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import sys\n",
    "import os\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "import datetime\n",
    "\n",
    "current_dir = os.getcwd()\n",
    "parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))\n",
    "sys.path.append(parent_dir)\n",
    "\n",
    "import django\n",
    "from django.db import models\n",
    "from django.db.models import Q\n",
    "from django.utils import timezone\n",
    "from asgiref.sync import sync_to_async\n",
    "from fwd import settings\n",
    "os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"fwd.settings\")\n",
    "os.environ[\"DJANGO_ALLOW_ASYNC_UNSAFE\"] = \"true\"\n",
    "django.setup()\n",
    "from fwd_api.models.SubscriptionRequest import SubscriptionRequest\n",
    "from utils.processing_time import cost_profile, backend_cost\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_date_str = \"2024-05-25T00:00:00+0800\"\n",
    "end_date_str = \"2024-06-24T00:00:00+0800\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@sync_to_async\n",
    "def query_data(start_date_str, end_date_str):\n",
    "    base_query = Q(status=200)\n",
    "    start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only\n",
    "    end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')\n",
    "    # start_date = timezone.make_aware(start_date)\n",
    "    # end_date = timezone.make_aware(end_date)\n",
    "    base_query &= Q(redemption_id__startswith=\"AU\")\n",
    "    base_query &= Q(created_at__range=(start_date, end_date))\n",
    "    subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')\n",
    "    return subscription_requests\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "subscription_requests = await query_data(start_date_str, end_date_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(requests):\n",
    "    result_by_file = {\"queue\": [],\n",
    "                        \"inference\": [],\n",
    "                        \"postprocessing\": [],\n",
    "                        \"created\": []}\n",
    "    result_by_file_type = {\"imei\": {\"queue\": [],\n",
    "                                    \"inference\": [],\n",
    "                                    \"postprocessing\": [],\n",
    "                                    \"created\": []},\n",
    "                          \"invoice\": {\"queue\": [],\n",
    "                                    \"inference\": [],\n",
    "                                    \"postprocessing\": [],\n",
    "                                    \"created\": []}}\n",
    "    result_by_request = {\"backend_cost\": [],\n",
    "                        \"number_of_file\": [],\n",
    "                        \"request_cost\": [],\n",
    "                        \"created\": []}\n",
    "    for request in tqdm(requests):\n",
    "        if not request.ai_inference_profile:\n",
    "            continue\n",
    "        result_by_request[\"created\"].append(request.created_at.timestamp())\n",
    "        result_by_request[\"number_of_file\"].append(request.pages)\n",
    "        result_by_request[\"backend_cost\"].append(backend_cost(request.created_at, request.ai_inference_start_time))\n",
    "        result_by_request[\"request_cost\"].append(request.ai_inference_time)\n",
    "\n",
    "        for key in request.ai_inference_profile.keys():\n",
    "            profile = cost_profile(request.ai_inference_start_time, request.ai_inference_profile[key])\n",
    "            result_by_file[\"queue\"].append(profile[\"queue\"])\n",
    "            result_by_file[\"inference\"].append(profile[\"inference\"])\n",
    "            result_by_file[\"postprocessing\"].append(profile[\"postprocessing\"])\n",
    "            result_by_file[\"created\"].append(request.ai_inference_start_time)\n",
    "            if key.split(\"_\")[0] in [\"imei\", \"invoice\"]:\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"queue\"].append(profile[\"queue\"])\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"inference\"].append(profile[\"inference\"])\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"postprocessing\"].append(profile[\"postprocessing\"])\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"created\"].append(request.ai_inference_start_time)\n",
    "\n",
    "    \n",
    "    return result_by_request, result_by_file, result_by_file_type\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_by_request, result_by_file, result_by_file_type = process(subscription_requests)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame_by_file = pd.DataFrame(result_by_file)\n",
    "frame_by_request = pd.DataFrame(result_by_request)\n",
    "frame_by_imei = pd.DataFrame(result_by_file_type[\"imei\"])\n",
    "frame_by_invoice = pd.DataFrame(result_by_file_type[\"invoice\"])\n",
    "\n",
    "frame_by_request[\"image_avg_cost\"] = (frame_by_request[\"backend_cost\"] + frame_by_request[\"request_cost\"])/frame_by_request[\"number_of_file\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame_by_file.set_index('created', inplace=True)\n",
    "frame_by_request.set_index('created', inplace=True)\n",
    "frame_by_imei.set_index('created', inplace=True)\n",
    "frame_by_invoice.set_index('created', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_datetime(timestamp):\n",
    "    # Convert the timestamp to a datetime object\n",
    "    dt = datetime.datetime.fromtimestamp(timestamp)\n",
    "\n",
    "    # Format the datetime object as YYYY-MM-DD\n",
    "    formatted_date = dt.strftime('%Y-%m-%d')\n",
    "    return formatted_date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_trend(x, y, title, outliner_threah = 95, num_bins=200):\n",
    "    y = y[x>=0]\n",
    "    x = x[x>=0]\n",
    "    if outliner_threah:\n",
    "        # Calculate z-scores\n",
    "        z_scores = np.abs((y - np.mean(y)) / np.std(y))\n",
    "\n",
    "        # Determine the threshold based on the desired percentage of inliers\n",
    "        threshold = np.percentile(z_scores, outliner_threah)\n",
    "\n",
    "        # Filter out outliers\n",
    "        filtered_x = x[z_scores <= threshold]\n",
    "        filtered_y = y[z_scores <= threshold]\n",
    "    else:\n",
    "        filtered_x = x\n",
    "        filtered_y = y\n",
    "\n",
    "    # Compute the histogram\n",
    "    if num_bins:\n",
    "        hist, bin_edges = np.histogram(filtered_x, bins=num_bins)\n",
    "        # Compute the average value for each bin\n",
    "        bin_avg = np.zeros(num_bins)\n",
    "        for i in range(num_bins):\n",
    "            bin_values = filtered_y[(filtered_x >= bin_edges[i]) & (filtered_x < bin_edges[i + 1])]\n",
    "            bin_avg[i] = np.mean(bin_values)\n",
    "        nan_mask = np.isnan(bin_avg)\n",
    "        edges = bin_edges[:-1][~nan_mask]\n",
    "        bin_avg = bin_avg[~nan_mask]\n",
    "    else:\n",
    "        bin_avg = filtered_y\n",
    "        edges = filtered_x\n",
    "\n",
    "\n",
    "    average = np.mean(y)\n",
    "    highest = np.max(y)\n",
    "    lowest = np.min(y)\n",
    "\n",
    "    date_time = []\n",
    "    for e in edges:\n",
    "        date_time.append(to_datetime(e))\n",
    "    plt.plot(edges, bin_avg)\n",
    "    # plt.plot(filtered_x, filtered_y)\n",
    "    plt.axhline(average, color='orange', linestyle='--', label='Average')\n",
    "    plt.text(x[-1], average, f'Avg: {average:.2f}', ha='right', va='center')\n",
    "\n",
    "    plt.axhline(highest, color='red', linestyle='--', label='Highest')\n",
    "    plt.text(x[-1], highest, f'High: {highest:.2f}', ha='right', va='center')\n",
    "\n",
    "    plt.axhline(lowest, color='green', linestyle='--', label='Lowest')\n",
    "    plt.text(x[-1], lowest, f'Avg: {lowest:.2f}', ha='right', va='center')\n",
    "    plt.xlabel('Timestamp')\n",
    "    plt.ylabel('Processing Time (seconds)')\n",
    "    plt.title(title)\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"image_avg_cost\"], \"Image average cost\", outliner_threah=95, num_bins=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"backend_cost\"], \"Backend cost Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"request_cost\"], \"Request_cost Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"number_of_file\"], \"Files in a request Trend\", outliner_threah=None, num_bins=30)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_file.index, frame_by_file[\"postprocessing\"], \"AI postprocessing Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_imei.index, frame_by_imei[\"inference\"], \"IMEI Inference Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_invoice.index, frame_by_invoice[\"inference\"], \"Invoice Inference Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_file.index, frame_by_file[\"inference\"], \"AI inference Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_file.index, frame_by_file[\"queue\"], \"AI queuing Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_timestamps_by_day(timestamps):\n",
    "    \"\"\"\n",
    "    Splits a list of timestamps into a list of lists, where each inner list contains the timestamps for a single day.\n",
    "    \n",
    "    Args:\n",
    "        timestamps (list): A list of timestamp values.\n",
    "        \n",
    "    Returns:\n",
    "        list: A list of lists, where each inner list contains the timestamps for a single day.\n",
    "    \"\"\"\n",
    "    # Convert timestamps to datetime objects\n",
    "    datetimes = [pd.Timestamp(t, unit='s') for t in timestamps]\n",
    "    \n",
    "    # Create a DataFrame with the datetime objects\n",
    "    df = pd.DataFrame({'timestamp': datetimes})\n",
    "    \n",
    "    # Group the DataFrame by day and collect the timestamps for each day\n",
    "    timestamps_by_day = []\n",
    "    for _, group in df.groupby(pd.Grouper(key='timestamp', freq='D')):\n",
    "        day_timestamps = [t.timestamp() for t in group['timestamp']]\n",
    "        timestamps_by_day.append(day_timestamps)\n",
    "    \n",
    "    return timestamps_by_day"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualize_requests_by_time(timestamps_by_day):\n",
    "    \"\"\"\n",
    "    Generates a histogram to visualize the number of requests received by time.\n",
    "    \n",
    "    Args:\n",
    "        timestamps (list): A list of timestamps (in seconds) representing the time when each request was received.\n",
    "    \"\"\"\n",
    "    num_days = len(timestamps_by_day)\n",
    "    fig, axes = plt.subplots(nrows=int(np.ceil(num_days / 2)), ncols=2, figsize=(90, 450))\n",
    "    for i, day_timestamps in enumerate(timestamps_by_day):\n",
    "        row = i // 2\n",
    "        col = i % 2\n",
    "        \n",
    "        day_timestamps = [pd.Timestamp(t, unit='s') for t in day_timestamps]\n",
    "        # Get the current axis\n",
    "        ax = axes[row, col] if num_days > 1 else axes\n",
    "        \n",
    "        # Plot the histogram for the current day\n",
    "        ax.hist(pd.to_datetime(day_timestamps), bins=60*24, edgecolor='black')\n",
    "        # ax.xticks(rotation=45)\n",
    "        ax.set_title(f\"Day {i+1}\")\n",
    "        ax.set_xlabel(\"Time\")\n",
    "        ax.set_ylabel(\"Count\")\n",
    "\n",
    "        ax2 = ax.twinx()\n",
    "        ax2.set_ylabel(\"Time Process (s)\")\n",
    "        ax2.set_ylim(min(day_timestamps), max(day_timestamps))\n",
    "    # Convert timestamps to datetime objects\n",
    "    # timestamps = [pd.Timestamp(t, unit='s') for t in timestamps]\n",
    "    \n",
    "    # # Create a histogram with 24 bins (one for each hour of the day)\n",
    "    # plt.figure(figsize=(12, 6))\n",
    "    # # plt.hist(pd.to_datetime([t.hour for t in timestamps], unit='h'), bins=24, edgecolor='black')\n",
    "    # plt.hist(pd.to_datetime(timestamps), bins=1000, edgecolor='black')\n",
    "    \n",
    "    # Set x-axis labels to show the hour of the day\n",
    "    # x_labels = [f'{i:02d}:00' for i in range(24)]\n",
    "    # plt.xticks(np.arange(0, 24), x_labels, rotation=45)\n",
    "    # plt.xticks(rotation=45)\n",
    "    # plt.xlabel('Hour of the Day')\n",
    "    # plt.ylabel('Number of Requests')\n",
    "    # plt.title('Requests Received by Time of Day')\n",
    "    # plt.show()\n",
    "    plt.suptitle(\"Distribution of Requests by Time\")\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# timestamps_by_day=split_timestamps_by_day(result_by_file[\"created\"])\n",
    "# visualize_requests_by_time(timestamps_by_day)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
WIP on main: 782d99a Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into main 2024-06-11 07:22:17 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# !pip install -r /mnt/hdd4T/TannedCung/OCR/sbt-idp/cope2n-api/requirements.txt\n",`
			`"# !pip install matplotlib tqdm python-dotenv"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
WIP on main: 782d99a Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into main 2024-06-11 07:22:17 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# import os\n",`
			`"# from dotenv import load_dotenv\n",`
			`"\n",`
			`"# load_dotenv(\"/mnt/hdd4T/TannedCung/OCR/sbt-idp/.env_prod\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
WIP on main: 782d99a Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into main 2024-06-11 07:22:17 +00:00			`"source": [`
			`"# !export DB_ENGINE=django.db.backends.postgresql_psycopg2\n",`
			`"# !export DB_SCHEMA=sbt_prod_20240422_1\n",`
			`"# !export DB_USER=postgres\n",`
			`"# !export DB_PASSWORD=TannedCung\n",`
			`"# !export DB_HOST=db-sbt\n",`
			`"# !export DB_PUBLIC_PORT=5432\n",`
			`"# !export DB_INTERNAL_PORT=5432"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
WIP on main: 782d99a Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into main 2024-06-11 07:22:17 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"import pandas as pd\n",`
			`"import matplotlib.pyplot as plt\n",`
			`"import sys\n",`
			`"import os\n",`
			`"import numpy as np\n",`
			`"from tqdm import tqdm\n",`
			`"import datetime\n",`
			`"\n",`
			`"current_dir = os.getcwd()\n",`
			`"parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))\n",`
			`"sys.path.append(parent_dir)\n",`
			`"\n",`
			`"import django\n",`
			`"from django.db import models\n",`
			`"from django.db.models import Q\n",`
			`"from django.utils import timezone\n",`
			`"from asgiref.sync import sync_to_async\n",`
			`"from fwd import settings\n",`
			`"os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"fwd.settings\")\n",`
			`"os.environ[\"DJANGO_ALLOW_ASYNC_UNSAFE\"] = \"true\"\n",`
			`"django.setup()\n",`
			`"from fwd_api.models.SubscriptionRequest import SubscriptionRequest\n",`
			`"from utils.processing_time import cost_profile, backend_cost\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
Add logging 2024-06-26 07:58:24 +00:00			`"start_date_str = \"2024-05-25T00:00:00+0800\"\n",`
			`"end_date_str = \"2024-06-24T00:00:00+0800\""`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"@sync_to_async\n",`
			`"def query_data(start_date_str, end_date_str):\n",`
			`" base_query = Q(status=200)\n",`
			`" start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only\n",`
			`" end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')\n",`
			`" # start_date = timezone.make_aware(start_date)\n",`
			`" # end_date = timezone.make_aware(end_date)\n",`
Add logging 2024-06-26 07:58:24 +00:00			`" base_query &= Q(redemption_id__startswith=\"AU\")\n",`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`" base_query &= Q(created_at__range=(start_date, end_date))\n",`
			`" subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')\n",`
			`" return subscription_requests\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"subscription_requests = await query_data(start_date_str, end_date_str)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def process(requests):\n",`
			`" result_by_file = {\"queue\": [],\n",`
			`" \"inference\": [],\n",`
			`" \"postprocessing\": [],\n",`
			`" \"created\": []}\n",`
			`" result_by_file_type = {\"imei\": {\"queue\": [],\n",`
			`" \"inference\": [],\n",`
			`" \"postprocessing\": [],\n",`
			`" \"created\": []},\n",`
			`" \"invoice\": {\"queue\": [],\n",`
			`" \"inference\": [],\n",`
			`" \"postprocessing\": [],\n",`
			`" \"created\": []}}\n",`
			`" result_by_request = {\"backend_cost\": [],\n",`
			`" \"number_of_file\": [],\n",`
			`" \"request_cost\": [],\n",`
			`" \"created\": []}\n",`
			`" for request in tqdm(requests):\n",`
			`" if not request.ai_inference_profile:\n",`
			`" continue\n",`
			`" result_by_request[\"created\"].append(request.created_at.timestamp())\n",`
			`" result_by_request[\"number_of_file\"].append(request.pages)\n",`
			`" result_by_request[\"backend_cost\"].append(backend_cost(request.created_at, request.ai_inference_start_time))\n",`
			`" result_by_request[\"request_cost\"].append(request.ai_inference_time)\n",`
			`"\n",`
			`" for key in request.ai_inference_profile.keys():\n",`
			`" profile = cost_profile(request.ai_inference_start_time, request.ai_inference_profile[key])\n",`
			`" result_by_file[\"queue\"].append(profile[\"queue\"])\n",`
			`" result_by_file[\"inference\"].append(profile[\"inference\"])\n",`
			`" result_by_file[\"postprocessing\"].append(profile[\"postprocessing\"])\n",`
			`" result_by_file[\"created\"].append(request.ai_inference_start_time)\n",`
			`" if key.split(\"_\")[0] in [\"imei\", \"invoice\"]:\n",`
			`" result_by_file_type[key.split(\"_\")[0]][\"queue\"].append(profile[\"queue\"])\n",`
			`" result_by_file_type[key.split(\"_\")[0]][\"inference\"].append(profile[\"inference\"])\n",`
			`" result_by_file_type[key.split(\"_\")[0]][\"postprocessing\"].append(profile[\"postprocessing\"])\n",`
			`" result_by_file_type[key.split(\"_\")[0]][\"created\"].append(request.ai_inference_start_time)\n",`
			`"\n",`
			`" \n",`
			`" return result_by_request, result_by_file, result_by_file_type\n",`
			`"\n",`
			`" "`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"result_by_request, result_by_file, result_by_file_type = process(subscription_requests)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"frame_by_file = pd.DataFrame(result_by_file)\n",`
			`"frame_by_request = pd.DataFrame(result_by_request)\n",`
			`"frame_by_imei = pd.DataFrame(result_by_file_type[\"imei\"])\n",`
Add logging 2024-06-26 07:58:24 +00:00			`"frame_by_invoice = pd.DataFrame(result_by_file_type[\"invoice\"])\n",`
			`"\n",`
			`"frame_by_request[\"image_avg_cost\"] = (frame_by_request[\"backend_cost\"] + frame_by_request[\"request_cost\"])/frame_by_request[\"number_of_file\"]"`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"frame_by_file.set_index('created', inplace=True)\n",`
			`"frame_by_request.set_index('created', inplace=True)\n",`
			`"frame_by_imei.set_index('created', inplace=True)\n",`
			`"frame_by_invoice.set_index('created', inplace=True)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def to_datetime(timestamp):\n",`
			`" # Convert the timestamp to a datetime object\n",`
			`" dt = datetime.datetime.fromtimestamp(timestamp)\n",`
			`"\n",`
			`" # Format the datetime object as YYYY-MM-DD\n",`
			`" formatted_date = dt.strftime('%Y-%m-%d')\n",`
			`" return formatted_date"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def plot_trend(x, y, title, outliner_threah = 95, num_bins=200):\n",`
			`" y = y[x>=0]\n",`
			`" x = x[x>=0]\n",`
			`" if outliner_threah:\n",`
			`" # Calculate z-scores\n",`
			`" z_scores = np.abs((y - np.mean(y)) / np.std(y))\n",`
			`"\n",`
			`" # Determine the threshold based on the desired percentage of inliers\n",`
			`" threshold = np.percentile(z_scores, outliner_threah)\n",`
			`"\n",`
			`" # Filter out outliers\n",`
			`" filtered_x = x[z_scores <= threshold]\n",`
			`" filtered_y = y[z_scores <= threshold]\n",`
			`" else:\n",`
			`" filtered_x = x\n",`
			`" filtered_y = y\n",`
			`"\n",`
			`" # Compute the histogram\n",`
			`" if num_bins:\n",`
			`" hist, bin_edges = np.histogram(filtered_x, bins=num_bins)\n",`
			`" # Compute the average value for each bin\n",`
			`" bin_avg = np.zeros(num_bins)\n",`
			`" for i in range(num_bins):\n",`
			`" bin_values = filtered_y[(filtered_x >= bin_edges[i]) & (filtered_x < bin_edges[i + 1])]\n",`
			`" bin_avg[i] = np.mean(bin_values)\n",`
			`" nan_mask = np.isnan(bin_avg)\n",`
			`" edges = bin_edges[:-1][~nan_mask]\n",`
			`" bin_avg = bin_avg[~nan_mask]\n",`
			`" else:\n",`
			`" bin_avg = filtered_y\n",`
			`" edges = filtered_x\n",`
			`"\n",`
			`"\n",`
Add logging 2024-06-26 07:58:24 +00:00			`" average = np.mean(y)\n",`
			`" highest = np.max(y)\n",`
			`" lowest = np.min(y)\n",`
			`"\n",`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`" date_time = []\n",`
			`" for e in edges:\n",`
			`" date_time.append(to_datetime(e))\n",`
			`" plt.plot(edges, bin_avg)\n",`
			`" # plt.plot(filtered_x, filtered_y)\n",`
Add logging 2024-06-26 07:58:24 +00:00			`" plt.axhline(average, color='orange', linestyle='--', label='Average')\n",`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`" plt.text(x[-1], average, f'Avg: {average:.2f}', ha='right', va='center')\n",`
Add logging 2024-06-26 07:58:24 +00:00			`"\n",`
			`" plt.axhline(highest, color='red', linestyle='--', label='Highest')\n",`
			`" plt.text(x[-1], highest, f'High: {highest:.2f}', ha='right', va='center')\n",`
			`"\n",`
			`" plt.axhline(lowest, color='green', linestyle='--', label='Lowest')\n",`
			`" plt.text(x[-1], lowest, f'Avg: {lowest:.2f}', ha='right', va='center')\n",`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`" plt.xlabel('Timestamp')\n",`
			`" plt.ylabel('Processing Time (seconds)')\n",`
			`" plt.title(title)\n",`
			`" plt.xticks(rotation=45)\n",`
			`" plt.show()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
			`"source": [`
			`"plot_trend(frame_by_request.index, frame_by_request[\"image_avg_cost\"], \"Image average cost\", outliner_threah=95, num_bins=30)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_request.index, frame_by_request[\"backend_cost\"], \"Backend cost Trend\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_request.index, frame_by_request[\"request_cost\"], \"Request_cost Trend\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
WIP on main: 782d99a Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into main 2024-06-11 07:22:17 +00:00			`"plot_trend(frame_by_request.index, frame_by_request[\"number_of_file\"], \"Files in a request Trend\", outliner_threah=None, num_bins=30)\n"`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_file.index, frame_by_file[\"postprocessing\"], \"AI postprocessing Trend\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_imei.index, frame_by_imei[\"inference\"], \"IMEI Inference Trend\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_invoice.index, frame_by_invoice[\"inference\"], \"Invoice Inference Trend\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_file.index, frame_by_file[\"inference\"], \"AI inference Trend\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Add logging 2024-06-26 07:58:24 +00:00			`"execution_count": null,`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"metadata": {},`
Add logging 2024-06-26 07:58:24 +00:00			`"outputs": [],`
Update: Unlimit task timeout, Add EDA for processing time 2024-04-04 07:07:02 +00:00			`"source": [`
			`"plot_trend(frame_by_file.index, frame_by_file[\"queue\"], \"AI queuing Trend\")\n"`
			`]`
WIP on main: 782d99a Merge branch 'main' of https://code.sdsdev.co.kr/SDSRV-IDP/sbt-idp into main 2024-06-11 07:22:17 +00:00			`},`
Add logging 2024-06-26 07:58:24 +00:00			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def split_timestamps_by_day(timestamps):\n",`
			`" \"\"\"\n",`
			`" Splits a list of timestamps into a list of lists, where each inner list contains the timestamps for a single day.\n",`
			`" \n",`
			`" Args:\n",`
			`" timestamps (list): A list of timestamp values.\n",`
			`" \n",`
			`" Returns:\n",`
			`" list: A list of lists, where each inner list contains the timestamps for a single day.\n",`
			`" \"\"\"\n",`
			`" # Convert timestamps to datetime objects\n",`
			`" datetimes = [pd.Timestamp(t, unit='s') for t in timestamps]\n",`
			`" \n",`
			`" # Create a DataFrame with the datetime objects\n",`
			`" df = pd.DataFrame({'timestamp': datetimes})\n",`
			`" \n",`
			`" # Group the DataFrame by day and collect the timestamps for each day\n",`
			`" timestamps_by_day = []\n",`
			`" for _, group in df.groupby(pd.Grouper(key='timestamp', freq='D')):\n",`
			`" day_timestamps = [t.timestamp() for t in group['timestamp']]\n",`
			`" timestamps_by_day.append(day_timestamps)\n",`
			`" \n",`
			`" return timestamps_by_day"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def visualize_requests_by_time(timestamps_by_day):\n",`
			`" \"\"\"\n",`
			`" Generates a histogram to visualize the number of requests received by time.\n",`
			`" \n",`
			`" Args:\n",`
			`" timestamps (list): A list of timestamps (in seconds) representing the time when each request was received.\n",`
			`" \"\"\"\n",`
			`" num_days = len(timestamps_by_day)\n",`
			`" fig, axes = plt.subplots(nrows=int(np.ceil(num_days / 2)), ncols=2, figsize=(90, 450))\n",`
			`" for i, day_timestamps in enumerate(timestamps_by_day):\n",`
			`" row = i // 2\n",`
			`" col = i % 2\n",`
			`" \n",`
			`" day_timestamps = [pd.Timestamp(t, unit='s') for t in day_timestamps]\n",`
			`" # Get the current axis\n",`
			`" ax = axes[row, col] if num_days > 1 else axes\n",`
			`" \n",`
			`" # Plot the histogram for the current day\n",`
			`" ax.hist(pd.to_datetime(day_timestamps), bins=60*24, edgecolor='black')\n",`
			`" # ax.xticks(rotation=45)\n",`
			`" ax.set_title(f\"Day {i+1}\")\n",`
			`" ax.set_xlabel(\"Time\")\n",`
			`" ax.set_ylabel(\"Count\")\n",`
			`"\n",`
			`" ax2 = ax.twinx()\n",`
			`" ax2.set_ylabel(\"Time Process (s)\")\n",`
			`" ax2.set_ylim(min(day_timestamps), max(day_timestamps))\n",`
			`" # Convert timestamps to datetime objects\n",`
			`" # timestamps = [pd.Timestamp(t, unit='s') for t in timestamps]\n",`
			`" \n",`
			`" # # Create a histogram with 24 bins (one for each hour of the day)\n",`
			`" # plt.figure(figsize=(12, 6))\n",`
			`" # # plt.hist(pd.to_datetime([t.hour for t in timestamps], unit='h'), bins=24, edgecolor='black')\n",`
			`" # plt.hist(pd.to_datetime(timestamps), bins=1000, edgecolor='black')\n",`
			`" \n",`
			`" # Set x-axis labels to show the hour of the day\n",`
			`" # x_labels = [f'{i:02d}:00' for i in range(24)]\n",`
			`" # plt.xticks(np.arange(0, 24), x_labels, rotation=45)\n",`
			`" # plt.xticks(rotation=45)\n",`
			`" # plt.xlabel('Hour of the Day')\n",`
			`" # plt.ylabel('Number of Requests')\n",`
			`" # plt.title('Requests Received by Time of Day')\n",`
			`" # plt.show()\n",`
			`" plt.suptitle(\"Distribution of Requests by Time\")\n",`
			`" plt.tight_layout()\n",`
			`" plt.show()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# timestamps_by_day=split_timestamps_by_day(result_by_file[\"created\"])\n",`
			`"# visualize_requests_by_time(timestamps_by_day)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
update 2024-06-12 07:46:18 +00:00			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.14"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`