{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install -r /mnt/hdd4T/TannedCung/OCR/sbt-idp/cope2n-api/requirements.txt\n",
    "# !pip install matplotlib tqdm python-dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "# from dotenv import load_dotenv\n",
    "\n",
    "# load_dotenv(\"/mnt/hdd4T/TannedCung/OCR/sbt-idp/.env_prod\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !export DB_ENGINE=django.db.backends.postgresql_psycopg2\n",
    "# !export DB_SCHEMA=sbt_prod_20240422_1\n",
    "# !export DB_USER=postgres\n",
    "# !export DB_PASSWORD=TannedCung\n",
    "# !export DB_HOST=db-sbt\n",
    "# !export DB_PUBLIC_PORT=5432\n",
    "# !export DB_INTERNAL_PORT=5432"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import sys\n",
    "import os\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "import datetime\n",
    "\n",
    "current_dir = os.getcwd()\n",
    "parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))\n",
    "sys.path.append(parent_dir)\n",
    "\n",
    "import django\n",
    "from django.db import models\n",
    "from django.db.models import Q\n",
    "from django.utils import timezone\n",
    "from asgiref.sync import sync_to_async\n",
    "from fwd import settings\n",
    "os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"fwd.settings\")\n",
    "os.environ[\"DJANGO_ALLOW_ASYNC_UNSAFE\"] = \"true\"\n",
    "django.setup()\n",
    "from fwd_api.models.SubscriptionRequest import SubscriptionRequest\n",
    "from utils.processing_time import cost_profile, backend_cost\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_date_str = \"2024-05-25T00:00:00+0800\"\n",
    "end_date_str = \"2024-06-24T00:00:00+0800\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@sync_to_async\n",
    "def query_data(start_date_str, end_date_str):\n",
    "    base_query = Q(status=200)\n",
    "    start_date = timezone.datetime.strptime(start_date_str, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only\n",
    "    end_date = timezone.datetime.strptime(end_date_str, '%Y-%m-%dT%H:%M:%S%z')\n",
    "    # start_date = timezone.make_aware(start_date)\n",
    "    # end_date = timezone.make_aware(end_date)\n",
    "    base_query &= Q(redemption_id__startswith=\"AU\")\n",
    "    base_query &= Q(created_at__range=(start_date, end_date))\n",
    "    subscription_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at')\n",
    "    return subscription_requests\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "subscription_requests = await query_data(start_date_str, end_date_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(requests):\n",
    "    result_by_file = {\"queue\": [],\n",
    "                        \"inference\": [],\n",
    "                        \"postprocessing\": [],\n",
    "                        \"created\": []}\n",
    "    result_by_file_type = {\"imei\": {\"queue\": [],\n",
    "                                    \"inference\": [],\n",
    "                                    \"postprocessing\": [],\n",
    "                                    \"created\": []},\n",
    "                          \"invoice\": {\"queue\": [],\n",
    "                                    \"inference\": [],\n",
    "                                    \"postprocessing\": [],\n",
    "                                    \"created\": []}}\n",
    "    result_by_request = {\"backend_cost\": [],\n",
    "                        \"number_of_file\": [],\n",
    "                        \"request_cost\": [],\n",
    "                        \"created\": []}\n",
    "    for request in tqdm(requests):\n",
    "        if not request.ai_inference_profile:\n",
    "            continue\n",
    "        result_by_request[\"created\"].append(request.created_at.timestamp())\n",
    "        result_by_request[\"number_of_file\"].append(request.pages)\n",
    "        result_by_request[\"backend_cost\"].append(backend_cost(request.created_at, request.ai_inference_start_time))\n",
    "        result_by_request[\"request_cost\"].append(request.ai_inference_time)\n",
    "\n",
    "        for key in request.ai_inference_profile.keys():\n",
    "            profile = cost_profile(request.ai_inference_start_time, request.ai_inference_profile[key])\n",
    "            result_by_file[\"queue\"].append(profile[\"queue\"])\n",
    "            result_by_file[\"inference\"].append(profile[\"inference\"])\n",
    "            result_by_file[\"postprocessing\"].append(profile[\"postprocessing\"])\n",
    "            result_by_file[\"created\"].append(request.ai_inference_start_time)\n",
    "            if key.split(\"_\")[0] in [\"imei\", \"invoice\"]:\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"queue\"].append(profile[\"queue\"])\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"inference\"].append(profile[\"inference\"])\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"postprocessing\"].append(profile[\"postprocessing\"])\n",
    "                result_by_file_type[key.split(\"_\")[0]][\"created\"].append(request.ai_inference_start_time)\n",
    "\n",
    "    \n",
    "    return result_by_request, result_by_file, result_by_file_type\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_by_request, result_by_file, result_by_file_type = process(subscription_requests)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame_by_file = pd.DataFrame(result_by_file)\n",
    "frame_by_request = pd.DataFrame(result_by_request)\n",
    "frame_by_imei = pd.DataFrame(result_by_file_type[\"imei\"])\n",
    "frame_by_invoice = pd.DataFrame(result_by_file_type[\"invoice\"])\n",
    "\n",
    "frame_by_request[\"image_avg_cost\"] = (frame_by_request[\"backend_cost\"] + frame_by_request[\"request_cost\"])/frame_by_request[\"number_of_file\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame_by_file.set_index('created', inplace=True)\n",
    "frame_by_request.set_index('created', inplace=True)\n",
    "frame_by_imei.set_index('created', inplace=True)\n",
    "frame_by_invoice.set_index('created', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_datetime(timestamp):\n",
    "    # Convert the timestamp to a datetime object\n",
    "    dt = datetime.datetime.fromtimestamp(timestamp)\n",
    "\n",
    "    # Format the datetime object as YYYY-MM-DD\n",
    "    formatted_date = dt.strftime('%Y-%m-%d')\n",
    "    return formatted_date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_trend(x, y, title, outliner_threah = 95, num_bins=200):\n",
    "    y = y[x>=0]\n",
    "    x = x[x>=0]\n",
    "    if outliner_threah:\n",
    "        # Calculate z-scores\n",
    "        z_scores = np.abs((y - np.mean(y)) / np.std(y))\n",
    "\n",
    "        # Determine the threshold based on the desired percentage of inliers\n",
    "        threshold = np.percentile(z_scores, outliner_threah)\n",
    "\n",
    "        # Filter out outliers\n",
    "        filtered_x = x[z_scores <= threshold]\n",
    "        filtered_y = y[z_scores <= threshold]\n",
    "    else:\n",
    "        filtered_x = x\n",
    "        filtered_y = y\n",
    "\n",
    "    # Compute the histogram\n",
    "    if num_bins:\n",
    "        hist, bin_edges = np.histogram(filtered_x, bins=num_bins)\n",
    "        # Compute the average value for each bin\n",
    "        bin_avg = np.zeros(num_bins)\n",
    "        for i in range(num_bins):\n",
    "            bin_values = filtered_y[(filtered_x >= bin_edges[i]) & (filtered_x < bin_edges[i + 1])]\n",
    "            bin_avg[i] = np.mean(bin_values)\n",
    "        nan_mask = np.isnan(bin_avg)\n",
    "        edges = bin_edges[:-1][~nan_mask]\n",
    "        bin_avg = bin_avg[~nan_mask]\n",
    "    else:\n",
    "        bin_avg = filtered_y\n",
    "        edges = filtered_x\n",
    "\n",
    "\n",
    "    average = np.mean(y)\n",
    "    highest = np.max(y)\n",
    "    lowest = np.min(y)\n",
    "\n",
    "    date_time = []\n",
    "    for e in edges:\n",
    "        date_time.append(to_datetime(e))\n",
    "    plt.plot(edges, bin_avg)\n",
    "    # plt.plot(filtered_x, filtered_y)\n",
    "    plt.axhline(average, color='orange', linestyle='--', label='Average')\n",
    "    plt.text(x[-1], average, f'Avg: {average:.2f}', ha='right', va='center')\n",
    "\n",
    "    plt.axhline(highest, color='red', linestyle='--', label='Highest')\n",
    "    plt.text(x[-1], highest, f'High: {highest:.2f}', ha='right', va='center')\n",
    "\n",
    "    plt.axhline(lowest, color='green', linestyle='--', label='Lowest')\n",
    "    plt.text(x[-1], lowest, f'Avg: {lowest:.2f}', ha='right', va='center')\n",
    "    plt.xlabel('Timestamp')\n",
    "    plt.ylabel('Processing Time (seconds)')\n",
    "    plt.title(title)\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"image_avg_cost\"], \"Image average cost\", outliner_threah=95, num_bins=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"backend_cost\"], \"Backend cost Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"request_cost\"], \"Request_cost Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_request.index, frame_by_request[\"number_of_file\"], \"Files in a request Trend\", outliner_threah=None, num_bins=30)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_file.index, frame_by_file[\"postprocessing\"], \"AI postprocessing Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_imei.index, frame_by_imei[\"inference\"], \"IMEI Inference Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_invoice.index, frame_by_invoice[\"inference\"], \"Invoice Inference Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_file.index, frame_by_file[\"inference\"], \"AI inference Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_trend(frame_by_file.index, frame_by_file[\"queue\"], \"AI queuing Trend\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_timestamps_by_day(timestamps):\n",
    "    \"\"\"\n",
    "    Splits a list of timestamps into a list of lists, where each inner list contains the timestamps for a single day.\n",
    "    \n",
    "    Args:\n",
    "        timestamps (list): A list of timestamp values.\n",
    "        \n",
    "    Returns:\n",
    "        list: A list of lists, where each inner list contains the timestamps for a single day.\n",
    "    \"\"\"\n",
    "    # Convert timestamps to datetime objects\n",
    "    datetimes = [pd.Timestamp(t, unit='s') for t in timestamps]\n",
    "    \n",
    "    # Create a DataFrame with the datetime objects\n",
    "    df = pd.DataFrame({'timestamp': datetimes})\n",
    "    \n",
    "    # Group the DataFrame by day and collect the timestamps for each day\n",
    "    timestamps_by_day = []\n",
    "    for _, group in df.groupby(pd.Grouper(key='timestamp', freq='D')):\n",
    "        day_timestamps = [t.timestamp() for t in group['timestamp']]\n",
    "        timestamps_by_day.append(day_timestamps)\n",
    "    \n",
    "    return timestamps_by_day"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualize_requests_by_time(timestamps_by_day):\n",
    "    \"\"\"\n",
    "    Generates a histogram to visualize the number of requests received by time.\n",
    "    \n",
    "    Args:\n",
    "        timestamps (list): A list of timestamps (in seconds) representing the time when each request was received.\n",
    "    \"\"\"\n",
    "    num_days = len(timestamps_by_day)\n",
    "    fig, axes = plt.subplots(nrows=int(np.ceil(num_days / 2)), ncols=2, figsize=(90, 450))\n",
    "    for i, day_timestamps in enumerate(timestamps_by_day):\n",
    "        row = i // 2\n",
    "        col = i % 2\n",
    "        \n",
    "        day_timestamps = [pd.Timestamp(t, unit='s') for t in day_timestamps]\n",
    "        # Get the current axis\n",
    "        ax = axes[row, col] if num_days > 1 else axes\n",
    "        \n",
    "        # Plot the histogram for the current day\n",
    "        ax.hist(pd.to_datetime(day_timestamps), bins=60*24, edgecolor='black')\n",
    "        # ax.xticks(rotation=45)\n",
    "        ax.set_title(f\"Day {i+1}\")\n",
    "        ax.set_xlabel(\"Time\")\n",
    "        ax.set_ylabel(\"Count\")\n",
    "\n",
    "        ax2 = ax.twinx()\n",
    "        ax2.set_ylabel(\"Time Process (s)\")\n",
    "        ax2.set_ylim(min(day_timestamps), max(day_timestamps))\n",
    "    # Convert timestamps to datetime objects\n",
    "    # timestamps = [pd.Timestamp(t, unit='s') for t in timestamps]\n",
    "    \n",
    "    # # Create a histogram with 24 bins (one for each hour of the day)\n",
    "    # plt.figure(figsize=(12, 6))\n",
    "    # # plt.hist(pd.to_datetime([t.hour for t in timestamps], unit='h'), bins=24, edgecolor='black')\n",
    "    # plt.hist(pd.to_datetime(timestamps), bins=1000, edgecolor='black')\n",
    "    \n",
    "    # Set x-axis labels to show the hour of the day\n",
    "    # x_labels = [f'{i:02d}:00' for i in range(24)]\n",
    "    # plt.xticks(np.arange(0, 24), x_labels, rotation=45)\n",
    "    # plt.xticks(rotation=45)\n",
    "    # plt.xlabel('Hour of the Day')\n",
    "    # plt.ylabel('Number of Requests')\n",
    "    # plt.title('Requests Received by Time of Day')\n",
    "    # plt.show()\n",
    "    plt.suptitle(\"Distribution of Requests by Time\")\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# timestamps_by_day=split_timestamps_by_day(result_by_file[\"created\"])\n",
    "# visualize_requests_by_time(timestamps_by_day)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}