import re from datetime import datetime import copy from typing import Any from .ocr_utils.ocr_metrics import eval_ocr_metric from .ocr_utils.sbt_report import post_processing_str import uuid from fwd_api.models import SubscriptionRequest, SubscriptionRequestFile, ReportFile from ..celery_worker.client_connector import c_connector from ..utils.file import dict2xlsx, save_workbook_file, save_report_to_S3 from django.db.models import Q from django.utils import timezone import redis from fwd import settings from ..models import SubscriptionRequest, Report, ReportFile import json BAD_THRESHOLD = 0.75 valid_keys = ["retailername", "sold_to_party", "purchase_date", "imei_number"] class ReportAccumulateByRequest: def __init__(self, sub): # self.redis_client = redis.Redis(host=settings.REDIS_HOST, port=settings.REDIS_PORT, decode_responses=True) self.sub = sub self.current_time = None self.data = {} # {"month": [total, {"day": day_data}]} self.total_format = { 'subs': "+", 'extraction_date': "Subtotal ()", 'total_images': 0, 'images_quality': { 'successful': 0, 'successful_percent': 0, 'bad': 0, 'bad_percent': 0 }, 'average_accuracy_rate': { 'imei': IterAvg(), 'purchase_date': IterAvg(), 'retailer_name': IterAvg(), 'sold_to_party': IterAvg() }, 'average_processing_time': { 'imei': IterAvg(), 'invoice': IterAvg() }, 'usage': { 'imei':0, 'invoice': 0, 'request': 0 }, 'feedback_accuracy': { 'imei_number': IterAvg(), 'purchase_date': IterAvg(), 'retailername': IterAvg(), 'sold_to_party': IterAvg() }, 'reviewed_accuracy': { 'imei_number': IterAvg(), 'purchase_date': IterAvg(), 'retailername': IterAvg(), 'sold_to_party': IterAvg() }, 'num_request': 0 } self.day_format = { 'subs': sub, 'extraction_date': "", 'num_imei': 0, 'num_invoice': 0, 'total_images': 0, 'images_quality': { 'successful': 0, 'successful_percent': 0, 'bad': 0, 'bad_percent': 0 }, 'average_accuracy_rate': { 'imei': IterAvg(), 'purchase_date': IterAvg(), 'retailer_name': IterAvg(), 'sold_to_party': IterAvg() }, 'average_processing_time': { 'imei': IterAvg(), 'invoice': IterAvg() }, 'usage': { 'imei': 0, 'invoice': 0, 'request': 0 }, 'feedback_accuracy': { 'imei_number': IterAvg(), 'purchase_date': IterAvg(), 'retailername': IterAvg(), 'sold_to_party': IterAvg() }, 'reviewed_accuracy': { 'imei_number': IterAvg(), 'purchase_date': IterAvg(), 'retailername': IterAvg(), 'sold_to_party': IterAvg() }, "report_files": [], 'num_request': 0 }, @staticmethod def update_total(total, report_file): total["total_images"] += 1 total["images_quality"]["successful"] += 1 if not report_file.is_bad_image else 0 total["images_quality"]["bad"] += 1 if report_file.is_bad_image else 0 # total["report_files"].append(report_file) if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 : total["average_accuracy_rate"]["imei"].add(report_file.reviewed_accuracy.get("imei_number", [])) total["average_accuracy_rate"]["purchase_date"].add(report_file.reviewed_accuracy.get("purchase_date", [])) total["average_accuracy_rate"]["retailer_name"].add(report_file.reviewed_accuracy.get("retailername", [])) total["average_accuracy_rate"]["sold_to_party"].add(report_file.reviewed_accuracy.get("sold_to_party", [])) elif sum([len(report_file.feedback_accuracy[x]) for x in report_file.feedback_accuracy.keys() if "_count" not in x]) > 0: total["average_accuracy_rate"]["imei"].add(report_file.feedback_accuracy.get("imei_number", [])) total["average_accuracy_rate"]["purchase_date"].add(report_file.feedback_accuracy.get("purchase_date", [])) total["average_accuracy_rate"]["retailer_name"].add(report_file.feedback_accuracy.get("retailername", [])) total["average_accuracy_rate"]["sold_to_party"].add(report_file.feedback_accuracy.get("sold_to_party", [])) for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]: total["feedback_accuracy"][key].add(report_file.feedback_accuracy.get(key, [])) for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]: total["reviewed_accuracy"][key].add(report_file.reviewed_accuracy.get(key, [])) if not total["average_processing_time"].get(report_file.doc_type, None): print(f"[WARM]: Weird doctype: {report_file.doc_type}") total["average_processing_time"][report_file.doc_type] = IterAvg() total["average_processing_time"][report_file.doc_type].add_avg(report_file.time_cost, 1) if report_file.time_cost else 0 total["usage"]["imei"] += 1 if report_file.doc_type == "imei" else 0 total["usage"]["invoice"] += 1 if report_file.doc_type == "invoice" else 0 return total @staticmethod def update_day(day_data, report_file): day_data["total_images"] += 1 day_data["images_quality"]["successful"] += 1 if not report_file.is_bad_image else 0 day_data["images_quality"]["bad"] += 1 if report_file.is_bad_image else 0 day_data["num_imei"] += 1 if report_file.doc_type == "imei" else 0 day_data["num_invoice"] += 1 if report_file.doc_type == "invoice" else 0 day_data["report_files"].append(report_file) if sum([len(report_file.reviewed_accuracy[x]) for x in report_file.reviewed_accuracy.keys() if "_count" not in x]) > 0 : day_data["average_accuracy_rate"]["imei"].add(report_file.reviewed_accuracy.get("imei_number", 0)) day_data["average_accuracy_rate"]["purchase_date"].add(report_file.reviewed_accuracy.get("purchase_date", 0)) day_data["average_accuracy_rate"]["retailer_name"].add(report_file.reviewed_accuracy.get("retailername", 0)) day_data["average_accuracy_rate"]["sold_to_party"].add(report_file.reviewed_accuracy.get("sold_to_party", 0)) elif sum([len(report_file.feedback_accuracy[x]) for x in report_file.feedback_accuracy.keys() if "_count" not in x]) > 0: day_data["average_accuracy_rate"]["imei"].add(report_file.feedback_accuracy.get("imei_number", 0)) day_data["average_accuracy_rate"]["purchase_date"].add(report_file.feedback_accuracy.get("purchase_date", 0)) day_data["average_accuracy_rate"]["retailer_name"].add(report_file.feedback_accuracy.get("retailername", 0)) day_data["average_accuracy_rate"]["sold_to_party"].add(report_file.feedback_accuracy.get("sold_to_party", 0)) for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]: day_data["feedback_accuracy"][key].add(report_file.feedback_accuracy.get(key, 0)) for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]: day_data["reviewed_accuracy"][key].add(report_file.reviewed_accuracy.get(key, 0)) if not day_data["average_processing_time"].get(report_file.doc_type, None): print(f"[WARM]: Weird doctype: {report_file.doc_type}") day_data["average_processing_time"][report_file.doc_type] = IterAvg() day_data["average_processing_time"][report_file.doc_type].add_avg(report_file.time_cost, 1) if report_file.time_cost else 0 return day_data def add(self, request, report_files): this_month = request.created_at.strftime("%Y%m") this_day = request.created_at.strftime("%Y%m%d") if not self.data.get(this_month, None): self.data[this_month] = [copy.deepcopy(self.total_format), {}] self.data[this_month][0]["extraction_date"] = "Subtotal (" + request.created_at.strftime("%Y-%m") + ")" if not self.data[this_month][1].get(this_day, None): self.data[this_month][1][this_day] = copy.deepcopy(self.day_format)[0] self.data[this_month][1][this_day]['extraction_date'] = request.created_at.strftime("%Y-%m-%d") usage = self.count_transactions_within_day(this_day) self.data[this_month][1][this_day]["usage"]["imei"] = usage.get("imei", 0) self.data[this_month][1][this_day]["usage"]["invoice"] = usage.get("invoice", 0) self.data[this_month][1][this_day]["usage"]["request"] = usage.get("request", 0) self.data[this_month][1][this_day]['num_request'] += 1 self.data[this_month][0]['num_request'] += 1 for report_file in report_files: self.data[this_month][0] = self.update_total(self.data[this_month][0], report_file) # Update the subtotal within the month self.data[this_month][1][this_day] = self.update_day(self.data[this_month][1][this_day], report_file) # Update the subtotal of the day def count_transactions_within_day(self, date_string): # convert this day into timezone.datetime at UTC start_date = datetime.strptime(date_string, "%Y%m%d") start_date_with_timezone = timezone.make_aware(start_date) end_date_with_timezone = start_date_with_timezone + timezone.timedelta(days=1) return count_transactions(start_date_with_timezone, end_date_with_timezone, self.sub) def save(self, root_report_id, is_daily_report=False, include_test=False): report_data = self.get() fine_data = [] save_data = {"file": {"overview": f"{root_report_id}/{root_report_id}.xlsx"}, "data": fine_data} # {"sub_report_id": "S3 location", "data": fine_data} # extract data month_keys = list(report_data.keys()) month_keys.sort(reverse=True) for month in month_keys: fine_data.append(report_data[month][0]) day_keys = list(report_data[month][1].keys()) day_keys.sort(reverse = True) for day in day_keys: fine_data.append(report_data[month][1][day]) # save daily reports report_id = root_report_id + "_" + day start_date = datetime.strptime(day, "%Y%m%d") start_date_with_timezone = timezone.make_aware(start_date) end_date_with_timezone = start_date_with_timezone + timezone.timedelta(days=1) _average_OCR_time = {"invoice": self.data[month][1][day]["average_processing_time"]["invoice"](), "imei": self.data[month][1][day]["average_processing_time"]["imei"](), "invoice_count": self.data[month][1][day]["average_processing_time"]["invoice"].count, "imei_count": self.data[month][1][day]["average_processing_time"]["imei"].count} _average_OCR_time["avg"] = (_average_OCR_time["invoice"]*_average_OCR_time["invoice_count"] + _average_OCR_time["imei"]*_average_OCR_time["imei_count"])/(_average_OCR_time["imei_count"] + _average_OCR_time["invoice_count"]) if (_average_OCR_time["imei_count"] + _average_OCR_time["invoice_count"]) > 0 else None acumulated_acc = {"feedback_accuracy": {}, "reviewed_accuracy": {}} for acc_type in ["feedback_accuracy", "reviewed_accuracy"]: avg_acc = IterAvg() for key in ["imei_number", "purchase_date", "retailername", "sold_to_party"]: acumulated_acc[acc_type][key] = self.data[month][1][day][acc_type][key]() acumulated_acc[acc_type][key+"_count"] = self.data[month][1][day][acc_type][key].count avg_acc.add_avg(acumulated_acc[acc_type][key], acumulated_acc[acc_type][key+"_count"]) acumulated_acc[acc_type]["avg"] = avg_acc() acumulated_acc[acc_type]["avg_count"] = avg_acc.count new_report: Report = Report( report_id=report_id, is_daily_report=is_daily_report, subsidiary=self.sub.lower().replace(" ", ""), include_test=include_test, start_at=start_date_with_timezone, end_at=end_date_with_timezone, status="Ready", number_request=report_data[month][1][day]["num_request"], number_images=report_data[month][1][day]["total_images"], number_imei=report_data[month][1][day]["num_imei"], number_invoice=report_data[month][1][day]["num_invoice"], number_bad_images=report_data[month][1][day]["images_quality"]["bad"], average_OCR_time=_average_OCR_time, number_imei_transaction=report_data[month][1][day]["usage"]["imei"], number_invoice_transaction=report_data[month][1][day]["usage"]["invoice"], feedback_accuracy=acumulated_acc["feedback_accuracy"], reviewed_accuracy=acumulated_acc["reviewed_accuracy"], ) if is_daily_report: new_report.save() data = extract_report_detail_list(self.data[month][1][day]["report_files"], lower=True) data_workbook = dict2xlsx(data, _type='report_detail') local_workbook = save_workbook_file(report_id + ".xlsx", new_report, data_workbook) s3_key=save_report_to_S3(report_id, local_workbook) return fine_data, save_data def get(self) -> Any: # FIXME: This looks like a junk _data = copy.deepcopy(self.data) for month in _data.keys(): _data[month][0]["images_quality"]["successful_percent"] = _data[month][0]["images_quality"]["successful"]/_data[month][0]["total_images"] if _data[month][0]["total_images"] > 0 else 0 _data[month][0]["images_quality"]["bad_percent"] = _data[month][0]["images_quality"]["bad"]/_data[month][0]["total_images"] if _data[month][0]["total_images"] > 0 else 0 num_transaction_imei = 0 num_transaction_invoice = 0 for day in _data[month][1].keys(): num_transaction_imei += _data[month][1][day]["usage"].get("imei", 0) num_transaction_invoice += _data[month][1][day]["usage"].get("invoice", 0) _data[month][1][day]["average_accuracy_rate"]["imei"] = _data[month][1][day]["average_accuracy_rate"]["imei"]() _data[month][1][day]["average_accuracy_rate"]["purchase_date"] = _data[month][1][day]["average_accuracy_rate"]["purchase_date"]() _data[month][1][day]["average_accuracy_rate"]["retailer_name"] = _data[month][1][day]["average_accuracy_rate"]["retailer_name"]() _data[month][1][day]["average_accuracy_rate"]["sold_to_party"] = _data[month][1][day]["average_accuracy_rate"]["sold_to_party"]() for key in _data[month][1][day]["average_processing_time"].keys(): _data[month][1][day]["average_processing_time"][key] = _data[month][1][day]["average_processing_time"][key]() _data[month][1][day]["feedback_accuracy"]["imei_number"] = _data[month][1][day]["feedback_accuracy"]["imei_number"]() _data[month][1][day]["feedback_accuracy"]["purchase_date"] = _data[month][1][day]["feedback_accuracy"]["purchase_date"]() _data[month][1][day]["feedback_accuracy"]["retailername"] = _data[month][1][day]["feedback_accuracy"]["retailername"]() _data[month][1][day]["feedback_accuracy"]["sold_to_party"] = _data[month][1][day]["feedback_accuracy"]["sold_to_party"]() _data[month][1][day]["reviewed_accuracy"]["imei_number"] = _data[month][1][day]["reviewed_accuracy"]["imei_number"]() _data[month][1][day]["reviewed_accuracy"]["purchase_date"] = _data[month][1][day]["reviewed_accuracy"]["purchase_date"]() _data[month][1][day]["reviewed_accuracy"]["retailername"] = _data[month][1][day]["reviewed_accuracy"]["retailername"]() _data[month][1][day]["reviewed_accuracy"]["sold_to_party"] = _data[month][1][day]["reviewed_accuracy"]["sold_to_party"]() _data[month][1][day].pop("report_files") _data[month][1][day]["images_quality"]["successful_percent"] = _data[month][1][day]["images_quality"]["successful"]/_data[month][1][day]["total_images"] if _data[month][1][day]["total_images"] > 0 else 0 _data[month][1][day]["images_quality"]["bad_percent"] = _data[month][1][day]["images_quality"]["bad"]/_data[month][1][day]["total_images"] if _data[month][1][day]["total_images"] > 0 else 0 _data[month][0]["usage"]["imei"] = num_transaction_imei _data[month][0]["usage"]["invoice"] = num_transaction_invoice _data[month][0]["average_accuracy_rate"]["imei"] = _data[month][0]["average_accuracy_rate"]["imei"]() _data[month][0]["average_accuracy_rate"]["purchase_date"] = _data[month][0]["average_accuracy_rate"]["purchase_date"]() _data[month][0]["average_accuracy_rate"]["retailer_name"] = _data[month][0]["average_accuracy_rate"]["retailer_name"]() _data[month][0]["average_accuracy_rate"]["sold_to_party"] = _data[month][0]["average_accuracy_rate"]["sold_to_party"]() for key in _data[month][0]["average_processing_time"].keys(): _data[month][0]["average_processing_time"][key] = _data[month][0]["average_processing_time"][key]() _data[month][0]["feedback_accuracy"]["imei_number"] = _data[month][0]["feedback_accuracy"]["imei_number"]() _data[month][0]["feedback_accuracy"]["purchase_date"] = _data[month][0]["feedback_accuracy"]["purchase_date"]() _data[month][0]["feedback_accuracy"]["retailername"] = _data[month][0]["feedback_accuracy"]["retailername"]() _data[month][0]["feedback_accuracy"]["sold_to_party"] = _data[month][0]["feedback_accuracy"]["sold_to_party"]() _data[month][0]["reviewed_accuracy"]["imei_number"] = _data[month][0]["reviewed_accuracy"]["imei_number"]() _data[month][0]["reviewed_accuracy"]["purchase_date"] = _data[month][0]["reviewed_accuracy"]["purchase_date"]() _data[month][0]["reviewed_accuracy"]["retailername"] = _data[month][0]["reviewed_accuracy"]["retailername"]() _data[month][0]["reviewed_accuracy"]["sold_to_party"] = _data[month][0]["reviewed_accuracy"]["sold_to_party"]() return _data class MonthReportAccumulate: def __init__(self): self.month = None self.total = { 'subs': "+", 'extraction_date': "Subtotal ()", 'total_images': 0, 'images_quality': { 'successful': 0, 'successful_percent': 0, 'bad': 0, 'bad_percent': 0 }, 'average_accuracy_rate': { 'imei': IterAvg(), 'purchase_date': IterAvg(), 'retailer_name': IterAvg() }, 'average_processing_time': { 'imei': IterAvg(), 'invoice': IterAvg() }, 'usage': { 'imei':0, 'invoice': 0 } } self.data = [] self.data_format = { 'subs': "", 'extraction_date': "", 'num_imei': 0, 'num_invoice': 0, 'total_images': 0, 'images_quality': { 'successful': 0, 'successful_percent': 0, 'bad': 0, 'bad_percent': 0 }, 'average_accuracy_rate': { 'imei': 0, 'purchase_date': 0, 'retailer_name': 0 }, 'average_processing_time': { 'imei': 0, 'invoice': 0 }, 'usage': { 'imei':0, 'invoice': 0 } }, def accumulate(self, report): self.total["total_images"] += report.number_images self.total["images_quality"]["successful"] += report.number_images - report.number_bad_images self.total["images_quality"]["bad"] += report.number_bad_images if sum([report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]) > 0 : self.total["average_accuracy_rate"]["imei"].add_avg(report.reviewed_accuracy.get("imei_number", 0), report.reviewed_accuracy.get("imei_number_count", 0)) self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.reviewed_accuracy.get("purchase_date", 0), report.reviewed_accuracy.get("purchase_date_count", 0)) self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.reviewed_accuracy.get("retailername", 0), report.reviewed_accuracy.get("retailername_count", 0)) elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]) > 0: self.total["average_accuracy_rate"]["imei"].add_avg(report.feedback_accuracy.get("imei_number", 0), report.feedback_accuracy.get("imei_number_count", 0)) self.total["average_accuracy_rate"]["purchase_date"].add_avg(report.feedback_accuracy.get("purchase_date", 0), report.feedback_accuracy.get("purchase_date_count", 0)) self.total["average_accuracy_rate"]["retailer_name"].add_avg(report.feedback_accuracy.get("retailername", 0), report.feedback_accuracy.get("retailername_count", 0)) self.total["average_processing_time"]["imei"].add_avg(report.average_OCR_time.get("imei", 0), report.average_OCR_time.get("imei_count", 0)) if report.average_OCR_time else 0 self.total["average_processing_time"]["invoice"].add_avg(report.average_OCR_time.get("invoice", 0), report.average_OCR_time.get("invoice_count", 0)) if report.average_OCR_time else 0 self.total["usage"]["imei"] += report.number_imei_transaction self.total["usage"]["invoice"] += report.number_invoice_transaction def add(self, report): report_month = report.start_at.month if self.month is None: self.month = report_month self.total["extraction_date"] = f"Subtotal ({self.month})" elif self.month != report_month: self.total["images_quality"]["successful_percent"] += self.total["images_quality"]["successful"]/self.total["total_images"] self.total["images_quality"]["bad_percent"] += self.total["images_quality"]["bad"]/self.total["total_images"] return False # Reports from a different month, stop accumulating # accumulate fields new_data = copy.deepcopy(self.data_format)[0] new_data["num_imei"] = report.number_imei new_data["subs"] = report.subsidiary new_data["extraction_date"] = report.start_at new_data["num_invoice"] = report.number_invoice new_data["total_images"] = report.number_images new_data["images_quality"]["successful"] = report.number_images - report.number_bad_images new_data["images_quality"]["bad"] = report.number_bad_images report.reviewed_accuracy = {} if report.reviewed_accuracy is None else report.reviewed_accuracy report.feedback_accuracy = {} if report.feedback_accuracy is None else report.feedback_accuracy if sum([ report.reviewed_accuracy[x] for x in report.reviewed_accuracy.keys() if "_count" not in x]): new_data["average_accuracy_rate"]["imei"] = report.reviewed_accuracy.get("imei_number", None) new_data["average_accuracy_rate"]["purchase_date"] = report.reviewed_accuracy.get("purchase_date", None) new_data["average_accuracy_rate"]["retailer_name"] = report.reviewed_accuracy.get("retailername", None) elif sum([ report.feedback_accuracy[x] for x in report.feedback_accuracy.keys() if "_count" not in x]): new_data["average_accuracy_rate"]["imei"] = report.feedback_accuracy.get("imei_number", None) new_data["average_accuracy_rate"]["purchase_date"] = report.feedback_accuracy.get("purchase_date", None) new_data["average_accuracy_rate"]["retailer_name"] = report.feedback_accuracy.get("retailername", None) new_data["average_processing_time"]["imei"] = report.average_OCR_time.get("imei", 0) if report.average_OCR_time else 0 new_data["average_processing_time"]["invoice"] = report.average_OCR_time.get("invoice", 0) if report.average_OCR_time else 0 new_data["usage"]["imei"] = report.number_imei_transaction new_data["usage"]["invoice"] = report.number_invoice_transaction new_data["images_quality"]["successful_percent"] += new_data["images_quality"]["successful"]/new_data["total_images"] if new_data["total_images"] else 0 new_data["images_quality"]["bad_percent"] += new_data["images_quality"]["bad"]/new_data["total_images"] if new_data["total_images"] else 0 self.data.append(new_data) self.accumulate(report) return True def clear(self): self.month = None self.total = { 'subs': "+", 'extraction_date': "Subtotal ()", 'total_images': 0, 'images_quality': { 'successful': 0, 'successful_percent': 0, 'bad': 0, 'bad_percent': 0 }, 'average_accuracy_rate': { 'imei': IterAvg(), 'purchase_date': IterAvg(), 'retailer_name': IterAvg() }, 'average_processing_time': { 'imei': IterAvg(), 'invoice': IterAvg() }, 'usage': { 'imei':0, 'invoice': 0 } } self.data = [] def __call__(self): total = copy.deepcopy(self.total) total["images_quality"]["successful_percent"] = total["images_quality"]["successful"]/total["total_images"] if total["total_images"] else 0 total["images_quality"]["bad_percent"] = total["images_quality"]["bad"]/total["total_images"] if total["total_images"] else 0 total["average_accuracy_rate"]["imei"] = total["average_accuracy_rate"]["imei"]() total["average_accuracy_rate"]["purchase_date"] = total["average_accuracy_rate"]["purchase_date"]() total["average_accuracy_rate"]["retailer_name"] = total["average_accuracy_rate"]["retailer_name"]() total["average_processing_time"]["imei"] = total["average_processing_time"]["imei"]() total["average_processing_time"]["invoice"] = total["average_processing_time"]["invoice"]() return self.month, self.data, total class IterAvg: def __init__(self, name="default"): self.name = name self.avg = 0 self.count = 0 def add(self, values): """ Args: values (list[float]): """ values = [x for x in values if x is not None] if len(values) == 0: return self.avg = (self.avg*self.count + sum(values))/(self.count+len(values)) self.count += len(values) def add_avg(self, avg, count): if avg is None or count is None or count == 0: return self.count += count self.avg = (self.avg*(self.count-count) + avg*count)/(self.count) def __call__(self): return self.avg def validate_feedback_file(feedback, predict): if feedback: imei_feedback = feedback.get("imei_number", []) imei_feedback = [x for x in imei_feedback if x != ""] num_imei_feedback = len(imei_feedback) num_imei_predict = len(predict.get("imei_number", [])) if num_imei_feedback != num_imei_predict: return False return True def first_of_list(the_list): if not the_list: return None return the_list[0] def extract_report_detail_list(report_detail_list, lower=False, in_percent=True): data = [] for report_file in report_detail_list: data.append({ "Request ID": report_file.correspond_request_id, "Redemption Number": report_file.correspond_redemption_id, "Image type": report_file.doc_type, "IMEI_user submitted": first_of_list(report_file.feedback_result.get("imei_number", [None])), "IMEI_OCR retrieved": first_of_list(report_file.predict_result.get("imei_number", [None])), "IMEI1 Accuracy": first_of_list(report_file.feedback_accuracy.get("imei_number", [None])), "Invoice_Purchase Date_Consumer": report_file.feedback_result.get("purchase_date", None), "Invoice_Purchase Date_OCR": report_file.predict_result.get("purchase_date", []), "Invoice_Purchase Date Accuracy": first_of_list(report_file.feedback_accuracy.get("purchase_date", [None])), "Invoice_Retailer_Consumer": report_file.feedback_result.get("retailername", None), "Invoice_Retailer_OCR": report_file.predict_result.get("retailername", None), "Invoice_Retailer Accuracy": first_of_list(report_file.feedback_accuracy.get("retailername", [None])), "OCR Image Accuracy": report_file.acc, "OCR Image Speed (seconds)": report_file.time_cost, "Reviewed?": "No", "Bad Image Reasons": report_file.bad_image_reason, "Countermeasures": report_file.counter_measures, "IMEI_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("imei_number", [None])), "Purchase Date_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("purchase_date", [None])), "Retailer_Revised Accuracy": first_of_list(report_file.reviewed_accuracy.get("retailername", [None])), }) if lower: for i, dat in enumerate(data): keys = list(dat.keys()) for old_key in keys: data[i][old_key.lower().replace(" ", "_")] = data[i].pop(old_key) if in_percent: for i, dat in enumerate(data): keys = [x for x in list(dat.keys()) if "accuracy" in x.lower()] for key in keys: if data[i][key]: data[i][key] = data[i][key]*100 return data def count_transactions(start_date, end_date, subsidiary="all"): base_query = Q(created_at__range=(start_date, end_date)) base_query &= Q(is_test_request=False) if subsidiary and subsidiary.lower().replace(" ", "")!="all": base_query &= Q(redemption_id__startswith=subsidiary) transaction_att = {} print(f"[DEBUG]: atracting transactions attribute...") total_transaction_requests = SubscriptionRequest.objects.filter(base_query).order_by('created_at') for request in total_transaction_requests: if not request.doc_type: continue doc_types = request.doc_type.split(",") for doc_type in doc_types: if transaction_att.get(doc_type, None) == None: transaction_att[doc_type] = 1 else: transaction_att[doc_type] += 1 if not transaction_att.get("request", None): transaction_att["request"] = 1 else: transaction_att["request"] += 1 return transaction_att def convert_datetime_format(date_string: str, is_gt=False) -> str: # pattern_date_string = "2023-02-28" input_format = "%Y-%m-%d" output_format = "%d/%m/%Y" # Validate the input date string format pattern = r"\d{4}-\d{2}-\d{2}" if re.match(pattern, date_string): # Convert the date string to a datetime object date_object = datetime.strptime(date_string, input_format) # Convert the datetime object to the desired output format formatted_date = date_object.strftime(output_format) return formatted_date return date_string def predict_result_to_ready(result): dict_result = {"retailername": "", "sold_to_party": "", "purchase_date": [], "imei_number": [],} dict_result["retailername"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}])[0].get("value", None) dict_result["sold_to_party"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}])[1].get("value", None) dict_result["purchase_date"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}])[2].get("value", []) dict_result["imei_number"] = result.get("content", {}).get("document", [{}])[0].get("content", [{}, {}, {}, {}])[3].get("value", []) return dict_result def align_fine_result(ready_predict, fine_result): # print(f"[DEBUG]: fine_result: {fine_result}") # print(f"[DEBUG]: ready_predict: {ready_predict}") if fine_result: if fine_result["purchase_date"] and len(ready_predict["purchase_date"]) == 0: ready_predict["purchase_date"] = [None] if fine_result["retailername"] and not ready_predict["retailername"]: ready_predict["retailername"] = [None] fine_result["purchase_date"] = [fine_result["purchase_date"] for _ in range(len(ready_predict["purchase_date"]))] # else: # fine_result = {} # for key in ready_predict.keys(): # fine_result[key] = [] # fine_result["purchase_date"] = [None for _ in range(len(ready_predict["purchase_date"]))] return ready_predict, fine_result def update_temp_accuracy(accuracy, acc, keys): for key in keys: accuracy[key].add(acc[key]) return accuracy def calculate_accuracy(key_name, inference, target): """_summary_ Args: key_name (string): key to calculate accuracy on, ex: retailername inference (dict): result from ocr, refined to align with the target down below target (dict): result of type """ acc = [] data = [] if not target or not inference: return acc, data if not isinstance(inference[key_name], list): if inference[key_name] is None: inference[key_name] = [] else: inference[key_name] = [inference[key_name]] if not isinstance(target[key_name], list): if target[key_name] is None: target[key_name] = [] else: target[key_name] = [target[key_name]] for i, v in enumerate(inference[key_name]): # TODO: target[key_name][i] is None, "" x = post_processing_str(key_name, inference[key_name][i], is_gt=False) y = post_processing_str(key_name, target[key_name][i], is_gt=True) score = eval_ocr_metric( [x], [y], metric=[ "one_minus_ned", # "line_acc_ignore_case_symbol", # "line_acc", # "one_minus_ned_word", ]) acc.append(list(score.values())[0]) data.append([x, y]) return acc, data def calculate_avg_accuracy(acc, type, keys=[]): acc_list = [] # print(f"[DEBUG]: type: {type} - acc: {acc}") for key in keys: acc_list += acc.get(type, {}).get(key, []) acc_list = [x for x in acc_list if x is not None] return sum(acc_list)/len(acc_list) if len(acc_list) > 0 else None def calculate_and_save_subcription_file(report, request): request_att = {"acc": {"feedback": {"imei_number": [], "purchase_date": [], "retailername": [], "sold_to_party": [], }, "reviewed": {"imei_number": [], "purchase_date": [], "retailername": [], "sold_to_party": [], }}, "err": [], "time_cost": {}, "total_images": 0, "bad_images": 0} images = SubscriptionRequestFile.objects.filter(request=request) for image in images: status, att = calculate_subcription_file(image) if status != 200: continue image.feedback_accuracy = att["acc"]["feedback"] image.reviewed_accuracy = att["acc"]["reviewed"] image.is_bad_image_quality = att["is_bad_image"] image.save() new_report_file = ReportFile(report=report, correspond_request_id=request.request_id, correspond_redemption_id=request.redemption_id, doc_type=image.doc_type, predict_result=image.predict_result, feedback_result=image.feedback_result, reviewed_result=image.reviewed_result, feedback_accuracy=att["acc"]["feedback"], reviewed_accuracy=att["acc"]["reviewed"], acc=att["avg_acc"], time_cost=image.processing_time, is_bad_image=att["is_bad_image"], bad_image_reason=image.reason, counter_measures=image.counter_measures, error="|".join(att["err"]) ) new_report_file.save() if request_att["time_cost"].get(image.doc_type, None): request_att["time_cost"][image.doc_type].append(image.processing_time) else: request_att["time_cost"][image.doc_type] = [image.processing_time] try: request_att["acc"]["feedback"]["imei_number"] += att["acc"]["feedback"]["imei_number"] request_att["acc"]["feedback"]["purchase_date"] += att["acc"]["feedback"]["purchase_date"] request_att["acc"]["feedback"]["retailername"] += att["acc"]["feedback"]["retailername"] request_att["acc"]["feedback"]["sold_to_party"] += att["acc"]["feedback"]["sold_to_party"] request_att["acc"]["reviewed"]["imei_number"] += att["acc"]["reviewed"]["imei_number"] request_att["acc"]["reviewed"]["purchase_date"] += att["acc"]["reviewed"]["purchase_date"] request_att["acc"]["reviewed"]["retailername"] += att["acc"]["reviewed"]["retailername"] request_att["acc"]["reviewed"]["sold_to_party"] += att["acc"]["reviewed"]["sold_to_party"] request_att["bad_images"] += int(att["is_bad_image"]) request_att["total_images"] += 1 request_att["err"] += att["err"] except Exception as e: print(e) continue return request_att def calculate_a_request(report, request): request_att = {"acc": {"feedback": {"imei_number": [], "purchase_date": [], "retailername": [], "sold_to_party": [], }, "reviewed": {"imei_number": [], "purchase_date": [], "retailername": [], "sold_to_party": [], }}, "err": [], "time_cost": {}, "total_images": 0, "bad_images": 0} images = SubscriptionRequestFile.objects.filter(request=request) report_files = [] for image in images: status, att = calculate_subcription_file(image) if status != 200: continue image.feedback_accuracy = att["acc"]["feedback"] image.reviewed_accuracy = att["acc"]["reviewed"] image.is_bad_image_quality = att["is_bad_image"] image.save() new_report_file = ReportFile(report=report, correspond_request_id=request.request_id, correspond_redemption_id=request.redemption_id, doc_type=image.doc_type, predict_result=image.predict_result, feedback_result=image.feedback_result, reviewed_result=image.reviewed_result, feedback_accuracy=att["acc"]["feedback"], reviewed_accuracy=att["acc"]["reviewed"], acc=att["avg_acc"], is_bad_image=att["is_bad_image"], time_cost=image.processing_time, bad_image_reason=image.reason, counter_measures=image.counter_measures, error="|".join(att["err"]) ) report_files.append(new_report_file) if request_att["time_cost"].get(image.doc_type, None): request_att["time_cost"][image.doc_type].append(image.processing_time) else: request_att["time_cost"][image.doc_type] = [image.processing_time] try: request_att["acc"]["feedback"]["imei_number"] += att["acc"]["feedback"]["imei_number"] request_att["acc"]["feedback"]["purchase_date"] += att["acc"]["feedback"]["purchase_date"] request_att["acc"]["feedback"]["retailername"] += att["acc"]["feedback"]["retailername"] request_att["acc"]["feedback"]["sold_to_party"] += att["acc"]["feedback"]["sold_to_party"] request_att["acc"]["reviewed"]["imei_number"] += att["acc"]["reviewed"]["imei_number"] request_att["acc"]["reviewed"]["purchase_date"] += att["acc"]["reviewed"]["purchase_date"] request_att["acc"]["reviewed"]["retailername"] += att["acc"]["reviewed"]["retailername"] request_att["acc"]["reviewed"]["sold_to_party"] += att["acc"]["reviewed"]["sold_to_party"] request_att["bad_images"] += int(att["is_bad_image"]) request_att["total_images"] += 1 request_att["err"] += att["err"] except Exception as e: print(e) continue return request_att, report_files def calculate_subcription_file(subcription_request_file): att = {"acc": {"feedback": {}, "reviewed": {}}, "err": [], "is_bad_image": False, "avg_acc": None} if not subcription_request_file.predict_result: return 400, att inference_result = copy.deepcopy(subcription_request_file.predict_result) inference_result, feedback_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.feedback_result)) inference_result, reviewed_result = align_fine_result(inference_result, copy.deepcopy(subcription_request_file.reviewed_result)) # print(f"[DEBUG]: predict_result: {subcription_request_file.predict_result}") # print(f"[DEBUG]: inference_result: {inference_result}") # print(f"[DEBUG]: feedback_result: {feedback_result}") # print(f"[DEBUG]: reviewed_result: {reviewed_result}") for key_name in valid_keys: try: att["acc"]["feedback"][key_name], _ = calculate_accuracy(key_name, inference_result, feedback_result) att["acc"]["reviewed"][key_name], _ = calculate_accuracy(key_name, inference_result, reviewed_result) except Exception as e: att["err"].append(str(e)) # print(f"[DEBUG]: e: {e} -key_name: {key_name}") avg_reviewed = calculate_avg_accuracy(att["acc"], "reviewed", ["retailername", "sold_to_party", "purchase_date", "imei_number"]) avg_feedback = calculate_avg_accuracy(att["acc"], "feedback", ["retailername", "sold_to_party", "purchase_date", "imei_number"]) if avg_feedback is not None or avg_reviewed is not None: avg_acc = max([x for x in [avg_feedback, avg_reviewed] if x is not None]) if avg_acc < BAD_THRESHOLD: att["is_bad_image"] = True # exclude bad images for key_name in valid_keys: att["acc"]["feedback"][key_name] = [] att["acc"]["reviewed"][key_name] = [] att["avg_acc"] = None else: att["avg_acc"] = avg_acc return 200, att def calculate_attributions(request): # for one request, return in order acc = {"feedback": {}, "reviewed": {}} # {"feedback": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}, # "reviewed": {"retailername": [0.1], "sold_to_party":[0.9], "purchase_date":[0.6], "imei_number":[0.8]}} data = {"feedback": {}, "reviewed": {}} # {"feedback": {"retailername": [[ocr, feedback], ...], "sold_to_party":[[ocr, feedback], ...], "purchase_date":[[ocr, feedback], ...], "imei_number":[[ocr, feedback], ...]}} # {"reviewed": {"retailername": [[ocr, reviewed], ...], "sold_to_party":[[ocr, reviewed], ...], "purchase_date":[[ocr, reviewed], ...], "imei_number":[[ocr, reviewed], ...]}} time_cost = {} # {"imei": [0.1], "invoice": [0.1]} image_quality_num = [0, 0] # [good, bad] image_quality_num[0] = len(request.doc_type.split(",")) error = "" inference_result = predict_result_to_ready(request.predict_result) reviewed_result = align_fine_result(inference_result, request.reviewed_result) feedback_result = align_fine_result(inference_result, request.feedback_result) # accuracy calculation for key_name in valid_keys: if isinstance(inference_result[key_name], list): if len(inference_result[key_name]) != len(reviewed_result.get(key_name, [])): error = f"Request {request.request_id} failed with different {key_name} in predict and reviewed_result" break if len(inference_result[key_name]) != len(feedback_result.get(key_name, [])): error = f"Request {request.request_id} failed with different {key_name} in predict and feedback_result" break # calculate accuracy for feedback result acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result) acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result) else: inference_result[key_name] = [inference_result[key_name]] feedback_result[key_name] = [feedback_result[key_name]] reviewed_result[key_name] = [reviewed_result[key_name]] acc["feedback"][key_name], data["feedback"][key_name] = calculate_accuracy(key_name, inference_result, feedback_result) acc["reviewed"][key_name], data["reviewed"][key_name] = calculate_accuracy(key_name, inference_result, reviewed_result) acc["feedback"]["purchase_date"] = [max(acc["feedback"]["purchase_date"])] if len(acc["feedback"]["purchase_date"]) > 0 else [] acc["reviewed"]["purchase_date"] = [max(acc["reviewed"]["purchase_date"])] if len(acc["reviewed"]["purchase_date"]) > 0 else [] # Count for bad and total images avg_invoice_feedback = calculate_avg_accuracy(acc, "feedback", ["retailername", "sold_to_party", "purchase_date"]) avg_invoice_reviewed = calculate_avg_accuracy(acc, "reviewed", ["retailername", "sold_to_party", "purchase_date"]) if avg_invoice_feedback is not None or avg_invoice_reviewed is not None: if max([x for x in [avg_invoice_feedback, avg_invoice_reviewed] if x is not None]) < BAD_THRESHOLD: image_quality_num[1] += 1 for i, _ in enumerate(acc["feedback"]["imei_number"]): if acc["feedback"]["imei_number"][i] is not None and acc["reviewed"]["imei_number"][i] is not None: if max([x for x in [acc["feedback"]["imei_number"][i], acc["reviewed"]["imei_number"][i]] if x is not None]) < BAD_THRESHOLD: image_quality_num[1] += 1 # time cost and quality calculation # TODO: to be deprecated, doc_type would be in file level in the future try: for doc_type, doc_profile in request.ai_inference_profile.items(): doc_type = doc_type.split("_")[0] inference_time = doc_profile["inference"][1][0] - doc_profile["inference"][0] postprocess_time = doc_profile["postprocess"][1] - doc_profile["postprocess"][0] time_cost[doc_type].append(inference_time + postprocess_time) except Exception as e: error = f"Request id {request.request_id} failed with error: {e}" return acc, data, time_cost, image_quality_num, error def shadow_report(report_id, query): c_connector.make_a_report_2( (report_id, query))