diff --git a/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py b/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py index 6930500..0947a9a 100644 --- a/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py +++ b/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py @@ -14,8 +14,12 @@ from django.utils import timezone IMAGE_DIRS = ["/external_data/SGGE", "/external_data/zipsGwp1", "/external_data/zipsGwp2", "/external_data/zipsGwp3", "/external_data/zipsGwp4", "/external_data/zipsEvoucher"] # IMAGE_DIRS = ["/external_data/SGGE"] -image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif'] -pdf_extensions = ['*.pdf'] +image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.JPG', '*.JPEG', '*.PNG', '*.GÌ'] +pdf_extensions = ['*.pdf', '*.PDF'] + +IGNORE_MULTIPLE_IMAGE = True +BET_ON_FIRST_IMAGE = True # Try to upload the first image to monoimage-request +PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE = ["SG"] class Command(BaseCommand): help = 'Refactor database for image level' @@ -30,10 +34,13 @@ class Command(BaseCommand): for redemtion_dir in redemtion_dirs: redemptions = os.listdir(redemtion_dir) for redemption in redemptions: + if "." + redemption.split(".")[-1] in image_extensions + pdf_extensions + [".csv", ".zip"]: + continue # ignore non-folder files_in_dir = [] for ext in image_extensions + pdf_extensions: files_in_dir.extend(glob.glob(os.path.join(redemtion_dir, redemption, ext))) - redemption = redemption.replace("Data", "") + files_in_dir = sorted(files_in_dir) + redemption = redemption.replace("Data", "").replace("(pdf)", "") if prepared_data.get(redemption, None): prepared_data[redemption]["image_paths"] += files_in_dir prepared_data[redemption]["pages"] += len(files_in_dir) @@ -42,29 +49,23 @@ class Command(BaseCommand): return prepared_data - def _add_error(self, result, error, redemption_id): - if not result.get("Error", None): - result["Error"] = {} - if result["Error"].get(error, None): - result["Error"][error].add(redemption_id) + def _add_log(self, result, log, redemption_id, log_level): + if not result.get(log_level, None): + result[log_level] = {} + log = "[{}]".format(redemption_id[:2]) + log + if result[log_level].get(log, None): + result[log_level][log].add(redemption_id) else: - result["Error"][error] = set([redemption_id]) + result[log_level][log] = set([redemption_id]) + + def _add_error(self, result, error, redemption_id): + self._add_log(result, error, redemption_id, "Error") def _add_info(self, result, info, redemption_id): - if not result.get("Info", None): - result["Info"] = {} - if result["Info"].get(info, None): - result["Info"][info].add(redemption_id) - else: - result["Info"][info] = set([redemption_id]) + self._add_log(result, info, redemption_id, "Info") def _add_warning(self, result, warn, redemption_id): - if not result.get("Warning", None): - result["Warning"] = {} - if result["Warning"].get(warn, None): - result["Warning"][warn].add(redemption_id) - else: - result["Warning"][warn] = set([redemption_id]) + self._add_log(result, warn, redemption_id, "Warning") def _try_find_doc_type(self, file_paths): doc_types = {"invoice": [], @@ -88,16 +89,58 @@ class Command(BaseCommand): return # Find to coresponding redemption_ID self._add_info(result, "[OCR]: redemptions", request.redemption_id) + self._add_info(result, "[OCR]: total {} images".format(request.pages), request.redemption_id) if request.redemption_id not in list(data.keys()): self._add_error(result, "[OCR]: Not found redemption_ID", request.redemption_id) return + if request.pages != data[request.redemption_id]["pages"]: self._add_error(result, "[SBT]: Mismatch files number in a request", request.redemption_id) - return + if BET_ON_FIRST_IMAGE and request.pages == 1: + self._add_warning(result, "[SBT]: monoimage-request, bet on first one", request.redemption_id) + data[request.redemption_id]["image_paths"] = [data[request.redemption_id]["image_paths"][0]] + else: + return file_paths_by_doc_type = self._try_find_doc_type(data[request.redemption_id]["image_paths"]) + if request.redemption_id in [ + "SGE20240608115040-910", + "SGE20240607160017-644", + "SGE20240609095034-986", + "SGGE20240609145539-429", + "SGE20240607134340-431", + "SGE20240609073431-645", + "SGE20240608124611-070", + "SGE20240610120344-912", + "SGE20240610085917-775", + "SGGE20240609044518-869", + "SGE20240608093242-813", + "SGGE20240608175708-038", + "SGE20240607175952-926", + "SGE20240609060258-864", + "SGGE20240609144052-538", + "SGG20240607135057-187", + "SGE20240608133426-100", + "SGE20240607152408-300", + "SGG20240608162101-167", + "SGG20240608133730-021", + "SGE20240609103647-828" + ]: + print("{} - {} - {}".format(request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE, request.redemption_id[:2], data[request.redemption_id]["pages"])) + if request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE and data[request.redemption_id]["pages"] == 1: + self._add_warning(result, "[SBT]: mono-redemption, make it invoice", request.redemption_id) + file_paths_by_doc_type["invoice"] = data[request.redemption_id]["image_paths"] + file_paths_by_doc_type["imei"] = [] + file_paths_by_doc_type["undefined"] = [] + if len(file_paths_by_doc_type["undefined"]) > 0: self._add_warning(result, "[SBT]: Undefined doc type", request.redemption_id) + + + if request.pages > 1 or data[request.redemption_id]["pages"] > 1: + self._add_error(result, "[SBT]: request with multiple images", request.redemption_id) + if IGNORE_MULTIPLE_IMAGE: + return if len(request.request_id.split(".")[0].split("_")) < 2: return @@ -147,12 +190,15 @@ class Command(BaseCommand): subcription_iter = SubscriptionRequest.objects.filter(redemption_id__isnull=False) print(f"[INFO]: Preparing data for filling up...") prepared_data = self._prepare_data(IMAGE_DIRS) + # Log out prepared infomation + for k,v in prepared_data.items(): + self._add_info(result, "[Provided]: total {} images found".format(v["pages"]), k) print(f"[INFO]: Prepared data, total: {len(list(prepared_data.keys()))}") prepared_data_copy = copy.deepcopy(prepared_data) s3_client = MinioS3Client( # endpoint='http://107.120.133.27:9884', access_key='secret', - secret_key='secret+HRcfOsbXhx0YSNOLxdW', + secret_key='secret', bucket_name='ocr-sds' ) # file = open("modified.txt", "w") @@ -164,11 +210,12 @@ class Command(BaseCommand): for err in result.get("Error", []): print("[INFO]: Error: {}: {}".format(err, len(result["Error"][err]))) result["Error"][err] = list(result["Error"][err]) - for info in result.get("Info", []): - print("[INFO]: Info: {}: {}".format(info, len(result["Info"][info]))) - result["Info"][info] = list(result["Info"][info]) - for warn in result.get("Warning", []): - print("[INFO]: Warning: {}: {}".format(warn, len(result["Warning"][warn]))) - result["Warning"][warn] = list(result["Warning"][warn]) + + for log_level in ['Info', 'Error', 'Warning']: + errs = result.get(log_level, []) + errs = sorted(errs) + for err in errs: + print("[INFO]: {}: {}: {}".format(log_level, err, len(result[log_level][err]))) + result[log_level][err] = list(result[log_level][err]) with open("result.json", "w") as outfile: json.dump(result, outfile) \ No newline at end of file diff --git a/cope2n-api/fwd_api/utils/accuracy.py b/cope2n-api/fwd_api/utils/accuracy.py index 92c967f..01be84f 100755 --- a/cope2n-api/fwd_api/utils/accuracy.py +++ b/cope2n-api/fwd_api/utils/accuracy.py @@ -408,7 +408,7 @@ class ReportAccumulateByRequest: _report[accuracy_type][key] = _report[accuracy_type][key]() _report["average_accuracy_rate"]["avg"] = _report["average_accuracy_rate"]["avg"]() - _report["review_progress"] = _report["review_progress"].count(1)/(_report["review_progress"].count(0)+ _report["review_progress"].count(1)) if (_report["review_progress"].count(0)+ _report["review_progress"].count(1)) >0 else 0 + _report["review_progress"] = _report["review_progress"].count(1)/(_report["review_progress"].count(0)+ _report["review_progress"].count(1)) if (_report["review_progress"].count(0)+ _report["review_progress"].count(1)) >0 else 1.0 _report["images_quality"]["successful_percent"] = _report["images_quality"]["successful"]/_report["total_images"] if _report["total_images"] > 0 else 0 _report["images_quality"]["bad_percent"] = _report["images_quality"]["bad"]/_report["total_images"] if _report["total_images"] > 0 else 0 # export data for dashboard