from django.core.management.base import BaseCommand from tqdm import tqdm from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest from fwd_api.exception.exceptions import InvalidException from fwd_api.utils.s3 import MinioS3Client import copy import os import glob import traceback import copy import json from django.utils import timezone IMAGE_DIRS = ["/external_data/SGGE", "/external_data/zipsGwp1", "/external_data/zipsGwp2", "/external_data/zipsGwp3", "/external_data/zipsGwp4", "/external_data/zipsEvoucher"] # IMAGE_DIRS = ["/external_data/SGGE"] image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.JPG', '*.JPEG', '*.PNG', '*.GÌ'] pdf_extensions = ['*.pdf', '*.PDF'] IGNORE_MULTIPLE_IMAGE = True BET_ON_FIRST_IMAGE = True # Try to upload the first image to monoimage-request PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE = ["SG"] class Command(BaseCommand): help = 'Refactor database for image level' def add_arguments(self, parser): # Add your command-line arguments here parser.add_argument('start', type=str, help='start date, sample: 2023-01-02T00:00:00+0700') parser.add_argument('end', type=str, help='end date, sample: 2023-01-03T00:00:00+0700') def _prepare_data(self, redemtion_dirs): prepared_data = {} # {"redemption_id": {"image_paths": []}, "pages": 1} for redemtion_dir in redemtion_dirs: redemptions = os.listdir(redemtion_dir) for redemption in redemptions: if "." + redemption.split(".")[-1] in image_extensions + pdf_extensions + [".csv", ".zip"]: continue # ignore non-folder files_in_dir = [] for ext in image_extensions + pdf_extensions: files_in_dir.extend(glob.glob(os.path.join(redemtion_dir, redemption, ext))) files_in_dir = sorted(files_in_dir) redemption = redemption.replace("Data", "").replace("(pdf)", "") if prepared_data.get(redemption, None): prepared_data[redemption]["image_paths"] += files_in_dir prepared_data[redemption]["pages"] += len(files_in_dir) else: prepared_data[redemption] = {"image_paths": files_in_dir, "pages": len(files_in_dir)} return prepared_data def _add_log(self, result, log, redemption_id, log_level): if not result.get(log_level, None): result[log_level] = {} log = "[{}]".format(redemption_id[:2]) + log if result[log_level].get(log, None): result[log_level][log].add(redemption_id) else: result[log_level][log] = set([redemption_id]) def _add_error(self, result, error, redemption_id): self._add_log(result, error, redemption_id, "Error") def _add_info(self, result, info, redemption_id): self._add_log(result, info, redemption_id, "Info") def _add_warning(self, result, warn, redemption_id): self._add_log(result, warn, redemption_id, "Warning") def _try_find_doc_type(self, file_paths): doc_types = {"invoice": [], "imei": [], "undefined": []} for file_path in file_paths: if "invoice" in os.path.basename(file_path): doc_types["invoice"].append(file_path) elif "imei" in os.path.basename(file_path): doc_types["imei"].append(file_path) else: doc_types["undefined"].append(file_path) return doc_types def process_request(self, request, data, result, s3_client): if not request.predict_result: # self.stdout.write(self.style.WARNING(f"Key predict_result not found in {request.request_id}")) return if request.predict_result.get("status", 200) != 200: # self.stdout.write(self.style.WARNING(f"Not a sucess request {request.request_id}")) return # Find to coresponding redemption_ID self._add_info(result, "[OCR]: redemptions", request.redemption_id) self._add_info(result, "[OCR]: total {} images".format(request.pages), request.redemption_id) if request.redemption_id not in list(data.keys()): self._add_error(result, "[OCR]: Not found redemption_ID", request.redemption_id) return if request.pages != data[request.redemption_id]["pages"]: self._add_error(result, "[SBT]: Mismatch files number in a request", request.redemption_id) if BET_ON_FIRST_IMAGE and request.pages == 1: self._add_warning(result, "[SBT]: monoimage-request, bet on first one", request.redemption_id) data[request.redemption_id]["image_paths"] = [data[request.redemption_id]["image_paths"][0]] else: return file_paths_by_doc_type = self._try_find_doc_type(data[request.redemption_id]["image_paths"]) if request.redemption_id in [ "SGE20240608115040-910", "SGE20240607160017-644", "SGE20240609095034-986", "SGGE20240609145539-429", "SGE20240607134340-431", "SGE20240609073431-645", "SGE20240608124611-070", "SGE20240610120344-912", "SGE20240610085917-775", "SGGE20240609044518-869", "SGE20240608093242-813", "SGGE20240608175708-038", "SGE20240607175952-926", "SGE20240609060258-864", "SGGE20240609144052-538", "SGG20240607135057-187", "SGE20240608133426-100", "SGE20240607152408-300", "SGG20240608162101-167", "SGG20240608133730-021", "SGE20240609103647-828" ]: print("{} - {} - {}".format(request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE, request.redemption_id[:2], data[request.redemption_id]["pages"])) if request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE and data[request.redemption_id]["pages"] == 1: self._add_warning(result, "[SBT]: mono-redemption, make it invoice", request.redemption_id) file_paths_by_doc_type["invoice"] = data[request.redemption_id]["image_paths"] file_paths_by_doc_type["imei"] = [] file_paths_by_doc_type["undefined"] = [] if len(file_paths_by_doc_type["undefined"]) > 0: self._add_warning(result, "[SBT]: Undefined doc type", request.redemption_id) if request.pages > 1 or data[request.redemption_id]["pages"] > 1: self._add_error(result, "[SBT]: request with multiple images", request.redemption_id) if IGNORE_MULTIPLE_IMAGE: return if len(request.request_id.split(".")[0].split("_")) < 2: return images = SubscriptionRequestFile.objects.filter(request=request, file_category="Origin") for i, image in enumerate(images): if image.doc_type not in ["imei", "invoice"]: self._add_error(result, "[OCR]: Weird doc type", request.redemption_id) continue try: if len(file_paths_by_doc_type[image.doc_type]) > 0: local_file_path = file_paths_by_doc_type[image.doc_type].pop(0) else: local_file_path = file_paths_by_doc_type["undefined"].pop(0) predir = "sbt_invoice" s3_key = os.path.join(predir, request.request_id, image.file_name) # s3_client.upload_file(local_file_path, s3_key) result['total'] += 1 self._add_info(result, "[OCR]: Success", request.redemption_id) except IndexError as e: self._add_error(result, "[OCR]: Mismatch doc type", request.redemption_id) continue except Exception as e: self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}")) print(traceback.format_exc()) result['failed'] += 1 self._add_info(result, "[OCR]: Failed", request.redemption_id) continue data.pop(request.redemption_id, None) def handle(self, *args, **options): start = options['start'] end = options['end'] result = {'total':0, 'failed':0} # TODO: redemption ID is not null on filter if start or end: try: start_date = timezone.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only end_date = timezone.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z') except Exception as e: print(f"[INFO]: start: {start}") print(f"[INFO]: end: {end}") raise InvalidException(excArgs="Date format") subcription_iter = SubscriptionRequest.objects.filter(created_at__range=(start_date, end_date), redemption_id__isnull=False) else: subcription_iter = SubscriptionRequest.objects.filter(redemption_id__isnull=False) print(f"[INFO]: Preparing data for filling up...") prepared_data = self._prepare_data(IMAGE_DIRS) # Log out prepared infomation for k,v in prepared_data.items(): self._add_info(result, "[Provided]: total {} images found".format(v["pages"]), k) print(f"[INFO]: Prepared data, total: {len(list(prepared_data.keys()))}") prepared_data_copy = copy.deepcopy(prepared_data) s3_client = MinioS3Client( # endpoint='http://107.120.133.27:9884', access_key='secret', secret_key='secret', bucket_name='ocr-sds' ) # file = open("modified.txt", "w") for request in tqdm(subcription_iter.iterator()): self.process_request(request, prepared_data_copy, result, s3_client) # file.close() self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully! total/failed: {}/{}'.format(result['total'], result['failed']))) # print(f"[INFO]: result: {result}") for err in result.get("Error", []): print("[INFO]: Error: {}: {}".format(err, len(result["Error"][err]))) result["Error"][err] = list(result["Error"][err]) for log_level in ['Info', 'Error', 'Warning']: errs = result.get(log_level, []) errs = sorted(errs) for err in errs: print("[INFO]: {}: {}: {}".format(log_level, err, len(result[log_level][err]))) result[log_level][err] = list(result[log_level][err]) with open("result.json", "w") as outfile: json.dump(result, outfile)