sbt-idp/cope2n-api/fwd_api/management/commands/migrate-datebase-fillup-images.py

from django.core.management.base import BaseCommand
from tqdm import tqdm
from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest
from fwd_api.exception.exceptions import InvalidException
from fwd_api.utils.s3 import MinioS3Client

import copy
import os
import glob
import traceback
import copy
import json 
from django.utils import timezone

IMAGE_DIRS = ["/external_data/SGGE", "/external_data/zipsGwp1", "/external_data/zipsGwp2", "/external_data/zipsGwp3", "/external_data/zipsGwp4", "/external_data/zipsEvoucher"]
# IMAGE_DIRS = ["/external_data/SGGE"]
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif', '*.JPG', '*.JPEG', '*.PNG', '*.GÌ']
pdf_extensions = ['*.pdf', '*.PDF']

IGNORE_MULTIPLE_IMAGE = True
BET_ON_FIRST_IMAGE = True # Try to upload the first image to monoimage-request
PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE = ["SG"]

class Command(BaseCommand):
    help = 'Refactor database for image level'

    def add_arguments(self, parser):
        # Add your command-line arguments here
        parser.add_argument('start', type=str, help='start date, sample: 2023-01-02T00:00:00+0700')
        parser.add_argument('end', type=str, help='end date, sample: 2023-01-03T00:00:00+0700')

    def _prepare_data(self, redemtion_dirs):
        prepared_data = {} # {"redemption_id": {"image_paths": []}, "pages": 1}
        for redemtion_dir in redemtion_dirs:
            redemptions = os.listdir(redemtion_dir)
            for redemption in redemptions:
                if "." + redemption.split(".")[-1] in image_extensions + pdf_extensions + [".csv", ".zip"]:
                    continue # ignore non-folder
                files_in_dir = []
                for ext in image_extensions + pdf_extensions:
                    files_in_dir.extend(glob.glob(os.path.join(redemtion_dir, redemption, ext)))
                files_in_dir = sorted(files_in_dir)
                redemption = redemption.replace("Data", "").replace("(pdf)", "")
                if prepared_data.get(redemption, None):
                    prepared_data[redemption]["image_paths"] += files_in_dir
                    prepared_data[redemption]["pages"] += len(files_in_dir)
                else:
                    prepared_data[redemption] = {"image_paths": files_in_dir, "pages": len(files_in_dir)}
        
        return prepared_data

    def _add_log(self, result, log, redemption_id, log_level):
        if not result.get(log_level, None):
            result[log_level] = {}
        log = "[{}]".format(redemption_id[:2]) + log
        if result[log_level].get(log, None):
            result[log_level][log].add(redemption_id)
        else:
            result[log_level][log] = set([redemption_id])

    def _add_error(self, result, error, redemption_id):
        self._add_log(result, error, redemption_id, "Error")
    
    def _add_info(self, result, info, redemption_id):
        self._add_log(result, info, redemption_id, "Info")
    
    def _add_warning(self, result, warn, redemption_id):
        self._add_log(result, warn, redemption_id, "Warning")

    def _try_find_doc_type(self, file_paths):
        doc_types = {"invoice": [],
                     "imei": [],
                     "undefined": []}
        for file_path in file_paths:
            if "invoice" in os.path.basename(file_path):
                doc_types["invoice"].append(file_path)
            elif "imei" in os.path.basename(file_path):
                doc_types["imei"].append(file_path)
            else:
                doc_types["undefined"].append(file_path)
        return doc_types

    def process_request(self, request, data, result, s3_client):
        if not request.predict_result:
            # self.stdout.write(self.style.WARNING(f"Key predict_result not found in {request.request_id}"))
            return
        if request.predict_result.get("status", 200) != 200:
            # self.stdout.write(self.style.WARNING(f"Not a sucess request {request.request_id}"))
            return
        # Find to coresponding redemption_ID
        self._add_info(result, "[OCR]: redemptions", request.redemption_id)
        self._add_info(result, "[OCR]: total {} images".format(request.pages), request.redemption_id)
        if request.redemption_id not in list(data.keys()):
            self._add_error(result, "[OCR]: Not found redemption_ID", request.redemption_id)
            return

        if request.pages != data[request.redemption_id]["pages"]:
            self._add_error(result, "[SBT]: Mismatch files number in a request", request.redemption_id)
            if BET_ON_FIRST_IMAGE and request.pages == 1:
                self._add_warning(result, "[SBT]: monoimage-request, bet on first one", request.redemption_id)
                data[request.redemption_id]["image_paths"] = [data[request.redemption_id]["image_paths"][0]]
            else:
                return

        file_paths_by_doc_type = self._try_find_doc_type(data[request.redemption_id]["image_paths"])
        if request.redemption_id in [
            "SGE20240608115040-910",
            "SGE20240607160017-644",
            "SGE20240609095034-986",
            "SGGE20240609145539-429",
            "SGE20240607134340-431",
            "SGE20240609073431-645",
            "SGE20240608124611-070",
            "SGE20240610120344-912",
            "SGE20240610085917-775",
            "SGGE20240609044518-869",
            "SGE20240608093242-813",
            "SGGE20240608175708-038",
            "SGE20240607175952-926",
            "SGE20240609060258-864",
            "SGGE20240609144052-538",
            "SGG20240607135057-187",
            "SGE20240608133426-100",
            "SGE20240607152408-300",
            "SGG20240608162101-167",
            "SGG20240608133730-021",
            "SGE20240609103647-828"
        ]:
            print("{} - {} - {}".format(request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE, request.redemption_id[:2], data[request.redemption_id]["pages"]))
        if request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE and data[request.redemption_id]["pages"] == 1:
            self._add_warning(result, "[SBT]: mono-redemption, make it invoice", request.redemption_id)
            file_paths_by_doc_type["invoice"] = data[request.redemption_id]["image_paths"]
            file_paths_by_doc_type["imei"] = []
            file_paths_by_doc_type["undefined"] = []

        if len(file_paths_by_doc_type["undefined"]) > 0:
            self._add_warning(result, "[SBT]: Undefined doc type", request.redemption_id)
                

        if request.pages > 1 or data[request.redemption_id]["pages"] > 1:
            self._add_error(result, "[SBT]: request with multiple images", request.redemption_id)
            if IGNORE_MULTIPLE_IMAGE:
                return   
            
        if len(request.request_id.split(".")[0].split("_")) < 2:
            return
        images = SubscriptionRequestFile.objects.filter(request=request, file_category="Origin")
        
        for i, image in enumerate(images):
            if image.doc_type not in ["imei", "invoice"]:
                self._add_error(result, "[OCR]: Weird doc type", request.redemption_id)
                continue
            try:
                if len(file_paths_by_doc_type[image.doc_type]) > 0:
                    local_file_path = file_paths_by_doc_type[image.doc_type].pop(0)
                else:
                    local_file_path = file_paths_by_doc_type["undefined"].pop(0)
                predir = "sbt_invoice"
                s3_key = os.path.join(predir, request.request_id, image.file_name)
                # s3_client.upload_file(local_file_path, s3_key)
                result['total'] += 1
                self._add_info(result, "[OCR]: Success", request.redemption_id)
            except IndexError as e:
                self._add_error(result, "[OCR]: Mismatch doc type", request.redemption_id)
                continue
            except Exception as e:
                self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}"))
                print(traceback.format_exc())
                result['failed'] += 1
                self._add_info(result, "[OCR]: Failed", request.redemption_id)
                continue
        data.pop(request.redemption_id, None)

    def handle(self, *args, **options):
        start = options['start']
        end = options['end']
        result = {'total':0,
                  'failed':0}
        # TODO: redemption ID is not null on filter
        if start or end:
            try:
                start_date = timezone.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only
                end_date = timezone.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z')
            except Exception as e:
                print(f"[INFO]: start: {start}")
                print(f"[INFO]: end: {end}")
                raise InvalidException(excArgs="Date format")
            subcription_iter = SubscriptionRequest.objects.filter(created_at__range=(start_date, end_date), redemption_id__isnull=False)
        else:
            subcription_iter = SubscriptionRequest.objects.filter(redemption_id__isnull=False)
        print(f"[INFO]: Preparing data for filling up...")
        prepared_data = self._prepare_data(IMAGE_DIRS)
        # Log out prepared infomation
        for k,v in prepared_data.items():
            self._add_info(result, "[Provided]: total {} images found".format(v["pages"]), k)
        print(f"[INFO]: Prepared data, total: {len(list(prepared_data.keys()))}")
        prepared_data_copy = copy.deepcopy(prepared_data)
        s3_client = MinioS3Client(
            # endpoint='http://107.120.133.27:9884',
            access_key='secret',
            secret_key='secret',
            bucket_name='ocr-sds'
            )
        # file = open("modified.txt", "w")
        for request in tqdm(subcription_iter.iterator()):
            self.process_request(request, prepared_data_copy, result, s3_client)
        # file.close()
        self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully! total/failed: {}/{}'.format(result['total'], result['failed'])))
        # print(f"[INFO]: result: {result}")
        for err in result.get("Error", []):
            print("[INFO]: Error: {}: {}".format(err, len(result["Error"][err])))
            result["Error"][err] = list(result["Error"][err])
        
        for log_level in ['Info', 'Error', 'Warning']:
            errs = result.get(log_level, [])
            errs = sorted(errs)
            for err in errs:
                print("[INFO]: {}: {}: {}".format(log_level, err, len(result[log_level][err])))
                result[log_level][err] = list(result[log_level][err])
        with open("result.json", "w") as outfile: 
            json.dump(result, outfile)
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`from django.core.management.base import BaseCommand`
			`from tqdm import tqdm`
			`from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest`
			`from fwd_api.exception.exceptions import InvalidException`
			`from fwd_api.utils.s3 import MinioS3Client`

			`import copy`
			`import os`
			`import glob`
			`import traceback`
			`import copy`
			`import json`
			`from django.utils import timezone`

			`IMAGE_DIRS = ["/external_data/SGGE", "/external_data/zipsGwp1", "/external_data/zipsGwp2", "/external_data/zipsGwp3", "/external_data/zipsGwp4", "/external_data/zipsEvoucher"]`
			`# IMAGE_DIRS = ["/external_data/SGGE"]`
update: script 2024-07-03 05:51:49 +00:00			`image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.JPG', '.JPEG', '.PNG', '.GÌ']`
			`pdf_extensions = ['.pdf', '.PDF']`

			`IGNORE_MULTIPLE_IMAGE = True`
			`BET_ON_FIRST_IMAGE = True # Try to upload the first image to monoimage-request`
			`PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE = ["SG"]`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00
			`class Command(BaseCommand):`
			`help = 'Refactor database for image level'`

			`def add_arguments(self, parser):`
			`# Add your command-line arguments here`
			`parser.add_argument('start', type=str, help='start date, sample: 2023-01-02T00:00:00+0700')`
			`parser.add_argument('end', type=str, help='end date, sample: 2023-01-03T00:00:00+0700')`

			`def _prepare_data(self, redemtion_dirs):`
			`prepared_data = {} # {"redemption_id": {"image_paths": []}, "pages": 1}`
			`for redemtion_dir in redemtion_dirs:`
			`redemptions = os.listdir(redemtion_dir)`
			`for redemption in redemptions:`
update: script 2024-07-03 05:51:49 +00:00			`if "." + redemption.split(".")[-1] in image_extensions + pdf_extensions + [".csv", ".zip"]:`
			`continue # ignore non-folder`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`files_in_dir = []`
			`for ext in image_extensions + pdf_extensions:`
			`files_in_dir.extend(glob.glob(os.path.join(redemtion_dir, redemption, ext)))`
update: script 2024-07-03 05:51:49 +00:00			`files_in_dir = sorted(files_in_dir)`
			`redemption = redemption.replace("Data", "").replace("(pdf)", "")`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`if prepared_data.get(redemption, None):`
			`prepared_data[redemption]["image_paths"] += files_in_dir`
			`prepared_data[redemption]["pages"] += len(files_in_dir)`
			`else:`
			`prepared_data[redemption] = {"image_paths": files_in_dir, "pages": len(files_in_dir)}`

			`return prepared_data`

update: script 2024-07-03 05:51:49 +00:00			`def _add_log(self, result, log, redemption_id, log_level):`
			`if not result.get(log_level, None):`
			`result[log_level] = {}`
			`log = "[{}]".format(redemption_id[:2]) + log`
			`if result[log_level].get(log, None):`
			`result[log_level][log].add(redemption_id)`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`else:`
update: script 2024-07-03 05:51:49 +00:00			`result[log_level][log] = set([redemption_id])`

			`def _add_error(self, result, error, redemption_id):`
			`self._add_log(result, error, redemption_id, "Error")`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00
			`def _add_info(self, result, info, redemption_id):`
update: script 2024-07-03 05:51:49 +00:00			`self._add_log(result, info, redemption_id, "Info")`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00
			`def _add_warning(self, result, warn, redemption_id):`
update: script 2024-07-03 05:51:49 +00:00			`self._add_log(result, warn, redemption_id, "Warning")`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00
			`def _try_find_doc_type(self, file_paths):`
			`doc_types = {"invoice": [],`
			`"imei": [],`
			`"undefined": []}`
			`for file_path in file_paths:`
			`if "invoice" in os.path.basename(file_path):`
			`doc_types["invoice"].append(file_path)`
			`elif "imei" in os.path.basename(file_path):`
			`doc_types["imei"].append(file_path)`
			`else:`
			`doc_types["undefined"].append(file_path)`
			`return doc_types`

			`def process_request(self, request, data, result, s3_client):`
			`if not request.predict_result:`
			`# self.stdout.write(self.style.WARNING(f"Key predict_result not found in {request.request_id}"))`
			`return`
			`if request.predict_result.get("status", 200) != 200:`
			`# self.stdout.write(self.style.WARNING(f"Not a sucess request {request.request_id}"))`
			`return`
			`# Find to coresponding redemption_ID`
			`self._add_info(result, "[OCR]: redemptions", request.redemption_id)`
update: script 2024-07-03 05:51:49 +00:00			`self._add_info(result, "[OCR]: total {} images".format(request.pages), request.redemption_id)`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`if request.redemption_id not in list(data.keys()):`
			`self._add_error(result, "[OCR]: Not found redemption_ID", request.redemption_id)`
			`return`
update: script 2024-07-03 05:51:49 +00:00
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`if request.pages != data[request.redemption_id]["pages"]:`
			`self._add_error(result, "[SBT]: Mismatch files number in a request", request.redemption_id)`
update: script 2024-07-03 05:51:49 +00:00			`if BET_ON_FIRST_IMAGE and request.pages == 1:`
			`self._add_warning(result, "[SBT]: monoimage-request, bet on first one", request.redemption_id)`
			`data[request.redemption_id]["image_paths"] = [data[request.redemption_id]["image_paths"][0]]`
			`else:`
			`return`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00
			`file_paths_by_doc_type = self._try_find_doc_type(data[request.redemption_id]["image_paths"])`
update: script 2024-07-03 05:51:49 +00:00			`if request.redemption_id in [`
			`"SGE20240608115040-910",`
			`"SGE20240607160017-644",`
			`"SGE20240609095034-986",`
			`"SGGE20240609145539-429",`
			`"SGE20240607134340-431",`
			`"SGE20240609073431-645",`
			`"SGE20240608124611-070",`
			`"SGE20240610120344-912",`
			`"SGE20240610085917-775",`
			`"SGGE20240609044518-869",`
			`"SGE20240608093242-813",`
			`"SGGE20240608175708-038",`
			`"SGE20240607175952-926",`
			`"SGE20240609060258-864",`
			`"SGGE20240609144052-538",`
			`"SGG20240607135057-187",`
			`"SGE20240608133426-100",`
			`"SGE20240607152408-300",`
			`"SGG20240608162101-167",`
			`"SGG20240608133730-021",`
			`"SGE20240609103647-828"`
			`]:`
			`print("{} - {} - {}".format(request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE, request.redemption_id[:2], data[request.redemption_id]["pages"]))`
			`if request.redemption_id[:2] in PROVIDED_MONO_REDEMPTION_MAKE_IT_INVOICE and data[request.redemption_id]["pages"] == 1:`
			`self._add_warning(result, "[SBT]: mono-redemption, make it invoice", request.redemption_id)`
			`file_paths_by_doc_type["invoice"] = data[request.redemption_id]["image_paths"]`
			`file_paths_by_doc_type["imei"] = []`
			`file_paths_by_doc_type["undefined"] = []`

Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`if len(file_paths_by_doc_type["undefined"]) > 0:`
			`self._add_warning(result, "[SBT]: Undefined doc type", request.redemption_id)`
update: script 2024-07-03 05:51:49 +00:00

			`if request.pages > 1 or data[request.redemption_id]["pages"] > 1:`
			`self._add_error(result, "[SBT]: request with multiple images", request.redemption_id)`
			`if IGNORE_MULTIPLE_IMAGE:`
			`return`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00
			`if len(request.request_id.split(".")[0].split("_")) < 2:`
			`return`
			`images = SubscriptionRequestFile.objects.filter(request=request, file_category="Origin")`

			`for i, image in enumerate(images):`
			`if image.doc_type not in ["imei", "invoice"]:`
			`self._add_error(result, "[OCR]: Weird doc type", request.redemption_id)`
			`continue`
			`try:`
			`if len(file_paths_by_doc_type[image.doc_type]) > 0:`
			`local_file_path = file_paths_by_doc_type[image.doc_type].pop(0)`
			`else:`
			`local_file_path = file_paths_by_doc_type["undefined"].pop(0)`
			`predir = "sbt_invoice"`
			`s3_key = os.path.join(predir, request.request_id, image.file_name)`
			`# s3_client.upload_file(local_file_path, s3_key)`
			`result['total'] += 1`
			`self._add_info(result, "[OCR]: Success", request.redemption_id)`
			`except IndexError as e:`
			`self._add_error(result, "[OCR]: Mismatch doc type", request.redemption_id)`
			`continue`
			`except Exception as e:`
			`self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}"))`
			`print(traceback.format_exc())`
			`result['failed'] += 1`
			`self._add_info(result, "[OCR]: Failed", request.redemption_id)`
			`continue`
			`data.pop(request.redemption_id, None)`

			`def handle(self, args, *options):`
			`start = options['start']`
			`end = options['end']`
			`result = {'total':0,`
			`'failed':0}`
			`# TODO: redemption ID is not null on filter`
			`if start or end:`
			`try:`
			`start_date = timezone.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only`
			`end_date = timezone.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z')`
			`except Exception as e:`
			`print(f"[INFO]: start: {start}")`
			`print(f"[INFO]: end: {end}")`
			`raise InvalidException(excArgs="Date format")`
			`subcription_iter = SubscriptionRequest.objects.filter(created_at__range=(start_date, end_date), redemption_id__isnull=False)`
			`else:`
			`subcription_iter = SubscriptionRequest.objects.filter(redemption_id__isnull=False)`
			`print(f"[INFO]: Preparing data for filling up...")`
			`prepared_data = self._prepare_data(IMAGE_DIRS)`
update: script 2024-07-03 05:51:49 +00:00			`# Log out prepared infomation`
			`for k,v in prepared_data.items():`
			`self._add_info(result, "[Provided]: total {} images found".format(v["pages"]), k)`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`print(f"[INFO]: Prepared data, total: {len(list(prepared_data.keys()))}")`
			`prepared_data_copy = copy.deepcopy(prepared_data)`
			`s3_client = MinioS3Client(`
			`# endpoint='http://107.120.133.27:9884',`
remove keys 2024-06-25 05:54:32 +00:00			`access_key='secret',`
update: script 2024-07-03 05:51:49 +00:00			`secret_key='secret',`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`bucket_name='ocr-sds'`
			`)`
			`# file = open("modified.txt", "w")`
			`for request in tqdm(subcription_iter.iterator()):`
			`self.process_request(request, prepared_data_copy, result, s3_client)`
			`# file.close()`
			`self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully! total/failed: {}/{}'.format(result['total'], result['failed'])))`
			`# print(f"[INFO]: result: {result}")`
			`for err in result.get("Error", []):`
			`print("[INFO]: Error: {}: {}".format(err, len(result["Error"][err])))`
			`result["Error"][err] = list(result["Error"][err])`
update: script 2024-07-03 05:51:49 +00:00
			`for log_level in ['Info', 'Error', 'Warning']:`
			`errs = result.get(log_level, [])`
			`errs = sorted(errs)`
			`for err in errs:`
			`print("[INFO]: {}: {}: {}".format(log_level, err, len(result[log_level][err])))`
			`result[log_level][err] = list(result[log_level][err])`
Add: refill S3 image by redemption script 2024-06-21 03:48:02 +00:00			`with open("result.json", "w") as outfile:`
			`json.dump(result, outfile)`