Add: refill S3 image by redemption script
This commit is contained in:
parent
74431c3bc9
commit
d27a7ec9d2
@ -0,0 +1,174 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from tqdm import tqdm
|
||||
from fwd_api.models import SubscriptionRequestFile, SubscriptionRequest
|
||||
from fwd_api.exception.exceptions import InvalidException
|
||||
from fwd_api.utils.s3 import MinioS3Client
|
||||
|
||||
import copy
|
||||
import os
|
||||
import glob
|
||||
import traceback
|
||||
import copy
|
||||
import json
|
||||
from django.utils import timezone
|
||||
|
||||
IMAGE_DIRS = ["/external_data/SGGE", "/external_data/zipsGwp1", "/external_data/zipsGwp2", "/external_data/zipsGwp3", "/external_data/zipsGwp4", "/external_data/zipsEvoucher"]
|
||||
# IMAGE_DIRS = ["/external_data/SGGE"]
|
||||
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.gif']
|
||||
pdf_extensions = ['*.pdf']
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Refactor database for image level'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
# Add your command-line arguments here
|
||||
parser.add_argument('start', type=str, help='start date, sample: 2023-01-02T00:00:00+0700')
|
||||
parser.add_argument('end', type=str, help='end date, sample: 2023-01-03T00:00:00+0700')
|
||||
|
||||
def _prepare_data(self, redemtion_dirs):
|
||||
prepared_data = {} # {"redemption_id": {"image_paths": []}, "pages": 1}
|
||||
for redemtion_dir in redemtion_dirs:
|
||||
redemptions = os.listdir(redemtion_dir)
|
||||
for redemption in redemptions:
|
||||
files_in_dir = []
|
||||
for ext in image_extensions + pdf_extensions:
|
||||
files_in_dir.extend(glob.glob(os.path.join(redemtion_dir, redemption, ext)))
|
||||
redemption = redemption.replace("Data", "")
|
||||
if prepared_data.get(redemption, None):
|
||||
prepared_data[redemption]["image_paths"] += files_in_dir
|
||||
prepared_data[redemption]["pages"] += len(files_in_dir)
|
||||
else:
|
||||
prepared_data[redemption] = {"image_paths": files_in_dir, "pages": len(files_in_dir)}
|
||||
|
||||
return prepared_data
|
||||
|
||||
def _add_error(self, result, error, redemption_id):
|
||||
if not result.get("Error", None):
|
||||
result["Error"] = {}
|
||||
if result["Error"].get(error, None):
|
||||
result["Error"][error].add(redemption_id)
|
||||
else:
|
||||
result["Error"][error] = set([redemption_id])
|
||||
|
||||
def _add_info(self, result, info, redemption_id):
|
||||
if not result.get("Info", None):
|
||||
result["Info"] = {}
|
||||
if result["Info"].get(info, None):
|
||||
result["Info"][info].add(redemption_id)
|
||||
else:
|
||||
result["Info"][info] = set([redemption_id])
|
||||
|
||||
def _add_warning(self, result, warn, redemption_id):
|
||||
if not result.get("Warning", None):
|
||||
result["Warning"] = {}
|
||||
if result["Warning"].get(warn, None):
|
||||
result["Warning"][warn].add(redemption_id)
|
||||
else:
|
||||
result["Warning"][warn] = set([redemption_id])
|
||||
|
||||
def _try_find_doc_type(self, file_paths):
|
||||
doc_types = {"invoice": [],
|
||||
"imei": [],
|
||||
"undefined": []}
|
||||
for file_path in file_paths:
|
||||
if "invoice" in os.path.basename(file_path):
|
||||
doc_types["invoice"].append(file_path)
|
||||
elif "imei" in os.path.basename(file_path):
|
||||
doc_types["imei"].append(file_path)
|
||||
else:
|
||||
doc_types["undefined"].append(file_path)
|
||||
return doc_types
|
||||
|
||||
def process_request(self, request, data, result, s3_client):
|
||||
if not request.predict_result:
|
||||
# self.stdout.write(self.style.WARNING(f"Key predict_result not found in {request.request_id}"))
|
||||
return
|
||||
if request.predict_result.get("status", 200) != 200:
|
||||
# self.stdout.write(self.style.WARNING(f"Not a sucess request {request.request_id}"))
|
||||
return
|
||||
# Find to coresponding redemption_ID
|
||||
self._add_info(result, "[OCR]: redemptions", request.redemption_id)
|
||||
if request.redemption_id not in list(data.keys()):
|
||||
self._add_error(result, "[OCR]: Not found redemption_ID", request.redemption_id)
|
||||
return
|
||||
if request.pages != data[request.redemption_id]["pages"]:
|
||||
self._add_error(result, "[SBT]: Mismatch files number in a request", request.redemption_id)
|
||||
return
|
||||
|
||||
file_paths_by_doc_type = self._try_find_doc_type(data[request.redemption_id]["image_paths"])
|
||||
if len(file_paths_by_doc_type["undefined"]) > 0:
|
||||
self._add_warning(result, "[SBT]: Undefined doc type", request.redemption_id)
|
||||
|
||||
if len(request.request_id.split(".")[0].split("_")) < 2:
|
||||
return
|
||||
images = SubscriptionRequestFile.objects.filter(request=request, file_category="Origin")
|
||||
|
||||
for i, image in enumerate(images):
|
||||
if image.doc_type not in ["imei", "invoice"]:
|
||||
self._add_error(result, "[OCR]: Weird doc type", request.redemption_id)
|
||||
continue
|
||||
try:
|
||||
if len(file_paths_by_doc_type[image.doc_type]) > 0:
|
||||
local_file_path = file_paths_by_doc_type[image.doc_type].pop(0)
|
||||
else:
|
||||
local_file_path = file_paths_by_doc_type["undefined"].pop(0)
|
||||
predir = "sbt_invoice"
|
||||
s3_key = os.path.join(predir, request.request_id, image.file_name)
|
||||
# s3_client.upload_file(local_file_path, s3_key)
|
||||
result['total'] += 1
|
||||
self._add_info(result, "[OCR]: Success", request.redemption_id)
|
||||
except IndexError as e:
|
||||
self._add_error(result, "[OCR]: Mismatch doc type", request.redemption_id)
|
||||
continue
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"Request: {request.request_id} failed with {e}"))
|
||||
print(traceback.format_exc())
|
||||
result['failed'] += 1
|
||||
self._add_info(result, "[OCR]: Failed", request.redemption_id)
|
||||
continue
|
||||
data.pop(request.redemption_id, None)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
start = options['start']
|
||||
end = options['end']
|
||||
result = {'total':0,
|
||||
'failed':0}
|
||||
# TODO: redemption ID is not null on filter
|
||||
if start or end:
|
||||
try:
|
||||
start_date = timezone.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S%z') # We care only about day precision only
|
||||
end_date = timezone.datetime.strptime(end, '%Y-%m-%dT%H:%M:%S%z')
|
||||
except Exception as e:
|
||||
print(f"[INFO]: start: {start}")
|
||||
print(f"[INFO]: end: {end}")
|
||||
raise InvalidException(excArgs="Date format")
|
||||
subcription_iter = SubscriptionRequest.objects.filter(created_at__range=(start_date, end_date), redemption_id__isnull=False)
|
||||
else:
|
||||
subcription_iter = SubscriptionRequest.objects.filter(redemption_id__isnull=False)
|
||||
print(f"[INFO]: Preparing data for filling up...")
|
||||
prepared_data = self._prepare_data(IMAGE_DIRS)
|
||||
print(f"[INFO]: Prepared data, total: {len(list(prepared_data.keys()))}")
|
||||
prepared_data_copy = copy.deepcopy(prepared_data)
|
||||
s3_client = MinioS3Client(
|
||||
# endpoint='http://107.120.133.27:9884',
|
||||
access_key='AKIA3AFPFVWZHTZHB6FW',
|
||||
secret_key='qYmEkfnO8ltQ7n9GfnF8+HRcfOsbXhx0YSNOLxdW',
|
||||
bucket_name='ocr-sds'
|
||||
)
|
||||
# file = open("modified.txt", "w")
|
||||
for request in tqdm(subcription_iter.iterator()):
|
||||
self.process_request(request, prepared_data_copy, result, s3_client)
|
||||
# file.close()
|
||||
self.stdout.write(self.style.SUCCESS('Sample Django management command executed successfully! total/failed: {}/{}'.format(result['total'], result['failed'])))
|
||||
# print(f"[INFO]: result: {result}")
|
||||
for err in result.get("Error", []):
|
||||
print("[INFO]: Error: {}: {}".format(err, len(result["Error"][err])))
|
||||
result["Error"][err] = list(result["Error"][err])
|
||||
for info in result.get("Info", []):
|
||||
print("[INFO]: Info: {}: {}".format(info, len(result["Info"][info])))
|
||||
result["Info"][info] = list(result["Info"][info])
|
||||
for warn in result.get("Warning", []):
|
||||
print("[INFO]: Warning: {}: {}".format(warn, len(result["Warning"][warn])))
|
||||
result["Warning"][warn] = list(result["Warning"][warn])
|
||||
with open("result.json", "w") as outfile:
|
||||
json.dump(result, outfile)
|
Loading…
Reference in New Issue
Block a user