87 lines
3.0 KiB
Python
87 lines
3.0 KiB
Python
|
import requests
|
||
|
import yaml
|
||
|
from time import time
|
||
|
import numpy as np
|
||
|
import re
|
||
|
|
||
|
from TemplateMatching.templatebasedextraction.src.serve_model import Predictor
|
||
|
from TemplateMatching.textdetection.serve_model import Predictor as TextDetector
|
||
|
from TemplateMatching.textrecognition.src.serve_model import Predictor as TextRecognizer
|
||
|
|
||
|
class Extractor:
|
||
|
def __init__(self):
|
||
|
with open("./TemplateMatching/setting.yml") as f:
|
||
|
self.setting = yaml.safe_load(f)
|
||
|
self.predictor = Predictor(self.setting["template_based_extraction"]["setting"])
|
||
|
self.text_detector = TextDetector(self.setting["text_detection"]["setting"])
|
||
|
self.text_recognizer = TextRecognizer(
|
||
|
self.setting["text_recognition"]["setting"]
|
||
|
)
|
||
|
|
||
|
def _format_output(self, document):
|
||
|
result = dict()
|
||
|
for field, values in document.items():
|
||
|
print(values["value"])
|
||
|
if "✪" in values["value"]:
|
||
|
values = values["value"].replace("✪", " ")
|
||
|
result[field] = values
|
||
|
else:
|
||
|
values = values["value"]
|
||
|
result[field] = values
|
||
|
return result
|
||
|
|
||
|
def _extract_idcard_info(self, images):
|
||
|
id_card_crops = self.idcard_detector(np.array(images))
|
||
|
processed_images = []
|
||
|
for i in range(len(id_card_crops)):
|
||
|
aligned_img = id_card_crops[i]
|
||
|
if aligned_img is not None:
|
||
|
processed_images.append(aligned_img)
|
||
|
else:
|
||
|
processed_images.append(images[i])
|
||
|
return processed_images
|
||
|
|
||
|
def _extract_id_no(self, doc):
|
||
|
page = doc["page_data"][0]
|
||
|
content = " ".join(page["contents"])
|
||
|
result1 = re.findall("[0-9]{12}", content)
|
||
|
if len(result1) == 0:
|
||
|
result2 = re.findall("[0-9]{9}", content)
|
||
|
if len(result2) == 0:
|
||
|
return None
|
||
|
return result2
|
||
|
return result1[0]
|
||
|
|
||
|
def image_alige(self, images, tmp_json):
|
||
|
template_image_dir = "/"
|
||
|
template_name = tmp_json["template_name"]
|
||
|
|
||
|
image_aliged = self.predictor.align_image(
|
||
|
images[0], tmp_json, template_image_dir, template_name
|
||
|
)
|
||
|
|
||
|
return image_aliged
|
||
|
|
||
|
def extract_information(self, image_aliged, tmp_json):
|
||
|
image_aligeds = [image_aliged]
|
||
|
batch_boxes = self.text_detector(image_aligeds)
|
||
|
cropped_images = [
|
||
|
image_aliged[int(y1) : int(y2), int(x1) : int(x2)]
|
||
|
for x1, y1, x2, y2 in batch_boxes[0]
|
||
|
]
|
||
|
texts = self.text_recognizer(cropped_images)
|
||
|
texts = [res for res in texts]
|
||
|
|
||
|
doc_page = dict()
|
||
|
doc_page["boxes"] = batch_boxes
|
||
|
doc_page["contents"] = texts
|
||
|
doc_page["types"] = ["word"] * len(batch_boxes)
|
||
|
doc_page["image"] = image_aliged
|
||
|
|
||
|
documents_with_info = self.predictor.template_based_extractor(
|
||
|
batch_boxes, texts, doc_page, tmp_json
|
||
|
)
|
||
|
|
||
|
result = self._format_output(documents_with_info)
|
||
|
|
||
|
return result
|