233 lines
9.8 KiB
Python
Executable File
233 lines
9.8 KiB
Python
Executable File
import logging
|
|
import logging.config
|
|
from utils.logging.logging import LOGGER_CONFIG
|
|
|
|
# Load the logging configuration
|
|
logging.config.dictConfig(LOGGER_CONFIG)
|
|
# Get the logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class Word():
|
|
def __init__(self, text="",image=None, conf_detect=0.0, conf_cls=0.0, bndbox = [-1,-1,-1,-1], kie_label =""):
|
|
self.type = "word"
|
|
self.text =text
|
|
self.image = image
|
|
self.conf_detect = conf_detect
|
|
self.conf_cls = conf_cls
|
|
self.boundingbox = bndbox # [left, top,right,bot] coordinate of top-left and bottom-right point
|
|
self.word_id = 0 # id of word
|
|
self.word_group_id = 0 # id of word_group which instance belongs to
|
|
self.line_id = 0 #id of line which instance belongs to
|
|
self.paragraph_id = 0 #id of line which instance belongs to
|
|
self.kie_label = kie_label
|
|
def invalid_size(self):
|
|
return (self.boundingbox[2] - self.boundingbox[0]) * (self.boundingbox[3] - self.boundingbox[1]) > 0
|
|
def is_special_word(self):
|
|
left, top, right, bottom = self.boundingbox
|
|
width, height = right - left, bottom - top
|
|
text = self.text
|
|
|
|
if text is None:
|
|
return True
|
|
|
|
# if len(text) > 7:
|
|
# return True
|
|
if len(text) >= 7:
|
|
no_digits = sum(c.isdigit() for c in text)
|
|
return no_digits / len(text) >= 0.3
|
|
|
|
return False
|
|
|
|
class Word_group():
|
|
def __init__(self):
|
|
self.type = "word_group"
|
|
self.list_words = [] # dict of word instances
|
|
self.word_group_id = 0 # word group id
|
|
self.line_id = 0 #id of line which instance belongs to
|
|
self.paragraph_id = 0# id of paragraph which instance belongs to
|
|
self.text =""
|
|
self.boundingbox = [-1,-1,-1,-1]
|
|
self.kie_label =""
|
|
def add_word(self, word:Word): #add a word instance to the word_group
|
|
if word.text != "✪":
|
|
for w in self.list_words:
|
|
if word.word_id == w.word_id:
|
|
logger.info("Word id collision")
|
|
return False
|
|
word.word_group_id = self.word_group_id #
|
|
word.line_id = self.line_id
|
|
word.paragraph_id = self.paragraph_id
|
|
self.list_words.append(word)
|
|
self.text += ' '+ word.text
|
|
if self.boundingbox == [-1,-1,-1,-1]:
|
|
self.boundingbox = word.boundingbox
|
|
else:
|
|
self.boundingbox = [min(self.boundingbox[0], word.boundingbox[0]),
|
|
min(self.boundingbox[1], word.boundingbox[1]),
|
|
max(self.boundingbox[2], word.boundingbox[2]),
|
|
max(self.boundingbox[3], word.boundingbox[3])]
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def update_word_group_id(self, new_word_group_id):
|
|
self.word_group_id = new_word_group_id
|
|
for i in range(len(self.list_words)):
|
|
self.list_words[i].word_group_id = new_word_group_id
|
|
|
|
def update_kie_label(self):
|
|
list_kie_label = [word.kie_label for word in self.list_words]
|
|
dict_kie = dict()
|
|
for label in list_kie_label:
|
|
if label not in dict_kie:
|
|
dict_kie[label]=1
|
|
else:
|
|
dict_kie[label]+=1
|
|
total = len(list(dict_kie.values()))
|
|
max_value = max(list(dict_kie.values()))
|
|
list_keys = list(dict_kie.keys())
|
|
list_values = list(dict_kie.values())
|
|
self.kie_label = list_keys[list_values.index(max_value)]
|
|
|
|
class Line():
|
|
def __init__(self):
|
|
self.type = "line"
|
|
self.list_word_groups = [] # list of Word_group instances in the line
|
|
self.line_id = 0 #id of line in the paragraph
|
|
self.paragraph_id = 0 # id of paragraph which instance belongs to
|
|
self.text = ""
|
|
self.boundingbox = [-1,-1,-1,-1]
|
|
def add_group(self, word_group:Word_group): # add a word_group instance
|
|
if word_group.list_words is not None:
|
|
for wg in self.list_word_groups:
|
|
if word_group.word_group_id == wg.word_group_id:
|
|
logger.info("Word_group id collision")
|
|
return False
|
|
|
|
self.list_word_groups.append(word_group)
|
|
self.text += word_group.text
|
|
word_group.paragraph_id = self.paragraph_id
|
|
word_group.line_id = self.line_id
|
|
|
|
for i in range(len(word_group.list_words)):
|
|
word_group.list_words[i].paragraph_id = self.paragraph_id #set paragraph_id for word
|
|
word_group.list_words[i].line_id = self.line_id #set line_id for word
|
|
return True
|
|
return False
|
|
def update_line_id(self, new_line_id):
|
|
self.line_id = new_line_id
|
|
for i in range(len(self.list_word_groups)):
|
|
self.list_word_groups[i].line_id = new_line_id
|
|
for j in range(len(self.list_word_groups[i].list_words)):
|
|
self.list_word_groups[i].list_words[j].line_id = new_line_id
|
|
|
|
|
|
def merge_word(self, word): # word can be a Word instance or a Word_group instance
|
|
if word.text != "✪":
|
|
if self.boundingbox == [-1,-1,-1,-1]:
|
|
self.boundingbox = word.boundingbox
|
|
else:
|
|
self.boundingbox = [min(self.boundingbox[0], word.boundingbox[0]),
|
|
min(self.boundingbox[1], word.boundingbox[1]),
|
|
max(self.boundingbox[2], word.boundingbox[2]),
|
|
max(self.boundingbox[3], word.boundingbox[3])]
|
|
self.list_word_groups.append(word)
|
|
self.text += ' ' + word.text
|
|
return True
|
|
return False
|
|
|
|
|
|
def in_same_line(self, input_line, thresh=0.7):
|
|
# calculate iou in vertical direction
|
|
left1, top1, right1, bottom1 = self.boundingbox
|
|
left2, top2, right2, bottom2 = input_line.boundingbox
|
|
|
|
sorted_vals = sorted([top1, bottom1, top2, bottom2])
|
|
intersection = sorted_vals[2] - sorted_vals[1]
|
|
union = sorted_vals[3]-sorted_vals[0]
|
|
min_height = min(bottom1-top1, bottom2-top2)
|
|
if min_height==0:
|
|
return False
|
|
ratio = intersection / min_height
|
|
height1, height2 = top1-bottom1, top2-bottom2
|
|
ratio_height = float(max(height1, height2))/float(min(height1, height2))
|
|
# height_diff = (float(top1-bottom1))/(float(top2-bottom2))
|
|
|
|
|
|
if (top1 in range(top2, bottom2) or top2 in range(top1, bottom1)) and ratio >= thresh and (ratio_height<2):
|
|
return True
|
|
return False
|
|
|
|
def check_iomin(word:Word, word_group:Word_group):
|
|
min_height = min(word.boundingbox[3]-word.boundingbox[1],word_group.boundingbox[3]-word_group.boundingbox[1])
|
|
intersect = min(word.boundingbox[3],word_group.boundingbox[3]) - max(word.boundingbox[1],word_group.boundingbox[1])
|
|
if intersect/min_height > 0.7:
|
|
return True
|
|
return False
|
|
|
|
def words_to_lines(words, check_special_lines=True): #words is list of Word instance
|
|
#sort word by top
|
|
words.sort(key = lambda x: (x.boundingbox[1], x.boundingbox[0]))
|
|
number_of_word = len(words)
|
|
#sort list words to list lines, which have not contained word_group yet
|
|
lines = []
|
|
for i, word in enumerate(words):
|
|
if word.invalid_size()==0:
|
|
continue
|
|
new_line = True
|
|
for i in range(len(lines)):
|
|
if lines[i].in_same_line(word): #check if word is in the same line with lines[i]
|
|
lines[i].merge_word(word)
|
|
new_line = False
|
|
|
|
if new_line ==True:
|
|
new_line = Line()
|
|
new_line.merge_word(word)
|
|
lines.append(new_line)
|
|
|
|
#sort line from top to bottom according top coordinate
|
|
lines.sort(key = lambda x: x.boundingbox[1])
|
|
|
|
#construct word_groups in each line
|
|
line_id = 0
|
|
word_group_id =0
|
|
word_id = 0
|
|
for i in range(len(lines)):
|
|
if len(lines[i].list_word_groups)==0:
|
|
continue
|
|
#left, top ,right, bottom
|
|
line_width = lines[i].boundingbox[2] - lines[i].boundingbox[0] # right - left
|
|
lines[i].list_word_groups.sort(key = lambda x: x.boundingbox[0]) #sort word in lines from left to right
|
|
|
|
#update text for lines after sorting
|
|
lines[i].text =""
|
|
for word in lines[i].list_word_groups:
|
|
lines[i].text += " "+word.text
|
|
|
|
list_word_groups=[]
|
|
inital_word_group = Word_group()
|
|
inital_word_group.word_group_id= word_group_id
|
|
word_group_id +=1
|
|
lines[i].list_word_groups[0].word_id=word_id
|
|
inital_word_group.add_word(lines[i].list_word_groups[0])
|
|
word_id+=1
|
|
list_word_groups.append(inital_word_group)
|
|
for word in lines[i].list_word_groups[1:]: #iterate through each word object in list_word_groups (has not been construted to word_group yet)
|
|
check_word_group= True
|
|
#set id for each word in each line
|
|
word.word_id = word_id
|
|
word_id+=1
|
|
if (not list_word_groups[-1].text.endswith(':')) and ((word.boundingbox[0]-list_word_groups[-1].boundingbox[2])/line_width <0.05) and check_iomin(word, list_word_groups[-1]):
|
|
list_word_groups[-1].add_word(word)
|
|
check_word_group=False
|
|
if check_word_group ==True:
|
|
new_word_group = Word_group()
|
|
new_word_group.word_group_id= word_group_id
|
|
word_group_id +=1
|
|
new_word_group.add_word(word)
|
|
list_word_groups.append(new_word_group)
|
|
lines[i].list_word_groups = list_word_groups
|
|
# set id for lines from top to bottom
|
|
lines[i].update_line_id(line_id)
|
|
line_id +=1
|
|
return lines, number_of_word |