class Word(): def __init__(self, text="",image=None, conf_detect=0.0, conf_cls=0.0, bndbox = [-1,-1,-1,-1], kie_label =""): self.type = "word" self.text =text self.image = image self.conf_detect = conf_detect self.conf_cls = conf_cls self.boundingbox = bndbox # [left, top,right,bot] coordinate of top-left and bottom-right point self.word_id = 0 # id of word self.word_group_id = 0 # id of word_group which instance belongs to self.line_id = 0 #id of line which instance belongs to self.paragraph_id = 0 #id of line which instance belongs to self.kie_label = kie_label def invalid_size(self): return (self.boundingbox[2] - self.boundingbox[0]) * (self.boundingbox[3] - self.boundingbox[1]) > 0 def is_special_word(self): left, top, right, bottom = self.boundingbox width, height = right - left, bottom - top text = self.text if text is None: return True # if len(text) > 7: # return True if len(text) >= 7: no_digits = sum(c.isdigit() for c in text) return no_digits / len(text) >= 0.3 return False class Word_group(): def __init__(self): self.type = "word_group" self.list_words = [] # dict of word instances self.word_group_id = 0 # word group id self.line_id = 0 #id of line which instance belongs to self.paragraph_id = 0# id of paragraph which instance belongs to self.text ="" self.boundingbox = [-1,-1,-1,-1] self.kie_label ="" def add_word(self, word:Word): #add a word instance to the word_group if word.text != "✪": for w in self.list_words: if word.word_id == w.word_id: print("Word id collision") return False word.word_group_id = self.word_group_id # word.line_id = self.line_id word.paragraph_id = self.paragraph_id self.list_words.append(word) self.text += ' '+ word.text if self.boundingbox == [-1,-1,-1,-1]: self.boundingbox = word.boundingbox else: self.boundingbox = [min(self.boundingbox[0], word.boundingbox[0]), min(self.boundingbox[1], word.boundingbox[1]), max(self.boundingbox[2], word.boundingbox[2]), max(self.boundingbox[3], word.boundingbox[3])] return True else: return False def update_word_group_id(self, new_word_group_id): self.word_group_id = new_word_group_id for i in range(len(self.list_words)): self.list_words[i].word_group_id = new_word_group_id def update_kie_label(self): list_kie_label = [word.kie_label for word in self.list_words] dict_kie = dict() for label in list_kie_label: if label not in dict_kie: dict_kie[label]=1 else: dict_kie[label]+=1 total = len(list(dict_kie.values())) max_value = max(list(dict_kie.values())) list_keys = list(dict_kie.keys()) list_values = list(dict_kie.values()) self.kie_label = list_keys[list_values.index(max_value)] class Line(): def __init__(self): self.type = "line" self.list_word_groups = [] # list of Word_group instances in the line self.line_id = 0 #id of line in the paragraph self.paragraph_id = 0 # id of paragraph which instance belongs to self.text = "" self.boundingbox = [-1,-1,-1,-1] def add_group(self, word_group:Word_group): # add a word_group instance if word_group.list_words is not None: for wg in self.list_word_groups: if word_group.word_group_id == wg.word_group_id: print("Word_group id collision") return False self.list_word_groups.append(word_group) self.text += word_group.text word_group.paragraph_id = self.paragraph_id word_group.line_id = self.line_id for i in range(len(word_group.list_words)): word_group.list_words[i].paragraph_id = self.paragraph_id #set paragraph_id for word word_group.list_words[i].line_id = self.line_id #set line_id for word return True return False def update_line_id(self, new_line_id): self.line_id = new_line_id for i in range(len(self.list_word_groups)): self.list_word_groups[i].line_id = new_line_id for j in range(len(self.list_word_groups[i].list_words)): self.list_word_groups[i].list_words[j].line_id = new_line_id def merge_word(self, word): # word can be a Word instance or a Word_group instance if word.text != "✪": if self.boundingbox == [-1,-1,-1,-1]: self.boundingbox = word.boundingbox else: self.boundingbox = [min(self.boundingbox[0], word.boundingbox[0]), min(self.boundingbox[1], word.boundingbox[1]), max(self.boundingbox[2], word.boundingbox[2]), max(self.boundingbox[3], word.boundingbox[3])] self.list_word_groups.append(word) self.text += ' ' + word.text return True return False def in_same_line(self, input_line, thresh=0.7): # calculate iou in vertical direction left1, top1, right1, bottom1 = self.boundingbox left2, top2, right2, bottom2 = input_line.boundingbox sorted_vals = sorted([top1, bottom1, top2, bottom2]) intersection = sorted_vals[2] - sorted_vals[1] union = sorted_vals[3]-sorted_vals[0] min_height = min(bottom1-top1, bottom2-top2) if min_height==0: return False ratio = intersection / min_height height1, height2 = top1-bottom1, top2-bottom2 ratio_height = float(max(height1, height2))/float(min(height1, height2)) # height_diff = (float(top1-bottom1))/(float(top2-bottom2)) if (top1 in range(top2, bottom2) or top2 in range(top1, bottom1)) and ratio >= thresh and (ratio_height<2): return True return False def check_iomin(word:Word, word_group:Word_group): min_height = min(word.boundingbox[3]-word.boundingbox[1],word_group.boundingbox[3]-word_group.boundingbox[1]) intersect = min(word.boundingbox[3],word_group.boundingbox[3]) - max(word.boundingbox[1],word_group.boundingbox[1]) if intersect/min_height > 0.7: return True return False def words_to_lines(words, check_special_lines=True): #words is list of Word instance #sort word by top words.sort(key = lambda x: (x.boundingbox[1], x.boundingbox[0])) number_of_word = len(words) #sort list words to list lines, which have not contained word_group yet lines = [] for i, word in enumerate(words): if word.invalid_size()==0: continue new_line = True for i in range(len(lines)): if lines[i].in_same_line(word): #check if word is in the same line with lines[i] lines[i].merge_word(word) new_line = False if new_line ==True: new_line = Line() new_line.merge_word(word) lines.append(new_line) # print(len(lines)) #sort line from top to bottom according top coordinate lines.sort(key = lambda x: x.boundingbox[1]) #construct word_groups in each line line_id = 0 word_group_id =0 word_id = 0 for i in range(len(lines)): if len(lines[i].list_word_groups)==0: continue #left, top ,right, bottom line_width = lines[i].boundingbox[2] - lines[i].boundingbox[0] # right - left # print("line_width",line_width) lines[i].list_word_groups.sort(key = lambda x: x.boundingbox[0]) #sort word in lines from left to right #update text for lines after sorting lines[i].text ="" for word in lines[i].list_word_groups: lines[i].text += " "+word.text list_word_groups=[] inital_word_group = Word_group() inital_word_group.word_group_id= word_group_id word_group_id +=1 lines[i].list_word_groups[0].word_id=word_id inital_word_group.add_word(lines[i].list_word_groups[0]) word_id+=1 list_word_groups.append(inital_word_group) for word in lines[i].list_word_groups[1:]: #iterate through each word object in list_word_groups (has not been construted to word_group yet) check_word_group= True #set id for each word in each line word.word_id = word_id word_id+=1 if (not list_word_groups[-1].text.endswith(':')) and ((word.boundingbox[0]-list_word_groups[-1].boundingbox[2])/line_width <0.05) and check_iomin(word, list_word_groups[-1]): list_word_groups[-1].add_word(word) check_word_group=False if check_word_group ==True: new_word_group = Word_group() new_word_group.word_group_id= word_group_id word_group_id +=1 new_word_group.add_word(word) list_word_groups.append(new_word_group) lines[i].list_word_groups = list_word_groups # set id for lines from top to bottom lines[i].update_line_id(line_id) line_id +=1 return lines, number_of_word