sbt-idp/cope2n-ai-fi/modules/_sdsvkvu/sdsvkvu/utils/post_processing.py

363 lines
13 KiB
Python
Raw Normal View History

2023-11-30 11:22:16 +00:00
import re
import nltk
import string
import tldextract
from dateutil import parser
from datetime import datetime
# nltk.download('words')
try:
nltk.data.find("corpora/words")
except LookupError:
nltk.download('words')
words = set(nltk.corpus.words.words())
from sdsvkvu.utils.word2line import Word, words_to_lines
s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
# def clean_text(text):
# return re.sub(r"[^A-Za-z(),!?\'\`]", " ", text)
def get_string(lwords: list):
unique_list = []
for item in lwords:
if item.isdigit() and len(item) == 1:
unique_list.append(item)
elif item not in unique_list:
unique_list.append(item)
return ' '.join(unique_list)
def get_string_by_deduplicate_bbox(lwords: list, lbboxes: list):
unique_list = []
prev_bbox = [-1, -1, -1, -1]
for word, bbox in zip(lwords, lbboxes):
if bbox != prev_bbox:
unique_list.append(word)
prev_bbox = bbox
return ' '.join(unique_list)
def get_string_with_word2line(lwords: list, lbboxes: list):
list_words = []
unique_list = []
list_sorted_words = []
prev_bbox = [-1, -1, -1, -1]
for word, bbox in zip(lwords, lbboxes):
if bbox != prev_bbox:
prev_bbox = bbox
list_words.append(Word(image=None, text=word, conf_cls=-1, bndbox=bbox, conf_detect=-1))
unique_list.append(word)
llines = words_to_lines(list_words)[0]
for line in llines:
for _word_group in line.list_word_groups:
for _word in _word_group.list_words:
list_sorted_words.append(_word.text)
string_from_model = ' '.join(unique_list)
string_after_word2line = ' '.join(list_sorted_words)
# if string_from_model != string_after_word2line:
# print("[Warning] Word group from model is different with word2line module")
# print("Model: ", ' '.join(unique_list))
# print("Word2line: ", ' '.join(list_sorted_words))
return string_after_word2line
def date_regexing(inp_str):
patterns = {
'ngay': r"ngày\d+",
'thang': r"tháng\d+",
'nam': r"năm\d+"
}
inp_str = inp_str.replace(" ", "").lower()
outputs = {k: '' for k in patterns}
for key, pattern in patterns.items():
matches = re.findall(pattern, inp_str)
if len(matches) > 0:
element = set([match[len(key):] for match in matches])
outputs[key] = list(element)[0]
return outputs['ngay'], outputs['thang'], outputs['nam']
def parse_date1(date_str):
# remove space
date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str)
date_str = re.sub(r"/\s+", "/", date_str)
date_str = re.sub(r"-\s+", "-", date_str)
is_parser_error = False
try:
date_obj = parser.parse(date_str, fuzzy=True)
year_str = str(date_obj.year)
day_str = str(date_obj.day)
# date_formated = date_obj.strftime("%d/%m/%Y")
date_formated = date_obj.strftime("%Y-%m-%d")
except Exception as err:
# date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0]
# date_str, is_match = date_regexing(date_str)
is_match = False
if is_match:
date_formated = date_str
is_parser_error = False
return date_formated, is_parser_error
else:
print(f"Error parse date: err = {err}, date = {date_str}")
date_formated = date_str
is_parser_error = True
return date_formated, is_parser_error
if len(normalize_number(date_str)) == 6:
year_str = year_str[-2:]
try:
year_index = date_str.index(str(year_str))
day_index = date_str.index(str(day_str))
if year_index > day_index:
date_obj = parser.parse(date_str, fuzzy=True, dayfirst=True)
# date_formated = date_obj.strftime("%d/%m/%Y")
date_formated = date_obj.strftime("%Y-%m-%d")
except Exception as err:
print(f"Error check dayfirst: err = {err}, date = {date_str}")
return date_formated, is_parser_error
def parse_date(date_str):
# remove space
date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str)
date_str = re.sub(r"/\s+", "/", date_str)
date_str = re.sub(r"-\s+", "-", date_str)
date_str = re.sub(r"\-+", "-", date_str)
date_str = date_str.lower().replace("0ct", "oct")
is_parser_error = False
try:
date_obj = parser.parse(date_str, fuzzy=True)
except Exception as err:
print(f"1.Error parse date: err = {err}, date = {date_str}")
try:
date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0]
date_obj = parser.parse(date_str, fuzzy=True)
except Exception as err:
print(f"2.Error parse date: err = {err}, date = {date_str}")
is_parser_error = True
return [date_str], is_parser_error
year_str = int(date_obj.year)
month_str = int(date_obj.month)
day_str = int(date_obj.day)
current_year = int(datetime.now().year)
if year_str > current_year or year_str < 2010: # invalid year
date_obj = date_obj.replace(year=current_year)
formated_date = date_obj.strftime("%Y-%m-%d")
revert_formated_date = date_obj.strftime("%Y-%d-%m")
if any(txt in date_str for txt in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']):
return [formated_date], is_parser_error
if month_str <= 12 and day_str <= 12:
return [formated_date, revert_formated_date], is_parser_error
return [formated_date], is_parser_error
def normalize_imei(imei):
imei = imei.replace(" ", "")
imei = imei.split("/")[0]
return imei
def normalize_seller(seller):
# if isinstance(seller, str):
# seller = seller
return seller
def normalize_website(website):
if isinstance(website, str):
# website = website.lower().replace("www.", "").replace("ww.", "").replace(".com", "")
website = website.lower()
website = website.split(".com")[0]
website = tldextract.extract(website).domain
return website
def normalize_hotline(hotline):
if isinstance(hotline, str):
hotline = hotline.lower().replace("hotline", "")
return hotline
def normalize_voucher(voucher):
if isinstance(voucher, str):
voucher = voucher.lower().replace("voucher", "")
return voucher
def normalize_number(
text_str: str, reserve_dot=False, reserve_plus=False, reserve_minus=False
):
"""
Normalize a string of numbers by removing non-numeric characters
"""
assert isinstance(text_str, str), "input must be str"
reserver_chars = ""
if reserve_dot:
reserver_chars += ".,"
if reserve_plus:
reserver_chars += "+"
if reserve_minus:
reserver_chars += "-"
regex_fomula = "[^0-9{}]".format(reserver_chars)
normalized_text_str = re.sub(r"{}".format(regex_fomula), "", text_str)
return normalized_text_str
def remove_bullet_points_and_punctuation(text):
# Remove bullet points (e.g., • or -)
text = re.sub(r'^\s*[\\-\*]\s*', '', text, flags=re.MULTILINE)
text = text.strip()
# # Remove end-of-sentence punctuation (e.g., ., !, ?)
# text = re.sub(r'[.!?]', '', text)
if len(text) > 0 and text[0] in (',', '.', ':', ';', '?', '!'):
text = text[1:]
if len(text) > 0 and text[-1] in (',', '.', ':', ';', '?', '!'):
text = text[:-1]
return text.strip()
def split_key_value_by_colon(key: str, value: str) -> list:
key_string = key if key is not None else ""
value_string = value if value is not None else ""
text_string = key_string + " " + value_string
elements = text_string.split(":")
if len(elements) > 1 and not bool(re.search(r'\d', elements[0])):
return elements[0], text_string[len(elements[0])+1 :].strip()
return key, value
def is_string_in_range(s):
try:
num = int(s)
return 0 <= num <= 9
except ValueError:
return False
def remove_english_words(text):
_word = [w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() not in words]
return ' '.join(_word)
def remove_punctuation(text):
return text.translate(str.maketrans(" ", " ", string.punctuation))
def remove_accents(input_str, s0, s1):
s = ''
# print input_str.encode('utf-8')
for c in input_str:
if c in s1:
s += s0[s1.index(c)]
else:
s += c
return s
def remove_spaces(text):
return text.replace(' ', '')
def preprocessing(text: str):
# text = remove_english_words(text) if table else text
text = remove_punctuation(text)
text = remove_accents(text, s0, s1)
text = remove_spaces(text)
return text.lower()
def longestCommonSubsequence(text1: str, text2: str) -> int:
# https://leetcode.com/problems/longest-common-subsequence/discuss/351689/JavaPython-3-Two-DP-codes-of-O(mn)-and-O(min(m-n))-spaces-w-picture-and-analysis
dp = [[0] * (len(text2) + 1) for _ in range(len(text1) + 1)]
for i, c in enumerate(text1):
for j, d in enumerate(text2):
dp[i + 1][j + 1] = 1 + \
dp[i][j] if c == d else max(dp[i][j + 1], dp[i + 1][j])
return dp[-1][-1]
def longest_common_subsequence_with_idx(X, Y):
"""
This implementation uses dynamic programming to calculate the length of the LCS, and uses a path array to keep track of the characters in the LCS.
The longest_common_subsequence function takes two strings as input, and returns a tuple with three values:
the length of the LCS,
the index of the first character of the LCS in the first string,
and the index of the last character of the LCS in the first string.
"""
m, n = len(X), len(Y)
L = [[0 for i in range(n + 1)] for j in range(m + 1)]
# Following steps build L[m+1][n+1] in bottom up fashion. Note
# that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]
right_idx = 0
max_lcs = 0
for i in range(m + 1):
for j in range(n + 1):
if i == 0 or j == 0:
L[i][j] = 0
elif X[i - 1] == Y[j - 1]:
L[i][j] = L[i - 1][j - 1] + 1
if L[i][j] > max_lcs:
max_lcs = L[i][j]
right_idx = i
else:
L[i][j] = max(L[i - 1][j], L[i][j - 1])
# Create a string variable to store the lcs string
lcs = L[i][j]
# Start from the right-most-bottom-most corner and
# one by one store characters in lcs[]
i = m
j = n
# right_idx = 0
while i > 0 and j > 0:
# If current character in X[] and Y are same, then
# current character is part of LCS
if X[i - 1] == Y[j - 1]:
i -= 1
j -= 1
# If not same, then find the larger of two and
# go in the direction of larger value
elif L[i - 1][j] > L[i][j - 1]:
# right_idx = i if not right_idx else right_idx #the first change in L should be the right index of the lcs
i -= 1
else:
j -= 1
return lcs, i, max(i + lcs, right_idx)
def longest_common_substring(X, Y):
m = len(X)
n = len(Y)
# Create a 2D array to store the lengths of common substrings
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Variables to store the length of the longest common substring
max_length = 0
end_index = 0
# Build the dp array bottom-up
for i in range(1, m + 1):
for j in range(1, n + 1):
if X[i - 1] == Y[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
# Update the length and ending index of the common substring
if dp[i][j] > max_length:
max_length = dp[i][j]
end_index = i - 1
else:
dp[i][j] = 0
# The longest common substring is X[end_index - max_length + 1:end_index + 1]
return len(X[end_index - max_length + 1: end_index + 1])