363 lines
13 KiB
Python
363 lines
13 KiB
Python
|
import re
|
||
|
import nltk
|
||
|
import string
|
||
|
import tldextract
|
||
|
from dateutil import parser
|
||
|
from datetime import datetime
|
||
|
# nltk.download('words')
|
||
|
try:
|
||
|
nltk.data.find("corpora/words")
|
||
|
except LookupError:
|
||
|
nltk.download('words')
|
||
|
words = set(nltk.corpus.words.words())
|
||
|
|
||
|
from sdsvkvu.utils.word2line import Word, words_to_lines
|
||
|
|
||
|
s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
|
||
|
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
|
||
|
|
||
|
# def clean_text(text):
|
||
|
# return re.sub(r"[^A-Za-z(),!?\'\`]", " ", text)
|
||
|
|
||
|
def get_string(lwords: list):
|
||
|
unique_list = []
|
||
|
for item in lwords:
|
||
|
if item.isdigit() and len(item) == 1:
|
||
|
unique_list.append(item)
|
||
|
elif item not in unique_list:
|
||
|
unique_list.append(item)
|
||
|
return ' '.join(unique_list)
|
||
|
|
||
|
def get_string_by_deduplicate_bbox(lwords: list, lbboxes: list):
|
||
|
unique_list = []
|
||
|
prev_bbox = [-1, -1, -1, -1]
|
||
|
for word, bbox in zip(lwords, lbboxes):
|
||
|
if bbox != prev_bbox:
|
||
|
unique_list.append(word)
|
||
|
prev_bbox = bbox
|
||
|
return ' '.join(unique_list)
|
||
|
|
||
|
def get_string_with_word2line(lwords: list, lbboxes: list):
|
||
|
list_words = []
|
||
|
unique_list = []
|
||
|
list_sorted_words = []
|
||
|
|
||
|
prev_bbox = [-1, -1, -1, -1]
|
||
|
for word, bbox in zip(lwords, lbboxes):
|
||
|
if bbox != prev_bbox:
|
||
|
prev_bbox = bbox
|
||
|
list_words.append(Word(image=None, text=word, conf_cls=-1, bndbox=bbox, conf_detect=-1))
|
||
|
unique_list.append(word)
|
||
|
|
||
|
llines = words_to_lines(list_words)[0]
|
||
|
|
||
|
for line in llines:
|
||
|
for _word_group in line.list_word_groups:
|
||
|
for _word in _word_group.list_words:
|
||
|
list_sorted_words.append(_word.text)
|
||
|
|
||
|
string_from_model = ' '.join(unique_list)
|
||
|
string_after_word2line = ' '.join(list_sorted_words)
|
||
|
|
||
|
# if string_from_model != string_after_word2line:
|
||
|
# print("[Warning] Word group from model is different with word2line module")
|
||
|
# print("Model: ", ' '.join(unique_list))
|
||
|
# print("Word2line: ", ' '.join(list_sorted_words))
|
||
|
|
||
|
return string_after_word2line
|
||
|
|
||
|
def date_regexing(inp_str):
|
||
|
patterns = {
|
||
|
'ngay': r"ngày\d+",
|
||
|
'thang': r"tháng\d+",
|
||
|
'nam': r"năm\d+"
|
||
|
}
|
||
|
inp_str = inp_str.replace(" ", "").lower()
|
||
|
outputs = {k: '' for k in patterns}
|
||
|
for key, pattern in patterns.items():
|
||
|
matches = re.findall(pattern, inp_str)
|
||
|
if len(matches) > 0:
|
||
|
element = set([match[len(key):] for match in matches])
|
||
|
outputs[key] = list(element)[0]
|
||
|
return outputs['ngay'], outputs['thang'], outputs['nam']
|
||
|
|
||
|
|
||
|
def parse_date1(date_str):
|
||
|
# remove space
|
||
|
date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str)
|
||
|
date_str = re.sub(r"/\s+", "/", date_str)
|
||
|
date_str = re.sub(r"-\s+", "-", date_str)
|
||
|
|
||
|
is_parser_error = False
|
||
|
try:
|
||
|
date_obj = parser.parse(date_str, fuzzy=True)
|
||
|
year_str = str(date_obj.year)
|
||
|
day_str = str(date_obj.day)
|
||
|
# date_formated = date_obj.strftime("%d/%m/%Y")
|
||
|
date_formated = date_obj.strftime("%Y-%m-%d")
|
||
|
except Exception as err:
|
||
|
# date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0]
|
||
|
# date_str, is_match = date_regexing(date_str)
|
||
|
is_match = False
|
||
|
if is_match:
|
||
|
date_formated = date_str
|
||
|
is_parser_error = False
|
||
|
return date_formated, is_parser_error
|
||
|
else:
|
||
|
print(f"Error parse date: err = {err}, date = {date_str}")
|
||
|
date_formated = date_str
|
||
|
is_parser_error = True
|
||
|
return date_formated, is_parser_error
|
||
|
|
||
|
if len(normalize_number(date_str)) == 6:
|
||
|
year_str = year_str[-2:]
|
||
|
try:
|
||
|
year_index = date_str.index(str(year_str))
|
||
|
day_index = date_str.index(str(day_str))
|
||
|
if year_index > day_index:
|
||
|
date_obj = parser.parse(date_str, fuzzy=True, dayfirst=True)
|
||
|
|
||
|
# date_formated = date_obj.strftime("%d/%m/%Y")
|
||
|
date_formated = date_obj.strftime("%Y-%m-%d")
|
||
|
except Exception as err:
|
||
|
print(f"Error check dayfirst: err = {err}, date = {date_str}")
|
||
|
|
||
|
return date_formated, is_parser_error
|
||
|
|
||
|
|
||
|
def parse_date(date_str):
|
||
|
# remove space
|
||
|
date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str)
|
||
|
date_str = re.sub(r"/\s+", "/", date_str)
|
||
|
date_str = re.sub(r"-\s+", "-", date_str)
|
||
|
date_str = re.sub(r"\-+", "-", date_str)
|
||
|
date_str = date_str.lower().replace("0ct", "oct")
|
||
|
|
||
|
is_parser_error = False
|
||
|
try:
|
||
|
date_obj = parser.parse(date_str, fuzzy=True)
|
||
|
except Exception as err:
|
||
|
print(f"1.Error parse date: err = {err}, date = {date_str}")
|
||
|
try:
|
||
|
date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0]
|
||
|
date_obj = parser.parse(date_str, fuzzy=True)
|
||
|
except Exception as err:
|
||
|
print(f"2.Error parse date: err = {err}, date = {date_str}")
|
||
|
is_parser_error = True
|
||
|
return [date_str], is_parser_error
|
||
|
|
||
|
year_str = int(date_obj.year)
|
||
|
month_str = int(date_obj.month)
|
||
|
day_str = int(date_obj.day)
|
||
|
|
||
|
current_year = int(datetime.now().year)
|
||
|
if year_str > current_year or year_str < 2010: # invalid year
|
||
|
date_obj = date_obj.replace(year=current_year)
|
||
|
|
||
|
formated_date = date_obj.strftime("%Y-%m-%d")
|
||
|
revert_formated_date = date_obj.strftime("%Y-%d-%m")
|
||
|
|
||
|
if any(txt in date_str for txt in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']):
|
||
|
return [formated_date], is_parser_error
|
||
|
if month_str <= 12 and day_str <= 12:
|
||
|
return [formated_date, revert_formated_date], is_parser_error
|
||
|
return [formated_date], is_parser_error
|
||
|
|
||
|
|
||
|
|
||
|
def normalize_imei(imei):
|
||
|
imei = imei.replace(" ", "")
|
||
|
imei = imei.split("/")[0]
|
||
|
return imei
|
||
|
|
||
|
def normalize_seller(seller):
|
||
|
# if isinstance(seller, str):
|
||
|
# seller = seller
|
||
|
return seller
|
||
|
|
||
|
def normalize_website(website):
|
||
|
if isinstance(website, str):
|
||
|
# website = website.lower().replace("www.", "").replace("ww.", "").replace(".com", "")
|
||
|
website = website.lower()
|
||
|
website = website.split(".com")[0]
|
||
|
website = tldextract.extract(website).domain
|
||
|
return website
|
||
|
|
||
|
def normalize_hotline(hotline):
|
||
|
if isinstance(hotline, str):
|
||
|
hotline = hotline.lower().replace("hotline", "")
|
||
|
return hotline
|
||
|
|
||
|
def normalize_voucher(voucher):
|
||
|
if isinstance(voucher, str):
|
||
|
voucher = voucher.lower().replace("voucher", "")
|
||
|
return voucher
|
||
|
|
||
|
|
||
|
|
||
|
def normalize_number(
|
||
|
text_str: str, reserve_dot=False, reserve_plus=False, reserve_minus=False
|
||
|
):
|
||
|
"""
|
||
|
Normalize a string of numbers by removing non-numeric characters
|
||
|
|
||
|
"""
|
||
|
assert isinstance(text_str, str), "input must be str"
|
||
|
reserver_chars = ""
|
||
|
if reserve_dot:
|
||
|
reserver_chars += ".,"
|
||
|
if reserve_plus:
|
||
|
reserver_chars += "+"
|
||
|
if reserve_minus:
|
||
|
reserver_chars += "-"
|
||
|
regex_fomula = "[^0-9{}]".format(reserver_chars)
|
||
|
normalized_text_str = re.sub(r"{}".format(regex_fomula), "", text_str)
|
||
|
return normalized_text_str
|
||
|
|
||
|
def remove_bullet_points_and_punctuation(text):
|
||
|
# Remove bullet points (e.g., • or -)
|
||
|
text = re.sub(r'^\s*[\•\-\*]\s*', '', text, flags=re.MULTILINE)
|
||
|
text = text.strip()
|
||
|
# # Remove end-of-sentence punctuation (e.g., ., !, ?)
|
||
|
# text = re.sub(r'[.!?]', '', text)
|
||
|
if len(text) > 0 and text[0] in (',', '.', ':', ';', '?', '!'):
|
||
|
text = text[1:]
|
||
|
if len(text) > 0 and text[-1] in (',', '.', ':', ';', '?', '!'):
|
||
|
text = text[:-1]
|
||
|
return text.strip()
|
||
|
|
||
|
def split_key_value_by_colon(key: str, value: str) -> list:
|
||
|
key_string = key if key is not None else ""
|
||
|
value_string = value if value is not None else ""
|
||
|
text_string = key_string + " " + value_string
|
||
|
elements = text_string.split(":")
|
||
|
if len(elements) > 1 and not bool(re.search(r'\d', elements[0])):
|
||
|
return elements[0], text_string[len(elements[0])+1 :].strip()
|
||
|
return key, value
|
||
|
|
||
|
|
||
|
def is_string_in_range(s):
|
||
|
try:
|
||
|
num = int(s)
|
||
|
return 0 <= num <= 9
|
||
|
except ValueError:
|
||
|
return False
|
||
|
|
||
|
def remove_english_words(text):
|
||
|
_word = [w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() not in words]
|
||
|
return ' '.join(_word)
|
||
|
|
||
|
def remove_punctuation(text):
|
||
|
return text.translate(str.maketrans(" ", " ", string.punctuation))
|
||
|
|
||
|
def remove_accents(input_str, s0, s1):
|
||
|
s = ''
|
||
|
# print input_str.encode('utf-8')
|
||
|
for c in input_str:
|
||
|
if c in s1:
|
||
|
s += s0[s1.index(c)]
|
||
|
else:
|
||
|
s += c
|
||
|
return s
|
||
|
|
||
|
def remove_spaces(text):
|
||
|
return text.replace(' ', '')
|
||
|
|
||
|
def preprocessing(text: str):
|
||
|
# text = remove_english_words(text) if table else text
|
||
|
text = remove_punctuation(text)
|
||
|
text = remove_accents(text, s0, s1)
|
||
|
text = remove_spaces(text)
|
||
|
return text.lower()
|
||
|
|
||
|
def longestCommonSubsequence(text1: str, text2: str) -> int:
|
||
|
# https://leetcode.com/problems/longest-common-subsequence/discuss/351689/JavaPython-3-Two-DP-codes-of-O(mn)-and-O(min(m-n))-spaces-w-picture-and-analysis
|
||
|
dp = [[0] * (len(text2) + 1) for _ in range(len(text1) + 1)]
|
||
|
for i, c in enumerate(text1):
|
||
|
for j, d in enumerate(text2):
|
||
|
dp[i + 1][j + 1] = 1 + \
|
||
|
dp[i][j] if c == d else max(dp[i][j + 1], dp[i + 1][j])
|
||
|
return dp[-1][-1]
|
||
|
|
||
|
|
||
|
def longest_common_subsequence_with_idx(X, Y):
|
||
|
"""
|
||
|
This implementation uses dynamic programming to calculate the length of the LCS, and uses a path array to keep track of the characters in the LCS.
|
||
|
The longest_common_subsequence function takes two strings as input, and returns a tuple with three values:
|
||
|
the length of the LCS,
|
||
|
the index of the first character of the LCS in the first string,
|
||
|
and the index of the last character of the LCS in the first string.
|
||
|
"""
|
||
|
m, n = len(X), len(Y)
|
||
|
L = [[0 for i in range(n + 1)] for j in range(m + 1)]
|
||
|
|
||
|
# Following steps build L[m+1][n+1] in bottom up fashion. Note
|
||
|
# that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]
|
||
|
right_idx = 0
|
||
|
max_lcs = 0
|
||
|
for i in range(m + 1):
|
||
|
for j in range(n + 1):
|
||
|
if i == 0 or j == 0:
|
||
|
L[i][j] = 0
|
||
|
elif X[i - 1] == Y[j - 1]:
|
||
|
L[i][j] = L[i - 1][j - 1] + 1
|
||
|
if L[i][j] > max_lcs:
|
||
|
max_lcs = L[i][j]
|
||
|
right_idx = i
|
||
|
else:
|
||
|
L[i][j] = max(L[i - 1][j], L[i][j - 1])
|
||
|
|
||
|
# Create a string variable to store the lcs string
|
||
|
lcs = L[i][j]
|
||
|
# Start from the right-most-bottom-most corner and
|
||
|
# one by one store characters in lcs[]
|
||
|
i = m
|
||
|
j = n
|
||
|
# right_idx = 0
|
||
|
while i > 0 and j > 0:
|
||
|
# If current character in X[] and Y are same, then
|
||
|
# current character is part of LCS
|
||
|
if X[i - 1] == Y[j - 1]:
|
||
|
|
||
|
i -= 1
|
||
|
j -= 1
|
||
|
# If not same, then find the larger of two and
|
||
|
# go in the direction of larger value
|
||
|
elif L[i - 1][j] > L[i][j - 1]:
|
||
|
# right_idx = i if not right_idx else right_idx #the first change in L should be the right index of the lcs
|
||
|
i -= 1
|
||
|
else:
|
||
|
j -= 1
|
||
|
return lcs, i, max(i + lcs, right_idx)
|
||
|
|
||
|
|
||
|
|
||
|
def longest_common_substring(X, Y):
|
||
|
m = len(X)
|
||
|
n = len(Y)
|
||
|
|
||
|
# Create a 2D array to store the lengths of common substrings
|
||
|
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
||
|
|
||
|
# Variables to store the length of the longest common substring
|
||
|
max_length = 0
|
||
|
end_index = 0
|
||
|
|
||
|
# Build the dp array bottom-up
|
||
|
for i in range(1, m + 1):
|
||
|
for j in range(1, n + 1):
|
||
|
if X[i - 1] == Y[j - 1]:
|
||
|
dp[i][j] = dp[i - 1][j - 1] + 1
|
||
|
|
||
|
# Update the length and ending index of the common substring
|
||
|
if dp[i][j] > max_length:
|
||
|
max_length = dp[i][j]
|
||
|
end_index = i - 1
|
||
|
else:
|
||
|
dp[i][j] = 0
|
||
|
|
||
|
# The longest common substring is X[end_index - max_length + 1:end_index + 1]
|
||
|
|
||
|
return len(X[end_index - max_length + 1: end_index + 1])
|
||
|
|