import re import nltk import string import tldextract from dateutil import parser from datetime import datetime # nltk.download('words') try: nltk.data.find("corpora/words") except LookupError: nltk.download('words') words = set(nltk.corpus.words.words()) from sdsvkvu.utils.word2line import Word, words_to_lines s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ' s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy' # def clean_text(text): # return re.sub(r"[^A-Za-z(),!?\'\`]", " ", text) def get_string(lwords: list): unique_list = [] for item in lwords: if item.isdigit() and len(item) == 1: unique_list.append(item) elif item not in unique_list: unique_list.append(item) return ' '.join(unique_list) def get_string_by_deduplicate_bbox(lwords: list, lbboxes: list): unique_list = [] prev_bbox = [-1, -1, -1, -1] for word, bbox in zip(lwords, lbboxes): if bbox != prev_bbox: unique_list.append(word) prev_bbox = bbox return ' '.join(unique_list) def get_string_with_word2line(lwords: list, lbboxes: list): list_words = [] unique_list = [] list_sorted_words = [] prev_bbox = [-1, -1, -1, -1] for word, bbox in zip(lwords, lbboxes): if bbox != prev_bbox: prev_bbox = bbox list_words.append(Word(image=None, text=word, conf_cls=-1, bndbox=bbox, conf_detect=-1)) unique_list.append(word) llines = words_to_lines(list_words)[0] for line in llines: for _word_group in line.list_word_groups: for _word in _word_group.list_words: list_sorted_words.append(_word.text) string_from_model = ' '.join(unique_list) string_after_word2line = ' '.join(list_sorted_words) # if string_from_model != string_after_word2line: # print("[Warning] Word group from model is different with word2line module") # print("Model: ", ' '.join(unique_list)) # print("Word2line: ", ' '.join(list_sorted_words)) return string_after_word2line def date_regexing(inp_str): patterns = { 'ngay': r"ngày\d+", 'thang': r"tháng\d+", 'nam': r"năm\d+" } inp_str = inp_str.replace(" ", "").lower() outputs = {k: '' for k in patterns} for key, pattern in patterns.items(): matches = re.findall(pattern, inp_str) if len(matches) > 0: element = set([match[len(key):] for match in matches]) outputs[key] = list(element)[0] return outputs['ngay'], outputs['thang'], outputs['nam'] def parse_date1(date_str): # remove space date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str) date_str = re.sub(r"/\s+", "/", date_str) date_str = re.sub(r"-\s+", "-", date_str) is_parser_error = False try: date_obj = parser.parse(date_str, fuzzy=True) year_str = str(date_obj.year) day_str = str(date_obj.day) # date_formated = date_obj.strftime("%d/%m/%Y") date_formated = date_obj.strftime("%Y-%m-%d") except Exception as err: # date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0] # date_str, is_match = date_regexing(date_str) is_match = False if is_match: date_formated = date_str is_parser_error = False return date_formated, is_parser_error else: print(f"Error parse date: err = {err}, date = {date_str}") date_formated = date_str is_parser_error = True return date_formated, is_parser_error if len(normalize_number(date_str)) == 6: year_str = year_str[-2:] try: year_index = date_str.index(str(year_str)) day_index = date_str.index(str(day_str)) if year_index > day_index: date_obj = parser.parse(date_str, fuzzy=True, dayfirst=True) # date_formated = date_obj.strftime("%d/%m/%Y") date_formated = date_obj.strftime("%Y-%m-%d") except Exception as err: print(f"Error check dayfirst: err = {err}, date = {date_str}") return date_formated, is_parser_error def parse_date(date_str): # remove space date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str) date_str = re.sub(r"/\s+", "/", date_str) date_str = re.sub(r"-\s+", "-", date_str) date_str = re.sub(r"\-+", "-", date_str) date_str = date_str.lower().replace("0ct", "oct") is_parser_error = False try: date_obj = parser.parse(date_str, fuzzy=True) except Exception as err: print(f"1.Error parse date: err = {err}, date = {date_str}") try: date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0] date_obj = parser.parse(date_str, fuzzy=True) except Exception as err: print(f"2.Error parse date: err = {err}, date = {date_str}") is_parser_error = True return [date_str], is_parser_error year_str = int(date_obj.year) month_str = int(date_obj.month) day_str = int(date_obj.day) current_year = int(datetime.now().year) if year_str > current_year or year_str < 2010: # invalid year date_obj = date_obj.replace(year=current_year) formated_date = date_obj.strftime("%Y-%m-%d") revert_formated_date = date_obj.strftime("%Y-%d-%m") if any(txt in date_str for txt in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']): return [formated_date], is_parser_error if month_str <= 12 and day_str <= 12: return [formated_date, revert_formated_date], is_parser_error return [formated_date], is_parser_error def normalize_imei(imei): imei = imei.replace(" ", "") imei = imei.split("/")[0] return imei def normalize_seller(seller): # if isinstance(seller, str): # seller = seller return seller def normalize_website(website): if isinstance(website, str): # website = website.lower().replace("www.", "").replace("ww.", "").replace(".com", "") website = website.lower() website = website.split(".com")[0] website = tldextract.extract(website).domain return website def normalize_hotline(hotline): if isinstance(hotline, str): hotline = hotline.lower().replace("hotline", "") return hotline def normalize_voucher(voucher): if isinstance(voucher, str): voucher = voucher.lower().replace("voucher", "") return voucher def normalize_number( text_str: str, reserve_dot=False, reserve_plus=False, reserve_minus=False ): """ Normalize a string of numbers by removing non-numeric characters """ assert isinstance(text_str, str), "input must be str" reserver_chars = "" if reserve_dot: reserver_chars += ".," if reserve_plus: reserver_chars += "+" if reserve_minus: reserver_chars += "-" regex_fomula = "[^0-9{}]".format(reserver_chars) normalized_text_str = re.sub(r"{}".format(regex_fomula), "", text_str) return normalized_text_str def remove_bullet_points_and_punctuation(text): # Remove bullet points (e.g., • or -) text = re.sub(r'^\s*[\•\-\*]\s*', '', text, flags=re.MULTILINE) text = text.strip() # # Remove end-of-sentence punctuation (e.g., ., !, ?) # text = re.sub(r'[.!?]', '', text) if len(text) > 0 and text[0] in (',', '.', ':', ';', '?', '!'): text = text[1:] if len(text) > 0 and text[-1] in (',', '.', ':', ';', '?', '!'): text = text[:-1] return text.strip() def split_key_value_by_colon(key: str, value: str) -> list: key_string = key if key is not None else "" value_string = value if value is not None else "" text_string = key_string + " " + value_string elements = text_string.split(":") if len(elements) > 1 and not bool(re.search(r'\d', elements[0])): return elements[0], text_string[len(elements[0])+1 :].strip() return key, value def is_string_in_range(s): try: num = int(s) return 0 <= num <= 9 except ValueError: return False def remove_english_words(text): _word = [w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() not in words] return ' '.join(_word) def remove_punctuation(text): return text.translate(str.maketrans(" ", " ", string.punctuation)) def remove_accents(input_str, s0, s1): s = '' # print input_str.encode('utf-8') for c in input_str: if c in s1: s += s0[s1.index(c)] else: s += c return s def remove_spaces(text): return text.replace(' ', '') def preprocessing(text: str): # text = remove_english_words(text) if table else text text = remove_punctuation(text) text = remove_accents(text, s0, s1) text = remove_spaces(text) return text.lower() def longestCommonSubsequence(text1: str, text2: str) -> int: # https://leetcode.com/problems/longest-common-subsequence/discuss/351689/JavaPython-3-Two-DP-codes-of-O(mn)-and-O(min(m-n))-spaces-w-picture-and-analysis dp = [[0] * (len(text2) + 1) for _ in range(len(text1) + 1)] for i, c in enumerate(text1): for j, d in enumerate(text2): dp[i + 1][j + 1] = 1 + \ dp[i][j] if c == d else max(dp[i][j + 1], dp[i + 1][j]) return dp[-1][-1] def longest_common_subsequence_with_idx(X, Y): """ This implementation uses dynamic programming to calculate the length of the LCS, and uses a path array to keep track of the characters in the LCS. The longest_common_subsequence function takes two strings as input, and returns a tuple with three values: the length of the LCS, the index of the first character of the LCS in the first string, and the index of the last character of the LCS in the first string. """ m, n = len(X), len(Y) L = [[0 for i in range(n + 1)] for j in range(m + 1)] # Following steps build L[m+1][n+1] in bottom up fashion. Note # that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1] right_idx = 0 max_lcs = 0 for i in range(m + 1): for j in range(n + 1): if i == 0 or j == 0: L[i][j] = 0 elif X[i - 1] == Y[j - 1]: L[i][j] = L[i - 1][j - 1] + 1 if L[i][j] > max_lcs: max_lcs = L[i][j] right_idx = i else: L[i][j] = max(L[i - 1][j], L[i][j - 1]) # Create a string variable to store the lcs string lcs = L[i][j] # Start from the right-most-bottom-most corner and # one by one store characters in lcs[] i = m j = n # right_idx = 0 while i > 0 and j > 0: # If current character in X[] and Y are same, then # current character is part of LCS if X[i - 1] == Y[j - 1]: i -= 1 j -= 1 # If not same, then find the larger of two and # go in the direction of larger value elif L[i - 1][j] > L[i][j - 1]: # right_idx = i if not right_idx else right_idx #the first change in L should be the right index of the lcs i -= 1 else: j -= 1 return lcs, i, max(i + lcs, right_idx) def longest_common_substring(X, Y): m = len(X) n = len(Y) # Create a 2D array to store the lengths of common substrings dp = [[0] * (n + 1) for _ in range(m + 1)] # Variables to store the length of the longest common substring max_length = 0 end_index = 0 # Build the dp array bottom-up for i in range(1, m + 1): for j in range(1, n + 1): if X[i - 1] == Y[j - 1]: dp[i][j] = dp[i - 1][j - 1] + 1 # Update the length and ending index of the common substring if dp[i][j] > max_length: max_length = dp[i][j] end_index = i - 1 else: dp[i][j] = 0 # The longest common substring is X[end_index - max_length + 1:end_index + 1] return len(X[end_index - max_length + 1: end_index + 1])