sbt-idp/cope2n-ai-fi/common/AnyKey_Value/word_preprocess.py
2023-11-30 18:22:16 +07:00

225 lines
8.9 KiB
Python
Executable File

import nltk
import re
import string
import copy
from utils.kvu_dictionary import vat_dictionary, ap_dictionary, DKVU2XML
nltk.download('words')
words = set(nltk.corpus.words.words())
s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
# def clean_text(text):
# return re.sub(r"[^A-Za-z(),!?\'\`]", " ", text)
def get_string(lwords: list):
unique_list = []
for item in lwords:
if item.isdigit() and len(item) == 1:
unique_list.append(item)
elif item not in unique_list:
unique_list.append(item)
return ' '.join(unique_list)
def remove_english_words(text):
_word = [w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() not in words]
return ' '.join(_word)
def remove_punctuation(text):
return text.translate(str.maketrans(" ", " ", string.punctuation))
def remove_accents(input_str, s0, s1):
s = ''
# print input_str.encode('utf-8')
for c in input_str:
if c in s1:
s += s0[s1.index(c)]
else:
s += c
return s
def remove_spaces(text):
return text.replace(' ', '')
def preprocessing(text: str):
# text = remove_english_words(text) if table else text
text = remove_punctuation(text)
text = remove_accents(text, s0, s1)
text = remove_spaces(text)
return text.lower()
def vat_standardize_outputs(vat_outputs: dict) -> dict:
outputs = {}
for key, value in vat_outputs.items():
if key != "table":
outputs[DKVU2XML[key]] = value
else:
list_items = []
for item in value:
list_items.append({
DKVU2XML[item_key]: item_value for item_key, item_value in item.items()
})
outputs['table'] = list_items
return outputs
def vat_standardizer(text: str, threshold: float, header: bool):
dictionary = vat_dictionary(header)
processed_text = preprocessing(text)
for candidates in [('ngayday', 'ngaydate', 'ngay', 'day'), ('thangmonth', 'thang', 'month'), ('namyear', 'nam', 'year')]:
if any([processed_text in txt for txt in candidates]):
processed_text = candidates[-1]
return "Ngày, tháng, năm lập hóa đơn", 5, processed_text
_dictionary = copy.deepcopy(dictionary)
if not header:
exact_dictionary = {
'Số hóa đơn': ['sono', 'so'],
'Mã số thuế người bán': ['mst'],
'Tên người bán': ['kyboi'],
'Ngày, tháng, năm lập hóa đơn': ['kyngay', 'kyngaydate']
}
for k, v in exact_dictionary.items():
_dictionary[k] = dictionary[k] + exact_dictionary[k]
for k, v in dictionary.items():
# if k in ("Ngày, tháng, năm lập hóa đơn"):
# continue
# Prioritize match completely
if k in ('Tên người bán') and processed_text == "kyboi":
return k, 8, processed_text
if any([processed_text == key for key in _dictionary[k]]):
return k, 10, processed_text
scores = {k: 0.0 for k in dictionary}
for k, v in dictionary.items():
if k in ("Ngày, tháng, năm lập hóa đơn"):
continue
scores[k] = max([longestCommonSubsequence(processed_text, key)/len(key) for key in dictionary[k]])
key, score = max(scores.items(), key=lambda x: x[1])
return key if score > threshold else text, score, processed_text
def ap_standardizer(text: str, threshold: float, header: bool):
dictionary = ap_dictionary(header)
processed_text = preprocessing(text)
# Prioritize match completely
_dictionary = copy.deepcopy(dictionary)
if not header:
_dictionary['serial_number'] = dictionary['serial_number'] + ['sn']
_dictionary['imei_number'] = dictionary['imei_number'] + ['imel']
else:
_dictionary['modelnumber'] = dictionary['modelnumber'] + ['sku', 'sn', 'imei']
_dictionary['qty'] = dictionary['qty'] + ['qty']
for k, v in dictionary.items():
if any([processed_text == key for key in _dictionary[k]]):
return k, 10, processed_text
scores = {k: 0.0 for k in dictionary}
for k, v in dictionary.items():
scores[k] = max([longestCommonSubsequence(processed_text, key)/len(key) for key in dictionary[k]])
key, score = max(scores.items(), key=lambda x: x[1])
return key if score >= threshold else text, score, processed_text
def convert_format_number(s: str) -> float:
s = s.replace(' ', '').replace('O', '0').replace('o', '0')
if s.endswith(",00") or s.endswith(".00"):
s = s[:-3]
if all([delimiter in s for delimiter in [',', '.']]):
s = s.replace('.', '').split(',')
remain_value = s[1].split('0')[0]
return int(s[0]) + int(remain_value) * 1 / (10**len(remain_value))
else:
s = s.replace(',', '').replace('.', '')
return int(s)
def post_process_for_item(item: dict) -> dict:
check_keys = ['Số lượng', 'Đơn giá', 'Doanh số mua chưa có thuế']
mis_key = []
for key in check_keys:
if item[key] in (None, '0'):
mis_key.append(key)
if len(mis_key) == 1:
try:
if mis_key[0] == check_keys[0] and convert_format_number(item[check_keys[1]]) != 0:
item[mis_key[0]] = round(convert_format_number(item[check_keys[2]]) / convert_format_number(item[check_keys[1]])).__str__()
elif mis_key[0] == check_keys[1] and convert_format_number(item[check_keys[0]]) != 0:
item[mis_key[0]] = (convert_format_number(item[check_keys[2]]) / convert_format_number(item[check_keys[0]])).__str__()
elif mis_key[0] == check_keys[2]:
item[mis_key[0]] = (convert_format_number(item[check_keys[0]]) * convert_format_number(item[check_keys[1]])).__str__()
except Exception as e:
print("Cannot post process this item with error:", e)
return item
def longestCommonSubsequence(text1: str, text2: str) -> int:
# https://leetcode.com/problems/longest-common-subsequence/discuss/351689/JavaPython-3-Two-DP-codes-of-O(mn)-and-O(min(m-n))-spaces-w-picture-and-analysis
dp = [[0] * (len(text2) + 1) for _ in range(len(text1) + 1)]
for i, c in enumerate(text1):
for j, d in enumerate(text2):
dp[i + 1][j + 1] = 1 + \
dp[i][j] if c == d else max(dp[i][j + 1], dp[i + 1][j])
return dp[-1][-1]
def longest_common_subsequence_with_idx(X, Y):
"""
This implementation uses dynamic programming to calculate the length of the LCS, and uses a path array to keep track of the characters in the LCS.
The longest_common_subsequence function takes two strings as input, and returns a tuple with three values:
the length of the LCS,
the index of the first character of the LCS in the first string,
and the index of the last character of the LCS in the first string.
"""
m, n = len(X), len(Y)
L = [[0 for i in range(n + 1)] for j in range(m + 1)]
# Following steps build L[m+1][n+1] in bottom up fashion. Note
# that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]
right_idx = 0
max_lcs = 0
for i in range(m + 1):
for j in range(n + 1):
if i == 0 or j == 0:
L[i][j] = 0
elif X[i - 1] == Y[j - 1]:
L[i][j] = L[i - 1][j - 1] + 1
if L[i][j] > max_lcs:
max_lcs = L[i][j]
right_idx = i
else:
L[i][j] = max(L[i - 1][j], L[i][j - 1])
# Create a string variable to store the lcs string
lcs = L[i][j]
# Start from the right-most-bottom-most corner and
# one by one store characters in lcs[]
i = m
j = n
# right_idx = 0
while i > 0 and j > 0:
# If current character in X[] and Y are same, then
# current character is part of LCS
if X[i - 1] == Y[j - 1]:
i -= 1
j -= 1
# If not same, then find the larger of two and
# go in the direction of larger value
elif L[i - 1][j] > L[i][j - 1]:
# right_idx = i if not right_idx else right_idx #the first change in L should be the right index of the lcs
i -= 1
else:
j -= 1
return lcs, i, max(i + lcs, right_idx)