import re
import nltk
import string
import tldextract
from dateutil import parser
from datetime import datetime
# nltk.download('words')
try:
    nltk.data.find("corpora/words")
except LookupError:
    nltk.download('words')
words = set(nltk.corpus.words.words())

from sdsvkvu.utils.word2line import Word, words_to_lines

s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'

# def clean_text(text):
#     return re.sub(r"[^A-Za-z(),!?\'\`]", " ", text)

def get_string(lwords: list):
    unique_list = []
    for item in lwords:
        if item.isdigit() and len(item) == 1:
            unique_list.append(item)
        elif item not in unique_list:
            unique_list.append(item)
    return ' '.join(unique_list)

def get_string_by_deduplicate_bbox(lwords: list, lbboxes: list):
    unique_list = []
    prev_bbox = [-1, -1, -1, -1]
    for word, bbox in zip(lwords, lbboxes):
        if bbox != prev_bbox:
            unique_list.append(word)
            prev_bbox = bbox
    return ' '.join(unique_list)

def get_string_with_word2line(lwords: list, lbboxes: list):
    list_words = []
    unique_list = []
    list_sorted_words = []
    
    prev_bbox = [-1, -1, -1, -1]
    for word, bbox in zip(lwords, lbboxes):
        if bbox != prev_bbox:
            prev_bbox = bbox
            list_words.append(Word(image=None, text=word, conf_cls=-1, bndbox=bbox, conf_detect=-1))
            unique_list.append(word)
    
    llines = words_to_lines(list_words)[0]
    
    for line in llines:
        for _word_group in line.list_word_groups:
            for _word in _word_group.list_words:
                list_sorted_words.append(_word.text)
                
    string_from_model = ' '.join(unique_list) 
    string_after_word2line = ' '.join(list_sorted_words)   
    
    # if string_from_model != string_after_word2line:
    #     print("[Warning] Word group from model is different with word2line module")
    #     print("Model: ", ' '.join(unique_list))
    #     print("Word2line: ", ' '.join(list_sorted_words))
    
    return string_after_word2line

def date_regexing(inp_str):
    patterns = {
        'ngay': r"ngày\d+",
        'thang': r"tháng\d+",
        'nam': r"năm\d+"
    }
    inp_str = inp_str.replace(" ", "").lower()
    outputs = {k: '' for k in patterns}
    for key, pattern in patterns.items(): 
        matches = re.findall(pattern, inp_str)
        if len(matches) > 0:
            element = set([match[len(key):] for match in matches])
            outputs[key] = list(element)[0] 
    return outputs['ngay'], outputs['thang'], outputs['nam']


def parse_date1(date_str):
    # remove space
    date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str)
    date_str = re.sub(r"/\s+", "/", date_str)
    date_str = re.sub(r"-\s+", "-", date_str)

    is_parser_error = False
    try:
        date_obj = parser.parse(date_str, fuzzy=True)
        year_str = str(date_obj.year)
        day_str = str(date_obj.day)
        # date_formated = date_obj.strftime("%d/%m/%Y")
        date_formated = date_obj.strftime("%Y-%m-%d")
    except Exception as err:
        # date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0]
        # date_str, is_match = date_regexing(date_str)
        is_match = False
        if is_match:
            date_formated = date_str
            is_parser_error = False
            return date_formated, is_parser_error
        else:
            print(f"Error parse date: err = {err}, date = {date_str}")
            date_formated = date_str
            is_parser_error = True
            return date_formated, is_parser_error

    if len(normalize_number(date_str)) == 6:
        year_str = year_str[-2:]
    try:
        year_index = date_str.index(str(year_str))
        day_index = date_str.index(str(day_str))
        if year_index > day_index:
            date_obj = parser.parse(date_str, fuzzy=True, dayfirst=True)

            # date_formated = date_obj.strftime("%d/%m/%Y")
            date_formated = date_obj.strftime("%Y-%m-%d")
    except Exception as err:
        print(f"Error check dayfirst: err = {err}, date = {date_str}")

    return date_formated, is_parser_error


def parse_date(date_str):
    # remove space
    date_str = re.sub(r"[\[\]\{\}\(\)\.\,]", " ", date_str)
    date_str = re.sub(r"/\s+", "/", date_str)
    date_str = re.sub(r"-\s+", "-", date_str)
    date_str = re.sub(r"\-+", "-", date_str)
    date_str = date_str.lower().replace("0ct", "oct")

    is_parser_error = False
    try:
        date_obj = parser.parse(date_str, fuzzy=True)
    except Exception as err:
        print(f"1.Error parse date: err = {err}, date = {date_str}")
        try: 
            date_str = sorted(date_str.split(" "), key=lambda x: len(x), reverse=True)[0]
            date_obj = parser.parse(date_str, fuzzy=True)
        except Exception as err:
            print(f"2.Error parse date: err = {err}, date = {date_str}")
            is_parser_error = True
            return [date_str], is_parser_error
        
    year_str = int(date_obj.year)
    month_str = int(date_obj.month)
    day_str = int(date_obj.day)
    
    current_year = int(datetime.now().year)
    if year_str > current_year or year_str < 2010: # invalid year
        date_obj = date_obj.replace(year=current_year)
        
    formated_date = date_obj.strftime("%Y-%m-%d")
    revert_formated_date = date_obj.strftime("%Y-%d-%m")    
    
    if any(txt in date_str for txt in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']):
        return [formated_date], is_parser_error
    if month_str <= 12 and day_str <= 12:    
        return [formated_date, revert_formated_date], is_parser_error
    return [formated_date], is_parser_error



def normalize_imei(imei):
    imei = imei.replace(" ", "")
    imei = imei.split("/")[0]
    return imei

def normalize_seller(seller):
    # if isinstance(seller, str):
    #     seller = seller
    return seller

def normalize_website(website):
    if isinstance(website, str):
        # website = website.lower().replace("www.", "").replace("ww.", "").replace(".com", "")
        website = website.lower()
        website = website.split(".com")[0]
        website = tldextract.extract(website).domain
    return website

def normalize_hotline(hotline):
    if isinstance(hotline, str):
        hotline = hotline.lower().replace("hotline", "")
    return hotline

def normalize_voucher(voucher):
    if isinstance(voucher, str):
        voucher = voucher.lower().replace("voucher", "")
    return voucher



def normalize_number(
    text_str: str, reserve_dot=False, reserve_plus=False, reserve_minus=False
):
    """
    Normalize a string of numbers by removing non-numeric characters

    """
    assert isinstance(text_str, str), "input must be str"
    reserver_chars = ""
    if reserve_dot:
        reserver_chars += ".,"
    if reserve_plus:
        reserver_chars += "+"
    if reserve_minus:
        reserver_chars += "-"
    regex_fomula = "[^0-9{}]".format(reserver_chars)
    normalized_text_str = re.sub(r"{}".format(regex_fomula), "", text_str)
    return normalized_text_str

def remove_bullet_points_and_punctuation(text):
    # Remove bullet points (e.g., • or -)
    text = re.sub(r'^\s*[\•\-\*]\s*', '', text, flags=re.MULTILINE)
    text = text.strip()
    # # Remove end-of-sentence punctuation (e.g., ., !, ?)
    # text = re.sub(r'[.!?]', '', text)
    if len(text) > 0 and text[0] in (',', '.', ':', ';', '?', '!'):
        text = text[1:]
    if len(text) > 0 and text[-1] in (',', '.', ':', ';', '?', '!'):
        text = text[:-1]
    return text.strip()

def split_key_value_by_colon(key: str, value: str) -> list:
    key_string = key if key is not None else ""
    value_string = value if value is not None else ""
    text_string = key_string + " " + value_string
    elements = text_string.split(":")
    if len(elements) > 1 and not bool(re.search(r'\d', elements[0])):
        return elements[0], text_string[len(elements[0])+1 :].strip()
    return key, value


def is_string_in_range(s):
    try:
        num = int(s)
        return 0 <= num <= 9
    except ValueError:
        return False

def remove_english_words(text):
    _word = [w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() not in words]
    return ' '.join(_word)

def remove_punctuation(text):
    return text.translate(str.maketrans(" ", " ", string.punctuation))

def remove_accents(input_str, s0, s1):
	s = ''
	# print input_str.encode('utf-8')
	for c in input_str:
		if c in s1:
			s += s0[s1.index(c)]
		else:
			s += c
	return s

def remove_spaces(text):
    return text.replace(' ', '')

def preprocessing(text: str):
    # text = remove_english_words(text) if table else text
    text = remove_punctuation(text)
    text = remove_accents(text, s0, s1)
    text = remove_spaces(text)
    return text.lower()

def longestCommonSubsequence(text1: str, text2: str) -> int:
    # https://leetcode.com/problems/longest-common-subsequence/discuss/351689/JavaPython-3-Two-DP-codes-of-O(mn)-and-O(min(m-n))-spaces-w-picture-and-analysis
    dp = [[0] * (len(text2) + 1) for _ in range(len(text1) + 1)]
    for i, c in enumerate(text1):
        for j, d in enumerate(text2):
            dp[i + 1][j + 1] = 1 + \
                dp[i][j] if c == d else max(dp[i][j + 1], dp[i + 1][j])
    return dp[-1][-1]


def longest_common_subsequence_with_idx(X, Y):
    """
    This implementation uses dynamic programming to calculate the length of the LCS, and uses a path array to keep track of the characters in the LCS.
    The longest_common_subsequence function takes two strings as input, and returns a tuple with three values:
    the length of the LCS,
    the index of the first character of the LCS in the first string,
    and the index of the last character of the LCS in the first string.
    """
    m, n = len(X), len(Y)
    L = [[0 for i in range(n + 1)] for j in range(m + 1)]

    # Following steps build L[m+1][n+1] in bottom up fashion. Note
    # that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]
    right_idx = 0
    max_lcs = 0
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
                if L[i][j] > max_lcs:
                    max_lcs = L[i][j]
                    right_idx = i
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

        # Create a string variable to store the lcs string
    lcs = L[i][j]
    # Start from the right-most-bottom-most corner and
    # one by one store characters in lcs[]
    i = m
    j = n
    # right_idx = 0
    while i > 0 and j > 0:
        # If current character in X[] and Y are same, then
        # current character is part of LCS
        if X[i - 1] == Y[j - 1]:

            i -= 1
            j -= 1
        # If not same, then find the larger of two and
        # go in the direction of larger value
        elif L[i - 1][j] > L[i][j - 1]:
            # right_idx = i if not right_idx else right_idx #the first change in L should be the right index of the lcs
            i -= 1
        else:
            j -= 1
    return lcs, i, max(i + lcs, right_idx)



def longest_common_substring(X, Y):
    m = len(X)
    n = len(Y)

    # Create a 2D array to store the lengths of common substrings
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Variables to store the length of the longest common substring
    max_length = 0
    end_index = 0

    # Build the dp array bottom-up
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i - 1] == Y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1

                # Update the length and ending index of the common substring
                if dp[i][j] > max_length:
                    max_length = dp[i][j]
                    end_index = i - 1
            else:
                dp[i][j] = 0

    # The longest common substring is X[end_index - max_length + 1:end_index + 1]

    return len(X[end_index - max_length + 1: end_index + 1])