sbt-idp/cope2n-ai-fi/common/post_processing_datetime.py

import re
from datetime import datetime
from sklearn.metrics import classification_report
from common.utils.utils import read_json
from underthesea import word_tokenize


class DatetimeCorrector:
    @staticmethod
    def verify_and_convert_date(date_str):
        # Try to parse the date string using the datetime module
        try:
            date = datetime.strptime(date_str, "%d/%m/%Y")  # TODO: fix this
        except ValueError:
            # If the date string is not in a valTid format, return False
            return ""

        # If the date string is in the correct format, check if it is already in the "dd/mm/yyyy" format
        if date_str[:2] == "dd" and date_str[3:5] == "mm" and date_str[6:] == "yyyy":
            # If the date string is already in the correct format, return it as is
            return date_str
        else:
            # If the date string is not in the correct format, use the strftime method to convert it
            return date.strftime("%d/%m/%Y")

    @staticmethod
    def get_date_from_date_string_by_prefix(date_string_, prefix_):
        prefix = prefix_.lower()
        date_string = date_string_.lower()
        if prefix in date_string:
            try:
                if prefix == "năm":
                    match = re.split(
                        r"năm[^\d]*(\d{4}|\d{1}[\s.]*\d{3}|\d{3}[\s.]*\d{1}|\d{2}[\s.]*\d{2}|\d{2}[\s.]*\d{1}[\s.]*\d{1}|\d{1}[\s.]*\d{2}[\s.]*\d{1}|\d{1}[\s.]*\d{1}[\s.]*\d{2}|\d{1}[\s.]*\d{1}[\s.]*\d{1}[\s.]*\d{1})[\s.]*\b",
                        date_string)  # match "năm" following with all combination of 4 numbers and whitespace/dot such as 1111; 111.1; 111 1; 11 2 1, 2 2 2.2; ...
                elif prefix == "ngày":
                    match = re.split(r"ngày[^\d]*(\d{2}|\d{1}[\s.]*\d{1}|\d{1})[\s.]*\b", date_string)
                else:
                    match = re.split(r"tháng[^\d]*(\d{2}|\d{1}[\s.]*\d{1}|\d{1})[\s.]*\b", date_string)
                num = match[1]
                remain_string = match[2] if prefix != "năm" else match[0]
                return num, remain_string
            except:
                return "", date_string_
        else:
            return '', date_string_

    @staticmethod
    def get_date_by_pattern(date_string):
        match = re.findall(r"([^\d\s]+)?\s*(\d{1}\s*\d?\s+|\d{2}\s+|\d+\s*\b)", date_string)
        if not match:
            return ""
        if len(match) > 3:
            day = match[0][-1].replace(" ", "")
            year = match[-1][-1].replace(" ", "")
            # since in the VIETNAMESE DRIVER LICENSE, the tháng/month is behind the stamp and can be recognized as any thing => mistạken number may be in range (1->-3) => choose month to be -2
            month = match[-2][-1].replace(" ", "")
            return "/".join([day, month, year])
        else:
            return "/".join([m[-1].replace(" ", "") for m in match])

    @staticmethod
    def extract_date_from_string(date_string):
        remain_str = date_string
        ldate = []
        for d in ["năm", "ngày", "tháng"]:
            date, remain_str = DatetimeCorrector.get_date_from_date_string_by_prefix(date_string, d)
            if not date:
                return DatetimeCorrector.get_date_by_pattern(date_string)
            ldate.append(date.strip().replace(" ", "").replace(".", ""))
        return "/".join([ldate[1], ldate[2], ldate[0]])

    @staticmethod
    def correct(date_string):
        # Extract the day, month, and year from the string using regular expressions
        date_string = date_string.lower().replace("✪", " ")
        date_string = " ".join(word_tokenize(date_string))
        parsed_date_string_ = DatetimeCorrector.verify_and_convert_date(date_string)  # if already in datetime format
        if parsed_date_string_:
            return parsed_date_string_
        extracted_date = DatetimeCorrector.extract_date_from_string(date_string)
        parsed_date_string_ = DatetimeCorrector.verify_and_convert_date(extracted_date)
        return parsed_date_string_ if parsed_date_string_ else date_string

    @staticmethod
    def eval():
        data = read_json("common/dates_gplx.json")
        type_column = "GPLX"  # Invoice/GPLX
        y_true, y_pred = [], []
        lexcludes = {}
        ddata = {}
        for k, d in data.items():
            if k in lexcludes:
                continue
            if k == "inv_SDV_215":
                print("debugging")
            pred = DatetimeCorrector.correct(d["pred"])
            label = DatetimeCorrector.correct(d["label"])
            ddata[k] = {}
            data[k]["Type"] = type_column
            ddata[k]["Predict"] = d["pred"]
            ddata[k]["Label"] = d["label"]
            ddata[k]["Post-processed"] = pred
            y_pred.append(pred == label)
            y_true.append(1)
            if k == "invoice_1219_000":
                print("\n", k, '-' * 50)
                print(pred, "------", d["pred"])
                print(label, "------", d["label"])
        print(classification_report(y_true, y_pred))
        import pandas as pd
        df = pd.DataFrame.from_dict(ddata, orient="index")
        df.to_csv(f"result/datetime_post_processed_{type_column}.csv")