sbt-idp/cope2n-ai-fi/modules/sdsvkie/notebooks/sdsvap_invoice.ipynb
2023-12-12 15:14:54 +07:00

195 lines
5.0 KiB
Plaintext
Executable File

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os \n",
"import glob \n",
"from tqdm import tqdm \n",
"import cv2 \n",
"import shutil\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = \"/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/IMGS\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def copy_only_first_page(data_dir, out_dir, skip_types=['Receipt_taxi','Receipt_food_Cam', 'Receipt_food_Scan']):\n",
" paths = sorted(glob.glob(data_dir + \"/*/*\"))\n",
" print(\"Total paths: \", len(paths))\n",
" out_dir = Path(out_dir)\n",
" for path in paths:\n",
" type_doc = Path(path).parent.name\n",
" out_dir_full = out_dir / type_doc\n",
" if not out_dir_full.exists():\n",
" out_dir_full.mkdir(parents=True)\n",
" if type_doc in skip_types:\n",
" shutil.copy(path, str(out_dir_full))\n",
" else:\n",
" if \"_1.jpg\" in path:\n",
" shutil.copy(path, out_dir_full)\n",
" prefix_name = \"_\".join(path.split(\"_\")[:-1]) + \"_1.jpg\"\n",
" print(prefix_name)\n",
" if Path(prefix_name).exists():\n",
" continue\n",
" else:\n",
" shutil.copy(path, out_dir_full)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"classes = [\n",
" # id invoice\n",
" 'No_key', # số hóa đơn\n",
" 'No_value', \n",
" 'Form_key', # mẫu số hóa đơn\n",
" 'Form_value', \n",
" 'Serial_key', # số kí hiệu hoá đơn\n",
" 'Serial_value', \n",
" 'Date_value', \n",
"\n",
" # seller info\n",
" 'Seller_company_name_key', \n",
" 'Seller_company_name_value', \n",
" 'Seller_tax_code_key', \n",
" 'Seller_tax_code_value', \n",
" 'Seller_address_value',\n",
" 'Seller_address_key', \n",
" 'Seller_tel_key',\n",
" 'Seller_tel_value', \n",
" \n",
" # buyer info\n",
" 'Buyer_personal_name_key',\n",
" 'Buyer_personal_name_value', \n",
" 'Buyer_company_name_key', \n",
" 'Buyer_company_name_value', \n",
" 'Buyer_tax_code_key', \n",
" 'Buyer_tax_code_value', \n",
" 'Buyer_address_key', \n",
" 'Buyer_address_value', \n",
" 'Buyer_address_key',\n",
" 'Buyer_address_value',\n",
"\n",
" # money info\n",
" 'Tax_amount_key', \n",
" 'Tax_amount_value', \n",
" 'Total_key', \n",
" 'Total_value', \n",
" 'Total_in_words_key', \n",
" 'Total_in_words_value',\n",
" \n",
" 'Other', \n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"classes = [x.lower() for x in classes]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['no_key',\n",
" 'no_value',\n",
" 'form_key',\n",
" 'form_value',\n",
" 'serial_key',\n",
" 'serial_value',\n",
" 'date_value',\n",
" 'seller_company_name_key',\n",
" 'seller_company_name_value',\n",
" 'seller_tax_code_key',\n",
" 'seller_tax_code_value',\n",
" 'seller_address_value',\n",
" 'seller_address_key',\n",
" 'seller_tel_key',\n",
" 'seller_tel_value',\n",
" 'buyer_personal_name_key',\n",
" 'buyer_personal_name_value',\n",
" 'buyer_company_name_key',\n",
" 'buyer_company_name_value',\n",
" 'buyer_tax_code_key',\n",
" 'buyer_tax_code_value',\n",
" 'buyer_address_key',\n",
" 'buyer_address_value',\n",
" 'buyer_address_key',\n",
" 'buyer_address_value',\n",
" 'tax_amount_key',\n",
" 'tax_amount_value',\n",
" 'total_key',\n",
" 'total_value',\n",
" 'total_in_words_key',\n",
" 'total_in_words_value',\n",
" 'other']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"classes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py38_hoanglv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}