{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os \n", "import glob \n", "from tqdm import tqdm \n", "import cv2 \n", "import shutil\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "DATA_DIR = \"/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/IMGS\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def copy_only_first_page(data_dir, out_dir, skip_types=['Receipt_taxi','Receipt_food_Cam', 'Receipt_food_Scan']):\n", " paths = sorted(glob.glob(data_dir + \"/*/*\"))\n", " print(\"Total paths: \", len(paths))\n", " out_dir = Path(out_dir)\n", " for path in paths:\n", " type_doc = Path(path).parent.name\n", " out_dir_full = out_dir / type_doc\n", " if not out_dir_full.exists():\n", " out_dir_full.mkdir(parents=True)\n", " if type_doc in skip_types:\n", " shutil.copy(path, str(out_dir_full))\n", " else:\n", " if \"_1.jpg\" in path:\n", " shutil.copy(path, out_dir_full)\n", " prefix_name = \"_\".join(path.split(\"_\")[:-1]) + \"_1.jpg\"\n", " print(prefix_name)\n", " if Path(prefix_name).exists():\n", " continue\n", " else:\n", " shutil.copy(path, out_dir_full)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "classes = [\n", " # id invoice\n", " 'No_key', # số hóa đơn\n", " 'No_value', \n", " 'Form_key', # mẫu số hóa đơn\n", " 'Form_value', \n", " 'Serial_key', # số kí hiệu hoá đơn\n", " 'Serial_value', \n", " 'Date_value', \n", "\n", " # seller info\n", " 'Seller_company_name_key', \n", " 'Seller_company_name_value', \n", " 'Seller_tax_code_key', \n", " 'Seller_tax_code_value', \n", " 'Seller_address_value',\n", " 'Seller_address_key', \n", " 'Seller_tel_key',\n", " 'Seller_tel_value', \n", " \n", " # buyer info\n", " 'Buyer_personal_name_key',\n", " 'Buyer_personal_name_value', \n", " 'Buyer_company_name_key', \n", " 'Buyer_company_name_value', \n", " 'Buyer_tax_code_key', \n", " 'Buyer_tax_code_value', \n", " 'Buyer_address_key', \n", " 'Buyer_address_value', \n", " 'Buyer_address_key',\n", " 'Buyer_address_value',\n", "\n", " # money info\n", " 'Tax_amount_key', \n", " 'Tax_amount_value', \n", " 'Total_key', \n", " 'Total_value', \n", " 'Total_in_words_key', \n", " 'Total_in_words_value',\n", " \n", " 'Other', \n", "]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "classes = [x.lower() for x in classes]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['no_key',\n", " 'no_value',\n", " 'form_key',\n", " 'form_value',\n", " 'serial_key',\n", " 'serial_value',\n", " 'date_value',\n", " 'seller_company_name_key',\n", " 'seller_company_name_value',\n", " 'seller_tax_code_key',\n", " 'seller_tax_code_value',\n", " 'seller_address_value',\n", " 'seller_address_key',\n", " 'seller_tel_key',\n", " 'seller_tel_value',\n", " 'buyer_personal_name_key',\n", " 'buyer_personal_name_value',\n", " 'buyer_company_name_key',\n", " 'buyer_company_name_value',\n", " 'buyer_tax_code_key',\n", " 'buyer_tax_code_value',\n", " 'buyer_address_key',\n", " 'buyer_address_value',\n", " 'buyer_address_key',\n", " 'buyer_address_value',\n", " 'tax_amount_key',\n", " 'tax_amount_value',\n", " 'total_key',\n", " 'total_value',\n", " 'total_in_words_key',\n", " 'total_in_words_value',\n", " 'other']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "py38_hoanglv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }