sbt-idp/cope2n-ai-fi/modules/sdsvkie/notebooks/pdf2image.ipynb

198 lines
50 KiB
Plaintext
Raw Normal View History

2023-12-12 08:14:54 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import fitz\n",
"from io import BytesIO"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n"
]
}
],
"source": [
"pdf_path = \"/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/PDF/eReceipt/(Sol tech Grp)_202303_Receipt_Transfort #1.pdf\"\n",
"with open(pdf_path, 'rb') as f:\n",
" pdf_bytes = f.read()\n",
"doc = fitz.open(stream=pdf_bytes, filetype='pdf')\n",
"print(doc.page_count)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"isinstance(pdf_bytes, bytes)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import os \n",
"\n",
"def pdf_to_image(pdf, dpi=300, is_get_first_page=False, max_page=1000):\n",
" \"\"\"_summary_\n",
"\n",
" Args:\n",
" pdf (_type_): _description_\n",
" dpi (int, optional): _description_. Defaults to 300.\n",
" is_get_first_page (bool, optional): _description_. Defaults to False.\n",
" max_page (int, optional): _description_. Defaults to 1000.\n",
"\n",
" Raises:\n",
" NotImplementedError: _description_\n",
"\n",
" Returns:\n",
" _type_: _description_\n",
" \"\"\"\n",
" if isinstance(pdf, str):\n",
" if not os.path.exists(pdf):\n",
" print(f\"Not found pdf path at {pdf}\")\n",
" return []\n",
" doc = fitz.open(pdf) # open document\n",
" elif isinstance(pdf, bytes):\n",
" doc = fitz.open(stream=pdf, filetype='pdf')\n",
" else:\n",
" raise NotImplementedError(f\"Not yet implement for {type(pdf)} type !!!\")\n",
" \n",
" zoom = dpi // 72\n",
" magnify = fitz.Matrix(zoom, zoom) \n",
" imgs = []\n",
" for idx, page in enumerate(doc):\n",
" pix = page.get_pixmap(matrix=magnify) # render page to an image\n",
" \n",
" im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)\n",
" im = np.ascontiguousarray(im[..., [2, 1, 0]]) # rgb to bgr\n",
" imgs.append(im)\n",
" if is_get_first_page or idx >= max_page:\n",
" break\n",
" return imgs"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "no such file: '/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/PDF/eReceipt/(Sol tech Grp)_202303_Rec eipt_Transfort #1.pdf'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m imgs \u001b[39m=\u001b[39m pdf_to_image(pdf\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/PDF/eReceipt/(Sol tech Grp)_202303_Rec eipt_Transfort #1.pdf\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m 2\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mlen\u001b[39m(imgs))\n",
"Cell \u001b[0;32mIn[13], line 5\u001b[0m, in \u001b[0;36mpdf_to_image\u001b[0;34m(pdf, dpi, is_get_first_page)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpdf_to_image\u001b[39m(pdf, dpi\u001b[39m=\u001b[39m\u001b[39m300\u001b[39m, is_get_first_page\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m 4\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(pdf, \u001b[39mstr\u001b[39m):\n\u001b[0;32m----> 5\u001b[0m doc \u001b[39m=\u001b[39m fitz\u001b[39m.\u001b[39;49mopen(pdf) \u001b[39m# open document\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(pdf, \u001b[39mbytes\u001b[39m):\n\u001b[1;32m 7\u001b[0m doc \u001b[39m=\u001b[39m fitz\u001b[39m.\u001b[39mopen(stream\u001b[39m=\u001b[39mpdf, filetype\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mpdf\u001b[39m\u001b[39m'\u001b[39m)\n",
"File \u001b[0;32m~/miniconda3/envs/py38_hoanglv/lib/python3.8/site-packages/fitz/fitz.py:3867\u001b[0m, in \u001b[0;36mDocument.__init__\u001b[0;34m(self, filename, stream, filetype, rect, width, height, fontsize)\u001b[0m\n\u001b[1;32m 3865\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39mexists(filename):\n\u001b[1;32m 3866\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mno such file: \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mfilename\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m-> 3867\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mFileNotFoundError\u001b[39;00m(msg)\n\u001b[1;32m 3868\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mnot\u001b[39;00m os\u001b[39m.\u001b[39mpath\u001b[39m.\u001b[39misfile(filename):\n\u001b[1;32m 3869\u001b[0m msg \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mfilename\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m is no file\u001b[39m\u001b[39m\"\u001b[39m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: no such file: '/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/PDF/eReceipt/(Sol tech Grp)_202303_Rec eipt_Transfort #1.pdf'"
]
}
],
"source": [
"imgs = pdf_to_image(pdf=\"/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/PDF/eReceipt/(Sol tech Grp)_202303_Rec eipt_Transfort #1.pdf\")\n",
"print(len(imgs))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGPCAYAAABChepSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB/nUlEQVR4nOz9eZQcd33v/z9r6+q9e/Z9pNG+Sza2ZbGYGIxtcAgE5yYQn1xIfOGbHMOXxFkckrDdm3NNSH5ZbsISbhJI7k1IIN9A2AIxNrYx3mVLtnZrtMyMZl9632r5/P4YdaHRLiNpNDPvxzl9pO6qrq6qnu569WfVlFIKIYQQQogFRJ/vHRBCCCGEuFQSYIQQQgix4EiAEUIIIcSCIwFGCCGEEAuOBBghhBBCLDgSYIQQQgix4EiAEUIIIcSCIwFGCCGEEAuOBBghhBBCLDgSYIQQQgix4FzTAeYzn/kMy5cvJxwOs337dp599tn53iUhhBBCXAOu2QDzL//yL9x///18/OMf54UXXmDr1q3ccccdjI+Pz/euCSGEEGKeadfqZI7bt2/nxhtv5K/+6q8A8H2fnp4ePvShD/G7v/u787x3QgghhJhP5nzvwNnUajV27tzJRz7ykeAxXde57bbbeOqpp876nGq1SrVaDe77vs/09DRNTU1omnbF91kIIYQQPzmlFPl8ns7OTnT93BVF12SAmZycxPM82tra5jze1tbGgQMHzvqcBx98kE9+8pNXY/eEEEIIcYUNDg7S3d19zuXXZIB5NT7ykY9w//33B/ez2Sy9vb0MDg6STCbncc+EEEIIcbFyuRw9PT0kEonzrndNBpjm5mYMw2BsbGzO42NjY7S3t5/1ObZtY9v2GY8nk0kJMEIIIcQCc6HmH9dkL6RQKMRrXvMaHn744eAx3/d5+OGH2bFjxzzumRBCCCGuBddkCQzA/fffz3vf+15uuOEGbrrpJv78z/+cYrHIL//yL8/3rgkhhBBinl2zAeYXfuEXmJiY4GMf+xijo6Ns27aN7373u2c07L0WnNoTXdO04L70frp2ZDIZjh8/Tnd3N6lUikqlQiwWo1qtYpompjn7UZiensY0TXzfJ5VKnbEdz/PYv38/8Xic5cuXB4+XSiWi0SiO41CpVKTacpE5fbQJpRSapsln/BqhlOLgwYNomsaqVasoFovE43E0TQs+m5qm4TgOhUKBSqVCW1vbWd+/0dFRJiYmWLFiBbFYDJjtGavrOqZpMjMzQzqdPm/vGHF1XLPjwPykcrkcqVSKbDZ7wYuJUoqjR4/iOA5KKRKJBJ2dndRqNfbt28fq1auJx+NMTU1x/PhxWlpaKBQKrFu3Dk3TGBoaYmBggMbGRjo7O/nBD36AaZrceuutKKXYu3cvmzZtwrIsTpw4gWEYFItF1q5dK1+AV8nTTz/N0NAQ2WyW1tZWLMuiXC5TKpV4/etfT0dHB4Zh8M1vfpOJiQluvvlm+vv7aW9vD/6GTNNkzZo1fOUrX0HXdfr6+tB1nWw2y+joKKtXr6ZUKtHW1saNN94434csTlEqldi/fz8dHR2MjY2xadMmTNNkcHCQarXKqlWrUEqxb98+EokEmUyGZcuWkU6ncV2XF154Ac/z2LJlC0ePHuXFF1/kpptuYs2aNRw6dAjTNFmxYgWjo6OEw2GOHj3KqlWrJMheJa7r8nd/93fYtk08HicWi+H7PoZhMDU1xS/8wi/gui7FYpG//du/5aabbqJarWJZFr7vEw6HqVQqbNmyhaeffhrHcfB9n1gshm3bHDt2jO7uborFIkopfvqnf5pQKDTfh71oXez1WyLkSb7v89JLL1GtVnn66aeDx0ZGRjhy5AgAjuMwODjID37wA44fPx78MY+Pj6NpGqOjo8GHJJfL4fs+g4OD7N69m+PHjzM6OsqTTz7JM888w+OPP47nefN5yEuKruu0tbWxfft2GhoaGBoaoqmpiVqthmma9Pf3k8lk0HWdpqYmjh49SmNjIy+88AK+73P48GGOHTuGpmnE43Fuu+02stksw8PDRCIRGhoa6O/vZ//+/RJKr0GO45DNZjl27Bj79+9nenoamP2MP/nkk/i+H9z+8z//k8nJSXbt2hU8d3p6mlKpRD6fZ3x8nGg0yuTkJEopnn76aY4dO8aJEyd47rnnOHToEA8//DD9/f3zeMRLTyQS4Y1vfCORSATDMKhUKtRqNRKJBOVymf3796OUor29nenpaRzHYXx8nGKxyNDQEDt37sT3/SCM1scgOXDgAGvXrmVkZIT+/n6y2ex8H6o46ZqtQrraXnjhBVpbW8nn8yilmJ6eRtd1NE0LvrxOnDhBoVCgsbGRarVKJpMhEokQDod5+eWX2bRpE/l8nmg0immalMtlotEoHR0dFAoF+vr6aG1tpVAokEql8DwvqLoQV1a9BEwpheM4rF+/nnA4zMaNG7Ftm66uLgBuvfVWQqEQmUyGeDzOpk2b5hRHR6NR3v72txOLxYhGoyiliEajlEolyuUyoVDorFVPYn7l83kOHjzIjh07CIfDjI6O0tjYyNTUFLZtMzY2RlNTE0899RTr169nenqahoYGCoUCjuNgGAbZbJZKpUI6nebIkSOsXr0az/NYuXIlY2Nj2LbN6tWrmZmZobu7G8dx5vuwlwzDMHjHO95BIpGgqakJmA2nmqYF1cXbtm3DdV1+9md/Fsdx0DQNXdfJ5/OEQiEsyyIUCvH617+eaDSK53mMjIywfft2LMti2bJleJ5HPB7Hsqx5PmIBUoUEzFYhTU1NoWkahmEEbSJCoRBTU1PE43FM06RSqVAqlYjFYriui23bhMPhoLhR0zRCoRDFYhFd14lGo7iuy8zMDIlEAsuycBwHz/NwHIdUKiW/1oW4CiqVShBKy+Uy8Xgc27Ypl8s4joNt24RCISYnJwmHw7iuSywWQ9d1DMMIwohpmhiGweTkJM3NzcDsd039e8MwDGC2zVU6nZYLnRCvwsVevyXACCHEAnO+jgKnf6XLj6SFa6l2CLnY67fUXwghxALj+/5Ze8EopXAro2QGvgpAuvfnMcNtF+wdea5lr+YCenqvTPHq1Xu7ibOTACOEEIuE8isMPPMBipPPA5Ab+T59b/gK6DZHjhzB8zxWr17Ns88+SzQaZfXq1VQqFarVKpFIhEgkQq1W49lnn2XNmjXYtk21WqWrq4v+/n4SiQRTU1N0dHQQiUQwTXM2NLnunEB1alfjetjyPA/btuWCfAmUAjld5yYBRgghFgmvlqE4uZPG1f8vANOv/CVebQYz3E4ikWB4eBilFKOjo0HPy0gkQnNzM5VKhTVr1hCPxxkcHMR1XZRSdHZ20tnZyczMDP39/UxPT1Mul+nv76enp4dSqRSMsWJZFtVqlWQySS6XwzTNYMgCTdOCRvJCXA4SYIQQYpEwQo3E297I8J7/HwDNvbdjhBpRSrFr1y5WrlyJpmn09vbS3d3Nc889R29vL8ViEdM0aWtro1arcd111zE2Nhb00qnVajQ0NOA4Dk1NTWSzWWzbplAo0NXVxfHjx8nn83R0dNDc3IxSimQyGXQ9d12XUChErVaTACMuG2nEew5KKQ4fPkw2m+X6668Pij2PHz9OIpEgnU7T39+PaZp0d3ef0dvA93127dqFbdts3LgRmJ0he3p6mr6+PsrlMs8++yxbt24FYNeuXWzevDnoAlg3ODhIa2srtm2Tz+eJx+MyAqQQS5znecEwD6dSSuHVMmSH/wOAVOdbMUJpYPY7CZjz/VHfztlGFlZK4fs+tVqNcDh8xjLP8+jv72fVqlXA7HdVb2/vGdvfs2dPMMinYRhShXQJPM9H15feiM/SiPcyeOihhwiHwwwODlIul6lWqxw6dIjrrruOd7zjHXz/+9/n+PHjLF++nFQqRU9PDwcPHmTlypU0NTXx3HPP0dbWxo9+9CMaGxuZmJhgaGiIt7zlLfT19fGNb3yDZcuWMTAwwL/8y7+wb98+PM+jra0N27YZGBggFAphGAa+77N//35+53d+h87Ozvk+NUKIa5CmaRihNI3L3zPnMaUUlUqFUCiE7/s4joOu60HblHp
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt \n",
"plt.imshow(imgs[0])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# import module\n",
"from pdf2image import convert_from_path\n",
" \n",
" \n",
"# Store Pdf with convert_from_path function\n",
"images = convert_from_path('/mnt/hdd2T/AICR/Projects/2023/Invoices_SL_HCM/EAS03581488.pdf', dpi=300)\n",
" \n",
"for i in range(len(images)):\n",
" \n",
" # Save pages as images in the pdf\n",
" images[i].save('page'+ str(i) +'.jpg', 'JPEG')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py38_hoanglv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}