Remove unused files

2023-12-14 13:49:42 +07:00 · 2023-12-14 13:49:42 +07:00 · eb2c08195d
commit eb2c08195d
parent 6873ffce05
119 changed files with 0 additions and 16408 deletions
--- a/cope2n-ai-fi/modules/sdsvkie/.gitignore
+++ b/cope2n-ai-fi/modules/sdsvkie/.gitignore
@ -1,13 +0,0 @@
 *.pyc
 __pycache__
 .cache
 /microsoft
 weights/
 workdirs/
 wandb
 sdsvkie/tools/sample_cvat
 notebooks/workdirs
 external/
 notebooks/visualize
 *.egg-info
 ./external/sdsv_dewarp
--- a/cope2n-ai-fi/modules/sdsvkie/README.md
+++ b/cope2n-ai-fi/modules/sdsvkie/README.md
@ -1,75 +0,0 @@
 <p align="center">
  <h1 align="center">SDSVKIE</h1>
 </p>
 ***Feature***
 - Extract information from documents: VAT Invoice, Receipt
 - Language: VI + EN
 ***What's news***
 ### - Ver 1.0.1:
 - Improve postprocessing for receipts
 - Support handling multiple pages for PDF files
 - Lastest weight: /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/best
 - Lastest config: /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/config.yaml
 ## I. Setup 
 ***Dependencies***
 - Python: 3.8
 - Torch: 1.10.2
 - CUDA: 11.6
 - transformers: 4.28.1
 ```
 pip install -v -e .
 ```
 ## II. Inference
 ```
 from sdsvkie import Predictor
 import cv2 
 predictor = Predictor(
    cfg="./workdirs/training/sdsap_receipt/exp_3/config.yaml", 
    weights="./workdirs/training/sdsap_receipt/exp_3/best",
    device="cpu",
 )
 img = cv2.imread("./demos/4 Sep OPC to Home.jpg")
 out = predictor(img)
 output = out['end2end_results']
 ```
 ## III. Training
 - Prepare dataset: The structure of the dataset directory is organized as follows:
 └── base_dataset \
  ├── train \
  ├──── sub_dir_1 \
  ├────── img1.txt \
  ├────── img1.txt \
  ├────── ... \
  ├──── sub_dir_2 \
  ├────── img2.txt \
  ├────── img2.txt \
  ├── test \
  ├──── imgn.jpg \
  ├──── imgn.txt
 - Edit and run scripts:
 ```
 sh ./scripts/train.sh
 ```
 # TODO
 - [ ] Add more fields: sub_total, tips, seller_address, item list
 - [x] Support muliple pages
 - [x] Review result KIE for invoice (vnpt_exp_4_model)
 - [x] Fix unnormalize box error in some cases
 - [x] Support multiple pages
 - [x] Create 200 multiple pages invoice
 - [ ] Finalize multi page testset
 - [ ] Eval result
--- a/cope2n-ai-fi/modules/sdsvkie/arial.ttf
+++ b/cope2n-ai-fi/modules/sdsvkie/arial.ttf
--- a/cope2n-ai-fi/modules/sdsvkie/demos/2022_07_25
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/2022_07_25
--- a/cope2n-ai-fi/modules/sdsvkie/demos/4
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/4
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,73 +0,0 @@
 350	31	403	58	dịch
 329	143	386	166	ngoài
 104	298	174	321	CƯỚC
 323	244	375	268	ĐẾN
 739	67	810	91	(Price)
 610	67	663	91	Unit)
 1109	26	1174	55	Thuế
 181	143	231	166	hãng
 102	244	145	268	PHÍ
 1021	27	1076	55	thuế
 25	67	73	92	(No)
 151	350	240	374	CHỨNG
 893	25	949	54	Tiền
 938	68	1033	92	(Amount)
 150	244	198	268	XẾP
 247	244	316	267	CẢNG
 967	301	1080	323	68.730.120
 998	354	1080	376	760.000
 1394	31	1473	55	Thành
 102	349	146	375	PHÍ
 1477	25	1527	56	tiền
 784	30	823	59	giá
 241	68	374	92	(Description)
 293	31	345	58	hóa,
 228	32	288	58	hàng
 1383	66	1447	92	(Total
 247	350	333	374	TỪ-D/O
 103	142	145	162	Thu
 1196	82	1254	112	Tiền
 203	244	242	268	DỠ
 781	353	863	376	760.000
 181	298	240	321	BIỂN
 979	248	1080	270	6.342.336
 174	31	222	54	Tên
 533	66	579	92	(Qty
 1254	354	1324	375	40.000
 1486	353	1567	375	800.000
 1467	247	1568	270	6.342.336
 750	301	864	323	68.730.120
 273	144	324	162	nước
 956	32	1015	54	chưa
 726	32	778	54	Đơn
 1455	300	1568	323	68.730.120
 148	142	177	166	hộ
 1180	30	1321	59	GTGT✪(VAT)
 1258	81	1316	111	thuế
 531	30	568	54	SL
 104	195	285	215	BL146201088385
 604	31	663	53	ĐVT
 763	248	863	270	6.342.336
 22	31	76	54	STT
 1451	68	1535	92	amount)
 407	36	440	59	vụ
 235	143	268	163	tàu
 260	302	304	321	O.F.
 1120	87	1149	112	%
 583	66	605	88	&
 573	30	598	54	&
 591	301	624	322	BL
 591	353	625	376	BL
 1115	306	1154	322	XXX
 41	301	57	322	2
 1115	360	1154	375	XXX
 1114	253	1154	269	XXX
 42	354	56	375	3
 591	247	625	270	BL
 44	248	55	268	1
 1310	301	1326	322	0
 574	301	586	322	1
 573	354	586	375	1
 573	248	586	269	1
 244	303	256	322	-
 1310	247	1326	270	0
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,198 +0,0 @@
 749	41	785	60	suất
 746	18	789	37	Thuế
 746	70	788	86	(Rate)
 192	35	226	52	dịch
 812	30	850	49	Tiền
 341	70	382	86	(Unit)
 15	104	63	120	CƯỚC
 632	29	670	48	Tiền
 673	33	714	52	hàng
 110	34	150	52	hàng
 673	674	727	689	855.000
 841	552	894	567	169.742
 433	34	479	52	lượng
 660	552	727	567	3.227.035
 345	46	377	60	tính
 841	228	894	243	169.742
 154	34	188	51	hóa,
 973	227	1040	243	3.396.777
 66	103	102	120	TÀU
 841	715	894	729	111.509
 660	227	727	243	3.227.035
 673	512	727	527	855.000
 673	187	727	202	855.000
 660	714	727	730	2.119.950
 474	895	510	912	chiếu
 118	58	207	75	(Description)
 605	899	639	914	nhận
 388	895	422	913	kiểm
 973	552	1040	567	3.396.777
 841	58	899	74	Amount)
 75	34	107	49	Tên
 988	29	1020	49	tiền
 847	188	894	201	44.973
 535	227	602	243	3.227.035
 926	58	968	74	(Total
 986	187	1039	202	899.973
 334	23	368	37	Đơn
 852	30	890	49	thuế
 652	755	727	770	23.507.890
 965	268	1040	283	38.111.990
 973	390	1039	405	3.396.777
 548	187	603	202	855.000
 410	58	479	75	(Quantity)
 547	59	589	74	Price)
 841	390	894	405	169.742
 652	593	727	608	38.111.990
 535	552	602	568	3.227.035
 847	512	894	526	44.973
 965	106	1040	121	38.111.990
 932	34	985	49	Thành
 537	898	563	915	lập,
 342	107	380	120	45GP
 15	147	43	161	Thu
 47	185	106	202	CHỨNG
 673	349	727	364	855.000
 847	674	894	689	44.973
 548	674	603	689	855.000
 567	900	601	915	giao,
 555	33	580	52	giá
 660	389	727	405	3.227.035
 15	185	43	201	PHÍ
 985	674	1039	689	899.973
 652	268	727	284	38.111.990
 351	895	386	915	(Cần
 671	898	704	915	đơn)
 675	59	731	74	amount)
 970	59	1026	74	amount)
 45	147	65	164	hộ
 985	512	1039	527	899.973
 973	714	1040	730	2.231.459
 448	895	472	913	đối
 535	715	603	730	2.119.950
 652	106	727	121	38.111.990
 965	593	1040	608	38.111.990
 548	512	603	527	855.000
 15	266	63	282	CƯỚC
 508	58	544	74	(Unit
 46	509	106	526	CHỨNG
 408	30	429	49	Số
 965	430	1040	446	38.111.990
 46	225	79	242	XÉP
 105	147	249	161	SNKO010220804769
 15	428	62	444	CƯỚC
 527	106	602	121	38.111.990
 985	349	1039	364	899.973
 15	590	63	607	CƯỚC
 642	898	669	913	hóa
 527	268	603	284	38.111.990
 65	589	101	607	TÀU
 15	752	63	769	CƯỚC
 46	671	106	688	CHỨNG
 15	225	43	242	PHÍ
 341	228	380	242	45GP
 14	633	43	648	Thu
 847	350	894	364	44.973
 547	349	603	365	855.000
 65	427	101	445	TÀU
 513	898	536	913	khi
 14	471	43	485	Thu
 673	837	727	851	855.000
 15	309	43	323	Thu
 14	508	44	526	PHÍ
 965	755	1040	770	23.507.890
 67	148	101	161	SNK
 14	670	43	688	PHÍ
 527	593	603	608	38.111.990
 527	756	602	770	23.507.890
 65	751	101	769	TÀU
 44	470	65	489	hộ
 517	34	551	48	Đơn
 45	548	78	566	XÉP
 14	549	43	567	PHÍ
 65	265	102	283	TÀU
 535	389	602	406	3.227.035
 14	386	44	404	PHÍ
 14	832	43	850	PHÍ
 46	347	106	364	CHỨNG
 44	633	65	651	hộ
 548	837	602	850	855.000
 46	833	106	850	CHỨNG
 44	308	65	326	hộ
 45	386	79	404	XÃP
 14	346	43	364	PHÍ
 14	795	43	810	Thu
 526	430	602	446	38.111.990
 802	58	840	74	(VAT
 45	711	79	729	XÉP
 848	837	893	850	44.973
 80	225	107	242	DỠ
 230	37	251	52	vụ
 754	109	781	120	XXX
 652	430	727	446	38.111.990
 80	549	107	567	DỠ
 350	188	372	202	BL
 109	184	136	202	TỪ
 616	58	672	73	(Pre-tax
 44	795	65	813	hộ
 986	837	1039	851	899.973
 341	756	380	770	22GP
 80	387	107	404	DỠ
 424	900	445	912	tra
 14	711	43	729	PHÍ
 104	796	249	810	SNKO010220805559
 104	309	248	323	SNKO010220805023
 341	268	380	283	45GP
 66	634	101	647	SNK
 80	711	107	729	DỠ
 372	22	389	40	vị
 108	670	136	689	TỪ
 104	633	248	648	SNKO010220805118
 754	556	781	567	XXX
 754	759	781	769	XXX
 754	597	781	607	XXX
 66	472	101	485	SNK
 66	309	101	323	SNK
 755	719	780	729	XXX
 108	832	136	851	TỪ
 341	431	380	445	45GP
 754	515	781	526	XXX
 754	840	781	850	XXX
 341	715	380	729	22GP
 108	345	136	364	TỪ
 754	678	781	688	XXX
 66	796	101	810	SNK
 341	390	380	405	45GP
 104	471	249	486	SNKO010220805117
 341	553	380	567	45GP
 108	508	136	527	TỪ
 755	232	780	242	XXX
 754	435	781	445	XXX
 341	593	380	608	45GP
 754	394	781	405	XXX
 755	272	781	283	XXX
 754	191	780	202	XXX
 349	836	373	851	BL
 754	353	780	364	XXX
 349	349	372	364	BL
 440	106	448	120	1
 887	106	896	121	1
 440	553	448	567	1
 440	755	448	769	1
 441	715	448	729	1
 888	756	896	769	/
 441	228	448	242	1
 440	593	448	608	1
 440	269	449	283	1
 440	836	448	850	1
 440	349	449	364	1
 888	269	896	283	1
 440	390	448	404	1
 440	188	448	202	1
 440	431	448	445	1
 888	431	896	445	/
 350	512	372	526	BL
 440	674	448	688	1
 888	594	896	607	/
 349	675	372	689	BL
 440	512	448	526	1
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,76 +0,0 @@
 162	92	206	111	hàng
 263	152	303	175	gồm
 133	124	188	142	chứng
 132	26	176	46	hàng
 224	26	261	45	dịch
 267	124	311	142	nhập
 928	23	974	43	Thuế
 218	124	261	142	hàng
 93	53	202	73	Descriptions
 343	155	378	175	phụ
 1110	23	1151	43	thuế
 1067	23	1107	43	Tiền
 181	26	219	45	hóa,
 98	156	148	172	Cước
 216	152	258	174	biển
 26	54	56	70	No.
 1185	26	1240	42	Thành
 437	53	469	73	Qty
 270	87	304	107	đến
 92	26	127	43	Tên
 1245	23	1279	44	tiền
 979	23	1018	43	suất
 640	26	669	47	giá
 152	156	185	174	vận
 98	124	129	140	Phí
 98	92	129	109	Phí
 1075	92	1144	109	166.904
 813	92	900	109	3.171.168
 866	23	906	43	thuế
 97	178	127	198	phí
 1223	54	1267	71	Total
 26	26	52	43	Stt
 308	155	340	171	các
 684	54	728	70	Price
 1246	92	1333	109	3.338.072
 769	23	809	43	Tiền
 1236	167	1333	184	36.393.480
 673	26	728	46	(VND)
 803	167	899	183	36.393.480
 1085	124	1144	140	39.474
 830	124	900	140	750.000
 636	92	724	109	3.171.168
 625	168	723	183	36.393.480
 1082	54	1151	71	Amount
 652	124	723	140	750.000
 1284	26	1339	46	(VND)
 1263	124	1333	140	789.474
 487	92	542	108	CONT
 837	54	906	71	Amount
 132	92	158	108	dỡ
 210	92	233	111	tại
 643	54	679	70	Unit
 813	27	862	43	trước
 237	91	266	107	nơi
 1271	54	1339	71	Amount
 598	27	636	43	Đơn
 482	54	519	70	Unit
 190	155	212	172	tải
 1039	54	1079	70	VAT
 265	29	288	45	vụ
 480	26	522	43	ĐVT
 442	25	468	43	SL
 975	54	1016	70	Rate
 193	124	214	140	từ
 485	123	517	141	B/L
 485	166	510	184	Lô
 931	53	972	71	VAT
 31	167	43	183	3
 31	124	44	140	2
 1000	94	1014	110	X
 32	92	42	108	1
 1001	127	1014	142	X
 1004	166	1014	185	/
 1138	167	1147	184	/
 451	92	461	108	1
 451	124	461	140	1
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,82 +0,0 @@
 537	569	588	597	Tổng
 233	161	291	182	chứng
 593	573	641	597	cộng
 290	35	332	57	dịch
 467	78	524	99	(Unit)
 577	35	632	57	lượng
 472	50	511	68	tính
 1124	18	1175	45	Tổng
 190	161	227	183	nộp
 647	573	715	596	(Total):
 1012	160	1088	179	720.000
 114	160	156	182	dịch
 133	613	180	641	bằng
 792	63	877	84	(Amount)
 193	36	239	57	hàng
 986	52	1029	76	Tiền
 1138	78	1201	99	(Total)
 797	159	892	180	9.000.000
 322	156	364	179	xuất
 1118	50	1170	68	thanh
 244	35	285	56	hóa,
 87	575	184	596	(Exchange
 50	613	88	637	tiền
 844	30	883	55	tiền
 399	159	440	179	C/O
 463	22	504	41	Đơn
 1139	160	1235	179	9.720.000
 703	64	758	84	price)
 1140	119	1201	139	9=6+8
 781	35	840	54	Thành
 1178	17	1216	42	tiền
 914	14	964	38	Thuế
 915	160	949	180	8%
 189	576	239	596	rate):
 198	64	318	84	(Description)
 91	613	129	637	viết
 77	160	109	179	Phí
 52	573	83	597	giá
 256	619	323	640	words):
 1032	53	1074	76	thuế
 20	613	47	637	Số
 1175	51	1217	68	toán
 804	119	863	139	6=4x5
 710	34	741	59	giá
 664	35	707	54	Đơn
 651	63	700	83	(Unit
 546	30	573	54	Số
 546	63	639	84	(Quantity)
 151	36	188	54	Tên
 368	159	396	179	xứ
 184	618	221	637	chữ
 482	619	531	638	trăm
 20	573	48	597	Tỷ
 1018	85	1092	105	Amount)
 969	17	1082	41	GTGT(VAT)
 634	618	692	642	nghìn
 916	68	947	89	TS
 696	613	754	644	đồng.
 575	619	629	638	mươi
 782	572	877	594	9.000.000
 329	619	379	638	Chín
 384	619	433	642	triệu
 966	84	1015	104	(VAT
 225	619	252	640	(In
 296	160	318	179	từ
 989	573	1065	592	720.000
 17	63	66	84	(No.)
 161	165	186	182	vụ
 537	618	571	638	hai
 336	39	362	57	vụ
 507	21	529	45	vị
 1119	572	1215	594	9.720.000
 438	619	477	642	bảy
 487	118	501	139	3
 15	35	60	54	STT
 1021	119	1037	140	8
 925	119	940	139	7
 585	118	600	139	4
 252	119	266	139	2
 697	117	712	140	5
 37	160	47	178	1
 37	120	47	138	1
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,68 +0,0 @@
 624	54	678	75	(Unit)
 1049	23	1121	45	THÀNH
 119	129	157	145	thuê
 667	22	717	44	TÍNH
 120	203	157	220	thuê
 308	22	360	49	HÓA
 185	161	217	183	đầu
 185	198	217	220	đầu
 749	54	838	76	(Quantity)
 119	166	157	182	thuê
 937	54	989	75	Price)
 85	202	115	220	Phí
 28	53	75	75	(No.)
 185	124	217	146	đầu
 774	25	850	48	LƯỢNG
 1067	54	1153	75	(Amount)
 270	54	382	76	(Description)
 85	166	115	183	Phí
 221	202	252	220	kéo
 1067	202	1199	221	360.000.000.00
 84	128	115	146	Phí
 738	22	770	45	SỐ
 245	23	304	45	HÀNG
 221	165	252	183	kéo
 221	128	252	146	kéo
 1125	23	1173	45	TIỀN
 423	421	505	449	STOP
 1068	165	1198	185	138.461.550,00
 586	26	634	45	ĐƠN
 892	202	1012	221	72.000.000.00
 1077	129	1198	148	47.076.927,00
 39	95	65	116	(1)
 885	54	933	74	(Unit
 33	26	73	46	STT
 199	22	242	45	TÊN
 891	128	1012	148	47.076.927,00
 891	165	1013	184	69.230.775.00
 318	420	391	449	ONE
 637	26	662	48	VỊ
 697	128	726	145	XE
 639	94	664	115	(3)
 364	27	421	49	DỊCH
 160	169	182	183	xe
 698	165	726	182	XE
 160	132	182	146	xe
 925	95	950	116	(5)
 782	95	807	116	(4)
 160	207	182	220	xe
 46	165	58	183	2
 892	26	940	45	ĐƠN
 315	94	339	116	(2)
 46	202	58	220	3
 943	22	984	46	GIÁ
 841	166	853	183	2
 47	129	57	146	1
 691	421	724	449	M
 842	203	853	220	5
 424	26	456	50	VỤ
 843	129	852	146	1
 580	420	680	451	VICES,
 733	421	787	449	ULTI
 698	203	727	220	XE
 399	425	414	447	-
 1078	95	1159	115	)=(4)X(5)
 1061	96	1084	115	(6)
 796	420	854	449	BEN
 515	420	572	449	SER
 862	421	936	449	EFITS
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,76 +0,0 @@
 590	592	641	614	Cộng
 141	131	193	153	ngoài
 234	131	286	153	tháng
 374	23	417	44	dịch
 759	22	818	44	lượng
 272	22	321	44	hàng
 327	23	369	43	hóa,
 731	47	815	67	(Quantity)
 646	586	682	610	tiền
 686	591	732	614	hàng
 589	711	640	739	Tổng
 620	47	671	66	(Unit)
 282	47	393	68	(Description)
 117	588	184	613	chuyển
 590	628	639	652	Thuế
 291	130	367	150	08/2022
 792	593	867	613	amount):
 590	670	634	693	Tiền
 638	670	680	693	thuế
 35	46	79	67	(No.)
 643	628	682	652	suất
 728	17	755	40	Số
 104	130	136	149	Phí
 198	131	230	154	giờ
 940	48	992	66	Price)
 71	591	112	613	lòng
 228	22	267	41	Tên
 949	21	982	44	giá
 687	633	748	651	GTGT
 644	716	691	738	cộng
 34	21	79	40	STT
 1189	17	1229	41	tiền
 589	23	632	40	Đơn
 750	675	807	696	(VAT):
 838	717	900	737	(Grand
 694	711	731	735	tiền
 661	22	701	40	tính
 684	674	745	693	GTGT
 1120	22	1184	40	Thành
 737	593	788	612	(Total
 98	616	164	636	transfer
 735	716	787	734	thanh
 31	616	93	635	(Please
 902	22	945	40	Đơn
 753	634	799	654	(VAT
 319	591	377	613	chúng
 792	715	833	734	toán
 891	47	936	66	(Unit
 1135	47	1212	67	(Amount)
 801	634	849	654	rate):
 30	591	66	609	Vui
 905	717	961	736	Total):
 1187	131	1290	149	57.753.850
 257	591	315	610	khoản
 636	22	657	44	vị
 224	616	266	633	bank
 189	591	224	610	vào
 339	616	383	636	No.):
 421	27	448	46	vụ
 410	586	435	611	số
 1201	674	1292	693	4.620.308
 228	591	253	610	tài
 1189	591	1292	610	57.753.850
 1257	632	1292	652	8%
 381	590	408	611	tôi
 1189	715	1292	734	62.374.158
 1140	94	1207	111	6=4x5
 269	619	336	633	account
 190	621	220	633	our
 168	619	186	633	to
 331	93	344	111	2
 767	93	780	110	4
 639	93	651	111	3
 935	93	948	110	5
 52	130	62	149	1
 53	94	62	110	1
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
--- a/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/invoice/Screenshot
@ -1,150 +0,0 @@
 677	742	742	776	Tổng
 1413	28	1476	63	Tổng
 590	104	660	129	(Unit)
 814	748	900	775	(Total):
 1431	104	1509	129	(Total)
 748	747	807	776	cộng
 72	797	118	827	tiền
 174	797	233	832	bằng
 727	50	796	78	lượng
 199	202	284	235	chuyển
 369	50	421	77	dịch
 596	68	645	92	tính
 292	207	409	235	(trucking)
 1480	28	1528	58	tiền
 1150	24	1213	53	Thuế
 305	515	358	542	định
 884	86	954	111	price)
 427	360	547	388	clearance)
 1405	69	1471	91	thanh
 1240	71	1294	101	Tiền
 123	797	170	827	viết
 247	51	305	78	hàng
 1069	805	1141	833	nghìn
 688	43	722	73	Số
 995	85	1102	111	(Amount)
 584	33	636	58	Đơn
 148	516	199	542	dịch
 1061	43	1109	75	tiền
 242	516	300	543	giám
 148	259	207	286	nâng
 271	258	327	285	(Lift
 1274	515	1368	539	160.000
 1477	69	1529	91	toán
 311	50	362	77	hóa,
 982	50	1056	73	Thành
 993	309	1120	334	13.309.081
 148	464	205	491	giám
 118	749	238	776	(Exchange
 73	747	112	777	giá
 253	86	404	111	(Description)
 148	207	192	234	vận
 148	412	199	439	dịch
 820	85	880	110	(Unit
 1413	799	1484	835	đồng.
 1249	206	1368	230	4.055.959
 1252	258	1367	281	1.973.455
 1206	805	1269	828	trăm
 101	206	143	230	Phí
 689	85	805	111	(Quantity)
 333	259	374	286	on)
 304	309	351	337	off)
 329	805	412	830	words):
 832	805	893	828	trăm
 990	257	1123	282	24.668.174
 837	50	889	73	Đơn
 101	258	143	281	Phí
 149	361	199	384	khai
 1273	463	1368	489	576.000
 243	310	299	337	(Lift
 33	797	67	827	Số
 1432	515	1551	540	2.160.000
 102	515	142	539	Phí
 238	803	284	827	chữ
 244	750	308	775	rate):
 1005	514	1124	540	2.000.000
 103	563	156	585	SEA
 1434	463	1551	488	7.776.000
 195	51	241	74	Tên
 101	361	142	384	Phí
 1298	72	1350	101	thuế
 974	746	1117	771	113.180.694
 1281	112	1372	137	Amount)
 386	418	443	440	sung
 1418	258	1550	281	26.641.629
 991	206	1122	231	50.699.472
 1432	156	1509	179	9=6+8
 1434	412	1551	436	7.560.000
 101	412	143	436	Phí
 894	49	933	79	giá
 1011	155	1085	179	6=4x5
 314	362	420	388	(customs
 102	309	143	333	Phí
 1007	412	1123	437	7.000.000
 34	747	69	777	Tỷ
 1152	205	1194	231	8%
 1419	206	1548	230	54.755.431
 101	463	142	488	Phí
 1152	256	1194	282	8%
 1253	309	1367	333	1.064.725
 1153	309	1194	334	8%
 715	805	777	832	triệu
 1005	360	1123	385	8.303.967
 206	361	244	385	hải
 1273	412	1369	436	560.000
 1273	361	1368	384	664.317
 1153	361	1194	385	8%
 349	407	381	437	bổ
 594	806	661	828	mươi
 1276	805	1333	828	năm
 1153	464	1193	488	8%
 214	262	265	281	cont
 242	412	280	437	hải
 1220	29	1360	57	GTGT(VAT)
 940	806	1006	828	mươi
 477	805	539	828	trăm
 1153	515	1194	539	8%
 1422	309	1550	332	14.373.806
 1006	463	1123	489	7.200.000
 899	805	933	828	ba
 1153	91	1192	117	TS
 250	367	307	388	quan
 1013	804	1061	828	lăm
 1434	361	1551	384	8.968.284
 639	32	667	62	vị
 1340	806	1407	829	mươi
 1149	805	1200	831	một
 289	804	323	831	(In
 1395	746	1540	771	122.235.150
 1231	747	1350	771	9.054.456
 784	805	826	828	hai
 1153	413	1194	437	8%
 545	804	588	829	hai
 148	310	179	337	hạ
 286	419	343	440	quan
 667	805	709	829	hai
 101	595	262	624	>07/08/2022)
 1217	111	1277	135	(VAT
 185	314	237	333	cont
 204	521	236	543	vụ
 419	805	471	832	Một
 28	85	88	110	(No.)
 211	464	269	491	định
 165	561	447	589	INBOUND-(01/08/2022
 427	55	459	78	vụ
 615	155	633	180	3
 737	154	757	179	4
 31	49	86	74	STT
 206	418	237	440	vụ
 50	463	68	488	6
 1285	154	1304	180	8
 49	309	66	334	3
 1165	155	1182	180	7
 49	257	67	281	2
 49	360	68	385	4
 50	412	67	437	5
 51	515	67	539	7
 878	154	896	180	5
 52	156	65	178	1
 52	206	64	229	1
 321	155	339	179	2
--- a/cope2n-ai-fi/modules/sdsvkie/demos/test.py
+++ b/cope2n-ai-fi/modules/sdsvkie/demos/test.py
@ -1,948 +0,0 @@
 # Ultralytics YOLO 🚀, GPL-3.0 license
 """
 Model validation metrics
 """
 import math
 import warnings
 from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.nn as nn
 from sklearn.metrics import confusion_matrix
 # boxes
 def box_area(box):
    # box = xyxy(4,n)
    return (box[2] - box[0]) * (box[3] - box[1])
 def bbox_ioa(box1, box2, eps=1e-7):
    """Returns the intersection over box2 area given box1, box2. Boxes are x1y1x2y2
    box1:       np.array of shape(nx4)
    box2:       np.array of shape(mx4)
    returns:    np.array of shape(nxm)
    """
    # Get the coordinates of bounding boxes
    b1_x1, b1_y1, b1_x2, b1_y2 = box1.T
    b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
    # Intersection area
    inter_area = (
        np.minimum(b1_x2[:, None], b2_x2) - np.maximum(b1_x1[:, None], b2_x1)
    ).clip(0) * (
        np.minimum(b1_y2[:, None], b2_y2) - np.maximum(b1_y1[:, None], b2_y1)
    ).clip(
        0
    )
    # box2 area
    box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps
    # Intersection over box2 area
    return inter_area / box2_area
 def box_iou(box1, box2, eps=1e-7):
    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
    """
    Return intersection-over-union (Jaccard index) of boxes.
    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
    Arguments:
        box1 (Tensor[N, 4])
        box2 (Tensor[M, 4])
    Returns:
        iou (Tensor[N, M]): the NxM matrix containing the pairwise
            IoU values for every element in boxes1 and boxes2
    """
    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
    (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2)
    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
    # IoU = inter / (area1 + area2 - inter)
    return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
 def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
    # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
    # Get the coordinates of bounding boxes
    if xywh:  # transform from xywh to xyxy
        (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
        w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
        b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
        b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
    else:  # x1, y1, x2, y2 = box1
        b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
        b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
        w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
        w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
    # Intersection area
    inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * (
        b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)
    ).clamp(0)
    # Union Area
    union = w1 * h1 + w2 * h2 - inter + eps
    # IoU
    iou = inter / union
    if CIoU or DIoU or GIoU:
        cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(
            b2_x1
        )  # convex (smallest enclosing box) width
        ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1)  # convex height
        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
            c2 = cw**2 + ch**2 + eps  # convex diagonal squared
            rho2 = (
                (b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2
                + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2
            ) / 4  # center dist ** 2
            if (
                CIoU
            ):  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
                v = (4 / math.pi**2) * (
                    torch.atan(w2 / h2) - torch.atan(w1 / h1)
                ).pow(2)
                with torch.no_grad():
                    alpha = v / (v - iou + (1 + eps))
                return iou - (rho2 / c2 + v * alpha)  # CIoU
            return iou - rho2 / c2  # DIoU
        c_area = cw * ch + eps  # convex area
        return (
            iou - (c_area - union) / c_area
        )  # GIoU https://arxiv.org/pdf/1902.09630.pdf
    return iou  # IoU
 def mask_iou(mask1, mask2, eps=1e-7):
    """
    mask1: [N, n] m1 means number of predicted objects
    mask2: [M, n] m2 means number of gt objects
    Note: n means image_w x image_h
    return: masks iou, [N, M]
    """
    intersection = torch.matmul(mask1, mask2.t()).clamp(0)
    union = (
        mask1.sum(1)[:, None] + mask2.sum(1)[None]
    ) - intersection  # (area1 + area2) - intersection
    return intersection / (union + eps)
 def masks_iou(mask1, mask2, eps=1e-7):
    """
    mask1: [N, n] m1 means number of predicted objects
    mask2: [N, n] m2 means number of gt objects
    Note: n means image_w x image_h
    return: masks iou, (N, )
    """
    intersection = (mask1 * mask2).sum(1).clamp(0)  # (N, )
    union = (mask1.sum(1) + mask2.sum(1))[
        None
    ] - intersection  # (area1 + area2) - intersection
    return intersection / (union + eps)
 def smooth_BCE(
    eps=0.1,
 ):  # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
    # return positive, negative label smoothing BCE targets
    return 1.0 - 0.5 * eps, 0.5 * eps
 # losses
 class FocalLoss(nn.Module):
    # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
    def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
        super().__init__()
        self.loss_fcn = loss_fcn  # must be nn.BCEWithLogitsLoss()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = loss_fcn.reduction
        self.loss_fcn.reduction = "none"  # required to apply FL to each element
    def forward(self, pred, true):
        loss = self.loss_fcn(pred, true)
        # p_t = torch.exp(-loss)
        # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
        # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
        pred_prob = torch.sigmoid(pred)  # prob from logits
        p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
        alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
        modulating_factor = (1.0 - p_t) ** self.gamma
        loss *= alpha_factor * modulating_factor
        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        else:  # 'none'
            return loss
 class ConfusionMatrix:
    # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix
    def __init__(self, nc, conf=0.25, iou_thres=0.45):
        self.matrix = np.zeros((nc + 1, nc + 1))
        self.nc = nc  # number of classes
        self.conf = conf
        self.iou_thres = iou_thres
    def process_batch(self, detections, labels):
        """
        Return intersection-over-union (Jaccard index) of boxes.
        Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
        Arguments:
            detections (Array[N, 6]), x1, y1, x2, y2, conf, class
            labels (Array[M, 5]), class, x1, y1, x2, y2
        Returns:
            None, updates confusion matrix accordingly
        """
        if detections is None:
            gt_classes = labels.int()
            for gc in gt_classes:
                self.matrix[self.nc, gc] += 1  # background FN
            return
        detections = detections[detections[:, 4] > self.conf]
        gt_classes = labels[:, 0].int()
        detection_classes = detections[:, 5].int()
        iou = box_iou(labels[:, 1:], detections[:, :4])
        x = torch.where(iou > self.iou_thres)
        if x[0].shape[0]:
            matches = (
                torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1)
                .cpu()
                .numpy()
            )
            if x[0].shape[0] > 1:
                matches = matches[matches[:, 2].argsort()[::-1]]
                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
                matches = matches[matches[:, 2].argsort()[::-1]]
                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
        else:
            matches = np.zeros((0, 3))
        n = matches.shape[0] > 0
        m0, m1, _ = matches.transpose().astype(int)
        for i, gc in enumerate(gt_classes):
            j = m0 == i
            if n and sum(j) == 1:
                self.matrix[detection_classes[m1[j]], gc] += 1  # correct
            else:
                self.matrix[self.nc, gc] += 1  # true background
        if n:
            for i, dc in enumerate(detection_classes):
                if not any(m1 == i):
                    self.matrix[dc, self.nc] += 1  # predicted background
    def matrix(self):
        return self.matrix
    def tp_fp(self):
        tp = self.matrix.diagonal()  # true positives
        fp = self.matrix.sum(1) - tp  # false positives
        # fn = self.matrix.sum(0) - tp  # false negatives (missed detections)
        return tp[:-1], fp[:-1]  # remove background class
    # @TryExcept("WARNING ⚠️ ConfusionMatrix plot failure")
    def plot(self, normalize=True, save_dir="", names=()):
        import seaborn as sn
        array = self.matrix / (
            (self.matrix.sum(0).reshape(1, -1) + 1e-9) if normalize else 1
        )  # normalize columns
        array[array < 0.005] = np.nan  # don't annotate (would appear as 0.00)
        fig, ax = plt.subplots(1, 1, figsize=(12, 9), tight_layout=True)
        nc, nn = self.nc, len(names)  # number of classes, names
        sn.set(font_scale=1.0 if nc < 50 else 0.8)  # for label size
        labels = (0 < nn < 99) and (nn == nc)  # apply names to ticklabels
        ticklabels = (names + ["background"]) if labels else "auto"
        with warnings.catch_warnings():
            warnings.simplefilter(
                "ignore"
            )  # suppress empty matrix RuntimeWarning: All-NaN slice encountered
            sn.heatmap(
                array,
                ax=ax,
                annot=nc < 30,
                annot_kws={"size": 8},
                cmap="Blues",
                fmt=".2f",
                square=True,
                vmin=0.0,
                xticklabels=ticklabels,
                yticklabels=ticklabels,
            ).set_facecolor((1, 1, 1))
        ax.set_xlabel("True")
        ax.set_ylabel("Predicted")
        ax.set_title("Confusion Matrix")
        fig.savefig(Path(save_dir) / "confusion_matrix.png", dpi=250)
        plt.close(fig)
    def print(self):
        for i in range(self.nc + 1):
            print(" ".join(map(str, self.matrix[i])))
 def smooth(y, f=0.05):
    # Box filter of fraction f
    nf = round(len(y) * f * 2) // 2 + 1  # number of filter elements (must be odd)
    p = np.ones(nf // 2)  # ones padding
    yp = np.concatenate((p * y[0], y, p * y[-1]), 0)  # y padded
    return np.convolve(yp, np.ones(nf) / nf, mode="valid")  # y-smoothed
 def plot_pr_curve(px, py, ap, save_dir=Path("pr_curve.png"), names=()):
    # Precision-recall curve
    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
    py = np.stack(py, axis=1)
    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
        for i, y in enumerate(py.T):
            ax.plot(
                px, y, linewidth=1, label=f"{names[i]} {ap[i, 0]:.3f}"
            )  # plot(recall, precision)
    else:
        ax.plot(px, py, linewidth=1, color="grey")  # plot(recall, precision)
    ax.plot(
        px,
        py.mean(1),
        linewidth=3,
        color="blue",
        label="all classes %.3f mAP@0.5" % ap[:, 0].mean(),
    )
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
    ax.set_title("Precision-Recall Curve")
    fig.savefig(save_dir, dpi=250)
    plt.close(fig)
 def plot_mc_curve(
    px,
    py,
    save_dir=Path("mc_curve.png"),
    names=(),
    xlabel="Confidence",
    ylabel="Metric",
 ):
    # Metric-confidence curve
    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
    if 0 < len(names) < 21:  # display per-class legend if < 21 classes
        for i, y in enumerate(py):
            ax.plot(px, y, linewidth=1, label=f"{names[i]}")  # plot(confidence, metric)
    else:
        ax.plot(px, py.T, linewidth=1, color="grey")  # plot(confidence, metric)
    y = smooth(py.mean(0), 0.05)
    ax.plot(
        px,
        y,
        linewidth=3,
        color="blue",
        label=f"all classes {y.max():.2f} at {px[y.argmax()]:.3f}",
    )
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
    ax.set_title(f"{ylabel}-Confidence Curve")
    fig.savefig(save_dir, dpi=250)
    plt.close(fig)
 def compute_ap(recall, precision):
    """Compute the average precision, given the recall and precision curves
    # Arguments
        recall:    The recall curve (list)
        precision: The precision curve (list)
    # Returns
        Average precision, precision curve, recall curve
    """
    # Append sentinel values to beginning and end
    mrec = np.concatenate(([0.0], recall, [1.0]))
    mpre = np.concatenate(([1.0], precision, [0.0]))
    # Compute the precision envelope
    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
    # Integrate area under curve
    method = "interp"  # methods: 'continuous', 'interp'
    if method == "interp":
        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
    else:  # 'continuous'
        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x-axis (recall) changes
        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
    return ap, mpre, mrec
 def ap_per_class(
    tp,
    conf,
    pred_cls,
    target_cls,
    plot=False,
    save_dir=Path(),
    names=(),
    eps=1e-16,
    prefix="",
 ):
    """Compute the average precision, given the recall and precision curves.
    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
    # Arguments
        tp:  True positives (nparray, nx1 or nx10).
        conf:  Objectness value from 0-1 (nparray).
        pred_cls:  Predicted object classes (nparray).
        target_cls:  True object classes (nparray).
        plot:  Plot precision-recall curve at mAP@0.5
        save_dir:  Plot save directory
    # Returns
        The average precision as computed in py-faster-rcnn.
    """
    # Sort by objectness
    i = np.argsort(-conf)
    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
    # Find unique classes
    unique_classes, nt = np.unique(target_cls, return_counts=True)
    nc = unique_classes.shape[0]  # number of classes, number of detections
    # Create Precision-Recall curve and compute AP for each class
    px, py = np.linspace(0, 1, 1000), []  # for plotting
    ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
    for ci, c in enumerate(unique_classes):
        i = pred_cls == c
        n_l = nt[ci]  # number of labels
        n_p = i.sum()  # number of predictions
        if n_p == 0 or n_l == 0:
            continue
        # Accumulate FPs and TPs
        fpc = (1 - tp[i]).cumsum(0)
        tpc = tp[i].cumsum(0)
        # Recall
        recall = tpc / (n_l + eps)  # recall curve
        r[ci] = np.interp(
            -px, -conf[i], recall[:, 0], left=0
        )  # negative x, xp because xp decreases
        # Precision
        precision = tpc / (tpc + fpc)  # precision curve
        p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1)  # p at pr_score
        # AP from recall-precision curve
        for j in range(tp.shape[1]):
            ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
            if plot and j == 0:
                py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
    # from IPython import embed; embed()
    # Compute F1 (harmonic mean of precision and recall)
    f1 = 2 * p * r / (p + r + eps)
    names = [
        v for k, v in names.items() if k in unique_classes
    ]  # list: only classes that have data
    names = dict(enumerate(names))  # to dict
    if plot:
        plot_pr_curve(px, py, ap, save_dir / f"{prefix}PR_curve.png", names)
        plot_mc_curve(px, f1, save_dir / f"{prefix}F1_curve.png", names, ylabel="F1")
        plot_mc_curve(
            px, p, save_dir / f"{prefix}P_curve.png", names, ylabel="Precision"
        )
        plot_mc_curve(px, r, save_dir / f"{prefix}R_curve.png", names, ylabel="Recall")
    i = smooth(f1.mean(0), 0.1).argmax()  # max F1 index
    p, r, f1 = p[:, i], r[:, i], f1[:, i]
    tp = (r * nt).round()  # true positives
    fp = (tp / (p + eps) - tp).round()  # false positives
    return tp, fp, p, r, f1, ap, unique_classes.astype(int)
 class Metric:
    def __init__(self) -> None:
        self.p = []  # (nc, )
        self.r = []  # (nc, )
        self.f1 = []  # (nc, )
        self.all_ap = []  # (nc, 10)
        self.ap_class_index = []  # (nc, )
        self.nc = 0
    @property
    def ap50(self):
        """AP@0.5 of all classes.
        Return:
            (nc, ) or [].
        """
        return self.all_ap[:, 0] if len(self.all_ap) else []
    @property
    def ap(self):
        """AP@0.5:0.95
        Return:
            (nc, ) or [].
        """
        return self.all_ap.mean(1) if len(self.all_ap) else []
    @property
    def mp(self):
        """mean precision of all classes.
        Return:
            float.
        """
        return self.p.mean() if len(self.p) else 0.0
    @property
    def mr(self):
        """mean recall of all classes.
        Return:
            float.
        """
        return self.r.mean() if len(self.r) else 0.0
    @property
    def map50(self):
        """Mean AP@0.5 of all classes.
        Return:
            float.
        """
        return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0
    @property
    def map75(self):
        """Mean AP@0.75 of all classes.
        Return:
            float.
        """
        return self.all_ap[:, 5].mean() if len(self.all_ap) else 0.0
    @property
    def map(self):
        """Mean AP@0.5:0.95 of all classes.
        Return:
            float.
        """
        return self.all_ap.mean() if len(self.all_ap) else 0.0
    def mean_results(self):
        """Mean of results, return mp, mr, map50, map"""
        return [self.mp, self.mr, self.map50, self.map]
    def class_result(self, i):
        """class-aware result, return p[i], r[i], ap50[i], ap[i]"""
        return self.p[i], self.r[i], self.ap50[i], self.ap[i]
    @property
    def maps(self):
        """mAP of each class"""
        maps = np.zeros(self.nc) + self.map
        for i, c in enumerate(self.ap_class_index):
            maps[c] = self.ap[i]
        return maps
    def fitness(self):
        # Model fitness as a weighted combination of metrics
        w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
        return (np.array(self.mean_results()) * w).sum()
    def update(self, results):
        """
        Args:
            results: tuple(p, r, ap, f1, ap_class)
        """
        self.p, self.r, self.f1, self.all_ap, self.ap_class_index = results
 class DetMetrics:
    def __init__(self, save_dir=Path("."), plot=False, names=()) -> None:
        self.save_dir = save_dir
        self.plot = plot
        self.names = names
        self.box = Metric()
        self.speed = {
            "preprocess": 0.0,
            "inference": 0.0,
            "loss": 0.0,
            "postprocess": 0.0,
        }
        self.probs = {}
        self.tp, self.fp = [], []
    def process(self, tp, conf, pred_cls, target_cls):
        if len(conf) > 0:
            for cls_id in range(len(self.names)):
                conf_with_cls  = conf[np.where(pred_cls == cls_id)]
                if len(conf_with_cls) > 0:
                    highest_prob = conf_with_cls.max()
                    self.probs[self.names[cls_id]] = [highest_prob]
        results = ap_per_class(
            tp,
            conf,
            pred_cls,
            target_cls,
            plot=self.plot,
            save_dir=self.save_dir,
            names=self.names,
        )
        self.tp, self.fp = results[:2]
        results = results[2:]
        self.box.nc = len(self.names)
        self.box.update(results)
    @property
    def keys(self):
        return [
            "metrics/precision(B)",
            "metrics/recall(B)",
            "metrics/mAP50(B)",
            "metrics/mAP50-95(B)",
        ]
    def mean_results(self):
        return self.box.mean_results()
    def class_result(self, i):
        return self.box.class_result(i)
    @property
    def maps(self):
        return self.box.maps
    @property
    def fitness(self):
        return self.box.fitness()
    @property
    def ap_class_index(self):
        return self.box.ap_class_index
    @property
    def results_dict(self):
        return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
    def custom_result(self, instances_info =None, iou_match_dict: dict = None, prob_of_classes: dict = None):
        """_summary_
        Args:
            instances_info (np.ndarray, optional): _description_. Defaults to None.
            iou_list (list, optional): _description_. Defaults to None.
            iou_match_dict (dict, optional): _description_. Defaults to None.
            prob_of_classes (dict, optional): _description_. Defaults to None.
        Returns:
            _type_: _description_
        """
        instances_info = instances_info.tolist() if isinstance(instances_info, np.ndarray) else instances_info
        total_instances = sum(instances_info)
        total_instances = total_instances.item() if not isinstance(total_instances, int) else total_instances
        custom_res = {}
        for ci, c in self.names.items():
            iou_match = iou_match_dict[c]
            average_iou = np.mean(iou_match).item() if len(iou_match) > 0 else 0
            _tp = self.tp[ci] if len(self.tp) == len(self.names) else 0
            _fp = self.fp[ci] if len(self.fp) == len(self.names) else 0
            _miss = instances_info[ci] - _tp
            custom_res.update(
                {
                    self.names[ci]: {
                        "actual": instances_info[ci],
                        "correct": int(_tp),
                        "missed_detection": int(_miss),
                        "false_detection": int(_fp),
                        "average_iou": round(average_iou, 4),
                        "average_dice": 0
                    }
                }
            )
        custom_res.update({
            "probability": prob_of_classes,
            "average": {
                "actual": int(total_instances // len(self.names)),
                "correct": int(self.tp.sum() // len(self.names)) if len(self.tp) > 0 else 0,
                "missed_detection": 0,
                "false_detection": int(self.fp.sum() // len(self.names)) if len(self.fp) > 0 else 0,
                "average_iou": round(np.mean([custom_res[cls_name]['average_iou'] for cls_name in self.names.values()]).item(), 4),
                "average_dice": 0,
            },
            "total": {
                "actual": total_instances,
                "correct": int(self.tp.sum()) if len(self.tp) > 0 else 0 ,
                "missed_detection": 0,
                "false_detection": int(self.fp.sum()) if len(self.fp) > 0 else 0,
            },
        })
        custom_res["total"]["missed_detection"] = (
            custom_res["total"]["actual"] - custom_res["total"]["correct"]
        )
        custom_res["average"]["missed_detection"] = custom_res["total"][
            "missed_detection"
        ] // len(self.names)
        return custom_res
 class SegmentMetrics:
    def __init__(self, save_dir=Path("."), plot=False, names=()) -> None:
        self.save_dir = save_dir
        self.plot = plot
        self.names = names
        self.box = Metric()
        self.seg = Metric()
        self.speed = {
            "preprocess": 0.0,
            "inference": 0.0,
            "loss": 0.0,
            "postprocess": 0.0,
        }
        self.tp = []
        self.fp = []
        self.probs = {}
    def process(self, tp_m, tp_b, conf, pred_cls, target_cls):
        if len(conf) > 0:
            for cls_id in range(len(self.names)):
                conf_with_cls  = conf[np.where(pred_cls == cls_id)]
                if len(conf_with_cls) > 0:
                    highest_prob = conf_with_cls.max()
                    self.probs[self.names[cls_id]] = [highest_prob]
        res_mask = ap_per_class(
            tp_m,
            conf,
            pred_cls,
            target_cls,
            plot=self.plot,
            save_dir=self.save_dir,
            names=self.names,
            prefix="Mask",
        )
        tp, fp, results_mask = res_mask[0], res_mask[1], res_mask[2:]
        self.seg.nc = len(self.names)
        self.seg.update(results_mask)
        self.tp = tp
        self.fp = fp
        # print(self.tp, self.fp)
        results_box = ap_per_class(
            tp_b,
            conf,
            pred_cls,
            target_cls,
            plot=self.plot,
            save_dir=self.save_dir,
            names=self.names,
            prefix="Box",
        )[2:]
        self.box.nc = len(self.names)
        self.box.update(results_box)
    @property
    def keys(self):
        return [
            "metrics/precision(B)",
            "metrics/recall(B)",
            "metrics/mAP50(B)",
            "metrics/mAP50-95(B)",
            "metrics/precision(M)",
            "metrics/recall(M)",
            "metrics/mAP50(M)",
            "metrics/mAP50-95(M)",
        ]
    def mean_results(self):
        return self.box.mean_results() + self.seg.mean_results()
    def class_result(self, i):
        return self.box.class_result(i) + self.seg.class_result(i)
    def custom_result(self, instances_info =None, iou_match_dict: dict = None, prob_of_classes: dict = None):
        """_summary_
        Args:
            instances_info (np.ndarray, optional): _description_. Defaults to None.
            iou_list (list, optional): _description_. Defaults to None.
            iou_match_dict (dict, optional): _description_. Defaults to None.
            prob_of_classes (dict, optional): _description_. Defaults to None.
        Returns:
            _type_: _description_
        """
        instances_info = instances_info.tolist() if isinstance(instances_info, np.ndarray) else instances_info
        total_instances = sum(instances_info)
        total_instances = total_instances.item() if not isinstance(total_instances, int) else total_instances
        custom_res = {}
        for ci, c in self.names.items():
            iou_match = iou_match_dict[c]
            average_iou = np.mean(iou_match).item() if len(iou_match) > 0 else 0
            _tp = self.tp[ci] if len(self.tp) == len(self.names) else 0
            _fp = self.fp[ci] if len(self.fp) == len(self.names) else 0
            _miss = instances_info[ci] - _tp
            custom_res.update(
                {
                    self.names[ci]: {
                        "actual": instances_info[ci],
                        "correct": int(_tp),
                        "missed_detection": int(_miss),
                        "false_detection": int(_fp),
                        "average_iou": round(average_iou, 4),
                        "average_dice": 0
                    }
                }
            )
        custom_res.update({
            "probability": prob_of_classes,
            "average": {
                "actual": int(total_instances // len(self.names)),
                "correct": int(self.tp.sum() // len(self.names)) if len(self.tp) > 0 else 0,
                "missed_detection": 0,
                "false_detection": int(self.fp.sum() // len(self.names)) if len(self.fp) > 0 else 0,
                "average_iou": round(np.mean([custom_res[cls_name]['average_iou'] for cls_name in self.names.values()]).item(), 4),
                "average_dice": 0,
            },
            "total": {
                "actual": total_instances,
                "correct": int(self.tp.sum()) if len(self.tp) > 0 else 0 ,
                "missed_detection": 0,
                "false_detection": int(self.fp.sum()) if len(self.fp) > 0 else 0,
            },
        })
        custom_res["total"]["missed_detection"] = (
            custom_res["total"]["actual"] - custom_res["total"]["correct"]
        )
        custom_res["average"]["missed_detection"] = custom_res["total"][
            "missed_detection"
        ] // len(self.names)
        return custom_res
    @property
    def maps(self):
        return self.box.maps + self.seg.maps
    @property
    def fitness(self):
        return self.seg.fitness() + self.box.fitness()
    @property
    def ap_class_index(self):
        # boxes and masks have the same ap_class_index
        return self.box.ap_class_index
    @property
    def results_dict(self):
        return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
 class ClassifyMetrics:
    def __init__(self) -> None:
        self.top1 = 0
        self.top5 = 0
        self.speed = {
            "preprocess": 0.0,
            "inference": 0.0,
            "loss": 0.0,
            "postprocess": 0.0,
        }
    def process(self, targets, pred):
        # target classes and predicted classes
        pred, targets = torch.cat(pred), torch.cat(targets)
        correct = (targets[:, None] == pred).float()
        acc = torch.stack(
            (correct[:, 0], correct.max(1).values), dim=1
        )  # (top1, top5) accuracy
        self.top1, self.top5 = acc.mean(0).tolist()
    def custom_result(self ,targets, pred):
        # from IPython import embed; embed()
        metric_per_class = {}
        pred, targets = torch.cat(pred), torch.cat(targets)
        pred = pred.cpu().numpy()[:, 0]
        targets = targets.cpu().numpy()
        conf_mat = confusion_matrix(targets, pred)
        """{
                "actual": 
                "correct": 
                "recall": 
                "precision": 
            }
        """
        k = conf_mat.shape[0]
        for cls_id in range(k):
            tp = conf_mat[cls_id, cls_id]
            fp = np.sum(conf_mat[cls_id, :]) - tp
            fn = np.sum(conf_mat[:, cls_id]) - tp
            tn = np.sum(conf_mat) - tp - fp - fn
            res_per_class = {
                "actual": tp + fp,
                "correct": tp,
                "miss_detection": 0,
                "false_detection": fp, 
                "recall": 0 if tp + fn == 0 else tp / (tp+fn),
                "precision": tp / (tp + fp)
            }   
            metric_per_class[cls_id] = res_per_class
        return metric_per_class
    @property
    def fitness(self):
        return self.top5
    @property
    def results_dict(self):
        return dict(zip(self.keys + ["fitness"], [self.top1, self.top5, self.fitness]))
    @property
    def keys(self):
        return ["metrics/accuracy_top1", "metrics/accuracy_top5"]
 if __name__ == "__main__":
    det_metric = DetMetrics()
--- a/cope2n-ai-fi/modules/sdsvkie/eval_with_api.py
+++ b/cope2n-ai-fi/modules/sdsvkie/eval_with_api.py
@ -1,275 +0,0 @@
 import json
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 import requests
 import tqdm
 from sdsvkie.utils.eval_kie import eval_kie
 from sdsvkie.utils.io_file import read_json, write_json
 logging.basicConfig(
    level=logging.INFO,
    # format=""
 )
 logger = logging.getLogger()
 HEADERS = {
    'accept': 'application/json',
    'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTA2ODk4ODcsInVzZXJuYW1lIjoiYWRtaW4ifQ.Oybpc9tBsN35vCn3jzekkABDQKJT6yO1aBBJ4rMNln0'
 }
 URL = "http://107.120.133.27:8082/predict/image"
 def run(
        data_dir: str,
        url: str,
        gt_path: str,
        field_pred_file: str,
        samples: Union[int, None] = None,
 ):
    files = get_files(data_dir, recursive=False, limit=samples)
    preds = predict(url, files)
    ##  process for table 
    # table_eval_result = {}
    # table_preds = get_table_preds(preds)
    # table_eval_result = 
    # process for seller, buyer, ...
    field_eval_result = {} 
    # field_preds = get_field_preds_from_api(api_preds=preds)
    field_preds = get_field_preds_from_file(pred_file=field_pred_file)
    classes = get_classes(preds=field_preds)
    if len(classes) == 0:
        raise Exception("Can not get the classes list")
    field_eval_result = eval(
        gt=gt_path,
        pred=field_preds,
        classes=classes,
        classes_ignore=['other', 'table']
    )
    print(field_eval_result)
    ## combine result 
    combine_result = {}
    # combine_result = combine_result(table_res=table_eval_result, field_res=field_eval_result)
    print_result(
        data_path=data_dir,
        num_samples=len(list(field_preds.keys())),
        target_level=0.05,
        result=1.0,   # edit here
    )
    return combine_result
 def print_result(
        data_path: str, 
        num_samples:int, 
        target_level: float, 
        result: float, 
        metric: str = "NLD", 
        avg_time: float = 1.6363
    ):
    print(f"Path of validation dataset: {data_path}\n"
        + f"Number of validation dataset: {num_samples}\n"
        + f"Evaluation metric: {metric}\n"
        + f"Target level: {target_level}\n"
        + f"Archieved level: {result}\n"
        + f"Average time: {avg_time}\n"
        + f"Verification result: {'PASS' if result > target_level else 'FAILED'}"
        )
 def get_field_preds_from_api(api_preds: str) -> dict:
    field_preds = get_fields_preds(api_preds)
    field_preds = combine_to_single_file(field_preds)
    return field_preds
 def get_field_preds_from_file(pred_file: str) -> dict:
    """
    Get predictions from json file
    """
    field_preds = read_json(pred_file)
    return field_preds
 def get_fields_preds(preds: List[Dict]):
    preds = [
       {item['file_path']: format_output_api(item['response_dict'])}
       for item in preds
    ]
    return preds
 def combine_result(table_res: Dict, field_res: Dict):
    return {}
 def _str2dict(text: str) -> Dict:
    try:
        data = json.loads(text)
    except Exception as err:
        logger.error(f"{err} - data: {text}")
        data = {}
    return data
 def predict_one_file(url: str, file: Union[str, Path]) -> Dict:
    """
    Output format: 
    { 
        file_path: path of file
        response_dict: 
    }
    """
    if isinstance(file, str):
        file = Path(file)
    payload = {}
    filename = file.name
    files = [
        (
            'file',
            (
                filename,
                open(str(file), 'rb'),
                'application/pdf'
            )
        )
    ]
    # logger.info(f"Files: {file}")
    response = requests.request(
        "POST", url, headers=HEADERS, data=payload, files=files)
    response_dict = _str2dict(response.text)
    return {
        "file_path": str(file),
        "pred_data": response_dict
    }
 def predict(url: str, files: List[Union[str, Path]]) -> List[Dict]:
    """
    List of {'file_path', 'response_dict'}
    """
    preds = []
    for idx, file in tqdm.tqdm(enumerate(files)):
        try:
            pred = predict_one_file(url, file)
            preds.append(pred)
        except:
            logger.info(f"Error at file: {file}")
    return preds
 def get_files(data_dir: str, recursive: bool = False, limit: Union[int, None] = None) -> List[Union[Path, str]]:
    if recursive:
        files = Path(data_dir).rglob("*")
    else:
        files = Path(data_dir).glob("*")
    if limit:
        files = list(files)[:limit]
    return files
 def _stem_filename(filename: str) -> str:
    """
    Stem a file path: x/y.txt -> y
    """
    return Path(filename).stem
 def format_output_api(output_api: Dict, skip_fields=['table']) -> Dict:
    if "pages" not in output_api:
        return {}
    pages = output_api['pages']
    result = {}
    for page in pages:
        fields = page['fields']
        for field_item in fields:
            field_label, field_value = field_item['label'], field_item['value']
            if field_label in result or field_label in skip_fields:
                continue
            result[field_label] = field_value
    return result
 def combine_to_single_file(preds: List[Dict]) -> None:
    if len(preds) == 0:
        return {} 
    combined_data = {
        _stem_filename(item["filename"]): item["pred_data"]
        for item in preds
    }
    return combined_data
 def eval(
    gt: Union[str, Dict],
    pred: Union[str, Dict],
    classes: List[str],
    classes_ignore: List[str] = []
 ) -> Dict:
    eval_res = eval_kie(
        gt_e2e_path=gt,
        pred_e2e_path=pred,
        kie_labels=classes,
        skip_labels=classes_ignore
    )
    return eval_res
 def get_classes(preds: Dict) -> List[str]:
    classes = []
    for k, v in preds.items():
        if v:
            classes = list(v.keys())
            break
    return classes
 def test():
    import requests
    url = "http://107.120.133.27:8082/predict/image"
    payload = {}
    files = [
        ('file', ('(1 of 19)_HOADON_1C23TYY_50.pdf', open(
            '/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/final/all_in/(1 of 19)_HOADON_1C23TYY_50.pdf', 'rb'), 'application/pdf'))
    ]
    headers = {
        'accept': 'application/json',
        'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTA2ODk4ODcsInVzZXJuYW1lIjoiYWRtaW4ifQ.Oybpc9tBsN35vCn3jzekkABDQKJT6yO1aBBJ4rMNln0'
    }
    response = requests.request(
        "POST", url, headers=headers, data=payload, files=files)
    print(response.text)
    # print(json.loa  ds(response.text))
 if __name__ == "__main__":
    limit = 5
    run(
        data_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/final/all_in",
        url=URL,
        gt_path="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/final/all_in.json",
        field_pred_file="/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/invoice_all_in_final_e2e_21072023_5.json",
        samples=limit
    )
    # test()
--- a/cope2n-ai-fi/modules/sdsvkie/index.html
+++ b/cope2n-ai-fi/modules/sdsvkie/index.html
@ -1,15 +0,0 @@
 <!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="vi"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){window.google={kEI:'1d43ZPakBPjJ0PEPi7S6aA',kEXPI:'0,1359409,6059,206,4804,2316,383,246,5,1129120,1197734,658,380098,16115,19398,9286,22431,1361,12317,2817,14765,4998,13228,3847,6884,31560,887,1985,2891,11754,606,29842,826,30022,2614,13142,3,346,230,20573,4,1538,2304,42127,13658,4437,16786,5830,2527,4094,7596,1,39047,2,3105,2,14022,25739,5679,1021,31122,4568,6252,23423,1253,5835,14967,4333,24,4993,2467,445,2,2,1,6959,19673,8155,7381,15969,874,19632,8,1922,9779,12415,8224,751,1503,13261,6305,20199,927,4869,14,14326,14,82,12151,8055,1622,1778,669,4308,8051,11189,575,3560,988,1494,1536,426,5685,3225,6480,1804,6250,1979,2243,1757,1127,152,9329,3571,6273,147,269,1896,276,3607,671,2054,1696,562,196,1330,1697,1635,1606,4198,2,931,906,31,6,267,3417,1055,2181,386,2483,663,536,1261,958,951,492,931,355,838,278,628,444,892,3,952,1237,951,3347,84,480,72,734,862,521,117,360,18,409,966,335,1,5,84,619,679,1973,123,686,255,123,2,728,765,57,392,549,25,65,410,101,110,418,626,225,585,656,52,163,8,645,2,184,1289,3,40,7,251,161,280,5,69,716,221,596,50,520,40,1361,111,218,214,739,34,2,14,70,421,164,600,620,520,141,5206827,2,21,47,136,329,8798385,3306,141,795,19736,1,346,4808,17,1,5,1,6,1,32,106,10,2,20725522,2920861,299113,550,2773269,1268323,1964,16673,2893,6250,12560,3179,726,841,213,430,270,1412577,194320,23565287,2064,34',kBL:'gVXf',kOPI:89978449};google.sn='webhp';google.kHL='vi';})();(function(){
 var e=this||self;var g,h=[];function k(a){for(var c;a&&(!a.getAttribute||!(c=a.getAttribute("eid")));)a=a.parentNode;return c||g}function l(a){for(var c=null;a&&(!a.getAttribute||!(c=a.getAttribute("leid")));)a=a.parentNode;return c}function m(a){/^http:/i.test(a)&&"https:"===window.location.protocol&&(google.ml&&google.ml(Error("a"),!1,{src:a,glmm:1}),a="");return a}
 function p(a,c,b,f){var d="";-1===c.search("&ei=")&&(d="&ei="+k(b),-1===c.search("&lei=")&&(b=l(b))&&(d+="&lei="+b));b="";e._cshid&&-1===c.search("&cshid=")&&"slh"!==a&&(b="&cshid="+e._cshid);return"/"+(f||"gen_204")+"?atyp=i&ct="+String(a)+"&cad="+(c+d)+"&zx="+String(Date.now())+b};g=google.kEI;google.getEI=k;google.getLEI=l;google.ml=function(){return null};google.log=function(a,c,b,f,d){b||(b=p(a,c,f,d));if(b=m(b)){a=new Image;var n=h.length;h[n]=a;a.onerror=a.onload=a.onabort=function(){delete h[n]};a.src=b}};google.logUrl=function(a){return p("",a)};}).call(this);(function(){google.y={};google.sy=[];google.x=function(a,b){if(a)var c=a.id;else{do c=Math.random();while(google.y[c])}google.y[c]=[a,b];return!1};google.sx=function(a){google.sy.push(a)};google.lm=[];google.plm=function(a){google.lm.push.apply(google.lm,a)};google.lq=[];google.load=function(a,b,c){google.lq.push([[a],b,c])};google.loadAll=function(a,b){google.lq.push([a,b])};google.bx=!1;google.lx=function(){};}).call(this);google.f={};(function(){
 document.documentElement.addEventListener("submit",function(b){var a;if(a=b.target){var c=a.getAttribute("data-submitfalse");a="1"===c||"q"===c&&!a.elements.q.value?!0:!1}else a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var a;a:{for(a=b.target;a&&a!==document.documentElement;a=a.parentElement)if("A"===a.tagName){a="1"===a.getAttribute("data-nohref");break a}a=!1}a&&b.preventDefault()},!0);}).call(this);</script><style>#gbar,#guser{font-size:13px;padding-top:1px !important;}#gbar{height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}@media all{.gb1{height:22px;margin-right:.5em;vertical-align:top}#gbar{float:left}}a.gb1,a.gb4{text-decoration:underline !important}a.gb1,a.gb4{color:#00c !important}.gbi .gb4{color:#dd8e27 !important}.gbf .gb4{color:#900 !important}
 </style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;overflow-y:scroll}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}.h{color:#1558d6}em{font-weight:bold;font-style:normal}.lst{height:25px;width:496px}.gsfi,.lst{font:18px arial,sans-serif}.gsfs{font:17px arial,sans-serif}.ds{display:inline-box;display:inline-block;margin:3px 0 4px;margin-left:4px}input{font-family:inherit}body{background:#fff;color:#000}a{color:#4b11a8;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#1558d6}a:visited{color:#4b11a8}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px}.lsbb{background:#f8f9fa;border:solid 1px;border-color:#dadce0 #70757a #70757a #dadce0;height:30px}.lsbb{display:block}#WqQANb a{display:inline-block;margin:0 12px}.lsb{background:url(/images/nav_logo229.png) 0 -261px repeat-x;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;font:15px arial,sans-serif;vertical-align:top}.lsb:active{background:#dadce0}.lst:focus{outline:none}.Ucigb{width:458px}</style><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){window.google.erd={jsr:1,bv:1776,de:true};
 var h=this||self;var k,l=null!=(k=h.mei)?k:1,n,p=null!=(n=h.sdo)?n:!0,q=0,r,t=google.erd,v=t.jsr;google.ml=function(a,b,d,m,e){e=void 0===e?2:e;b&&(r=a&&a.message);if(google.dl)return google.dl(a,e,d),null;if(0>v){window.console&&console.error(a,d);if(-2===v)throw a;b=!1}else b=!a||!a.message||"Error loading script"===a.message||q>=l&&!m?!1:!0;if(!b)return null;q++;d=d||{};b=encodeURIComponent;var c="/gen_204?atyp=i&ei="+b(google.kEI);google.kEXPI&&(c+="&jexpid="+b(google.kEXPI));c+="&srcpg="+b(google.sn)+"&jsr="+b(t.jsr)+"&bver="+b(t.bv);var f=a.lineNumber;void 0!==f&&(c+="&line="+f);var g=
 a.fileName;g&&(0<g.indexOf("-extension:/")&&(e=3),c+="&script="+b(g),f&&g===window.location.href&&(f=document.documentElement.outerHTML.split("\n")[f],c+="&cad="+b(f?f.substring(0,300):"No script found.")));c+="&jsel="+e;for(var u in d)c+="&",c+=b(u),c+="=",c+=b(d[u]);c=c+"&emsg="+b(a.name+": "+a.message);c=c+"&jsst="+b(a.stack||"N/A");12288<=c.length&&(c=c.substr(0,12288));a=c;m||google.log(0,"",a);return a};window.onerror=function(a,b,d,m,e){r!==a&&(a=e instanceof Error?e:Error(a),void 0===d||"lineNumber"in a||(a.lineNumber=d),void 0===b||"fileName"in a||(a.fileName=b),google.ml(a,!1,void 0,!1,"SyntaxError"===a.name||"SyntaxError"===a.message.substring(0,11)||-1!==a.message.indexOf("Script error")?3:0));r=null;p&&q>=l&&(window.onerror=null)};})();</script></head><body bgcolor="#fff"><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){var src='/images/nav_logo229.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
 if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
 }
 })();</script><div id="mngb"><div id=gbar><nobr><b class=gb1>Tìm ki&#7871;m</b> <a class=gb1 href="http://www.google.com.vn/imghp?hl=vi&tab=wi">Hình &#7843;nh</a> <a class=gb1 href="http://maps.google.com/maps?hl=vi&tab=wl">Maps</a> <a class=gb1 href="https://play.google.com/?hl=vi&tab=w8">Play</a> <a class=gb1 href="https://www.youtube.com/?tab=w1">YouTube</a> <a class=gb1 href="https://news.google.com/?tab=wn">Tin t&#7913;c</a> <a class=gb1 href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class=gb1 href="https://drive.google.com/?tab=wo">Drive</a> <a class=gb1 style="text-decoration:none" href="https://www.google.com.vn/intl/vi/about/products?tab=wh"><u>Thêm</u> &raquo;</a></nobr></div><div id=guser width=100%><nobr><span id=gbn class=gbi></span><span id=gbf class=gbf></span><span id=gbe></span><a href="http://www.google.com.vn/history/optout?hl=vi" class=gb4>Li&#803;ch s&#432;&#777; Web</a> | <a  href="/preferences?hl=vi" class=gb4>Cài &#273;&#7863;t</a> | <a target=_top id=gb_70 href="https://accounts.google.com/ServiceLogin?hl=vi&passive=true&continue=http://www.google.com/&ec=GAZAAQ" class=gb4>&#272;&#259;ng nh&#7853;p</a></nobr></div><div class=gbh style=left:0></div><div class=gbh style=right:0></div></div><center><br clear="all" id="lgpd"><div id="lga"><img alt="Google" height="92" src="/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png" style="padding:28px 0 14px" width="272" id="hplogo"><br><br></div><form action="/search" name="f"><table cellpadding="0" cellspacing="0"><tr valign="top"><td width="25%">&nbsp;</td><td align="center" nowrap=""><input name="ie" value="ISO-8859-1" type="hidden"><input value="vi" name="hl" type="hidden"><input name="source" type="hidden" value="hp"><input name="biw" type="hidden"><input name="bih" type="hidden"><div class="ds" style="height:32px;margin:4px 0"><div style="position:relative;zoom:1"><input class="lst Ucigb" style="margin:0;padding:5px 8px 0 6px;vertical-align:top;color:#000;padding-right:38px" autocomplete="off" value="" title="Tìm trên Google" maxlength="2048" name="q" size="57"><img src="/textinputassistant/tia.png" style="position:absolute;cursor:pointer;right:5px;top:4px;z-index:300" data-script-url="/textinputassistant/11/vi_tia.js" id="tsuid_1" alt="" height="23" width="27"><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){var id='tsuid_1';document.getElementById(id).onclick = function(){var s = document.createElement('script');s.src = this.getAttribute('data-script-url');(document.getElementById('xjsc')||document.body).appendChild(s);};})();</script></div></div><br style="line-height:0"><span class="ds"><span class="lsbb"><input class="lsb" value="Tìm trên Google" name="btnG" type="submit"></span></span><span class="ds"><span class="lsbb"><input class="lsb" id="tsuid_2" value="Xem trang &#273;&#7847;u tiên tìm &#273;&#432;&#7907;c" name="btnI" type="submit"><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){var id='tsuid_2';document.getElementById(id).onclick = function(){if (this.form.q.value){this.checked = 1;if (this.form.iflsig)this.form.iflsig.disabled = false;}
 else top.location='/doodles/';};})();</script><input value="AOEireoAAAAAZDfs5TmWBqHzA-IR1T30Oi85AUDoQoip" name="iflsig" type="hidden"></span></span></td><td class="fl sblc" align="left" nowrap="" width="25%"><a href="/advanced_search?hl=vi&amp;authuser=0">Tìm ki&#7871;m nâng cao</a></td></tr></table><input id="gbv" name="gbv" type="hidden" value="1"><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){var a,b="1";if(document&&document.getElementById)if("undefined"!=typeof XMLHttpRequest)b="2";else if("undefined"!=typeof ActiveXObject){var c,d,e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"];for(c=0;d=e[c++];)try{new ActiveXObject(d),b="2"}catch(h){}}a=b;if("2"==a&&-1==location.search.indexOf("&gbv=2")){var f=google.gbvu,g=document.getElementById("gbv");g&&(g.value=a);f&&window.setTimeout(function(){location.href=f},0)};}).call(this);</script></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br><div id="gws-output-pages-elements-homepage_additional_languages__als"><style>#gws-output-pages-elements-homepage_additional_languages__als{font-size:small;margin-bottom:24px}#SIvCob{color:#3c4043;display:inline-block;line-height:28px;}#SIvCob a{padding:0 3px;}.H6sW5{display:inline-block;margin:0 2px;white-space:nowrap}.z4hgWe{display:inline-block;margin:0 2px}</style><div id="SIvCob">Google có các th&#7913; ti&#7871;ng:  <a href="http://www.google.com/setprefs?sig=0_LiRsKCij61938RPrg0HatfNL9R8%3D&amp;hl=en&amp;source=homepage&amp;sa=X&amp;ved=0ahUKEwi245qf2Kb-AhX4JDQIHQuaDg0Q2ZgBCAU">English</a>    <a href="http://www.google.com/setprefs?sig=0_LiRsKCij61938RPrg0HatfNL9R8%3D&amp;hl=fr&amp;source=homepage&amp;sa=X&amp;ved=0ahUKEwi245qf2Kb-AhX4JDQIHQuaDg0Q2ZgBCAY">Français</a>    <a href="http://www.google.com/setprefs?sig=0_LiRsKCij61938RPrg0HatfNL9R8%3D&amp;hl=zh-TW&amp;source=homepage&amp;sa=X&amp;ved=0ahUKEwi245qf2Kb-AhX4JDQIHQuaDg0Q2ZgBCAc">&#32321;&#39636;&#20013;&#25991;</a>  </div></div></div><span id="footer"><div style="font-size:10pt"><div style="margin:19px auto;text-align:center" id="WqQANb"><a href="/intl/vi/ads/">Qu&#7843;ng cáo</a><a href="/services/">Gia&#777;i pha&#769;p Kinh doanh</a><a href="/intl/vi/about.html">Gi&#7899;i thi&#7879;u v&#7873; Google</a><a href="http://www.google.com/setprefdomain?prefdom=VN&amp;prev=http://www.google.com.vn/&amp;sig=K_USzddBar78vXxQUAlYD0s7dc7wA%3D">Google.com.vn</a></div></div><p style="font-size:8pt;color:#70757a">&copy; 2023</p></span></center><script nonce="zHShHBII_Chw737pdVlj6Q">(function(){window.google.cdo={height:757,width:1440};(function(){var a=window.innerWidth,b=window.innerHeight;if(!a||!b){var c=window.document,d="CSS1Compat"==c.compatMode?c.documentElement:c.body;a=d.clientWidth;b=d.clientHeight}a&&b&&(a!=google.cdo.width||b!=google.cdo.height)&&google.log("","","/client_204?&atyp=i&biw="+a+"&bih="+b+"&ei="+google.kEI);}).call(this);})();</script> <script nonce="zHShHBII_Chw737pdVlj6Q">(function(){google.xjs={ck:'xjs.hp.7OK0Zk1e1VY.L.X.O',cs:'ACT90oGoTtEyU0QPoieI0SztcjxmRBMKuQ',excm:[]};})();</script>  <script nonce="zHShHBII_Chw737pdVlj6Q">(function(){var u='/xjs/_/js/k\x3dxjs.hp.en.jodjsTdQot0.O/am\x3dAgAAdAIAKACw/d\x3d1/ed\x3d1/rs\x3dACT90oGMuax7_CJvh2mHTlqLp1N805IOLg/m\x3dsb_he,d';var amd=0;
 var e=this||self,g=function(c){return c};var k;var n=function(c,f){this.g=f===l?c:""};n.prototype.toString=function(){return this.g+""};var l={};
 function p(){var c=u,f=function(){};google.lx=google.stvsc?f:function(){google.timers&&google.timers.load&&google.tick&&google.tick("load","xjsls");var a=document;var b="SCRIPT";"application/xhtml+xml"===a.contentType&&(b=b.toLowerCase());b=a.createElement(b);b.id="base-js";a=null===c?"null":void 0===c?"undefined":c;if(void 0===k){var d=null;var m=e.trustedTypes;if(m&&m.createPolicy){try{d=m.createPolicy("goog#html",{createHTML:g,createScript:g,createScriptURL:g})}catch(r){e.console&&e.console.error(r.message)}k=
 d}else k=d}a=(d=k)?d.createScriptURL(a):a;a=new n(a,l);b.src=a instanceof n&&a.constructor===n?a.g:"type_error:TrustedResourceUrl";var h,q;(h=(a=null==(q=(h=(b.ownerDocument&&b.ownerDocument.defaultView||window).document).querySelector)?void 0:q.call(h,"script[nonce]"))?a.nonce||a.getAttribute("nonce")||"":"")&&b.setAttribute("nonce",h);document.body.appendChild(b);google.psa=!0;google.lx=f};google.bx||google.lx()};google.xjsu=u;setTimeout(function(){0<amd?google.caft(function(){return p()},amd):p()},0);})();window._ = window._ || {};window._DumpException = _._DumpException = function(e){throw e;};window._s = window._s || {};_s._DumpException = _._DumpException;window._qs = window._qs || {};_qs._DumpException = _._DumpException;function _F_installCss(c){}
 (function(){google.jl={blt:'none',chnk:0,dw:false,dwu:true,emtn:0,end:0,ico:false,ikb:0,ine:false,injs:'none',injt:0,injth:0,injv2:false,lls:'default',pdt:0,rep:0,snet:true,strt:0,ubm:false,uwp:true};})();(function(){var pmc='{\x22d\x22:{},\x22sb_he\x22:{\x22agen\x22:true,\x22cgen\x22:true,\x22client\x22:\x22heirloom-hp\x22,\x22dh\x22:true,\x22ds\x22:\x22\x22,\x22fl\x22:true,\x22host\x22:\x22google.com\x22,\x22jsonp\x22:true,\x22msgs\x22:{\x22cibl\x22:\x22Xóa tìm ki&#7871;m\x22,\x22dym\x22:\x22Có ph&#7843;i b&#7841;n mu&#7889;n tìm:\x22,\x22lcky\x22:\x22Xem trang &#273;&#7847;u tiên tìm &#273;&#432;&#7907;c\x22,\x22lml\x22:\x22Tìm hi&#7875;u thêm\x22,\x22psrc\x22:\x22&#272;ã xóa tìm ki&#7871;m này kh&#7887;i \\u003Ca href\x3d\\\x22/history\\\x22\\u003EL&#7883;ch s&#7917; Web\\u003C/a\\u003E c&#7911;a b&#7841;n\x22,\x22psrl\x22:\x22Xóa\x22,\x22sbit\x22:\x22Tìm ki&#7871;m b&#7857;ng hình &#7843;nh\x22,\x22srch\x22:\x22Tìm trên Google\x22},\x22ovr\x22:{},\x22pq\x22:\x22\x22,\x22rfs\x22:[],\x22sbas\x22:\x220 3px 8px 0 rgba(0,0,0,0.2),0 0 0 1px rgba(0,0,0,0.08)\x22,\x22stok\x22:\x22eeOJZ_zWwu1haY9vT9tVq7xv54E\x22}}';google.pmc=JSON.parse(pmc);})();</script>       </body></html>
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/data.ipynb
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/data.ipynb
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/data.py
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/data.py
@ -1,84 +0,0 @@
 # from sdsvkie.utils.io_file import read_json
 import json 
 import Levenshtein
 from pathlib import Path
 import shutil
 import re 
 from unidecode import unidecode
 # from sdsvkie.utils.io_file import read_json
 def normalize(text):
    text = text.lower()
    text = unidecode(text)
    text = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
    return text 
 def is_match(src, str_new, thr=0.7):
    src = normalize(src)
    str_new = normalize(str_new)
    distance = Levenshtein.ratio(src, str_new)
    if distance > thr:
        return True 
    else:
        return False
 def get_store_name(gt_store, store_list):
    for store in store_list:
        if is_match(store, gt_store, thr=0.6):
            return store.lower()
    if len(gt_store) == 0:
        return "other_non_title"
    else:
        return "other_have_title_{}".format(gt_store)
 def read_json(json_path):
    with open(json_path, "r", encoding="utf8") as f:
        data = json.load(f)
    return data
 json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_ss_receipt.json"
 pred_data = read_json(json_path)
 store_names = [normalize(item['Store_name_value']) for k, item in pred_data.items()]
 # store_names = list(set(store_names))
 from collections import Counter
 my_counter = Counter(store_names)
 list_tuples = my_counter.most_common()
 print(list_tuples)
 stores = [x[0] for x in list_tuples]
 print(stores)
 store_names = stores[1:]
 img_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/All"
 out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done"
 out_dir = Path(out_dir)
 for img_name, item in pred_data.items():
    store_name = item['Store_name_value']
    store_category = get_store_name(store_name, store_list=store_names)
    store_category = store_category.replace(" ", "_")
    print(store_category)
    out_dir_by_store = out_dir / store_category
    if not out_dir_by_store.exists():
        out_dir_by_store.mkdir(parents=True, exist_ok=True)
    img_full_name = Path(img_name).with_suffix(".jpg")
    img_full_path = Path(img_dir) / img_full_name
    txt_full_path = img_full_path.with_suffix(".txt")
    if not img_full_path.exists():
        print(str(img_full_path))
        continue
    else:
        shutil.copy(str(img_full_path), out_dir_by_store)
        shutil.copy(str(txt_full_path), out_dir_by_store)
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/page0.jpg
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/page0.jpg
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/pdf2image.ipynb
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/pdf2image.ipynb
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/scp.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/scp.sh
@ -1 +0,0 @@
 rsync -r --exclude='workdirs/' --exclude='notebooks/' --exclude='weights/' --exclude='wandb/' --exclude='microsoft/' /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie user@107.120.133.42:/mnt/data/kie
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/sdsvap_invoice.ipynb
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/sdsvap_invoice.ipynb
@ -1,194 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "import glob \n",
    "from tqdm import tqdm \n",
    "import cv2 \n",
    "import shutil\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = \"/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/IMGS\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copy_only_first_page(data_dir, out_dir, skip_types=['Receipt_taxi','Receipt_food_Cam', 'Receipt_food_Scan']):\n",
    "    paths = sorted(glob.glob(data_dir + \"/*/*\"))\n",
    "    print(\"Total paths: \", len(paths))\n",
    "    out_dir = Path(out_dir)\n",
    "    for path in paths:\n",
    "        type_doc = Path(path).parent.name\n",
    "        out_dir_full = out_dir / type_doc\n",
    "        if not out_dir_full.exists():\n",
    "            out_dir_full.mkdir(parents=True)\n",
    "        if type_doc in skip_types:\n",
    "            shutil.copy(path, str(out_dir_full))\n",
    "        else:\n",
    "            if \"_1.jpg\" in path:\n",
    "                shutil.copy(path, out_dir_full)\n",
    "            prefix_name = \"_\".join(path.split(\"_\")[:-1]) + \"_1.jpg\"\n",
    "            print(prefix_name)\n",
    "            if Path(prefix_name).exists():\n",
    "                continue\n",
    "            else:\n",
    "                shutil.copy(path, out_dir_full)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "classes = [\n",
    "    # id invoice\n",
    "    'No_key',    # số hóa đơn\n",
    "    'No_value', \n",
    "    'Form_key',    # mẫu số hóa đơn\n",
    "    'Form_value', \n",
    "    'Serial_key',     # số kí hiệu hoá đơn\n",
    "    'Serial_value', \n",
    "    'Date_value', \n",
    "\n",
    "    # seller info\n",
    "    'Seller_company_name_key', \n",
    "    'Seller_company_name_value', \n",
    "    'Seller_tax_code_key', \n",
    "    'Seller_tax_code_value', \n",
    "    'Seller_address_value',\n",
    "    'Seller_address_key', \n",
    "    'Seller_tel_key',\n",
    "    'Seller_tel_value', \n",
    "    \n",
    "    # buyer info\n",
    "    'Buyer_personal_name_key',\n",
    "    'Buyer_personal_name_value', \n",
    "    'Buyer_company_name_key', \n",
    "    'Buyer_company_name_value', \n",
    "    'Buyer_tax_code_key', \n",
    "    'Buyer_tax_code_value', \n",
    "    'Buyer_address_key', \n",
    "    'Buyer_address_value', \n",
    "    'Buyer_address_key',\n",
    "    'Buyer_address_value',\n",
    "\n",
    "    # money info\n",
    "    'Tax_amount_key', \n",
    "    'Tax_amount_value', \n",
    "    'Total_key', \n",
    "    'Total_value', \n",
    "    'Total_in_words_key', \n",
    "    'Total_in_words_value',\n",
    "    \n",
    "    'Other', \n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "classes = [x.lower() for x in classes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['no_key',\n",
       " 'no_value',\n",
       " 'form_key',\n",
       " 'form_value',\n",
       " 'serial_key',\n",
       " 'serial_value',\n",
       " 'date_value',\n",
       " 'seller_company_name_key',\n",
       " 'seller_company_name_value',\n",
       " 'seller_tax_code_key',\n",
       " 'seller_tax_code_value',\n",
       " 'seller_address_value',\n",
       " 'seller_address_key',\n",
       " 'seller_tel_key',\n",
       " 'seller_tel_value',\n",
       " 'buyer_personal_name_key',\n",
       " 'buyer_personal_name_value',\n",
       " 'buyer_company_name_key',\n",
       " 'buyer_company_name_value',\n",
       " 'buyer_tax_code_key',\n",
       " 'buyer_tax_code_value',\n",
       " 'buyer_address_key',\n",
       " 'buyer_address_value',\n",
       " 'buyer_address_key',\n",
       " 'buyer_address_value',\n",
       " 'tax_amount_key',\n",
       " 'tax_amount_value',\n",
       " 'total_key',\n",
       " 'total_value',\n",
       " 'total_in_words_key',\n",
       " 'total_in_words_value',\n",
       " 'other']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py38_hoanglv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/cope2n-ai-fi/modules/sdsvkie/notebooks/wildreceipt_data.ipynb
+++ b/cope2n-ai-fi/modules/sdsvkie/notebooks/wildreceipt_data.ipynb
--- a/cope2n-ai-fi/modules/sdsvkie/requirements.txt
+++ b/cope2n-ai-fi/modules/sdsvkie/requirements.txt
@ -1,15 +0,0 @@
 protobuf>=3.19.6,<4
 opencv-python>=4.4.0
 torch>=1.4
 torchvision
 transformers>=4.25.1
 datasets>=2.5.2
 Pillow==9.5.0
 wandb
 easydict==1.10
 terminaltables==3.1.10
 tqdm
 rapidfuzz==2.13.7
 PyMuPDF==1.20.2
 sentencepiece
 underthesea
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/2023-04-20_0101803564_0300741922_1C23TYY_714.pdf
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/2023-04-20_0101803564_0300741922_1C23TYY_714.pdf
@ -1,49 +0,0 @@
 %PDF-1.3
 %“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
 1 0 obj
 <<
 /F1 2 0 R
 >>
 endobj
 2 0 obj
 <<
 /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
 >>
 endobj
 3 0 obj
 <<
 /PageMode /UseNone /Pages 5 0 R /Type /Catalog
 >>
 endobj
 4 0 obj
 <<
 /Author (anonymous) /CreationDate (D:20230522134603-07'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20230522134603-07'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
  /Subject (unspecified) /Title (untitled) /Trapped /False
 >>
 endobj
 5 0 obj
 <<
 /Count 0 /Kids [  ] /Type /Pages
 >>
 endobj
 xref
 0 6
 0000000000 65535 f 
 0000000073 00000 n 
 0000000104 00000 n 
 0000000211 00000 n 
 0000000279 00000 n 
 0000000575 00000 n 
 trailer
 <<
 /ID 
 [<4d2762f6f45f96a78f66af9b0251b167><4d2762f6f45f96a78f66af9b0251b167>]
 % ReportLab generated PDF document -- digest (http://www.reportlab.com)
 /Info 4 0 R
 /Root 3 0 R
 /Size 6
 >>
 startxref
 629
 %%EOF
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/build_batch_2.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/build_batch_2.py
@ -1,10 +0,0 @@
 import os
 from pathlib import Path 
 import shutil
 data_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Data"
 data_dir = Path(data_dir)
 for path in data_dir.glob("*/*.pdf"):
    if path.with_suffix(".xml").exists():
        shutil.copy(str(path), "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/multi_page_vat/batch_2")
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/check_duplicate_vat.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/check_duplicate_vat.py
@ -1,117 +0,0 @@
 from pathlib import Path
 import shutil
 import json 
 def write_txt(txt, data, mode="w"):
    with open(txt, mode, encoding="utf8") as f:
        for line in data:
            f.write(line + "\n")
 def read_txt(txt):
    with open(txt, "r", encoding="utf8") as f:
        data = [line.strip() for line in f]
    return data
 def get_no(items):
    no = "xxxx"
    for item in items:
        if "No_value" in item:
            tmp = item.split("\t")
            no = tmp[-2]
    return no 
 def write_json(json_path, data):
    with open(json_path, "w", encoding="utf8") as f:
        json.dump(data, f, ensure_ascii=False, sort_keys=True)
 def read_json(json_path):
    with open(json_path, "r", encoding="utf8") as f:
        data = json.load(f)
    return data
 def check(txt_dir):
    log_dict = {}
    txt_dir = Path(txt_dir)
    txt_paths = txt_dir.rglob("*.txt")
    for txt_path in txt_paths:
        items = read_txt(str(txt_path))
        no_doc = get_no(items)
        if no_doc not in log_dict:
            log_dict[no_doc] = [str(txt_path.with_suffix(".jpg"))]
        else:
            log_dict[no_doc].append(str(txt_path.with_suffix(".jpg")))
    not_dups = []
    for no, paths in log_dict.items():
        if len(paths) == 1:
            not_dups.append(no)
    # if "xxxx" in log_dict.keys():
    #     log_dict.pop("xxxx")
    for _ in not_dups:
        log_dict.pop(_)
    print(log_dict.keys())
    return log_dict
 # print(check("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SS_Invoice"))
 def get_leak_test(data_dir):
    test_keys = []
    data_dir = Path(data_dir)
    test_paths = data_dir.rglob("test_*")
    # print(list(test_paths))
    for path in test_paths:
        img_name = path.stem
        img_name = img_name.replace("test_","")
        test_keys.append(img_name)
    # write_txt("leak.txt", test_keys)
    return test_keys
 def create_new_test(ori_dir, out_dir, test_keys):
    ori_dir = Path(ori_dir)
    out_dir = Path(out_dir)
    if not out_dir.exists():
        out_dir.mkdir(parents=True, exist_ok=True)
    img_paths = ori_dir.rglob("*.jpg")
    for img_path in img_paths:
        img_key = img_path.stem
        if img_key in test_keys:
            continue
        txt_path = img_path.with_suffix(".txt")
        shutil.copy(str(img_path), str(out_dir))
        shutil.copy(str(txt_path), str(out_dir))
 def create_new_e2e_test(ori_json, out_json, test_keys):
    ori_data = read_json(ori_json)
    out_dict = {}
    for k, v in ori_data.items():
        if k  in test_keys:
            continue
        out_dict[k] = v
    write_json(out_json, out_dict)
 test_keys = get_leak_test("/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/EXCESS")
 # create_new_test(
 #     ori_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss",
 #     out_dir="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_rm_leak",
 #     test_keys=test_keys
 # )
 create_new_e2e_test(
    ori_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e.json",
    out_json="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e_rm_leak.json",
    test_keys=test_keys
 )
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/clean_json.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/clean_json.py
@ -1,39 +0,0 @@
 import argparse
 from pathlib import Path
 from tqdm import tqdm
 from sdsvkie.utils.io_file import read_json, write_json
 def clean_json(in_json, out_json, pdf_dir):
    data_src = read_json(in_json)
    pdf_dir = Path(pdf_dir)
    pdf_paths = pdf_dir.glob("*.pdf")
    pdf_keys = set([pdf_path.stem for pdf_path in pdf_paths])
    data_tgt = {}
    for src_pdf_key in data_src.keys():
        if src_pdf_key in pdf_keys:
            data_tgt[src_pdf_key] = data_src[src_pdf_key]
    write_json(out_json, data_tgt, sort_keys=False)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog="Rename labels")
    parser.add_argument("--input", type=str, required=True, help="dataset directory")
    parser.add_argument("--out", type=str, required=False, help="output")
    parser.add_argument("--dir", type=str, required=True, help="document type: receipt / invoice")
    args = parser.parse_args()
    clean_json(
        in_json=args.input,
        out_json=args.out,
        pdf_dir=args.dir
    )
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/clean_multipage_data.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/clean_multipage_data.py
@ -1,49 +0,0 @@
 import os
 import json
 from pathlib import Path
 def read_json(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data
 def write_json(json_path, data):
    with open(json_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False)
 def clean_json(json_in, json_out, valid_names):
    out_data = {}
    data = read_json(json_in)
    for name_key, items in data.items():
        if name_key in valid_names:
            out_data[name_key] = items
    write_json(json_out, out_data)
 # DIRNAMES = ['SL_HCM', 'SL_HN_batch_1', 'SL_HN_batch_2', 'Invoices_SAVINA']
 # ROOT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page"
 DIRNAMES = ['test_sbt_v2']
 ROOT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed"
 for dirname in DIRNAMES:
    json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e.json"
    json_out_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e_v2.json"
    valid_names = [p.stem for p in (Path(ROOT_DIR) / dirname).glob("*")]
    print(valid_names)
    clean_json(json_path, json_out_path, valid_names)
 # def combine_json(json_paths, json_out):
 #     datas = [read_json(json_path) for json_path in json_paths]
 #     out_data = {}
 #     for data in datas:
 #         out_data.update(data) 
 #     write_json(json_out, out_data)
 # json_paths = [Path(ROOT_DIR) / (dirname + "_out.json") for dirname in DIRNAMES]
 # json_out = ROOT_DIR + "/test_e2e_multi_pages.json"
 # combine_json(json_paths, json_out)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/copy_img.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/copy_img.py
@ -1,23 +0,0 @@
 import os 
 import shutil
 import glob 
 from pathlib import Path
 if __name__ == "__main__":
    src_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SL_HCM_batch_2_multi_pages"
    tgt_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/key_information_extraction/"
    num_files = 100
    files = glob.glob(os.path.join(src_dir, "*.jpg"))
    count = 0
    for file in files:
        src_path = os.path.join(src_dir, file)
        tgt_path = os.path.join(tgt_dir, file)
        if os.path.isfile(src_path):
            shutil.copy(src_path, tgt_path)
            count += 1
            if count == num_files:
                break
    print(f"Copied {count} files from {src_dir} to {tgt_dir}")
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/create_train_test_data.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/create_train_test_data.py
@ -1,25 +0,0 @@
 import os 
 import shutil
 from pathlib  import Path 
 SRC_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train/sbt/batch_1"
 TEST_DIR = "/mnt/ssd1T/tuanlv/06.KVUCombineStage/datasets/invoices-receipts/SS_invoices/SBT/validation_data/images"
 OUT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_sbt"
 # Get a list of all the files in the test directory
 test_files = [Path(f).name for f in os.listdir(TEST_DIR) if ".txt" not in f]
 # Create the output directory if it doesn't exist
 os.makedirs(OUT_DIR, exist_ok=True)
 # Move the matching files from the source directory to the output directory
 for filename in os.listdir(SRC_DIR):
    if Path(filename).name in test_files:
        src_path = os.path.join(SRC_DIR, filename)
        # out_path = os.path.join(OUT_DIR, filename)
        shutil.move(src_path, OUT_DIR)
        #move .txt 
        src_txt_path = Path(os.path.join(SRC_DIR, filename)).with_suffix(".txt")
        shutil.move(str(src_txt_path), OUT_DIR)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/extract_xml.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/extract_xml.py
@ -1,314 +0,0 @@
 import shutil
 import xml.etree.ElementTree as ET
 from datetime import datetime
 # from sdsvkie.utils.io_file import read_json, write_json
 import json 
 import csv
 import ast 
 def get_xml_from_csv(csv_file):
    data = {}
    with open(csv_file, 'r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # print(row)
            pdf_path = row['file_path']
            pdf_key = Path(pdf_path).stem
            xml_paths = ast.literal_eval(row['xml_path'])
            data[pdf_key] = xml_paths
    return data
 def get_xml_from_dirs(dir_path, pdf_keys):
    dir_path = Path(dir_path)
    xml_paths = dir_path.rglob("*.xml")
    xml_paths = [str(path) for path in xml_paths]
    xml_infos = {}
    for pdf_key in pdf_keys:
        xml_infos[pdf_key] = xml_paths
    return xml_infos
 def write_json(json_path, data, sort_keys=True):
    with open(json_path, "w", encoding="utf8") as f:
        json.dump(data, f, ensure_ascii=False, sort_keys=sort_keys)
 def read_json(json_path):
    with open(json_path, "r", encoding="utf8") as f:
        data = json.load(f)
    return data
 from pathlib import Path
 import tqdm 
 import logging 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def convert_date(date_str: str, ori_pattern: str = '%Y-%m-%d', tgt_pattern: str = '%d/%m/%Y'):
    date_obj = datetime.strptime(date_str, ori_pattern)
    # convert back to string in DD-MM-YY format
    new_date_str = date_obj.strftime(tgt_pattern)
    return new_date_str
 def extract(xml_in, field_mapping):
    with open(xml_in, "r") as f:
        xml_string = f.read()
    # parse the XML string
    root = ET.fromstring(xml_string)
    # extract the SHDon and NLap elements
    output = {}
    for key in field_mapping:
        pattern = f".//{field_mapping[key]}"
        value = root.find(pattern)
        value = "" if value is None else value.text
        if key == "date_value" and value != "":
            value = convert_date(value)
        if key in ["tax_amount_value", "total_value"] and value != "":
            value = str(int(float(value)))
        output[key] = value
    return output
 field_mapping = {
        "no_value": "SHDon",
        "form_value": "KHMSHDon",
        "serial_value": "XXXXXXX",
        "date_value": "NLap",     # 2023-06-05 -> YY-MM-DD
        "seller_company_name_value": "NBan/Ten",
        "seller_address_value": "NBan/DChi",
        "seller_tel_value": "XXXXXXXXX",
        "seller_tax_code_value": "NBan/MST",
        "buyer_personal_name_value": "NMua/HVTNMHang",
        "buyer_company_name_value": "NMua/Ten",
        "buyer_address_value": "NMua/DChi",
        "buyer_tax_code_value": "NMua/MST",
        "buyer_tel_value": "NMua/SDT",
        "tax_amount_value": "TThue",
        "total_value": "TgTTTBSo",
        "total_in_words_value": "TgTTTBChu"
    }
 ## fields need care: serial_value, seller_tel_value, buyer_tel_value
 def get_xml_list_info(xml_dir):
    xml_dir = Path(xml_dir)
    xml_files = xml_dir.glob("*/*.xml")
    xml_info = {}
    for xml_file in xml_files:
        pdf_key = xml_file.stem
        xml_info[pdf_key] = str(xml_file)
    return xml_info
 def process(json_in, json_out, xml_dir):
    assert Path(json_in).exists() == True 
    assert Path(xml_dir).exists() == True
    data_in = read_json(json_in)
    data_out = {}
    if data_in is None or not data_in:
        logger.error("empty file")
        return 
    xml_info = get_xml_list_info(xml_dir)
    for pdf_key in tqdm.tqdm(data_in.keys()):
        xml_path = xml_info[pdf_key] if pdf_key in xml_info else None
        if xml_path is None:
            continue 
        else:
            output = extract(xml_path, field_mapping)
        data_out[pdf_key] = output
    write_json(json_out, data_out, sort_keys=False)
 def get_xml_list_info_v2(xml_dir):
    xml_dir = Path(xml_dir)
    xml_files = xml_dir.glob("*/*.xml")
    xml_info = {}
    for xml_file in xml_files:
        pdf_key = xml_file.stem
        if pdf_key in xml_info:
            xml_info[pdf_key].append(str(xml_file))
        else:
            xml_info[pdf_key] = [str(xml_file)]
    return xml_info
 def extract_v2(xml_paths, preds, field_mapping, pdf_key=None):
    xml_path = None
    if len(xml_paths) == 1:
        xml_path = xml_paths[0]
    else:
        # find best xml
        for xml_in in xml_paths:
            try:
                with open(xml_in, "r", encoding='utf8') as f:
                    xml_string = f.read()
                    root = ET.fromstring(xml_string, parser = ET.XMLParser(encoding = 'iso-8859-5'))
            except Exception as err:
                print("Error exception (check) ", err, xml_in)
                continue
            key_checks = ["no_value"]
            is_exists_xml = False
            for key_check in key_checks:
                pattern = f".//{field_mapping[key_check]}"
                value = root.find(pattern)
                value = "" if value is None else value.text
                if value == preds[key_check]:
                    is_exists_xml = True
            if is_exists_xml:
                xml_path = xml_in
        if xml_path is None:
            print("Not found best xml for ",pdf_key,  xml_paths)
            return None, None
    # 
    try:
        with open(xml_path, "r") as f:
            xml_string = f.read()
        # parse the XML string
        root = ET.fromstring(xml_string)
    except Exception as err:
        print("Error exception: ", err, xml_path)
        return None, None
    # extract the SHDon and NLap elements
    output = {}
    for key in field_mapping:
        pattern = f".//{field_mapping[key]}"
        value = root.find(pattern)
        value = "" if value is None else value.text
        if key == "date_value" and value != "":
            value = convert_date(value)
        if key in ["tax_amount_value", "total_value"] and value != "":
            value = str(int(float(value)))
        output[key] = value
    return output, xml_path
 def process_v2(json_in, json_out, csv_file, xml_dir, xml_out_dir, pdf_xml_json):
    assert Path(json_in).exists() == True 
    assert Path(xml_dir).exists() == True
     # make dir
    if not Path(xml_out_dir).exists():
        Path(xml_out_dir).mkdir(parents=True, exist_ok=True)
    data_in = read_json(json_in)
    data_out = {}
    if data_in is None or not data_in:
        logger.error("empty file")
        return 
    # xml_info = get_xml_list_info_v2(xml_dir)
    # xml_info = get_xml_from_csv(csv_file=csv_file)
    xml_info = get_xml_from_dirs(dir_path=csv_file, pdf_keys=list(data_in.keys()))
    print("Num xml: ", len(xml_info))
    succes = 0
    pdf_xml_info = {}
    set_xml_paths = set()
    for pdf_key in tqdm.tqdm(data_in.keys()):
        xml_paths = xml_info[pdf_key] if pdf_key in xml_info else None
        # print(xml_paths)
        preds = data_in[pdf_key]
        if xml_paths is None or len(xml_paths) == 0:
            print("Not exist xml because xml_paths is None or len xml_paths = 0", pdf_key)
            continue 
        else:
            output, xml_path = extract_v2(xml_paths, preds, field_mapping, pdf_key=pdf_key)
        if output is not None:
            pdf_xml_info[pdf_key] = xml_path
            shutil.copy(xml_path, xml_out_dir)
            # if Path(xml_path).stem in set_xml_paths:
                # print(pdf_key, xml_path)
            set_xml_paths.add(Path(xml_path).stem)
            succes += 1
            data_out[pdf_key] = output
    print("Succes: ", succes)
    print(len(set_xml_paths))
    write_json(pdf_xml_json, pdf_xml_info, sort_keys=False)
    write_json(json_out, data_out, sort_keys=False)
 def combine_xml(json_src, json_refer):
    data_src = read_json(json_src)
    data_refer = read_json(json_refer)
    for pdf_key in data_src.keys():
        for field_key in data_src[pdf_key]:
            if data_src[pdf_key][field_key] == "":
                data_src[pdf_key][field_key] = data_refer[pdf_key][field_key]
    write_json(json_src, data=data_src, sort_keys=False)
 def create_data_from_json(in_dir, out_dir, json_path):
    in_dir = Path(in_dir)
    out_dir = Path(out_dir)
    if not out_dir.exists():
        out_dir.mkdir(parents=True, exist_ok=True)
    data = read_json(json_path)
    for pdf_key in data.keys():
        pdf_path = in_dir / (pdf_key + ".pdf")
        shutil.copy(str(pdf_path), str(out_dir))
 if __name__ == "__main__":
    # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json"
    # json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
    # xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test"
    # process(json_in=json_in, json_out=json_out, xml_dir=xml_dir)
    # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
    # json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json"
    # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
    # json_refer = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json"
    # combine_xml(json_src=json_in, json_refer=json_refer)
    ## One page
    # json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e.json"
    # json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
    ## Multi page
    json_in = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page.json"
    json_out = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v1_multi_page_from_xml.json"
    # csv_file = "/mnt/ssd1T/tuanlv/02.KeyValueUnderstanding/inferences/e2e_outputs/FI_June_data.csv"
    csv_file = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Data"
    pdf_xml_json = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v1_multi_page_metadata.json"
    xml_dir = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test"
    xml_out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v1_multi_page_xml"
    process_v2(json_in=json_in, json_out=json_out, csv_file=csv_file, xml_dir=xml_dir, xml_out_dir=xml_out_dir, pdf_xml_json=pdf_xml_json)
    # in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page"
    # out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_multi_page_clean"
    # json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page_from_xml.json"
    # in_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page"
    # out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/Invoice_v2_one_page_clean"
    # json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_one_page_e2e_from_xml.json"
    # create_data_from_json(in_dir, out_dir, json_path)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/format_json.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/format_json.py
@ -1,42 +0,0 @@
 import json 
 json_path = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/multi_page_vat/SL_HCM.json"
 with open(json_path, 'r', encoding='utf8') as f:
    data = json.load(f)
 print(data[list(data.keys())[0]].keys())
 keys = [
    'serial_value',
    'no_value', 
    'form_value', 
    'date', 
    'seller_company_name_value', 
    'seller_address_value', 
    'seller_mobile_value', 
    'seller_tax_code_value', 
    'buyer_name_value', 
    'buyer_company_name_value', 
    'buyer_address_value', 
    'buyer_mobile_value', 
    'buyer_tax_code_value', 
    'VAT_amount_value',
    'total_in_words_value', 
    'total_value'
 ]
 new_data = {}
 for file_name, items in data.items():
    new_items = {}
    for k in keys:
        new_items[k] = items[k]
    new_data[file_name] = new_items
 with open(json_path, 'w', encoding='utf8') as f:
    json.dump(new_data, f, ensure_ascii=False)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/get_more_data.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/get_more_data.py
@ -1,31 +0,0 @@
 import os
 import shutil 
 from pathlib import Path
 folder1 = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v2_multi_page"
 folder2 = "/mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Invoice_v2_multi_page"
 out_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/Invoice_v2_multi_page_2"
 out_dir = Path(out_dir)
 if not out_dir.exists():
    out_dir.mkdir(parents=True, exist_ok=True)
 # Get list of files in both folders
 files1 = [f for f in os.listdir(folder1) if os.path.isfile(os.path.join(folder1, f))]
 files2 = [f for f in os.listdir(folder2) if os.path.isfile(os.path.join(folder2, f))]
 # Get list of file names in both folders
 names1 = [os.path.splitext(f)[0] for f in files1]
 names2 = [os.path.splitext(f)[0] for f in files2]
 # Find duplicates by comparing names
 duplicates = set(names1) ^ set(names2)
 print(len(duplicates))
 # Print duplicate file names
 for d in duplicates:
    print(f"Duplicate file name found: {d}")
    pdf_path = Path(folder2) / (d+".pdf")
    shutil.copy(str(pdf_path), str(out_dir))
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/leak.txt
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/leak.txt
@ -1,106 +0,0 @@
 200
 Invoice_ho_1007_000
 210
 invoice_126
 inv_SDV_016
 invoice_108
 invoice_215
 invoice_135
 inv_SDV_004
 292
 164
 242
 inv_SDV_240
 207
 invoice_0525_000
 invoice_1279_000
 306
 d2.sc_1261_000
 invoice_90
 304
 s1.sc_1258_000
 ce_1h_0967_000
 invoice_1392_000
 193
 invoice_109
 281
 354
 invoice_1059_000
 es_10_1043_000
 257
 invoice_65
 invoice_1252_006
 331
 scan__1319_000
 230
 20210_1314_000
 328
 o1_aa_1093_000
 342
 invoice_149
 invoice_1304_000
 c2_em_0081_000
 Invoice_En_1074_000
 invoice_89
 Invoice_Sh_0712_000
 invoice_202
 hotel_0209_000
 invoice_0872_000
 invoice_72
 InvoiceofP_0648_000
 invoice_133
 C1_Invoice_0968_000
 invoice_0803_000
 invoice_50
 invoice_208
 253
 inv_SDV_215
 360
 invoice_1393_000
 scan__0953_000
 invoice_22
 O1_Invoice_1348_000
 inv_SDV_231
 252
 273
 156
 330
 invoice_0457_001
 invoice_0180_001
 invoice_182
 326
 14
 301
 334
 01gtk_0199_000
 343
 Invoice201_0930_000
 invoice_1
 344
 inv_SDV_021
 invoice_170
 E2.Invoice_0561_000
 Invoice_Sh_0262_000
 1.1Invoice_1431_000
 invoice_0112_000
 invoice_195
 314
 2021._0035_000
 invoice_0013_000
 invoice_1204_000
 2021._0868_000
 scan__0520_000
 255
 invoice_200
 C3_Invoice_1359_000
 invoice_49
 invoice_1095_000
 hq_20_0003_000
 invoice_180
 invoice_184
 340
 invoice_0447_000
 invoice_6
 invoice_190
 invoice_105
 invoice_0673_000
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/remove_imgs.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/remove_imgs.py
@ -1,24 +0,0 @@
 import os 
 import shutil
 from pathlib  import Path 
 SRC_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train/sbt/batch_1"
 TEST_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/sbt_txt"
 # Get a list of all the files in the test directory
 test_files = sorted([Path(f).stem for f in os.listdir(TEST_DIR) if ".txt" in f])
 print(len(test_files))
 # Create the output directory if it doesn't exist
 # Move the matching files from the source directory to the output directory
 i = 0
 src_files = sorted(os.listdir(SRC_DIR))
 print(len(src_files))
 for filename in src_files:
    # print(Path(filename).stem )
    if  Path(filename).stem not in test_files:
        print(Path(filename).stem)
        i+=1 
 print(i)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename.py
@ -1,15 +0,0 @@
 import os
 def rename_files(folder_path):
    # Get a list of all the files in the folder
    files = os.listdir(folder_path)
    # Iterate over the files and rename them
    for i, filename in enumerate(files):
        # Construct the new filename
        new_filename = filename.replace(" ", "_")
        # Rename the file
        os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))
 rename_files(
    folder_path="/mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/sbt/images",
 )
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename.sh
@ -1,10 +0,0 @@
 python rename_labels.py \
    --in_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed  \
    --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed \
    --doc_type receipt
 python rename_labels.py \
    --in_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed  \
    --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed \
    --doc_type invoice
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename_labels.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/rename_labels.py
@ -1,191 +0,0 @@
 import argparse
 from pathlib import Path
 from tqdm import tqdm
 import json 
 INVOICE_MAPPING = {
    'no_key': 'No_key',    # số hóa đơn
    'no_value': 'No_value', 
    'form_key': 'Form_key',    # mẫu số hóa đơn
    'form_value': 'Form_value', 
    'serial_key': 'Serial_key',     # số kí hiệu hoá đơn
    'serial_value': 'Serial_value', 
    'date': 'Date_value',
    # seller info
    'seller_company_name_key': 'Seller_company_name_key', 
    'seller_company_name_value': 'Seller_company_name_value', 
    'seller_tax_code_key': 'Seller_tax_code_key', 
    'seller_tax_code_value': 'Seller_tax_code_value', 
    'seller_address_value': 'Seller_address_value',
    'seller_address_key': 'Seller_address_key', 
    'seller_mobile_key': 'Seller_tel_key',
    'seller_mobile_value': 'Seller_tel_value', 
    # buyer info
    'buyer_name_key': 'Buyer_personal_name_key',
    'buyer_name_value': 'Buyer_personal_name_value', 
    'buyer_company_name_value': 'Buyer_company_name_value', 
    'buyer_company_name_key': 'Buyer_company_name_key', 
    'buyer_tax_code_key': 'Buyer_tax_code_key', 
    'buyer_tax_code_value': 'Buyer_tax_code_value', 
    'buyer_address_key': 'Buyer_address_key', 
    'buyer_address_value': 'Buyer_address_value', 
    'buyer_mobile_key': 'Buyer_tel_key',
    'buyer_mobile_value': 'Buyer_tel_value',
    # money info
    'VAT_amount_key': 'Tax_amount_key', 
    'VAT_amount_value': 'Tax_amount_value', 
    'total_key': 'Total_key', 
    'total_value': 'Total_value', 
    'total_in_words_key': 'Total_in_words_key', 
    'total_in_words_value': 'Total_in_words_value',
    'other': 'Other', 
 }
 RECEIPT_MAPPING = {
    "Store_name_value": "seller_company_name_value",
    "Seller_company_name_value": "seller_company_name_value", 
    "id": "no_value",
    "No_value": "no_value",
    "Date_value": "date_value",
    "Total_key": "total_key",
    "Total_value": "total_value",
    "Others": "other",
    "others": "other",
    "Other": "other",
 }
 def write_txt(txt, data, mode="w"):
    with open(txt, mode, encoding="utf8") as f:
        for line in data:
            f.write(line + "\n")
 def read_txt(txt):
    with open(txt, "r", encoding="utf8") as f:
        data = [line.strip() for line in f]
    return data
 def edit_file(in_txt, out_txt, mapping):
    data = read_txt(in_txt)
    new_items = []
    not_exits_label = False 
    not_edit = True
    for item in data:
        splited_item = item.split("\t")
        label = splited_item[-1]
        if label in mapping.keys():
            new_label = mapping[label]
            splited_item[-1] = new_label
            not_edit = False
        else:
            # print(label, "not in ", mapping.keys())
            not_exits_label = True
            splited_item[-1] = label.lower()
        splited_item[-1] = splited_item[-1].lower()
        new_item = "\t".join(splited_item)
        new_items.append(new_item)
    if not_exits_label:
        print("Not exists label: ", in_txt)
    if not not_edit:
        print("Not edit: ", in_txt)
    write_txt(out_txt, new_items)
 def rename_labels(data_dir, out_dir, doc_type):
    data_dir = Path(data_dir)
    out_dir = Path(out_dir)
    if not out_dir.exists():
        out_dir.mkdir(parents=True, exist_ok=True)
    if doc_type == "receipt":
        mapping = RECEIPT_MAPPING
    elif doc_type == 'invoice':
        mapping = INVOICE_MAPPING
    else:
        raise NotImplementedError()
    txt_paths = data_dir.rglob("*.txt")
    for txt_path in tqdm(txt_paths):
        txt_dir = str(Path(str(txt_path).replace(str(data_dir), "")).parent)  # a/b/c/x.txt -> c/x.txt -> c 
        if txt_dir[0] == "/":
            txt_dir = txt_dir[1:]
        out_sub_dir = out_dir / Path(txt_dir)
        if not out_sub_dir.exists():
            out_sub_dir.mkdir(parents=True, exist_ok=True)
        out_txt = out_sub_dir / txt_path.name
        # if "failure" in str(out_txt):
        #     # print(out_txt)
        #     print(out_sub_dir)
        #     print(out_txt)
        # print(out_txt)
        edit_file(str(txt_path), out_txt=out_txt, mapping=mapping)
 def write_json(json_path, data):
    with open(json_path, "w", encoding="utf8") as f:
        json.dump(data, f, ensure_ascii=False, sort_keys=True)
 def read_json(json_path):
    with open(json_path, "r", encoding="utf8") as f:
        data = json.load(f)
    return data
 def rename_label_in_json(json_in, json_out, doc_type):
    if doc_type == "invoice":
        mapping = INVOICE_MAPPING
    else:
        mapping = RECEIPT_MAPPING
    ori_data = read_json(json_in)
    new_data = {}
    for img_key, field_item in ori_data.items():
        new_field_item = {}
        for field_key, field_value in field_item.items():
            if field_key in mapping:
                new_field_key = mapping[field_key]
            else:
                new_field_key = field_key
            new_field_key = new_field_key.lower()
            new_field_item[new_field_key] = field_value
        new_data[img_key] = new_field_item
    write_json(json_out, new_data)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog="Rename labels")
    parser.add_argument("--in_dir", type=str, required=True, help="dataset directory")
    parser.add_argument("--out_dir", type=str, required=False, help="output")
    parser.add_argument("--doc_type", type=str, required=True, help="document type: receipt / invoice")
    args = parser.parse_args()
    rename_labels(
        data_dir=args.in_dir,
        out_dir=args.out_dir,
        doc_type=args.doc_type
    )
    # rename_label_in_json(
    #     json_in=args.in_dir,
    #     json_out=args.out_dir,
    #     doc_type=args.doc_type
    # )
 """
 """
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/sort_e2e.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/sort_e2e.py
@ -1,83 +0,0 @@
 import argparse
 from sdsvkie.utils import read_json, yaml_load, write_json
 # from sdsvkie.utils.post_processing.invoice_post_processing import *
 # from sdsvkie.utils.post_processing.common_post_processing import normalize_number
 from tqdm import tqdm
 INVOICE_KEYS =  [
 'no_key',
 'no_value',
 'form_key',
 'form_value',
 'serial_key',
 'serial_value',
 'date_value',
 'seller_company_name_key',
 'seller_company_name_value',
 'seller_address_value',
 'seller_address_key',
 'seller_tel_key',
 'seller_tel_value',
 'seller_tax_code_key',
 'seller_tax_code_value',
 'buyer_personal_name_key',
 'buyer_personal_name_value',
 'buyer_company_name_value', 
 'buyer_company_name_key',
 'buyer_address_key',
 'buyer_address_value',
  'buyer_tax_code_key',
 'buyer_tax_code_value',
 'buyer_tel_key',
 'buyer_tel_value',
 'tax_amount_key',
 'tax_amount_value',
 'total_key',
 'total_value',
 'total_in_words_key',
 'total_in_words_value',
 'other'
 ]
 def sort_invoice(data):
    sorted_data = {}
    for img_key, img_data in tqdm(data.items()):
        sorted_img_data = {}
        for field_key in INVOICE_KEYS:
            if "_key" in field_key or "other" in field_key:
                continue
            sorted_img_data[field_key] = img_data.get(field_key, "")
        sorted_data[img_key] = sorted_img_data
    return sorted_data
 def sort_receipt(data):
    return data
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", type=str, help="e2e label file path")
    parser.add_argument("--out", type=str, help='postprocess e2e label')
    parser.add_argument("--doc_type", default="invoice")
    args = parser.parse_args()
    data = read_json(args.input)
    if args.doc_type:
        sorted_data = sort_invoice(data)
    else:
        sorted_data = sort_receipt(data)
    write_json(args.out, sorted_data, sort_keys=False)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/split_batches.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/split_batches.py
@ -1,34 +0,0 @@
 import os
 import shutil
 def split_folder_into_batches(input_folder, output_folder, n):
    # Get the list of image files in the input folder
    image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    # Sort the list of image files
    image_files.sort()
    print("len: ", len(image_files))
    # Calculate the number of images per batch
    batch_size = len(image_files) // n
    # Create the output directories
    for i in range(n):
        batch_dir = os.path.join(output_folder, f"batch_{i+1}")
        os.makedirs(batch_dir, exist_ok=True)
    # Split the images into batches
    for i, image_file in enumerate(image_files):
        # print(i, image_file)
        batch_index = i // batch_size
        batch_dir = os.path.join(output_folder, f"batch_{batch_index+1}")
        if not os.path.exists(batch_dir):
            os.makedirs(batch_dir, exist_ok=True)
        # print(batch_dir)
        # Find the corresponding label file
        image_name, image_ext = os.path.splitext(image_file)
        label_file = f"{image_name}.txt"
        label_path = os.path.join(input_folder, label_file)
        # Copy the image and label files into the appropriate batch directory
        print(label_path, os.path.join(input_folder, image_file), batch_dir)
        shutil.copy(os.path.join(input_folder, image_file), batch_dir)
        shutil.copy(label_path, batch_dir)
 # Example usage:
 split_folder_into_batches("/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/pseudo_ocr/invoice_receipt_sbt", "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/pseudo_ocr/invoice_receipt_sbt_split", 3)
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/common/xml2pdf.py
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/common/xml2pdf.py
@ -1,31 +0,0 @@
 # import xml.etree.ElementTree as ET
 # from reportlab.pdfgen import canvas
 # from reportlab.lib.pagesizes import letter
 # # Load XML file
 # tree = ET.parse('/mnt/hdd2T/AICR/Projects/2023/Xml_SAVINA/2023-04-20_0101803564_0300741922_1C23TYY_714.xml')
 # root = tree.getroot()
 # # Create PDF canvas
 # pdf_canvas = canvas.Canvas('./2023-04-20_0101803564_0300741922_1C23TYY_714.pdf', pagesize=letter)
 # # Iterate over XML elements and draw on PDF canvas
 # for element in root.iter():
 #     if element.tag == 'paragraph':
 #         pdf_canvas.drawString(int(element.get('x')), int(element.get('y')), element.text)
 #     elif element.tag == 'image':
 #         pdf_canvas.drawImage(element.get('src'), int(element.get('x')), int(element.get('y')), int(element.get('width')), int(element.get('height')))
 # # Save PDF file
 # pdf_canvas.save()
 import pyxml2pdf
 # Create an XML file
 # Create a new xml2pdf object
 xml2pdf = pyxml2pdf.("/mnt/hdd2T/AICR/Projects/2023/Xml_SAVINA/2023-04-20_0101803564_0300741922_1C23TYY_714.xml")
 # Save the output PDF file
 xml2pdf.save("my_pdf_file.pdf")
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/cvat_tool.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/cvat_tool.sh
@ -1,256 +0,0 @@
 python tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_taxi_sub_1_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_taxi_sub_1_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo_OCR/Batch_1_Good/Taxi_sub_1
 python tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_taxi_sub_2_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_taxi_sub_2_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/Taxi_sub_2
 python rename_labels.py \
    --in_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed  \
    --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed \
    --doc_type receipt
 python rename_labels.py \
    --in_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed  \
    --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed \
    --doc_type receipt
 # text detect
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/batch_1_taxi_sub_1_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/batch_1_taxi_sub_1_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/taxi_sub_1  
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/batch_1_food.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/batch_1_food_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/food  
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/labeling/train_vnpt_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/labeling/train_vnpt_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/train_with_vnpt  
 ##########
 python tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo_OCR/Batch_1_Good/Taxi_sub_1  \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_taxi_sub_1_done.xml   \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_1 \
    --line_to_word \
    --other_class Others
 python tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/Taxi_sub_2  \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_taxi_sub_2_done.xml   \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_2 \
    --other_class Others
 python tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/Food  \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/batch_1_food_done.xml   \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Food \
    --other_class Others
 python tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_2/Good/Food  \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_2/batch_2_food_done.xml   \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_2/Good/Food \
    --other_class Others
 # WILD RECEIPT
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/WildReceipt/re_labeling/batches/batch_1  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/WildReceipt/re_labeling/batches/batch_1 \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/WildReceipt/re_labeling/wild_batch_1_done.xml   \
    --other_class Others
 # SS RECEIPT
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/ss_receipt_batch_1_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/ss_receipt_batch_1_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted_txt/batch_1 \
    --other_class Others
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/mc_ocr_batch_3_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/mc_ocr_batch_3_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_v2/batch_3 \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/ss_receipt_all_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/ss_receipt_all_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/mcocr_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/mcocr_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train/mc_ocr \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/sdsap_receipt_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/sdsap_receipt_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train/sdsap_receipt \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/ss_receipt_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/ss_receipt_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train/ss_receipt \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/wildreceipt_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/pseudo_label/wildreceipt_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train/wildreceipt \
    --other_class Others \
    --resever_parent_dir
 # INVOICE
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_sl_hn_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_sl_hn_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/SL_HN_Invoice_txt \
    --other_class other
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_sl_hcm_tmp_done.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_sl_hcm_tmp_done_2.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/invoice_add_sl_hcm_finetuning/SL_HCM_Invoice_wg_txt_need_review \
    --other_class other
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_ss_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_ss_wg_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/parse_wg/SS_Invoice \
    --other_class other
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/multi_pages/labels/ss_hcm_batch_2_multi_pages_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/multi_pages/labels/ss_hcm_batch_2_multi_pages_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/SL_HCM_batch_2_first_last_page_txt \
    --other_class other
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/sbt/sbt_craw_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/sbt/sbt_craw_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/Crawled_invoices_SBT \
    --other_class other
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/sbt_test_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/sbt_test_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_sbt_wg \
    --other_class other
 python sdsvkie/tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/viettinbank_pocr_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/viettinbank_pocr_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/invoice_viettinbank_poc_txt \
    --other_class other
 # UPDATE TXT 
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/batch_1  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted/batch_1 \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/mc_ocr_batch_1_done.xml   \
    --other_class Others
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/batch_2  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted/batch_2 \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/mc_ocr_batch_2_done.xml   \
    --other_class Others
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_v2/batch_3  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_v2/batch_3_2 \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/mc_ocr_batch_3_done.xml   \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Done \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/ss_receipt_by_store_done.xml   \
    --other_class Others \
    --resever_parent_dir
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted_txt/batch_1  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/batch_1 \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/ss_receipt_batch_1_done.xml   \
    --other_class Others
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted_txt/batch_2  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/batch_2 \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/ss_receipt_batch_2_done.xml   \
    --other_class Others
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/SL_HN_Invoice  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/SL_HN_Invoice \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/invoice_sl_hn_done.xml   \
    --other_class other \
    --line_to_word
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SS_Invoice  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train/SS_Invoice \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/Labeling/ss_invoice_done.xml   \
    --other_class other \
    --line_to_word
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/multi_pages/SL_HCM_batch_2_first_last_page  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/multi_pages/SL_HCM_batch_2_first_last_page \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/multi_pages/labels/sl_hcm_batch_2_multi_pages_done.xml   \
    --other_class other \
    --line_to_word
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/Crawled_invoices_SBT_no_wg_txt  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/Crawled_invoices_SBT_no_wg_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/intermediate/sbt/sbt_craw_done.xml   \
    --other_class other \
    --line_to_word
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_sbt  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_sbt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/sbt_test_done.xml   \
    --other_class other \
    --line_to_word
 python sdsvkie/tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/invoice_viettinbank_poc_txt  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/invoice_viettinbank_poc_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/viettinbank_poc_done.xml   \
    --other_class other \
    --line_to_word
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/eval_e2e.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/eval_e2e.sh
@ -1,96 +0,0 @@
 #SDSAP
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/food_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_end2end_food_pred_v1.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml
 python sdsvkie/utils/eval_kie.py \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/all_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_all_end2end_w0.1_h0.3_thr5.json \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_all_end2end_w0.1_h0.3_thr5.json
 python sdsvkie/utils/eval_kie.py \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/taxi_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_text_det_20230425.json \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_text_det_20230425_fail.json
 python sdsvkie/utils/eval_kie.py \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/taxi_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_w0.2_h0.2_thr5.json \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_w0.2_h0.2_thr5_fail.json
 # 10/5/2023
 python sdsvkie/utils/eval_kie.py \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/config.yaml \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test_end2end/all_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/receipt_e2e.json \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/receipt_e2e_fail.json
 python sdsvkie/utils/eval_kie.py \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/config.yaml \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/all_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/receipt_e2e_best.json \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/receipt_e2e_best_fail.json
 =============================== INVOICE ====================================================================
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/test_end2end/test_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/e2e/test_vnpt_epx_4_best_not_ocr_merge_use_label.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/failure_of_test_vnpt_epx_4_best_not_ocr_merge_use_label.json
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/test_end2end/test_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/pred_test_end2end.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/vnpt_epx_falures_v2.json
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e_rm_leak.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/test_ss_rm_leak.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/test_ss_e2e_rm_leak_fail.json
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page/test_e2e_multi_pages.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/test_e2e_multi_page.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/test_e2e_multi_page_fail.json
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_e2e_rm_leak.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_ss_rm_leak.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/config.yaml \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_ss_rm_leak_fail.json
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page/test_e2e_multi_pages.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_e2e_multi_page.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/config.yaml \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_e2e_multi_page_fail.json
 python sdsvkie/utils/eval_kie.py \
 --gt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_end2end/sbt_validation_e2e.json \
 --pred /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/sbt_validation_e2e_ep50.json \
 --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/config.yaml  \
 --log_failure_case /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/sbt_validation_e2e_ep50_fail.json
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/infer.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/infer.sh
@ -1,199 +0,0 @@
 python sdsvkie/tools/infer.py --cfg workdirs/invoice/exp1/config.yaml --inference_weights workdirs/invoice/exp1/best --device cuda --img ../TokenClassification_invoice/DATA/test/01067_0452_000.jpg 
 python sdsvkie/tools/infer.py --cfg workdirs/invoice/exp1/config.yaml --inference_weights workdirs/invoice/exp1/best --device "cuda:1" \
    --img ../craw_data/output/synth_template_4/one_line  --txt_out workdirs/visualize/vnpt_one_line_txt --kie_wordgroup_out
 python sdsvkie/tools/infer.py --cfg workdirs/invoice/exp1/config.yaml --inference_weights workdirs/invoice/exp1/best --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r20/one_line  --txt_out workdirs/visualize/vnpt_one_line_r20_txt --kie_wordgroup_out
 python sdsvkie/tools/infer.py --cfg workdirs/invoice/exp_add_vnpt_template/config.yaml --inference_weights workdirs/invoice/exp_add_vnpt_r2/best --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r2_2/one_line \
    --vis_out  workdirs/visualize/vnpt_r2_phase_2 \
     --txt_out workdirs/visualize/vnpt_r2_phase_2_txt --kie_wordgroup_out
 python sdsvkie/tools/infer.py --cfg workdirs/invoice/exp_add_vnpt_template/config.yaml --inference_weights workdirs/invoice/exp_add_vnpt_r2/best --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r2_2/one_line \
    --vis_out  workdirs/visualize/vnpt_r2_phase_2 \
     --txt_out workdirs/visualize/vnpt_r2_phase_2_txt --kie_wordgroup_out
 python sdsvkie/tools/infer.py --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoiceadd_vnpt_final/config.yaml --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoiceadd_vnpt_final/epoch_60 --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/test \
    --vis_out  workdirs/visualize/test_sorted     
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoiceadd_vnpt_final/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoiceadd_vnpt_final/epoch_60 \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/test \
    --vis_out  workdirs/visualize/test_sorted     
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoiceadd_vnpt_final/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoiceadd_vnpt_final/epoch_60 \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/test \
 # test e2e 
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/exp_wild_1/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/exp_wild_1/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/intermediate/labeling/phase_1/batch_1 \
    --vis_out  workdirs/visualize/SDSAP_Invoice_exp_wild_1 
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/test \
    --e2e workdirs/e2e/test_sampling_sortword_exp4_best.json
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Dataset/common_tools/JPG/one_page/vat \
    --e2e workdirs/e2e/test_invoice_vnpt_exp4_sdsv_invoice.json \
    --vis workdirs/e2e/test_invoice_vnpt_exp4_sdsv_invoice_visualize
 # sdsAP
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_1/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_1/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_2 \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/infer/visualize/exp_1_batch_1_taxi_sub_2   
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_2/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_2/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_2/Noise \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/infer/visualize/exp_2_batch_2_noise
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_2/Good/Taxi \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/infer/visualize/exp_3_batch_2_good_taxi
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/Taxi \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/infer/visualize/exp_3_batch_2_good_taxi
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/food \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/infer/visualize/exp_3_batch_1_food
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/Taxi \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/infer/visualize/exp_3_test_end2end_taxi_2
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test_end2end/All  \
    --vis_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/receipt_e2e_infer_best
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/last  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test_sbt  \
    --vis_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/test_sbt_infer
 # MCOCR
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted/batch_1 \
    --txt_out  /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images_aligned/train_splitted_txt/batch_1
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted/batch_1 \
    --txt_out  /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/SS_Receipt/Images_splitted_txt/batch_1
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/02062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/02062023/best \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/multi_pages/sl_hcm_hn_savina  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/02062023/multi_pages_sl_hcm_hn_savina_wg_txt \
    --kie_wordgroup_out
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/last \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/raw/Crawled_invoices_SBT  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/Crawled_invoices_SBT \
    --kie_wordgroup_out
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/last \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/raw/Crawled_invoices_SBT  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/Crawled_invoices_SBT_no_wg_txt
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/epoch_50  \
    --device "cuda:0" \
    --img /mnt/ssd1T/tuanlv/06.KVUCombineStage/datasets/invoices-receipts/SS_invoices/SBT/validation_data/valid_images  \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_6/sbt_validation_e2e_ep50_old_textdet_infer
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/Viettinbank_POC/POC_OCR/invoice_JPG  \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/visualize/vietinbank
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/1000416613  \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/visualize/intermediate_1000416613
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/infer_e2e.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/infer_e2e.sh
@ -1,210 +0,0 @@
 # SDSAP_Receipt
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/All  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_all_end2end.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_all_end2end
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/OTHER_DATA/MC-OCR/raw/Images/train  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/train_mcocr_end2end.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/train_mcocr_end2end
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --text_det /mnt/ssd500/datnt/mmdetection/logs/textdet-fwd-table-receipt-20230425/best_bbox_mAP_epoch_15_lite.pth \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/Taxi  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_text_det_20230425.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_text_det_20230425 
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/v1/train  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_end2end.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/Taxi  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_w0.2_h0.2_thr5.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_w0.2_h0.2_thr5
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test_end2end/All  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_all_end2end_w0.1_h0.3_thr5_v2.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/test_taxi_end2end_w0.1_h0.3_thr5_v2
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test_end2end/All  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/receipt_e2e.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_8_lr9e_6/receipt_e2e
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test_end2end/All  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/receipt_e2e_best.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/sdsap_receipt/exp_9_lr5e_6_no_scheduler/receipt_e2e_best
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/Webcash/testing  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/pred_webcash_testing.json \
    --vis /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_3/pred_webcash_testing
 #invoice
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/test  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/invoice_end2end_last.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Dataset/common_tools/Split_by_pages/multi_page/vat  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/pred_vat_multi_page_v2.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Dataset/common_tools/Split_by_pages/multi_page/vat  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/pred_vat_multi_page_v2.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/multi_page_vat/SL_HCM  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/multi_page_vat/SL_HCM.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_rm_leak  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/test_ss_rm_leak.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/test_ss.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/SL_HCM_Invoice  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/SL_HCM_Invoice.json \
    --vis  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/SL_HCM_Invoice
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss_rm_leak  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_ss_rm_leak.json \
    --vis  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_ss_rm_leak
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/multi_pages/SL_HCM_batch_2  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_ss_sl_hcm_batch_2_multi_page_jpg.json \
    --vis  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_ss_sl_hcm_batch_2_multi_page_jpg
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page/SL_HN_batch_2  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/multi_page_SL_HN_batch_2.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page/test_e2e_multi_page  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/26052023/test_e2e_multi_page.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/02062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/02062023/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page/SL_HN_batch_2  \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/02062023/multi_page_SL_HN_batch_2.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best  \
    --device "cuda:0" \
    --img /mnt/hdd2T/AICR/Projects/2023/Vietinbank_POC/Invoice_JPG/ \
    --e2e  /mnt/hdd2T/AICR/Projects/2023/Vietinbank_POC/Invoice_KIE_Results/result.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best  \
    --device "cuda:0" \
    --img /mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Invoice_v2_multi_page \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/Invoice_v2_multi_page.json
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/19072023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/19072023/best  \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/Viettinbank_POC/POC_OCR/invoice_JPG/ \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/19072023/vietinbank_poc_infer.json \
    --vis workdirs/invoice/19072023/visualize/vietinbank_infer
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best  \
    --device "cuda:0" \
    --img /mnt/hdd2T/AICR/Projects/2023/FI_Invoices/Test/1000416613/1000416613_0102310385_26062023163233062_001.pdf \
    --e2e  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/1000416613_0102310385_26062023163233062_001.json
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/label_cvat.json
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/label_cvat.json
@ -1,226 +0,0 @@
 [
  {
    "name": "total_in_words_key",
    "id": 92,
    "color": "#33ddff",
    "type": "any",
    "attributes": []
  },
  {
    "name": "no_value",
    "id": 93,
    "color": "#fa3253",
    "type": "any",
    "attributes": []
  },
  {
    "name": "form_key",
    "id": 94,
    "color": "#34d1b7",
    "type": "any",
    "attributes": []
  },
  {
    "name": "no_key",
    "id": 95,
    "color": "#ff007c",
    "type": "any",
    "attributes": []
  },
  {
    "name": "form_value",
    "id": 96,
    "color": "#ddff33",
    "type": "any",
    "attributes": []
  },
  {
    "name": "serial_key",
    "id": 97,
    "color": "#24b353",
    "type": "any",
    "attributes": []
  },
  {
    "name": "serial_value",
    "id": 98,
    "color": "#b83df5",
    "type": "any",
    "attributes": []
  },
  {
    "name": "date_value",
    "id": 99,
    "color": "#66ff66",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_company_name_key",
    "id": 100,
    "color": "#32b7fa",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_company_name_value",
    "id": 101,
    "color": "#ffcc33",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_tax_code_key",
    "id": 102,
    "color": "#83e070",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_tax_code_value",
    "id": 103,
    "color": "#fafa37",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_address_value",
    "id": 104,
    "color": "#5986b3",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_address_key",
    "id": 105,
    "color": "#8c78f0",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_tel_key",
    "id": 106,
    "color": "#ff6a4d",
    "type": "any",
    "attributes": []
  },
  {
    "name": "seller_tel_value",
    "id": 107,
    "color": "#f078f0",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_personal_name_key",
    "id": 108,
    "color": "#2a7dd1",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_personal_name_value",
    "id": 109,
    "color": "#83e070",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_company_name_value",
    "id": 110,
    "color": "#5986b3",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_company_name_key",
    "id": 111,
    "color": "#8c78f0",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_tax_code_key",
    "id": 112,
    "color": "#ff6a4d",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_tax_code_value",
    "id": 113,
    "color": "#f078f0",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_address_key",
    "id": 114,
    "color": "#2a7dd1",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_address_value",
    "id": 115,
    "color": "#b25050",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_tel_key",
    "id": 116,
    "color": "#cc3366",
    "type": "any",
    "attributes": []
  },
  {
    "name": "buyer_tel_value",
    "id": 117,
    "color": "#cc9933",
    "type": "any",
    "attributes": []
  },
  {
    "name": "tax_amount_key",
    "id": 118,
    "color": "#aaf0d1",
    "type": "any",
    "attributes": []
  },
  {
    "name": "tax_amount_value",
    "id": 119,
    "color": "#ff00cc",
    "type": "any",
    "attributes": []
  },
  {
    "name": "total_key",
    "id": 120,
    "color": "#3df53d",
    "type": "any",
    "attributes": []
  },
  {
    "name": "total_value",
    "id": 121,
    "color": "#fa32b7",
    "type": "any",
    "attributes": []
  },
  {
    "name": "total_in_words_value",
    "id": 122,
    "color": "#3d3df5",
    "type": "any",
    "attributes": []
  },
  {
    "name": "other",
    "id": 123,
    "color": "#733380",
    "type": "any",
    "attributes": []
  }
 ]
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/post_eval_res.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/post_eval_res.sh
@ -1,10 +0,0 @@
 python sdsvkie/tools/postprocess_e2e_label.py \
    --cfg workdirs/invoice/vnpt_exp_4/config.yaml \
    --input /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/test_end2end/test_e2e.json \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SS_Invoice/test_end2end/test_e2e_post.json
 python sdsvkie/tools/postprocess_e2e_label.py \
    --cfg workdirs/invoice/vnpt_exp_4/config.yaml \
    --input /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/invoice_exp4/test_end2end_post.json \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/invoice_exp4/test_end2end_post.json
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/pseudo_label.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/pseudo_label.sh
@ -1,36 +0,0 @@
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/exp_wild_1/config.yaml \
    --inference_weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/exp_wild_1/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_2/Good/Food  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_2/Good/Food
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_2/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/training/sdsap_receipt/exp_2/best  \
    --device "cuda:0" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_2  \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_1/Good/Taxi_sub_2
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/intermediate/multi_pages/SL_HCM_batch_2_first_last_page \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/SL_HCM_batch_2_first_last_page_vis \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/SL_HCM_batch_2_first_last_page_txt \
    --kie_wordgroup_out
 python sdsvkie/tools/infer.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best \
    --device "cuda:1" \
    --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/invoice_viettinbank_poc \
    --vis_out  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/invoice_viettinbank_poc_vis \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/invoice_viettinbank_poc_txt \
    --kie_wordgroup_out
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/run_ocr.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/run_ocr.sh
@ -1,19 +0,0 @@
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/IMGS \
 --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/SDSAP_Invoice/visualize_ocr \
 --device "cuda:1" \
 --reserve_parent_dir
 # python sdsvkie/tools/run_ocr.py \
 # --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/IMGS \
 # --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/SDSAP_Invoice/visualize_ocr \
 # --device "cuda:1" \
 # --reserve_parent_dir
 python sdsvkie/tools/run_ocr.py \  
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/test_end2end/Taxi \
 --device "cuda:1" \
 --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/test_v2/taxi \
 --text_det "/mnt/ssd500/datnt/mmdetection/logs/textdet-fwd-table-receipt-20230425/best_bbox_mAP_epoch_15_lite.pth"
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/run_ocr_gen_txt.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/run_ocr_gen_txt.sh
@ -1,35 +0,0 @@
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/pseudo_ocr \
 --out_dir ./workdirs/visualize/WildReceipt \
 --device "cuda:0" \
 --out_txt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/pseudo_ocr
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_1 \
 --device "cuda:0" \
 --out_txt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo_OCR/Batch_1_Good/Taxi_sub_1
 # pseudo for edit text detect boxes
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/taxi_sub_1 \
 --device "cuda:0" \
 --out_txt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/taxi_sub_1
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/taxi_sub_1 \
 --device "cuda:0" \
 --out_txt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/taxi_sub_1
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/food \
 --device "cuda:0" \
 --out_txt /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/label_ocr/all/batch_1/food
 python sdsvkie/tools/run_ocr.py \
 --img /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/demos/invoice \
 --device "cuda:0" \
 --out_txt /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/demos/invoice
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/run_pdf2img.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/run_pdf2img.sh
@ -1,15 +0,0 @@
 python sdsvkie/utils/pdf2image.py \
 --pdf_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/PDF \
 --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/raw/batch_1/IMGS_dpi_300 \
 --reserve_parent_dir
 python sdsvkie/utils/pdf2image.py \
 --pdf_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/PDF/multi_page/Invoices_SL_HCM \
 --out_dir /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/raw/JPG/Invoices_SL_HCM
 python sdsvkie/utils/pdf2image.py \
 --pdf_dir /mnt/hdd2T/AICR/Projects/2023/Invoice_SDSV/SDSV_Invoice_2023/All \
 --out_dir /mnt/hdd2T/AICR/Projects/2023/Invoice_SDSV/SDSV_Invoice_2023/JPG
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/split_data.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/split_data.sh
@ -1,25 +0,0 @@
 python sdsvkie/utils/split_data.py \
    --path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Food \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/v2/ \
    --test_ratio 0.05
 python sdsvkie/utils/split_data.py \
    --path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_1 \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/v2/ \
    --test_ratio 0.05
 python sdsvkie/utils/split_data.py \
    --path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_1/Good/Taxi_sub_2 \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/v2/ \
    --test_ratio 0.05
 python sdsvkie/utils/split_data.py \
    --path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_2/Good/Food \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/v2/ \
    --test_ratio 0.05
 python sdsvkie/utils/split_data.py \
    --path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/re_labeling/batches/batch_1 \
    --out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/done/v2/ \
    --test_ratio 0.0
--- a/cope2n-ai-fi/modules/sdsvkie/scripts/train.sh
+++ b/cope2n-ai-fi/modules/sdsvkie/scripts/train.sh
@ -1,7 +0,0 @@
 python sdsvkie/tools/train.py --cfg sdsvkie/cfg/wildreciept.yaml --device cuda:0 --save_dir workdirs/invoice/exp_wild_1
 python sdsvkie/tools/train.py --cfg sdsvkie/cfg/sdsap_receipt_scheduler_linear.yaml --device cuda:0 --save_dir workdirs/sdsap_receipt/exp_4_scheduler_linear
 python sdsvkie/tools/train.py --cfg sdsvkie/cfg/sdsap_receipt.yaml --device cuda:1 --save_dir workdirs/sdsap_receipt/exp_5
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/init.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/init.py
@ -1,5 +0,0 @@
 import os 
 import sys 
 sys.path.append(__file__)
 from .engine.predictor import Predictor
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/init.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/init.py
@ -1,33 +0,0 @@
 from types import SimpleNamespace
 from sdsvkie.utils.io_file import yaml_load
 from pathlib import Path
 from copy import copy, deepcopy
 def load_cfg(cfg, args=None):
    """
    Convert a configuration object to a dictionary, whether it is a file path, a string, or a SimpleNamespace object.
    Inputs:
        cfg (str) or (Path) or (SimpleNamespace): Configuration object to be converted to a dictionary.
    Returns:
        cfg (dict): Configuration object in dictionary format.
    """
    if isinstance(cfg, (str, Path)):
        cfg = yaml_load(cfg)  # load dict
    elif isinstance(cfg, SimpleNamespace):
        cfg = vars(cfg)  # convert to dict
    if args is not None:
        _args = deepcopy(args)
        for k, v in args.items():
            if v is None or v == "cfg" :
                _args.pop(k)
            if v is not None and k == "weights":
                _args['inference_weights'] = v
                _args.pop(k)
        cfg.update(_args)
    return cfg
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/class_list.txt
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/class_list.txt
@ -1,39 +0,0 @@
 no_key
 no_value
 form_key
 form_value
 serial_key
 serial_value
 date_key
 date_value
 subtotal_key
 subtotal_value
 tax_rate_key
 tax_rate_value
 tax_amount_key
 tax_amount_value
 tips_key
 tips_value 
 total_key
 total_value
 total_in_words_key
 total_in_words_value
 seller_company_name_key
 seller_company_name_value
 seller_address_key
 seller_address_value
 seller_tel_key
 seller_tel_value 
 seller_tax_code_key
 seller_tax_code_value
 buyer_company_name_key
 buyer_company_name_value
 buyer_personal_name_key
 buyer_personal_name_value 
 buyer_tax_code_key
 buyer_tax_code_value
 buyer_address_key
 buyer_address_value
 buyer_tel_key
 buyer_tel_value
 other
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/default.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/default.yaml
@ -1,79 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/train_with_vnpt
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/test
 slice_interval: 75
 postprocess_type: invoice_postprocess
 classes: [
    # id invoice
    'no_key',    # số hóa đơn
    'no_value', 
    'form_key',    # mẫu số hóa đơn
    'form_value', 
    'serial_key',     # số kí hiệu hoá đơn
    'serial_value', 
    'date', 
    # seller info
    'seller_company_name_key', 
    'seller_company_name_value', 
    'seller_tax_code_key', 
    'seller_tax_code_value', 
    'seller_address_value',
    'seller_address_key', 
    'seller_mobile_key',
    'seller_mobile_value',
    # buyer info
    'buyer_name_key',
    'buyer_name_value',
    'buyer_company_name_value',
    'buyer_company_name_key',
    'buyer_tax_code_key',
    'buyer_tax_code_value',
    'buyer_address_key', 
    'buyer_address_value',
    'buyer_mobile_key',
    'buyer_mobile_value',
    # money info
    'VAT_amount_key', 
    'VAT_amount_value', 
    'total_key', 
    'total_value', 
    'total_in_words_key', 
    'total_in_words_value',
    'other', 
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 5.0e-6
 shuffle: True
 num_workers: 4
 scheduler: False                        #  False or  "linear" / "cosine" / "cosine_with_restarts" /  "polynomial" / "constant" /  "constant_with_warmup" / "inverse_sqrt"
 save_dir: workdirs/invoice/exp1
 save_weight_interval: 10
 eval_delay: 0
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/invoice.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/invoice.yaml
@ -1,74 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss
 slice_interval: 75
 postprocess_type: invoice_postprocess
 classes: [
 'no_key',
 'no_value',
 'form_key',
 'form_value',
 'serial_key',
 'serial_value',
 'date_value',
 'seller_company_name_key',
 'seller_company_name_value',
 'seller_tax_code_key',
 'seller_tax_code_value',
 'seller_address_value',
 'seller_address_key',
 'seller_tel_key',
 'seller_tel_value',
 'buyer_personal_name_key',
 'buyer_personal_name_value',
 'buyer_company_name_value', 
 'buyer_company_name_key',
 'buyer_tax_code_key',
 'buyer_tax_code_value',
 'buyer_address_key',
 'buyer_address_value',
 'buyer_tel_key',
 'buyer_tel_value',
 'tax_amount_key',
 'tax_amount_value',
 'total_key',
 'total_value',
 'total_in_words_key',
 'total_in_words_value',
 'other'
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 145
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 5.0e-6
 shuffle: True
 num_workers: 4
 scheduler: "linear"                        #  False or  "linear" / "cosine" / "cosine_with_restarts" /  "polynomial" / "constant" /  "constant_with_warmup" / "inverse_sqrt"
 save_dir: workdirs/invoice/19072023
 save_weight_interval: 10
 eval_delay: 50
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/invoice_relabel.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/invoice_relabel.yaml
@ -1,74 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss
 slice_interval: 75
 postprocess_type: invoice_postprocess
 classes: [
 'no_key',
 'no_value',
 'form_key',
 'form_value',
 'serial_key',
 'serial_value',
 'date_value',
 'seller_company_name_key',
 'seller_company_name_value',
 'seller_tax_code_key',
 'seller_tax_code_value',
 'seller_address_value',
 'seller_address_key',
 'seller_tel_key',
 'seller_tel_value',
 'buyer_personal_name_key',
 'buyer_personal_name_value',
 'buyer_company_name_value', 
 'buyer_company_name_key',
 'buyer_tax_code_key',
 'buyer_tax_code_value',
 'buyer_address_key',
 'buyer_address_value',
 'buyer_tel_key',
 'buyer_tel_value',
 'tax_amount_key',
 'tax_amount_value',
 'total_key',
 'total_value',
 'total_in_words_key',
 'total_in_words_value',
 'other'
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 145
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 5.0e-6
 shuffle: True
 num_workers: 4
 scheduler: "cosine"                        #  False or  "linear" / "cosine" / "cosine_with_restarts" /  "polynomial" / "constant" /  "constant_with_warmup" / "inverse_sqrt"
 save_dir: workdirs/invoice/26052023
 save_weight_interval: 10
 eval_delay: 50
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/invoice_relabel_finetuning.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/invoice_relabel_finetuning.yaml
@ -1,74 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/test_ss
 slice_interval: 75
 postprocess_type: invoice_postprocess
 classes: [
 'no_key',
 'no_value',
 'form_key',
 'form_value',
 'serial_key',
 'serial_value',
 'date_value',
 'seller_company_name_key',
 'seller_company_name_value',
 'seller_tax_code_key',
 'seller_tax_code_value',
 'seller_address_value',
 'seller_address_key',
 'seller_tel_key',
 'seller_tel_value',
 'buyer_personal_name_key',
 'buyer_personal_name_value',
 'buyer_company_name_value', 
 'buyer_company_name_key',
 'buyer_tax_code_key',
 'buyer_tax_code_value',
 'buyer_address_key',
 'buyer_address_value',
 'buyer_tel_key',
 'buyer_tel_value',
 'tax_amount_key',
 'tax_amount_value',
 'total_key',
 'total_value',
 'total_in_words_key',
 'total_in_words_value',
 'other'
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/vnpt_exp_4/last"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 1.0e-5
 shuffle: True
 num_workers: 4
 scheduler: "cosine"                        #  False or  "linear" / "cosine" / "cosine_with_restarts" /  "polynomial" / "constant" /  "constant_with_warmup" / "inverse_sqrt"
 save_dir: workdirs/invoice/add_sl_hcm
 save_weight_interval: 1
 eval_delay: 1
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt.yaml
@ -1,47 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test
 slice_interval: 75
 postprocess_type: receipt_postprocess
 classes: [
    "seller_company_name_value",
    "no_value",
    "date_value",
    "total_key",
    "total_value",
    "other"
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 5.0e-6
 scheduler: False
 shuffle: True
 num_workers: 4
 save_dir: workdirs/receipt/13062023
 save_weight_interval: 10
 eval_delay: 0
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt_finetuning.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt_finetuning.yaml
@ -1,47 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/processed/test
 slice_interval: 75
 postprocess_type: receipt_postprocess
 classes: [
    "seller_company_name_value",
    "no_value",
    "date_value",
    "total_key",
    "total_value",
    "other"
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/receipt/13062023_4/last"
 # opt + scheduler
 batch_size: 8
 epochs: 30
 lr: 3.0e-6
 scheduler: False
 shuffle: True
 num_workers: 4
 save_dir: workdirs/receipt/13062023
 save_weight_interval: 10
 eval_delay: 0
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt_relabel.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt_relabel.yaml
@ -1,47 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test
 slice_interval: 75
 postprocess_type: receipt_postprocess
 classes: [
    "Seller_company_name_value",
    "ID_value",
    "Date_value",
    "Total_key",
    "Total_value",
    "Other"
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 9.0e-6
 scheduler: "linear"
 shuffle: True
 num_workers: 4
 save_dir: workdirs/invoice/exp1
 save_weight_interval: 10
 eval_delay: 50
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt_scheduler_linear.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/sdsap_receipt_scheduler_linear.yaml
@ -1,47 +0,0 @@
 debug: False
 v3: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Receipt/test
 slice_interval: 75
 postprocess_type: receipt_postprocess
 classes: [
    "Store_name_value",
    "id",
    "Date_value",
    "Total_key",
    "Total_value",
    "Others"
 ]
 sampling: true            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 9.0e-6
 scheduler: "linear"
 shuffle: True
 num_workers: 4
 save_dir: workdirs/invoice/exp1
 save_weight_interval: 10
 eval_delay: 50
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/wildreciept.yaml
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/cfg/wildreciept.yaml
@ -1,47 +0,0 @@
 debug: False
 v3: False
 scheduler: False
 # common 
 device: 'cpu'       # 'cpu' / 'cuda:0' / 'cuda:1'  / 'cuda'
 #dataset
 train_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/v1/train
 val_dir: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/v1/test
 slice_interval: 75
 postprocess_type: receipt_postprocess
 classes: [
    "Store_name_value",
    "Date_value",
    "Total_key",
    "Total_value",
    "Others"
 ]
 sampling: True            # sampling window - fix long document (larger than 512 token)
 #model 
 img_size: 224            # fixed
 max_seq_length: 512        # fixed
 max_num_words: 150
 # tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/layoutlmv3-base"      # fixed
 # weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/layoutlmv3-base"
 tokenizer_weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"      # fixed
 weights: "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/microsoft/microsoft/layoutxlm-base"
 # opt + scheduler
 batch_size: 8
 epochs: 100
 lr: 5.0e-6
 shuffle: True
 num_workers: 4
 save_dir: workdirs/invoice/exp1
 save_weight_interval: 10
 eval_delay: 0
 wandb: null 
 # inference
 inference_weights: null 
 text_det: yolox-s-general-text-pretrain-20221226
 text_reg: satrn-lite-general-pretrain-20230106
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/datasets/base_dataset.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/datasets/base_dataset.py
@ -1,314 +0,0 @@
 import logging
 import os
 import pickle
 import random
 from copy import deepcopy
 from pathlib import Path
 import cv2
 import pandas as pd
 import torch
 from datasets import (Array2D, Array3D, ClassLabel, Dataset, Features,
                      Sequence, Value, concatenate_datasets)
 from easydict import EasyDict
 from PIL import Image
 from tqdm import tqdm
 from sdsvkie.utils.io_file import read_txt
 from sdsvkie.utils import normalize_box, visualize_kie
 from sdsvkie.utils.augmentation import perturbate_character, sampling_data
 from sdsvkie.utils.word_formation import sliding_windows, sort_words
 import glob 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 IMG_EXTENSION = [".jpg", ".jpeg", ".png"]
 class BaseDataset:
    def __init__(self, cfg):
        self.cfg = cfg
        self.feature_format = Features(
            {
                "image": Array3D(
                    dtype="int64", shape=(3, self.cfg.img_size, self.cfg.img_size)
                ),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(self.cfg.max_seq_length, 4)),
                "labels": Sequence(ClassLabel(names=self.cfg.classes)),
            }
        ) if not self.cfg.v3 else \
            Features(
                {
                    "pixel_values": Array3D(
                        dtype="float32", shape=(3, self.cfg.img_size, self.cfg.img_size)
                    ),
                    "input_ids": Sequence(feature=Value(dtype="int64")),
                    "attention_mask": Sequence(Value(dtype="int64")),
                    "bbox": Array2D(dtype="int64", shape=(self.cfg.max_seq_length, 4)),
                    "labels": Sequence(ClassLabel(names=self.cfg.classes)),
                }
            )
        logger.info("Feature format: {}".format(self.feature_format.keys()))
    def _build_df(self, data_dir):
        """Build dataframe from data directory
        Args:
            data_dir (str): structure data folder
                - data_dir
                    - img1.jpg
                    - img1.txt
                    - ...
        """
        data_dir = Path(data_dir)
        # img_paths = glob.glob("*") + glob.glob("*/*")
        img_paths = [
            path for path in list(data_dir.rglob("*")) 
            if ".txt" not in str(path) and path.with_suffix(".txt").exists() and path.suffix.lower() in IMG_EXTENSION
        ]
        label_paths = [str(path.with_suffix(".txt")) for path in img_paths]
        img_paths = [str(path) for path in img_paths]
        assert len(label_paths) == len(img_paths)
        # remove empty txt
        ids = [id for id in range(len(label_paths)) if len(read_txt(label_paths[id])) > 0]
        label_paths = [label_paths[id] for id in ids]
        img_paths = [img_paths[id] for id in ids]
        dataframe = pd.DataFrame.from_dict(
            {"image_path": img_paths, "label": label_paths}
        )
        return dataframe
    def build_dataloader_from_dataset(
        self,
        dataset,
        processor,
        device,
        batch_size,
        shuffle=True,
        num_workers=4,
        cache_file="./cache.pkl",
        use_sampling=False,
    ):
        if not os.path.exists(cache_file):
            self._build_cache(
                dataset,
                processor,
                cache_file=cache_file,
                max_seq_length=self.cfg.max_seq_length,
            )
        cache = self._load_cache(cache_file)
        dataset = dataset.map(
            self._prepare_data,
            fn_kwargs={"cache": cache, "sampling": use_sampling},
            remove_columns=dataset.column_names,
            features=self.feature_format,
            batched=False,
            batch_size=self.cfg.batch_size,
        )
        dataset.set_format(type="torch", device=device)
        dataloader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers
        )
        return dataloader
    def build_dataloader_from_dir(
        self,
        data_dir,
        processor,
        device,
        batch_size,
        shuffle=True,
        num_workers=4,
        cache_file="./cache.pkl",
        use_sampling=False,
    ):
        dataset = self._build_dataset(data_dir)
        dataloader = self.build_dataloader_from_dataset(
            dataset,
            processor,
            device,
            batch_size,
            shuffle,
            num_workers,
            cache_file,
            use_sampling,
        )
        return dataloader
    def _build_dataset(self, data_dir):
        df = self._build_df(data_dir)
        dataset = Dataset.from_pandas(df)
        logger.info(f"Load example for {data_dir}")
        dataset = dataset.map(lambda example: self._load_example_info(example))
        return dataset
    def _build_cache(self, dataset, processor, max_seq_length, cache_file=""):
        logger.info(f"Caching {cache_file}...")
        cache = {}
        for examples in tqdm(dataset):
            encoding_inputs = self._cache_feature(examples, processor, max_seq_length)
            cache[examples["image_path"]] = encoding_inputs
        with open(cache_file, "wb") as f:
            pickle.dump(cache, f)
    def _load_cache(self, cache_file):
        with open(cache_file, "rb") as f:
            cache = pickle.load(f)
        return cache
    def _prepare_data(self, example, cache, sampling):
        encoded_inputs_windows = cache[example["image_path"]]
        if len(encoded_inputs_windows) == 0:
            raise Exception("Empty encoded_inputs_windows")
        if sampling:
            if random.random() < 0.6:
                encoded_inputs = encoded_inputs_windows[-1]
            else:
                encoded_inputs = random.choice(encoded_inputs_windows)
        else:
            encoded_inputs = encoded_inputs_windows[-1]
        for k, v in encoded_inputs.items():
            if k in ["image", "pixel_values"]:
                encoded_inputs[k] = encoded_inputs[k][0]
        return encoded_inputs
    def _cache_feature(self, example, processor, max_seq_length=512):
        """Sampling (optional) + apply LayoutLMProcessor
        Args:
            examples (dict): dict {'image_path', 'words', 'bbox', 'word_labels'}
            max_seq_length (int, optional): _description_. Defaults to 512.
        Returns:
            list[dict]: list encoding inputs
        """
        image = Image.open(example["image_path"]).convert("RGB")
        batch_words = example["words"]
        batch_boxes = example["bbox"]
        batch_labels = example["word_labels"]
        window_size = self.cfg.max_num_words
        slice_interval = self.cfg.slice_interval
        word_windows = sliding_windows(batch_words, window_size, slice_interval)
        box_windows = sliding_windows(batch_boxes, window_size, slice_interval)
        label_windows = sliding_windows(batch_labels, window_size, slice_interval)
        encoded_inputs = []
        for words, boxes, labels in zip(word_windows, box_windows, label_windows):
            # Process examples
            encoded_input = processor(
                image,
                padding="max_length",
                truncation=True,
                text=words,
                boxes=boxes,
                word_labels=labels,
                max_length=max_seq_length,
            )
            encoded_inputs.append(encoded_input)
        # full page
        encoded_input = processor(
            image,
            padding="max_length",
            truncation=True,
            text=batch_words,
            boxes=batch_boxes,
            word_labels=batch_labels,
            max_length=max_seq_length,
        )
        encoded_inputs.append(encoded_input)
        return encoded_inputs
    def _load_example_info(self, example, aug_prob=0.0):
        """_summary_
        Args:
            example (_type_): _description_
            aug_prob (float, optional): _description_. Defaults to 0.0.
        """
        image_path = example["image_path"]
        label_path = example["label"]
        assert os.path.exists(image_path)
        assert os.path.exists(label_path)
        # try:
        image = cv2.imread(image_path)
        h, w, _ = image.shape
        with open(label_path) as f:
            lines = [line.replace("\n", "").replace("\r", "") for line in f.readlines()]
        words, boxes, labels = [], [], []
        # print(label_path)
        for i, line in enumerate(lines):
            x1, y1, x2, y2, text, label = line.split("\t")
            box = [int(x1), int(y1), int(x2), int(y2)]
            if text != " ":
                words.append(text)
                boxes.append(box)
                labels.append(label)
        if aug_prob > 0:
            p_words = perturbate_character(words, aug_prob)
            logging.info("{} - {}".format(len(p_words), len(words)))
        # custom for sort boxes
        items = {
            'boxes': boxes,
            'texts': words,
            'labels': labels
        }
        # boxes, words, labels = sort_words(boxes, words, labels)
        sorted_items = sort_words(items)
        boxes, words, labels = sorted_items['boxes'], sorted_items['texts'], sorted_items['labels']
        # print(image_path)
        # print(image_path)
        labels = [self.cfg.classes.index(label) for label in labels]
        if self.cfg.debug:
            visualize_kie(
                img=image,
                boxes=boxes,
                pred_labels=labels,
                outdir="wordirs/debug_{}".format(
                    "val" if "train" not in image_path else "train"
                ),
                image_name=os.path.basename(image_path),
            )
        boxes = [normalize_box(box, width=w, height=h) for box in boxes]
        example["words"] = words
        example["bbox"] = boxes  # TODO: Check this
        example["word_labels"] = labels
        # except Exception as err:
        #     logger.info(f"Exception: {err} at image path: {example['image_path']}")
        #     example["words"] = []
        #     example["bbox"] = []  # TODO: Check this
        #     example["word_labels"] = []
        return example
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/engine/init.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/engine/init.py
@ -1 +0,0 @@
 from .predictor import Predictor
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/engine/predictor.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/engine/predictor.py
@ -1,457 +0,0 @@
 import logging
 import time
 import cv2
 import numpy as np
 from sdsvkie.utils.word_formation import merge_boxes
 import torch
 from easydict import EasyDict
 from PIL import Image
 from sdsvkie.cfg import load_cfg
 # from sdsvkie.models.layoutlm import LayoutLM
 from sdsvkie.models.layoutlmv2 import LayoutLMv2
 from sdsvkie.models.ocr import OCREngine
 from sdsvkie.utils import invoice_postprocess  # invoice
 from sdsvkie.utils import receipt_postprocess  # receipt
 from sdsvkie.utils import POSTPROCESS_FUNC
 from sdsvkie.utils import (Word, construct_word_groups_to_kie_label,
                           normalize_box, sliding_windows, sort_words,
                           unnormalize_box, words_to_lines,
                           pdf_to_image)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class Predictor:
    def __init__(self, cfg, **kwargs) -> None:
        """
        Args:
            cfg (dict / str): config
            **kwargs: device=..., weights=..., text_det=..., text_reg=...
        """
        if isinstance(cfg, str):
            cfg = load_cfg(cfg, kwargs)
        self.cfg = EasyDict(cfg)
        print(self.cfg)
        self.model = None
        self.processor = None
        self.ocr_engine = None
        self.classes = self.cfg["classes"]
        self.max_num_words = self.cfg.max_num_words
        self.model = None
        self.processor = None 
        self.ocr_engine = None
    def _init_predictor(self, model=None, proccessor=None, ocr_engine=None):
        if self.cfg.device != "cpu" and not torch.cuda.is_available():
            logger.info("Can not found cuda, training with CPU!!!")
            self.cfg.device = "cpu"
        if self.cfg["inference_weights"] is None:
            logger.info(
                "Not yet set value for inference weights, use weights instead!!!"
            )
        if model is None:
            self.model = LayoutLMv2._load_model(self.cfg)
        if proccessor is None:
            self.processor = LayoutLMv2._load_processor(self.cfg)
        if ocr_engine is None:
            self.ocr_engine = OCREngine(
                text_det=self.cfg["text_det"],
                text_recog=self.cfg["text_reg"],
                device=self.cfg["device"],
            )
    def __call__(self, input , ocr_output=None, return_raw=False):
        """Inference KIE -
        Pipeline:  Img -> OCR -> box + text (word-level) -> sort by x, y-axis -> LayoutLM -> Word formation -> result
        Args:
            input (np.ndarray / str / pdf path): BGR image (cv2)
        Returns:
            (dict): {
                'kie_raw_output': 
                'kie_post_output':
                'end2end_results': (dict) : {kie_label : value}
            }
        """
        if self.model is None:
            self._init_predictor()
        #check single or multi images
        if isinstance(input, np.ndarray):
            final_out= self.predict_single_image(input, ocr_output=ocr_output, return_raw=return_raw)
        elif isinstance(input, list):
            items = [self.predict_single_image(im, ocr_output=ocr_output, return_raw=return_raw) for im in input]
            final_out = self.aggregate_outputs(items)
        else:  #pdf
            import time 
            t1 = time.time()
            imgs = pdf_to_image(input)
            print("1. pdf2img: ", round(time.time() - t1, 4))
            t2 = time.time()
            items = [self.predict_single_image(im, ocr_output=ocr_output, return_raw=return_raw) for im in imgs]
            final_out = self.aggregate_outputs(items)
            print("2. kie: ", round(time.time() - t2, 4))
            print(f"3. full pipeline for {len(imgs)} pages: {round(time.time() - t1, 4)}")
        return final_out
    def predict_single_image(self, img, ocr_output=None, return_raw=False):
        if ocr_output is None:
            ocr_output = self.ocr_engine(img, extend_ratio=[0.1, 0.3], ratio_thr=5)   # solve long box
            # ocr_output = self.ocr_engine(img)
        kie_input = self.prepare_inputs(img, ocr_output)
        kie_output = self.predict(kie_input)
        kie_post_output = self.postprocessing(kie_output)
        formated_output = self.format_output(kie_post_output)
        output = {
            "kie_raw_output": kie_output if return_raw else None,  # raw output from layoutlm model
            "kie_post_output": kie_post_output if return_raw else None,  # wordgroup
            "end2end_results": formated_output,                         # field_key + field_value
        }
        return output
    def predict(self, inputs: dict):
        """predict
        Args:
            inputs (dict): format
                {
                    'image": PIL RGB,
                    'boxes': list[],
                    'texts'
                }
        Returns:
            list[Word]: list of Word object
        """
        window_size = self.cfg.max_num_words
        slice_interval = self.cfg.slice_interval
        image, batch_boxes, batch_words = (
            inputs["img"],
            inputs["boxes"],
            inputs["texts"],
        )
        results = []
        non_norm_boxes = inputs['non_norm_boxes']
        if len(batch_boxes) == 0:
            logger.info("Not found any words in image!!! Continue...")
            return results
        text_windows = sliding_windows(batch_words, window_size, slice_interval)
        box_windows = sliding_windows(batch_boxes, window_size, slice_interval)
        # print([len(t) for t in text_windows])
        out_boxes_windows = []
        # out_labels_windows = []
        out_logits_windows = []
        for i in range(len(text_windows)):
            words = text_windows[i]  # len: MAX_N_WORDS
            boxes = box_windows[i]
            # Preprocess
            dummy_word_labels = [0] * len(words)
            encoding = self.processor(
                image,
                text=words,
                boxes=boxes,
                word_labels=dummy_word_labels,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=self.cfg.max_seq_length,
            )
            label_ = encoding.pop('labels')
            # Run model
            for k, v in encoding.items():
                encoding[k] = v.to(self.cfg.device)
            with torch.no_grad():
                output = self.model(**encoding)
            logits = output.logits.squeeze()  # seq_len * classes
            predictions = output.logits.argmax(-1).squeeze().tolist()
            token_boxes = encoding.bbox.squeeze().tolist()
            # Postprocess
            # is_subword = (encoding["labels"] == -100).detach().cpu()[0]
            is_subword = (label_ == -100).detach().cpu()[0]
            logit_predictions = logits[torch.logical_not(is_subword), :]
            true_boxes = torch.Tensor(
                [
                    unnormalize_box(box, image.size[0], image.size[1])
                    for idx, box in enumerate(token_boxes)
                    if not is_subword[idx]
                ]
            )
            # print("logit shape: ", logit_predictions.shape)
            # print("box shape: ", true_boxes.shape)
            # true_boxes = torch.Tensor(
            #     [
            #         unnormalize_box(box, image.size[0], image.size[1])
            #         for idx, box in enumerate(boxes)
            #     ]
            # )
            out_boxes_windows.append(true_boxes)
            out_logits_windows.append(logit_predictions)
        # merge output by average logits between overlap window
        merged_out_boxes = out_boxes_windows[0]
        merged_out_logit = out_logits_windows[0]
        overlap = window_size - slice_interval 
        for i in range(1, len(out_boxes_windows)):
            if overlap != 0:
                prev_overlap_logits = merged_out_logit[-overlap:, :]  
                curr_overlap_logits = out_logits_windows[i][:overlap, :]  
                avg_overlap_logits = (
                    prev_overlap_logits + curr_overlap_logits
                ) / 2  
                curr_logits = torch.cat(
                    [avg_overlap_logits, out_logits_windows[i][overlap:, :]], dim=0
                )
                merged_out_logit = torch.cat(
                    [merged_out_logit[:-overlap, :], curr_logits], dim=0
                )
            else:
                merged_out_logit = torch.cat(
                    [merged_out_logit, out_logits_windows[i]], dim=0
                )
            merged_out_boxes = torch.cat(
                [merged_out_boxes, out_boxes_windows[i][overlap:, :]], dim=0
            )
        # print(f"merged_out_logit={len(merged_out_logit)} - merged_out_boxes={len(merged_out_boxes)}")
        # from IPython import embed; embed()
        assert len(merged_out_logit) == len(
            merged_out_boxes
        ), f"{len(merged_out_logit)} # {len(merged_out_boxes)}"
        predictions = merged_out_logit.argmax(-1).squeeze().tolist()
        if not isinstance(predictions, list):
            predictions = [predictions]
        assert len(predictions) == len(batch_words), f"{len(predictions)} # {len(batch_words)}"
        # for word_index in range(len(batch_words)):
        for word_index, word in enumerate(batch_words):
            word = batch_words[word_index]
            # bndbox = [int(coord) for coord in merged_out_boxes[word_index]]
            bndbox = non_norm_boxes[word_index]
            kie_label = self.cfg.classes[predictions[word_index]]
            results.append(
                Word(
                    text=word,
                    bndbox=bndbox,
                    kie_label=kie_label,
                    conf_cls=inputs["recog_confs"][word_index]
                )
            )
        return results
    def postprocessing(self, words):
        """Post processing for output of kie
        - Merge wordgroup list for each field
        Args:
            items (dict): _description_
        """
        list_lines, _ = words_to_lines(words)
        list_word_group = []
        for line in list_lines:
            for word_group in line.list_word_groups:
                word_group.update_kie_label()
                word_group.update_conf()
                list_word_group.append(word_group)
        kie_dict = construct_word_groups_to_kie_label(list_word_group)
        #receipt postprocess
        if 'postprocess_type' in self.cfg and self.cfg.postprocess_type == "receipt_postprocess":
            kie_dict = receipt_postprocess(kie_dict, words) 
        else:  #invoice_postprocess
            # kie_dict = invoice_postprocess(kie_dict)
            kie_dict = self._postprocess_kie_wordgroups(kie_dict, doc_type=self.cfg.postprocess_type)
        return kie_dict
    def format_output(self, kie_dict):
        """
        Args:
            kie_dict (dict) : format 
                {
                    'field_name': list[Wordgroup]
                }
        Returns:
            (dict): format 
                {
                    'field_name': {
                        'box': list,
                        'value': str,
                        'conf': float
                    }
                }
        """
        end2end_results = {}
        filtered_dict = {k:v for k,v in kie_dict.items() if "key" not in k}
        for field_name, wg_list in filtered_dict.items():
            wg_list = [wg_list] if not isinstance(wg_list, list) else wg_list
            if len(wg_list) == 0:
                text, conf, box = "", 0.0, []
            else:
                text = " ".join([wg.text for wg in wg_list]).strip().replace("✪", " ")
                conf = sum(wg.conf for wg in wg_list) / len(wg_list)
                box = merge_boxes([wg.boundingbox for wg in wg_list])
            end2end_results[field_name] = {
                "box": box,
                "value": text, 
                "conf": conf
            }
        # add empty values for missing fields
        for class_name in self.classes:
            if "key" not in class_name and class_name not in end2end_results and class_name.lower() not in ['other', 'others']:
                end2end_results[class_name] = {
                    "box": [],
                    "value": "", 
                    "conf": 0.0
                }
        end2end_results = self._postprocess_recognized_text(end2end_results, doc_type=self.cfg.postprocess_type)
        # sort by keys
        end2end_results = dict(sorted(end2end_results.items()))
        return end2end_results
    def _postprocess_kie_wordgroups(self, result, doc_type, metadata=None):
        """post process for wordgroup outputs
        Args:
            result (dict): {'field_name': list[Wordgroup]}
            doc_type (str): invoice / receipt
            metadata (_type_, optional): _description_. Defaults to None.
        Returns:
            _type_: _description_
        """
        for field_name in result.keys():
            if field_name not in POSTPROCESS_FUNC[doc_type]:
                continue
            postprocess_func = POSTPROCESS_FUNC[doc_type][field_name].get("wordgroup", None)
            if postprocess_func is None:
                continue
            result[field_name] = postprocess_func(result[field_name], metadata={"field_name": field_name, "wg_res": result})
        return result
    def _postprocess_recognized_text(self, result, doc_type, metadata=None):
        for field_name in result.keys():
            if field_name not in POSTPROCESS_FUNC[doc_type]:
                continue
            postprocess_func = POSTPROCESS_FUNC[doc_type][field_name].get("text", None)
            if postprocess_func is None:
                continue
            result[field_name]["value"] = postprocess_func(result[field_name]['value'], metadata)
        return result
    def prepare_inputs(self, img, ocr_output):
        """Prepare input for KIE  model
        Args:
            img (np.ndarray): BGR image
            ocr_output (dict): format
                {
                    "img_path": img_path,
                    "img": image,
                    "boxes": boxes,
                    "texts": words,
                    "kie_labels": word_labels
                }
        """
        assert "boxes" in ocr_output, "boxes not exists in ocr_output"
        assert "texts" in ocr_output, "texts not exists in ocr_output"
        # cv2 to PIL (RGB)
        img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        w, h = img.size
        texts = ocr_output["texts"]
        boxes = ocr_output["boxes"]
        recog_confs = ocr_output['recog_confs']
        det_confs = ocr_output['det_confs']
        texts = [text.replace(" ", "✪") for text in texts]   # layoutlm will throw an error if the input has space characters
        word_items = {
            'boxes': boxes,
            'texts': texts,
            'det_confs': det_confs,
            'recog_confs': recog_confs
        }
        sorted_word_items = sort_words(word_items)
        (boxes, texts, det_confs, recog_confs) = (
            sorted_word_items['boxes'], 
            sorted_word_items['texts'], 
            sorted_word_items['det_confs'], 
            sorted_word_items['recog_confs']
        )
        non_norm_boxes = sorted_word_items['boxes']
        boxes = [normalize_box(box, width=w, height=h) for box in boxes]
        out_item = {
            "img": img, 
            "boxes": np.array(boxes), 
            "texts": texts,
            "det_confs": det_confs,
            "recog_confs": recog_confs,
            "non_norm_boxes": non_norm_boxes
        }
        return out_item
    def aggregate_outputs(self, outs):
        f"""Postprocess the outputs of the muliple pages
        Args:
            outs (_type_): _description_
        Returns:
            output: (dict): final output 
        """
        combine_out = {
            'kie_raw_output': [],
            'kie_post_output': [],
            'end2end_result_each_page': [],
            'end2end_results': None
        }
        for idx, out in enumerate(outs):
            combine_out['kie_raw_output'].append(out['kie_raw_output'])
            combine_out['kie_post_output'].append(out['kie_post_output'])
            combine_out['end2end_result_each_page'].append(out['end2end_results'])
        #merge end2end result
        end2end_results = combine_out['end2end_result_each_page'][0]
        for page_id, end2end_results_page in enumerate(combine_out['end2end_result_each_page'][1:]):
            for field_key, field_value in end2end_results_page.items():
                if "value" in end2end_results[field_key] \
                  and (end2end_results[field_key]['value'] == "" or end2end_results[field_key]['value'] == "0"):
                    end2end_results[field_key] = field_value
        combine_out['end2end_results'] = end2end_results
        return combine_out
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/engine/trainer.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/engine/trainer.py
@ -1,260 +0,0 @@
 import logging
 import os
 import time
 import torch
 import wandb
 from easydict import EasyDict
 from sklearn.metrics import precision_recall_fscore_support
 from terminaltables import AsciiTable
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import LinearLR
 from tqdm import tqdm
 from sdsvkie.datasets.base_dataset import BaseDataset
 from sdsvkie.models.layoutlmv2 import LayoutLMv2
 from sdsvkie.models.layoutlm import LayoutLM
 from sdsvkie.utils import yaml_save, get_info_env, get_logger
 from transformers import get_scheduler
 # logging = logging.getlogging(__name__)
 # logging.basicConfig(level=logging.INFO)
 class Trainer:
    def __init__(self, cfg: dict):
        self.cfg = EasyDict(cfg)
        self.model = None
        self.processor = None
        self._init_trainer()
    def _init_trainer(self):
        if self.cfg.v3:
            os.environ["TOKENIZERS_PARALLELISM"] = "false"
        if not os.path.exists(self.cfg.save_dir):
            os.makedirs(self.cfg.save_dir, exist_ok=True)
        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
        log_file = "{}/{}.log".format(self.cfg.save_dir, str(timestamp))
        self.logger = get_logger(log_file=log_file, log_level='INFO')
        self.logger.info(self.cfg)
        # self.logger.info(get_info_env())
        if not torch.cuda.is_available():
            self.logger.info("Can not found cuda, training with CPU!!!")
            self.cfg.device = "cpu"
        self.model = LayoutLMv2._load_model(self.cfg)
        self.processor = LayoutLMv2._load_processor(self.cfg)
        self.model.to(self.cfg.device)
        if self.cfg.wandb:
            wandb.init(
                project=self.cfg.wandb,
            )
    def _build_dataloader(self, data_dir, cache_file, use_sampling=False):
        base_dataset = BaseDataset(self.cfg)
        dataloader = base_dataset.build_dataloader_from_dir(
            data_dir=data_dir,
            processor=self.processor,
            device="cpu",
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            cache_file=cache_file,
            use_sampling=use_sampling,
        )
        return dataloader
    def val(self, val_dir=None):
        val_dir = val_dir if val_dir is not None else self.cfg.val_dir
        val_cache_file = os.path.splitext(val_dir)[0] + ".pkl"
        val_dataloader = self._build_dataloader(
            val_dir if val_dir is not None else self.cfg.val_dir,
            cache_file=val_cache_file
        )
        acc = self.val_on_dataloader(val_dataloader)
        return acc
    def train(self, train_dir=None, val_dir=None):
        self.logger.info("Building train dataloader...")
        base_dataset = BaseDataset(self.cfg)
        train_dir = train_dir if train_dir is not None else self.cfg.train_dir
        val_dir = val_dir if val_dir is not None else self.cfg.val_dir
        train_cache_file = os.path.splitext(train_dir)[0] + ".pkl"
        val_cache_file = os.path.splitext(val_dir)[0] + ".pkl"
        if self.cfg.sampling:
            train_dataset = base_dataset._build_dataset(data_dir=train_dir)
            train_dataloader = base_dataset.build_dataloader_from_dataset(
                train_dataset,
                batch_size=self.cfg.batch_size,
                processor=self.processor,
                device="cpu",
                shuffle=True,
                num_workers=self.cfg.num_workers,
                cache_file=train_cache_file,
                use_sampling=True,
            )
        else:
            train_dataloader = self._build_dataloader(
                train_dir if train_dir is not None else self.cfg.train_dir,
                cache_file=train_cache_file,
                use_sampling=False,
            )
        self.logger.info("Building valid dataloader...")
        val_dataloader = self._build_dataloader(
            val_dir if val_dir is not None else self.cfg.val_dir,
            cache_file= val_cache_file,
            use_sampling=False,
        )
        self.logger.info(
            f"Info dataset: train = {len(train_dataloader)}, test = {len(val_dataloader)}"
        )
        optimizer = AdamW(self.model.parameters(), lr=self.cfg.lr)
        if self.cfg.scheduler:
            # scheduler = torch.optim.lr_scheduler.OneCycleLR(
            #     optimizer, 
            #     max_lr=self.cfg.lr, 
            #     steps_per_epoch=len(train_dataloader), 
            #     epochs=self.cfg.epochs, 
            #     anneal_strategy='cos', 
            #     pct_start=0.1,
            #     div_factor=25,   #init lr = max_lr / div_factor
            #     final_div_factor=1e4, # min lr = init_lr / final_dev_factor
            # )
            num_training_steps = self.cfg.epochs * len(train_dataloader)
            scheduler = get_scheduler(
                name=self.cfg.scheduler, 
                optimizer=optimizer, 
                num_warmup_steps=0,
                num_training_steps=num_training_steps,
            )
        if self.cfg.wandb:
            wandb.config = dict(self.cfg)
        best_acc = 0.0
        best_epoch = 0
        yaml_save(os.path.join(self.cfg.save_dir, "config.yaml"), dict(self.cfg))
        for epoch in range(self.cfg.epochs):
            # sampling slice window
            if (
                self.cfg.sampling and epoch != 0 and epoch % self.cfg.sampling == 0
            ):  # sampling each cfg.sampling epochs
                train_dataloader = base_dataset.build_dataloader_from_dataset(
                    train_dataset,
                    batch_size=self.cfg.batch_size,
                    processor=self.processor,
                    device="cpu",
                    shuffle=True,
                    num_workers=self.cfg.num_workers,
                    cache_file=train_cache_file,
                    use_sampling=True,
                )
            self.model.train()
            self.logger.info(f"Epoch: {epoch}:")
            running_loss = 0.0
            for batch in tqdm(train_dataloader):
                # forward pass
                batch = self._to_device(batch, self.cfg.device)
                outputs = self.model(**batch)
                loss = outputs.loss
                running_loss += loss.item()
                # backward pass to get the gradients
                loss.backward()
                # update
                optimizer.step()
                if self.cfg.scheduler:
                    scheduler.step()
                optimizer.zero_grad()
            loss_avg = running_loss / len(train_dataloader)
            self.logger.info(f"Epoch[{epoch}/{self.cfg.epochs}] - lr: {round(scheduler.get_last_lr()[0], 9) if self.cfg.scheduler else self.cfg.lr} - loss: {loss_avg}")
            if self.cfg.wandb:
                wandb.log({"train_loss": loss_avg})
            # valid
            if epoch >= self.cfg.eval_delay:
                acc = self.val_on_dataloader(val_dataloader)
                if acc > best_acc:
                    self.model.save_pretrained(os.path.join(self.cfg.save_dir, "best"))
                    self.logger.info(f"Update best acc, prev best acc = {best_acc}, current best acc = {acc}")
                    best_acc = acc
                    best_epoch = epoch
                if epoch % self.cfg.save_weight_interval == 0:
                    self.model.save_pretrained(
                        os.path.join(self.cfg.save_dir, "epoch_{}".format(epoch))
                    )
            self.model.save_pretrained(os.path.join(self.cfg.save_dir, "last"))
        self.logger.info(f"Best accuracy = {best_acc} at epoch {best_epoch}")
    def _to_device(self, batch, device):
        batch = {k: v.to(device) for k, v in batch.items()}
        return batch
    def val_on_dataloader(self, dataloader):
        self.model.eval()
        total, correct = 0, 0
        preds, truths = [], []
        running_loss = 0.0
        for batch in tqdm(dataloader):
            with torch.no_grad():
                batch = self._to_device(batch, self.cfg.device)
                outputs = self.model(**batch)
                loss = outputs.loss
                running_loss += loss.item()
                predictions = outputs.logits.argmax(dim=2)
                valid_samples = batch["labels"] != -100
                predictions = predictions[valid_samples]
                batch_labels = batch["labels"][valid_samples]
                preds.extend(predictions.detach().cpu().numpy().tolist())
                truths.extend(batch_labels.detach().cpu().numpy().tolist())
                correct += (predictions == batch_labels).float().sum()
                total += predictions.numel()
        loss_avg = running_loss / len(dataloader)
        p, r, f1, support = precision_recall_fscore_support(truths, preds)
        # self.logger.info("shapeeee", p.shape)
        table_data = [["Class", "P", "R", "F1", "#samples"]]
        for c in range(len(self.cfg.classes)):
            if c < p.shape[0]:
                table_data.append([self.cfg.classes[c], p[c], r[c], f1[c], support[c]])
            continue
        f1_avg = sum(f1) / len(f1)
        table = AsciiTable(table_data)
        self.logger.info(table.table)
        self.logger.info(
            "Validation F1: {} - #samples: {} - #corrects: {}".format(
                f1_avg, total, correct
            )
        )
        if self.cfg.wandb:
            wandb.log({"val_loss": loss_avg, "f1": f1_avg})
        return f1_avg
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/init.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/init.py
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/cvat.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/cvat.py
@ -1,671 +0,0 @@
 import argparse
 import glob
 import json
 import os
 import xml.etree.ElementTree as ET
 import numpy as np
 import tqdm
 from sdsvkie.utils.word_formation import Box, check_iou
 from pathlib import Path 
 def read_txt(txt):
    with open(txt, 'r', encoding='utf8') as f:
        data = [line.strip() for line in f]
    return data 
 def write_txt(txt, data):
    with open(txt, 'w', encoding='utf8') as f:
        for line in data:
            f.write(line + "\n")
 """
 general json format: 
 python cvat.py --task pseudo --xml sample_cvat/annotations.xml --xml_out sample_cvat/annotations_out.xml --pseudo_path sample_cvat/pseudo.json
 """
 """
 config for fwd
 {
    "id": "OCR005_2",
    "name": "Số Hợp đồng",
    "type": "text",
    "value": "33252663",
    "page_num": 0,
    "box": [192, 168, 220, 250]
 },
 """
 # CONFIG = {
 #     "label": "field",
 #     #edit here 
 #     "attribute_names": ["type", "value", "id", "name"]    # name of attribute in cvat label: text / checkbox
 # }
 CONFIG = {
    "label": "word",
    # "attribute_names": ["text", "kie_label"]    # name of attribute in cvat label: text / checkbox
    "attribute_names": [] 
 }
 class CVAT:
    def __init__(self):
        pass 
    def create_xml_from_json(self, json_path, xml_in, xml_out):
        """
        json data format: 
        {
            "img_1.jpg": [
                {
                    "box": [x1, y1, x2, y2] 
                    "label": str                # (not required)
                    "attrib1": str, 
                    "attrib2": str,
                },
                {
                    "box": [x1, y1, x2, y2] 
                    "label": str                # (not required)
                    "attrib1": str, 
                    "attrib2": str,
                }
                ...
            ],
            "img_2.jpg": [...]
        }        
        """
        data = self.read_json(json_path)
        tree = ET.parse(xml_in)
        root = tree.getroot()
        for img_item in root.iter("image"):
            img_path = img_item.attrib['name']
            img_data = data[img_path]
            for item in img_data:
                et = ET.Element('box')
                #default values
                et.attrib['occluded']="0"
                et.attrib['source']="manual"
                et.attrib['z_order'] = "0"
                #overwrite values
                if 'label' in item:
                    et.attrib['label'] = item['label']
                else:
                    et.attrib['label'] = CONFIG['label']
                xmin, ymin, xmax, ymax = item['box']
                (
                    et.attrib['xtl'], et.attrib['ytl'], 
                    et.attrib['xbr'], et.attrib['ybr']
                ) = (
                    str(xmin), str(ymin), 
                    str(xmax), str(ymax)
                )
                for att_name in CONFIG['attribute_names']:
                    if att_name not in item:
                        continue
                    att_et = ET.Element('atrribute')
                    att_et.attrib['name'] = att_name
                    att_et.text = item[att_name]
                    et.append(att_et)
                img_item.append(et)
        tree.write(xml_out, encoding='utf8')
    def get_data_from_txt_dir(self, txt_dir, resever_parent_dir=False):
        if resever_parent_dir:
            txt_paths = glob.glob(txt_dir + "/*/*.txt")
        else:
            txt_paths = glob.glob(txt_dir + "/*.txt")
        data = {}
        for txt_path in txt_paths:
            if resever_parent_dir:
                txt_name = "/".join(txt_path.split("/")[-2:])
            else:
                txt_name = os.path.basename(txt_path)
            txt_data = read_txt(txt_path)
            format_data = []
            for line in txt_data:
                items = line.split("\t")
                # assert len(items) == 6, "error get len = {} - {}".format(len(items), items)
                box = [int(float(x)) for x in items[:4]]
                text = items[4]
                if len(items) == 6:
                    kie_label = items[5]
                else:
                    kie_label = "word"
                format_data.append(
                    {
                        'box': box,
                        'text': text, 
                        'label': kie_label
                    }
                )
            data[txt_name] = format_data
        return data
    def get_data_from_txt_path(self, txt_path):    
        txt_data = read_txt(txt_path)
        format_data = []
        for line in txt_data:
            items = line.split("\t")
            assert len(items) == 6, "error get len = {} - {}".format(len(items), items)
            box = [int(float(x)) for x in items[:4]]
            text = items[4]
            kie_label = items[5]
            format_data.append(
                {
                    'box': box,
                    'text': text, 
                    'label': kie_label
                }
            )
        return format_data
    def format_data_invoice(self, data):
        new_data = {}
        for txt_name, value in data.items():
            items = []
            for item in value:
                text = item['text']
                if "____kie_wordgroup" in text:
                    new_item = {
                        'box': item['box'],
                        'label': item['label']
                    }
                else:
                    new_item = {
                        "box": item['box'],
                        'text': "xxxxxx",
                        'kie_label': item['label'],
                        'label': "word"
                    }
                items.append(new_item)
            new_data[txt_name] = items
        return new_data
    def create_xml_from_txt(self, txt_dir, xml_in, xml_out, skip_labels=[], resever_parent_dir=False):
        """
        """
        data = self.get_data_from_txt_dir(txt_dir, resever_parent_dir)
        print(list(data.keys()))
        # for invoice 
        if len(skip_labels) > 0 and "word" in skip_labels:
            data = self.format_data_invoice(data)
        tree = ET.parse(xml_in)
        root = tree.getroot()
        count = 0 
        for img_item in tqdm.tqdm(root.iter("image")):
            count += 1
            img_path = img_item.attrib['name']
            txt_name = os.path.splitext(img_path)[0] + ".txt"
            img_data = data.get(txt_name, [])
            # from IPython import embed; embed()
            # break
            if len(img_data) > 0:
                # img_item.clear()
                for child in img_item:
                    img_item.remove(child)
            for item in img_data:
                et = ET.Element('box')
                #default values
                et.attrib['occluded']="0"
                et.attrib['source']="manual"
                et.attrib['z_order'] = "0"
                #overwrite values
                if 'label' in item:
                    if item['label'] in skip_labels:
                        continue
                    et.attrib['label'] = item['label']
                else:
                    et.attrib['label'] = CONFIG['label']
                xmin, ymin, xmax, ymax = item['box']
                (
                    et.attrib['xtl'], et.attrib['ytl'], 
                    et.attrib['xbr'], et.attrib['ybr']
                ) = (
                    str(xmin), str(ymin), 
                    str(xmax), str(ymax)
                )
                for att_name in CONFIG['attribute_names']:
                    if att_name not in item:
                        continue
                    att_et = ET.Element('atrribute')
                    att_et.attrib['name'] = att_name
                    att_et.text = item[att_name]
                    et.append(att_et)
                img_item.append(et)
        print("Num imgs: ", count)
        tree.write(xml_out, encoding='utf8')
    def get_data_from_xml(self, xml, skip_labels=[]):
        """ parse xml to dict
        Args:
            xml (str): cvat anno xml path
        Returns:
            (dict): {
                'img_1.jpg': [
                    'kie_label': [xyxy]
                    ...
                ],
                'img_2.jpg': ...
            }
        """
        anno_data = open(xml, encoding='utf8')
        tree  = ET.parse(anno_data)
        root = tree.getroot()
        data = {}
        for obj in tqdm.tqdm(root.iter("image")):
            img_path = obj.attrib['name']
            img_data = []
            for box in obj.iter("box"):
                box_label = box.attrib['label']
                if box_label in skip_labels:
                    continue
                #get coordinate
                xmin, ymin, xmax, ymax = box.attrib['xtl'], box.attrib['ytl'], box.attrib['xbr'], box.attrib['ybr']
                xmin, ymin, xmax, ymax = int(float(xmin)),  int(float(ymin)),  int(float(xmax)),  int(float(ymax))
                item = {
                    box_label: [xmin, ymin, xmax, ymax]
                }
                img_data.append(item)
            data[img_path] = img_data
        return data
    @staticmethod
    def write_json(json_path, data):
        with open(json_path, 'w', encoding='utf8') as f:
            json.dump(data, f, ensure_ascii=False)
    def read_json(self, json_path):
        with open(json_path, 'r', encoding='utf8') as f:
            data = json.load(f)
        return data 
    def update_label_kie(self, txt_dir, json_path, out_dir):
        if not os.path.exists(out_dir):
            os.makedirs(out_dir, exist_ok=True)
        data = self.read_json(json_path)
        txt_paths = glob.glob(txt_dir + "/*.txt")
        for txt_path in tqdm.tqdm(txt_paths):
            ori_data = read_txt(txt_path)
            boxes = []
            img_name = os.path.splitext(os.path.basename(txt_path))[0]
            img_name = "_".join(img_name.split("_")[:-1]) + "_1" + ".jpg"
            # img_name = os.path.splitext(os.path.basename(txt_path))[0] + ".jpg"
            new_img_data = data[img_name]
            for line in ori_data:
                xmin, ymin, xmax, ymax, text, kie_label = line.strip().split("\t")
                if "____kie_wordgroup" in text:
                    continue
                xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)
                # box_word = [xmin, ymin, xmax, ymax]
                new_kie_label = "other"
                for label_info in new_img_data:
                    for label, box_wordgroup in label_info.items():
                        # print(label, box_wordgroup)
                        box_word = Box(
                            xmin, ymin, xmax, ymax
                        )
                        box_wordgroup = Box(
                            box_wordgroup[0], box_wordgroup[1], box_wordgroup[2],box_wordgroup[3]
                        )
                        if  check_iou(box1=box_word, box2=box_wordgroup, threshold=0.85):
                            new_kie_label = label
                            break 
                    if new_kie_label != "other":
                        break
                new_box = Box(
                    xmin=xmin, 
                    ymin=ymin,
                    xmax=xmax,
                    ymax=ymax, 
                    label=text,
                    kie_label=new_kie_label
                )
                boxes.append(new_box)
            boxes = sorted(boxes, key=lambda box: [box.ymin, xmin])
            new_data = [
               "\t".join([str(box.xmin), str(box.ymin), str(box.xmax), str(box.ymax), box.label, box.kie_label])
                    for box in boxes
            ]
            write_txt(os.path.join(out_dir, os.path.basename(txt_path)), new_data)
    def _check_iou(self, box1, box2, threshold=0.9):
        """_summary_
        Args:
            box1 (_type_): word box
            box2 (_type_): line box 
            threshold (float, optional): _description_. Defaults to 0.9.
        Returns:
            _type_: _description_
        """
        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area2 =  (box2[2] - box2[0]) * (box2[3] - box2[1])
        xmin_intersect = max(box1[0], box2[0])
        ymin_intersect = max(box1[1], box2[1])
        xmax_intersect = min(box1[2], box2[2])
        ymax_intersect = min(box1[3], box2[3])
        if xmax_intersect < xmin_intersect or ymax_intersect < ymin_intersect:
            area_intersect = 0
        else:
            area_intersect = (xmax_intersect - xmin_intersect) * (
                ymax_intersect - ymin_intersect
            )
        # union = area1 + area2 - area_intersect
        iou = area_intersect / area1
        return iou 
        # if iou > threshold:
        #     return True
        # return False
    def _update_label_for_word(self, box, line_items, threshold=0.75, other_class='others'):
        have_label = False
        max_iou = -1
        for line_item in line_items:
            # 465	901	664	940
            curr_iou = self._check_iou(box, line_item['box'], threshold=threshold)
            if curr_iou > threshold and curr_iou > max_iou:
                max_iou = curr_iou
                kie_label = line_item['label']
                have_label = True
                # if box[0] == 465 and box[-1] == 940:
                #     print(box, curr_iou, kie_label, line_item)
                # break
        if not have_label:
            kie_label = other_class
        return kie_label
    def update_label_kie_from_xml(self, txt_dir, xml, out_dir, skip_labels = [], line_to_word=False, other_class="others", resever_parent_dir=False):
        if not os.path.exists(out_dir):
            os.makedirs(out_dir, exist_ok=True)
        #read xml 
        xml_data = {}
        anno_data = open(xml, encoding='utf8')
        tree  = ET.parse(anno_data)
        root = tree.getroot()
        # data = {}
        for obj in tqdm.tqdm(root.iter("image")):
            img_path = obj.attrib['name']
            # img_data = []
            if not line_to_word:
                img_data = {}
                for box in obj.iter("box"):
                    box_label = box.attrib['label']
                    if box_label in skip_labels:
                        continue
                    #get coordinate
                    xmin, ymin, xmax, ymax = box.attrib['xtl'], box.attrib['ytl'], box.attrib['xbr'], box.attrib['ybr']
                    box_int = int(float(xmin)),  int(float(ymin)),  int(float(xmax)),  int(float(ymax))
                    box_key = ",".join([str(x) for x in box_int])
                    img_data[box_key] = box_label
            else:
                img_data = []
                for box in obj.iter("box"):
                    box_label = box.attrib['label']
                    if box_label in skip_labels:
                        continue
                    #get coordinate
                    xmin, ymin, xmax, ymax = box.attrib['xtl'], box.attrib['ytl'], box.attrib['xbr'], box.attrib['ybr']
                    box_int = int(float(xmin)),  int(float(ymin)),  int(float(xmax)),  int(float(ymax))
                    box_key = ",".join([str(x) for x in box_int])
                    img_data.append(
                        {
                            'box': box_int,
                            'label': box_label
                        }
                    )
            xml_data[os.path.splitext(img_path)[0]] = img_data
        # print(xml_data)
        if resever_parent_dir:
            txt_paths = glob.glob(txt_dir + "/*/*.txt")
        else:
            txt_paths = glob.glob(txt_dir + "/*.txt")
        updated_imgs = []
        for txt_path in tqdm.tqdm(txt_paths):
            is_update = False 
            # print(txt_path)
            ori_data = read_txt(txt_path)
            # print(ori_data)
            img_new_data = []
            if resever_parent_dir:
                img_key = str(Path(txt_path).with_suffix('').relative_to(Path(txt_path).parent.parent))   # a/xyz
            else:
                img_key = os.path.splitext(os.path.basename(txt_path))[0]   # xyz
            if img_key not in xml_data:
                print(txt_path)
                continue
            img_annoted_data = xml_data[img_key]
            # print(img_key, img_annoted_data)
            if not line_to_word:
                for line in ori_data:
                    xmin, ymin, xmax, ymax, text, kie_label = line.strip().split("\t")
                    if "____kie_wordgroup" in text:
                        continue
                    box_int = int(xmin), int(ymin), int(xmax), int(ymax)
                    box_key = ",".join([str(x) for x in box_int])
                    if box_key in img_annoted_data:
                        if kie_label != img_annoted_data[box_key]:
                            is_update.append(txt_path)
                            # print(kie_label, img_annoted_data[box_key])
                        kie_label = img_annoted_data[box_key]
                    else:
                        kie_label = other_class
                    img_new_data.append("\t".join([xmin, ymin, xmax, ymax, text, kie_label]))
            else:
                # print("ori_data: ", ori_data)
                for line in ori_data:
                    # print(line)
                    items = line.strip().split("\t")
                    if len(items) == 5:
                        xmin, ymin, xmax, ymax, text = items
                    else:
                        xmin, ymin, xmax, ymax, text, label = items
                    box_int = int(xmin), int(ymin), int(xmax), int(ymax)
                    kie_label = self._update_label_for_word(box_int, img_annoted_data, threshold=0.75, other_class=other_class)
                    if label != kie_label:
                        print(kie_label, label)
                        is_update = True
                    img_new_data.append("\t".join([xmin, ymin, xmax, ymax, text, kie_label]))
            if resever_parent_dir:
                out_sub_dir = Path(out_dir) / Path(img_key).parts[-2]
                if not out_sub_dir.exists():
                    out_sub_dir.mkdir(parents=True)
            # else:
            #     out_sub_dir = out_dir
            # out_sub_dir = str(out_sub_dir)
            write_txt(os.path.join(out_dir, img_key + ".txt"), img_new_data)
            if is_update:
                updated_imgs.append(txt_path)
            else:
                print("No update: ", txt_path)
        print("updated_imgs: ", list(set(updated_imgs)))
        print("num updated_imgs: ", len(list(set(updated_imgs))))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--xml", type=str, default='annotations.xml')
    parser.add_argument("--pseudo_path", type=str, default="pseudolabel.json")
    parser.add_argument("--xml_out", type=str, default="annotations_out.xml")
    parser.add_argument("--task", type=str, default='pseudo', help='pseudo / parse_data / update_txt / update_txt_from_xml')
    parser.add_argument("--txt_in", type=str, default='txt_dir_in')
    parser.add_argument("--txt_out", type=str, default='txt_dir_out')
    parser.add_argument("--line_to_word", action='store_true')
    parser.add_argument("--other_class", type=str, default='other')
    parser.add_argument("--resever_parent_dir", action="store_true")
    args = parser.parse_args()
    cvat = CVAT()
    if args.task == 'parse_data':
        data = cvat.get_data_from_xml(
            xml=args.xml
        )
        CVAT.write_json(args.xml_out, data)
    elif args.task == 'pseudo_from_json':
        cvat.create_xml_from_json(
            xml_in=args.xml,
            xml_out=args.xml_out,
            json_path=args.pseudo_path
        )
    elif args.task == 'pseudo_from_txt':
        cvat.create_xml_from_txt(
            xml_in=args.xml,
            xml_out=args.xml_out,
            txt_dir=args.pseudo_path,
            # skip_labels=['word']
            # skip_labels=[args.other_class],
            resever_parent_dir=args.resever_parent_dir
        )
    elif args.task == 'update_txt':
        cvat.update_label_kie(
            txt_dir=args.txt_in,
            json_path=args.pseudo_path,
            out_dir=args.txt_out
        )
    elif args.task == 'update_txt_from_xml':
        cvat.update_label_kie_from_xml(
            txt_dir=args.txt_in,
            xml=args.xml,
            out_dir=args.txt_out,
            skip_labels = ['word'], 
            line_to_word=args.line_to_word, 
            other_class=args.other_class,
            resever_parent_dir=args.resever_parent_dir
        )
    else:
        raise NotImplementedError(f"{args.task} not yet implemented")
 """
 python tools/cvat.py --task update_txt --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r20/one_line_filtered  --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r20/one_line_filtered --pseudo_path ../workdirs/data/vnpt_oneline/annotations.json
 python tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/xml/vnpt_r2/annotations.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/xml/vnpt_r2/annotations_out.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/vnpt_r2_txt
 python tools/cvat.py --task pseudo_from_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/re_labeling/wild_batch_1_raw.xml \
    --xml_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/re_labeling/wild_batch_1_pseudo.xml \
    --pseudo_path /mnt/ssd1T/hoanglv/Projects/KIE/DATA/WildReceipt/re_labeling/batches/batch_1
 python tools/cvat.py --task update_txt_from_xml \
    --txt_in  /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/vnpt_r2_txt \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/vnpt_r2.xml \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/vnpt_r2_done_txt
 python tools/cvat.py --task update_txt_from_xml \
    --txt_in /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_2/Good/Food  \
    --xml /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/labeling/Pseudo/batch_2/batch_2_food_done.xml   \
    --txt_out /mnt/ssd1T/hoanglv/Projects/KIE/DATA/SDSAP_Invoice/processed/batch_2/Good/Food \
    --other_class Others
 """
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/gen_cvat_text_det.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/gen_cvat_text_det.py
@ -1,137 +0,0 @@
 import cv2
 import glob
 import os
 import fitz
 import json
 import numpy as np
 import pandas as pd
 from time import time
 import yaml
 from tqdm.auto import tqdm
 import xml.etree.ElementTree as ET
 from sklearn.model_selection import StratifiedKFold
 # from textdetection.src.serve_model import Predictor as TextDetector
 FOLDER = "/mnt/ssd1T/tuanlv/06.KVUCombineStage/preprocess/data/invoices-receipts/SBT/nttmai_renamed/"
 TXT_DIR = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/pseudo_ocr/invoice_receipt_sbt"
 OUT_FOLDER = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/pseudo_ocr/sbt_batches"
 ANN_OUT = "processed_textdet_batch%d.json"
 N_BATCHES = 3
 os.makedirs(OUT_FOLDER, exist_ok=True)
 if __name__ == "__main__":
    all_files = []
    doc_types = []
    for file_name in tqdm(sorted(os.listdir(FOLDER))):
        try:
            file_name_no_ext = file_name.split('.')[0]
            all_files.append(file_name)
            doc_type = "1"
            doc_types.append(doc_type)
            file_path = os.path.join(FOLDER, file_name)
        except Exception as ex:
            print('image:', file_name, '. Error:', ex)
    df = pd.DataFrame({'file_name': all_files, 'doc_type': doc_types})
    df = df[df.doc_type.isin(['1'])].reset_index(drop=True)
    kfold = StratifiedKFold(n_splits=N_BATCHES)
    fold_num = 0
    for train_inds, val_inds in kfold.split(df, df['doc_type']):
        df.loc[val_inds, 'fold'] = fold_num
        fold_num+=1
    df.to_csv(f'{OUT_FOLDER}/tmp.csv', index=False)
    text_detector = TextDetector(setting['text_detection']['setting'], "textdetection")
    for batch in range(N_BATCHES):
        print(f"================== Batch {batch} ================")
        fold_df = df.loc[df.fold == batch]
        # tree = ET.parse(XML_IN)
        # root = tree.getroot()
        # for image in root.findall('image'):
        #     root.remove(image)
        with open('/home/sds/namnt/FWD_Data/coco_template.json', 'r') as f:
            coco_annotations = json.load(f)
        count = 2
        img_id = 1
        ann_id = 1
        all_images = []
        all_annotations = []
        for row_num, row in tqdm(fold_df.iterrows(), total=len(fold_df)):
            # try:
            file_name = row['file_name']
            file_name_no_ext = file_name.split('.')[0]
            doc_type = row['doc_type']
            file_path = os.path.join(FOLDER, file_name)
            images = pdf2np_fitz(file_path, _type='fname')
            images, batch_boxes = text_detector(images)
            for page_num, (img, boxes) in enumerate(zip(images, batch_boxes)):
                os.makedirs(os.path.join(OUT_FOLDER, f"batch{batch}"), exist_ok=True)
                out_img_path = os.path.join(OUT_FOLDER, f"batch{batch}", f"batch{batch}_{img_id:04d}_{file_name_no_ext}_{page_num}.jpg")
                cv2.imwrite(out_img_path, img[:,:,::-1])
                H, W = img.shape[:2]
                c_img = {
                    "id": int(img_id),
                    "width": W,
                    "height": H,
                    "file_name": os.path.join(f"batch{batch}", f"batch{batch}_{img_id:04d}_{file_name_no_ext}_{page_num}.jpg"),
                    "license": 0,
                    "flickr_url": "",
                    "coco_url": "",
                    "date_captured": 0
                }
                all_images.append(c_img)
                for box in boxes:
                    x1,y1,x2,y2 = box
                    w, h = x2-x1, y2-y1
                    c_ann = {
                        "id": int(ann_id),
                        "image_id": int(img_id),
                        "category_id": 1,
                        "segmentation": [],
                        "area": w*h,
                        "bbox": [x1,y1,w,h],
                        "iscrowd": 0,
                        "attributes": {
                            "occluded": False,
                            "rotation": 0.0
                        }
                    }
                    all_annotations.append(c_ann)
                    ann_id += 1
                img_id += 1
            # if count == 1:
            #     break
            # else:
            #     count -= 1
            # except Exception as ex:
            #     print('image:', file_name, '. Error:', ex)
        coco_annotations['categories'] = [{
            "id": 1,
            "name": "text",
            "supercategory": ""
        }]
        coco_annotations['images'] = all_images
        coco_annotations['annotations'] = all_annotations
        with open(os.path.join(OUT_FOLDER, ANN_OUT%(batch)), 'w') as f:
            json.dump(coco_annotations, f)
        # break
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/infer.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/infer.py
@ -1,144 +0,0 @@
 """
 Use for eval, debug
 """
 import argparse
 import os
 from copy import copy
 from glob import glob
 from pathlib import Path
 import cv2
 import tqdm
 from sdsvkie.cfg import load_cfg
 from sdsvkie.engine import Predictor
 from sdsvkie.utils import visualize_kie
 from sdsvkie.utils.io_file import load_ocr_output, write_json, write_txt
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str, default="sdsvkie/cfg/default.yaml")
    parser.add_argument(
        "--img", type=str, default="img.jpg", help="image path or directory"
    )
    # optional
    parser.add_argument("--weights", type=str, default=None, required=False)
    parser.add_argument("--device", type=str, default=None,  required=False)
    parser.add_argument(
        "--text_det", type=str, default=None, required=False, help="image path or directory"
    )
    parser.add_argument(
        "--text_reg", type=str, default=None,  required=False, help="image path or directory"
    )
    parser.add_argument(
        "--vis_out", type=str, default=None,  required=False,  help="visualize output directory"
    )
    parser.add_argument("--txt_out", type=str, required=False,  default=None)
    parser.add_argument("--kie_wordgroup_out", required=False,  action="store_true")
    parser.add_argument("--e2e", type=str, required=False,  default=None)
    parser.add_argument("--not_use_ocr", required=False,  action="store_true")
    parser.add_argument("--parse_e2e", action='store_true', help="Parse end2end result from word label")
    args = parser.parse_args()
    # print(cfg)
    predictor = Predictor(**vars(args))
    # cfg = load_cfg(args.cfg,  vars(args))
    # predictor = Predictor(cfg)
    if args.txt_out:
        if not os.path.exists(args.txt_out):
            os.makedirs(args.txt_out, exist_ok=True)
    if args.e2e:
        outdir_e2e = os.path.dirname(args.e2e)
        if not os.path.exists(outdir_e2e):
            os.makedirs(outdir_e2e, exist_ok=True)
    if os.path.isdir(args.img):
        img_paths = glob(args.img + "/*")
        print("Infence image dir, total imgs: {}".format(len(img_paths)))
    else:
        img_paths = [args.img]
    out_dict = {}
    for img_path in tqdm.tqdm(img_paths):
        img = cv2.imread(img_path)
        if img is None:
            print("img is None: ", img_path)
            continue
        if args.not_use_ocr:
            txt_path = str(Path(img_path).with_suffix(".txt"))
            ocr_output = load_ocr_output(txt_path)
            # print(len(ocr_output["boxes"]))
            out = predictor(img, ocr_output=ocr_output, return_raw=True)
        else:
            out = predictor(img, return_raw=True)
        # visualize
        if args.vis_out:
            # from IPython import embed; embed()
            out_kie = out["kie_raw_output"]
            visualize_kie(
                img,
                boxes=[word.boundingbox for word in out_kie],
                pred_labels=[word.kie_label for word in out_kie],
                image_name=os.path.basename(img_path),
                outdir=args.vis_out,
                skip_classes=["other---"]
            )
        if args.txt_out:
            txt_out_path = os.path.join(
                args.txt_out, os.path.splitext(os.path.basename(img_path))[0] + ".txt"
            )
            out_kie = out["kie_raw_output"]
            boxes = [word.boundingbox for word in out_kie]
            pred_labels = [word.kie_label for word in out_kie]
            texts = [word.text for word in out_kie]
            data = []
            if args.kie_wordgroup_out:
                output = out["kie_post_output"]
                # print(output)
                # from IPython import embed; embed()
                wordgroup_all_list = []
                for kie_label, wordgroup_list in output.items():
                    if isinstance(wordgroup_list, list):
                        wordgroup_all_list.extend(wordgroup_list)
                    else:
                        wordgroup_all_list.append(wordgroup_list)
                boxes = [word.boundingbox for word in wordgroup_all_list]
                pred_labels = [word.kie_label for word in wordgroup_all_list]
                texts = [word.text + "____kie_wordgroup" for word in wordgroup_all_list]
                for box, text, kie_label in zip(boxes, texts, pred_labels):
                    item = "\t".join([str(int(x)) for x in box])
                    item = "\t".join([item, text, kie_label])
                    data.append(item)
            else:
                for box, text, kie_label in zip(boxes, texts, pred_labels):
                    item = "\t".join([str(int(x)) for x in box])
                    item = "\t".join([item, text, kie_label])
                    data.append(item)
            write_txt(txt_out_path, data)
        if args.e2e:
            img_id = os.path.splitext(os.path.basename(img_path))[0]
            out_dict[img_id] = out['end2end_results']
    if args.e2e:
        write_json(os.path.join(args.e2e), out_dict)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/infer_e2e.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/infer_e2e.py
@ -1,122 +0,0 @@
 """
 Use for deploy
 """
 import argparse
 import os
 from glob import glob
 from pathlib import Path
 import cv2
 import tqdm
 from sdsvkie.cfg import load_cfg
 from sdsvkie.engine.predictor import Predictor
 from sdsvkie.utils.io_file import  write_json
 from sdsvkie.utils import visualize_kie, IMG_EXT, PDF_EXT
 import random
 """
 python sdsvkie/tools/infer_e2e.py \
    --cfg /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/config.yaml \
    --weights /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/invoice/06062023/best  \
    --device "cuda:0" \
    --img /mnt/hdd2T/AICR/Projects/2023/Vietinbank_POC/Invoice_JPG/ \
    --e2e  /mnt/hdd2T/AICR/Projects/2023/Vietinbank_POC/Invoice_KIE_Results/result.json
 """
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str, default="sdsvkie/cfg/default.yaml")
    parser.add_argument("--img", type=str, default="img.jpg", help="image path or directory")
    parser.add_argument("--weights", type=str, default=None)
    parser.add_argument("--text_det", type=str, default=None)
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--e2e", type=str, default=None)
    parser.add_argument("--vis", type=str, default=None)
    args = parser.parse_args()
    # predictor = Predictor(cfg=cfg)
    predictor = Predictor(**vars(args))
    if args.e2e:
        outdir_e2e = os.path.dirname(args.e2e)
        if not os.path.exists(outdir_e2e):
            os.makedirs(outdir_e2e, exist_ok=True)
    if os.path.isdir(args.img):
        img_paths = sorted(glob(args.img + "/*"))
        print("Infence image dir, total imgs: {}".format(len(img_paths)))
    else:
        img_paths = [args.img]
    out_dict = {}
    if "Others" in predictor.classes:
        colors = {
            "Store_name_value": (30,97,235),
            "id": (28,175,6),
            "Date_value": (241,26,242),
            "Total_value": (255,0,0),
        }
    else:
        colors = [
            (
                random.randint(0, 255),
                random.randint(0, 255),
                random.randint(0, 255),
            )
                for _ in range(len(predictor.classes))
        ]
    import time 
    for img_path in tqdm.tqdm(img_paths):
        print(img_path)
        if Path(img_path).suffix.lower() in IMG_EXT:
            img = cv2.imread(img_path)
            if img is None:
                print("img is None: ", img_path)
                continue
        elif Path(img_path).suffix.lower() in PDF_EXT:
            img = img_path  #pdf
        else:
            continue
        # try:
        out = predictor(img)
        # except Exception as err:
        #     print(err, img_path)
        #     continue 
        out_api = out['end2end_results']
        if not args.e2e:
            print(out_api)
        else:
            img_id = os.path.splitext(os.path.basename(img_path))[0]
            out_dict[img_id] = {
                field_name: field_item['value'] for field_name, field_item in out_api.items()
            }
        if args.vis:
            visualize_kie(
                img,
                boxes=[field_item['box'] for field_name, field_item in out_api.items() if len(field_item['box']) > 0],
                pred_labels=[field_name for field_name, field_item in out_api.items() if len(field_item['box']) > 0],
                image_name=os.path.basename(img_path),
                outdir=args.vis,
                colors=colors,
                texts = [field_item['value'] for field_name, field_item in out_api.items() if len(field_item['box']) > 0]
            )
    if args.e2e:
        write_json(os.path.join(args.e2e), out_dict)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/parse_e2e.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/parse_e2e.py
@ -1,72 +0,0 @@
 """
 Use for eval, debug
 """
 import argparse
 import os
 from glob import glob
 from pathlib import Path
 from sdsvkie.utils.word_formation import Word
 import tqdm
 from sdsvkie.engine import Predictor
 from sdsvkie.utils.io_file import load_ocr_output, write_txt
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str, default="sdsvkie/cfg/default.yaml")
    parser.add_argument(
        "--src", type=str, default="img.jpg / img_dir", help="image path or directory"
    )
    parser.add_argument("--tgt", type=str, required=True,  default=None)
    args = parser.parse_args()
    predictor = Predictor(args.cfg)
    if not os.path.exists(args.tgt):
        os.makedirs(args.tgt, exist_ok=True)
    txt_paths = glob(args.src + "/*.txt")
    print("Total txt: {}".format(len(txt_paths)))
    for txt_path in tqdm.tqdm(txt_paths):
        ocr_output = load_ocr_output(txt_path)
        boxes, texts, labels = ocr_output['boxes'], ocr_output['texts'], ocr_output['labels']
        words = []
        for box, text, label in zip(boxes, texts, labels):
            words.append(
                Word(
                    text=text,
                    bndbox=box,
                    kie_label=label,
                    conf_cls=0.0
                )
            )
        kie_output = predictor.postprocessing(words)
        txt_out_path = str(Path(args.tgt) / Path(txt_path).name)
        data = []
        # print(output)
        wordgroup_all_list = []
        for kie_label, wordgroup_list in kie_output.items():
            wordgroup_all_list.extend(wordgroup_list)
        boxes = [word.boundingbox for word in wordgroup_all_list]
        pred_labels = [word.kie_label for word in wordgroup_all_list]
        texts = [word.text + "____kie_wordgroup" for word in wordgroup_all_list]
        for box, text, kie_label in zip(boxes, texts, pred_labels):
            item = "\t".join([str(int(x)) for x in box])
            item = "\t".join([item, text, kie_label])
            data.append(item)
        write_txt(txt_out_path, data)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/postprocess_e2e_label.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/postprocess_e2e_label.py
@ -1,72 +0,0 @@
 import argparse
 from sdsvkie.utils import read_json, yaml_load, write_json
 from sdsvkie.utils.post_processing.invoice_post_processing import *
 from sdsvkie.utils.post_processing.common_post_processing import normalize_number
 def postprocess_invoice(invoice_data):
    if 'date' in invoice_data:
        invoice_data['date'] = post_processing_datetime(invoice_data['date'])
    #### normalize number
    number_fields = ['total_value', 'VAT_amount_value']
    for number_field in number_fields:
        if number_field not in invoice_data:
            continue
        invoice_data[number_field] = normalize_number(invoice_data[number_field])
    if 'buyer_tax_code_value' in invoice_data:
        invoice_data['buyer_tax_code_value'] = normalize_number(invoice_data['buyer_tax_code_value'], rerserve_minus=True)
    if 'seller_tax_code_value' in invoice_data:
        invoice_data['seller_tax_code_value'] = normalize_number(invoice_data['seller_tax_code_value'], rerserve_minus=True)
    if "seller_mobile_value" in invoice_data:
        invoice_data['seller_mobile_value'] = normalize_number(invoice_data['seller_mobile_value'], rerserve_minus=False, reserve_plus=True)
    for field_name in invoice_data.keys():
        field_value = invoice_data[field_name]
        field_value = field_value.replace("✪", " ")
        field_value = field_value.replace("\t", " ")
        field_value = re.sub(r"\s+", " ", field_value)
        invoice_data[field_name] = field_value
    return invoice_data
 def format_e2e_data(input_json, output_json, cfg):
    cfg = yaml_load(cfg)
    classes = cfg['classes']
    value_classes = [cls_name for cls_name in classes if "_key" not in cls_name and "other" not in cls_name]
    print(value_classes)
    in_data = read_json(input_json)
    out_data = {}
    for img_id, img_data in in_data.items():
        new_img_data = postprocess_invoice(img_data)
        for cls_value in value_classes:
            if cls_value not in new_img_data:
                new_img_data[cls_value] = ""
        out_data[img_id] = new_img_data
    write_json(data=out_data, json_path=output_json)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str)
    parser.add_argument("--input", type=str, help="e2e label file path")
    parser.add_argument("--out", type=str, help='postprocess e2e label')
    args = parser.parse_args()
    format_e2e_data(args.input, args.out, args.cfg)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/run_ocr.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/run_ocr.py
@ -1,84 +0,0 @@
 import argparse
 import glob
 from pathlib import Path
 import cv2
 from tqdm import tqdm
 from sdsvkie.models.ocr import OCREngine
 from sdsvkie.utils.visualize import visualize_ocr
 from sdsvkie.utils.io_file import write_txt
 from sdsvkie.utils.word_formation import sort_words
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--img", type=str)
    parser.add_argument("--out_dir", type=str, default=None)
    parser.add_argument("--device", type=str, default='cpu')
    parser.add_argument("--reserve_parent_dir", action='store_true')
    parser.add_argument("--only_text", action='store_true')
    parser.add_argument("--out_txt", type=str, default=None)
    parser.add_argument("--text_det", default="yolox-s-general-text-pretrain-20221226")
    parser.add_argument("--text_recog", default="satrn-lite-general-pretrain-20230106")
    args = parser.parse_args()
    ocr_engine = OCREngine(text_det=args.text_det, text_recog=args.text_recog, device=args.device)
    if args.reserve_parent_dir:
        paths = glob.glob(args.img + "/*/*")
    else:
        paths = glob.glob(args.img + "/*")
    for path in tqdm(paths):
        img = cv2.imread(path)
        if img is None:
            print(path)
            continue
        ocr_output = ocr_engine(img, extend_ratio=[0.1, 0.3], ratio_thr=5)
        if args.out_dir:
            if args.reserve_parent_dir:
                out_dir_img = Path(args.out_dir) / Path(path).parent.name
            else:
                out_dir_img = Path(args.out_dir)
            if not out_dir_img.exists():
                out_dir_img.mkdir(parents=True)
            visualize_ocr(
                img = img,
                boxes=ocr_output['boxes'],
                texts=ocr_output['texts'],
                image_name=str(Path(path).name),
                outdir=str(out_dir_img)
            )
        if args.out_txt:
            if args.reserve_parent_dir:
                out_dir_txt = Path(args.out_txt) / Path(path).parent.name
            else:
                out_dir_txt = Path(args.out_txt)
            if not out_dir_txt.exists():
                out_dir_txt.mkdir(parents=True)
            out_txt_path = out_dir_txt / Path(path).with_suffix(".txt").name
            data = []
            if args.only_text:
                out = sort_words(ocr_output)
                text = " ".join(out['texts'])
                data.append(text)
            else:
                for box, text in zip(ocr_output['boxes'], ocr_output['texts']):
                    item = "\t".join([str(int(x)) for x in box])
                    item = "\t".join([item, text])
                    data.append(item)
            write_txt(str(out_txt_path), data)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/train.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/train.py
@ -1,27 +0,0 @@
 import argparse
 from sdsvkie.cfg import load_cfg
 from sdsvkie.engine.trainer import Trainer
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str, default="sdsvkie/cfg/default.yaml")
    parser.add_argument("--device", type=str, default="cuda")
    parser.add_argument("--save_dir", type=str, default="./workdirs/exp")
    parser.add_argument("--wandb", action="store_true")
    args = parser.parse_args()
    # cfg = cfg2dict(args.cfg)
    # cfg['device'] = args.device
    # cfg['save_dir'] = args.save_dir
    cfg = load_cfg(args.cfg, vars(args))
    print(cfg)
    if args.wandb:
        cfg['wandb'] = "invoice"
    trainer = Trainer(cfg)
    trainer.train()
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/val.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/tools/val.py
@ -1,20 +0,0 @@
 import argparse
 from sdsvkie.cfg import load_cfg
 from sdsvkie.engine.trainer import Trainer
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str, default="sdsvkie/cfg/default.yaml")
    parser.add_argument("--device", type=str, default="cuda")
    parser.add_argument("--weights", type=str, default=None)
    args = parser.parse_args()
    cfg = load_cfg(args.cfg, vars(args))
    trainer = Trainer(cfg)
    metric = trainer.val()
    print(metric)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/init.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/init.py
@ -1,17 +0,0 @@
 from .transform import normalize_box, unnormalize_box
 from .augmentation import perturbate_character, sampling_data
 from .io_file import yaml_load, yaml_save, read_json,write_json
 from .visualize import visualize_kie, visualize_ocr
 from .word_formation import (
    sliding_windows, Word_group, words_to_lines, 
    sort_words, Word, merge_boxes, merge_wordgroups,
    distance_of_boxes, y_distance
 )
 from .post_processing import *
 from .logger import get_logger
 from .common import get_info_env
 from .convert_pdf2image import pdf_to_image
 IMG_EXT = ['.jpg', ".png", ".jpeg"]
 PDF_EXT = [".pdf"]
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/augmentation.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/augmentation.py
@ -1,112 +0,0 @@
 import math
 import random
 VN_list_char = "aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!#$%&()*+,-./:;<=>?@[\]^_`{|}~"
 def perturbate_character(words: list, ratio=0.01):
    # Algorithm
    # Step 1: couting number of characters is words and sample the postion we want to perturbation
    # ( words = ["abc", "lkdhf", "lfhdlsa", "akdjhf"] =>> total_char = 21, pertubation_position = [15,13,7]), these are the positions of perturbating chars in the concatenating string of words "abclkdhflfhdlsaakdjhf"
    # start_pos = 0, ending_pos = 0
    # Step 2: with each word in words, calculate the start_position and ending_pos of word in the concatenating string
    # if ending pos > pertubation_position[-1] => conduct perturbation and plus 1 to perturbation index, else => continue
    # Loop 1: words[0], start_pos = 0 ,ending_pos= 3 <= pertubation_position[-1] =7 => continue
    # Loop 2:
    total_char = sum(len(i) for i in words)
    pertubation_positions = sorted(
        random.sample(range(total_char), int(ratio * total_char))
    )
    # logging.info(pertubation_positions)
    pos = 0
    start_pos = 0
    j = 0
    for i, word in enumerate(words):
        if j == len(pertubation_positions):
            break
        start_pos = pos
        pos += len(word)
        # logging.info(start_pos,pos)
        while pos > pertubation_positions[j]:
            x = random.randint(0, 3)
            fixing_pos = pertubation_positions[j] - start_pos
            if x == 0:  # append random char to the left
                word = (
                    word[:fixing_pos]
                    + VN_list_char[random.randint(0, len(VN_list_char) - 1)]
                    + word[fixing_pos:]
                )
            if x == 1:  # append random char to the right
                word = (
                    word[: fixing_pos + 1]
                    + VN_list_char[random.randint(0, len(VN_list_char) - 1)]
                    + word[fixing_pos + 1 :]
                )
            if x == 2:  # adjust to another random char at current position
                word = (
                    word[:fixing_pos]
                    + VN_list_char[random.randint(0, len(VN_list_char) - 1)]
                    + word[fixing_pos + 1 :]
                )
            if x == 3 and len(word) > 1:  # delete char at current position
                word = word[:fixing_pos] + word[fixing_pos + 1 :]
            j += 1
            # logging.info(words[i], word)
            words[i] = word
            if j == len(pertubation_positions):
                break
    return words
 def sampling_data(words, boxes, labels, max_num_words=150, slice_interval=50):
    # num_boxes = len(words)
    # if num_boxes > max_num_words:
    #     slide = max_num_words // 2
    #     num_batches = math.ceil(num_boxes / slide)
    #     idx_batches = [i for i in range(num_batches)]
    #     idx_batch = random.choice(idx_batches)
    #     start_idx = slide * idx_batch
    #     words = words[start_idx:]
    #     normalized_word_boxes = normalized_word_boxes[start_idx:]
    #     word_labels = normalized_word_boxes[start_idx:]
    total_word = len(words)
    window_size = max_num_words
    text_windows = [
        words[i : i + window_size] for i in range(0, total_word, slice_interval)
    ]
    box_windows = [
        boxes[i : i + window_size] for i in range(0, total_word, slice_interval)
    ]
    label_windown = [
        labels[i : i + window_size] for i in range(0, total_word, slice_interval)
    ]
    # assert all(
    #     [
    #         len(_words) == len(boxes)
    #         for _words, boxes in zip(words, normalized_word_boxes)
    #     ]
    # )
    # assert all(
    #     [
    #         len(_words) == len(_word_labels)
    #         for _words, _word_labels in zip(words, word_labels)
    #     ]
    # )
    sampling_idx = random.choice([i for i in range(len(text_windows))])
    sampling_words, sampling_boxes, sampling_labels = (
        text_windows[sampling_idx],
        box_windows[sampling_idx],
        label_windown[sampling_idx],
    )
    return sampling_idx, sampling_words, sampling_boxes, sampling_labels
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/common.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/common.py
@ -1,8 +0,0 @@
 from torch.utils import collect_env
 def get_info_env():
    return collect_env.get_pretty_env_info()
 def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/convert_pdf2image.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/convert_pdf2image.py
@ -1,166 +0,0 @@
 import argparse
 import glob
 import os
 import shutil
 from pathlib import Path
 import fitz  # PyMuPDF, imported as fitz for backward compatibility reasons
 import numpy as np
 from tqdm import tqdm
 import cv2 
 from pdf2image import convert_from_path
 def convert_pdf2image(filename, out_dir, dpi=300, reserve_parent_dir=False, is_get_first_page=False):
    """Convert and save to disk
    Args:
        filename (_type_): _description_
        out_dir (_type_): _description_
        dpi (int, optional): _description_. Defaults to 300.
        reserve_parent_dir (bool, optional): _description_. Defaults to False.
    """
    out_dir = Path(out_dir)
    filename = Path(filename)
    filename_str = str(filename)
    if reserve_parent_dir:
        parent_dir = filename.parent.name
        out_dir = out_dir / parent_dir
    if not out_dir.exists():
        out_dir.mkdir(parents=True)
    if ".txt" in str(filename).lower():
        return 
    if ".pdf" not in str(filename).lower():
        shutil.copy(filename, out_dir)
        return
    # doc = fitz.open(filename_str)  # open document
    # # zoom = 2  # zoom factor, standard: 72 dpi
    # # dpi = 300
    # zoom = dpi // 72
    # magnify = fitz.Matrix(zoom, zoom) 
    # for idx, page in enumerate(doc):
    #     pix = page.get_pixmap(matrix=magnify)  # render page to an image
    #     outpath = out_dir / Path(os.path.splitext(os.path.basename(filename))[0] + "_" + str(idx+1) + ".jpg")
    #     pix.pil_save(outpath)
    try:
        imgs = pdf_to_image(pdf=filename_str, is_get_first_page=is_get_first_page, dpi=dpi)
    except:
        print("Use v2: ", filename_str)
        imgs = pdf_to_image_v2(pdf=filename_str, is_get_first_page=is_get_first_page, dpi=dpi)
        print("Len img: ", len(imgs))
    for idx, img in enumerate(imgs):
        outpath = str(out_dir / Path(os.path.splitext(os.path.basename(filename))[0] + "_" + str(idx+1) + ".jpg"))
        cv2.imwrite(img=img, filename=outpath)
 def pdf_to_image_v2(pdf, dpi=300, is_get_first_page=False, max_page=1000):
    """_summary_
    Args:
        pdf (_type_): _description_
        dpi (int, optional): _description_. Defaults to 300.
        is_get_first_page (bool, optional): _description_. Defaults to False.
        max_page (int, optional): _description_. Defaults to 1000.
    Raises:
        NotImplementedError: _description_
    Returns:
        _type_: _description_
    """
    if isinstance(pdf, str):
        if not os.path.exists(pdf):
            print(f"Not found pdf path at {pdf}")
            return []
        imgs = convert_from_path(pdf, dpi=dpi)  # PILLOW 
    else:
        raise NotImplementedError(f"Not yet implement for {type(pdf)} type !!!")
    # zoom = dpi // 72
    # magnify = fitz.Matrix(zoom, zoom) 
    cv_imgs = []
    for idx, img in enumerate(imgs):
        img = img.convert("RGB")
        cv_img = np.array(img)
        cv_img = cv_img[:, :, ::-1].copy()
        cv_imgs.append(cv_img)
        if is_get_first_page or idx >= max_page:
            break
    return cv_imgs
 def pdf_to_image(pdf, dpi=300, is_get_first_page=False, max_page=1000):
    """_summary_
    Args:
        pdf (_type_): _description_
        dpi (int, optional): _description_. Defaults to 300.
        is_get_first_page (bool, optional): _description_. Defaults to False.
        max_page (int, optional): _description_. Defaults to 1000.
    Raises:
        NotImplementedError: _description_
    Returns:
        _type_: _description_
    """
    if isinstance(pdf, str):
        if not os.path.exists(pdf):
            print(f"Not found pdf path at {pdf}")
            return []
        doc = fitz.open(pdf)  # open document
    elif isinstance(pdf, bytes):
        doc = fitz.open(stream=pdf, filetype='pdf')
    else:
        raise NotImplementedError(f"Not yet implement for {type(pdf)} type !!!")
    zoom = dpi // 72
    magnify = fitz.Matrix(zoom, zoom) 
    imgs = []
    for idx, page in enumerate(doc):
        pix = page.get_pixmap(matrix=magnify)  # render page to an image
        im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        im = np.ascontiguousarray(im[..., [2, 1, 0]])  # rgb to bgr
        imgs.append(im)
        if is_get_first_page or idx >= max_page:
            break
    return imgs
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdf_dir", type=str)
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--reserve_parent_dir", action='store_true')
    args = parser.parse_args()
    paths =  glob.glob(args.pdf_dir + "/*") \
            + glob.glob(args.pdf_dir + "/*/*")  \
            + glob.glob(args.pdf_dir + "/*/*/*") 
            # + glob.glob(args.pdf_dir + "/*")
    print(f"Total pdf paths in {args.pdf_dir}: {len(paths)} ")
    error_pdfs = []
    for path in tqdm(paths):
        path = str(path)
        try:
            convert_pdf2image(path, args.out_dir, reserve_parent_dir=args.reserve_parent_dir)
        except Exception as err:
            print(err, path)
            error_pdfs.append(path)
            continue
    print("Total error pdfs: ", len(error_pdfs))
    print(error_pdfs)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/cvat_utils.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/cvat_utils.py
@ -1,90 +0,0 @@
 from typing import List
 import random
 def gen_random_color():
    red = random.randint(0, 255)
    green = random.randint(0, 255)
    blue = random.randint(0, 255)
    # combine the values into a hexadecimal color code
    color_code = "#{:02x}{:02x}{:02x}".format(red, green, blue)
    return color_code
 def gen_raw_label(labels: List[str]):
    """gen raw label for cvat tool
    {
        "name": "no_key",
        "color": "#33ddff",
        "type": "any",
        "attributes": []
    },
    Args:
        labels (List[str]): _description_
    """
    raw_label = []
    for label in labels:
        item = {
            "name": label, 
            "color": gen_random_color(),
            "type": "any",
            "attributes": []
        }
        raw_label.append(item)
    return raw_label
 if __name__ == "__main__":
    labels = [
        # id invoice
        'no_key',    # số hóa đơn
        'no_value', 
        'form_key',    # mẫu số hóa đơn
        'form_value', 
        'serial_key',     # số kí hiệu hoá đơn
        'serial_value', 
        'date', 
        # seller info
        'seller_company_name_key', 
        'seller_company_name_value', 
        'seller_tax_code_key', 
        'seller_tax_code_value', 
        'seller_address_value',
        'seller_address_key', 
        'seller_mobile_key', 
        'seller_mobile_value', 
        # Not yet support seller_bank_no, seller_bank_name
        # 'seller_name_key', 
        # 'seller_name_value', 
        # 'seller_company_name_value',   -> seller_name_value
        # buyer info
        'buyer_name_key',
        'buyer_name_value', 
        'buyer_company_name_value', 
        'buyer_company_name_key', 
        'buyer_tax_code_key', 
        'buyer_tax_code_value', 
        'buyer_address_key', 
        'buyer_address_value', 
        'buyer_mobile_key',
        'buyer_mobile_value',
        # money info
        'VAT_amount_key', 
        'VAT_amount_value', 
        'total_key', 
        'total_value', 
        'total_in_words_key', 
        'total_in_words_value',
        'other', 
    ]
    raw_label =  gen_raw_label(labels)
    print(raw_label)
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/invoice_data.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/invoice_data.py
@ -1,87 +0,0 @@
 import os 
 import glob 
 import json 
 import os
 import shutil
 def read_txt(txt):
    with open(txt, 'r', encoding='utf8') as f:
        data = [line.strip() for line in f]
    return data 
 def write_txt(txt, data):
    with open(txt, 'w', encoding='utf8') as f:
        for line in data:
            f.write(line + "\n")
 def write_json(json_path, data):
    with open(json_path, 'w', encoding='utf8') as f:
        json.dump(data, f, ensure_ascii=False)
 def read_json(json_path):
    with open(json_path, 'r', encoding='utf8') as f:
        data = json.load(f)
    return data 
 def create_template_info(data_dir, json_out):
    outputs ={}
    txt_paths = sorted(glob.glob(data_dir + "/*.txt"))
    for txt_path in txt_paths:
        txt_name = os.path.basename(txt_path)
        txt_data = read_txt(txt_path)
        wordgroups = [item for item in txt_data if "____kie_wordgroup	seller_company_name_value" in item]
        num_line_company = len(wordgroups)
        outputs[txt_name] = num_line_company
    write_json(json_out, outputs)
 def filter_data(template_file, data_file, img_dir, txt_dir,  output):
    template_data = read_json(template_file)
    data = read_json(data_file)
    new_data = []
    for txt_name, num_wordgroup_line in template_data.items():
        id = txt_name.split("_type99_")[0]
        for txt_name_target, num_wordgroup_target_line in data.items():
            id_target = txt_name_target.split("_type99_")[0]
            # print(id, id_target, id_target != id)
            if id_target != id:
                # print(id_target, id)
                continue
            if num_wordgroup_line != num_wordgroup_target_line:
                continue
            new_data.append(txt_name_target)
    new_data = sorted(list(set(new_data)))
    print(new_data[:5])
    # return new_data
    if not os.path.exists(output):
        os.makedirs(output, exist_ok=True)
    for txt_name in new_data:
        img_path = os.path.join(img_dir, txt_name.replace(".txt", ".jpg"))
        shutil.copy(img_path, output)
        shutil.copy(os.path.join(txt_dir, txt_name), output)
 if __name__ == "__main__":
    target_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/vnpt_one_line_r20_txt"
    template_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/visualize/vnpt_one_line_txt"
    output_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r20/one_line_filtered"
    img_target_dir = "/mnt/ssd1T/hoanglv/Projects/KIE/craw_data/output/synth_vnpt_r20/one_line"
    out_template_json = "vnpt_template.json"
    out_target_json = "vnpt_r20.json"
    create_template_info(template_dir, out_template_json)
    create_template_info(target_dir, out_target_json)
    new_data = filter_data(out_template_json, out_target_json,img_target_dir, target_dir,  output_dir)
    print("Total after filter: ", len(new_data))
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/vnpt_r20.json
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/vnpt_r20.json
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/vnpt_template.json
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/vnpt_template.json
@ -1,48 +0,0 @@
 {
    "invoice_vnpt_id10_type99_1.txt": 1,
    "invoice_vnpt_id11_type99_1.txt": 1,
    "invoice_vnpt_id12_type99_1.txt": 1,
    "invoice_vnpt_id13_type99_1.txt": 2,
    "invoice_vnpt_id14_type99_1.txt": 1,
    "invoice_vnpt_id15_type99_1.txt": 0,
    "invoice_vnpt_id16_type99_1.txt": 1,
    "invoice_vnpt_id17_type99_1.txt": 1,
    "invoice_vnpt_id18_type99_1.txt": 1,
    "invoice_vnpt_id19_type99_1.txt": 1,
    "invoice_vnpt_id20_type99_1.txt": 2,
    "invoice_vnpt_id21_type99_1.txt": 1,
    "invoice_vnpt_id22_type99_1.txt": 1,
    "invoice_vnpt_id23_type99_1.txt": 1,
    "invoice_vnpt_id24_type99_1.txt": 1,
    "invoice_vnpt_id25_type99_1.txt": 2,
    "invoice_vnpt_id26_type99_1.txt": 1,
    "invoice_vnpt_id27_type99_1.txt": 4,
    "invoice_vnpt_id28_type99_1.txt": 0,
    "invoice_vnpt_id29_type99_1.txt": 1,
    "invoice_vnpt_id30_type99_1.txt": 3,
    "invoice_vnpt_id31_type99_1.txt": 1,
    "invoice_vnpt_id32_type99_1.txt": 1,
    "invoice_vnpt_id33_type99_1.txt": 1,
    "invoice_vnpt_id34_type99_1.txt": 3,
    "invoice_vnpt_id35_type99_1.txt": 1,
    "invoice_vnpt_id36_type99_1.txt": 2,
    "invoice_vnpt_id37_type99_1.txt": 1,
    "invoice_vnpt_id39_type99_1.txt": 1,
    "invoice_vnpt_id40_type99_1.txt": 1,
    "invoice_vnpt_id43_type99_1.txt": 1,
    "invoice_vnpt_id44_type99_1.txt": 1,
    "invoice_vnpt_id45_type99_1.txt": 0,
    "invoice_vnpt_id46_type99_1.txt": 2,
    "invoice_vnpt_id47_type99_1.txt": 1,
    "invoice_vnpt_id48_type99_1.txt": 1,
    "invoice_vnpt_id49_type99_1.txt": 1,
    "invoice_vnpt_id50_type99_1.txt": 1,
    "invoice_vnpt_id52_type99_1.txt": 1,
    "invoice_vnpt_id53_type99_1.txt": 1,
    "invoice_vnpt_id54_type99_1.txt": 0,
    "invoice_vnpt_id55_type99_1.txt": 0,
    "invoice_vnpt_id56_type99_1.txt": 2,
    "invoice_vnpt_id57_type99_1.txt": 1,
    "invoice_vnpt_id8_type99_1.txt": 1,
    "invoice_vnpt_id9_type99_1.txt": 1
 }
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/x.json
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/data_preprocessing/x.json
@ -1,213 +0,0 @@
 [
    {
        "name": "word",
        "color": "#83e070",
        "type": "any",
        "attributes": [
            {
                "name": "text",
                "input_type": "text",
                "mutable": false,
                "values": ["x"]
            },
            {
                "name": "kie_label",
                "input_type": "text",
                "mutable": false,
                "values": ["x"]
            }
        ]
    },
    {
        "name": "no_key",
        "color": "#cf04f1",
        "type": "any",
        "attributes": []
    },
    {
        "name": "no_value",
        "color": "#0a01ce",
        "type": "any",
        "attributes": []
    },
    {
        "name": "form_key",
        "color": "#bfe920",
        "type": "any",
        "attributes": []
    },
    {
        "name": "form_value",
        "color": "#ac3436",
        "type": "any",
        "attributes": []
    },
    {
        "name": "serial_key",
        "color": "#706724",
        "type": "any",
        "attributes": []
    },
    {
        "name": "serial_value",
        "color": "#7a9b4b",
        "type": "any",
        "attributes": []
    },
    {
        "name": "date",
        "color": "#23f0e9",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_company_name_key",
        "color": "#f47ccc",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_company_name_value",
        "color": "#9c9c73",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_tax_code_key",
        "color": "#afa0fa",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_tax_code_value",
        "color": "#6e7352",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_address_value",
        "color": "#121512",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_address_key",
        "color": "#188735",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_mobile_key",
        "color": "#7387fd",
        "type": "any",
        "attributes": []
    },
    {
        "name": "seller_mobile_value",
        "color": "#325bf1",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_name_key",
        "color": "#a5b431",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_name_value",
        "color": "#e63dcc",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_company_name_value",
        "color": "#e9bf0b",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_company_name_key",
        "color": "#a8d921",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_tax_code_key",
        "color": "#1d8f4f",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_tax_code_value",
        "color": "#e638c6",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_address_key",
        "color": "#74afe5",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_address_value",
        "color": "#1518dc",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_mobile_key",
        "color": "#13b1cd",
        "type": "any",
        "attributes": []
    },
    {
        "name": "buyer_mobile_value",
        "color": "#c49d59",
        "type": "any",
        "attributes": []
    },
    {
        "name": "VAT_amount_key",
        "color": "#69c945",
        "type": "any",
        "attributes": []
    },
    {
        "name": "VAT_amount_value",
        "color": "#77c3be",
        "type": "any",
        "attributes": []
    },
    {
        "name": "total_key",
        "color": "#d1353a",
        "type": "any",
        "attributes": []
    },
    {
        "name": "total_value",
        "color": "#246976",
        "type": "any",
        "attributes": []
    },
    {
        "name": "total_in_words_key",
        "color": "#45a8b5",
        "type": "any",
        "attributes": []
    },
    {
        "name": "total_in_words_value",
        "color": "#d800df",
        "type": "any",
        "attributes": []
    },
    {
        "name": "other",
        "color": "#ba0fbd",
        "type": "any",
        "attributes": []
    }
 ]
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/eval_kie.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/eval_kie.py
@ -1,479 +0,0 @@
 import argparse
 import re
 from difflib import SequenceMatcher
 from rapidfuzz.distance import Levenshtein
 from terminaltables import AsciiTable
 from sdsvkie.cfg import load_cfg
 from sdsvkie.utils.io_file import read_json, write_json
 from pathlib import Path
 def is_type_list(x, type):
    if not isinstance(x, list):
        return False
    return all(isinstance(item, type) for item in x)
 def cal_true_positive_char(pred, gt):
    """Calculate correct character number in prediction.
    Args:
        pred (str): Prediction text.
        gt (str): Ground truth text.
    Returns:
        true_positive_char_num (int): The true positive number.
    """
    all_opt = SequenceMatcher(None, pred, gt)
    true_positive_char_num = 0
    for opt, _, _, s2, e2 in all_opt.get_opcodes():
        if opt == "equal":
            true_positive_char_num += e2 - s2
        else:
            pass
    return true_positive_char_num
 def post_processing(text, lowercase=False):
    """
    - Remove special characters and  extra spaces
    """
    text = re.sub(
        r"[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789 ]",
        " ",
        text,
    )
    text = re.sub(r"\s\s+", " ", text)
    text = text.strip()
    if lowercase:
        text = text.lower()
    return text
 def count_matches(pred_texts, gt_texts, use_ignore=True):
    """Count the various match number for metric calculation.
    Args:
        pred_texts (list[str]): Predicted text string.
        gt_texts (list[str]): Ground truth text string.
    Returns:
        match_res: (dict[str: int]): Match number used for
            metric calculation.
    """
    match_res = {
        "gt_char_num": 0,
        "pred_char_num": 0,
        "true_positive_char_num": 0,
        "gt_word_num": 0,
        "match_word_num": 0,
        "match_word_ignore_case": 0,
        "match_word_ignore_case_symbol": 0,
        "match_kie": 0,
        "match_kie_ignore_case": 0,
    }
    # comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
    # comp = re.compile('[]')
    norm_ed_sum = 0.0
    gt_texts_for_ned_word = []
    pred_texts_for_ned_word = []
    for pred_text, gt_text in zip(pred_texts, gt_texts):
        if gt_text == pred_text:
            match_res["match_word_num"] += 1
            match_res["match_kie"] += 1
        gt_text_lower = gt_text.lower()
        pred_text_lower = pred_text.lower()
        if gt_text_lower == pred_text_lower:
            match_res["match_word_ignore_case"] += 1
        # gt_text_lower_ignore = comp.sub('', gt_text_lower)
        # pred_text_lower_ignore = comp.sub('', pred_text_lower)
        if use_ignore:
            gt_text_lower_ignore = post_processing(gt_text_lower)
            pred_text_lower_ignore = post_processing(pred_text_lower)
        else:
            gt_text_lower_ignore = gt_text_lower
            pred_text_lower_ignore = pred_text_lower
        if gt_text_lower_ignore == pred_text_lower_ignore:
            match_res["match_kie_ignore_case"] += 1
        gt_texts_for_ned_word.append(gt_text_lower_ignore.split(" "))
        pred_texts_for_ned_word.append(pred_text_lower_ignore.split(" "))
        match_res["gt_word_num"] += 1
        norm_ed = Levenshtein.normalized_distance(
            pred_text_lower_ignore, gt_text_lower_ignore
        )
        # if norm_ed > 0.1:
        #     print(gt_text_lower_ignore, pred_text_lower_ignore, sep='\n')
        #     print("-"*20)
        norm_ed_sum += norm_ed
        # number to calculate char level recall & precision
        match_res["gt_char_num"] += len(gt_text_lower_ignore)
        match_res["pred_char_num"] += len(pred_text_lower_ignore)
        true_positive_char_num = cal_true_positive_char(
            pred_text_lower_ignore, gt_text_lower_ignore
        )
        match_res["true_positive_char_num"] += true_positive_char_num
    normalized_edit_distance = norm_ed_sum / max(1, len(gt_texts))
    match_res["ned"] = normalized_edit_distance
    # NED for word-level
    norm_ed_word_sum = 0.0
    # print(pred_texts_for_ned_word[0])
    unique_words = list(
        set(
            [x for line in pred_texts_for_ned_word for x in line]
            + [x for line in gt_texts_for_ned_word for x in line]
        )
    )
    preds = [
        [unique_words.index(w) for w in pred_text_for_ned_word]
        for pred_text_for_ned_word in pred_texts_for_ned_word
    ]
    truths = [
        [unique_words.index(w) for w in gt_text_for_ned_word]
        for gt_text_for_ned_word in gt_texts_for_ned_word
    ]
    for pred_text, gt_text in zip(preds, truths):
        norm_ed_word = Levenshtein.normalized_distance(pred_text, gt_text)
        # if norm_ed_word < 0.2:
        #     print(pred_text, gt_text)
        norm_ed_word_sum += norm_ed_word
    normalized_edit_distance_word = norm_ed_word_sum / max(1, len(gt_texts))
    match_res["ned_word"] = normalized_edit_distance_word
    return match_res
 def eval_ocr_metric(pred_texts, gt_texts, metric="acc"):
    """Evaluate the text recognition performance with metric: word accuracy and
    1-N.E.D. See https://rrc.cvc.uab.es/?ch=14&com=tasks for details.
    Args:
        pred_texts (list[str]): Text strings of prediction.
        gt_texts (list[str]): Text strings of ground truth.
        metric (str | list[str]): Metric(s) to be evaluated. Options are:
            - 'word_acc': Accuracy at word level.
            - 'word_acc_ignore_case': Accuracy at word level, ignoring letter
              case.
            - 'word_acc_ignore_case_symbol': Accuracy at word level, ignoring
              letter case and symbol. (Default metric for academic evaluation)
            - 'char_recall': Recall at character level, ignoring
              letter case and symbol.
            - 'char_precision': Precision at character level, ignoring
              letter case and symbol.
            - 'one_minus_ned': 1 - normalized_edit_distance
            In particular, if ``metric == 'acc'``, results on all metrics above
            will be reported.
    Returns:
        dict{str: float}: Result dict for text recognition, keys could be some
        of the following: ['word_acc', 'word_acc_ignore_case',
        'word_acc_ignore_case_symbol', 'char_recall', 'char_precision',
        '1-N.E.D'].
    """
    assert isinstance(pred_texts, list)
    assert isinstance(gt_texts, list)
    assert len(pred_texts) == len(gt_texts)
    assert isinstance(metric, str) or is_type_list(metric, str)
    if metric == "acc" or metric == ["acc"]:
        metric = [
            "word_acc",
            "word_acc_ignore_case",
            "word_acc_ignore_case_symbol",
            "char_recall",
            "char_precision",
            "one_minus_ned",
        ]
    metric = set([metric]) if isinstance(metric, str) else set(metric)
    # supported_metrics = set([
    #     'word_acc', 'word_acc_ignore_case', 'word_acc_ignore_case_symbol',
    #     'char_recall', 'char_precision', 'one_minus_ned', 'one_minust_ned_word'
    # ])
    # assert metric.issubset(supported_metrics)
    match_res = count_matches(pred_texts, gt_texts)
    eps = 1e-8
    eval_res = {}
    if "char_recall" in metric:
        char_recall = (
            1.0 * match_res["true_positive_char_num"] / (eps + match_res["gt_char_num"])
        )
        eval_res["char_recall"] = char_recall
    if "char_precision" in metric:
        char_precision = (
            1.0
            * match_res["true_positive_char_num"]
            / (eps + match_res["pred_char_num"])
        )
        eval_res["char_precision"] = char_precision
    if "word_acc" in metric:
        word_acc = 1.0 * match_res["match_word_num"] / (eps + match_res["gt_word_num"])
        eval_res["word_acc"] = word_acc
    if "word_acc_ignore_case" in metric:
        word_acc_ignore_case = (
            1.0 * match_res["match_word_ignore_case"] / (eps + match_res["gt_word_num"])
        )
        eval_res["word_acc_ignore_case"] = word_acc_ignore_case
    if "word_acc_ignore_case_symbol" in metric:
        word_acc_ignore_case_symbol = (
            1.0
            * match_res["match_word_ignore_case_symbol"]
            / (eps + match_res["gt_word_num"])
        )
        eval_res["word_acc_ignore_case_symbol"] = word_acc_ignore_case_symbol
    if "one_minus_ned" in metric:
        eval_res["1-N.E.D"] = 1.0 - match_res["ned"]
    if "one_minus_ned_word" in metric:
        eval_res["1-N.E.D_word"] = 1.0 - match_res["ned_word"]
    if "line_acc_ignore_case_symbol" in metric:
        line_acc_ignore_case_symbol = (
            1.0 * match_res["match_kie_ignore_case"] / (eps + match_res["gt_word_num"])
        )
        eval_res["line_acc_ignore_case_symbol"] = line_acc_ignore_case_symbol
    if "line_acc" in metric:
        word_acc_ignore_case_symbol = (
            1.0 * match_res["match_kie"] / (eps + match_res["gt_word_num"])
        )
        eval_res["line_acc"] = word_acc_ignore_case_symbol
    for key, value in eval_res.items():
        eval_res[key] = float("{:.4f}".format(value))
    return eval_res
 def eval_kie(pred_e2e_path, gt_e2e_path, kie_labels=[], skip_labels=[], log_failure_case=None, norm_failcase=False):
    # assert ".json" in pred_e2e_path and ".json" in gt_e2e_path, "only support json type"
    f = None
    if log_failure_case:
        log_failure_case = Path(log_failure_case)
        log_failure_case_dir = log_failure_case.parent
        if not log_failure_case_dir.exists():
            log_failure_case_dir.mkdir(parents=True)
    if isinstance(gt_e2e_path, str):
        gt_e2e = read_json(gt_e2e_path)
    else:
        gt_e2e = gt_e2e_path
    if isinstance(pred_e2e_path, str):
        preds_e2e = read_json(pred_e2e_path)
    else:
        preds_e2e = pred_e2e_path
    KIE_LABELS_WITH_ONLY_VALUES = [
        class_name
        for class_name in kie_labels
        if "_key" not in class_name
        and "other" not in class_name
        and class_name not in skip_labels
    ]
    pred_texts_dict = {label: [] for label in KIE_LABELS_WITH_ONLY_VALUES}
    gt_texts_dict = {label: [] for label in KIE_LABELS_WITH_ONLY_VALUES}
    results = {label: 1 for label in KIE_LABELS_WITH_ONLY_VALUES}
    # print(KIE_LABELS_WITH_ONLY_VALUES)
    fail_cases = {}
    for img_id in preds_e2e.keys():
        fail_cases[img_id] = {}
        pred_items = preds_e2e[img_id]
        gt_items = gt_e2e[img_id]
        if not pred_items:
            pred_items = {
                class_name: "" for class_name in KIE_LABELS_WITH_ONLY_VALUES
            }
        for class_name, text_gt in gt_items.items():
            if class_name in skip_labels:
                continue
            # if class_name  == 'seller_name_value':
            # print(gt_items)
            if class_name not in pred_items:
                text_pred = ""
            else:
                text_pred = pred_items[class_name]
            if norm_failcase:
                _text_pred = post_processing(text_pred, lowercase=True)
                _text_gt = post_processing(text_gt, lowercase=True)
            else:
                _text_pred = text_pred 
                _text_gt = text_gt
            if _text_pred != _text_gt:
                fail_cases[img_id][class_name] = {
                    'pred': _text_pred,
                    'gt': _text_gt
                }
            pred_texts_dict[class_name].append(text_pred)
            gt_texts_dict[class_name].append(text_gt)
    if log_failure_case:
        with open(log_failure_case, 'w', encoding='utf8') as f:
            write_json(log_failure_case, fail_cases)
    for class_name in KIE_LABELS_WITH_ONLY_VALUES:
        pred_texts = pred_texts_dict[class_name]
        gt_texts = gt_texts_dict[class_name]
        result = eval_ocr_metric(
            pred_texts,
            gt_texts,
            metric=[
                "one_minus_ned",
                "line_acc_ignore_case_symbol",
                "line_acc",
                "one_minus_ned_word",
            ],
        )
        results[class_name] = {
            "1-ned": result["1-N.E.D"],
            "1-ned-word": result["1-N.E.D_word"],
            "line_acc": result["line_acc"],
            "line_acc_ignore_case_symbol": result["line_acc_ignore_case_symbol"],
            "samples": len(pred_texts),
        }
    # avg reusults
    sum_1_ned = sum(
        [
            results[class_name]["1-ned"] * results[class_name]["samples"]
            for class_name in KIE_LABELS_WITH_ONLY_VALUES
        ]
    )
    sum_1_ned_word = sum(
        [
            results[class_name]["1-ned-word"] * results[class_name]["samples"]
            for class_name in KIE_LABELS_WITH_ONLY_VALUES
        ]
    )
    sum_line_acc = sum(
        [
            results[class_name]["line_acc"] * results[class_name]["samples"]
            for class_name in KIE_LABELS_WITH_ONLY_VALUES
        ]
    )
    sum_line_acc_ignore_case_symbol = sum(
        [
            results[class_name]["line_acc_ignore_case_symbol"]
            * results[class_name]["samples"]
            for class_name in KIE_LABELS_WITH_ONLY_VALUES
        ]
    )
    total_samples = sum(
        [results[class_name]["samples"] for class_name in KIE_LABELS_WITH_ONLY_VALUES]
    )
    results["avg_all"] = {
        "1-ned": round(sum_1_ned / total_samples, 4),
        "1-ned-word": round(sum_1_ned_word / total_samples, 4),
        "line_acc": round(sum_line_acc / total_samples, 4),
        "line_acc_ignore_case_symbol": round(
            sum_line_acc_ignore_case_symbol / total_samples, 4
        ),
        "samples": total_samples,
    }
    table_data = [
        [
            "class_name",
            "1-NED",
            "1-N.E.D_word",
            "line-acc",
            "line_acc_ignore_case_symbol",
            "#samples",
        ]
    ]
    for class_name in results.keys():
        # if c < p.shape[0]:
        table_data.append(
            [
                class_name,
                results[class_name]["1-ned"],
                results[class_name]["1-ned-word"],
                results[class_name]["line_acc"],
                results[class_name]["line_acc_ignore_case_symbol"],
                results[class_name]["samples"],
            ]
        )
    table = AsciiTable(table_data)
    print(table.table)
    return results
 if __name__ == "__main__":
    # gt_e2e = "/mnt/ssd1T/hoanglv/Projects/KIE/DATA/test_end2end/test_e2e.json"
    # # pred_e2e =  "/mnt/ssd1T/hoanglv/Projects/KIE/TokenClassification_invoice/workdirs/runs/pred_e2e.json"
    # # pred_e2e =  "/mnt/ssd1T/hoanglv/Projects/KIE/TokenClassification_invoice/workdirs/runs/infer/layoutxlm-base-31-03-2023-maxwords150_samplingv2/pred_e2e.json"
    # # pred_e2e = "/mnt/ssd1T/hoanglv/Projects/KIE/TokenClassification_invoice/workdirs/runs/infer/kie_e2e_pred_17-10-2022-maxwords150_samplingv2_rm_dup_boxes/pred_e2e.json"
    # # pred_e2e = "/mnt/ssd1T/hoanglv/Projects/KIE/TokenClassification_invoice/workdirs/runs/infer/kie_e2e_pred_17-10-2022-maxwords150_samplingv2/pred_e2e.json"
    # # pred_e2e = "/home/sds/hoanglv/Projects/TokenClassification_invoice/runs/infer/kie_e2e_pred_14-10-2022_2/pred_e2e.json"
    # pred_e2e = "/mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie/workdirs/e2e/test_17_10_2022_last_use_ocr_merge_use_label.json"
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg", type=str)
    parser.add_argument("--pred", type=str, help="predict json file path")
    parser.add_argument("--gt", type=str, help="ground truth json file path")
    parser.add_argument("--log_failure_case", type=str, default=None, help="log_failure_case path")
    parser.add_argument("--norm_failcase", action='store_true')
    args = parser.parse_args()
    cfg = load_cfg(args.cfg)
    kie_labels = cfg['classes']
    # res = eval_kie(pred_e2e, gt_e2e, skip_labels=["buyer_mobile_value"])
    # print(res)
    result = eval_kie(
        pred_e2e_path=args.pred,
        gt_e2e_path=args.gt,
        kie_labels=kie_labels,
        skip_labels=["Others", "other"],
        log_failure_case=args.log_failure_case,
        norm_failcase=args.norm_failcase
    )
    print("Path of validation dataset: /mnt/ssd1T/hoanglv/Projects/KIE/DATA/dev_model/Invoice/processed/test/PV2/invoice_kie_validation")
    print("Number of validation dataset: ", result[list(result.keys())[0]]['samples'])
    print("Evaluation metric: NLD")
    print("Target level: ")
    print("Archieved level: ")
    print("Verification result: PASS")
 """"
 """
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/io_file.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/io_file.py
@ -1,102 +0,0 @@
 import json
 import re
 from pathlib import Path
 import yaml
 def yaml_load(file="data.yaml", append_filename=False):
    """
    Load YAML data from a file.
    Args:
        file (str, optional): File name. Default is 'data.yaml'.
        append_filename (bool): Add the YAML filename to the YAML dictionary. Default is False.
    Returns:
        dict: YAML data and file name.
    """
    with open(file, errors="ignore", encoding="utf-8") as f:
        s = f.read()  # string
        # Remove special characters
        if not s.isprintable():
            s = re.sub(
                r"[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]+",
                "",
                s,
            )
        # Add YAML filename to dict and return
        return (
            {**yaml.safe_load(s), "yaml_file": str(file)}
            if append_filename
            else yaml.safe_load(s)
        )
 def yaml_save(file="data.yaml", data=None):
    """
    Save YAML data to a file.
    Args:
        file (str, optional): File name. Default is 'data.yaml'.
        data (dict, optional): Data to save in YAML format. Default is None.
    Returns:
        None: Data is saved to the specified file.
    """
    file = Path(file)
    if not file.parent.exists():
        # Create parent directories if they don't exist
        file.parent.mkdir(parents=True, exist_ok=True)
    with open(file, "w") as f:
        # Dump data to file in YAML format, converting Path objects to strings
        yaml.safe_dump(
            {k: str(v) if isinstance(v, Path) else v for k, v in data.items()},
            f,
            sort_keys=False,
            allow_unicode=True,
        )
 def write_txt(txt, data, mode="w"):
    with open(txt, mode, encoding="utf8") as f:
        for line in data:
            f.write(line + "\n")
 def read_txt(txt):
    with open(txt, "r", encoding="utf8") as f:
        data = [line.strip() for line in f]
    return data
 def write_json(json_path, data, sort_keys=True):
    with open(json_path, "w", encoding="utf8") as f:
        json.dump(data, f, ensure_ascii=False, sort_keys=sort_keys)
 def read_json(json_path):
    with open(json_path, "r", encoding="utf8") as f:
        data = json.load(f)
    return data
 def load_ocr_output(txt_path):
    with open(txt_path) as f:
        lines = [line.replace("\n", "").replace("\r", "") for line in f.readlines()]
    words, boxes, labels = [], [], []
    for i, line in enumerate(lines):
        if len(line.split("\t")) == 6:
            x1, y1, x2, y2, text, label = line.split("\t")
        else:
            x1, y1, x2, y2, text = line.split("\t")
            label = None
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        box = [x1, y1, x2, y2]
        if text != " ":
            words.append(text)
            boxes.append(box)
            labels.append(label)
    return {"boxes": boxes, "texts": words, 'labels': labels}
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/logger.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/logger.py
@ -1,48 +0,0 @@
 import os
 import sys
 import logging
 import functools
 logger_initialized = {}
@functools.lru_cache()
 def get_logger(name='root', log_file=None, log_level=logging.DEBUG):
    """Initialize and get a logger by name.
    If the logger has not been initialized, this method will initialize the
    logger by adding one or two handlers, otherwise the initialized logger will
    be directly returned. During initialization, a StreamHandler will always be
    added. If `log_file` is specified a FileHandler will also be added.
    Args:
        name (str): Logger name.
        log_file (str | None): The log filename. If specified, a FileHandler
            will be added to the logger.
        log_level (int): The logger level. Note that only the process of
            rank 0 is affected, and other processes will set the level to
            "Error" thus be silent most of the time.
    Returns:
        logging.Logger: The expected logger.
    """
    logger = logging.getLogger(name)
    if name in logger_initialized:
        return logger
    for logger_name in logger_initialized:
        if name.startswith(logger_name):
            return logger
    formatter = logging.Formatter(
        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
        datefmt="%Y/%m/%d %H:%M:%S")
    stream_handler = logging.StreamHandler(stream=sys.stdout)
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)
    if log_file is not None:
        log_file_folder = os.path.split(log_file)[0]
        os.makedirs(log_file_folder, exist_ok=True)
        file_handler = logging.FileHandler(log_file, 'a', encoding='utf8')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    logger.setLevel(log_level)
    logger_initialized[name] = True
    return logger
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/post_processing/init.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/post_processing/init.py
@ -1,4 +0,0 @@
 from .common_post_processing import *
 from .invoice_post_processing import *
 from .receipt_post_processing import *
 from .hardcoded_postprocess_funcs import *
--- a/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/post_processing/common_post_processing.py
+++ b/cope2n-ai-fi/modules/sdsvkie/sdsvkie/utils/post_processing/common_post_processing.py
@ -1,129 +0,0 @@
 import re
 from datetime import datetime
 from sdsvkie.utils import Word_group
 YEAR_START = 2000
 def construct_word_groups_to_kie_label(list_word_groups: list):
    kie_dict = dict()
    for wg in list_word_groups:
        if wg.kie_label.lower() in ['other', 'others']:
            continue
        if wg.kie_label not in kie_dict:
            kie_dict[wg.kie_label] = [wg]
        else:
            kie_dict[wg.kie_label].append(wg)
    return kie_dict
 def near(word_group1: Word_group, word_group2: Word_group):
    min_height = min(
        word_group1.boundingbox[3] - word_group1.boundingbox[1],
        word_group2.boundingbox[3] - word_group2.boundingbox[1],
    )
    overlap = min(word_group1.boundingbox[3], word_group2.boundingbox[3]) - max(
        word_group1.boundingbox[1], word_group2.boundingbox[1]
    )
    if overlap > 0:
        return True
    if abs(overlap / min_height) < 1.5:
        print("near enough", abs(overlap / min_height), overlap, min_height)
        return True
    return False
 def normalize_number(text_str: str,  reserve_dot=False, reserve_plus=False, reserve_minus=False):
    """
    Normalize a string of numbers by removing non-numeric characters
    """
    assert isinstance(text_str, str), "input must be str"
    reserver_chars = ""
    if reserve_dot:
        reserver_chars += ".,"
    if reserve_plus:
        reserver_chars += "+"
    if reserve_minus:
        reserver_chars += "-"
    regex_fomula = "[^0-9{}]".format(reserver_chars)
    normalized_text_str = re.sub(r"{}".format(regex_fomula), "", text_str)
    return normalized_text_str
 def normalize_number_wordgroup(word_group, reserve_dot=False, reserve_plus=False, reserve_minus=False):
    word_group.text = normalize_number(word_group.text,  reserve_dot=reserve_dot, reserve_plus=reserve_plus, reserve_minus=reserve_minus)
    return word_group
 def tax_code_processing(tax_code_raw: str):    
    """
    """
    if len(tax_code_raw.replace(' ', '')) != 13 or (len(tax_code_raw.replace(' ', '')) != 14 and "-" not in tax_code_raw): # to remove the first/last number dupicated
        tax_code_raw = tax_code_raw.split(' ')        
        tax_code_raw = sorted(tax_code_raw, key=lambda x: len(x), reverse=True)[0]
    return tax_code_raw.replace(' ', '')
 def normalize_tax_wordgroup(word_group, reserve_dot=False, reserve_plus=False, reserve_minus=False):
    print("before: ", word_group.text)
    word_group.text = tax_code_processing(word_group.text)
    print("after: ", word_group.text)
    word_group.text = normalize_number(word_group.text,  reserve_dot=reserve_dot, reserve_plus=reserve_plus, reserve_minus=reserve_minus)
    return word_group
 def _date_format(date_string):
    """Format date string according format dd/MM/yyyy"""
    date_string = (
        date_string.replace("ngay ", "")
        .replace(" thang ", "/")
        .replace(" nam ", "/")
    )
    day, month, year = date_string.split("/")
    day = day.rjust(2, "0")
    month = month.rjust(2, "0")
    year = f"20{year}" if len(year) == 2 else year
    # Check valid year
    try:
        _ = datetime(year=int(year), month=int(month), day=int(day))
        if int(year) > YEAR_START:
            return "/".join([day, month, year])
    except:
        print("Date is invalid", date_string)
    return None
 def get_date(list_date):
    """Regex get date"""
    list_date = [
        _date_format(date.group(0))
        for date in list_date
        # if _date_format(date.group(0)) is not None
    ]
    list_date = [
        date for date in list_date if date is not None
    ]
    return list_date
 def merge_multi_page_results(result_pages):
    """Merge the result of the multiple pages
    Args:
        results (list[dict]): list of result dict of each page 
    """
    if len(result_pages) == 0:
        return {}
    result = result_pages[0]
    for result_page in result_pages[1:]:
        for field_name, field_value in result_page.items():
            if field_name not in result:
                result[field_name] = field_value
    return result
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`rsync -r --exclude='workdirs/' --exclude='notebooks/' --exclude='weights/' --exclude='wandb/' --exclude='microsoft/' /mnt/ssd1T/hoanglv/Projects/KIE/sdsvkie user@107.120.133.42:/mnt/data/kie`