README.fr.md : Fix typos

downloader/convert_dataset.py : Added better typing and formatting
Changed VENV_PATH from 'venv' to '.venv'
2025-04-30 21:33:52 +02:00 · 2024-11-08 01:23:38 +01:00 · 2024-11-08 01:22:49 +01:00 · 2024-11-08 01:18:31 +01:00 · 2024-07-22 22:07:17 +02:00 · 2024-07-22 22:06:51 +02:00
49 changed files with 2507 additions and 1465 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,6 @@
 data
 */models*
-venv
+.venv
 */out*
 python/__pycache__
 cpp/bin
--- a/README.fr.md
+++ b/README.fr.md
@ -1,6 +1,6 @@
 # Viola Jones

-*Lisez ceci dans d'autres langues: [English](README.md)*
+*Lisez ceci dans d'autres langues : [English](README.md)*

 ## Description

@ -23,7 +23,7 @@ Implémentation de l'algorithme "Viola Jones" en Python et C++.

 Vous pouvez configurer l'algorithme avec les variables globales définies au début du fichier *ViolaJones.cpp* puis lancer 'make start'.

-Il y a également la commande 'make clean' qui permet de supprimer tout fichiers compilées.
+Il y a également la commande 'make clean' qui permet de supprimer tous fichiers compilés.

 ### Python

@ -34,7 +34,7 @@ Vous pouvez configurer l'algorithme dans le fichier *config.py* puis lancer l'al

 ## Entraînement

-L'algorithme à été entraîné avec un processeur Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz et un GPU NVIDIA GeForce RTX 2080 Ti.
+L'algorithme a été entraîné avec un processeur Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz et un GPU NVIDIA GeForce RTX 2080 Ti.

 ### Tableau de comparaison des temps d'exécution

@ -66,7 +66,7 @@ Il se trouve que le GPU bat systématiquement le CPU en matière de temps d'exé

 L'algorithme de ViolaJones étant déterministe, tous les modèles entraînés avec un T donnée, peu importe le moyen (CPU, NJIT ou GPU), seront les mêmes modèles avec les mêmes paramètres.

-Rappel: ACC (Accuracy i.e. Précision), F1 (Score F1), FN (Faux Négatif) et FP (Faux Positif).
+Rappel : ACC (Accuracy i.e. Précision), F1 (Score F1), FN (Faux Négatif) et FP (Faux Positif).

 | Evaluating         | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) |
 | ------------------ | ------- | ------ | ------ | ------ | ------- | ------ | ------ | ------ |
@ -202,7 +202,7 @@ L'algorithme de ViolaJones étant déterministe, les fichiers devraient être é
 | ViolaJones T = 200 (NJIT) | 3,989,600           | 3ms 989µs 600ns          | 15,957,700          | 15ms 957µs 700ns         |
 | ViolaJones T = 300 (NJIT) | 5,983,900           | 5ms 983µs 900ns          | 23,935,500          | 23ms 935µs 500ns         |

-## Resources additionnels
+## Ressources additionnelles

 - [Rapid Object Detection using a Boosted Cascade of Simple Features](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/viola-cvpr-01.pdf)
 - [Chapter 39. Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda)
--- a/cpp/Dockerfile
+++ b/cpp/Dockerfile
@ -0,0 +1,17 @@
+FROM nvidia/cuda:12.6.2-devel-ubi9 AS builder
+
+WORKDIR /home/ViolaJones/cpp
+
+COPY *.cu *.cpp *.hpp Makefile ./
+RUN make -j "$(nproc)" && make -j "$(nproc)" ./bin/ViolaJonesTest
+
+FROM nvidia/cuda:12.6.2-base-ubi9
+
+WORKDIR /home/ViolaJones/cpp
+
+RUN dnf install -y make-1:4.3-8.el9 && dnf clean all
+COPY --from=builder /home/ViolaJones/cpp/bin ./bin
+COPY --from=builder /home/ViolaJones/cpp/Makefile .
+
+ENTRYPOINT ["make"]
+CMD ["start"]
--- a/cpp/Makefile
+++ b/cpp/Makefile
@ -1,79 +1,149 @@
-CC := nvcc -m64 -std=c++17 -ccbin g++-12 -Xcompiler -m64,-std=c++17
+CC := nvcc -m64 -t=0 -std=c++17 -Xcompiler -m64,-std=c++17
 OBJ_DIR := bin
-$(shell mkdir -p $(OBJ_DIR))
 MODELS_DIR := models
 OUT_DIR := out
 SRC_DIR := .
-#CFLAGS := -O0 -Werror=all-warnings -g -G
-#CFLAGS := $(CFLAGS) -pg
-#CFLAGS := $(CFLAGS) -Xptxas=-w
-#CFLAGS := $(CFLAGS) -Xcompiler -Wall,-O0,-g,-Werror,-Werror=implicit-fallthrough=0,-Wextra,-rdynamic
-CFLAGS := -O4 -Xcompiler -O4
+DATA_PATH := ../data
+#CFLAGS := -O0 -g -G -Xptxas=-w -Xcompiler -O0,-rdynamic,-g
+#CFLAGS := -O0 -g -G -pg -Xptxas=-w -Xcompiler -O0,-rdynamic,-g
+CFLAGS := -dlto -O2 -Xcompiler -O2
+#CFLAGS := -dlto -O2 -g -Xcompiler -O2,-g,-ggdb
+CFLAGS := $(CFLAGS) -MMD -MP -Werror=all-warnings -Xcompiler -Wall,-Werror,-Wextra
 EXEC := $(OBJ_DIR)/ViolaJones
-DATA := ../data/X_train.bin ../data/X_test.bin ../data/y_train.bin ../data/y_test.bin
-SRC := $(shell find $(SRC_DIR) -name "*.cpp" -o -name "*.cu" )
+EXEC_TEST := $(OBJ_DIR)/ViolaJonesTest
+DATA := $(DATA_PATH)/X_train.bin $(DATA_PATH)/X_test.bin $(DATA_PATH)/y_train.bin $(DATA_PATH)/y_test.bin
+SRC := $(shell find $(SRC_DIR) \( -name '*.cpp' -o -name '*.cu' \) -and -not -name projet_test.cpp)
+SRC_TEST := $(shell find $(SRC_DIR) \( -name '*.cpp' -o -name '*.cu' \) -and -not -name projet.cpp)
 OBJ_EXT := o
 ifeq ($(OS), Windows_NT)
-	EXEC:=$(EXEC).exe
-	OBJ_EXT:=obj
+	EXEC := $(EXEC).exe
+	OBJ_EXT := obj
 endif
 OBJ := $(SRC:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.$(OBJ_EXT))
 OBJ := $(OBJ:$(SRC_DIR)/%.cu=$(OBJ_DIR)/%.$(OBJ_EXT))
+OBJ_TEST := $(SRC_TEST:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.$(OBJ_EXT))
+OBJ_TEST := $(OBJ_TEST:$(SRC_DIR)/%.cu=$(OBJ_DIR)/%.$(OBJ_EXT))

-.PHONY: all start reset clean mrproper debug check
+.PHONY: all
+all: $(EXEC)

-all: $(EXEC) $(DATA)
+$(OBJ_DIR):
+	@mkdir -v $@

 # Compiling host code
-$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cpp
+$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cpp | $(OBJ_DIR) check-nvcc-works
 	@echo Compiling $<
 	@$(CC) $(CFLAGS) -c $< -o $@

 # Compiling gpu code
-$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cu
+$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cu | $(OBJ_DIR) check-nvcc-works
 	@echo Compiling $<
 	@$(CC) $(CFLAGS) -c $< -o $@

+# FIXME When using the docker image, Make check prequisites even when the target already exists
+#$(EXEC): $(OBJ) | check-nvcc-works
 $(EXEC): $(OBJ)
 	@echo Linking objects files to $@
 	@$(CC) $(CFLAGS) $^ -o $@

-$(DATA):
-	@bash ../download_data.sh ..
+# FIXME When using the docker image, Make check prequisites even when the target already exists
+#$(EXEC_TEST): $(OBJ_TEST) | check-nvcc-works
+$(EXEC_TEST): $(OBJ_TEST)
+	@echo Linking objects files to $@
+	@$(CC) $(CFLAGS) $^ -o $@

+$(DATA):
+	@echo 'Missing $(DATA) files, use downloader first' && exit 1
+
+.PHONY: start
 start: $(EXEC) $(DATA)
 	@./$(EXEC)

-profile: start
-	@gprof $(EXEC) gmon.out | gprof2dot | dot -Tpng -o output.png
-#@gprof $(EXEC) gmon.out > analysis.txt
+.PHONY: test
+test: $(EXEC_TEST)
+	@./$(EXEC_TEST)

+.PHONY: debug
 debug: $(EXEC) $(DATA)
 	#@cuda-gdb -q $(EXEC)
 	@gdb -q --tui $(EXEC)

-check: $(EXEC) $(DATA)
+.PHONY: profile
+profile: start | check-gprof-works check-gprof2dot-works check-dot-works
+	@gprof $(EXEC) gmon.out | gprof2dot | dot -T png -o output.png
+
+.PHONY: check
+check: $(EXEC) $(DATA) | check-valgrind-works
 	@valgrind -q -s --leak-check=full --show-leak-kinds=all $(EXEC)

-cudacheck: $(EXEC) $(DATA)
-	@cuda-memcheck --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all $(EXEC)
-#@cuda-memcheck --destroy-on-device-error kernel --tool racecheck --racecheck-report all $(EXEC)
-#@cuda-memcheck --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC)
-#@cuda-memcheck --destroy-on-device-error kernel --tool synccheck $(EXEC)
-#@compute-sanitizer --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all --track-stream-ordered-races all $(EXEC)
-#@compute-sanitizer --destroy-on-device-error kernel --tool racecheck --racecheck-detect-level info --racecheck-report all $(EXEC)
-#@compute-sanitizer --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC)
-#@compute-sanitizer --destroy-on-device-error kernel --tool synccheck $(EXEC)
+.PHONY: cudacheck
+cudacheck: $(EXEC) $(DATA) | check-compute-sanitizer-works
+	@compute-sanitizer --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all --track-stream-ordered-races all --target-processes all $(EXEC)
+	#@compute-sanitizer --destroy-on-device-error kernel --tool racecheck --racecheck-detect-level info --racecheck-report all $(EXEC)
+	#@compute-sanitizer --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC)
+	#@compute-sanitizer --destroy-on-device-error kernel --tool synccheck $(EXEC)

-r2: $(EXEC) $(DATA)
-	@r2 $(EXEC)
+.PHONY: log
+log: $(DATA) reset
+	@echo 'Building GPU'
+	@sed -i 's/GPU_BOOSTED false/GPU_BOOSTED true/' config.hpp
+	@make -s -j "$(shell nproc)"
+	@echo 'Logging GPU'
+	@make -s start > log_gpu
+	@echo 'Building CPU'
+	@sed -i 's/GPU_BOOSTED true/GPU_BOOSTED false/' config.hpp
+	@make -s -j "$(shell nproc)"
+	@echo 'Logging CPU'
+	@make -s start > log_cpu
+	@sed -i 's/GPU_BOOSTED false/GPU_BOOSTED true/' config.hpp
+	@echo 'Cleaning up'
+	@make -s reset

+.PHONY: reset
 reset:
-	@echo Deleting generated states and models
-	@rm -rf $(OUT_DIR)/* $(MODELS_DIR)/* | true
+	@echo 'Deleting generated states and models'
+	@rm -frv $(OUT_DIR) $(MODELS_DIR)

+.PHONY: clean
 clean:
-	@rm $(EXEC)
+	@rm -fv $(EXEC) log_gpu log_cpu

-mrproper:
-	@rm -r $(OBJ_DIR)
+.PHONY: mrproper
+mrproper: clean
+	@rm -rfv $(OBJ_DIR) gmon.out
+
+.PHONY: help
+help:
+	@echo "Available targets:"
+	@echo "\tall: alias for start, (default target)"
+	@echo "\tstart: Start the ViolaJones algorithm, require data beforehand downloaded by the downloader."
+	@echo "\tdebug: Debug the ViolaJones algorithm, require data beforehand downloaded by the downloader."
+	@echo "\tprofile: Profile the ViolaJones algorithm functions timestamps, require data beforehand downloaded by the downloader."
+	@echo "\treset: Will delete any saved models and processed data made by ViolaJones."
+	@echo "\tmrproper: Will remove cpp binary files. Will execute reset target beforehand."
+
+.PHONY: check-nvcc-works
+check-nvcc-works:
+	@nvcc --version >/dev/null 2>&1 || (echo 'Please install NVIDIA Cuda compiler.' && exit 1)
+
+.PHONY: check-gprof-works
+check-gprof-works:
+	@gprof --version >/dev/null 2>&1 || (echo 'Please install GNU gprof.' && exit 1)
+
+.PHONY: check-gprof2dot-works
+check-gprof2dot-works:
+	@gprof2dot --help >/dev/null 2>&1 || (echo 'Please install gprof2dot.' && exit 1)
+
+.PHONY: check-dot-works
+check-dot-works:
+	@dot --version >/dev/null 2>&1 || (echo 'Please install dot from graphviz.' && exit 1)
+
+.PHONY: check-valgrind-works
+check-valgrind-works:
+	@valgrind --version >/dev/null 2>&1 || (echo 'Please install valgrind.' && exit 1)
+
+.PHONY: check-compute-sanitizer-works
+check-compute-sanitizer-works:
+	@compute-sanitizer --version >/dev/null 2>&1 || (echo 'Please install Compute Sanitizer from Cuda toolkit.' && exit 1)
+
+-include $(OBJ:.o=.d)
--- a/cpp/ViolaJones.cpp
+++ b/cpp/ViolaJones.cpp
@ -1,56 +1,61 @@
 #include <cmath>
 #include "data.hpp"
-#include "config.hpp"
-#include "ViolaJonesGPU.hpp"
-#include "ViolaJonesCPU.hpp"
+#include "ViolaJones_device.hpp"

-static inline void add_empty_feature(const np::Array<uint8_t>& feats, size_t& n) noexcept {
+constexpr static inline void add_empty_feature(const np::Array<uint8_t>& feats, size_t& n) noexcept {
 	memset(&feats[n], 0, 4 * sizeof(uint8_t));
 	n += 4;
 }

-static inline void add_right_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
+constexpr static inline void add_right_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
 	feats[n++] = i + w;
 	feats[n++] = j;
 	feats[n++] = w;
 	feats[n++] = h;
 }

-static inline void add_immediate_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
+constexpr static inline void add_immediate_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
 	feats[n++] = i;
 	feats[n++] = j;
 	feats[n++] = w;
 	feats[n++] = h;
 }

-static inline void add_bottom_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
+constexpr static inline void add_bottom_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
 	feats[n++] = i;
 	feats[n++] = j + h;
 	feats[n++] = w;
 	feats[n++] = h;
 }

-static inline void add_right2_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
+constexpr static inline void add_right2_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
 	feats[n++] = i + 2 * w;
 	feats[n++] = j;
 	feats[n++] = w;
 	feats[n++] = h;
 }

-static inline void add_bottom2_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
+constexpr static inline void add_bottom2_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
 	feats[n++] = i;
 	feats[n++] = j + 2 * h;
 	feats[n++] = w;
 	feats[n++] = h;
 }

-static inline void add_bottom_right_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
+constexpr static inline void add_bottom_right_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
 	feats[n++] = i + w;
 	feats[n++] = j + h;
 	feats[n++] = w;
 	feats[n++] = h;
 }

+/**
+ * @brief Initialize the features based on the input shape.
+ *
+ * @param width Width of the image
+ * @param height Height of the image
+ * @return The initialized features
+ */
 np::Array<uint8_t> build_features(const uint16_t& width, const uint16_t& height) noexcept {
 	size_t n = 0;
 	uint16_t w, h, i, j;
@ -110,11 +115,11 @@ np::Array<uint8_t> build_features(const uint16_t& width, const uint16_t& height)
 	return feats;
 }

-//np::Array<int> select_percentile(const np::Array<uint8_t> X_feat, const np::Array<uint8_t> y) noexcept {
+//np::Array<int32_t> select_percentile(const np::Array<uint8_t> X_feat, const np::Array<uint8_t> y) noexcept {
 //	std::vector<float64_t> class_0, class_1;
 //
-//	const int im_size = X_feat.shape[0] / y.shape[0];
-//	int idy = 0, n_samples_per_class_0 = 0, n_samples_per_class_1 = 0;
+//	const int32_t im_size = X_feat.shape[0] / y.shape[0];
+//	int32_t idy = 0, n_samples_per_class_0 = 0, n_samples_per_class_1 = 0;
 //	for (size_t i = 0; i < X_feat.shape[0]; i += im_size) {
 //		if (y[idy] == 0) {
 //			++n_samples_per_class_0;
@ -126,24 +131,24 @@ np::Array<uint8_t> build_features(const uint16_t& width, const uint16_t& height)
 //		}
 //		++idy;
 //	}
-//	const int n_samples = n_samples_per_class_0 + n_samples_per_class_1;
+//	const int32_t n_samples = n_samples_per_class_0 + n_samples_per_class_1;
 //
 //	float64_t ss_alldata_0 = 0;
-//	for (int i = 0;i < n_samples_per_class_0;++i)
+//	for (int32_t i = 0;i < n_samples_per_class_0;++i)
 //		ss_alldata_0 += (class_0[i] * class_0[i]);
 //
 //	float64_t ss_alldata_1 = 0;
-//	for (int i = 0;i < n_samples_per_class_1;++i)
+//	for (int32_t i = 0;i < n_samples_per_class_1;++i)
 //		ss_alldata_1 += (class_1[i] * class_1[i]);
 //
 //	const float64_t ss_alldata = ss_alldata_0 + ss_alldata_1;
 //
 //	float64_t sums_classes_0 = 0;
-//	for (int i = 0;i < n_samples_per_class_0;++i)
+//	for (int32_t i = 0;i < n_samples_per_class_0;++i)
 //		sums_classes_0 += class_0[i];
 //
 //	float64_t sums_classes_1 = 0;
-//	for (int i = 0;i < n_samples_per_class_1;++i)
+//	for (int32_t i = 0;i < n_samples_per_class_1;++i)
 //		sums_classes_1 += class_1[i];
 //
 //	float64_t sq_of_sums_alldata = sums_classes_0 + sums_classes_1;
@ -154,15 +159,21 @@ np::Array<uint8_t> build_features(const uint16_t& width, const uint16_t& height)
 //	const float64_t ss_tot = ss_alldata - sq_of_sums_alldata / n_samples;
 //	const float64_t sqd_sum_bw_n = sq_of_sums_args_0 / n_samples_per_class_0 + sq_of_sums_args_1 / n_samples_per_class_1 - sq_of_sums_alldata / n_samples;
 //	const float64_t ss_wn = ss_tot - sqd_sum_bw_n;
-//	const int df_wn = n_samples - 2;
+//	const int32_t df_wn = n_samples - 2;
 //	const float64_t msw = ss_wn / df_wn;
 //	const float64_t f_values = sqd_sum_bw_n / msw;
 //
-//	const np::Array<int> res = np::empty<int>({ static_cast<size_t>(std::ceil(static_cast<float64_t>(im_size) / 10.0)) });
+//	const np::Array<int32_t> res = np::empty<int32_t>({ static_cast<size_t>(std::ceil(static_cast<float64_t>(im_size) / 10.0)) });
 //	// TODO Complete code
 //	return res;
 //}

+/**
+ * @brief Initialize the weights of the weak classifiers based on the training labels.
+ *
+ * @param y_train Training labels
+ * @return The initialized weights
+ */
 np::Array<float64_t> init_weights(const np::Array<uint8_t>& y_train) noexcept {
 	np::Array<float64_t> weights = np::empty<float64_t>(y_train.shape);
 	const uint16_t t = np::sum(np::astype<uint16_t>(y_train));
@ -172,13 +183,30 @@ np::Array<float64_t> init_weights(const np::Array<uint8_t>& y_train) noexcept {
 	}));
 }

-np::Array<uint8_t> classify_weak_clf(const np::Array<int32_t>& X_feat_i, const size_t& j, const float64_t& threshold, const float64_t& polarity) noexcept {
+/**
+ * @brief Classify the integrated features based on polarity and threshold.
+ *
+ * @param X_feat_i Integrated features
+ * @param j Index of the classifier
+ * @param threshold Trained threshold
+ * @param polarity Trained polarity
+ * @return Classified features
+ */
+static np::Array<uint8_t> classify_weak_clf(const np::Array<int32_t>& X_feat_i, const size_t& j, const float64_t& threshold, const float64_t& polarity) noexcept {
 	np::Array<uint8_t> res = np::empty<uint8_t>({ X_feat_i.shape[1] });
 	for(size_t i = 0; i < res.shape[0]; ++i)
 		res[i] = polarity * X_feat_i[j * X_feat_i.shape[1] + i] < polarity * threshold ? 1 : 0;
 	return res;
 }

+/**
+ * @brief Classify the trained classifiers on the given features.
+ *
+ * @param alphas Trained alphas
+ * @param classifiers Trained classifiers
+ * @param X_feat integrated features
+ * @return Classification results
+ */
 np::Array<uint8_t> classify_viola_jones(const np::Array<float64_t>& alphas, const np::Array<float64_t>& classifiers, const np::Array<int32_t>& X_feat) noexcept {
 	np::Array<float64_t> total = np::zeros<float64_t>({ X_feat.shape[1] });

@ -199,6 +227,15 @@ np::Array<uint8_t> classify_viola_jones(const np::Array<float64_t>& alphas, cons
 	return y_pred;
 }

+/**
+ * @brief Select the best classifer given their predictions.
+ *
+ * @param classifiers The weak classifiers
+ * @param weights Trained weights of each classifiers
+ * @param X_feat Integrated features
+ * @param y Features labels
+ * @return Index of the best classifier, the best error and the best accuracy
+ */
 std::tuple<int32_t, float64_t, np::Array<float64_t>> select_best(const np::Array<float64_t>& classifiers, const np::Array<float64_t>& weights, const np::Array<int32_t>& X_feat, const np::Array<uint8_t>& y) noexcept {
 	std::tuple<int32_t, float64_t, np::Array<float64_t>> res = { -1, np::inf, np::empty<float64_t>({ X_feat.shape[0] }) };

@ -216,6 +253,15 @@ std::tuple<int32_t, float64_t, np::Array<float64_t>> select_best(const np::Array
 	return res;
 }

+/**
+ * @brief Train the weak calssifiers.
+ *
+ * @param T Number of weak classifiers
+ * @param X_feat Integrated features
+ * @param X_feat_argsort Sorted indexes of the integrated features
+ * @param y Features labels
+ * @return List of trained alphas and the list of the final classifiers
+ */
 std::array<np::Array<float64_t>, 2> train_viola_jones(const size_t& T, const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y) noexcept {
 	np::Array<float64_t> weights = init_weights(y);
 	np::Array<float64_t> alphas = np::empty<float64_t>({ T });
@ -223,11 +269,7 @@ std::array<np::Array<float64_t>, 2> train_viola_jones(const size_t& T, const np:

 	for(size_t t = 0; t < T; ++t ){
 		weights /= np::sum(weights);
-#if GPU_BOOSTED
-		const np::Array<float64_t> classifiers = train_weak_clf_gpu(X_feat, X_feat_argsort, y, weights);
-#else
-		const np::Array<float64_t> classifiers = train_weak_clf_cpu(X_feat, X_feat_argsort, y, weights);
-#endif
+		const np::Array<float64_t> classifiers = train_weak_clf(X_feat, X_feat_argsort, y, weights);
 		const auto [ clf, error, accuracy ] = select_best(classifiers, weights, X_feat, y);
 		float64_t beta = error / (1.0 - error);
 		weights *= np::pow(beta, (1.0 - accuracy));
@ -239,6 +281,13 @@ std::array<np::Array<float64_t>, 2> train_viola_jones(const size_t& T, const np:
 	return { alphas, final_classifier };
 }

+/**
+ * @brief Compute the accuracy score i.e. how a given set of measurements are close to their true value.
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed accuracy score
+ */
 float64_t accuracy_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
 	float64_t res = 0.0;
 	for(size_t i = 0; i < y.shape[0]; ++i)
@ -247,6 +296,13 @@ float64_t accuracy_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>&
 	return res / y.shape[0];
 }

+/**
+ * @brief Compute the precision score i.e. how a given set of measurements are close to each other.
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed precision score
+ */
 float64_t precision_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
 	uint16_t true_positive = 0, false_positive = 0;
 	for(size_t i = 0; i < y.shape[0]; ++i)
@ -259,6 +315,13 @@ float64_t precision_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>&
 	return static_cast<float64_t>(true_positive) / (true_positive + false_positive);
 }

+/**
+ * @brief Compute the recall score i.e. the ratio (TP / (TP + FN)) where TP is the number of true positives and FN the number of false negatives.
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed recall score
+ */
 float64_t recall_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
 	uint16_t true_positive = 0, false_negative = 0;
 	for(size_t i = 0; i < y.shape[0]; ++i)
@ -272,12 +335,35 @@ float64_t recall_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_
 	return static_cast<float64_t>(true_positive) / (true_positive + false_negative);
 }

+/**
+ * @brief Compute the F1 score aka balanced F-score or F-measure.
+ *
+ * F1 = (2 * TP) / (2 * TP + FP + FN)
+ * where TP is the true positives,
+ * FP is the false positives,
+ * and FN is the false negatives
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed F1 score
+ */
 float64_t f1_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
 	const float64_t precision = precision_score(y, y_pred);
 	const float64_t recall = recall_score(y, y_pred);
 	return 2 * (precision * recall) / (precision + recall);
 }

+/**
+ * @brief Compute the confusion matrix to evaluate a given classification.
+ *
+ * A confusion matrix of a binary classification consists of a 2x2 matrix containing
+ * | True negatives  | False positives |
+ * | False negatives | True positives  |
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed confusion matrix
+ */
 std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> confusion_matrix(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
 	uint16_t true_positive = 0, false_positive = 0, true_negative = 0, false_negative = 0;
 	for(size_t i = 0; i < y.shape[0]; ++i)
@ -293,4 +379,3 @@ std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> confusion_matrix(const np::Ar
 				++false_positive;
 	return std::make_tuple(true_negative, false_positive, false_negative, true_positive);
 }
-
--- a/cpp/ViolaJones.hpp
+++ b/cpp/ViolaJones.hpp
@ -1,10 +1,16 @@
 #pragma once
 #include <filesystem>
-namespace fs = std::filesystem;
 #include "data.hpp"
 #include "toolbox.hpp"
-//#include "config.hpp"

+/**
+ * @brief Test if a array from a CPU computation is equal to a GPU computation equivalent.
+ *
+ * @tparam T Inner type of the arrays to test
+ * @param cpu CPU Array
+ * @param gpu GPU Array
+ * @return Whether the test was successful
+ */
 template <typename T>
 bool unit_test_cpu_vs_gpu(const np::Array<T>& cpu, const np::Array<T>& gpu) noexcept {
 	if (cpu.shape != gpu.shape) {
@ -28,6 +34,14 @@ bool unit_test_cpu_vs_gpu(const np::Array<T>& cpu, const np::Array<T>& gpu) noex
 	return eq == length;
 }

+/**
+ * @brief Test if a given 2D array of indices sort a given 2D array
+ *
+ * @tparam T Inner type of the array to test
+ * @param a 2D Array of data
+ * @param indices 2D Indices that sort the array
+ * @return Whether the test was successful
+ */
 template <typename T>
 bool unit_test_argsort_2d(const np::Array<T>& a, const np::Array<uint16_t>& indices) noexcept {
 	if (a.shape != indices.shape) {
@ -52,68 +66,122 @@ bool unit_test_argsort_2d(const np::Array<T>& a, const np::Array<uint16_t>& indi
 	return correct == total;
 }

+/**
+ * @brief Benchmark a function and display the result in stdout.
+ *
+ * @tparam T Resulting type of the function to benchmark
+ * @tparam F Signature of the function to call
+ * @tparam Args Arguments variadic of the function to call
+ * @param step_name Name of the function to log
+ * @param column_width Width of the column to print during logging
+ * @param fnc Function to benchmark
+ * @param args Arguments to pass to the function to call
+ * @return Result of the benchmarked function
+ */
 template <typename T, typename F, typename... Args>
-T benchmark_function(const char* step_name, const F& fnc, Args &&...args) noexcept {
+T benchmark_function(const char* const step_name, const int32_t& column_width, const F& fnc, Args &&...args) noexcept {
 #if __DEBUG == false
-	printf("%s...\r", step_name);
-	fflush(stdout); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered
+	fprintf(stderr, "%s...\r", step_name);
+	fflush(stderr); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered
 #endif
 	const std::chrono::system_clock::time_point start = perf_counter_ns();
 	const T res = fnc(std::forward<Args>(args)...);
 	const long long time_spent = duration_ns(perf_counter_ns() - start);
-	printf("| %-49s | %18s | %-29s |\n", step_name, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str());
+	formatted_row<3>({ column_width, -18, 29 }, { step_name, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
 	return res;
 }

+/**
+ * @brief Benchmark a function and display the result in stdout.
+ *
+ * @tparam F Signature of the function to call
+ * @tparam Args Arguments variadic of the function to call
+ * @param step_name Name of the function to log
+ * @param column_width Width of the column to print during logging
+ * @param fnc Function to benchmark
+ * @param args Arguments to pass to the function to call
+ */
 template <typename F, typename... Args>
-void benchmark_function_void(const char* step_name, const F& fnc, Args &&...args) noexcept {
+void benchmark_function_void(const char* const step_name, const int32_t& column_width, const F& fnc, Args &&...args) noexcept {
 #if __DEBUG == false
-	printf("%s...\r", step_name);
-	fflush(stdout); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered
+	fprintf(stderr, "%s...\r", step_name);
+	fflush(stderr); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered
 #endif
 	const std::chrono::system_clock::time_point start = perf_counter_ns();
 	fnc(std::forward<Args>(args)...);
 	const long long time_spent = duration_ns(perf_counter_ns() - start);
-	printf("| %-49s | %18s | %-29s |\n", step_name, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str());
+	formatted_row<3>({ column_width, -18, 29 }, { step_name, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
 }

+/**
+ * @brief Either execute a function then save the result or load the already cached result.
+ *
+ * @tparam T Inner type of the resulting array
+ * @tparam F Signature of the function to call
+ * @tparam Args Arguments variadic of the function to call
+ * @param step_name Name of the function to log
+ * @param column_width Width of the column to print during logging
+ * @param filename Name of the filename where the result is saved
+ * @param force_redo Recall the function even if the result is already saved, ignored if result is not cached
+ * @param save_state Whether the computed result will be saved or not, ignore if loading already cached result
+ * @param out_dir Path of the directory to save the result
+ * @param fnc Function to call
+ * @param args Arguments to pass to the function to call
+ * @return The result of the called function
+ */
 template <typename T, typename F, typename... Args>
-np::Array<T> state_saver(const char* step_name, const char* filename, const bool& force_redo, const bool& save_state, const char* out_dir, const F& fnc, Args &&...args) noexcept {
+np::Array<T> state_saver(const char* const step_name, const int32_t& column_width, const char* const filename, const bool& force_redo, const bool& save_state, const char* const out_dir, const F& fnc, Args &&...args) noexcept {
 	char filepath[BUFFER_SIZE] = { 0 };
-	sprintf(filepath, "%s/%s.bin", out_dir, filename);
+	snprintf(filepath, BUFFER_SIZE, "%s/%s.bin", out_dir, filename);

 	np::Array<T> bin;
-	if (!fs::exists(filepath) || force_redo) {
-		bin = std::move(benchmark_function<np::Array<T>>(step_name, fnc, std::forward<Args>(args)...));
+	if (!std::filesystem::exists(filepath) || force_redo) {
+		bin = benchmark_function<np::Array<T>>(step_name, column_width, fnc, std::forward<Args>(args)...);
 		if(save_state){
 #if __DEBUG == false
-			printf("Saving results of %s\r", step_name);
-			fflush(stdout);
+			fprintf(stderr, "Saving results of %s\r", step_name);
+			fflush(stderr);
 #endif
 			save<T>(bin, filepath);
 #if __DEBUG == false
-			printf("%*c\r", 100, ' ');
-			fflush(stdout);
+			fprintf(stderr, "%*c\r", 100, ' '); // Clear previous clear
+			fflush(stderr);
 #endif
 		}
 	} else {
 #if __DEBUG == false
-		printf("Loading results of %s\r", step_name);
-		fflush(stdout);
+		fprintf(stderr, "Loading results of %s\r", step_name);
+		fflush(stderr);
 #endif
-		bin = std::move(load<T>(filepath));
-		printf("| %-49s | %18s | %-29s |\n", step_name, "None", "loaded saved state");
+		bin = load<T>(filepath);
+		formatted_row<3>({ column_width, -18, 29 }, { step_name, "None", "loaded saved state" });
 	}
 	return bin;
 }

+/**
+ * @brief Either execute a function then saves the results or load the already cached result.
+ *
+ * @tparam T Inner type of the resulting arrays
+ * @tparam F Signature of the function to call
+ * @tparam Args Arguments variadic of the function to call
+ * @param step_name Name of the function to log
+ * @param column_width Width of the column to print during logging
+ * @param filenames List of names of the filenames where the results are save
+ * @param force_redo Recall the function even if the results are already saved, ignored if results are not cached
+ * @param save_state Whether the computed results will be saved or not, ignored if loading already cached results
+ * @param out_dir Path of the directory to save the results
+ * @param fnc Function to call
+ * @param args Arguments to pass to the function to call
+ * @return The results of the called function
+ */
 template <typename T, size_t N, typename F, typename... Args>
-std::array<np::Array<T>, N> state_saver(const char* step_name, const std::vector<const char*>& filenames, const bool& force_redo, const bool& save_state, const char* out_dir, const F& fnc, Args &&...args) noexcept {
+std::array<np::Array<T>, N> state_saver(const char* const step_name, const int32_t& column_width, const std::vector<const char*>& filenames, const bool& force_redo, const bool& save_state, const char* const out_dir, const F& fnc, Args &&...args) noexcept {
 	char filepath[BUFFER_SIZE] = { 0 };
 	bool abs = false;
-	for (const char* filename : filenames){
-		sprintf(filepath, "%s/%s.bin", out_dir, filename);
-		if (!fs::exists(filepath)) {
+	for (const char* const filename : filenames){
+		snprintf(filepath, BUFFER_SIZE, "%s/%s.bin", out_dir, filename);
+		if (!std::filesystem::exists(filepath)) {
 			abs = true;
 			break;
 		}
@ -121,47 +189,138 @@ std::array<np::Array<T>, N> state_saver(const char* step_name, const std::vector

 	std::array<np::Array<T>, N> bin;
 	if (abs || force_redo) {
-		bin = std::move(benchmark_function<std::array<np::Array<T>, N>>(step_name, fnc, std::forward<Args>(args)...));
+		bin = benchmark_function<std::array<np::Array<T>, N>>(step_name, column_width, fnc, std::forward<Args>(args)...);
 		if (save_state){
 #if __DEBUG == false
-			printf("Saving results of %s\r", step_name);
-			fflush(stdout);
+			fprintf(stderr, "Saving results of %s\r", step_name);
+			fflush(stderr);
 #endif
 			size_t i = 0;
-			for (const char* filename : filenames){
-				sprintf(filepath, "%s/%s.bin", out_dir, filename);
+			for (const char* const filename : filenames){
+				snprintf(filepath, BUFFER_SIZE, "%s/%s.bin", out_dir, filename);
 				save<T>(bin[i++], filepath);
 			}
 #if __DEBUG == false
-			printf("%*c\r", 100, ' ');
-			fflush(stdout);
+			fprintf(stderr, "%*c\r", 100, ' '); // Clear previous print
+			fflush(stderr);
 #endif
 		}
 	} else {
 #if __DEBUG == false
-		printf("Loading results of %s\r", step_name);
-		fflush(stdout);
+		fprintf(stderr, "Loading results of %s\r", step_name);
+		fflush(stderr);
 #endif
 		size_t i = 0;
-		for (const char* filename : filenames){
-			sprintf(filepath, "%s/%s.bin", out_dir, filename);
-			bin[i++] = std::move(load<T>(filepath));
+		for (const char* const filename : filenames){
+			snprintf(filepath, BUFFER_SIZE, "%s/%s.bin", out_dir, filename);
+			bin[i++] = load<T>(filepath);
 		}
-		printf("| %-49s | %18s | %-29s |\n", step_name, "None", "loaded saved state");
+		formatted_row<3>({ column_width, -18, 29 }, { step_name, "None", "loaded saved state" });
 	}
 	return bin;
 }

-np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>&) noexcept;
+/**
+ * @brief Initialize the features based on the input shape.
+ *
+ * @param width Width of the image
+ * @param height Height of the image
+ * @return The initialized features
+ */
 np::Array<uint8_t> build_features(const uint16_t&, const uint16_t&) noexcept;
-np::Array<int> select_percentile(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
+//np::Array<int32_t> select_percentile(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Classify the trained classifiers on the given features.
+ *
+ * @param alphas Trained alphas
+ * @param classifiers Trained classifiers
+ * @param X_feat integrated features
+ * @return Classification results
+ */
 np::Array<uint8_t> classify_viola_jones(const np::Array<float64_t>&, const np::Array<float64_t>&, const np::Array<int32_t>&) noexcept;
+
+/**
+ * @brief Initialize the weights of the weak classifiers based on the training labels.
+ *
+ * @param y_train Training labels
+ * @return The initialized weights
+ */
 np::Array<float64_t> init_weights(const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Select the best classifier given their predictions.
+ *
+ * @param classifiers The weak classifiers
+ * @param weights Trained weights of each classifiers
+ * @param X_feat Integrated features
+ * @param y Features labels
+ * @return Index of the best classifier, the best error and the best accuracy
+ */
 std::tuple<int32_t, float64_t, np::Array<float64_t>> select_best(const np::Array<float64_t>&, const np::Array<float64_t>&, const np::Array<int32_t>&,
 								const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Train the weak classifiers.
+ *
+ * @param T Number of weak classifiers
+ * @param X_feat Integrated features
+ * @param X_feat_argsort Sorted indexes of the integrated features
+ * @param y Features labels
+ * @return List of trained alphas and the list of the final classifiers
+ */
 std::array<np::Array<float64_t>, 2> train_viola_jones(const size_t&, const np::Array<int32_t>&, const np::Array<uint16_t>&, const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Compute the accuracy score i.e. how a given set of measurements are close to their true value.
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed accuracy score
+ */
 float64_t accuracy_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Compute the precision score i.e. how a given set of measurements are close to each other.
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed precision score
+ */
 float64_t precision_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Compute the recall score i.e. the ratio (TP / (TP + FN)) where TP is the number of true positives and FN the number of false negatives.
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed recall score
+ */
 float64_t recall_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Compute the F1 score aka balanced F-score or F-measure.
+ *
+ * F1 = (2 * TP) / (2 * TP + FP + FN)
+ * where TP is the true positives,
+ * FP is the false positives,
+ * and FN is the false negatives
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed F1 score
+ */
 float64_t f1_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Compute the confusion matrix to evaluate a given classification.
+ *
+ * A confusion matrix of a binary classification consists of a 2x2 matrix containing
+ * | True negatives  | False positives |
+ * | False negatives | True positives  |
+ *
+ * @param y Ground truth labels
+ * @param y_pred Predicted labels
+ * @return computed confusion matrix
+ */
 std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> confusion_matrix(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
--- a/cpp/ViolaJonesCPU.cpp
+++ b/cpp/ViolaJonesCPU.cpp
@ -1,7 +1,15 @@
 #include "data.hpp"
-#include "toolbox.hpp"
+#include "config.hpp"

-np::Array<uint32_t> set_integral_image_cpu(const np::Array<uint8_t>& set) noexcept {
+#if GPU_BOOSTED == false
+
+/**
+ * @brief Transform the input images in integrated images (CPU version).
+ *
+ * @param X Dataset of images
+ * @return Dataset of integrated images
+ */
+np::Array<uint32_t> set_integral_image(const np::Array<uint8_t>& set) noexcept {
 	np::Array<uint32_t> X_ii = np::empty<uint32_t>(set.shape);

 	size_t i, y, x, s;
@ -31,7 +39,14 @@ constexpr static inline int16_t __compute_feature__(const np::Array<uint32_t>& X
 	return X_ii[j + _yh + w] + X_ii[j + _y] - X_ii[j + _yh] - X_ii[j + _y + w];
 }

-np::Array<int32_t> apply_features_cpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
+/**
+ * @brief Apply the features on a integrated image dataset (CPU version).
+ *
+ * @param feats Features to apply
+ * @param X_ii Integrated image dataset
+ * @return Applied features
+ */
+np::Array<int32_t> apply_features(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
 	np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });

 	size_t j, feat_idx = 0;
@ -51,7 +66,7 @@ np::Array<int32_t> apply_features_cpu(const np::Array<uint8_t>& feats, const np:
 	return X_feat;
 }

-np::Array<float64_t> train_weak_clf_cpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y, const np::Array<float64_t>& weights) noexcept {
+np::Array<float64_t> train_weak_clf(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y, const np::Array<float64_t>& weights) noexcept {
 	float64_t total_pos = 0.0, total_neg = 0.0;
 	for(size_t i = 0; i < y.shape[0]; ++i)
 		(y[i] == static_cast<uint8_t>(1) ? total_pos : total_neg) += weights[i];
@ -81,7 +96,69 @@ np::Array<float64_t> train_weak_clf_cpu(const np::Array<int32_t>& X_feat, const
 	return classifiers;
 }

-np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>& X_feat) noexcept {
+/**
+ * @brief Perform an indirect sort of a given array within a given bound.
+ *
+ * @tparam T Inner type of the array
+ * @param a Array to sort
+ * @param indices Array of indices to write to
+ * @param low lower bound to sort
+ * @param high higher bound to sort
+ */
+template<typename T>
+static void argsort(const T* const a, uint16_t* const indices, size_t low, size_t high) noexcept {
+	const size_t total = high - low + 1;
+
+	size_t* const stack = new size_t[total]{low, high};
+	//size_t stack[total];
+	//stack[0] = l;
+	//stack[1] = h;
+	size_t top = 1;
+
+	while (top <= total) {
+		high = stack[top--];
+		low = stack[top--];
+		if(low >= high)
+			break;
+
+		const size_t p = as_partition(a, indices, low, high);
+
+		if (p - 1 > low && p - 1 < total) {
+			stack[++top] = low;
+			stack[++top] = p - 1;
+		}
+
+		if (p + 1 < high) {
+			stack[++top] = p + 1;
+			stack[++top] = high;
+		}
+	}
+	delete[] stack;
+}
+
+/**
+ * @brief Apply argsort to every column of a given 2D array.
+ *
+ * @tparam T Inner type of the array
+ * @param a 2D Array to sort
+ * @return 2D Array of indices that sort the array
+ */
+template<typename T>
+static np::Array<uint16_t> argsort_bounded(const np::Array<T>& a, const size_t& low, const size_t& high) noexcept {
+	np::Array<uint16_t> indices = np::empty(a.shape);
+	map(indices, [](const size_t& i, const uint16_t&) -> uint16_t { return i; });
+
+	argsort_bounded(a, indices, low, high);
+	return indices;
+}
+
+/**
+ * @brief Perform an indirect sort on each column of a given 2D array (CPU version).
+ *
+ * @param a 2D Array to sort
+ * @return 2D Array of indices that sort the array
+ */
+np::Array<uint16_t> argsort_2d(const np::Array<int32_t>& X_feat) noexcept {
 	const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
 	const size_t length = np::prod(X_feat.shape);
 	for (size_t i = 0; i < length; i += X_feat.shape[1]) {
@ -91,3 +168,4 @@ np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>& X_feat) noexcept {
 	return indices;
 }

+#endif // GPU_BOOSTED == false
--- a/cpp/ViolaJonesCPU.hpp
+++ b/cpp/ViolaJonesCPU.hpp
@ -1,8 +0,0 @@
-#pragma once
-#include "data.hpp"
-
-np::Array<uint32_t> set_integral_image_cpu(const np::Array<uint8_t>&) noexcept;
-np::Array<int32_t> apply_features_cpu(const np::Array<uint8_t>&, const np::Array<uint32_t>&) noexcept;
-np::Array<float64_t> train_weak_clf_cpu(const np::Array<int32_t>&, const np::Array<uint16_t>&, const np::Array<uint8_t>&,
-					const np::Array<float64_t>&) noexcept;
-np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>&) noexcept;
--- a/cpp/ViolaJonesGPU.cu
+++ b/cpp/ViolaJonesGPU.cu
@ -1,5 +1,14 @@
 #include "data.hpp"
+#include "config.hpp"

+#if GPU_BOOSTED
+
+/**
+ * @brief Prefix Sum (scan) of a given dataset.
+ *
+ * @param X Dataset of images to apply sum
+ * @return Scanned dataset of images
+ */
 static np::Array<uint32_t> __scanCPU_3d__(const np::Array<uint32_t>& X) noexcept {
 	np::Array<uint32_t> X_scan = np::empty<uint32_t>(X.shape);
 	const size_t total = np::prod(X_scan.shape);
@ -16,6 +25,14 @@ static np::Array<uint32_t> __scanCPU_3d__(const np::Array<uint32_t>& X) noexcept
 	return X_scan;
 }

+/**
+ * @brief GPU kernel used to do a parallel prefix sum (scan).
+ *
+ * @param n Number of width blocks
+ * @param j Temporary sum index
+ * @param d_inter Temporary sums on device to add
+ * @param d_X Dataset of images on device to apply sum
+ */
 static __global__ void __kernel_scan_3d__(const uint16_t n, const uint16_t j, np::Array<uint32_t> d_inter, np::Array<uint32_t> d_X) {
 	const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x;
 	const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y;
@ -60,6 +77,14 @@ static __global__ void __kernel_scan_3d__(const uint16_t n, const uint16_t j, np
 		d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] = sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y];
 }

+/**
+ * @brief GPU kernel for parallel sum.
+ *
+ * @param d_X Dataset of images on device
+ * @param d_s Temporary sums to add on device
+ * @param n Number of width blocks
+ * @param m Height of a block
+ */
 static __global__ void __add_3d__(np::Array<uint32_t> d_X, const np::Array<uint32_t> d_s, const uint16_t n, const uint16_t m) {
 	const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x;
 	const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y;
@ -67,6 +92,14 @@ static __global__ void __add_3d__(np::Array<uint32_t> d_X, const np::Array<uint3
 		d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] += d_s[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + blockIdx.x];
 }

+/**
+ * @brief Parallel Prefix Sum (scan) of a given dataset.
+ *
+ * Read more: https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
+ *
+ * @param X Dataset of images
+ * @return Scanned dataset of images
+ */
 static np::Array<uint32_t> __scanGPU_3d__(const np::Array<uint32_t>& X) noexcept {
 	np::Array<uint32_t> X_scan = np::empty<uint32_t>(X.shape);

@ -112,6 +145,12 @@ static np::Array<uint32_t> __scanGPU_3d__(const np::Array<uint32_t>& X) noexcept
 	return X_scan;
 }

+/**
+ * @brief GPU kernel of the function __transpose_3d__.
+ *
+ * @param d_X Dataset of images on device
+ * @param d_Xt Transposed dataset of images on device
+ */
 static __global__ void __transpose_kernel__(const np::Array<uint32_t> d_X, np::Array<uint32_t> d_Xt) {
 	__shared__ uint32_t temp[NB_THREADS_2D_X * NB_THREADS_2D_Y];

@ -128,6 +167,12 @@ static __global__ void __transpose_kernel__(const np::Array<uint32_t> d_X, np::A
 		d_Xt[blockIdx.z * d_Xt.shape[1] * d_Xt.shape[2] + x * d_X.shape[2] + y] = temp[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y];
 }

+/**
+ * @brief Transpose every images in the given dataset.
+ *
+ * @param X Dataset of images
+ * @return Transposed dataset of images
+ */
 static np::Array<uint32_t> __transpose_3d__(const np::Array<uint32_t>& X) noexcept {
 	np::Array<uint32_t> Xt = np::empty<uint32_t>({ X.shape[0], X.shape[2], X.shape[1] });

@ -147,7 +192,13 @@ static np::Array<uint32_t> __transpose_3d__(const np::Array<uint32_t>& X) noexce
 	return Xt;
 }

-np::Array<uint32_t> set_integral_image_gpu(const np::Array<uint8_t>& X) noexcept {
+/**
+ * @brief Transform the input images in integrated images (GPU version).
+ *
+ * @param X Dataset of images
+ * @return Dataset of integrated images
+ */
+np::Array<uint32_t> set_integral_image(const np::Array<uint8_t>& X) noexcept {
 	np::Array<uint32_t> X_ii = np::astype<uint32_t>(X);
 	X_ii = __scanCPU_3d__(X_ii);
 	X_ii = __transpose_3d__(X_ii);
@ -155,53 +206,17 @@ np::Array<uint32_t> set_integral_image_gpu(const np::Array<uint8_t>& X) noexcept
 	return __transpose_3d__(X_ii);
 }

-static inline __device__ int16_t __compute_feature__(const np::Array<uint32_t>& d_X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept {
-	const size_t _y = y * d_X_ii.shape[1] + x;
-	const size_t _yh = _y + h * d_X_ii.shape[1];
-	return d_X_ii[j + _yh + w] + d_X_ii[j + _y] - d_X_ii[j + _yh] - d_X_ii[j + _y + w];
-}
-
-static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Array<uint8_t> d_feats, const np::Array<uint32_t> d_X_ii) {
-	size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-	size_t j = blockIdx.y * blockDim.y + threadIdx.y;
-
-	if (i >= d_feats.shape[0] || j >= d_X_ii.shape[0])
-		return;
-
-	const size_t k = i * d_X_ii.shape[0] + j;
-	i *= np::prod(d_feats.shape, 1);
-	j *= np::prod(d_X_ii.shape, 1);
-	const int16_t p1 = __compute_feature__(d_X_ii, j, d_feats[i +  0], d_feats[i +  1], d_feats[i +  2], d_feats[i +  3]);
-	const int16_t p2 = __compute_feature__(d_X_ii, j, d_feats[i +  4], d_feats[i +  5], d_feats[i +  6], d_feats[i +  7]);
-	const int16_t n1 = __compute_feature__(d_X_ii, j, d_feats[i +  8], d_feats[i +  9], d_feats[i + 10], d_feats[i + 11]);
-	const int16_t n2 = __compute_feature__(d_X_ii, j, d_feats[i + 12], d_feats[i + 13], d_feats[i + 14], d_feats[i + 15]);
-	d_X_feat[k] = static_cast<int32_t>(p1 + p2) - static_cast<int32_t>(n1 + n2);
-}
-
-np::Array<int32_t> apply_features_gpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
-	const np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });
-	int32_t* d_X_feat;
-
-	_print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t)));
-	np::Array<uint32_t> d_X_ii = copyToDevice<uint32_t>("X_ii", X_ii);
-	np::Array<uint8_t> d_feats = copyToDevice<uint8_t>("feats", feats);
-
-	const size_t dimX = static_cast<size_t>(std::ceil(static_cast<float64_t>(feats.shape[0]) / static_cast<float64_t>(NB_THREADS_2D_X)));
-	const size_t dimY = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_ii.shape[0]) / static_cast<float64_t>(NB_THREADS_2D_Y)));
-	const dim3 dimGrid(dimX, dimY);
-	constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y);
-	__apply_feature_kernel__<<<dimGrid, dimBlock>>>(d_X_feat, d_feats, d_X_ii);
-	_print_cuda_error_("synchronize", cudaDeviceSynchronize());
-
-	_print_cuda_error_("memcpy X_feat", cudaMemcpy(X_feat.data, d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t), cudaMemcpyDeviceToHost));
-
-	_print_cuda_error_("free d_X_feat", cudaFree(d_X_feat));
-	cudaFree("free d_feats", d_feats);
-	cudaFree("free d_X_11", d_X_ii);
-
-	return X_feat;
-}
-
+/**
+ * @brief GPU kernel of the function train_weak_clf.
+ *
+ * @param d_classifiers Weak classifiers on device to train
+ * @param d_y Labels of the features on device
+ * @param d_X_feat Feature images dataset on device
+ * @param d_X_feat_argsort Sorted indexes of the integrated features on device
+ * @param d_weights Weights of the features on device
+ * @param total_pos Total of positive labels in the dataset
+ * @param total_neg Total of negative labels in the dataset
+ */
 static __global__ void __train_weak_clf_kernel__(np::Array<float64_t> d_classifiers, const np::Array<uint8_t> d_y,
 						const np::Array<int32_t> d_X_feat, const np::Array<uint16_t> d_X_feat_argsort,
 						const np::Array<float64_t> d_weights, const float64_t total_pos, const float64_t total_neg) {
@ -210,7 +225,7 @@ static __global__ void __train_weak_clf_kernel__(np::Array<float64_t> d_classifi
 	i += threadIdx.x * blockDim.y * blockDim.z;
 	i += threadIdx.y * blockDim.z;
 	i += threadIdx.z;
-	// const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
 	if(i >= d_classifiers.shape[0])
 		return;

@ -235,7 +250,16 @@ static __global__ void __train_weak_clf_kernel__(np::Array<float64_t> d_classifi
 	d_classifiers[i * 2] = best_threshold; d_classifiers[i * 2 + 1] = best_polarity;
 }

-np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y,
+/**
+ * @brief Train the weak classifiers on a given dataset (GPU version).
+ *
+ * @param X_feat Feature images dataset
+ * @param X_feat_argsort Sorted indexes of the integrated features
+ * @param y Labels of the features
+ * @param weights Weights of the features
+ * @return Trained weak classifiers
+ */
+np::Array<float64_t> train_weak_clf(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y,
 					const np::Array<float64_t>& weights) noexcept {
 	float64_t total_pos = 0.0, total_neg = 0.0;
 	for(size_t i = 0; i < y.shape[0]; ++i)
@ -251,8 +275,6 @@ np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const

 	const size_t n_blocks = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_feat.shape[0]) / static_cast<float64_t>(NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z)));
 	constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z);
-	// const size_t n_blocks = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_feat.shape[0]) / static_cast<float64_t>(NB_THREADS)));
-	// constexpr const dim3 dimBlock(NB_THREADS);
 	__train_weak_clf_kernel__<<<n_blocks, dimBlock>>>(d_classifiers, d_y, d_X_feat, d_X_feat_argsort, d_weights, total_pos, total_neg);
 	_print_cuda_error_("synchronize", cudaDeviceSynchronize());

@ -267,28 +289,118 @@ np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const
 	return classifiers;
 }

+/**
+ * @brief Compute a feature on a integrated image at a specific coordinate (GPU version).
+ *
+ * @param d_X_ii Dataset of integrated images on device
+ * @param j Image index in the dataset
+ * @param x X coordinate
+ * @param y Y coordinate
+ * @param w width of the feature
+ * @param h height of the feature
+ */
+static inline __device__ int16_t __compute_feature__(const np::Array<uint32_t>& d_X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept {
+	const size_t _y = y * d_X_ii.shape[1] + x;
+	const size_t _yh = _y + h * d_X_ii.shape[1];
+	return d_X_ii[j + _yh + w] + d_X_ii[j + _y] - d_X_ii[j + _yh] - d_X_ii[j + _y + w];
+}
+
+/**
+ * @brief GPU kernel of the function apply_features.
+ *
+ * @param d_X_feat Dataset of image features on device
+ * @param d_feats Features on device to apply
+ * @param d_X_ii Integrated image dataset on device
+ */
+static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Array<uint8_t> d_feats, const np::Array<uint32_t> d_X_ii) {
+	size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+	size_t j = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if (i >= d_feats.shape[0] || j >= d_X_ii.shape[0])
+		return;
+
+	const size_t k = i * d_X_ii.shape[0] + j;
+	i *= np::prod(d_feats.shape, 1);
+	j *= np::prod(d_X_ii.shape, 1);
+	const int16_t p1 = __compute_feature__(d_X_ii, j, d_feats[i +  0], d_feats[i +  1], d_feats[i +  2], d_feats[i +  3]);
+	const int16_t p2 = __compute_feature__(d_X_ii, j, d_feats[i +  4], d_feats[i +  5], d_feats[i +  6], d_feats[i +  7]);
+	const int16_t n1 = __compute_feature__(d_X_ii, j, d_feats[i +  8], d_feats[i +  9], d_feats[i + 10], d_feats[i + 11]);
+	const int16_t n2 = __compute_feature__(d_X_ii, j, d_feats[i + 12], d_feats[i + 13], d_feats[i + 14], d_feats[i + 15]);
+	d_X_feat[k] = static_cast<int32_t>(p1 + p2) - static_cast<int32_t>(n1 + n2);
+}
+
+/**
+ * @brief Apply the features on a integrated image dataset (GPU version).
+ *
+ * @param feats Features to apply
+ * @param X_ii Integrated image dataset
+ * @return Applied features
+ */
+np::Array<int32_t> apply_features(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
+	const np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });
+	int32_t* d_X_feat = nullptr;
+
+	_print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t)));
+	np::Array<uint32_t> d_X_ii = copyToDevice<uint32_t>("X_ii", X_ii);
+	np::Array<uint8_t> d_feats = copyToDevice<uint8_t>("feats", feats);
+
+	const size_t dimX = static_cast<size_t>(std::ceil(static_cast<float64_t>(feats.shape[0]) / static_cast<float64_t>(NB_THREADS_2D_X)));
+	const size_t dimY = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_ii.shape[0]) / static_cast<float64_t>(NB_THREADS_2D_Y)));
+	const dim3 dimGrid(dimX, dimY);
+	constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y);
+	__apply_feature_kernel__<<<dimGrid, dimBlock>>>(d_X_feat, d_feats, d_X_ii);
+	_print_cuda_error_("synchronize", cudaDeviceSynchronize());
+
+	_print_cuda_error_("memcpy X_feat", cudaMemcpy(X_feat.data, d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t), cudaMemcpyDeviceToHost));
+
+	_print_cuda_error_("free d_X_feat", cudaFree(d_X_feat));
+	cudaFree("free d_feats", d_feats);
+	cudaFree("free d_X_11", d_X_ii);
+
+	return X_feat;
+}
+
+/**
+ * @brief Partition of the argsort algorithm.
+ *
+ * @tparam T Inner type of the array
+ * @param d_a Array on device to sort
+ * @param d_indices Array of indices on device to write to
+ * @param low lower bound to sort
+ * @param high higher bound to sort
+ * @return Last index sorted
+ */
 template<typename T>
-__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
-	int32_t i = l - 1;
-	for (int32_t j = l; j <= h; ++j)
-		if (a[indices[j]] < a[indices[h]])
-			swap(&indices[++i], &indices[j]);
-	swap(&indices[++i], &indices[h]);
+__device__ inline static int32_t _as_partition_(const T* d_a, uint16_t* const d_indices, const size_t low, const size_t high) noexcept {
+	int32_t i = low - 1;
+	for (int32_t j = low; j <= high; ++j)
+		if (d_a[d_indices[j]] < d_a[d_indices[high]])
+			swap(&d_indices[++i], &d_indices[j]);
+	swap(&d_indices[++i], &d_indices[high]);
 	return i;
 }

+/**
+ * @brief Cuda kernel to perform an indirect sort of a given array within a given bound.
+ *
+ * @tparam T Inner type of the array
+ * @param d_a Array on device to sort
+ * @param d_indices Array of indices on device to write to
+ * @param low lower bound to sort
+ * @param high higher bound to sort
+ */
 template<typename T>
-__device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
-	const size_t total = h - l + 1;
+__device__ void argsort_kernel(const T* d_a, uint16_t* const d_indices, size_t low, size_t high) noexcept {
+	const size_t total = high - low + 1;

-	//int32_t* stack = new int32_t[total]{l, h};
+	//int32_t* stack = new int32_t[total]{low, high};
 	//int32_t stack[total];
 	int32_t stack[6977];
 	//int32_t stack[1<<16];
-	stack[0] = l;
-	stack[1] = h;
+	stack[0] = low;
+	stack[1] = high;

-	size_t top = 1, low = l, high = h;
+	size_t top = 1;

 	while (top <= total) {
 		high = stack[top--];
@ -296,7 +408,7 @@ __device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const
 		if(low >= high)
 			break;

-		const int32_t p = as_partition_gpu(a, indices, low, high);
+		const int32_t p = _as_partition_(d_a, d_indices, low, high);

 		if (p - 1 > low && p - 1 < total) {
 			stack[++top] = low;
@ -311,42 +423,49 @@ __device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const
 	//delete[] stack;
 }

+/**
+ * @brief Cuda kernel where argsort is applied to every column of a given 2D array.
+ *
+ * @tparam T Inner type of the array
+ * @param d_a 2D Array on device to sort
+ * @param d_indices 2D Array of indices on device to write to
+ */
 template<typename T>
-__global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* indices){
+__global__ void argsort_bounded(const np::Array<T> d_a, uint16_t* const d_indices){
 	const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-	if (idx >= a.shape[0])
+	if (idx >= d_a.shape[0])
 		return;

-	for(size_t y = 0; y < a.shape[1]; ++y) indices[idx * a.shape[1] + y] = y;
-	argsort_gpu(&a[idx * a.shape[1]], &indices[idx * a.shape[1]], 0, a.shape[1] - 1);
+	for(size_t y = 0; y < d_a.shape[1]; ++y) d_indices[idx * d_a.shape[1] + y] = y;
+	argsort_kernel(&d_a[idx * d_a.shape[1]], &d_indices[idx * d_a.shape[1]], 0, d_a.shape[1] - 1);
 }

-np::Array<uint16_t> argsort_2d_gpu(const np::Array<int32_t>& X_feat) noexcept {
-	const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
+/**
+ * @brief Perform an indirect sort on each column of a given 2D array (GPU version).
+ *
+ * @param a 2D Array to sort
+ * @return 2D Array of indices that sort the array
+ */
+np::Array<uint16_t> argsort_2d(const np::Array<int32_t>& a) noexcept {
+	const np::Array<uint16_t> indices = np::empty<uint16_t>(a.shape);

-	uint16_t* d_indices;
+	uint16_t* d_indices = nullptr;
 	const size_t indices_size = np::prod(indices.shape) * sizeof(uint16_t);

-	np::Array<int32_t> d_X_feat = copyToDevice<int32_t>("X_feat", X_feat);
+	np::Array<int32_t> d_a = copyToDevice<int32_t>("X_feat", a);
 	_print_cuda_error_("malloc d_indices", cudaMalloc(&d_indices, indices_size));

-	const size_t dimGrid = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_feat.shape[0]) / static_cast<float64_t>(NB_THREADS)));
+	const size_t dimGrid = static_cast<size_t>(std::ceil(static_cast<float64_t>(a.shape[0]) / static_cast<float64_t>(NB_THREADS)));
 	const dim3 dimBlock(NB_THREADS);
-	argsort_bounded_gpu<<<dimGrid, dimBlock>>>(d_X_feat, d_indices);
+	argsort_bounded<<<dimGrid, dimBlock>>>(d_a, d_indices);
 	_print_cuda_error_("synchronize", cudaDeviceSynchronize());

 	_print_cuda_error_("memcpy d_indices", cudaMemcpy(indices.data, d_indices, indices_size, cudaMemcpyDeviceToHost));

-	cudaFree("free d_X_feat", d_X_feat);
+	cudaFree("free d_a", d_a);
 	_print_cuda_error_("free d_indices", cudaFree(d_indices));

 	return indices;
 }

-__host__ __device__
-size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept {
-	size_t result = shape[offset];
-	for(size_t i = 1 + offset; i < shape.length; ++i)
-		result *= shape[i];
-	return result;
-}
+#endif // GPU_BOOSTED
--- a/cpp/ViolaJonesGPU.hpp
+++ b/cpp/ViolaJonesGPU.hpp
@ -1,8 +0,0 @@
-#pragma once
-#include "data.hpp"
-
-np::Array<uint32_t> set_integral_image_gpu(const np::Array<uint8_t>&) noexcept;
-np::Array<int32_t> apply_features_gpu(const np::Array<uint8_t>&, const np::Array<uint32_t>&) noexcept;
-np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y,
-					const np::Array<float64_t>& weights) noexcept;
-np::Array<uint16_t> argsort_2d_gpu(const np::Array<int32_t>& X_feat) noexcept;
--- a/cpp/ViolaJones_device.hpp
+++ b/cpp/ViolaJones_device.hpp
@ -0,0 +1,39 @@
+#pragma once
+#include "data.hpp"
+
+/**
+ * @brief Transform the input images in integrated images.
+ *
+ * @param X Dataset of images
+ * @return Dataset of integrated images
+ */
+np::Array<uint32_t> set_integral_image(const np::Array<uint8_t>&) noexcept;
+
+/**
+ * @brief Apply the features on a integrated image dataset.
+ *
+ * @param feats Features to apply
+ * @param X_ii Integrated image dataset
+ * @return Applied features
+ */
+np::Array<int32_t> apply_features(const np::Array<uint8_t>&, const np::Array<uint32_t>&) noexcept;
+
+/**
+ * @brief Train the weak classifiers on a given dataset.
+ *
+ * @param X_feat Feature images dataset
+ * @param X_feat_argsort Sorted indexes of the integrated features
+ * @param y Labels of the features
+ * @param weights Weights of the features
+ * @return Trained weak classifiers
+ */
+np::Array<float64_t> train_weak_clf(const np::Array<int32_t>&, const np::Array<uint16_t>&, const np::Array<uint8_t>&,
+					const np::Array<float64_t>&) noexcept;
+
+/**
+ * @brief Perform an indirect sort on each column of a given 2D array
+ *
+ * @param a 2D Array to sort
+ * @return 2D Array of indices that sort the array
+ */
+np::Array<uint16_t> argsort_2d(const np::Array<int32_t>&) noexcept;
--- a/cpp/config.hpp
+++ b/cpp/config.hpp
@ -1,4 +1,5 @@
 #pragma once
+#include <array>

 #define DATA_DIR "../data"
 #define OUT_DIR "./out"
@ -11,21 +12,30 @@
 #define NB_THREADS_3D_X 16
 #define NB_THREADS_3D_Y 16
 #define NB_THREADS_3D_Z 4
-__device__ constexpr const size_t M = 5; //log2(NB_THREADS_2D_Y));
+#define M static_cast<size_t>(log2f(NB_THREADS_2D_Y))
 #endif

-// Save state to avoid recalulation on restart
+// Save state to avoid recalculation on restart
 #define SAVE_STATE true
 // Redo the state even if it's already saved
 #define FORCE_REDO false
+
 // Use GPU to greatly accelerate runtime
 #define GPU_BOOSTED true
+// Depending on what you set, the output label will be as follow :
+// ┌─────────────┬───────┐
+// │ GPU_BOOSTED │ LABEL │
+// ├─────────────┼───────┤
+// │ true        │ GPU   │
+// │ false       │ CPU   │
+// └─────────────┴───────┘
+
 // Number of weak classifiers
-// const size_t TS[] =  { 1 };
-// const size_t TS[] =  { 1, 5, 10 };
-const size_t TS[] =  { 1, 5, 10, 25, 50 };
-// const size_t TS[] =  { 1, 5, 10, 25, 50, 100, 200, 300 };
-// const size_t TS[] =  { 1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000 };
+// [[maybe_unused]] constexpr const std::array TS{ 1 };
+// [[maybe_unused]] constexpr const std::array TS{ 1, 5, 10 };
+[[maybe_unused]] constexpr const std::array TS{ 1, 5, 10, 25, 50 };
+// [[maybe_unused]] constexpr const std::array TS{ 1, 5, 10, 25, 50, 100, 200, 300 };
+// [[maybe_unused]] constexpr const std::array TS{ 1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000 };

 // Enable verbose output (for debugging purposes)
 #define __DEBUG false
--- a/cpp/data.cpp
+++ b/cpp/data.cpp
@ -1,9 +1,7 @@
 #include "data.hpp"
-//#include "toolbox.hpp"
-//#include <cstring>

-int print(const np::Shape& shape) noexcept {
-	int num_written = 0;
+int32_t print(const np::Shape& shape) noexcept {
+	int32_t num_written = 0;
 	num_written += printf("(");
 	if (shape.length > 1) {
 		const size_t length = shape.length - 1;
@ -17,20 +15,12 @@ int print(const np::Shape& shape) noexcept {
 }

 template<typename T>
-int print(const np::Array<T>& array, const char* format) noexcept {
-	//printf("[");
-	//const size_t length = np::prod(array.shape);
-	//for(size_t i = 0; i < length - 1; ++i)
-	//	//std::cout << array[i] << " ";
-	//	printf("%f ", array[i]);
-	////std::cout << array[array.shape[0] - 1] << "]\n";
-	//printf("%f]\n", array[length - 1]);
-
+int32_t print(const np::Array<T>& array, const char* const format) noexcept {
 	char format_space[BUFFER_SIZE] = { 0 };
-	sprintf(format_space, "%s ", format);
+	snprintf(format_space, BUFFER_SIZE,"%s ", format);
 	char format_close[BUFFER_SIZE] = { 0 };
-	sprintf(format_close, "%s]\n", format);
-	int num_written = 0;
+	snprintf(format_close, BUFFER_SIZE,"%s]\n", format);
+	int32_t num_written = 0;

 	if (array.shape.length == 1) {
 		const size_t max = array.shape[0] - 1;
@ -53,16 +43,16 @@ int print(const np::Array<T>& array, const char* format) noexcept {
 	return num_written;
 }

-int print(const np::Array<uint8_t>& array) noexcept {
+int32_t print(const np::Array<uint8_t>& array) noexcept {
 	return print(array, "%hu");
 }

-int print(const np::Array<float64_t>& array) noexcept {
+int32_t print(const np::Array<float64_t>& array) noexcept {
 	return print(array, "%f");
 }

-int print_feat(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
-	int num_written = 0;
+int32_t print_feat(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
+	int32_t num_written = 0;
 	num_written += printf("[");
 	const size_t feat_size = np::prod(array.shape, 1);
 	const size_t offset = slice.x * feat_size;
@ -74,10 +64,10 @@ int print_feat(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept
 	return num_written;
 }

-int print(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
-	int num_written = 0;
+int32_t print(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
+	int32_t num_written = 0;
 	if (array.shape.length == 1) {
-		const size_t max = slice.y - 1; //std::min(slice.y, array.shape[0] - 1);
+		const size_t max = slice.y - 1;
 		num_written += printf("[");
 		for (size_t i = slice.x; i < max; ++i)
 			num_written += printf("%hu ", array[i]);
@ -97,10 +87,10 @@ int print(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
 	return num_written;
 }

-int print(const np::Array<uint32_t>& array, const np::Slice& slice) noexcept {
-	int num_written = 0;
+int32_t print(const np::Array<uint32_t>& array, const np::Slice& slice) noexcept {
+	int32_t num_written = 0;
 	if (array.shape.length == 1) {
-		const size_t max = slice.y - 1; //std::min(slice.y, array.shape[0] - 1);
+		const size_t max = slice.y - 1;
 		num_written += printf("[");
 		for (size_t i = slice.x; i < max; ++i)
 			num_written += printf("%iu ", array[i]);
@ -115,37 +105,35 @@ int print(const np::Array<uint32_t>& array, const np::Slice& slice) noexcept {
 				num_written += printf("%5i ", array[k + i * array.shape[1] + j]);
 			num_written += printf("]\n");
 		}
-		num_written += print("]");
+		num_written += printf("]");
 	}
 	return num_written;
 }

-int print(const np::Array<int32_t>& array, const np::Slice& slice) noexcept {
-	int num_written = 0;
+int32_t print(const np::Array<int32_t>& array, const np::Slice& slice) noexcept {
+	int32_t num_written = 0;
 	num_written += printf("[");
-	//size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z;
 	size_t k = slice.x * array.shape[1];
 	for (size_t i = k; i < k + (slice.y - slice.x); ++i) {
 		num_written += printf("%5i ", array[i]);
 	}
-	num_written += print("]");
+	num_written += printf("]");
 	return num_written;
 }

-int print(const np::Array<uint16_t>& array, const np::Slice& slice) noexcept {
-	int num_written = 0;
+int32_t print(const np::Array<uint16_t>& array, const np::Slice& slice) noexcept {
+	int32_t num_written = 0;
 	num_written += printf("[");
-	//size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z;
 	size_t k = slice.x * array.shape[1];
 	for (size_t i = k; i < k + (slice.y - slice.x); ++i) {
 		num_written += printf("%5hu ", array[i]);
 	}
-	num_written += print("]");
+	num_written += printf("]");
 	return num_written;
 }

-static inline np::Array<uint8_t> load_set(const char* set_name) {
-	FILE* file = fopen(set_name, "rb");
+static inline np::Array<uint8_t> load_set(const char* const set_name) {
+	FILE* const file = fopen(set_name, "rb");
 	if (file == NULL) {
 		print_error_file(set_name);
 		throw;
@ -156,7 +144,7 @@ static inline np::Array<uint8_t> load_set(const char* set_name) {
 		fclose(file);
 		throw;
 	}
-	size_t* dims = new size_t[3]();
+	size_t* const dims = new size_t[3]();
 	if (!sscanf(meta, "%lu %lu %lu", &dims[0], &dims[1], &dims[2])) {
 		print_error_file(set_name);
 		fclose(file);
@ -167,13 +155,12 @@ static inline np::Array<uint8_t> load_set(const char* set_name) {

 	const size_t size = np::prod(a.shape);
 	size_t i = 0, j = 0;
-	int c;
+	int32_t c;
 	char buff[STRING_INT_SIZE] = { 0 };
 	while ((c = fgetc(file)) != EOF && i < size) {
 		if (c == ' ' || c == '\n') {
 			buff[j] = '\0';
 			a[i++] = static_cast<uint8_t>(atoi(buff));
-			//memset(buff, 0, STRING_INT_SIZE);
 			j = 0;
 		}
 		else
@ -191,22 +178,20 @@ static inline np::Array<uint8_t> load_set(const char* set_name) {
 	return a;
 }

-std::array<np::Array<uint8_t>, 4> load_datasets() {
+/**
+ * @brief Load the datasets.
+ *
+ * @return Array containing X_train, y_trait, X_test, y_test
+ */
+std::array<np::Array<uint8_t>, 4> load_datasets(void) {
 	return {
 		load_set(DATA_DIR "/X_train.bin"), load_set(DATA_DIR "/y_train.bin"),
 		load_set(DATA_DIR "/X_test.bin"), load_set(DATA_DIR "/y_test.bin")
 	};
 }

-void print_error_file(const char* file_dir) noexcept {
-	const char* buff = strerror(errno);
+void print_error_file(const char* const file_dir) noexcept {
+	const char* const buff = strerror(errno);
 	fprintf(stderr, "Can't open %s, error code = %d : %s\n", file_dir, errno, buff);
 	// delete buff;
 }
-
-//size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept {
-//	size_t result = shape[offset];
-//	for(size_t i = 1 + offset; i < shape.length; ++i)
-//		result *= shape[i];
-//	return result;
-//}
--- a/cpp/data.hpp
+++ b/cpp/data.hpp
@ -4,12 +4,11 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#include <memory>
+#include <stdint.h>
 #include "config.hpp"

 #define BUFFER_SIZE 256
 #define STRING_INT_SIZE 8 // Length of a number in log10 (including '-')
-#define S(N) std::string(N, '-').c_str()

 #ifndef __CUDACC__
 #define __host__
@ -20,15 +19,6 @@ typedef float float32_t;
 typedef double float64_t;
 typedef long double float128_t;

-__host__ __device__
-constexpr inline int print(const char* str) noexcept {
-	return printf("%s\n", str);
-}
-
-inline int print(const std::string& s) noexcept {
-	return printf("%s\n", s.c_str());
-}
-
 namespace np {
 	constexpr const float64_t inf = std::numeric_limits<float64_t>::infinity();

@ -45,16 +35,16 @@ namespace np {
 #endif

 		__host__ __device__
-		Shape() noexcept {
-// #if __DEBUG
-// 			print("Shape created (default)");
-// #endif
+		Shape(void) noexcept {
+#if __DEBUG
+			printf("Shape created (default)\n");
+#endif
 		}

 		__host__ __device__
-		Shape(const size_t& length, size_t* data) noexcept : length(length), data(data), refcount(new size_t(1)) {
+		Shape(const size_t& length, size_t* const data) noexcept : length(length), data(data), refcount(new size_t(1)) {
 #if __DEBUG
-			//print("Shape created (raw)");
+			printf("Shape created (raw)\n");
 			for(size_t i = 0; i < length; ++i)
 				total *= data[i];
 #endif
@ -62,10 +52,10 @@ namespace np {

 		__host__ __device__
 		Shape(const std::initializer_list<size_t>& dims) noexcept : length(dims.size()), data(new size_t[dims.size()]), refcount(new size_t(1)) {
-// #if __DEBUG
-// 			print("Shape created (initializer)");
-// #endif
-			const size_t* begin = dims.begin();
+#if __DEBUG
+			printf("Shape created (initializer)\n");
+#endif
+			const size_t* const begin = dims.begin();
 			for(size_t i = 0; i < length; ++i){
 				data[i] = begin[i];
 #if __DEBUG
@ -77,52 +67,49 @@ namespace np {
 		__host__ __device__
 		Shape(const Shape& shape) noexcept {
 #if __DEBUG
-			print("Shape created (copy)");
+			printf("Shape created (copy)\n");
 #endif
 			if (data != nullptr && data != shape.data){
 #if __DEBUG
-				print("Former shape deleted (copy)");
+				printf("Former shape deleted (copy)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != shape.refcount){
 #if __DEBUG
-				print("Former shape refcount freed (copy)");
+				printf("Former shape refcount freed (copy)\n");
 #endif
 				delete refcount;
 			}
 			length = shape.length;
-
-			//data = new size_t[length];
-			//memcpy(data, shape.data, length * sizeof(size_t));
-			//refcount = new size_t;
-			//memcpy(refcount, shape.refcount, sizeof(size_t));
-
 			data = shape.data;
 			refcount = shape.refcount;
+
 			if (refcount != nullptr)
 				(*refcount)++;
 #if __DEBUG
 			else
-				print("Moved shape has null refcount");
+				printf("Moved shape has null refcount\n");
+#endif
+#if __DEBUG
 			total = shape.total;
 #endif
 		}

 		__host__ __device__
 		Shape(Shape&& shape) noexcept {
-// #if __DEBUG
-// 			print("Shape created (move));
-// #endif
+#if __DEBUG
+			printf("Shape created (move)\n");
+#endif
 			if (data != nullptr && data != shape.data){
 #if __DEBUG
-				print("Former shape deleted (move)");
+				printf("Former shape deleted (move)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != shape.refcount){
 #if __DEBUG
-				print("Former shape refcount freed (move)");
+				printf("Former shape refcount freed (move)\n");
 #endif
 				delete refcount;
 			}
@ -140,29 +127,29 @@ namespace np {
 		}

 		__host__ __device__
-		~Shape() noexcept {
+		~Shape(void) noexcept {
 			if(refcount == nullptr){
-// #if __DEBUG
-// 					print("Shape refcount freed more than once");
-// #endif
+#if __DEBUG
+					printf("Shape refcount freed more than once\n");
+#endif
 					return;
 			}
 			--(*refcount);
-// #if __DEBUG
-// 			printf("Shape destructed : %lu\n", *refcount);
-// #endif
+#if __DEBUG
+			printf("Shape destructed : %lu\n", *refcount);
+#endif
 			if(*refcount == 0){
 				if (data != nullptr){
 					delete[] data;
 					data = nullptr;
-// #if __DEBUG
-// 					print("Shape freeing ...");
-// #endif
+#if __DEBUG
+					printf("Shape freeing ...\n");
+#endif
 				}
-//#if __DEBUG
+#if __DEBUG
 				else
 					printf("Shape freed more than once : %lu\n", *refcount);
-//#endif
+#endif
 				delete refcount;
 				refcount = nullptr;
 #if __DEBUG
@ -174,34 +161,29 @@ namespace np {
 		__host__ __device__
 		Shape& operator=(const Shape& shape) noexcept {
 #if __DEBUG
-			print("Shape created (assign copy)");
+			printf("Shape created (assign copy)\n");
 #endif
 			if (data != nullptr && data != shape.data){
 #if __DEBUG
-				print("Former shape deleted (assign copy)");
+				printf("Former shape deleted (assign copy)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != shape.refcount){
 #if __DEBUG
-				print("Former shape refcount freed (assign copy)");
+				printf("Former shape refcount freed (assign copy)\n");
 #endif
 				delete refcount;
 			}
 			length = shape.length;
-
-			// data = new size_t[length];
-			// memcpy(data, shape.data, length * sizeof(size_t));
-			// refcount = new size_t;
-			// memcpy(refcount, shape.refcount, sizeof(size_t));
-
 			data = shape.data;
 			refcount = shape.refcount;
+
 			if (refcount != nullptr)
 				(*refcount)++;
 #if __DEBUG
 			else
-				printf("Assigned copy shape has null refcount");
+				printf("Assigned copy shape has null refcount\n");
 			total = shape.total;
 #endif
 			return *this;
@ -209,18 +191,18 @@ namespace np {

 		__host__ __device__
 		Shape& operator=(Shape&& shape) noexcept {
-// #if __DEBUG
-// 			print("Shape created (assign move)");
-// #endif
+#if __DEBUG
+			printf("Shape created (assign move)\n");
+#endif
 			if (data != nullptr && data != shape.data){
 #if __DEBUG
-				print("Former shape deleted (assign move)");
+				printf("Former shape deleted (assign move)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != shape.refcount){
 #if __DEBUG
-				print("Former shape refcount freed (assign move)");
+				printf("Former shape refcount freed (assign move)\n");
 #endif
 				delete refcount;
 			}
@ -228,9 +210,9 @@ namespace np {
 			data = shape.data;
 			refcount = shape.refcount;
 #if __DEBUG
-			total = shape.total;
 			if (refcount == nullptr)
-				print("Assigned copy shape has null refcount");
+				printf("Assigned copy shape has null refcount\n");
+			total = shape.total;
 			shape.total = 1;
 #endif
 			shape.length = 0;
@ -281,62 +263,57 @@ namespace np {
 		size_t* refcount = nullptr;

 		__host__ __device__
-		Array() noexcept {
-// #if __DEBUG
-// 			print("Array created (default)");
-// #endif
+		Array(void) noexcept {
+#if __DEBUG
+			printf("Array created (default)\n");
+#endif
 		}

 		__host__ __device__
-		Array(const Shape& shape, T* data) noexcept : shape(shape), data(data), refcount(new size_t(1)) {
-// #if __DEBUG
-// 			print("Array created (raw, copy shape)");
-// #endif
+		Array(const Shape& shape, T* const data) noexcept : shape(shape), data(data), refcount(new size_t(1)) {
+#if __DEBUG
+			printf("Array created (raw, copy shape)\n");
+#endif
 		}

 		__host__ __device__
 		Array(const Shape& shape) noexcept : shape(shape), data(new T[np::prod(shape)]), refcount(new size_t(1)) {
-// #if __DEBUG
-// 			print("Array created (raw empty, copy shape)");
-// #endif
+#if __DEBUG
+			printf("Array created (raw empty, copy shape)\n");
+#endif
 		}

 		__host__ __device__
-		Array(Shape&& shape, T* data) noexcept : shape(std::move(shape)), data(data), refcount(new size_t(1)) {
-// #if __DEBUG
-// 			print("Array created (raw, move shape)");
-// #endif
+		Array(Shape&& shape, T* const data) noexcept : shape(shape), data(data), refcount(new size_t(1)) {
+#if __DEBUG
+			printf("Array created (raw, move shape)\n");
+#endif
 		}

 		__host__ __device__
-		Array(Shape&& shape) noexcept : shape(std::move(shape)), data(new T[np::prod(shape)]), refcount(new size_t(1)) {
-// #if __DEBUG
-// 			print("Array created (raw empty, move shape)");
-// #endif
+		Array(Shape&& shape) noexcept : shape(shape), data(new T[np::prod(shape)]), refcount(new size_t(1)) {
+#if __DEBUG
+			printf("Array created (raw empty, move shape)\n");
+#endif
 		}

 		__host__ __device__
 		Array(const Array& array) noexcept : shape(array.shape) {
 #if __DEBUG
-			print("Array created (copy)");
+			printf("Array created (copy)\n");
 #endif
 			if (data != nullptr && data != array.data){
-#ifdef __debug
-				print("Former array deleted (move)");
+#if __DEBUG
+				printf("Former array deleted (copy)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != array.refcount){
 #if __DEBUG
-				print("Former array refcount freed (move)");
+				printf("Former array refcount freed (copy)\n");
 #endif
 				delete refcount;
 			}
-			// const size_t size = np::prod(shape);
-			// data = new T[size];
-			// memcpy(data, array.data, size);
-			// refcount = new size_t;
-			// memcpy(refcount, array.refcount, sizeof(size_t));

 			data = array.data;
 			refcount = array.refcount;
@ -344,28 +321,27 @@ namespace np {
 				(*refcount)++;
 #if __DEBUG
 			else
-				print("Moved array has null refcount");
+				printf("Moved array has null refcount\n");
 #endif
 		}

 		__host__ __device__
-		Array(Array&& array) noexcept {
-// #if __DEBUG
-// 			print("Array created (move)");
-// #endif
+		Array(Array&& array) noexcept : shape(std::move(array.shape)) {
+#if __DEBUG
+			printf("Array created (move)\n");
+#endif
 			if (data != nullptr && data != array.data){
 #if __DEBUG
-				print("Former array deleted (move)");
+				printf("Former array deleted (move)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != array.refcount){
 #if __DEBUG
-				print("Former array refcount freed (move)");
+				printf("Former array refcount freed (move)\n");
 #endif
 				delete refcount;
 			}
-			shape = std::move(array.shape);
 			data = array.data;
 			refcount = array.refcount;

@ -374,24 +350,24 @@ namespace np {
 		}

 		__host__ __device__
-		~Array() noexcept {
+		~Array(void) noexcept {
 			if(refcount == nullptr){
-// #if __DEBUG
-// 				print("Array refcount freed more than once");
-// #endif
+#if __DEBUG
+				printf("Array refcount freed more than once\n");
+#endif
 				return;
 			}
 			--(*refcount);
-// #if __DEBUG
-// 			printf("Array destructed : %lu\n", *refcount);
-// #endif
+#if __DEBUG
+			printf("Array destructed : %lu\n", *refcount);
+#endif
 			if(*refcount == 0){
 				if (data != nullptr){
 					delete[] data;
 					data = nullptr;
-// #if __DEBUG
-// 					print("Array freeing ...");
-// #endif
+#if __DEBUG
+					printf("Array freeing ...\n");
+#endif
 				}
 #if __DEBUG
 				else
@ -405,53 +381,47 @@ namespace np {
 		__host__ __device__
 		Array& operator=(const Array& array) noexcept {
 #if __DEBUG
-			print("Array created (assign copy)");
+			printf("Array created (assign copy)\n");
 #endif
 			if (data != nullptr && data != array.data){
 #if __DEBUG
-				print("Former array deleted (assign copy)");
+				printf("Former array deleted (assign copy)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != array.refcount){
 #if __DEBUG
-				print("Former array refcount freed (assign copy)");
+				printf("Former array refcount freed (assign copy)\n");
 #endif
 				delete refcount;
 			}
 			shape = array.shape;
-
-			// const size_t size = np::prod(shape) * sizeof(T);
-			// data = new T[size];
-			// memcpy(data, array.data, size);
-			// refcount = new size_t;
-			// memcpy(refcount, array.refcount, sizeof(size_t));
-
 			data = array.data;
 			refcount = array.refcount;
+
 			if (refcount != nullptr)
 				(*refcount)++;
 #if __DEBUG
 			else
-				print("Assigned array has null refcount");
+				printf("Assigned array has null refcount\n");
 #endif
 			return *this;
 		}

 		__host__ __device__
 		Array& operator=(Array&& array) noexcept {
-// #if __DEBUG
-// 			print("Array created (assign move)");
-// #endif
+#if __DEBUG
+			printf("Array created (assign move)\n");
+#endif
 			if (data != nullptr && data != array.data){
 #if __DEBUG
-				print("Former array deleted (assign move)");
+				printf("Former array deleted (assign move)\n");
 #endif
 				delete[] data;
 			}
 			if (refcount != nullptr && refcount != array.refcount){
 #if __DEBUG
-				print("Former array refcount freed (assign move)");
+				printf("Former array refcount freed (assign move)\n");
 #endif
 				delete refcount;
 			}
@ -486,35 +456,39 @@ namespace np {
 	};

 	template<typename T>
-	Array<T> empty(Shape&& shape) noexcept {
-		return { std::move(shape), new T[np::prod(shape)] };
+	inline Array<T> empty(Shape&& shape) noexcept {
+		return Array<T>(shape);
 	}

 	template<typename T>
-	Array<T> empty(const Shape& shape) noexcept {
-		return { std::move(shape), new T[np::prod(shape)] };
+	inline Array<T> empty(const Shape& shape) noexcept {
+		return Array<T>(shape);
 	}

 	template<typename T>
-	Array<T> empty(const std::initializer_list<size_t>& dims) noexcept {
-		const Shape shape(dims);
-		return { std::move(shape), new T[np::prod(shape)] };
+	inline Array<T> empty(const std::initializer_list<size_t>& dims) noexcept {
+		return Array<T>(dims);
 	}

 	template<typename T>
 	Array<T> zeros(Shape&& shape) noexcept {
-		return { std::move(shape), new T[np::prod(shape)]{0} };
+		Array<T> res(shape);
+		memset(res.data, 0, sizeof(T) * np::prod(res.shape));
+		return res;
 	}

 	template<typename T>
 	Array<T> zeros(const Shape& shape) noexcept {
-		return { std::move(shape), new T[np::prod(shape)]{0} };
+		Array<T> res(shape);
+		memset(res.data, 0, sizeof(T) * np::prod(res.shape));
+		return res;
 	}

 	template<typename T>
 	Array<T> zeros(const std::initializer_list<size_t>& dims) noexcept {
-		const Shape shape(dims);
-		return { std::move(shape), new T[np::prod(shape)]{0} };
+		Array<T> res(dims);
+		memset(res.data, 0, sizeof(T) * np::prod(res.shape));
+		return res;
 	}

 	template<typename T>
@ -774,7 +748,7 @@ constexpr np::Array<T>& map(np::Array<T>& a, const std::function<T(const size_t&

 template<typename T>
 __host__ __device__
-constexpr inline static void swap(T* a, T* b) noexcept {
+constexpr inline static void swap(T* const a, T* const b) noexcept {
 	if (a == b) return;
 	const T temp = *a;
 	*a = *b;
@ -806,7 +780,7 @@ void quicksort(const np::Array<T>& a) noexcept {
 }

 template<typename T>
-static size_t as_partition(const T* a, uint16_t* indices, const size_t& l, const size_t& h) noexcept {
+static size_t as_partition(const T* const a, uint16_t* const indices, const size_t& l, const size_t& h) noexcept {
 	size_t i = l - 1;
 	for (size_t j = l; j <= h; ++j)
 		if (a[indices[j]] < a[indices[h]])
@ -815,69 +789,27 @@ static size_t as_partition(const T* a, uint16_t* indices, const size_t& l, const
 	return i;
 }

-template<typename T>
-void argsort(const T* a, uint16_t* indices, const size_t& l, const size_t& h) noexcept {
-	const size_t total = h - l + 1;
-
-	size_t* stack = new size_t[total]{l, h};
-	size_t top = 1, low = l, high = h;
-
-	while (top <= total) {
-		high = stack[top--];
-		low = stack[top--];
-		if(low >= high)
-			break;
-
-		const size_t p = as_partition(a, indices, low, high);
-
-		if (p - 1 > low && p - 1 < total) {
-			stack[++top] = low;
-			stack[++top] = p - 1;
-		}
-
-		if (p + 1 < high) {
-			stack[++top] = p + 1;
-			stack[++top] = high;
-		}
-	}
-	delete[] stack;
-}
-
-template<typename T>
-np::Array<uint16_t> argsort(const np::Array<T>& other, const size_t& l, const size_t& h) noexcept {
-	np::Array<uint16_t> indices = np::empty(other.shape);
-	map(indices, [](const size_t& i, const uint16_t&) -> uint16_t { return i; });
-
-	argsort(other, indices, l, h);
-	return indices;
-}
-
-template<typename T>
-np::Array<uint16_t> argsort(const np::Array<T>* other, const size_t& length) noexcept {
-	return argsort(other, 0, length - 1);
-}
-
 std::array<np::Array<uint8_t>, 4> load_datasets(void);
-void print_error_file(const char*) noexcept;
+void print_error_file(const char* const) noexcept;

 template<typename T>
-void save(const np::Array<T>& d, const char* filename) {
-	FILE* output = fopen(filename, "wb");
+void save(const np::Array<T>& d, const char* const filename) {
+	FILE* const output = fopen(filename, "wb");
 	if (output == NULL) {
 		print_error_file(filename);
 		throw;
 	}
-	assert(d.shape.refcount != 0);//, "Refcount shape is zero !!");
+	assert(d.shape.refcount != 0);
 	fwrite(&d.shape.length, sizeof(size_t), 1, output);
 	fwrite(d.shape.data, sizeof(size_t), d.shape.length, output);
-	assert(d.refcount != 0);//, "Refcount array is zero !!");
+	assert(d.refcount != 0);
 	fwrite(d.data, sizeof(T), np::prod(d.shape), output);
 	fclose(output);
 }

 template<typename T>
-np::Array<T> load(const char* filename) {
-	FILE* input = fopen(filename, "rb");
+np::Array<T> load(const char* const filename) {
+	FILE* const input = fopen(filename, "rb");
 	if (input == NULL) {
 		print_error_file(filename);
 		throw;
@ -888,7 +820,7 @@ np::Array<T> load(const char* filename) {
 		fclose(input);
 		throw;
 	}
-	size_t* data = new size_t[length];
+	size_t* const data = new size_t[length];
 	if(!fread(data, sizeof(size_t), length, input)){
 		print_error_file(filename);
 		fclose(input);
@ -906,7 +838,7 @@ np::Array<T> load(const char* filename) {

 #ifdef __CUDACC__
 template<typename T>
-np::Array<T> copyToDevice(const char* name, const np::Array<T>& array) noexcept {
+np::Array<T> copyToDevice(const char* const name, const np::Array<T>& array) noexcept {
 	const size_t array_size = np::prod(array.shape) * sizeof(T);
 	const size_t shape_size = array.shape.length * sizeof(size_t);
 	np::Array<T> d_array;
@ -926,7 +858,7 @@ np::Array<T> copyToDevice(const char* name, const np::Array<T>& array) noexcept
 }

 template<typename T>
-constexpr void cudaFree(const char* name, np::Array<T>& array) noexcept {
+constexpr void cudaFree(const char* const name, np::Array<T>& array) noexcept {
 	//_print_cuda_error_(name, cudaFree(array.refcount));
 	//array.refcount = nullptr;
 	_print_cuda_error_(name, cudaFree(array.data));
@ -937,16 +869,16 @@ constexpr void cudaFree(const char* name, np::Array<T>& array) noexcept {
 	array.shape.data = nullptr;
 }

-constexpr inline void _print_cuda_error_(const char* name, const cudaError_t& err) noexcept {
+constexpr inline void _print_cuda_error_(const char* const name, const cudaError_t& err) noexcept {
 	if (err != cudaSuccess) fprintf(stderr, "Error: %s = %d : %s\n", name, err, cudaGetErrorString(err));
 }
 #endif

-int print(const np::Shape&) noexcept;
-int print(const np::Array<uint8_t>&) noexcept;
-int print(const np::Array<float64_t>&) noexcept;
-int print(const np::Array<uint8_t>&, const np::Slice&) noexcept;
-int print(const np::Array<uint32_t>&, const np::Slice&) noexcept;
-int print(const np::Array<int32_t>&, const np::Slice&) noexcept;
-int print(const np::Array<uint16_t>&, const np::Slice&) noexcept;
-int print_feat(const np::Array<uint8_t>&, const np::Slice&) noexcept;
+int32_t print(const np::Shape&) noexcept;
+int32_t print(const np::Array<uint8_t>&) noexcept;
+int32_t print(const np::Array<float64_t>&) noexcept;
+int32_t print(const np::Array<uint8_t>&, const np::Slice&) noexcept;
+int32_t print(const np::Array<uint32_t>&, const np::Slice&) noexcept;
+int32_t print(const np::Array<int32_t>&, const np::Slice&) noexcept;
+int32_t print(const np::Array<uint16_t>&, const np::Slice&) noexcept;
+int32_t print_feat(const np::Array<uint8_t>&, const np::Slice&) noexcept;
--- a/cpp/data_device.cu
+++ b/cpp/data_device.cu
@ -0,0 +1,16 @@
+#include "data.hpp"
+
+/**
+ * @brief Product of every elements in a given shape after a given offset.
+ *
+ * @param shape Shape to product over
+ * @param offset Skip offset
+ * @return Scalar product
+ */
+__host__ __device__
+size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept {
+	size_t result = shape[offset];
+	for(size_t i = 1 + offset; i < shape.length; ++i)
+		result *= shape[i];
+	return result;
+}
--- a/cpp/docker-compose.yaml
+++ b/cpp/docker-compose.yaml
@ -0,0 +1,15 @@
+services:
+  violajones-cpp:
+    image: saundersp/violajones-cpp
+    pull_policy: never
+    build: .
+    volumes:
+      - ./models:/home/ViolaJones/cpp/models
+      - ./out:/home/ViolaJones/cpp/out
+      - ../data:/home/ViolaJones/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
--- a/cpp/gpu_unittest.cu
+++ b/cpp/gpu_unittest.cu
@ -11,7 +11,7 @@ void test_working(const size_t& length) noexcept {
 	const size_t size = length * sizeof(size_t);

 #if __DEBUG
-	print("Estimating memory footprint at : " + format_byte_size(2 * size));
+	printf("Estimating memory footprint at : %s\n", format_byte_size(2 * size).c_str());
 #endif

 	np::Array<size_t> x = np::empty<size_t>({ length }), y = np::empty<size_t>({ length });
@ -53,7 +53,7 @@ void test_working_2d(const size_t& N1, const size_t& N2) noexcept {
 	const size_t size = length * sizeof(size_t);

 #if __DEBUG
-	print("Estimating memory footprint at : " + format_byte_size(2 * size));
+	printf("Estimating memory footprint at : %s\n", format_byte_size(2 * size).c_str());
 #endif

 	np::Array<size_t> x = np::empty<size_t>({ length }), y = np::empty<size_t>({ length });
@ -96,7 +96,7 @@ void test_working_3d(const size_t& N1, const size_t& N2, const size_t& N3) noexc
 	const size_t size = length * sizeof(size_t);

 #if __DEBUG
-	print("Estimating memory footprint at : " + format_byte_size(2 * size));
+	printf("Estimating memory footprint at : %s\n", format_byte_size(2 * size).c_str());
 #endif

 	np::Array<size_t> x = np::empty<size_t>({ length }), y = np::empty<size_t>({ length });
--- a/cpp/projet.cpp
+++ b/cpp/projet.cpp
@ -1,167 +1,192 @@
 #include <filesystem>
-namespace fs = std::filesystem;
 #include "data.hpp"
 #include "toolbox.hpp"
 #include "config.hpp"
-#include "gpu_unit_test.hpp"
 #include "toolbox_unit_test.hpp"
 #include "ViolaJones.hpp"
-#include "ViolaJonesGPU.hpp"
-#include "ViolaJonesCPU.hpp"
+#include "ViolaJones_device.hpp"

 #if GPU_BOOSTED
+#include "gpu_unit_test.hpp"
 #define LABEL "GPU"
-#define apply_features apply_features_gpu
-#define set_integral_image set_integral_image_gpu
-#define argsort_2d argsort_2d_gpu
 #else
 #define LABEL "CPU"
-#define apply_features apply_features_cpu
-#define set_integral_image set_integral_image_cpu
-#define argsort_2d argsort_2d_cpu
 #endif

-std::tuple<np::Array<int32_t>, np::Array<uint16_t>, np::Array<uint8_t>, np::Array<int32_t>, np::Array<uint8_t>> preprocessing() {
+/**
+ * @brief Execute the preprocessing phase
+ *
+ * The preprocessing phase consist of the following steps :
+ * - Load the dataset
+ * - Calculate features
+ * - Calculate integral images
+ * - Apply features to images
+ * - Calculate argsort of the featured images
+ *
+ * @return std::tuple<np::Array<int32_t>, np::Array<uint16_t>, np::Array<uint8_t>, np::Array<int32_t>, np::Array<uint8_t>> Tuple containing in order : training features, training features sorted indexes, training labels, testing features, testing labels
+ */
+std::tuple<np::Array<int32_t>, np::Array<uint16_t>, np::Array<uint8_t>, np::Array<int32_t>, np::Array<uint8_t>> preprocessing(void) {
 	// Creating state saver folders if they don't exist already
 	if (SAVE_STATE)
 		for (const char* const folder_name : { "models", "out" })
-			fs::create_directory(folder_name);
+			std::filesystem::create_directory(folder_name);

-	printf("| %-49s | %-18s | %-29s |\n", "Preprocessing", "Time spent (ns)", "Formatted time spent");
-	printf("|%s|%s|%s|\n", S(51), S(20), S(31));
+	const std::chrono::system_clock::time_point preproc_timestamp = perf_counter_ns();
+	const std::array<int32_t, 3> preproc_gaps = { 49, -18, 29 };
+	header(preproc_gaps, { "Preprocessing", "Time spent (ns)", "Formatted time spent" });

-	const auto [ X_train, y_train, X_test, y_test ] = state_saver<uint8_t, 4>("Loading sets", {"X_train", "y_train", "X_test", "y_test"},
+	const auto [ X_train, y_train, X_test, y_test ] = state_saver<uint8_t, 4>("Loading sets", preproc_gaps[0], { "X_train", "y_train", "X_test", "y_test" },
 			FORCE_REDO, SAVE_STATE, OUT_DIR, load_datasets);

 #if __DEBUG
-	print("X_train");
+	printf("X_train\n");
 	print(X_train.shape);
 	print(X_train, { IDX_INSPECT });
-	print("X_test");
+	printf("X_test\n");
 	print(X_test.shape);
 	print(X_test, { IDX_INSPECT });
-	print("y_train");
+	printf("y_train\n");
 	print(y_train.shape);
 	print(y_train, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
-	print("y_test");
+	printf("y_test\n");
 	print(y_test.shape);
 	print(y_test, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
 #endif

-	const np::Array<uint8_t> feats = state_saver<uint8_t>("Building features", "feats",
+	const np::Array<uint8_t> feats = state_saver<uint8_t>("Building features", preproc_gaps[0], "feats",
 			FORCE_REDO, SAVE_STATE, OUT_DIR, build_features, X_train.shape[1], X_train.shape[2]);

 #if __DEBUG
-	print("feats");
+	printf("feats\n");
 	print(feats.shape);
 	print_feat(feats, { IDX_INSPECT });
 #endif

-	const np::Array<uint32_t> X_train_ii = state_saver<uint32_t>("Converting training set to integral images (" LABEL ")", "X_train_ii_" LABEL,
+	const np::Array<uint32_t> X_train_ii = state_saver<uint32_t>("Converting training set to integral images (" LABEL ")", preproc_gaps[0], "X_train_ii_" LABEL,
 			FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_train);
-	const np::Array<uint32_t> X_test_ii = state_saver<uint32_t>("Converting testing set to integral images (" LABEL ")", "X_test_ii_" LABEL,
+	const np::Array<uint32_t> X_test_ii = state_saver<uint32_t>("Converting testing set to integral images (" LABEL ")", preproc_gaps[0], "X_test_ii_" LABEL,
 			FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_test);

 #if __DEBUG
-	print("X_train_ii");
+	printf("X_train_ii\n");
 	print(X_train_ii.shape);
 	print(X_train_ii, { IDX_INSPECT });
-	print("X_test_ii");
+	printf("X_test_ii\n");
 	print(X_test_ii.shape);
 	print(X_test_ii, { IDX_INSPECT });
 #endif

-	const np::Array<int32_t> X_train_feat = state_saver<int32_t>("Applying features to training set (" LABEL ")", "X_train_feat_" LABEL,
+	const np::Array<int32_t> X_train_feat = state_saver<int32_t>("Applying features to training set (" LABEL ")", preproc_gaps[0], "X_train_feat_" LABEL,
 			FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_train_ii);
-	const np::Array<int32_t> X_test_feat = state_saver<int32_t>("Applying features to testing set (" LABEL ")", "X_test_feat_" LABEL,
+	const np::Array<int32_t> X_test_feat = state_saver<int32_t>("Applying features to testing set (" LABEL ")", preproc_gaps[0], "X_test_feat_" LABEL,
 			FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_test_ii);

 #if __DEBUG
-	print("X_train_feat");
+	printf("X_train_feat\n");
 	print(X_train_feat.shape);
 	print(X_train_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
-	print("X_test_feat");
+	printf("X_test_feat\n");
 	print(X_test_feat.shape);
 	print(X_test_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
 #endif

-	// const Array<int> indices = measure_time_save<Array<int>>("Selecting best features", "indices", select_percentile, X_train_feat, d.y_train);
-	// const Array<int> indices = measure_time<Array<int>>("Selecting best features", select_percentile, X_train_feat, d.y_train);
+	// const np::Array<int32_t> indices = state_saver<int32_t>("Selecting best features", preproc_gaps[0], "indices", select_percentile, X_train_feat, d.y_train);

 #if __DEBUG
 	// print_feature(indices);
 #endif

-	const np::Array<uint16_t> X_train_feat_argsort = state_saver<uint16_t>("Precalculating training set argsort (" LABEL ")", "X_train_feat_argsort_" LABEL,
+	const np::Array<uint16_t> X_train_feat_argsort = state_saver<uint16_t>("Precalculating training set argsort (" LABEL ")", preproc_gaps[0], "X_train_feat_argsort_" LABEL,
 			FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_train_feat);

 #if __DEBUG
-	print("X_train_feat_argsort");
+	printf("X_train_feat_argsort\n");
 	print(X_train_feat_argsort.shape);
 	print(X_train_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
 #endif

-	const np::Array<uint16_t> X_test_feat_argsort = state_saver<uint16_t>("Precalculating testing set argsort (" LABEL ")", "X_test_feat_argsort_" LABEL,
+	const np::Array<uint16_t> X_test_feat_argsort = state_saver<uint16_t>("Precalculating testing set argsort (" LABEL ")", preproc_gaps[0], "X_test_feat_argsort_" LABEL,
 			FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_test_feat);

 #if __DEBUG
-	print("X_test_feat_argsort");
+	printf("X_test_feat_argsort\n");
 	print(X_test_feat_argsort.shape);
 	print(X_test_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
 #endif
-
+	const long long time_spent = duration_ns(perf_counter_ns() - preproc_timestamp);
+	formatted_line(preproc_gaps, "├", "┼", "─", "┤");
+	formatted_row(preproc_gaps, { "Preprocessing summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
+	footer(preproc_gaps);
 	return { X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test };
 }

-void train(const np::Array<int32_t>& X_train_feat, const np::Array<uint16_t>& X_train_feat_argsort, const np::Array<uint8_t>& y_train) {
-	printf("\n| %-49s | %-18s | %-29s |\n", "Training", "Time spent (ns)", "Formatted time spent");
-	printf("|%s|%s|%s|\n", S(51), S(20), S(31));
+/**
+ * @brief Train the weak classifiers.
+ *
+ * @param X_train_feat Training images
+ * @param X_train_feat_argsort Sorted indexes of the training images features
+ * @param y_train Training labels
+ * @return List of trained models
+ */
+std::array<std::array<np::Array<float64_t>, 2>, TS.size()> train(const np::Array<int32_t>& X_train_feat, const np::Array<uint16_t>& X_train_feat_argsort, const np::Array<uint8_t>& y_train) noexcept {
+	const std::chrono::system_clock::time_point training_timestamp = perf_counter_ns();
+	const std::array<int32_t, 3> training_gaps = { 26, -18, 29 };
+	header(training_gaps, { "Training", "Time spent (ns)", "Formatted time spent" });

-	for (const size_t T : TS) {
-		char title[BUFFER_SIZE] = { 0 };
-		char alphas_title[BUFFER_SIZE] = { 0 };
-		char final_classifiers_title[BUFFER_SIZE] = { 0 };
-		sprintf(title, "ViolaJones T = %-4lu (%s)", T, LABEL);
-		sprintf(alphas_title, "alphas_%lu_%s", T, LABEL);
-		sprintf(final_classifiers_title, "final_classifiers_%lu_%s", T, LABEL);
-
-#if __DEBUG
-		const auto [ alphas, final_classifiers ] = state_saver<float64_t, 2>(title, { alphas_title, final_classifiers_title },
-#else
-		state_saver<float64_t, 2>(title, { alphas_title, final_classifiers_title },
-#endif
-				FORCE_REDO, SAVE_STATE, MODEL_DIR, train_viola_jones, T, X_train_feat, X_train_feat_argsort, y_train);
-#if __DEBUG
-		print("alphas");
-		print(alphas);
-		print("final_classifiers");
-		print(final_classifiers);
-#endif
-	}
-}
-
-void testing_and_evaluating(const np::Array<int32_t>& X_train_feat, const np::Array<uint8_t>& y_train, const np::Array<int32_t>& X_test_feat, const np::Array<uint8_t>& y_test) {
-	printf("\n| %-26s | Time spent (ns) (E) | %-29s | Time spent (ns) (T) | %-29s |\n", "Testing", "Formatted time spent (E)", "Formatted time spent (T)");
-	printf("|%s|%s|%s|%s|%s|\n", S(28), S(21), S(31), S(21), S(31));
-
-	constexpr const size_t NT = sizeof(TS) / sizeof(size_t);
-	std::array<std::array<float64_t, 8>, NT> results;
+	std::array<std::array<np::Array<float64_t>, 2>, TS.size()> models;

 	size_t i = 0;
 	for (const size_t T : TS) {
 		char title[BUFFER_SIZE] = { 0 };
 		char alphas_title[BUFFER_SIZE] = { 0 };
 		char final_classifiers_title[BUFFER_SIZE] = { 0 };
-		sprintf(title, "ViolaJones T = %-4lu (%s)", T, LABEL);
-		sprintf(alphas_title, MODEL_DIR "/alphas_%lu_%s.bin", T, LABEL);
-		sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_%s.bin", T, LABEL);
+		snprintf(title, BUFFER_SIZE, "ViolaJones T = %-4lu (%s)", T, LABEL);
+		snprintf(alphas_title, BUFFER_SIZE, "alphas_%lu_%s", T, LABEL);
+		snprintf(final_classifiers_title, BUFFER_SIZE, "final_classifiers_%lu_%s", T, LABEL);

-		const np::Array<float64_t> alphas = load<float64_t>(alphas_title);
-		const np::Array<float64_t> final_classifiers = load<float64_t>(final_classifiers_title);
+		const auto [ alphas, final_classifiers ] = state_saver<float64_t, 2>(title, training_gaps[0], { alphas_title, final_classifiers_title },
+				FORCE_REDO, SAVE_STATE, MODEL_DIR, train_viola_jones, T, X_train_feat, X_train_feat_argsort, y_train);
+#if __DEBUG
+		printf("alphas\n");
+		print(alphas);
+		printf("final_classifiers\n");
+		print(final_classifiers);
+#endif
+		models[i++] = { alphas, final_classifiers };
+	}
+	const long long time_spent = duration_ns(perf_counter_ns() - training_timestamp);
+	formatted_line(training_gaps, "├", "┼", "─", "┤");
+	formatted_row(training_gaps, { "Training summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
+	footer(training_gaps);
+
+	return models;
+}
+
+/**
+ * @brief Benchmark the trained classifiers on the training and testing sets.
+ *
+ * @param models List of trained models
+ * @param X_train_feat Training features
+ * @param y_train Training labels
+ * @param X_test_feat Testing features
+ * @param y_test Testing labels
+ */
+void testing_and_evaluating(const std::array<std::array<np::Array<float64_t>, 2>, TS.size()>& models, const np::Array<int32_t>& X_train_feat, const np::Array<uint8_t>& y_train, const np::Array<int32_t>& X_test_feat, const np::Array<uint8_t>& y_test) {
+	const std::array<int32_t, 5> testing_gaps = { 26, -19, 24, -19, 24 };
+	header(testing_gaps, { "Testing", "Time spent (ns) (E)", "Formatted time spent (E)", "Time spent (ns) (T)", "Formatted time spent (T)" });
+	std::array<std::array<float64_t, 8>, TS.size()> results;
+
+	size_t i = 0;
+	long long total_train_timestamp = 0;
+	long long total_test_timestamp = 0;
+	for (const auto& [ alphas, final_classifiers ] : models) {
+		char title[BUFFER_SIZE] = { 0 };
+		snprintf(title, BUFFER_SIZE, "ViolaJones T = %-4i (%s)", TS[i], LABEL);

 		std::chrono::system_clock::time_point start = perf_counter_ns();
 		const np::Array<uint8_t> y_pred_train = classify_viola_jones(alphas, final_classifiers, X_train_feat);
 		const long long t_pred_train = duration_ns(perf_counter_ns() - start);
+		total_train_timestamp += t_pred_train;
 		const float64_t e_acc = accuracy_score(y_train, y_pred_train);
 		const float64_t e_f1 = f1_score(y_train, y_pred_train);
 		float64_t e_FN, e_FP;
@ -170,137 +195,177 @@ void testing_and_evaluating(const np::Array<int32_t>& X_train_feat, const np::Ar
 		start = perf_counter_ns();
 		const np::Array<uint8_t> y_pred_test = classify_viola_jones(alphas, final_classifiers, X_test_feat);
 		const long long t_pred_test = duration_ns(perf_counter_ns() - start);
+		total_test_timestamp += t_pred_test;
 		const float64_t t_acc = accuracy_score(y_test, y_pred_test);
 		const float64_t t_f1 = f1_score(y_test, y_pred_test);
 		float64_t t_FN, t_FP;
 		std::tie(std::ignore, t_FN, t_FP, std::ignore) = confusion_matrix(y_test, y_pred_test);
 		results[i++] = { e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP };

-		printf("| %-26s | %'19lld | %-29s | %'19lld | %-29s |\n", title, t_pred_train, format_time_ns(t_pred_train).c_str(), t_pred_test, format_time_ns(t_pred_test).c_str());
+		formatted_row(testing_gaps, { title, thousand_sep(t_pred_train).c_str(), format_time_ns(t_pred_train).c_str(), thousand_sep(t_pred_test).c_str(), format_time_ns(t_pred_test).c_str() });
 	}
+	formatted_line(testing_gaps, "├", "┼", "─", "┤");
+	formatted_row(testing_gaps, { "Testing summary", thousand_sep(total_train_timestamp).c_str(), format_time_ns(total_train_timestamp).c_str(), thousand_sep(total_test_timestamp).c_str(), format_time_ns(total_test_timestamp).c_str() });
+	footer(testing_gaps);

-	printf("\n| %-19s | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) |\n", "Evaluating");
-	printf("|%s|%s|%s|%s|%s|%s|%s|%s|%s|\n", S(21), S(9), S(8), S(8), S(8), S(9), S(8), S(8), S(8));
+	const std::array<int32_t, 9> evaluating_gaps = { 19, -7, -6, -6, -6, -7, -6, -6, -6 };
+	header(evaluating_gaps, { "Evaluating", "ACC (E)", "F1 (E)", "FN (E)", "FP (E)", "ACC (T)", "F1 (T)", "FN (T)", "FP (T)"});

 	i = 0;
 	for (const size_t T : TS) {
 		char title[BUFFER_SIZE] = { 0 };
-		sprintf(title, "ViolaJones T = %-4lu", T);
+		snprintf(title, BUFFER_SIZE, "ViolaJones T = %-4lu", T);
 		const auto [e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP] = results[i++];
-		printf("| %-19s | %'6.2f%% | %'6.2f | %'6.0f | %'6.0f | %6.2f%% | %'6.2f | %'6.0f | %'6.0f |\n", title, e_acc * 100, e_f1, e_FN, e_FP, t_acc * 100, t_f1, t_FN, t_FP);
+		printf("│ %-19s │ %'6.2f%% │ %'6.2f │ %'6.0f │ %'6.0f │ %6.2f%% │ %'6.2f │ %'6.0f │ %'6.0f │\n", title, e_acc * 100, e_f1, e_FN, e_FP, t_acc * 100, t_f1, t_FN, t_FP);
 	}
+	footer(evaluating_gaps);
 }

+/**
+ * @brief Test if the each result is equals to other devices.
+ *
+ * Given ViolaJones is a fully deterministic algorithm. The results, regardless the device, should be the same,
+ * this function check this assertion.
+ */
 void unit_test(void) {
-	printf("\n| %-37s | %-10s | %-18s | %-29s |\n", "Unit testing", "Test state", "Time spent (ns)", "Formatted time spent");
-	printf("|%s|%s|%s|%s|\n", S(39), S(12), S(20), S(31));
+	const std::chrono::system_clock::time_point unit_timestamp = perf_counter_ns();
+	const std::array<int32_t, 4> unit_gaps = { 37, -10, -18, 29};
+	header(unit_gaps, { "Unit testing", "Test state", "Time spent (ns)", "Formatted time spent" });

 	char title[BUFFER_SIZE] = { 0 };
 	char tmp_title[BUFFER_SIZE / 2] = { 0 };
 	char file_cpu[BUFFER_SIZE] = { 0 };
 	char file_gpu[BUFFER_SIZE] = { 0 };
-	const std::chrono::system_clock::time_point fnc_s = perf_counter_ns();
 	uint64_t n_total = 0, n_success = 0;

-	auto test_fnc = [&n_total, &n_success](const char* title, const auto& fnc) {
+	const auto test_fnc = [&unit_gaps, &n_total, &n_success](const char* const title, const auto& fnc) noexcept {
 		++n_total;
 		const std::chrono::system_clock::time_point start = perf_counter_ns();
 		const bool state = fnc();
 		const long long time_spent = duration_ns(perf_counter_ns() - start);
 		if(state){
-			printf("| %-37s | %10s | %18s | %-29s |\n", title, "Passed", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str());
+			formatted_row(unit_gaps, { title, "Passed", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
 			++n_success;
 		} else
-			printf("| %-37s | %10s | %18s | %-29s |\n", title, "Failed", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str());
+			formatted_row(unit_gaps, { title, "Failed", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
 	};

-	for (const char* label : { "train", "test" }) {
-		sprintf(file_cpu, OUT_DIR "/X_%s_ii_CPU.bin", label);
-		sprintf(file_gpu, OUT_DIR "/X_%s_ii_GPU.bin", label);
-		if (fs::exists(file_cpu) && fs::exists(file_gpu)) {
-			const np::Array<uint32_t> X_train_ii_cpu = load<uint32_t>(file_cpu);
-			const np::Array<uint32_t> X_train_ii_gpu = load<uint32_t>(file_gpu);
-			sprintf(tmp_title, "X_%s_ii", label);
-			sprintf(title, "%-22s - CPU  vs GPU", tmp_title);
-			test_fnc(title, [&X_train_ii_cpu, &X_train_ii_gpu]{ return unit_test_cpu_vs_gpu<uint32_t>(X_train_ii_cpu, X_train_ii_gpu); });
+	for (const char* const label : { "train", "test" }) {
+		snprintf(file_cpu, BUFFER_SIZE, OUT_DIR "/X_%s_ii_CPU.bin", label);
+		snprintf(file_gpu, BUFFER_SIZE, OUT_DIR "/X_%s_ii_GPU.bin", label);
+		if (std::filesystem::exists(file_cpu) && std::filesystem::exists(file_gpu)) {
+			snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_ii", label);
+			snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
+			test_fnc(title, [&file_cpu, &file_gpu]{
+				const np::Array<uint32_t> X_train_ii_cpu = load<uint32_t>(file_cpu);
+				const np::Array<uint32_t> X_train_ii_gpu = load<uint32_t>(file_gpu);
+				return unit_test_cpu_vs_gpu<uint32_t>(X_train_ii_cpu, X_train_ii_gpu);
+			});
 		}
+		snprintf(file_cpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_CPU.bin", label);
+		snprintf(file_gpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_GPU.bin", label);
+		uint8_t feat = 0;
 		char file_feat[BUFFER_SIZE] = { 0 };
-		sprintf(file_feat, OUT_DIR "/X_%s_feat_CPU.bin", label);
-		if (fs::exists(file_feat)) {
+		if (std::filesystem::exists(file_cpu)) {
+			strncpy(file_feat, file_cpu, BUFFER_SIZE);
+			feat = 1;
+		} else if (std::filesystem::exists(file_gpu)) {
+			strncpy(file_feat, file_gpu, BUFFER_SIZE);
+			feat = 2;
+		}
+		if (feat != 0) {
 			const np::Array<int32_t> X_feat = load<int32_t>(file_feat);
-			sprintf(file_gpu, OUT_DIR "/X_%s_feat_GPU.bin", label);
-			if (fs::exists(file_gpu)) {
-				const np::Array<int32_t> X_feat_gpu = load<int32_t>(file_gpu);
-				sprintf(tmp_title, "X_%s_feat", label);
-				sprintf(title, "%-22s - CPU  vs GPU", tmp_title);
-				test_fnc(title, [&X_feat, &X_feat_gpu]{ return unit_test_cpu_vs_gpu<int32_t>(X_feat, X_feat_gpu); });
+			snprintf(file_gpu, BUFFER_SIZE, feat == 1 ? OUT_DIR "/X_%s_feat_GPU.bin" : OUT_DIR "/X_%s_feat_CPU.bin", label);
+			if (std::filesystem::exists(file_gpu)) {
+				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat", label);
+				snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
+				test_fnc(title, [&X_feat, &file_gpu]{
+					const np::Array<int32_t> X_feat_aux = load<int32_t>(file_gpu);
+					return unit_test_cpu_vs_gpu<int32_t>(X_feat, X_feat_aux);
+				});
 			}
-			sprintf(file_cpu, OUT_DIR "/X_%s_feat_argsort_CPU.bin", label);
+			snprintf(file_cpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_argsort_CPU.bin", label);
 			np::Array<uint16_t> X_feat_argsort_cpu;
 			uint8_t loaded = 0;
-			if (fs::exists(file_cpu)) {
-				X_feat_argsort_cpu = std::move(load<uint16_t>(file_cpu));
+			if (std::filesystem::exists(file_cpu)) {
 				++loaded;
-				sprintf(tmp_title, "X_%s_feat_argsort", label);
-				sprintf(title, "%-22s - CPU  argsort", tmp_title);
-				test_fnc(title, [&X_feat, &X_feat_argsort_cpu]{ return unit_test_argsort_2d<int32_t>(X_feat, X_feat_argsort_cpu); });
+				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat_argsort", label);
+				snprintf(title, BUFFER_SIZE, "%-22s - CPU  argsort", tmp_title);
+				test_fnc(title, [&X_feat, &X_feat_argsort_cpu, &file_cpu]{
+					X_feat_argsort_cpu = load<uint16_t>(file_cpu);
+					return unit_test_argsort_2d<int32_t>(X_feat, X_feat_argsort_cpu);
+				});
 			}
-			sprintf(file_gpu, OUT_DIR "/X_%s_feat_argsort_GPU.bin", label);
+			snprintf(file_gpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_argsort_GPU.bin", label);
 			np::Array<uint16_t> X_feat_argsort_gpu;
-			if (fs::exists(file_gpu)) {
-				X_feat_argsort_gpu = std::move(load<uint16_t>(file_gpu));
+			if (std::filesystem::exists(file_gpu)) {
 				++loaded;
-				sprintf(tmp_title, "X_%s_feat_argsort", label);
-				sprintf(title, "%-22s - GPU  argsort", tmp_title);
-				test_fnc(title, [&X_feat, &X_feat_argsort_gpu]{ return unit_test_argsort_2d<int32_t>(X_feat, X_feat_argsort_gpu); });
+				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat_argsort", label);
+				snprintf(title, BUFFER_SIZE, "%-22s - GPU  argsort", tmp_title);
+				test_fnc(title, [&X_feat, &X_feat_argsort_gpu, &file_gpu]{
+					X_feat_argsort_gpu = load<uint16_t>(file_gpu);
+					return unit_test_argsort_2d<int32_t>(X_feat, X_feat_argsort_gpu);
+				});
 			}
 			if (loaded == 2){
-				sprintf(tmp_title, "X_%s_feat_argsort", label);
-				sprintf(title, "%-22s - CPU  vs GPU", tmp_title);
+				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat_argsort", label);
+				snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
 				test_fnc(title, [&X_feat_argsort_cpu, &X_feat_argsort_gpu]{ return unit_test_cpu_vs_gpu<uint16_t>(X_feat_argsort_cpu, X_feat_argsort_gpu); });
 			}
 		}
 	}

 	for (const size_t T : TS)
-		for (const char* label : { "alphas", "final_classifiers" }) {
-			sprintf(file_cpu, MODEL_DIR "/%s_%lu_CPU.bin", label, T);
-			sprintf(file_gpu, MODEL_DIR "/%s_%lu_GPU.bin", label, T);
-			if (fs::exists(file_cpu) && fs::exists(file_gpu)){
-				const np::Array<float64_t> cpu = load<float64_t>(file_cpu);
-				const np::Array<float64_t> gpu = load<float64_t>(file_gpu);
-				sprintf(tmp_title, "%s_%ld", label, T);
-				sprintf(title, "%-22s - CPU  vs GPU", tmp_title);
-				test_fnc(title, [&cpu, &gpu]{ return unit_test_cpu_vs_gpu<float64_t>(cpu, gpu); });
+		for (const char* const label : { "alphas", "final_classifiers" }) {
+			snprintf(file_cpu, BUFFER_SIZE, MODEL_DIR "/%s_%lu_CPU.bin", label, T);
+			snprintf(file_gpu, BUFFER_SIZE, MODEL_DIR "/%s_%lu_GPU.bin", label, T);
+			if (std::filesystem::exists(file_cpu) && std::filesystem::exists(file_gpu)){
+				snprintf(tmp_title, BUFFER_SIZE / 2, "%s_%ld", label, T);
+				snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
+				test_fnc(title, [&file_cpu, &file_gpu]{
+					const np::Array<float64_t> cpu = load<float64_t>(file_cpu);
+					const np::Array<float64_t> gpu = load<float64_t>(file_gpu);
+					return unit_test_cpu_vs_gpu<float64_t>(cpu, gpu);
+				});
 			}
 		}

-	const long long time_spent = duration_ns(perf_counter_ns() - fnc_s);
-	sprintf(title, "%ld/%ld", n_success, n_total);
+	const long long time_spent = duration_ns(perf_counter_ns() - unit_timestamp);

-	printf("|%s|%s|%s|%s|\n", S(39), S(12), S(20), S(31));
-	printf("| %-37s | %10s | %18s | %-29s |\n", "Unit testing summary", title, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str());
+	if (n_total == 0)
+		formatted_row(unit_gaps, { "Unit testing summary", "No files", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
+	else {
+		snprintf(title, BUFFER_SIZE, "%ld/%ld", n_success, n_total);
+		formatted_line(unit_gaps, "├", "┼", "─", "┤");
+		formatted_row(unit_gaps, { "Unit testing summary", title, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
+	}
+	footer(unit_gaps);
 }

-int main(){
+int32_t main(void){
 	setlocale(LC_NUMERIC, ""); // Allow proper number display

-	printf("| %-49s | %-18s | %-29s |\n", "Unit testing", "Time spent (ns)", "Formatted time spent");
-	printf("|%s|%s|%s|\n", S(51), S(20), S(31));
+	const std::chrono::system_clock::time_point unit_timestamp = perf_counter_ns();
+	const std::array<int32_t, 3> unit_gaps = { 27, -18, 29 };
+	header(unit_gaps, { "Unit testing", "Time spent (ns)", "Formatted time spent" });
 #if GPU_BOOSTED
-	benchmark_function_void("Testing GPU capabilities 1D", test_working, 3 + (1<<29));
-	benchmark_function_void("Testing GPU capabilities 2D", test_working_2d, 3 + (1<<15), 2 + (1<<14));
-	benchmark_function_void("Testing GPU capabilities 3D", test_working_3d, 9 + (1<<10), 5 + (1<<10), 7 + (1<<9));
+	benchmark_function_void("Testing GPU capabilities 1D", unit_gaps[0], test_working, 50000);
+	benchmark_function_void("Testing GPU capabilities 2D", unit_gaps[0], test_working_2d, 200, 500);
+	benchmark_function_void("Testing GPU capabilities 3D", unit_gaps[0], test_working_3d, 30, 40, 500);
 #endif
-	benchmark_function_void("Testing format_time", format_time_test);
-	benchmark_function_void("Testing format_time_ns", format_time_ns_test);
-	benchmark_function_void("Testing format_byte_size", format_byte_size_test);
-	benchmark_function_void("Testing thousand_sep", thousand_sep_test);
-	printf("\n");
+	benchmark_function_void("Testing format_time", unit_gaps[0], format_time_test);
+	benchmark_function_void("Testing format_time_ns", unit_gaps[0], format_time_ns_test);
+	benchmark_function_void("Testing format_byte_size", unit_gaps[0], format_byte_size_test);
+	benchmark_function_void("Testing thousand_sep", unit_gaps[0], thousand_sep_test);
+	const long long time_spent = duration_ns(perf_counter_ns() - unit_timestamp);
+	formatted_line(unit_gaps, "├", "┼", "─", "┤");
+	formatted_row(unit_gaps, { "Unit testing summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
+	footer(unit_gaps);

 	const auto [ X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test ] = preprocessing();
-	train(X_train_feat, X_train_feat_argsort, y_train);
-	testing_and_evaluating(X_train_feat, y_train, X_test_feat, y_test);
+	const std::array<std::array<np::Array<float64_t>, 2>, TS.size()> models = train(X_train_feat, X_train_feat_argsort, y_train);
+	testing_and_evaluating(models, X_train_feat, y_train, X_test_feat, y_test);
 	unit_test();
+
 	return EXIT_SUCCESS;
 }
--- a/cpp/projet_test.cpp
+++ b/cpp/projet_test.cpp
@ -0,0 +1,31 @@
+#include "toolbox.hpp"
+#include "config.hpp"
+#include "toolbox_unit_test.hpp"
+#include "ViolaJones.hpp"
+
+#if GPU_BOOSTED
+#include "gpu_unit_test.hpp"
+#endif
+
+int32_t main(void){
+	setlocale(LC_NUMERIC, ""); // Allow proper number display
+
+	const std::chrono::system_clock::time_point unit_timestamp = perf_counter_ns();
+	const std::array<int32_t, 3> unit_gaps = { 27, -18, 29 };
+	header(unit_gaps, { "Unit testing", "Time spent (ns)", "Formatted time spent" });
+#if GPU_BOOSTED
+	benchmark_function_void("Testing GPU capabilities 1D", unit_gaps[0], test_working, 50000);
+	benchmark_function_void("Testing GPU capabilities 2D", unit_gaps[0], test_working_2d, 200, 500);
+	benchmark_function_void("Testing GPU capabilities 3D", unit_gaps[0], test_working_3d, 30, 40, 500);
+#endif
+	benchmark_function_void("Testing format_time", unit_gaps[0], format_time_test);
+	benchmark_function_void("Testing format_time_ns", unit_gaps[0], format_time_ns_test);
+	benchmark_function_void("Testing format_byte_size", unit_gaps[0], format_byte_size_test);
+	benchmark_function_void("Testing thousand_sep", unit_gaps[0], thousand_sep_test);
+	const long long time_spent = duration_ns(perf_counter_ns() - unit_timestamp);
+	formatted_line(unit_gaps, "├", "┼", "─", "┤");
+	formatted_row(unit_gaps, { "Unit testing summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
+	footer(unit_gaps);
+
+	return EXIT_SUCCESS;
+}
--- a/cpp/test.cpp
+++ b/cpp/test.cpp
@ -1,63 +0,0 @@
-#include <iostream>
-#include <iomanip>
-#include "data.hpp"
-#include "toolbox.hpp"
-
-#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
-#define PBWIDTH 60
-
-void printProgress(const float64_t& percentage) noexcept {
-	const uint64_t val = static_cast<uint64_t>(percentage * 100);
-	const int lpad = static_cast<int>(percentage * PBWIDTH);
-	const int rpad = PBWIDTH - lpad;
-	printf("%3lu%% [%.*s%*s]\r", val, lpad, PBSTR, rpad, "");
-	fflush(stdout);
-}
-
-void clearProgress() noexcept {
-	// Progress bar width + space before + num space + space after
-	printf("%*c\r", PBWIDTH + 1 + 3 + 3, ' ');
-}
-
-template<typename T>
-void test(const uint64_t& N) noexcept {
-#if __DEBUG
-	printf("DETERMINISTIC for N=%s of %s sized %s\n", thousand_sep(N).c_str(), typeid(T).name(), format_byte_size(sizeof(T)).c_str());
-	print("Estimating memory footprint at : " + format_byte_size(3 * N * sizeof(T)));
-#endif
-
-	T *a = new T[N], *b = new T[N], *c = new T[N];
-
-	T mean = static_cast<T>(0.0);
-	const size_t percent = N / 100;
-	for(size_t i = 0; i < N; ++i){
-		if (i % percent == 0) printProgress(static_cast<float64_t>(i) / N);
-		a[i] = static_cast<T>(i < N>>1 ? 0.1 : 1.0);
-		b[i] = static_cast<T>(1.0);
-		c[i] = a[i] * b[i];
-		mean += c[i];
-	}
-	mean /= static_cast<T>(N);
-
-	clearProgress();
-	std::cout << mean << std::endl;
-
-	delete[] a, delete[] b, delete[] c;
-}
-
-void test_float() noexcept {
-	std::cout << std::setprecision(1<<8);
-	const uint64_t N = static_cast<uint64_t>(1)<<28;
-	test<float128_t>(N);
-	test<float64_t>(N);
-	test<float32_t>(N);
-
-	//printf("%.128af\n", static_cast<float64_t>(1) / 3);
-	//std::cout << static_cast<float64_t>(1) / 3 << std::endl;
-	//std::cout << std::hexfloat << static_cast<float64_t>(1) / 3 << std::endl;
-
-	//printf("%.128Lf\n", static_cast<long float64_t>(1) / 3);
-	//printf("%.128lf\n", static_cast<float64_t>(1) / 3);
-	//printf("%.128f\n", static_cast<float>(1) / 3);
-}
-
--- a/cpp/toolbox.cpp
+++ b/cpp/toolbox.cpp
@ -1,8 +1,6 @@
 #include "toolbox.hpp"
-#include <numeric>
-#include <algorithm>

-static constexpr uint64_t u64(const double& n) noexcept { return static_cast<uint64_t>(n); }
+inline static constexpr uint64_t u64(const double& n) noexcept { return static_cast<uint64_t>(n); }

 static const constexpr size_t N_TIMES = 11;
 static const constexpr std::array<const char*, N_TIMES> time_formats = { "ns", "us", "ms", "s", "m", "h", "j", "w", "M", "y", "c" };
@ -12,8 +10,8 @@ static const constexpr std::array<uint64_t, N_TIMES> time_numbers = { 1, u64(1e3
 /**
 * @brief Format the time in seconds in human readable format.
 *
- * @param time Time in seconds
- * @return std::string The formatted human readable string.
+ * @param time number of seconds
+ * @return The formatted human readable string
 */
 std::string format_time(uint64_t time) noexcept {
 	if (time == 0)
@ -21,8 +19,8 @@ std::string format_time(uint64_t time) noexcept {

 	std::string s = "";
 	uint64_t res;
-	for (int i = N_TIMES - 1; i >= 3; --i) {
-		const uint64_t time_number = time_numbers[i] / 1e9; // Converting nanosecond timestamp to second
+	for (int32_t i = N_TIMES - 1; i >= 3; --i) {
+		const uint64_t time_number = time_numbers[i] / u64(1e9); // Converting nanosecond timestamp to second
 		if (time >= time_number) {
 			res = time / time_number;
 			time %= time_number;
@ -30,8 +28,8 @@ std::string format_time(uint64_t time) noexcept {
 		}
 	}

-	if (s.back() == ' ')
-		s.pop_back();
+	// Remove trailing character
+	s.pop_back();

 	return s;
 }
@ -40,7 +38,7 @@ std::string format_time(uint64_t time) noexcept {
 * @brief Format the time in nanoseconds in human readable format.
 *
 * @param time Time in nanoseconds
- * @return std::string The formatted human readable string.
+ * @return std::string The formatted human readable string
 */
 std::string format_time_ns(uint64_t time) noexcept {
 	if (time == 0)
@ -48,7 +46,7 @@ std::string format_time_ns(uint64_t time) noexcept {

 	std::string s = "";
 	uint64_t res;
-	for (int i = N_TIMES - 1; i >= 0; --i) {
+	for (int32_t i = N_TIMES - 1; i >= 0; --i) {
 		if (time >= time_numbers[i]) {
 			res = time / time_numbers[i];
 			time %= time_numbers[i];
@ -56,8 +54,8 @@ std::string format_time_ns(uint64_t time) noexcept {
 		}
 	}

-	if (s.back() == ' ')
-		s.pop_back();
+	// Remove trailing character
+	s.pop_back();

 	return s;
 }
@ -71,7 +69,7 @@ static const constexpr uint64_t total_bytes = u64(1)<<(10 * (N_BYTES - 1));
 * See more : https://en.wikipedia.org/wiki/JEDEC_memory_standards
 *
 * @param bytes Number of bytes
- * @return std::string JEDEC compliant formatted number of bytes
+ * @return JEDEC compliant formatted number of bytes
 */
 std::string format_byte_size(uint64_t bytes) noexcept {
 	if (bytes == 0)
@ -95,6 +93,13 @@ std::string format_byte_size(uint64_t bytes) noexcept {
 	return s;
 }

+/**
+ * @brief Format a number with a separator (i.e. 1000 as 1,000)
+ *
+ * @param k number to format
+ * @param separator used between each thouand
+ * @return Formatted number
+ */
 std::string thousand_sep(uint64_t k, const char& separator) noexcept {
 	const std::string n = std::to_string(k);
 	const uint64_t st_size = n.length() + (n.length() - 1) / 3;
@ -111,4 +116,3 @@ std::string thousand_sep(uint64_t k, const char& separator) noexcept {

 	return s;
 }
-
--- a/cpp/toolbox.hpp
+++ b/cpp/toolbox.hpp
@ -1,12 +1,102 @@
 #pragma once
 #include <array>
-#include <chrono>
 #include <string>
+#include <stdint.h>
+
+/**
+ * @brief Print a formatted row of titles with of gaps seperated by a separator.
+ *
+ * @param gaps List of size gaps
+ * @param titles List of titles
+ * @param separator Separator character between each gap
+ */
+template<size_t N>
+constexpr void formatted_row(const std::array<int32_t, N>& gaps, const std::array<const char* const, N>& titles,
+					const char* const separator = "│") noexcept {
+	for(size_t i = 0; i < N; ++i)
+		printf("%s %*s ", separator, -gaps[i], titles[i]);
+	printf("%s\n", separator);
+}
+
+/**
+ * @brief Print a formatted line of repeated characters.
+ *
+ * @param gaps List of size gaps
+ * @param right Character on the left
+ * @param middle Character between each separator
+ * @param separator Separator character between each gap
+ * @param left Character on the right
+ */
+template<size_t N>
+constexpr void formatted_line(const std::array<int32_t, N>& gaps, const char* const left, const char* const middle,
+			const char* const separator, const char* const right) noexcept {
+	printf("%s", left);
+	for(size_t i = 0; i < N; ++i){
+		for(int32_t j = std::abs(gaps[i]) + 2; j > 0; --j)
+			printf("%s", separator);
+		if(i != N - 1)
+			printf("%s", middle);
+	}
+
+	printf("%s\n", right);
+}
+
+/**
+ * @brief Print a formatted header with the given titles and sizes.
+ *
+ * @param gaps List of size gaps
+ * @param titles List of titles
+ */
+template<size_t N>
+constexpr void header(const std::array<int32_t, N>& gaps, const std::array<const char* const, N>& titles) noexcept {
+	formatted_line(gaps, "┌", "┬", "─", "┐");
+	formatted_row(gaps, titles);
+	formatted_line(gaps, "├", "┼", "─", "┤");
+}
+
+/**
+ * @brief Print a formatted footer with the given sizes.
+ *
+ * @param gaps List of size gaps
+ */
+template<size_t N>
+constexpr inline void footer(const std::array<int32_t, N>& gaps) noexcept {
+	formatted_line(gaps, "└", "┴", "─", "┘");
+}

 #define duration_ns(a) std::chrono::duration_cast<std::chrono::nanoseconds>(a).count()
 #define perf_counter_ns() std::chrono::high_resolution_clock::now()

+/**
+ * @brief Format the time in seconds in human readable format.
+ *
+ * @param time number of seconds
+ * @return The formatted human readable string
+ */
 std::string format_time(uint64_t) noexcept;
+
+/**
+ * @brief Format the time in nanoseconds in human readable format.
+ *
+ * @param time Time in nanoseconds
+ * @return std::string The formatted human readable string
+ */
 std::string format_time_ns(uint64_t) noexcept;
+
+/**
+ * @brief Convert the number of byte in JEDEC standard form.
+ * See more : https://en.wikipedia.org/wiki/JEDEC_memory_standards
+ *
+ * @param bytes Number of bytes
+ * @return JEDEC compliant formatted number of bytes
+ */
 std::string format_byte_size(uint64_t) noexcept;
+
+/**
+ * @brief Format a number with a separator (i.e. 1000 as 1,000)
+ *
+ * @param k number to format
+ * @param separator used between each thouand
+ * @return Formatted number
+ */
 std::string thousand_sep(uint64_t, const char& = ',') noexcept;
--- a/cpp/toolbox_unit_test.cpp
+++ b/cpp/toolbox_unit_test.cpp
@ -2,30 +2,44 @@
 #include <iostream>
 #include <assert.h>

+/**
+ * @brief Test if a given result is equal of the expected one and log result
+ *
+ * @tparam T type of returning values
+ * @param name of the unit test
+ * @param expected result of the function call
+ * @param result of the function
+ */
 template<typename T>
-void Assert(const char* name, const T& expected, const T& result) noexcept {
+static void Assert(const char* const name, const T& expected, const T& result) noexcept {
 	if(expected != result){
 		std::cerr << "For test named " << name << " Expected '" << expected << "' but got '" << result << "' instead\n";
 		assert(false);
 	}
 }

+/**
+ * @brief Test suite for the format_byte_size output
+ */
 void format_byte_size_test(void) noexcept {
-	Assert("format_byte_size null",      std::string("0B"),  format_byte_size(static_cast<uint64_t>(0)));
-	Assert("format_byte_size byte",      std::string("1B"),  format_byte_size(static_cast<uint64_t>(1)));
-	Assert("format_byte_size kilobyte",  std::string("1KB"), format_byte_size(static_cast<uint64_t>(1)<<10));
-	Assert("format_byte_size megabyte",  std::string("1MB"), format_byte_size(static_cast<uint64_t>(1)<<20));
-	Assert("format_byte_size gigabyte",  std::string("1GB"), format_byte_size(static_cast<uint64_t>(1)<<30));
-	Assert("format_byte_size terabyte",  std::string("1TB"), format_byte_size(static_cast<uint64_t>(1)<<40));
-	Assert("format_byte_size petabyte",  std::string("1PB"), format_byte_size(static_cast<uint64_t>(1)<<50));
-	Assert("format_byte_size exabyte",   std::string("1EB"), format_byte_size(static_cast<uint64_t>(1)<<60));
+	Assert("format_byte_size null",      std::string("0B"),                                            format_byte_size(static_cast<uint64_t>(0)));
+	Assert("format_byte_size byte",      std::string("1B"),                                            format_byte_size(static_cast<uint64_t>(1)));
+	Assert("format_byte_size kilobyte",  std::string("1KB"),                                           format_byte_size(static_cast<uint64_t>(1)<<10));
+	Assert("format_byte_size megabyte",  std::string("1MB"),                                           format_byte_size(static_cast<uint64_t>(1)<<20));
+	Assert("format_byte_size gigabyte",  std::string("1GB"),                                           format_byte_size(static_cast<uint64_t>(1)<<30));
+	Assert("format_byte_size terabyte",  std::string("1TB"),                                           format_byte_size(static_cast<uint64_t>(1)<<40));
+	Assert("format_byte_size petabyte",  std::string("1PB"),                                           format_byte_size(static_cast<uint64_t>(1)<<50));
+	Assert("format_byte_size exabyte",   std::string("1EB"),                                           format_byte_size(static_cast<uint64_t>(1)<<60));
 	// Unsupported due to number of byte bigger than currently supported by ISO c++
-	//Assert("format_byte_size zettabyte", std::string("1ZB"), format_byte_size(static_cast<uint64_t>(1)<<70));
-	//Assert("format_byte_size yottabyte", std::string("1YB"), format_byte_size(static_cast<uint64_t>(1)<<80));
+	//Assert("format_byte_size zettabyte", std::string("1ZB"),                                           format_byte_size(static_cast<uint64_t>(1)<<70));
+	//Assert("format_byte_size yottabyte", std::string("1YB"),                                           format_byte_size(static_cast<uint64_t>(1)<<80));
 	// uint64_t_MAX == 2**64 == 18446744073709551615I64u == -1
 	Assert("format_byte_size max",       std::string("15EB 1023PB 1023TB 1023GB 1023MB 1023KB 1023B"), format_byte_size(static_cast<uint64_t>(-1)));
 }

+/**
+ * @brief Test suite for the format_time output
+ */
 void format_time_test(void) noexcept {
 	// https://en.wikipedia.org/wiki/Unit_of_time
 	Assert("format_time null",                      std::string("0s"),                               format_time(static_cast<uint64_t>(0)));
@ -80,66 +94,72 @@ void format_time_test(void) noexcept {
 	Assert("format_time max",                       std::string("5849424173c 55y 3w 5j 7h 15s"),     format_time(static_cast<uint64_t>(-1)));
 }

+/**
+ * @brief Test suite for the format_time_ns output
+ */
 void format_time_ns_test(void) noexcept {
 	// https://en.wikipedia.org/wiki/Unit_of_time
-	Assert("format_time_ns null",                      std::string("0ns"),                              format_time_ns(static_cast<uint64_t>(0)));
-	Assert("format_time_ns nanosecond",                std::string("1ns"),                              format_time_ns(static_cast<uint64_t>(1)));
-	Assert("format_time_ns shake",                     std::string("10ns"),                             format_time_ns(static_cast<uint64_t>(10)));
-	Assert("format_time_ns microsecond",               std::string("1us"),                              format_time_ns(static_cast<uint64_t>(1e3)));
-	Assert("format_time_ns millisecond",               std::string("1ms"),                              format_time_ns(static_cast<uint64_t>(1e6)));
-	Assert("format_time_ns centisecond",               std::string("10ms"),                             format_time_ns(static_cast<uint64_t>(1e7)));
-	Assert("format_time_ns decisecond",                std::string("100ms"),                            format_time_ns(static_cast<uint64_t>(1e8)));
-	Assert("format_time_ns second",                    std::string("1s"),                               format_time_ns(static_cast<uint64_t>(1e9)));
-	Assert("format_time_ns decasecond",                std::string("10s"),                              format_time_ns(static_cast<uint64_t>(1e10)));
-	Assert("format_time_ns minute",                    std::string("1m"),                               format_time_ns(static_cast<uint64_t>(6e10)));
-	Assert("format_time_ns milliday",                  std::string("1m 26s 400ms"),                     format_time_ns(static_cast<uint64_t>(864e8)));
-	Assert("format_time_ns hectosecond",               std::string("1m 40s"),                           format_time_ns(static_cast<uint64_t>(1e11)));
-	Assert("format_time_ns kilosecond",                std::string("16m 40s"),                          format_time_ns(static_cast<uint64_t>(1e12)));
-	Assert("format_time_ns hour",                      std::string("1h"),                               format_time_ns(static_cast<uint64_t>(36e11)));
-	Assert("format_time_ns day",                       std::string("1j"),                               format_time_ns(static_cast<uint64_t>(864e11)));
-	Assert("format_time_ns week/sennight",             std::string("1w"),                               format_time_ns(static_cast<uint64_t>(6048e11)));
-	Assert("format_time_ns megasecond",                std::string("1w 4j 13h 46m 40s"),                format_time_ns(static_cast<uint64_t>(1e15)));
-	Assert("format_time_ns fortnight",                 std::string("2w"),                               format_time_ns(static_cast<uint64_t>(12096e11)));
-	Assert("format_time_ns lunar month (draconitic)",  std::string("3w 6j 5h 5m 35s 800ms"),            format_time_ns(static_cast<uint64_t>(23511358e8)));
-	Assert("format_time_ns lunar month (tropical)",    std::string("3w 6j 7h 43m 4s 700ms"),            format_time_ns(static_cast<uint64_t>(23605847e8)));
-	Assert("format_time_ns lunar month (sidereal)",    std::string("3w 6j 7h 43m 11s 600ms"),           format_time_ns(static_cast<uint64_t>(23605916e8)));
-	Assert("format_time_ns lunar month (anomalistic)", std::string("3w 6j 13h 18m 33s 200ms"),          format_time_ns(static_cast<uint64_t>(23807132e8)));
-	Assert("format_time_ns lunar month (synodic)",     std::string("4w 1j 12h 44m 2s 900ms"),           format_time_ns(static_cast<uint64_t>(25514429e8)));
-	Assert("format_time_ns month",                     std::string("1M"),                               format_time_ns(static_cast<uint64_t>(26784e11)));
-	Assert("format_time_ns quarantine",                std::string("1M 1w 2j"),                         format_time_ns(static_cast<uint64_t>(3456e12)));
-	Assert("format_time_ns semester",                  std::string("4M 2j"),                            format_time_ns(static_cast<uint64_t>(108864e11)));
-	Assert("format_time_ns lunar year",                std::string("11M 1w 6j 8h 52m 48s"),             format_time_ns(static_cast<uint64_t>(30617568e9)));
-	Assert("format_time_ns year",                      std::string("1y"),                               format_time_ns(static_cast<uint64_t>(31536e12)));
-	Assert("format_time_ns tropical year",             std::string("1y 5h 48m 45s 216ms"),              format_time_ns(static_cast<uint64_t>(31556925216e6)));
-	Assert("format_time_ns gregorian year",            std::string("1y 5h 49m 12s"),                    format_time_ns(static_cast<uint64_t>(31556952e9)));
-	Assert("format_time_ns sidereal year",             std::string("1y 6h 9m 9s 763ms 545us 600ns"),    format_time_ns(static_cast<uint64_t>(315581497635456e2)));
-	Assert("format_time_ns leap year",                 std::string("1y 1j"),                            format_time_ns(static_cast<uint64_t>(316224e11)));
-	Assert("format_time_ns olympiad",                  std::string("4y"),                               format_time_ns(static_cast<uint64_t>(126144e12)));
-	Assert("format_time_ns lusturm",                   std::string("5y"),                               format_time_ns(static_cast<uint64_t>(15768e13)));
-	Assert("format_time_ns decade",                    std::string("10y"),                              format_time_ns(static_cast<uint64_t>(31536e13)));
-	Assert("format_time_ns indiction",                 std::string("15y"),                              format_time_ns(static_cast<uint64_t>(47304e13)));
-	Assert("format_time_ns score",                     std::string("20y"),                              format_time_ns(static_cast<uint64_t>(63072e13)));
-	Assert("format_time_ns gigasecond",                std::string("31y 8M 1w 4j 1h 46m 40s"),          format_time_ns(static_cast<uint64_t>(1e18)));
-	Assert("format_time_ns jubilee",                   std::string("50y"),                              format_time_ns(static_cast<uint64_t>(15768e14)));
-	Assert("format_time_ns century",                   std::string("1c"),                               format_time_ns(static_cast<uint64_t>(31536e14)));
+	Assert("format_time_ns null",                      std::string("0ns"),                                         format_time_ns(static_cast<uint64_t>(0)));
+	Assert("format_time_ns nanosecond",                std::string("1ns"),                                         format_time_ns(static_cast<uint64_t>(1)));
+	Assert("format_time_ns shake",                     std::string("10ns"),                                        format_time_ns(static_cast<uint64_t>(10)));
+	Assert("format_time_ns microsecond",               std::string("1us"),                                         format_time_ns(static_cast<uint64_t>(1e3)));
+	Assert("format_time_ns millisecond",               std::string("1ms"),                                         format_time_ns(static_cast<uint64_t>(1e6)));
+	Assert("format_time_ns centisecond",               std::string("10ms"),                                        format_time_ns(static_cast<uint64_t>(1e7)));
+	Assert("format_time_ns decisecond",                std::string("100ms"),                                       format_time_ns(static_cast<uint64_t>(1e8)));
+	Assert("format_time_ns second",                    std::string("1s"),                                          format_time_ns(static_cast<uint64_t>(1e9)));
+	Assert("format_time_ns decasecond",                std::string("10s"),                                         format_time_ns(static_cast<uint64_t>(1e10)));
+	Assert("format_time_ns minute",                    std::string("1m"),                                          format_time_ns(static_cast<uint64_t>(6e10)));
+	Assert("format_time_ns milliday",                  std::string("1m 26s 400ms"),                                format_time_ns(static_cast<uint64_t>(864e8)));
+	Assert("format_time_ns hectosecond",               std::string("1m 40s"),                                      format_time_ns(static_cast<uint64_t>(1e11)));
+	Assert("format_time_ns kilosecond",                std::string("16m 40s"),                                     format_time_ns(static_cast<uint64_t>(1e12)));
+	Assert("format_time_ns hour",                      std::string("1h"),                                          format_time_ns(static_cast<uint64_t>(36e11)));
+	Assert("format_time_ns day",                       std::string("1j"),                                          format_time_ns(static_cast<uint64_t>(864e11)));
+	Assert("format_time_ns week/sennight",             std::string("1w"),                                          format_time_ns(static_cast<uint64_t>(6048e11)));
+	Assert("format_time_ns megasecond",                std::string("1w 4j 13h 46m 40s"),                           format_time_ns(static_cast<uint64_t>(1e15)));
+	Assert("format_time_ns fortnight",                 std::string("2w"),                                          format_time_ns(static_cast<uint64_t>(12096e11)));
+	Assert("format_time_ns lunar month (draconitic)",  std::string("3w 6j 5h 5m 35s 800ms"),                       format_time_ns(static_cast<uint64_t>(23511358e8)));
+	Assert("format_time_ns lunar month (tropical)",    std::string("3w 6j 7h 43m 4s 700ms"),                       format_time_ns(static_cast<uint64_t>(23605847e8)));
+	Assert("format_time_ns lunar month (sidereal)",    std::string("3w 6j 7h 43m 11s 600ms"),                      format_time_ns(static_cast<uint64_t>(23605916e8)));
+	Assert("format_time_ns lunar month (anomalistic)", std::string("3w 6j 13h 18m 33s 200ms"),                     format_time_ns(static_cast<uint64_t>(23807132e8)));
+	Assert("format_time_ns lunar month (synodic)",     std::string("4w 1j 12h 44m 2s 900ms"),                      format_time_ns(static_cast<uint64_t>(25514429e8)));
+	Assert("format_time_ns month",                     std::string("1M"),                                          format_time_ns(static_cast<uint64_t>(26784e11)));
+	Assert("format_time_ns quarantine",                std::string("1M 1w 2j"),                                    format_time_ns(static_cast<uint64_t>(3456e12)));
+	Assert("format_time_ns semester",                  std::string("4M 2j"),                                       format_time_ns(static_cast<uint64_t>(108864e11)));
+	Assert("format_time_ns lunar year",                std::string("11M 1w 6j 8h 52m 48s"),                        format_time_ns(static_cast<uint64_t>(30617568e9)));
+	Assert("format_time_ns year",                      std::string("1y"),                                          format_time_ns(static_cast<uint64_t>(31536e12)));
+	Assert("format_time_ns tropical year",             std::string("1y 5h 48m 45s 216ms"),                         format_time_ns(static_cast<uint64_t>(31556925216e6)));
+	Assert("format_time_ns gregorian year",            std::string("1y 5h 49m 12s"),                               format_time_ns(static_cast<uint64_t>(31556952e9)));
+	Assert("format_time_ns sidereal year",             std::string("1y 6h 9m 9s 763ms 545us 600ns"),               format_time_ns(static_cast<uint64_t>(315581497635456e2)));
+	Assert("format_time_ns leap year",                 std::string("1y 1j"),                                       format_time_ns(static_cast<uint64_t>(316224e11)));
+	Assert("format_time_ns olympiad",                  std::string("4y"),                                          format_time_ns(static_cast<uint64_t>(126144e12)));
+	Assert("format_time_ns lusturm",                   std::string("5y"),                                          format_time_ns(static_cast<uint64_t>(15768e13)));
+	Assert("format_time_ns decade",                    std::string("10y"),                                         format_time_ns(static_cast<uint64_t>(31536e13)));
+	Assert("format_time_ns indiction",                 std::string("15y"),                                         format_time_ns(static_cast<uint64_t>(47304e13)));
+	Assert("format_time_ns score",                     std::string("20y"),                                         format_time_ns(static_cast<uint64_t>(63072e13)));
+	Assert("format_time_ns gigasecond",                std::string("31y 8M 1w 4j 1h 46m 40s"),                     format_time_ns(static_cast<uint64_t>(1e18)));
+	Assert("format_time_ns jubilee",                   std::string("50y"),                                         format_time_ns(static_cast<uint64_t>(15768e14)));
+	Assert("format_time_ns century",                   std::string("1c"),                                          format_time_ns(static_cast<uint64_t>(31536e14)));
 	// Cannot use number bigger than currently supported ISO C++
-	//Assert("format_time_ns millennium",                std::string("10c"),                              format_time_ns(static_cast<uint64_t>(31536e15)));
-	//Assert("format_time_ns age",                       std::string("257c 72y"),                         format_time_ns(static_cast<uint64_t>(812745792e12)));
-	//Assert("format_time_ns terasecond",                std::string("3170c 97y 10M 3w 4j 17h 46m 40s"),  format_time_ns(static_cast<uint64_t>(1e22)));
-	//Assert("format_time_ns megaannum",                 std::string("10000c"),                           format_time_ns(static_cast<uint64_t>(31536e18)));
-	//Assert("format_time_ns petasecond",                std::string("317097c 91y 11M 2w 4j 1h 46m 40s"), format_time_ns(static_cast<uint64_t>(1e24)));
-	//Assert("format_time_ns galactic year",             std::string("2300000c"),                         format_time_ns(static_cast<uint64_t>(725328e19)));
-	//Assert("format_time_ns eon",                       std::string("10000000c"),                        format_time_ns(static_cast<uint64_t>(31536e21)));
-	//Assert("format_time_ns kalpa",                     std::string("43200000c"),                        format_time_ns(static_cast<uint64_t>(13623552e19)));
-	//Assert("format_time_ns exasecond",                 std::string("317097919c 83y 9M 1h 46m 40s"),     format_time_ns(static_cast<uint64_t>(1e27)));
-	//Assert("format_time_ns zettasecond",               std::string(""),                                 format_time_ns(static_cast<uint64_t>(1e30)));
-	//Assert("format_time_ns yottasecond",               std::string(""),                                 format_time_ns(static_cast<uint64_t>(1e33)));
-	//Assert("format_time_ns ronnasecond",               std::string(""),                                 format_time_ns(static_cast<uint64_t>(1e36)));
-	//Assert("format_time_ns quettasecond",              std::string(""),                                 format_time_ns(static_cast<uint64_t>(1e39)));
+	//Assert("format_time_ns millennium",                std::string("10c"),                                         format_time_ns(static_cast<uint64_t>(31536e15)));
+	//Assert("format_time_ns age",                       std::string("257c 72y"),                                    format_time_ns(static_cast<uint64_t>(812745792e12)));
+	//Assert("format_time_ns terasecond",                std::string("3170c 97y 10M 3w 4j 17h 46m 40s"),             format_time_ns(static_cast<uint64_t>(1e22)));
+	//Assert("format_time_ns megaannum",                 std::string("10000c"),                                      format_time_ns(static_cast<uint64_t>(31536e18)));
+	//Assert("format_time_ns petasecond",                std::string("317097c 91y 11M 2w 4j 1h 46m 40s"),            format_time_ns(static_cast<uint64_t>(1e24)));
+	//Assert("format_time_ns galactic year",             std::string("2300000c"),                                    format_time_ns(static_cast<uint64_t>(725328e19)));
+	//Assert("format_time_ns eon",                       std::string("10000000c"),                                   format_time_ns(static_cast<uint64_t>(31536e21)));
+	//Assert("format_time_ns kalpa",                     std::string("43200000c"),                                   format_time_ns(static_cast<uint64_t>(13623552e19)));
+	//Assert("format_time_ns exasecond",                 std::string("317097919c 83y 9M 1h 46m 40s"),                format_time_ns(static_cast<uint64_t>(1e27)));
+	//Assert("format_time_ns zettasecond",               std::string(""),                                            format_time_ns(static_cast<uint64_t>(1e30)));
+	//Assert("format_time_ns yottasecond",               std::string(""),                                            format_time_ns(static_cast<uint64_t>(1e33)));
+	//Assert("format_time_ns ronnasecond",               std::string(""),                                            format_time_ns(static_cast<uint64_t>(1e36)));
+	//Assert("format_time_ns quettasecond",              std::string(""),                                            format_time_ns(static_cast<uint64_t>(1e39)));
 	// uint64_t_MAX == 2**64 == 18446744073709551615I64u == -1
 	Assert("format_time_ns max",                       std::string("5c 84y 11M 2j 23h 34m 33s 709ms 551us 615ns"), format_time_ns(static_cast<uint64_t>(-1)));
 }

+/**
+ * @brief Test suite for the thousand_sep output
+ */
 void thousand_sep_test(void) noexcept {
 	// https://en.wikipedia.org/wiki/Names_of_large_numbers
 	Assert("thousand_sep null",                std::string("0"),                                           thousand_sep(static_cast<uint64_t>(0)));
@ -182,4 +202,3 @@ void thousand_sep_test(void) noexcept {
 	// uint64_t_MAX == 2**64 == 18446744073709551615I64u == -1
 	Assert("thousand_sep max",                 std::string("18,446,744,073,709,551,615"),                  thousand_sep(static_cast<uint64_t>(-1)));
 }
-
--- a/cpp/toolbox_unit_test.hpp
+++ b/cpp/toolbox_unit_test.hpp
@ -1,6 +1,21 @@
 #pragma once

+/**
+ * @brief Test suite for the format_byte_size output
+ */
 void format_byte_size_test(void) noexcept;
+
+/**
+ * @brief Test suite for the format_time output
+ */
 void format_time_test(void) noexcept;
+
+/**
+ * @brief Test suite for the format_time_ns output
+ */
 void format_time_ns_test(void) noexcept;
+
+/**
+ * @brief Test suite for the thousand_sep output
+ */
 void thousand_sep_test(void) noexcept;
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,19 @@
+services:
+  downloader:
+    extends:
+      file: ./downloader/docker-compose.yaml
+      service: downloader
+  violajones-cpp:
+    extends:
+      file: ./cpp/docker-compose.yaml
+      service: violajones-cpp
+    depends_on:
+          downloader:
+            condition: service_completed_successfully
+  violajones-python:
+    extends:
+      file: ./python/docker-compose.yaml
+      service: violajones-python
+    depends_on:
+          downloader:
+            condition: service_completed_successfully
--- a/download_data.sh
+++ b/download_data.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-#!/bin/sh
-
-# Exit if any of the command doesn't exit with code 0
-set -e
-
-EXEC_DIR=$1
-test -z $EXEC_DIR && EXEC_DIR=.
-DATA_LOCATION=$EXEC_DIR/data
-mkdir -p $DATA_LOCATION
-
-if [ ! -f $DATA_LOCATION/X_train.bin ] || [ ! -f $DATA_LOCATION/X_test.bin ] \
-|| [ ! -f $DATA_LOCATION/y_train.bin ] || [ ! -f $DATA_LOCATION/y_test.bin ]; then
-#if true; then
-	if [ ! -f $DATA_LOCATION/faces.tar.gz ]; then
-		echo 'Downloading raw dataset'
-		curl -o $DATA_LOCATION/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz
-	fi
-
-	echo 'Extracting raw files'
-	tar xzf $DATA_LOCATION/faces.tar.gz -C $DATA_LOCATION
-	rm $DATA_LOCATION/README
-	rm $DATA_LOCATION/svm.*
-
-	echo 'Extracting raw train set'
-	tar xzf $DATA_LOCATION/face.train.tar.gz -C $DATA_LOCATION
-	rm $DATA_LOCATION/face.train.tar.gz
-
-	echo 'Extracting raw test set'
-	tar xzf $DATA_LOCATION/face.test.tar.gz -C $DATA_LOCATION
-	rm $DATA_LOCATION/face.test.tar.gz
-
-	echo 'Converting raw dataset to bin file'
-	source $EXEC_DIR/python/activate.sh $EXEC_DIR
-	python $EXEC_DIR/python/convert_dataset.py $DATA_LOCATION
-
-	echo 'Removing leftovers'
-	rm -rf $DATA_LOCATION/train
-	rm -rf $DATA_LOCATION/test
-
-	echo 'Done !'
-fi
--- a/downloader/Dockerfile
+++ b/downloader/Dockerfile
@ -0,0 +1,11 @@
+FROM alpine:3.20.3
+
+RUN apk add --no-cache curl=8.11.0-r1 python3=3.12.7-r0 && rm -rf /var/cache/apk*
+
+WORKDIR /home/ViolaJones/downloader
+COPY requirements.txt activate.sh ./
+RUN ./activate.sh
+
+COPY download_data.sh convert_dataset.py ./
+
+CMD ["./download_data.sh"]
--- a/downloader/activate.sh
+++ b/downloader/activate.sh
@ -0,0 +1,25 @@
+#!/bin/sh
+
+# Exit if any of the command doesn't exit with code 0
+set -e
+
+test -z "$EXEC_DIR" && EXEC_DIR=.
+test -z "$VENV_PATH" && VENV_PATH="$EXEC_DIR/.venv"
+
+activate(){
+	if [ ! -d "$VENV_PATH" ]; then
+		echo 'Creating python virtual environnement'
+		python -m venv --upgrade-deps "$VENV_PATH"
+		echo 'Activating virtual environnement'
+		activate
+		echo 'Installing requirements'
+		pip install -r requirements.txt
+	elif [ -f "$VENV_PATH"/Scripts/activate ]; then . "$VENV_PATH"/Scripts/activate
+	elif [ -f "$VENV_PATH"/bin/activate ]; then . "$VENV_PATH"/bin/activate
+	else
+		echo 'Python virtual environnement not detected'
+		exit 1
+	fi
+}
+
+activate
--- a/downloader/convert_dataset.py
+++ b/downloader/convert_dataset.py
@ -0,0 +1,62 @@
+from io import BufferedReader
+from tqdm import tqdm
+from functools import partial
+from sys import argv
+import numpy as np
+from numpy.typing import NDArray
+from typing import Final, Callable
+from os import path, listdir
+
+# Induce determinism
+np.random.seed(196_863)
+
+# Makes the 'leave' argument default to False
+tqdm: Callable = partial(tqdm, leave = False)
+
+def read_pgm(pgm_file: BufferedReader) -> NDArray[np.uint8]:
+	"""Read the data of a PGM file
+
+	Args:
+		pgm_file (BufferedReader): PGM File
+
+	Returns:
+		NDArray[np.uint8]: PGM data
+	"""
+	assert (f := pgm_file.readline()) == b'P5\n', f'Incorrect file format: {f}'
+	(width, height) = (int(i) for i in pgm_file.readline().split())
+	assert width > 0 and height > 0, f'Incorrect dimensions: {width}x{height}'
+	assert (depth := int(pgm_file.readline())) < 256, f'Incorrect depth: {depth}'
+
+	buff: Final[NDArray[np.uint8]] = np.empty(height * width, dtype = np.uint8)
+	for i in range(buff.shape[0]):
+		buff[i] = ord(pgm_file.read(1))
+	return buff.reshape((height, width))
+
+def __main__(data_path: str) -> None:
+	"""Read the data of every PGM file and output it in data files
+
+	Args:
+		data_path (str): Path of the PGM files
+	"""
+	for set_name in tqdm(['train', 'test'], desc = 'set name'):
+		X, y = [], []
+		for y_i, label in enumerate(tqdm(['non-face', 'face'], desc = 'label')):
+			for filename in tqdm(listdir(f'{data_path}/{set_name}/{label}'), desc = 'Reading pgm file'):
+				with open(f'{data_path}/{set_name}/{label}/{filename}', 'rb') as face:
+					X.append(read_pgm(face))
+					y.append(y_i)
+
+		X, y = np.asarray(X), np.asarray(y)
+		idx: NDArray[np.int64] = np.random.permutation(y.shape[0])
+		X, y = X[idx], y[idx]
+
+		for org, s in tqdm(zip('Xy', [X, y]), desc = f'Writing {set_name}'):
+			with open(f'{data_path}/{org}_{set_name}.bin', 'w') as out:
+				out.write(f'{str(s.shape)[1:-1].replace(',', '')}\n')
+				raw: NDArray = s.ravel()
+				for s_i in tqdm(raw[:-1], desc = f'Writing {org}'):
+					out.write(f'{s_i} ')
+				out.write(str(raw[-1]))
+
+if __name__ == '__main__':
+	__main__(argv[1]) if len(argv) == 2 else print(f'Usage: python {__file__[__file__.rfind(path.sep) + 1:]} ./data_location')
--- a/downloader/docker-compose.yaml
+++ b/downloader/docker-compose.yaml
@ -0,0 +1,7 @@
+services:
+  downloader:
+    image: saundersp/violajones-downloader
+    pull_policy: never
+    build: .
+    volumes:
+      - ../data:/home/ViolaJones/data
--- a/downloader/download_data.sh
+++ b/downloader/download_data.sh
@ -0,0 +1,38 @@
+#!/bin/sh
+
+# Exit if any of the command doesn't exit with code 0
+set -e
+
+test -z "$EXEC_DIR" && EXEC_DIR=.
+DATA_PATH="$EXEC_DIR/../data"
+test ! -d "$DATA_PATH" && mkdir -v "$DATA_PATH"
+
+if [ ! -f "$DATA_PATH"/X_train.bin ] || [ ! -f "$DATA_PATH"/X_test.bin ] \
+|| [ ! -f "$DATA_PATH"/y_train.bin ] || [ ! -f "$DATA_PATH"/y_test.bin ]; then
+	if [ ! -f "$DATA_PATH"/faces.tar.gz ]; then
+		echo 'Downloading raw dataset'
+		curl -o "$DATA_PATH"/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz
+	fi
+
+	echo 'Extracting raw files'
+	tar xvzf "$DATA_PATH"/faces.tar.gz -C "$DATA_PATH"
+	rm -v "$DATA_PATH"/README "$DATA_PATH"/svm.*
+
+	echo 'Extracting raw train set'
+	tar xvzf "$DATA_PATH"/face.train.tar.gz -C "$DATA_PATH"
+	rm -v "$DATA_PATH"/face.train.tar.gz
+
+	echo 'Extracting raw test set'
+	tar xvzf "$DATA_PATH"/face.test.tar.gz -C "$DATA_PATH"
+	rm -v "$DATA_PATH"/face.test.tar.gz
+
+	echo 'Converting raw dataset to bin file'
+	export EXEC_DIR
+	. "$EXEC_DIR"/activate.sh
+	python "$EXEC_DIR"/convert_dataset.py "$DATA_PATH"
+
+	echo 'Removing leftovers'
+	rm -rvf "$DATA_PATH"/train "$DATA_PATH"/test
+
+	echo 'Done !'
+fi
--- a/downloader/requirements.txt
+++ b/downloader/requirements.txt
@ -0,0 +1,2 @@
+numpy==2.1.3
+tqdm==4.67.0
--- a/python/Dockerfile
+++ b/python/Dockerfile
@ -0,0 +1,13 @@
+FROM nvidia/cuda:12.6.2-devel-ubi9
+
+RUN dnf install -y python3.12-3.12.1-4.el9_4.4 \
+	&& dnf clean all \
+	&& ln -s /usr/bin/python3.12 /usr/bin/python
+
+WORKDIR /home/ViolaJones/python
+COPY Makefile activate.sh requirements.txt ./
+RUN make venv
+COPY *.py ./
+
+ENTRYPOINT ["make"]
+CMD ["start"]
--- a/python/Makefile
+++ b/python/Makefile
@ -1,34 +1,87 @@
-DATA := ../data/X_train.bin ../data/X_test.bin ../data/y_train.bin ../data/y_test.bin
+MODELS_DIR := models
+OUT_DIR := out
+DATA_PATH := ../data
+DATA := $(DATA_PATH)/X_train.bin $(DATA_PATH)/X_test.bin $(DATA_PATH)/y_train.bin $(DATA_PATH)/y_test.bin

-.PHONY: all start reset
+.PHONY: all
+all: venv

-all: ${DATA}
-
-${DATA}:
-	@bash ../download_data.sh ..
+$(DATA):
+	@echo 'Missing $(DATA) files, use downloader first' && exit 1

+.PHONY: venv
 venv:
-	@bash -c 'source activate.sh'
+	@sh -c '. ./activate.sh'

-start: ${DATA} venv
-	@bash -c 'source activate.sh && python projet.py'
+.PHONY: start
+start: $(DATA) | venv check-python-works
+	@sh -c '. ./activate.sh && python projet.py'

-reset:
-	@echo Deleting generated states and models
-	@rm -rf out/* models/* | true
+.PHONY: test
+test: | venv check-python-works
+	@sh -c '. ./activate.sh && python project_test.py'

-debug:
+.PHONY: debug
+debug: $(DATA) | venv check-python-works check-pudb-works
 	@bash -c 'source activate.sh && pudb projet.py'

-profile:
-	@bash -c 'source activate.sh && python -m cProfile -o prof.out projet.py && gprof2dot -f pstats prof.out | dot -Tpng -o output.png'
+.PHONY: profile
+profile: $(DATA) | venv check-python-works check-gprof2dot-works check-dot-works
+	@bash -c 'source activate.sh && python -m cProfile -o prof.out projet.py && gprof2dot -f pstats prof.out | dot -T png -o output.png'

-mrproper: reset
-	@rm -r __pycache__ venv
+.PHONY: log
+log: $(DATA) reset | venv
+	@sed -i 's/GPU_BOOSTED: Final = False/GPU_BOOSTED: Final = True/;s/COMPILE_WITH_C: Final = False/COMPILE_WITH_C: Final = True/' config.py
+	@echo 'Logging GPU'
+	@make -s start > log_gpu
+	@sed -i 's/GPU_BOOSTED: Final = True/GPU_BOOSTED: Final = False/' config.py
+	@echo 'Logging CPU'
+	@make -s start > log_cpu
+	@sed -i 's/GPU_BOOSTED: Final = False/GPU_BOOSTED: Final = True/;s/COMPILE_WITH_C: Final = True/COMPILE_WITH_C: Final = False/' config.py
+	@echo 'Logging PGPU'
+	@make -s start > log_pgpu
+	@sed -i 's/GPU_BOOSTED: Final = True/GPU_BOOSTED: Final = False/' config.py
+	@echo 'Logging PY'
+	@make -s start > log_py
+	@echo 'Cleaning up'
+	@make -s reset

-test:
-	@bash -c 'source activate.sh && ls out | sed s/.pkl// | xargs -n1 python test_diff.py out'
-	@bash -c 'source activate.sh && ls models | sed s/.pkl// | xargs -n1 python test_diff.py models'
+.PHONY: reset
+reset:
+	@echo 'Deleting generated states and models'
+	@rm -frv $(OUT_DIR) $(MODELS_DIR)

+.PHONY: clean
+clean:
+	@rm -fv log_gpu log_cpu log_gpu log_py
+
+.PHONY: mrproper
+mrproper: clean
+	@rm -rfv __pycache__ .venv
+
+.PHONY: help
 help:
-	@echo "all start reset mrproper help"
+	@echo "Available targets:"
+	@echo "\tall: alias for start, (default target)"
+	@echo "\tvenv: Create python virtual environnement."
+	@echo "\tstart: Start the ViolaJones algorithm, require data beforehand downloaded by the downloader."
+	@echo "\tdebug: Debug the ViolaJones algorithm, require data beforehand downloaded by the downloader."
+	@echo "\tprofile: Profile the ViolaJones algorithm functions timestamps, require data beforehand downloaded by the downloader."
+	@echo "\treset: Will delete any saved models and processed data made by ViolaJones."
+	@echo "\tmrproper: Will remove cpp binary files. Will execute reset target beforehand."
+
+.PHONY: check-python-works
+check-python-works:
+	@python --version >/dev/null 2>&1 || (echo 'Please install Python.' && exit 1)
+
+.PHONY: check-pudb-works
+check-pudb-works:
+	@pudb --version >/dev/null 2>&1 || (echo 'Please install pudb.' && exit 1)
+
+.PHONY: check-gprof2dot-works
+check-gprof2dot-works:
+	@gprof2dot --help >/dev/null 2>&1 || (echo 'Please install gprof2dot.' && exit 1)
+
+.PHONY: check-dot-works
+check-dot-works:
+	@dot --version >/dev/null 2>&1 || (echo 'Please install dot from graphviz.' && exit 1)
--- a/python/ViolaJones.py
+++ b/python/ViolaJones.py
@ -18,13 +18,13 @@ else:

@njit('uint8[:, :, :, :](uint16, uint16)')
 def build_features(width: int, height: int) -> np.ndarray:
-	"""Initialize the features base on the input shape.
+	"""Initialize the features based on the input shape.

 	Args:
-		shape (Tuple[int, int]): Shape of the image (Width, Height).
+		shape (Tuple[int, int]): Shape of the image (Width, Height)

 	Returns:
-		np.ndarray: The initialized features.
+		np.ndarray: The initialized features
 	"""
 	feats = []
 	empty = (0, 0, 0, 0)
@ -63,10 +63,10 @@ def init_weights(y_train: np.ndarray) -> np.ndarray:
 	"""Initialize the weights of the weak classifiers based on the training labels.

 	Args:
-		y_train (np.ndarray): Training labels.
+		y_train (np.ndarray): Training labels

 	Returns:
-		np.ndarray: The initialized weights.
+		np.ndarray: The initialized weights
 	"""
 	weights = np.empty_like(y_train, dtype = np.float64)
 	t = y_train.sum()
@ -79,26 +79,48 @@ def classify_weak_clf(x_feat_i: np.ndarray, threshold: int, polarity: int) -> np
 	"""Classify the integrated features based on polarity and threshold.

 	Args:
-		x_feat_i (np.ndarray): Integrated features.
-		threshold (int): Trained threshold.
-		polarity (int): Trained polarity.
+		x_feat_i (np.ndarray): Integrated features
+		threshold (int): Trained threshold
+		polarity (int): Trained polarity

 	Returns:
-		np.ndarray: Classified features.
+		np.ndarray: Classified features
 	"""
 	res = np.zeros_like(x_feat_i, dtype = np.int8)
 	res[polarity * x_feat_i < polarity * threshold] = 1
 	return res

-@njit('Tuple((int32, float64, float64[:]))(int32[:, :], float64[:], int32[:, :], uint8[:])')
-def select_best(classifiers: np.ndarray, weights: np.ndarray, X_feat: np.ndarray, y: np.ndarray) -> Tuple[int, float, np.ndarray]:
-	"""Select the best classifier given theirs predictions.
+@njit('uint8[:](float64[:], int32[:, :], int32[:, :])')
+def classify_viola_jones(alphas: np.ndarray, classifiers: np.ndarray, X_feat: np.ndarray) -> np.ndarray:
+	"""Classify the trained classifiers on the given features.

 	Args:
-		classifiers (np.ndarray): The weak classifiers.
-		weights (np.ndarray): Trained weights of each classifiers.
-		X_feat (np.ndarray): Integrated features.
-		y (np.ndarray): Features labels.
+		alphas (np.ndarray): Trained alphas
+		classifiers (np.ndarray): Trained classifiers
+		X_feat (np.ndarray): Integrated features
+
+	Returns:
+		np.ndarray: Classification results
+	"""
+	total = np.zeros(X_feat.shape[1], dtype = np.float64)
+
+	for i, alpha in enumerate(tqdm_iter(alphas, "Classifying ViolaJones")):
+		(j, threshold, polarity) = classifiers[i]
+		total += alpha * classify_weak_clf(X_feat[j], threshold, polarity)
+
+	y_pred = np.zeros(X_feat.shape[1], dtype = np.uint8)
+	y_pred[total >= 0.5 * np.sum(alphas)] = 1
+	return y_pred
+
+@njit('Tuple((int32, float64, float64[:]))(int32[:, :], float64[:], int32[:, :], uint8[:])')
+def select_best(classifiers: np.ndarray, weights: np.ndarray, X_feat: np.ndarray, y: np.ndarray) -> Tuple[int, float, np.ndarray]:
+	"""Select the best classifier given their predictions.
+
+	Args:
+		classifiers (np.ndarray): The weak classifiers
+		weights (np.ndarray): Trained weights of each classifiers
+		X_feat (np.ndarray): Integrated features
+		y (np.ndarray): Features labels

 	Returns:
 		Tuple[int, float, np.ndarray]: Index of the best classifier, the best error and the best accuracy
@ -116,13 +138,13 @@ def train_viola_jones(T: int, X_feat: np.ndarray, X_feat_argsort: np.ndarray, y:
 	"""Train the weak classifiers.

 	Args:
-		T (int): Number of weak classifiers.
-		X_feat (np.ndarray): Integrated features.
-		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features.
-		y (np.ndarray): Features labels.
+		T (int): Number of weak classifiers
+		X_feat (np.ndarray): Integrated features
+		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features
+		y (np.ndarray): Features labels

 	Returns:
-		Tuple[np.ndarray, np.ndarray]: List of trained alphas and the list of the final classifiers.
+		Tuple[np.ndarray, np.ndarray]: List of trained alphas and the list of the final classifiers
 	"""
 	weights = init_weights(y)
 	alphas, final_classifier = np.empty(T, dtype = np.float64), np.empty((T, 3), dtype = np.int32)
@ -139,44 +161,22 @@ def train_viola_jones(T: int, X_feat: np.ndarray, X_feat_argsort: np.ndarray, y:

 	return alphas, final_classifier

-@njit('uint8[:](float64[:], int32[:, :], int32[:, :])')
-def classify_viola_jones(alphas: np.ndarray, classifiers: np.ndarray, X_feat: np.ndarray) -> np.ndarray:
-	"""Classify the trained classifiers on the given features.
-
-	Args:
-		alphas (np.ndarray): Trained alphas.
-		classifiers (np.ndarray): Trained classifiers.
-		X_feat (np.ndarray): Integrated features.
-
-	Returns:
-		np.ndarray: Classification results.
-	"""
-	total = np.zeros(X_feat.shape[1], dtype = np.float64)
-
-	for i, alpha in enumerate(tqdm_iter(alphas, "Classifying ViolaJones")):
-		(j, threshold, polarity) = classifiers[i]
-		total += alpha * classify_weak_clf(X_feat[j], threshold, polarity)
-
-	y_pred = np.zeros(X_feat.shape[1], dtype = np.uint8)
-	y_pred[total >= 0.5 * np.sum(alphas)] = 1
-	return y_pred
-
-@njit
-def get_best_anova_features(X: np.ndarray, y: np.ndarray) -> np.ndarray:
-	#SelectPercentile(f_classif, percentile = 10).fit(X, y).get_support(indices = True)
-	classes = [X.T[y == 0].astype(np.float64), X.T[y == 1].astype(np.float64)]
-	n_samples_per_class = np.asarray([classes[0].shape[0], classes[1].shape[0]])
-	n_samples = classes[0].shape[0] + classes[1].shape[0]
-	ss_alldata = (classes[0] ** 2).sum(axis = 0) + (classes[1] ** 2).sum(axis = 0)
-	sums_classes = [np.asarray(classes[0].sum(axis = 0)), np.asarray(classes[1].sum(axis = 0))]
-	sq_of_sums_alldata = (sums_classes[0] + sums_classes[1]) ** 2
-	sq_of_sums_args = [sums_classes[0] ** 2, sums_classes[1] ** 2]
-	ss_tot = ss_alldata - sq_of_sums_alldata / n_samples
-
-	sqd_sum_bw_n = sq_of_sums_args[0] / n_samples_per_class[0] + \
-		sq_of_sums_args[1] / n_samples_per_class[1] - sq_of_sums_alldata / n_samples
-	ss_wn = ss_tot - sqd_sum_bw_n
-	df_wn = n_samples - 2
-	msw = ss_wn / df_wn
-	f_values = sqd_sum_bw_n / msw
-	return np.sort(np.argsort(f_values)[::-1][: int(np.ceil(X.shape[0] / 10.0))])
+#@njit
+#def get_best_anova_features(X: np.ndarray, y: np.ndarray) -> np.ndarray:
+#	#SelectPercentile(f_classif, percentile = 10).fit(X, y).get_support(indices = True)
+#	classes = [X.T[y == 0].astype(np.float64), X.T[y == 1].astype(np.float64)]
+#	n_samples_per_class = np.asarray([classes[0].shape[0], classes[1].shape[0]])
+#	n_samples = classes[0].shape[0] + classes[1].shape[0]
+#	ss_all_data = (classes[0] ** 2).sum(axis = 0) + (classes[1] ** 2).sum(axis = 0)
+#	sums_classes = [np.asarray(classes[0].sum(axis = 0)), np.asarray(classes[1].sum(axis = 0))]
+#	sq_of_sums_all_data = (sums_classes[0] + sums_classes[1]) ** 2
+#	sq_of_sums_args = [sums_classes[0] ** 2, sums_classes[1] ** 2]
+#	ss_tot = ss_all_data - sq_of_sums_all_data / n_samples
+#
+#	sqd_sum_bw_n = sq_of_sums_args[0] / n_samples_per_class[0] + \
+#		sq_of_sums_args[1] / n_samples_per_class[1] - sq_of_sums_all_data / n_samples
+#	ss_wn = ss_tot - sqd_sum_bw_n
+#	df_wn = n_samples - 2
+#	msw = ss_wn / df_wn
+#	f_values = sqd_sum_bw_n / msw
+#	return np.sort(np.argsort(f_values)[::-1][: int(np.ceil(X.shape[0] / 10.0))])
--- a/python/ViolaJonesCPU.py
+++ b/python/ViolaJonesCPU.py
@ -18,10 +18,10 @@ def set_integral_image(X: np.ndarray) -> np.ndarray:
 	"""Transform the input images in integrated images (CPU version).

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Dataset of integrated images.
+		np.ndarray: Dataset of integrated images
 	"""
 	X_ii = np.empty_like(X, dtype = np.uint32)
 	for i, Xi in enumerate(tqdm_iter(X, "Applying integral image")):
@ -34,59 +34,18 @@ def set_integral_image(X: np.ndarray) -> np.ndarray:
 		X_ii[i] = ii
 	return X_ii

-@njit('uint32(uint32[:, :], int16, int16, int16, int16)')
-def __compute_feature__(ii: np.ndarray, x: int, y: int, w: int, h: int) -> int:
-	"""Compute a feature on an integrated image at a specific coordinate (CPU version).
-
-	Args:
-		ii (np.ndarray): Integrated image.
-		x (int): X coordinate.
-		y (int): Y coordinate.
-		w (int): width of the feature.
-		h (int): height of the feature.
-
-	Returns:
-		int: Computed feature.
-	"""
-	return ii[y + h, x + w] + ii[y, x] - ii[y + h, x] - ii[y, x + w]
-
-@njit('int32[:, :](uint8[:, :, :, :], uint32[:, :, :])')
-def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray:
-	"""Apply the features on a integrated image dataset (CPU version).
-
-	Args:
-		feats (np.ndarray): Features to apply.
-		X_ii (np.ndarray): Integrated image dataset.
-
-	Returns:
-		np.ndarray: Applied features.
-	"""
-	X_feat = np.empty((feats.shape[0], X_ii.shape[0]), dtype = np.int32)
-
-	for i, (p, n) in enumerate(tqdm_iter(feats, "Applying features")):
-		for j, x_i in enumerate(X_ii):
-			p_x, p_y, p_w, p_h = p[0]
-			p1_x, p1_y, p1_w, p1_h = p[1]
-			n_x, n_y, n_w, n_h = n[0]
-			n1_x, n1_y, n1_w, n1_h = n[1]
-			p1 = __compute_feature__(x_i, p_x, p_y, p_w, p_h) + __compute_feature__(x_i, p1_x, p1_y, p1_w, p1_h)
-			n1 = __compute_feature__(x_i, n_x, n_y, n_w, n_h) + __compute_feature__(x_i, n1_x, n1_y, n1_w, n1_h)
-			X_feat[i, j] = int32(p1) - int32(n1)
-
-	return X_feat
-
@njit('int32[:, :](int32[:, :], uint16[:, :], uint8[:], float64[:])')
 def train_weak_clf(X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray, weights: np.ndarray) -> np.ndarray:
 	"""Train the weak classifiers on a given dataset (CPU version).

 	Args:
-		X_feat (np.ndarray): Feature images dataset.
-		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features.
-		y (np.ndarray): Labels of the features.
-		weights (np.ndarray): Weights of the features.
+		X_feat (np.ndarray): Feature images dataset
+		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features
+		y (np.ndarray): Labels of the features
+		weights (np.ndarray): Weights of the features

 	Returns:
-		np.ndarray: Trained weak classifiers.
+		np.ndarray: Trained weak classifiers
 	"""
 	total_pos, total_neg = weights[y == 1].sum(), weights[y == 0].sum()

@ -112,29 +71,85 @@ def train_weak_clf(X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray
 		classifiers[i] = (best_threshold, best_polarity)
 	return classifiers

+@njit('uint32(uint32[:, :], int16, int16, int16, int16)')
+def __compute_feature__(ii: np.ndarray, x: int, y: int, w: int, h: int) -> int:
+	"""Compute a feature on an integrated image at a specific coordinate (CPU version).
+
+	Args:
+		ii (np.ndarray): Integrated image
+		x (int): X coordinate
+		y (int): Y coordinate
+		w (int): width of the feature
+		h (int): height of the feature
+
+	Returns:
+		int: Computed feature
+	"""
+	return ii[y + h, x + w] + ii[y, x] - ii[y + h, x] - ii[y, x + w]
+
+@njit('int32[:, :](uint8[:, :, :, :], uint32[:, :, :])')
+def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray:
+	"""Apply the features on a integrated image dataset (CPU version).
+
+	Args:
+		feats (np.ndarray): Features to apply
+		X_ii (np.ndarray): Integrated image dataset
+
+	Returns:
+		np.ndarray: Applied features
+	"""
+	X_feat = np.empty((feats.shape[0], X_ii.shape[0]), dtype = np.int32)
+
+	for i, (p, n) in enumerate(tqdm_iter(feats, "Applying features")):
+		for j, x_i in enumerate(X_ii):
+			p_x, p_y, p_w, p_h = p[0]
+			p1_x, p1_y, p1_w, p1_h = p[1]
+			n_x, n_y, n_w, n_h = n[0]
+			n1_x, n1_y, n1_w, n1_h = n[1]
+			p1 = __compute_feature__(x_i, p_x, p_y, p_w, p_h) + __compute_feature__(x_i, p1_x, p1_y, p1_w, p1_h)
+			n1 = __compute_feature__(x_i, n_x, n_y, n_w, n_h) + __compute_feature__(x_i, n1_x, n1_y, n1_w, n1_h)
+			X_feat[i, j] = int32(p1) - int32(n1)
+
+	return X_feat
+
@njit('int32(int32[:], uint16[:], int32, int32)')
-def as_partition(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> int:
-	i = l - 1
-	j = l
-	for j in range(l, h + 1):
-		if a[indices[j]] < a[indices[h]]:
+def _as_partition_(d_a: np.ndarray, d_indices: np.ndarray, low: int, high: int) -> int:
+	"""Partition of the argsort algorithm.
+
+	Args:
+		d_a (np.ndarray): Array on device to sort
+		d_indices (np.ndarray): Array of indices on device to write to
+		low (int): lower bound to sort
+		high (int): higher bound to sort
+
+	Returns:
+		int: Last index sorted
+	"""
+	i, j = low - 1, low
+	for j in range(low, high + 1):
+		if d_a[d_indices[j]] < d_a[d_indices[high]]:
 			i += 1
-			indices[i], indices[j] = indices[j], indices[i]
+			d_indices[i], d_indices[j] = d_indices[j], d_indices[i]

 	i += 1
-	indices[i], indices[j] = indices[j], indices[i]
+	d_indices[i], d_indices[j] = d_indices[j], d_indices[i]
 	return i

@njit('void(int32[:], uint16[:], int32, int32)')
-def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int):
-	total = h - l + 1;
-	stack = np.empty((total,), dtype = np.int32)
-	stack[0] = l
-	stack[1] = h
-	top = 1;
+def argsort_bounded(d_a: np.ndarray, d_indices: np.ndarray, low: int, high: int) -> None:
+	"""Perform an indirect sort of a given array within a given bound.

-	low = l
-	high = h
+	Args:
+		d_a (np.ndarray): Array to sort
+		d_indices (np.ndarray): Array of indices to write to
+		low (int): lower bound to sort
+		high (int): higher bound to sort
+	"""
+	total = high - low + 1
+	stack = np.empty((total,), dtype = np.int32)
+	stack[0] = low
+	stack[1] = high
+	top = 1

 	while top >= 0:
 		high = stack[top]
@ -143,24 +158,32 @@ def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int):
 		top -= 1

 		if low >= high:
-			break;
+			break

-		p = as_partition(a, indices, low, high);
+		p = _as_partition_(d_a, d_indices, low, high)

 		if p - 1 > low:
 			top += 1
-			stack[top] = low;
+			stack[top] = low
 			top += 1
-			stack[top] = p - 1;
+			stack[top] = p - 1

 		if p + 1 < high:
 			top += 1
-			stack[top] = p + 1;
+			stack[top] = p + 1
 			top += 1
-			stack[top] = high;
+			stack[top] = high

@njit('uint16[:, :](int32[:, :])')
-def argsort(X_feat: np.ndarray) -> np.ndarray:
+def argsort_2d(X_feat: np.ndarray) -> np.ndarray:
+	"""Perform an indirect sort of a given array.
+
+	Args:
+		X_feat (np.ndarray): Array to sort
+
+	Returns:
+		np.ndarray: Array of indices that sort the array
+	"""
 	indices = np.empty_like(X_feat, dtype = np.uint16)
 	indices[:, :] = np.arange(indices.shape[1])
 	for i in tqdm_iter(range(X_feat.shape[0]), "argsort"):
--- a/python/ViolaJonesGPU.py
+++ b/python/ViolaJonesGPU.py
@ -12,10 +12,10 @@ def __scanCPU_3d__(X: np.ndarray) -> np.ndarray:
 	"""Prefix Sum (scan) of a given dataset.

 	Args:
-		X (np.ndarray): Dataset of images to apply sum.
+		X (np.ndarray): Dataset of images to apply sum

 	Returns:
-		np.ndarray: Scanned dataset of images.
+		np.ndarray: Scanned dataset of images
 	"""
 	for x in range(X.shape[0]):
 		for y in range(X.shape[1]):
@ -30,10 +30,10 @@ def __kernel_scan_3d__(n: int, j: int, d_inter: np.ndarray, d_a: np.ndarray) ->
 	"""GPU kernel used to do a parallel prefix sum (scan).

 	Args:
-		n (int):
-		j (int): [description]
-		d_inter (np.ndarray): [description]
-		d_a (np.ndarray): [description]
+		n (int): Number of width blocks
+		j (int): Temporary sum index
+		d_inter (np.ndarray): Temporary sums on device to add
+		d_a (np.ndarray): Dataset of images on device to apply sum
 	"""
 	x_coor, y_coor = cuda.grid(2)

@ -76,10 +76,10 @@ def __add_3d__(d_X: np.ndarray, d_s: np.ndarray, n: int, m: int) -> None:
 	"""GPU kernel for parallel sum.

 	Args:
-		d_X (np.ndarray): Dataset of images.
-		d_s (np.ndarray): Temporary sums to add.
-		n (int): Number of width blocks.
-		m (int): Height of a block.
+		d_X (np.ndarray): Dataset of images on device
+		d_s (np.ndarray): Temporary sums on device to add
+		n (int): Number of width blocks
+		m (int): Height of a block
 	"""
 	x_coor, y_coor = cuda.grid(2)
 	if x_coor < n and y_coor < m:
@ -91,10 +91,10 @@ def __scanGPU_3d__(X: np.ndarray) -> np.ndarray:
 	Read more: https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Scanned dataset of images.
+		np.ndarray: Scanned dataset of images
 	"""
 	k, height, n = X.shape
 	n_block_x, n_block_y = np.ceil(np.divide(X.shape[1:], NB_THREADS_2D)).astype(np.uint64)
@ -131,10 +131,10 @@ def __transpose_kernel__(d_X: np.ndarray, d_Xt: np.ndarray) -> None:
 	"""GPU kernel of the function __transpose_3d__.

 	Args:
-		d_X (np.ndarray): Dataset of images.
-		d_Xt(np.ndarray): Transposed dataset of images.
-		width (int): Width of each images in the dataset.
-		height (int): Height of each images in the dataset.
+		d_X (np.ndarray): Dataset of images on device
+		d_Xt(np.ndarray): Transposed dataset of images
+		width (int): Width of each images in the dataset
+		height (int): Height of each images in the dataset
 	"""
 	temp = cuda.shared.array(NB_THREADS_2D, dtype = uint32)

@ -152,10 +152,10 @@ def __transpose_3d__(X: np.ndarray) -> np.ndarray:
 	"""Transpose every images in the given dataset.

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Transposed dataset of images.
+		np.ndarray: Transposed dataset of images
 	"""
 	n_block_x, n_block_z = np.ceil(np.divide(X.shape[1:], NB_THREADS_2D)).astype(np.uint64)
 	d_X = cuda.to_device(X)
@ -167,10 +167,10 @@ def set_integral_image(X: np.ndarray) -> np.ndarray:
 	"""Transform the input images in integrated images (GPU version).

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Dataset of integrated images.
+		np.ndarray: Dataset of integrated images
 	"""
 	X = X.astype(np.uint32)
 	X = __scanGPU_3d__(X)
@ -184,13 +184,13 @@ def __train_weak_clf_kernel__(d_classifiers: np.ndarray, d_y: np.ndarray, d_X_fe
 	"""GPU kernel of the function train_weak_clf.

 	Args:
-		d_classifiers (np.ndarray): Weak classifiers to train.
-		d_y (np.ndarray): Labels of the features.
-		d_X_feat (np.ndarray): Feature images dataset.
-		d_X_feat_argsort (np.ndarray): Sorted indexes of the integrated features.
-		d_weights (np.ndarray): Weights of the features.
-		total_pos (float): Total of positive labels in the dataset.
-		total_neg (float): Total of negative labels in the dataset.
+		d_classifiers (np.ndarray): Weak classifiers on device to train
+		d_y (np.ndarray): Labels of the features on device
+		d_X_feat (np.ndarray): Feature images dataset on device
+		d_X_feat_argsort (np.ndarray): Sorted indexes of the integrated features on device
+		d_weights (np.ndarray): Weights of the features on device
+		total_pos (float): Total of positive labels in the dataset
+		total_neg (float): Total of negative labels in the dataset
 	"""
 	i = cuda.blockIdx.x * cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
 	i += cuda.threadIdx.x * cuda.blockDim.y * cuda.blockDim.z
@ -224,13 +224,13 @@ def train_weak_clf(X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray
 	"""Train the weak classifiers on a given dataset (GPU version).

 	Args:
-		X_feat (np.ndarray): Feature images dataset.
-		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features.
-		y (np.ndarray): Labels of the features.
-		weights (np.ndarray): Weights of the features.
+		X_feat (np.ndarray): Feature images dataset
+		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features
+		y (np.ndarray): Labels of the features
+		weights (np.ndarray): Weights of the features

 	Returns:
-		np.ndarray: Trained weak classifiers.
+		np.ndarray: Trained weak classifiers
 	"""
 	total_pos, total_neg = weights[y == 1].sum(), weights[y == 0].sum()
 	d_classifiers = cuda.to_device(np.empty((X_feat.shape[0], 2), dtype = np.int32))
@ -247,52 +247,52 @@ def __compute_feature__(ii: np.ndarray, x: int, y: int, w: int, h: int) -> int:
 	"""Compute a feature on an integrated image at a specific coordinate (GPU version).

 	Args:
-		ii (np.ndarray): Integrated image.
-		x (int): X coordinate.
-		y (int): Y coordinate.
-		w (int): width of the feature.
-		h (int): height of the feature.
+		ii (np.ndarray): Integrated image
+		x (int): X coordinate
+		y (int): Y coordinate
+		w (int): width of the feature
+		h (int): height of the feature

 	Returns:
-		int: Computed feature.
+		int: Computed feature
 	"""
 	return ii[y + h, x + w] + ii[y, x] - ii[y + h, x] - ii[y, x + w]

@cuda.jit('void(int32[:, :], uint8[:, :, :, :], uint32[:, :, :])')
-def __apply_feature_kernel__(X_feat: np.ndarray, feats: np.ndarray, X_ii: np.ndarray) -> None:
+def __apply_feature_kernel__(d_X_feat: np.ndarray, d_feats: np.ndarray, d_X_ii: np.ndarray) -> None:
 	"""GPU kernel of the function apply_features.

 	Args:
-		X_feat (np.ndarray): Feature images dataset.
-		feats (np.ndarray): Features to apply.
-		X_ii (np.ndarray): Integrated image dataset.
-		n (int): Number of features.
-		m (int): Number of images of the dataset.
+		d_X_feat (np.ndarray): Feature images dataset on device
+		d_feats (np.ndarray): Features on device to apply
+		d_X_ii (np.ndarray): Integrated image dataset on device
+		n (int): Number of features
+		m (int): Number of images of the dataset
 	"""
 	x, y = cuda.grid(2)
-	if x >= feats.shape[0] or y >= X_ii.shape[0]:
+	if x >= d_feats.shape[0] or y >= d_X_ii.shape[0]:
 		return

-	p_x, p_y, p_w, p_h = feats[x, 0, 0]
-	p1_x, p1_y, p1_w, p1_h = feats[x, 0, 1]
-	n_x, n_y, n_w, n_h = feats[x, 1, 0]
-	n1_x, n1_y, n1_w, n1_h = feats[x, 1, 1]
-	sP = __compute_feature__(X_ii[y], p_x, p_y, p_w, p_h) + \
-		__compute_feature__(X_ii[y], p1_x, p1_y, p1_w, p1_h)
-	sN = __compute_feature__(X_ii[y], n_x, n_y, n_w, n_h) + \
-		__compute_feature__(X_ii[y], n1_x, n1_y, n1_w, n1_h)
-	X_feat[x, y] = sP - sN
+	p_x, p_y, p_w, p_h = d_feats[x, 0, 0]
+	p1_x, p1_y, p1_w, p1_h = d_feats[x, 0, 1]
+	n_x, n_y, n_w, n_h = d_feats[x, 1, 0]
+	n1_x, n1_y, n1_w, n1_h = d_feats[x, 1, 1]
+	sP = __compute_feature__(d_X_ii[y], p_x, p_y, p_w, p_h) + \
+		__compute_feature__(d_X_ii[y], p1_x, p1_y, p1_w, p1_h)
+	sN = __compute_feature__(d_X_ii[y], n_x, n_y, n_w, n_h) + \
+		__compute_feature__(d_X_ii[y], n1_x, n1_y, n1_w, n1_h)
+	d_X_feat[x, y] = sP - sN

 #@njit('int32[:, :](uint8[:, :, :, :], uint32[:, :, :])')
 def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray:
 	"""Apply the features on a integrated image dataset (GPU version).

 	Args:
-		feats (np.ndarray): Features to apply.
-		X_ii (np.ndarray): Integrated image dataset.
+		feats (np.ndarray): Features to apply
+		X_ii (np.ndarray): Integrated image dataset

 	Returns:
-		np.ndarray: Applied features.
+		np.ndarray: Applied features
 	"""
 	d_X_feat = cuda.to_device(np.empty((feats.shape[0], X_ii.shape[0]), dtype = np.int32))
 	d_feats = cuda.to_device(feats)
@ -303,28 +303,44 @@ def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray:
 	return d_X_feat.copy_to_host()

@cuda.jit('int32(int32[:], uint16[:], int32, int32)', device = True)
-def as_partition(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> int:
-	i = l - 1
-	j = l
-	for j in range(l, h + 1):
-		if a[indices[j]] < a[indices[h]]:
+def _as_partition_(d_a: np.ndarray, d_indices: np.ndarray, low: int, high: int) -> int:
+	"""Partition of the argsort algorithm.
+
+	Args:
+		d_a (np.ndarray): Array on device to sort
+		d_indices (np.ndarray): Array of indices on device to write to
+		low (int): lower bound to sort
+		high (int): higher bound to sort
+
+	Returns:
+		int: Last index sorted
+	"""
+	i = low - 1
+	j = low
+	for j in range(low, high + 1):
+		if d_a[d_indices[j]] < d_a[d_indices[high]]:
 			i += 1
-			indices[i], indices[j] = indices[j], indices[i]
+			d_indices[i], d_indices[j] = d_indices[j], d_indices[i]

 	i += 1
-	indices[i], indices[j] = indices[j], indices[i]
+	d_indices[i], d_indices[j] = d_indices[j], d_indices[i]
 	return i

@cuda.jit('void(int32[:], uint16[:], int32, int32)', device = True)
-def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> None:
-	#total = h - l + 1;
-	stack = cuda.local.array(6977, int32)
-	stack[0] = l
-	stack[1] = h
-	top = 1;
+def argsort_bounded(d_a: np.ndarray, d_indices: np.ndarray, low: int, high: int) -> None:
+	"""Perform an indirect sort of a given array within a given bound.

-	low = l
-	high = h
+	Args:
+		d_a (np.ndarray): Array on device to sort
+		d_indices (np.ndarray): Array of indices on device to write to
+		low (int): lower bound to sort
+		high (int): higher bound to sort
+	"""
+	#total = high - low + 1;
+	stack = cuda.local.array(6977, int32)
+	stack[0] = low
+	stack[1] = high
+	top = 1

 	while top >= 0:
 		high = stack[top]
@ -333,35 +349,50 @@ def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> None:
 		top -= 1

 		if low >= high:
-			break;
+			break

-		p = as_partition(a, indices, low, high);
+		p = _as_partition_(d_a, d_indices, low, high)

 		if p - 1 > low:
 			top += 1
-			stack[top] = low;
+			stack[top] = low
 			top += 1
-			stack[top] = p - 1;
+			stack[top] = p - 1

 		if p + 1 < high:
 			top += 1
-			stack[top] = p + 1;
+			stack[top] = p + 1
 			top += 1
-			stack[top] = high;
+			stack[top] = high

@cuda.jit('void(int32[:, :], uint16[:, :])')
-def argsort_flatter(X_feat: np.ndarray, indices: np.ndarray) -> None:
-	i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
-	if i < X_feat.shape[0]:
-		for j in range(indices.shape[1]):
-			indices[i, j] = j
-		argsort_bounded(X_feat[i], indices[i], 0, X_feat.shape[1] - 1)
+def argsort_flatter(d_a: np.ndarray, d_indices: np.ndarray) -> None:
+	# TODO Finish doxygen
+	"""Cuda kernel where argsort is applied to every column of a given 2D array.

-def argsort(X_feat: np.ndarray) -> np.ndarray:
-	indices = np.empty_like(X_feat, dtype = np.uint16)
-	n_blocks = int(np.ceil(np.divide(X_feat.shape[0], NB_THREADS)))
-	d_X_feat = cuda.to_device(X_feat)
+	Args:
+		d_a (np.ndarray): 2D Array on device to sort
+		d_indices (np.ndarray): 2D Array of indices on device to write to
+	"""
+	i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
+	if i < d_a.shape[0]:
+		for j in range(d_indices.shape[1]):
+			d_indices[i, j] = j
+		argsort_bounded(d_a[i], d_indices[i], 0, d_a.shape[1] - 1)
+
+def argsort_2d(a: np.ndarray) -> np.ndarray:
+	"""Perform an indirect sort on each column of a given 2D array
+
+	Args:
+		a (np.ndarray): 2D Array to sort
+
+	Returns:
+		np.ndarray: 2D Array of indices that sort the array
+	"""
+	indices = np.empty_like(a, dtype = np.uint16)
+	n_blocks = int(np.ceil(np.divide(a.shape[0], NB_THREADS)))
+	d_a = cuda.to_device(a)
 	d_indices = cuda.to_device(indices)
-	argsort_flatter[n_blocks, NB_THREADS](d_X_feat, d_indices)
+	argsort_flatter[n_blocks, NB_THREADS](d_a, d_indices)
 	cuda.synchronize()
 	return d_indices.copy_to_host()
--- a/python/activate.sh
+++ b/python/activate.sh
@ -3,22 +3,19 @@
 # Exit if any of the command doesn't exit with code 0
 set -e

-EXEC_DIR=$1
-test -z "$EXEC_DIR" && EXEC_DIR=..
-VENV_PATH=$EXEC_DIR/python/venv
+test -z "$EXEC_DIR" && EXEC_DIR=.
+test -z "$VENV_PATH" && VENV_PATH="$EXEC_DIR/.venv"

 activate(){
 	if [ ! -d "$VENV_PATH" ]; then
 		echo 'Creating python virtual environnement'
-		python -m venv "$VENV_PATH"
+		python -m venv --upgrade-deps "$VENV_PATH"
 		echo 'Activating virtual environnement'
 		activate
-		echo 'Updating base pip packages'
-		python -m pip install -U setuptools pip
 		echo 'Installing requirements'
-		pip install -r "$EXEC_DIR"/python/requirements.txt
-	elif [ -f "$VENV_PATH"/Scripts/activate ]; then source "$VENV_PATH"/Scripts/activate
-	elif [ -f "$VENV_PATH"/bin/activate ]; then source "$VENV_PATH"/bin/activate
+		pip install -r requirements.txt
+	elif [ -f "$VENV_PATH"/Scripts/activate ]; then . "$VENV_PATH"/Scripts/activate
+	elif [ -f "$VENV_PATH"/bin/activate ]; then . "$VENV_PATH"/bin/activate
 	else
 		echo 'Python virtual environnement not detected'
 		exit 1
--- a/python/common.py
+++ b/python/common.py
@ -1,29 +1,29 @@
-from toolbox import picke_multi_loader, format_time_ns, unit_test_argsort_2d
+from toolbox import pickle_multi_loader, format_time_ns, unit_test_argsort_2d, header, footer, formatted_line, formatted_row
 from typing import List, Tuple
 from time import perf_counter_ns
+from sys import stderr
 import numpy as np
 from config import OUT_DIR, DATA_DIR, __DEBUG

-def unit_test(TS: List[int], labels: List[str] = ["CPU", "GPU", "PY", "PGPU"], tol: float = 1e-8) -> None:
+def unit_test(TS: List[int], labels: List[str] = ['CPU', 'GPU', 'PY', 'PGPU'], tol: float = 1e-8) -> None:
 	"""Test if the each result is equals to other devices.

-	Given ViolaJones is a deterministic algorithm, the results no matter the device should be the same
+	Given ViolaJones is a fully deterministic algorithm. The results, regardless the device, should be the same
 	(given the floating point fluctuations), this function check this assertion.

 	Args:
-		TS (List[int]): Number of trained weak classifiers.
-		labels (List[str], optional): List of the trained device names. Defaults to ["CPU", "GPU", "PY", "PGPU"] (see config.py for more info).
-		tol (float, optional): Float difference tolerance. Defaults to 1e-8.
+		TS (List[int]): Number of trained weak classifiers
+		labels (List[str], optional): List of the trained device names. Defaults to ['CPU', 'GPU', 'PY', 'PGPU'] (see config.py for more info)
+		tol (float, optional): Float difference tolerance. Defaults to 1e-8
 	"""
 	if len(labels) < 2:
-		return print("Not enough devices to test")
+		return print('Not enough devices to test')

-	print(f"\n| {'Unit testing':<37} | {'Test state':<10} | {'Time spent (ns)':<18} | {'Formatted time spent':<29} |")
-	print(f"|{'-'*39}|{'-'*12}|{'-'*20}|{'-'*31}|")
+	unit_gaps = [37, -10, -18, 29]
+	header(unit_gaps, ['Unit testing', 'Test state', 'Time spent (ns)', 'Formatted time spent'])

-	fnc_s = perf_counter_ns()
-	n_total = 0
-	n_success = 0
+	unit_timestamp = perf_counter_ns()
+	n_total, n_success = 0, 0

 	def test_fnc(title, fnc):
 		nonlocal n_total, n_success
@ -32,96 +32,104 @@ def unit_test(TS: List[int], labels: List[str] = ["CPU", "GPU", "PY", "PGPU"], t
 		state = fnc()
 		e = perf_counter_ns() - s
 		if state:
-			print(f"| {title:<37} | {'Passed':>10} | {e:>18,} | {format_time_ns(e):<29} |")
+			formatted_row(unit_gaps, [title, 'Passed', f'{e:,}', format_time_ns(e)])
 			n_success += 1
 		else:
-			print(f"| {title:<37} | {'Failed':>10} | {e:>18,} | {format_time_ns(e):<29} |")
+			formatted_row(unit_gaps, [title, 'Failed', f'{e:,}', format_time_ns(e)])

-	for set_name in ["train", "test"]:
-		for filename in ["ii", "feat"]:
-			title = f"X_{set_name}_{filename}"
-			print(f"{filename}...", end = "\r")
-			bs = picke_multi_loader([f"{title}_{label}" for label in labels], OUT_DIR)
+	for set_name in ['train', 'test']:
+		for filename in ['ii', 'feat']:
+			title = f'X_{set_name}_{filename}'
+			print(f'{filename}...', file = stderr, end = '\r')
+			bs = pickle_multi_loader([f'{title}_{label}' for label in labels], OUT_DIR)

 			for i, (b1, l1) in enumerate(zip(bs, labels)):
 				if b1 is None:
 					if __DEBUG:
-						print(f"| {title:<22} - {l1:<12} | {'Skipped':>10} | {'None':>18} | {'None':<29} |")
+						formatted_row(unit_gaps, [f'{title:<22} - {l1:<12}', 'Skipped', 'None', 'None'])
 					continue
 				for j, (b2, l2) in enumerate(zip(bs, labels)):
 					if i >= j:
 						continue
 					if b2 is None:
 						if __DEBUG:
-							print(f"| {title:<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>18} | {'None':<29} |")
+							formatted_row(unit_gaps, [f'{title:<22} - {l1:<4} vs {l2:<4}', 'Skipped', 'None', 'None'])
 						continue
-					test_fnc(f"{title:<22} - {l1:<4} vs {l2:<4}", lambda: np.abs(b1 - b2).mean() < tol)
+					test_fnc(f'{title:<22} - {l1:<4} vs {l2:<4}', lambda: np.abs(b1 - b2).mean() < tol)

-		title = f"X_{set_name}_feat_argsort"
-		print(f"Loading {title}...", end = "\r")
+		title = f'X_{set_name}_feat_argsort'
+		print(f'Loading {title}...', file = stderr, end = '\r')
 		feat = None
+		#indices = pickle_multi_loader(['indices'], OUT_DIR)[0]
 		bs = []
 		for label in labels:
 			if feat is None:
-				feat_tmp = picke_multi_loader([f"X_{set_name}_feat_{label}"], OUT_DIR)[0]
+				feat_tmp = pickle_multi_loader([f'X_{set_name}_feat_{label}'], OUT_DIR)[0]
 				if feat_tmp is not None:
+					#feat = feat_tmp[indices]
 					feat = feat_tmp
-			bs.append(picke_multi_loader([f"{title}_{label}"], OUT_DIR)[0])
+			bs.append(pickle_multi_loader([f'{title}_{label}'], OUT_DIR)[0])

 		for i, (b1, l1) in enumerate(zip(bs, labels)):
 			if b1 is None:
 				if __DEBUG:
-					print(f"| {title:<22} - {l1:<12} | {'Skipped':>10} | {'None':>18} | {'None':<29} |")
+					formatted_row(unit_gaps, [f'{title:<22} - {l1:<12}', 'Skipped', 'None', 'None'])
 				continue
 			if feat is not None:
-				test_fnc(f"{title:<22} - {l1:<4} argsort", lambda: unit_test_argsort_2d(feat, b1))
+				test_fnc(f'{title:<22} - {l1:<4} argsort', lambda: unit_test_argsort_2d(feat, b1))

 			for j, (b2, l2) in enumerate(zip(bs, labels)):
 				if i >= j:
 					continue
 				if b2 is None:
 					if __DEBUG:
-						print(f"| {title:<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>18} | {'None':<29} |")
+						formatted_row(unit_gaps, [f'{title:<22} - {l1:<4} vs {l2:<4}', 'Skipped', 'None', 'None'])
 					continue
-				test_fnc(f"{title:<22} - {l1:<4} vs {l2:<4}", lambda: np.abs(b1 - b2).mean() < tol)
+				test_fnc(f'{title:<22} - {l1:<4} vs {l2:<4}', lambda: np.abs(b1 - b2).mean() < tol)

 	for T in TS:
-		for filename in ["alphas", "final_classifiers"]:
-			print(f"{filename}_{T}...", end = "\r")
-			bs = picke_multi_loader([f"{filename}_{T}_{label}" for label in labels])
+		for filename in ['alphas', 'final_classifiers']:
+			print(f'{filename}_{T}...', file = stderr, end = '\r')
+			bs = pickle_multi_loader([f'{filename}_{T}_{label}' for label in labels])

 			for i, (b1, l1) in enumerate(zip(bs, labels)):
 				if b1 is None:
 					if __DEBUG:
-						print(f"| {filename + '_' + str(T):<22} - {l1:<12} | {'Skipped':>10} | {'None':>18} | {'None':<29} |")
+						formatted_row(unit_gaps, [f"{filename + '_' + str(T):<22} - {l1:<12}", 'Skipped', 'None', 'None'])
 					continue
 				for j, (b2, l2) in enumerate(zip(bs, labels)):
 					if i >= j:
 						continue
 					if b2 is None:
 						if __DEBUG:
-							print(f"| {filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>18} | {'None':<29} |")
+							formatted_row(unit_gaps, [f"{filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4}", 'Skipped', 'None', 'None'])
 						continue
 					test_fnc(f"{filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4}", lambda: np.abs(b1 - b2).mean() < tol)

-	print(f"|{'-'*39}|{'-'*12}|{'-'*20}|{'-'*31}|")
-	e = perf_counter_ns() - fnc_s
-	print(f"| {'Unit testing summary':<37} | {str(n_success) + '/' + str(n_total):>10} | {e:>18,} | {format_time_ns(e):<29} |")
+	time_spent = perf_counter_ns() - unit_timestamp
+
+	if n_total == 0:
+		formatted_row(unit_gaps, ['Unit testing summary', 'No files', f'{time_spent:,}', format_time_ns(time_spent)])
+	else:
+		formatted_line(unit_gaps, '├', '┼', '─', '┤')
+		formatted_row(unit_gaps, ['Unit testing summary', f'{n_success}/{n_total}', f'{time_spent:,}', format_time_ns(time_spent)])
+
+	footer(unit_gaps)

 def load_datasets(data_dir: str = DATA_DIR) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
 	"""Load the datasets.

 	Args:
-		data_dir (str, optional): [description]. Defaults to DATA_DIR (see config.py).
+		data_dir (str, optional): [description]. Defaults to DATA_DIR (see config.py)

 	Returns:
-		Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: [description]
+		Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: X_train, y_train, X_test, y_test
 	"""
-	bytes_to_int_list = lambda b: list(map(int, b.rstrip().split(" ")))
+	bytes_to_int_list = lambda b: list(map(int, b.rstrip().split(' ')))

 	def load(set_name: str) -> np.ndarray:
-		with open(f"{data_dir}/{set_name}.bin", "r") as f:
+		with open(f'{data_dir}/{set_name}.bin', 'r') as f:
 			shape = bytes_to_int_list(f.readline())
 			return np.asarray(bytes_to_int_list(f.readline()), dtype = np.uint8).reshape(shape)

-	return load("X_train"), load("y_train"), load("X_test"), load("y_test")
+	return load('X_train'), load('y_train'), load('X_test'), load('y_test')
--- a/python/config.py
+++ b/python/config.py
@ -1,36 +1,47 @@
+from typing import Final
 import numpy as np

-DATA_DIR = "../data"
-OUT_DIR = "./out"
-MODEL_DIR = "./models"
+DATA_DIR: Final = '../data'
+OUT_DIR: Final = './out'
+MODEL_DIR: Final = './models'

-NB_THREADS = 1024
-NB_THREADS_2D = (32, 32)
-NB_THREADS_3D = (16, 16, 4)
-M = int(np.log2(NB_THREADS_2D[1]))
+NB_THREADS: Final = 1024
+NB_THREADS_2D: Final = (32, 32)
+NB_THREADS_3D: Final = (16, 16, 4)
+M: Final = int(np.log2(NB_THREADS_2D[1]))

-# Save state to avoid recalulation on restart
-SAVE_STATE = True
+# Save state to avoid recalculation on restart
+SAVE_STATE: Final = True
 # Redo the state even if it's already saved
-FORCE_REDO = False
+FORCE_REDO: Final = False
 # Use NJIT to greatly accelerate runtime
-COMPILE_WITH_C = True
+COMPILE_WITH_C: Final = True
 # Use GPU to greatly accelerate runtime (as priority over NJIT)
-GPU_BOOSTED = True
+GPU_BOOSTED: Final = True
+# Depending on what you set, the output label will be as follow :
+# ┌────────────────┬─────────────┬───────┐
+# │ COMPILE_WITH_C │ GPU_BOOSTED │ LABEL │
+# ├────────────────┼─────────────┼───────┤
+# │ True           │ True        │ GPU   │
+# │ True           │ False       │ CPU   │
+# │ False          │ True        │ PGPU  │
+# │ False          │ False       │ PY    │
+# └────────────────┴─────────────┴───────┘
+
 # Number of weak classifiers
-# TS = [1]
-# TS = [1, 5, 10]
-TS = [1, 5, 10, 25, 50]
-# TS = [1, 5, 10, 25, 50, 100, 200]
-# TS = [1, 5, 10, 25, 50, 100, 200, 300]
-# TS = [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000]
+# TS: Final = [1]
+# TS: Final = [1, 5, 10]
+TS: Final = [1, 5, 10, 25, 50]
+# TS: Final = [1, 5, 10, 25, 50, 100, 200]
+# TS: Final = [1, 5, 10, 25, 50, 100, 200, 300]
+# TS: Final = [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000]

 # Enable verbose output (for debugging purposes)
-__DEBUG = False
+__DEBUG: Final = False
 # Debugging options
 if __DEBUG:
-	IDX_INSPECT = 4548
-	IDX_INSPECT_OFFSET = 100
+	IDX_INSPECT: Final = 4548
+	IDX_INSPECT_OFFSET: Final = 100
 	np.seterr(all = 'raise')
 	# Debug option (image width * log_10(length) + extra characters)
 	np.set_printoptions(linewidth = 19 * 6 + 3)
--- a/python/convert_dataset.py
+++ b/python/convert_dataset.py
@ -5,6 +5,9 @@ from sys import argv
 import numpy as np
 from os import path, listdir

+# Induce determinism
+np.random.seed(133742)
+
 # Makes the "leave" argument default to False
 tqdm = partial(tqdm, leave = False)

@ -42,8 +45,8 @@ def __main__(data_path: str) -> None:
 					y.append(y_i)

 		X, y = np.asarray(X), np.asarray(y)
-		# idx = np.random.permutation(y.shape[0])
-		# X, y = X[idx], y[idx]
+		idx = np.random.permutation(y.shape[0])
+		X, y = X[idx], y[idx]

 		for org, s in tqdm(zip("Xy", [X, y]), desc = f"Writing {set_name}"):
 			with open(f"{data_path}/{org}_{set_name}.bin", "w") as out:
--- a/python/decorators.py
+++ b/python/decorators.py
@ -2,6 +2,14 @@ from typing import Callable, Iterable, Union, Any
 from tqdm import tqdm

 def njit(f: Union[Callable, str] = None, *args, **kwargs) -> Callable:
+	"""Wrapper for optional numba's njit decorator
+
+	Args:
+		f (Union[Callable, str], optional): Function to wrap with numba. Defaults to None.
+
+	Returns:
+		Callable: Wrapped function.
+	"""
 	def decorator(func: Callable) -> Any:
 		return func

@ -10,4 +18,13 @@ def njit(f: Union[Callable, str] = None, *args, **kwargs) -> Callable:
 	return decorator

 def tqdm_iter(iter: Iterable, desc: str):
-	return tqdm(iter, leave = False, desc = desc)
+	"""Wrapper for optional tqdm iterator progress bar.
+
+	Args:
+		iter (Iterable): Object to iterate over.
+		desc (str): Description written to stdout.
+
+	Returns:
+		_type_: Wrapped iterator.
+	"""
+	return tqdm(iter, leave = False, desc = desc)
--- a/python/docker-compose.yaml
+++ b/python/docker-compose.yaml
@ -0,0 +1,15 @@
+services:
+  violajones-python:
+    image: saundersp/violajones-python
+    pull_policy: never
+    build: .
+    volumes:
+      - ./models:/home/ViolaJones/python/models
+      - ./out:/home/ViolaJones/python/out
+      - ../data:/home/ViolaJones/data
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
--- a/python/project_test.py
+++ b/python/project_test.py
@ -0,0 +1,15 @@
+from toolbox_unit_test import format_time_test, format_time_ns_test
+from toolbox import header, footer, formatted_row, formatted_line
+from toolbox import format_time_ns, benchmark_function
+from time import perf_counter_ns
+
+if __name__ == '__main__':
+	unit_timestamp = perf_counter_ns()
+	unit_gaps = [27, -18, 29]
+	header(unit_gaps, ['Unit testing', 'Time spent (ns)', 'Formatted time spent'])
+	benchmark_function('testing format_time', unit_gaps[0], format_time_test)
+	benchmark_function('testing format_time_ns', unit_gaps[0], format_time_ns_test)
+	time_spent = perf_counter_ns() - unit_timestamp
+	formatted_line(unit_gaps, '├', '┼', '─', '┤')
+	formatted_row(unit_gaps, ['Unit testing summary', f'{time_spent:,}', format_time_ns(time_spent)])
+	footer(unit_gaps)
--- a/python/projet.py
+++ b/python/projet.py
@ -2,12 +2,15 @@
 # Author: @saundersp

 from ViolaJones import train_viola_jones, classify_viola_jones
-from toolbox import state_saver, picke_multi_loader, format_time_ns, benchmark_function, toolbox_unit_test, unit_test_argsort_2d
+#from toolbox import state_saver, pickle_multi_loader, format_time_ns, benchmark_function, unit_test_argsort_2d
+from toolbox import state_saver, format_time_ns, benchmark_function, unit_test_argsort_2d
+from toolbox import header, footer, formatted_row, formatted_line
+from toolbox_unit_test import format_time_test, format_time_ns_test
 from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
-from sklearn.feature_selection import SelectPercentile, f_classif
+#from sklearn.feature_selection import SelectPercentile, f_classif
 from common import load_datasets, unit_test
-from ViolaJones import build_features, get_best_anova_features
-from typing import Tuple
+from ViolaJones import build_features  # , get_best_anova_features
+from typing import Tuple, List
 from time import perf_counter_ns
 from os import makedirs
 import numpy as np
@ -17,153 +20,187 @@ if __DEBUG:
 	from config import IDX_INSPECT, IDX_INSPECT_OFFSET

 if GPU_BOOSTED:
-	from ViolaJonesGPU import apply_features, set_integral_image, argsort
+	from ViolaJonesGPU import apply_features, set_integral_image, argsort_2d
 	label = 'GPU' if COMPILE_WITH_C else 'PGPU'
 	# The parallel prefix sum doesn't use the whole GPU so numba output some annoying warnings, this disables it
 	from numba import config
 	config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 else:
-	from ViolaJonesCPU import apply_features, set_integral_image, argsort
+	from ViolaJonesCPU import apply_features, set_integral_image, argsort_2d
 	label = 'CPU' if COMPILE_WITH_C else 'PY'

-def preprocessing() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-	"""Load the dataset, calculate features and integral images, apply features to images and calculate argsort of the featured images.
+def preprocessing() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+	"""Execute the preprocessing phase
+
+	The preprocessing phase consist of the following steps :
+	- Load the dataset
+	- Calculate features
+	- Calculate integral images
+	- Apply features to images
+	- Calculate argsort of the featured images

 	Returns:
-		Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test
+		Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Tuple containing in order : training features, training features sorted indexes, training labels, testing features, testing labels
 	"""
 	# Creating state saver folders if they don't exist already
 	if SAVE_STATE:
-		for folder_name in ["models", "out"]:
+		for folder_name in ['models', 'out']:
 			makedirs(folder_name, exist_ok = True)

-	print(f"| {'Preprocessing':<49} | {'Time spent (ns)':<18} | {'Formatted time spent':<29} |\n|{'-'*51}|{'-'*20}|{'-'*31}|")
+	preproc_timestamp = perf_counter_ns()
+	preproc_gaps = [49, -18, 29]
+	header(preproc_gaps, ['Preprocessing', 'Time spent (ns)', 'Formatted time spent'])

-	X_train, y_train, X_test, y_test = state_saver("Loading sets", ["X_train", "y_train", "X_test", "y_test"],
-	 											   load_datasets, FORCE_REDO, SAVE_STATE)
+	X_train, y_train, X_test, y_test = state_saver('Loading sets', preproc_gaps[0], ['X_train', 'y_train', 'X_test', 'y_test'],
+													load_datasets, FORCE_REDO, SAVE_STATE)

 	if __DEBUG:
-		print("X_train")
+		print('X_train')
 		print(X_train.shape)
 		print(X_train[IDX_INSPECT])
-		print("X_test")
+		print('X_test')
 		print(X_test.shape)
 		print(X_test[IDX_INSPECT])
-		print("y_train")
+		print('y_train')
 		print(y_train.shape)
 		print(y_train[IDX_INSPECT: IDX_INSPECT + IDX_INSPECT_OFFSET])
-		print("y_test")
+		print('y_test')
 		print(y_test.shape)
 		print(y_test[IDX_INSPECT: IDX_INSPECT + IDX_INSPECT_OFFSET])

-	feats = state_saver("Building features", "feats", lambda: build_features(X_train.shape[1], X_train.shape[2]), FORCE_REDO, SAVE_STATE)
+	feats = state_saver('Building features', preproc_gaps[0], 'feats', lambda: build_features(X_train.shape[1], X_train.shape[2]),
+						FORCE_REDO, SAVE_STATE)

 	if __DEBUG:
-		print("feats")
+		print('feats')
 		print(feats.shape)
 		print(feats[IDX_INSPECT].ravel())

-	X_train_ii = state_saver(f"Converting training set to integral images ({label})", f"X_train_ii_{label}",
+	X_train_ii = state_saver(f'Converting training set to integral images ({label})', preproc_gaps[0], f'X_train_ii_{label}',
 							lambda: set_integral_image(X_train), FORCE_REDO, SAVE_STATE)
-	X_test_ii = state_saver(f"Converting testing set to integral images ({label})", f"X_test_ii_{label}",
+	X_test_ii = state_saver(f'Converting testing set to integral images ({label})', preproc_gaps[0], f'X_test_ii_{label}',
 							lambda: set_integral_image(X_test), FORCE_REDO, SAVE_STATE)

 	if __DEBUG:
-		print("X_train_ii")
+		print('X_train_ii')
 		print(X_train_ii.shape)
 		print(X_train_ii[IDX_INSPECT])
-		print("X_test_ii")
+		print('X_test_ii')
 		print(X_test_ii.shape)
 		print(X_test_ii[IDX_INSPECT])

-	X_train_feat = state_saver(f"Applying features to training set ({label})", f"X_train_feat_{label}",
+	X_train_feat = state_saver(f'Applying features to training set ({label})', preproc_gaps[0], f'X_train_feat_{label}',
 							lambda: apply_features(feats, X_train_ii), FORCE_REDO, SAVE_STATE)
-	X_test_feat = state_saver(f"Applying features to testing set ({label})", f"X_test_feat_{label}",
+	X_test_feat = state_saver(f'Applying features to testing set ({label})', preproc_gaps[0], f'X_test_feat_{label}',
 							lambda: apply_features(feats, X_test_ii), FORCE_REDO, SAVE_STATE)
 	del X_train_ii, X_test_ii, feats

 	if __DEBUG:
-		print("X_train_feat")
+		print('X_train_feat')
 		print(X_train_feat.shape)
 		print(X_train_feat[IDX_INSPECT, : IDX_INSPECT_OFFSET])
-		print("X_test_feat")
+		print('X_test_feat')
 		print(X_test_feat.shape)
 		print(X_test_feat[IDX_INSPECT, : IDX_INSPECT_OFFSET])

-	#indices = state_saver("Selecting best features training set", "indices", force_redo = True, save_state = SAVE_STATE,
+	#indices = state_saver('Selecting best features training set', 'indices', force_redo = FORCE_REDO, save_state = SAVE_STATE,
 	#						fnc = lambda: SelectPercentile(f_classif, percentile = 10).fit(X_train_feat.T, y_train).get_support(indices = True))
-	#indices = state_saver("Selecting best features training set", "indices", force_redo = FORCE_REDO, save_state = SAVE_STATE,
+	#indices = state_saver('Selecting best features training set', 'indices', force_redo = FORCE_REDO, save_state = SAVE_STATE,
 	#						fnc = lambda: get_best_anova_features(X_train_feat, y_train))
-	#indices = benchmark_function("Selecting best features (manual)", lambda: get_best_anova_features(X_train_feat, y_train))
+	#indices = benchmark_function('Selecting best features (manual)', lambda: get_best_anova_features(X_train_feat, y_train))

 	#if __DEBUG:
-	#	print("indices")
+	#	print('indices')
 	#	print(indices.shape)
 	#	print(indices[IDX_INSPECT: IDX_INSPECT + IDX_INSPECT_OFFSET])
-	#	assert indices.shape[0] == indices_new.shape[0], f"Indices length not equal : {indices.shape} != {indices_new.shape}"
-	#	assert (eq := indices == indices_new).all(), f"Indices not equal : {eq.sum() / indices.shape[0]}"
+	#	assert indices.shape[0] == indices_new.shape[0], f'Indices length not equal : {indices.shape} != {indices_new.shape}'
+	#	assert (eq := indices == indices_new).all(), f'Indices not equal : {eq.sum() / indices.shape[0]}'

 	# X_train_feat, X_test_feat = X_train_feat[indices], X_test_feat[indices]

-	X_train_feat_argsort = state_saver(f"Precalculating training set argsort ({label})", f"X_train_feat_argsort_{label}",
-									lambda: argsort(X_train_feat), FORCE_REDO, SAVE_STATE)
+	X_train_feat_argsort = state_saver(f'Precalculating training set argsort ({label})', preproc_gaps[0], f'X_train_feat_argsort_{label}',
+									lambda: argsort_2d(X_train_feat), FORCE_REDO, SAVE_STATE)

 	if __DEBUG:
-		print("X_train_feat_argsort")
+		print('X_train_feat_argsort')
 		print(X_train_feat_argsort.shape)
 		print(X_train_feat_argsort[IDX_INSPECT, : IDX_INSPECT_OFFSET])
-		benchmark_function("Arg unit test", lambda: unit_test_argsort_2d(X_train_feat, X_train_feat_argsort))
+		benchmark_function('Arg unit test', preproc_gaps[0], lambda: unit_test_argsort_2d(X_train_feat, X_train_feat_argsort))

-	X_test_feat_argsort = state_saver(f"Precalculating testing set argsort ({label})", f"X_test_feat_argsort_{label}",
-									lambda: argsort(X_test_feat), FORCE_REDO, SAVE_STATE)
+	X_test_feat_argsort = state_saver(f'Precalculating testing set argsort ({label})', preproc_gaps[0], f'X_test_feat_argsort_{label}',
+									lambda: argsort_2d(X_test_feat), FORCE_REDO, SAVE_STATE)

 	if __DEBUG:
-		print("X_test_feat_argsort")
+		print('X_test_feat_argsort')
 		print(X_test_feat_argsort.shape)
 		print(X_test_feat_argsort[IDX_INSPECT, : IDX_INSPECT_OFFSET])
-		benchmark_function("Arg unit test", lambda: unit_test_argsort_2d(X_test_feat, X_test_feat_argsort))
+		benchmark_function('Arg unit test', lambda: unit_test_argsort_2d(X_test_feat, X_test_feat_argsort))
+
+	time_spent = perf_counter_ns() - preproc_timestamp
+	formatted_line(preproc_gaps, '├', '┼', '─', '┤')
+	formatted_row(preproc_gaps, ['Preprocessing summary', f'{time_spent:,}', format_time_ns(time_spent)])
+	footer(preproc_gaps)

 	return X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test

-def train(X_train_feat: np.ndarray, X_train_feat_argsort: np.ndarray, y_train: np.ndarray) -> None:
+def train(X_train_feat: np.ndarray, X_train_feat_argsort: np.ndarray, y_train: np.ndarray) -> List[np.ndarray]:
 	"""Train the weak classifiers.

 	Args:
-		X_train (np.ndarray): Training images.
-		X_test (np.ndarray): Testing Images.
-		y_train (np.ndarray): Training labels.
+		X_train (np.ndarray): Training images
+		X_train_feat_argsort (np.ndarray): Sorted indexes of the training images features
+		y_train (np.ndarray): Training labels
+
+	Returns:
+		List[np.ndarray]: List of trained models
 	"""
-	print(f"\n| {'Training':<49} | {'Time spent (ns)':<18} | {'Formatted time spent':<29} |\n|{'-'*51}|{'-'*20}|{'-'*31}|")
+
+	training_timestamp = perf_counter_ns()
+	training_gaps = [26, -18, 29]
+	header(training_gaps, ['Training', 'Time spent (ns)', 'Formatted time spent'])
+	models = []

 	for T in TS:
-		alphas, final_classifiers = state_saver(f"ViolaJones T = {T:<3} ({label})", [f"alphas_{T}_{label}", f"final_classifiers_{T}_{label}"],
-					lambda: train_viola_jones(T, X_train_feat, X_train_feat_argsort, y_train), FORCE_REDO, SAVE_STATE, MODEL_DIR)
+		alphas, final_classifiers = state_saver(f'ViolaJones T = {T:<4} ({label})', training_gaps[0],
+			[f'alphas_{T}_{label}', f'final_classifiers_{T}_{label}'],
+			lambda: train_viola_jones(T, X_train_feat, X_train_feat_argsort, y_train), FORCE_REDO, SAVE_STATE, MODEL_DIR)
+		models.append([alphas, final_classifiers])
+
 		if __DEBUG:
-			print("alphas")
+			print('alphas')
 			print(alphas)
-			print("final_classifiers")
+			print('final_classifiers')
 			print(final_classifiers)

-def testing_and_evaluating(X_train_feat: np.ndarray, y_train: np.ndarray, X_test_feat: np.ndarray, y_test: np.ndarray) -> None:
+	time_spent = perf_counter_ns() - training_timestamp
+	formatted_line(training_gaps, '├', '┼', '─', '┤')
+	formatted_row(training_gaps, ['Training summary', f'{time_spent:,}', format_time_ns(time_spent)])
+	footer(training_gaps)
+
+	return models
+
+def testing_and_evaluating(models: List[np.ndarray], X_train_feat: np.ndarray, y_train: np.ndarray, X_test_feat: np.ndarray, y_test: np.ndarray) -> None:
 	"""Benchmark the trained classifiers on the training and testing sets.

 	Args:
-		X_train_feat (np.ndarray): Training features.
-		y_train (np.ndarray): Training labels.
-		X_test_feat (np.ndarray): Testing features.
-		y_test (np.ndarray): Testing labels.
+		models (List[np.ndarray]): List of trained models
+		X_train_feat (np.ndarray): Training features
+		y_train (np.ndarray): Training labels
+		X_test_feat (np.ndarray): Testing features
+		y_test (np.ndarray): Testing labels
 	"""
-	print(f"\n| {'Testing':<26} | Time spent (ns) (E) | {'Formatted time spent (E)':<29}", end = " | ")
-	print(f"Time spent (ns) (T) | {'Formatted time spent (T)':<29} |")
-	print(f"|{'-'*28}|{'-'*21}|{'-'*31}|{'-'*21}|{'-'*31}|")

-	perfs = []
-	for T in TS:
-		(alphas, final_classifiers) = picke_multi_loader([f"alphas_{T}_{label}", f"final_classifiers_{T}_{label}"])
+	testing_gaps = [26, -19, 24, -19, 24]
+	header(testing_gaps, ['Testing', 'Time spent (ns) (E)', 'Formatted time spent (E)', 'Time spent (ns) (T)', 'Formatted time spent (T)'])

+	performances = []
+	total_train_timestamp = 0
+	total_test_timestamp = 0
+	for T, (alphas, final_classifiers) in zip(TS, models):
 		s = perf_counter_ns()
 		y_pred_train = classify_viola_jones(alphas, final_classifiers, X_train_feat)
 		t_pred_train = perf_counter_ns() - s
+		total_train_timestamp += t_pred_train
 		e_acc = accuracy_score(y_train, y_pred_train)
 		e_f1 = f1_score(y_train, y_pred_train)
 		(_, e_FP), (e_FN, _) = confusion_matrix(y_train, y_pred_train)
@ -171,37 +208,49 @@ def testing_and_evaluating(X_train_feat: np.ndarray, y_train: np.ndarray, X_test
 		s = perf_counter_ns()
 		y_pred_test = classify_viola_jones(alphas, final_classifiers, X_test_feat)
 		t_pred_test = perf_counter_ns() - s
+		total_test_timestamp += t_pred_test
 		t_acc = accuracy_score(y_test, y_pred_test)
 		t_f1 = f1_score(y_test, y_pred_test)
 		(_, t_FP), (t_FN, _) = confusion_matrix(y_test, y_pred_test)
-		perfs.append((e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP))
+		performances.append((e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP))

-		print(f"| {'ViolaJones T = ' + str(T):<19} {'(' + label + ')':<6}", end = " | ")
-		print(f"{t_pred_train:>19,} | {format_time_ns(t_pred_train):<29}", end = " | ")
-		print(f"{t_pred_test:>19,} | {format_time_ns(t_pred_test):<29} |")
+		formatted_row(testing_gaps, [f"{'ViolaJones T = ' + str(T):<19} {'(' + label + ')':<6}", f'{t_pred_train:,}',
+									format_time_ns(t_pred_train), f'{t_pred_test:,}', format_time_ns(t_pred_test)])

-	print(f"\n| {'Evaluating':<19} | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) | ")
-	print(f"|{'-'*21}|{'-'*9}|{'-'*8}|{'-'*8}|{'-'*8}|{'-'*9}|{'-'*8}|{'-'*8}|{'-'*8}|")
+	formatted_line(testing_gaps, '├', '┼', '─', '┤')
+	formatted_row(testing_gaps, ['Testing summary', f'{total_train_timestamp:,}', format_time_ns(total_train_timestamp), f'{total_test_timestamp:,}',
+					format_time_ns(total_test_timestamp)])
+	footer(testing_gaps)

-	for T, (e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP) in zip(TS, perfs):
-		print(f"| {'ViolaJones T = ' + str(T):<19} | {e_acc:>7.2%} | {e_f1:>6.2f} | {e_FN:>6,} | {e_FP:>6,}", end = " | ")
-		print(f"{t_acc:>7.2%} | {t_f1:>6.2f} | {t_FN:>6,} | {t_FP:>6,} |")
+	evaluating_gaps = [19, 7, 6, 6, 6, 7, 6, 6, 6]
+	header(evaluating_gaps, ['Evaluating', 'ACC (E)', 'F1 (E)', 'FN (E)', 'FP (E)', 'ACC (T)', 'F1 (T)', 'FN (T)', 'FP (T)'])
+
+	for T, (e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP) in zip(TS, performances):
+		print(f'│ ViolaJones T = {T:<4} │ {e_acc:>7.2%} │ {e_f1:>6.2f} │ {e_FN:>6,} │ {e_FP:>6,}', end = ' │ ')
+		print(f'{t_acc:>7.2%} │ {t_f1:>6.2f} │ {t_FN:>6,} │ {t_FP:>6,} │')
+
+	footer(evaluating_gaps)

 def main() -> None:
-	print(f"| {'Unit testing':<49} | {'Time spent (ns)':<18} | {'Formatted time spent':<29} |")
-	print(f"|{'-'*51}|{'-'*20}|{'-'*31}|")
-	benchmark_function("Testing format_time_ns", format_time_ns_test)
-	print()
+	unit_timestamp = perf_counter_ns()
+	unit_gaps = [27, -18, 29]
+	header(unit_gaps, ['Unit testing', 'Time spent (ns)', 'Formatted time spent'])
+	benchmark_function('testing format_time', unit_gaps[0], format_time_test)
+	benchmark_function('testing format_time_ns', unit_gaps[0], format_time_ns_test)
+	time_spent = perf_counter_ns() - unit_timestamp
+	formatted_line(unit_gaps, '├', '┼', '─', '┤')
+	formatted_row(unit_gaps, ['Unit testing summary', f'{time_spent:,}', format_time_ns(time_spent)])
+	footer(unit_gaps)

 	X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test = preprocessing()
-	train(X_train_feat, X_train_feat_argsort, y_train)
+	models = train(X_train_feat, X_train_feat_argsort, y_train)

-	# X_train_feat, X_test_feat = picke_multi_loader([f"X_train_feat_{label}", f"X_test_feat_{label}"], OUT_DIR)
-	# indices = picke_multi_loader(["indices"], OUT_DIR)[0]
+	# X_train_feat, X_test_feat = pickle_multi_loader([f'X_train_feat_{label}', f'X_test_feat_{label}'], OUT_DIR)
+	# indices = pickle_multi_loader(['indices'], OUT_DIR)[0]
 	# X_train_feat, X_test_feat = X_train_feat[indices], X_test_feat[indices]

-	testing_and_evaluating(X_train_feat, y_train, X_test_feat, y_test)
+	testing_and_evaluating(models, X_train_feat, y_train, X_test_feat, y_test)
 	unit_test(TS)

-if __name__ == "__main__":
+if __name__ == '__main__':
 	main()
--- a/python/requirements.txt
+++ b/python/requirements.txt
@ -1,3 +1,5 @@
-numba
-scikit-learn
-tqdm
+numba==0.60.0
+scikit-learn==1.5.2
+tqdm==4.67.0
+
+#pudb==2024.1.3
--- a/python/test.py
+++ b/python/test.py
@ -1,189 +0,0 @@
-import numpy as np
-from numba import cuda, config, njit
-config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-#import matplotlib.pyplot as plt
-from tqdm import tqdm
-from time import perf_counter_ns
-from toolbox import format_time_ns
-from pickle import load, dump
-from sys import argv
-
-def get(a):
-	with open(f"{a}.pkl", 'rb') as f:
-		return load(f)
-
-def save(a, name) -> None:
-	with open(name, 'wb') as f:
-		dump(a, f)
-
-def diff(folder, a, label1, label2):
-	af, bf = get(f"{folder}/{a}_{label1}"), get(f"{folder}/{a}_{label2}")
-	#print(af)
-	#print(bf)
-	print((af - bf).mean())
-
-if __name__ == "__main__":
-	if len(argv) == 5:
-		diff(argv[1], argv[4], argv[2], argv[3])
-
-def py_mean(a, b):
-	s = 0.0
-	for a_i, b_i in zip(a, b):
-		s += a_i * b_i
-	return s / a.shape[0]
-
-def np_mean(a, b):
-	return np.mean(a * b)
-
-@njit('float64(float64[:], float64[:])', fastmath = True, nogil = True)
-def nb_mean(a, b):
-	return np.mean(a * b)
-
-@njit('float64(float64[:], float64[:])', fastmath = True, nogil = True)
-def nb_mean_loop(a, b):
-	s = 0.0
-	for a_i, b_i in zip(a, b):
-		s += a_i * b_i
-	return s / a.shape[0]
-
-@cuda.jit('void(float64[:], float64[:], float64[:])', fastmath = True)
-def cuda_mean_kernel(r, a, b):
-	s = 0.0
-	for a_i, b_i in zip(a, b):
-		s += a_i * b_i
-	r[0] = s / a.shape[0]
-
-def cuda_mean(a, b):
-	r = cuda.to_device(np.empty(1, dtype = np.float64))
-	d_a = cuda.to_device(a)
-	d_b = cuda.to_device(b)
-	cuda_mean_kernel[1, 1](r, d_a, d_b)
-	return r.copy_to_host()[0]
-
-def test_and_compare(labels, fncs, a, b):
-	m = []
-	for fnc in tqdm(fncs, leave = False, desc = "Calculating..."):
-		s = perf_counter_ns()
-		m.append([fnc(a, b), perf_counter_ns() - s])
-	print("Results:")
-	[print(f"\t{label:<10} {m_i:<20} {format_time_ns(time_i)}") for ((m_i, time_i), label) in zip(m, labels)]
-	print("Comparaison:")
-	for i, (m_i, label_i) in enumerate(zip(m, labels)):
-		for j, (m_j, label_j) in enumerate(zip(m, labels)):
-			if i >= j:
-				continue
-			print(f"\t{label_i:<10} vs {label_j:<10} - {abs(m_i[0] - m_j[0])}")
-
-if __name__ == "__main__":
-	np.set_printoptions(linewidth = 10000, threshold = 1000)
-
-	N = int(2**20)
-	labels = ["Python", "Numpy", "Numba", "Numba loop", "CUDA"]
-	fncs = [py_mean, np_mean, nb_mean, nb_mean_loop, cuda_mean]
-
-	print(f"RANDOM for N={N}")
-	total_size = (2 * 8 * N)
-	print(f"Size = {total_size} B")
-	print(f"Size = {total_size // 1024} kB")
-	print(f"Size = {total_size // 1024 // 1024} MB")
-	print(f"Size = {total_size // 1024 // 1024 // 1024} GB")
-	a, b = np.random.rand(N).astype(np.float64), np.random.rand(N).astype(np.float64)
-	test_and_compare(labels, fncs, a, b)
-	del a, b
-
-	print(f"\nDETERMINSTIC for N={N}")
-	total_size = (2 * 8 * N) + (8 * N)
-	print(f"Size = {total_size} B")
-	print(f"Size = {total_size // 1024} kB")
-	print(f"Size = {total_size // 1024 // 1024} MB")
-	print(f"Size = {total_size // 1024 // 1024 // 1024} GB")
-	mask = np.arange(N, dtype = np.uint64)
-	a = np.ones(N, dtype = np.float64)
-	a[mask < N//2] = 0.1
-	del mask
-	b = np.ones(N, dtype = np.float64)
-	test_and_compare(labels, fncs, a, b)
-	del a, b
-
-	#from ViolaJonesGPU import argsort as argsort_GPU
-	#from ViolaJonesCPU import argsort as argsort_CPU
-	#from toolbox import unit_test_argsort_2d, benchmark_function
-
-	#labels = ["Numpy", "Numba", "CUDA"]
-	#a = np.random.randint(2**12, size = (2**20, 2**8), dtype = np.int32)
-	#m = [benchmark_function(f"Argsort {label}", lambda: f(np.copy(a))) for (label, f) in zip(labels, [
-	#	lambda a: np.argsort(a).astype(np.uint16), argsort_CPU, argsort_GPU
-	#])]
-	#for i, (m_i, label_i) in enumerate(zip(m, labels)):
-	#	#for j, (m_j, label_j) in enumerate(zip(m, labels)):
-	#	#	if i >= j:
-	#	#		continue
-	#	#	print(f"\t{label_i:<10} vs {label_j:<10} - {(m_i == m_j).mean()}")
-	#	benchmark_function(f"Unit test {label_i}", lambda: unit_test_argsort_2d(a, m_i))
-
-	#for i in tqdm(range(X.shape[0]), leave = False, desc = "Extract image"):
-	#	x = X[i]
-	#	y = Y[i]
-	#	fig = plt.figure()
-	#	plt.imshow(x, cmap = 'gray')
-	#	plt.savefig(f"imgs/{y}/{i}.png")
-	#	plt.close(fig)
-
-	#def extract_FD(Xy):
-	#	X_c, Y_c = [], []
-	#	for x,y in Xy:
-	#		X_c.append(x)
-	#		Y_c.append(y)
-	#	X_c = np.asarray(X_c)
-	#	Y_c = np.asarray(Y_c)
-	#	return X_c, Y_c
-
-	#X_train, y_train = get('out/X_train'), get('out/y_train')
-	#X_test, y_test = get('out/X_test'), get('out/y_test')
-
-	#X_train, y_train = extract_FD(get('/home/_aspil0w/git/FaceDetection/training'))
-	#X_test, y_test = extract_FD(get('/home/_aspil0w/git/FaceDetection/test'))
-	#save(X_train, 'out/X_train'), save(y_train, 'out/y_train')
-	#save(X_test, 'out/X_test'), save(y_test, 'out/y_test')
-
-	#print(X_train.shape, X_train_org.shape, X_train.shape == X_train_org.shape)
-	#print((X_train == X_train_org).mean())
-	#print(y_train.shape, y_train_org.shape, y_train.shape == y_train_org.shape)
-	#print((y_train == y_train_org).mean())
-
-	#print(X_test.shape, X_test_org.shape, X_test.shape == X_test_org.shape)
-	#print((X_test == X_test_org).mean())
-	#print(y_test.shape, y_test_org.shape, y_test.shape == y_test_org.shape)
-	#print((y_test == y_test_org).mean())
-
-	#@njit('uint16[:](uint8[:, :, :], uint8[:, :, :])')
-	#def arg_find(X, X_org):
-	#	arg = np.empty(X.shape[0], dtype = np.uint16)
-	#	for i, x in enumerate(X_org):
-	#		found = False
-	#		for j, x_org in enumerate(X):
-	#			if np.all(x == x_org):
-	#				arg[i] = j
-	#				found = True
-	#				break
-	#		assert found, "Image not found"
-	#	return arg
-
-	#print("Arg find results train")
-	#arg_train = arg_find(X_train, X_train_org)
-	#print((X_train[arg_train] == X_train_org).mean())
-	#print((y_train[arg_train] == y_train_org).mean())
-
-	#print("Arg find results test")
-	#arg_test = arg_find(X_test, X_test_org)
-	#print((X_test[arg_test] == X_test_org).mean())
-	#print((y_test[arg_test] == y_test_org).mean())
-
-	#for i in tqdm(range(X_c.shape[0]), leave = False, desc = "Extract image"):
-	#	x = X_c[i]
-	#	y = Y_c[i]
-	#	fig = plt.figure()
-	#	plt.imshow(x, cmap = 'gray')
-	#	plt.savefig(f"imgs2/{y}/{i}.png")
-	#	plt.close(fig)
-
--- a/python/toolbox.py
+++ b/python/toolbox.py
@ -1,154 +1,223 @@
-from typing import Any, Callable, List, Union
+from typing import Any, Callable, List, Union, Final
 from time import perf_counter_ns
-from numba import njit
 import numpy as np
+from sys import stderr
 import pickle
 import os
-from config import MODEL_DIR, OUT_DIR
+from config import MODEL_DIR, OUT_DIR, __DEBUG
+from decorators import njit

-formats = ["ns", "µs", "ms", "s", "m", "h", "j", "w", "M", "y"]
-nb = np.array([1, 1000, 1000, 1000, 60, 60, 24, 7, 4, 12], dtype = np.uint16)
+def formatted_row(gaps: list[int], titles: list[str], separator: str = '│') -> None:
+	"""Print a formatted row of titles with of gaps seperated by a separator.
+
+	Args:
+		gaps: List of size gaps
+		titles: List of titles
+		separator: Separator character between each gap
+	"""
+	for gap, title in zip(gaps, titles):
+		print(f"{separator} {title:{'>' if gap < 0 else '<'}{abs(gap)}} ", end = '')
+	print(separator)
+
+def formatted_line(gaps: list[int], left: str, middle: str, separator: str, right: str) -> None:
+	"""Print a formatted line of repeated characters.
+
+	Args:
+		gaps: List of size gaps
+		left: Character on the left
+		middle: Character between each separator
+		separator: Separator character between each gap
+		right: Character on the right
+	"""
+	print(left, end = '')
+	last_gap = len(gaps) - 1
+	for i, gap in enumerate(gaps):
+		print(f'{separator * (abs(gap) + 2)}', end = '')
+		if i != last_gap:
+			print(middle, end = '')
+	print(right)
+
+def header(gaps: list[int], titles: list[str]) -> None:
+	"""Print a formatted header with the given titles and sizes.
+
+	Args:
+		gaps: List of size gaps
+		titles: List of titles
+	"""
+	formatted_line(gaps, '┌', '┬', '─', '┐')
+	formatted_row(gaps, titles)
+	formatted_line(gaps, '├', '┼', '─', '┤')
+
+def footer(gaps: list[int]) -> None:
+	"""Print a formatted footer with the given sizes.
+
+	Args:
+		gaps: List of size gaps
+	"""
+	formatted_line(gaps, '└', '┴', '─', '┘')
+
+time_formats: Final = ['ns', 'µs', 'ms', 's', 'm', 'h', 'j', 'w', 'M', 'y', 'c']
+time_numbers: Final = np.array([1, 1e3, 1e6, 1e9, 6e10, 36e11, 864e11, 6048e11, 26784e11, 31536e12, 31536e14], dtype = np.uint64)
+@njit('str(uint64)')
 def format_time_ns(time: int) -> str:
 	"""Format the time in nanoseconds in human readable format.

 	Args:
-		time (int): Time in nanoseconds.
+		time (int): Time in nanoseconds

 	Returns:
-		str: The formatted human readable string.
+		str: The formatted human readable string
 	"""
-	assert time >= 0, "Incorrect time stamp"
+	assert time >= 0, 'Incorrect time stamp'
 	if time == 0:
-		return "0ns"
-	prod = nb.prod(dtype = np.uint64)
+		return '0ns'

-	s = ""
-	for i in range(nb.shape[0])[::-1]:
-		if time >= prod:
-			res = int(time // prod)
-			time = time % prod
-			s += f"{res}{formats[i]} "
-		prod = prod // nb[i]
+	s = ''
+	for i in range(time_numbers.shape[0])[::-1]:
+		if time >= time_numbers[i]:
+			res = int(time // time_numbers[i])
+			time = time % time_numbers[i]
+			s += f'{res}{time_formats[i]} '

-	assert time == 0, "Leftover in formatting time !"
+	assert time == 0, 'Leftover in formatting time !'
 	return s.rstrip()

-def toolbox_unit_test() -> None:
-	# FIXME Move unit test to different file
-	assert "0ns" == format_time_ns(0)
-	assert "1ns" == format_time_ns(1)
-	assert "1µs" == format_time_ns(int(1e3))
-	assert "1ms" == format_time_ns(int(1e6))
-	assert "1s" == format_time_ns(int(1e9))
-	assert "1m" == format_time_ns(int(6e10))
-	assert "1h" == format_time_ns(int(36e11))
-	assert "1j" == format_time_ns(int(864e11))
-	assert "1w" == format_time_ns(int(6048e11))
-	assert "1M" == format_time_ns(int(24192e11))
-	assert "1y" == format_time_ns(int(290304e11))
-	# UINT64_MAX == 2^64 = 18446744073709551615 == -1
-	assert "635y 5M 3j 23h 34m 33s 709ms 551µs 616ns" == format_time_ns(2**64)
+@njit('str(uint64)')
+def format_time(time: int) -> str:
+	"""Format the time in seconds in human readable format.

-def picke_multi_loader(filenames: List[str], save_dir: str = MODEL_DIR) -> List[Any]:
+	Args:
+		time (int): Time in seconds
+
+	Returns:
+		str: The formatted human readable string
+	"""
+	assert time >= 0, 'Incorrect time stamp'
+	if time == 0:
+		return '0s'
+
+	s = ''
+	for i in range(3, time_numbers.shape[0])[::-1]:
+		time_number = time_numbers[i] / int(1e9)
+		if time >= time_number:
+			res = int(time // time_number)
+			time = time % time_number
+			s += f'{res}{time_formats[i]} '
+
+	assert time == 0, 'Leftover in formatting time !'
+	return s.rstrip()
+
+def pickle_multi_loader(filenames: List[str], save_dir: str = MODEL_DIR) -> List[Any]:
 	"""Load multiple pickle data files.

 	Args:
-		filenames (List[str]): List of all the filename to load.
-		save_dir (str, optional): Path of the files to load. Defaults to MODELS_DIR (see config.py).
+		filenames (List[str]): List of all the filename to load
+		save_dir (str, optional): Path of the files to load. Defaults to MODELS_DIR (see config.py)

 	Returns:
-		List[Any]. List of loaded pickle data files.
+		List[Any]. List of loaded pickle data files
 	"""
 	b = []
 	for f in filenames:
-		filepath = f"{save_dir}/{f}.pkl"
+		filepath = f'{save_dir}/{f}.pkl'
 		if os.path.exists(filepath):
-			with open(filepath, "rb") as filebyte:
-				b.append(pickle.load(filebyte))
+			with open(filepath, 'rb') as file_bytes:
+				b.append(pickle.load(file_bytes))
 		else:
 			b.append(None)
 	return b

-def benchmark_function(step_name: str, fnc: Callable) -> Any:
-	"""Benchmark a function and display the result of stdout.
+def benchmark_function(step_name: str, column_width: int, fnc: Callable) -> Any:
+	"""Benchmark a function and display the result in stdout.

 	Args:
-		step_name (str): Name of the function to call.
-		fnc (Callable): Function to call.
+		step_name (str): Name of the function to call
+		fnc (Callable): Function to call

 	Returns:
-		Any: Result of the function.
+		Any: Result of the function
 	"""
-	print(f"{step_name}...", end = "\r")
+	print(f'{step_name}...', file = stderr, end = '\r')
 	s = perf_counter_ns()
 	b = fnc()
 	e = perf_counter_ns() - s
-	print(f"| {step_name:<49} | {e:>18,} | {format_time_ns(e):<29} |")
+	print(f'│ {step_name:<{column_width}} │ {e:>18,} │ {format_time_ns(e):<29} │')
 	return b

-def state_saver(step_name: str, filename: Union[str, List[str]], fnc, force_redo: bool = False, save_state: bool = True, save_dir: str = OUT_DIR) -> Any:
+def state_saver(step_name: str, column_width: int, filename: Union[str, List[str]], fnc, force_redo: bool = False,
+				save_state: bool = True, save_dir: str = OUT_DIR) -> Any:
 	"""Either execute a function then saves the result or load the already existing result.

 	Args:
-		step_name (str): Name of the function to call.
-		filename (Union[str, List[str]]): Name or list of names of the filenames where the result(s) are saved.
-		fnc ([type]): Function to call.
-		force_redo (bool, optional): Recall the function even if the result(s) is already saved. Defaults to False.
-		save_dir (str, optional): Path of the directory to save the result(s). Defaults to OUT_DIR (see config.py).
+		step_name (str): Name of the function to call
+		filename (Union[str, List[str]]): Name or list of names of the filenames where the result(s) are saved
+		fnc ([type]): Function to call
+		force_redo (bool, optional): Recall the function even if the result(s) is already saved. Defaults to False
+		save_dir (str, optional): Path of the directory to save the result(s). Defaults to OUT_DIR (see config.py)

 	Returns:
 		Any: The result(s) of the called function
 	"""
 	if isinstance(filename, str):
-		if not os.path.exists(f"{save_dir}/{filename}.pkl") or force_redo:
-			b = benchmark_function(step_name, fnc)
+		if not os.path.exists(f'{save_dir}/{filename}.pkl') or force_redo:
+			b = benchmark_function(step_name, column_width, fnc)
 			if save_state:
-				print(f"Saving results of {step_name}", end = '\r')
-				with open(f"{save_dir}/{filename}.pkl", 'wb') as f:
+				print(f'Saving results of {step_name}', file = stderr, end = '\r')
+				with open(f'{save_dir}/{filename}.pkl', 'wb') as f:
 					pickle.dump(b, f)
-				print(' ' * 100, end = '\r')
+				print(' ' * 100, file = stderr, end = '\r')
 			return b
 		else:
-			print(f"Loading results of {step_name}", end = '\r')
-			with open(f"{save_dir}/{filename}.pkl", "rb") as f:
+			print(f'Loading results of {step_name}', file = stderr, end = '\r')
+			with open(f'{save_dir}/{filename}.pkl', 'rb') as f:
 				res = pickle.load(f)
-			print(f"| {step_name:<49} | {'None':>18} | {'loaded saved state':<29} |")
+			print(f"│ {step_name:<{column_width}} │ {'None':>18} │ {'loaded saved state':<29} │")
 			return res
 	elif isinstance(filename, list):
 		abs = False
 		for fn in filename:
-			if not os.path.exists(f"{save_dir}/{fn}.pkl"):
+			if not os.path.exists(f'{save_dir}/{fn}.pkl'):
 				abs = True
 				break
 		if abs or force_redo:
-			b = benchmark_function(step_name, fnc)
+			b = benchmark_function(step_name, column_width, fnc)
 			if save_state:
-				print(f"Saving results of {step_name}", end = '\r')
+				print(f'Saving results of {step_name}', file = stderr, end = '\r')
 				for bi, fnI in zip(b, filename):
-					with open(f"{save_dir}/{fnI}.pkl", 'wb') as f:
+					with open(f'{save_dir}/{fnI}.pkl', 'wb') as f:
 						pickle.dump(bi, f)
-				print(' ' * 100, end = '\r')
+				print(' ' * 100, file = stderr, end = '\r')
 			return b

-		print(f"| {step_name:<49} | {'None':>18} | {'loaded saved state':<29} |")
+		print(f"│ {step_name:<{column_width}} │ {'None':>18} │ {'loaded saved state':<29} │")
 		b = []
-		print(f"Loading results of {step_name}", end = '\r')
+		print(f'Loading results of {step_name}', file = stderr, end = '\r')
 		for fn in filename:
-			with open(f"{save_dir}/{fn}.pkl", "rb") as f:
+			with open(f'{save_dir}/{fn}.pkl', 'rb') as f:
 				b.append(pickle.load(f))
-		print(' ' * 100, end = '\r')
+		print(' ' * 100, file = stderr, end = '\r')
 		return b
 	else:
-		assert False, f"Incompatible filename type = {type(filename)}"
+		assert False, f'Incompatible filename type = {type(filename)}'

@njit('boolean(int32[:, :], uint16[:, :])')
 def unit_test_argsort_2d(arr: np.ndarray, indices: np.ndarray) -> bool:
+	"""Test if a given 2D array of indices sort a given 2D array.
+
+	Args:
+		arr (np.ndarray): 2D Array of data
+		indices (np.ndarray): 2D Indices that sort the array
+
+	Returns:
+		bool: Whether the test was successful
+	"""
 	n = indices.shape[0]
 	total = indices.shape[0] * indices.shape[1]
 	for i, sub_indices in enumerate(indices):
 		for j in range(sub_indices.shape[0] - 1):
 			if arr[i, sub_indices[j]] <= arr[i, sub_indices[j + 1]]:
 				n += 1
-	if n != total:
-		print(n, total, n / (total))
+	if __DEBUG:
+		if n != total:
+			print(n, total, n / (total))
 	return n == total
--- a/python/toolbox_unit_test.py
+++ b/python/toolbox_unit_test.py
@ -0,0 +1,133 @@
+from toolbox import format_time, format_time_ns
+from typing import Any
+
+def Assert(name: str, expected: Any, result: Any):
+	"""Test if a given result is equal of the expected one and log result
+
+	Args:
+		name (str): name of the unit test
+		expected (Any): expected result of the function call
+		result (Any): result of the function
+	"""
+	if expected != result:
+		print(f"For test name {name} Expected '{expected}' but got '{result}' instead")
+		assert False
+
+def format_time_test() -> None:
+	"""Test suite for the format_time output
+
+	See https://en.wikipedia.org/wiki/Unit_of_time for details
+	"""
+	Assert('format_time null',                      '0s',                               format_time(0))
+	Assert('format_time second',                    '1s',                               format_time(1))
+	Assert('format_time decasecond',                '10s',                              format_time(10))
+	Assert('format_time minute',                    '1m',                               format_time(60))
+	Assert('format_time milliday',                  '1m 26s',                           format_time(86))  # missing 0.4s due to precision
+	Assert('format_time hectosecond',               '1m 40s',                           format_time(100))
+	Assert('format_time kilosecond',                '16m 40s',                          format_time(int(1e3)))
+	Assert('format_time hour',                      '1h',                               format_time(3600))
+	Assert('format_time day',                       '1j',                               format_time(86400))
+	Assert('format_time week/sennight',             '1w',                               format_time(604800))
+	Assert('format_time megasecond',                '1w 4j 13h 46m 40s',                format_time(int(1e6)))
+	Assert('format_time fortnight',                 '2w',                               format_time(1209600))
+	Assert('format_time lunar month (draconitic)',  '3w 6j 5h 5m 35s',                  format_time(2351135))  # missing 0.8 due to precision
+	Assert('format_time lunar month (tropical)',    '3w 6j 7h 43m 4s',                  format_time(2360584))  # missing 0.7 due to precision
+	Assert('format_time lunar month (sidereal)',    '3w 6j 7h 43m 11s',                 format_time(2360591))  # missing 0.6 to precision
+	Assert('format_time lunar month (anomalistic)', '3w 6j 13h 18m 33s',                format_time(2380713))  # missing 0.2 due to precision
+	Assert('format_time lunar month (synodic)',     '4w 1j 12h 44m 2s',                 format_time(2551442))  # missing 0.9 due to precision
+	Assert('format_time month',                     '1M',                               format_time(2678400))
+	Assert('format_time quarantine',                '1M 1w 2j',                         format_time(int(3456e3)))
+	Assert('format_time semester',                  '4M 2j',                            format_time(10886400))
+	Assert('format_time lunar year',                '11M 1w 6j 8h 52m 48s',             format_time(30617568))
+	Assert('format_time year',                      '1y',                               format_time(int(31536e3)))
+	Assert('format_time tropical year',             '1y 5h 48m 45s',                    format_time(31556925))  # missing 0.216 due to precision
+	Assert('format_time gregorian year',            '1y 5h 49m 12s',                    format_time(31556952))
+	Assert('format_time sidereal year',             '1y 6h 9m 9s',                      format_time(31558149))  # missing 0.7635456 due to precision
+	Assert('format_time leap year',                 '1y 1j',                            format_time(31622400))
+	Assert('format_time olympiad',                  '4y',                               format_time(int(126144e3)))
+	Assert('format_time lusturm',                   '5y',                               format_time(int(15768e4)))
+	Assert('format_time decade',                    '10y',                              format_time(int(31536e4)))
+	Assert('format_time indiction',                 '15y',                              format_time(int(47304e4)))
+	Assert('format_time score',                     '20y',                              format_time(int(63072e4)))
+	Assert('format_time gigasecond',                '31y 8M 1w 4j 1h 46m 40s',          format_time(int(1e9)))
+	Assert('format_time jubilee',                   '50y',                              format_time(int(15768e5)))
+	Assert('format_time century',                   '1c',                               format_time(int(31536e5)))
+	Assert('format_time millennium',                '10c',                              format_time(int(31536e6)))
+	Assert('format_time age',                       '257c 72y',                         format_time(int(812745792e3)))
+	Assert('format_time terasecond',                '3170c 97y 10M 3w 4j 17h 46m 40s',  format_time(int(1e13)))
+	Assert('format_time megaannum',                 '10000c',                           format_time(int(31536e9)))
+	Assert('format_time petasecond',                '317097c 91y 11M 2w 4j 1h 46m 40s', format_time(int(1e15)))
+	Assert('format_time galactic year',             '2300000c',                         format_time(int(725328e10)))
+	Assert('format_time eon',                       '10000000c',                        format_time(int(31536e12)))
+	Assert('format_time kalpa',                     '43200000c',                        format_time(int(13623552e10)))
+	Assert('format_time exasecond',                 '317097919c 83y 9M 1h 46m 40s',     format_time(int(1e18)))
+	# Cannot use number bigger than currently supported ISO Python
+	# Assert('format_time zettasecond',               '',                                 format_time(1e21))
+	# Assert('format_time yottasecond',               '',                                 format_time(1e24))
+	# Assert('format_time ronnasecond',               '',                                 format_time(1e27))
+	# Assert('format_time quettasecond',              '',                                 format_time(1e30))
+	# uint64_t_MAX == 2**64 == 18446744073709551615 == -1
+	Assert('format_time max',                       '5849424173c 55y 3w 5j 7h 16s',     format_time(int(2**64 - 1)))
+
+def format_time_ns_test() -> None:
+	"""Test suite for the format_time_ns output
+
+	See https://en.wikipedia.org/wiki/Unit_of_time for details
+	"""
+	Assert('format_time_ns null',                      '0ns',                                         format_time_ns(0))
+	Assert('format_time_ns nanosecond',                '1ns',                                         format_time_ns(1))
+	Assert('format_time_ns shake',                     '10ns',                                        format_time_ns(10))
+	Assert('format_time_ns microsecond',               '1µs',                                         format_time_ns(int(1e3)))
+	Assert('format_time_ns millisecond',               '1ms',                                         format_time_ns(int(1e6)))
+	Assert('format_time_ns centisecond',               '10ms',                                        format_time_ns(int(1e7)))
+	Assert('format_time_ns decisecond',                '100ms',                                       format_time_ns(int(1e8)))
+	Assert('format_time_ns second',                    '1s',                                          format_time_ns(int(1e9)))
+	Assert('format_time_ns decasecond',                '10s',                                         format_time_ns(int(1e10)))
+	Assert('format_time_ns minute',                    '1m',                                          format_time_ns(int(6e10)))
+	Assert('format_time_ns milliday',                  '1m 26s 400ms',                                format_time_ns(int(864e8)))
+	Assert('format_time_ns hectosecond',               '1m 40s',                                      format_time_ns(int(1e11)))
+	Assert('format_time_ns kilosecond',                '16m 40s',                                     format_time_ns(int(1e12)))
+	Assert('format_time_ns hour',                      '1h',                                          format_time_ns(int(36e11)))
+	Assert('format_time_ns day',                       '1j',                                          format_time_ns(int(864e11)))
+	Assert('format_time_ns week/sennight',             '1w',                                          format_time_ns(int(6048e11)))
+	Assert('format_time_ns megasecond',                '1w 4j 13h 46m 40s',                           format_time_ns(int(1e15)))
+	Assert('format_time_ns fortnight',                 '2w',                                          format_time_ns(int(12096e11)))
+	Assert('format_time_ns lunar month (draconitic)',  '3w 6j 5h 5m 35s 800ms',                       format_time_ns(int(23511358e8)))
+	Assert('format_time_ns lunar month (tropical)',    '3w 6j 7h 43m 4s 700ms',                       format_time_ns(int(23605847e8)))
+	Assert('format_time_ns lunar month (sidereal)',    '3w 6j 7h 43m 11s 600ms',                      format_time_ns(int(23605916e8)))
+	Assert('format_time_ns lunar month (anomalistic)', '3w 6j 13h 18m 33s 200ms',                     format_time_ns(int(23807132e8)))
+	Assert('format_time_ns lunar month (synodic)',     '4w 1j 12h 44m 2s 900ms',                      format_time_ns(int(25514429e8)))
+	Assert('format_time_ns month',                     '1M',                                          format_time_ns(int(26784e11)))
+	Assert('format_time_ns quarantine',                '1M 1w 2j',                                    format_time_ns(int(3456e12)))
+	Assert('format_time_ns semester',                  '4M 2j',                                       format_time_ns(int(108864e11)))
+	Assert('format_time_ns lunar year',                '11M 1w 6j 8h 52m 48s',                        format_time_ns(int(30617568e9)))
+	Assert('format_time_ns year',                      '1y',                                          format_time_ns(int(31536e12)))
+	Assert('format_time_ns tropical year',             '1y 5h 48m 45s 216ms',                         format_time_ns(int(31556925216e6)))
+	Assert('format_time_ns gregorian year',            '1y 5h 49m 12s',                               format_time_ns(int(31556952e9)))
+	Assert('format_time_ns sidereal year',             '1y 6h 9m 9s 763ms 545µs 600ns',               format_time_ns(int(315581497635456e2)))
+	Assert('format_time_ns leap year',                 '1y 1j',                                       format_time_ns(int(316224e11)))
+	Assert('format_time_ns olympiad',                  '4y',                                          format_time_ns(int(126144e12)))
+	Assert('format_time_ns lusturm',                   '5y',                                          format_time_ns(int(15768e13)))
+	Assert('format_time_ns decade',                    '10y',                                         format_time_ns(int(31536e13)))
+	Assert('format_time_ns indiction',                 '15y',                                         format_time_ns(int(47304e13)))
+	Assert('format_time_ns score',                     '20y',                                         format_time_ns(int(63072e13)))
+	Assert('format_time_ns gigasecond',                '31y 8M 1w 4j 1h 46m 40s',                     format_time_ns(int(1e18)))
+	Assert('format_time_ns jubilee',                   '50y',                                         format_time_ns(int(15768e14)))
+	Assert('format_time_ns century',                   '1c',                                          format_time_ns(int(31536e14)))
+	# Python int too large to convert to C long
+	# Assert('format_time_ns millennium',                '10c',                                         format_time_ns(int(31536e15)))
+	# Assert('format_time_ns age',                       '257c 72y',                                    format_time_ns(int(812745792e12)))
+	# Assert('format_time_ns terasecond',                '3170c 97y 10M 3w 4j 17h 46m 40s',             format_time_ns(int(1e22)))
+	# Assert('format_time_ns megaannum',                 '10000c',                                      format_time_ns(int(31536e18)))
+	# Cannot use number bigger than currently supported ISO Python
+	# Assert('format_time_ns petasecond',                '317097c 91y 11M 2w 4j 1h 46m 40s',            format_time_ns(int(1e24)))
+	# Assert('format_time_ns galactic year',             '2300000c',                                    format_time_ns(int(725328e19)))
+	# Assert('format_time_ns eon',                       '10000000c',                                   format_time_ns(int(31536e21)))
+	# Assert('format_time_ns kalpa',                     '43200000c',                                   format_time_ns(int(13623552e19)))
+	# Assert('format_time_ns exasecond',                 '317097919c 83y 9M 1h 46m 40s',                format_time_ns(int(1e27)))
+	# Assert('format_time_ns zettasecond',               '',                                            format_time_ns(int(1e30)))
+	# Assert('format_time_ns yottasecond',               '',                                            format_time_ns(int(1e33)))
+	# Assert('format_time_ns ronnasecond',               '',                                            format_time_ns(int(1e36)))
+	# Assert('format_time_ns quettasecond',              '',                                            format_time_ns(int(1e39)))
+	# uint64_t_MAX == 2**64 == 18446744073709551615 == -1
+	Assert('format_time_ns max',                       '5c 84y 11M 2j 23h 34m 33s 709ms 551µs 615ns', format_time_ns(2**64 - 1))
Author	SHA1	Message	Date
saundersp	b1abaed9e3	README.fr.md : Fix typos	2025-04-30 21:33:52 +02:00
saundersp	2051ae8cdc	downloader/convert_dataset.py : Added better typing and formatting	2024-11-08 01:23:38 +01:00
saundersp	e9df962d7a	Changed VENV_PATH from 'venv' to '.venv'	2024-11-08 01:22:49 +01:00
saundersp	512a21cdb3	Updated dependencies	2024-11-08 01:18:31 +01:00
saundersp	8c7f2c1c97	python/Makefile : fix test target	2024-07-22 22:07:17 +02:00
saundersp	b55713cae3	Fix some typos	2024-07-22 22:06:51 +02:00
saundersp	d83ddd04c1	docker-compose.yaml : disabled pulling for custom images	2024-07-22 22:04:49 +02:00
saundersp	294eb7fa5b	Updated dependencies	2024-07-22 22:03:59 +02:00
saundersp	95eaffda4d	Added testing targets	2024-07-22 22:03:30 +02:00
saundersp	1ec28f78de	Updated Dockerfiles	2024-06-28 20:04:33 +02:00
saundersp	8565ce782b	cpp/Makefile : fixed compute-sanitizer typo	2024-06-10 18:58:12 +02:00
saundersp	1f5aa0874a	Removed deprecated cpp/test.cpp	2024-06-10 18:57:36 +02:00
saundersp	179f106062	Updated Dockerfiles	2024-06-10 18:57:17 +02:00
saundersp	2e7b7313c3	cpp : Fixed Docker image execution	2024-05-01 15:14:04 +02:00
saundersp	4a42747837	python : improved documentation	2024-04-28 22:35:42 +02:00
saundersp	c71b04f00d	cpp : Added documentation	2024-04-28 22:11:33 +02:00
saundersp	f7ac38b93a	python : added comment for potential indices in unit test	2024-04-28 00:26:31 +02:00
saundersp	434ce20374	python : updated requirements	2024-04-28 00:26:06 +02:00
saundersp	718724b63b	python : Updated code with better display, documentation and format_time	2024-04-28 00:25:13 +02:00
saundersp	c7d21e1014	cpp : more robust code and added more documentation	2024-04-27 21:08:33 +02:00
saundersp	45f0f6ab8e	Fixed incorrect filemode	2024-04-27 20:52:17 +02:00
saundersp	466fd0f782	Dockerized every modules	2024-04-27 20:51:36 +02:00
saundersp	226df0882c	Makefiles : checking dependencies beforehand && added help text	2024-04-27 20:50:50 +02:00
saundersp	ff8142e678	Makefiles : python and cpp will not start without data downloaded by downloader first	2024-04-27 20:50:08 +02:00
saundersp	8740f7ea4b	moved download_data.sh to seperate module downloader	2024-04-27 20:47:30 +02:00
saundersp	211dcad893	Better handling of printing results board	2024-03-21 00:50:13 +01:00
saundersp	f65c58d95c	configs : added output label description depending on option set	2023-08-03 21:35:03 +02:00
saundersp	c2f46806c6	python : better format_time_ns & moved unit test to seperate file	2023-08-03 21:34:12 +02:00
saundersp	c8929e4f89	cpp : fixed format_time_ns microsecond abbreviation	2023-08-03 21:32:29 +02:00