diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..d1c03f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +data +*/models* +venv +*/out* +python/__pycache__ +cpp/bin diff --git a/README.fr.md b/README.fr.md new file mode 100755 index 0000000..c78048e --- /dev/null +++ b/README.fr.md @@ -0,0 +1,211 @@ +# Viola Jones + +*Lisez ceci dans d'autres langues: [English](README.md)* + +## Description + +Implémentation de l'algorithme "Viola Jones" en Python et C++. + +## Dépendances + +- Python +- pip +- Bash +- Make +- curl +- tar +- Cuda toolkit +- Cudnn + +## Utilisation + +### C++ + +Vous pouvez configurer l'algorithme avec les variables globales définies au début du fichier *ViolaJones.cpp* puis lancer 'make start'. + +Il y a également la commande 'make clean' qui permet de supprimer tout fichiers compilées. + +### Python + +Vous pouvez configurer l'algorithme dans le fichier *config.py* puis lancer l'algorithme avec 'make start'. + +**Note : Le script téléchargera automatiquement le set de données.** +**Note : Vous pouvez supprimer la sauvegardes de tout résultat avec la commande 'make reset'** + +## Entraînement + +L'algorithme à été entraîné avec un processeur Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz et un GPU NVIDIA GeForce RTX 2080 Ti. + +### Tableau de comparaison des temps d'exécution + +R = Temps CPU / Temps GPU ou NJIT +Si R >= 1 alors le temps est R fois plus rapide que CPU. +Si R < 1 alors le temps est R^-1 fois plus lent que CPU. + +Il se trouve que le GPU bat systématiquement le CPU en matière de temps d'exécution, alors les chiffres indiqués sont les ratios R. + +| Preprocessing | GPU | NJIT | +| ------------------------------------------ | ------- | ------ | +| Converting training set to integral images | 12.62 | 280.78 | +| Converting testing set to integral images | 15.90 | 251.18 | +| Applying features to training set | 3252.38 | 191.87 | +| Applying features to testing set | 3204.09 | 114.48 | + +| Training | GPU | NJIT | +| ------------------ | ----- | ------ | +| ViolaJones T = 1 | 40.29 | 25.08 | +| ViolaJones T = 5 | 64.64 | 124.17 | +| ViolaJones T = 10 | 64.98 | 121.03 | +| ViolaJones T = 25 | 67.65 | 126.69 | +| ViolaJones T = 50 | 67.35 | 128.80 | +| ViolaJones T = 100 | 66.86 | 128.31 | +| ViolaJones T = 200 | 65.92 | 126.71 | +| ViolaJones T = 300 | 65.47 | 124.91 | + +## Évaluation + +L'algorithme de ViolaJones étant déterministe, tous les modèles entraînés avec un T donnée, peu importe le moyen (CPU, NJIT ou GPU), seront les mêmes modèles avec les mêmes paramètres. + +Rappel: ACC (Accuracy i.e. Précision), F1 (Score F1), FN (Faux Négatif) et FP (Faux Positif). + +| Evaluating | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) | +| ------------------ | ------- | ------ | ------ | ------ | ------- | ------ | ------ | ------ | +| ViolaJones T = 1 | 86.37% | 0.82 | 753 | 198 | 75.64% | 0.09 | 5,662 | 196 | +| ViolaJones T = 5 | 85.27% | 0.77 | 318 | 710 | 91.86% | 0.09 | 1,582 | 375 | +| ViolaJones T = 10 | 86.01% | 0.80 | 545 | 431 | 93.34% | 0.13 | 1,248 | 354 | +| ViolaJones T = 25 | 92.06% | 0.89 | 373 | 181 | 93.79% | 0.19 | 1,201 | 292 | +| ViolaJones T = 50 | 94.20% | 0.92 | 239 | 166 | 96.23% | 0.25 | 588 | 319 | +| ViolaJones T = 100 | 95.41% | 0.93 | 152 | 168 | 96.54% | 0.22 | 479 | 352 | +| ViolaJones T = 200 | 96.24% | 0.95 | 133 | 129 | 96.78% | 0.17 | 381 | 394 | +| ViolaJones T = 300 | 96.75% | 0.95 | 94 | 133 | 96.93% | 0.17 | 343 | 394 | + +## Annexes + +### Temps d'exécution des parties communes + +| Preprocessing | Time spent (ns) | Formatted time spent | +| ----------------------------------- | --------------- | -------------------- | +| Compiling NJIT and GPU | 6,315,144,200 | 6s 315ms 144µs 200ns | +| Loading sets | 155,582,900 | 155ms 582µs 900ns | +| Building features | 292,216,000 | 292ms 216µs | +| Selecting best features | 3,956,470,800 | 3s 956ms 470µs 800ns | +| Precalculating training set argsort | 1,356,386,000 | 1s 356ms 386µs | +| Precalculating testing set argsort | 4,766,277,500 | 4s 766ms 277µs 500ns | + +### Test unitaires + +Les tests unitaires consistent en la vérification de l'égalité des fichiers résultant des procédés (CPU == NJIT == GPU). +L'algorithme de ViolaJones étant déterministe, les fichiers devraient être égaux (au mieux que le permet la virgule flottante). + +| Unit testing | Test state | Time spent (ns) | Formatted time spent | +| --------------------- | ---------- | --------------- | ------------------------ | +| X_train_feat | Passed | 10,527,171,100 | 10s 527ms 171µs 100ns | +| X_test_feat | Passed | 86,698,306,700 | 1m 26s 698ms 306µs 700ns | +| X_train_ii | Passed | 1,120,532,600 | 1s 120ms 532µs 600ns | +| X_test_ii | Passed | 589,468,800 | 589ms 468µs 800ns | +| alphas_1 | Passed | 15,958,700 | 15ms 958µs 700ns | +| final_classifiers_1 | Passed | 13,961,300 | 13ms 961µs 300ns | +| alphas_5 | Passed | 41,888,300 | 41ms 888µs 300ns | +| final_classifiers_5 | Passed | 23,936,300 | 23ms 936µs 300ns | +| alphas_10 | Passed | 62,881,900 | 62ms 881µs 900ns | +| final_classifiers_10 | Passed | 82,882,100 | 82ms 882µs 100ns | +| alphas_25 | Passed | 11,495,300 | 11ms 495µs 300ns | +| final_classifiers_25 | Passed | 62,827,900 | 62ms 827µs 900ns | +| alphas_50 | Passed | 3,987,200 | 3ms 987µs 200ns | +| final_classifiers_50 | Passed | 46,897,900 | 46ms 897µs 900ns | +| alphas_100 | Passed | 2,991,400 | 2ms 991µs 400ns | +| final_classifiers_100 | Passed | 100,732,100 | 100ms 732µs 100ns | +| alphas_200 | Passed | 6,979,900 | 6ms 979µs 900ns | +| final_classifiers_200 | Passed | 2,991,600 | 2ms 991µs 600ns | +| alphas_300 | Passed | 50,862,500 | 50ms 862µs 500ns | +| final_classifiers_300 | Passed | 997,400 | 997µs 400ns | + +### Temps d'exécution du CPU + +| Preprocessing | Time spent (ns) | Formatted time spent | +| ------------------------------------------------ | ----------------- | ---------------------------- | +| Converting training set to integral images (CPU) | 1,120,022,400 | 1s 120ms 22µs 400ns | +| Converting testing set to integral images (CPU) | 3,757,517,900 | 3s 757ms 517µs 900ns | +| Applying features to training set (CPU) | 2,607,923,836,600 | 43m 27s 923ms 836µs 600ns | +| Applying features to testing set (CPU) | 8,910,858,819,100 | 2h 28m 30s 858ms 819µs 100ns | + +| Training | Time spent (ns) | Formatted time spent | +| ------------------------ | ----------------- | ---------------------------- | +| ViolaJones T = 1 (CPU) | 32,948,442,200 | 32s 948ms 442µs 200ns | +| ViolaJones T = 5 (CPU) | 159,626,648,000 | 2m 39s 626ms 648µs | +| ViolaJones T = 10 (CPU) | 315,165,752,800 | 5m 15s 165ms 752µs 800ns | +| ViolaJones T = 25 (CPU) | 773,419,206,100 | 12m 53s 419ms 206µs 100ns | +| ViolaJones T = 50 (CPU) | 1,531,656,252,200 | 25m 31s 656ms 252µs 200ns | +| ViolaJones T = 100 (CPU) | 3,056,693,435,300 | 50m 56s 693ms 435µs 300ns | +| ViolaJones T = 200 (CPU) | 6,093,482,072,800 | 1h 41m 33s 482ms 72µs 800ns | +| ViolaJones T = 300 (CPU) | 9,139,635,975,200 | 2h 32m 19s 635ms 975µs 200ns | + +| Testing | Time spent (ns) (E) | Formatted time spent (E) | Time spent (ns) (T) | Formatted time spent (T) | +| ------------------------ | ------------------- | ------------------------ | ------------------- | ------------------------ | +| ViolaJones T = 1 (CPU) | 0 | <1ns | 997,200 | 997µs 200ns | +| ViolaJones T = 5 (CPU) | 997,100 | 997µs 100ns | 997,700 | 997µs 700ns | +| ViolaJones T = 10 (CPU) | 997,700 | 997µs 700ns | 2,992,200 | 2ms 992µs 200ns | +| ViolaJones T = 25 (CPU) | 1,994,600 | 1ms 994µs 600ns | 4,986,800 | 4ms 986µs 800ns | +| ViolaJones T = 50 (CPU) | 3,989,400 | 3ms 989µs 400ns | 11,968,000 | 11ms 968µs | +| ViolaJones T = 100 (CPU) | 6,981,500 | 6ms 981µs 500ns | 18,949,800 | 18ms 949µs 800ns | +| ViolaJones T = 200 (CPU) | 12,965,800 | 12ms 965µs 800ns | 36,902,700 | 36ms 902µs 700ns | +| ViolaJones T = 300 (CPU) | 17,951,800 | 17ms 951µs 800ns | 56,848,300 | 56ms 848µs 300ns | + +### Temps d'exécution du GPU + +| Preprocessing | Time spent (ns) | Formatted time spent | +| ------------------------------------------------ | --------------- | -------------------- | +| Converting training set to integral images (GPU) | 88,759,800 | 88ms 759µs 800ns | +| Converting testing set to integral images (GPU) | 236,366,600 | 236ms 366µs 600ns | +| Applying features to training set (GPU) | 801,849,700 | 801ms 849µs 700ns | +| Applying features to testing set (GPU) | 2,781,090,300 | 2s 781ms 90µs 300ns | + +| Training | Time spent (ns) | Formatted time spent | +| ------------------------ | --------------- | ------------------------ | +| ViolaJones T = 1 (GPU) | 817,811,700 | 817ms 811µs 700ns | +| ViolaJones T = 5 (GPU) | 2,469,417,100 | 2s 469ms 417µs 100ns | +| ViolaJones T = 10 (GPU) | 4,850,067,700 | 4s 850ms 67µs 700ns | +| ViolaJones T = 25 (GPU) | 11,432,447,600 | 11s 432ms 447µs 600ns | +| ViolaJones T = 50 (GPU) | 22,742,326,800 | 22s 742ms 326µs 800ns | +| ViolaJones T = 100 (GPU) | 45,714,804,900 | 45s 714ms 804µs 900ns | +| ViolaJones T = 200 (GPU) | 92,438,265,000 | 1m 32s 438ms 265µs | +| ViolaJones T = 300 (GPU) | 139,605,228,600 | 2m 19s 605ms 228µs 600ns | + +### Temps d'exécution du CPU compilé avec NJIT + +| Preprocessing | Time spent (ns) | Formatted time spent | +| ------------------------------------------------- | --------------- | ------------------------ | +| Converting training set to integral images (NJIT) | 3,989,000 | 3ms 989µs | +| Converting testing set to integral images (NJIT) | 14,959,600 | 14ms 959µs 600ns | +| Applying features to training set (NJIT) | 13,592,361,400 | 13s 592ms 361µs 400ns | +| Applying features to testing set (NJIT) | 77,834,323,700 | 1m 17s 834ms 323µs 700ns | + +| Training | Time spent (ns) | Formatted time spent | +| ------------------------- | --------------- | ------------------------ | +| ViolaJones T = 1 (NJIT) | 1,313,497,300 | 1s 313ms 497µs 300ns | +| ViolaJones T = 5 (NJIT) | 1,285,571,700 | 1s 285ms 571µs 700ns | +| ViolaJones T = 10 (NJIT) | 2,604,081,500 | 2s 604ms 81µs 500ns | +| ViolaJones T = 25 (NJIT) | 6,104,721,700 | 6s 104ms 721µs 700ns | +| ViolaJones T = 50 (NJIT) | 11,891,281,600 | 11s 891ms 281µs 600ns | +| ViolaJones T = 100 (NJIT) | 23,822,338,800 | 23s 822ms 338µs 800ns | +| ViolaJones T = 200 (NJIT) | 48,089,174,900 | 48s 89ms 174µs 900ns | +| ViolaJones T = 300 (NJIT) | 73,169,668,200 | 1m 13s 169ms 668µs 200ns | + +| Testing | Time spent (ns) (E) | Formatted time spent (E) | Time spent (ns) (T) | Formatted time spent (T) | +| ------------------------- | ------------------- | ------------------------ | ------------------- | ------------------------ | +| ViolaJones T = 1 (NJIT) | 0 | <1ns | 997,900 | 997µs 900ns | +| ViolaJones T = 5 (NJIT) | 0 | <1ns | 0 | <1ns | +| ViolaJones T = 10 (NJIT) | 0 | <1ns | 997,200 | 997µs 200ns | +| ViolaJones T = 25 (NJIT) | 997,100 | 997µs 100ns | 997,400 | 997µs 400ns | +| ViolaJones T = 50 (NJIT) | 996,800 | 996µs 800ns | 3,989,000 | 3ms 989µs | +| ViolaJones T = 100 (NJIT) | 2,991,900 | 2ms 991µs 900ns | 7,978,900 | 7ms 978µs 900ns | +| ViolaJones T = 200 (NJIT) | 3,989,600 | 3ms 989µs 600ns | 15,957,700 | 15ms 957µs 700ns | +| ViolaJones T = 300 (NJIT) | 5,983,900 | 5ms 983µs 900ns | 23,935,500 | 23ms 935µs 500ns | + +## Resources additionnels + +- [Rapid Object Detection using a Boosted Cascade of Simple Features](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/viola-cvpr-01.pdf) +- [Chapter 39. Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda) +- [Understanding and Implementing the Viola-Jones Image Classification Algorithm](https://medium.datadriveninvestor.com/understanding-and-implementing-the-viola-jones-image-classification-algorithm-85621f7fe20b) + +**2022 Pierre Saunders @saundersp** diff --git a/cpp/Makefile b/cpp/Makefile new file mode 100644 index 0000000..eba79d6 --- /dev/null +++ b/cpp/Makefile @@ -0,0 +1,80 @@ +CC := nvcc -m64 -std=c++17 -ccbin g++-12 -Xcompiler -m64,-std=c++17 +OBJ_DIR := bin +$(shell mkdir -p $(OBJ_DIR)) +MODELS_DIR := models +OUT_DIR := out +SRC_DIR := . +#CFLAGS := -O0 -Werror=all-warnings -g -G +#CFLAGS := $(CFLAGS) -D__DEBUG +#CFLAGS := $(CFLAGS) -pg +#CFLAGS := $(CFLAGS) -Xptxas=-w +#CFLAGS := $(CFLAGS) -Xcompiler -Wall,-O0,-g,-Werror,-Werror=implicit-fallthrough=0,-Wextra,-rdynamic +CFLAGS := -O4 -Xcompiler -O4 +EXEC := $(OBJ_DIR)/ViolaJones +DATA := ../data/X_train.bin ../data/X_test.bin ../data/y_train.bin ../data/y_test.bin +SRC := $(shell find $(SRC_DIR) -name "*.cpp" -o -name "*.cu" ) +OBJ_EXT := o +ifeq ($(OS), Windows_NT) + EXEC:=$(EXEC).exe + OBJ_EXT:=obj +endif +OBJ := $(SRC:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.$(OBJ_EXT)) +OBJ := $(OBJ:$(SRC_DIR)/%.cu=$(OBJ_DIR)/%.$(OBJ_EXT)) + +.PHONY: all start reset clean mrproper debug check + +all: $(EXEC) $(DATA) + +# Compiling host code +$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cpp + @echo Compiling $< + @$(CC) $(CFLAGS) -c $< -o $@ + +# Compiling gpu code +$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cu + @echo Compiling $< + @$(CC) $(CFLAGS) -c $< -o $@ + +$(EXEC): $(OBJ) + @echo Linking objects files to $@ + @$(CC) $(CFLAGS) $^ -o $@ + +$(DATA): + @bash ../download_data.sh .. + +start: $(EXEC) $(DATA) + @./$(EXEC) + +profile: start + @gprof $(EXEC) gmon.out | gprof2dot | dot -Tpng -o output.png +#@gprof $(EXEC) gmon.out > analysis.txt + +debug: $(EXEC) $(DATA) + #@cuda-gdb -q $(EXEC) + @gdb -q --tui $(EXEC) + +check: $(EXEC) $(DATA) + @valgrind -q -s --leak-check=full --show-leak-kinds=all $(EXEC) + +cudacheck: $(EXEC) $(DATA) + @cuda-memcheck --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all $(EXEC) +#@cuda-memcheck --destroy-on-device-error kernel --tool racecheck --racecheck-report all $(EXEC) +#@cuda-memcheck --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC) +#@cuda-memcheck --destroy-on-device-error kernel --tool synccheck $(EXEC) +#@compute-sanitizer --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all --track-stream-ordered-races all $(EXEC) +#@compute-sanitizer --destroy-on-device-error kernel --tool racecheck --racecheck-detect-level info --racecheck-report all $(EXEC) +#@compute-sanitizer --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC) +#@compute-sanitizer --destroy-on-device-error kernel --tool synccheck $(EXEC) + +r2: $(EXEC) $(DATA) + @r2 $(EXEC) + +reset: + @echo Deleting generated states and models + @rm -rf $(OUT_DIR)/* $(MODELS_DIR)/* | true + +clean: + @rm $(EXEC) + +mrproper: + @rm -r $(OBJ_DIR) diff --git a/cpp/ViolaJones.cpp b/cpp/ViolaJones.cpp new file mode 100644 index 0000000..92f75c3 --- /dev/null +++ b/cpp/ViolaJones.cpp @@ -0,0 +1,296 @@ +#include +#include "data.hpp" +#include "config.hpp" +#include "ViolaJonesGPU.hpp" +#include "ViolaJonesCPU.hpp" + +static inline void add_empty_feature(const np::Array& feats, size_t& n) noexcept { + memset(&feats[n], 0, 4 * sizeof(uint8_t)); + n += 4; +} + +static inline void add_right_feature(const np::Array& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept { + feats[n++] = i + w; + feats[n++] = j; + feats[n++] = w; + feats[n++] = h; +} + +static inline void add_immediate_feature(const np::Array& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept { + feats[n++] = i; + feats[n++] = j; + feats[n++] = w; + feats[n++] = h; +} + +static inline void add_bottom_feature(const np::Array& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept { + feats[n++] = i; + feats[n++] = j + h; + feats[n++] = w; + feats[n++] = h; +} + +static inline void add_right2_feature(const np::Array& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept { + feats[n++] = i + 2 * w; + feats[n++] = j; + feats[n++] = w; + feats[n++] = h; +} + +static inline void add_bottom2_feature(const np::Array& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept { + feats[n++] = i; + feats[n++] = j + 2 * h; + feats[n++] = w; + feats[n++] = h; +} + +static inline void add_bottom_right_feature(const np::Array& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept { + feats[n++] = i + w; + feats[n++] = j + h; + feats[n++] = w; + feats[n++] = h; +} + +np::Array build_features(const uint16_t& width, const uint16_t& height) noexcept { + size_t n = 0; + uint16_t w, h, i, j; + for (w = 1; w < width; ++w) + for (h = 1; h < height; ++h) + for (i = 0; i < width - w; ++i) + for (j = 0; j < height - h; ++j) { + if (i + 2 * w < width) ++n; + if (j + 2 * h < height) ++n; + if (i + 3 * w < width) ++n; + if (j + 3 * h < height) ++n; + if (i + 2 * w < width && j + 2 * h < height) ++n; + } + + np::Array feats = np::empty({ n, 2, 2, 4 }); + + n = 0; + for (w = 1; w < width; ++w) + for (h = 1; h < height; ++h) + for (i = 0; i < width - w; ++i) + for (j = 0; j < height - h; ++j) { + if (i + 2 * w < width) { + add_right_feature(feats, n, i, j, w, h); + add_empty_feature(feats, n); + add_immediate_feature(feats, n, i, j, w, h); + add_empty_feature(feats, n); + } + + if (j + 2 * h < height) { + add_immediate_feature(feats, n, i, j, w, h); + add_empty_feature(feats, n); + add_bottom_feature(feats, n, i, j, w, h); + add_empty_feature(feats, n); + } + + if (i + 3 * w < width) { + add_right_feature(feats, n, i, j, w, h); + add_empty_feature(feats, n); + add_right2_feature(feats, n, i, j, w, h); + add_immediate_feature(feats, n, i, j, w, h); + } + + if (j + 3 * h < height) { + add_bottom_feature(feats, n, i, j, w, h); + add_empty_feature(feats, n); + add_bottom2_feature(feats, n, i, j, w, h); + add_immediate_feature(feats, n, i, j, w, h); + } + + if (i + 2 * w < width && j + 2 * h < height) { + add_right_feature(feats, n, i, j, w, h); + add_bottom_feature(feats, n, i, j, w, h); + add_immediate_feature(feats, n, i, j, w, h); + add_bottom_right_feature(feats, n, i, j, w, h); + } + } + return feats; +} + +//np::Array select_percentile(const np::Array X_feat, const np::Array y) noexcept { +// std::vector class_0, class_1; +// +// const int im_size = X_feat.shape[0] / y.shape[0]; +// int idy = 0, n_samples_per_class_0 = 0, n_samples_per_class_1 = 0; +// for (size_t i = 0; i < X_feat.shape[0]; i += im_size) { +// if (y[idy] == 0) { +// ++n_samples_per_class_0; +// class_0.push_back(static_cast(X_feat[i])); +// } +// else { +// ++n_samples_per_class_1; +// class_1.push_back(static_cast(X_feat[i])); +// } +// ++idy; +// } +// const int n_samples = n_samples_per_class_0 + n_samples_per_class_1; +// +// float64_t ss_alldata_0 = 0; +// for (int i = 0;i < n_samples_per_class_0;++i) +// ss_alldata_0 += (class_0[i] * class_0[i]); +// +// float64_t ss_alldata_1 = 0; +// for (int i = 0;i < n_samples_per_class_1;++i) +// ss_alldata_1 += (class_1[i] * class_1[i]); +// +// const float64_t ss_alldata = ss_alldata_0 + ss_alldata_1; +// +// float64_t sums_classes_0 = 0; +// for (int i = 0;i < n_samples_per_class_0;++i) +// sums_classes_0 += class_0[i]; +// +// float64_t sums_classes_1 = 0; +// for (int i = 0;i < n_samples_per_class_1;++i) +// sums_classes_1 += class_1[i]; +// +// float64_t sq_of_sums_alldata = sums_classes_0 + sums_classes_1; +// sq_of_sums_alldata *= sq_of_sums_alldata; +// +// const float64_t sq_of_sums_args_0 = sums_classes_0 * sums_classes_0; +// const float64_t sq_of_sums_args_1 = sums_classes_1 * sums_classes_1; +// const float64_t ss_tot = ss_alldata - sq_of_sums_alldata / n_samples; +// const float64_t sqd_sum_bw_n = sq_of_sums_args_0 / n_samples_per_class_0 + sq_of_sums_args_1 / n_samples_per_class_1 - sq_of_sums_alldata / n_samples; +// const float64_t ss_wn = ss_tot - sqd_sum_bw_n; +// const int df_wn = n_samples - 2; +// const float64_t msw = ss_wn / df_wn; +// const float64_t f_values = sqd_sum_bw_n / msw; +// +// const np::Array res = np::empty({ static_cast(std::ceil(static_cast(im_size) / 10.0)) }); +// // TODO Complete code +// return res; +//} + +np::Array init_weights(const np::Array& y_train) noexcept { + np::Array weights = np::empty(y_train.shape); + const uint16_t t = np::sum(np::astype(y_train)); + return map(weights, static_cast>( + [&t, &y_train](const size_t& i, const float64_t&) -> float64_t { + return 1.0 / (2 * (y_train[i] == 0 ? t : y_train.shape[0] - t)); + })); +} + +np::Array classify_weak_clf(const np::Array& X_feat_i, const size_t& j, const float64_t& threshold, const float64_t& polarity) noexcept { + np::Array res = np::empty({ X_feat_i.shape[1] }); + for(size_t i = 0; i < res.shape[0]; ++i) + res[i] = polarity * X_feat_i[j * X_feat_i.shape[1] + i] < polarity * threshold ? 1 : 0; + return res; +} + +np::Array classify_viola_jones(const np::Array& alphas, const np::Array& classifiers, const np::Array& X_feat) noexcept { + np::Array total = np::zeros({ X_feat.shape[1] }); + + float64_t clf = 0.0, threshold = 0.0, polarity = 0.0; + for(size_t i = 0; i < alphas.shape[0]; ++i){ + clf = classifiers[i * 3]; threshold = classifiers[i * 3 + 1]; polarity = classifiers[i * 3 + 2]; + //total += alphas * np::astype(classify_weak_clf(X_feat, clf, threshold, polarity)); + const np::Array res = classify_weak_clf(X_feat, clf, threshold, polarity); + for(size_t j = 0; j < res.shape[0]; ++j) + total[j] += alphas[i] * res[j]; + } + + np::Array y_pred = np::empty({ X_feat.shape[1] }); + const float64_t alphas_sum = np::sum(alphas); + for(size_t i = 0; i < X_feat.shape[1]; ++i) + y_pred[i] = total[i] >= 0.5 * alphas_sum ? 1 : 0; + + return y_pred; +} + +std::tuple> select_best(const np::Array& classifiers, const np::Array& weights, const np::Array& X_feat, const np::Array& y) noexcept { + std::tuple> res = { -1, np::inf, np::empty({ X_feat.shape[0] }) }; + + for(size_t j = 0; j < classifiers.shape[0]; ++j){ + const np::Array accuracy = np::abs(np::astype(classify_weak_clf(X_feat, j, classifiers[j * 2], classifiers[j * 2 + 1])) - y); + // const float64_t error = np::mean(weights * accuracy); + float64_t error = 0.0; + for(size_t i = 0; i < weights.shape[0]; ++i) + error += weights[i] * accuracy[i]; + error /= weights.shape[0]; + + if (error < std::get<1>(res)) + res = { j, error, accuracy }; + } + return res; +} + +std::array, 2> train_viola_jones(const size_t& T, const np::Array& X_feat, const np::Array& X_feat_argsort, const np::Array& y) noexcept { + np::Array weights = init_weights(y); + np::Array alphas = np::empty({ T }); + np::Array final_classifier = np::empty({ T, 3 }); + + for(size_t t = 0; t < T; ++t ){ + weights /= np::sum(weights); +#if GPU_BOOSTED + const np::Array classifiers = train_weak_clf_gpu(X_feat, X_feat_argsort, y, weights); +#else + const np::Array classifiers = train_weak_clf_cpu(X_feat, X_feat_argsort, y, weights); +#endif + const auto [ clf, error, accuracy ] = select_best(classifiers, weights, X_feat, y); + float64_t beta = error / (1.0 - error); + weights *= np::pow(beta, (1.0 - accuracy)); + alphas[t] = std::log(1.0 / beta); + final_classifier[t * 3] = clf; + final_classifier[t * 3 + 1] = classifiers[clf * 2]; + final_classifier[t * 3 + 2] = classifiers[clf * 2 + 1]; + } + return { alphas, final_classifier }; +} + +float64_t accuracy_score(const np::Array& y, const np::Array& y_pred) noexcept { + float64_t res = 0.0; + for(size_t i = 0; i < y.shape[0]; ++i) + if(y[i] == y_pred[i]) + ++res; + return res / y.shape[0]; +} + +float64_t precision_score(const np::Array& y, const np::Array& y_pred) noexcept { + uint16_t true_positive = 0, false_positive = 0; + for(size_t i = 0; i < y.shape[0]; ++i) + if(y[i] == 1){ + if(y[i] == y_pred[i]) + ++true_positive; + else + ++false_positive; + } + return static_cast(true_positive) / (true_positive + false_positive); +} + +float64_t recall_score(const np::Array& y, const np::Array& y_pred) noexcept { + uint16_t true_positive = 0, false_negative = 0; + for(size_t i = 0; i < y.shape[0]; ++i) + if(y[i] == 0) { + if(y[i] != y_pred[i]) + ++false_negative; + } else { + if(y[i] == y_pred[i]) + ++true_positive; + } + return static_cast(true_positive) / (true_positive + false_negative); +} + +float64_t f1_score(const np::Array& y, const np::Array& y_pred) noexcept { + const float64_t precision = precision_score(y, y_pred); + const float64_t recall = recall_score(y, y_pred); + return 2 * (precision * recall) / (precision + recall); +} + +std::tuple confusion_matrix(const np::Array& y, const np::Array& y_pred) noexcept { + uint16_t true_positive = 0, false_positive = 0, true_negative = 0, false_negative = 0; + for(size_t i = 0; i < y.shape[0]; ++i) + if(y[i] == 0) + if(y[i] == y_pred[i]) + ++true_negative; + else + ++false_negative; + else + if(y[i] == y_pred[i]) + ++true_positive; + else + ++false_positive; + return std::make_tuple(true_negative, false_positive, false_negative, true_positive); +} + diff --git a/cpp/ViolaJones.hpp b/cpp/ViolaJones.hpp new file mode 100644 index 0000000..d5b51a4 --- /dev/null +++ b/cpp/ViolaJones.hpp @@ -0,0 +1,157 @@ +#pragma once +#include +namespace fs = std::filesystem; +#include "data.hpp" +#include "toolbox.hpp" +//#include "config.hpp" + +template +void unit_test_cpu_vs_gpu(const np::Array& cpu, const np::Array& gpu) noexcept { + if (cpu.shape != gpu.shape) { + fprintf(stderr, "Inequal shape !\n"); + return; + } + size_t eq = 0; + const size_t length = np::prod(cpu.shape); + for (size_t i = 0; i < length; ++i) + if (cpu[i] == gpu[i]) + ++eq; + //else + // std::cout << i << ": " << cpu[i] << " != " << gpu[i] << std::endl; + if (eq != length) + printf("Incorrect results, Number of equalities : %s/%s <=> %.2f%% !\n", thousand_sep(eq).c_str(), thousand_sep(length).c_str(), + static_cast(eq) / static_cast(length) * 100.0); +} + +template +void unit_test_argsort_2d(const np::Array& a, const np::Array& indices) noexcept { + if (a.shape != indices.shape) { + fprintf(stderr, "Inequal shape !\n"); + return; + } + size_t correct = a.shape[0]; // First elements are always correctly sorted + const size_t total = np::prod(a.shape); + for(size_t i = 0; i < total; i += a.shape[1]) + for(size_t j = 0; j < a.shape[1] - 1; ++j){ + const size_t k = i + j; + if(a[i + indices[k]] <= a[i + indices[k + 1]]) + ++correct; + } + if (correct != total) + printf("Incorrect results, Number of equalities : %s/%s <=> %.2f%% !\n", thousand_sep(correct).c_str(), thousand_sep(total).c_str(), + static_cast(correct) / static_cast(total) * 100.0); +} + +template +T benchmark_function(const char* step_name, const F& fnc, Args &&...args) noexcept { +#ifndef __DEBUG + printf("%s...\r", step_name); + fflush(stdout); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered +#endif + const auto start = time(); + const T res = fnc(std::forward(args)...); + const long long timespent = duration_ns(time() - start); + printf("| %-49s | %17s | %-29s |\n", step_name, thousand_sep(timespent).c_str(), format_time_ns(timespent).c_str()); + return res; +} + +template +void benchmark_function_void(const char* step_name, const F& fnc, Args &&...args) noexcept { +#ifndef __DEBUG + printf("%s...\r", step_name); + fflush(stdout); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered +#endif + const auto start = time(); + fnc(std::forward(args)...); + const long long timespent = duration_ns(time() - start); + printf("| %-49s | %17s | %-29s |\n", step_name, thousand_sep(timespent).c_str(), format_time_ns(timespent).c_str()); +} + +template +np::Array state_saver(const char* step_name, const char* filename, const bool& force_redo, const bool& save_state, const char* out_dir, const F& fnc, Args &&...args) noexcept { + char filepath[BUFFER_SIZE] = { 0 }; + sprintf(filepath, "%s/%s.bin", out_dir, filename); + + np::Array bin; + if (!fs::exists(filepath) || force_redo) { + bin = std::move(benchmark_function>(step_name, fnc, std::forward(args)...)); + if(save_state){ +#ifndef __DEBUG + printf("Saving results of %s\r", step_name); + fflush(stdout); +#endif + save(bin, filepath); +#ifndef __DEBUG + printf("%*c\r", 100, ' '); + fflush(stdout); +#endif + } + } else { +#ifndef __DEBUG + printf("Loading results of %s\r", step_name); + fflush(stdout); +#endif + bin = std::move(load(filepath)); + printf("| %-49s | %17s | %-29s |\n", step_name, "None", "loaded saved state"); + } + return bin; +} + +template +std::array, N> state_saver(const char* step_name, const std::vector& filenames, const bool& force_redo, const bool& save_state, const char* out_dir, const F& fnc, Args &&...args) noexcept { + char filepath[BUFFER_SIZE] = { 0 }; + bool abs = false; + for (const char* filename : filenames){ + sprintf(filepath, "%s/%s.bin", out_dir, filename); + if (!fs::exists(filepath)) { + abs = true; + break; + } + } + + std::array, N> bin; + if (abs || force_redo) { + bin = std::move(benchmark_function, N>>(step_name, fnc, std::forward(args)...)); + if (save_state){ +#ifndef __DEBUG + printf("Saving results of %s\r", step_name); + fflush(stdout); +#endif + size_t i = 0; + for (const char* filename : filenames){ + sprintf(filepath, "%s/%s.bin", out_dir, filename); + save(bin[i++], filepath); + } +#ifndef __DEBUG + printf("%*c\r", 100, ' '); + fflush(stdout); +#endif + } + } else { +#ifndef __DEBUG + printf("Loading results of %s\r", step_name); + fflush(stdout); +#endif + size_t i = 0; + for (const char* filename : filenames){ + sprintf(filepath, "%s/%s.bin", out_dir, filename); + bin[i++] = std::move(load(filepath)); + } + printf("| %-49s | %17s | %-29s |\n", step_name, "None", "loaded saved state"); + } + return bin; +} + +np::Array argsort_2d_cpu(const np::Array&) noexcept; +np::Array build_features(const uint16_t&, const uint16_t&) noexcept; +np::Array select_percentile(const np::Array&, const np::Array&) noexcept; +np::Array classify_viola_jones(const np::Array&, const np::Array&, const np::Array&) noexcept; +np::Array init_weights(const np::Array&) noexcept; +std::tuple> select_best(const np::Array&, const np::Array&, const np::Array&, + const np::Array&) noexcept; +std::array, 2> train_viola_jones(const size_t&, const np::Array&, const np::Array&, const np::Array&) noexcept; +float64_t accuracy_score(const np::Array&, const np::Array&) noexcept; +float64_t precision_score(const np::Array&, const np::Array&) noexcept; +float64_t recall_score(const np::Array&, const np::Array&) noexcept; +float64_t f1_score(const np::Array&, const np::Array&) noexcept; +std::tuple confusion_matrix(const np::Array&, const np::Array&) noexcept; diff --git a/cpp/ViolaJonesCPU.cpp b/cpp/ViolaJonesCPU.cpp new file mode 100644 index 0000000..49cd654 --- /dev/null +++ b/cpp/ViolaJonesCPU.cpp @@ -0,0 +1,93 @@ +#include "data.hpp" +#include "toolbox.hpp" + +np::Array set_integral_image_cpu(const np::Array& set) noexcept { + np::Array X_ii = np::empty(set.shape); + + size_t i, y, x, s; + uint32_t ii[set.shape[1] * set.shape[2]]; + const size_t length = np::prod(set.shape); + for (size_t offset = 0; offset < length; offset += set.shape[1] * set.shape[2]) { + for (i = 0; i < set.shape[1] * set.shape[2]; ++i) + ii[i] = 0; + for (y = 1; y < set.shape[1]; ++y) { + s = 0; + for (x = 0; x < set.shape[2] - 1; ++x) { + s += set[offset + (y - 1) * set.shape[2] + x]; + ii[y * set.shape[2] + x + 1] = s + ii[(y - 1) * set.shape[2] + x + 1]; + } + } + for (y = 0; y < set.shape[1]; ++y) + for (x = 0; x < set.shape[2]; ++x) + X_ii[offset + y * set.shape[2] + x] = ii[y * set.shape[2] + x]; + } + + return X_ii; +} + +constexpr static inline int16_t __compute_feature__(const np::Array& X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept { + const size_t _y = y * X_ii.shape[1] + x; + const size_t _yh = _y + h * X_ii.shape[1]; + return X_ii[j + _yh + w] + X_ii[j + _y] - X_ii[j + _yh] - X_ii[j + _y + w]; +} + +np::Array apply_features_cpu(const np::Array& feats, const np::Array& X_ii) noexcept { + np::Array X_feat = np::empty({ feats.shape[0], X_ii.shape[0] }); + + size_t j, feat_idx = 0; + int16_t p1, p2, n1, n2; + const size_t feats_length = np::prod(feats.shape), X_ii_length = np::prod(X_ii.shape); + const size_t feats_step = np::prod(feats.shape, 1), X_ii_step = np::prod(X_ii.shape, 1); + for (size_t i = 0; i < feats_length; i += feats_step){ + for (j = 0; j < X_ii_length; j += X_ii_step) { + p1 = __compute_feature__(X_ii, j, feats[i + 0], feats[i + 1], feats[i + 2], feats[i + 3]); + p2 = __compute_feature__(X_ii, j, feats[i + 4], feats[i + 5], feats[i + 6], feats[i + 7]); + n1 = __compute_feature__(X_ii, j, feats[i + 8], feats[i + 9], feats[i + 10], feats[i + 11]); + n2 = __compute_feature__(X_ii, j, feats[i + 12], feats[i + 13], feats[i + 14], feats[i + 15]); + X_feat[feat_idx++] = static_cast(p1 + p2) - static_cast(n1 + n2); + } + } + + return X_feat; +} + +np::Array train_weak_clf_cpu(const np::Array& X_feat, const np::Array& X_feat_argsort, const np::Array& y, const np::Array& weights) noexcept { + float64_t total_pos = 0.0, total_neg = 0.0; + for(size_t i = 0; i < y.shape[0]; ++i) + (y[i] == static_cast(1) ? total_pos : total_neg) += weights[i]; + + np::Array classifiers = np::empty({ X_feat.shape[0], 2}); + for(size_t i = 0; i < X_feat.shape[0]; ++i){ + size_t pos_seen = 0, neg_seen = 0; + float64_t pos_weights = 0.0, neg_weights = 0.0; + float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0; + for(size_t j = 0; j < X_feat_argsort.shape[1]; ++j) { + const float64_t error = std::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights); + if (error < min_error){ + min_error = error; + best_threshold = X_feat[i * X_feat.shape[1] + X_feat_argsort[i * X_feat.shape[1] + j]]; + best_polarity = pos_seen > neg_seen ? 1.0 : -1.0; + } + if(y[X_feat_argsort[i * X_feat.shape[1] + j]] == static_cast(1)){ + ++pos_seen; + pos_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]]; + } else { + ++neg_seen; + neg_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]]; + } + } + classifiers[i * 2] = best_threshold; classifiers[i * 2 + 1] = best_polarity; + } + return classifiers; +} + +np::Array argsort_2d_cpu(const np::Array& X_feat) noexcept { + const np::Array indices = np::empty(X_feat.shape); + const size_t length = np::prod(X_feat.shape); + for (size_t i = 0; i < length; i += X_feat.shape[1]) { + for(size_t j = 0; j < X_feat.shape[1]; ++j) indices[i + j] = j; + argsort(&X_feat[i], &indices[i], 0, X_feat.shape[1] - 1); + } + return indices; +} + diff --git a/cpp/ViolaJonesCPU.hpp b/cpp/ViolaJonesCPU.hpp new file mode 100644 index 0000000..e3d7c7c --- /dev/null +++ b/cpp/ViolaJonesCPU.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "data.hpp" + +np::Array set_integral_image_cpu(const np::Array&) noexcept; +np::Array apply_features_cpu(const np::Array&, const np::Array&) noexcept; +np::Array train_weak_clf_cpu(const np::Array&, const np::Array&, const np::Array&, + const np::Array&) noexcept; +np::Array argsort_2d_cpu(const np::Array&) noexcept; diff --git a/cpp/ViolaJonesGPU.cu b/cpp/ViolaJonesGPU.cu new file mode 100644 index 0000000..9041859 --- /dev/null +++ b/cpp/ViolaJonesGPU.cu @@ -0,0 +1,491 @@ +#include +#include "data.hpp" +#include "toolbox.hpp" +#include "ViolaJones.hpp" + +#define NB_THREADS 1024 + +#define NB_THREADS_2D_X 32 +#define NB_THREADS_2D_Y 32 +__device__ constexpr const size_t M = 5; //log2(NB_THREADS_2D_Y)); + +#define NB_THREADS_3D_X 16 +#define NB_THREADS_3D_Y 16 +#define NB_THREADS_3D_Z 4 + +static __global__ void __test_working_kernel__(const np::Array d_x, np::Array d_y, const size_t length) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < length) + d_y[i] = d_x[i] * i; +} + +void test_working(const size_t& length) noexcept { + const size_t size = length * sizeof(size_t); + +#ifdef __DEBUG + print("Estimating memory footprint at : " + format_byte_size(2 * size)); +#endif + + np::Array x = np::empty({ length }), y = np::empty({ length }); + size_t i; + for (i = 0; i < length; ++i) + x[i] = i; + + np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); + + const size_t dimX = static_cast(std::ceil(static_cast(length) / static_cast(NB_THREADS))); + const dim3 dimGrid(dimX); + constexpr const dim3 dimBlock(NB_THREADS); + __test_working_kernel__<<>>(d_x, d_y, length); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); + + size_t ne = 0; + for (i = 0; i < length; ++i) + if (y[i] != x[i] * i) + ++ne; + + if (ne != 0) + fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); + + cudaFree("d_x", d_x); + cudaFree("d_y", d_y); +} + +static __global__ void __test_working_kernel_2d__(const np::Array d_x, np::Array d_y, const size_t length) { + const size_t idx = threadIdx.x * blockDim.y + threadIdx.y; + const size_t idy = blockIdx.x * gridDim.y + blockIdx.y; + const size_t i = idy * NB_THREADS_2D_X * NB_THREADS_2D_Y + idx; + if (i < length) + d_y[i] = d_x[i] * i; +} + +void test_working_2d(const size_t& N1, const size_t& N2) noexcept { + const size_t length = N1 * N2; + const size_t size = length * sizeof(size_t); + +#ifdef __DEBUG + print("Estimating memory footprint at : " + format_byte_size(2 * size)); +#endif + + np::Array x = np::empty({ length }), y = np::empty({ length }); + size_t i; + for (i = 0; i < length; ++i) + x[i] = i; + + np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); + + const size_t dimX = static_cast(std::ceil(static_cast(N1) / static_cast(NB_THREADS_2D_X))); + const size_t dimY = static_cast(std::ceil(static_cast(N2) / static_cast(NB_THREADS_2D_Y))); + const dim3 dimGrid(dimX, dimY); + constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); + __test_working_kernel_2d__<<>>(d_x, d_y, length); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); + + size_t ne = 0; + for (i = 0; i < length; ++i) + if (y[i] != x[i] * i) + ++ne; + + if (ne != 0) + fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); + + cudaFree("d_x", d_x); + cudaFree("d_y", d_y); +} + +static __global__ void __test_working_kernel_3d__(const np::Array d_x, np::Array d_y, const size_t length) { + const size_t idx = (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z; + const size_t idy = (blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z; + const size_t i = idy * NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z + idx; + if (i < length) + d_y[i] = d_x[i] * i; +} + +void test_working_3d(const size_t& N1, const size_t& N2, const size_t& N3) noexcept { + const size_t length = N1 * N2 * N3; + const size_t size = length * sizeof(size_t); + +#ifdef __DEBUG + print("Estimating memory footprint at : " + format_byte_size(2 * size)); +#endif + + np::Array x = np::empty({ length }), y = np::empty({ length }); + size_t i; + for (i = 0; i < length; ++i) + x[i] = i; + + np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); + + const size_t dimX = static_cast(std::ceil(static_cast(N1) / static_cast(NB_THREADS_3D_X))); + const size_t dimY = static_cast(std::ceil(static_cast(N2) / static_cast(NB_THREADS_3D_Y))); + const size_t dimZ = static_cast(std::ceil(static_cast(N3) / static_cast(NB_THREADS_3D_Z))); + const dim3 dimGrid(dimX, dimY, dimZ); + constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z); + __test_working_kernel_3d__<<>>(d_x, d_y, length); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); + + size_t ne = 0; + for (i = 0; i < length; ++i) + if (y[i] != x[i] * i) + ++ne; + + if (ne != 0) + fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); + + cudaFree("d_x", d_x); + cudaFree("d_y", d_y); +} + +static np::Array __scanCPU_3d__(const np::Array& X) noexcept { + np::Array X_scan = np::empty(X.shape); + const size_t total = np::prod(X_scan.shape); + const size_t i_step = np::prod(X_scan.shape, 1); + for(size_t x = 0; x < total; x += i_step) + for(size_t y = 0; y < i_step; y += X_scan.shape[2]){ + uint32_t cum = 0; + for(size_t z = 0; z < X_scan.shape[2]; ++z){ + const size_t idx = x + y + z; + cum += X[idx]; + X_scan[idx] = cum - X[idx]; + } + } + return X_scan; +} + +static __global__ void __kernel_scan_3d__(const uint16_t n, const uint16_t j, np::Array d_inter, np::Array d_X) { + const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x; + const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y; + + __shared__ uint32_t sA[NB_THREADS_2D_X * NB_THREADS_2D_Y]; + sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y] = (x_coor < n && y_coor) < j ? + d_X[blockIdx.z * NB_THREADS_2D_X * NB_THREADS_2D_Y + y_coor * NB_THREADS_2D_Y + x_coor] : 0; + __syncthreads(); + + size_t k = threadIdx.x; + for(size_t d = 0; d < M; ++d){ + k *= 2; + const size_t i1 = k + std::pow(2, d) - 1; + const size_t i2 = k + std::pow(2, d + 1) - 1; + if(i2 >= blockDim.x) + break; + sA[i2 * NB_THREADS_2D_Y + threadIdx.y] += sA[i1 * NB_THREADS_2D_Y + threadIdx.y]; + } + __syncthreads(); + + if(threadIdx.x == 0){ + d_inter[blockIdx.z * d_inter.shape[1] * d_inter.shape[2] + y_coor * d_inter.shape[2] + blockIdx.x] = + sA[(blockDim.x - 1) * NB_THREADS_2D_Y + threadIdx.y]; + sA[(blockDim.x - 1) * NB_THREADS_2D_Y + threadIdx.y] = 0; + } + __syncthreads(); + + k = std::pow(2, M + 1) * threadIdx.x; + for(int64_t d = M - 1; d > -1; --d){ + k = k / 2; + const size_t i1 = k + std::pow(2, d) - 1; + const size_t i2 = k + std::pow(2, d + 1) - 1; + if(i2 >= blockDim.x) + continue; + const uint32_t t = sA[i1 * NB_THREADS_2D_Y + threadIdx.y]; + sA[i1 * NB_THREADS_2D_Y + threadIdx.y]= sA[i2 * NB_THREADS_2D_Y + threadIdx.y]; + sA[i2 * NB_THREADS_2D_Y + threadIdx.y] += t; + } + __syncthreads(); + + if(x_coor < n && y_coor < j) + d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] = sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y]; +} + +static __global__ void __add_3d__(np::Array d_X, const np::Array d_s, const uint16_t n, const uint16_t m) { + const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x; + const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y; + if(x_coor < n && y_coor < m) + d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] += d_s[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + blockIdx.x]; +} + +static np::Array __scanGPU_3d__(const np::Array& X) noexcept { + np::Array X_scan = np::empty(X.shape); + + const size_t k = X.shape[0]; + const size_t height = X.shape[1]; + const size_t n = X.shape[2]; + const size_t n_block_x = static_cast(std::ceil(static_cast(X.shape[1]) / static_cast(NB_THREADS_2D_X))); + const size_t n_block_y = static_cast(std::ceil(static_cast(X.shape[2]) / static_cast(NB_THREADS_2D_Y))); + + np::Array d_X = copyToDevice("X", X); + np::Array inter = np::empty({ k, height, n_block_x }); + np::Array d_inter = copyToDevice("inter", inter); + + const dim3 dimGrid(n_block_x, n_block_y, k); + constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); + __kernel_scan_3d__<<>>(n, height, d_inter, d_X); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + + _print_cuda_error_("memcpy d_inter", cudaMemcpy(inter.data, d_inter.data, np::prod(inter.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + if(n_block_x >= NB_THREADS_2D_X){ + np::Array sums = __scanGPU_3d__(inter); + + np::Array d_s = copyToDevice("sums", sums); + __add_3d__<<>>(d_X, d_s, n, height); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + _print_cuda_error_("memcpy d_X", cudaMemcpy(X_scan.data, d_X.data, np::prod(X_scan.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + } else { + np::Array sums = __scanCPU_3d__(inter); + _print_cuda_error_("memcpy d_X", cudaMemcpy(X_scan.data, d_X.data, np::prod(X_scan.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + for(size_t p = 0; p < k; ++p) + for(size_t h = 0; h < height; ++h) + for(size_t i = 1; i < n_block_x; ++i) + for(size_t j = 0; j < NB_THREADS_2D_X; ++j){ + const size_t idx = i * NB_THREADS_2D_X + j; + if(idx < n){ + const size_t idy = p * X_scan.shape[1] * X_scan.shape[2] + h * X_scan.shape[2]; + X_scan[idy + idx] += sums[idy + i]; + } + } + } + + return X_scan; +} + +static __global__ void __transpose_kernel__(const np::Array d_X, np::Array d_Xt) { + __shared__ uint32_t temp[NB_THREADS_2D_X * NB_THREADS_2D_Y]; + + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + if(x < d_X.shape[1] && y < d_X.shape[2]) + temp[threadIdx.y * NB_THREADS_2D_Y + threadIdx.x] = d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + x * d_X.shape[2] + y]; + + __syncthreads(); + + x = blockIdx.y * blockDim.y + threadIdx.x; + y = blockIdx.x * blockDim.x + threadIdx.y; + if(x < d_X.shape[2] && y < d_X.shape[1]) + d_Xt[blockIdx.z * d_Xt.shape[1] * d_Xt.shape[2] + x * d_X.shape[2] + y] = temp[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y]; +} + +static np::Array __transpose_3d__(const np::Array& X) noexcept { + np::Array Xt = np::empty({ X.shape[0], X.shape[2], X.shape[1] }); + + np::Array d_X = copyToDevice("X", X); + np::Array d_Xt = copyToDevice("Xt", Xt); + + const size_t n_block_x = static_cast(std::ceil(static_cast(X.shape[1]) / static_cast(NB_THREADS_2D_X))); + const size_t n_block_y = static_cast(std::ceil(static_cast(X.shape[2]) / static_cast(NB_THREADS_2D_Y))); + const dim3 dimGrid(n_block_x, n_block_y, X.shape[0]); + constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); + __transpose_kernel__<<>>(d_X, d_Xt); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + _print_cuda_error_("memcpy d_Xt", cudaMemcpy(Xt.data, d_Xt.data, np::prod(Xt.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaFree("X", d_X); + cudaFree("Xt", d_Xt); + + return Xt; +} + +np::Array set_integral_image_gpu(const np::Array& X) noexcept { + np::Array X_ii = np::astype(X); + X_ii = __scanCPU_3d__(X_ii); + X_ii = __transpose_3d__(X_ii); + X_ii = __scanCPU_3d__(X_ii); + return __transpose_3d__(X_ii); +} + +static inline __device__ int16_t __compute_feature__(const np::Array& d_X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept { + const size_t _y = y * d_X_ii.shape[1] + x; + const size_t _yh = _y + h * d_X_ii.shape[1]; + return d_X_ii[j + _yh + w] + d_X_ii[j + _y] - d_X_ii[j + _yh] - d_X_ii[j + _y + w]; +} + +static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Array d_feats, const np::Array d_X_ii) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + size_t j = blockIdx.y * blockDim.y + threadIdx.y; + + if (i >= d_feats.shape[0] || j >= d_X_ii.shape[0]) + return; + + const size_t k = i * d_X_ii.shape[0] + j; + i *= np::prod(d_feats.shape, 1); + j *= np::prod(d_X_ii.shape, 1); + const int16_t p1 = __compute_feature__(d_X_ii, j, d_feats[i + 0], d_feats[i + 1], d_feats[i + 2], d_feats[i + 3]); + const int16_t p2 = __compute_feature__(d_X_ii, j, d_feats[i + 4], d_feats[i + 5], d_feats[i + 6], d_feats[i + 7]); + const int16_t n1 = __compute_feature__(d_X_ii, j, d_feats[i + 8], d_feats[i + 9], d_feats[i + 10], d_feats[i + 11]); + const int16_t n2 = __compute_feature__(d_X_ii, j, d_feats[i + 12], d_feats[i + 13], d_feats[i + 14], d_feats[i + 15]); + d_X_feat[k] = static_cast(p1 + p2) - static_cast(n1 + n2); +} + +np::Array apply_features_gpu(const np::Array& feats, const np::Array& X_ii) noexcept { + const np::Array X_feat = np::empty({ feats.shape[0], X_ii.shape[0] }); + int32_t* d_X_feat; + + _print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t))); + np::Array d_X_ii = copyToDevice("X_ii", X_ii); + np::Array d_feats = copyToDevice("feats", feats); + + const size_t dimX = static_cast(std::ceil(static_cast(feats.shape[0]) / static_cast(NB_THREADS_2D_X))); + const size_t dimY = static_cast(std::ceil(static_cast(X_ii.shape[0]) / static_cast(NB_THREADS_2D_Y))); + const dim3 dimGrid(dimX, dimY); + constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); + __apply_feature_kernel__<<>>(d_X_feat, d_feats, d_X_ii); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + + _print_cuda_error_("memcpy X_feat", cudaMemcpy(X_feat.data, d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t), cudaMemcpyDeviceToHost)); + + _print_cuda_error_("free d_X_feat", cudaFree(d_X_feat)); + cudaFree("free d_feats", d_feats); + cudaFree("free d_X_11", d_X_ii); + + return X_feat; +} + +static __global__ void __train_weak_clf_kernel__(np::Array d_classifiers, const np::Array d_y, + const np::Array d_X_feat, const np::Array d_X_feat_argsort, + const np::Array d_weights, const float64_t total_pos, const float64_t total_neg) { + + size_t i = blockIdx.x * blockDim.x * blockDim.y * blockDim.z; + i += threadIdx.x * blockDim.y * blockDim.z; + i += threadIdx.y * blockDim.z; + i += threadIdx.z; + // const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= d_classifiers.shape[0]) + return; + + size_t pos_seen = 0, neg_seen = 0; + float64_t pos_weights = 0.0, neg_weights = 0.0; + float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0; + for(size_t j = 0; j < d_X_feat_argsort.shape[1]; ++j) { + const float64_t error = np::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights); + if (error < min_error){ + min_error = error; + best_threshold = d_X_feat[i * d_X_feat.shape[1] + d_X_feat_argsort[i * d_X_feat.shape[1] + j]]; + best_polarity = pos_seen > neg_seen ? 1.0 : -1.0; + } + if(d_y[d_X_feat_argsort[i * d_X_feat.shape[1] + j]] == static_cast(1)){ + ++pos_seen; + pos_weights += d_weights[d_X_feat_argsort[i * d_X_feat.shape[1] + j]]; + } else { + ++neg_seen; + neg_weights += d_weights[d_X_feat_argsort[i * d_X_feat.shape[1] + j]]; + } + } + d_classifiers[i * 2] = best_threshold; d_classifiers[i * 2 + 1] = best_polarity; +} + +np::Array train_weak_clf_gpu(const np::Array& X_feat, const np::Array& X_feat_argsort, const np::Array& y, + const np::Array& weights) noexcept { + float64_t total_pos = 0.0, total_neg = 0.0; + for(size_t i = 0; i < y.shape[0]; ++i) + (y[i] == static_cast(1) ? total_pos : total_neg) += weights[i]; + + np::Array classifiers = np::empty({ X_feat.shape[0], 2}); + + np::Array d_classifiers = copyToDevice("classifiers", classifiers); + np::Array d_X_feat = copyToDevice("X_feat", X_feat); + np::Array d_X_feat_argsort = copyToDevice("X_feat_argsort", X_feat_argsort); + np::Array d_weights = copyToDevice("weights", weights); + np::Array d_y = copyToDevice("y", y); + + const size_t n_blocks = static_cast(std::ceil(static_cast(X_feat.shape[0]) / static_cast(NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z))); + constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z); + // const size_t n_blocks = static_cast(std::ceil(static_cast(X_feat.shape[0]) / static_cast(NB_THREADS))); + // constexpr const dim3 dimBlock(NB_THREADS); + __train_weak_clf_kernel__<<>>(d_classifiers, d_y, d_X_feat, d_X_feat_argsort, d_weights, total_pos, total_neg); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + + _print_cuda_error_("memcpy classifiers", cudaMemcpy(classifiers.data, d_classifiers.data, np::prod(classifiers.shape) * sizeof(float64_t), cudaMemcpyDeviceToHost)); + + cudaFree("free d_classifiers", d_classifiers); + cudaFree("free d_X_feat", d_X_feat); + cudaFree("free d_X_feat_argsort", d_X_feat_argsort); + cudaFree("free d_weights", d_weights); + cudaFree("free d_y", d_y); + + return classifiers; +} + +template +__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept { + int32_t i = l - 1; + for (int32_t j = l; j <= h; ++j) + if (a[indices[j]] < a[indices[h]]) + swap(&indices[++i], &indices[j]); + swap(&indices[++i], &indices[h]); + return i; +} + +template +__device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept { + const size_t total = h - l + 1; + + //int32_t* stack = new int32_t[total]{l, h}; + //int32_t stack[total]; + int32_t stack[6977]; + //int32_t stack[1<<16]; + stack[0] = l; + stack[1] = h; + + size_t top = 1, low = l, high = h; + + while (top <= total) { + high = stack[top--]; + low = stack[top--]; + if(low >= high) + break; + + const int32_t p = as_partition_gpu(a, indices, low, high); + + if (p - 1 > low && p - 1 < total) { + stack[++top] = low; + stack[++top] = p - 1; + } + + if (p + 1 < high) { + stack[++top] = p + 1; + stack[++top] = high; + } + } + //delete[] stack; +} + +template +__global__ void argsort_bounded_gpu(const np::Array a, uint16_t* indices){ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= a.shape[0]) + return; + + for(size_t y = 0; y < a.shape[1]; ++y) indices[idx * a.shape[1] + y] = y; + argsort_gpu(&a[idx * a.shape[1]], &indices[idx * a.shape[1]], 0, a.shape[1] - 1); +} + +np::Array argsort_2d_gpu(const np::Array& X_feat) noexcept { + const np::Array indices = np::empty(X_feat.shape); + + uint16_t* d_indices; + const size_t indices_size = np::prod(indices.shape) * sizeof(uint16_t); + + np::Array d_X_feat = copyToDevice("X_feat", X_feat); + _print_cuda_error_("malloc d_indices", cudaMalloc(&d_indices, indices_size)); + + const size_t dimGrid = static_cast(std::ceil(static_cast(X_feat.shape[0]) / static_cast(NB_THREADS))); + const dim3 dimBlock(NB_THREADS); + argsort_bounded_gpu<<>>(d_X_feat, d_indices); + _print_cuda_error_("synchronize", cudaDeviceSynchronize()); + + _print_cuda_error_("memcpy d_indices", cudaMemcpy(indices.data, d_indices, indices_size, cudaMemcpyDeviceToHost)); + + cudaFree("free d_X_feat", d_X_feat); + _print_cuda_error_("free d_indices", cudaFree(d_indices)); + + return indices; +} + +__host__ __device__ +size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept { + size_t result = shape[offset]; + for(size_t i = 1 + offset; i < shape.length; ++i) + result *= shape[i]; + return result; +} diff --git a/cpp/ViolaJonesGPU.hpp b/cpp/ViolaJonesGPU.hpp new file mode 100644 index 0000000..3d305a6 --- /dev/null +++ b/cpp/ViolaJonesGPU.hpp @@ -0,0 +1,11 @@ +#pragma once +#include "data.hpp" + +void test_working(const size_t&) noexcept; +void test_working_2d(const size_t&, const size_t&) noexcept; +void test_working_3d(const size_t&, const size_t&, const size_t&) noexcept; +np::Array set_integral_image_gpu(const np::Array&) noexcept; +np::Array apply_features_gpu(const np::Array&, const np::Array&) noexcept; +np::Array train_weak_clf_gpu(const np::Array& X_feat, const np::Array& X_feat_argsort, const np::Array& y, + const np::Array& weights) noexcept; +np::Array argsort_2d_gpu(const np::Array& X_feat) noexcept; diff --git a/cpp/config.hpp b/cpp/config.hpp new file mode 100644 index 0000000..52e22e1 --- /dev/null +++ b/cpp/config.hpp @@ -0,0 +1,14 @@ +#pragma once + +// Save state to avoid recalulation on restart +#define SAVE_STATE true +// Redo the state even if it's already saved +#define FORCE_REDO false +// Use GPU to greatly accelerate runtime +#define GPU_BOOSTED true +// Number of weak classifiers +// const size_t TS[] = { 1 }; +// const size_t TS[] = { 1, 5, 10 }; +// const size_t TS[] = { 1, 5, 10, 25, 50 }; +// const size_t TS[] = { 1, 5, 10, 25, 50, 100, 200, 300 }; +const size_t TS[] = { 1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000 }; diff --git a/cpp/data.cpp b/cpp/data.cpp new file mode 100644 index 0000000..676da1c --- /dev/null +++ b/cpp/data.cpp @@ -0,0 +1,212 @@ +#include "data.hpp" +//#include "toolbox.hpp" +//#include + +int print(const np::Shape& shape) noexcept { + int num_written = 0; + num_written += printf("("); + if (shape.length > 1) { + const size_t length = shape.length - 1; + for (size_t i = 0; i < length; ++i) + num_written += printf("%lu, ", shape[i]); + num_written += printf("%lu)\n", shape[length]); + } + else + num_written += printf("%lu,)\n", shape[0]); + return num_written; +} + +template +int print(const np::Array& array, const char* format) noexcept { + //printf("["); + //const size_t length = np::prod(array.shape); + //for(size_t i = 0; i < length - 1; ++i) + // //std::cout << array[i] << " "; + // printf("%f ", array[i]); + ////std::cout << array[array.shape[0] - 1] << "]\n"; + //printf("%f]\n", array[length - 1]); + + char format_space[BUFFER_SIZE] = { 0 }; + sprintf(format_space, "%s ", format); + char format_close[BUFFER_SIZE] = { 0 }; + sprintf(format_close, "%s]\n", format); + int num_written = 0; + + if (array.shape.length == 1) { + const size_t max = array.shape[0] - 1; + num_written += printf("["); + for (size_t i = 0; i < max; ++i) + num_written += printf(format_space, array[i]); + num_written += printf(format_close, array[max]); + } + else { + num_written += printf("["); + for (size_t i = 0; i < array.shape[0]; ++i) { + num_written += printf(" ["); + for (size_t j = 0; j < array.shape[1] - 1; ++j) + num_written += printf(format_space, array[i * array.shape[1] + j]); + num_written += printf(format_close, array[i * array.shape[1] + array.shape[1] - 1]); + } + num_written += printf("]\n"); + } + + return num_written; +} + +int print(const np::Array& array) noexcept { + return print(array, "%hu"); +} + +int print(const np::Array& array) noexcept { + return print(array, "%f"); +} + +int print_feat(const np::Array& array, const np::Slice& slice) noexcept { + int num_written = 0; + num_written += printf("["); + const size_t feat_size = np::prod(array.shape, 1); + const size_t offset = slice.x * feat_size; + const size_t length = offset + feat_size - 1; + for (size_t i = offset; i < length; ++i) + num_written += printf("%2hu ", array[i]); + num_written += printf("%2hu]\n", array[length]); + + return num_written; +} + +int print(const np::Array& array, const np::Slice& slice) noexcept { + int num_written = 0; + if (array.shape.length == 1) { + const size_t max = slice.y - 1; //std::min(slice.y, array.shape[0] - 1); + num_written += printf("["); + for (size_t i = slice.x; i < max; ++i) + num_written += printf("%hu ", array[i]); + num_written += printf("%hu]\n", array[max]); + } + else { + num_written += printf("["); + size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z; + for (size_t i = 0; i < array.shape[1]; ++i) { + num_written += printf(" ["); + for (size_t j = 0; j < array.shape[2]; ++j) + num_written += printf("%3hu ", array[k + i * array.shape[1] + j]); + num_written += printf("]\n"); + } + num_written += printf("]\n"); + } + return num_written; +} + +int print(const np::Array& array, const np::Slice& slice) noexcept { + int num_written = 0; + if (array.shape.length == 1) { + const size_t max = slice.y - 1; //std::min(slice.y, array.shape[0] - 1); + num_written += printf("["); + for (size_t i = slice.x; i < max; ++i) + num_written += printf("%iu ", array[i]); + num_written += printf("%iu]\n", array[max]); + } + else { + num_written += printf("["); + size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z; + for (size_t i = 0; i < array.shape[1]; ++i) { + num_written += printf(" ["); + for (size_t j = 0; j < array.shape[2]; ++j) + num_written += printf("%5i ", array[k + i * array.shape[1] + j]); + num_written += printf("]\n"); + } + num_written += print("]"); + } + return num_written; +} + +int print(const np::Array& array, const np::Slice& slice) noexcept { + int num_written = 0; + num_written += printf("["); + //size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z; + size_t k = slice.x * array.shape[1]; + for (size_t i = k; i < k + (slice.y - slice.x); ++i) { + num_written += printf("%5i ", array[i]); + } + num_written += print("]"); + return num_written; +} + +int print(const np::Array& array, const np::Slice& slice) noexcept { + int num_written = 0; + num_written += printf("["); + //size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z; + size_t k = slice.x * array.shape[1]; + for (size_t i = k; i < k + (slice.y - slice.x); ++i) { + num_written += printf("%5hu ", array[i]); + } + num_written += print("]"); + return num_written; +} + +static inline np::Array load_set(const char* set_name) { + FILE* file = fopen(set_name, "rb"); + if (file == NULL) { + print_error_file(set_name); + throw; + } + char meta[BUFFER_SIZE]; + if (!fgets(meta, BUFFER_SIZE, file)) { + print_error_file(set_name); + fclose(file); + throw; + } + size_t* dims = new size_t[3](); + if (!sscanf(meta, "%lu %lu %lu", &dims[0], &dims[1], &dims[2])) { + print_error_file(set_name); + fclose(file); + throw; + } + np::Shape shape = { static_cast(dims[1] == 0 ? 1 : 3), dims }; + np::Array a = np::empty(std::move(shape)); + + const size_t size = np::prod(a.shape); + size_t i = 0, j = 0; + int c; + char buff[STRING_INT_SIZE] = { 0 }; + while ((c = fgetc(file)) != EOF && i < size) { + if (c == ' ' || c == '\n') { + buff[j] = '\0'; + a[i++] = static_cast(atoi(buff)); + //memset(buff, 0, STRING_INT_SIZE); + j = 0; + } + else + buff[j++] = (char)c; + } + buff[j] = '\0'; + a[i++] = static_cast(atoi(buff)); + if (i != size) { + fprintf(stderr, "Missing loaded data %lu/%lu\n", i, size); + fclose(file); + throw; + } + fclose(file); + + return a; +} + +std::array, 4> load_datasets() { + return { + load_set(DATA_DIR "/X_train.bin"), load_set(DATA_DIR "/y_train.bin"), + load_set(DATA_DIR "/X_test.bin"), load_set(DATA_DIR "/y_test.bin") + }; +} + +void print_error_file(const char* file_dir) noexcept { + const char* buff = strerror(errno); + fprintf(stderr, "Can't open %s, error code = %d : %s\n", file_dir, errno, buff); + // delete buff; +} + +//size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept { +// size_t result = shape[offset]; +// for(size_t i = 1 + offset; i < shape.length; ++i) +// result *= shape[i]; +// return result; +//} diff --git a/cpp/data.hpp b/cpp/data.hpp new file mode 100644 index 0000000..5e45949 --- /dev/null +++ b/cpp/data.hpp @@ -0,0 +1,954 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +#define DATA_DIR "../data" +#define OUT_DIR "./out" +#define MODEL_DIR "./models" +#define BUFFER_SIZE 256 +#define STRING_INT_SIZE 8 // Length of a number in log10 (including '-') +#define S(N) std::string(N, '-').c_str() + +#ifndef __CUDACC__ +#define __host__ +#define __device__ +#endif + +typedef float float32_t; +typedef double float64_t; +typedef long double float128_t; + +__host__ __device__ +constexpr inline int print(const char* str) noexcept { + return printf("%s\n", str); +} + +inline int print(const std::string& s) noexcept { + return printf("%s\n", s.c_str()); +} + +namespace np { + constexpr const float64_t inf = std::numeric_limits::infinity(); + + typedef struct Slice { + size_t x = 0, y = 0, z = 0; + } Slice; + + typedef struct Shape { + size_t length = 0; + size_t* data = nullptr; + size_t* refcount = nullptr; +#ifdef __DEBUG + size_t total = 1; +#endif + + __host__ __device__ + Shape() noexcept { +// #ifdef __DEBUG +// print("Shape created (default)"); +// #endif + } + + __host__ __device__ + Shape(const size_t& length, size_t* data) noexcept : length(length), data(data), refcount(new size_t(1)) { +#ifdef __DEBUG + //print("Shape created (raw)"); + for(size_t i = 0; i < length; ++i) + total *= data[i]; +#endif + } + + __host__ __device__ + Shape(const std::initializer_list& dims) noexcept : length(dims.size()), data(new size_t[dims.size()]), refcount(new size_t(1)) { +// #ifdef __DEBUG +// print("Shape created (initializer)"); +// #endif + const auto* begin = dims.begin(); + for(size_t i = 0; i < length; ++i){ + data[i] = begin[i]; +#ifdef __DEBUG + total *= data[i]; +#endif + } + } + + __host__ __device__ + Shape(const Shape& shape) noexcept { +#ifdef __DEBUG + print("Shape created (copy)"); +#endif + if (data != nullptr && data != shape.data){ +#ifdef __DEBUG + print("Former shape deleted (copy)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != shape.refcount){ +#ifdef __DEBUG + print("Former shape refcount freed (copy)"); +#endif + delete refcount; + } + length = shape.length; + + //data = new size_t[length]; + //memcpy(data, shape.data, length * sizeof(size_t)); + //refcount = new size_t; + //memcpy(refcount, shape.refcount, sizeof(size_t)); + + data = shape.data; + refcount = shape.refcount; + if (refcount != nullptr) + (*refcount)++; +#ifdef __DEBUG + else + print("Moved shape has null refcount"); + total = shape.total; +#endif + } + + __host__ __device__ + Shape(Shape&& shape) noexcept { +// #ifdef __DEBUG +// print("Shape created (move)); +// #endif + if (data != nullptr && data != shape.data){ +#ifdef __DEBUG + print("Former shape deleted (move)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != shape.refcount){ +#ifdef __DEBUG + print("Former shape refcount freed (move)"); +#endif + delete refcount; + } + length = shape.length; + data = shape.data; + refcount = shape.refcount; + + shape.length = 0; + shape.data = nullptr; + shape.refcount = nullptr; +#ifdef __DEBUG + total = shape.total; + shape.total = 1; +#endif + } + + __host__ __device__ + ~Shape() noexcept { + if(refcount == nullptr){ +// #ifdef __DEBUG +// print("Shape refcount freed more than once"); +// #endif + return; + } + --(*refcount); +// #ifdef __DEBUG +// printf("Shape destructed : %lu\n", *refcount); +// #endif + if(*refcount == 0){ + if (data != nullptr){ + delete[] data; + data = nullptr; +// #ifdef __DEBUG +// print("Shape freeing ..."); +// #endif + } +//#ifdef __DEBUG + else + printf("Shape freed more than once : %lu\n", *refcount); +//#endif + delete refcount; + refcount = nullptr; +#ifdef __DEBUG + total = 1; +#endif + } + } + + __host__ __device__ + Shape& operator=(const Shape& shape) noexcept { +#ifdef __DEBUG + print("Shape created (assign copy)"); +#endif + if (data != nullptr && data != shape.data){ +#ifdef __DEBUG + print("Former shape deleted (assign copy)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != shape.refcount){ +#ifdef __DEBUG + print("Former shape refcount freed (assign copy)"); +#endif + delete refcount; + } + length = shape.length; + + // data = new size_t[length]; + // memcpy(data, shape.data, length * sizeof(size_t)); + // refcount = new size_t; + // memcpy(refcount, shape.refcount, sizeof(size_t)); + + data = shape.data; + refcount = shape.refcount; + if (refcount != nullptr) + (*refcount)++; +#ifdef __DEBUG + else + printf("Assigned copy shape has null refcount"); + total = shape.total; +#endif + return *this; + } + + __host__ __device__ + Shape& operator=(Shape&& shape) noexcept { +// #ifdef __DEBUG +// print("Shape created (assign move)"); +// #endif + if (data != nullptr && data != shape.data){ +#ifdef __DEBUG + print("Former shape deleted (assign move)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != shape.refcount){ +#ifdef __DEBUG + print("Former shape refcount freed (assign move)"); +#endif + delete refcount; + } + length = shape.length; + data = shape.data; + refcount = shape.refcount; +#ifdef __DEBUG + total = shape.total; + if (refcount == nullptr) + print("Assigned copy shape has null refcount"); + shape.total = 1; +#endif + shape.length = 0; + shape.data = nullptr; + shape.refcount = nullptr; + return *this; + } + + + __host__ __device__ + constexpr size_t& operator[](const size_t& i) const { +#ifdef __DEBUG + if (i > length){ + printf("Index %lu out of shape length %lu\n", i, length); + #ifndef __CUDACC__ + throw std::out_of_range("Index out of shape size"); + #endif + } +#endif + return data[i]; + } + + constexpr bool operator==(const Shape& other) const noexcept { + if (length != other.length) + return false; +#ifdef __DEBUG + if (total != other.total) + return false; +#endif + for(size_t i = 0; i < length; ++i) + if (data[i] != other[i]) + return false; + return true; + } + + constexpr bool operator!=(const Shape& other) const noexcept { + return !(*this == other); + } + } Shape; + + __host__ __device__ + size_t prod(const Shape&, const size_t& = 0) noexcept; + + template + struct Array { + Shape shape; + T* data = nullptr; + size_t* refcount = nullptr; + + __host__ __device__ + Array() noexcept { +// #ifdef __DEBUG +// print("Array created (default)"); +// #endif + } + + __host__ __device__ + Array(const Shape& shape, T* data) noexcept : shape(shape), data(data), refcount(new size_t(1)) { +// #ifdef __DEBUG +// print("Array created (raw, copy shape)"); +// #endif + } + + __host__ __device__ + Array(const Shape& shape) noexcept : shape(shape), data(new T[np::prod(shape)]), refcount(new size_t(1)) { +// #ifdef __DEBUG +// print("Array created (raw empty, copy shape)"); +// #endif + } + + __host__ __device__ + Array(Shape&& shape, T* data) noexcept : shape(std::move(shape)), data(data), refcount(new size_t(1)) { +// #ifdef __DEBUG +// print("Array created (raw, move shape)"); +// #endif + } + + __host__ __device__ + Array(Shape&& shape) noexcept : shape(std::move(shape)), data(new T[np::prod(shape)]), refcount(new size_t(1)) { +// #ifdef __DEBUG +// print("Array created (raw empty, move shape)"); +// #endif + } + + __host__ __device__ + Array(const Array& array) noexcept : shape(array.shape) { +#ifdef __DEBUG + print("Array created (copy)"); +#endif + if (data != nullptr && data != array.data){ +#ifdef __debug + print("Former array deleted (move)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != array.refcount){ +#ifdef __DEBUG + print("Former array refcount freed (move)"); +#endif + delete refcount; + } + // const size_t size = np::prod(shape); + // data = new T[size]; + // memcpy(data, array.data, size); + // refcount = new size_t; + // memcpy(refcount, array.refcount, sizeof(size_t)); + + data = array.data; + refcount = array.refcount; + if (refcount != nullptr) + (*refcount)++; +#ifdef __DEBUG + else + print("Moved array has null refcount"); +#endif + } + + __host__ __device__ + Array(Array&& array) noexcept { +// #ifdef __DEBUG +// print("Array created (move)"); +// #endif + if (data != nullptr && data != array.data){ +#ifdef __DEBUG + print("Former array deleted (move)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != array.refcount){ +#ifdef __DEBUG + print("Former array refcount freed (move)"); +#endif + delete refcount; + } + shape = std::move(array.shape); + data = array.data; + refcount = array.refcount; + + array.data = nullptr; + array.refcount = nullptr; + } + + __host__ __device__ + ~Array() noexcept { + if(refcount == nullptr){ +// #ifdef __DEBUG +// print("Array refcount freed more than once"); +// #endif + return; + } + --(*refcount); +// #ifdef __DEBUG +// printf("Array destructed : %lu\n", *refcount); +// #endif + if(*refcount == 0){ + if (data != nullptr){ + delete[] data; + data = nullptr; +// #ifdef __DEBUG +// print("Array freeing ..."); +// #endif + } +#ifdef __DEBUG + else + printf("Array freed more than once : %lu\n", *refcount); +#endif + delete refcount; + refcount = nullptr; + } + } + + __host__ __device__ + Array& operator=(const Array& array) noexcept { +#ifdef __DEBUG + print("Array created (assign copy)"); +#endif + if (data != nullptr && data != array.data){ +#ifdef __DEBUG + print("Former array deleted (assign copy)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != array.refcount){ +#ifdef __DEBUG + print("Former array refcount freed (assign copy)"); +#endif + delete refcount; + } + shape = array.shape; + + // const size_t size = np::prod(shape) * sizeof(T); + // data = new T[size]; + // memcpy(data, array.data, size); + // refcount = new size_t; + // memcpy(refcount, array.refcount, sizeof(size_t)); + + data = array.data; + refcount = array.refcount; + if (refcount != nullptr) + (*refcount)++; +#ifdef __DEBUG + else + print("Assigned array has null refcount"); +#endif + return *this; + } + + __host__ __device__ + Array& operator=(Array&& array) noexcept { +// #ifdef __DEBUG +// print("Array created (assign move)"); +// #endif + if (data != nullptr && data != array.data){ +#ifdef __DEBUG + print("Former array deleted (assign move)"); +#endif + delete[] data; + } + if (refcount != nullptr && refcount != array.refcount){ +#ifdef __DEBUG + print("Former array refcount freed (assign move)"); +#endif + delete refcount; + } + shape = std::move(array.shape); + data = array.data; + refcount = array.refcount; + + array.refcount = nullptr; + array.data = nullptr; + + return *this; + } + + __host__ __device__ + constexpr T& operator[](const size_t& i) const; + // bool operator==(const Array& other) const noexcept; + template + Array& operator/=(const F& value) noexcept; + // Array& operator*=(const T& value) noexcept; + template + Array& operator*=(const Array& other); + template + Array& operator+=(const Array& other); + template + Array operator*(const Array& other) const; + // template + // Array operator*(const F& other) const; + template + Array operator-(const np::Array& other) const; + template + Array operator-(const F& other) const; + }; + + template + Array empty(Shape&& shape) noexcept { + return { std::move(shape), new T[np::prod(shape)] }; + } + + template + Array empty(const Shape& shape) noexcept { + return { std::move(shape), new T[np::prod(shape)] }; + } + + template + Array empty(const std::initializer_list& dims) noexcept { + const Shape shape(dims); + return { std::move(shape), new T[np::prod(shape)] }; + } + + template + Array zeros(Shape&& shape) noexcept { + return { std::move(shape), new T[np::prod(shape)]{0} }; + } + + template + Array zeros(const Shape& shape) noexcept { + return { std::move(shape), new T[np::prod(shape)]{0} }; + } + + template + Array zeros(const std::initializer_list& dims) noexcept { + const Shape shape(dims); + return { std::move(shape), new T[np::prod(shape)]{0} }; + } + + template + __host__ __device__ + constexpr T& Array::operator[](const size_t& i) const { +#ifdef __DEBUG + if (i > shape.total){ + printf("Index %lu out of array size %lu\n", i, shape.total); + #ifndef __CUDACC__ + throw std::out_of_range("Index out of array size"); + #endif + } +#endif + return data[i]; + } + + // bool Array::operator==(const Array& other) const noexcept { + // if (shape != other.shape) + // return false; + // const size_t lenght = np::prod(shape); + // for(size_t i = 0; i < length; ++i) + // if (data[i] != other[i]) + // return false; + // return true; + // } + + template + template + Array& Array::operator/=(const F& value) noexcept { + const size_t total = prod(shape); + for(size_t i = 0; i < total; ++i) + data[i] /= value; + return *this; + } + + // template + // Array& Array::operator*=(const T& value) noexcept { + // const size_t total = prod(shape); + // for(size_t i = 0; i < total; ++i) + // data[i] *= value; + // return *this; + // } + + template + template + Array Array::operator*(const Array& other) const { +#ifdef __DEBUG + if (shape != other.shape){ + printf("Incompatible shapes\n"); + throw; + } +#endif + np::Array res = np::empty(shape); + const size_t total = prod(shape); + for(size_t i = 0; i < total; ++i) + res[i] = data[i] * other[i]; + return res; + } + + // template + // template + // Array Array::operator*(const F& value) const { + // np::Array res = np::empty(shape); + // const size_t total = prod(shape); + // for(size_t i = 0; i < total; ++i) + // res[i] = data[i] * value; + // return res; + // } + + // template + // Array operator*(const F& value, const Array& other) { + // np::Array res = np::empty(other.shape); + // const size_t total = prod(other.shape); + // for(size_t i = 0; i < total; ++i) + // res[i] = other[i] * value; + // return res; + // } + + template + template + Array& Array::operator*=(const Array& other) { +#ifdef __DEBUG + if (shape != other.shape){ + printf("Incompatible shapes\n"); + throw; + } +#endif + const size_t total = prod(shape); + for(size_t i = 0; i < total; ++i) + data[i] *= other[i]; + return *this; + } + + template + template + Array& Array::operator+=(const Array& other) { +#ifdef __DEBUG + if (shape != other.shape){ + printf("Incompatible shapes\n"); + throw; + } +#endif + const size_t total = prod(shape); + for(size_t i = 0; i < total; ++i) + data[i] += other[i]; + return *this; + } + + template + template + Array Array::operator-(const F& other) const { + np::Array res = np::empty(shape); + const size_t total = prod(shape); + for(size_t i = 0; i < total; ++i) + res[i] = data[i] - other; + return res; + } + + template + template + Array Array::operator-(const np::Array& other) const { +#ifdef __DEBUG + if (shape != other.shape){ + printf("Incompatible shapes\n"); + throw; + } +#endif + np::Array res = np::empty(shape); + const size_t total = prod(shape); + for(size_t i = 0; i < total; ++i) + res[i] = data[i] - other[i]; + return res; + } + + // template + // T prod(const Array& array, const size_t& offset = 0) noexcept { + // T result = array[offset]; + // const size_t total = prod(array.shape); + // for(size_t i = 1 + offset; i < total; ++i) + // result *= array[i]; + // return result; + // } + + template + T sum(const Array& array) noexcept { + T result = array[0]; + const size_t total = prod(array.shape); + for(size_t i = 1; i < total; ++i) + result += array[i]; + return result; + } + + template + F mean(const Array& array) noexcept { + T result = array[0]; + const size_t total = prod(array.shape); + for(size_t i = 1; i < total; ++i) + result += array[i]; + return result / total; + } + + template + float64_t mean(const Array& array) noexcept { + return mean(array); + } + + template + np::Array abs(const Array& array) noexcept { + np::Array result = np::empty(array.shape); + const size_t total = prod(array.shape); + for(size_t i = 0; i < total; ++i) + result[i] = std::abs(array[i]); + return result; + } + + template + np::Array pow(const F& k, const Array& array) noexcept { + np::Array result = np::empty(array.shape); + const size_t total = prod(array.shape); + for(size_t i = 0; i < total; ++i) + result[i] = std::pow(k, array[i]); + return result; + } + + //template + //T max(const Array& array) noexcept { + // T result = array[0]; + // for(size_t i = 1; i < prod(array.shape); ++i) + // if(array[i] > result) + // result = array[i]; + // return result; + //} + + //template + //T min(const Array& array) noexcept { + // T result = array[0]; + // for(size_t i = 1; i < prod(array.shape); ++i) + // if(array[i] < result) + // result = array[i]; + // return result; + //} + + template + Array astype(const Array& array) noexcept { + Array res = empty(array.shape); + const size_t total = prod(array.shape); + for(size_t i = 0; i < total; ++i) + res[i] = static_cast(array[i]); + return res; + } + + template + Array operator-(const T& k, const Array& other) noexcept { + np::Array res = empty(other.shape); + const size_t total = prod(other.shape); + for(size_t i = 0; i < total; ++i) + res[i] = k - other[i]; + return res; + } + + template + __host__ __device__ + constexpr T min(const T& lhs, const T& rhs) noexcept { + return lhs < rhs ? lhs : rhs; + } + + // template + // __host__ __device__ + // constexpr T max(const T& lhs, const T& rhs) noexcept { + // return lhs > rhs ? lhs : rhs; + // } +}; + +template +constexpr np::Array& map(np::Array& a, const T(&fnc)(const size_t& i, const T& v)) noexcept { + return std::function(fnc); +} + +template +constexpr np::Array& map(np::Array& a, const std::function& fnc) noexcept { + const size_t a_length = np::prod(a.shape); + for (size_t i = 0; i < a_length; ++i) + a[i] = fnc(i, a[i]); + return a; +} + +//template +//constexpr void foreach(const np::Array& a, const void(&fnc)(const size_t&, const T&)) noexcept { +// return std::function(fnc); +//} + +//template +//constexpr void foreach(const np::Array& a, const std::function& fnc) noexcept { +// for (size_t i = 0; i < a.length; ++i) +// fnc(i, a[i]); +//} + +template +__host__ __device__ +constexpr inline static void swap(T* a, T* b) noexcept { + if (a == b) return; + const T temp = *a; + *a = *b; + *b = temp; +} + +template +static int32_t qs_partition(const np::Array& a, const int32_t& l, const int32_t& h) noexcept { + int32_t i = l - 1; + for (int32_t j = l; j <= h; ++j) + if (a[j] < a[h]) + swap(&a[++i], &a[j]); + swap(&a[++i], &a[h]); + return i; +} + +template +void quicksort(const np::Array& a, const int32_t& l, const int32_t& h) noexcept { + if (l >= h) + return; + const int32_t p = qs_partition(a, l, h); + quicksort(a, l, p - 1); + quicksort(a, p + 1, h); +} + +template +void quicksort(const np::Array& a) noexcept { + quicksort(a, 0, a.length - 1); +} + +template +static size_t as_partition(const T* a, uint16_t* indices, const size_t& l, const size_t& h) noexcept { + size_t i = l - 1; + for (size_t j = l; j <= h; ++j) + if (a[indices[j]] < a[indices[h]]) + swap(&indices[++i], &indices[j]); + swap(&indices[++i], &indices[h]); + return i; +} + +template +void argsort(const T* a, uint16_t* indices, const size_t& l, const size_t& h) noexcept { + const size_t total = h - l + 1; + + size_t* stack = new size_t[total]{l, h}; + size_t top = 1, low = l, high = h; + + while (top <= total) { + high = stack[top--]; + low = stack[top--]; + if(low >= high) + break; + + const size_t p = as_partition(a, indices, low, high); + + if (p - 1 > low && p - 1 < total) { + stack[++top] = low; + stack[++top] = p - 1; + } + + if (p + 1 < high) { + stack[++top] = p + 1; + stack[++top] = high; + } + } + delete[] stack; +} + +template +np::Array argsort(const np::Array& other, const size_t& l, const size_t& h) noexcept { + np::Array indices = np::empty(other.shape); + map(indices, [](const size_t& i, const uint16_t&) -> uint16_t { return i; }); + + argsort(other, indices, l, h); + return indices; +} + +template +np::Array argsort(const np::Array* other, const size_t& length) noexcept { + return argsort(other, 0, length - 1); +} + +std::array, 4> load_datasets(void); +void print_error_file(const char*) noexcept; + +template +void save(const np::Array& d, const char* filename) { + FILE* output = fopen(filename, "wb"); + if (output == NULL) { + print_error_file(filename); + throw; + } + assert(d.shape.refcount != 0);//, "Refcount shape is zero !!"); + fwrite(&d.shape.length, sizeof(size_t), 1, output); + fwrite(d.shape.data, sizeof(size_t), d.shape.length, output); + assert(d.refcount != 0);//, "Refcount array is zero !!"); + fwrite(d.data, sizeof(T), np::prod(d.shape), output); + fclose(output); +} + +template +np::Array load(const char* filename) { + FILE* input = fopen(filename, "rb"); + if (input == NULL) { + print_error_file(filename); + throw; + } + size_t length = 0; + if(!fread(&length, sizeof(size_t), 1, input)){ + print_error_file(filename); + fclose(input); + throw; + } + size_t* data = new size_t[length]; + if(!fread(data, sizeof(size_t), length, input)){ + print_error_file(filename); + fclose(input); + throw; + } + np::Array d = np::empty(np::Shape(length, data)); + if(!fread(d.data, sizeof(T), np::prod(d.shape), input)){ + print_error_file(filename); + fclose(input); + throw; + } + fclose(input); + return d; +} + +#ifdef __CUDACC__ +template +np::Array copyToDevice(const char* name, const np::Array& array) noexcept { + const size_t array_size = np::prod(array.shape) * sizeof(T); + const size_t shape_size = array.shape.length * sizeof(size_t); + np::Array d_array; + //_print_cuda_error_(name, cudaMalloc(&d_array.refcount, sizeof(size_t))); + _print_cuda_error_(name, cudaMalloc(&d_array.data, array_size)); + //_print_cuda_error_(name, cudaMalloc(&d_array.shape.refcount, sizeof(size_t))); + _print_cuda_error_(name, cudaMalloc(&d_array.shape.data, shape_size)); + d_array.shape.length = array.shape.length; + //_print_cuda_error_(name, cudaMemcpy(d_array.refcount, array.refcount, sizeof(size_t), cudaMemcpyHostToDevice)); + _print_cuda_error_(name, cudaMemcpy(d_array.data, array.data, array_size, cudaMemcpyHostToDevice)); + //_print_cuda_error_(name, cudaMemcpy(d_array.shape.refcount, array.shape.refcount, sizeof(size_t), cudaMemcpyHostToDevice)); + _print_cuda_error_(name, cudaMemcpy(d_array.shape.data, array.shape.data, shape_size, cudaMemcpyHostToDevice)); +#ifdef __DEBUG + d_array.shape.total = np::prod(array.shape); +#endif + return d_array; +} + +template +constexpr void cudaFree(const char* name, np::Array& array) noexcept { + //_print_cuda_error_(name, cudaFree(array.refcount)); + //array.refcount = nullptr; + _print_cuda_error_(name, cudaFree(array.data)); + array.data = nullptr; + //_print_cuda_error_(name, cudaFree(array.shape.refcount)); + //array.shape.refcount = nullptr; + _print_cuda_error_(name, cudaFree(array.shape.data)); + array.shape.data = nullptr; +} + +constexpr inline void _print_cuda_error_(const char* name, const cudaError_t& err) noexcept { + if (err != cudaSuccess) fprintf(stderr, "Error: %s = %d : %s\n", name, err, cudaGetErrorString(err)); +} +#endif + +int print(const np::Shape&) noexcept; +int print(const np::Array&) noexcept; +int print(const np::Array&) noexcept; +int print(const np::Array&, const np::Slice&) noexcept; +int print(const np::Array&, const np::Slice&) noexcept; +int print(const np::Array&, const np::Slice&) noexcept; +int print(const np::Array&, const np::Slice&) noexcept; +int print_feat(const np::Array&, const np::Slice&) noexcept; diff --git a/cpp/projet.cpp b/cpp/projet.cpp new file mode 100644 index 0000000..302baa9 --- /dev/null +++ b/cpp/projet.cpp @@ -0,0 +1,320 @@ +#include +namespace fs = std::filesystem; +#include "data.hpp" +#include "toolbox.hpp" +#include "config.hpp" +#include "ViolaJones.hpp" +#include "ViolaJonesGPU.hpp" +#include "ViolaJonesCPU.hpp" + +void test_float() noexcept; + +#ifdef __DEBUG +// #define IDX_INSPECT 0 +// #define IDX_INSPECT 2 +#define IDX_INSPECT 4548 +#define IDX_INSPECT_OFFSET 100 +#endif + +#if GPU_BOOSTED +#define LABEL "GPU" +#define apply_features apply_features_gpu +#define set_integral_image set_integral_image_gpu +#define argsort_2d argsort_2d_gpu +#else +#define LABEL "CPU" +#define apply_features apply_features_cpu +#define set_integral_image set_integral_image_cpu +#define argsort_2d argsort_2d_cpu +#endif + +std::tuple, np::Array, np::Array, np::Array, np::Array> preprocessing() { + // Creating state saver folders if they don't exist already + if (SAVE_STATE) + for (const char* const folder_name : { "models", "out" }) + fs::create_directory(folder_name); + + printf("| %-49s | %-17s | %-29s |\n", "Preprocessing", "Time spent (ns)", "Formatted time spent"); + printf("|%s|%s|%s|\n", S(51), S(19), S(31)); + + const auto [ X_train, y_train, X_test, y_test ] = state_saver("Loading sets", {"X_train", "y_train", "X_test", "y_test"}, + FORCE_REDO, SAVE_STATE, OUT_DIR, load_datasets); + +#ifdef __DEBUG + // print("X_train"); + // print(X_train.shape); + // print(X_train, { IDX_INSPECT }); + // print("X_test"); + // print(X_test.shape); + // print(X_test, { IDX_INSPECT }); + // print("y_train"); + // print(y_train.shape); + // print(y_train, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET }); + // print("y_test"); + // print(y_test.shape); + // print(y_test, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET }); +#endif + + const np::Array feats = state_saver("Building features", "feats", + FORCE_REDO, SAVE_STATE, OUT_DIR, build_features, X_train.shape[1], X_train.shape[2]); + +#ifdef __DEBUG + // print("feats"); + // print(feats.shape); + // print_feat(feats, { IDX_INSPECT }); +#endif + + const np::Array X_train_ii = state_saver("Converting training set to integral images (" LABEL ")", "X_train_ii_" LABEL, + FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_train); + const np::Array X_test_ii = state_saver("Converting testing set to integral images (" LABEL ")", "X_test_ii_" LABEL, + FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_test); + +#ifdef __DEBUG + // print("X_train_ii"); + // print(X_train_ii.shape); + // print(X_train_ii, { IDX_INSPECT }); + // print("X_test_ii"); + // print(X_test_ii.shape); + // print(X_test_ii, { IDX_INSPECT }); + // return {}; +#endif + + const np::Array X_train_feat = state_saver("Applying features to training set (" LABEL ")", "X_train_feat_" LABEL, + FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_train_ii); + const np::Array X_test_feat = state_saver("Applying features to testing set (" LABEL ")", "X_test_feat_" LABEL, + FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_test_ii); + +#ifdef __DEBUG + // print("X_train_feat"); + // print(X_train_feat.shape); + // print(X_train_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET }); + // print("X_test_feat"); + // print(X_test_feat.shape); + // print(X_test_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET }); +#endif + + // const Array indices = measure_time_save>("Selecting best features", "indices", select_percentile, X_train_feat, d.y_train); + // const Array indices = measure_time>("Selecting best features", select_percentile, X_train_feat, d.y_train); + +#ifdef __DEBUG + // print_feature(indices); +#endif + + const np::Array X_train_feat_argsort = state_saver("Precalculating training set argsort (" LABEL ")", "X_train_feat_argsort_" LABEL, + FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_train_feat); + +#ifdef __DEBUG + print("X_train_feat_argsort"); + print(X_train_feat_argsort.shape); + print(X_train_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET }); +#endif + + // const np::Array X_test_feat_argsort = state_saver("Precalculating testing set argsort (" LABEL ")", "X_test_feat_argsort_" LABEL, + // FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_test_feat); + +#ifdef __DEBUG + // print("X_test_feat_argsort"); + // print(X_test_feat_argsort.shape); + // print(X_test_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET }); +#endif + + return { X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test }; +} + +void train(const np::Array& X_train_feat, const np::Array& X_train_feat_argsort, const np::Array& y_train) { + printf("\n| %-49s | %-17s | %-29s |\n", "Training", "Time spent (ns)", "Formatted time spent"); + printf("|%s|%s|%s|\n", S(51), S(19), S(31)); + + for (const size_t T : TS) { + char title[BUFFER_SIZE] = { 0 }; + char alphas_title[BUFFER_SIZE] = { 0 }; + char final_classifiers_title[BUFFER_SIZE] = { 0 }; + sprintf(title, "ViolaJones T = %-4lu (%s)", T, LABEL); + sprintf(alphas_title, "alphas_%lu_%s", T, LABEL); + sprintf(final_classifiers_title, "final_classifiers_%lu_%s", T, LABEL); + +#ifdef __DEBUG + const auto [ alphas, final_classifiers ] = state_saver(title, { alphas_title, final_classifiers_title }, +#else + state_saver(title, { alphas_title, final_classifiers_title }, +#endif + FORCE_REDO, SAVE_STATE, MODEL_DIR, train_viola_jones, T, X_train_feat, X_train_feat_argsort, y_train); +#ifdef __DEBUG + print("alphas"); + print(alphas); + print("final_classifiers"); + print(final_classifiers); +#endif + } +} + +void testing_and_evaluating(const np::Array& X_train_feat, const np::Array& y_train, const np::Array& X_test_feat, const np::Array& y_test) { + printf("\n| %-26s | Time spent (ns) (E) | %-29s | Time spent (ns) (T) | %-29s |\n", "Testing", "Formatted time spent (E)", "Formatted time spent (T)"); + printf("|%s|%s|%s|%s|%s|\n", S(28), S(21), S(31), S(21), S(31)); + + constexpr const size_t NT = sizeof(TS) / sizeof(size_t); + std::array, NT> results; + + size_t i = 0; + for (const size_t T : TS) { + char title[BUFFER_SIZE] = { 0 }; + char alphas_title[BUFFER_SIZE] = { 0 }; + char final_classifiers_title[BUFFER_SIZE] = { 0 }; + sprintf(title, "ViolaJones T = %-4lu (%s)", T, LABEL); + sprintf(alphas_title, MODEL_DIR "/alphas_%lu_%s.bin", T, LABEL); + sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_%s.bin", T, LABEL); + + const np::Array alphas = load(alphas_title); + const np::Array final_classifiers = load(final_classifiers_title); + + auto start = time(); + const np::Array y_pred_train = classify_viola_jones(alphas, final_classifiers, X_train_feat); + const long long t_pred_train = duration_ns(time() - start); + const float64_t e_acc = accuracy_score(y_train, y_pred_train); + const float64_t e_f1 = f1_score(y_train, y_pred_train); + float64_t e_FN, e_FP; + std::tie(std::ignore, e_FN, e_FP, std::ignore) = confusion_matrix(y_train, y_pred_train); + + start = time(); + const np::Array y_pred_test = classify_viola_jones(alphas, final_classifiers, X_test_feat); + const long long t_pred_test = duration_ns(time() - start); + const float64_t t_acc = accuracy_score(y_test, y_pred_test); + const float64_t t_f1 = f1_score(y_test, y_pred_test); + float64_t t_FN, t_FP; + std::tie(std::ignore, t_FN, t_FP, std::ignore) = confusion_matrix(y_test, y_pred_test); + results[i++] = { e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP }; + + printf("| %-26s | %'19lld | %-29s | %'19lld | %-29s |\n", title, t_pred_train, format_time_ns(t_pred_train).c_str(), t_pred_test, format_time_ns(t_pred_test).c_str()); + } + + printf("\n| %-19s | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) |\n", "Evaluating"); + printf("|%s|%s|%s|%s|%s|%s|%s|%s|%s|\n", S(21), S(9), S(8), S(8), S(8), S(9), S(8), S(8), S(8)); + + i = 0; + for (const size_t T : TS) { + char title[BUFFER_SIZE] = { 0 }; + sprintf(title, "ViolaJones T = %-4lu", T); + const auto [e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP] = results[i++]; + printf("| %-19s | %'6.2f%% | %'6.2f | %'6.0f | %'6.0f | %6.2f%% | %'6.2f | %'6.0f | %'6.0f |\n", title, e_acc * 100, e_f1, e_FN, e_FP, t_acc * 100, t_f1, t_FN, t_FP); + } +} + +void final_unit_test() { + printf("\n| %-49s | %-10s | %-17s | %-29s |\n", "Unit testing", "Test state", "Time spent (ns)", "Formatted time spent"); + printf("|%s|%s|%s|%s|\n", S(51), S(12), S(19), S(31)); + + if(fs::exists(OUT_DIR "/X_train_ii_CPU.bin") && fs::exists(OUT_DIR "/X_train_ii_GPU.bin")){ + const np::Array X_train_ii_cpu = load(OUT_DIR "/X_train_ii_CPU.bin"); + const np::Array X_train_ii_gpu = load(OUT_DIR "/X_train_ii_GPU.bin"); + benchmark_function_void("X_train_ii CPU vs GPU", unit_test_cpu_vs_gpu, X_train_ii_cpu, X_train_ii_gpu); + } + + if(fs::exists(OUT_DIR "/X_test_ii_CPU.bin") && fs::exists(OUT_DIR "/X_test_ii_GPU.bin")){ + const np::Array X_test_ii_cpu = load(OUT_DIR "/X_test_ii_CPU.bin"); + const np::Array X_test_ii_gpu = load(OUT_DIR "/X_test_ii_GPU.bin"); + benchmark_function_void("X_test_ii CPU vs GPU", unit_test_cpu_vs_gpu, X_test_ii_cpu, X_test_ii_gpu); + } + + if(fs::exists(OUT_DIR "/X_train_feat_CPU.bin")){ + const np::Array X_train_feat = load(OUT_DIR "/X_train_feat_CPU.bin"); + + if(fs::exists(OUT_DIR "/X_train_feat_GPU.bin")){ + const np::Array X_train_feat_gpu = load(OUT_DIR "/X_train_feat_CPU.bin"); + benchmark_function_void("X_train_feat CPU vs GPU", unit_test_cpu_vs_gpu, X_train_feat, X_train_feat_gpu); + } + + np::Array X_train_feat_argsort_cpu; + uint8_t loaded = 0; + if(fs::exists(OUT_DIR "/X_train_feat_argsort_CPU.bin")){ + X_train_feat_argsort_cpu = std::move(load(OUT_DIR "/X_train_feat_argsort_CPU.bin")); + ++loaded; + benchmark_function_void("argsort_2D training set (CPU)", unit_test_argsort_2d, X_train_feat, X_train_feat_argsort_cpu); + } + + np::Array X_train_feat_argsort_gpu; + if(fs::exists(OUT_DIR "/X_train_feat_argsort_GPU.bin")){ + X_train_feat_argsort_gpu = std::move(load(OUT_DIR "/X_train_feat_argsort_GPU.bin")); + ++loaded; + benchmark_function_void("argsort_2D training set (GPU)", unit_test_argsort_2d, X_train_feat, X_train_feat_argsort_gpu); + } + + if (loaded == 2) + benchmark_function_void("X_train_feat_argsort CPU vs GPU", unit_test_cpu_vs_gpu, X_train_feat_argsort_cpu, X_train_feat_argsort_gpu); + } + + if(fs::exists(OUT_DIR "/X_test_feat_CPU.bin")){ + const np::Array X_test_feat = load(OUT_DIR "/X_test_feat_CPU.bin"); + + if(fs::exists(OUT_DIR "/X_test_feat_GPU.bin")){ + const np::Array X_test_feat_gpu = load(OUT_DIR "/X_test_feat_GPU.bin"); + benchmark_function_void("X_test_feat CPU vs GPU", unit_test_cpu_vs_gpu, X_test_feat, X_test_feat_gpu); + } + + np::Array X_test_feat_argsort_cpu; + uint8_t loaded = 0; + if(fs::exists(OUT_DIR "/X_test_feat_argsort_CPU.bin")){ + X_test_feat_argsort_cpu = std::move(load(OUT_DIR "/X_test_feat_argsort_CPU.bin")); + ++loaded; + benchmark_function_void("argsort_2D testing set (CPU)", unit_test_argsort_2d, X_test_feat, X_test_feat_argsort_cpu); + } + + np::Array X_test_feat_argsort_gpu; + if(fs::exists(OUT_DIR "/X_test_feat_argsort_GPU.bin")){ + X_test_feat_argsort_gpu = std::move(load(OUT_DIR "/X_test_feat_argsort_GPU.bin")); + ++loaded; + benchmark_function_void("argsort_2D testing set (GPU)", unit_test_argsort_2d, X_test_feat, X_test_feat_argsort_gpu); + } + + if (loaded == 2) + benchmark_function_void("X_test_feat_argsort CPU vs GPU", unit_test_cpu_vs_gpu, X_test_feat_argsort_cpu, X_test_feat_argsort_gpu); + } + + char title[BUFFER_SIZE] = { 0 }; + char alphas_title[BUFFER_SIZE] = { 0 }; + char final_classifiers_title[BUFFER_SIZE] = { 0 }; + + for (const size_t T : TS) { + sprintf(alphas_title, MODEL_DIR "/alphas_%lu_CPU.bin", T); + if(!fs::exists(alphas_title)) continue; + const np::Array alphas = load(alphas_title); + + sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_CPU.bin", T); + if(!fs::exists(final_classifiers_title)) continue; + const np::Array final_classifiers = load(final_classifiers_title); + + sprintf(alphas_title, MODEL_DIR "/alphas_%lu_GPU.bin", T); + if(!fs::exists(alphas_title)) continue; + const np::Array alphas_gpu = load(alphas_title); + + sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_GPU.bin", T); + if(!fs::exists(final_classifiers_title)) continue; + const np::Array final_classifiers_gpu = load(final_classifiers_title); + + sprintf(title, "alphas %ld CPU vs GPU", T); + benchmark_function_void(title, unit_test_cpu_vs_gpu, alphas, alphas_gpu); + sprintf(title, "final_classifiers %ld CPU vs GPU", T); + benchmark_function_void(title, unit_test_cpu_vs_gpu, final_classifiers, final_classifiers_gpu); + } +} + +int main(){ +#ifdef __DEBUG + printf("| %-49s | %-17s | %-29s |\n", "Unit testing", "Time spent (ns)", "Formatted time spent"); + printf("|%s|%s|%s|\n", S(51), S(19), S(31)); + benchmark_function_void("Testing GPU capabilities 1D", test_working, 3 + (1<<29)); + benchmark_function_void("Testing GPU capabilities 2D", test_working_2d, 3 + (1<<15), 2 + (1<<14)); + benchmark_function_void("Testing GPU capabilities 3D", test_working_3d, 9 + (1<<10), 5 + (1<<10), 7 + (1<<9)); + benchmark_function_void("Testing toolbox", toolbox_unit_test); + // benchmark_function_void("Testing floating capabilities", test_float); + printf("\n"); +#endif + setlocale(LC_NUMERIC, ""); // Allow proper number display + const auto [ X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test ] = preprocessing(); + train(X_train_feat, X_train_feat_argsort, y_train); + testing_and_evaluating(X_train_feat, y_train, X_test_feat, y_test); + final_unit_test(); +#ifdef __DEBUG + printf("\nAFTER CLEANUP\n"); +#endif + return EXIT_SUCCESS; +} diff --git a/cpp/test.cpp b/cpp/test.cpp new file mode 100644 index 0000000..c46d2d0 --- /dev/null +++ b/cpp/test.cpp @@ -0,0 +1,63 @@ +#include +#include +#include "data.hpp" +#include "toolbox.hpp" + +#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||" +#define PBWIDTH 60 + +void printProgress(const float64_t& percentage) noexcept { + const uint64_t val = static_cast(percentage * 100); + const int lpad = static_cast(percentage * PBWIDTH); + const int rpad = PBWIDTH - lpad; + printf("%3lu%% [%.*s%*s]\r", val, lpad, PBSTR, rpad, ""); + fflush(stdout); +} + +void clearProgress() noexcept { + // Progress bar width + space before + num space + space after + printf("%*c\r", PBWIDTH + 1 + 3 + 3, ' '); +} + +template +void test(const uint64_t& N) noexcept { +#ifdef __DEBUG + printf("DETERMINISTIC for N=%s of %s sized %s\n", thousand_sep(N).c_str(), typeid(T).name(), format_byte_size(sizeof(T)).c_str()); + print("Estimating memory footprint at : " + format_byte_size(3 * N * sizeof(T))); +#endif + + T *a = new T[N], *b = new T[N], *c = new T[N]; + + T mean = static_cast(0.0); + const size_t percent = N / 100; + for(size_t i = 0; i < N; ++i){ + if (i % percent == 0) printProgress(static_cast(i) / N); + a[i] = static_cast(i < N>>1 ? 0.1 : 1.0); + b[i] = static_cast(1.0); + c[i] = a[i] * b[i]; + mean += c[i]; + } + mean /= static_cast(N); + + clearProgress(); + std::cout << mean << std::endl; + + delete[] a, delete[] b, delete[] c; +} + +void test_float() noexcept { + std::cout << std::setprecision(1<<8); + const uint64_t N = static_cast(1)<<28; + test(N); + test(N); + test(N); + + //printf("%.128af\n", static_cast(1) / 3); + //std::cout << static_cast(1) / 3 << std::endl; + //std::cout << std::hexfloat << static_cast(1) / 3 << std::endl; + + //printf("%.128Lf\n", static_cast(1) / 3); + //printf("%.128lf\n", static_cast(1) / 3); + //printf("%.128f\n", static_cast(1) / 3); +} + diff --git a/cpp/toolbox.cpp b/cpp/toolbox.cpp new file mode 100644 index 0000000..ddea4b9 --- /dev/null +++ b/cpp/toolbox.cpp @@ -0,0 +1,133 @@ +#include "toolbox.hpp" +#include +#include +#include +#include + +static constexpr size_t N_TIMES = 10; +static const std::array time_formats = { "ns", "us", "ms", "s", "m", "h", "j", "w", "M", "y" }; +static constexpr std::array time_numbers = { 1, 1000, 1000, 1000, 60, 60, 24, 7, 4, 12 }; +static const uint64_t total_time = std::accumulate(time_numbers.begin(), time_numbers.end(), (uint64_t)1, std::multiplies()); + +/** + * @brief Format the time in seconds in human readable format. + * + * @param time Time in seconds + * @return std::string The formatted human readable string. + */ +std::string format_time(uint64_t time) noexcept { + return time < 2 ? std::to_string(time) + "s" : format_time_ns(time * (uint64_t)1e9); +} + +/** + * @brief Format the time in nanoseconds in human readable format. + * + * @param time Time in nanoseconds + * @return std::string The formatted human readable string. + */ +std::string format_time_ns(uint64_t time) noexcept { + if (time == 0) + return "0ns"; + uint64_t prod = total_time; + + std::string s = ""; + uint64_t res; + for (int i = N_TIMES - 1; i >= 0; --i) { + if (time >= prod) { + res = time / prod; + time %= prod; + s += std::to_string(res) + time_formats[i] + " "; + } + prod /= time_numbers[i]; + } + + if (s.back() == ' ') + s.pop_back(); + + return s; +} + +static const constexpr size_t N_BYTES = 7; +static const constexpr std::array bytes_formats = { "", "K", "M", "G", "P", "E", "Z" }; //, "Y" }; +static const constexpr uint64_t total_bytes = static_cast(1)<<(10 * (N_BYTES - 1)); + +std::string format_byte_size(uint64_t bytes) noexcept { + if (bytes == 0) + return "0B"; + uint64_t prod = total_bytes; + + std::string s = ""; + uint64_t res; + for (size_t i = N_BYTES; i > 0; --i) { + if (bytes >= prod) { + res = bytes / prod; + bytes %= prod; + s += std::to_string(res) + bytes_formats[i - 1] + "B "; + } + prod /= static_cast(1)<<10; + } + + if (s.back() == ' ') + s.pop_back(); + + return s; +} + +void toolbox_unit_test() noexcept { + assert(std::string("0B") == format_byte_size(static_cast(0))); + assert(std::string("1B") == format_byte_size(static_cast(1))); + assert(std::string("1KB") == format_byte_size(static_cast(1)<<10)); + assert(std::string("1MB") == format_byte_size(static_cast(1)<<20)); + assert(std::string("1GB") == format_byte_size(static_cast(1)<<30)); + assert(std::string("1PB") == format_byte_size(static_cast(1)<<40)); + assert(std::string("1EB") == format_byte_size(static_cast(1)<<50)); + assert(std::string("1ZB") == format_byte_size(static_cast(1)<<60)); + //assert(std::string("1YB") == format_byte_size(static_cast(1)<<70)); + // UINT64_MAX == 18446744073709551615I64u == -1 + assert(std::string("15ZB 1023EB 1023PB 1023GB 1023MB 1023KB 1023B") == format_byte_size(static_cast(-1))); + + assert(std::string("0s") == format_time(static_cast(0))); + assert(std::string("1s") == format_time(static_cast(1))); + assert(std::string("1m") == format_time(static_cast(60))); + assert(std::string("1h") == format_time(static_cast(3600))); + assert(std::string("1j") == format_time(static_cast(86400))); + assert(std::string("1w") == format_time(static_cast(604800))); + assert(std::string("1M") == format_time(static_cast(2419200))); + assert(std::string("1y") == format_time(static_cast(29030400))); + + assert(std::string("0ns") == format_time_ns(static_cast(0))); + assert(std::string("1ns") == format_time_ns(static_cast(1))); + assert(std::string("1us") == format_time_ns(static_cast(1e3))); + assert(std::string("1ms") == format_time_ns(static_cast(1e6))); + assert(std::string("1s") == format_time_ns(static_cast(1e9))); + assert(std::string("1m") == format_time_ns(static_cast(6e10))); + assert(std::string("1h") == format_time_ns(static_cast(36e11))); + assert(std::string("1j") == format_time_ns(static_cast(864e11))); + assert(std::string("1w") == format_time_ns(static_cast(6048e11))); + assert(std::string("1M") == format_time_ns(static_cast(24192e11))); + assert(std::string("1y") == format_time_ns(static_cast(290304e11))); + // UINT64_MAX == 18446744073709551615I64u == -1 + assert(std::string("635y 5M 3j 23h 34m 33s 709ms 551us 615ns") == format_time_ns(static_cast(-1))); +} + +std::string thousand_sep(uint64_t k) noexcept { + std::string s = "", n = std::to_string(k); + + uint8_t c = 0; + for (const auto& n_i : n) { + ++c; + s.push_back(n_i); + if (c == 3) { + s.push_back(','); + c = 0; + } + } + + std::reverse(s.begin(), s.end()); + + if (s.size() % 4 == 0) + s.erase(s.begin()); + + return s; +} + diff --git a/cpp/toolbox.hpp b/cpp/toolbox.hpp new file mode 100644 index 0000000..e657712 --- /dev/null +++ b/cpp/toolbox.hpp @@ -0,0 +1,13 @@ +#pragma once +#include +#include +#include + +#define duration_ns(a) std::chrono::duration_cast(a).count() +#define time() std::chrono::high_resolution_clock::now() + +std::string format_time(uint64_t) noexcept; +std::string format_time_ns(uint64_t) noexcept; +std::string format_byte_size(uint64_t) noexcept; +void toolbox_unit_test() noexcept; +std::string thousand_sep(uint64_t) noexcept; diff --git a/download_data.sh b/download_data.sh new file mode 100755 index 0000000..e7f9cd9 --- /dev/null +++ b/download_data.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +#!/bin/sh + +# Exit if any of the command doesn't exit with code 0 +set -e + +EXEC_DIR=$1 +test -z $EXEC_DIR && EXEC_DIR=. +DATA_LOCATION=$EXEC_DIR/data +mkdir -p $DATA_LOCATION + +if [ ! -f $DATA_LOCATION/X_train.bin ] || [ ! -f $DATA_LOCATION/X_test.bin ] \ +|| [ ! -f $DATA_LOCATION/y_train.bin ] || [ ! -f $DATA_LOCATION/y_test.bin ]; then +#if true; then + if [ ! -f $DATA_LOCATION/faces.tar.gz ]; then + echo 'Downloading raw dataset' + curl -o $DATA_LOCATION/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz + fi + + echo 'Extracting raw files' + tar xzf $DATA_LOCATION/faces.tar.gz -C $DATA_LOCATION + rm $DATA_LOCATION/README + rm $DATA_LOCATION/svm.* + + echo 'Extracting raw train set' + tar xzf $DATA_LOCATION/face.train.tar.gz -C $DATA_LOCATION + rm $DATA_LOCATION/face.train.tar.gz + + echo 'Extracting raw test set' + tar xzf $DATA_LOCATION/face.test.tar.gz -C $DATA_LOCATION + rm $DATA_LOCATION/face.test.tar.gz + + echo 'Converting raw dataset to bin file' + source $EXEC_DIR/python/activate.sh $EXEC_DIR + python $EXEC_DIR/python/convert_dataset.py $DATA_LOCATION + + echo 'Removing leftovers' + rm -rf $DATA_LOCATION/train + rm -rf $DATA_LOCATION/test + + echo 'Done !' +fi diff --git a/python/Makefile b/python/Makefile new file mode 100644 index 0000000..078e4bd --- /dev/null +++ b/python/Makefile @@ -0,0 +1,34 @@ +DATA := ../data/X_train.bin ../data/X_test.bin ../data/y_train.bin ../data/y_test.bin + +.PHONY: all start reset + +all: ${DATA} + +${DATA}: + @bash ../download_data.sh .. + +venv: + @bash -c 'source activate.sh' + +start: ${DATA} venv + @bash -c 'source activate.sh && python projet.py' + +reset: + @echo Deleting generated states and models + @rm -rf out/* models/* | true + +debug: + @bash -c 'source activate.sh && pudb projet.py' + +profile: + @bash -c 'source activate.sh && python -m cProfile -o prof.out projet.py && gprof2dot -f pstats prof.out | dot -Tpng -o output.png' + +mrproper: reset + @rm -r __pycache__ venv + +test: + @bash -c 'source activate.sh && ls out | sed s/.pkl// | xargs -n1 python test_diff.py out' + @bash -c 'source activate.sh && ls models | sed s/.pkl// | xargs -n1 python test_diff.py models' + +help: + @echo "all start reset mrproper help" diff --git a/python/ViolaJones.py b/python/ViolaJones.py new file mode 100644 index 0000000..1c75e4b --- /dev/null +++ b/python/ViolaJones.py @@ -0,0 +1,182 @@ +from typing import Tuple, Iterable +from tqdm import tqdm +import numpy as np +import config + +if config.GPU_BOOSTED: + from ViolaJonesGPU import train_weak_clf +else: + from ViolaJonesCPU import train_weak_clf + +if config.COMPILE_WITH_C: + from numba import njit + @njit + def tqdm_iter(iter: Iterable, _: str): + return iter +else: + from decorators import njit, tqdm_iter + +@njit('uint8[:, :, :, :](uint16, uint16)') +def build_features(width: int, height: int) -> np.ndarray: + """Initialize the features base on the input shape. + + Args: + shape (Tuple[int, int]): Shape of the image (Width, Height). + + Returns: + np.ndarray: The initialized features. + """ + feats = [] + empty = (0, 0, 0, 0) + for w in range(1, width + 1): + for h in range(1, height + 1): + for i in range(width - w): + for j in range(height - h): + # 2 rectangle features + immediate = (i, j, w, h) + right = (i + w, j, w, h) + if i + 2 * w < width: # Horizontally Adjacent + feats.append(([right, empty], [immediate, empty])) + + bottom = (i, j + h, w, h) + if j + 2 * h < height: # Vertically Adjacent + feats.append((([immediate, empty], [bottom, empty]))) + + right_2 = (i + 2 * w, j, w, h) + # 3 rectangle features + if i + 3 * w < width: # Horizontally Adjacent + feats.append((([right, empty], [right_2, immediate]))) + + bottom_2 = (i, j + 2 * h, w, h) + if j + 3 * h < height: # Vertically Adjacent + feats.append((([bottom, empty], [bottom_2, immediate]))) + + # 4 rectangle features + bottom_right = (i + w, j + h, w, h) + if i + 2 * w < width and j + 2 * h < height: + feats.append((([right, bottom], [immediate, bottom_right]))) + + return np.asarray(feats, dtype = np.uint8) + +@njit('float64[:](uint8[:])') +def init_weights(y_train: np.ndarray) -> np.ndarray: + """Initialize the weights of the weak classifiers based on the training labels. + + Args: + y_train (np.ndarray): Training labels. + + Returns: + np.ndarray: The initialized weights. + """ + weights = np.empty_like(y_train, dtype = np.float64) + t = y_train.sum() + weights[y_train == 0] = 1.0 / (2 * t) + weights[y_train == 1] = 1.0 / (2 * (y_train.shape[0] - t)) + return weights + +@njit('int8[:](int32[:], int32, int32)') +def classify_weak_clf(x_feat_i: np.ndarray, threshold: int, polarity: int) -> np.ndarray: + """Classify the integrated features based on polarity and threshold. + + Args: + x_feat_i (np.ndarray): Integrated features. + threshold (int): Trained threshold. + polarity (int): Trained polarity. + + Returns: + np.ndarray: Classified features. + """ + res = np.zeros_like(x_feat_i, dtype = np.int8) + res[polarity * x_feat_i < polarity * threshold] = 1 + return res + +@njit('Tuple((int32, float64, float64[:]))(int32[:, :], float64[:], int32[:, :], uint8[:])') +def select_best(classifiers: np.ndarray, weights: np.ndarray, X_feat: np.ndarray, y: np.ndarray) -> Tuple[int, float, np.ndarray]: + """Select the best classifier given theirs predictions. + + Args: + classifiers (np.ndarray): The weak classifiers. + weights (np.ndarray): Trained weights of each classifiers. + X_feat (np.ndarray): Integrated features. + y (np.ndarray): Features labels. + + Returns: + Tuple[int, float, np.ndarray]: Index of the best classifier, the best error and the best accuracy + """ + best_clf, best_error, best_accuracy = 0, np.inf, np.empty(X_feat.shape[1], dtype = np.float64) + for j, (threshold, polarity) in enumerate(tqdm_iter(classifiers, "Selecting best classifiers")): + accuracy = np.abs(classify_weak_clf(X_feat[j], threshold, polarity) - y).astype(np.float64) + error = np.mean(weights * accuracy) + if error < best_error: + best_clf, best_error, best_accuracy = j, error, accuracy + return best_clf, best_error, best_accuracy + +#@njit('Tuple((float64[:], int32[:, :]))(uint16, int32[:, :], uint16[:, :], uint8[:])') +def train_viola_jones(T: int, X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """Train the weak classifiers. + + Args: + T (int): Number of weak classifiers. + X_feat (np.ndarray): Integrated features. + X_feat_argsort (np.ndarray): Sorted indexes of the integrated features. + y (np.ndarray): Features labels. + + Returns: + Tuple[np.ndarray, np.ndarray]: List of trained alphas and the list of the final classifiers. + """ + weights = init_weights(y) + alphas, final_classifier = np.empty(T, dtype = np.float64), np.empty((T, 3), dtype = np.int32) + + #for t in tqdm_iter(range(T), "Training ViolaJones"): + for t in tqdm(range(T), desc = "Training ViolaJones", leave = False): + weights /= weights.sum() + classifiers = train_weak_clf(X_feat, X_feat_argsort, y, weights) + clf, error, accuracy = select_best(classifiers, weights, X_feat, y) + beta = error / (1.0 - error) + weights *= beta ** (1.0 - accuracy) + alphas[t] = np.log(1.0 / beta) + final_classifier[t] = (clf, classifiers[clf][0], classifiers[clf][1]) + + return alphas, final_classifier + +@njit('uint8[:](float64[:], int32[:, :], int32[:, :])') +def classify_viola_jones(alphas: np.ndarray, classifiers: np.ndarray, X_feat: np.ndarray) -> np.ndarray: + """Classify the trained classifiers on the given features. + + Args: + alphas (np.ndarray): Trained alphas. + classifiers (np.ndarray): Trained classifiers. + X_feat (np.ndarray): Integrated features. + + Returns: + np.ndarray: Classification results. + """ + total = np.zeros(X_feat.shape[1], dtype = np.float64) + + for i, alpha in enumerate(tqdm_iter(alphas, "Classifying ViolaJones")): + (j, threshold, polarity) = classifiers[i] + total += alpha * classify_weak_clf(X_feat[j], threshold, polarity) + + y_pred = np.zeros(X_feat.shape[1], dtype = np.uint8) + y_pred[total >= 0.5 * np.sum(alphas)] = 1 + return y_pred + +@njit +def get_best_anova_features(X: np.ndarray, y: np.ndarray) -> np.ndarray: + #SelectPercentile(f_classif, percentile = 10).fit(X, y).get_support(indices = True) + classes = [X.T[y == 0].astype(np.float64), X.T[y == 1].astype(np.float64)] + n_samples_per_class = np.asarray([classes[0].shape[0], classes[1].shape[0]]) + n_samples = classes[0].shape[0] + classes[1].shape[0] + ss_alldata = (classes[0] ** 2).sum(axis = 0) + (classes[1] ** 2).sum(axis = 0) + sums_classes = [np.asarray(classes[0].sum(axis = 0)), np.asarray(classes[1].sum(axis = 0))] + sq_of_sums_alldata = (sums_classes[0] + sums_classes[1]) ** 2 + sq_of_sums_args = [sums_classes[0] ** 2, sums_classes[1] ** 2] + ss_tot = ss_alldata - sq_of_sums_alldata / n_samples + + sqd_sum_bw_n = sq_of_sums_args[0] / n_samples_per_class[0] + \ + sq_of_sums_args[1] / n_samples_per_class[1] - sq_of_sums_alldata / n_samples + ss_wn = ss_tot - sqd_sum_bw_n + df_wn = n_samples - 2 + msw = ss_wn / df_wn + f_values = sqd_sum_bw_n / msw + return np.sort(np.argsort(f_values)[::-1][: int(np.ceil(X.shape[0] / 10.0))]) diff --git a/python/ViolaJonesCPU.py b/python/ViolaJonesCPU.py new file mode 100644 index 0000000..21966bf --- /dev/null +++ b/python/ViolaJonesCPU.py @@ -0,0 +1,168 @@ +from config import COMPILE_WITH_C +from typing import Iterable +from numba import int32, float64 +import numpy as np + +if COMPILE_WITH_C: + from numba import njit + @njit + def tqdm_iter(iter: Iterable, _: str): + return iter +else: + from decorators import njit, tqdm_iter + import sys + sys.setrecursionlimit(10000) + +@njit('uint32[:, :, :](uint8[:, :, :])') +def set_integral_image(X: np.ndarray) -> np.ndarray: + """Transform the input images in integrated images (CPU version). + + Args: + X (np.ndarray): Dataset of images. + + Returns: + np.ndarray: Dataset of integrated images. + """ + X_ii = np.empty_like(X, dtype = np.uint32) + for i, Xi in enumerate(tqdm_iter(X, "Applying integral image")): + ii = np.zeros_like(Xi, dtype = np.uint32) + for y in range(1, Xi.shape[0]): + s = 0 + for x in range(Xi.shape[1] - 1): + s += Xi[y - 1, x] + ii[y, x + 1] = s + ii[y - 1, x + 1] + X_ii[i] = ii + return X_ii + +@njit('uint32(uint32[:, :], int16, int16, int16, int16)') +def __compute_feature__(ii: np.ndarray, x: int, y: int, w: int, h: int) -> int: + """Compute a feature on an integrated image at a specific coordinate (CPU version). + + Args: + ii (np.ndarray): Integrated image. + x (int): X coordinate. + y (int): Y coordinate. + w (int): width of the feature. + h (int): height of the feature. + + Returns: + int: Computed feature. + """ + return ii[y + h, x + w] + ii[y, x] - ii[y + h, x] - ii[y, x + w] + +@njit('int32[:, :](uint8[:, :, :, :], uint32[:, :, :])') +def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray: + """Apply the features on a integrated image dataset (CPU version). + + Args: + feats (np.ndarray): Features to apply. + X_ii (np.ndarray): Integrated image dataset. + + Returns: + np.ndarray: Applied features. + """ + X_feat = np.empty((feats.shape[0], X_ii.shape[0]), dtype = np.int32) + + for i, (p, n) in enumerate(tqdm_iter(feats, "Applying features")): + for j, x_i in enumerate(X_ii): + p_x, p_y, p_w, p_h = p[0] + p1_x, p1_y, p1_w, p1_h = p[1] + n_x, n_y, n_w, n_h = n[0] + n1_x, n1_y, n1_w, n1_h = n[1] + p1 = __compute_feature__(x_i, p_x, p_y, p_w, p_h) + __compute_feature__(x_i, p1_x, p1_y, p1_w, p1_h) + n1 = __compute_feature__(x_i, n_x, n_y, n_w, n_h) + __compute_feature__(x_i, n1_x, n1_y, n1_w, n1_h) + X_feat[i, j] = int32(p1) - int32(n1) + + return X_feat + +@njit('int32[:, :](int32[:, :], uint16[:, :], uint8[:], float64[:])') +def train_weak_clf(X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray, weights: np.ndarray) -> np.ndarray: + """Train the weak classifiers on a given dataset (CPU version). + + Args: + X_feat (np.ndarray): Feature images dataset. + X_feat_argsort (np.ndarray): Sorted indexes of the integrated features. + y (np.ndarray): Labels of the features. + weights (np.ndarray): Weights of the features. + + Returns: + np.ndarray: Trained weak classifiers. + """ + total_pos, total_neg = weights[y == 1].sum(), weights[y == 0].sum() + + classifiers = np.empty((X_feat.shape[0], 2), dtype = np.int32) + for i, feature in enumerate(tqdm_iter(X_feat, "Training weak classifiers")): + pos_seen, neg_seen = 0, 0 + pos_weights, neg_weights = 0, 0 + min_error, best_threshold, best_polarity = float64(np.inf), 0, 0 + for j in X_feat_argsort[i]: + error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights) + if error < min_error: + min_error = error + best_threshold = feature[j] + best_polarity = 1 if pos_seen > neg_seen else -1 + + if y[j] == 1: + pos_seen += 1 + pos_weights += weights[j] + else: + neg_seen += 1 + neg_weights += weights[j] + + classifiers[i] = (best_threshold, best_polarity) + return classifiers + +@njit('int32(int32[:], uint16[:], int32, int32)') +def as_partition(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> int: + i = l - 1 + j = l + for j in range(l, h + 1): + if a[indices[j]] < a[indices[h]]: + i += 1 + indices[i], indices[j] = indices[j], indices[i] + + i += 1 + indices[i], indices[j] = indices[j], indices[i] + return i + +@njit('void(int32[:], uint16[:], int32, int32)') +def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int): + total = h - l + 1; + stack = np.empty((total,), dtype = np.int32) + stack[0] = l + stack[1] = h + top = 1; + + low = l + high = h + + while top >= 0: + high = stack[top] + top -= 1 + low = stack[top] + top -= 1 + + if low >= high: + break; + + p = as_partition(a, indices, low, high); + + if p - 1 > low: + top += 1 + stack[top] = low; + top += 1 + stack[top] = p - 1; + + if p + 1 < high: + top += 1 + stack[top] = p + 1; + top += 1 + stack[top] = high; + +@njit('uint16[:, :](int32[:, :])') +def argsort(X_feat: np.ndarray) -> np.ndarray: + indices = np.empty_like(X_feat, dtype = np.uint16) + indices[:, :] = np.arange(indices.shape[1]) + for i in tqdm_iter(range(X_feat.shape[0]), "argsort"): + argsort_bounded(X_feat[i], indices[i], 0, X_feat[i].shape[0] - 1) + return indices diff --git a/python/ViolaJonesGPU.py b/python/ViolaJonesGPU.py new file mode 100644 index 0000000..7318c4c --- /dev/null +++ b/python/ViolaJonesGPU.py @@ -0,0 +1,372 @@ +from numba import float64, uint32, cuda, int32, uint16 +from config import COMPILE_WITH_C +import numpy as np + +NB_THREADS = 1024 +NB_THREADS_2D = (32, 32) +NB_THREADS_3D = (16, 16, 4) +M = int(np.log2(NB_THREADS_2D[1])) + +if COMPILE_WITH_C: + from numba import njit +else: + from decorators import njit + +@njit('uint32[:, :, :](uint32[:, :, :])') +def __scanCPU_3d__(X: np.ndarray) -> np.ndarray: + """Prefix Sum (scan) of a given dataset. + + Args: + X (np.ndarray): Dataset of images to apply sum. + + Returns: + np.ndarray: Scanned dataset of images. + """ + for x in range(X.shape[0]): + for y in range(X.shape[1]): + cum = 0 + for z in range(X.shape[2]): + cum += X[x, y, z] + X[x, y, z] = cum - X[x, y, z] + return X + +@cuda.jit('void(uint16, uint16, uint32[:, :, :], uint32[:, :, :])') +def __kernel_scan_3d__(n: int, j: int, d_inter: np.ndarray, d_a: np.ndarray) -> None: + """GPU kernel used to do a parallel prefix sum (scan). + + Args: + n (int): + j (int): [description] + d_inter (np.ndarray): [description] + d_a (np.ndarray): [description] + """ + x_coor, y_coor = cuda.grid(2) + + sA = cuda.shared.array(NB_THREADS_2D, uint32) + sA[cuda.threadIdx.x, cuda.threadIdx.y] = d_a[cuda.blockIdx.z, y_coor, x_coor] if x_coor < n and y_coor < j else 0 + cuda.syncthreads() + + k = cuda.threadIdx.x + for d in range(M): + k *= 2 + i1 = k + 2**d - 1 + i2 = k + 2**(d + 1) - 1 + if i2 >= cuda.blockDim.x: + break + sA[i2, cuda.threadIdx.y] += sA[i1, cuda.threadIdx.y] + cuda.syncthreads() + + if cuda.threadIdx.x == 0: + d_inter[cuda.blockIdx.z, y_coor, cuda.blockIdx.x] = sA[cuda.blockDim.x - 1, cuda.threadIdx.y] + sA[cuda.blockDim.x - 1, cuda.threadIdx.y] = 0 + cuda.syncthreads() + + k = 2**(M + 1) * cuda.threadIdx.x + for d in range(M - 1, -1, -1): + k //= 2 + i1 = k + 2**d - 1 + i2 = k + 2**(d + 1) - 1 + if i2 >= cuda.blockDim.x: + continue + t = sA[i1, cuda.threadIdx.y] + sA[i1, cuda.threadIdx.y] = sA[i2, cuda.threadIdx.y] + sA[i2, cuda.threadIdx.y] += t + cuda.syncthreads() + + if x_coor < n and y_coor < j: + d_a[cuda.blockIdx.z, y_coor, x_coor] = sA[cuda.threadIdx.x, cuda.threadIdx.y] + +@cuda.jit('void(uint32[:, :, :], uint32[:, :, :], uint16, uint16)') +def __add_3d__(d_X: np.ndarray, d_s: np.ndarray, n: int, m: int) -> None: + """GPU kernel for parallel sum. + + Args: + d_X (np.ndarray): Dataset of images. + d_s (np.ndarray): Temporary sums to add. + n (int): Number of width blocks. + m (int): Height of a block. + """ + x_coor, y_coor = cuda.grid(2) + if x_coor < n and y_coor < m: + d_X[cuda.blockIdx.z, y_coor, x_coor] += d_s[cuda.blockIdx.z, y_coor, cuda.blockIdx.x] + +def __scanGPU_3d__(X: np.ndarray) -> np.ndarray: + """Parallel Prefix Sum (scan) of a given dataset. + + Read more: https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda + + Args: + X (np.ndarray): Dataset of images. + + Returns: + np.ndarray: Scanned dataset of images. + """ + k, height, n = X.shape + n_block_x, n_block_y = np.ceil(np.divide(X.shape[1:], NB_THREADS_2D)).astype(np.uint64) + d_X = cuda.to_device(X) + + d_inter = cuda.to_device(np.empty((k, height, n_block_x), dtype = np.uint32)) + __kernel_scan_3d__[(n_block_x, n_block_y, k), NB_THREADS_2D](n, height, d_inter, d_X) + cuda.synchronize() + + inter = d_inter.copy_to_host() + if n_block_x >= NB_THREADS_2D[0]: + sums = __scanGPU_3d__(inter) + + d_s = cuda.to_device(sums) + __add_3d__[(n_block_x, n_block_y, k), NB_THREADS_2D](d_X, d_s, n, height) + cuda.synchronize() + X_scan = d_X.copy_to_host() + else: + sums = __scanCPU_3d__(inter) + X_scan = d_X.copy_to_host() + + for p in range(k): + for h in range(height): + for i in range(1, n_block_x): + for j in range(NB_THREADS_2D[1]): + idx = i * NB_THREADS_2D[1] + j + if idx < n: + X_scan[p, h, idx] += sums[p, h, i] + + return X_scan + +@cuda.jit('void(uint32[:, :, :], uint32[:, :, :])') +def __transpose_kernel__(d_X: np.ndarray, d_Xt: np.ndarray) -> None: + """GPU kernel of the function __transpose_3d__. + + Args: + d_X (np.ndarray): Dataset of images. + d_Xt(np.ndarray): Transposed dataset of images. + width (int): Width of each images in the dataset. + height (int): Height of each images in the dataset. + """ + temp = cuda.shared.array(NB_THREADS_2D, dtype = uint32) + + x, y = cuda.grid(2) + if x < d_X.shape[1] and y < d_X.shape[2]: + temp[cuda.threadIdx.y, cuda.threadIdx.x] = d_X[cuda.blockIdx.z, x, y] + cuda.syncthreads() + + x = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.x + y = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.y + if x < d_X.shape[2] and y < d_X.shape[1]: + d_Xt[cuda.blockIdx.z, x, y] = temp[cuda.threadIdx.x, cuda.threadIdx.y] + +def __transpose_3d__(X: np.ndarray) -> np.ndarray: + """Transpose every images in the given dataset. + + Args: + X (np.ndarray): Dataset of images. + + Returns: + np.ndarray: Transposed dataset of images. + """ + n_block_x, n_block_z = np.ceil(np.divide(X.shape[1:], NB_THREADS_2D)).astype(np.uint64) + d_X = cuda.to_device(X) + d_Xt = cuda.to_device(np.empty((X.shape[0], X.shape[2], X.shape[1]), dtype = X.dtype)) + __transpose_kernel__[(n_block_x, n_block_z, X.shape[0]), NB_THREADS_2D](d_X, d_Xt) + return d_Xt.copy_to_host() + +def set_integral_image(X: np.ndarray) -> np.ndarray: + """Transform the input images in integrated images (GPU version). + + Args: + X (np.ndarray): Dataset of images. + + Returns: + np.ndarray: Dataset of integrated images. + """ + X = X.astype(np.uint32) + X = __scanGPU_3d__(X) + X = __transpose_3d__(X) + X = __scanGPU_3d__(X) + return __transpose_3d__(X) + +@cuda.jit('void(int32[:, :], uint8[:], int32[:, :], uint16[:, :], float64[:], float64, float64)') +def __train_weak_clf_kernel__(d_classifiers: np.ndarray, d_y: np.ndarray, d_X_feat: np.ndarray, d_X_feat_argsort: np.ndarray, + d_weights: np.ndarray, total_pos: float, total_neg: float) -> None: + """GPU kernel of the function train_weak_clf. + + Args: + d_classifiers (np.ndarray): Weak classifiers to train. + d_y (np.ndarray): Labels of the features. + d_X_feat (np.ndarray): Feature images dataset. + d_X_feat_argsort (np.ndarray): Sorted indexes of the integrated features. + d_weights (np.ndarray): Weights of the features. + total_pos (float): Total of positive labels in the dataset. + total_neg (float): Total of negative labels in the dataset. + """ + i = cuda.blockIdx.x * cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z + i += cuda.threadIdx.x * cuda.blockDim.y * cuda.blockDim.z + i += cuda.threadIdx.y * cuda.blockDim.z + i += cuda.threadIdx.z + if i >= d_classifiers.shape[0]: + return + + pos_seen, neg_seen = 0, 0 + pos_weights, neg_weights = 0.0, 0.0 + min_error, best_threshold, best_polarity = float64(np.inf), 0, 0 + + for j in d_X_feat_argsort[i]: + error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights) + if error < min_error: + min_error = error + best_threshold = d_X_feat[i, j] + best_polarity = 1 if pos_seen > neg_seen else -1 + + if d_y[j] == 1: + pos_seen += 1 + pos_weights += d_weights[j] + else: + neg_seen += 1 + neg_weights += d_weights[j] + + d_classifiers[i] = (best_threshold, best_polarity) + +#@njit('int32[:, :](int32[:, :], uint16[:, :], uint8[:], float64[:])') +def train_weak_clf(X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray, weights: np.ndarray) -> np.ndarray: + """Train the weak classifiers on a given dataset (GPU version). + + Args: + X_feat (np.ndarray): Feature images dataset. + X_feat_argsort (np.ndarray): Sorted indexes of the integrated features. + y (np.ndarray): Labels of the features. + weights (np.ndarray): Weights of the features. + + Returns: + np.ndarray: Trained weak classifiers. + """ + total_pos, total_neg = weights[y == 1].sum(), weights[y == 0].sum() + d_classifiers = cuda.to_device(np.empty((X_feat.shape[0], 2), dtype = np.int32)) + d_X_feat = cuda.to_device(X_feat) + d_X_feat_argsort = cuda.to_device(X_feat_argsort) + d_weights = cuda.to_device(weights) + d_y = cuda.to_device(y) + n_blocks = np.ceil(X_feat.shape[0] / np.prod(NB_THREADS_3D)).astype(np.uint16) + __train_weak_clf_kernel__[n_blocks, NB_THREADS_3D](d_classifiers, d_y, d_X_feat, d_X_feat_argsort, d_weights, total_pos, total_neg) + return d_classifiers.copy_to_host() + +@cuda.jit('uint32(uint32[:, :], int16, int16, int16, int16)', device = True) +def __compute_feature__(ii: np.ndarray, x: int, y: int, w: int, h: int) -> int: + """Compute a feature on an integrated image at a specific coordinate (GPU version). + + Args: + ii (np.ndarray): Integrated image. + x (int): X coordinate. + y (int): Y coordinate. + w (int): width of the feature. + h (int): height of the feature. + + Returns: + int: Computed feature. + """ + return ii[y + h, x + w] + ii[y, x] - ii[y + h, x] - ii[y, x + w] + +@cuda.jit('void(int32[:, :], uint8[:, :, :, :], uint32[:, :, :])') +def __apply_feature_kernel__(X_feat: np.ndarray, feats: np.ndarray, X_ii: np.ndarray) -> None: + """GPU kernel of the function apply_features. + + Args: + X_feat (np.ndarray): Feature images dataset. + feats (np.ndarray): Features to apply. + X_ii (np.ndarray): Integrated image dataset. + n (int): Number of features. + m (int): Number of images of the dataset. + """ + x, y = cuda.grid(2) + if x >= feats.shape[0] or y >= X_ii.shape[0]: + return + + p_x, p_y, p_w, p_h = feats[x, 0, 0] + p1_x, p1_y, p1_w, p1_h = feats[x, 0, 1] + n_x, n_y, n_w, n_h = feats[x, 1, 0] + n1_x, n1_y, n1_w, n1_h = feats[x, 1, 1] + sP = __compute_feature__(X_ii[y], p_x, p_y, p_w, p_h) + \ + __compute_feature__(X_ii[y], p1_x, p1_y, p1_w, p1_h) + sN = __compute_feature__(X_ii[y], n_x, n_y, n_w, n_h) + \ + __compute_feature__(X_ii[y], n1_x, n1_y, n1_w, n1_h) + X_feat[x, y] = sP - sN + +#@njit('int32[:, :](uint8[:, :, :, :], uint32[:, :, :])') +def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray: + """Apply the features on a integrated image dataset (GPU version). + + Args: + feats (np.ndarray): Features to apply. + X_ii (np.ndarray): Integrated image dataset. + + Returns: + np.ndarray: Applied features. + """ + d_X_feat = cuda.to_device(np.empty((feats.shape[0], X_ii.shape[0]), dtype = np.int32)) + d_feats = cuda.to_device(feats) + d_X_ii = cuda.to_device(X_ii) + n_x_blocks, n_y_blocks = np.ceil(np.divide(d_X_feat.shape, NB_THREADS_2D)).astype(np.uint16) + __apply_feature_kernel__[(n_x_blocks, n_y_blocks), NB_THREADS_2D](d_X_feat, d_feats, d_X_ii) + cuda.synchronize() + return d_X_feat.copy_to_host() + +@cuda.jit('int32(int32[:], uint16[:], int32, int32)', device = True) +def as_partition(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> int: + i = l - 1 + j = l + for j in range(l, h + 1): + if a[indices[j]] < a[indices[h]]: + i += 1 + indices[i], indices[j] = indices[j], indices[i] + + i += 1 + indices[i], indices[j] = indices[j], indices[i] + return i + +@cuda.jit('void(int32[:], uint16[:], int32, int32)', device = True) +def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> None: + #total = h - l + 1; + stack = cuda.local.array(6977, int32) + stack[0] = l + stack[1] = h + top = 1; + + low = l + high = h + + while top >= 0: + high = stack[top] + top -= 1 + low = stack[top] + top -= 1 + + if low >= high: + break; + + p = as_partition(a, indices, low, high); + + if p - 1 > low: + top += 1 + stack[top] = low; + top += 1 + stack[top] = p - 1; + + if p + 1 < high: + top += 1 + stack[top] = p + 1; + top += 1 + stack[top] = high; + +@cuda.jit('void(int32[:, :], uint16[:, :])') +def argsort_flatter(X_feat: np.ndarray, indices: np.ndarray) -> None: + i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x + if i < X_feat.shape[0]: + for j in range(indices.shape[1]): + indices[i, j] = j + argsort_bounded(X_feat[i], indices[i], 0, X_feat.shape[1] - 1) + +def argsort(X_feat: np.ndarray) -> np.ndarray: + indices = np.empty_like(X_feat, dtype = np.uint16) + n_blocks = int(np.ceil(np.divide(X_feat.shape[0], NB_THREADS))) + d_X_feat = cuda.to_device(X_feat) + d_indices = cuda.to_device(indices) + argsort_flatter[n_blocks, NB_THREADS](d_X_feat, d_indices) + cuda.synchronize() + return d_indices.copy_to_host() diff --git a/python/activate.sh b/python/activate.sh new file mode 100755 index 0000000..965bb8e --- /dev/null +++ b/python/activate.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +# Exit if any of the command doesn't exit with code 0 +set -e + +EXEC_DIR=$1 +test -z "$EXEC_DIR" && EXEC_DIR=.. +VENV_PATH=$EXEC_DIR/python/venv + +activate(){ + if [ ! -d "$VENV_PATH" ]; then + echo 'Creating python virtual environnement' + python -m venv "$VENV_PATH" + echo 'Activating virtual environnement' + activate + echo 'Updating base pip packages' + python -m pip install -U setuptools pip + echo 'Installing requirements' + pip install -r "$EXEC_DIR"/python/requirements.txt + elif [ -f "$VENV_PATH"/Scripts/activate ]; then source "$VENV_PATH"/Scripts/activate + elif [ -f "$VENV_PATH"/bin/activate ]; then source "$VENV_PATH"/bin/activate + else + echo 'Python virtual environnement not detected' + exit 1 + fi +} + +activate diff --git a/python/common.py b/python/common.py new file mode 100644 index 0000000..6c9fc91 --- /dev/null +++ b/python/common.py @@ -0,0 +1,136 @@ +from toolbox import picke_multi_loader, format_time_ns, unit_test_argsort_2d +from typing import List, Tuple +from time import perf_counter_ns +import numpy as np + +def unit_test(TS: List[int], labels: List[str] = ["CPU", "GPU"], tol: float = 1e-8) -> None: + """Test if the each result is equals to other devices. + + Given ViolaJones is a deterministic algorithm, the results no matter the device should be the same + (given the floating point fluctuations), this function check this assertion. + + Args: + TS (List[int]): Number of trained weak classifiers. + labels (List[str], optional): List of the trained device names. Defaults to ["CPU", "GPU"]. + tol (float, optional): Float difference tolerance. Defaults to 1e-8. + """ + if len(labels) < 2: + return print("Not enough devices to test") + + fnc_s = perf_counter_ns() + n_total= 0 + n_success = 0 + print(f"\n| {'Unit testing':<37} | {'Test state':<10} | {'Time spent (ns)':<17} | {'Formatted time spent':<29} |") + print(f"|{'-'*39}|{'-'*12}|{'-'*19}|{'-'*31}|") + + for filename in ["X_train_feat", "X_test_feat", "X_train_ii", "X_test_ii"]: + print(f"{filename}...", end = "\r") + bs = picke_multi_loader([f"{filename}_{label}" for label in labels], "./out") + + for i, (b1, l1) in enumerate(zip(bs, labels)): + if b1 is None: + #print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>17} | {'None':<29} |") + continue + for j, (b2, l2) in enumerate(zip(bs, labels)): + if i >= j: + continue + if b2 is None: + #print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>17} | {'None':<29} |") + continue + n_total += 1 + s = perf_counter_ns() + state = np.abs(b1 - b2).mean() < tol + e = perf_counter_ns() - s + if state: + print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Passed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + n_success += 1 + else: + print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Failed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + + for filename, featname in zip(["X_train_feat_argsort", "X_test_feat_argsort"], ["X_train_feat", "X_test_feat"]): + print(f"Loading {filename}...", end = "\r") + feat = None + bs = [] + for label in labels: + if feat is None: + feat_tmp = picke_multi_loader([f"{featname}_{label}"], "./out")[0] + if feat_tmp is not None: + feat = feat_tmp + bs.append(picke_multi_loader([f"{filename}_{label}"], "./out")[0]) + + for i, (b1, l1) in enumerate(zip(bs, labels)): + if b1 is None: + #print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>17} | {'None':<29} |") + continue + if feat is not None: + n_total += 1 + s = perf_counter_ns() + state = unit_test_argsort_2d(feat, b1) + e = perf_counter_ns() - s + if state: + print(f"| {filename:<22} - {l1:<4} argsort | {'Passed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + n_success += 1 + else: + print(f"| {filename:<22} - {l1:<4} argsort | {'Failed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + + for j, (b2, l2) in enumerate(zip(bs, labels)): + if i >= j: + continue + if b2 is None: + #print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>17} | {'None':<29} |") + continue + n_total += 1 + s = perf_counter_ns() + state = np.abs(b1 - b2).mean() < tol + e = perf_counter_ns() - s + if state: + print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Passed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + n_success += 1 + else: + print(f"| {filename:<22} - {l1:<4} vs {l2:<4} | {'Failed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + + for T in TS: + for filename in ["alphas", "final_classifiers"]: + print(f"{filename}_{T}...", end = "\r") + bs = picke_multi_loader([f"{filename}_{T}_{label}" for label in labels]) + + for i, (b1, l1) in enumerate(zip(bs, labels)): + if b1 is None: + #print(f"| {filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>17} | {'None':<29} |") + continue + for j, (b2, l2) in enumerate(zip(bs, labels)): + if i >= j: + continue + if b2 is None: + #print(f"| {filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4} | {'Skipped':>10} | {'None':>17} | {'None':<29} |") + continue + n_total += 1 + s = perf_counter_ns() + state = np.abs(b1 - b2).mean() < tol + e = perf_counter_ns() - s + if state: + print(f"| {filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4} | {'Passed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + n_success += 1 + else: + print(f"| {filename + '_' + str(T):<22} - {l1:<4} vs {l2:<4} | {'Failed':>10} | {e:>17,} | {format_time_ns(e):<29} |") + print(f"|{'-'*39}|{'-'*12}|{'-'*19}|{'-'*31}|") + e = perf_counter_ns() - fnc_s + print(f"| {'Unit testing summary':<37} | {str(n_success) + '/' + str(n_total):>10} | {e:>17,} | {format_time_ns(e):<29} |") + +def load_datasets(data_dir: str = "../data") -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Load the datasets. + + Args: + data_dir (str, optional): [description]. Defaults to "../data". + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: [description] + """ + bytes_to_int_list = lambda b: list(map(int, b.rstrip().split(" "))) + + def load(set_name: str) -> np.ndarray: + with open(f"{data_dir}/{set_name}.bin", "r") as f: + shape = bytes_to_int_list(f.readline()) + return np.asarray(bytes_to_int_list(f.readline()), dtype = np.uint8).reshape(shape) + + return load("X_train"), load("y_train"), load("X_test"), load("y_test") diff --git a/python/config.py b/python/config.py new file mode 100644 index 0000000..95656ab --- /dev/null +++ b/python/config.py @@ -0,0 +1,15 @@ +# Save state to avoid recalulation on restart +SAVE_STATE = True +# Redo the state even if it's already saved +FORCE_REDO = False +# Use NJIT to greatly accelerate runtime +COMPILE_WITH_C = False +# Use GPU to greatly accelerate runtime (as priority over NJIT) +GPU_BOOSTED = False +# Number of weak classifiers +# TS = [1] +# TS = [1, 5, 10] +# TS = [1, 5, 10, 25, 50] +# TS = [1, 5, 10, 25, 50, 100, 200] +# TS = [1, 5, 10, 25, 50, 100, 200, 300] +TS = [1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000] diff --git a/python/convert_dataset.py b/python/convert_dataset.py new file mode 100644 index 0000000..244d16f --- /dev/null +++ b/python/convert_dataset.py @@ -0,0 +1,57 @@ +from io import BufferedReader +from tqdm import tqdm +from functools import partial +from sys import argv +import numpy as np +from os import path, listdir + +# Makes the "leave" argument default to False +tqdm = partial(tqdm, leave = False) + +def read_pgm(pgm_file: BufferedReader) -> np.ndarray: + """Read the data of a PGM file + + Args: + pgm_file (BufferedReader): PGM File + + Returns: + np.ndarray: PGM data + """ + assert (f := pgm_file.readline()) == b'P5\n', f"Incorrect file format: {f}" + (width, height) = [int(i) for i in pgm_file.readline().split()] + assert width > 0 and height > 0, f"Incorrect dimensions: {width}x{height}" + assert (depth := int(pgm_file.readline())) < 256, f"Incorrect depth: {depth}" + + buff = np.empty(height * width, dtype = np.uint8) + for i in range(buff.shape[0]): + buff[i] = ord(pgm_file.read(1)) + return buff.reshape((height, width)) + +def __main__(data_path: str) -> None: + """Read the data of every PGM file and output it in data files + + Args: + data_path (str): Path of the PGM files + """ + for set_name in tqdm(["train", "test"], desc = "set name"): + X, y = [], [] + for y_i, label in enumerate(tqdm(["non-face", "face"], desc = "label")): + for filename in tqdm(listdir(f"{data_path}/{set_name}/{label}"), desc = "Reading pgm file"): + with open(f"{data_path}/{set_name}/{label}/{filename}", "rb") as face: + X.append(read_pgm(face)) + y.append(y_i) + + X, y = np.asarray(X), np.asarray(y) + # idx = np.random.permutation(y.shape[0]) + # X, y = X[idx], y[idx] + + for org, s in tqdm(zip("Xy", [X, y]), desc = f"Writing {set_name}"): + with open(f"{data_path}/{org}_{set_name}.bin", "w") as out: + out.write(f'{str(s.shape)[1:-1].replace(",", "")}\n') + raw = s.ravel() + for s_i in tqdm(raw[:-1], desc = f"Writing {org}"): + out.write(f"{s_i} ") + out.write(str(raw[-1])) + +if __name__ == "__main__": + __main__(argv[1]) if len(argv) == 2 else print(f"Usage: python {__file__[__file__.rfind(path.sep) + 1:]} ./data_location") diff --git a/python/decorators.py b/python/decorators.py new file mode 100644 index 0000000..76dfcd1 --- /dev/null +++ b/python/decorators.py @@ -0,0 +1,13 @@ +from typing import Callable, Iterable, Union, Any +from tqdm import tqdm + +def njit(f: Union[Callable, str] = None, *args, **kwargs) -> Callable: + def decorator(func: Callable) -> Any: + return func + + if callable(f): + return f + return decorator + +def tqdm_iter(iter: Iterable, desc: str): + return tqdm(iter, leave = False, desc = desc) diff --git a/python/projet.py b/python/projet.py new file mode 100644 index 0000000..2653ca3 --- /dev/null +++ b/python/projet.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python +# Author: @saundersp + +from ViolaJones import train_viola_jones, classify_viola_jones +from toolbox import state_saver, picke_multi_loader, format_time_ns, benchmark_function, toolbox_unit_test, unit_test_argsort_2d +from sklearn.metrics import accuracy_score, f1_score, confusion_matrix +from sklearn.feature_selection import SelectPercentile, f_classif +from common import load_datasets, unit_test +from ViolaJones import build_features, get_best_anova_features +from typing import Tuple +from time import perf_counter_ns +from os import makedirs +import numpy as np + +#np.seterr(all = 'raise') + +from config import FORCE_REDO, COMPILE_WITH_C, GPU_BOOSTED, TS, SAVE_STATE + +if GPU_BOOSTED: + from ViolaJonesGPU import apply_features, set_integral_image, argsort + label = 'GPU' if COMPILE_WITH_C else 'PGPU' + # The parallel prefix sum doesn't use the whole GPU so numba output some annoying warnings, this disables it + from numba import config + config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +else: + from ViolaJonesCPU import apply_features, set_integral_image, argsort + label = 'CPU' if COMPILE_WITH_C else 'PY' + +# FIXME Debug code +# IDX_INSPECT = 0 +# IDX_INSPECT = 2 +IDX_INSPECT = 4548 +IDX_INSPECT_OFFSET = 100 + +def bench_train(X_train: np.ndarray, X_test: np.ndarray, y_train: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """Train the weak classifiers. + + Args: + X_train (np.ndarray): Training images. + X_test (np.ndarray): Testing Images. + y_train (np.ndarray): Training labels. + + Returns: + Tuple[np.ndarray, np.ndarray]: Training and testing features. + """ + feats = state_saver("Building features", "feats", lambda: build_features(X_train.shape[1], X_train.shape[2]), FORCE_REDO, SAVE_STATE) + + # FIXME Debug code + # print("feats") + # print(feats.shape) + # print(feats[IDX_INSPECT].ravel()) + # return 0, 0 + + X_train_ii = state_saver(f"Converting training set to integral images ({label})", f"X_train_ii_{label}", + lambda: set_integral_image(X_train), FORCE_REDO, SAVE_STATE) + X_test_ii = state_saver(f"Converting testing set to integral images ({label})", f"X_test_ii_{label}", + lambda: set_integral_image(X_test), FORCE_REDO, SAVE_STATE) + + # FIXME Debug code + # print("X_train_ii") + # print(X_train_ii.shape) + # print(X_train_ii[IDX_INSPECT]) + # print("X_test_ii") + # print(X_test_ii.shape) + # print(X_test_ii[IDX_INSPECT]) + # return 0, 0 + + X_train_feat = state_saver(f"Applying features to training set ({label})", f"X_train_feat_{label}", + lambda: apply_features(feats, X_train_ii), FORCE_REDO, SAVE_STATE) + X_test_feat = state_saver(f"Applying features to testing set ({label})", f"X_test_feat_{label}", + lambda: apply_features(feats, X_test_ii), FORCE_REDO, SAVE_STATE) + del X_train_ii, X_test_ii, feats + + # FIXME Debug code + # print("X_train_feat") + # print(X_train_feat.shape) + # print(X_train_feat[IDX_INSPECT, : IDX_INSPECT_OFFSET]) + # print("X_test_feat") + # print(X_test_feat.shape) + # print(X_test_feat[IDX_INSPECT, : IDX_INSPECT_OFFSET]) + # return 0, 0 + + #indices = state_saver("Selecting best features training set", "indices", force_redo = True, save_state = SAVE_STATE, + # fnc = lambda: SelectPercentile(f_classif, percentile = 10).fit(X_train_feat.T, y_train).get_support(indices = True)) + #indices = state_saver("Selecting best features training set", "indices", force_redo = FORCE_REDO, save_state = SAVE_STATE, + # fnc = lambda: get_best_anova_features(X_train_feat, y_train)) + #indices = benchmark_function("Selecting best features (manual)", lambda: get_best_anova_features(X_train_feat, y_train)) + + # FIXME Debug code + # print("indices") + # print(indices.shape) + # print(indices[IDX_INSPECT: IDX_INSPECT + IDX_INSPECT_OFFSET]) + # assert indices.shape[0] == indices_new.shape[0], f"Indices length not equal : {indices.shape} != {indices_new.shape}" + # assert (eq := indices == indices_new).all(), f"Indices not equal : {eq.sum() / indices.shape[0]}" + # return 0, 0 + + # X_train_feat, X_test_feat = X_train_feat[indices], X_test_feat[indices] + + #return 0, 0 + + X_train_feat_argsort = state_saver(f"Precalculating training set argsort ({label})", f"X_train_feat_argsort_{label}", + lambda: argsort(X_train_feat), FORCE_REDO, SAVE_STATE) + + # FIXME Debug code + # print("X_train_feat_argsort") + # print(X_train_feat_argsort.shape) + # print(X_train_feat_argsort[IDX_INSPECT, : IDX_INSPECT_OFFSET]) + # benchmark_function("Arg unit test", lambda: unit_test_argsort_2d(X_train_feat, X_train_feat_argsort)) + # return 0, 0 + + # X_test_feat_argsort = state_saver(f"Precalculating testing set argsort ({label})", f"X_test_feat_argsort_{label}", + # lambda: argsort(X_test_feat), True, False) + + # FIXME Debug code + # print("X_test_feat_argsort") + # print(X_test_feat_argsort.shape) + # print(X_test_feat_argsort[IDX_INSPECT, : IDX_INSPECT_OFFSET]) + # benchmark_function("Arg unit test", lambda: unit_test_argsort_2d(X_test_feat, X_test_feat_argsort)) + # return 0, 0 + # del X_test_feat_argsort + + print(f"\n| {'Training':<49} | {'Time spent (ns)':<17} | {'Formatted time spent':<29} |\n|{'-'*51}|{'-'*19}|{'-'*31}|") + + for T in TS: + # alphas, final_classifiers = state_saver(f"ViolaJones T = {T:<3} ({label})", [f"alphas_{T}_{label}", f"final_classifiers_{T}_{label}"], + state_saver(f"ViolaJones T = {T:<4} ({label})", [f"alphas_{T}_{label}", f"final_classifiers_{T}_{label}"], + lambda: train_viola_jones(T, X_train_feat, X_train_feat_argsort, y_train), FORCE_REDO, SAVE_STATE, "./models") + # FIXME Debug code + # print("alphas") + # print(alphas) + # print("final_classifiers") + # print(final_classifiers) + + return X_train_feat, X_test_feat + +def bench_accuracy(label, X_train_feat: np.ndarray, X_test_feat: np.ndarray, y_train: np.ndarray, y_test: np.ndarray) -> None: + """Benchmark the trained classifiers on the training and testing sets. + + Args: + X_train_feat (np.ndarray): Training features. + X_test_feat (np.ndarray): Testing features. + y_train (np.ndarray): Training labels. + y_test (np.ndarray): Testing labels. + """ + print(f"\n| {'Testing':<26} | Time spent (ns) (E) | {'Formatted time spent (E)':<29}", end = " | ") + print(f"Time spent (ns) (T) | {'Formatted time spent (T)':<29} |") + print(f"|{'-'*28}|{'-'*21}|{'-'*31}|{'-'*21}|{'-'*31}|") + + perfs = [] + for T in TS: + (alphas, final_classifiers) = picke_multi_loader([f"alphas_{T}_{label}", f"final_classifiers_{T}_{label}"]) + + s = perf_counter_ns() + y_pred_train = classify_viola_jones(alphas, final_classifiers, X_train_feat) + t_pred_train = perf_counter_ns() - s + e_acc = accuracy_score(y_train, y_pred_train) + e_f1 = f1_score(y_train, y_pred_train) + (_, e_FP), (e_FN, _) = confusion_matrix(y_train, y_pred_train) + + s = perf_counter_ns() + y_pred_test = classify_viola_jones(alphas, final_classifiers, X_test_feat) + t_pred_test = perf_counter_ns() - s + t_acc = accuracy_score(y_test, y_pred_test) + t_f1 = f1_score(y_test, y_pred_test) + (_, t_FP), (t_FN, _) = confusion_matrix(y_test, y_pred_test) + perfs.append((e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP)) + + print(f"| {'ViolaJones T = ' + str(T):<19} {'(' + label + ')':<6}", end = " | ") + print(f"{t_pred_train:>19,} | {format_time_ns(t_pred_train):<29}", end = " | ") + print(f"{t_pred_test:>19,} | {format_time_ns(t_pred_test):<29} |") + + print(f"\n| {'Evaluating':<19} | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) | ") + print(f"|{'-'*21}|{'-'*9}|{'-'*8}|{'-'*8}|{'-'*8}|{'-'*9}|{'-'*8}|{'-'*8}|{'-'*8}|") + + for T, (e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP) in zip(TS, perfs): + print(f"| {'ViolaJones T = ' + str(T):<19} | {e_acc:>7.2%} | {e_f1:>6.2f} | {e_FN:>6,} | {e_FP:>6,}", end = " | ") + print(f"{t_acc:>7.2%} | {t_f1:>6.2f} | {t_FN:>6,} | {t_FP:>6,} |") + +def _main_() -> None: + + # Creating state saver folders if they don't exist already + if SAVE_STATE: + for folder_name in ["models", "out"]: + makedirs(folder_name, exist_ok = True) + + print(f"| {'Preprocessing':<49} | {'Time spent (ns)':<17} | {'Formatted time spent':<29} |\n|{'-'*51}|{'-'*19}|{'-'*31}|") + + X_train, y_train, X_test, y_test = state_saver("Loading sets", ["X_train", "y_train", "X_test", "y_test"], + load_datasets, FORCE_REDO, SAVE_STATE) + + # FIXME Debug option (image width * log_10(length) + extra characters) + # np.set_printoptions(linewidth = 19 * 6 + 3) + + # FIXME Debug code + # print("X_train") + # print(X_train.shape) + # print(X_train[IDX_INSPECT]) + # print("X_test") + # print(X_test.shape) + # print(X_test[IDX_INSPECT]) + # print("y_train") + # print(y_train.shape) + # print(y_train[IDX_INSPECT: IDX_INSPECT + IDX_INSPECT_OFFSET]) + # print("y_test") + # print(y_test.shape) + # print(y_test[IDX_INSPECT: IDX_INSPECT + IDX_INSPECT_OFFSET]) + # return + + X_train_feat, X_test_feat = bench_train(X_train, X_test, y_train) + + # FIXME Debug code + # return + + # X_train_feat, X_test_feat = picke_multi_loader([f"X_train_feat_{label}", f"X_test_feat_{label}"], "./out") + # indices = picke_multi_loader(["indices"], "./out")[0] + # X_train_feat, X_test_feat = X_train_feat[indices], X_test_feat[indices] + + bench_accuracy(label, X_train_feat, X_test_feat, y_train, y_test) + +if __name__ == "__main__": + #toolbox_unit_test() + _main_() + + # Only execute unit test after having trained the specified labels + unit_test(TS, ["GPU", "CPU", "PY", "PGPU"]) + pass diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..1e57b84 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,5 @@ +numba +scikit-learn +tqdm +pudb +nvitop diff --git a/python/test.py b/python/test.py new file mode 100644 index 0000000..a187c4f --- /dev/null +++ b/python/test.py @@ -0,0 +1,189 @@ +import numpy as np +from numba import cuda, config, njit +config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +#import matplotlib.pyplot as plt +from tqdm import tqdm +from time import perf_counter_ns +from toolbox import format_time_ns +from pickle import load, dump +from sys import argv + +def get(a): + with open(f"{a}.pkl", 'rb') as f: + return load(f) + +def save(a, name) -> None: + with open(name, 'wb') as f: + dump(a, f) + +def diff(folder, a, label1, label2): + af, bf = get(f"{folder}/{a}_{label1}"), get(f"{folder}/{a}_{label2}") + #print(af) + #print(bf) + print((af - bf).mean()) + +if __name__ == "__main__": + if len(argv) == 5: + diff(argv[1], argv[4], argv[2], argv[3]) + +def py_mean(a, b): + s = 0.0 + for a_i, b_i in zip(a, b): + s += a_i * b_i + return s / a.shape[0] + +def np_mean(a, b): + return np.mean(a * b) + +@njit('float64(float64[:], float64[:])', fastmath = True, nogil = True) +def nb_mean(a, b): + return np.mean(a * b) + +@njit('float64(float64[:], float64[:])', fastmath = True, nogil = True) +def nb_mean_loop(a, b): + s = 0.0 + for a_i, b_i in zip(a, b): + s += a_i * b_i + return s / a.shape[0] + +@cuda.jit('void(float64[:], float64[:], float64[:])', fastmath = True) +def cuda_mean_kernel(r, a, b): + s = 0.0 + for a_i, b_i in zip(a, b): + s += a_i * b_i + r[0] = s / a.shape[0] + +def cuda_mean(a, b): + r = cuda.to_device(np.empty(1, dtype = np.float64)) + d_a = cuda.to_device(a) + d_b = cuda.to_device(b) + cuda_mean_kernel[1, 1](r, d_a, d_b) + return r.copy_to_host()[0] + +def test_and_compare(labels, fncs, a, b): + m = [] + for fnc in tqdm(fncs, leave = False, desc = "Calculating..."): + s = perf_counter_ns() + m.append([fnc(a, b), perf_counter_ns() - s]) + print("Results:") + [print(f"\t{label:<10} {m_i:<20} {format_time_ns(time_i)}") for ((m_i, time_i), label) in zip(m, labels)] + print("Comparaison:") + for i, (m_i, label_i) in enumerate(zip(m, labels)): + for j, (m_j, label_j) in enumerate(zip(m, labels)): + if i >= j: + continue + print(f"\t{label_i:<10} vs {label_j:<10} - {abs(m_i[0] - m_j[0])}") + +if __name__ == "__main__": + np.set_printoptions(linewidth = 10000, threshold = 1000) + + N = int(2**20) + labels = ["Python", "Numpy", "Numba", "Numba loop", "CUDA"] + fncs = [py_mean, np_mean, nb_mean, nb_mean_loop, cuda_mean] + + print(f"RANDOM for N={N}") + total_size = (2 * 8 * N) + print(f"Size = {total_size} B") + print(f"Size = {total_size // 1024} kB") + print(f"Size = {total_size // 1024 // 1024} MB") + print(f"Size = {total_size // 1024 // 1024 // 1024} GB") + a, b = np.random.rand(N).astype(np.float64), np.random.rand(N).astype(np.float64) + test_and_compare(labels, fncs, a, b) + del a, b + + print(f"\nDETERMINSTIC for N={N}") + total_size = (2 * 8 * N) + (8 * N) + print(f"Size = {total_size} B") + print(f"Size = {total_size // 1024} kB") + print(f"Size = {total_size // 1024 // 1024} MB") + print(f"Size = {total_size // 1024 // 1024 // 1024} GB") + mask = np.arange(N, dtype = np.uint64) + a = np.ones(N, dtype = np.float64) + a[mask < N//2] = 0.1 + del mask + b = np.ones(N, dtype = np.float64) + test_and_compare(labels, fncs, a, b) + del a, b + + #from ViolaJonesGPU import argsort as argsort_GPU + #from ViolaJonesCPU import argsort as argsort_CPU + #from toolbox import unit_test_argsort_2d, benchmark_function + + #labels = ["Numpy", "Numba", "CUDA"] + #a = np.random.randint(2**12, size = (2**20, 2**8), dtype = np.int32) + #m = [benchmark_function(f"Argsort {label}", lambda: f(np.copy(a))) for (label, f) in zip(labels, [ + # lambda a: np.argsort(a).astype(np.uint16), argsort_CPU, argsort_GPU + #])] + #for i, (m_i, label_i) in enumerate(zip(m, labels)): + # #for j, (m_j, label_j) in enumerate(zip(m, labels)): + # # if i >= j: + # # continue + # # print(f"\t{label_i:<10} vs {label_j:<10} - {(m_i == m_j).mean()}") + # benchmark_function(f"Unit test {label_i}", lambda: unit_test_argsort_2d(a, m_i)) + + #for i in tqdm(range(X.shape[0]), leave = False, desc = "Extract image"): + # x = X[i] + # y = Y[i] + # fig = plt.figure() + # plt.imshow(x, cmap = 'gray') + # plt.savefig(f"imgs/{y}/{i}.png") + # plt.close(fig) + + #def extract_FD(Xy): + # X_c, Y_c = [], [] + # for x,y in Xy: + # X_c.append(x) + # Y_c.append(y) + # X_c = np.asarray(X_c) + # Y_c = np.asarray(Y_c) + # return X_c, Y_c + + #X_train, y_train = get('out/X_train'), get('out/y_train') + #X_test, y_test = get('out/X_test'), get('out/y_test') + + #X_train, y_train = extract_FD(get('/home/_aspil0w/git/FaceDetection/training')) + #X_test, y_test = extract_FD(get('/home/_aspil0w/git/FaceDetection/test')) + #save(X_train, 'out/X_train'), save(y_train, 'out/y_train') + #save(X_test, 'out/X_test'), save(y_test, 'out/y_test') + + #print(X_train.shape, X_train_org.shape, X_train.shape == X_train_org.shape) + #print((X_train == X_train_org).mean()) + #print(y_train.shape, y_train_org.shape, y_train.shape == y_train_org.shape) + #print((y_train == y_train_org).mean()) + + #print(X_test.shape, X_test_org.shape, X_test.shape == X_test_org.shape) + #print((X_test == X_test_org).mean()) + #print(y_test.shape, y_test_org.shape, y_test.shape == y_test_org.shape) + #print((y_test == y_test_org).mean()) + + #@njit('uint16[:](uint8[:, :, :], uint8[:, :, :])') + #def arg_find(X, X_org): + # arg = np.empty(X.shape[0], dtype = np.uint16) + # for i, x in enumerate(X_org): + # found = False + # for j, x_org in enumerate(X): + # if np.all(x == x_org): + # arg[i] = j + # found = True + # break + # assert found, "Image not found" + # return arg + + #print("Arg find results train") + #arg_train = arg_find(X_train, X_train_org) + #print((X_train[arg_train] == X_train_org).mean()) + #print((y_train[arg_train] == y_train_org).mean()) + + #print("Arg find results test") + #arg_test = arg_find(X_test, X_test_org) + #print((X_test[arg_test] == X_test_org).mean()) + #print((y_test[arg_test] == y_test_org).mean()) + + #for i in tqdm(range(X_c.shape[0]), leave = False, desc = "Extract image"): + # x = X_c[i] + # y = Y_c[i] + # fig = plt.figure() + # plt.imshow(x, cmap = 'gray') + # plt.savefig(f"imgs2/{y}/{i}.png") + # plt.close(fig) + diff --git a/python/toolbox.py b/python/toolbox.py new file mode 100644 index 0000000..43c2eb2 --- /dev/null +++ b/python/toolbox.py @@ -0,0 +1,153 @@ +from typing import Any, Callable, List, Union +from time import perf_counter_ns +from numba import njit +import numpy as np +import pickle +import os + +formats = ["ns", "µs", "ms", "s", "m", "h", "j", "w", "M", "y"] +nb = np.array([1, 1000, 1000, 1000, 60, 60, 24, 7, 4, 12], dtype = np.uint16) +def format_time_ns(time: int) -> str: + """Format the time in nanoseconds in human readable format. + + Args: + time (int): Time in nanoseconds. + + Returns: + str: The formatted human readable string. + """ + assert time >= 0, "Incorrect time stamp" + if time == 0: + return "0ns" + prod = nb.prod(dtype = np.uint64) + + s = "" + for i in range(nb.shape[0])[::-1]: + if time >= prod: + res = int(time // prod) + time = time % prod + s += f"{res}{formats[i]} " + prod = prod // nb[i] + + assert time == 0, "Leftover in formatting time !" + return s.rstrip() + +def toolbox_unit_test() -> None: + # FIXME Move unit test to different file + assert "0ns" == format_time_ns(0) + assert "1ns" == format_time_ns(1) + assert "1µs" == format_time_ns(int(1e3)) + assert "1ms" == format_time_ns(int(1e6)) + assert "1s" == format_time_ns(int(1e9)) + assert "1m" == format_time_ns(int(6e10)) + assert "1h" == format_time_ns(int(36e11)) + assert "1j" == format_time_ns(int(864e11)) + assert "1w" == format_time_ns(int(6048e11)) + assert "1M" == format_time_ns(int(24192e11)) + assert "1y" == format_time_ns(int(290304e11)) + # UINT64_MAX == 2^64 = 18446744073709551615 == -1 + assert "635y 5M 3j 23h 34m 33s 709ms 551µs 616ns" == format_time_ns(2**64) + +def picke_multi_loader(filenames: List[str], save_dir: str = "./models") -> List[Any]: + """Load multiple pickle data files. + + Args: + filenames (List[str]): List of all the filename to load. + save_dir (str, optional): Path of the files to load. Defaults to "./models". + + Returns: + List[Any]. List of loaded pickle data files. + """ + b = [] + for f in filenames: + filepath = f"{save_dir}/{f}.pkl" + if os.path.exists(filepath): + with open(filepath, "rb") as filebyte: + b.append(pickle.load(filebyte)) + else: + b.append(None) + return b + +def benchmark_function(step_name: str, fnc: Callable) -> Any: + """Benchmark a function and display the result of stdout. + + Args: + step_name (str): Name of the function to call. + fnc (Callable): Function to call. + + Returns: + Any: Result of the function. + """ + print(f"{step_name}...", end = "\r") + s = perf_counter_ns() + b = fnc() + e = perf_counter_ns() - s + print(f"| {step_name:<49} | {e:>17,} | {format_time_ns(e):<29} |") + return b + +def state_saver(step_name: str, filename: Union[str, List[str]], fnc, force_redo: bool = False, save_state: bool = True, save_dir: str = "./out") -> Any: + """Either execute a function then saves the result or load the already existing result. + + Args: + step_name (str): Name of the function to call. + filename (Union[str, List[str]]): Name or list of names of the filenames where the result(s) are saved. + fnc ([type]): Function to call. + force_redo (bool, optional): Recall the function even if the result(s) is already saved. Defaults to False. + save_dir (str, optional): Path of the directory to save the result(s). Defaults to "./out". + + Returns: + Any: The result(s) of the called function + """ + if isinstance(filename, str): + if not os.path.exists(f"{save_dir}/{filename}.pkl") or force_redo: + b = benchmark_function(step_name, fnc) + if save_state: + print(f"Saving results of {step_name}", end = '\r') + with open(f"{save_dir}/{filename}.pkl", 'wb') as f: + pickle.dump(b, f) + print(' ' * 100, end = '\r') + return b + else: + print(f"Loading results of {step_name}", end = '\r') + with open(f"{save_dir}/{filename}.pkl", "rb") as f: + res = pickle.load(f) + print(f"| {step_name:<49} | {'None':>17} | {'loaded saved state':<29} |") + return res + elif isinstance(filename, list): + abs = False + for fn in filename: + if not os.path.exists(f"{save_dir}/{fn}.pkl"): + abs = True + break + if abs or force_redo: + b = benchmark_function(step_name, fnc) + if save_state: + print(f"Saving results of {step_name}", end = '\r') + for bi, fnI in zip(b, filename): + with open(f"{save_dir}/{fnI}.pkl", 'wb') as f: + pickle.dump(bi, f) + print(' ' * 100, end = '\r') + return b + + print(f"| {step_name:<49} | {'None':>17} | {'loaded saved state':<29} |") + b = [] + print(f"Loading results of {step_name}", end = '\r') + for fn in filename: + with open(f"{save_dir}/{fn}.pkl", "rb") as f: + b.append(pickle.load(f)) + print(' ' * 100, end = '\r') + return b + else: + assert False, f"Incompatible filename type = {type(filename)}" + +@njit('boolean(int32[:, :], uint16[:, :])') +def unit_test_argsort_2d(arr: np.ndarray, indices: np.ndarray) -> bool: + n = indices.shape[0] + total = indices.shape[0] * indices.shape[1] + for i, sub_indices in enumerate(indices): + for j in range(sub_indices.shape[0] - 1): + if arr[i, sub_indices[j]] <= arr[i, sub_indices[j + 1]]: + n += 1 + if n != total: + print(n, total, n / (total)) + return n == total