Added files

This commit is contained in:
saundersp
2023-05-07 20:15:55 +02:00
parent c8e56c1277
commit e6194ac485
30 changed files with 4682 additions and 0 deletions

80
cpp/Makefile Normal file
View File

@ -0,0 +1,80 @@
CC := nvcc -m64 -std=c++17 -ccbin g++-12 -Xcompiler -m64,-std=c++17
OBJ_DIR := bin
$(shell mkdir -p $(OBJ_DIR))
MODELS_DIR := models
OUT_DIR := out
SRC_DIR := .
#CFLAGS := -O0 -Werror=all-warnings -g -G
#CFLAGS := $(CFLAGS) -D__DEBUG
#CFLAGS := $(CFLAGS) -pg
#CFLAGS := $(CFLAGS) -Xptxas=-w
#CFLAGS := $(CFLAGS) -Xcompiler -Wall,-O0,-g,-Werror,-Werror=implicit-fallthrough=0,-Wextra,-rdynamic
CFLAGS := -O4 -Xcompiler -O4
EXEC := $(OBJ_DIR)/ViolaJones
DATA := ../data/X_train.bin ../data/X_test.bin ../data/y_train.bin ../data/y_test.bin
SRC := $(shell find $(SRC_DIR) -name "*.cpp" -o -name "*.cu" )
OBJ_EXT := o
ifeq ($(OS), Windows_NT)
EXEC:=$(EXEC).exe
OBJ_EXT:=obj
endif
OBJ := $(SRC:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.$(OBJ_EXT))
OBJ := $(OBJ:$(SRC_DIR)/%.cu=$(OBJ_DIR)/%.$(OBJ_EXT))
.PHONY: all start reset clean mrproper debug check
all: $(EXEC) $(DATA)
# Compiling host code
$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cpp
@echo Compiling $<
@$(CC) $(CFLAGS) -c $< -o $@
# Compiling gpu code
$(OBJ_DIR)/%.$(OBJ_EXT): $(SRC_DIR)/%.cu
@echo Compiling $<
@$(CC) $(CFLAGS) -c $< -o $@
$(EXEC): $(OBJ)
@echo Linking objects files to $@
@$(CC) $(CFLAGS) $^ -o $@
$(DATA):
@bash ../download_data.sh ..
start: $(EXEC) $(DATA)
@./$(EXEC)
profile: start
@gprof $(EXEC) gmon.out | gprof2dot | dot -Tpng -o output.png
#@gprof $(EXEC) gmon.out > analysis.txt
debug: $(EXEC) $(DATA)
#@cuda-gdb -q $(EXEC)
@gdb -q --tui $(EXEC)
check: $(EXEC) $(DATA)
@valgrind -q -s --leak-check=full --show-leak-kinds=all $(EXEC)
cudacheck: $(EXEC) $(DATA)
@cuda-memcheck --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all $(EXEC)
#@cuda-memcheck --destroy-on-device-error kernel --tool racecheck --racecheck-report all $(EXEC)
#@cuda-memcheck --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC)
#@cuda-memcheck --destroy-on-device-error kernel --tool synccheck $(EXEC)
#@compute-sanitizer --destroy-on-device-error kernel --tool memcheck --leak-check full --report-api-errors all --track-stream-ordered-races all $(EXEC)
#@compute-sanitizer --destroy-on-device-error kernel --tool racecheck --racecheck-detect-level info --racecheck-report all $(EXEC)
#@compute-sanitizer --destroy-on-device-error kernel --tool initcheck --track-unused-memory yes $(EXEC)
#@compute-sanitizer --destroy-on-device-error kernel --tool synccheck $(EXEC)
r2: $(EXEC) $(DATA)
@r2 $(EXEC)
reset:
@echo Deleting generated states and models
@rm -rf $(OUT_DIR)/* $(MODELS_DIR)/* | true
clean:
@rm $(EXEC)
mrproper:
@rm -r $(OBJ_DIR)

296
cpp/ViolaJones.cpp Normal file
View File

@ -0,0 +1,296 @@
#include <cmath>
#include "data.hpp"
#include "config.hpp"
#include "ViolaJonesGPU.hpp"
#include "ViolaJonesCPU.hpp"
static inline void add_empty_feature(const np::Array<uint8_t>& feats, size_t& n) noexcept {
memset(&feats[n], 0, 4 * sizeof(uint8_t));
n += 4;
}
static inline void add_right_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
feats[n++] = i + w;
feats[n++] = j;
feats[n++] = w;
feats[n++] = h;
}
static inline void add_immediate_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
feats[n++] = i;
feats[n++] = j;
feats[n++] = w;
feats[n++] = h;
}
static inline void add_bottom_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
feats[n++] = i;
feats[n++] = j + h;
feats[n++] = w;
feats[n++] = h;
}
static inline void add_right2_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
feats[n++] = i + 2 * w;
feats[n++] = j;
feats[n++] = w;
feats[n++] = h;
}
static inline void add_bottom2_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
feats[n++] = i;
feats[n++] = j + 2 * h;
feats[n++] = w;
feats[n++] = h;
}
static inline void add_bottom_right_feature(const np::Array<uint8_t>& feats, size_t& n, const uint16_t& i, const uint16_t& j, const uint16_t& w, const uint16_t& h) noexcept {
feats[n++] = i + w;
feats[n++] = j + h;
feats[n++] = w;
feats[n++] = h;
}
np::Array<uint8_t> build_features(const uint16_t& width, const uint16_t& height) noexcept {
size_t n = 0;
uint16_t w, h, i, j;
for (w = 1; w < width; ++w)
for (h = 1; h < height; ++h)
for (i = 0; i < width - w; ++i)
for (j = 0; j < height - h; ++j) {
if (i + 2 * w < width) ++n;
if (j + 2 * h < height) ++n;
if (i + 3 * w < width) ++n;
if (j + 3 * h < height) ++n;
if (i + 2 * w < width && j + 2 * h < height) ++n;
}
np::Array<uint8_t> feats = np::empty<uint8_t>({ n, 2, 2, 4 });
n = 0;
for (w = 1; w < width; ++w)
for (h = 1; h < height; ++h)
for (i = 0; i < width - w; ++i)
for (j = 0; j < height - h; ++j) {
if (i + 2 * w < width) {
add_right_feature(feats, n, i, j, w, h);
add_empty_feature(feats, n);
add_immediate_feature(feats, n, i, j, w, h);
add_empty_feature(feats, n);
}
if (j + 2 * h < height) {
add_immediate_feature(feats, n, i, j, w, h);
add_empty_feature(feats, n);
add_bottom_feature(feats, n, i, j, w, h);
add_empty_feature(feats, n);
}
if (i + 3 * w < width) {
add_right_feature(feats, n, i, j, w, h);
add_empty_feature(feats, n);
add_right2_feature(feats, n, i, j, w, h);
add_immediate_feature(feats, n, i, j, w, h);
}
if (j + 3 * h < height) {
add_bottom_feature(feats, n, i, j, w, h);
add_empty_feature(feats, n);
add_bottom2_feature(feats, n, i, j, w, h);
add_immediate_feature(feats, n, i, j, w, h);
}
if (i + 2 * w < width && j + 2 * h < height) {
add_right_feature(feats, n, i, j, w, h);
add_bottom_feature(feats, n, i, j, w, h);
add_immediate_feature(feats, n, i, j, w, h);
add_bottom_right_feature(feats, n, i, j, w, h);
}
}
return feats;
}
//np::Array<int> select_percentile(const np::Array<uint8_t> X_feat, const np::Array<uint8_t> y) noexcept {
// std::vector<float64_t> class_0, class_1;
//
// const int im_size = X_feat.shape[0] / y.shape[0];
// int idy = 0, n_samples_per_class_0 = 0, n_samples_per_class_1 = 0;
// for (size_t i = 0; i < X_feat.shape[0]; i += im_size) {
// if (y[idy] == 0) {
// ++n_samples_per_class_0;
// class_0.push_back(static_cast<float64_t>(X_feat[i]));
// }
// else {
// ++n_samples_per_class_1;
// class_1.push_back(static_cast<float64_t>(X_feat[i]));
// }
// ++idy;
// }
// const int n_samples = n_samples_per_class_0 + n_samples_per_class_1;
//
// float64_t ss_alldata_0 = 0;
// for (int i = 0;i < n_samples_per_class_0;++i)
// ss_alldata_0 += (class_0[i] * class_0[i]);
//
// float64_t ss_alldata_1 = 0;
// for (int i = 0;i < n_samples_per_class_1;++i)
// ss_alldata_1 += (class_1[i] * class_1[i]);
//
// const float64_t ss_alldata = ss_alldata_0 + ss_alldata_1;
//
// float64_t sums_classes_0 = 0;
// for (int i = 0;i < n_samples_per_class_0;++i)
// sums_classes_0 += class_0[i];
//
// float64_t sums_classes_1 = 0;
// for (int i = 0;i < n_samples_per_class_1;++i)
// sums_classes_1 += class_1[i];
//
// float64_t sq_of_sums_alldata = sums_classes_0 + sums_classes_1;
// sq_of_sums_alldata *= sq_of_sums_alldata;
//
// const float64_t sq_of_sums_args_0 = sums_classes_0 * sums_classes_0;
// const float64_t sq_of_sums_args_1 = sums_classes_1 * sums_classes_1;
// const float64_t ss_tot = ss_alldata - sq_of_sums_alldata / n_samples;
// const float64_t sqd_sum_bw_n = sq_of_sums_args_0 / n_samples_per_class_0 + sq_of_sums_args_1 / n_samples_per_class_1 - sq_of_sums_alldata / n_samples;
// const float64_t ss_wn = ss_tot - sqd_sum_bw_n;
// const int df_wn = n_samples - 2;
// const float64_t msw = ss_wn / df_wn;
// const float64_t f_values = sqd_sum_bw_n / msw;
//
// const np::Array<int> res = np::empty<int>({ static_cast<size_t>(std::ceil(static_cast<float64_t>(im_size) / 10.0)) });
// // TODO Complete code
// return res;
//}
np::Array<float64_t> init_weights(const np::Array<uint8_t>& y_train) noexcept {
np::Array<float64_t> weights = np::empty<float64_t>(y_train.shape);
const uint16_t t = np::sum(np::astype<uint16_t>(y_train));
return map(weights, static_cast<std::function<float64_t(const size_t&, const float64_t&)>>(
[&t, &y_train](const size_t& i, const float64_t&) -> float64_t {
return 1.0 / (2 * (y_train[i] == 0 ? t : y_train.shape[0] - t));
}));
}
np::Array<uint8_t> classify_weak_clf(const np::Array<int32_t>& X_feat_i, const size_t& j, const float64_t& threshold, const float64_t& polarity) noexcept {
np::Array<uint8_t> res = np::empty<uint8_t>({ X_feat_i.shape[1] });
for(size_t i = 0; i < res.shape[0]; ++i)
res[i] = polarity * X_feat_i[j * X_feat_i.shape[1] + i] < polarity * threshold ? 1 : 0;
return res;
}
np::Array<uint8_t> classify_viola_jones(const np::Array<float64_t>& alphas, const np::Array<float64_t>& classifiers, const np::Array<int32_t>& X_feat) noexcept {
np::Array<float64_t> total = np::zeros<float64_t>({ X_feat.shape[1] });
float64_t clf = 0.0, threshold = 0.0, polarity = 0.0;
for(size_t i = 0; i < alphas.shape[0]; ++i){
clf = classifiers[i * 3]; threshold = classifiers[i * 3 + 1]; polarity = classifiers[i * 3 + 2];
//total += alphas * np::astype<float64_t>(classify_weak_clf(X_feat, clf, threshold, polarity));
const np::Array<uint8_t> res = classify_weak_clf(X_feat, clf, threshold, polarity);
for(size_t j = 0; j < res.shape[0]; ++j)
total[j] += alphas[i] * res[j];
}
np::Array<uint8_t> y_pred = np::empty<uint8_t>({ X_feat.shape[1] });
const float64_t alphas_sum = np::sum(alphas);
for(size_t i = 0; i < X_feat.shape[1]; ++i)
y_pred[i] = total[i] >= 0.5 * alphas_sum ? 1 : 0;
return y_pred;
}
std::tuple<int32_t, float64_t, np::Array<float64_t>> select_best(const np::Array<float64_t>& classifiers, const np::Array<float64_t>& weights, const np::Array<int32_t>& X_feat, const np::Array<uint8_t>& y) noexcept {
std::tuple<int32_t, float64_t, np::Array<float64_t>> res = { -1, np::inf, np::empty<float64_t>({ X_feat.shape[0] }) };
for(size_t j = 0; j < classifiers.shape[0]; ++j){
const np::Array<float64_t> accuracy = np::abs(np::astype<float64_t>(classify_weak_clf(X_feat, j, classifiers[j * 2], classifiers[j * 2 + 1])) - y);
// const float64_t error = np::mean(weights * accuracy);
float64_t error = 0.0;
for(size_t i = 0; i < weights.shape[0]; ++i)
error += weights[i] * accuracy[i];
error /= weights.shape[0];
if (error < std::get<1>(res))
res = { j, error, accuracy };
}
return res;
}
std::array<np::Array<float64_t>, 2> train_viola_jones(const size_t& T, const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y) noexcept {
np::Array<float64_t> weights = init_weights(y);
np::Array<float64_t> alphas = np::empty<float64_t>({ T });
np::Array<float64_t> final_classifier = np::empty<float64_t>({ T, 3 });
for(size_t t = 0; t < T; ++t ){
weights /= np::sum(weights);
#if GPU_BOOSTED
const np::Array<float64_t> classifiers = train_weak_clf_gpu(X_feat, X_feat_argsort, y, weights);
#else
const np::Array<float64_t> classifiers = train_weak_clf_cpu(X_feat, X_feat_argsort, y, weights);
#endif
const auto [ clf, error, accuracy ] = select_best(classifiers, weights, X_feat, y);
float64_t beta = error / (1.0 - error);
weights *= np::pow(beta, (1.0 - accuracy));
alphas[t] = std::log(1.0 / beta);
final_classifier[t * 3] = clf;
final_classifier[t * 3 + 1] = classifiers[clf * 2];
final_classifier[t * 3 + 2] = classifiers[clf * 2 + 1];
}
return { alphas, final_classifier };
}
float64_t accuracy_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
float64_t res = 0.0;
for(size_t i = 0; i < y.shape[0]; ++i)
if(y[i] == y_pred[i])
++res;
return res / y.shape[0];
}
float64_t precision_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
uint16_t true_positive = 0, false_positive = 0;
for(size_t i = 0; i < y.shape[0]; ++i)
if(y[i] == 1){
if(y[i] == y_pred[i])
++true_positive;
else
++false_positive;
}
return static_cast<float64_t>(true_positive) / (true_positive + false_positive);
}
float64_t recall_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
uint16_t true_positive = 0, false_negative = 0;
for(size_t i = 0; i < y.shape[0]; ++i)
if(y[i] == 0) {
if(y[i] != y_pred[i])
++false_negative;
} else {
if(y[i] == y_pred[i])
++true_positive;
}
return static_cast<float64_t>(true_positive) / (true_positive + false_negative);
}
float64_t f1_score(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
const float64_t precision = precision_score(y, y_pred);
const float64_t recall = recall_score(y, y_pred);
return 2 * (precision * recall) / (precision + recall);
}
std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> confusion_matrix(const np::Array<uint8_t>& y, const np::Array<uint8_t>& y_pred) noexcept {
uint16_t true_positive = 0, false_positive = 0, true_negative = 0, false_negative = 0;
for(size_t i = 0; i < y.shape[0]; ++i)
if(y[i] == 0)
if(y[i] == y_pred[i])
++true_negative;
else
++false_negative;
else
if(y[i] == y_pred[i])
++true_positive;
else
++false_positive;
return std::make_tuple(true_negative, false_positive, false_negative, true_positive);
}

157
cpp/ViolaJones.hpp Normal file
View File

@ -0,0 +1,157 @@
#pragma once
#include <filesystem>
namespace fs = std::filesystem;
#include "data.hpp"
#include "toolbox.hpp"
//#include "config.hpp"
template <typename T>
void unit_test_cpu_vs_gpu(const np::Array<T>& cpu, const np::Array<T>& gpu) noexcept {
if (cpu.shape != gpu.shape) {
fprintf(stderr, "Inequal shape !\n");
return;
}
size_t eq = 0;
const size_t length = np::prod(cpu.shape);
for (size_t i = 0; i < length; ++i)
if (cpu[i] == gpu[i])
++eq;
//else
// std::cout << i << ": " << cpu[i] << " != " << gpu[i] << std::endl;
if (eq != length)
printf("Incorrect results, Number of equalities : %s/%s <=> %.2f%% !\n", thousand_sep(eq).c_str(), thousand_sep(length).c_str(),
static_cast<float64_t>(eq) / static_cast<float64_t>(length) * 100.0);
}
template <typename T>
void unit_test_argsort_2d(const np::Array<T>& a, const np::Array<uint16_t>& indices) noexcept {
if (a.shape != indices.shape) {
fprintf(stderr, "Inequal shape !\n");
return;
}
size_t correct = a.shape[0]; // First elements are always correctly sorted
const size_t total = np::prod(a.shape);
for(size_t i = 0; i < total; i += a.shape[1])
for(size_t j = 0; j < a.shape[1] - 1; ++j){
const size_t k = i + j;
if(a[i + indices[k]] <= a[i + indices[k + 1]])
++correct;
}
if (correct != total)
printf("Incorrect results, Number of equalities : %s/%s <=> %.2f%% !\n", thousand_sep(correct).c_str(), thousand_sep(total).c_str(),
static_cast<float64_t>(correct) / static_cast<float64_t>(total) * 100.0);
}
template <typename T, typename F, typename... Args>
T benchmark_function(const char* step_name, const F& fnc, Args &&...args) noexcept {
#ifndef __DEBUG
printf("%s...\r", step_name);
fflush(stdout); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered
#endif
const auto start = time();
const T res = fnc(std::forward<Args>(args)...);
const long long timespent = duration_ns(time() - start);
printf("| %-49s | %17s | %-29s |\n", step_name, thousand_sep(timespent).c_str(), format_time_ns(timespent).c_str());
return res;
}
template <typename F, typename... Args>
void benchmark_function_void(const char* step_name, const F& fnc, Args &&...args) noexcept {
#ifndef __DEBUG
printf("%s...\r", step_name);
fflush(stdout); // manual flush is mandatory, otherwise it will not be shown immediately because the output is buffered
#endif
const auto start = time();
fnc(std::forward<Args>(args)...);
const long long timespent = duration_ns(time() - start);
printf("| %-49s | %17s | %-29s |\n", step_name, thousand_sep(timespent).c_str(), format_time_ns(timespent).c_str());
}
template <typename T, typename F, typename... Args>
np::Array<T> state_saver(const char* step_name, const char* filename, const bool& force_redo, const bool& save_state, const char* out_dir, const F& fnc, Args &&...args) noexcept {
char filepath[BUFFER_SIZE] = { 0 };
sprintf(filepath, "%s/%s.bin", out_dir, filename);
np::Array<T> bin;
if (!fs::exists(filepath) || force_redo) {
bin = std::move(benchmark_function<np::Array<T>>(step_name, fnc, std::forward<Args>(args)...));
if(save_state){
#ifndef __DEBUG
printf("Saving results of %s\r", step_name);
fflush(stdout);
#endif
save<T>(bin, filepath);
#ifndef __DEBUG
printf("%*c\r", 100, ' ');
fflush(stdout);
#endif
}
} else {
#ifndef __DEBUG
printf("Loading results of %s\r", step_name);
fflush(stdout);
#endif
bin = std::move(load<T>(filepath));
printf("| %-49s | %17s | %-29s |\n", step_name, "None", "loaded saved state");
}
return bin;
}
template <typename T, size_t N, typename F, typename... Args>
std::array<np::Array<T>, N> state_saver(const char* step_name, const std::vector<const char*>& filenames, const bool& force_redo, const bool& save_state, const char* out_dir, const F& fnc, Args &&...args) noexcept {
char filepath[BUFFER_SIZE] = { 0 };
bool abs = false;
for (const char* filename : filenames){
sprintf(filepath, "%s/%s.bin", out_dir, filename);
if (!fs::exists(filepath)) {
abs = true;
break;
}
}
std::array<np::Array<T>, N> bin;
if (abs || force_redo) {
bin = std::move(benchmark_function<std::array<np::Array<T>, N>>(step_name, fnc, std::forward<Args>(args)...));
if (save_state){
#ifndef __DEBUG
printf("Saving results of %s\r", step_name);
fflush(stdout);
#endif
size_t i = 0;
for (const char* filename : filenames){
sprintf(filepath, "%s/%s.bin", out_dir, filename);
save<T>(bin[i++], filepath);
}
#ifndef __DEBUG
printf("%*c\r", 100, ' ');
fflush(stdout);
#endif
}
} else {
#ifndef __DEBUG
printf("Loading results of %s\r", step_name);
fflush(stdout);
#endif
size_t i = 0;
for (const char* filename : filenames){
sprintf(filepath, "%s/%s.bin", out_dir, filename);
bin[i++] = std::move(load<T>(filepath));
}
printf("| %-49s | %17s | %-29s |\n", step_name, "None", "loaded saved state");
}
return bin;
}
np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>&) noexcept;
np::Array<uint8_t> build_features(const uint16_t&, const uint16_t&) noexcept;
np::Array<int> select_percentile(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
np::Array<uint8_t> classify_viola_jones(const np::Array<float64_t>&, const np::Array<float64_t>&, const np::Array<int32_t>&) noexcept;
np::Array<float64_t> init_weights(const np::Array<uint8_t>&) noexcept;
std::tuple<int32_t, float64_t, np::Array<float64_t>> select_best(const np::Array<float64_t>&, const np::Array<float64_t>&, const np::Array<int32_t>&,
const np::Array<uint8_t>&) noexcept;
std::array<np::Array<float64_t>, 2> train_viola_jones(const size_t&, const np::Array<int32_t>&, const np::Array<uint16_t>&, const np::Array<uint8_t>&) noexcept;
float64_t accuracy_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
float64_t precision_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
float64_t recall_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
float64_t f1_score(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;
std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> confusion_matrix(const np::Array<uint8_t>&, const np::Array<uint8_t>&) noexcept;

93
cpp/ViolaJonesCPU.cpp Normal file
View File

@ -0,0 +1,93 @@
#include "data.hpp"
#include "toolbox.hpp"
np::Array<uint32_t> set_integral_image_cpu(const np::Array<uint8_t>& set) noexcept {
np::Array<uint32_t> X_ii = np::empty<uint32_t>(set.shape);
size_t i, y, x, s;
uint32_t ii[set.shape[1] * set.shape[2]];
const size_t length = np::prod(set.shape);
for (size_t offset = 0; offset < length; offset += set.shape[1] * set.shape[2]) {
for (i = 0; i < set.shape[1] * set.shape[2]; ++i)
ii[i] = 0;
for (y = 1; y < set.shape[1]; ++y) {
s = 0;
for (x = 0; x < set.shape[2] - 1; ++x) {
s += set[offset + (y - 1) * set.shape[2] + x];
ii[y * set.shape[2] + x + 1] = s + ii[(y - 1) * set.shape[2] + x + 1];
}
}
for (y = 0; y < set.shape[1]; ++y)
for (x = 0; x < set.shape[2]; ++x)
X_ii[offset + y * set.shape[2] + x] = ii[y * set.shape[2] + x];
}
return X_ii;
}
constexpr static inline int16_t __compute_feature__(const np::Array<uint32_t>& X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept {
const size_t _y = y * X_ii.shape[1] + x;
const size_t _yh = _y + h * X_ii.shape[1];
return X_ii[j + _yh + w] + X_ii[j + _y] - X_ii[j + _yh] - X_ii[j + _y + w];
}
np::Array<int32_t> apply_features_cpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });
size_t j, feat_idx = 0;
int16_t p1, p2, n1, n2;
const size_t feats_length = np::prod(feats.shape), X_ii_length = np::prod(X_ii.shape);
const size_t feats_step = np::prod(feats.shape, 1), X_ii_step = np::prod(X_ii.shape, 1);
for (size_t i = 0; i < feats_length; i += feats_step){
for (j = 0; j < X_ii_length; j += X_ii_step) {
p1 = __compute_feature__(X_ii, j, feats[i + 0], feats[i + 1], feats[i + 2], feats[i + 3]);
p2 = __compute_feature__(X_ii, j, feats[i + 4], feats[i + 5], feats[i + 6], feats[i + 7]);
n1 = __compute_feature__(X_ii, j, feats[i + 8], feats[i + 9], feats[i + 10], feats[i + 11]);
n2 = __compute_feature__(X_ii, j, feats[i + 12], feats[i + 13], feats[i + 14], feats[i + 15]);
X_feat[feat_idx++] = static_cast<int32_t>(p1 + p2) - static_cast<int32_t>(n1 + n2);
}
}
return X_feat;
}
np::Array<float64_t> train_weak_clf_cpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y, const np::Array<float64_t>& weights) noexcept {
float64_t total_pos = 0.0, total_neg = 0.0;
for(size_t i = 0; i < y.shape[0]; ++i)
(y[i] == static_cast<uint8_t>(1) ? total_pos : total_neg) += weights[i];
np::Array<float64_t> classifiers = np::empty<float64_t>({ X_feat.shape[0], 2});
for(size_t i = 0; i < X_feat.shape[0]; ++i){
size_t pos_seen = 0, neg_seen = 0;
float64_t pos_weights = 0.0, neg_weights = 0.0;
float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0;
for(size_t j = 0; j < X_feat_argsort.shape[1]; ++j) {
const float64_t error = std::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights);
if (error < min_error){
min_error = error;
best_threshold = X_feat[i * X_feat.shape[1] + X_feat_argsort[i * X_feat.shape[1] + j]];
best_polarity = pos_seen > neg_seen ? 1.0 : -1.0;
}
if(y[X_feat_argsort[i * X_feat.shape[1] + j]] == static_cast<uint8_t>(1)){
++pos_seen;
pos_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]];
} else {
++neg_seen;
neg_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]];
}
}
classifiers[i * 2] = best_threshold; classifiers[i * 2 + 1] = best_polarity;
}
return classifiers;
}
np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>& X_feat) noexcept {
const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
const size_t length = np::prod(X_feat.shape);
for (size_t i = 0; i < length; i += X_feat.shape[1]) {
for(size_t j = 0; j < X_feat.shape[1]; ++j) indices[i + j] = j;
argsort(&X_feat[i], &indices[i], 0, X_feat.shape[1] - 1);
}
return indices;
}

8
cpp/ViolaJonesCPU.hpp Normal file
View File

@ -0,0 +1,8 @@
#pragma once
#include "data.hpp"
np::Array<uint32_t> set_integral_image_cpu(const np::Array<uint8_t>&) noexcept;
np::Array<int32_t> apply_features_cpu(const np::Array<uint8_t>&, const np::Array<uint32_t>&) noexcept;
np::Array<float64_t> train_weak_clf_cpu(const np::Array<int32_t>&, const np::Array<uint16_t>&, const np::Array<uint8_t>&,
const np::Array<float64_t>&) noexcept;
np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>&) noexcept;

491
cpp/ViolaJonesGPU.cu Normal file
View File

@ -0,0 +1,491 @@
#include <iostream>
#include "data.hpp"
#include "toolbox.hpp"
#include "ViolaJones.hpp"
#define NB_THREADS 1024
#define NB_THREADS_2D_X 32
#define NB_THREADS_2D_Y 32
__device__ constexpr const size_t M = 5; //log2(NB_THREADS_2D_Y));
#define NB_THREADS_3D_X 16
#define NB_THREADS_3D_Y 16
#define NB_THREADS_3D_Z 4
static __global__ void __test_working_kernel__(const np::Array<size_t> d_x, np::Array<size_t> d_y, const size_t length) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < length)
d_y[i] = d_x[i] * i;
}
void test_working(const size_t& length) noexcept {
const size_t size = length * sizeof(size_t);
#ifdef __DEBUG
print("Estimating memory footprint at : " + format_byte_size(2 * size));
#endif
np::Array<size_t> x = np::empty<size_t>({ length }), y = np::empty<size_t>({ length });
size_t i;
for (i = 0; i < length; ++i)
x[i] = i;
np::Array<size_t> d_x = copyToDevice<size_t>("x", x), d_y = copyToDevice<size_t>("y", y);
const size_t dimX = static_cast<size_t>(std::ceil(static_cast<float64_t>(length) / static_cast<float64_t>(NB_THREADS)));
const dim3 dimGrid(dimX);
constexpr const dim3 dimBlock(NB_THREADS);
__test_working_kernel__<<<dimGrid, dimBlock>>>(d_x, d_y, length);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost));
size_t ne = 0;
for (i = 0; i < length; ++i)
if (y[i] != x[i] * i)
++ne;
if (ne != 0)
fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast<float64_t>(ne) / static_cast<float64_t>(length));
cudaFree("d_x", d_x);
cudaFree("d_y", d_y);
}
static __global__ void __test_working_kernel_2d__(const np::Array<size_t> d_x, np::Array<size_t> d_y, const size_t length) {
const size_t idx = threadIdx.x * blockDim.y + threadIdx.y;
const size_t idy = blockIdx.x * gridDim.y + blockIdx.y;
const size_t i = idy * NB_THREADS_2D_X * NB_THREADS_2D_Y + idx;
if (i < length)
d_y[i] = d_x[i] * i;
}
void test_working_2d(const size_t& N1, const size_t& N2) noexcept {
const size_t length = N1 * N2;
const size_t size = length * sizeof(size_t);
#ifdef __DEBUG
print("Estimating memory footprint at : " + format_byte_size(2 * size));
#endif
np::Array<size_t> x = np::empty<size_t>({ length }), y = np::empty<size_t>({ length });
size_t i;
for (i = 0; i < length; ++i)
x[i] = i;
np::Array<size_t> d_x = copyToDevice<size_t>("x", x), d_y = copyToDevice<size_t>("y", y);
const size_t dimX = static_cast<size_t>(std::ceil(static_cast<float64_t>(N1) / static_cast<float64_t>(NB_THREADS_2D_X)));
const size_t dimY = static_cast<size_t>(std::ceil(static_cast<float64_t>(N2) / static_cast<float64_t>(NB_THREADS_2D_Y)));
const dim3 dimGrid(dimX, dimY);
constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y);
__test_working_kernel_2d__<<<dimGrid, dimBlock>>>(d_x, d_y, length);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost));
size_t ne = 0;
for (i = 0; i < length; ++i)
if (y[i] != x[i] * i)
++ne;
if (ne != 0)
fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast<float64_t>(ne) / static_cast<float64_t>(length));
cudaFree("d_x", d_x);
cudaFree("d_y", d_y);
}
static __global__ void __test_working_kernel_3d__(const np::Array<size_t> d_x, np::Array<size_t> d_y, const size_t length) {
const size_t idx = (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
const size_t idy = (blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z;
const size_t i = idy * NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z + idx;
if (i < length)
d_y[i] = d_x[i] * i;
}
void test_working_3d(const size_t& N1, const size_t& N2, const size_t& N3) noexcept {
const size_t length = N1 * N2 * N3;
const size_t size = length * sizeof(size_t);
#ifdef __DEBUG
print("Estimating memory footprint at : " + format_byte_size(2 * size));
#endif
np::Array<size_t> x = np::empty<size_t>({ length }), y = np::empty<size_t>({ length });
size_t i;
for (i = 0; i < length; ++i)
x[i] = i;
np::Array<size_t> d_x = copyToDevice<size_t>("x", x), d_y = copyToDevice<size_t>("y", y);
const size_t dimX = static_cast<size_t>(std::ceil(static_cast<float64_t>(N1) / static_cast<float64_t>(NB_THREADS_3D_X)));
const size_t dimY = static_cast<size_t>(std::ceil(static_cast<float64_t>(N2) / static_cast<float64_t>(NB_THREADS_3D_Y)));
const size_t dimZ = static_cast<size_t>(std::ceil(static_cast<float64_t>(N3) / static_cast<float64_t>(NB_THREADS_3D_Z)));
const dim3 dimGrid(dimX, dimY, dimZ);
constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z);
__test_working_kernel_3d__<<<dimGrid, dimBlock>>>(d_x, d_y, length);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost));
size_t ne = 0;
for (i = 0; i < length; ++i)
if (y[i] != x[i] * i)
++ne;
if (ne != 0)
fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast<float64_t>(ne) / static_cast<float64_t>(length));
cudaFree("d_x", d_x);
cudaFree("d_y", d_y);
}
static np::Array<uint32_t> __scanCPU_3d__(const np::Array<uint32_t>& X) noexcept {
np::Array<uint32_t> X_scan = np::empty<uint32_t>(X.shape);
const size_t total = np::prod(X_scan.shape);
const size_t i_step = np::prod(X_scan.shape, 1);
for(size_t x = 0; x < total; x += i_step)
for(size_t y = 0; y < i_step; y += X_scan.shape[2]){
uint32_t cum = 0;
for(size_t z = 0; z < X_scan.shape[2]; ++z){
const size_t idx = x + y + z;
cum += X[idx];
X_scan[idx] = cum - X[idx];
}
}
return X_scan;
}
static __global__ void __kernel_scan_3d__(const uint16_t n, const uint16_t j, np::Array<uint32_t> d_inter, np::Array<uint32_t> d_X) {
const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x;
const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ uint32_t sA[NB_THREADS_2D_X * NB_THREADS_2D_Y];
sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y] = (x_coor < n && y_coor) < j ?
d_X[blockIdx.z * NB_THREADS_2D_X * NB_THREADS_2D_Y + y_coor * NB_THREADS_2D_Y + x_coor] : 0;
__syncthreads();
size_t k = threadIdx.x;
for(size_t d = 0; d < M; ++d){
k *= 2;
const size_t i1 = k + std::pow(2, d) - 1;
const size_t i2 = k + std::pow(2, d + 1) - 1;
if(i2 >= blockDim.x)
break;
sA[i2 * NB_THREADS_2D_Y + threadIdx.y] += sA[i1 * NB_THREADS_2D_Y + threadIdx.y];
}
__syncthreads();
if(threadIdx.x == 0){
d_inter[blockIdx.z * d_inter.shape[1] * d_inter.shape[2] + y_coor * d_inter.shape[2] + blockIdx.x] =
sA[(blockDim.x - 1) * NB_THREADS_2D_Y + threadIdx.y];
sA[(blockDim.x - 1) * NB_THREADS_2D_Y + threadIdx.y] = 0;
}
__syncthreads();
k = std::pow(2, M + 1) * threadIdx.x;
for(int64_t d = M - 1; d > -1; --d){
k = k / 2;
const size_t i1 = k + std::pow(2, d) - 1;
const size_t i2 = k + std::pow(2, d + 1) - 1;
if(i2 >= blockDim.x)
continue;
const uint32_t t = sA[i1 * NB_THREADS_2D_Y + threadIdx.y];
sA[i1 * NB_THREADS_2D_Y + threadIdx.y]= sA[i2 * NB_THREADS_2D_Y + threadIdx.y];
sA[i2 * NB_THREADS_2D_Y + threadIdx.y] += t;
}
__syncthreads();
if(x_coor < n && y_coor < j)
d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] = sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y];
}
static __global__ void __add_3d__(np::Array<uint32_t> d_X, const np::Array<uint32_t> d_s, const uint16_t n, const uint16_t m) {
const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x;
const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y;
if(x_coor < n && y_coor < m)
d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] += d_s[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + blockIdx.x];
}
static np::Array<uint32_t> __scanGPU_3d__(const np::Array<uint32_t>& X) noexcept {
np::Array<uint32_t> X_scan = np::empty<uint32_t>(X.shape);
const size_t k = X.shape[0];
const size_t height = X.shape[1];
const size_t n = X.shape[2];
const size_t n_block_x = static_cast<size_t>(std::ceil(static_cast<float64_t>(X.shape[1]) / static_cast<float64_t>(NB_THREADS_2D_X)));
const size_t n_block_y = static_cast<size_t>(std::ceil(static_cast<float64_t>(X.shape[2]) / static_cast<float64_t>(NB_THREADS_2D_Y)));
np::Array<uint32_t> d_X = copyToDevice<uint32_t>("X", X);
np::Array<uint32_t> inter = np::empty<uint32_t>({ k, height, n_block_x });
np::Array<uint32_t> d_inter = copyToDevice<uint32_t>("inter", inter);
const dim3 dimGrid(n_block_x, n_block_y, k);
constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y);
__kernel_scan_3d__<<<dimGrid, dimBlock>>>(n, height, d_inter, d_X);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_inter", cudaMemcpy(inter.data, d_inter.data, np::prod(inter.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost));
if(n_block_x >= NB_THREADS_2D_X){
np::Array<uint32_t> sums = __scanGPU_3d__(inter);
np::Array<uint32_t> d_s = copyToDevice<uint32_t>("sums", sums);
__add_3d__<<<dimGrid, dimBlock>>>(d_X, d_s, n, height);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_X", cudaMemcpy(X_scan.data, d_X.data, np::prod(X_scan.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost));
} else {
np::Array<uint32_t> sums = __scanCPU_3d__(inter);
_print_cuda_error_("memcpy d_X", cudaMemcpy(X_scan.data, d_X.data, np::prod(X_scan.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost));
for(size_t p = 0; p < k; ++p)
for(size_t h = 0; h < height; ++h)
for(size_t i = 1; i < n_block_x; ++i)
for(size_t j = 0; j < NB_THREADS_2D_X; ++j){
const size_t idx = i * NB_THREADS_2D_X + j;
if(idx < n){
const size_t idy = p * X_scan.shape[1] * X_scan.shape[2] + h * X_scan.shape[2];
X_scan[idy + idx] += sums[idy + i];
}
}
}
return X_scan;
}
static __global__ void __transpose_kernel__(const np::Array<uint32_t> d_X, np::Array<uint32_t> d_Xt) {
__shared__ uint32_t temp[NB_THREADS_2D_X * NB_THREADS_2D_Y];
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < d_X.shape[1] && y < d_X.shape[2])
temp[threadIdx.y * NB_THREADS_2D_Y + threadIdx.x] = d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + x * d_X.shape[2] + y];
__syncthreads();
x = blockIdx.y * blockDim.y + threadIdx.x;
y = blockIdx.x * blockDim.x + threadIdx.y;
if(x < d_X.shape[2] && y < d_X.shape[1])
d_Xt[blockIdx.z * d_Xt.shape[1] * d_Xt.shape[2] + x * d_X.shape[2] + y] = temp[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y];
}
static np::Array<uint32_t> __transpose_3d__(const np::Array<uint32_t>& X) noexcept {
np::Array<uint32_t> Xt = np::empty<uint32_t>({ X.shape[0], X.shape[2], X.shape[1] });
np::Array<uint32_t> d_X = copyToDevice<uint32_t>("X", X);
np::Array<uint32_t> d_Xt = copyToDevice<uint32_t>("Xt", Xt);
const size_t n_block_x = static_cast<size_t>(std::ceil(static_cast<float64_t>(X.shape[1]) / static_cast<float64_t>(NB_THREADS_2D_X)));
const size_t n_block_y = static_cast<size_t>(std::ceil(static_cast<float64_t>(X.shape[2]) / static_cast<float64_t>(NB_THREADS_2D_Y)));
const dim3 dimGrid(n_block_x, n_block_y, X.shape[0]);
constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y);
__transpose_kernel__<<<dimGrid, dimBlock>>>(d_X, d_Xt);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_Xt", cudaMemcpy(Xt.data, d_Xt.data, np::prod(Xt.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost));
cudaFree("X", d_X);
cudaFree("Xt", d_Xt);
return Xt;
}
np::Array<uint32_t> set_integral_image_gpu(const np::Array<uint8_t>& X) noexcept {
np::Array<uint32_t> X_ii = np::astype<uint32_t>(X);
X_ii = __scanCPU_3d__(X_ii);
X_ii = __transpose_3d__(X_ii);
X_ii = __scanCPU_3d__(X_ii);
return __transpose_3d__(X_ii);
}
static inline __device__ int16_t __compute_feature__(const np::Array<uint32_t>& d_X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept {
const size_t _y = y * d_X_ii.shape[1] + x;
const size_t _yh = _y + h * d_X_ii.shape[1];
return d_X_ii[j + _yh + w] + d_X_ii[j + _y] - d_X_ii[j + _yh] - d_X_ii[j + _y + w];
}
static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Array<uint8_t> d_feats, const np::Array<uint32_t> d_X_ii) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
size_t j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= d_feats.shape[0] || j >= d_X_ii.shape[0])
return;
const size_t k = i * d_X_ii.shape[0] + j;
i *= np::prod(d_feats.shape, 1);
j *= np::prod(d_X_ii.shape, 1);
const int16_t p1 = __compute_feature__(d_X_ii, j, d_feats[i + 0], d_feats[i + 1], d_feats[i + 2], d_feats[i + 3]);
const int16_t p2 = __compute_feature__(d_X_ii, j, d_feats[i + 4], d_feats[i + 5], d_feats[i + 6], d_feats[i + 7]);
const int16_t n1 = __compute_feature__(d_X_ii, j, d_feats[i + 8], d_feats[i + 9], d_feats[i + 10], d_feats[i + 11]);
const int16_t n2 = __compute_feature__(d_X_ii, j, d_feats[i + 12], d_feats[i + 13], d_feats[i + 14], d_feats[i + 15]);
d_X_feat[k] = static_cast<int32_t>(p1 + p2) - static_cast<int32_t>(n1 + n2);
}
np::Array<int32_t> apply_features_gpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
const np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });
int32_t* d_X_feat;
_print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t)));
np::Array<uint32_t> d_X_ii = copyToDevice<uint32_t>("X_ii", X_ii);
np::Array<uint8_t> d_feats = copyToDevice<uint8_t>("feats", feats);
const size_t dimX = static_cast<size_t>(std::ceil(static_cast<float64_t>(feats.shape[0]) / static_cast<float64_t>(NB_THREADS_2D_X)));
const size_t dimY = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_ii.shape[0]) / static_cast<float64_t>(NB_THREADS_2D_Y)));
const dim3 dimGrid(dimX, dimY);
constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y);
__apply_feature_kernel__<<<dimGrid, dimBlock>>>(d_X_feat, d_feats, d_X_ii);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy X_feat", cudaMemcpy(X_feat.data, d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t), cudaMemcpyDeviceToHost));
_print_cuda_error_("free d_X_feat", cudaFree(d_X_feat));
cudaFree("free d_feats", d_feats);
cudaFree("free d_X_11", d_X_ii);
return X_feat;
}
static __global__ void __train_weak_clf_kernel__(np::Array<float64_t> d_classifiers, const np::Array<uint8_t> d_y,
const np::Array<int32_t> d_X_feat, const np::Array<uint16_t> d_X_feat_argsort,
const np::Array<float64_t> d_weights, const float64_t total_pos, const float64_t total_neg) {
size_t i = blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
i += threadIdx.x * blockDim.y * blockDim.z;
i += threadIdx.y * blockDim.z;
i += threadIdx.z;
// const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= d_classifiers.shape[0])
return;
size_t pos_seen = 0, neg_seen = 0;
float64_t pos_weights = 0.0, neg_weights = 0.0;
float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0;
for(size_t j = 0; j < d_X_feat_argsort.shape[1]; ++j) {
const float64_t error = np::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights);
if (error < min_error){
min_error = error;
best_threshold = d_X_feat[i * d_X_feat.shape[1] + d_X_feat_argsort[i * d_X_feat.shape[1] + j]];
best_polarity = pos_seen > neg_seen ? 1.0 : -1.0;
}
if(d_y[d_X_feat_argsort[i * d_X_feat.shape[1] + j]] == static_cast<uint8_t>(1)){
++pos_seen;
pos_weights += d_weights[d_X_feat_argsort[i * d_X_feat.shape[1] + j]];
} else {
++neg_seen;
neg_weights += d_weights[d_X_feat_argsort[i * d_X_feat.shape[1] + j]];
}
}
d_classifiers[i * 2] = best_threshold; d_classifiers[i * 2 + 1] = best_polarity;
}
np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y,
const np::Array<float64_t>& weights) noexcept {
float64_t total_pos = 0.0, total_neg = 0.0;
for(size_t i = 0; i < y.shape[0]; ++i)
(y[i] == static_cast<uint8_t>(1) ? total_pos : total_neg) += weights[i];
np::Array<float64_t> classifiers = np::empty<float64_t>({ X_feat.shape[0], 2});
np::Array<float64_t> d_classifiers = copyToDevice<float64_t>("classifiers", classifiers);
np::Array<int32_t> d_X_feat = copyToDevice<int32_t>("X_feat", X_feat);
np::Array<uint16_t> d_X_feat_argsort = copyToDevice<uint16_t>("X_feat_argsort", X_feat_argsort);
np::Array<float64_t> d_weights = copyToDevice<float64_t>("weights", weights);
np::Array<uint8_t> d_y = copyToDevice<uint8_t>("y", y);
const size_t n_blocks = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_feat.shape[0]) / static_cast<float64_t>(NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z)));
constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z);
// const size_t n_blocks = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_feat.shape[0]) / static_cast<float64_t>(NB_THREADS)));
// constexpr const dim3 dimBlock(NB_THREADS);
__train_weak_clf_kernel__<<<n_blocks, dimBlock>>>(d_classifiers, d_y, d_X_feat, d_X_feat_argsort, d_weights, total_pos, total_neg);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy classifiers", cudaMemcpy(classifiers.data, d_classifiers.data, np::prod(classifiers.shape) * sizeof(float64_t), cudaMemcpyDeviceToHost));
cudaFree("free d_classifiers", d_classifiers);
cudaFree("free d_X_feat", d_X_feat);
cudaFree("free d_X_feat_argsort", d_X_feat_argsort);
cudaFree("free d_weights", d_weights);
cudaFree("free d_y", d_y);
return classifiers;
}
template<typename T>
__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
int32_t i = l - 1;
for (int32_t j = l; j <= h; ++j)
if (a[indices[j]] < a[indices[h]])
swap(&indices[++i], &indices[j]);
swap(&indices[++i], &indices[h]);
return i;
}
template<typename T>
__device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
const size_t total = h - l + 1;
//int32_t* stack = new int32_t[total]{l, h};
//int32_t stack[total];
int32_t stack[6977];
//int32_t stack[1<<16];
stack[0] = l;
stack[1] = h;
size_t top = 1, low = l, high = h;
while (top <= total) {
high = stack[top--];
low = stack[top--];
if(low >= high)
break;
const int32_t p = as_partition_gpu(a, indices, low, high);
if (p - 1 > low && p - 1 < total) {
stack[++top] = low;
stack[++top] = p - 1;
}
if (p + 1 < high) {
stack[++top] = p + 1;
stack[++top] = high;
}
}
//delete[] stack;
}
template<typename T>
__global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* indices){
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= a.shape[0])
return;
for(size_t y = 0; y < a.shape[1]; ++y) indices[idx * a.shape[1] + y] = y;
argsort_gpu(&a[idx * a.shape[1]], &indices[idx * a.shape[1]], 0, a.shape[1] - 1);
}
np::Array<uint16_t> argsort_2d_gpu(const np::Array<int32_t>& X_feat) noexcept {
const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
uint16_t* d_indices;
const size_t indices_size = np::prod(indices.shape) * sizeof(uint16_t);
np::Array<int32_t> d_X_feat = copyToDevice<int32_t>("X_feat", X_feat);
_print_cuda_error_("malloc d_indices", cudaMalloc(&d_indices, indices_size));
const size_t dimGrid = static_cast<size_t>(std::ceil(static_cast<float64_t>(X_feat.shape[0]) / static_cast<float64_t>(NB_THREADS)));
const dim3 dimBlock(NB_THREADS);
argsort_bounded_gpu<<<dimGrid, dimBlock>>>(d_X_feat, d_indices);
_print_cuda_error_("synchronize", cudaDeviceSynchronize());
_print_cuda_error_("memcpy d_indices", cudaMemcpy(indices.data, d_indices, indices_size, cudaMemcpyDeviceToHost));
cudaFree("free d_X_feat", d_X_feat);
_print_cuda_error_("free d_indices", cudaFree(d_indices));
return indices;
}
__host__ __device__
size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept {
size_t result = shape[offset];
for(size_t i = 1 + offset; i < shape.length; ++i)
result *= shape[i];
return result;
}

11
cpp/ViolaJonesGPU.hpp Normal file
View File

@ -0,0 +1,11 @@
#pragma once
#include "data.hpp"
void test_working(const size_t&) noexcept;
void test_working_2d(const size_t&, const size_t&) noexcept;
void test_working_3d(const size_t&, const size_t&, const size_t&) noexcept;
np::Array<uint32_t> set_integral_image_gpu(const np::Array<uint8_t>&) noexcept;
np::Array<int32_t> apply_features_gpu(const np::Array<uint8_t>&, const np::Array<uint32_t>&) noexcept;
np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y,
const np::Array<float64_t>& weights) noexcept;
np::Array<uint16_t> argsort_2d_gpu(const np::Array<int32_t>& X_feat) noexcept;

14
cpp/config.hpp Normal file
View File

@ -0,0 +1,14 @@
#pragma once
// Save state to avoid recalulation on restart
#define SAVE_STATE true
// Redo the state even if it's already saved
#define FORCE_REDO false
// Use GPU to greatly accelerate runtime
#define GPU_BOOSTED true
// Number of weak classifiers
// const size_t TS[] = { 1 };
// const size_t TS[] = { 1, 5, 10 };
// const size_t TS[] = { 1, 5, 10, 25, 50 };
// const size_t TS[] = { 1, 5, 10, 25, 50, 100, 200, 300 };
const size_t TS[] = { 1, 5, 10, 25, 50, 100, 200, 300, 400, 500, 1000 };

212
cpp/data.cpp Normal file
View File

@ -0,0 +1,212 @@
#include "data.hpp"
//#include "toolbox.hpp"
//#include <cstring>
int print(const np::Shape& shape) noexcept {
int num_written = 0;
num_written += printf("(");
if (shape.length > 1) {
const size_t length = shape.length - 1;
for (size_t i = 0; i < length; ++i)
num_written += printf("%lu, ", shape[i]);
num_written += printf("%lu)\n", shape[length]);
}
else
num_written += printf("%lu,)\n", shape[0]);
return num_written;
}
template<typename T>
int print(const np::Array<T>& array, const char* format) noexcept {
//printf("[");
//const size_t length = np::prod(array.shape);
//for(size_t i = 0; i < length - 1; ++i)
// //std::cout << array[i] << " ";
// printf("%f ", array[i]);
////std::cout << array[array.shape[0] - 1] << "]\n";
//printf("%f]\n", array[length - 1]);
char format_space[BUFFER_SIZE] = { 0 };
sprintf(format_space, "%s ", format);
char format_close[BUFFER_SIZE] = { 0 };
sprintf(format_close, "%s]\n", format);
int num_written = 0;
if (array.shape.length == 1) {
const size_t max = array.shape[0] - 1;
num_written += printf("[");
for (size_t i = 0; i < max; ++i)
num_written += printf(format_space, array[i]);
num_written += printf(format_close, array[max]);
}
else {
num_written += printf("[");
for (size_t i = 0; i < array.shape[0]; ++i) {
num_written += printf(" [");
for (size_t j = 0; j < array.shape[1] - 1; ++j)
num_written += printf(format_space, array[i * array.shape[1] + j]);
num_written += printf(format_close, array[i * array.shape[1] + array.shape[1] - 1]);
}
num_written += printf("]\n");
}
return num_written;
}
int print(const np::Array<uint8_t>& array) noexcept {
return print(array, "%hu");
}
int print(const np::Array<float64_t>& array) noexcept {
return print(array, "%f");
}
int print_feat(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
int num_written = 0;
num_written += printf("[");
const size_t feat_size = np::prod(array.shape, 1);
const size_t offset = slice.x * feat_size;
const size_t length = offset + feat_size - 1;
for (size_t i = offset; i < length; ++i)
num_written += printf("%2hu ", array[i]);
num_written += printf("%2hu]\n", array[length]);
return num_written;
}
int print(const np::Array<uint8_t>& array, const np::Slice& slice) noexcept {
int num_written = 0;
if (array.shape.length == 1) {
const size_t max = slice.y - 1; //std::min(slice.y, array.shape[0] - 1);
num_written += printf("[");
for (size_t i = slice.x; i < max; ++i)
num_written += printf("%hu ", array[i]);
num_written += printf("%hu]\n", array[max]);
}
else {
num_written += printf("[");
size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z;
for (size_t i = 0; i < array.shape[1]; ++i) {
num_written += printf(" [");
for (size_t j = 0; j < array.shape[2]; ++j)
num_written += printf("%3hu ", array[k + i * array.shape[1] + j]);
num_written += printf("]\n");
}
num_written += printf("]\n");
}
return num_written;
}
int print(const np::Array<uint32_t>& array, const np::Slice& slice) noexcept {
int num_written = 0;
if (array.shape.length == 1) {
const size_t max = slice.y - 1; //std::min(slice.y, array.shape[0] - 1);
num_written += printf("[");
for (size_t i = slice.x; i < max; ++i)
num_written += printf("%iu ", array[i]);
num_written += printf("%iu]\n", array[max]);
}
else {
num_written += printf("[");
size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z;
for (size_t i = 0; i < array.shape[1]; ++i) {
num_written += printf(" [");
for (size_t j = 0; j < array.shape[2]; ++j)
num_written += printf("%5i ", array[k + i * array.shape[1] + j]);
num_written += printf("]\n");
}
num_written += print("]");
}
return num_written;
}
int print(const np::Array<int32_t>& array, const np::Slice& slice) noexcept {
int num_written = 0;
num_written += printf("[");
//size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z;
size_t k = slice.x * array.shape[1];
for (size_t i = k; i < k + (slice.y - slice.x); ++i) {
num_written += printf("%5i ", array[i]);
}
num_written += print("]");
return num_written;
}
int print(const np::Array<uint16_t>& array, const np::Slice& slice) noexcept {
int num_written = 0;
num_written += printf("[");
//size_t k = slice.x * array.shape[1] * array.shape[2] + slice.y * array.shape[2] + slice.z;
size_t k = slice.x * array.shape[1];
for (size_t i = k; i < k + (slice.y - slice.x); ++i) {
num_written += printf("%5hu ", array[i]);
}
num_written += print("]");
return num_written;
}
static inline np::Array<uint8_t> load_set(const char* set_name) {
FILE* file = fopen(set_name, "rb");
if (file == NULL) {
print_error_file(set_name);
throw;
}
char meta[BUFFER_SIZE];
if (!fgets(meta, BUFFER_SIZE, file)) {
print_error_file(set_name);
fclose(file);
throw;
}
size_t* dims = new size_t[3]();
if (!sscanf(meta, "%lu %lu %lu", &dims[0], &dims[1], &dims[2])) {
print_error_file(set_name);
fclose(file);
throw;
}
np::Shape shape = { static_cast<size_t>(dims[1] == 0 ? 1 : 3), dims };
np::Array<uint8_t> a = np::empty<uint8_t>(std::move(shape));
const size_t size = np::prod(a.shape);
size_t i = 0, j = 0;
int c;
char buff[STRING_INT_SIZE] = { 0 };
while ((c = fgetc(file)) != EOF && i < size) {
if (c == ' ' || c == '\n') {
buff[j] = '\0';
a[i++] = static_cast<uint8_t>(atoi(buff));
//memset(buff, 0, STRING_INT_SIZE);
j = 0;
}
else
buff[j++] = (char)c;
}
buff[j] = '\0';
a[i++] = static_cast<uint8_t>(atoi(buff));
if (i != size) {
fprintf(stderr, "Missing loaded data %lu/%lu\n", i, size);
fclose(file);
throw;
}
fclose(file);
return a;
}
std::array<np::Array<uint8_t>, 4> load_datasets() {
return {
load_set(DATA_DIR "/X_train.bin"), load_set(DATA_DIR "/y_train.bin"),
load_set(DATA_DIR "/X_test.bin"), load_set(DATA_DIR "/y_test.bin")
};
}
void print_error_file(const char* file_dir) noexcept {
const char* buff = strerror(errno);
fprintf(stderr, "Can't open %s, error code = %d : %s\n", file_dir, errno, buff);
// delete buff;
}
//size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept {
// size_t result = shape[offset];
// for(size_t i = 1 + offset; i < shape.length; ++i)
// result *= shape[i];
// return result;
//}

954
cpp/data.hpp Normal file
View File

@ -0,0 +1,954 @@
#pragma once
#include <iostream>
#include <cstring>
#include <cmath>
#include <cassert>
#include <functional>
#include <memory>
#define DATA_DIR "../data"
#define OUT_DIR "./out"
#define MODEL_DIR "./models"
#define BUFFER_SIZE 256
#define STRING_INT_SIZE 8 // Length of a number in log10 (including '-')
#define S(N) std::string(N, '-').c_str()
#ifndef __CUDACC__
#define __host__
#define __device__
#endif
typedef float float32_t;
typedef double float64_t;
typedef long double float128_t;
__host__ __device__
constexpr inline int print(const char* str) noexcept {
return printf("%s\n", str);
}
inline int print(const std::string& s) noexcept {
return printf("%s\n", s.c_str());
}
namespace np {
constexpr const float64_t inf = std::numeric_limits<float64_t>::infinity();
typedef struct Slice {
size_t x = 0, y = 0, z = 0;
} Slice;
typedef struct Shape {
size_t length = 0;
size_t* data = nullptr;
size_t* refcount = nullptr;
#ifdef __DEBUG
size_t total = 1;
#endif
__host__ __device__
Shape() noexcept {
// #ifdef __DEBUG
// print("Shape created (default)");
// #endif
}
__host__ __device__
Shape(const size_t& length, size_t* data) noexcept : length(length), data(data), refcount(new size_t(1)) {
#ifdef __DEBUG
//print("Shape created (raw)");
for(size_t i = 0; i < length; ++i)
total *= data[i];
#endif
}
__host__ __device__
Shape(const std::initializer_list<size_t>& dims) noexcept : length(dims.size()), data(new size_t[dims.size()]), refcount(new size_t(1)) {
// #ifdef __DEBUG
// print("Shape created (initializer)");
// #endif
const auto* begin = dims.begin();
for(size_t i = 0; i < length; ++i){
data[i] = begin[i];
#ifdef __DEBUG
total *= data[i];
#endif
}
}
__host__ __device__
Shape(const Shape& shape) noexcept {
#ifdef __DEBUG
print("Shape created (copy)");
#endif
if (data != nullptr && data != shape.data){
#ifdef __DEBUG
print("Former shape deleted (copy)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != shape.refcount){
#ifdef __DEBUG
print("Former shape refcount freed (copy)");
#endif
delete refcount;
}
length = shape.length;
//data = new size_t[length];
//memcpy(data, shape.data, length * sizeof(size_t));
//refcount = new size_t;
//memcpy(refcount, shape.refcount, sizeof(size_t));
data = shape.data;
refcount = shape.refcount;
if (refcount != nullptr)
(*refcount)++;
#ifdef __DEBUG
else
print("Moved shape has null refcount");
total = shape.total;
#endif
}
__host__ __device__
Shape(Shape&& shape) noexcept {
// #ifdef __DEBUG
// print("Shape created (move));
// #endif
if (data != nullptr && data != shape.data){
#ifdef __DEBUG
print("Former shape deleted (move)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != shape.refcount){
#ifdef __DEBUG
print("Former shape refcount freed (move)");
#endif
delete refcount;
}
length = shape.length;
data = shape.data;
refcount = shape.refcount;
shape.length = 0;
shape.data = nullptr;
shape.refcount = nullptr;
#ifdef __DEBUG
total = shape.total;
shape.total = 1;
#endif
}
__host__ __device__
~Shape() noexcept {
if(refcount == nullptr){
// #ifdef __DEBUG
// print("Shape refcount freed more than once");
// #endif
return;
}
--(*refcount);
// #ifdef __DEBUG
// printf("Shape destructed : %lu\n", *refcount);
// #endif
if(*refcount == 0){
if (data != nullptr){
delete[] data;
data = nullptr;
// #ifdef __DEBUG
// print("Shape freeing ...");
// #endif
}
//#ifdef __DEBUG
else
printf("Shape freed more than once : %lu\n", *refcount);
//#endif
delete refcount;
refcount = nullptr;
#ifdef __DEBUG
total = 1;
#endif
}
}
__host__ __device__
Shape& operator=(const Shape& shape) noexcept {
#ifdef __DEBUG
print("Shape created (assign copy)");
#endif
if (data != nullptr && data != shape.data){
#ifdef __DEBUG
print("Former shape deleted (assign copy)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != shape.refcount){
#ifdef __DEBUG
print("Former shape refcount freed (assign copy)");
#endif
delete refcount;
}
length = shape.length;
// data = new size_t[length];
// memcpy(data, shape.data, length * sizeof(size_t));
// refcount = new size_t;
// memcpy(refcount, shape.refcount, sizeof(size_t));
data = shape.data;
refcount = shape.refcount;
if (refcount != nullptr)
(*refcount)++;
#ifdef __DEBUG
else
printf("Assigned copy shape has null refcount");
total = shape.total;
#endif
return *this;
}
__host__ __device__
Shape& operator=(Shape&& shape) noexcept {
// #ifdef __DEBUG
// print("Shape created (assign move)");
// #endif
if (data != nullptr && data != shape.data){
#ifdef __DEBUG
print("Former shape deleted (assign move)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != shape.refcount){
#ifdef __DEBUG
print("Former shape refcount freed (assign move)");
#endif
delete refcount;
}
length = shape.length;
data = shape.data;
refcount = shape.refcount;
#ifdef __DEBUG
total = shape.total;
if (refcount == nullptr)
print("Assigned copy shape has null refcount");
shape.total = 1;
#endif
shape.length = 0;
shape.data = nullptr;
shape.refcount = nullptr;
return *this;
}
__host__ __device__
constexpr size_t& operator[](const size_t& i) const {
#ifdef __DEBUG
if (i > length){
printf("Index %lu out of shape length %lu\n", i, length);
#ifndef __CUDACC__
throw std::out_of_range("Index out of shape size");
#endif
}
#endif
return data[i];
}
constexpr bool operator==(const Shape& other) const noexcept {
if (length != other.length)
return false;
#ifdef __DEBUG
if (total != other.total)
return false;
#endif
for(size_t i = 0; i < length; ++i)
if (data[i] != other[i])
return false;
return true;
}
constexpr bool operator!=(const Shape& other) const noexcept {
return !(*this == other);
}
} Shape;
__host__ __device__
size_t prod(const Shape&, const size_t& = 0) noexcept;
template<typename T>
struct Array {
Shape shape;
T* data = nullptr;
size_t* refcount = nullptr;
__host__ __device__
Array() noexcept {
// #ifdef __DEBUG
// print("Array created (default)");
// #endif
}
__host__ __device__
Array(const Shape& shape, T* data) noexcept : shape(shape), data(data), refcount(new size_t(1)) {
// #ifdef __DEBUG
// print("Array created (raw, copy shape)");
// #endif
}
__host__ __device__
Array(const Shape& shape) noexcept : shape(shape), data(new T[np::prod(shape)]), refcount(new size_t(1)) {
// #ifdef __DEBUG
// print("Array created (raw empty, copy shape)");
// #endif
}
__host__ __device__
Array(Shape&& shape, T* data) noexcept : shape(std::move(shape)), data(data), refcount(new size_t(1)) {
// #ifdef __DEBUG
// print("Array created (raw, move shape)");
// #endif
}
__host__ __device__
Array(Shape&& shape) noexcept : shape(std::move(shape)), data(new T[np::prod(shape)]), refcount(new size_t(1)) {
// #ifdef __DEBUG
// print("Array created (raw empty, move shape)");
// #endif
}
__host__ __device__
Array(const Array& array) noexcept : shape(array.shape) {
#ifdef __DEBUG
print("Array created (copy)");
#endif
if (data != nullptr && data != array.data){
#ifdef __debug
print("Former array deleted (move)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != array.refcount){
#ifdef __DEBUG
print("Former array refcount freed (move)");
#endif
delete refcount;
}
// const size_t size = np::prod(shape);
// data = new T[size];
// memcpy(data, array.data, size);
// refcount = new size_t;
// memcpy(refcount, array.refcount, sizeof(size_t));
data = array.data;
refcount = array.refcount;
if (refcount != nullptr)
(*refcount)++;
#ifdef __DEBUG
else
print("Moved array has null refcount");
#endif
}
__host__ __device__
Array(Array&& array) noexcept {
// #ifdef __DEBUG
// print("Array created (move)");
// #endif
if (data != nullptr && data != array.data){
#ifdef __DEBUG
print("Former array deleted (move)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != array.refcount){
#ifdef __DEBUG
print("Former array refcount freed (move)");
#endif
delete refcount;
}
shape = std::move(array.shape);
data = array.data;
refcount = array.refcount;
array.data = nullptr;
array.refcount = nullptr;
}
__host__ __device__
~Array() noexcept {
if(refcount == nullptr){
// #ifdef __DEBUG
// print("Array refcount freed more than once");
// #endif
return;
}
--(*refcount);
// #ifdef __DEBUG
// printf("Array destructed : %lu\n", *refcount);
// #endif
if(*refcount == 0){
if (data != nullptr){
delete[] data;
data = nullptr;
// #ifdef __DEBUG
// print("Array freeing ...");
// #endif
}
#ifdef __DEBUG
else
printf("Array freed more than once : %lu\n", *refcount);
#endif
delete refcount;
refcount = nullptr;
}
}
__host__ __device__
Array& operator=(const Array& array) noexcept {
#ifdef __DEBUG
print("Array created (assign copy)");
#endif
if (data != nullptr && data != array.data){
#ifdef __DEBUG
print("Former array deleted (assign copy)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != array.refcount){
#ifdef __DEBUG
print("Former array refcount freed (assign copy)");
#endif
delete refcount;
}
shape = array.shape;
// const size_t size = np::prod(shape) * sizeof(T);
// data = new T[size];
// memcpy(data, array.data, size);
// refcount = new size_t;
// memcpy(refcount, array.refcount, sizeof(size_t));
data = array.data;
refcount = array.refcount;
if (refcount != nullptr)
(*refcount)++;
#ifdef __DEBUG
else
print("Assigned array has null refcount");
#endif
return *this;
}
__host__ __device__
Array& operator=(Array&& array) noexcept {
// #ifdef __DEBUG
// print("Array created (assign move)");
// #endif
if (data != nullptr && data != array.data){
#ifdef __DEBUG
print("Former array deleted (assign move)");
#endif
delete[] data;
}
if (refcount != nullptr && refcount != array.refcount){
#ifdef __DEBUG
print("Former array refcount freed (assign move)");
#endif
delete refcount;
}
shape = std::move(array.shape);
data = array.data;
refcount = array.refcount;
array.refcount = nullptr;
array.data = nullptr;
return *this;
}
__host__ __device__
constexpr T& operator[](const size_t& i) const;
// bool operator==(const Array& other) const noexcept;
template<typename F>
Array<T>& operator/=(const F& value) noexcept;
// Array<T>& operator*=(const T& value) noexcept;
template<typename F>
Array<T>& operator*=(const Array<F>& other);
template<typename F>
Array<T>& operator+=(const Array<F>& other);
template<typename F>
Array<T> operator*(const Array<F>& other) const;
// template<typename F>
// Array<T> operator*(const F& other) const;
template<typename F>
Array<T> operator-(const np::Array<F>& other) const;
template<typename F>
Array<T> operator-(const F& other) const;
};
template<typename T>
Array<T> empty(Shape&& shape) noexcept {
return { std::move(shape), new T[np::prod(shape)] };
}
template<typename T>
Array<T> empty(const Shape& shape) noexcept {
return { std::move(shape), new T[np::prod(shape)] };
}
template<typename T>
Array<T> empty(const std::initializer_list<size_t>& dims) noexcept {
const Shape shape(dims);
return { std::move(shape), new T[np::prod(shape)] };
}
template<typename T>
Array<T> zeros(Shape&& shape) noexcept {
return { std::move(shape), new T[np::prod(shape)]{0} };
}
template<typename T>
Array<T> zeros(const Shape& shape) noexcept {
return { std::move(shape), new T[np::prod(shape)]{0} };
}
template<typename T>
Array<T> zeros(const std::initializer_list<size_t>& dims) noexcept {
const Shape shape(dims);
return { std::move(shape), new T[np::prod(shape)]{0} };
}
template<typename T>
__host__ __device__
constexpr T& Array<T>::operator[](const size_t& i) const {
#ifdef __DEBUG
if (i > shape.total){
printf("Index %lu out of array size %lu\n", i, shape.total);
#ifndef __CUDACC__
throw std::out_of_range("Index out of array size");
#endif
}
#endif
return data[i];
}
// bool Array<T>::operator==(const Array& other) const noexcept {
// if (shape != other.shape)
// return false;
// const size_t lenght = np::prod(shape);
// for(size_t i = 0; i < length; ++i)
// if (data[i] != other[i])
// return false;
// return true;
// }
template<typename T>
template<typename F>
Array<T>& Array<T>::operator/=(const F& value) noexcept {
const size_t total = prod(shape);
for(size_t i = 0; i < total; ++i)
data[i] /= value;
return *this;
}
// template<typename T>
// Array<T>& Array<T>::operator*=(const T& value) noexcept {
// const size_t total = prod(shape);
// for(size_t i = 0; i < total; ++i)
// data[i] *= value;
// return *this;
// }
template<typename T>
template<typename F>
Array<T> Array<T>::operator*(const Array<F>& other) const {
#ifdef __DEBUG
if (shape != other.shape){
printf("Incompatible shapes\n");
throw;
}
#endif
np::Array<T> res = np::empty<T>(shape);
const size_t total = prod(shape);
for(size_t i = 0; i < total; ++i)
res[i] = data[i] * other[i];
return res;
}
// template<typename T>
// template<typename F>
// Array<T> Array<T>::operator*(const F& value) const {
// np::Array<T> res = np::empty<T>(shape);
// const size_t total = prod(shape);
// for(size_t i = 0; i < total; ++i)
// res[i] = data[i] * value;
// return res;
// }
// template<typename T, typename F>
// Array<T> operator*(const F& value, const Array<T>& other) {
// np::Array<T> res = np::empty<T>(other.shape);
// const size_t total = prod(other.shape);
// for(size_t i = 0; i < total; ++i)
// res[i] = other[i] * value;
// return res;
// }
template<typename T>
template<typename F>
Array<T>& Array<T>::operator*=(const Array<F>& other) {
#ifdef __DEBUG
if (shape != other.shape){
printf("Incompatible shapes\n");
throw;
}
#endif
const size_t total = prod(shape);
for(size_t i = 0; i < total; ++i)
data[i] *= other[i];
return *this;
}
template<typename T>
template<typename F>
Array<T>& Array<T>::operator+=(const Array<F>& other) {
#ifdef __DEBUG
if (shape != other.shape){
printf("Incompatible shapes\n");
throw;
}
#endif
const size_t total = prod(shape);
for(size_t i = 0; i < total; ++i)
data[i] += other[i];
return *this;
}
template<typename T>
template<typename F>
Array<T> Array<T>::operator-(const F& other) const {
np::Array<T> res = np::empty<T>(shape);
const size_t total = prod(shape);
for(size_t i = 0; i < total; ++i)
res[i] = data[i] - other;
return res;
}
template<typename T>
template<typename F>
Array<T> Array<T>::operator-(const np::Array<F>& other) const {
#ifdef __DEBUG
if (shape != other.shape){
printf("Incompatible shapes\n");
throw;
}
#endif
np::Array<T> res = np::empty<T>(shape);
const size_t total = prod(shape);
for(size_t i = 0; i < total; ++i)
res[i] = data[i] - other[i];
return res;
}
// template<typename T>
// T prod(const Array<T>& array, const size_t& offset = 0) noexcept {
// T result = array[offset];
// const size_t total = prod(array.shape);
// for(size_t i = 1 + offset; i < total; ++i)
// result *= array[i];
// return result;
// }
template<typename T>
T sum(const Array<T>& array) noexcept {
T result = array[0];
const size_t total = prod(array.shape);
for(size_t i = 1; i < total; ++i)
result += array[i];
return result;
}
template<typename T, typename F>
F mean(const Array<T>& array) noexcept {
T result = array[0];
const size_t total = prod(array.shape);
for(size_t i = 1; i < total; ++i)
result += array[i];
return result / total;
}
template<typename T>
float64_t mean(const Array<T>& array) noexcept {
return mean<float64_t, T>(array);
}
template<typename T>
np::Array<T> abs(const Array<T>& array) noexcept {
np::Array<T> result = np::empty<T>(array.shape);
const size_t total = prod(array.shape);
for(size_t i = 0; i < total; ++i)
result[i] = std::abs(array[i]);
return result;
}
template<typename T, typename F>
np::Array<T> pow(const F& k, const Array<T>& array) noexcept {
np::Array<T> result = np::empty<T>(array.shape);
const size_t total = prod(array.shape);
for(size_t i = 0; i < total; ++i)
result[i] = std::pow(k, array[i]);
return result;
}
//template<typename T>
//T max(const Array<T>& array) noexcept {
// T result = array[0];
// for(size_t i = 1; i < prod(array.shape); ++i)
// if(array[i] > result)
// result = array[i];
// return result;
//}
//template<typename T>
//T min(const Array<T>& array) noexcept {
// T result = array[0];
// for(size_t i = 1; i < prod(array.shape); ++i)
// if(array[i] < result)
// result = array[i];
// return result;
//}
template<typename T, typename F>
Array<T> astype(const Array<F>& array) noexcept {
Array<T> res = empty<T>(array.shape);
const size_t total = prod(array.shape);
for(size_t i = 0; i < total; ++i)
res[i] = static_cast<T>(array[i]);
return res;
}
template<typename T>
Array<T> operator-(const T& k, const Array<T>& other) noexcept {
np::Array<T> res = empty<T>(other.shape);
const size_t total = prod(other.shape);
for(size_t i = 0; i < total; ++i)
res[i] = k - other[i];
return res;
}
template<typename T>
__host__ __device__
constexpr T min(const T& lhs, const T& rhs) noexcept {
return lhs < rhs ? lhs : rhs;
}
// template<typename T>
// __host__ __device__
// constexpr T max(const T& lhs, const T& rhs) noexcept {
// return lhs > rhs ? lhs : rhs;
// }
};
template<typename T>
constexpr np::Array<T>& map(np::Array<T>& a, const T(&fnc)(const size_t& i, const T& v)) noexcept {
return std::function<T(const size_t&, const T&)>(fnc);
}
template<typename T>
constexpr np::Array<T>& map(np::Array<T>& a, const std::function<T(const size_t&, const T&)>& fnc) noexcept {
const size_t a_length = np::prod(a.shape);
for (size_t i = 0; i < a_length; ++i)
a[i] = fnc(i, a[i]);
return a;
}
//template<typename T>
//constexpr void foreach(const np::Array<T>& a, const void(&fnc)(const size_t&, const T&)) noexcept {
// return std::function<void(const size_t&, const T&)>(fnc);
//}
//template<typename T>
//constexpr void foreach(const np::Array<T>& a, const std::function<void(const size_t&, const T&)>& fnc) noexcept {
// for (size_t i = 0; i < a.length; ++i)
// fnc(i, a[i]);
//}
template<typename T>
__host__ __device__
constexpr inline static void swap(T* a, T* b) noexcept {
if (a == b) return;
const T temp = *a;
*a = *b;
*b = temp;
}
template<typename T>
static int32_t qs_partition(const np::Array<T>& a, const int32_t& l, const int32_t& h) noexcept {
int32_t i = l - 1;
for (int32_t j = l; j <= h; ++j)
if (a[j] < a[h])
swap(&a[++i], &a[j]);
swap(&a[++i], &a[h]);
return i;
}
template<typename T>
void quicksort(const np::Array<T>& a, const int32_t& l, const int32_t& h) noexcept {
if (l >= h)
return;
const int32_t p = qs_partition(a, l, h);
quicksort(a, l, p - 1);
quicksort(a, p + 1, h);
}
template<typename T>
void quicksort(const np::Array<T>& a) noexcept {
quicksort(a, 0, a.length - 1);
}
template<typename T>
static size_t as_partition(const T* a, uint16_t* indices, const size_t& l, const size_t& h) noexcept {
size_t i = l - 1;
for (size_t j = l; j <= h; ++j)
if (a[indices[j]] < a[indices[h]])
swap(&indices[++i], &indices[j]);
swap(&indices[++i], &indices[h]);
return i;
}
template<typename T>
void argsort(const T* a, uint16_t* indices, const size_t& l, const size_t& h) noexcept {
const size_t total = h - l + 1;
size_t* stack = new size_t[total]{l, h};
size_t top = 1, low = l, high = h;
while (top <= total) {
high = stack[top--];
low = stack[top--];
if(low >= high)
break;
const size_t p = as_partition(a, indices, low, high);
if (p - 1 > low && p - 1 < total) {
stack[++top] = low;
stack[++top] = p - 1;
}
if (p + 1 < high) {
stack[++top] = p + 1;
stack[++top] = high;
}
}
delete[] stack;
}
template<typename T>
np::Array<uint16_t> argsort(const np::Array<T>& other, const size_t& l, const size_t& h) noexcept {
np::Array<uint16_t> indices = np::empty(other.shape);
map(indices, [](const size_t& i, const uint16_t&) -> uint16_t { return i; });
argsort(other, indices, l, h);
return indices;
}
template<typename T>
np::Array<uint16_t> argsort(const np::Array<T>* other, const size_t& length) noexcept {
return argsort(other, 0, length - 1);
}
std::array<np::Array<uint8_t>, 4> load_datasets(void);
void print_error_file(const char*) noexcept;
template<typename T>
void save(const np::Array<T>& d, const char* filename) {
FILE* output = fopen(filename, "wb");
if (output == NULL) {
print_error_file(filename);
throw;
}
assert(d.shape.refcount != 0);//, "Refcount shape is zero !!");
fwrite(&d.shape.length, sizeof(size_t), 1, output);
fwrite(d.shape.data, sizeof(size_t), d.shape.length, output);
assert(d.refcount != 0);//, "Refcount array is zero !!");
fwrite(d.data, sizeof(T), np::prod(d.shape), output);
fclose(output);
}
template<typename T>
np::Array<T> load(const char* filename) {
FILE* input = fopen(filename, "rb");
if (input == NULL) {
print_error_file(filename);
throw;
}
size_t length = 0;
if(!fread(&length, sizeof(size_t), 1, input)){
print_error_file(filename);
fclose(input);
throw;
}
size_t* data = new size_t[length];
if(!fread(data, sizeof(size_t), length, input)){
print_error_file(filename);
fclose(input);
throw;
}
np::Array<T> d = np::empty<T>(np::Shape(length, data));
if(!fread(d.data, sizeof(T), np::prod(d.shape), input)){
print_error_file(filename);
fclose(input);
throw;
}
fclose(input);
return d;
}
#ifdef __CUDACC__
template<typename T>
np::Array<T> copyToDevice(const char* name, const np::Array<T>& array) noexcept {
const size_t array_size = np::prod(array.shape) * sizeof(T);
const size_t shape_size = array.shape.length * sizeof(size_t);
np::Array<T> d_array;
//_print_cuda_error_(name, cudaMalloc(&d_array.refcount, sizeof(size_t)));
_print_cuda_error_(name, cudaMalloc(&d_array.data, array_size));
//_print_cuda_error_(name, cudaMalloc(&d_array.shape.refcount, sizeof(size_t)));
_print_cuda_error_(name, cudaMalloc(&d_array.shape.data, shape_size));
d_array.shape.length = array.shape.length;
//_print_cuda_error_(name, cudaMemcpy(d_array.refcount, array.refcount, sizeof(size_t), cudaMemcpyHostToDevice));
_print_cuda_error_(name, cudaMemcpy(d_array.data, array.data, array_size, cudaMemcpyHostToDevice));
//_print_cuda_error_(name, cudaMemcpy(d_array.shape.refcount, array.shape.refcount, sizeof(size_t), cudaMemcpyHostToDevice));
_print_cuda_error_(name, cudaMemcpy(d_array.shape.data, array.shape.data, shape_size, cudaMemcpyHostToDevice));
#ifdef __DEBUG
d_array.shape.total = np::prod(array.shape);
#endif
return d_array;
}
template<typename T>
constexpr void cudaFree(const char* name, np::Array<T>& array) noexcept {
//_print_cuda_error_(name, cudaFree(array.refcount));
//array.refcount = nullptr;
_print_cuda_error_(name, cudaFree(array.data));
array.data = nullptr;
//_print_cuda_error_(name, cudaFree(array.shape.refcount));
//array.shape.refcount = nullptr;
_print_cuda_error_(name, cudaFree(array.shape.data));
array.shape.data = nullptr;
}
constexpr inline void _print_cuda_error_(const char* name, const cudaError_t& err) noexcept {
if (err != cudaSuccess) fprintf(stderr, "Error: %s = %d : %s\n", name, err, cudaGetErrorString(err));
}
#endif
int print(const np::Shape&) noexcept;
int print(const np::Array<uint8_t>&) noexcept;
int print(const np::Array<float64_t>&) noexcept;
int print(const np::Array<uint8_t>&, const np::Slice&) noexcept;
int print(const np::Array<uint32_t>&, const np::Slice&) noexcept;
int print(const np::Array<int32_t>&, const np::Slice&) noexcept;
int print(const np::Array<uint16_t>&, const np::Slice&) noexcept;
int print_feat(const np::Array<uint8_t>&, const np::Slice&) noexcept;

320
cpp/projet.cpp Normal file
View File

@ -0,0 +1,320 @@
#include <filesystem>
namespace fs = std::filesystem;
#include "data.hpp"
#include "toolbox.hpp"
#include "config.hpp"
#include "ViolaJones.hpp"
#include "ViolaJonesGPU.hpp"
#include "ViolaJonesCPU.hpp"
void test_float() noexcept;
#ifdef __DEBUG
// #define IDX_INSPECT 0
// #define IDX_INSPECT 2
#define IDX_INSPECT 4548
#define IDX_INSPECT_OFFSET 100
#endif
#if GPU_BOOSTED
#define LABEL "GPU"
#define apply_features apply_features_gpu
#define set_integral_image set_integral_image_gpu
#define argsort_2d argsort_2d_gpu
#else
#define LABEL "CPU"
#define apply_features apply_features_cpu
#define set_integral_image set_integral_image_cpu
#define argsort_2d argsort_2d_cpu
#endif
std::tuple<np::Array<int32_t>, np::Array<uint16_t>, np::Array<uint8_t>, np::Array<int32_t>, np::Array<uint8_t>> preprocessing() {
// Creating state saver folders if they don't exist already
if (SAVE_STATE)
for (const char* const folder_name : { "models", "out" })
fs::create_directory(folder_name);
printf("| %-49s | %-17s | %-29s |\n", "Preprocessing", "Time spent (ns)", "Formatted time spent");
printf("|%s|%s|%s|\n", S(51), S(19), S(31));
const auto [ X_train, y_train, X_test, y_test ] = state_saver<uint8_t, 4>("Loading sets", {"X_train", "y_train", "X_test", "y_test"},
FORCE_REDO, SAVE_STATE, OUT_DIR, load_datasets);
#ifdef __DEBUG
// print("X_train");
// print(X_train.shape);
// print(X_train, { IDX_INSPECT });
// print("X_test");
// print(X_test.shape);
// print(X_test, { IDX_INSPECT });
// print("y_train");
// print(y_train.shape);
// print(y_train, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
// print("y_test");
// print(y_test.shape);
// print(y_test, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif
const np::Array<uint8_t> feats = state_saver<uint8_t>("Building features", "feats",
FORCE_REDO, SAVE_STATE, OUT_DIR, build_features, X_train.shape[1], X_train.shape[2]);
#ifdef __DEBUG
// print("feats");
// print(feats.shape);
// print_feat(feats, { IDX_INSPECT });
#endif
const np::Array<uint32_t> X_train_ii = state_saver<uint32_t>("Converting training set to integral images (" LABEL ")", "X_train_ii_" LABEL,
FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_train);
const np::Array<uint32_t> X_test_ii = state_saver<uint32_t>("Converting testing set to integral images (" LABEL ")", "X_test_ii_" LABEL,
FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_test);
#ifdef __DEBUG
// print("X_train_ii");
// print(X_train_ii.shape);
// print(X_train_ii, { IDX_INSPECT });
// print("X_test_ii");
// print(X_test_ii.shape);
// print(X_test_ii, { IDX_INSPECT });
// return {};
#endif
const np::Array<int32_t> X_train_feat = state_saver<int32_t>("Applying features to training set (" LABEL ")", "X_train_feat_" LABEL,
FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_train_ii);
const np::Array<int32_t> X_test_feat = state_saver<int32_t>("Applying features to testing set (" LABEL ")", "X_test_feat_" LABEL,
FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_test_ii);
#ifdef __DEBUG
// print("X_train_feat");
// print(X_train_feat.shape);
// print(X_train_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
// print("X_test_feat");
// print(X_test_feat.shape);
// print(X_test_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif
// const Array<int> indices = measure_time_save<Array<int>>("Selecting best features", "indices", select_percentile, X_train_feat, d.y_train);
// const Array<int> indices = measure_time<Array<int>>("Selecting best features", select_percentile, X_train_feat, d.y_train);
#ifdef __DEBUG
// print_feature(indices);
#endif
const np::Array<uint16_t> X_train_feat_argsort = state_saver<uint16_t>("Precalculating training set argsort (" LABEL ")", "X_train_feat_argsort_" LABEL,
FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_train_feat);
#ifdef __DEBUG
print("X_train_feat_argsort");
print(X_train_feat_argsort.shape);
print(X_train_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif
// const np::Array<uint16_t> X_test_feat_argsort = state_saver<uint16_t>("Precalculating testing set argsort (" LABEL ")", "X_test_feat_argsort_" LABEL,
// FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_test_feat);
#ifdef __DEBUG
// print("X_test_feat_argsort");
// print(X_test_feat_argsort.shape);
// print(X_test_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif
return { X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test };
}
void train(const np::Array<int32_t>& X_train_feat, const np::Array<uint16_t>& X_train_feat_argsort, const np::Array<uint8_t>& y_train) {
printf("\n| %-49s | %-17s | %-29s |\n", "Training", "Time spent (ns)", "Formatted time spent");
printf("|%s|%s|%s|\n", S(51), S(19), S(31));
for (const size_t T : TS) {
char title[BUFFER_SIZE] = { 0 };
char alphas_title[BUFFER_SIZE] = { 0 };
char final_classifiers_title[BUFFER_SIZE] = { 0 };
sprintf(title, "ViolaJones T = %-4lu (%s)", T, LABEL);
sprintf(alphas_title, "alphas_%lu_%s", T, LABEL);
sprintf(final_classifiers_title, "final_classifiers_%lu_%s", T, LABEL);
#ifdef __DEBUG
const auto [ alphas, final_classifiers ] = state_saver<float64_t, 2>(title, { alphas_title, final_classifiers_title },
#else
state_saver<float64_t, 2>(title, { alphas_title, final_classifiers_title },
#endif
FORCE_REDO, SAVE_STATE, MODEL_DIR, train_viola_jones, T, X_train_feat, X_train_feat_argsort, y_train);
#ifdef __DEBUG
print("alphas");
print(alphas);
print("final_classifiers");
print(final_classifiers);
#endif
}
}
void testing_and_evaluating(const np::Array<int32_t>& X_train_feat, const np::Array<uint8_t>& y_train, const np::Array<int32_t>& X_test_feat, const np::Array<uint8_t>& y_test) {
printf("\n| %-26s | Time spent (ns) (E) | %-29s | Time spent (ns) (T) | %-29s |\n", "Testing", "Formatted time spent (E)", "Formatted time spent (T)");
printf("|%s|%s|%s|%s|%s|\n", S(28), S(21), S(31), S(21), S(31));
constexpr const size_t NT = sizeof(TS) / sizeof(size_t);
std::array<std::array<float64_t, 8>, NT> results;
size_t i = 0;
for (const size_t T : TS) {
char title[BUFFER_SIZE] = { 0 };
char alphas_title[BUFFER_SIZE] = { 0 };
char final_classifiers_title[BUFFER_SIZE] = { 0 };
sprintf(title, "ViolaJones T = %-4lu (%s)", T, LABEL);
sprintf(alphas_title, MODEL_DIR "/alphas_%lu_%s.bin", T, LABEL);
sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_%s.bin", T, LABEL);
const np::Array<float64_t> alphas = load<float64_t>(alphas_title);
const np::Array<float64_t> final_classifiers = load<float64_t>(final_classifiers_title);
auto start = time();
const np::Array<uint8_t> y_pred_train = classify_viola_jones(alphas, final_classifiers, X_train_feat);
const long long t_pred_train = duration_ns(time() - start);
const float64_t e_acc = accuracy_score(y_train, y_pred_train);
const float64_t e_f1 = f1_score(y_train, y_pred_train);
float64_t e_FN, e_FP;
std::tie(std::ignore, e_FN, e_FP, std::ignore) = confusion_matrix(y_train, y_pred_train);
start = time();
const np::Array<uint8_t> y_pred_test = classify_viola_jones(alphas, final_classifiers, X_test_feat);
const long long t_pred_test = duration_ns(time() - start);
const float64_t t_acc = accuracy_score(y_test, y_pred_test);
const float64_t t_f1 = f1_score(y_test, y_pred_test);
float64_t t_FN, t_FP;
std::tie(std::ignore, t_FN, t_FP, std::ignore) = confusion_matrix(y_test, y_pred_test);
results[i++] = { e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP };
printf("| %-26s | %'19lld | %-29s | %'19lld | %-29s |\n", title, t_pred_train, format_time_ns(t_pred_train).c_str(), t_pred_test, format_time_ns(t_pred_test).c_str());
}
printf("\n| %-19s | ACC (E) | F1 (E) | FN (E) | FP (E) | ACC (T) | F1 (T) | FN (T) | FP (T) |\n", "Evaluating");
printf("|%s|%s|%s|%s|%s|%s|%s|%s|%s|\n", S(21), S(9), S(8), S(8), S(8), S(9), S(8), S(8), S(8));
i = 0;
for (const size_t T : TS) {
char title[BUFFER_SIZE] = { 0 };
sprintf(title, "ViolaJones T = %-4lu", T);
const auto [e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP] = results[i++];
printf("| %-19s | %'6.2f%% | %'6.2f | %'6.0f | %'6.0f | %6.2f%% | %'6.2f | %'6.0f | %'6.0f |\n", title, e_acc * 100, e_f1, e_FN, e_FP, t_acc * 100, t_f1, t_FN, t_FP);
}
}
void final_unit_test() {
printf("\n| %-49s | %-10s | %-17s | %-29s |\n", "Unit testing", "Test state", "Time spent (ns)", "Formatted time spent");
printf("|%s|%s|%s|%s|\n", S(51), S(12), S(19), S(31));
if(fs::exists(OUT_DIR "/X_train_ii_CPU.bin") && fs::exists(OUT_DIR "/X_train_ii_GPU.bin")){
const np::Array<uint32_t> X_train_ii_cpu = load<uint32_t>(OUT_DIR "/X_train_ii_CPU.bin");
const np::Array<uint32_t> X_train_ii_gpu = load<uint32_t>(OUT_DIR "/X_train_ii_GPU.bin");
benchmark_function_void("X_train_ii CPU vs GPU", unit_test_cpu_vs_gpu<uint32_t>, X_train_ii_cpu, X_train_ii_gpu);
}
if(fs::exists(OUT_DIR "/X_test_ii_CPU.bin") && fs::exists(OUT_DIR "/X_test_ii_GPU.bin")){
const np::Array<uint32_t> X_test_ii_cpu = load<uint32_t>(OUT_DIR "/X_test_ii_CPU.bin");
const np::Array<uint32_t> X_test_ii_gpu = load<uint32_t>(OUT_DIR "/X_test_ii_GPU.bin");
benchmark_function_void("X_test_ii CPU vs GPU", unit_test_cpu_vs_gpu<uint32_t>, X_test_ii_cpu, X_test_ii_gpu);
}
if(fs::exists(OUT_DIR "/X_train_feat_CPU.bin")){
const np::Array<int32_t> X_train_feat = load<int32_t>(OUT_DIR "/X_train_feat_CPU.bin");
if(fs::exists(OUT_DIR "/X_train_feat_GPU.bin")){
const np::Array<int32_t> X_train_feat_gpu = load<int32_t>(OUT_DIR "/X_train_feat_CPU.bin");
benchmark_function_void("X_train_feat CPU vs GPU", unit_test_cpu_vs_gpu<int32_t>, X_train_feat, X_train_feat_gpu);
}
np::Array<uint16_t> X_train_feat_argsort_cpu;
uint8_t loaded = 0;
if(fs::exists(OUT_DIR "/X_train_feat_argsort_CPU.bin")){
X_train_feat_argsort_cpu = std::move(load<uint16_t>(OUT_DIR "/X_train_feat_argsort_CPU.bin"));
++loaded;
benchmark_function_void("argsort_2D training set (CPU)", unit_test_argsort_2d<int32_t>, X_train_feat, X_train_feat_argsort_cpu);
}
np::Array<uint16_t> X_train_feat_argsort_gpu;
if(fs::exists(OUT_DIR "/X_train_feat_argsort_GPU.bin")){
X_train_feat_argsort_gpu = std::move(load<uint16_t>(OUT_DIR "/X_train_feat_argsort_GPU.bin"));
++loaded;
benchmark_function_void("argsort_2D training set (GPU)", unit_test_argsort_2d<int32_t>, X_train_feat, X_train_feat_argsort_gpu);
}
if (loaded == 2)
benchmark_function_void("X_train_feat_argsort CPU vs GPU", unit_test_cpu_vs_gpu<uint16_t>, X_train_feat_argsort_cpu, X_train_feat_argsort_gpu);
}
if(fs::exists(OUT_DIR "/X_test_feat_CPU.bin")){
const np::Array<int32_t> X_test_feat = load<int32_t>(OUT_DIR "/X_test_feat_CPU.bin");
if(fs::exists(OUT_DIR "/X_test_feat_GPU.bin")){
const np::Array<int32_t> X_test_feat_gpu = load<int32_t>(OUT_DIR "/X_test_feat_GPU.bin");
benchmark_function_void("X_test_feat CPU vs GPU", unit_test_cpu_vs_gpu<int32_t>, X_test_feat, X_test_feat_gpu);
}
np::Array<uint16_t> X_test_feat_argsort_cpu;
uint8_t loaded = 0;
if(fs::exists(OUT_DIR "/X_test_feat_argsort_CPU.bin")){
X_test_feat_argsort_cpu = std::move(load<uint16_t>(OUT_DIR "/X_test_feat_argsort_CPU.bin"));
++loaded;
benchmark_function_void("argsort_2D testing set (CPU)", unit_test_argsort_2d<int32_t>, X_test_feat, X_test_feat_argsort_cpu);
}
np::Array<uint16_t> X_test_feat_argsort_gpu;
if(fs::exists(OUT_DIR "/X_test_feat_argsort_GPU.bin")){
X_test_feat_argsort_gpu = std::move(load<uint16_t>(OUT_DIR "/X_test_feat_argsort_GPU.bin"));
++loaded;
benchmark_function_void("argsort_2D testing set (GPU)", unit_test_argsort_2d<int32_t>, X_test_feat, X_test_feat_argsort_gpu);
}
if (loaded == 2)
benchmark_function_void("X_test_feat_argsort CPU vs GPU", unit_test_cpu_vs_gpu<uint16_t>, X_test_feat_argsort_cpu, X_test_feat_argsort_gpu);
}
char title[BUFFER_SIZE] = { 0 };
char alphas_title[BUFFER_SIZE] = { 0 };
char final_classifiers_title[BUFFER_SIZE] = { 0 };
for (const size_t T : TS) {
sprintf(alphas_title, MODEL_DIR "/alphas_%lu_CPU.bin", T);
if(!fs::exists(alphas_title)) continue;
const np::Array<float64_t> alphas = load<float64_t>(alphas_title);
sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_CPU.bin", T);
if(!fs::exists(final_classifiers_title)) continue;
const np::Array<float64_t> final_classifiers = load<float64_t>(final_classifiers_title);
sprintf(alphas_title, MODEL_DIR "/alphas_%lu_GPU.bin", T);
if(!fs::exists(alphas_title)) continue;
const np::Array<float64_t> alphas_gpu = load<float64_t>(alphas_title);
sprintf(final_classifiers_title, MODEL_DIR "/final_classifiers_%lu_GPU.bin", T);
if(!fs::exists(final_classifiers_title)) continue;
const np::Array<float64_t> final_classifiers_gpu = load<float64_t>(final_classifiers_title);
sprintf(title, "alphas %ld CPU vs GPU", T);
benchmark_function_void(title, unit_test_cpu_vs_gpu<float64_t>, alphas, alphas_gpu);
sprintf(title, "final_classifiers %ld CPU vs GPU", T);
benchmark_function_void(title, unit_test_cpu_vs_gpu<float64_t>, final_classifiers, final_classifiers_gpu);
}
}
int main(){
#ifdef __DEBUG
printf("| %-49s | %-17s | %-29s |\n", "Unit testing", "Time spent (ns)", "Formatted time spent");
printf("|%s|%s|%s|\n", S(51), S(19), S(31));
benchmark_function_void("Testing GPU capabilities 1D", test_working, 3 + (1<<29));
benchmark_function_void("Testing GPU capabilities 2D", test_working_2d, 3 + (1<<15), 2 + (1<<14));
benchmark_function_void("Testing GPU capabilities 3D", test_working_3d, 9 + (1<<10), 5 + (1<<10), 7 + (1<<9));
benchmark_function_void("Testing toolbox", toolbox_unit_test);
// benchmark_function_void("Testing floating capabilities", test_float);
printf("\n");
#endif
setlocale(LC_NUMERIC, ""); // Allow proper number display
const auto [ X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test ] = preprocessing();
train(X_train_feat, X_train_feat_argsort, y_train);
testing_and_evaluating(X_train_feat, y_train, X_test_feat, y_test);
final_unit_test();
#ifdef __DEBUG
printf("\nAFTER CLEANUP\n");
#endif
return EXIT_SUCCESS;
}

63
cpp/test.cpp Normal file
View File

@ -0,0 +1,63 @@
#include <iostream>
#include <iomanip>
#include "data.hpp"
#include "toolbox.hpp"
#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
#define PBWIDTH 60
void printProgress(const float64_t& percentage) noexcept {
const uint64_t val = static_cast<uint64_t>(percentage * 100);
const int lpad = static_cast<int>(percentage * PBWIDTH);
const int rpad = PBWIDTH - lpad;
printf("%3lu%% [%.*s%*s]\r", val, lpad, PBSTR, rpad, "");
fflush(stdout);
}
void clearProgress() noexcept {
// Progress bar width + space before + num space + space after
printf("%*c\r", PBWIDTH + 1 + 3 + 3, ' ');
}
template<typename T>
void test(const uint64_t& N) noexcept {
#ifdef __DEBUG
printf("DETERMINISTIC for N=%s of %s sized %s\n", thousand_sep(N).c_str(), typeid(T).name(), format_byte_size(sizeof(T)).c_str());
print("Estimating memory footprint at : " + format_byte_size(3 * N * sizeof(T)));
#endif
T *a = new T[N], *b = new T[N], *c = new T[N];
T mean = static_cast<T>(0.0);
const size_t percent = N / 100;
for(size_t i = 0; i < N; ++i){
if (i % percent == 0) printProgress(static_cast<float64_t>(i) / N);
a[i] = static_cast<T>(i < N>>1 ? 0.1 : 1.0);
b[i] = static_cast<T>(1.0);
c[i] = a[i] * b[i];
mean += c[i];
}
mean /= static_cast<T>(N);
clearProgress();
std::cout << mean << std::endl;
delete[] a, delete[] b, delete[] c;
}
void test_float() noexcept {
std::cout << std::setprecision(1<<8);
const uint64_t N = static_cast<uint64_t>(1)<<28;
test<float128_t>(N);
test<float64_t>(N);
test<float32_t>(N);
//printf("%.128af\n", static_cast<float64_t>(1) / 3);
//std::cout << static_cast<float64_t>(1) / 3 << std::endl;
//std::cout << std::hexfloat << static_cast<float64_t>(1) / 3 << std::endl;
//printf("%.128Lf\n", static_cast<long float64_t>(1) / 3);
//printf("%.128lf\n", static_cast<float64_t>(1) / 3);
//printf("%.128f\n", static_cast<float>(1) / 3);
}

133
cpp/toolbox.cpp Normal file
View File

@ -0,0 +1,133 @@
#include "toolbox.hpp"
#include <numeric>
#include <assert.h>
#include <stdint.h>
#include <algorithm>
static constexpr size_t N_TIMES = 10;
static const std::array<const char*, N_TIMES> time_formats = { "ns", "us", "ms", "s", "m", "h", "j", "w", "M", "y" };
static constexpr std::array<uint16_t, N_TIMES> time_numbers = { 1, 1000, 1000, 1000, 60, 60, 24, 7, 4, 12 };
static const uint64_t total_time = std::accumulate(time_numbers.begin(), time_numbers.end(), (uint64_t)1, std::multiplies<uint64_t>());
/**
* @brief Format the time in seconds in human readable format.
*
* @param time Time in seconds
* @return std::string The formatted human readable string.
*/
std::string format_time(uint64_t time) noexcept {
return time < 2 ? std::to_string(time) + "s" : format_time_ns(time * (uint64_t)1e9);
}
/**
* @brief Format the time in nanoseconds in human readable format.
*
* @param time Time in nanoseconds
* @return std::string The formatted human readable string.
*/
std::string format_time_ns(uint64_t time) noexcept {
if (time == 0)
return "0ns";
uint64_t prod = total_time;
std::string s = "";
uint64_t res;
for (int i = N_TIMES - 1; i >= 0; --i) {
if (time >= prod) {
res = time / prod;
time %= prod;
s += std::to_string(res) + time_formats[i] + " ";
}
prod /= time_numbers[i];
}
if (s.back() == ' ')
s.pop_back();
return s;
}
static const constexpr size_t N_BYTES = 7;
static const constexpr std::array<const char*, N_BYTES> bytes_formats = { "", "K", "M", "G", "P", "E", "Z" }; //, "Y" };
static const constexpr uint64_t total_bytes = static_cast<uint64_t>(1)<<(10 * (N_BYTES - 1));
std::string format_byte_size(uint64_t bytes) noexcept {
if (bytes == 0)
return "0B";
uint64_t prod = total_bytes;
std::string s = "";
uint64_t res;
for (size_t i = N_BYTES; i > 0; --i) {
if (bytes >= prod) {
res = bytes / prod;
bytes %= prod;
s += std::to_string(res) + bytes_formats[i - 1] + "B ";
}
prod /= static_cast<uint64_t>(1)<<10;
}
if (s.back() == ' ')
s.pop_back();
return s;
}
void toolbox_unit_test() noexcept {
assert(std::string("0B") == format_byte_size(static_cast<uint64_t>(0)));
assert(std::string("1B") == format_byte_size(static_cast<uint64_t>(1)));
assert(std::string("1KB") == format_byte_size(static_cast<uint64_t>(1)<<10));
assert(std::string("1MB") == format_byte_size(static_cast<uint64_t>(1)<<20));
assert(std::string("1GB") == format_byte_size(static_cast<uint64_t>(1)<<30));
assert(std::string("1PB") == format_byte_size(static_cast<uint64_t>(1)<<40));
assert(std::string("1EB") == format_byte_size(static_cast<uint64_t>(1)<<50));
assert(std::string("1ZB") == format_byte_size(static_cast<uint64_t>(1)<<60));
//assert(std::string("1YB") == format_byte_size(static_cast<uint64_t>(1)<<70));
// UINT64_MAX == 18446744073709551615I64u == -1
assert(std::string("15ZB 1023EB 1023PB 1023GB 1023MB 1023KB 1023B") == format_byte_size(static_cast<uint64_t>(-1)));
assert(std::string("0s") == format_time(static_cast<uint64_t>(0)));
assert(std::string("1s") == format_time(static_cast<uint64_t>(1)));
assert(std::string("1m") == format_time(static_cast<uint64_t>(60)));
assert(std::string("1h") == format_time(static_cast<uint64_t>(3600)));
assert(std::string("1j") == format_time(static_cast<uint64_t>(86400)));
assert(std::string("1w") == format_time(static_cast<uint64_t>(604800)));
assert(std::string("1M") == format_time(static_cast<uint64_t>(2419200)));
assert(std::string("1y") == format_time(static_cast<uint64_t>(29030400)));
assert(std::string("0ns") == format_time_ns(static_cast<uint64_t>(0)));
assert(std::string("1ns") == format_time_ns(static_cast<uint64_t>(1)));
assert(std::string("1us") == format_time_ns(static_cast<uint64_t>(1e3)));
assert(std::string("1ms") == format_time_ns(static_cast<uint64_t>(1e6)));
assert(std::string("1s") == format_time_ns(static_cast<uint64_t>(1e9)));
assert(std::string("1m") == format_time_ns(static_cast<uint64_t>(6e10)));
assert(std::string("1h") == format_time_ns(static_cast<uint64_t>(36e11)));
assert(std::string("1j") == format_time_ns(static_cast<uint64_t>(864e11)));
assert(std::string("1w") == format_time_ns(static_cast<uint64_t>(6048e11)));
assert(std::string("1M") == format_time_ns(static_cast<uint64_t>(24192e11)));
assert(std::string("1y") == format_time_ns(static_cast<uint64_t>(290304e11)));
// UINT64_MAX == 18446744073709551615I64u == -1
assert(std::string("635y 5M 3j 23h 34m 33s 709ms 551us 615ns") == format_time_ns(static_cast<uint64_t>(-1)));
}
std::string thousand_sep(uint64_t k) noexcept {
std::string s = "", n = std::to_string(k);
uint8_t c = 0;
for (const auto& n_i : n) {
++c;
s.push_back(n_i);
if (c == 3) {
s.push_back(',');
c = 0;
}
}
std::reverse(s.begin(), s.end());
if (s.size() % 4 == 0)
s.erase(s.begin());
return s;
}

13
cpp/toolbox.hpp Normal file
View File

@ -0,0 +1,13 @@
#pragma once
#include <array>
#include <chrono>
#include <string>
#define duration_ns(a) std::chrono::duration_cast<std::chrono::nanoseconds>(a).count()
#define time() std::chrono::high_resolution_clock::now()
std::string format_time(uint64_t) noexcept;
std::string format_time_ns(uint64_t) noexcept;
std::string format_byte_size(uint64_t) noexcept;
void toolbox_unit_test() noexcept;
std::string thousand_sep(uint64_t) noexcept;