#include #include "data.hpp" #include "toolbox.hpp" #include "ViolaJones.hpp" #include "config.hpp" static __global__ void __test_working_kernel__(const np::Array d_x, np::Array d_y, const size_t length) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < length) d_y[i] = d_x[i] * i; } void test_working(const size_t& length) noexcept { const size_t size = length * sizeof(size_t); #if __DEBUG print("Estimating memory footprint at : " + format_byte_size(2 * size)); #endif np::Array x = np::empty({ length }), y = np::empty({ length }); size_t i; for (i = 0; i < length; ++i) x[i] = i; np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); const size_t dimX = static_cast(std::ceil(static_cast(length) / static_cast(NB_THREADS))); const dim3 dimGrid(dimX); constexpr const dim3 dimBlock(NB_THREADS); __test_working_kernel__<<>>(d_x, d_y, length); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); size_t ne = 0; for (i = 0; i < length; ++i) if (y[i] != x[i] * i) ++ne; if (ne != 0) fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); cudaFree("d_x", d_x); cudaFree("d_y", d_y); } static __global__ void __test_working_kernel_2d__(const np::Array d_x, np::Array d_y, const size_t length) { const size_t idx = threadIdx.x * blockDim.y + threadIdx.y; const size_t idy = blockIdx.x * gridDim.y + blockIdx.y; const size_t i = idy * NB_THREADS_2D_X * NB_THREADS_2D_Y + idx; if (i < length) d_y[i] = d_x[i] * i; } void test_working_2d(const size_t& N1, const size_t& N2) noexcept { const size_t length = N1 * N2; const size_t size = length * sizeof(size_t); #if __DEBUG print("Estimating memory footprint at : " + format_byte_size(2 * size)); #endif np::Array x = np::empty({ length }), y = np::empty({ length }); size_t i; for (i = 0; i < length; ++i) x[i] = i; np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); const size_t dimX = static_cast(std::ceil(static_cast(N1) / static_cast(NB_THREADS_2D_X))); const size_t dimY = static_cast(std::ceil(static_cast(N2) / static_cast(NB_THREADS_2D_Y))); const dim3 dimGrid(dimX, dimY); constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); __test_working_kernel_2d__<<>>(d_x, d_y, length); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); size_t ne = 0; for (i = 0; i < length; ++i) if (y[i] != x[i] * i) ++ne; if (ne != 0) fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); cudaFree("d_x", d_x); cudaFree("d_y", d_y); } static __global__ void __test_working_kernel_3d__(const np::Array d_x, np::Array d_y, const size_t length) { const size_t idx = (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z; const size_t idy = (blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z; const size_t i = idy * NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z + idx; if (i < length) d_y[i] = d_x[i] * i; } void test_working_3d(const size_t& N1, const size_t& N2, const size_t& N3) noexcept { const size_t length = N1 * N2 * N3; const size_t size = length * sizeof(size_t); #if __DEBUG print("Estimating memory footprint at : " + format_byte_size(2 * size)); #endif np::Array x = np::empty({ length }), y = np::empty({ length }); size_t i; for (i = 0; i < length; ++i) x[i] = i; np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); const size_t dimX = static_cast(std::ceil(static_cast(N1) / static_cast(NB_THREADS_3D_X))); const size_t dimY = static_cast(std::ceil(static_cast(N2) / static_cast(NB_THREADS_3D_Y))); const size_t dimZ = static_cast(std::ceil(static_cast(N3) / static_cast(NB_THREADS_3D_Z))); const dim3 dimGrid(dimX, dimY, dimZ); constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z); __test_working_kernel_3d__<<>>(d_x, d_y, length); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); size_t ne = 0; for (i = 0; i < length; ++i) if (y[i] != x[i] * i) ++ne; if (ne != 0) fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); cudaFree("d_x", d_x); cudaFree("d_y", d_y); } static np::Array __scanCPU_3d__(const np::Array& X) noexcept { np::Array X_scan = np::empty(X.shape); const size_t total = np::prod(X_scan.shape); const size_t i_step = np::prod(X_scan.shape, 1); for(size_t x = 0; x < total; x += i_step) for(size_t y = 0; y < i_step; y += X_scan.shape[2]){ uint32_t cum = 0; for(size_t z = 0; z < X_scan.shape[2]; ++z){ const size_t idx = x + y + z; cum += X[idx]; X_scan[idx] = cum - X[idx]; } } return X_scan; } static __global__ void __kernel_scan_3d__(const uint16_t n, const uint16_t j, np::Array d_inter, np::Array d_X) { const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x; const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y; __shared__ uint32_t sA[NB_THREADS_2D_X * NB_THREADS_2D_Y]; sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y] = (x_coor < n && y_coor) < j ? d_X[blockIdx.z * NB_THREADS_2D_X * NB_THREADS_2D_Y + y_coor * NB_THREADS_2D_Y + x_coor] : 0; __syncthreads(); size_t k = threadIdx.x; for(size_t d = 0; d < M; ++d){ k *= 2; const size_t i1 = k + std::pow(2, d) - 1; const size_t i2 = k + std::pow(2, d + 1) - 1; if(i2 >= blockDim.x) break; sA[i2 * NB_THREADS_2D_Y + threadIdx.y] += sA[i1 * NB_THREADS_2D_Y + threadIdx.y]; } __syncthreads(); if(threadIdx.x == 0){ d_inter[blockIdx.z * d_inter.shape[1] * d_inter.shape[2] + y_coor * d_inter.shape[2] + blockIdx.x] = sA[(blockDim.x - 1) * NB_THREADS_2D_Y + threadIdx.y]; sA[(blockDim.x - 1) * NB_THREADS_2D_Y + threadIdx.y] = 0; } __syncthreads(); k = std::pow(2, M + 1) * threadIdx.x; for(int64_t d = M - 1; d > -1; --d){ k = k / 2; const size_t i1 = k + std::pow(2, d) - 1; const size_t i2 = k + std::pow(2, d + 1) - 1; if(i2 >= blockDim.x) continue; const uint32_t t = sA[i1 * NB_THREADS_2D_Y + threadIdx.y]; sA[i1 * NB_THREADS_2D_Y + threadIdx.y]= sA[i2 * NB_THREADS_2D_Y + threadIdx.y]; sA[i2 * NB_THREADS_2D_Y + threadIdx.y] += t; } __syncthreads(); if(x_coor < n && y_coor < j) d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] = sA[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y]; } static __global__ void __add_3d__(np::Array d_X, const np::Array d_s, const uint16_t n, const uint16_t m) { const size_t x_coor = blockIdx.x * blockDim.x + threadIdx.x; const size_t y_coor = blockIdx.y * blockDim.y + threadIdx.y; if(x_coor < n && y_coor < m) d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + x_coor] += d_s[blockIdx.z * d_X.shape[1] * d_X.shape[2] + y_coor * d_X.shape[2] + blockIdx.x]; } static np::Array __scanGPU_3d__(const np::Array& X) noexcept { np::Array X_scan = np::empty(X.shape); const size_t k = X.shape[0]; const size_t height = X.shape[1]; const size_t n = X.shape[2]; const size_t n_block_x = static_cast(std::ceil(static_cast(X.shape[1]) / static_cast(NB_THREADS_2D_X))); const size_t n_block_y = static_cast(std::ceil(static_cast(X.shape[2]) / static_cast(NB_THREADS_2D_Y))); np::Array d_X = copyToDevice("X", X); np::Array inter = np::empty({ k, height, n_block_x }); np::Array d_inter = copyToDevice("inter", inter); const dim3 dimGrid(n_block_x, n_block_y, k); constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); __kernel_scan_3d__<<>>(n, height, d_inter, d_X); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_inter", cudaMemcpy(inter.data, d_inter.data, np::prod(inter.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); if(n_block_x >= NB_THREADS_2D_X){ np::Array sums = __scanGPU_3d__(inter); np::Array d_s = copyToDevice("sums", sums); __add_3d__<<>>(d_X, d_s, n, height); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_X", cudaMemcpy(X_scan.data, d_X.data, np::prod(X_scan.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); } else { np::Array sums = __scanCPU_3d__(inter); _print_cuda_error_("memcpy d_X", cudaMemcpy(X_scan.data, d_X.data, np::prod(X_scan.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); for(size_t p = 0; p < k; ++p) for(size_t h = 0; h < height; ++h) for(size_t i = 1; i < n_block_x; ++i) for(size_t j = 0; j < NB_THREADS_2D_X; ++j){ const size_t idx = i * NB_THREADS_2D_X + j; if(idx < n){ const size_t idy = p * X_scan.shape[1] * X_scan.shape[2] + h * X_scan.shape[2]; X_scan[idy + idx] += sums[idy + i]; } } } return X_scan; } static __global__ void __transpose_kernel__(const np::Array d_X, np::Array d_Xt) { __shared__ uint32_t temp[NB_THREADS_2D_X * NB_THREADS_2D_Y]; size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; if(x < d_X.shape[1] && y < d_X.shape[2]) temp[threadIdx.y * NB_THREADS_2D_Y + threadIdx.x] = d_X[blockIdx.z * d_X.shape[1] * d_X.shape[2] + x * d_X.shape[2] + y]; __syncthreads(); x = blockIdx.y * blockDim.y + threadIdx.x; y = blockIdx.x * blockDim.x + threadIdx.y; if(x < d_X.shape[2] && y < d_X.shape[1]) d_Xt[blockIdx.z * d_Xt.shape[1] * d_Xt.shape[2] + x * d_X.shape[2] + y] = temp[threadIdx.x * NB_THREADS_2D_Y + threadIdx.y]; } static np::Array __transpose_3d__(const np::Array& X) noexcept { np::Array Xt = np::empty({ X.shape[0], X.shape[2], X.shape[1] }); np::Array d_X = copyToDevice("X", X); np::Array d_Xt = copyToDevice("Xt", Xt); const size_t n_block_x = static_cast(std::ceil(static_cast(X.shape[1]) / static_cast(NB_THREADS_2D_X))); const size_t n_block_y = static_cast(std::ceil(static_cast(X.shape[2]) / static_cast(NB_THREADS_2D_Y))); const dim3 dimGrid(n_block_x, n_block_y, X.shape[0]); constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); __transpose_kernel__<<>>(d_X, d_Xt); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_Xt", cudaMemcpy(Xt.data, d_Xt.data, np::prod(Xt.shape) * sizeof(uint32_t), cudaMemcpyDeviceToHost)); cudaFree("X", d_X); cudaFree("Xt", d_Xt); return Xt; } np::Array set_integral_image_gpu(const np::Array& X) noexcept { np::Array X_ii = np::astype(X); X_ii = __scanCPU_3d__(X_ii); X_ii = __transpose_3d__(X_ii); X_ii = __scanCPU_3d__(X_ii); return __transpose_3d__(X_ii); } static inline __device__ int16_t __compute_feature__(const np::Array& d_X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept { const size_t _y = y * d_X_ii.shape[1] + x; const size_t _yh = _y + h * d_X_ii.shape[1]; return d_X_ii[j + _yh + w] + d_X_ii[j + _y] - d_X_ii[j + _yh] - d_X_ii[j + _y + w]; } static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Array d_feats, const np::Array d_X_ii) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; size_t j = blockIdx.y * blockDim.y + threadIdx.y; if (i >= d_feats.shape[0] || j >= d_X_ii.shape[0]) return; const size_t k = i * d_X_ii.shape[0] + j; i *= np::prod(d_feats.shape, 1); j *= np::prod(d_X_ii.shape, 1); const int16_t p1 = __compute_feature__(d_X_ii, j, d_feats[i + 0], d_feats[i + 1], d_feats[i + 2], d_feats[i + 3]); const int16_t p2 = __compute_feature__(d_X_ii, j, d_feats[i + 4], d_feats[i + 5], d_feats[i + 6], d_feats[i + 7]); const int16_t n1 = __compute_feature__(d_X_ii, j, d_feats[i + 8], d_feats[i + 9], d_feats[i + 10], d_feats[i + 11]); const int16_t n2 = __compute_feature__(d_X_ii, j, d_feats[i + 12], d_feats[i + 13], d_feats[i + 14], d_feats[i + 15]); d_X_feat[k] = static_cast(p1 + p2) - static_cast(n1 + n2); } np::Array apply_features_gpu(const np::Array& feats, const np::Array& X_ii) noexcept { const np::Array X_feat = np::empty({ feats.shape[0], X_ii.shape[0] }); int32_t* d_X_feat; _print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t))); np::Array d_X_ii = copyToDevice("X_ii", X_ii); np::Array d_feats = copyToDevice("feats", feats); const size_t dimX = static_cast(std::ceil(static_cast(feats.shape[0]) / static_cast(NB_THREADS_2D_X))); const size_t dimY = static_cast(std::ceil(static_cast(X_ii.shape[0]) / static_cast(NB_THREADS_2D_Y))); const dim3 dimGrid(dimX, dimY); constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); __apply_feature_kernel__<<>>(d_X_feat, d_feats, d_X_ii); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy X_feat", cudaMemcpy(X_feat.data, d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t), cudaMemcpyDeviceToHost)); _print_cuda_error_("free d_X_feat", cudaFree(d_X_feat)); cudaFree("free d_feats", d_feats); cudaFree("free d_X_11", d_X_ii); return X_feat; } static __global__ void __train_weak_clf_kernel__(np::Array d_classifiers, const np::Array d_y, const np::Array d_X_feat, const np::Array d_X_feat_argsort, const np::Array d_weights, const float64_t total_pos, const float64_t total_neg) { size_t i = blockIdx.x * blockDim.x * blockDim.y * blockDim.z; i += threadIdx.x * blockDim.y * blockDim.z; i += threadIdx.y * blockDim.z; i += threadIdx.z; // const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= d_classifiers.shape[0]) return; size_t pos_seen = 0, neg_seen = 0; float64_t pos_weights = 0.0, neg_weights = 0.0; float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0; for(size_t j = 0; j < d_X_feat_argsort.shape[1]; ++j) { const float64_t error = np::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights); if (error < min_error){ min_error = error; best_threshold = d_X_feat[i * d_X_feat.shape[1] + d_X_feat_argsort[i * d_X_feat.shape[1] + j]]; best_polarity = pos_seen > neg_seen ? 1.0 : -1.0; } if(d_y[d_X_feat_argsort[i * d_X_feat.shape[1] + j]] == static_cast(1)){ ++pos_seen; pos_weights += d_weights[d_X_feat_argsort[i * d_X_feat.shape[1] + j]]; } else { ++neg_seen; neg_weights += d_weights[d_X_feat_argsort[i * d_X_feat.shape[1] + j]]; } } d_classifiers[i * 2] = best_threshold; d_classifiers[i * 2 + 1] = best_polarity; } np::Array train_weak_clf_gpu(const np::Array& X_feat, const np::Array& X_feat_argsort, const np::Array& y, const np::Array& weights) noexcept { float64_t total_pos = 0.0, total_neg = 0.0; for(size_t i = 0; i < y.shape[0]; ++i) (y[i] == static_cast(1) ? total_pos : total_neg) += weights[i]; np::Array classifiers = np::empty({ X_feat.shape[0], 2}); np::Array d_classifiers = copyToDevice("classifiers", classifiers); np::Array d_X_feat = copyToDevice("X_feat", X_feat); np::Array d_X_feat_argsort = copyToDevice("X_feat_argsort", X_feat_argsort); np::Array d_weights = copyToDevice("weights", weights); np::Array d_y = copyToDevice("y", y); const size_t n_blocks = static_cast(std::ceil(static_cast(X_feat.shape[0]) / static_cast(NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z))); constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z); // const size_t n_blocks = static_cast(std::ceil(static_cast(X_feat.shape[0]) / static_cast(NB_THREADS))); // constexpr const dim3 dimBlock(NB_THREADS); __train_weak_clf_kernel__<<>>(d_classifiers, d_y, d_X_feat, d_X_feat_argsort, d_weights, total_pos, total_neg); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy classifiers", cudaMemcpy(classifiers.data, d_classifiers.data, np::prod(classifiers.shape) * sizeof(float64_t), cudaMemcpyDeviceToHost)); cudaFree("free d_classifiers", d_classifiers); cudaFree("free d_X_feat", d_X_feat); cudaFree("free d_X_feat_argsort", d_X_feat_argsort); cudaFree("free d_weights", d_weights); cudaFree("free d_y", d_y); return classifiers; } template __device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept { int32_t i = l - 1; for (int32_t j = l; j <= h; ++j) if (a[indices[j]] < a[indices[h]]) swap(&indices[++i], &indices[j]); swap(&indices[++i], &indices[h]); return i; } template __device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept { const size_t total = h - l + 1; //int32_t* stack = new int32_t[total]{l, h}; //int32_t stack[total]; int32_t stack[6977]; //int32_t stack[1<<16]; stack[0] = l; stack[1] = h; size_t top = 1, low = l, high = h; while (top <= total) { high = stack[top--]; low = stack[top--]; if(low >= high) break; const int32_t p = as_partition_gpu(a, indices, low, high); if (p - 1 > low && p - 1 < total) { stack[++top] = low; stack[++top] = p - 1; } if (p + 1 < high) { stack[++top] = p + 1; stack[++top] = high; } } //delete[] stack; } template __global__ void argsort_bounded_gpu(const np::Array a, uint16_t* indices){ const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= a.shape[0]) return; for(size_t y = 0; y < a.shape[1]; ++y) indices[idx * a.shape[1] + y] = y; argsort_gpu(&a[idx * a.shape[1]], &indices[idx * a.shape[1]], 0, a.shape[1] - 1); } np::Array argsort_2d_gpu(const np::Array& X_feat) noexcept { const np::Array indices = np::empty(X_feat.shape); uint16_t* d_indices; const size_t indices_size = np::prod(indices.shape) * sizeof(uint16_t); np::Array d_X_feat = copyToDevice("X_feat", X_feat); _print_cuda_error_("malloc d_indices", cudaMalloc(&d_indices, indices_size)); const size_t dimGrid = static_cast(std::ceil(static_cast(X_feat.shape[0]) / static_cast(NB_THREADS))); const dim3 dimBlock(NB_THREADS); argsort_bounded_gpu<<>>(d_X_feat, d_indices); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_indices", cudaMemcpy(indices.data, d_indices, indices_size, cudaMemcpyDeviceToHost)); cudaFree("free d_X_feat", d_X_feat); _print_cuda_error_("free d_indices", cudaFree(d_indices)); return indices; } __host__ __device__ size_t np::prod(const np::Shape& shape, const size_t& offset) noexcept { size_t result = shape[offset]; for(size_t i = 1 + offset; i < shape.length; ++i) result *= shape[i]; return result; }