cpp : more robust code and added more documentation

2024-04-27 21:08:33 +02:00
parent 45f0f6ab8e
commit c7d21e1014
10 changed files with 355 additions and 319 deletions
--- a/cpp/ViolaJonesGPU.cu
+++ b/cpp/ViolaJonesGPU.cu
@ -180,7 +180,7 @@ static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Arr

 np::Array<int32_t> apply_features_gpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
 	const np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });
-	int32_t* d_X_feat;
+	int32_t* d_X_feat = nullptr;

 	_print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t)));
 	np::Array<uint32_t> d_X_ii = copyToDevice<uint32_t>("X_ii", X_ii);
@ -268,7 +268,7 @@ np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const
 }

 template<typename T>
-__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
+__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* const indices, const size_t l, const size_t h) noexcept {
 	int32_t i = l - 1;
 	for (int32_t j = l; j <= h; ++j)
 		if (a[indices[j]] < a[indices[h]])
@ -278,7 +278,7 @@ __device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices,
 }

 template<typename T>
-__device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
+__device__ void argsort_gpu(const T* a, uint16_t* const indices, const size_t l, const size_t h) noexcept {
 	const size_t total = h - l + 1;

 	//int32_t* stack = new int32_t[total]{l, h};
@ -312,7 +312,7 @@ __device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const
 }

 template<typename T>
-__global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* indices){
+__global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* const indices){
 	const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 	if (idx >= a.shape[0])
 		return;
@ -324,7 +324,7 @@ __global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* indices){
 np::Array<uint16_t> argsort_2d_gpu(const np::Array<int32_t>& X_feat) noexcept {
 	const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);

-	uint16_t* d_indices;
+	uint16_t* d_indices = nullptr;
 	const size_t indices_size = np::prod(indices.shape) * sizeof(uint16_t);

 	np::Array<int32_t> d_X_feat = copyToDevice<int32_t>("X_feat", X_feat);