cpp : more robust code and added more documentation

This commit is contained in:
saundersp
2024-04-27 21:08:33 +02:00
parent 45f0f6ab8e
commit c7d21e1014
10 changed files with 355 additions and 319 deletions

View File

@ -180,7 +180,7 @@ static __global__ void __apply_feature_kernel__(int32_t* d_X_feat, const np::Arr
np::Array<int32_t> apply_features_gpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
const np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });
int32_t* d_X_feat;
int32_t* d_X_feat = nullptr;
_print_cuda_error_("malloc d_X_feat", cudaMalloc(&d_X_feat, np::prod(X_feat.shape) * sizeof(int32_t)));
np::Array<uint32_t> d_X_ii = copyToDevice<uint32_t>("X_ii", X_ii);
@ -268,7 +268,7 @@ np::Array<float64_t> train_weak_clf_gpu(const np::Array<int32_t>& X_feat, const
}
template<typename T>
__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
__device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* const indices, const size_t l, const size_t h) noexcept {
int32_t i = l - 1;
for (int32_t j = l; j <= h; ++j)
if (a[indices[j]] < a[indices[h]])
@ -278,7 +278,7 @@ __device__ inline static int32_t as_partition_gpu(const T* a, uint16_t* indices,
}
template<typename T>
__device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const size_t h) noexcept {
__device__ void argsort_gpu(const T* a, uint16_t* const indices, const size_t l, const size_t h) noexcept {
const size_t total = h - l + 1;
//int32_t* stack = new int32_t[total]{l, h};
@ -312,7 +312,7 @@ __device__ void argsort_gpu(const T* a, uint16_t* indices, const size_t l, const
}
template<typename T>
__global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* indices){
__global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* const indices){
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= a.shape[0])
return;
@ -324,7 +324,7 @@ __global__ void argsort_bounded_gpu(const np::Array<T> a, uint16_t* indices){
np::Array<uint16_t> argsort_2d_gpu(const np::Array<int32_t>& X_feat) noexcept {
const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
uint16_t* d_indices;
uint16_t* d_indices = nullptr;
const size_t indices_size = np::prod(indices.shape) * sizeof(uint16_t);
np::Array<int32_t> d_X_feat = copyToDevice<int32_t>("X_feat", X_feat);