#include "data.hpp" #include "toolbox.hpp" static __global__ void __test_working_kernel__(const np::Array d_x, np::Array d_y, const size_t length) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < length) d_y[i] = d_x[i] * i; } void test_working(const size_t& length) noexcept { const size_t size = length * sizeof(size_t); #if __DEBUG print("Estimating memory footprint at : " + format_byte_size(2 * size)); #endif np::Array x = np::empty({ length }), y = np::empty({ length }); size_t i; for (i = 0; i < length; ++i) x[i] = i; np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); const size_t dimX = static_cast(std::ceil(static_cast(length) / static_cast(NB_THREADS))); const dim3 dimGrid(dimX); constexpr const dim3 dimBlock(NB_THREADS); __test_working_kernel__<<>>(d_x, d_y, length); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); size_t ne = 0; for (i = 0; i < length; ++i) if (y[i] != x[i] * i) ++ne; if (ne != 0) fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); cudaFree("d_x", d_x); cudaFree("d_y", d_y); } static __global__ void __test_working_kernel_2d__(const np::Array d_x, np::Array d_y, const size_t length) { const size_t idx = threadIdx.x * blockDim.y + threadIdx.y; const size_t idy = blockIdx.x * gridDim.y + blockIdx.y; const size_t i = idy * NB_THREADS_2D_X * NB_THREADS_2D_Y + idx; if (i < length) d_y[i] = d_x[i] * i; } void test_working_2d(const size_t& N1, const size_t& N2) noexcept { const size_t length = N1 * N2; const size_t size = length * sizeof(size_t); #if __DEBUG print("Estimating memory footprint at : " + format_byte_size(2 * size)); #endif np::Array x = np::empty({ length }), y = np::empty({ length }); size_t i; for (i = 0; i < length; ++i) x[i] = i; np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); const size_t dimX = static_cast(std::ceil(static_cast(N1) / static_cast(NB_THREADS_2D_X))); const size_t dimY = static_cast(std::ceil(static_cast(N2) / static_cast(NB_THREADS_2D_Y))); const dim3 dimGrid(dimX, dimY); constexpr const dim3 dimBlock(NB_THREADS_2D_X, NB_THREADS_2D_Y); __test_working_kernel_2d__<<>>(d_x, d_y, length); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); size_t ne = 0; for (i = 0; i < length; ++i) if (y[i] != x[i] * i) ++ne; if (ne != 0) fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); cudaFree("d_x", d_x); cudaFree("d_y", d_y); } static __global__ void __test_working_kernel_3d__(const np::Array d_x, np::Array d_y, const size_t length) { const size_t idx = (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z; const size_t idy = (blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z; const size_t i = idy * NB_THREADS_3D_X * NB_THREADS_3D_Y * NB_THREADS_3D_Z + idx; if (i < length) d_y[i] = d_x[i] * i; } void test_working_3d(const size_t& N1, const size_t& N2, const size_t& N3) noexcept { const size_t length = N1 * N2 * N3; const size_t size = length * sizeof(size_t); #if __DEBUG print("Estimating memory footprint at : " + format_byte_size(2 * size)); #endif np::Array x = np::empty({ length }), y = np::empty({ length }); size_t i; for (i = 0; i < length; ++i) x[i] = i; np::Array d_x = copyToDevice("x", x), d_y = copyToDevice("y", y); const size_t dimX = static_cast(std::ceil(static_cast(N1) / static_cast(NB_THREADS_3D_X))); const size_t dimY = static_cast(std::ceil(static_cast(N2) / static_cast(NB_THREADS_3D_Y))); const size_t dimZ = static_cast(std::ceil(static_cast(N3) / static_cast(NB_THREADS_3D_Z))); const dim3 dimGrid(dimX, dimY, dimZ); constexpr const dim3 dimBlock(NB_THREADS_3D_X, NB_THREADS_3D_Y, NB_THREADS_3D_Z); __test_working_kernel_3d__<<>>(d_x, d_y, length); _print_cuda_error_("synchronize", cudaDeviceSynchronize()); _print_cuda_error_("memcpy d_y", cudaMemcpy(y.data, d_y.data, size, cudaMemcpyDeviceToHost)); size_t ne = 0; for (i = 0; i < length; ++i) if (y[i] != x[i] * i) ++ne; if (ne != 0) fprintf(stderr, "Invalid result : %lu/%lu <=> %f%%\n", ne, length, static_cast(ne) / static_cast(length)); cudaFree("d_x", d_x); cudaFree("d_y", d_y); }