python : Updated code with better display, documentation and format_time

2024-04-28 00:25:13 +02:00
parent c7d21e1014
commit 718724b63b
11 changed files with 591 additions and 566 deletions
--- a/python/ViolaJonesGPU.py
+++ b/python/ViolaJonesGPU.py
@@ -12,10 +12,10 @@ def __scanCPU_3d__(X: np.ndarray) -> np.ndarray:
 	"""Prefix Sum (scan) of a given dataset.

 	Args:
-		X (np.ndarray): Dataset of images to apply sum.
+		X (np.ndarray): Dataset of images to apply sum

 	Returns:
-		np.ndarray: Scanned dataset of images.
+		np.ndarray: Scanned dataset of images
 	"""
 	for x in range(X.shape[0]):
 		for y in range(X.shape[1]):
@@ -30,10 +30,10 @@ def __kernel_scan_3d__(n: int, j: int, d_inter: np.ndarray, d_a: np.ndarray) ->
 	"""GPU kernel used to do a parallel prefix sum (scan).

 	Args:
-		n (int):
-		j (int): [description]
-		d_inter (np.ndarray): [description]
-		d_a (np.ndarray): [description]
+		n (int): Number of width blocks
+		j (int): Temporary sum index
+		d_inter (np.ndarray): Temporary sums in device to add
+		d_a (np.ndarray): Dataset of images in device to apply sum
 	"""
 	x_coor, y_coor = cuda.grid(2)

@@ -76,10 +76,10 @@ def __add_3d__(d_X: np.ndarray, d_s: np.ndarray, n: int, m: int) -> None:
 	"""GPU kernel for parallel sum.

 	Args:
-		d_X (np.ndarray): Dataset of images.
-		d_s (np.ndarray): Temporary sums to add.
-		n (int): Number of width blocks.
-		m (int): Height of a block.
+		d_X (np.ndarray): Dataset of images in device
+		d_s (np.ndarray): Temporary sums in device to add
+		n (int): Number of width blocks
+		m (int): Height of a block
 	"""
 	x_coor, y_coor = cuda.grid(2)
 	if x_coor < n and y_coor < m:
@@ -91,10 +91,10 @@ def __scanGPU_3d__(X: np.ndarray) -> np.ndarray:
 	Read more: https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Scanned dataset of images.
+		np.ndarray: Scanned dataset of images
 	"""
 	k, height, n = X.shape
 	n_block_x, n_block_y = np.ceil(np.divide(X.shape[1:], NB_THREADS_2D)).astype(np.uint64)
@@ -131,10 +131,10 @@ def __transpose_kernel__(d_X: np.ndarray, d_Xt: np.ndarray) -> None:
 	"""GPU kernel of the function __transpose_3d__.

 	Args:
-		d_X (np.ndarray): Dataset of images.
-		d_Xt(np.ndarray): Transposed dataset of images.
-		width (int): Width of each images in the dataset.
-		height (int): Height of each images in the dataset.
+		d_X (np.ndarray): Dataset of images in device
+		d_Xt(np.ndarray): Transposed dataset of images
+		width (int): Width of each images in the dataset
+		height (int): Height of each images in the dataset
 	"""
 	temp = cuda.shared.array(NB_THREADS_2D, dtype = uint32)

@@ -152,10 +152,10 @@ def __transpose_3d__(X: np.ndarray) -> np.ndarray:
 	"""Transpose every images in the given dataset.

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Transposed dataset of images.
+		np.ndarray: Transposed dataset of images
 	"""
 	n_block_x, n_block_z = np.ceil(np.divide(X.shape[1:], NB_THREADS_2D)).astype(np.uint64)
 	d_X = cuda.to_device(X)
@@ -167,10 +167,10 @@ def set_integral_image(X: np.ndarray) -> np.ndarray:
 	"""Transform the input images in integrated images (GPU version).

 	Args:
-		X (np.ndarray): Dataset of images.
+		X (np.ndarray): Dataset of images

 	Returns:
-		np.ndarray: Dataset of integrated images.
+		np.ndarray: Dataset of integrated images
 	"""
 	X = X.astype(np.uint32)
 	X = __scanGPU_3d__(X)
@@ -184,13 +184,13 @@ def __train_weak_clf_kernel__(d_classifiers: np.ndarray, d_y: np.ndarray, d_X_fe
 	"""GPU kernel of the function train_weak_clf.

 	Args:
-		d_classifiers (np.ndarray): Weak classifiers to train.
-		d_y (np.ndarray): Labels of the features.
-		d_X_feat (np.ndarray): Feature images dataset.
-		d_X_feat_argsort (np.ndarray): Sorted indexes of the integrated features.
-		d_weights (np.ndarray): Weights of the features.
-		total_pos (float): Total of positive labels in the dataset.
-		total_neg (float): Total of negative labels in the dataset.
+		d_classifiers (np.ndarray): Weak classifiers to train
+		d_y (np.ndarray): Labels of the features
+		d_X_feat (np.ndarray): Feature images dataset
+		d_X_feat_argsort (np.ndarray): Sorted indexes of the integrated features
+		d_weights (np.ndarray): Weights of the features
+		total_pos (float): Total of positive labels in the dataset
+		total_neg (float): Total of negative labels in the dataset
 	"""
 	i = cuda.blockIdx.x * cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
 	i += cuda.threadIdx.x * cuda.blockDim.y * cuda.blockDim.z
@@ -224,13 +224,13 @@ def train_weak_clf(X_feat: np.ndarray, X_feat_argsort: np.ndarray, y: np.ndarray
 	"""Train the weak classifiers on a given dataset (GPU version).

 	Args:
-		X_feat (np.ndarray): Feature images dataset.
-		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features.
-		y (np.ndarray): Labels of the features.
-		weights (np.ndarray): Weights of the features.
+		X_feat (np.ndarray): Feature images dataset
+		X_feat_argsort (np.ndarray): Sorted indexes of the integrated features
+		y (np.ndarray): Labels of the features
+		weights (np.ndarray): Weights of the features

 	Returns:
-		np.ndarray: Trained weak classifiers.
+		np.ndarray: Trained weak classifiers
 	"""
 	total_pos, total_neg = weights[y == 1].sum(), weights[y == 0].sum()
 	d_classifiers = cuda.to_device(np.empty((X_feat.shape[0], 2), dtype = np.int32))
@@ -247,14 +247,14 @@ def __compute_feature__(ii: np.ndarray, x: int, y: int, w: int, h: int) -> int:
 	"""Compute a feature on an integrated image at a specific coordinate (GPU version).

 	Args:
-		ii (np.ndarray): Integrated image.
-		x (int): X coordinate.
-		y (int): Y coordinate.
-		w (int): width of the feature.
-		h (int): height of the feature.
+		ii (np.ndarray): Integrated image
+		x (int): X coordinate
+		y (int): Y coordinate
+		w (int): width of the feature
+		h (int): height of the feature

 	Returns:
-		int: Computed feature.
+		int: Computed feature
 	"""
 	return ii[y + h, x + w] + ii[y, x] - ii[y + h, x] - ii[y, x + w]

@@ -263,11 +263,11 @@ def __apply_feature_kernel__(X_feat: np.ndarray, feats: np.ndarray, X_ii: np.nda
 	"""GPU kernel of the function apply_features.

 	Args:
-		X_feat (np.ndarray): Feature images dataset.
-		feats (np.ndarray): Features to apply.
-		X_ii (np.ndarray): Integrated image dataset.
-		n (int): Number of features.
-		m (int): Number of images of the dataset.
+		X_feat (np.ndarray): Feature images dataset on device
+		feats (np.ndarray): Features on device to apply
+		X_ii (np.ndarray): Integrated image dataset on device
+		n (int): Number of features
+		m (int): Number of images of the dataset
 	"""
 	x, y = cuda.grid(2)
 	if x >= feats.shape[0] or y >= X_ii.shape[0]:
@@ -288,11 +288,11 @@ def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray:
 	"""Apply the features on a integrated image dataset (GPU version).

 	Args:
-		feats (np.ndarray): Features to apply.
-		X_ii (np.ndarray): Integrated image dataset.
+		feats (np.ndarray): Features to apply
+		X_ii (np.ndarray): Integrated image dataset

 	Returns:
-		np.ndarray: Applied features.
+		np.ndarray: Applied features
 	"""
 	d_X_feat = cuda.to_device(np.empty((feats.shape[0], X_ii.shape[0]), dtype = np.int32))
 	d_feats = cuda.to_device(feats)
@@ -303,28 +303,44 @@ def apply_features(feats: np.ndarray, X_ii: np.ndarray) -> np.ndarray:
 	return d_X_feat.copy_to_host()

@cuda.jit('int32(int32[:], uint16[:], int32, int32)', device = True)
-def as_partition(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> int:
+def _as_partition_(d_a: np.ndarray, d_indices: np.ndarray, l: int, h: int) -> int:
+	"""Partition of the argsort algorithm.
+
+	Args:
+		d_a (np.ndarray): Array on device to sort
+		d_indices (np.ndarray): Array of indices on device to write to
+		low (int): lower bound to sort
+		high (int): higher bound to sort
+
+	Returns:
+		int: Last index sorted
+	"""
 	i = l - 1
 	j = l
 	for j in range(l, h + 1):
-		if a[indices[j]] < a[indices[h]]:
+		if d_a[d_indices[j]] < d_a[d_indices[h]]:
 			i += 1
-			indices[i], indices[j] = indices[j], indices[i]
+			d_indices[i], d_indices[j] = d_indices[j], d_indices[i]

 	i += 1
-	indices[i], indices[j] = indices[j], indices[i]
+	d_indices[i], d_indices[j] = d_indices[j], d_indices[i]
 	return i

@cuda.jit('void(int32[:], uint16[:], int32, int32)', device = True)
-def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> None:
-	#total = h - l + 1;
-	stack = cuda.local.array(6977, int32)
-	stack[0] = l
-	stack[1] = h
-	top = 1;
+def argsort_bounded(d_a: np.ndarray, d_indices: np.ndarray, low: int, high: int) -> None:
+	"""Perform an indirect sort of a given array within a given bound.

-	low = l
-	high = h
+	Args:
+		d_a (np.ndarray): Array on device to sort
+		d_indices (np.ndarray): Array of indices on device to write to
+		low (int): lower bound to sort
+		high (int): higher bound to sort
+	"""
+	#total = high - low + 1;
+	stack = cuda.local.array(6977, int32)
+	stack[0] = low
+	stack[1] = high
+	top = 1

 	while top >= 0:
 		high = stack[top]
@@ -333,34 +349,49 @@ def argsort_bounded(a: np.ndarray, indices: np.ndarray, l: int, h: int) -> None:
 		top -= 1

 		if low >= high:
-			break;
+			break

-		p = as_partition(a, indices, low, high);
+		p = _as_partition_(d_a, d_indices, low, high)

 		if p - 1 > low:
 			top += 1
-			stack[top] = low;
+			stack[top] = low
 			top += 1
-			stack[top] = p - 1;
+			stack[top] = p - 1

 		if p + 1 < high:
 			top += 1
-			stack[top] = p + 1;
+			stack[top] = p + 1
 			top += 1
-			stack[top] = high;
+			stack[top] = high

@cuda.jit('void(int32[:, :], uint16[:, :])')
-def argsort_flatter(X_feat: np.ndarray, indices: np.ndarray) -> None:
-	i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
-	if i < X_feat.shape[0]:
-		for j in range(indices.shape[1]):
-			indices[i, j] = j
-		argsort_bounded(X_feat[i], indices[i], 0, X_feat.shape[1] - 1)
+def argsort_flatter(d_a: np.ndarray, d_indices: np.ndarray) -> None:
+	# TODO Finish doxygen
+	"""Cuda kernel where argsort is applied to every columns of a given 2D array.

-def argsort(X_feat: np.ndarray) -> np.ndarray:
-	indices = np.empty_like(X_feat, dtype = np.uint16)
-	n_blocks = int(np.ceil(np.divide(X_feat.shape[0], NB_THREADS)))
-	d_X_feat = cuda.to_device(X_feat)
+	Args:
+		d_a (np.ndarray): Array in device to sort
+		d_indices (np.ndarray): Array of indices on device to write to
+	"""
+	i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
+	if i < d_a.shape[0]:
+		for j in range(d_indices.shape[1]):
+			d_indices[i, j] = j
+		argsort_bounded(d_a[i], d_indices[i], 0, d_a.shape[1] - 1)
+
+def argsort(a: np.ndarray) -> np.ndarray:
+	"""Perform an indirect sort of a given array
+
+	Args:
+		a (np.ndarray): Array to sort
+
+	Returns:
+		np.ndarray: Array of indices that sort the array
+	"""
+	indices = np.empty_like(a, dtype = np.uint16)
+	n_blocks = int(np.ceil(np.divide(a.shape[0], NB_THREADS)))
+	d_X_feat = cuda.to_device(a)
 	d_indices = cuda.to_device(indices)
 	argsort_flatter[n_blocks, NB_THREADS](d_X_feat, d_indices)
 	cuda.synchronize()