ViolaJones/cpp/ViolaJonesCPU.cpp

#include "data.hpp"
#include "config.hpp"

#if GPU_BOOSTED == false

/**
 * @brief Transform the input images in integrated images (CPU version).
 *
 * @param X Dataset of images
 * @return Dataset of integrated images
 */
np::Array<uint32_t> set_integral_image(const np::Array<uint8_t>& set) noexcept {
	np::Array<uint32_t> X_ii = np::empty<uint32_t>(set.shape);

	size_t i, y, x, s;
	uint32_t ii[set.shape[1] * set.shape[2]];
	const size_t length = np::prod(set.shape);
	for (size_t offset = 0; offset < length; offset += set.shape[1] * set.shape[2]) {
		for (i = 0; i < set.shape[1] * set.shape[2]; ++i)
			ii[i] = 0;
		for (y = 1; y < set.shape[1]; ++y) {
			s = 0;
			for (x = 0; x < set.shape[2] - 1; ++x) {
				s += set[offset + (y - 1) * set.shape[2] + x];
				ii[y * set.shape[2] + x + 1] = s + ii[(y - 1) * set.shape[2] + x + 1];
			}
		}
		for (y = 0; y < set.shape[1]; ++y)
			for (x = 0; x < set.shape[2]; ++x)
				X_ii[offset + y * set.shape[2] + x] = ii[y * set.shape[2] + x];
	}

	return X_ii;
}

constexpr static inline int16_t __compute_feature__(const np::Array<uint32_t>& X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept {
	const size_t _y = y * X_ii.shape[1] + x;
	const size_t _yh = _y + h * X_ii.shape[1];
	return X_ii[j + _yh + w] + X_ii[j + _y] - X_ii[j + _yh] - X_ii[j + _y + w];
}

/**
 * @brief Apply the features on a integrated image dataset (CPU version).
 *
 * @param feats Features to apply
 * @param X_ii Integrated image dataset
 * @return Applied features
 */
np::Array<int32_t> apply_features(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
	np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });

	size_t j, feat_idx = 0;
	int16_t p1, p2, n1, n2;
	const size_t feats_length = np::prod(feats.shape), X_ii_length = np::prod(X_ii.shape);
	const size_t feats_step = np::prod(feats.shape, 1), X_ii_step = np::prod(X_ii.shape, 1);
	for (size_t i = 0; i < feats_length; i += feats_step){
		for (j = 0; j < X_ii_length; j += X_ii_step) {
			p1 = __compute_feature__(X_ii, j, feats[i +  0], feats[i +  1], feats[i +  2], feats[i +  3]);
			p2 = __compute_feature__(X_ii, j, feats[i +  4], feats[i +  5], feats[i +  6], feats[i +  7]);
			n1 = __compute_feature__(X_ii, j, feats[i +  8], feats[i +  9], feats[i + 10], feats[i + 11]);
			n2 = __compute_feature__(X_ii, j, feats[i + 12], feats[i + 13], feats[i + 14], feats[i + 15]);
			X_feat[feat_idx++] = static_cast<int32_t>(p1 + p2) - static_cast<int32_t>(n1 + n2);
		}
	}

	return X_feat;
}

np::Array<float64_t> train_weak_clf(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y, const np::Array<float64_t>& weights) noexcept {
	float64_t total_pos = 0.0, total_neg = 0.0;
	for(size_t i = 0; i < y.shape[0]; ++i)
		(y[i] == static_cast<uint8_t>(1) ? total_pos : total_neg) += weights[i];

	np::Array<float64_t> classifiers = np::empty<float64_t>({ X_feat.shape[0], 2});
	for(size_t i = 0; i < X_feat.shape[0]; ++i){
		size_t pos_seen = 0, neg_seen = 0;
		float64_t pos_weights = 0.0, neg_weights = 0.0;
		float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0;
		for(size_t j = 0; j < X_feat_argsort.shape[1]; ++j) {
			const float64_t error = std::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights);
			if (error < min_error){
				min_error = error;
				best_threshold = X_feat[i * X_feat.shape[1] + X_feat_argsort[i * X_feat.shape[1] + j]];
				best_polarity = pos_seen > neg_seen ? 1.0 : -1.0;
			}
			if(y[X_feat_argsort[i * X_feat.shape[1] + j]] == static_cast<uint8_t>(1)){
				++pos_seen;
				pos_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]];
			} else {
				++neg_seen;
				neg_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]];
			}
		}
		classifiers[i * 2] = best_threshold; classifiers[i * 2 + 1] = best_polarity;
	}
	return classifiers;
}

/**
 * @brief Perform an indirect sort of a given array within a given bound.
 *
 * @tparam T Inner type of the array
 * @param a Array to sort
 * @param indices Array of indices to write to
 * @param low lower bound to sort
 * @param high higher bound to sort
 */
template<typename T>
static void argsort(const T* const a, uint16_t* const indices, size_t low, size_t high) noexcept {
	const size_t total = high - low + 1;

	size_t* const stack = new size_t[total]{low, high};
	//size_t stack[total];
	//stack[0] = l;
	//stack[1] = h;
	size_t top = 1;

	while (top <= total) {
		high = stack[top--];
		low = stack[top--];
		if(low >= high)
			break;

		const size_t p = as_partition(a, indices, low, high);

		if (p - 1 > low && p - 1 < total) {
			stack[++top] = low;
			stack[++top] = p - 1;
		}

		if (p + 1 < high) {
			stack[++top] = p + 1;
			stack[++top] = high;
		}
	}
	delete[] stack;
}

/**
 * @brief Apply argsort to every column of a given 2D array.
 *
 * @tparam T Inner type of the array
 * @param a 2D Array to sort
 * @return 2D Array of indices that sort the array
 */
template<typename T>
static np::Array<uint16_t> argsort_bounded(const np::Array<T>& a, const size_t& low, const size_t& high) noexcept {
	np::Array<uint16_t> indices = np::empty(a.shape);
	map(indices, [](const size_t& i, const uint16_t&) -> uint16_t { return i; });

	argsort_bounded(a, indices, low, high);
	return indices;
}

/**
 * @brief Perform an indirect sort on each column of a given 2D array (CPU version).
 *
 * @param a 2D Array to sort
 * @return 2D Array of indices that sort the array
 */
np::Array<uint16_t> argsort_2d(const np::Array<int32_t>& X_feat) noexcept {
	const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
	const size_t length = np::prod(X_feat.shape);
	for (size_t i = 0; i < length; i += X_feat.shape[1]) {
		for(size_t j = 0; j < X_feat.shape[1]; ++j) indices[i + j] = j;
		argsort(&X_feat[i], &indices[i], 0, X_feat.shape[1] - 1);
	}
	return indices;
}

#endif // GPU_BOOSTED == false