#include "data.hpp"
#include "toolbox.hpp"

np::Array<uint32_t> set_integral_image_cpu(const np::Array<uint8_t>& set) noexcept {
	np::Array<uint32_t> X_ii = np::empty<uint32_t>(set.shape);

	size_t i, y, x, s;
	uint32_t ii[set.shape[1] * set.shape[2]];
	const size_t length = np::prod(set.shape);
	for (size_t offset = 0; offset < length; offset += set.shape[1] * set.shape[2]) {
		for (i = 0; i < set.shape[1] * set.shape[2]; ++i)
			ii[i] = 0;
		for (y = 1; y < set.shape[1]; ++y) {
			s = 0;
			for (x = 0; x < set.shape[2] - 1; ++x) {
				s += set[offset + (y - 1) * set.shape[2] + x];
				ii[y * set.shape[2] + x + 1] = s + ii[(y - 1) * set.shape[2] + x + 1];
			}
		}
		for (y = 0; y < set.shape[1]; ++y)
			for (x = 0; x < set.shape[2]; ++x)
				X_ii[offset + y * set.shape[2] + x] = ii[y * set.shape[2] + x];
	}

	return X_ii;
}

constexpr static inline int16_t __compute_feature__(const np::Array<uint32_t>& X_ii, const size_t& j, const int16_t& x, const int16_t& y, const int16_t& w, const int16_t& h) noexcept {
	const size_t _y = y * X_ii.shape[1] + x;
	const size_t _yh = _y + h * X_ii.shape[1];
	return X_ii[j + _yh + w] + X_ii[j + _y] - X_ii[j + _yh] - X_ii[j + _y + w];
}

np::Array<int32_t> apply_features_cpu(const np::Array<uint8_t>& feats, const np::Array<uint32_t>& X_ii) noexcept {
	np::Array<int32_t> X_feat = np::empty<int32_t>({ feats.shape[0], X_ii.shape[0] });

	size_t j, feat_idx = 0;
	int16_t p1, p2, n1, n2;
	const size_t feats_length = np::prod(feats.shape), X_ii_length = np::prod(X_ii.shape);
	const size_t feats_step = np::prod(feats.shape, 1), X_ii_step = np::prod(X_ii.shape, 1);
	for (size_t i = 0; i < feats_length; i += feats_step){
		for (j = 0; j < X_ii_length; j += X_ii_step) {
			p1 = __compute_feature__(X_ii, j, feats[i +  0], feats[i +  1], feats[i +  2], feats[i +  3]);
			p2 = __compute_feature__(X_ii, j, feats[i +  4], feats[i +  5], feats[i +  6], feats[i +  7]);
			n1 = __compute_feature__(X_ii, j, feats[i +  8], feats[i +  9], feats[i + 10], feats[i + 11]);
			n2 = __compute_feature__(X_ii, j, feats[i + 12], feats[i + 13], feats[i + 14], feats[i + 15]);
			X_feat[feat_idx++] = static_cast<int32_t>(p1 + p2) - static_cast<int32_t>(n1 + n2);
		}
	}

	return X_feat;
}

np::Array<float64_t> train_weak_clf_cpu(const np::Array<int32_t>& X_feat, const np::Array<uint16_t>& X_feat_argsort, const np::Array<uint8_t>& y, const np::Array<float64_t>& weights) noexcept {
	float64_t total_pos = 0.0, total_neg = 0.0;
	for(size_t i = 0; i < y.shape[0]; ++i)
		(y[i] == static_cast<uint8_t>(1) ? total_pos : total_neg) += weights[i];

	np::Array<float64_t> classifiers = np::empty<float64_t>({ X_feat.shape[0], 2});
	for(size_t i = 0; i < X_feat.shape[0]; ++i){
		size_t pos_seen = 0, neg_seen = 0;
		float64_t pos_weights = 0.0, neg_weights = 0.0;
		float64_t min_error = np::inf, best_threshold = 0.0, best_polarity = 0.0;
		for(size_t j = 0; j < X_feat_argsort.shape[1]; ++j) {
			const float64_t error = std::min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights);
			if (error < min_error){
				min_error = error;
				best_threshold = X_feat[i * X_feat.shape[1] + X_feat_argsort[i * X_feat.shape[1] + j]];
				best_polarity = pos_seen > neg_seen ? 1.0 : -1.0;
			}
			if(y[X_feat_argsort[i * X_feat.shape[1] + j]] == static_cast<uint8_t>(1)){
				++pos_seen;
				pos_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]];
			} else {
				++neg_seen;
				neg_weights += weights[X_feat_argsort[i * X_feat.shape[1] + j]];
			}
		}
		classifiers[i * 2] = best_threshold; classifiers[i * 2 + 1] = best_polarity;
	}
	return classifiers;
}

np::Array<uint16_t> argsort_2d_cpu(const np::Array<int32_t>& X_feat) noexcept {
	const np::Array<uint16_t> indices = np::empty<uint16_t>(X_feat.shape);
	const size_t length = np::prod(X_feat.shape);
	for (size_t i = 0; i < length; i += X_feat.shape[1]) {
		for(size_t j = 0; j < X_feat.shape[1]; ++j) indices[i + j] = j;
		argsort(&X_feat[i], &indices[i], 0, X_feat.shape[1] - 1);
	}
	return indices;
}