#include <filesystem>
#include "data.hpp"
#include "toolbox.hpp"
#include "config.hpp"
#include "toolbox_unit_test.hpp"
#include "ViolaJones.hpp"
#include "ViolaJones_device.hpp"

#if GPU_BOOSTED
#include "gpu_unit_test.hpp"
#define LABEL "GPU"
#else
#define LABEL "CPU"
#endif

/**
 * @brief Execute the preprocessing phase
 *
 * The preprocessing phase consist of the following steps :
 * - Load the dataset
 * - Calculate features
 * - Calculate integral images
 * - Apply features to images
 * - Calculate argsort of the featured images
 *
 * @return std::tuple<np::Array<int32_t>, np::Array<uint16_t>, np::Array<uint8_t>, np::Array<int32_t>, np::Array<uint8_t>> Tuple containing in order : training features, training features sorted indexes, training labels, testing features, testing labels
 */
std::tuple<np::Array<int32_t>, np::Array<uint16_t>, np::Array<uint8_t>, np::Array<int32_t>, np::Array<uint8_t>> preprocessing(void) {
	// Creating state saver folders if they don't exist already
	if (SAVE_STATE)
		for (const char* const folder_name : { "models", "out" })
			std::filesystem::create_directory(folder_name);

	const std::chrono::system_clock::time_point preproc_timestamp = perf_counter_ns();
	const std::array<int32_t, 3> preproc_gaps = { 49, -18, 29 };
	header(preproc_gaps, { "Preprocessing", "Time spent (ns)", "Formatted time spent" });

	const auto [ X_train, y_train, X_test, y_test ] = state_saver<uint8_t, 4>("Loading sets", preproc_gaps[0], { "X_train", "y_train", "X_test", "y_test" },
			FORCE_REDO, SAVE_STATE, OUT_DIR, load_datasets);

#if __DEBUG
	printf("X_train\n");
	print(X_train.shape);
	print(X_train, { IDX_INSPECT });
	printf("X_test\n");
	print(X_test.shape);
	print(X_test, { IDX_INSPECT });
	printf("y_train\n");
	print(y_train.shape);
	print(y_train, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
	printf("y_test\n");
	print(y_test.shape);
	print(y_test, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif

	const np::Array<uint8_t> feats = state_saver<uint8_t>("Building features", preproc_gaps[0], "feats",
			FORCE_REDO, SAVE_STATE, OUT_DIR, build_features, X_train.shape[1], X_train.shape[2]);

#if __DEBUG
	printf("feats\n");
	print(feats.shape);
	print_feat(feats, { IDX_INSPECT });
#endif

	const np::Array<uint32_t> X_train_ii = state_saver<uint32_t>("Converting training set to integral images (" LABEL ")", preproc_gaps[0], "X_train_ii_" LABEL,
			FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_train);
	const np::Array<uint32_t> X_test_ii = state_saver<uint32_t>("Converting testing set to integral images (" LABEL ")", preproc_gaps[0], "X_test_ii_" LABEL,
			FORCE_REDO, SAVE_STATE, OUT_DIR, set_integral_image, X_test);

#if __DEBUG
	printf("X_train_ii\n");
	print(X_train_ii.shape);
	print(X_train_ii, { IDX_INSPECT });
	printf("X_test_ii\n");
	print(X_test_ii.shape);
	print(X_test_ii, { IDX_INSPECT });
#endif

	const np::Array<int32_t> X_train_feat = state_saver<int32_t>("Applying features to training set (" LABEL ")", preproc_gaps[0], "X_train_feat_" LABEL,
			FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_train_ii);
	const np::Array<int32_t> X_test_feat = state_saver<int32_t>("Applying features to testing set (" LABEL ")", preproc_gaps[0], "X_test_feat_" LABEL,
			FORCE_REDO, SAVE_STATE, OUT_DIR, apply_features, feats, X_test_ii);

#if __DEBUG
	printf("X_train_feat\n");
	print(X_train_feat.shape);
	print(X_train_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
	printf("X_test_feat\n");
	print(X_test_feat.shape);
	print(X_test_feat, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif

	// const np::Array<int32_t> indices = state_saver<int32_t>("Selecting best features", preproc_gaps[0], "indices", select_percentile, X_train_feat, d.y_train);

#if __DEBUG
	// print_feature(indices);
#endif

	const np::Array<uint16_t> X_train_feat_argsort = state_saver<uint16_t>("Precalculating training set argsort (" LABEL ")", preproc_gaps[0], "X_train_feat_argsort_" LABEL,
			FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_train_feat);

#if __DEBUG
	printf("X_train_feat_argsort\n");
	print(X_train_feat_argsort.shape);
	print(X_train_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif

	const np::Array<uint16_t> X_test_feat_argsort = state_saver<uint16_t>("Precalculating testing set argsort (" LABEL ")", preproc_gaps[0], "X_test_feat_argsort_" LABEL,
			FORCE_REDO, SAVE_STATE, OUT_DIR, argsort_2d, X_test_feat);

#if __DEBUG
	printf("X_test_feat_argsort\n");
	print(X_test_feat_argsort.shape);
	print(X_test_feat_argsort, { IDX_INSPECT, IDX_INSPECT + IDX_INSPECT_OFFSET });
#endif
	const long long time_spent = duration_ns(perf_counter_ns() - preproc_timestamp);
	formatted_line(preproc_gaps, "├", "┼", "─", "┤");
	formatted_row(preproc_gaps, { "Preprocessing summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
	footer(preproc_gaps);
	return { X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test };
}

/**
 * @brief Train the weak classifiers.
 *
 * @param X_train_feat Training images
 * @param X_train_feat_argsort Sorted indexes of the training images features
 * @param y_train Training labels
 * @return List of trained models
 */
std::array<std::array<np::Array<float64_t>, 2>, TS.size()> train(const np::Array<int32_t>& X_train_feat, const np::Array<uint16_t>& X_train_feat_argsort, const np::Array<uint8_t>& y_train) noexcept {
	const std::chrono::system_clock::time_point training_timestamp = perf_counter_ns();
	const std::array<int32_t, 3> training_gaps = { 26, -18, 29 };
	header(training_gaps, { "Training", "Time spent (ns)", "Formatted time spent" });

	std::array<std::array<np::Array<float64_t>, 2>, TS.size()> models;

	size_t i = 0;
	for (const size_t T : TS) {
		char title[BUFFER_SIZE] = { 0 };
		char alphas_title[BUFFER_SIZE] = { 0 };
		char final_classifiers_title[BUFFER_SIZE] = { 0 };
		snprintf(title, BUFFER_SIZE, "ViolaJones T = %-4lu (%s)", T, LABEL);
		snprintf(alphas_title, BUFFER_SIZE, "alphas_%lu_%s", T, LABEL);
		snprintf(final_classifiers_title, BUFFER_SIZE, "final_classifiers_%lu_%s", T, LABEL);

		const auto [ alphas, final_classifiers ] = state_saver<float64_t, 2>(title, training_gaps[0], { alphas_title, final_classifiers_title },
				FORCE_REDO, SAVE_STATE, MODEL_DIR, train_viola_jones, T, X_train_feat, X_train_feat_argsort, y_train);
#if __DEBUG
		printf("alphas\n");
		print(alphas);
		printf("final_classifiers\n");
		print(final_classifiers);
#endif
		models[i++] = { alphas, final_classifiers };
	}
	const long long time_spent = duration_ns(perf_counter_ns() - training_timestamp);
	formatted_line(training_gaps, "├", "┼", "─", "┤");
	formatted_row(training_gaps, { "Training summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
	footer(training_gaps);

	return models;
}

/**
 * @brief Benchmark the trained classifiers on the training and testing sets.
 *
 * @param models List of trained models
 * @param X_train_feat Training features
 * @param y_train Training labels
 * @param X_test_feat Testing features
 * @param y_test Testing labels
 */
void testing_and_evaluating(const std::array<std::array<np::Array<float64_t>, 2>, TS.size()>& models, const np::Array<int32_t>& X_train_feat, const np::Array<uint8_t>& y_train, const np::Array<int32_t>& X_test_feat, const np::Array<uint8_t>& y_test) {
	const std::array<int32_t, 5> testing_gaps = { 26, -19, 24, -19, 24 };
	header(testing_gaps, { "Testing", "Time spent (ns) (E)", "Formatted time spent (E)", "Time spent (ns) (T)", "Formatted time spent (T)" });
	std::array<std::array<float64_t, 8>, TS.size()> results;

	size_t i = 0;
	long long total_train_timestamp = 0;
	long long total_test_timestamp = 0;
	for (const auto& [ alphas, final_classifiers ] : models) {
		char title[BUFFER_SIZE] = { 0 };
		snprintf(title, BUFFER_SIZE, "ViolaJones T = %-4i (%s)", TS[i], LABEL);

		std::chrono::system_clock::time_point start = perf_counter_ns();
		const np::Array<uint8_t> y_pred_train = classify_viola_jones(alphas, final_classifiers, X_train_feat);
		const long long t_pred_train = duration_ns(perf_counter_ns() - start);
		total_train_timestamp += t_pred_train;
		const float64_t e_acc = accuracy_score(y_train, y_pred_train);
		const float64_t e_f1 = f1_score(y_train, y_pred_train);
		float64_t e_FN, e_FP;
		std::tie(std::ignore, e_FN, e_FP, std::ignore) = confusion_matrix(y_train, y_pred_train);

		start = perf_counter_ns();
		const np::Array<uint8_t> y_pred_test = classify_viola_jones(alphas, final_classifiers, X_test_feat);
		const long long t_pred_test = duration_ns(perf_counter_ns() - start);
		total_test_timestamp += t_pred_test;
		const float64_t t_acc = accuracy_score(y_test, y_pred_test);
		const float64_t t_f1 = f1_score(y_test, y_pred_test);
		float64_t t_FN, t_FP;
		std::tie(std::ignore, t_FN, t_FP, std::ignore) = confusion_matrix(y_test, y_pred_test);
		results[i++] = { e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP };

		formatted_row(testing_gaps, { title, thousand_sep(t_pred_train).c_str(), format_time_ns(t_pred_train).c_str(), thousand_sep(t_pred_test).c_str(), format_time_ns(t_pred_test).c_str() });
	}
	formatted_line(testing_gaps, "├", "┼", "─", "┤");
	formatted_row(testing_gaps, { "Testing summary", thousand_sep(total_train_timestamp).c_str(), format_time_ns(total_train_timestamp).c_str(), thousand_sep(total_test_timestamp).c_str(), format_time_ns(total_test_timestamp).c_str() });
	footer(testing_gaps);

	const std::array<int32_t, 9> evaluating_gaps = { 19, -7, -6, -6, -6, -7, -6, -6, -6 };
	header(evaluating_gaps, { "Evaluating", "ACC (E)", "F1 (E)", "FN (E)", "FP (E)", "ACC (T)", "F1 (T)", "FN (T)", "FP (T)"});

	i = 0;
	for (const size_t T : TS) {
		char title[BUFFER_SIZE] = { 0 };
		snprintf(title, BUFFER_SIZE, "ViolaJones T = %-4lu", T);
		const auto [e_acc, e_f1, e_FN, e_FP, t_acc, t_f1, t_FN, t_FP] = results[i++];
		printf("│ %-19s │ %'6.2f%% │ %'6.2f │ %'6.0f │ %'6.0f │ %6.2f%% │ %'6.2f │ %'6.0f │ %'6.0f │\n", title, e_acc * 100, e_f1, e_FN, e_FP, t_acc * 100, t_f1, t_FN, t_FP);
	}
	footer(evaluating_gaps);
}

/**
 * @brief Test if the each result is equals to other devices.
 *
 * Given ViolaJones is a fully deterministic algorithm. The results, regardless the device, should be the same,
 * this function check this assertion.
 */
void unit_test(void) {
	const std::chrono::system_clock::time_point unit_timestamp = perf_counter_ns();
	const std::array<int32_t, 4> unit_gaps = { 37, -10, -18, 29};
	header(unit_gaps, { "Unit testing", "Test state", "Time spent (ns)", "Formatted time spent" });

	char title[BUFFER_SIZE] = { 0 };
	char tmp_title[BUFFER_SIZE / 2] = { 0 };
	char file_cpu[BUFFER_SIZE] = { 0 };
	char file_gpu[BUFFER_SIZE] = { 0 };
	uint64_t n_total = 0, n_success = 0;

	const auto test_fnc = [&unit_gaps, &n_total, &n_success](const char* const title, const auto& fnc) noexcept {
		++n_total;
		const std::chrono::system_clock::time_point start = perf_counter_ns();
		const bool state = fnc();
		const long long time_spent = duration_ns(perf_counter_ns() - start);
		if(state){
			formatted_row(unit_gaps, { title, "Passed", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
			++n_success;
		} else
			formatted_row(unit_gaps, { title, "Failed", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
	};

	for (const char* const label : { "train", "test" }) {
		snprintf(file_cpu, BUFFER_SIZE, OUT_DIR "/X_%s_ii_CPU.bin", label);
		snprintf(file_gpu, BUFFER_SIZE, OUT_DIR "/X_%s_ii_GPU.bin", label);
		if (std::filesystem::exists(file_cpu) && std::filesystem::exists(file_gpu)) {
			snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_ii", label);
			snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
			test_fnc(title, [&file_cpu, &file_gpu]{
				const np::Array<uint32_t> X_train_ii_cpu = load<uint32_t>(file_cpu);
				const np::Array<uint32_t> X_train_ii_gpu = load<uint32_t>(file_gpu);
				return unit_test_cpu_vs_gpu<uint32_t>(X_train_ii_cpu, X_train_ii_gpu);
			});
		}
		snprintf(file_cpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_CPU.bin", label);
		snprintf(file_gpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_GPU.bin", label);
		uint8_t feat = 0;
		char file_feat[BUFFER_SIZE] = { 0 };
		if (std::filesystem::exists(file_cpu)) {
			strncpy(file_feat, file_cpu, BUFFER_SIZE);
			feat = 1;
		} else if (std::filesystem::exists(file_gpu)) {
			strncpy(file_feat, file_gpu, BUFFER_SIZE);
			feat = 2;
		}
		if (feat != 0) {
			const np::Array<int32_t> X_feat = load<int32_t>(file_feat);
			snprintf(file_gpu, BUFFER_SIZE, feat == 1 ? OUT_DIR "/X_%s_feat_GPU.bin" : OUT_DIR "/X_%s_feat_CPU.bin", label);
			if (std::filesystem::exists(file_gpu)) {
				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat", label);
				snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
				test_fnc(title, [&X_feat, &file_gpu]{
					const np::Array<int32_t> X_feat_aux = load<int32_t>(file_gpu);
					return unit_test_cpu_vs_gpu<int32_t>(X_feat, X_feat_aux);
				});
			}
			snprintf(file_cpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_argsort_CPU.bin", label);
			np::Array<uint16_t> X_feat_argsort_cpu;
			uint8_t loaded = 0;
			if (std::filesystem::exists(file_cpu)) {
				++loaded;
				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat_argsort", label);
				snprintf(title, BUFFER_SIZE, "%-22s - CPU  argsort", tmp_title);
				test_fnc(title, [&X_feat, &X_feat_argsort_cpu, &file_cpu]{
					X_feat_argsort_cpu = load<uint16_t>(file_cpu);
					return unit_test_argsort_2d<int32_t>(X_feat, X_feat_argsort_cpu);
				});
			}
			snprintf(file_gpu, BUFFER_SIZE, OUT_DIR "/X_%s_feat_argsort_GPU.bin", label);
			np::Array<uint16_t> X_feat_argsort_gpu;
			if (std::filesystem::exists(file_gpu)) {
				++loaded;
				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat_argsort", label);
				snprintf(title, BUFFER_SIZE, "%-22s - GPU  argsort", tmp_title);
				test_fnc(title, [&X_feat, &X_feat_argsort_gpu, &file_gpu]{
					X_feat_argsort_gpu = load<uint16_t>(file_gpu);
					return unit_test_argsort_2d<int32_t>(X_feat, X_feat_argsort_gpu);
				});
			}
			if (loaded == 2){
				snprintf(tmp_title, BUFFER_SIZE / 2, "X_%s_feat_argsort", label);
				snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
				test_fnc(title, [&X_feat_argsort_cpu, &X_feat_argsort_gpu]{ return unit_test_cpu_vs_gpu<uint16_t>(X_feat_argsort_cpu, X_feat_argsort_gpu); });
			}
		}
	}

	for (const size_t T : TS)
		for (const char* const label : { "alphas", "final_classifiers" }) {
			snprintf(file_cpu, BUFFER_SIZE, MODEL_DIR "/%s_%lu_CPU.bin", label, T);
			snprintf(file_gpu, BUFFER_SIZE, MODEL_DIR "/%s_%lu_GPU.bin", label, T);
			if (std::filesystem::exists(file_cpu) && std::filesystem::exists(file_gpu)){
				snprintf(tmp_title, BUFFER_SIZE / 2, "%s_%ld", label, T);
				snprintf(title, BUFFER_SIZE, "%-22s - CPU  vs GPU", tmp_title);
				test_fnc(title, [&file_cpu, &file_gpu]{
					const np::Array<float64_t> cpu = load<float64_t>(file_cpu);
					const np::Array<float64_t> gpu = load<float64_t>(file_gpu);
					return unit_test_cpu_vs_gpu<float64_t>(cpu, gpu);
				});
			}
		}

	const long long time_spent = duration_ns(perf_counter_ns() - unit_timestamp);

	if (n_total == 0)
		formatted_row(unit_gaps, { "Unit testing summary", "No files", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
	else {
		snprintf(title, BUFFER_SIZE, "%ld/%ld", n_success, n_total);
		formatted_line(unit_gaps, "├", "┼", "─", "┤");
		formatted_row(unit_gaps, { "Unit testing summary", title, thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
	}
	footer(unit_gaps);
}

int32_t main(void){
	setlocale(LC_NUMERIC, ""); // Allow proper number display

	const std::chrono::system_clock::time_point unit_timestamp = perf_counter_ns();
	const std::array<int32_t, 3> unit_gaps = { 27, -18, 29 };
	header(unit_gaps, { "Unit testing", "Time spent (ns)", "Formatted time spent" });
#if GPU_BOOSTED
	benchmark_function_void("Testing GPU capabilities 1D", unit_gaps[0], test_working, 50000);
	benchmark_function_void("Testing GPU capabilities 2D", unit_gaps[0], test_working_2d, 200, 500);
	benchmark_function_void("Testing GPU capabilities 3D", unit_gaps[0], test_working_3d, 30, 40, 500);
#endif
	benchmark_function_void("Testing format_time", unit_gaps[0], format_time_test);
	benchmark_function_void("Testing format_time_ns", unit_gaps[0], format_time_ns_test);
	benchmark_function_void("Testing format_byte_size", unit_gaps[0], format_byte_size_test);
	benchmark_function_void("Testing thousand_sep", unit_gaps[0], thousand_sep_test);
	const long long time_spent = duration_ns(perf_counter_ns() - unit_timestamp);
	formatted_line(unit_gaps, "├", "┼", "─", "┤");
	formatted_row(unit_gaps, { "Unit testing summary", thousand_sep(time_spent).c_str(), format_time_ns(time_spent).c_str() });
	footer(unit_gaps);

	const auto [ X_train_feat, X_train_feat_argsort, y_train, X_test_feat, y_test ] = preprocessing();
	const std::array<std::array<np::Array<float64_t>, 2>, TS.size()> models = train(X_train_feat, X_train_feat_argsort, y_train);
	testing_and_evaluating(models, X_train_feat, y_train, X_test_feat, y_test);
	unit_test();

	return EXIT_SUCCESS;
}