import numpy as np
from numba import cuda, config, njit
config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
#import matplotlib.pyplot as plt
from tqdm import tqdm
from time import perf_counter_ns
from toolbox import format_time_ns
from pickle import load, dump
from sys import argv

def get(a):
	with open(f"{a}.pkl", 'rb') as f:
		return load(f)

def save(a, name) -> None:
	with open(name, 'wb') as f:
		dump(a, f)

def diff(folder, a, label1, label2):
	af, bf = get(f"{folder}/{a}_{label1}"), get(f"{folder}/{a}_{label2}")
	#print(af)
	#print(bf)
	print((af - bf).mean())

if __name__ == "__main__":
	if len(argv) == 5:
		diff(argv[1], argv[4], argv[2], argv[3])

def py_mean(a, b):
	s = 0.0
	for a_i, b_i in zip(a, b):
		s += a_i * b_i
	return s / a.shape[0]

def np_mean(a, b):
	return np.mean(a * b)

@njit('float64(float64[:], float64[:])', fastmath = True, nogil = True)
def nb_mean(a, b):
	return np.mean(a * b)

@njit('float64(float64[:], float64[:])', fastmath = True, nogil = True)
def nb_mean_loop(a, b):
	s = 0.0
	for a_i, b_i in zip(a, b):
		s += a_i * b_i
	return s / a.shape[0]

@cuda.jit('void(float64[:], float64[:], float64[:])', fastmath = True)
def cuda_mean_kernel(r, a, b):
	s = 0.0
	for a_i, b_i in zip(a, b):
		s += a_i * b_i
	r[0] = s / a.shape[0]

def cuda_mean(a, b):
	r = cuda.to_device(np.empty(1, dtype = np.float64))
	d_a = cuda.to_device(a)
	d_b = cuda.to_device(b)
	cuda_mean_kernel[1, 1](r, d_a, d_b)
	return r.copy_to_host()[0]

def test_and_compare(labels, fncs, a, b):
	m = []
	for fnc in tqdm(fncs, leave = False, desc = "Calculating..."):
		s = perf_counter_ns()
		m.append([fnc(a, b), perf_counter_ns() - s])
	print("Results:")
	[print(f"\t{label:<10} {m_i:<20} {format_time_ns(time_i)}") for ((m_i, time_i), label) in zip(m, labels)]
	print("Comparaison:")
	for i, (m_i, label_i) in enumerate(zip(m, labels)):
		for j, (m_j, label_j) in enumerate(zip(m, labels)):
			if i >= j:
				continue
			print(f"\t{label_i:<10} vs {label_j:<10} - {abs(m_i[0] - m_j[0])}")

if __name__ == "__main__":
	np.set_printoptions(linewidth = 10000, threshold = 1000)

	N = int(2**20)
	labels = ["Python", "Numpy", "Numba", "Numba loop", "CUDA"]
	fncs = [py_mean, np_mean, nb_mean, nb_mean_loop, cuda_mean]

	print(f"RANDOM for N={N}")
	total_size = (2 * 8 * N)
	print(f"Size = {total_size} B")
	print(f"Size = {total_size // 1024} kB")
	print(f"Size = {total_size // 1024 // 1024} MB")
	print(f"Size = {total_size // 1024 // 1024 // 1024} GB")
	a, b = np.random.rand(N).astype(np.float64), np.random.rand(N).astype(np.float64)
	test_and_compare(labels, fncs, a, b)
	del a, b

	print(f"\nDETERMINSTIC for N={N}")
	total_size = (2 * 8 * N) + (8 * N)
	print(f"Size = {total_size} B")
	print(f"Size = {total_size // 1024} kB")
	print(f"Size = {total_size // 1024 // 1024} MB")
	print(f"Size = {total_size // 1024 // 1024 // 1024} GB")
	mask = np.arange(N, dtype = np.uint64)
	a = np.ones(N, dtype = np.float64)
	a[mask < N//2] = 0.1
	del mask
	b = np.ones(N, dtype = np.float64)
	test_and_compare(labels, fncs, a, b)
	del a, b

	#from ViolaJonesGPU import argsort as argsort_GPU
	#from ViolaJonesCPU import argsort as argsort_CPU
	#from toolbox import unit_test_argsort_2d, benchmark_function

	#labels = ["Numpy", "Numba", "CUDA"]
	#a = np.random.randint(2**12, size = (2**20, 2**8), dtype = np.int32)
	#m = [benchmark_function(f"Argsort {label}", lambda: f(np.copy(a))) for (label, f) in zip(labels, [
	#	lambda a: np.argsort(a).astype(np.uint16), argsort_CPU, argsort_GPU
	#])]
	#for i, (m_i, label_i) in enumerate(zip(m, labels)):
	#	#for j, (m_j, label_j) in enumerate(zip(m, labels)):
	#	#	if i >= j:
	#	#		continue
	#	#	print(f"\t{label_i:<10} vs {label_j:<10} - {(m_i == m_j).mean()}")
	#	benchmark_function(f"Unit test {label_i}", lambda: unit_test_argsort_2d(a, m_i))

	#for i in tqdm(range(X.shape[0]), leave = False, desc = "Extract image"):
	#	x = X[i]
	#	y = Y[i]
	#	fig = plt.figure()
	#	plt.imshow(x, cmap = 'gray')
	#	plt.savefig(f"imgs/{y}/{i}.png")
	#	plt.close(fig)

	#def extract_FD(Xy):
	#	X_c, Y_c = [], []
	#	for x,y in Xy:
	#		X_c.append(x)
	#		Y_c.append(y)
	#	X_c = np.asarray(X_c)
	#	Y_c = np.asarray(Y_c)
	#	return X_c, Y_c

	#X_train, y_train = get('out/X_train'), get('out/y_train')
	#X_test, y_test = get('out/X_test'), get('out/y_test')

	#X_train, y_train = extract_FD(get('/home/_aspil0w/git/FaceDetection/training'))
	#X_test, y_test = extract_FD(get('/home/_aspil0w/git/FaceDetection/test'))
	#save(X_train, 'out/X_train'), save(y_train, 'out/y_train')
	#save(X_test, 'out/X_test'), save(y_test, 'out/y_test')

	#print(X_train.shape, X_train_org.shape, X_train.shape == X_train_org.shape)
	#print((X_train == X_train_org).mean())
	#print(y_train.shape, y_train_org.shape, y_train.shape == y_train_org.shape)
	#print((y_train == y_train_org).mean())

	#print(X_test.shape, X_test_org.shape, X_test.shape == X_test_org.shape)
	#print((X_test == X_test_org).mean())
	#print(y_test.shape, y_test_org.shape, y_test.shape == y_test_org.shape)
	#print((y_test == y_test_org).mean())

	#@njit('uint16[:](uint8[:, :, :], uint8[:, :, :])')
	#def arg_find(X, X_org):
	#	arg = np.empty(X.shape[0], dtype = np.uint16)
	#	for i, x in enumerate(X_org):
	#		found = False
	#		for j, x_org in enumerate(X):
	#			if np.all(x == x_org):
	#				arg[i] = j
	#				found = True
	#				break
	#		assert found, "Image not found"
	#	return arg

	#print("Arg find results train")
	#arg_train = arg_find(X_train, X_train_org)
	#print((X_train[arg_train] == X_train_org).mean())
	#print((y_train[arg_train] == y_train_org).mean())

	#print("Arg find results test")
	#arg_test = arg_find(X_test, X_test_org)
	#print((X_test[arg_test] == X_test_org).mean())
	#print((y_test[arg_test] == y_test_org).mean())

	#for i in tqdm(range(X_c.shape[0]), leave = False, desc = "Extract image"):
	#	x = X_c[i]
	#	y = Y_c[i]
	#	fig = plt.figure()
	#	plt.imshow(x, cmap = 'gray')
	#	plt.savefig(f"imgs2/{y}/{i}.png")
	#	plt.close(fig)