From 8740f7ea4bce567f535ebbbdbaead42d4b206866 Mon Sep 17 00:00:00 2001
From: saundersp <pierre.saundersgb@gmail.com>
Date: Sat, 27 Apr 2024 20:47:30 +0200
Subject: [PATCH] moved download_data.sh to seperate module downloader

---
 download_data.sh              | 42 ------------------------
 downloader/activate.sh        | 27 ++++++++++++++++
 downloader/convert_dataset.py | 60 +++++++++++++++++++++++++++++++++++
 downloader/download_data.sh   | 38 ++++++++++++++++++++++
 downloader/requirements.txt   |  2 ++
 5 files changed, 127 insertions(+), 42 deletions(-)
 delete mode 100755 download_data.sh
 create mode 100755 downloader/activate.sh
 create mode 100644 downloader/convert_dataset.py
 create mode 100755 downloader/download_data.sh
 create mode 100644 downloader/requirements.txt

diff --git a/download_data.sh b/download_data.sh
deleted file mode 100755
index e7f9cd9..0000000
--- a/download_data.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-#!/bin/sh
-
-# Exit if any of the command doesn't exit with code 0
-set -e
-
-EXEC_DIR=$1
-test -z $EXEC_DIR && EXEC_DIR=.
-DATA_LOCATION=$EXEC_DIR/data
-mkdir -p $DATA_LOCATION
-
-if [ ! -f $DATA_LOCATION/X_train.bin ] || [ ! -f $DATA_LOCATION/X_test.bin ] \
-|| [ ! -f $DATA_LOCATION/y_train.bin ] || [ ! -f $DATA_LOCATION/y_test.bin ]; then
-#if true; then
-	if [ ! -f $DATA_LOCATION/faces.tar.gz ]; then
-		echo 'Downloading raw dataset'
-		curl -o $DATA_LOCATION/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz
-	fi
-
-	echo 'Extracting raw files'
-	tar xzf $DATA_LOCATION/faces.tar.gz -C $DATA_LOCATION
-	rm $DATA_LOCATION/README
-	rm $DATA_LOCATION/svm.*
-
-	echo 'Extracting raw train set'
-	tar xzf $DATA_LOCATION/face.train.tar.gz -C $DATA_LOCATION
-	rm $DATA_LOCATION/face.train.tar.gz
-
-	echo 'Extracting raw test set'
-	tar xzf $DATA_LOCATION/face.test.tar.gz -C $DATA_LOCATION
-	rm $DATA_LOCATION/face.test.tar.gz
-
-	echo 'Converting raw dataset to bin file'
-	source $EXEC_DIR/python/activate.sh $EXEC_DIR
-	python $EXEC_DIR/python/convert_dataset.py $DATA_LOCATION
-
-	echo 'Removing leftovers'
-	rm -rf $DATA_LOCATION/train
-	rm -rf $DATA_LOCATION/test
-
-	echo 'Done !'
-fi
diff --git a/downloader/activate.sh b/downloader/activate.sh
new file mode 100755
index 0000000..b8433b1
--- /dev/null
+++ b/downloader/activate.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+# Exit if any of the command doesn't exit with code 0
+set -e
+
+test -z "$EXEC_DIR" && EXEC_DIR=.
+test -z "$VENV_PATH" && VENV_PATH="$EXEC_DIR/venv"
+
+activate(){
+	if [ ! -d "$VENV_PATH" ]; then
+		echo 'Creating python virtual environnement'
+		python -m venv "$VENV_PATH"
+		echo 'Activating virtual environnement'
+		activate
+		echo 'Updating base pip packages'
+		python -m pip install -U setuptools pip
+		echo 'Installing requirements'
+		pip install -r requirements.txt
+	elif [ -f "$VENV_PATH"/Scripts/activate ]; then . "$VENV_PATH"/Scripts/activate
+	elif [ -f "$VENV_PATH"/bin/activate ]; then . "$VENV_PATH"/bin/activate
+	else
+		echo 'Python virtual environnement not detected'
+		exit 1
+	fi
+}
+
+activate
diff --git a/downloader/convert_dataset.py b/downloader/convert_dataset.py
new file mode 100644
index 0000000..f756b01
--- /dev/null
+++ b/downloader/convert_dataset.py
@@ -0,0 +1,60 @@
+from io import BufferedReader
+from tqdm import tqdm
+from functools import partial
+from sys import argv
+import numpy as np
+from os import path, listdir
+
+# Induce determinism
+np.random.seed(133742)
+
+# Makes the "leave" argument default to False
+tqdm = partial(tqdm, leave = False)
+
+def read_pgm(pgm_file: BufferedReader) -> np.ndarray:
+	"""Read the data of a PGM file
+
+	Args:
+		pgm_file (BufferedReader): PGM File
+
+	Returns:
+		np.ndarray: PGM data
+	"""
+	assert (f := pgm_file.readline()) == b'P5\n', f"Incorrect file format: {f}"
+	(width, height) = [int(i) for i in pgm_file.readline().split()]
+	assert width > 0 and height > 0, f"Incorrect dimensions: {width}x{height}"
+	assert (depth := int(pgm_file.readline())) < 256, f"Incorrect depth: {depth}"
+
+	buff = np.empty(height * width, dtype = np.uint8)
+	for i in range(buff.shape[0]):
+		buff[i] = ord(pgm_file.read(1))
+	return buff.reshape((height, width))
+
+def __main__(data_path: str) -> None:
+	"""Read the data of every PGM file and output it in data files
+
+	Args:
+		data_path (str): Path of the PGM files
+	"""
+	for set_name in tqdm(["train", "test"], desc = "set name"):
+		X, y = [], []
+		for y_i, label in enumerate(tqdm(["non-face", "face"], desc = "label")):
+			for filename in tqdm(listdir(f"{data_path}/{set_name}/{label}"), desc = "Reading pgm file"):
+				with open(f"{data_path}/{set_name}/{label}/{filename}", "rb") as face:
+					X.append(read_pgm(face))
+					y.append(y_i)
+
+		X, y = np.asarray(X), np.asarray(y)
+		idx = np.random.permutation(y.shape[0])
+		X, y = X[idx], y[idx]
+
+		for org, s in tqdm(zip("Xy", [X, y]), desc = f"Writing {set_name}"):
+			with open(f"{data_path}/{org}_{set_name}.bin", "w") as out:
+				out.write(f'{str(s.shape)[1:-1].replace(",", "")}\n')
+				raw = s.ravel()
+				for s_i in tqdm(raw[:-1], desc = f"Writing {org}"):
+					out.write(f"{s_i} ")
+				out.write(str(raw[-1]))
+
+if __name__ == "__main__":
+	__main__(argv[1]) if len(argv) == 2 else print(f"Usage: python {__file__[__file__.rfind(path.sep) + 1:]} ./data_location")
diff --git a/downloader/download_data.sh b/downloader/download_data.sh
new file mode 100755
index 0000000..121b8d1
--- /dev/null
+++ b/downloader/download_data.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+# Exit if any of the command doesn't exit with code 0
+set -e
+
+test -z "$EXEC_DIR" && EXEC_DIR=.
+DATA_PATH="$EXEC_DIR/../data"
+test ! -d "$DATA_PATH" && mkdir -v "$DATA_PATH"
+
+if [ ! -f "$DATA_PATH"/X_train.bin ] || [ ! -f "$DATA_PATH"/X_test.bin ] \
+|| [ ! -f "$DATA_PATH"/y_train.bin ] || [ ! -f "$DATA_PATH"/y_test.bin ]; then
+	if [ ! -f "$DATA_PATH"/faces.tar.gz ]; then
+		echo 'Downloading raw dataset'
+		curl -o "$DATA_PATH"/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz
+	fi
+
+	echo 'Extracting raw files'
+	tar xvzf "$DATA_PATH"/faces.tar.gz -C "$DATA_PATH"
+	rm -v "$DATA_PATH"/README "$DATA_PATH"/svm.*
+
+	echo 'Extracting raw train set'
+	tar xvzf "$DATA_PATH"/face.train.tar.gz -C "$DATA_PATH"
+	rm -v "$DATA_PATH"/face.train.tar.gz
+
+	echo 'Extracting raw test set'
+	tar xvzf "$DATA_PATH"/face.test.tar.gz -C "$DATA_PATH"
+	rm -v "$DATA_PATH"/face.test.tar.gz
+
+	echo 'Converting raw dataset to bin file'
+	export EXEC_DIR
+	. "$EXEC_DIR"/activate.sh
+	python "$EXEC_DIR"/convert_dataset.py "$DATA_PATH"
+
+	echo 'Removing leftovers'
+	rm -rvf "$DATA_PATH"/train "$DATA_PATH"/test
+
+	echo 'Done !'
+fi
diff --git a/downloader/requirements.txt b/downloader/requirements.txt
new file mode 100644
index 0000000..10d4523
--- /dev/null
+++ b/downloader/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.26.4
+tqdm==4.66.2