diff --git a/download_data.sh b/download_data.sh deleted file mode 100755 index e7f9cd9..0000000 --- a/download_data.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -#!/bin/sh - -# Exit if any of the command doesn't exit with code 0 -set -e - -EXEC_DIR=$1 -test -z $EXEC_DIR && EXEC_DIR=. -DATA_LOCATION=$EXEC_DIR/data -mkdir -p $DATA_LOCATION - -if [ ! -f $DATA_LOCATION/X_train.bin ] || [ ! -f $DATA_LOCATION/X_test.bin ] \ -|| [ ! -f $DATA_LOCATION/y_train.bin ] || [ ! -f $DATA_LOCATION/y_test.bin ]; then -#if true; then - if [ ! -f $DATA_LOCATION/faces.tar.gz ]; then - echo 'Downloading raw dataset' - curl -o $DATA_LOCATION/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz - fi - - echo 'Extracting raw files' - tar xzf $DATA_LOCATION/faces.tar.gz -C $DATA_LOCATION - rm $DATA_LOCATION/README - rm $DATA_LOCATION/svm.* - - echo 'Extracting raw train set' - tar xzf $DATA_LOCATION/face.train.tar.gz -C $DATA_LOCATION - rm $DATA_LOCATION/face.train.tar.gz - - echo 'Extracting raw test set' - tar xzf $DATA_LOCATION/face.test.tar.gz -C $DATA_LOCATION - rm $DATA_LOCATION/face.test.tar.gz - - echo 'Converting raw dataset to bin file' - source $EXEC_DIR/python/activate.sh $EXEC_DIR - python $EXEC_DIR/python/convert_dataset.py $DATA_LOCATION - - echo 'Removing leftovers' - rm -rf $DATA_LOCATION/train - rm -rf $DATA_LOCATION/test - - echo 'Done !' -fi diff --git a/downloader/activate.sh b/downloader/activate.sh new file mode 100755 index 0000000..b8433b1 --- /dev/null +++ b/downloader/activate.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +# Exit if any of the command doesn't exit with code 0 +set -e + +test -z "$EXEC_DIR" && EXEC_DIR=. +test -z "$VENV_PATH" && VENV_PATH="$EXEC_DIR/venv" + +activate(){ + if [ ! -d "$VENV_PATH" ]; then + echo 'Creating python virtual environnement' + python -m venv "$VENV_PATH" + echo 'Activating virtual environnement' + activate + echo 'Updating base pip packages' + python -m pip install -U setuptools pip + echo 'Installing requirements' + pip install -r requirements.txt + elif [ -f "$VENV_PATH"/Scripts/activate ]; then . "$VENV_PATH"/Scripts/activate + elif [ -f "$VENV_PATH"/bin/activate ]; then . "$VENV_PATH"/bin/activate + else + echo 'Python virtual environnement not detected' + exit 1 + fi +} + +activate diff --git a/downloader/convert_dataset.py b/downloader/convert_dataset.py new file mode 100644 index 0000000..f756b01 --- /dev/null +++ b/downloader/convert_dataset.py @@ -0,0 +1,60 @@ +from io import BufferedReader +from tqdm import tqdm +from functools import partial +from sys import argv +import numpy as np +from os import path, listdir + +# Induce determinism +np.random.seed(133742) + +# Makes the "leave" argument default to False +tqdm = partial(tqdm, leave = False) + +def read_pgm(pgm_file: BufferedReader) -> np.ndarray: + """Read the data of a PGM file + + Args: + pgm_file (BufferedReader): PGM File + + Returns: + np.ndarray: PGM data + """ + assert (f := pgm_file.readline()) == b'P5\n', f"Incorrect file format: {f}" + (width, height) = [int(i) for i in pgm_file.readline().split()] + assert width > 0 and height > 0, f"Incorrect dimensions: {width}x{height}" + assert (depth := int(pgm_file.readline())) < 256, f"Incorrect depth: {depth}" + + buff = np.empty(height * width, dtype = np.uint8) + for i in range(buff.shape[0]): + buff[i] = ord(pgm_file.read(1)) + return buff.reshape((height, width)) + +def __main__(data_path: str) -> None: + """Read the data of every PGM file and output it in data files + + Args: + data_path (str): Path of the PGM files + """ + for set_name in tqdm(["train", "test"], desc = "set name"): + X, y = [], [] + for y_i, label in enumerate(tqdm(["non-face", "face"], desc = "label")): + for filename in tqdm(listdir(f"{data_path}/{set_name}/{label}"), desc = "Reading pgm file"): + with open(f"{data_path}/{set_name}/{label}/{filename}", "rb") as face: + X.append(read_pgm(face)) + y.append(y_i) + + X, y = np.asarray(X), np.asarray(y) + idx = np.random.permutation(y.shape[0]) + X, y = X[idx], y[idx] + + for org, s in tqdm(zip("Xy", [X, y]), desc = f"Writing {set_name}"): + with open(f"{data_path}/{org}_{set_name}.bin", "w") as out: + out.write(f'{str(s.shape)[1:-1].replace(",", "")}\n') + raw = s.ravel() + for s_i in tqdm(raw[:-1], desc = f"Writing {org}"): + out.write(f"{s_i} ") + out.write(str(raw[-1])) + +if __name__ == "__main__": + __main__(argv[1]) if len(argv) == 2 else print(f"Usage: python {__file__[__file__.rfind(path.sep) + 1:]} ./data_location") diff --git a/downloader/download_data.sh b/downloader/download_data.sh new file mode 100755 index 0000000..121b8d1 --- /dev/null +++ b/downloader/download_data.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +# Exit if any of the command doesn't exit with code 0 +set -e + +test -z "$EXEC_DIR" && EXEC_DIR=. +DATA_PATH="$EXEC_DIR/../data" +test ! -d "$DATA_PATH" && mkdir -v "$DATA_PATH" + +if [ ! -f "$DATA_PATH"/X_train.bin ] || [ ! -f "$DATA_PATH"/X_test.bin ] \ +|| [ ! -f "$DATA_PATH"/y_train.bin ] || [ ! -f "$DATA_PATH"/y_test.bin ]; then + if [ ! -f "$DATA_PATH"/faces.tar.gz ]; then + echo 'Downloading raw dataset' + curl -o "$DATA_PATH"/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz + fi + + echo 'Extracting raw files' + tar xvzf "$DATA_PATH"/faces.tar.gz -C "$DATA_PATH" + rm -v "$DATA_PATH"/README "$DATA_PATH"/svm.* + + echo 'Extracting raw train set' + tar xvzf "$DATA_PATH"/face.train.tar.gz -C "$DATA_PATH" + rm -v "$DATA_PATH"/face.train.tar.gz + + echo 'Extracting raw test set' + tar xvzf "$DATA_PATH"/face.test.tar.gz -C "$DATA_PATH" + rm -v "$DATA_PATH"/face.test.tar.gz + + echo 'Converting raw dataset to bin file' + export EXEC_DIR + . "$EXEC_DIR"/activate.sh + python "$EXEC_DIR"/convert_dataset.py "$DATA_PATH" + + echo 'Removing leftovers' + rm -rvf "$DATA_PATH"/train "$DATA_PATH"/test + + echo 'Done !' +fi diff --git a/downloader/requirements.txt b/downloader/requirements.txt new file mode 100644 index 0000000..10d4523 --- /dev/null +++ b/downloader/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.26.4 +tqdm==4.66.2