moved download_data.sh to seperate module downloader
This commit is contained in:
parent
211dcad893
commit
8740f7ea4b
@ -1,42 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/sh
|
||||
|
||||
# Exit if any of the command doesn't exit with code 0
|
||||
set -e
|
||||
|
||||
EXEC_DIR=$1
|
||||
test -z $EXEC_DIR && EXEC_DIR=.
|
||||
DATA_LOCATION=$EXEC_DIR/data
|
||||
mkdir -p $DATA_LOCATION
|
||||
|
||||
if [ ! -f $DATA_LOCATION/X_train.bin ] || [ ! -f $DATA_LOCATION/X_test.bin ] \
|
||||
|| [ ! -f $DATA_LOCATION/y_train.bin ] || [ ! -f $DATA_LOCATION/y_test.bin ]; then
|
||||
#if true; then
|
||||
if [ ! -f $DATA_LOCATION/faces.tar.gz ]; then
|
||||
echo 'Downloading raw dataset'
|
||||
curl -o $DATA_LOCATION/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz
|
||||
fi
|
||||
|
||||
echo 'Extracting raw files'
|
||||
tar xzf $DATA_LOCATION/faces.tar.gz -C $DATA_LOCATION
|
||||
rm $DATA_LOCATION/README
|
||||
rm $DATA_LOCATION/svm.*
|
||||
|
||||
echo 'Extracting raw train set'
|
||||
tar xzf $DATA_LOCATION/face.train.tar.gz -C $DATA_LOCATION
|
||||
rm $DATA_LOCATION/face.train.tar.gz
|
||||
|
||||
echo 'Extracting raw test set'
|
||||
tar xzf $DATA_LOCATION/face.test.tar.gz -C $DATA_LOCATION
|
||||
rm $DATA_LOCATION/face.test.tar.gz
|
||||
|
||||
echo 'Converting raw dataset to bin file'
|
||||
source $EXEC_DIR/python/activate.sh $EXEC_DIR
|
||||
python $EXEC_DIR/python/convert_dataset.py $DATA_LOCATION
|
||||
|
||||
echo 'Removing leftovers'
|
||||
rm -rf $DATA_LOCATION/train
|
||||
rm -rf $DATA_LOCATION/test
|
||||
|
||||
echo 'Done !'
|
||||
fi
|
27
downloader/activate.sh
Executable file
27
downloader/activate.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Exit if any of the command doesn't exit with code 0
|
||||
set -e
|
||||
|
||||
test -z "$EXEC_DIR" && EXEC_DIR=.
|
||||
test -z "$VENV_PATH" && VENV_PATH="$EXEC_DIR/venv"
|
||||
|
||||
activate(){
|
||||
if [ ! -d "$VENV_PATH" ]; then
|
||||
echo 'Creating python virtual environnement'
|
||||
python -m venv "$VENV_PATH"
|
||||
echo 'Activating virtual environnement'
|
||||
activate
|
||||
echo 'Updating base pip packages'
|
||||
python -m pip install -U setuptools pip
|
||||
echo 'Installing requirements'
|
||||
pip install -r requirements.txt
|
||||
elif [ -f "$VENV_PATH"/Scripts/activate ]; then . "$VENV_PATH"/Scripts/activate
|
||||
elif [ -f "$VENV_PATH"/bin/activate ]; then . "$VENV_PATH"/bin/activate
|
||||
else
|
||||
echo 'Python virtual environnement not detected'
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
activate
|
60
downloader/convert_dataset.py
Normal file
60
downloader/convert_dataset.py
Normal file
@ -0,0 +1,60 @@
|
||||
from io import BufferedReader
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
from sys import argv
|
||||
import numpy as np
|
||||
from os import path, listdir
|
||||
|
||||
# Induce determinism
|
||||
np.random.seed(133742)
|
||||
|
||||
# Makes the "leave" argument default to False
|
||||
tqdm = partial(tqdm, leave = False)
|
||||
|
||||
def read_pgm(pgm_file: BufferedReader) -> np.ndarray:
|
||||
"""Read the data of a PGM file
|
||||
|
||||
Args:
|
||||
pgm_file (BufferedReader): PGM File
|
||||
|
||||
Returns:
|
||||
np.ndarray: PGM data
|
||||
"""
|
||||
assert (f := pgm_file.readline()) == b'P5\n', f"Incorrect file format: {f}"
|
||||
(width, height) = [int(i) for i in pgm_file.readline().split()]
|
||||
assert width > 0 and height > 0, f"Incorrect dimensions: {width}x{height}"
|
||||
assert (depth := int(pgm_file.readline())) < 256, f"Incorrect depth: {depth}"
|
||||
|
||||
buff = np.empty(height * width, dtype = np.uint8)
|
||||
for i in range(buff.shape[0]):
|
||||
buff[i] = ord(pgm_file.read(1))
|
||||
return buff.reshape((height, width))
|
||||
|
||||
def __main__(data_path: str) -> None:
|
||||
"""Read the data of every PGM file and output it in data files
|
||||
|
||||
Args:
|
||||
data_path (str): Path of the PGM files
|
||||
"""
|
||||
for set_name in tqdm(["train", "test"], desc = "set name"):
|
||||
X, y = [], []
|
||||
for y_i, label in enumerate(tqdm(["non-face", "face"], desc = "label")):
|
||||
for filename in tqdm(listdir(f"{data_path}/{set_name}/{label}"), desc = "Reading pgm file"):
|
||||
with open(f"{data_path}/{set_name}/{label}/{filename}", "rb") as face:
|
||||
X.append(read_pgm(face))
|
||||
y.append(y_i)
|
||||
|
||||
X, y = np.asarray(X), np.asarray(y)
|
||||
idx = np.random.permutation(y.shape[0])
|
||||
X, y = X[idx], y[idx]
|
||||
|
||||
for org, s in tqdm(zip("Xy", [X, y]), desc = f"Writing {set_name}"):
|
||||
with open(f"{data_path}/{org}_{set_name}.bin", "w") as out:
|
||||
out.write(f'{str(s.shape)[1:-1].replace(",", "")}\n')
|
||||
raw = s.ravel()
|
||||
for s_i in tqdm(raw[:-1], desc = f"Writing {org}"):
|
||||
out.write(f"{s_i} ")
|
||||
out.write(str(raw[-1]))
|
||||
|
||||
if __name__ == "__main__":
|
||||
__main__(argv[1]) if len(argv) == 2 else print(f"Usage: python {__file__[__file__.rfind(path.sep) + 1:]} ./data_location")
|
38
downloader/download_data.sh
Executable file
38
downloader/download_data.sh
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Exit if any of the command doesn't exit with code 0
|
||||
set -e
|
||||
|
||||
test -z "$EXEC_DIR" && EXEC_DIR=.
|
||||
DATA_PATH="$EXEC_DIR/../data"
|
||||
test ! -d "$DATA_PATH" && mkdir -v "$DATA_PATH"
|
||||
|
||||
if [ ! -f "$DATA_PATH"/X_train.bin ] || [ ! -f "$DATA_PATH"/X_test.bin ] \
|
||||
|| [ ! -f "$DATA_PATH"/y_train.bin ] || [ ! -f "$DATA_PATH"/y_test.bin ]; then
|
||||
if [ ! -f "$DATA_PATH"/faces.tar.gz ]; then
|
||||
echo 'Downloading raw dataset'
|
||||
curl -o "$DATA_PATH"/faces.tar.gz http://www.ai.mit.edu/courses/6.899/lectures/faces.tar.gz
|
||||
fi
|
||||
|
||||
echo 'Extracting raw files'
|
||||
tar xvzf "$DATA_PATH"/faces.tar.gz -C "$DATA_PATH"
|
||||
rm -v "$DATA_PATH"/README "$DATA_PATH"/svm.*
|
||||
|
||||
echo 'Extracting raw train set'
|
||||
tar xvzf "$DATA_PATH"/face.train.tar.gz -C "$DATA_PATH"
|
||||
rm -v "$DATA_PATH"/face.train.tar.gz
|
||||
|
||||
echo 'Extracting raw test set'
|
||||
tar xvzf "$DATA_PATH"/face.test.tar.gz -C "$DATA_PATH"
|
||||
rm -v "$DATA_PATH"/face.test.tar.gz
|
||||
|
||||
echo 'Converting raw dataset to bin file'
|
||||
export EXEC_DIR
|
||||
. "$EXEC_DIR"/activate.sh
|
||||
python "$EXEC_DIR"/convert_dataset.py "$DATA_PATH"
|
||||
|
||||
echo 'Removing leftovers'
|
||||
rm -rvf "$DATA_PATH"/train "$DATA_PATH"/test
|
||||
|
||||
echo 'Done !'
|
||||
fi
|
2
downloader/requirements.txt
Normal file
2
downloader/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
numpy==1.26.4
|
||||
tqdm==4.66.2
|
Loading…
x
Reference in New Issue
Block a user