Loads the fashion_mnist training and test data from the data directory.
The function returns four numpy arrays containing the training and test data
respectively.
If specified it can also return the standard scaled version of the data or
the first 10 principal components of the data.
The different dimensions of the returned data is below:
|
Raw |
Scaled |
PCA |
Training |
|
|
|
\(X\) |
\((10.000 \times 784)\) |
\((10.000 \times 784)\) |
\((10.000 \times 10)\) |
\(Y\) |
\((10.000 \times 1)\) |
\((10.000 \times 1)\) |
\((10.000 \times 1)\) |
Test |
|
|
|
\(X\) |
\((5.000 \times 784)\) |
\((5.000 \times 784)\) |
\((5.000 \times 10)\) |
\(Y\) |
\((5.000 \times 1)\) |
\((5.000 \times 1)\) |
\((5.000 \times 1)\) |
Returns:
Type |
Description |
2d ndarrays
|
numpy data arrays in the order X_train, X_test, y_train, y_test. |
Source code in mlproject/helpers/_data_loader.py
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60 | def data_loader(raw=True, scaled=False, pca=False):
r"""Loads the fashion_mnist training and test data from the data directory.
The function returns four numpy arrays containing the training and test data
respectively.
If specified it can also return the standard scaled version of the data or
the first 10 principal components of the data.
The different dimensions of the returned data is below:
| | Raw | Scaled | PCA |
|:------------:|:---------------------:|:---------------------:|:--------------------:|
| **Training** | | | |
| $X$ | $(10.000 \times 784)$ | $(10.000 \times 784)$ | $(10.000 \times 10)$ |
| $Y$ | $(10.000 \times 1)$ | $(10.000 \times 1)$ | $(10.000 \times 1)$ |
| **Test** | | | |
| $X$ | $(5.000 \times 784)$ | $(5.000 \times 784)$ | $(5.000 \times 10)$ |
| $Y$ | $(5.000 \times 1)$ | $(5.000 \times 1)$ | $(5.000 \times 1)$ |
Returns
-------
2d ndarrays
numpy data arrays in the order X_train, X_test, y_train, y_test.
"""
if raw and not scaled and not pca:
X_train, y_train = np.hsplit(
np.load(f"{ROOT_DIR}/data/fashion_train.npy"), [-1]
)
X_test, y_test = np.hsplit(np.load(f"{ROOT_DIR}/data/fashion_test.npy"), [-1])
elif scaled and not raw and not pca:
X_train, y_train = np.hsplit(
np.load(f"{ROOT_DIR}/data/fashion_train_scaled.npy"), [-1]
)
X_test, y_test = np.hsplit(
np.load(f"{ROOT_DIR}/data/fashion_test_scaled.npy"), [-1]
)
# converting the y_labels back to integers from floats to avoid issues
y_train, y_test = y_train.astype(int), y_test.astype(int)
elif pca and not raw and not scaled:
X_train, y_train = np.hsplit(
np.load(f"{ROOT_DIR}/data/fashion_train_pca.npy"), [-1]
)
X_test, y_test = np.hsplit(
np.load(f"{ROOT_DIR}/data/fashion_test_pca.npy"), [-1]
)
# converting the y_labels back to integers from floats to avoid issues
y_train, y_test = y_train.astype(int), y_test.astype(int)
else:
raise ValueError("If raw, scaled or pca is True, then all other arguments must be False.")
return X_train, X_test, y_train, y_test
|