From c7fae479822ffb9f23351d038f5f0fe53e213bcf Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 26 Sep 2024 20:00:49 +0200 Subject: [PATCH] Restructuring part 2 (renamed datasets folder, moved data utils) --- codes/__init__.py | 2 +- codes/benchmark/bench_fcts.py | 2 +- codes/benchmark/bench_utils.py | 2 +- codes/dataset/__init__.py | 17 ------- codes/surrogates/DeepONet/don_utils.py | 2 +- codes/train/train_fcts.py | 3 +- codes/utils/__init__.py | 30 ++++++++---- codes/{dataset => utils}/data_utils.py | 10 ++-- config.yaml | 4 +- datasets/data_analysis/__init__.py | 8 ++++ .../data_analysis}/analyse_dataset.py | 8 ++-- .../data_analysis}/data_plots.py | 10 ++-- .../generate_simple_ode_data.py | 44 +++++------------- .../data_generation}/make_new_dataset.py | 12 +++-- {data => datasets}/data_sources.yaml | 0 .../osu2008/example_trajectories.png | Bin .../osu2008/surrogates_config.py | 0 run_training.py | 2 +- test/test_data.py | 4 +- 19 files changed, 77 insertions(+), 83 deletions(-) delete mode 100644 codes/dataset/__init__.py rename codes/{dataset => utils}/data_utils.py (98%) create mode 100644 datasets/data_analysis/__init__.py rename {codes/dataset => datasets/data_analysis}/analyse_dataset.py (84%) rename {codes/dataset => datasets/data_analysis}/data_plots.py (97%) rename {data_gen => datasets/data_generation}/generate_simple_ode_data.py (80%) rename {data_gen => datasets/data_generation}/make_new_dataset.py (57%) rename {data => datasets}/data_sources.yaml (100%) rename {data => datasets}/osu2008/example_trajectories.png (100%) rename {data => datasets}/osu2008/surrogates_config.py (100%) diff --git a/codes/__init__.py b/codes/__init__.py index 3f405d6..35e36ed 100644 --- a/codes/__init__.py +++ b/codes/__init__.py @@ -1,4 +1,4 @@ from .benchmark import * from .surrogates import * from .train import * -from .utils import * \ No newline at end of file +from .utils import * diff --git a/codes/benchmark/bench_fcts.py b/codes/benchmark/bench_fcts.py index 14608b7..c0ab58d 100644 --- a/codes/benchmark/bench_fcts.py +++ b/codes/benchmark/bench_fcts.py @@ -8,7 +8,7 @@ from tabulate import tabulate from torch.utils.data import DataLoader -from codes.dataset import check_and_load_data +from codes.utils import check_and_load_data from .bench_plots import ( inference_time_bar_plot, diff --git a/codes/benchmark/bench_utils.py b/codes/benchmark/bench_utils.py index 3429759..8683b1b 100644 --- a/codes/benchmark/bench_utils.py +++ b/codes/benchmark/bench_utils.py @@ -627,7 +627,7 @@ def get_model_config(surr_name: str, config: dict) -> dict: return {} dataset_name = config["dataset"]["name"].lower() - dataset_folder = f"data/{dataset_name}" + dataset_folder = f"datasets/{dataset_name}" config_file = f"{dataset_folder}/surrogates_config.py" if os.path.exists(config_file): diff --git a/codes/dataset/__init__.py b/codes/dataset/__init__.py deleted file mode 100644 index f01980c..0000000 --- a/codes/dataset/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from .data_utils import ( - create_hdf5_dataset, - check_and_load_data, - get_data_subset, - create_dataset, - normalize_data, - download_data -) - -__all__ = [ - "create_hdf5_dataset", - "check_and_load_data", - "get_data_subset", - "create_dataset", - "normalize_data", - "download_data" -] diff --git a/codes/surrogates/DeepONet/don_utils.py b/codes/surrogates/DeepONet/don_utils.py index f8e9145..ae167ca 100644 --- a/codes/surrogates/DeepONet/don_utils.py +++ b/codes/surrogates/DeepONet/don_utils.py @@ -56,7 +56,7 @@ def get_project_path(relative_path): """ Construct the absolute path to a project resource (data or model) based on a relative path. - :param relative_path: A relative path to the resource, e.g., "data/dataset100" or "models/02-28/model.pth". + :param relative_path: A relative path to the resource, e.g., "datasets/dataset100" or "models/02-28/model.pth". :return: The absolute path to the resource. """ import os diff --git a/codes/train/train_fcts.py b/codes/train/train_fcts.py index 341ba8b..3088e54 100644 --- a/codes/train/train_fcts.py +++ b/codes/train/train_fcts.py @@ -5,8 +5,9 @@ from tqdm import tqdm from codes.benchmark.bench_utils import get_model_config, get_surrogate -from codes.dataset import check_and_load_data, get_data_subset from codes.utils import ( + check_and_load_data, + get_data_subset, get_progress_bar, load_and_save_config, make_description, diff --git a/codes/utils/__init__.py b/codes/utils/__init__.py index 01f7365..8fc81d5 100644 --- a/codes/utils/__init__.py +++ b/codes/utils/__init__.py @@ -1,19 +1,33 @@ +from .data_utils import ( + check_and_load_data, + create_dataset, + create_hdf5_dataset, + download_data, + get_data_subset, + normalize_data, +) from .utils import ( - read_yaml_config, - time_execution, + check_training_status, create_model_dir, + get_progress_bar, load_and_save_config, - set_random_seeds, - nice_print, + load_task_list, make_description, - get_progress_bar, - worker_init_fn, + nice_print, + read_yaml_config, save_task_list, - load_task_list, - check_training_status, + set_random_seeds, + time_execution, + worker_init_fn, ) __all__ = [ + "check_and_load_data", + "create_dataset", + "create_hdf5_dataset", + "download_data", + "get_data_subset", + "normalize_data", "read_yaml_config", "time_execution", "create_model_dir", diff --git a/codes/dataset/data_utils.py b/codes/utils/data_utils.py similarity index 98% rename from codes/dataset/data_utils.py rename to codes/utils/data_utils.py index 12a7864..6a35d4a 100644 --- a/codes/dataset/data_utils.py +++ b/codes/utils/data_utils.py @@ -36,7 +36,7 @@ def check_and_load_data( Raises: DatasetError: If the dataset or required data is missing or if the data shape is incorrect. """ - data_dir = "data" + data_dir = "datasets" dataset_name_lower = dataset_name.lower() # Check if dataset exists @@ -231,7 +231,7 @@ def create_hdf5_dataset( test_data: np.ndarray, val_data: np.ndarray, dataset_name: str, - data_dir: str = "data", + data_dir: str = "datasets", timesteps: np.ndarray | None = None, labels: list[str] | None = None, ): @@ -337,7 +337,7 @@ def create_dataset( TypeError: If the train_data is not a numpy array or torch tensor. ValueError: If the train_data, test_data, and val_data do not have the correct shape. """ - base_dir = "data" + base_dir = "datasets" dataset_dir = os.path.join(base_dir, name) if os.path.exists(dataset_dir): @@ -440,14 +440,14 @@ def download_data(dataset_name: str, path: str | None = None): path (str, optional): The path to save the dataset. If None, the default data directory is used. """ data_path = ( - os.path.abspath(f"data/{dataset_name.lower()}/data.hdf5") + os.path.abspath(f"datasets/{dataset_name.lower()}/data.hdf5") if path is None else os.path.abspath(path) ) if os.path.isfile(data_path): return - with open("data/data_sources.yaml", "r", encoding="utf-8") as file: + with open("datasets/data_sources.yaml", "r", encoding="utf-8") as file: data_sources = yaml.safe_load(file) try: diff --git a/config.yaml b/config.yaml index 1f54eec..5af6d32 100644 --- a/config.yaml +++ b/config.yaml @@ -1,11 +1,11 @@ # Global settings for the benchmark -training_id: "delete_me2" +training_id: "delete_me3" surrogates: ["LatentPoly", "LatentNeuralODE", "FullyConnected", "MultiONet"] batch_size: [256, 256, 256, 256] epochs: [2, 2, 2, 2] dataset: name: "osu2008" - log10_transform: True + log10_transform: False normalise: "minmax" # "standardise", "minmax", "disable" use_optimal_params: True devices: ["cuda:1"] diff --git a/datasets/data_analysis/__init__.py b/datasets/data_analysis/__init__.py new file mode 100644 index 0000000..aecd829 --- /dev/null +++ b/datasets/data_analysis/__init__.py @@ -0,0 +1,8 @@ +__all__ = [ + "create_hdf5_dataset", + "check_and_load_data", + "get_data_subset", + "create_dataset", + "normalize_data", + "download_data" +] diff --git a/codes/dataset/analyse_dataset.py b/datasets/data_analysis/analyse_dataset.py similarity index 84% rename from codes/dataset/analyse_dataset.py rename to datasets/data_analysis/analyse_dataset.py index 2730bfd..d6dc40b 100644 --- a/codes/dataset/analyse_dataset.py +++ b/datasets/data_analysis/analyse_dataset.py @@ -1,11 +1,11 @@ -import os import sys from argparse import ArgumentParser -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +sys.path.insert(1, "../..") -from data import check_and_load_data -from data.data_plots import plot_example_trajectories, plot_example_trajectories_paper +from codes import check_and_load_data + +from .data_plots import plot_example_trajectories, plot_example_trajectories_paper def main(args): diff --git a/codes/dataset/data_plots.py b/datasets/data_analysis/data_plots.py similarity index 97% rename from codes/dataset/data_plots.py rename to datasets/data_analysis/data_plots.py index 864615a..990334f 100644 --- a/codes/dataset/data_plots.py +++ b/datasets/data_analysis/data_plots.py @@ -1,7 +1,11 @@ +import sys + import matplotlib.pyplot as plt import numpy as np -from benchmark import save_plot +sys.path.insert(1, "../..") + +from codes import save_plot def plot_example_trajectories( @@ -72,7 +76,7 @@ def plot_example_trajectories( "example_trajectories.png", conf, dpi=300, - base_dir="data", + base_dir="datasets", increase_count=False, ) @@ -196,7 +200,7 @@ def plot_example_trajectories_paper( "example_trajectories_paper.png", conf, dpi=300, - base_dir="data", + base_dir="datasets", increase_count=False, ) diff --git a/data_gen/generate_simple_ode_data.py b/datasets/data_generation/generate_simple_ode_data.py similarity index 80% rename from data_gen/generate_simple_ode_data.py rename to datasets/data_generation/generate_simple_ode_data.py index 065dcf5..3718ed8 100644 --- a/data_gen/generate_simple_ode_data.py +++ b/datasets/data_generation/generate_simple_ode_data.py @@ -1,13 +1,18 @@ # TODO: move this to an appropriate location import os + +# Add codes package to the path (two keys up) +import sys from argparse import ArgumentParser from typing import Callable import numpy as np from scipy.integrate import solve_ivp -from codes.dataset.data_utils import create_dataset +sys.path.insert(1, "../..") + +from codes.utils.data_utils import create_dataset def lotka_volterra(t, n): @@ -55,7 +60,7 @@ def reaction(t, n): array Array of the derivatives of the abundances of species s1, s2, s3, s4, s5, and s6. """ - s1, s2, s3, s4, s5, s6 = n[0], n[1], n[2], n[3], n[4], n[5] + s1, s2, s3, s4, s5, _ = n[0], n[1], n[2], n[3], n[4], n[5] return np.array( [ -0.1 * s1 + 0.1 * s2, @@ -68,33 +73,6 @@ def reaction(t, n): ) -# def func(t, n): -# """ -# Differential equations for a simple ODE system. - -# Parameters -# ---------- -# t : float -# Time -# n : array -# Array of concentrations of species A, B, C, D, and E. - -# Returns -# ------- -# array -# Array of the derivatives of the concentrations of species A, B, C, D, and E. -# """ -# k = np.array([0.8, 0.5, 0.2]) -# return np.array( -# [ -# -k[0] * n[0] - k[2] * n[0] * n[2], -# k[0] * n[0] - k[1] * n[1] + 2 * k[2] * n[0] * n[2], -# k[1] * n[1] - k[2] * n[0] * n[2], -# k[2] * n[0] + k[1] / k[0] * n[1], -# k[0] * n[0] / k[1] * n[2] - k[1] * n[0] * n[2], -# ] -# ) - FUNCS = { "lotka_volterra": { "func": lotka_volterra, @@ -130,13 +108,15 @@ def create_data(num: int, func: Callable, timesteps: np.ndarray, dim: int): def main(args): - if os.path.exists(f"data/{args.name}"): + # Switch cwd to the root directory + os.chdir("../..") + if os.path.exists(f"datasets/{args.name}"): res = input( - f"The data directory 'data/{args.name}' already exists. Press Enter to overwrite it." + f"The data directory 'datasets/{args.name}' already exists. Press Enter to overwrite it." ) if res != "": return - os.system(f"rm -r data/data/{args.name}") + os.system(f"rm -r datasets/{args.name}") if not FUNCS.get(args.func): print(f"Function {args.func} not found") diff --git a/data_gen/make_new_dataset.py b/datasets/data_generation/make_new_dataset.py similarity index 57% rename from data_gen/make_new_dataset.py rename to datasets/data_generation/make_new_dataset.py index 9757b01..1e48d74 100644 --- a/data_gen/make_new_dataset.py +++ b/datasets/data_generation/make_new_dataset.py @@ -1,16 +1,20 @@ +import sys + import numpy as np -from data import create_dataset +sys.path.insert(1, "../..") + +from codes import create_dataset if __name__ == "__main__": # Create a new dataset - train_data = np.load("data/osu2008_old/train_data.npy") - test_data = np.load("data/osu2008_old/test_data.npy") + train_data = np.load("datasets/osu2008_old/train_data.npy") + test_data = np.load("datasets/osu2008_old/test_data.npy") full_dataset = np.concatenate((train_data, test_data), axis=0) np.random.shuffle(full_dataset) labels = None create_dataset( - "osu2008", + "osu2008_test", full_dataset, timesteps=np.linspace(0, 1, 100), labels=labels, diff --git a/data/data_sources.yaml b/datasets/data_sources.yaml similarity index 100% rename from data/data_sources.yaml rename to datasets/data_sources.yaml diff --git a/data/osu2008/example_trajectories.png b/datasets/osu2008/example_trajectories.png similarity index 100% rename from data/osu2008/example_trajectories.png rename to datasets/osu2008/example_trajectories.png diff --git a/data/osu2008/surrogates_config.py b/datasets/osu2008/surrogates_config.py similarity index 100% rename from data/osu2008/surrogates_config.py rename to datasets/osu2008/surrogates_config.py diff --git a/run_training.py b/run_training.py index 24171c3..fe849c4 100644 --- a/run_training.py +++ b/run_training.py @@ -3,7 +3,7 @@ from tqdm import tqdm -from codes.dataset.data_utils import download_data +from codes.utils.data_utils import download_data from codes.train import create_task_list_for_surrogate, parallel_training, sequential_training from codes.utils import ( check_training_status, diff --git a/test/test_data.py b/test/test_data.py index 258efa1..2f0f728 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -3,9 +3,9 @@ import pytest -from codes.dataset.data_utils import check_and_load_data, download_data +from codes.utils.data_utils import check_and_load_data, download_data -paths = glob.glob("data/*/data.hdf5") +paths = glob.glob("datasets/*/data.hdf5") dataset_names = [path.split("/")[1] for path in paths]