From c7fae479822ffb9f23351d038f5f0fe53e213bcf Mon Sep 17 00:00:00 2001
From: Robin <robin.janssen@stud.uni-heidelberg.de>
Date: Thu, 26 Sep 2024 20:00:49 +0200
Subject: [PATCH] Restructuring part 2 (renamed datasets folder, moved data
 utils)

---
 codes/__init__.py                             |   2 +-
 codes/benchmark/bench_fcts.py                 |   2 +-
 codes/benchmark/bench_utils.py                |   2 +-
 codes/dataset/__init__.py                     |  17 -------
 codes/surrogates/DeepONet/don_utils.py        |   2 +-
 codes/train/train_fcts.py                     |   3 +-
 codes/utils/__init__.py                       |  30 ++++++++----
 codes/{dataset => utils}/data_utils.py        |  10 ++--
 config.yaml                                   |   4 +-
 datasets/data_analysis/__init__.py            |   8 ++++
 .../data_analysis}/analyse_dataset.py         |   8 ++--
 .../data_analysis}/data_plots.py              |  10 ++--
 .../generate_simple_ode_data.py               |  44 +++++-------------
 .../data_generation}/make_new_dataset.py      |  12 +++--
 {data => datasets}/data_sources.yaml          |   0
 .../osu2008/example_trajectories.png          | Bin
 .../osu2008/surrogates_config.py              |   0
 run_training.py                               |   2 +-
 test/test_data.py                             |   4 +-
 19 files changed, 77 insertions(+), 83 deletions(-)
 delete mode 100644 codes/dataset/__init__.py
 rename codes/{dataset => utils}/data_utils.py (98%)
 create mode 100644 datasets/data_analysis/__init__.py
 rename {codes/dataset => datasets/data_analysis}/analyse_dataset.py (84%)
 rename {codes/dataset => datasets/data_analysis}/data_plots.py (97%)
 rename {data_gen => datasets/data_generation}/generate_simple_ode_data.py (80%)
 rename {data_gen => datasets/data_generation}/make_new_dataset.py (57%)
 rename {data => datasets}/data_sources.yaml (100%)
 rename {data => datasets}/osu2008/example_trajectories.png (100%)
 rename {data => datasets}/osu2008/surrogates_config.py (100%)

diff --git a/codes/__init__.py b/codes/__init__.py
index 3f405d6..35e36ed 100644
--- a/codes/__init__.py
+++ b/codes/__init__.py
@@ -1,4 +1,4 @@
 from .benchmark import *
 from .surrogates import *
 from .train import *
-from .utils import *
\ No newline at end of file
+from .utils import *
diff --git a/codes/benchmark/bench_fcts.py b/codes/benchmark/bench_fcts.py
index 14608b7..c0ab58d 100644
--- a/codes/benchmark/bench_fcts.py
+++ b/codes/benchmark/bench_fcts.py
@@ -8,7 +8,7 @@
 from tabulate import tabulate
 from torch.utils.data import DataLoader
 
-from codes.dataset import check_and_load_data
+from codes.utils import check_and_load_data
 
 from .bench_plots import (
     inference_time_bar_plot,
diff --git a/codes/benchmark/bench_utils.py b/codes/benchmark/bench_utils.py
index 3429759..8683b1b 100644
--- a/codes/benchmark/bench_utils.py
+++ b/codes/benchmark/bench_utils.py
@@ -627,7 +627,7 @@ def get_model_config(surr_name: str, config: dict) -> dict:
         return {}
 
     dataset_name = config["dataset"]["name"].lower()
-    dataset_folder = f"data/{dataset_name}"
+    dataset_folder = f"datasets/{dataset_name}"
     config_file = f"{dataset_folder}/surrogates_config.py"
 
     if os.path.exists(config_file):
diff --git a/codes/dataset/__init__.py b/codes/dataset/__init__.py
deleted file mode 100644
index f01980c..0000000
--- a/codes/dataset/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from .data_utils import (
-    create_hdf5_dataset,
-    check_and_load_data,
-    get_data_subset,
-    create_dataset,
-    normalize_data,
-    download_data
-)
-
-__all__ = [
-    "create_hdf5_dataset",
-    "check_and_load_data",
-    "get_data_subset",
-    "create_dataset",
-    "normalize_data",
-    "download_data"
-]
diff --git a/codes/surrogates/DeepONet/don_utils.py b/codes/surrogates/DeepONet/don_utils.py
index f8e9145..ae167ca 100644
--- a/codes/surrogates/DeepONet/don_utils.py
+++ b/codes/surrogates/DeepONet/don_utils.py
@@ -56,7 +56,7 @@ def get_project_path(relative_path):
     """
     Construct the absolute path to a project resource (data or model) based on a relative path.
 
-    :param relative_path: A relative path to the resource, e.g., "data/dataset100" or "models/02-28/model.pth".
+    :param relative_path: A relative path to the resource, e.g., "datasets/dataset100" or "models/02-28/model.pth".
     :return: The absolute path to the resource.
     """
     import os
diff --git a/codes/train/train_fcts.py b/codes/train/train_fcts.py
index 341ba8b..3088e54 100644
--- a/codes/train/train_fcts.py
+++ b/codes/train/train_fcts.py
@@ -5,8 +5,9 @@
 from tqdm import tqdm
 
 from codes.benchmark.bench_utils import get_model_config, get_surrogate
-from codes.dataset import check_and_load_data, get_data_subset
 from codes.utils import (
+    check_and_load_data,
+    get_data_subset,
     get_progress_bar,
     load_and_save_config,
     make_description,
diff --git a/codes/utils/__init__.py b/codes/utils/__init__.py
index 01f7365..8fc81d5 100644
--- a/codes/utils/__init__.py
+++ b/codes/utils/__init__.py
@@ -1,19 +1,33 @@
+from .data_utils import (
+    check_and_load_data,
+    create_dataset,
+    create_hdf5_dataset,
+    download_data,
+    get_data_subset,
+    normalize_data,
+)
 from .utils import (
-    read_yaml_config,
-    time_execution,
+    check_training_status,
     create_model_dir,
+    get_progress_bar,
     load_and_save_config,
-    set_random_seeds,
-    nice_print,
+    load_task_list,
     make_description,
-    get_progress_bar,
-    worker_init_fn,
+    nice_print,
+    read_yaml_config,
     save_task_list,
-    load_task_list,
-    check_training_status,
+    set_random_seeds,
+    time_execution,
+    worker_init_fn,
 )
 
 __all__ = [
+    "check_and_load_data",
+    "create_dataset",
+    "create_hdf5_dataset",
+    "download_data",
+    "get_data_subset",
+    "normalize_data",
     "read_yaml_config",
     "time_execution",
     "create_model_dir",
diff --git a/codes/dataset/data_utils.py b/codes/utils/data_utils.py
similarity index 98%
rename from codes/dataset/data_utils.py
rename to codes/utils/data_utils.py
index 12a7864..6a35d4a 100644
--- a/codes/dataset/data_utils.py
+++ b/codes/utils/data_utils.py
@@ -36,7 +36,7 @@ def check_and_load_data(
     Raises:
         DatasetError: If the dataset or required data is missing or if the data shape is incorrect.
     """
-    data_dir = "data"
+    data_dir = "datasets"
     dataset_name_lower = dataset_name.lower()
 
     # Check if dataset exists
@@ -231,7 +231,7 @@ def create_hdf5_dataset(
     test_data: np.ndarray,
     val_data: np.ndarray,
     dataset_name: str,
-    data_dir: str = "data",
+    data_dir: str = "datasets",
     timesteps: np.ndarray | None = None,
     labels: list[str] | None = None,
 ):
@@ -337,7 +337,7 @@ def create_dataset(
         TypeError: If the train_data is not a numpy array or torch tensor.
         ValueError: If the train_data, test_data, and val_data do not have the correct shape.
     """
-    base_dir = "data"
+    base_dir = "datasets"
     dataset_dir = os.path.join(base_dir, name)
 
     if os.path.exists(dataset_dir):
@@ -440,14 +440,14 @@ def download_data(dataset_name: str, path: str | None = None):
         path (str, optional): The path to save the dataset. If None, the default data directory is used.
     """
     data_path = (
-        os.path.abspath(f"data/{dataset_name.lower()}/data.hdf5")
+        os.path.abspath(f"datasets/{dataset_name.lower()}/data.hdf5")
         if path is None
         else os.path.abspath(path)
     )
     if os.path.isfile(data_path):
         return
 
-    with open("data/data_sources.yaml", "r", encoding="utf-8") as file:
+    with open("datasets/data_sources.yaml", "r", encoding="utf-8") as file:
         data_sources = yaml.safe_load(file)
 
     try:
diff --git a/config.yaml b/config.yaml
index 1f54eec..5af6d32 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,11 +1,11 @@
 # Global settings for the benchmark
-training_id: "delete_me2"
+training_id: "delete_me3"
 surrogates: ["LatentPoly", "LatentNeuralODE", "FullyConnected", "MultiONet"]
 batch_size: [256, 256, 256, 256]
 epochs: [2, 2, 2, 2]
 dataset: 
   name: "osu2008"
-  log10_transform: True
+  log10_transform: False
   normalise: "minmax" # "standardise", "minmax", "disable"
   use_optimal_params: True
 devices: ["cuda:1"]
diff --git a/datasets/data_analysis/__init__.py b/datasets/data_analysis/__init__.py
new file mode 100644
index 0000000..aecd829
--- /dev/null
+++ b/datasets/data_analysis/__init__.py
@@ -0,0 +1,8 @@
+__all__ = [
+    "create_hdf5_dataset",
+    "check_and_load_data",
+    "get_data_subset",
+    "create_dataset",
+    "normalize_data",
+    "download_data"
+]
diff --git a/codes/dataset/analyse_dataset.py b/datasets/data_analysis/analyse_dataset.py
similarity index 84%
rename from codes/dataset/analyse_dataset.py
rename to datasets/data_analysis/analyse_dataset.py
index 2730bfd..d6dc40b 100644
--- a/codes/dataset/analyse_dataset.py
+++ b/datasets/data_analysis/analyse_dataset.py
@@ -1,11 +1,11 @@
-import os
 import sys
 from argparse import ArgumentParser
 
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.insert(1, "../..")
 
-from data import check_and_load_data
-from data.data_plots import plot_example_trajectories, plot_example_trajectories_paper
+from codes import check_and_load_data
+
+from .data_plots import plot_example_trajectories, plot_example_trajectories_paper
 
 
 def main(args):
diff --git a/codes/dataset/data_plots.py b/datasets/data_analysis/data_plots.py
similarity index 97%
rename from codes/dataset/data_plots.py
rename to datasets/data_analysis/data_plots.py
index 864615a..990334f 100644
--- a/codes/dataset/data_plots.py
+++ b/datasets/data_analysis/data_plots.py
@@ -1,7 +1,11 @@
+import sys
+
 import matplotlib.pyplot as plt
 import numpy as np
 
-from benchmark import save_plot
+sys.path.insert(1, "../..")
+
+from codes import save_plot
 
 
 def plot_example_trajectories(
@@ -72,7 +76,7 @@ def plot_example_trajectories(
             "example_trajectories.png",
             conf,
             dpi=300,
-            base_dir="data",
+            base_dir="datasets",
             increase_count=False,
         )
 
@@ -196,7 +200,7 @@ def plot_example_trajectories_paper(
             "example_trajectories_paper.png",
             conf,
             dpi=300,
-            base_dir="data",
+            base_dir="datasets",
             increase_count=False,
         )
 
diff --git a/data_gen/generate_simple_ode_data.py b/datasets/data_generation/generate_simple_ode_data.py
similarity index 80%
rename from data_gen/generate_simple_ode_data.py
rename to datasets/data_generation/generate_simple_ode_data.py
index 065dcf5..3718ed8 100644
--- a/data_gen/generate_simple_ode_data.py
+++ b/datasets/data_generation/generate_simple_ode_data.py
@@ -1,13 +1,18 @@
 # TODO: move this to an appropriate location
 
 import os
+
+# Add codes package to the path (two keys up)
+import sys
 from argparse import ArgumentParser
 from typing import Callable
 
 import numpy as np
 from scipy.integrate import solve_ivp
 
-from codes.dataset.data_utils import create_dataset
+sys.path.insert(1, "../..")
+
+from codes.utils.data_utils import create_dataset
 
 
 def lotka_volterra(t, n):
@@ -55,7 +60,7 @@ def reaction(t, n):
     array
         Array of the derivatives of the abundances of species s1, s2, s3, s4, s5, and s6.
     """
-    s1, s2, s3, s4, s5, s6 = n[0], n[1], n[2], n[3], n[4], n[5]
+    s1, s2, s3, s4, s5, _ = n[0], n[1], n[2], n[3], n[4], n[5]
     return np.array(
         [
             -0.1 * s1 + 0.1 * s2,
@@ -68,33 +73,6 @@ def reaction(t, n):
     )
 
 
-# def func(t, n):
-#     """
-#     Differential equations for a simple ODE system.
-
-#     Parameters
-#     ----------
-#     t : float
-#         Time
-#     n : array
-#         Array of concentrations of species A, B, C, D, and E.
-
-#     Returns
-#     -------
-#     array
-#         Array of the derivatives of the concentrations of species A, B, C, D, and E.
-#     """
-#     k = np.array([0.8, 0.5, 0.2])
-#     return np.array(
-#         [
-#             -k[0] * n[0] - k[2] * n[0] * n[2],
-#             k[0] * n[0] - k[1] * n[1] + 2 * k[2] * n[0] * n[2],
-#             k[1] * n[1] - k[2] * n[0] * n[2],
-#             k[2] * n[0] + k[1] / k[0] * n[1],
-#             k[0] * n[0] / k[1] * n[2] - k[1] * n[0] * n[2],
-#         ]
-#     )
-
 FUNCS = {
     "lotka_volterra": {
         "func": lotka_volterra,
@@ -130,13 +108,15 @@ def create_data(num: int, func: Callable, timesteps: np.ndarray, dim: int):
 
 def main(args):
 
-    if os.path.exists(f"data/{args.name}"):
+    # Switch cwd to the root directory
+    os.chdir("../..")
+    if os.path.exists(f"datasets/{args.name}"):
         res = input(
-            f"The data directory 'data/{args.name}' already exists. Press Enter to overwrite it."
+            f"The data directory 'datasets/{args.name}' already exists. Press Enter to overwrite it."
         )
         if res != "":
             return
-        os.system(f"rm -r data/data/{args.name}")
+        os.system(f"rm -r datasets/{args.name}")
 
     if not FUNCS.get(args.func):
         print(f"Function {args.func} not found")
diff --git a/data_gen/make_new_dataset.py b/datasets/data_generation/make_new_dataset.py
similarity index 57%
rename from data_gen/make_new_dataset.py
rename to datasets/data_generation/make_new_dataset.py
index 9757b01..1e48d74 100644
--- a/data_gen/make_new_dataset.py
+++ b/datasets/data_generation/make_new_dataset.py
@@ -1,16 +1,20 @@
+import sys
+
 import numpy as np
 
-from data import create_dataset
+sys.path.insert(1, "../..")
+
+from codes import create_dataset
 
 if __name__ == "__main__":
     # Create a new dataset
-    train_data = np.load("data/osu2008_old/train_data.npy")
-    test_data = np.load("data/osu2008_old/test_data.npy")
+    train_data = np.load("datasets/osu2008_old/train_data.npy")
+    test_data = np.load("datasets/osu2008_old/test_data.npy")
     full_dataset = np.concatenate((train_data, test_data), axis=0)
     np.random.shuffle(full_dataset)
     labels = None
     create_dataset(
-        "osu2008",
+        "osu2008_test",
         full_dataset,
         timesteps=np.linspace(0, 1, 100),
         labels=labels,
diff --git a/data/data_sources.yaml b/datasets/data_sources.yaml
similarity index 100%
rename from data/data_sources.yaml
rename to datasets/data_sources.yaml
diff --git a/data/osu2008/example_trajectories.png b/datasets/osu2008/example_trajectories.png
similarity index 100%
rename from data/osu2008/example_trajectories.png
rename to datasets/osu2008/example_trajectories.png
diff --git a/data/osu2008/surrogates_config.py b/datasets/osu2008/surrogates_config.py
similarity index 100%
rename from data/osu2008/surrogates_config.py
rename to datasets/osu2008/surrogates_config.py
diff --git a/run_training.py b/run_training.py
index 24171c3..fe849c4 100644
--- a/run_training.py
+++ b/run_training.py
@@ -3,7 +3,7 @@
 
 from tqdm import tqdm
 
-from codes.dataset.data_utils import download_data
+from codes.utils.data_utils import download_data
 from codes.train import create_task_list_for_surrogate, parallel_training, sequential_training
 from codes.utils import (
     check_training_status,
diff --git a/test/test_data.py b/test/test_data.py
index 258efa1..2f0f728 100644
--- a/test/test_data.py
+++ b/test/test_data.py
@@ -3,9 +3,9 @@
 
 import pytest
 
-from codes.dataset.data_utils import check_and_load_data, download_data
+from codes.utils.data_utils import check_and_load_data, download_data
 
-paths = glob.glob("data/*/data.hdf5")
+paths = glob.glob("datasets/*/data.hdf5")
 dataset_names = [path.split("/")[1] for path in paths]