Make Simulation stateful

Parsed data is now stored in attributes of mdx.ingest.Simulation and data paths are read from metadata by default Data path is now stored as a default in simulation metadata Documented mdx.ingest Minor changes: Add pre-commit hooks Add requirements.txt for dev
ashenoy463 · May 1, 2024 · 4021ecb · 4021ecb
1 parent 5b38f1f
commit 4021ecb
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 testbench.ipynb
-
+.vscode/
+.pre-commit-config.yaml
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/examples/meta.yaml b/examples/meta.yaml
@@ -5,6 +5,8 @@ Metadata:
   sim_desc: "Preliminary run of 1 nanosecond at 300 K — test bed for refactoring and further processing code"
   # execution dates of chunks (yyyy-mm-ddhh:mmZ,yyyy-mm-ddhh:mmZ)
   exec_times: []
+  # path of folder containing simulation chunks
+  data_path: "/home/Work/ayush/sim_data/prelim"
 
   partition:
     # stepsize in real units

diff --git a/mdx/format/meta.py b/mdx/format/meta.py
@@ -1,6 +1,10 @@
+from pydantic import BaseModel, PositiveFloat, PositiveInt, ValidationError
+from typing import Literal, Dict, Any, Optional, Union
 from datetime import datetime
-from pydantic import BaseModel, PositiveFloat, PositiveInt
-from typing import Literal, Dict, Any, Optional
+
+import os
+from pydantic.functional_validators import AfterValidator
+from typing_extensions import Annotated
 
 # Allowed values
 
@@ -19,6 +23,16 @@
 ]
 # fmt: on
 
+# Validators
+
+
+def check_path(path: os.PathLike) -> os.PathLike:
+    assert os.path.exists(path), f"{path} is not a valid path"
+    return path
+
+
+ValidPath = Annotated[os.PathLike, AfterValidator(check_path)]
+
 # Format for simulation metadata
 
 
@@ -28,7 +42,7 @@ class MetaPartition(BaseModel):
     n_chunks: PositiveInt  # Chunks in a simulation
 
 
-# TEMPFIX: Find a
+# TEMPFIX: Everything that would be here is left to the user currently
 class MetaExperimental(BaseModel):
     pass
 
@@ -51,7 +65,8 @@ class FormatMeta(BaseModel):
     sim_id: str
     sim_desc: str
     exec_times: list[tuple[datetime, datetime]]
+    data_path: ValidPath
     partition: MetaPartition
     box: MetaBox
-    experimental: Optional[Dict[str, Any]] 
+    experimental: Optional[Dict[str, Any]]
     output: MetaOutput
diff --git a/mdx/ingest.py b/mdx/ingest.py
@@ -8,11 +8,13 @@
 from .format.meta import FormatMeta
 
 # ASSUMPTIONS:
-#
 # GRID is never used
-# LP is type dependent
-# N is constant
-# Molecule ID is useless
+# LP is type dependent; have to ask
+# N is constant ; good assumption by and large
+# Molecule ID is useless ; have to ask
+
+
+# Exceptions
 
 
 class InvalidItem(Exception):
@@ -23,37 +25,93 @@ class InvalidFormat(Exception):
     pass
 
 
-class Simulation:
-    # handle invalid glob error
-    # classdocstring
-    # record traj resolution
-    # write comments at each step
-    def __init__(self, meta_file, chunks=[], block="250MiB", eager=False) -> None:
+class InvalidChunks(Exception):
+    pass
 
-        self.chunks = chunks
-        self.eager = eager
-        self.block = block
+
+# TODO
+#
+# Write intermediate bag step formas
+# Overridable blocksize
+class Simulation:
+    """
+    Representation of an entire simulation run, with its metadata.
+
+    The Simulation class is initialized with a metadata file describing
+    the parameters of a given simulation's chosen chunks. Trajectories
+    and other outputs are stored as attributes (default: None) which
+    can be initialized through the appropriate methods.
+
+    Args:
+        meta_file (os.PathLike): File specifying simulation metadata
+        chunks (list[int]): Chosen simulation chunks to handle
+        block (str): Block size for dask
+        eager (bool): Whether to compute attributes immediately
+
+    Attributes:
+        trajectory: Atomic trajectories
+        bonds: ReaxFF bond data
+        species: ReaxFF species data
+        thermo: Thermodyanmic data
+
+    """
+
+    def __init__(
+        self,
+        meta_file: os.PathLike,
+        chunks: list[int] = None,
+        block_size: str = "250MiB",
+        eager: bool = False,
+    ) -> None:
 
         with open(meta_file, "r") as f:
             self.meta = FormatMeta(**yaml.safe_load(f)["Metadata"]).model_dump()
 
-        if not chunks:
-            self.chunks = [i for i in range(self.meta["partition"]["n_chunks"])]
+        valid_chunks = [i for i in range(self.meta["partition"]["n_chunks"])]
+
+        self.eager = eager
+        self.block = block_size
+
+        if chunks is not None:
+            if set(chunks) <= set(valid_chunks):
+                self.chunks = chunks
+            else:
+                raise InvalidChunks(
+                    f"Valid chunks are in range [0,...,{valid_chunks[-1]}]. Was supplied {chunks}"
+                )
+        else:
+            self.chunks = valid_chunks
+
+        self.trajectory = None
+        self.bonds = None
+        self.species = None
+        self.thermo = None
+        self.other_data = (
+            {k: None for k in self.meta["output"]["other"].keys()}
+            if self.meta["output"]["other"]
+            else None
+        )
 
     # End user methods
 
-    def read_bonds(self, bond_path="."):
+    def read_bonds(self, data_path: os.PathLike = None):
+        """
+        Read bond files, parse and store in class attribute
 
-        bond_paths = [
+        Args:
+        data_path (os.PathLike): alternate base path containing chosen chunks
+        """
+        base_path = self.meta["data_path"] if data_path is None else data_path
+        file_paths = [
             os.path.join(
-                bond_path,
-                f"{self.meta['sim_id']}/{chunk}/dat_bonds_{self.meta['sim_id']}_{chunk}.reaxff",
+                base_path,
+                f"{chunk}/dat_bonds_{self.meta['sim_id']}_{chunk}.reaxff",
             )
             for chunk in self.chunks
         ]
 
         corpus = (
-            db.read_text(bond_paths, linedelimiter="# Timestep", blocksize=self.block)
+            db.read_text(file_paths, linedelimiter="# Timestep", blocksize=self.block)
             .remove(lambda x: x == "# Timestep")
             .map(
                 lambda x: [
@@ -67,47 +125,53 @@ def read_bonds(self, bond_path="."):
             .distinct(key=lambda x: x["timestep"])
         )
 
-        if self.eager:
-            return corpus.compute()
-        else:
-            return corpus
+        self.bonds = corpus.compute() if self.eager else corpus
 
-    def read_trajectory(self, traj_path=".", atomic_format="frame"):
-        traj_paths = [
+    def read_trajectory(
+        self, data_path: os.PathLike = None, atomic_format: str = "frame"
+    ):
+        """
+        Read trajectory files, parse and store in class attribute
+
+        Args:
+        data_path (os.PathLike): alternate base path containing chosen chunks
+        atomic_format: format to project trajectories into
+        """
+        base_path = self.meta["data_path"] if data_path is None else data_path
+        file_paths = [
             os.path.join(
-                traj_path,
-                f"{self.meta['sim_id']}/{chunk}/dat_trajectory_{self.meta['sim_id']}_{chunk}.dump",
+                base_path,
+                f"{chunk}/dat_trajectory_{self.meta['sim_id']}_{chunk}.dump",
             )
             for chunk in self.chunks
         ]
 
         corpus = (
-            db.read_text(traj_paths, linedelimiter="TIMESTEP", blocksize=self.block)
+            db.read_text(file_paths, linedelimiter="TIMESTEP", blocksize=self.block)
             .remove(lambda x: x == "ITEM: TIMESTEP")
             .map(lambda x: x.split("ITEM: "))
             .map(lambda x: x[:-1] if (x[-1] == "TIMESTEP") else x)
             .map(self.__process_traj_step, atomic_format=atomic_format)
             # .distinct(key=lambda x: x["timestep"]) ; causes memory leak on nanosecond scale data
         )
 
-        if self.eager:
-            return corpus.compute()
-        else:
-            return corpus
+        self.trajectory = corpus.compute() if self.eager else corpus
 
     def read_species(self):
         pass
 
     def read_ave(self):
         pass
 
-    def read_log(self):
+    def read_thermo(self):
         pass
 
     # Intermediate processing steps
 
-    def __process_traj_step(self, step_text, atomic_format):
-
+    def __process_traj_step(self, step_text: str, atomic_format: str):
+        """
+        Parse raw trajectory data text of one frame into chosen format
+        """
         frame = {"timestep": "", "n_atoms": "", "atomic": ""}
         item_regex = "([A-Z ]*)([A-z ]*)\n((.*[\n]?)*)"
         valid_items = ["NUMBER OF ATOMS", "BOX BOUNDS", "ATOMS", "DIMENSIONS"]
@@ -138,8 +202,19 @@ def __process_traj_step(self, step_text, atomic_format):
 
             elif label == "ATOMS":
 
+                # Almost intentionally bad implementation, never use this
                 if atomic_format == "frame":
-                    frame["atomic"] = [x for x in data.split("\n")]
+                    frame["atomic"] = [
+                        [
+                            (
+                                float(num)
+                                if any(mark in num for mark in [".", "e"])
+                                else int(num)
+                            )
+                            for num in x.split()
+                        ]
+                        for x in data.split("\n")
+                    ]
 
                 elif atomic_format == "pandas":
                     dataf = pd.read_csv(
@@ -156,8 +231,10 @@ def __process_traj_step(self, step_text, atomic_format):
 
         return frame
 
-    def __process_bond_step(self, step_text):
-
+    def __process_bond_step(self, step_text: str):
+        """
+        Parse raw bond data text of one frame into chosen format
+        """
         # TODO Leverage symmetry and halve time
         # atomids start from 0 (actualid -1)
         timestep = int(step_text.pop(0))

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,8 @@ dependencies = [
 "PyYAML",
 "scipy",
 "sparse",
-"xarray"
+"xarray",
+"pre-commit"
 ]
 
 [project.urls]

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,79 @@
+annotated-types==0.6.0
+asttokens==2.4.1
+black==24.4.2
+bokeh==3.4.1
+cfgv==3.4.0
+click==8.1.7
+cloudpickle==3.0.0
+comm==0.2.2
+contourpy==1.2.1
+dask==2024.4.2
+debugpy==1.8.1
+decorator==5.1.1
+distlib==0.3.8
+distributed==2024.4.2
+exceptiongroup==1.2.1
+executing==2.0.1
+filelock==3.14.0
+fsspec==2024.3.1
+h5netcdf==1.3.0
+h5py==3.11.0
+identify==2.5.36
+importlib_metadata==7.1.0
+ipykernel==6.29.4
+ipython==8.18.1
+jedi==0.19.1
+Jinja2==3.1.3
+jupyter_client==8.6.1
+jupyter_core==5.7.2
+llvmlite==0.42.0
+locket==1.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+-e git+ssh://[email protected]/ashenoy463/mdx.git@5b38f1fd1eb0a996ed5aabcba52ce3124e4c1fbd#egg=mdx
+msgpack==1.0.8
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+nodeenv==1.8.0
+numba==0.59.1
+numpy==1.26.4
+packaging==24.0
+pandas==2.2.2
+parso==0.8.4
+partd==1.4.1
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.3.0
+pkg_resources==0.0.0
+platformdirs==4.2.1
+pre-commit==3.7.0
+prompt-toolkit==3.0.43
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pydantic==2.7.1
+pydantic_core==2.18.2
+Pygments==2.17.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+pyzmq==26.0.2
+scipy==1.13.0
+six==1.16.0
+sortedcontainers==2.4.0
+sparse==0.15.1
+stack-data==0.6.3
+tblib==3.0.0
+tomli==2.0.1
+toolz==0.12.1
+tornado==6.4
+traitlets==5.14.3
+typing_extensions==4.11.0
+tzdata==2024.1
+urllib3==2.2.1
+virtualenv==20.26.1
+wcwidth==0.2.13
+xarray==2024.3.0
+xyzservices==2024.4.0
+zict==3.0.0
+zipp==3.18.1