Skip to content

Commit

Permalink
Make Simulation stateful
Browse files Browse the repository at this point in the history
Parsed data is now stored in attributes of mdx.ingest.Simulation and data paths are read from metadata by default
Data path is now stored as a default in simulation metadata
Documented mdx.ingest

Minor changes:
Add pre-commit hooks
Add requirements.txt for dev
  • Loading branch information
ashenoy463 committed May 1, 2024
1 parent 5b38f1f commit 4021ecb
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 45 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
testbench.ipynb

.vscode/
.pre-commit-config.yaml

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 2 additions & 0 deletions examples/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Metadata:
sim_desc: "Preliminary run of 1 nanosecond at 300 K — test bed for refactoring and further processing code"
# execution dates of chunks (yyyy-mm-ddhh:mmZ,yyyy-mm-ddhh:mmZ)
exec_times: []
# path of folder containing simulation chunks
data_path: "/home/Work/ayush/sim_data/prelim"

partition:
# stepsize in real units
Expand Down
23 changes: 19 additions & 4 deletions mdx/format/meta.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from pydantic import BaseModel, PositiveFloat, PositiveInt, ValidationError
from typing import Literal, Dict, Any, Optional, Union
from datetime import datetime
from pydantic import BaseModel, PositiveFloat, PositiveInt
from typing import Literal, Dict, Any, Optional

import os
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated

# Allowed values

Expand All @@ -19,6 +23,16 @@
]
# fmt: on

# Validators


def check_path(path: os.PathLike) -> os.PathLike:
assert os.path.exists(path), f"{path} is not a valid path"
return path


ValidPath = Annotated[os.PathLike, AfterValidator(check_path)]

# Format for simulation metadata


Expand All @@ -28,7 +42,7 @@ class MetaPartition(BaseModel):
n_chunks: PositiveInt # Chunks in a simulation


# TEMPFIX: Find a
# TEMPFIX: Everything that would be here is left to the user currently
class MetaExperimental(BaseModel):
pass

Expand All @@ -51,7 +65,8 @@ class FormatMeta(BaseModel):
sim_id: str
sim_desc: str
exec_times: list[tuple[datetime, datetime]]
data_path: ValidPath
partition: MetaPartition
box: MetaBox
experimental: Optional[Dict[str, Any]]
experimental: Optional[Dict[str, Any]]
output: MetaOutput
155 changes: 116 additions & 39 deletions mdx/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
from .format.meta import FormatMeta

# ASSUMPTIONS:
#
# GRID is never used
# LP is type dependent
# N is constant
# Molecule ID is useless
# LP is type dependent; have to ask
# N is constant ; good assumption by and large
# Molecule ID is useless ; have to ask


# Exceptions


class InvalidItem(Exception):
Expand All @@ -23,37 +25,93 @@ class InvalidFormat(Exception):
pass


class Simulation:
# handle invalid glob error
# classdocstring
# record traj resolution
# write comments at each step
def __init__(self, meta_file, chunks=[], block="250MiB", eager=False) -> None:
class InvalidChunks(Exception):
pass

self.chunks = chunks
self.eager = eager
self.block = block

# TODO
#
# Write intermediate bag step formas
# Overridable blocksize
class Simulation:
"""
Representation of an entire simulation run, with its metadata.
The Simulation class is initialized with a metadata file describing
the parameters of a given simulation's chosen chunks. Trajectories
and other outputs are stored as attributes (default: None) which
can be initialized through the appropriate methods.
Args:
meta_file (os.PathLike): File specifying simulation metadata
chunks (list[int]): Chosen simulation chunks to handle
block (str): Block size for dask
eager (bool): Whether to compute attributes immediately
Attributes:
trajectory: Atomic trajectories
bonds: ReaxFF bond data
species: ReaxFF species data
thermo: Thermodyanmic data
"""

def __init__(
self,
meta_file: os.PathLike,
chunks: list[int] = None,
block_size: str = "250MiB",
eager: bool = False,
) -> None:

with open(meta_file, "r") as f:
self.meta = FormatMeta(**yaml.safe_load(f)["Metadata"]).model_dump()

if not chunks:
self.chunks = [i for i in range(self.meta["partition"]["n_chunks"])]
valid_chunks = [i for i in range(self.meta["partition"]["n_chunks"])]

self.eager = eager
self.block = block_size

if chunks is not None:
if set(chunks) <= set(valid_chunks):
self.chunks = chunks
else:
raise InvalidChunks(
f"Valid chunks are in range [0,...,{valid_chunks[-1]}]. Was supplied {chunks}"
)
else:
self.chunks = valid_chunks

self.trajectory = None
self.bonds = None
self.species = None
self.thermo = None
self.other_data = (
{k: None for k in self.meta["output"]["other"].keys()}
if self.meta["output"]["other"]
else None
)

# End user methods

def read_bonds(self, bond_path="."):
def read_bonds(self, data_path: os.PathLike = None):
"""
Read bond files, parse and store in class attribute
bond_paths = [
Args:
data_path (os.PathLike): alternate base path containing chosen chunks
"""
base_path = self.meta["data_path"] if data_path is None else data_path
file_paths = [
os.path.join(
bond_path,
f"{self.meta['sim_id']}/{chunk}/dat_bonds_{self.meta['sim_id']}_{chunk}.reaxff",
base_path,
f"{chunk}/dat_bonds_{self.meta['sim_id']}_{chunk}.reaxff",
)
for chunk in self.chunks
]

corpus = (
db.read_text(bond_paths, linedelimiter="# Timestep", blocksize=self.block)
db.read_text(file_paths, linedelimiter="# Timestep", blocksize=self.block)
.remove(lambda x: x == "# Timestep")
.map(
lambda x: [
Expand All @@ -67,47 +125,53 @@ def read_bonds(self, bond_path="."):
.distinct(key=lambda x: x["timestep"])
)

if self.eager:
return corpus.compute()
else:
return corpus
self.bonds = corpus.compute() if self.eager else corpus

def read_trajectory(self, traj_path=".", atomic_format="frame"):
traj_paths = [
def read_trajectory(
self, data_path: os.PathLike = None, atomic_format: str = "frame"
):
"""
Read trajectory files, parse and store in class attribute
Args:
data_path (os.PathLike): alternate base path containing chosen chunks
atomic_format: format to project trajectories into
"""
base_path = self.meta["data_path"] if data_path is None else data_path
file_paths = [
os.path.join(
traj_path,
f"{self.meta['sim_id']}/{chunk}/dat_trajectory_{self.meta['sim_id']}_{chunk}.dump",
base_path,
f"{chunk}/dat_trajectory_{self.meta['sim_id']}_{chunk}.dump",
)
for chunk in self.chunks
]

corpus = (
db.read_text(traj_paths, linedelimiter="TIMESTEP", blocksize=self.block)
db.read_text(file_paths, linedelimiter="TIMESTEP", blocksize=self.block)
.remove(lambda x: x == "ITEM: TIMESTEP")
.map(lambda x: x.split("ITEM: "))
.map(lambda x: x[:-1] if (x[-1] == "TIMESTEP") else x)
.map(self.__process_traj_step, atomic_format=atomic_format)
# .distinct(key=lambda x: x["timestep"]) ; causes memory leak on nanosecond scale data
)

if self.eager:
return corpus.compute()
else:
return corpus
self.trajectory = corpus.compute() if self.eager else corpus

def read_species(self):
pass

def read_ave(self):
pass

def read_log(self):
def read_thermo(self):
pass

# Intermediate processing steps

def __process_traj_step(self, step_text, atomic_format):

def __process_traj_step(self, step_text: str, atomic_format: str):
"""
Parse raw trajectory data text of one frame into chosen format
"""
frame = {"timestep": "", "n_atoms": "", "atomic": ""}
item_regex = "([A-Z ]*)([A-z ]*)\n((.*[\n]?)*)"
valid_items = ["NUMBER OF ATOMS", "BOX BOUNDS", "ATOMS", "DIMENSIONS"]
Expand Down Expand Up @@ -138,8 +202,19 @@ def __process_traj_step(self, step_text, atomic_format):

elif label == "ATOMS":

# Almost intentionally bad implementation, never use this
if atomic_format == "frame":
frame["atomic"] = [x for x in data.split("\n")]
frame["atomic"] = [
[
(
float(num)
if any(mark in num for mark in [".", "e"])
else int(num)
)
for num in x.split()
]
for x in data.split("\n")
]

elif atomic_format == "pandas":
dataf = pd.read_csv(
Expand All @@ -156,8 +231,10 @@ def __process_traj_step(self, step_text, atomic_format):

return frame

def __process_bond_step(self, step_text):

def __process_bond_step(self, step_text: str):
"""
Parse raw bond data text of one frame into chosen format
"""
# TODO Leverage symmetry and halve time
# atomids start from 0 (actualid -1)
timestep = int(step_text.pop(0))
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ dependencies = [
"PyYAML",
"scipy",
"sparse",
"xarray"
"xarray",
"pre-commit"
]

[project.urls]
Expand Down
79 changes: 79 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
annotated-types==0.6.0
asttokens==2.4.1
black==24.4.2
bokeh==3.4.1
cfgv==3.4.0
click==8.1.7
cloudpickle==3.0.0
comm==0.2.2
contourpy==1.2.1
dask==2024.4.2
debugpy==1.8.1
decorator==5.1.1
distlib==0.3.8
distributed==2024.4.2
exceptiongroup==1.2.1
executing==2.0.1
filelock==3.14.0
fsspec==2024.3.1
h5netcdf==1.3.0
h5py==3.11.0
identify==2.5.36
importlib_metadata==7.1.0
ipykernel==6.29.4
ipython==8.18.1
jedi==0.19.1
Jinja2==3.1.3
jupyter_client==8.6.1
jupyter_core==5.7.2
llvmlite==0.42.0
locket==1.0.0
MarkupSafe==2.1.5
matplotlib-inline==0.1.7
-e git+ssh://[email protected]/ashenoy463/mdx.git@5b38f1fd1eb0a996ed5aabcba52ce3124e4c1fbd#egg=mdx
msgpack==1.0.8
mypy-extensions==1.0.0
nest-asyncio==1.6.0
nodeenv==1.8.0
numba==0.59.1
numpy==1.26.4
packaging==24.0
pandas==2.2.2
parso==0.8.4
partd==1.4.1
pathspec==0.12.1
pexpect==4.9.0
pillow==10.3.0
pkg_resources==0.0.0
platformdirs==4.2.1
pre-commit==3.7.0
prompt-toolkit==3.0.43
psutil==5.9.8
ptyprocess==0.7.0
pure-eval==0.2.2
pydantic==2.7.1
pydantic_core==2.18.2
Pygments==2.17.2
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
pyzmq==26.0.2
scipy==1.13.0
six==1.16.0
sortedcontainers==2.4.0
sparse==0.15.1
stack-data==0.6.3
tblib==3.0.0
tomli==2.0.1
toolz==0.12.1
tornado==6.4
traitlets==5.14.3
typing_extensions==4.11.0
tzdata==2024.1
urllib3==2.2.1
virtualenv==20.26.1
wcwidth==0.2.13
xarray==2024.3.0
xyzservices==2024.4.0
zict==3.0.0
zipp==3.18.1

0 comments on commit 4021ecb

Please sign in to comment.