diff --git a/.gitignore b/.gitignore index 874f093..156a490 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ testbench.ipynb - +.vscode/ +.pre-commit-config.yaml # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/examples/meta.yaml b/examples/meta.yaml index 46c7bb7..fca1d4b 100644 --- a/examples/meta.yaml +++ b/examples/meta.yaml @@ -5,6 +5,8 @@ Metadata: sim_desc: "Preliminary run of 1 nanosecond at 300 K — test bed for refactoring and further processing code" # execution dates of chunks (yyyy-mm-ddhh:mmZ,yyyy-mm-ddhh:mmZ) exec_times: [] + # path of folder containing simulation chunks + data_path: "/home/Work/ayush/sim_data/prelim" partition: # stepsize in real units diff --git a/mdx/format/meta.py b/mdx/format/meta.py index 9bb4ddc..cafb068 100644 --- a/mdx/format/meta.py +++ b/mdx/format/meta.py @@ -1,6 +1,10 @@ +from pydantic import BaseModel, PositiveFloat, PositiveInt, ValidationError +from typing import Literal, Dict, Any, Optional, Union from datetime import datetime -from pydantic import BaseModel, PositiveFloat, PositiveInt -from typing import Literal, Dict, Any, Optional + +import os +from pydantic.functional_validators import AfterValidator +from typing_extensions import Annotated # Allowed values @@ -19,6 +23,16 @@ ] # fmt: on +# Validators + + +def check_path(path: os.PathLike) -> os.PathLike: + assert os.path.exists(path), f"{path} is not a valid path" + return path + + +ValidPath = Annotated[os.PathLike, AfterValidator(check_path)] + # Format for simulation metadata @@ -28,7 +42,7 @@ class MetaPartition(BaseModel): n_chunks: PositiveInt # Chunks in a simulation -# TEMPFIX: Find a +# TEMPFIX: Everything that would be here is left to the user currently class MetaExperimental(BaseModel): pass @@ -51,7 +65,8 @@ class FormatMeta(BaseModel): sim_id: str sim_desc: str exec_times: list[tuple[datetime, datetime]] + data_path: ValidPath partition: MetaPartition box: MetaBox - experimental: Optional[Dict[str, Any]] + experimental: Optional[Dict[str, Any]] output: MetaOutput diff --git a/mdx/ingest.py b/mdx/ingest.py index adabdb6..7881fdc 100755 --- a/mdx/ingest.py +++ b/mdx/ingest.py @@ -8,11 +8,13 @@ from .format.meta import FormatMeta # ASSUMPTIONS: -# # GRID is never used -# LP is type dependent -# N is constant -# Molecule ID is useless +# LP is type dependent; have to ask +# N is constant ; good assumption by and large +# Molecule ID is useless ; have to ask + + +# Exceptions class InvalidItem(Exception): @@ -23,37 +25,93 @@ class InvalidFormat(Exception): pass -class Simulation: - # handle invalid glob error - # classdocstring - # record traj resolution - # write comments at each step - def __init__(self, meta_file, chunks=[], block="250MiB", eager=False) -> None: +class InvalidChunks(Exception): + pass - self.chunks = chunks - self.eager = eager - self.block = block + +# TODO +# +# Write intermediate bag step formas +# Overridable blocksize +class Simulation: + """ + Representation of an entire simulation run, with its metadata. + + The Simulation class is initialized with a metadata file describing + the parameters of a given simulation's chosen chunks. Trajectories + and other outputs are stored as attributes (default: None) which + can be initialized through the appropriate methods. + + Args: + meta_file (os.PathLike): File specifying simulation metadata + chunks (list[int]): Chosen simulation chunks to handle + block (str): Block size for dask + eager (bool): Whether to compute attributes immediately + + Attributes: + trajectory: Atomic trajectories + bonds: ReaxFF bond data + species: ReaxFF species data + thermo: Thermodyanmic data + + """ + + def __init__( + self, + meta_file: os.PathLike, + chunks: list[int] = None, + block_size: str = "250MiB", + eager: bool = False, + ) -> None: with open(meta_file, "r") as f: self.meta = FormatMeta(**yaml.safe_load(f)["Metadata"]).model_dump() - if not chunks: - self.chunks = [i for i in range(self.meta["partition"]["n_chunks"])] + valid_chunks = [i for i in range(self.meta["partition"]["n_chunks"])] + + self.eager = eager + self.block = block_size + + if chunks is not None: + if set(chunks) <= set(valid_chunks): + self.chunks = chunks + else: + raise InvalidChunks( + f"Valid chunks are in range [0,...,{valid_chunks[-1]}]. Was supplied {chunks}" + ) + else: + self.chunks = valid_chunks + + self.trajectory = None + self.bonds = None + self.species = None + self.thermo = None + self.other_data = ( + {k: None for k in self.meta["output"]["other"].keys()} + if self.meta["output"]["other"] + else None + ) # End user methods - def read_bonds(self, bond_path="."): + def read_bonds(self, data_path: os.PathLike = None): + """ + Read bond files, parse and store in class attribute - bond_paths = [ + Args: + data_path (os.PathLike): alternate base path containing chosen chunks + """ + base_path = self.meta["data_path"] if data_path is None else data_path + file_paths = [ os.path.join( - bond_path, - f"{self.meta['sim_id']}/{chunk}/dat_bonds_{self.meta['sim_id']}_{chunk}.reaxff", + base_path, + f"{chunk}/dat_bonds_{self.meta['sim_id']}_{chunk}.reaxff", ) for chunk in self.chunks ] corpus = ( - db.read_text(bond_paths, linedelimiter="# Timestep", blocksize=self.block) + db.read_text(file_paths, linedelimiter="# Timestep", blocksize=self.block) .remove(lambda x: x == "# Timestep") .map( lambda x: [ @@ -67,22 +125,29 @@ def read_bonds(self, bond_path="."): .distinct(key=lambda x: x["timestep"]) ) - if self.eager: - return corpus.compute() - else: - return corpus + self.bonds = corpus.compute() if self.eager else corpus - def read_trajectory(self, traj_path=".", atomic_format="frame"): - traj_paths = [ + def read_trajectory( + self, data_path: os.PathLike = None, atomic_format: str = "frame" + ): + """ + Read trajectory files, parse and store in class attribute + + Args: + data_path (os.PathLike): alternate base path containing chosen chunks + atomic_format: format to project trajectories into + """ + base_path = self.meta["data_path"] if data_path is None else data_path + file_paths = [ os.path.join( - traj_path, - f"{self.meta['sim_id']}/{chunk}/dat_trajectory_{self.meta['sim_id']}_{chunk}.dump", + base_path, + f"{chunk}/dat_trajectory_{self.meta['sim_id']}_{chunk}.dump", ) for chunk in self.chunks ] corpus = ( - db.read_text(traj_paths, linedelimiter="TIMESTEP", blocksize=self.block) + db.read_text(file_paths, linedelimiter="TIMESTEP", blocksize=self.block) .remove(lambda x: x == "ITEM: TIMESTEP") .map(lambda x: x.split("ITEM: ")) .map(lambda x: x[:-1] if (x[-1] == "TIMESTEP") else x) @@ -90,10 +155,7 @@ def read_trajectory(self, traj_path=".", atomic_format="frame"): # .distinct(key=lambda x: x["timestep"]) ; causes memory leak on nanosecond scale data ) - if self.eager: - return corpus.compute() - else: - return corpus + self.trajectory = corpus.compute() if self.eager else corpus def read_species(self): pass @@ -101,13 +163,15 @@ def read_species(self): def read_ave(self): pass - def read_log(self): + def read_thermo(self): pass # Intermediate processing steps - def __process_traj_step(self, step_text, atomic_format): - + def __process_traj_step(self, step_text: str, atomic_format: str): + """ + Parse raw trajectory data text of one frame into chosen format + """ frame = {"timestep": "", "n_atoms": "", "atomic": ""} item_regex = "([A-Z ]*)([A-z ]*)\n((.*[\n]?)*)" valid_items = ["NUMBER OF ATOMS", "BOX BOUNDS", "ATOMS", "DIMENSIONS"] @@ -138,8 +202,19 @@ def __process_traj_step(self, step_text, atomic_format): elif label == "ATOMS": + # Almost intentionally bad implementation, never use this if atomic_format == "frame": - frame["atomic"] = [x for x in data.split("\n")] + frame["atomic"] = [ + [ + ( + float(num) + if any(mark in num for mark in [".", "e"]) + else int(num) + ) + for num in x.split() + ] + for x in data.split("\n") + ] elif atomic_format == "pandas": dataf = pd.read_csv( @@ -156,8 +231,10 @@ def __process_traj_step(self, step_text, atomic_format): return frame - def __process_bond_step(self, step_text): - + def __process_bond_step(self, step_text: str): + """ + Parse raw bond data text of one frame into chosen format + """ # TODO Leverage symmetry and halve time # atomids start from 0 (actualid -1) timestep = int(step_text.pop(0)) diff --git a/pyproject.toml b/pyproject.toml index defa9fd..8fc5bc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,8 @@ dependencies = [ "PyYAML", "scipy", "sparse", -"xarray" +"xarray", +"pre-commit" ] [project.urls] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..039e163 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,79 @@ +annotated-types==0.6.0 +asttokens==2.4.1 +black==24.4.2 +bokeh==3.4.1 +cfgv==3.4.0 +click==8.1.7 +cloudpickle==3.0.0 +comm==0.2.2 +contourpy==1.2.1 +dask==2024.4.2 +debugpy==1.8.1 +decorator==5.1.1 +distlib==0.3.8 +distributed==2024.4.2 +exceptiongroup==1.2.1 +executing==2.0.1 +filelock==3.14.0 +fsspec==2024.3.1 +h5netcdf==1.3.0 +h5py==3.11.0 +identify==2.5.36 +importlib_metadata==7.1.0 +ipykernel==6.29.4 +ipython==8.18.1 +jedi==0.19.1 +Jinja2==3.1.3 +jupyter_client==8.6.1 +jupyter_core==5.7.2 +llvmlite==0.42.0 +locket==1.0.0 +MarkupSafe==2.1.5 +matplotlib-inline==0.1.7 +-e git+ssh://git@github.com/ashenoy463/mdx.git@5b38f1fd1eb0a996ed5aabcba52ce3124e4c1fbd#egg=mdx +msgpack==1.0.8 +mypy-extensions==1.0.0 +nest-asyncio==1.6.0 +nodeenv==1.8.0 +numba==0.59.1 +numpy==1.26.4 +packaging==24.0 +pandas==2.2.2 +parso==0.8.4 +partd==1.4.1 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.3.0 +pkg_resources==0.0.0 +platformdirs==4.2.1 +pre-commit==3.7.0 +prompt-toolkit==3.0.43 +psutil==5.9.8 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pydantic==2.7.1 +pydantic_core==2.18.2 +Pygments==2.17.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 +PyYAML==6.0.1 +pyzmq==26.0.2 +scipy==1.13.0 +six==1.16.0 +sortedcontainers==2.4.0 +sparse==0.15.1 +stack-data==0.6.3 +tblib==3.0.0 +tomli==2.0.1 +toolz==0.12.1 +tornado==6.4 +traitlets==5.14.3 +typing_extensions==4.11.0 +tzdata==2024.1 +urllib3==2.2.1 +virtualenv==20.26.1 +wcwidth==0.2.13 +xarray==2024.3.0 +xyzservices==2024.4.0 +zict==3.0.0 +zipp==3.18.1