Source code for autojob.harvest.patch
"""Supplement harvested data with data patches.
Oftentimes, you may have additional data that you either
a. can't determine a priori (and thus mark the task with it prior to
submission), or
b. extract programatically (these may be analyses that require fuzzy
intuition).
but nonetheless want to store with your data. This module defines some
simple routines and classes to facilitate the latter use-case.
A :class:`Patch` is just that, a "patch" - it fills in the gap in data
that may exist. To define one, you specify to a feature of the data
to which it should be applied and what data should be added when it is
applied.
.. code-block:: python
from ase import Atoms
from autojob.harvest.patch import Patch
pch = Patch(match_path=["study_id"],
match_value="123456789",
patch_path=["atoms", "positions"]
patch_value=[0.0, 0.0, 0.0]
)
datapoint1 = {
"study_id": None,
"atoms": None
}
atoms = Atoms("C", positions=[[0.0, 1.0, 2.0]])
datapoint2 = {
"study_id": None,
"atoms": atoms
}
pch.apply(datapoint1)
print(datapoint1["atoms"])
None
pch.apply(datapoint2)
print(datapoint2["atoms"].positions)
[0.0, 0.0, 0.0]
To what data the :class:`Patch` will apply is specified by ``match_path`` and
``match_value``. While, what will be applied is specified by ``patch_path`` and
``patch_value``.
Note:
Patch applies to both dictionaries and objects alike!
Example:
Apply a set of patches in batch
.. code-block:: python
from autojob.task import Task
tasks = [Task(...), Task(...), ...]
patches = [Patch(..., Patch(...), ...]
for task in tasks:
for patch in patches:
patch.apply(task)
"""
import json
import logging
from pathlib import Path
from typing import Any
from typing import Literal
from typing import NamedTuple
from autojob import SETTINGS
from autojob.parametrizations import VariableReference
from autojob.parametrizations import getattrpath
from autojob.task import Task
from autojob.utils.files import find_calculation_dirs
from autojob.utils.files import find_study_dirs
from autojob.utils.files import find_study_group_dirs
logger = logging.getLogger(__name__)
[docs]
class Patch(NamedTuple):
"""A data patch.
Attributes:
match_path: A list of attribute/key names used to identify which
attributes are to be patched by the path.
match_value: The value of the attribute/key that must match.
patch_path: The value of the attribute/key to be patched.
patch_value: The value of the attribute/key to be set.
"""
match_path: list[str]
match_value: Any
patch_path: list[str]
patch_value: Any
[docs]
def apply(self, data: object) -> None:
"""Apply a patch to an object.
Args:
data: the data to which the patch will be applied. Note that this
method may or may not end up modifying ``data``, but if it
does, it will do in place.
"""
condition = getattrpath(
data,
self.match_path,
)
if condition == self.match_value:
reference = VariableReference(
set_path=self.patch_path,
get_path=None,
constant=self.patch_value,
)
reference.set_input_value({}, data)
[docs]
def patch_tasks(patches: list[Patch], tasks: list[Task]) -> None:
"""Patch a list of tasks.
This method modifies ``tasks`` in place.
Args:
patches: The patches to apply.
tasks: The tasks to which the patches will be applied.
"""
for task in tasks:
for patch in patches:
patch.apply(task)
[docs]
def build_metadata_patches(
dir_name: Path,
*,
metadata_type: Literal[
"study_group", "study", "calculation"
] = "study_group",
strict_mode: bool = SETTINGS.STRICT_MODE,
legacy_mode: bool = False,
) -> list[Patch]:
"""Create patches from metadata files.
Args:
dir_name: The name of the directory under which to search for
metadata. Defaults to the current working directory.
strict_mode: Whether or not to abort metadata collection if
metadata cannot be found. Defaults to ``SETTINGS.STRICT_MODE``.
metadata_type: The type of metadata file from which patches are to be
built. Must be one of ``"study_group"``, ``"study"``,
``"calculation"``. Defaults to ``"study_group"``.
legacy_mode: Whether or not to assume the legacy format for the
directory. Defaults to False.
Returns:
A list of :class:`Patch` objects which will add metadata
to :attr:`TaskMetadata.__pydantic_extra__`. Further, patch paths are
defined such that study group, study, and calculation metadata will be
added under the ``"study_group_metadata"``, ``"study_metadata"``, and
``"calculation_metadata"`` keys, respectively.
Example:
Patch study group and study metadata for all tasks in a subdirectory.
.. code-block:: python
from pathlib import Path
from autojob.harvest.harvest import harvest
from autojob.harvest.patch import build_metadata_patches
from autojob.harvest.patch import patch_tasks
dir_name = Path().cwd()
tasks = harvest(dir_name)
patches = build_metadata_patches(dir_name)
patch_tasks(patches, tasks)
"""
match metadata_type:
case "study_group":
finder = find_study_group_dirs
filename = SETTINGS.STUDY_GROUP_FILE
subsidiary: Literal["study", "calculation", ""] = "study"
case "study":
finder = find_study_dirs
filename = SETTINGS.STUDY_FILE
subsidiary = "calculation" if legacy_mode else ""
case "calculation":
finder = find_calculation_dirs
filename = SETTINGS.CALCULATION_FILE
subsidiary = ""
source_dirs = finder(dir_name)
patches: list[Patch] = []
for source_dir in source_dirs:
try:
with source_dir.joinpath(filename).open(
mode="r", encoding="utf-8"
) as file:
metadata = json.load(file)
match_path = ["task_metadata", f"{metadata_type}_id"]
patch_path = [
"task_metadata",
"__pydantic_extra__",
f"{metadata_type}_metadata",
]
patches.append(
Patch(
match_path=match_path,
match_value=source_dir.name,
patch_path=patch_path,
patch_value=metadata,
)
)
if subsidiary:
patches.extend(
build_metadata_patches(
dir_name=source_dir,
metadata_type=subsidiary,
strict_mode=strict_mode,
legacy_mode=legacy_mode,
)
)
except FileNotFoundError:
if strict_mode:
raise
logger.warning(
"Unable to build metadata patches for %s %s",
metadata_type,
source_dir,
)
return patches