"""Harvest data from the directories of completed calculations.Example: Harvest the results in the current working directory as vibrational calculations .. code-block:: python from pathlib import Path from autojob.calculation.vibration import Vibration from autojob.harvest.harvest import harvest harvest(dir_name=Path.cwd(), strictness="relaxed", preferred=Vibration).. important:: Always verify the units of harvested quantities."""importloggingfrompathlibimportPathfromtypingimportLiteralfromtqdmimporttqdmfromautojobimportSETTINGSfromautojob.taskimportTaskfromautojob.utils.filesimportfind_job_dirslogger=logging.getLogger(__name__)def_concatenate_list_sources(sources:list[str]|list[Path])->list[str]:"""Read the lines from a list of files and concatentate their lines. Args: sources: A list of filenames. Returns: The unique, non-empty lines in the files provided. """res=[]forsourceinsources:withPath(source).open(mode="r",encoding="utf-8")asfile:lines=[line.rstrip()forlineinfile.readlines()ifline.rstrip()]res.extend(lines)returnres
[docs]defharvest(dir_name:str|Path,*,strictness:Literal["strict","relaxed","atomic"]|None=None,whitelists:list[str]|list[Path]|None=None,blacklists:list[str]|list[Path]|None=None,preferred:type[Task]|None=None,)->list[Task]:"""Collect all data in subdirectories of the given directory. Args: dir_name: The directory under which to collect data. strictness: How to treat tasks for which errors are thrown during their harvesting. If ``"strict"``, all harvesting will abort. If ``"atomic"``, only calculations for which errors are not thrown will be harvested. If ``"relaxed"``, every attempt to harvest all calculations. The default behaviour is controlled by the value of ``SETTINGS.STRICT_MODE``. If ``SETTINGS.STRICT_MODE=True``, the default behaviour will be that of ``strictness="strict"``. Otherwise, the default behaviour will be that of ``strictness="relaxed"``. whitelists: A list of strings or paths representing whitelist filenames, where each whitelist points to a list of task IDs that should be harvested. When specified, only tasks with task IDs matching these IDs will be harvested. Defaults to None in which case all tasks are eligible for harvesting. blacklists: A list of strings or paths representing blacklist filenames, where each blacklist points to a list of task IDs that should not be harvested. hen specified, no tasks with task IDs in this list will be harvested. Defaults to None in which case all tasks will be harvested. preferred: A preferred Task type to use to harvest each calculation. Defaults to :class:`autojob.task.Task`. Returns: A list of :class:`~task.Task` s containing the data within ``dir_name``. """logger.debug(f"Harvesting calculations from: {dir_name}")strict_mode=(SETTINGS.STRICT_MODEifstrictnessisNoneelsestrictnessin("strict","atomic"))jobs=find_job_dirs(Path(dir_name))builder=preferredorTaskifwhitelistsisnotNone:jobs=[jforjinjobsifj.namein_concatenate_list_sources(whitelists)]ifblacklistsisnotNone:jobs=[jforjinjobsifj.namenotin_concatenate_list_sources(blacklists)]harvested=[]forjobintqdm(jobs):try:harvested_task=builder.from_directory(job,strict_mode=strict_mode,magic_mode=True)harvested.append(harvested_task)exceptFileNotFoundErrorase:ifstrict_modeandstrictness!="atomic":raiselogger.warning(f"Unable to harvest task in directory {job} due to ""following error")logger.error(e)logger.info(f"{len(harvested)} calculations harvested")returnharvested