"""Utilities for handling files and directories."""importcontextlibimportloggingimportpathlibimportreimportsocketimportsubprocessfromtypingimportTextIOimportjinja2fromautojobimportSETTINGSfromautojob.coordinatorimportjoblogger=logging.getLogger(__name__)JOB_STATS_FILE="job_stats.txt"
[docs]defget_uri(dir_name:str|pathlib.Path)->str:"""Return the URI path for a directory. This allows files hosted on different file servers to have distinct locations. Adapted from Atomate2. Arg: dir_name: A directory name. Returns: Full URI path, e.g., "fileserver.host.com:/full/path/of/dir_name". """fullpath=pathlib.Path(dir_name).absolute()hostname=socket.gethostname()withcontextlib.suppress(socket.gaierror,socket.herror):hostname=socket.gethostbyaddr(hostname)[0]returnf"{hostname}:{fullpath}"
[docs]defextract_structure_name(python_script:TextIO)->str:"""Determine the structure filename from a Python script file. The structure must appear in a call to ase.io.read as either: 1) ase.io.read(structure_name) 2) io.read(structure_name) 3) read(structure_name) The structure present in the first such occurrence will be returned. Args: python_script: A stream containing the contents of the Python script used to run the calculation. Raises: RuntimeError: No structure name found. Returns: A string representing the filename of the structure read in the Python script. """logger.debug(f"Extracting structure filename from {python_script.name}")structure_re=re.compile(r'^atoms = (?:ase.)?(?:io.)?read\(["\'](?P<structure_name>.+)["\']\)$')forlineinpython_script:match=structure_re.search(line)ifmatch:structure_name=match.group("structure_name")logger.debug("Successfully extracted structure filename from "f"{python_script.name}: {structure_name}")returnstructure_namemsg=f"Unable to determine structure name from {python_script.name}"raiseRuntimeError(msg)
[docs]deffind_slurm_file(dir_name:pathlib.Path)->pathlib.Path:"""Retrieves the path to the first slurm output file found. Args: dir_name: The directory in which to search. Returns: The path to the slurm output file. If multiple slurm output files exist, the one corresponding to the job with the highest slurm job ID will be returned. Raises: FileNotFoundError: No valid slurm file found. """try:slurm_files=sorted(dir_name.rglob("slurm-*.out"))returnslurm_files[-1]exceptIndexErroraserr:msg="No valid slurm file found."raiseFileNotFoundError(msg)fromerr
[docs]defget_slurm_job_id(job_dir:pathlib.Path)->int:"""Returns the SLURM job id for the job run in the directory "job_dir". Args: job_dir: The directory containing the slurm output file. Raises: FileNotFoundError: SLURM output file not found. Returns: The SLURM job id. """slurm_re=re.compile(r"slurm-(\d+).out")forpathinjob_dir.iterdir():match=slurm_re.fullmatch(path.name)ifmatch:returnint(match[1])msg=f"No slurm output file found in {'/'.join(job_dir.parts[-4:])}"raiseFileNotFoundError(msg)
[docs]defcreate_job_stats_file(slurm_job_id:int,job_dir:str|pathlib.Path)->pathlib.Path:"""Creates file containing statistics from completed Slurm job. Args: slurm_job_id: The Slurm job ID for the job. job_dir: The job directory. Raises: RuntimeError: Unable to create job stats file. Returns: A pathlib.Path to the file containing the job statistics. """logger.debug(f"Creating job stats file for Slurm job: {slurm_job_id}")job_stats_file=pathlib.Path(job_dir).joinpath(JOB_STATS_FILE)slurm_cmd=["/usr/bin/env","sacct",f"--jobs={slurm_job_id}",f'--format={"%20,".join(job.JOB_STATS_FIELDS)}',]try:process=subprocess.run(slurm_cmd,text=True,check=True,capture_output=True,)ifprocess.stdout:withjob_stats_file.open(mode="x",encoding="utf-8")asfile:file.write(process.stdout)exceptsubprocess.CalledProcessErroraserr:msg=f"Unable to create job stats file for job in {job_dir}"raiseRuntimeError(msg)fromerrlogger.debug(f"Successfully created job stats file for Slurm job: {slurm_job_id}")returnjob_stats_file
[docs]deffind_study_group_dirs(path:pathlib.Path|None=None,)->list[pathlib.Path]:"""Find all study group directories in the directory tree below "path". Note that if a path matches the specified pattern, its subdirectories are not searched. Args: path: Top level directory to be searched. Defaults to current working directory. Returns: List[pathlib.Path]: All study group directories below "path". """return_find_template_dir(re.compile(r"g[a-zA-Z0-9]{9}"),path)
[docs]deffind_study_dirs(path:pathlib.Path|None=None)->list[pathlib.Path]:"""Find all study directories in the directory tree below "path". Note that if a path matches the specified pattern, its subdirectories are not searched. Args: path: Top level directory to be searched. Defaults to current working directory. Returns: A list of Paths to all study directories below path. """return_find_template_dir(re.compile(r"s[a-zA-Z0-9]{9}"),path)
[docs]deffind_calculation_dirs(path:pathlib.Path|None=None,)->list[pathlib.Path]:"""Find all calculation directories in the directory tree below "path". Note that if a path matches the specified pattern, its subdirectories are not searched. Args: path: Top level directory to be searched. Defaults to current working directory. Returns: A list of Paths to all calculation directories below path. """return_find_template_dir(re.compile(r"c[a-zA-Z0-9]{9}"),path)
[docs]deffind_job_dirs(path:pathlib.Path|None=None)->list[pathlib.Path]:"""Find all job directories in the directory tree below "path". Note that if a path matches the specified pattern, its subdirectories are not searched. Args: path: Top level directory to be searched. Defaults to current working directory. Returns: A list of all job directories below path. """return_find_template_dir(re.compile(r"j[a-zA-Z0-9]{9}"),path)
[docs]deffind_last_submitted_jobs(path:pathlib.Path|None=None,ignore_unrun_jobs:bool=False,)->list[pathlib.Path]:"""Returns the directories of the most recently submitted jobs. Only the directories in each calculation specified in "path" or subdirectories of "path" are returned. Args: path: The directory specifying or containing calculations. Defaults to current working directory. ignore_unrun_jobs: If true, no job will be reported for calculation directories containing jobs that have yet been run. Otherwise, the most recently submitted job will be reported. Defaults to False. Returns: A list of Paths to directories containing newest jobs for each calculation in path or subdirectories of path. """calc_dirs=find_calculation_dirs(path)newest_jobs:list[pathlib.Path]=[]forcalc_dirincalc_dirs:newest_job_dir=Nonenewest_job_id=Noneforjob_dirincalc_dir.iterdir():ifnotjob_dir.is_dir():continuetry:job_id=get_slurm_job_id(job_dir)exceptFileNotFoundError:ifignore_unrun_jobs:breakcontinueifnewest_job_idisNoneorjob_id>newest_job_id:newest_job_id=job_idnewest_job_dir=job_dirifnewest_job_dirisnotNone:newest_jobs.append(newest_job_dir)returnnewest_jobs
[docs]defcheck_job_status(job_id:int,)->str:"""Determine the status of a SLURM job. Args: job_id: The Slurm job ID. Returns: A string indicating the job status. """output=subprocess.check_output(["/usr/bin/env","seff",str(job_id)],encoding="utf-8",)status_re=re.compile(r"^State: (?P<status>\w+) \(exit code \d*\)$")forlineinoutput.splitlines():ifmatch:=status_re.match(line):returnmatch.group("status")msg=(f"Unable to determine the status of job: {job_id}. Please verify ""that this is a valid SLURM job ID")raiseValueError(msg)
[docs]deffind_finished_jobs(path:pathlib.Path|None=None)->list[pathlib.Path]:"""Find the directories and subdirectories containing finished jobs. These jobs may have terminated due to errors, but they are no longer running. Args: path: The directory in which to search. Defaults to None (in which case the current working directory is searched). Returns: A list of Paths pointing to directories containing jobs that have finished. """last_submitted=find_last_submitted_jobs(path=path,ignore_unrun_jobs=True)finished_jobs=[]forjob_dirinlast_submitted:job_id=get_slurm_job_id(job_dir=job_dir)status=check_job_status(job_id=job_id)ifstatus.lower()!="idle":finished_jobs.append(job_dir)returnfinished_jobs
def_find_template_dir(pattern:re.Pattern,path:pathlib.Path|None=None)->list[pathlib.Path]:"""Returns list of directories. Note that if the supplied path matches the specified pattern, its subdirectories are not searched. Args: path: The starting directory for the search. pattern: A regular expression to match with directory names. Returns: The list of directories matching pattern. """ifpathisNone:path=pathlib.Path.cwd()ifpattern.fullmatch(path.name):return[path]dirs:list[pathlib.Path]=[]forsub_pathinpath.iterdir():ifnotsub_path.is_dir():continueifpattern.fullmatch(sub_path.name):dirs.append(sub_path)else:dirs.extend(_find_template_dir(pattern,sub_path))returndirs
[docs]defget_loader()->jinja2.BaseLoader:"""Return the Jinja template loader."""ifSETTINGS.TEMPLATE_DIR:loader=jinja2.FileSystemLoader(SETTINGS.TEMPLATE_DIR)else:loader=jinja2.PackageLoader(__name__.split(".",maxsplit=1)[0])returnloader