/rlpy/Tools/run.py
Python | 259 lines | 251 code | 3 blank | 5 comment | 10 complexity | c6c843d4293a935924a9944bb7eb481b MD5 | raw file
- import joblib
- import os
- import glob
- import re
- from rlpy.Tools import __rlpy_location__
- import rlpy.Tools.condor as ct
- from time import sleep
- import cProfile
- import pstats
- import platform
- import sys
- import subprocess
- __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
- __credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
- "William Dabney", "Jonathan P. How"]
- __license__ = "BSD 3-Clause"
- # template for executable file used to execute experiments
- template = """#!/usr/bin/env python
- import sys
- sys.path = ["{rlpy_location}"] + sys.path
- {setting_content}
- {variables}
- if __name__ == "__main__":
- exp = make_experiment(int(sys.argv[1]), ".", **hyper_param)
- exp.log_interval = 60
- exp.run()
- exp.save()
- """
- # template for submission files of htcondor queueing system
- condor_submit_template_start = """
- executable = /data/scratch/cdann/anaconda/bin/python
- universe = vanilla
- priority = 0
- Notification = Never
- requirements = OpSys == "LINUX" && ARCH == "X86_64"
- getenv = True"""
- condor_submit_template_each_job = """
- arguments = "{fn} {id}"
- Error = condor/{id}.err
- Log = condor/{id}.log
- Output = condor/{id}.out
- queue 1
- """
- if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
- devnull = "nul"
- else:
- devnull = "/dev/null"
- def run_profiled(make_exp_fun, profile_location="Profiling",
- out="Test.pdf", **kwargs):
- """run an experiment (without storing its results) and profiles the execution.
- A gprof file is created and a pdf with a graphical visualization of the most
- time-consuming functions in the experiment execution
- :param make_exp_fun: function that returns an Experiment instance which is then
- executed. All remaining keyword parameters are passed to this
- function.
- :param profile_location: directory used to store the profiling result files.
- :param out: filename of the generated pdf file.
- :param \*\*kwargs: remaining parameters passed to the experiment generator function
- """
- dat_fn = os.path.join(profile_location, "profile.dat")
- pdf_fn = os.path.join(profile_location, out)
- dot_fn = os.path.join(profile_location, "graph.txt")
- exp = make_exp_fun(**kwargs)
- cProfile.runctx('exp.run()', {}, {"exp": exp}, dat_fn)
- p = pstats.Stats(dat_fn)
- p.sort_stats('time').print_stats(5)
- gprof2dot_fn = os.path.join(__rlpy_location__, 'Tools', 'gprof2dot.py')
- if(platform.system() == 'Windows'):
- # Load the STATS and prepare the dot file for graphvis
- command = gprof2dot_fn + ' -f pstats {dat_fn} > {dot_fn}'.format(
- dat_fn=dat_fn,
- dot_fn=dot_fn)
- os.system(command)
- else:
- # Load the STATS and prepare the dot file for graphvis
- command = '/usr/bin/env python ' + gprof2dot_fn \
- + ' -f pstats {dat_fn} > {dot_fn}'.format(
- dat_fn=dat_fn,
- dot_fn=dot_fn)
- os.system(command)
- # Call Graphvis to generate the pdf
- command = 'dot -T pdf {dot_fn} -o {pdf_fn}'.format(dot_fn=dot_fn,
- pdf_fn=pdf_fn)
- os.system(command)
- def get_finished_ids(path):
- """returns all experiment ids for which the result file exists in
- the given directory"""
- l = sorted([int(re.findall("([0-9]*)-results.json", p)[0])
- for p in glob.glob(os.path.join(path, "*-results.json"))])
- return l
- def read_setting_content(filename):
- """reads the file content without the __main__ section
- :param filename: filename where the settings are specified"""
- setting_content = ""
- with open(filename) as f:
- lines = f.readlines()
- for l in lines:
- if "if __name__ ==" in l:
- # beware: we assume that the __main__ execution block is the
- # last one in the file
- break
- setting_content += l
- return setting_content
- def prepare_directory(setting, path, **hyperparam):
- """
- Creates a directory in path with a file for executing a given
- setting. The function returns the executable python script file
- :param setting: filename which contains a make_experiment method that get
- the id and hyperparameters
- and returns an instance of Experiment ready to run
- :param path: specifies where to create the directory
- :param \*\*hyperparam: all hyperparameters passed to the setting's
- ``make_experiment()``
- :return: filename of the file to execute in path
- """
- # create file to execute
- variables = "hyper_param = dict(" + ",\n".join(["{}={}".format(k, repr(v))
- for k, v in hyperparam.items()]) + ")"
- final_path = path
- if not os.path.exists(final_path):
- os.makedirs(final_path)
- fn = os.path.join(final_path, "main.py")
- setting_content = read_setting_content(setting)
- with open(fn, "w") as f:
- f.write(template.format(setting=setting,
- rlpy_location=__rlpy_location__,
- variables=variables,
- setting_content=setting_content))
- return fn
- def run(filename, location, ids, parallelization="sequential",
- force_rerun=False, block=True, n_jobs=-2, verbose=10, **hyperparam):
- """
- run a file containing a RLPy experiment description (a make_experiment function)
- in batch mode. Note that the __main__ section of this file is ignored
- :param filename: file to run
- :param location: directory (does not need to exist), where all outputs and a
- copy of the file to execute is stored
- :param ids: list of ids / seeds which should be executed
- :param parallelization: either **sequential** (running the experiment on one core
- for each seed in sequence), **joblib** (run using multiple
- cores in parallel, no console ouput of the individual runs)
- or **condor** (submit jobs to a HTCondor job scheduling
- system
- :param force_rerun: if False, seeds for which the results exists are not executed
- :param block: if True, the function returns when all jobs are done
- :param n_jobs: if parallelized with joblib, this specifies the number of cores to use
- specifying -1 means all cores, -2 means all but one cores
- :param verbose: controls the amount of outputs
- :param \**hyperaram: hyperparameter values which are passed to make_experiment
- as keyword arguments.
- """
- setting = filename
- fn = prepare_directory(setting, location, **hyperparam)
- # filter ids if necessary
- if not force_rerun:
- finished_ids = get_finished_ids(location)
- ids = [idtmp for idtmp in ids if idtmp not in finished_ids]
- if len(ids):
- # spawn jobs
- if parallelization == "joblib":
- run_joblib(fn, ids, n_jobs=n_jobs, verbose=verbose)
- elif parallelization == "condor":
- run_condor(fn, ids, force_rerun=force_rerun, block=block)
- elif parallelization == "sequential":
- run_joblib(fn, ids, n_jobs=1, verbose=verbose)
- def _run_helper(fn, job_id, verbose):
- if verbose >= 15:
- out = ""
- else:
- out = "> " + devnull
- path, filen = os.path.split(fn)
- subprocess.Popen(
- "python {} {} {}".format(
- filen,
- job_id + 1,
- out),
- shell=True,
- cwd=path).wait(
- )
- def run_joblib(fn, ids, n_jobs=-2, verbose=10):
- jobs = (joblib.delayed(_run_helper)(fn, i, verbose) for i in ids)
- #jobs = (joblib.delayed(os.system)(fn, i) for i in ids)
- exit_codes = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(jobs)
- return exit_codes
- def run_condor(fn, ids,
- force_rerun=False, block=False, verbose=10, poll_duration=30):
- # create condor subdirectory
- dir = os.path.dirname(fn)
- fn = os.path.basename(fn)
- outdir = os.path.join(dir, "condor")
- if not os.path.exists(outdir):
- os.makedirs(outdir)
- # exclude running jobs from ids
- if not force_rerun:
- submitted_jobs = ct.submitted_jobs_user()
- for job in submitted_jobs:
- if os.path.abspath(job["directory"]) == os.path.abspath(dir):
- if job["run_id"] in ids:
- ids.remove(job["run_id"])
- if verbose:
- print "Jobs #{} already submitted".format(job["run_id"])
- if len(ids) > 0:
- # write submit file
- with open(os.path.join(outdir, "submit"), "w") as f:
- f.write(condor_submit_template_start)
- for exp_id in ids:
- f.write(condor_submit_template_each_job.format(fn=fn, exp_id=exp_id))
- exit_code = os.system(
- "cd {dir} && condor_submit condor/submit".format(dir=dir))
- if verbose:
- print "Jobs submitted with exit code", exit_code
- else:
- if verbose:
- print "All jobs have been already submitted"
- # if blocking mode in enabled, wait until all result files are there
- # WARNING: this does not recognize killed or dead jobs and would wait
- # infinitely long
- if block:
- while(True):
- sleep(poll_duration)
- finished_ids = set(get_finished_ids(dir))
- finished_ids &= set(ids)
- if verbose > 100:
- print len(finished_ids), "of", len(ids), "jobs finished"
- if len(finished_ids) == len(ids):
- return