PageRenderTime 47ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/rlpy/Tools/run.py

https://bitbucket.org/okkhoy/rlpy
Python | 259 lines | 251 code | 3 blank | 5 comment | 10 complexity | c6c843d4293a935924a9944bb7eb481b MD5 | raw file
  1. import joblib
  2. import os
  3. import glob
  4. import re
  5. from rlpy.Tools import __rlpy_location__
  6. import rlpy.Tools.condor as ct
  7. from time import sleep
  8. import cProfile
  9. import pstats
  10. import platform
  11. import sys
  12. import subprocess
  13. __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
  14. __credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
  15. "William Dabney", "Jonathan P. How"]
  16. __license__ = "BSD 3-Clause"
  17. # template for executable file used to execute experiments
  18. template = """#!/usr/bin/env python
  19. import sys
  20. sys.path = ["{rlpy_location}"] + sys.path
  21. {setting_content}
  22. {variables}
  23. if __name__ == "__main__":
  24. exp = make_experiment(int(sys.argv[1]), ".", **hyper_param)
  25. exp.log_interval = 60
  26. exp.run()
  27. exp.save()
  28. """
  29. # template for submission files of htcondor queueing system
  30. condor_submit_template_start = """
  31. executable = /data/scratch/cdann/anaconda/bin/python
  32. universe = vanilla
  33. priority = 0
  34. Notification = Never
  35. requirements = OpSys == "LINUX" && ARCH == "X86_64"
  36. getenv = True"""
  37. condor_submit_template_each_job = """
  38. arguments = "{fn} {id}"
  39. Error = condor/{id}.err
  40. Log = condor/{id}.log
  41. Output = condor/{id}.out
  42. queue 1
  43. """
  44. if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
  45. devnull = "nul"
  46. else:
  47. devnull = "/dev/null"
  48. def run_profiled(make_exp_fun, profile_location="Profiling",
  49. out="Test.pdf", **kwargs):
  50. """run an experiment (without storing its results) and profiles the execution.
  51. A gprof file is created and a pdf with a graphical visualization of the most
  52. time-consuming functions in the experiment execution
  53. :param make_exp_fun: function that returns an Experiment instance which is then
  54. executed. All remaining keyword parameters are passed to this
  55. function.
  56. :param profile_location: directory used to store the profiling result files.
  57. :param out: filename of the generated pdf file.
  58. :param \*\*kwargs: remaining parameters passed to the experiment generator function
  59. """
  60. dat_fn = os.path.join(profile_location, "profile.dat")
  61. pdf_fn = os.path.join(profile_location, out)
  62. dot_fn = os.path.join(profile_location, "graph.txt")
  63. exp = make_exp_fun(**kwargs)
  64. cProfile.runctx('exp.run()', {}, {"exp": exp}, dat_fn)
  65. p = pstats.Stats(dat_fn)
  66. p.sort_stats('time').print_stats(5)
  67. gprof2dot_fn = os.path.join(__rlpy_location__, 'Tools', 'gprof2dot.py')
  68. if(platform.system() == 'Windows'):
  69. # Load the STATS and prepare the dot file for graphvis
  70. command = gprof2dot_fn + ' -f pstats {dat_fn} > {dot_fn}'.format(
  71. dat_fn=dat_fn,
  72. dot_fn=dot_fn)
  73. os.system(command)
  74. else:
  75. # Load the STATS and prepare the dot file for graphvis
  76. command = '/usr/bin/env python ' + gprof2dot_fn \
  77. + ' -f pstats {dat_fn} > {dot_fn}'.format(
  78. dat_fn=dat_fn,
  79. dot_fn=dot_fn)
  80. os.system(command)
  81. # Call Graphvis to generate the pdf
  82. command = 'dot -T pdf {dot_fn} -o {pdf_fn}'.format(dot_fn=dot_fn,
  83. pdf_fn=pdf_fn)
  84. os.system(command)
  85. def get_finished_ids(path):
  86. """returns all experiment ids for which the result file exists in
  87. the given directory"""
  88. l = sorted([int(re.findall("([0-9]*)-results.json", p)[0])
  89. for p in glob.glob(os.path.join(path, "*-results.json"))])
  90. return l
  91. def read_setting_content(filename):
  92. """reads the file content without the __main__ section
  93. :param filename: filename where the settings are specified"""
  94. setting_content = ""
  95. with open(filename) as f:
  96. lines = f.readlines()
  97. for l in lines:
  98. if "if __name__ ==" in l:
  99. # beware: we assume that the __main__ execution block is the
  100. # last one in the file
  101. break
  102. setting_content += l
  103. return setting_content
  104. def prepare_directory(setting, path, **hyperparam):
  105. """
  106. Creates a directory in path with a file for executing a given
  107. setting. The function returns the executable python script file
  108. :param setting: filename which contains a make_experiment method that get
  109. the id and hyperparameters
  110. and returns an instance of Experiment ready to run
  111. :param path: specifies where to create the directory
  112. :param \*\*hyperparam: all hyperparameters passed to the setting's
  113. ``make_experiment()``
  114. :return: filename of the file to execute in path
  115. """
  116. # create file to execute
  117. variables = "hyper_param = dict(" + ",\n".join(["{}={}".format(k, repr(v))
  118. for k, v in hyperparam.items()]) + ")"
  119. final_path = path
  120. if not os.path.exists(final_path):
  121. os.makedirs(final_path)
  122. fn = os.path.join(final_path, "main.py")
  123. setting_content = read_setting_content(setting)
  124. with open(fn, "w") as f:
  125. f.write(template.format(setting=setting,
  126. rlpy_location=__rlpy_location__,
  127. variables=variables,
  128. setting_content=setting_content))
  129. return fn
  130. def run(filename, location, ids, parallelization="sequential",
  131. force_rerun=False, block=True, n_jobs=-2, verbose=10, **hyperparam):
  132. """
  133. run a file containing a RLPy experiment description (a make_experiment function)
  134. in batch mode. Note that the __main__ section of this file is ignored
  135. :param filename: file to run
  136. :param location: directory (does not need to exist), where all outputs and a
  137. copy of the file to execute is stored
  138. :param ids: list of ids / seeds which should be executed
  139. :param parallelization: either **sequential** (running the experiment on one core
  140. for each seed in sequence), **joblib** (run using multiple
  141. cores in parallel, no console ouput of the individual runs)
  142. or **condor** (submit jobs to a HTCondor job scheduling
  143. system
  144. :param force_rerun: if False, seeds for which the results exists are not executed
  145. :param block: if True, the function returns when all jobs are done
  146. :param n_jobs: if parallelized with joblib, this specifies the number of cores to use
  147. specifying -1 means all cores, -2 means all but one cores
  148. :param verbose: controls the amount of outputs
  149. :param \**hyperaram: hyperparameter values which are passed to make_experiment
  150. as keyword arguments.
  151. """
  152. setting = filename
  153. fn = prepare_directory(setting, location, **hyperparam)
  154. # filter ids if necessary
  155. if not force_rerun:
  156. finished_ids = get_finished_ids(location)
  157. ids = [idtmp for idtmp in ids if idtmp not in finished_ids]
  158. if len(ids):
  159. # spawn jobs
  160. if parallelization == "joblib":
  161. run_joblib(fn, ids, n_jobs=n_jobs, verbose=verbose)
  162. elif parallelization == "condor":
  163. run_condor(fn, ids, force_rerun=force_rerun, block=block)
  164. elif parallelization == "sequential":
  165. run_joblib(fn, ids, n_jobs=1, verbose=verbose)
  166. def _run_helper(fn, job_id, verbose):
  167. if verbose >= 15:
  168. out = ""
  169. else:
  170. out = "> " + devnull
  171. path, filen = os.path.split(fn)
  172. subprocess.Popen(
  173. "python {} {} {}".format(
  174. filen,
  175. job_id + 1,
  176. out),
  177. shell=True,
  178. cwd=path).wait(
  179. )
  180. def run_joblib(fn, ids, n_jobs=-2, verbose=10):
  181. jobs = (joblib.delayed(_run_helper)(fn, i, verbose) for i in ids)
  182. #jobs = (joblib.delayed(os.system)(fn, i) for i in ids)
  183. exit_codes = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(jobs)
  184. return exit_codes
  185. def run_condor(fn, ids,
  186. force_rerun=False, block=False, verbose=10, poll_duration=30):
  187. # create condor subdirectory
  188. dir = os.path.dirname(fn)
  189. fn = os.path.basename(fn)
  190. outdir = os.path.join(dir, "condor")
  191. if not os.path.exists(outdir):
  192. os.makedirs(outdir)
  193. # exclude running jobs from ids
  194. if not force_rerun:
  195. submitted_jobs = ct.submitted_jobs_user()
  196. for job in submitted_jobs:
  197. if os.path.abspath(job["directory"]) == os.path.abspath(dir):
  198. if job["run_id"] in ids:
  199. ids.remove(job["run_id"])
  200. if verbose:
  201. print "Jobs #{} already submitted".format(job["run_id"])
  202. if len(ids) > 0:
  203. # write submit file
  204. with open(os.path.join(outdir, "submit"), "w") as f:
  205. f.write(condor_submit_template_start)
  206. for exp_id in ids:
  207. f.write(condor_submit_template_each_job.format(fn=fn, exp_id=exp_id))
  208. exit_code = os.system(
  209. "cd {dir} && condor_submit condor/submit".format(dir=dir))
  210. if verbose:
  211. print "Jobs submitted with exit code", exit_code
  212. else:
  213. if verbose:
  214. print "All jobs have been already submitted"
  215. # if blocking mode in enabled, wait until all result files are there
  216. # WARNING: this does not recognize killed or dead jobs and would wait
  217. # infinitely long
  218. if block:
  219. while(True):
  220. sleep(poll_duration)
  221. finished_ids = set(get_finished_ids(dir))
  222. finished_ids &= set(ids)
  223. if verbose > 100:
  224. print len(finished_ids), "of", len(ids), "jobs finished"
  225. if len(finished_ids) == len(ids):
  226. return