/ruffus/test/test_N_x_M_and_collate.py

https://code.google.com/p/ruffus/ · Python · 417 lines · 171 code · 96 blank · 150 comment · 27 complexity · 01731cd036f1bbb7462e8e2e868a98b6 MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. test_N_x_M_and_collate.py
  4. This script takes N pairs of input file pairs
  5. (with the suffices .gene and .gwas)
  6. and runs them against M sets of simulation data
  7. (with the suffix .simulation)
  8. A summary per input file pair is then produced
  9. In pseudo-code:
  10. STEP_1:
  11. for n_file in NNN_pairs_of_input_files:
  12. for m_file in MMM_simulation_data:
  13. [n_file.gene,
  14. n_file.gwas,
  15. m_file.simulation] -> n_file.m_file.simulation_res
  16. STEP_2:
  17. for n_file in NNN_pairs_of_input_files:
  18. n_file.*.simulation_res -> n_file.mean
  19. n = CNT_GENE_GWAS_FILES
  20. m = CNT_SIMULATION_FILES
  21. """
  22. CNT_GENE_GWAS_FILES = 2
  23. CNT_SIMULATION_FILES = 3
  24. import os, sys
  25. exe_path = os.path.split(os.path.abspath(sys.argv[0]))[0]
  26. sys.path.insert(0, os.path.abspath(os.path.join(exe_path,"..", "..")))
  27. from ruffus import *
  28. import random
  29. from itertools import izip
  30. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  31. # options
  32. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  33. from optparse import OptionParser
  34. parser = OptionParser(version="%prog 1.0")
  35. parser.add_option("-D", "--debug", dest = "debug",
  36. action="store_true", default=False,
  37. help="Run as unit test with default values.")
  38. parser.add_option("-k", "--keep", dest = "keep",
  39. action="store_true", default=False,
  40. help="Do not cleanup after unit test runs.")
  41. parser.add_option("-t", "--target_tasks", dest="target_tasks",
  42. action="append",
  43. default = ["statistical_summary"],
  44. metavar="JOBNAME",
  45. type="string",
  46. help="Target task(s) of pipeline.")
  47. parser.add_option("-f", "--forced_tasks", dest="forced_tasks",
  48. action="append",
  49. default = list(),
  50. metavar="JOBNAME",
  51. type="string",
  52. help="Pipeline task(s) which will be included even if they are up to date.")
  53. parser.add_option("-j", "--jobs", dest="jobs",
  54. default=5,
  55. metavar="jobs",
  56. type="int",
  57. help="Specifies the number of jobs (commands) to run simultaneously.")
  58. parser.add_option("-g", "--gene_data_dir", dest="gene_data_dir",
  59. default="%s/temp_gene_data_for_intermediate_example" % exe_path,
  60. metavar="PATH",
  61. type="string",
  62. help="Directory with gene data [*.genes / *.gwas].")
  63. parser.add_option("-s", "--simulation_data_dir", dest="simulation_data_dir",
  64. default="%s/temp_simulation_data_for_intermediate_example" % exe_path,
  65. metavar="PATH",
  66. type="string",
  67. help="Directory with simulation data [*.simulation].")
  68. parser.add_option("-w", "--working_dir", dest="working_dir",
  69. default="%s/working_dir_for_intermediate_example" % exe_path,
  70. metavar="PATH",
  71. type="string",
  72. help="Working directory.")
  73. parser.add_option("-v", "--verbose", dest = "verbose",
  74. action="count", default=0,
  75. help="Print more verbose messages for each additional verbose level.")
  76. parser.add_option("-d", "--dependency", dest="dependency_file",
  77. metavar="FILE",
  78. type="string",
  79. help="Print a dependency graph of the pipeline that would be executed "
  80. "to FILE, but do not execute it.")
  81. parser.add_option("-F", "--dependency_graph_format", dest="dependency_graph_format",
  82. metavar="FORMAT",
  83. type="string",
  84. default = 'svg',
  85. help="format of dependency graph file. Can be 'ps' (PostScript), "+
  86. "'svg' 'svgz' (Structured Vector Graphics), " +
  87. "'png' 'gif' (bitmap graphics) etc ")
  88. parser.add_option("-n", "--just_print", dest="just_print",
  89. action="store_true", default=False,
  90. help="Print a description of the jobs that would be executed, "
  91. "but do not execute them.")
  92. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  93. # imports
  94. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  95. import StringIO
  96. import re
  97. import operator
  98. import sys
  99. from collections import defaultdict
  100. import glob
  101. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  102. # Functions
  103. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  104. #_________________________________________________________________________________________
  105. #
  106. # get gene gwas file pairs
  107. #
  108. #_________________________________________________________________________________________
  109. def get_gene_gwas_file_pairs( ):
  110. """
  111. Helper function to get all *.gene, *.gwas from the direction specified
  112. in --gene_data_dir
  113. Returns
  114. file pairs with both .gene and .gwas extensions,
  115. corresponding roots (no extension) of each file
  116. """
  117. gene_files = glob.glob(os.path.join(options.gene_data_dir, "*.gene"))
  118. gwas_files = glob.glob(os.path.join(options.gene_data_dir, "*.gwas"))
  119. common_roots = set(map(lambda x: os.path.splitext(os.path.split(x)[1])[0], gene_files))
  120. common_roots &=set(map(lambda x: os.path.splitext(os.path.split(x)[1])[0], gwas_files))
  121. common_roots = list(common_roots)
  122. p = os.path; g_dir = options.gene_data_dir
  123. file_pairs = [[p.join(g_dir, x + ".gene"), p.join(g_dir, x + ".gwas")] for x in common_roots]
  124. return file_pairs, common_roots
  125. #_________________________________________________________________________________________
  126. #
  127. # get simulation files
  128. #
  129. #_________________________________________________________________________________________
  130. def get_simulation_files( ):
  131. """
  132. Helper function to get all *.simulation from the direction specified
  133. in --simulation_data_dir
  134. Returns
  135. file with .simulation extensions,
  136. corresponding roots (no extension) of each file
  137. """
  138. simulation_files = glob.glob(os.path.join(options.simulation_data_dir, "*.simulation"))
  139. simulation_roots =map(lambda x: os.path.splitext(os.path.split(x)[1])[0], simulation_files)
  140. return simulation_files, simulation_roots
  141. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  142. # Main logic
  143. #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  144. # get help string
  145. f =StringIO.StringIO()
  146. parser.print_help(f)
  147. helpstr = f.getvalue()
  148. (options, remaining_args) = parser.parse_args()
  149. working_dir = options.working_dir
  150. #_________________________________________________________________________________________
  151. #
  152. # setup_simulation_data
  153. #
  154. #_________________________________________________________________________________________
  155. #
  156. # mkdir: makes sure output directories exist before task
  157. #
  158. @follows(mkdir(options.gene_data_dir, options.simulation_data_dir))
  159. def setup_simulation_data ():
  160. """
  161. create simulation files
  162. """
  163. for i in range(CNT_GENE_GWAS_FILES):
  164. open(os.path.join(options.gene_data_dir, "%03d.gene" % i), "w")
  165. open(os.path.join(options.gene_data_dir, "%03d.gwas" % i), "w")
  166. # gene files without corresponding gwas and vice versa
  167. open(os.path.join(options.gene_data_dir, "orphan1.gene"), "w")
  168. open(os.path.join(options.gene_data_dir, "orphan2.gwas"), "w")
  169. open(os.path.join(options.gene_data_dir, "orphan3.gwas"), "w")
  170. for i in range(CNT_SIMULATION_FILES):
  171. open(os.path.join(options.simulation_data_dir, "%03d.simulation" % i), "w")
  172. #_________________________________________________________________________________________
  173. #
  174. # cleanup_simulation_data
  175. #
  176. #_________________________________________________________________________________________
  177. def try_rmdir (d):
  178. if os.path.exists(d):
  179. try:
  180. os.rmdir(d)
  181. except OSError:
  182. sys.stderr.write("Warning:\t%s is not empty and will not be removed.\n" % d)
  183. def cleanup_simulation_data ():
  184. """
  185. cleanup files
  186. """
  187. if options.verbose:
  188. sys.stderr.write("Cleanup working directory and simulation files.\n")
  189. #
  190. # cleanup gene and gwas files
  191. #
  192. for f in glob.glob(os.path.join(options.gene_data_dir, "*.gene")):
  193. os.unlink(f)
  194. for f in glob.glob(os.path.join(options.gene_data_dir, "*.gwas")):
  195. os.unlink(f)
  196. try_rmdir(options.gene_data_dir)
  197. #
  198. # cleanup simulation
  199. #
  200. for f in glob.glob(os.path.join(options.simulation_data_dir, "*.simulation")):
  201. os.unlink(f)
  202. try_rmdir(options.simulation_data_dir)
  203. #
  204. # cleanup working_dir
  205. #
  206. for f in glob.glob(os.path.join(working_dir, "simulation_results", "*.simulation_res")):
  207. os.unlink(f)
  208. try_rmdir(os.path.join(working_dir, "simulation_results"))
  209. for f in glob.glob(os.path.join(working_dir, "*.mean")):
  210. os.unlink(f)
  211. try_rmdir(working_dir)
  212. #_________________________________________________________________________________________
  213. #
  214. # Step 1:
  215. #
  216. # for n_file in NNN_pairs_of_input_files:
  217. # for m_file in MMM_simulation_data:
  218. #
  219. # [n_file.gene,
  220. # n_file.gwas,
  221. # m_file.simulation] -> working_dir/n_file.m_file.simulation_res
  222. #
  223. #_________________________________________________________________________________________
  224. def generate_simulation_params ():
  225. """
  226. Custom function to generate
  227. file names for gene/gwas simulation study
  228. """
  229. simulation_files, simulation_file_roots = get_simulation_files()
  230. gene_gwas_file_pairs, gene_gwas_file_roots = get_gene_gwas_file_pairs()
  231. for sim_file, sim_file_root in izip(simulation_files, simulation_file_roots):
  232. for (gene, gwas), gene_file_root in izip(gene_gwas_file_pairs, gene_gwas_file_roots):
  233. result_file = "%s.%s.simulation_res" % (gene_file_root, sim_file_root)
  234. result_file_path = os.path.join(working_dir, "simulation_results", result_file)
  235. yield [gene, gwas, sim_file], result_file_path, gene_file_root, sim_file_root, result_file
  236. #
  237. # mkdir: makes sure output directories exist before task
  238. #
  239. @follows(mkdir(options.working_dir, os.path.join(working_dir, "simulation_results")))
  240. @files(generate_simulation_params)
  241. def gwas_simulation(input_files, result_file_path, gene_file_root, sim_file_root, result_file):
  242. """
  243. Dummy calculation of gene gwas vs simulation data
  244. Normally runs in parallel on a computational cluster
  245. """
  246. (gene_file,
  247. gwas_file,
  248. simulation_data_file) = input_files
  249. simulation_res_file = open(result_file_path, "w")
  250. simulation_res_file.write("%s + %s -> %s\n" % (gene_file_root, sim_file_root, result_file))
  251. #_________________________________________________________________________________________
  252. #
  253. # Step 2:
  254. #
  255. # Statistical summary per gene/gwas file pair
  256. #
  257. # for n_file in NNN_pairs_of_input_files:
  258. # working_dir/simulation_results/n.*.simulation_res
  259. # -> working_dir/n.mean
  260. #
  261. #_________________________________________________________________________________________
  262. @collate(gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")
  263. @posttask(lambda : sys.stdout.write("\nOK\n"))
  264. def statistical_summary (result_files, summary_file):
  265. """
  266. Simulate statistical summary
  267. """
  268. summary_file = open(summary_file, "w")
  269. for f in result_files:
  270. summary_file.write(open(f).read())
  271. #888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
  272. #
  273. # print pipeline or run pipeline
  274. #
  275. #
  276. # Necessary to protect the "entry point" of the program under windows.
  277. # see: http://docs.python.org/library/multiprocessing.html#multiprocessing-programming
  278. #
  279. if __name__ == '__main__':
  280. try:
  281. if options.debug:
  282. if not len(options.target_tasks):
  283. options.target_tasks.append([statistical_summary])
  284. pipeline_run([setup_simulation_data], [setup_simulation_data], multiprocess = options.jobs, verbose = 0)
  285. else:
  286. if (not len(get_gene_gwas_file_pairs( )[0]) or
  287. not len (get_simulation_files( )[0])):
  288. print "Warning!!\n\n\tNo *.gene / *.gwas or *.simulation: Run --debug to create simulation files first\n\n"
  289. sys.exit(1)
  290. if options.just_print:
  291. pipeline_printout(sys.stdout, options.target_tasks, options.forced_tasks, verbose=options.verbose)
  292. elif options.dependency_file:
  293. graph_printout ( open(options.dependency_file, "w"),
  294. options.dependency_graph_format,
  295. options.target_tasks,
  296. options.forced_tasks)
  297. else:
  298. pipeline_run(options.target_tasks, options.forced_tasks, multiprocess = options.jobs, verbose = options.verbose)
  299. if options.debug and not options.keep:
  300. cleanup_simulation_data ()
  301. except Exception, e:
  302. print e.args
  303. raise