PageRenderTime 65ms CodeModel.GetById 44ms app.highlight 17ms RepoModel.GetById 1ms app.codeStats 0ms

/ruffus/test/test_N_x_M_and_collate.py

https://code.google.com/p/ruffus/
Python | 417 lines | 171 code | 96 blank | 150 comment | 34 complexity | 01731cd036f1bbb7462e8e2e868a98b6 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3
  4    test_N_x_M_and_collate.py
  5
  6
  7        This script takes N pairs of input file pairs
  8                                (with the suffices .gene and .gwas)
  9                          and runs them against M sets of simulation data
 10                                (with the suffix .simulation)
 11        A summary per input file pair is then produced
 12
 13
 14    In pseudo-code:
 15
 16        STEP_1:
 17
 18        for n_file in NNN_pairs_of_input_files:
 19            for m_file in MMM_simulation_data:
 20
 21                [n_file.gene,
 22                 n_file.gwas,
 23                 m_file.simulation] -> n_file.m_file.simulation_res
 24
 25
 26        STEP_2:
 27
 28        for n_file in NNN_pairs_of_input_files:
 29
 30            n_file.*.simulation_res -> n_file.mean
 31
 32
 33        n = CNT_GENE_GWAS_FILES
 34        m = CNT_SIMULATION_FILES
 35
 36
 37
 38"""
 39
 40CNT_GENE_GWAS_FILES     = 2
 41CNT_SIMULATION_FILES    = 3
 42
 43
 44
 45import os, sys
 46exe_path = os.path.split(os.path.abspath(sys.argv[0]))[0]
 47sys.path.insert(0, os.path.abspath(os.path.join(exe_path,"..", "..")))
 48
 49
 50
 51from ruffus import *
 52import random
 53from itertools import izip
 54
 55
 56#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
 57
 58#   options
 59
 60
 61#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
 62
 63from optparse import OptionParser
 64parser = OptionParser(version="%prog 1.0")
 65parser.add_option("-D", "--debug", dest = "debug",
 66                  action="store_true", default=False,
 67                  help="Run as unit test with default values.")
 68parser.add_option("-k", "--keep", dest = "keep",
 69                  action="store_true", default=False,
 70                  help="Do not cleanup after unit test runs.")
 71parser.add_option("-t", "--target_tasks", dest="target_tasks",
 72                  action="append",
 73                  default = ["statistical_summary"],
 74                  metavar="JOBNAME",
 75                  type="string",
 76                  help="Target task(s) of pipeline.")
 77parser.add_option("-f", "--forced_tasks", dest="forced_tasks",
 78                  action="append",
 79                  default = list(),
 80                  metavar="JOBNAME",
 81                  type="string",
 82                  help="Pipeline task(s) which will be included even if they are up to date.")
 83parser.add_option("-j", "--jobs", dest="jobs",
 84                  default=5,
 85                  metavar="jobs",
 86                  type="int",
 87                  help="Specifies the number of jobs (commands) to run simultaneously.")
 88
 89parser.add_option("-g", "--gene_data_dir", dest="gene_data_dir",
 90                  default="%s/temp_gene_data_for_intermediate_example" % exe_path,
 91                  metavar="PATH",
 92                  type="string",
 93                  help="Directory with gene data [*.genes / *.gwas].")
 94parser.add_option("-s", "--simulation_data_dir", dest="simulation_data_dir",
 95                  default="%s/temp_simulation_data_for_intermediate_example" % exe_path,
 96                  metavar="PATH",
 97                  type="string",
 98                  help="Directory with simulation data [*.simulation].")
 99parser.add_option("-w", "--working_dir", dest="working_dir",
100                  default="%s/working_dir_for_intermediate_example" % exe_path,
101                  metavar="PATH",
102                  type="string",
103                  help="Working directory.")
104
105
106parser.add_option("-v", "--verbose", dest = "verbose",
107                  action="count", default=0,
108                  help="Print more verbose messages for each additional verbose level.")
109parser.add_option("-d", "--dependency", dest="dependency_file",
110                  metavar="FILE",
111                  type="string",
112                  help="Print a dependency graph of the pipeline that would be executed "
113                        "to FILE, but do not execute it.")
114parser.add_option("-F", "--dependency_graph_format", dest="dependency_graph_format",
115                  metavar="FORMAT",
116                  type="string",
117                  default = 'svg',
118                  help="format of dependency graph file. Can be 'ps' (PostScript), "+
119                  "'svg' 'svgz' (Structured Vector Graphics), " +
120                  "'png' 'gif' (bitmap  graphics) etc ")
121parser.add_option("-n", "--just_print", dest="just_print",
122                    action="store_true", default=False,
123                    help="Print a description of the jobs that would be executed, "
124                        "but do not execute them.")
125
126
127
128
129
130#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
131
132#   imports
133
134
135#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
136
137import StringIO
138import re
139import operator
140import sys
141from collections import defaultdict
142import glob
143#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
144
145#   Functions
146
147
148#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
149
150#_________________________________________________________________________________________
151#
152#   get gene gwas file pairs
153#
154#_________________________________________________________________________________________
155def get_gene_gwas_file_pairs(  ):
156    """
157    Helper function to get all *.gene, *.gwas from the direction specified
158        in --gene_data_dir
159
160    Returns
161        file pairs with both .gene and .gwas extensions,
162        corresponding roots (no extension) of each file
163    """
164
165
166    gene_files = glob.glob(os.path.join(options.gene_data_dir, "*.gene"))
167    gwas_files = glob.glob(os.path.join(options.gene_data_dir, "*.gwas"))
168
169    common_roots = set(map(lambda x: os.path.splitext(os.path.split(x)[1])[0], gene_files))
170    common_roots &=set(map(lambda x: os.path.splitext(os.path.split(x)[1])[0], gwas_files))
171    common_roots = list(common_roots)
172
173    p = os.path; g_dir = options.gene_data_dir
174
175    file_pairs = [[p.join(g_dir, x + ".gene"), p.join(g_dir, x + ".gwas")] for x in common_roots]
176
177    return file_pairs, common_roots
178
179#_________________________________________________________________________________________
180#
181#   get simulation files
182#
183#_________________________________________________________________________________________
184def get_simulation_files(  ):
185    """
186    Helper function to get all *.simulation from the direction specified
187        in --simulation_data_dir
188        Returns
189            file with .simulation extensions,
190            corresponding roots (no extension) of each file
191    """
192    simulation_files = glob.glob(os.path.join(options.simulation_data_dir, "*.simulation"))
193    simulation_roots =map(lambda x: os.path.splitext(os.path.split(x)[1])[0], simulation_files)
194    return simulation_files, simulation_roots
195
196
197
198#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
199
200#   Main logic
201
202
203#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
204
205
206
207
208
209# get help string
210f =StringIO.StringIO()
211parser.print_help(f)
212helpstr = f.getvalue()
213(options, remaining_args) = parser.parse_args()
214
215
216working_dir = options.working_dir
217
218
219
220
221#_________________________________________________________________________________________
222#
223#   setup_simulation_data
224#
225#_________________________________________________________________________________________
226
227#
228# mkdir: makes sure output directories exist before task
229#
230@follows(mkdir(options.gene_data_dir, options.simulation_data_dir))
231def setup_simulation_data ():
232    """
233    create simulation files
234    """
235    for i in range(CNT_GENE_GWAS_FILES):
236        open(os.path.join(options.gene_data_dir, "%03d.gene" % i), "w")
237        open(os.path.join(options.gene_data_dir, "%03d.gwas" % i), "w")
238
239    # gene files without corresponding gwas and vice versa
240    open(os.path.join(options.gene_data_dir, "orphan1.gene"), "w")
241    open(os.path.join(options.gene_data_dir, "orphan2.gwas"), "w")
242    open(os.path.join(options.gene_data_dir, "orphan3.gwas"), "w")
243
244    for i in range(CNT_SIMULATION_FILES):
245        open(os.path.join(options.simulation_data_dir, "%03d.simulation" % i), "w")
246
247
248
249
250#_________________________________________________________________________________________
251#
252#   cleanup_simulation_data
253#
254#_________________________________________________________________________________________
255def try_rmdir (d):
256    if os.path.exists(d):
257        try:
258            os.rmdir(d)
259        except OSError:
260            sys.stderr.write("Warning:\t%s is not empty and will not be removed.\n" % d)
261
262def cleanup_simulation_data ():
263    """
264    cleanup files
265    """
266    if options.verbose:
267        sys.stderr.write("Cleanup working directory and simulation files.\n")
268
269    #
270    #   cleanup gene and gwas files
271    #
272    for f in glob.glob(os.path.join(options.gene_data_dir, "*.gene")):
273        os.unlink(f)
274    for f in glob.glob(os.path.join(options.gene_data_dir, "*.gwas")):
275        os.unlink(f)
276    try_rmdir(options.gene_data_dir)
277
278    #
279    #   cleanup simulation
280    #
281    for f in glob.glob(os.path.join(options.simulation_data_dir, "*.simulation")):
282        os.unlink(f)
283    try_rmdir(options.simulation_data_dir)
284
285
286    #
287    #   cleanup working_dir
288    #
289    for f in glob.glob(os.path.join(working_dir, "simulation_results", "*.simulation_res")):
290        os.unlink(f)
291    try_rmdir(os.path.join(working_dir, "simulation_results"))
292
293    for f in glob.glob(os.path.join(working_dir, "*.mean")):
294        os.unlink(f)
295    try_rmdir(working_dir)
296
297
298#_________________________________________________________________________________________
299#
300#   Step 1:
301#
302#        for n_file in NNN_pairs_of_input_files:
303#            for m_file in MMM_simulation_data:
304#
305#                [n_file.gene,
306#                 n_file.gwas,
307#                 m_file.simulation] -> working_dir/n_file.m_file.simulation_res
308#
309#_________________________________________________________________________________________
310def generate_simulation_params ():
311    """
312    Custom function to generate
313    file names for gene/gwas simulation study
314    """
315
316    simulation_files, simulation_file_roots    = get_simulation_files()
317    gene_gwas_file_pairs, gene_gwas_file_roots =  get_gene_gwas_file_pairs()
318
319    for sim_file, sim_file_root in izip(simulation_files, simulation_file_roots):
320        for (gene, gwas), gene_file_root in izip(gene_gwas_file_pairs, gene_gwas_file_roots):
321
322            result_file = "%s.%s.simulation_res" % (gene_file_root, sim_file_root)
323            result_file_path = os.path.join(working_dir, "simulation_results", result_file)
324
325            yield [gene, gwas, sim_file], result_file_path, gene_file_root, sim_file_root, result_file
326
327#
328# mkdir: makes sure output directories exist before task
329#
330@follows(mkdir(options.working_dir, os.path.join(working_dir, "simulation_results")))
331@files(generate_simulation_params)
332def gwas_simulation(input_files, result_file_path, gene_file_root, sim_file_root, result_file):
333    """
334    Dummy calculation of gene gwas vs simulation data
335    Normally runs in parallel on a computational cluster
336    """
337    (gene_file,
338    gwas_file,
339    simulation_data_file) = input_files
340
341    simulation_res_file = open(result_file_path, "w")
342    simulation_res_file.write("%s + %s -> %s\n" % (gene_file_root, sim_file_root, result_file))
343
344
345#_________________________________________________________________________________________
346#
347#   Step 2:
348#
349#       Statistical summary per gene/gwas file pair
350#
351#        for n_file in NNN_pairs_of_input_files:
352#            working_dir/simulation_results/n.*.simulation_res
353#               -> working_dir/n.mean
354#
355#_________________________________________________________________________________________
356
357
358@collate(gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")
359@posttask(lambda : sys.stdout.write("\nOK\n"))
360def statistical_summary (result_files, summary_file):
361    """
362    Simulate statistical summary
363    """
364
365    summary_file = open(summary_file, "w")
366    for f in result_files:
367        summary_file.write(open(f).read())
368
369
370
371
372
373
374
375
376
377#888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
378#
379#   print pipeline or run pipeline
380#
381
382#
383#   Necessary to protect the "entry point" of the program under windows.
384#       see: http://docs.python.org/library/multiprocessing.html#multiprocessing-programming
385#
386if __name__ == '__main__':
387    try:
388        if options.debug:
389            if not len(options.target_tasks):
390                options.target_tasks.append([statistical_summary])
391            pipeline_run([setup_simulation_data], [setup_simulation_data], multiprocess = options.jobs, verbose = 0)
392        else:
393            if (not len(get_gene_gwas_file_pairs(  )[0]) or
394                not len (get_simulation_files(  )[0])):
395                print "Warning!!\n\n\tNo *.gene / *.gwas or *.simulation: Run --debug to create simulation files first\n\n"
396                sys.exit(1)
397
398
399        if options.just_print:
400            pipeline_printout(sys.stdout, options.target_tasks, options.forced_tasks, verbose=options.verbose)
401
402        elif options.dependency_file:
403            graph_printout (     open(options.dependency_file, "w"),
404                                 options.dependency_graph_format,
405                                 options.target_tasks,
406                                 options.forced_tasks)
407        else:
408            pipeline_run(options.target_tasks, options.forced_tasks, multiprocess = options.jobs, verbose = options.verbose)
409
410
411        if options.debug and not options.keep:
412            cleanup_simulation_data ()
413
414    except Exception, e:
415        print e.args
416        raise
417