PageRenderTime 39ms CodeModel.GetById 2ms app.highlight 31ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/new_operations/gops_cluster.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 132 lines | 102 code | 12 blank | 18 comment | 41 complexity | 4c056b62a5b4d01487f1260d90d3ba50 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3Cluster regions of intervals.
  4
  5usage: %prog in_file out_file
  6    -1, --cols1=N,N,N,N: Columns for start, end, strand in file
  7    -d, --distance=N: Maximum distance between clustered intervals
  8    -v, --overlap=N: Minimum overlap require (negative distance)
  9    -m, --minregions=N: Minimum regions per cluster
 10    -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum
 11"""
 12from galaxy import eggs
 13import pkg_resources
 14pkg_resources.require( "bx-python" )
 15import sys, traceback, fileinput
 16from warnings import warn
 17from bx.intervals import *
 18from bx.intervals.io import *
 19from bx.intervals.operations.find_clusters import *
 20from bx.cookbook import doc_optparse
 21from galaxy.tools.util.galaxyops import *
 22
 23assert sys.version_info[:2] >= ( 2, 4 )
 24
 25def main():
 26    distance = 0
 27    minregions = 2
 28    output = 1
 29    upstream_pad = 0
 30    downstream_pad = 0
 31
 32    options, args = doc_optparse.parse( __doc__ )
 33    try:
 34        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
 35        if options.distance: distance = int( options.distance )
 36        if options.overlap: distance = -1 * int( options.overlap )
 37        if options.output: output = int( options.output )
 38        if options.minregions: minregions = int( options.minregions )
 39        in_fname, out_fname = args
 40    except:
 41        doc_optparse.exception()
 42
 43    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
 44                            chrom_col=chr_col_1,
 45                            start_col=start_col_1,
 46                            end_col=end_col_1,
 47                            strand_col=strand_col_1,
 48                            fix_strand=True )
 49
 50    # Get the cluster tree
 51    try:
 52        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
 53    except ParseError, exc:
 54        fail( "Invalid file format: %s" % str( exc ) )
 55
 56    f1 = open( in_fname, "r" )
 57    out_file = open( out_fname, "w" )
 58    
 59    # If "merge"
 60    if output == 1:
 61        fields = ["."  for x in range(max(g1.chrom_col, g1.start_col, g1.end_col)+1)]
 62        for chrom, tree in clusters.items():
 63            for start, end, lines in tree.getregions():
 64                fields[g1.chrom_col] = chrom
 65                fields[g1.start_col] = str(start)
 66                fields[g1.end_col] = str(end)
 67                out_file.write( "%s\n" % "\t".join( fields ) )
 68
 69    # If "filtered" we preserve order of file and comments, etc.
 70    if output == 2:
 71        linenums = dict()
 72        for chrom, tree in clusters.items():
 73            for linenum in tree.getlines():
 74                linenums[linenum] = 0
 75        linenum = -1
 76        f1.seek(0)
 77        for line in f1.readlines():
 78            linenum += 1
 79            if linenum in linenums or linenum in extra:
 80                out_file.write( "%s\n" % line.rstrip( "\n\r" ) )
 81
 82    # If "clustered" we output original intervals, but near each other (i.e. clustered)
 83    if output == 3:
 84        linenums = list()
 85        f1.seek(0)
 86        fileLines = f1.readlines()
 87        for chrom, tree in clusters.items():
 88            for linenum in tree.getlines():
 89                out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )
 90
 91    # If "minimum" we output the smallest interval in each cluster
 92    if output == 4 or output == 5:
 93        linenums = list()
 94        f1.seek(0)
 95        fileLines = f1.readlines()
 96        for chrom, tree in clusters.items():
 97            regions = tree.getregions()
 98            for start, end, lines in tree.getregions():
 99                outsize = -1
100                outinterval = None
101                for line in lines:
102                    # three nested for loops?
103                    # should only execute this code once per line
104                    fileline = fileLines[line].rstrip("\n\r")
105                    try:
106                        cluster_interval = GenomicInterval( g1, fileline.split("\t"), 
107                                                            g1.chrom_col, 
108                                                            g1.start_col,
109                                                            g1.end_col, 
110                                                            g1.strand_col, 
111                                                            g1.default_strand,
112                                                            g1.fix_strand )
113                    except Exception, exc:
114                        print >> sys.stderr, str( exc )
115                        f1.close()
116                        sys.exit()
117                    interval_size = cluster_interval.end - cluster_interval.start
118                    if outsize == -1 or \
119                       ( outsize > interval_size and output == 4 ) or \
120                       ( outsize < interval_size and output == 5 ) :
121                        outinterval = cluster_interval
122                        outsize = interval_size
123                out_file.write( "%s\n" % outinterval )
124
125    f1.close()
126    out_file.close()
127    
128    if g1.skipped > 0:
129        print skipped( g1, filedesc="" )
130
131if __name__ == "__main__":
132    main()