PageRenderTime 18ms CodeModel.GetById 2ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/datatypes/converters/interval_to_coverage.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 152 lines | 116 code | 3 blank | 33 comment | 2 complexity | 7f978cdc14cc94ece76ccacbb4f4a55f MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3Converter to generate 3 (or 4) column base-pair coverage from an interval file.
  4
  5usage: %prog bed_file out_file
  6    -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in interval file
  7    -2, --cols2=N,N,N,N: Columns for chrom, start, end, strand in coverage file
  8"""
  9import sys
 10from galaxy import eggs
 11import pkg_resources; pkg_resources.require( "bx-python" )
 12from bx.intervals import io
 13from bx.cookbook import doc_optparse
 14import psyco_full
 15import commands
 16import os
 17from os import environ
 18import tempfile
 19from bisect import bisect
 20
 21INTERVAL_METADATA = ('chromCol',
 22                     'startCol',
 23                     'endCol',
 24                     'strandCol',)
 25
 26COVERAGE_METADATA = ('chromCol',
 27                     'positionCol',
 28                     'forwardCol',
 29                     'reverseCol',)
 30
 31def main( interval, coverage ):
 32    """
 33    Uses a sliding window of partitions to count coverages.
 34    Every interval record adds its start and end to the partitions.  The result
 35    is a list of partitions, or every position that has a (maybe) different
 36    number of basepairs covered.  We don't worry about merging because we pop
 37    as the sorted intervals are read in.  As the input start positions exceed
 38    the partition positions in partitions, coverages are kicked out in bulk.
 39    """
 40    partitions = []
 41    forward_covs = []
 42    reverse_covs = []
 43    offset = 0
 44    chrom = None
 45    lastchrom = None
 46    for record in interval:
 47        chrom = record.chrom
 48        if lastchrom and not lastchrom == chrom and partitions:
 49            for partition in xrange(0, len(partitions)-1):
 50                forward = forward_covs[partition]
 51                reverse = reverse_covs[partition]
 52                if forward+reverse > 0:
 53                    coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]),
 54                                   forward=forward, reverse=reverse)
 55            partitions = []
 56            forward_covs = []
 57            reverse_covs = []
 58
 59        start_index = bisect(partitions, record.start)
 60        forward = int(record.strand == "+")
 61        reverse = int(record.strand == "-")
 62        forward_base = 0
 63        reverse_base = 0
 64        if start_index > 0:
 65            forward_base = forward_covs[start_index-1]
 66            reverse_base = reverse_covs[start_index-1]
 67        partitions.insert(start_index, record.start)
 68        forward_covs.insert(start_index, forward_base)
 69        reverse_covs.insert(start_index, reverse_base)
 70        end_index = bisect(partitions, record.end)
 71        for index in xrange(start_index, end_index):
 72            forward_covs[index] += forward
 73            reverse_covs[index] += reverse
 74        partitions.insert(end_index, record.end)
 75        forward_covs.insert(end_index, forward_covs[end_index-1] - forward )
 76        reverse_covs.insert(end_index, reverse_covs[end_index-1] - reverse )
 77
 78        if partitions:
 79            for partition in xrange(0, start_index):
 80                forward = forward_covs[partition]
 81                reverse = reverse_covs[partition]
 82                if forward+reverse > 0:
 83                    coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]),
 84                                   forward=forward, reverse=reverse)
 85            partitions = partitions[start_index:]
 86            forward_covs = forward_covs[start_index:]
 87            reverse_covs = reverse_covs[start_index:]
 88
 89        lastchrom = chrom
 90
 91    # Finish the last chromosome
 92    if partitions:
 93        for partition in xrange(0, len(partitions)-1):
 94            forward = forward_covs[partition]
 95            reverse = reverse_covs[partition]
 96            if forward+reverse > 0:
 97                coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]),
 98                               forward=forward, reverse=reverse)
 99
100class CoverageWriter( object ):
101    def __init__( self, out_stream=None, chromCol=0, positionCol=1, forwardCol=2, reverseCol=3 ):
102        self.out_stream = out_stream
103        self.reverseCol = reverseCol
104        self.nlines = 0
105        positions = {str(chromCol):'%(chrom)s',
106                     str(positionCol):'%(position)d',
107                     str(forwardCol):'%(forward)d',
108                     str(reverseCol):'%(reverse)d'}
109        if reverseCol < 0:
110            self.template = "%(0)s\t%(1)s\t%(2)s\n" % positions
111        else:
112            self.template = "%(0)s\t%(1)s\t%(2)s\t%(3)s\n" % positions
113
114    def write(self, **kwargs ):
115        if self.reverseCol < 0: kwargs['forward'] += kwargs['reverse']
116        posgen = kwargs['position']
117        for position in posgen:
118            kwargs['position'] = position
119            self.out_stream.write(self.template % kwargs)
120
121    def close(self):
122        self.out_stream.flush()
123        self.out_stream.close()
124
125if __name__ == "__main__":
126    options, args = doc_optparse.parse( __doc__ )
127    try:
128        chr_col_1, start_col_1, end_col_1, strand_col_1 = [int(x)-1 for x in options.cols1.split(',')]
129        chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [int(x)-1 for x in options.cols2.split(',')]
130        in_fname, out_fname = args
131    except:
132        doc_optparse.exception()
133
134    # Sort through a tempfile first
135    temp_file = tempfile.NamedTemporaryFile(mode="r")
136    environ['LC_ALL'] = 'POSIX'
137    commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (chr_col_1+1,start_col_1+1,end_col_1+1, temp_file.name, in_fname)
138    errorcode, stdout = commands.getstatusoutput(commandline)
139
140    coverage = CoverageWriter( out_stream = open(out_fname, "a"),
141                               chromCol = chr_col_2, positionCol = position_col_2,
142                               forwardCol = forward_col_2, reverseCol = reverse_col_2, )
143    temp_file.seek(0)
144    interval = io.NiceReaderWrapper( temp_file,
145                                     chrom_col=chr_col_1,
146                                     start_col=start_col_1,
147                                     end_col=end_col_1,
148                                     strand_col=strand_col_1,
149                                     fix_strand=True )
150    main( interval, coverage )
151    temp_file.close()
152    coverage.close()