/lib/galaxy/datatypes/converters/interval_to_coverage.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 152 lines · 116 code · 3 blank · 33 comment · 0 complexity · 7f978cdc14cc94ece76ccacbb4f4a55f MD5 · raw file

  1. #!/usr/bin/env python
  2. """
  3. Converter to generate 3 (or 4) column base-pair coverage from an interval file.
  4. usage: %prog bed_file out_file
  5. -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in interval file
  6. -2, --cols2=N,N,N,N: Columns for chrom, start, end, strand in coverage file
  7. """
  8. import sys
  9. from galaxy import eggs
  10. import pkg_resources; pkg_resources.require( "bx-python" )
  11. from bx.intervals import io
  12. from bx.cookbook import doc_optparse
  13. import psyco_full
  14. import commands
  15. import os
  16. from os import environ
  17. import tempfile
  18. from bisect import bisect
  19. INTERVAL_METADATA = ('chromCol',
  20. 'startCol',
  21. 'endCol',
  22. 'strandCol',)
  23. COVERAGE_METADATA = ('chromCol',
  24. 'positionCol',
  25. 'forwardCol',
  26. 'reverseCol',)
  27. def main( interval, coverage ):
  28. """
  29. Uses a sliding window of partitions to count coverages.
  30. Every interval record adds its start and end to the partitions. The result
  31. is a list of partitions, or every position that has a (maybe) different
  32. number of basepairs covered. We don't worry about merging because we pop
  33. as the sorted intervals are read in. As the input start positions exceed
  34. the partition positions in partitions, coverages are kicked out in bulk.
  35. """
  36. partitions = []
  37. forward_covs = []
  38. reverse_covs = []
  39. offset = 0
  40. chrom = None
  41. lastchrom = None
  42. for record in interval:
  43. chrom = record.chrom
  44. if lastchrom and not lastchrom == chrom and partitions:
  45. for partition in xrange(0, len(partitions)-1):
  46. forward = forward_covs[partition]
  47. reverse = reverse_covs[partition]
  48. if forward+reverse > 0:
  49. coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]),
  50. forward=forward, reverse=reverse)
  51. partitions = []
  52. forward_covs = []
  53. reverse_covs = []
  54. start_index = bisect(partitions, record.start)
  55. forward = int(record.strand == "+")
  56. reverse = int(record.strand == "-")
  57. forward_base = 0
  58. reverse_base = 0
  59. if start_index > 0:
  60. forward_base = forward_covs[start_index-1]
  61. reverse_base = reverse_covs[start_index-1]
  62. partitions.insert(start_index, record.start)
  63. forward_covs.insert(start_index, forward_base)
  64. reverse_covs.insert(start_index, reverse_base)
  65. end_index = bisect(partitions, record.end)
  66. for index in xrange(start_index, end_index):
  67. forward_covs[index] += forward
  68. reverse_covs[index] += reverse
  69. partitions.insert(end_index, record.end)
  70. forward_covs.insert(end_index, forward_covs[end_index-1] - forward )
  71. reverse_covs.insert(end_index, reverse_covs[end_index-1] - reverse )
  72. if partitions:
  73. for partition in xrange(0, start_index):
  74. forward = forward_covs[partition]
  75. reverse = reverse_covs[partition]
  76. if forward+reverse > 0:
  77. coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]),
  78. forward=forward, reverse=reverse)
  79. partitions = partitions[start_index:]
  80. forward_covs = forward_covs[start_index:]
  81. reverse_covs = reverse_covs[start_index:]
  82. lastchrom = chrom
  83. # Finish the last chromosome
  84. if partitions:
  85. for partition in xrange(0, len(partitions)-1):
  86. forward = forward_covs[partition]
  87. reverse = reverse_covs[partition]
  88. if forward+reverse > 0:
  89. coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]),
  90. forward=forward, reverse=reverse)
  91. class CoverageWriter( object ):
  92. def __init__( self, out_stream=None, chromCol=0, positionCol=1, forwardCol=2, reverseCol=3 ):
  93. self.out_stream = out_stream
  94. self.reverseCol = reverseCol
  95. self.nlines = 0
  96. positions = {str(chromCol):'%(chrom)s',
  97. str(positionCol):'%(position)d',
  98. str(forwardCol):'%(forward)d',
  99. str(reverseCol):'%(reverse)d'}
  100. if reverseCol < 0:
  101. self.template = "%(0)s\t%(1)s\t%(2)s\n" % positions
  102. else:
  103. self.template = "%(0)s\t%(1)s\t%(2)s\t%(3)s\n" % positions
  104. def write(self, **kwargs ):
  105. if self.reverseCol < 0: kwargs['forward'] += kwargs['reverse']
  106. posgen = kwargs['position']
  107. for position in posgen:
  108. kwargs['position'] = position
  109. self.out_stream.write(self.template % kwargs)
  110. def close(self):
  111. self.out_stream.flush()
  112. self.out_stream.close()
  113. if __name__ == "__main__":
  114. options, args = doc_optparse.parse( __doc__ )
  115. try:
  116. chr_col_1, start_col_1, end_col_1, strand_col_1 = [int(x)-1 for x in options.cols1.split(',')]
  117. chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [int(x)-1 for x in options.cols2.split(',')]
  118. in_fname, out_fname = args
  119. except:
  120. doc_optparse.exception()
  121. # Sort through a tempfile first
  122. temp_file = tempfile.NamedTemporaryFile(mode="r")
  123. environ['LC_ALL'] = 'POSIX'
  124. commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (chr_col_1+1,start_col_1+1,end_col_1+1, temp_file.name, in_fname)
  125. errorcode, stdout = commands.getstatusoutput(commandline)
  126. coverage = CoverageWriter( out_stream = open(out_fname, "a"),
  127. chromCol = chr_col_2, positionCol = position_col_2,
  128. forwardCol = forward_col_2, reverseCol = reverse_col_2, )
  129. temp_file.seek(0)
  130. interval = io.NiceReaderWrapper( temp_file,
  131. chrom_col=chr_col_1,
  132. start_col=start_col_1,
  133. end_col=end_col_1,
  134. strand_col=strand_col_1,
  135. fix_strand=True )
  136. main( interval, coverage )
  137. temp_file.close()
  138. coverage.close()