/tools/filters/gff_to_bed_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 133 lines · 78 code · 18 blank · 37 comment · 20 complexity · c5910f4d713d4d0fb80cbb5740a679e6 MD5 · raw file

  1. #!/usr/bin/env python
  2. import sys
  3. from galaxy import eggs
  4. from galaxy.datatypes.util.gff_util import parse_gff_attributes
  5. assert sys.version_info[:2] >= ( 2, 4 )
  6. def get_bed_line( chrom, name, strand, blocks ):
  7. """ Returns a BED line for given data. """
  8. if len( blocks ) == 1:
  9. # Use simple BED format if there is only a single block:
  10. # chrom, chromStart, chromEnd, name, score, strand
  11. #
  12. start, end = blocks[0]
  13. return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )
  14. #
  15. # Build lists for transcript blocks' starts, sizes.
  16. #
  17. # Get transcript start, end.
  18. t_start = sys.maxint
  19. t_end = -1
  20. for block_start, block_end in blocks:
  21. if block_start < t_start:
  22. t_start = block_start
  23. if block_end > t_end:
  24. t_end = block_end
  25. # Get block starts, sizes.
  26. block_starts = []
  27. block_sizes = []
  28. for block_start, block_end in blocks:
  29. block_starts.append( str( block_start - t_start ) )
  30. block_sizes.append( str( block_end - block_start ) )
  31. #
  32. # Create BED entry.
  33. # Bed format: chrom, chromStart, chromEnd, name, score, strand, \
  34. # thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
  35. #
  36. # Render complete feature with thick blocks. There's no clear way to do this unless
  37. # we analyze the block names, but making everything thick makes more sense than
  38. # making everything thin.
  39. #
  40. return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
  41. ( chrom, t_start, t_end, name, strand, t_start, t_end, len( block_starts ),
  42. ",".join( block_sizes ), ",".join( block_starts ) )
  43. def __main__():
  44. input_name = sys.argv[1]
  45. output_name = sys.argv[2]
  46. skipped_lines = 0
  47. first_skipped_line = 0
  48. out = open( output_name, 'w' )
  49. i = 0
  50. cur_transcript_chrom = None
  51. cur_transcript_id = None
  52. cur_transcript_strand = None
  53. cur_transcripts_blocks = [] # (start, end) for each block.
  54. for i, line in enumerate( file( input_name ) ):
  55. line = line.rstrip( '\r\n' )
  56. if line and not line.startswith( '#' ):
  57. try:
  58. # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
  59. elems = line.split( '\t' )
  60. start = str( long( elems[3] ) - 1 )
  61. coords = [ long( start ), long( elems[4] ) ]
  62. strand = elems[6]
  63. if strand not in ['+', '-']:
  64. strand = '+'
  65. attributes = parse_gff_attributes( elems[8] )
  66. t_id = attributes.get( "transcript_id", None )
  67. if not t_id:
  68. #
  69. # No transcript ID, so write last transcript and write current line as its own line.
  70. #
  71. # Write previous transcript.
  72. if cur_transcript_id:
  73. # Write BED entry.
  74. out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
  75. # Replace any spaces in the name with underscores so UCSC will not complain.
  76. name = elems[2].replace(" ", "_")
  77. out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
  78. continue
  79. # There is a transcript ID, so process line at transcript level.
  80. if t_id == cur_transcript_id:
  81. # Line is element of transcript and will be a block in the BED entry.
  82. cur_transcripts_blocks.append( coords )
  83. continue
  84. #
  85. # Line is part of new transcript; write previous transcript and start
  86. # new transcript.
  87. #
  88. # Write previous transcript.
  89. if cur_transcript_id:
  90. # Write BED entry.
  91. out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
  92. # Start new transcript.
  93. cur_transcript_chrome = elems[0]
  94. cur_transcript_id = t_id
  95. cur_transcript_strand = strand
  96. cur_transcripts_blocks = []
  97. cur_transcripts_blocks.append( coords )
  98. except:
  99. skipped_lines += 1
  100. if not first_skipped_line:
  101. first_skipped_line = i + 1
  102. else:
  103. skipped_lines += 1
  104. if not first_skipped_line:
  105. first_skipped_line = i + 1
  106. # Write last transcript.
  107. if cur_transcript_id:
  108. # Write BED entry.
  109. out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
  110. out.close()
  111. info_msg = "%i lines converted to BED. " % ( i + 1 - skipped_lines )
  112. if skipped_lines > 0:
  113. info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
  114. print info_msg
  115. if __name__ == "__main__": __main__()