PageRenderTime 55ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/filters/bed_to_gff_converter.py

https://bitbucket.org/chapmanb/galaxy-central
Python | 73 lines | 67 code | 3 blank | 3 comment | 22 complexity | 784cb409f233a9aa1036937a7d36b388 MD5 | raw file
Possible License(s): CC-BY-3.0
  1. #!/usr/bin/env python
  2. # This code exists in 2 places: ~/datatypes/converters and ~/tools/filters
  3. import sys
  4. assert sys.version_info[:2] >= ( 2, 4 )
  5. def __main__():
  6. input_name = sys.argv[1]
  7. output_name = sys.argv[2]
  8. skipped_lines = 0
  9. first_skipped_line = 0
  10. out = open( output_name, 'w' )
  11. out.write( "##gff-version 2\n" )
  12. out.write( "##bed_to_gff_converter.py\n\n" )
  13. i = 0
  14. for i, line in enumerate( file( input_name ) ):
  15. complete_bed = False
  16. line = line.rstrip( '\r\n' )
  17. if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ):
  18. try:
  19. elems = line.split( '\t' )
  20. if len( elems ) == 12:
  21. complete_bed = True
  22. chrom = elems[0]
  23. if complete_bed:
  24. feature = "mRNA"
  25. else:
  26. try:
  27. feature = elems[3]
  28. except:
  29. feature = 'feature%d' % ( i + 1 )
  30. start = int( elems[1] ) + 1
  31. end = int( elems[2] )
  32. try:
  33. score = elems[4]
  34. except:
  35. score = '0'
  36. try:
  37. strand = elems[5]
  38. except:
  39. strand = '+'
  40. try:
  41. group = elems[3]
  42. except:
  43. group = 'group%d' % ( i + 1 )
  44. if complete_bed:
  45. out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) )
  46. else:
  47. out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) )
  48. if complete_bed:
  49. # We have all the info necessary to annotate exons for genes and mRNAs
  50. block_count = int( elems[9] )
  51. block_sizes = elems[10].split( ',' )
  52. block_starts = elems[11].split( ',' )
  53. for j in range( block_count ):
  54. exon_start = int( start ) + int( block_starts[j] )
  55. exon_end = exon_start + int( block_sizes[j] ) - 1
  56. out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) )
  57. except:
  58. skipped_lines += 1
  59. if not first_skipped_line:
  60. first_skipped_line = i + 1
  61. else:
  62. skipped_lines += 1
  63. if not first_skipped_line:
  64. first_skipped_line = i + 1
  65. out.close()
  66. info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines )
  67. if skipped_lines > 0:
  68. info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
  69. print info_msg
  70. if __name__ == "__main__": __main__()