/tools/filters/gtf_to_bedgraph_converter.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 80 lines · 60 code · 10 blank · 10 comment · 16 complexity · fffc489c0adff4d8a42850499e31652c MD5 · raw file

  1. #!/usr/bin/env python
  2. import os, sys, tempfile
  3. assert sys.version_info[:2] >= ( 2, 4 )
  4. def __main__():
  5. # Read parms.
  6. input_name = sys.argv[1]
  7. output_name = sys.argv[2]
  8. attribute_name = sys.argv[3]
  9. # Create temp files.
  10. tmp_name1 = tempfile.NamedTemporaryFile().name
  11. tmp_name2 = tempfile.NamedTemporaryFile().name
  12. # Do conversion.
  13. skipped_lines = 0
  14. first_skipped_line = 0
  15. out = open( tmp_name1, 'w' )
  16. # Write track data to temporary file.
  17. i = 0
  18. for i, line in enumerate( file( input_name ) ):
  19. line = line.rstrip( '\r\n' )
  20. if line and not line.startswith( '#' ):
  21. try:
  22. elems = line.split( '\t' )
  23. start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based.
  24. strand = elems[6]
  25. if strand not in ['+', '-']:
  26. strand = '+'
  27. attributes_list = elems[8].split(";")
  28. attributes = {}
  29. for name_value_pair in attributes_list:
  30. pair = name_value_pair.strip().split(" ")
  31. name = pair[0].strip()
  32. if name == '':
  33. continue
  34. # Need to strip double quote from values
  35. value = pair[1].strip(" \"")
  36. attributes[name] = value
  37. value = attributes[ attribute_name ]
  38. # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
  39. # BedGraph format: chrom, chromStart, chromEnd, value
  40. out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) )
  41. except:
  42. skipped_lines += 1
  43. if not first_skipped_line:
  44. first_skipped_line = i + 1
  45. else:
  46. skipped_lines += 1
  47. if not first_skipped_line:
  48. first_skipped_line = i + 1
  49. out.close()
  50. # Sort tmp file by chromosome name and chromosome start to create ordered track data.
  51. cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 )
  52. try:
  53. os.system(cmd)
  54. os.remove(tmp_name1)
  55. except Exception, ex:
  56. sys.stderr.write( "%s\n" % ex )
  57. sys.exit(1)
  58. # Create bedgraph file by combining track definition with ordered track data.
  59. cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name )
  60. try:
  61. os.system(cmd)
  62. os.remove(tmp_name2)
  63. except Exception, ex:
  64. sys.stderr.write( "%s\n" % ex )
  65. sys.exit(1)
  66. info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines )
  67. if skipped_lines > 0:
  68. info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
  69. print info_msg
  70. if __name__ == "__main__": __main__()