PageRenderTime 11ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/filters/axt_to_lav.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 176 lines | 153 code | 12 blank | 11 comment | 6 complexity | 529af9fc08e343352fdd8bbfdc037321 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Application to convert AXT file to LAV file
  4. -------------------------------------------
  5. :Author: Bob Harris (rsharris@bx.psu.edu)
  6. :Version: $Revision: $
  7. The application reads an AXT file from standard input and writes a LAV file to
  8. standard out; some statistics are written to standard error.
  9. """
  10. import sys, copy
  11. from galaxy import eggs
  12. import pkg_resources
  13. pkg_resources.require( "bx-python" )
  14. import bx.align.axt
  15. import bx.align.lav
  16. assert sys.version_info[:2] >= ( 2, 4 )
  17. def usage(s=None):
  18. message = """
  19. axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
  20. Each spec is of the form seq_file[:species_name]:lengths_file.
  21. seq_file should be a format string for the file names for the individual
  22. sequences, with %s to be replaced by the alignment's src field. For example,
  23. "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
  24. etc.
  25. species_name is optional. If present, it is prepended to the alignment's src
  26. field.
  27. Lengths files provide the length of each chromosome (lav format needs this
  28. information but axt file does not contain it). The format is a series of
  29. lines of the form
  30. <chromosome name> <length>
  31. The chromosome field in each axt block must match some <chromosome name> in
  32. the lengths file.
  33. """
  34. if (s == None): sys.exit (message)
  35. else: sys.exit ("%s\n%s" % (s,message))
  36. def main():
  37. global debug
  38. # parse the command line
  39. primary = None
  40. secondary = None
  41. silent = False
  42. # pick off options
  43. args = sys.argv[1:]
  44. seq_file2 = open(args.pop(-1),'w')
  45. seq_file1 = open(args.pop(-1),'w')
  46. lav_out = args.pop(-1)
  47. axt_in = args.pop(-1)
  48. while (len(args) > 0):
  49. arg = args.pop(0)
  50. val = None
  51. fields = arg.split("=",1)
  52. if (len(fields) == 2):
  53. arg = fields[0]
  54. val = fields[1]
  55. if (val == ""):
  56. usage("missing a value in %s=" % arg)
  57. if (arg == "--silent") and (val == None):
  58. silent = True
  59. elif (primary == None) and (val == None):
  60. primary = arg
  61. elif (secondary == None) and (val == None):
  62. secondary = arg
  63. else:
  64. usage("unknown argument: %s" % arg)
  65. if (primary == None):
  66. usage("missing primary file name and length")
  67. if (secondary == None):
  68. usage("missing secondary file name and length")
  69. try:
  70. (primaryFile,primary,primaryLengths) = parse_spec(primary)
  71. except:
  72. usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
  73. try:
  74. (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
  75. except:
  76. usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
  77. # read the lengths
  78. speciesToLengths = {}
  79. speciesToLengths[primary] = read_lengths (primaryLengths)
  80. speciesToLengths[secondary] = read_lengths (secondaryLengths)
  81. # read the alignments
  82. out = bx.align.lav.Writer(open(lav_out,'w'), \
  83. attributes = { "name_format_1" : primaryFile,
  84. "name_format_2" : secondaryFile })
  85. axtsRead = 0
  86. axtsWritten = 0
  87. for axtBlock in bx.align.axt.Reader(open(axt_in), \
  88. species_to_lengths = speciesToLengths,
  89. species1 = primary,
  90. species2 = secondary,
  91. support_ids = True):
  92. axtsRead += 1
  93. out.write (axtBlock)
  94. primary_c = axtBlock.get_component_by_src_start(primary)
  95. secondary_c = axtBlock.get_component_by_src_start(secondary)
  96. print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
  97. print >>seq_file1,primary_c.text
  98. print >>seq_file1
  99. print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
  100. print >>seq_file2,secondary_c.text
  101. print >>seq_file2
  102. axtsWritten += 1
  103. out.close()
  104. seq_file1.close()
  105. seq_file2.close()
  106. if (not silent):
  107. sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
  108. def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
  109. fields = spec.split(":")
  110. if (len(fields) == 2): return (fields[0],"",fields[1])
  111. elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
  112. else: raise ValueError
  113. def read_lengths (fileName):
  114. chromToLength = {}
  115. f = file (fileName, "r")
  116. for lineNumber,line in enumerate(f):
  117. line = line.strip()
  118. if (line == ""): continue
  119. if (line.startswith("#")): continue
  120. fields = line.split ()
  121. if (len(fields) != 2):
  122. raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
  123. chrom = fields[0]
  124. try:
  125. length = int(fields[1])
  126. except:
  127. raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
  128. if (chrom in chromToLength):
  129. raise "%s appears more than once (%s:%d): %s" \
  130. % (chrom,fileName,lineNumber)
  131. chromToLength[chrom] = length
  132. f.close ()
  133. return chromToLength
  134. if __name__ == "__main__": main()