/tools/filters/lav_to_bed.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 54 lines · 40 code · 11 blank · 3 comment · 11 complexity · bbfcb4c3d20fad5ce5cae9f937bbc2f3 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Reads a LAV file and writes two BED files.
  3. import sys
  4. from galaxy import eggs
  5. import pkg_resources
  6. pkg_resources.require( "bx-python" )
  7. import bx.align.lav
  8. assert sys.version_info[:2] >= ( 2, 4 )
  9. def stop_err( msg ):
  10. sys.stderr.write( msg )
  11. sys.exit()
  12. def main():
  13. try:
  14. lav_file = open(sys.argv[1],'r')
  15. bed_file1 = open(sys.argv[2],'w')
  16. bed_file2 = open(sys.argv[3],'w')
  17. except Exception, e:
  18. stop_err( str( e ) )
  19. lavsRead = 0
  20. bedsWritten = 0
  21. species = {}
  22. # TODO: this is really bad since everything is read into memory. Can we eliminate this tool?
  23. for lavBlock in bx.align.lav.Reader( lav_file ):
  24. lavsRead += 1
  25. for c in lavBlock.components:
  26. spec, chrom = bx.align.lav.src_split( c.src )
  27. if bedsWritten < 1:
  28. if len( species )==0:
  29. species[spec]=bed_file1
  30. elif len( species )==1:
  31. species[spec]=bed_file2
  32. else:
  33. continue #this is a pairwise alignment...
  34. if spec in species:
  35. species[spec].write( "%s\t%i\t%i\t%s_%s\t%i\t%s\n" % ( chrom, c.start, c.end, spec, str( bedsWritten ), 0, c.strand ) )
  36. bedsWritten += 1
  37. for spec,file in species.items():
  38. print "#FILE\t%s\t%s" % (file.name, spec)
  39. lav_file.close()
  40. bed_file1.close()
  41. bed_file2.close()
  42. print "%d lav blocks read, %d regions written\n" % (lavsRead,bedsWritten)
  43. if __name__ == "__main__": main()