/tools/fasta_tools/fasta_concatenate_by_species.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 40 lines · 29 code · 3 blank · 8 comment · 8 complexity · 4a69aa08d9f6e0d9409e2f79795b1522 MD5 · raw file

  1. #!/usr/bin/env python
  2. #Dan Blankenberg
  3. """
  4. Takes a Multiple Alignment FASTA file and concatenates
  5. sequences for each species, resulting in one sequence
  6. alignment per species.
  7. """
  8. import sys, tempfile
  9. from galaxy import eggs
  10. from galaxy.tools.util.maf_utilities import iter_fasta_alignment
  11. from galaxy.util.odict import odict
  12. def __main__():
  13. input_filename = sys.argv[1]
  14. output_filename = sys.argv[2]
  15. species = odict()
  16. cur_size = 0
  17. for components in iter_fasta_alignment( input_filename ):
  18. species_not_written = species.keys()
  19. for component in components:
  20. if component.species not in species:
  21. species[component.species] = tempfile.TemporaryFile()
  22. species[component.species].write( "-" * cur_size )
  23. species[component.species].write( component.text )
  24. try:
  25. species_not_written.remove( component.species )
  26. except ValueError:
  27. #this is a new species
  28. pass
  29. for spec in species_not_written:
  30. species[spec].write( "-" * len( components[0].text ) )
  31. cur_size += len( components[0].text )
  32. out = open( output_filename, 'wb' )
  33. for spec, f in species.iteritems():
  34. f.seek( 0 )
  35. out.write( ">%s\n%s\n" % ( spec, f.read() ) )
  36. out.close()
  37. if __name__ == "__main__" : __main__()