PageRenderTime 22ms CodeModel.GetById 13ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/fasta_tools/fasta_concatenate_by_species.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 40 lines | 29 code | 3 blank | 8 comment | 12 complexity | 4a69aa08d9f6e0d9409e2f79795b1522 MD5 | raw file
 1#!/usr/bin/env python
 2#Dan Blankenberg
 3"""
 4Takes a Multiple Alignment FASTA file and concatenates 
 5sequences for each species, resulting in one sequence 
 6alignment per species.
 7"""
 8
 9import sys, tempfile
10from galaxy import eggs
11from galaxy.tools.util.maf_utilities import iter_fasta_alignment
12from galaxy.util.odict import odict
13
14def __main__():
15    input_filename = sys.argv[1]
16    output_filename = sys.argv[2]
17    species = odict()
18    cur_size = 0
19    for components in iter_fasta_alignment( input_filename ):
20        species_not_written = species.keys()
21        for component in components:
22            if component.species not in species:
23                species[component.species] = tempfile.TemporaryFile()
24                species[component.species].write( "-" * cur_size )
25            species[component.species].write( component.text )
26            try:
27                species_not_written.remove( component.species )
28            except ValueError:
29                #this is a new species
30                pass
31        for spec in species_not_written:
32            species[spec].write( "-" * len( components[0].text ) )
33        cur_size += len( components[0].text )
34    out = open( output_filename, 'wb' )
35    for spec, f in species.iteritems():
36        f.seek( 0 )
37        out.write( ">%s\n%s\n" % ( spec, f.read() ) )
38    out.close()
39
40if __name__ == "__main__" : __main__()