PageRenderTime 48ms CodeModel.GetById 1ms app.highlight 41ms RepoModel.GetById 1ms app.codeStats 1ms

/lib/galaxy/datatypes/assembly.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 226 lines | 176 code | 8 blank | 42 comment | 15 complexity | 33b1ec232a7d15035cbf857101d809bd MD5 | raw file
  1"""
  2velvet datatypes
  3James E Johnson - University of Minnesota
  4for velvet assembler tool in galaxy
  5"""
  6
  7import data
  8import logging
  9import os
 10import re
 11import sys
 12from galaxy.datatypes import sequence
 13from galaxy.datatypes.images import Html
 14from galaxy.datatypes.metadata import MetadataElement
 15
 16
 17log = logging.getLogger(__name__)
 18
 19class Amos( data.Text ):
 20    """Class describing the AMOS assembly file """
 21    file_ext = 'afg'
 22
 23    def sniff( self, filename ):
 24        # FIXME: this method will read the entire file.
 25        # It should call get_headers() like other sniff methods.
 26        """
 27        Determines whether the file is an amos assembly file format
 28        Example::
 29
 30          {CTG
 31          iid:1
 32          eid:1
 33          seq:
 34          CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA
 35          .
 36          qlt:
 37          DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
 38          .
 39          {TLE
 40          src:1027
 41          off:0
 42          clr:618,0
 43          gap:
 44          250 612
 45          .
 46          }
 47          }
 48        """
 49        isAmos = False
 50        try:
 51            fh = open( filename )
 52            while not isAmos:
 53                line = fh.readline()
 54                if not line:
 55                    break #EOF
 56                line = line.strip()
 57                if line: #first non-empty line
 58                    if line.startswith( '{' ):
 59                        if re.match(r'{(RED|CTG|TLE)$',line):
 60                            isAmos = True
 61            fh.close()
 62        except:
 63            pass
 64        return isAmos
 65
 66class Sequences( sequence.Fasta ):
 67    """Class describing the Sequences file generated by velveth """
 68
 69    def sniff( self, filename ):
 70        """
 71        Determines whether the file is a velveth produced  fasta format
 72        The id line has 3 fields separated by tabs: sequence_name  sequence_index cataegory::
 73
 74          >SEQUENCE_0_length_35   1       1
 75          GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT
 76          >SEQUENCE_1_length_35   2       1
 77          CGACGAATGACAGGTCACGAATTTGGCGGGGATTA
 78        """
 79
 80        try:
 81            fh = open( filename )
 82            while True:
 83                line = fh.readline()
 84                if not line:
 85                    break #EOF
 86                line = line.strip()
 87                if line: #first non-empty line
 88                    if line.startswith( '>' ):
 89                        if not re.match(r'>[^\t]+\t\d+\t\d+$',line):
 90                            break
 91                        #The next line.strip() must not be '', nor startwith '>'
 92                        line = fh.readline().strip()
 93                        if line == '' or line.startswith( '>' ):
 94                            break
 95                        return True
 96                    else:
 97                        break #we found a non-empty line, but its not a fasta header
 98            fh.close()
 99        except:
100            pass
101        return False
102
103class Roadmaps( data.Text ):
104    """Class describing the Sequences file generated by velveth """
105
106    def sniff( self, filename ):
107        """
108        Determines whether the file is a velveth produced RoadMap::
109          142858  21      1
110          ROADMAP 1
111          ROADMAP 2
112          ...
113        """
114
115        try:
116            fh = open( filename )
117            while True:
118                line = fh.readline()
119                if not line:
120                    break #EOF
121                line = line.strip()
122                if line: #first non-empty line
123                    if not re.match(r'\d+\t\d+\t\d+$',line):
124                        break
125                    #The next line.strip() should be 'ROADMAP 1'
126                    line = fh.readline().strip()
127                    if not re.match(r'ROADMAP \d+$',line):
128                        break
129                    return True
130                else:
131                    break #we found a non-empty line, but its not a fasta header
132            fh.close()
133        except:
134            pass
135        return False
136
137class Velvet( Html ):
138    MetadataElement( name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True)
139    MetadataElement( name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True)
140    MetadataElement( name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True)
141    MetadataElement( name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True)
142    composite_type = 'auto_primary_file'
143    allow_datatype_change = False
144    file_ext = 'html'
145
146    def __init__( self, **kwd ):
147        Html.__init__( self, **kwd )
148        self.add_composite_file( 'Sequences', mimetype = 'text/html', description = 'Sequences', substitute_name_with_metadata = None, is_binary = False )
149        self.add_composite_file( 'Roadmaps', mimetype = 'text/html', description = 'Roadmaps', substitute_name_with_metadata = None, is_binary = False )
150        self.add_composite_file( 'Log', mimetype = 'text/html', description = 'Log', optional = 'True', substitute_name_with_metadata = None, is_binary = False )
151
152    def generate_primary_file( self, dataset = None ):
153        log.debug( "Velvet log info  %s %s" % ('JJ generate_primary_file',dataset))
154        rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
155        rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
156        for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
157            fn = composite_name
158            log.debug( "Velvet log info  %s %s %s" % ('JJ generate_primary_file',fn,composite_file))
159            opt_text = ''
160            if composite_file.optional:
161                opt_text = ' (optional)'
162            if composite_file.get('description'):
163                rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
164            else:
165                rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
166        rval.append( '</ul></div></html>' )
167        return "\n".join( rval )
168
169    def regenerate_primary_file(self,dataset):
170        """
171        cannot do this until we are setting metadata
172        """
173        log.debug( "Velvet log info  %s" % 'JJ regenerate_primary_file')
174        gen_msg = ''
175        try:
176            efp = dataset.extra_files_path
177            log_path = os.path.join(efp,'Log')
178            f = open(log_path,'r')
179            log_content = f.read(1000)
180            f.close()
181            log_msg = re.sub('/\S*/','',log_content)
182            log.debug( "Velveth log info  %s" % log_msg)
183            paired_end_reads = re.search('-(short|long)Paired', log_msg) != None
184            dataset.metadata.paired_end_reads = paired_end_reads
185            long_reads = re.search('-long', log_msg) != None
186            dataset.metadata.long_reads = long_reads
187            short2_reads = re.search('-short(Paired)?2', log_msg) != None
188            dataset.metadata.short2_reads = short2_reads
189            dataset.info = re.sub('.*velveth \S+','hash_length',re.sub('\n',' ',log_msg))
190            if paired_end_reads:
191                 gen_msg = gen_msg + ' Paired-End Reads'
192            if long_reads:
193                 gen_msg = gen_msg + ' Long Reads'
194            if len(gen_msg) > 0:
195                gen_msg = 'Uses: ' + gen_msg
196        except:
197            log.debug( "Velveth could not read Log file in %s" % efp)
198        log.debug( "Velveth log info  %s" % gen_msg)
199        rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
200        # rval.append('<div>Generated:<p/><code> %s </code></div>' %(re.sub('\n','<br>',log_msg)))
201        rval.append('<div>Generated:<p/> %s </div>' %(gen_msg))
202        rval.append('<div>Velveth dataset:<p/><ul>')
203        for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
204            fn = composite_name
205            log.debug( "Velvet log info  %s %s %s" % ('JJ regenerate_primary_file',fn,composite_file))
206            if re.search('Log',fn) == None:
207                opt_text = ''
208                if composite_file.optional:
209                    opt_text = ' (optional)'
210                if composite_file.get('description'):
211                    rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
212                else:
213                    rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
214        rval.append( '</ul></div></html>' )
215        f = file(dataset.file_name,'w')
216        f.write("\n".join( rval ))
217        f.write('\n')
218        f.close()
219
220    def set_meta( self, dataset, **kwd ):
221        Html.set_meta( self, dataset, **kwd )
222        self.regenerate_primary_file(dataset)
223
224if __name__ == '__main__':
225    import doctest
226    doctest.testmod(sys.modules[__name__])