/lib/galaxy/datatypes/assembly.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 226 lines · 154 code · 17 blank · 55 comment · 36 complexity · 33b1ec232a7d15035cbf857101d809bd MD5 · raw file
- """
- velvet datatypes
- James E Johnson - University of Minnesota
- for velvet assembler tool in galaxy
- """
- import data
- import logging
- import os
- import re
- import sys
- from galaxy.datatypes import sequence
- from galaxy.datatypes.images import Html
- from galaxy.datatypes.metadata import MetadataElement
- log = logging.getLogger(__name__)
- class Amos( data.Text ):
- """Class describing the AMOS assembly file """
- file_ext = 'afg'
- def sniff( self, filename ):
- # FIXME: this method will read the entire file.
- # It should call get_headers() like other sniff methods.
- """
- Determines whether the file is an amos assembly file format
- Example::
- {CTG
- iid:1
- eid:1
- seq:
- CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA
- .
- qlt:
- DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
- .
- {TLE
- src:1027
- off:0
- clr:618,0
- gap:
- 250 612
- .
- }
- }
- """
- isAmos = False
- try:
- fh = open( filename )
- while not isAmos:
- line = fh.readline()
- if not line:
- break #EOF
- line = line.strip()
- if line: #first non-empty line
- if line.startswith( '{' ):
- if re.match(r'{(RED|CTG|TLE)$',line):
- isAmos = True
- fh.close()
- except:
- pass
- return isAmos
- class Sequences( sequence.Fasta ):
- """Class describing the Sequences file generated by velveth """
- def sniff( self, filename ):
- """
- Determines whether the file is a velveth produced fasta format
- The id line has 3 fields separated by tabs: sequence_name sequence_index cataegory::
- >SEQUENCE_0_length_35 1 1
- GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT
- >SEQUENCE_1_length_35 2 1
- CGACGAATGACAGGTCACGAATTTGGCGGGGATTA
- """
- try:
- fh = open( filename )
- while True:
- line = fh.readline()
- if not line:
- break #EOF
- line = line.strip()
- if line: #first non-empty line
- if line.startswith( '>' ):
- if not re.match(r'>[^\t]+\t\d+\t\d+$',line):
- break
- #The next line.strip() must not be '', nor startwith '>'
- line = fh.readline().strip()
- if line == '' or line.startswith( '>' ):
- break
- return True
- else:
- break #we found a non-empty line, but its not a fasta header
- fh.close()
- except:
- pass
- return False
- class Roadmaps( data.Text ):
- """Class describing the Sequences file generated by velveth """
- def sniff( self, filename ):
- """
- Determines whether the file is a velveth produced RoadMap::
- 142858 21 1
- ROADMAP 1
- ROADMAP 2
- ...
- """
- try:
- fh = open( filename )
- while True:
- line = fh.readline()
- if not line:
- break #EOF
- line = line.strip()
- if line: #first non-empty line
- if not re.match(r'\d+\t\d+\t\d+$',line):
- break
- #The next line.strip() should be 'ROADMAP 1'
- line = fh.readline().strip()
- if not re.match(r'ROADMAP \d+$',line):
- break
- return True
- else:
- break #we found a non-empty line, but its not a fasta header
- fh.close()
- except:
- pass
- return False
- class Velvet( Html ):
- MetadataElement( name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True)
- MetadataElement( name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True)
- MetadataElement( name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True)
- MetadataElement( name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True)
- composite_type = 'auto_primary_file'
- allow_datatype_change = False
- file_ext = 'html'
- def __init__( self, **kwd ):
- Html.__init__( self, **kwd )
- self.add_composite_file( 'Sequences', mimetype = 'text/html', description = 'Sequences', substitute_name_with_metadata = None, is_binary = False )
- self.add_composite_file( 'Roadmaps', mimetype = 'text/html', description = 'Roadmaps', substitute_name_with_metadata = None, is_binary = False )
- self.add_composite_file( 'Log', mimetype = 'text/html', description = 'Log', optional = 'True', substitute_name_with_metadata = None, is_binary = False )
- def generate_primary_file( self, dataset = None ):
- log.debug( "Velvet log info %s %s" % ('JJ generate_primary_file',dataset))
- rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
- rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
- for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
- fn = composite_name
- log.debug( "Velvet log info %s %s %s" % ('JJ generate_primary_file',fn,composite_file))
- opt_text = ''
- if composite_file.optional:
- opt_text = ' (optional)'
- if composite_file.get('description'):
- rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
- else:
- rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
- rval.append( '</ul></div></html>' )
- return "\n".join( rval )
- def regenerate_primary_file(self,dataset):
- """
- cannot do this until we are setting metadata
- """
- log.debug( "Velvet log info %s" % 'JJ regenerate_primary_file')
- gen_msg = ''
- try:
- efp = dataset.extra_files_path
- log_path = os.path.join(efp,'Log')
- f = open(log_path,'r')
- log_content = f.read(1000)
- f.close()
- log_msg = re.sub('/\S*/','',log_content)
- log.debug( "Velveth log info %s" % log_msg)
- paired_end_reads = re.search('-(short|long)Paired', log_msg) != None
- dataset.metadata.paired_end_reads = paired_end_reads
- long_reads = re.search('-long', log_msg) != None
- dataset.metadata.long_reads = long_reads
- short2_reads = re.search('-short(Paired)?2', log_msg) != None
- dataset.metadata.short2_reads = short2_reads
- dataset.info = re.sub('.*velveth \S+','hash_length',re.sub('\n',' ',log_msg))
- if paired_end_reads:
- gen_msg = gen_msg + ' Paired-End Reads'
- if long_reads:
- gen_msg = gen_msg + ' Long Reads'
- if len(gen_msg) > 0:
- gen_msg = 'Uses: ' + gen_msg
- except:
- log.debug( "Velveth could not read Log file in %s" % efp)
- log.debug( "Velveth log info %s" % gen_msg)
- rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
- # rval.append('<div>Generated:<p/><code> %s </code></div>' %(re.sub('\n','<br>',log_msg)))
- rval.append('<div>Generated:<p/> %s </div>' %(gen_msg))
- rval.append('<div>Velveth dataset:<p/><ul>')
- for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
- fn = composite_name
- log.debug( "Velvet log info %s %s %s" % ('JJ regenerate_primary_file',fn,composite_file))
- if re.search('Log',fn) == None:
- opt_text = ''
- if composite_file.optional:
- opt_text = ' (optional)'
- if composite_file.get('description'):
- rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
- else:
- rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
- rval.append( '</ul></div></html>' )
- f = file(dataset.file_name,'w')
- f.write("\n".join( rval ))
- f.write('\n')
- f.close()
- def set_meta( self, dataset, **kwd ):
- Html.set_meta( self, dataset, **kwd )
- self.regenerate_primary_file(dataset)
- if __name__ == '__main__':
- import doctest
- doctest.testmod(sys.modules[__name__])