assembly.py - This is a Python script that defines a class …

/lib/galaxy/datatypes/assembly.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 226 lines · 154 code · 17 blank · 55 comment · 36 complexity · 33b1ec232a7d15035cbf857101d809bd MD5 · raw file


"""
velvet datatypes
James E Johnson - University of Minnesota
for velvet assembler tool in galaxy
"""

import data
import logging
import os
import re
import sys
from galaxy.datatypes import sequence
from galaxy.datatypes.images import Html
from galaxy.datatypes.metadata import MetadataElement


log = logging.getLogger(__name__)

class Amos( data.Text ):
    """Class describing the AMOS assembly file """
    file_ext = 'afg'

    def sniff( self, filename ):
        # FIXME: this method will read the entire file.
        # It should call get_headers() like other sniff methods.
        """
        Determines whether the file is an amos assembly file format
        Example::

          {CTG
          iid:1
          eid:1
          seq:
          CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA
          .
          qlt:
          DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
          .
          {TLE
          src:1027
          off:0
          clr:618,0
          gap:
          250 612
          .
          }
          }
        """
        isAmos = False
        try:
            fh = open( filename )
            while not isAmos:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line: #first non-empty line
                    if line.startswith( '{' ):
                        if re.match(r'{(RED|CTG|TLE)$',line):
                            isAmos = True
            fh.close()
        except:
            pass
        return isAmos

class Sequences( sequence.Fasta ):
    """Class describing the Sequences file generated by velveth """

    def sniff( self, filename ):
        """
        Determines whether the file is a velveth produced  fasta format
        The id line has 3 fields separated by tabs: sequence_name  sequence_index cataegory::

          >SEQUENCE_0_length_35   1       1
          GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT
          >SEQUENCE_1_length_35   2       1
          CGACGAATGACAGGTCACGAATTTGGCGGGGATTA
        """

        try:
            fh = open( filename )
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line: #first non-empty line
                    if line.startswith( '>' ):
                        if not re.match(r'>[^\t]+\t\d+\t\d+$',line):
                            break
                        #The next line.strip() must not be '', nor startwith '>'
                        line = fh.readline().strip()
                        if line == '' or line.startswith( '>' ):
                            break
                        return True
                    else:
                        break #we found a non-empty line, but its not a fasta header
            fh.close()
        except:
            pass
        return False

class Roadmaps( data.Text ):
    """Class describing the Sequences file generated by velveth """

    def sniff( self, filename ):
        """
        Determines whether the file is a velveth produced RoadMap::
          142858  21      1
          ROADMAP 1
          ROADMAP 2
          ...
        """

        try:
            fh = open( filename )
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line: #first non-empty line
                    if not re.match(r'\d+\t\d+\t\d+$',line):
                        break
                    #The next line.strip() should be 'ROADMAP 1'
                    line = fh.readline().strip()
                    if not re.match(r'ROADMAP \d+$',line):
                        break
                    return True
                else:
                    break #we found a non-empty line, but its not a fasta header
            fh.close()
        except:
            pass
        return False

class Velvet( Html ):
    MetadataElement( name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True)
    MetadataElement( name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True)
    MetadataElement( name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True)
    MetadataElement( name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True)
    composite_type = 'auto_primary_file'
    allow_datatype_change = False
    file_ext = 'html'

    def __init__( self, **kwd ):
        Html.__init__( self, **kwd )
        self.add_composite_file( 'Sequences', mimetype = 'text/html', description = 'Sequences', substitute_name_with_metadata = None, is_binary = False )
        self.add_composite_file( 'Roadmaps', mimetype = 'text/html', description = 'Roadmaps', substitute_name_with_metadata = None, is_binary = False )
        self.add_composite_file( 'Log', mimetype = 'text/html', description = 'Log', optional = 'True', substitute_name_with_metadata = None, is_binary = False )

    def generate_primary_file( self, dataset = None ):
        log.debug( "Velvet log info  %s %s" % ('JJ generate_primary_file',dataset))
        rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
        rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
        for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
            fn = composite_name
            log.debug( "Velvet log info  %s %s %s" % ('JJ generate_primary_file',fn,composite_file))
            opt_text = ''
            if composite_file.optional:
                opt_text = ' (optional)'
            if composite_file.get('description'):
                rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
            else:
                rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
        rval.append( '</ul></div></html>' )
        return "\n".join( rval )

    def regenerate_primary_file(self,dataset):
        """
        cannot do this until we are setting metadata
        """
        log.debug( "Velvet log info  %s" % 'JJ regenerate_primary_file')
        gen_msg = ''
        try:
            efp = dataset.extra_files_path
            log_path = os.path.join(efp,'Log')
            f = open(log_path,'r')
            log_content = f.read(1000)
            f.close()
            log_msg = re.sub('/\S*/','',log_content)
            log.debug( "Velveth log info  %s" % log_msg)
            paired_end_reads = re.search('-(short|long)Paired', log_msg) != None
            dataset.metadata.paired_end_reads = paired_end_reads
            long_reads = re.search('-long', log_msg) != None
            dataset.metadata.long_reads = long_reads
            short2_reads = re.search('-short(Paired)?2', log_msg) != None
            dataset.metadata.short2_reads = short2_reads
            dataset.info = re.sub('.*velveth \S+','hash_length',re.sub('\n',' ',log_msg))
            if paired_end_reads:
                 gen_msg = gen_msg + ' Paired-End Reads'
            if long_reads:
                 gen_msg = gen_msg + ' Long Reads'
            if len(gen_msg) > 0:
                gen_msg = 'Uses: ' + gen_msg
        except:
            log.debug( "Velveth could not read Log file in %s" % efp)
        log.debug( "Velveth log info  %s" % gen_msg)
        rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
        # rval.append('<div>Generated:<p/><code> %s </code></div>' %(re.sub('\n','<br>',log_msg)))
        rval.append('<div>Generated:<p/> %s </div>' %(gen_msg))
        rval.append('<div>Velveth dataset:<p/><ul>')
        for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
            fn = composite_name
            log.debug( "Velvet log info  %s %s %s" % ('JJ regenerate_primary_file',fn,composite_file))
            if re.search('Log',fn) == None:
                opt_text = ''
                if composite_file.optional:
                    opt_text = ' (optional)'
                if composite_file.get('description'):
                    rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
                else:
                    rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
        rval.append( '</ul></div></html>' )
        f = file(dataset.file_name,'w')
        f.write("\n".join( rval ))
        f.write('\n')
        f.close()

    def set_meta( self, dataset, **kwd ):
        Html.set_meta( self, dataset, **kwd )
        self.regenerate_primary_file(dataset)

if __name__ == '__main__':
    import doctest
    doctest.testmod(sys.modules[__name__])

Summary ✨

This is a Python script that defines a class called Velvet which inherits from the GalaxyApp class in Galaxy. The class has several methods, including __init__, regenerate_primary_file, and set_meta. The __init__ method initializes the class with some default values, while the regenerate_primary_file method regenerates the primary file for a dataset using the metadata of the dataset. The set_meta method sets the metadata of the dataset.

Tech Fingerprint

Alerts (20)

'def' Ensure functions have docstrings for documentation
23 152 220
'open(' Use 'with open()' to ensure Files are properly closed
51 116 178
Complexity hotspot; lines 57 to 59 (total complexity: 3)
57 58 59
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
62 99 133 196
'try:' Ensure try blocks have corresponding except or finally blocks
80 115 175
Complexity hotspot; lines 87 to 89 (total complexity: 3)
87 88 89
'== None' Use 'is' for None comparisons (e.g., x is None)
206