/lib/galaxy/datatypes/assembly.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 226 lines · 154 code · 17 blank · 55 comment · 36 complexity · 33b1ec232a7d15035cbf857101d809bd MD5 · raw file

  1. """
  2. velvet datatypes
  3. James E Johnson - University of Minnesota
  4. for velvet assembler tool in galaxy
  5. """
  6. import data
  7. import logging
  8. import os
  9. import re
  10. import sys
  11. from galaxy.datatypes import sequence
  12. from galaxy.datatypes.images import Html
  13. from galaxy.datatypes.metadata import MetadataElement
  14. log = logging.getLogger(__name__)
  15. class Amos( data.Text ):
  16. """Class describing the AMOS assembly file """
  17. file_ext = 'afg'
  18. def sniff( self, filename ):
  19. # FIXME: this method will read the entire file.
  20. # It should call get_headers() like other sniff methods.
  21. """
  22. Determines whether the file is an amos assembly file format
  23. Example::
  24. {CTG
  25. iid:1
  26. eid:1
  27. seq:
  28. CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA
  29. .
  30. qlt:
  31. DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
  32. .
  33. {TLE
  34. src:1027
  35. off:0
  36. clr:618,0
  37. gap:
  38. 250 612
  39. .
  40. }
  41. }
  42. """
  43. isAmos = False
  44. try:
  45. fh = open( filename )
  46. while not isAmos:
  47. line = fh.readline()
  48. if not line:
  49. break #EOF
  50. line = line.strip()
  51. if line: #first non-empty line
  52. if line.startswith( '{' ):
  53. if re.match(r'{(RED|CTG|TLE)$',line):
  54. isAmos = True
  55. fh.close()
  56. except:
  57. pass
  58. return isAmos
  59. class Sequences( sequence.Fasta ):
  60. """Class describing the Sequences file generated by velveth """
  61. def sniff( self, filename ):
  62. """
  63. Determines whether the file is a velveth produced fasta format
  64. The id line has 3 fields separated by tabs: sequence_name sequence_index cataegory::
  65. >SEQUENCE_0_length_35 1 1
  66. GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT
  67. >SEQUENCE_1_length_35 2 1
  68. CGACGAATGACAGGTCACGAATTTGGCGGGGATTA
  69. """
  70. try:
  71. fh = open( filename )
  72. while True:
  73. line = fh.readline()
  74. if not line:
  75. break #EOF
  76. line = line.strip()
  77. if line: #first non-empty line
  78. if line.startswith( '>' ):
  79. if not re.match(r'>[^\t]+\t\d+\t\d+$',line):
  80. break
  81. #The next line.strip() must not be '', nor startwith '>'
  82. line = fh.readline().strip()
  83. if line == '' or line.startswith( '>' ):
  84. break
  85. return True
  86. else:
  87. break #we found a non-empty line, but its not a fasta header
  88. fh.close()
  89. except:
  90. pass
  91. return False
  92. class Roadmaps( data.Text ):
  93. """Class describing the Sequences file generated by velveth """
  94. def sniff( self, filename ):
  95. """
  96. Determines whether the file is a velveth produced RoadMap::
  97. 142858 21 1
  98. ROADMAP 1
  99. ROADMAP 2
  100. ...
  101. """
  102. try:
  103. fh = open( filename )
  104. while True:
  105. line = fh.readline()
  106. if not line:
  107. break #EOF
  108. line = line.strip()
  109. if line: #first non-empty line
  110. if not re.match(r'\d+\t\d+\t\d+$',line):
  111. break
  112. #The next line.strip() should be 'ROADMAP 1'
  113. line = fh.readline().strip()
  114. if not re.match(r'ROADMAP \d+$',line):
  115. break
  116. return True
  117. else:
  118. break #we found a non-empty line, but its not a fasta header
  119. fh.close()
  120. except:
  121. pass
  122. return False
  123. class Velvet( Html ):
  124. MetadataElement( name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True)
  125. MetadataElement( name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True)
  126. MetadataElement( name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True)
  127. MetadataElement( name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True)
  128. composite_type = 'auto_primary_file'
  129. allow_datatype_change = False
  130. file_ext = 'html'
  131. def __init__( self, **kwd ):
  132. Html.__init__( self, **kwd )
  133. self.add_composite_file( 'Sequences', mimetype = 'text/html', description = 'Sequences', substitute_name_with_metadata = None, is_binary = False )
  134. self.add_composite_file( 'Roadmaps', mimetype = 'text/html', description = 'Roadmaps', substitute_name_with_metadata = None, is_binary = False )
  135. self.add_composite_file( 'Log', mimetype = 'text/html', description = 'Log', optional = 'True', substitute_name_with_metadata = None, is_binary = False )
  136. def generate_primary_file( self, dataset = None ):
  137. log.debug( "Velvet log info %s %s" % ('JJ generate_primary_file',dataset))
  138. rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
  139. rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
  140. for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
  141. fn = composite_name
  142. log.debug( "Velvet log info %s %s %s" % ('JJ generate_primary_file',fn,composite_file))
  143. opt_text = ''
  144. if composite_file.optional:
  145. opt_text = ' (optional)'
  146. if composite_file.get('description'):
  147. rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
  148. else:
  149. rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
  150. rval.append( '</ul></div></html>' )
  151. return "\n".join( rval )
  152. def regenerate_primary_file(self,dataset):
  153. """
  154. cannot do this until we are setting metadata
  155. """
  156. log.debug( "Velvet log info %s" % 'JJ regenerate_primary_file')
  157. gen_msg = ''
  158. try:
  159. efp = dataset.extra_files_path
  160. log_path = os.path.join(efp,'Log')
  161. f = open(log_path,'r')
  162. log_content = f.read(1000)
  163. f.close()
  164. log_msg = re.sub('/\S*/','',log_content)
  165. log.debug( "Velveth log info %s" % log_msg)
  166. paired_end_reads = re.search('-(short|long)Paired', log_msg) != None
  167. dataset.metadata.paired_end_reads = paired_end_reads
  168. long_reads = re.search('-long', log_msg) != None
  169. dataset.metadata.long_reads = long_reads
  170. short2_reads = re.search('-short(Paired)?2', log_msg) != None
  171. dataset.metadata.short2_reads = short2_reads
  172. dataset.info = re.sub('.*velveth \S+','hash_length',re.sub('\n',' ',log_msg))
  173. if paired_end_reads:
  174. gen_msg = gen_msg + ' Paired-End Reads'
  175. if long_reads:
  176. gen_msg = gen_msg + ' Long Reads'
  177. if len(gen_msg) > 0:
  178. gen_msg = 'Uses: ' + gen_msg
  179. except:
  180. log.debug( "Velveth could not read Log file in %s" % efp)
  181. log.debug( "Velveth log info %s" % gen_msg)
  182. rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>']
  183. # rval.append('<div>Generated:<p/><code> %s </code></div>' %(re.sub('\n','<br>',log_msg)))
  184. rval.append('<div>Generated:<p/> %s </div>' %(gen_msg))
  185. rval.append('<div>Velveth dataset:<p/><ul>')
  186. for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
  187. fn = composite_name
  188. log.debug( "Velvet log info %s %s %s" % ('JJ regenerate_primary_file',fn,composite_file))
  189. if re.search('Log',fn) == None:
  190. opt_text = ''
  191. if composite_file.optional:
  192. opt_text = ' (optional)'
  193. if composite_file.get('description'):
  194. rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
  195. else:
  196. rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
  197. rval.append( '</ul></div></html>' )
  198. f = file(dataset.file_name,'w')
  199. f.write("\n".join( rval ))
  200. f.write('\n')
  201. f.close()
  202. def set_meta( self, dataset, **kwd ):
  203. Html.set_meta( self, dataset, **kwd )
  204. self.regenerate_primary_file(dataset)
  205. if __name__ == '__main__':
  206. import doctest
  207. doctest.testmod(sys.modules[__name__])