/lib/galaxy_utils/sequence/fastq.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 752 lines · 694 code · 26 blank · 32 comment · 191 complexity · 1272cbccc204b9274f9767aeb80058c9 MD5 · raw file

  1. ##Dan Blankenberg
  2. import math
  3. import string
  4. import transform
  5. from sequence import SequencingRead
  6. from fasta import fastaSequence
  7. class fastqSequencingRead( SequencingRead ):
  8. format = 'sanger' #sanger is default
  9. ascii_min = 33
  10. ascii_max = 126
  11. quality_min = 0
  12. quality_max = 93
  13. score_system = 'phred' #phred or solexa
  14. sequence_space = 'base' #base or color
  15. @classmethod
  16. def get_class_by_format( cls, format ):
  17. assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format
  18. return FASTQ_FORMATS[ format ]
  19. @classmethod
  20. def convert_score_phred_to_solexa( cls, decimal_score_list ):
  21. def phred_to_solexa( score ):
  22. if score <= 0: #can't take log10( 1 - 1 ); make <= 0 into -5
  23. return -5
  24. return int( round( 10.0 * math.log10( math.pow( 10.0, ( float( score ) / 10.0 ) ) - 1.0 ) ) )
  25. return map( phred_to_solexa, decimal_score_list )
  26. @classmethod
  27. def convert_score_solexa_to_phred( cls, decimal_score_list ):
  28. def solexa_to_phred( score ):
  29. return int( round( 10.0 * math.log10( math.pow( 10.0, ( float( score ) / 10.0 ) ) + 1.0 ) ) )
  30. return map( solexa_to_phred, decimal_score_list )
  31. @classmethod
  32. def restrict_scores_to_valid_range( cls, decimal_score_list ):
  33. def restrict_score( score ):
  34. return max( min( score, cls.quality_max ), cls.quality_min )
  35. return map( restrict_score, decimal_score_list )
  36. @classmethod
  37. def transform_scores_to_valid_range( cls, decimal_score_list):
  38. cls_quality_max = cls.quality_max
  39. cls_quality_min = cls.quality_min
  40. for i in range( len( decimal_score_list ) ):
  41. score = decimal_score_list[i]
  42. if(score > cls_quality_max):
  43. transformed_score = cls_quality_max
  44. elif( score < cls_quality_min ):
  45. transformed_score = cls_quality_min
  46. else:
  47. transformed_score = score
  48. decimal_score_list[i] = str(transformed_score)
  49. @classmethod
  50. def transform_scores_to_valid_range_ascii( cls, decimal_score_list ):
  51. cls_quality_max = cls.quality_max
  52. cls_quality_min = cls.quality_min
  53. to_quality = cls.ascii_min - cls.quality_min
  54. for i in range( len( decimal_score_list ) ):
  55. score = decimal_score_list[i]
  56. if(score > cls_quality_max):
  57. transformed_score = cls_quality_max
  58. elif( score < cls_quality_min ):
  59. transformed_score = cls_quality_min
  60. else:
  61. transformed_score = score
  62. transformed_score = chr(transformed_score + to_quality)
  63. decimal_score_list[i] = transformed_score
  64. @classmethod
  65. def convert_base_to_color_space( cls, sequence ):
  66. return cls.color_space_converter.to_color_space( sequence )
  67. @classmethod
  68. def convert_color_to_base_space( cls, sequence ):
  69. return cls.color_space_converter.to_base_space( sequence )
  70. def is_ascii_encoded( self ):
  71. #as per fastq definition only decimal quality strings can have spaces (and TABs for our purposes) in them (and must have a trailing space)
  72. if ' ' in self.quality:
  73. return False
  74. if '\t' in self.quality:
  75. return False
  76. return True
  77. def get_ascii_quality_scores( self ):
  78. if self.is_ascii_encoded():
  79. return list( self.quality )
  80. else:
  81. quality = self.quality.rstrip() #decimal scores should have a trailing space
  82. if quality:
  83. try:
  84. to_quality = self.ascii_min - self.quality_min
  85. return [ chr( int( val ) + to_quality ) for val in quality.split() ]
  86. except ValueError, e:
  87. raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) )
  88. else:
  89. return []
  90. def get_ascii_quality_scores_len( self ):
  91. """
  92. Compute ascii quality score length, without generating relatively
  93. expensive qualty score array.
  94. """
  95. if self.is_ascii_encoded():
  96. return len( self.quality )
  97. else:
  98. quality = self.quality.rstrip()
  99. if quality:
  100. try:
  101. return len( quality.split() )
  102. except ValueError, e:
  103. raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) )
  104. else:
  105. return 0
  106. def get_decimal_quality_scores( self ):
  107. return self.__get_decimal_quality_scores(self.is_ascii_encoded())
  108. def __get_decimal_quality_scores( self, ascii ):
  109. if ascii:
  110. to_quality = self.quality_min - self.ascii_min
  111. return [ ord( val ) + to_quality for val in self.quality ]
  112. else:
  113. quality = self.quality.rstrip() #decimal scores should have a trailing space
  114. if quality:
  115. return [ int( val ) for val in quality.split() if val.strip() ]
  116. else:
  117. return []
  118. def convert_read_to_format( self, format, force_quality_encoding = None ):
  119. assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format
  120. assert force_quality_encoding in [ None, 'ascii', 'decimal' ], 'Invalid force_quality_encoding: %s' % force_quality_encoding
  121. new_class = FASTQ_FORMATS[ format ]
  122. new_read = new_class()
  123. new_read.identifier = self.identifier
  124. if self.sequence_space == new_class.sequence_space:
  125. new_read.sequence = self.sequence
  126. else:
  127. if self.sequence_space == 'base':
  128. new_read.sequence = self.convert_base_to_color_space( self.sequence )
  129. else:
  130. new_read.sequence = self.convert_color_to_base_space( self.sequence )
  131. new_read.description = self.description
  132. is_ascii = self.is_ascii_encoded()
  133. if self.score_system != new_read.score_system:
  134. if self.score_system == 'phred':
  135. score_list = self.convert_score_phred_to_solexa( self.__get_decimal_quality_scores(is_ascii) )
  136. else:
  137. score_list = self.convert_score_solexa_to_phred( self.__get_decimal_quality_scores(is_ascii) )
  138. else:
  139. score_list = self.__get_decimal_quality_scores(is_ascii)
  140. if force_quality_encoding is None:
  141. if is_ascii:
  142. new_encoding = 'ascii'
  143. else:
  144. new_encoding = 'decimal'
  145. else:
  146. new_encoding = force_quality_encoding
  147. if new_encoding == 'ascii':
  148. new_class.transform_scores_to_valid_range_ascii( score_list )
  149. new_read.quality = "".join( score_list )
  150. else: # decimal
  151. new_class.transform_scores_to_valid_range( score_list )
  152. new_read.quality = "%s " % " ".join( score_list ) #need trailing space to be valid decimal fastq
  153. return new_read
  154. def get_sequence( self ):
  155. return self.sequence
  156. def slice( self, left_column_offset, right_column_offset ):
  157. new_read = fastqSequencingRead.get_class_by_format( self.format )()
  158. new_read.identifier = self.identifier
  159. new_read.sequence = self.get_sequence()[left_column_offset:right_column_offset]
  160. new_read.description = self.description
  161. if self.is_ascii_encoded():
  162. new_read.quality = self.quality[left_column_offset:right_column_offset]
  163. else:
  164. quality = map( str, self.get_decimal_quality_scores()[left_column_offset:right_column_offset] )
  165. if quality:
  166. new_read.quality = "%s " % " ".join( quality )
  167. else:
  168. new_read.quality = ''
  169. return new_read
  170. def is_valid_format( self ):
  171. if self.is_ascii_encoded():
  172. for val in self.get_ascii_quality_scores():
  173. val = ord( val )
  174. if val < self.ascii_min or val > self.ascii_max:
  175. return False
  176. else:
  177. for val in self.get_decimal_quality_scores():
  178. if val < self.quality_min or val > self.quality_max:
  179. return False
  180. if not self.is_valid_sequence():
  181. return False
  182. return True
  183. def is_valid_sequence( self ):
  184. for base in self.get_sequence():
  185. if base not in self.valid_sequence_list:
  186. return False
  187. return True
  188. def insufficient_quality_length( self ):
  189. return self.get_ascii_quality_scores_len() < len( self.sequence )
  190. def assert_sequence_quality_lengths( self ):
  191. qual_len = self.get_ascii_quality_scores_len()
  192. seq_len = len( self.sequence )
  193. assert qual_len == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i)" % ( qual_len, seq_len )
  194. def reverse( self, clone = True ):
  195. #need to override how decimal quality scores are reversed
  196. if clone:
  197. rval = self.clone()
  198. else:
  199. rval = self
  200. rval.sequence = transform.reverse( self.sequence )
  201. if rval.is_ascii_encoded():
  202. rval.quality = rval.quality[::-1]
  203. else:
  204. rval.quality = reversed( rval.get_decimal_quality_scores() )
  205. rval.quality = "%s " % " ".join( map( str, rval.quality ) )
  206. return rval
  207. def apply_galaxy_conventions( self ):
  208. pass
  209. class fastqSangerRead( fastqSequencingRead ):
  210. format = 'sanger'
  211. ascii_min = 33
  212. ascii_max = 126
  213. quality_min = 0
  214. quality_max = 93
  215. score_system = 'phred'
  216. sequence_space = 'base'
  217. class fastqIlluminaRead( fastqSequencingRead ):
  218. format = 'illumina'
  219. ascii_min = 64
  220. ascii_max = 126
  221. quality_min = 0
  222. quality_max = 62
  223. score_system = 'phred'
  224. sequence_space = 'base'
  225. class fastqSolexaRead( fastqSequencingRead ):
  226. format = 'solexa'
  227. ascii_min = 59
  228. ascii_max = 126
  229. quality_min = -5
  230. quality_max = 62
  231. score_system = 'solexa'
  232. sequence_space = 'base'
  233. class fastqCSSangerRead( fastqSequencingRead ):
  234. format = 'cssanger' #color space
  235. ascii_min = 33
  236. ascii_max = 126
  237. quality_min = 0
  238. quality_max = 93
  239. score_system = 'phred'
  240. sequence_space = 'color'
  241. valid_sequence_list = map( str, range( 7 ) ) + [ '.' ]
  242. def __len__( self ):
  243. if self.has_adapter_base(): #Adapter base is not counted in length of read
  244. return len( self.sequence ) - 1
  245. return fastqSequencingRead.__len__( self )
  246. def has_adapter_base( self ):
  247. if self.sequence and self.sequence[0] in string.letters: #adapter base must be a letter
  248. return True
  249. return False
  250. def insufficient_quality_length( self ):
  251. if self.has_adapter_base():
  252. return self.get_ascii_quality_scores_len() + 1 < len( self.sequence )
  253. return fastqSequencingRead.insufficient_quality_length( self )
  254. def assert_sequence_quality_lengths( self ):
  255. if self.has_adapter_base():
  256. qual_len = self.get_ascii_quality_scores_len()
  257. seq_len = len( self.sequence )
  258. assert ( qual_len + 1 == seq_len ) or ( qual_len == seq_len ), "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) #SRA adds FAKE/DUMMY quality scores to the adapter base, we'll allow the reading of the Improper score here, but remove it in the Reader when "apply_galaxy_conventions" is set to True
  259. else:
  260. return fastqSequencingRead.assert_sequence_quality_lengths( self )
  261. def get_sequence( self ):
  262. if self.has_adapter_base():
  263. return self.sequence[1:]
  264. return self.sequence
  265. def reverse( self, clone = True ):
  266. #need to override how color space is reversed
  267. if clone:
  268. rval = self.clone()
  269. else:
  270. rval = self
  271. if rval.has_adapter_base():
  272. adapter = rval.sequence[0]
  273. #sequence = rval.sequence[1:]
  274. rval.sequence = self.color_space_converter.to_color_space( transform.reverse( self.color_space_converter.to_base_space( rval.sequence ) ), adapter_base = adapter )
  275. else:
  276. rval.sequence = transform.reverse( rval.sequence )
  277. if rval.is_ascii_encoded():
  278. rval.quality = rval.quality[::-1]
  279. else:
  280. rval.quality = reversed( rval.get_decimal_quality_scores() )
  281. rval.quality = "%s " % " ".join( map( str, rval.quality ) )
  282. return rval
  283. def complement( self, clone = True ):
  284. #need to override how color space is complemented
  285. if clone:
  286. rval = self.clone()
  287. else:
  288. rval = self
  289. if rval.has_adapter_base(): #No adapter, color space stays the same
  290. adapter = rval.sequence[0]
  291. sequence = rval.sequence[1:]
  292. if adapter.lower() != 'u':
  293. adapter = transform.DNA_complement( adapter )
  294. else:
  295. adapter = transform.RNA_complement( adapter )
  296. rval.sequence = "%s%s" % ( adapter, sequence )
  297. return rval
  298. def change_adapter( self, new_adapter, clone = True ):
  299. #if new_adapter is empty, remove adapter, otherwise replace with new_adapter
  300. if clone:
  301. rval = self.clone()
  302. else:
  303. rval = self
  304. if rval.has_adapter_base():
  305. if new_adapter:
  306. if new_adapter != rval.sequence[0]:
  307. rval.sequence = rval.color_space_converter.to_color_space( rval.color_space_converter.to_base_space( rval.sequence ), adapter_base = new_adapter )
  308. else:
  309. rval.sequence = rval.sequence[1:]
  310. elif new_adapter:
  311. rval.sequence = "%s%s" % ( new_adapter, rval.sequence )
  312. return rval
  313. def apply_galaxy_conventions( self ):
  314. if self.has_adapter_base() and len( self.sequence ) == len( self.get_ascii_quality_scores() ): #SRA adds FAKE/DUMMY quality scores to the adapter base, we remove them here
  315. if self.is_ascii_encoded():
  316. self.quality = self.quality[1:]
  317. else:
  318. self.quality = " ".join( map( str, self.get_decimal_quality_scores()[1:] ) )
  319. FASTQ_FORMATS = {}
  320. for format in [ fastqIlluminaRead, fastqSolexaRead, fastqSangerRead, fastqCSSangerRead ]:
  321. FASTQ_FORMATS[ format.format ] = format
  322. class fastqAggregator( object ):
  323. VALID_FORMATS = FASTQ_FORMATS.keys()
  324. def __init__( self, ):
  325. self.ascii_values_used = [] #quick lookup of all ascii chars used
  326. self.seq_lens = {} #counts of seqs by read len
  327. self.nuc_index_quality = [] #counts of scores by read column
  328. self.nuc_index_base = [] #counts of bases by read column
  329. def consume_read( self, fastq_read ):
  330. #ascii values used
  331. for val in fastq_read.get_ascii_quality_scores():
  332. if val not in self.ascii_values_used:
  333. self.ascii_values_used.append( val )
  334. #lengths
  335. seq_len = len( fastq_read )
  336. self.seq_lens[ seq_len ] = self.seq_lens.get( seq_len, 0 ) + 1
  337. #decimal qualities by column
  338. for i, val in enumerate( fastq_read.get_decimal_quality_scores() ):
  339. if i == len( self.nuc_index_quality ):
  340. self.nuc_index_quality.append( {} )
  341. self.nuc_index_quality[ i ][ val ] = self.nuc_index_quality[ i ].get( val, 0 ) + 1
  342. #bases by column
  343. for i, nuc in enumerate( fastq_read.get_sequence() ):
  344. if i == len( self.nuc_index_base ):
  345. self.nuc_index_base.append( {} )
  346. nuc = nuc.upper()
  347. self.nuc_index_base[ i ][ nuc ] = self.nuc_index_base[ i ].get( nuc, 0 ) + 1
  348. def get_valid_formats( self, check_list = None ):
  349. if not check_list:
  350. check_list = self.VALID_FORMATS
  351. rval = []
  352. sequence = []
  353. for nuc_dict in self.nuc_index_base:
  354. for nuc in nuc_dict.keys():
  355. if nuc not in sequence:
  356. sequence.append( nuc )
  357. sequence = "".join( sequence )
  358. quality = "".join( self.ascii_values_used )
  359. for fastq_format in check_list:
  360. fastq_read = fastqSequencingRead.get_class_by_format( fastq_format )()
  361. fastq_read.quality = quality
  362. fastq_read.sequence = sequence
  363. if fastq_read.is_valid_format():
  364. rval.append( fastq_format )
  365. return rval
  366. def get_ascii_range( self ):
  367. if not self.ascii_values_used:
  368. return None
  369. return ( min( self.ascii_values_used ), max( self.ascii_values_used ) )
  370. def get_decimal_range( self ):
  371. if not self.nuc_index_quality:
  372. return None
  373. decimal_values_used = []
  374. for scores in self.nuc_index_quality:
  375. decimal_values_used.extend( scores.keys() )
  376. return ( min( decimal_values_used ), max( decimal_values_used ) )
  377. def get_length_counts( self ):
  378. return self.seq_lens
  379. def get_max_read_length( self ):
  380. return len( self.nuc_index_quality )
  381. def get_read_count_for_column( self, column ):
  382. if column >= len( self.nuc_index_quality ):
  383. return 0
  384. return sum( self.nuc_index_quality[ column ].values() )
  385. def get_read_count( self ):
  386. return self.get_read_count_for_column( 0 )
  387. def get_base_counts_for_column( self, column ):
  388. return self.nuc_index_base[ column ]
  389. def get_score_list_for_column( self, column ):
  390. return self.nuc_index_quality[ column ].keys()
  391. def get_score_min_for_column( self, column ):
  392. return min( self.nuc_index_quality[ column ].keys() )
  393. def get_score_max_for_column( self, column ):
  394. return max( self.nuc_index_quality[ column ].keys() )
  395. def get_score_sum_for_column( self, column ):
  396. return sum( score * count for score, count in self.nuc_index_quality[ column ].iteritems() )
  397. def get_score_at_position_for_column( self, column, position ):
  398. score_value_dict = self.nuc_index_quality[ column ]
  399. scores = sorted( score_value_dict.keys() )
  400. for score in scores:
  401. if score_value_dict[ score ] <= position:
  402. position -= score_value_dict[ score ]
  403. else:
  404. return score
  405. def get_summary_statistics_for_column( self, i ):
  406. def _get_med_pos( size ):
  407. halfed = int( size / 2 )
  408. if size % 2 == 1:
  409. return [ halfed ]
  410. return[ halfed - 1, halfed ]
  411. read_count = self.get_read_count_for_column( i )
  412. min_score = self.get_score_min_for_column( i )
  413. max_score = self.get_score_max_for_column( i )
  414. sum_score = self.get_score_sum_for_column( i )
  415. mean_score = float( sum_score ) / float( read_count )
  416. #get positions
  417. med_pos = _get_med_pos( read_count )
  418. if 0 in med_pos:
  419. q1_pos = [ 0 ]
  420. q3_pos = [ read_count - 1 ]
  421. else:
  422. q1_pos = _get_med_pos( min( med_pos ) )
  423. q3_pos = []
  424. for pos in q1_pos:
  425. q3_pos.append( max( med_pos ) + 1 + pos )
  426. #get scores at position
  427. med_score = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in med_pos ] ) ) / float( len( med_pos ) )
  428. q1 = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in q1_pos ] ) ) / float( len( q1_pos ) )
  429. q3 = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in q3_pos ] ) ) / float( len( q3_pos ) )
  430. #determine iqr and step
  431. iqr = q3 - q1
  432. step = 1.5 * iqr
  433. #Determine whiskers and outliers
  434. outliers = []
  435. score_list = sorted( self.get_score_list_for_column( i ) )
  436. left_whisker = q1 - step
  437. for score in score_list:
  438. if left_whisker <= score:
  439. left_whisker = score
  440. break
  441. else:
  442. outliers.append( score )
  443. right_whisker = q3 + step
  444. score_list.reverse()
  445. for score in score_list:
  446. if right_whisker >= score:
  447. right_whisker = score
  448. break
  449. else:
  450. outliers.append( score )
  451. column_stats = { 'read_count': read_count,
  452. 'min_score': min_score,
  453. 'max_score': max_score,
  454. 'sum_score': sum_score,
  455. 'mean_score': mean_score,
  456. 'q1': q1,
  457. 'med_score': med_score,
  458. 'q3': q3,
  459. 'iqr': iqr,
  460. 'left_whisker': left_whisker,
  461. 'right_whisker': right_whisker,
  462. 'outliers': outliers }
  463. return column_stats
  464. class fastqReader( object ):
  465. def __init__( self, fh, format = 'sanger', apply_galaxy_conventions = False ):
  466. self.file = fh
  467. self.format = format
  468. self.apply_galaxy_conventions = apply_galaxy_conventions
  469. def close( self ):
  470. return self.file.close()
  471. def next(self):
  472. while True:
  473. fastq_header = self.file.readline()
  474. if not fastq_header:
  475. raise StopIteration
  476. fastq_header = fastq_header.rstrip( '\n\r' )
  477. #remove empty lines, apparently extra new lines at end of file is common?
  478. if fastq_header:
  479. break
  480. assert fastq_header.startswith( '@' ), 'Invalid fastq header: %s' % fastq_header
  481. rval = fastqSequencingRead.get_class_by_format( self.format )()
  482. rval.identifier = fastq_header
  483. while True:
  484. line = self.file.readline()
  485. if not line:
  486. raise Exception( 'Invalid FASTQ file: could not find quality score of sequence identifier %s.' % rval.identifier )
  487. line = line.rstrip( '\n\r' )
  488. if line.startswith( '+' ) and ( len( line ) == 1 or line[1:].startswith( fastq_header[1:] ) ):
  489. rval.description = line
  490. break
  491. rval.append_sequence( line )
  492. while rval.insufficient_quality_length():
  493. line = self.file.readline()
  494. if not line:
  495. break
  496. rval.append_quality( line )
  497. rval.assert_sequence_quality_lengths()
  498. if self.apply_galaxy_conventions:
  499. rval.apply_galaxy_conventions()
  500. return rval
  501. def __iter__( self ):
  502. while True:
  503. yield self.next()
  504. class ReadlineCountFile( object ):
  505. def __init__( self, f ):
  506. self.__file = f
  507. self.readline_count = 0
  508. def readline( self, *args, **kwds ):
  509. self.readline_count += 1
  510. return self.__file.readline( *args, **kwds )
  511. def __getattr__( self, name ):
  512. return getattr( self.__file, name )
  513. class fastqVerboseErrorReader( fastqReader ):
  514. MAX_PRINT_ERROR_BYTES = 1024
  515. def __init__( self, fh, **kwds ):
  516. super( fastqVerboseErrorReader, self ).__init__( ReadlineCountFile( fh ), **kwds )
  517. self.last_good_identifier = None
  518. def next( self ):
  519. last_good_end_offset = self.file.tell()
  520. last_readline_count = self.file.readline_count
  521. try:
  522. block = super( fastqVerboseErrorReader, self ).next()
  523. self.last_good_identifier = block.identifier
  524. return block
  525. except StopIteration, e:
  526. raise e
  527. except Exception, e:
  528. print "There was an error reading your input file. Your input file is likely malformed.\nIt is suggested that you double-check your original input file for errors -- helpful information for this purpose has been provided below.\nHowever, if you think that you have encountered an actual error with this tool, please do tell us by using the bug reporting mechanism.\n\nThe reported error is: '%s'." % e
  529. if self.last_good_identifier is not None:
  530. print "The last valid FASTQ read had an identifier of '%s'." % self.last_good_identifier
  531. else:
  532. print "The error occurred at the start of your file and no valid FASTQ reads were found."
  533. error_offset = self.file.tell()
  534. error_byte_count = error_offset - last_good_end_offset
  535. print_error_bytes = min( self.MAX_PRINT_ERROR_BYTES, error_byte_count )
  536. print "The error in your file occurs between lines '%i' and '%i', which corresponds to byte-offsets '%i' and '%i', and contains the text (%i of %i bytes shown):\n" % ( last_readline_count + 1, self.file.readline_count, last_good_end_offset, error_offset, print_error_bytes, error_byte_count )
  537. self.file.seek( last_good_end_offset )
  538. print self.file.read( print_error_bytes )
  539. raise e
  540. class fastqNamedReader( object ):
  541. def __init__( self, fh, format = 'sanger', apply_galaxy_conventions = False ):
  542. self.file = fh
  543. self.format = format
  544. self.reader = fastqReader( self.file, self.format )
  545. #self.last_offset = self.file.tell()
  546. self.offset_dict = {}
  547. self.eof = False
  548. self.apply_galaxy_conventions = apply_galaxy_conventions
  549. def close( self ):
  550. return self.file.close()
  551. def get( self, sequence_identifier ):
  552. # Input is either a sequence ID or a sequence object
  553. if not isinstance( sequence_identifier, basestring ):
  554. # Input was a sequence object (not a sequence ID). Get the sequence ID
  555. sequence_identifier = sequence_identifier.identifier
  556. # Get only the ID part of the sequence header
  557. sequence_id, sequence_sep, sequence_desc = sequence_identifier.partition(' ')
  558. rval = None
  559. if sequence_id in self.offset_dict:
  560. initial_offset = self.file.tell()
  561. seq_offset = self.offset_dict[ sequence_id ].pop( 0 )
  562. if not self.offset_dict[ sequence_id ]:
  563. del self.offset_dict[ sequence_id ]
  564. self.file.seek( seq_offset )
  565. rval = self.reader.next()
  566. #assert rval.id == sequence_id, 'seq id mismatch' #should be able to remove this
  567. self.file.seek( initial_offset )
  568. else:
  569. while True:
  570. offset = self.file.tell()
  571. try:
  572. fastq_read = self.reader.next()
  573. except StopIteration:
  574. self.eof = True
  575. break #eof, id not found, will return None
  576. fastq_read_id, fastq_read_sep, fastq_read_desc = fastq_read.identifier.partition(' ')
  577. if fastq_read_id == sequence_id:
  578. rval = fastq_read
  579. break
  580. else:
  581. if fastq_read_id not in self.offset_dict:
  582. self.offset_dict[ fastq_read_id ] = []
  583. self.offset_dict[ fastq_read_id ].append( offset )
  584. if rval is not None and self.apply_galaxy_conventions:
  585. rval.apply_galaxy_conventions()
  586. return rval
  587. def has_data( self ):
  588. #returns a string representation of remaining data, or empty string (False) if no data remaining
  589. eof = self.eof
  590. count = 0
  591. rval = ''
  592. if self.offset_dict:
  593. count = sum( map( len, self.offset_dict.values() ) )
  594. if not eof:
  595. offset = self.file.tell()
  596. try:
  597. fastq_read = self.reader.next()
  598. except StopIteration:
  599. eof = True
  600. self.file.seek( offset )
  601. if count:
  602. rval = "There were %i known sequence reads not utilized. " % count
  603. if not eof:
  604. rval = "%s%s" % ( rval, "An additional unknown number of reads exist in the input that were not utilized." )
  605. return rval
  606. class fastqWriter( object ):
  607. def __init__( self, fh, format = None, force_quality_encoding = None ):
  608. self.file = fh
  609. self.format = format
  610. self.force_quality_encoding = force_quality_encoding
  611. def write( self, fastq_read ):
  612. if self.format:
  613. fastq_read = fastq_read.convert_read_to_format( self.format, force_quality_encoding = self.force_quality_encoding )
  614. self.file.write( str( fastq_read ) )
  615. def close( self ):
  616. return self.file.close()
  617. class fastqJoiner( object ):
  618. def __init__( self, format, force_quality_encoding = None ):
  619. self.format = format
  620. self.force_quality_encoding = force_quality_encoding
  621. def join( self, read1, read2 ):
  622. read1_id, read1_sep, read1_desc = read1.identifier.partition(' ')
  623. read2_id, read2_sep, read2_desc = read2.identifier.partition(' ')
  624. if read1_id.endswith( '/2' ) and read2_id.endswith( '/1' ):
  625. #swap 1 and 2
  626. tmp = read1
  627. read1 = read2
  628. read2 = tmp
  629. del tmp
  630. if read1_id.endswith( '/1' ) and read2_id.endswith( '/2' ):
  631. read1_id = read1_id[:-2]
  632. identifier = read1_id
  633. if read1_desc:
  634. identifier = identifier + ' ' + read1_desc
  635. #use force quality encoding, if not present force to encoding of first read
  636. force_quality_encoding = self.force_quality_encoding
  637. if not force_quality_encoding:
  638. if read1.is_ascii_encoded():
  639. force_quality_encoding = 'ascii'
  640. else:
  641. force_quality_encoding = 'decimal'
  642. new_read1 = read1.convert_read_to_format( self.format, force_quality_encoding = force_quality_encoding )
  643. new_read2 = read2.convert_read_to_format( self.format, force_quality_encoding = force_quality_encoding )
  644. rval = FASTQ_FORMATS[ self.format ]()
  645. rval.identifier = identifier
  646. if len( read1.description ) > 1:
  647. rval.description = "+%s" % ( identifier[1:] )
  648. else:
  649. rval.description = '+'
  650. if rval.sequence_space == 'color':
  651. #need to handle color space joining differently
  652. #convert to nuc space, join, then convert back
  653. rval.sequence = rval.convert_base_to_color_space( new_read1.convert_color_to_base_space( new_read1.sequence ) + new_read2.convert_color_to_base_space( new_read2.sequence ) )
  654. else:
  655. rval.sequence = new_read1.sequence + new_read2.sequence
  656. if force_quality_encoding == 'ascii':
  657. rval.quality = new_read1.quality + new_read2.quality
  658. else:
  659. rval.quality = "%s %s" % ( new_read1.quality.strip(), new_read2.quality.strip() )
  660. return rval
  661. def get_paired_identifier( self, fastq_read ):
  662. read_id, read_sep, read_desc = fastq_read.identifier.partition(' ')
  663. if read_id[-2] == '/':
  664. if read_id[-1] == "1":
  665. read_id = "%s2" % read_id[:-1]
  666. elif read_id[-1] == "2":
  667. read_id = "%s1" % read_id[:-1]
  668. return read_id
  669. def is_first_mate( self, sequence_id ):
  670. is_first = None
  671. if not isinstance( sequence_id, basestring ):
  672. sequence_id = sequence_id.identifier
  673. sequence_id, sequence_sep, sequence_desc = sequence_id.partition(' ')
  674. if sequence_id[-2] == '/':
  675. if sequence_id[-1] == "1":
  676. is_first = True
  677. else:
  678. is_first = False
  679. return is_first
  680. class fastqSplitter( object ):
  681. def split( self, fastq_read ):
  682. length = len( fastq_read )
  683. #Only reads of even lengths can be split
  684. if length % 2 != 0:
  685. return None, None
  686. half = int( length / 2 )
  687. read1 = fastq_read.slice( 0, half )
  688. read1.identifier += "/1"
  689. if len( read1.description ) > 1:
  690. read1.description += "/1"
  691. read2 = fastq_read.slice( half, None )
  692. read2.identifier += "/2"
  693. if len( read2.description ) > 1:
  694. read2.description += "/2"
  695. return read1, read2
  696. class fastqCombiner( object ):
  697. def __init__( self, format ):
  698. self.format = format
  699. def combine(self, fasta_seq, quality_seq ):
  700. fastq_read = fastqSequencingRead.get_class_by_format( self.format )()
  701. fastq_read.identifier = "@%s" % fasta_seq.identifier[1:]
  702. fastq_read.description = '+'
  703. fastq_read.sequence = fasta_seq.sequence
  704. fastq_read.quality = quality_seq.sequence
  705. return fastq_read
  706. class fastqFakeFastaScoreReader( object ):
  707. def __init__( self, format = 'sanger', quality_encoding = None ):
  708. self.fastq_read = fastqSequencingRead.get_class_by_format( format )()
  709. if quality_encoding != 'decimal':
  710. quality_encoding = 'ascii'
  711. self.quality_encoding = quality_encoding
  712. def close( self ):
  713. return #nothing to close
  714. def get( self, sequence ):
  715. assert isinstance( sequence, fastaSequence ), 'fastqFakeFastaScoreReader requires a fastaSequence object as the parameter'
  716. #add sequence to fastq_read, then get_sequence(), color space adapters do not have quality score values
  717. self.fastq_read.sequence = sequence.sequence
  718. new_sequence = fastaSequence()
  719. new_sequence.identifier = sequence.identifier
  720. if self.quality_encoding == 'ascii':
  721. new_sequence.sequence = chr( self.fastq_read.ascii_max ) * len( self.fastq_read.get_sequence() )
  722. else:
  723. new_sequence.sequence = ( "%i " % self.fastq_read.quality_max ) * len( self.fastq_read.get_sequence() )
  724. return new_sequence
  725. def has_data( self ):
  726. return '' #No actual data exist, none can be remaining