/tools/fastq/fastq_masker_by_quality.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 83 lines · 65 code · 17 blank · 1 comment · 15 complexity · 03c266b3b797243d51efbe9f3ba79a72 MD5 · raw file

  1. #Dan Blankenberg
  2. import string
  3. from optparse import OptionParser
  4. from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
  5. def get_score_comparer( operator ):
  6. if operator == 'gt':
  7. return compare_gt
  8. elif operator == 'ge':
  9. return compare_ge
  10. elif operator == 'eq':
  11. return compare_eq
  12. elif operator == 'lt':
  13. return compare_lt
  14. elif operator == 'le':
  15. return compare_le
  16. elif operator == 'ne':
  17. return compare_ne
  18. raise 'Invalid operator provided: %s' % operator
  19. def compare_gt( quality_score, threshold_value ):
  20. return quality_score > threshold_value
  21. def compare_ge( quality_score, threshold_value ):
  22. return quality_score >= threshold_value
  23. def compare_eq( quality_score, threshold_value ):
  24. return quality_score == threshold_value
  25. def compare_ne( quality_score, threshold_value ):
  26. return quality_score != threshold_value
  27. def compare_lt( quality_score, threshold_value ):
  28. return quality_score < threshold_value
  29. def compare_le( quality_score, threshold_value ):
  30. return quality_score <= threshold_value
  31. class BaseReplacer( object ):
  32. def __init__( self, replace_character ):
  33. self.replace_character = replace_character
  34. def __call__( self, base_character ):
  35. return self.replace_character
  36. def main():
  37. usage = "usage: %prog [options] input_file output_file"
  38. parser = OptionParser( usage=usage )
  39. parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'solexa', 'illumina' ), help='FASTQ variant type' )
  40. parser.add_option( '-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use' )
  41. parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt','ge','eq','lt', 'le', 'ne' ), help='Mask base when score is' )
  42. parser.add_option( '-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' )
  43. parser.add_option( "-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking")
  44. ( options, args ) = parser.parse_args()
  45. if len ( args ) != 2:
  46. parser.error( "Need to specify an input file and an output file" )
  47. score_comparer = get_score_comparer( options.score_comparison )
  48. if options.lowercase:
  49. base_masker = string.lower
  50. else:
  51. base_masker = BaseReplacer( options.mask_character )
  52. out = fastqWriter( open( args[1], 'wb' ), format = options.format )
  53. num_reads = None
  54. num_reads_excluded = 0
  55. for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ):
  56. sequence_list = list( fastq_read.sequence )
  57. for i, quality_score in enumerate( fastq_read.get_decimal_quality_scores() ):
  58. if score_comparer( quality_score, options.quality_score ):
  59. sequence_list[ i ] = base_masker( sequence_list[ i ] )
  60. fastq_read.sequence = "".join( sequence_list )
  61. out.write( fastq_read )
  62. if num_reads is not None:
  63. print "Processed %i %s reads." % ( num_reads + 1, options.format )
  64. else:
  65. print "No valid FASTQ reads were provided."
  66. if __name__ == "__main__": main()