/tools/next_gen_conversion/solid2fastq.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 214 lines · 161 code · 44 blank · 9 comment · 36 complexity · df0c875e235646ef0629c19128f5eed6 MD5 · raw file

  1. #!/usr/bin/env python
  2. import sys
  3. import string
  4. import optparse
  5. import tempfile
  6. import sqlite3
  7. def stop_err( msg ):
  8. sys.stderr.write( msg )
  9. sys.exit()
  10. def solid2sanger( quality_string, min_qual = 0 ):
  11. sanger = ""
  12. quality_string = quality_string.rstrip( " " )
  13. for qv in quality_string.split(" "):
  14. try:
  15. if int( qv ) < 0:
  16. qv = '0'
  17. if int( qv ) < min_qual:
  18. return False
  19. break
  20. sanger += chr( int( qv ) + 33 )
  21. except:
  22. pass
  23. return sanger
  24. def Translator(frm='', to='', delete='', keep=None):
  25. allchars = string.maketrans('','')
  26. if len(to) == 1:
  27. to = to * len(frm)
  28. trans = string.maketrans(frm, to)
  29. if keep is not None:
  30. delete = allchars.translate(allchars, keep.translate(allchars, delete))
  31. def callable(s):
  32. return s.translate(trans, delete)
  33. return callable
  34. def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
  35. # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
  36. # which can have three values: fastq, txt, and db
  37. # fastq = fastq format
  38. # txt = space delimited format with defline, reads, and qvs
  39. # dp = dump data into sqlite3 db.
  40. # IMPORTNAT! If out = db two optins must be provided:
  41. # 1. f_out must be a db connection object initialized with sqlite3.connect()
  42. # 2. table_name must be provided
  43. if out == 'db':
  44. cursor = f_out.cursor()
  45. sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
  46. cursor.execute(sql)
  47. lines = []
  48. line = " "
  49. while line:
  50. for f in [ f_reads, f_qual ]:
  51. line = f.readline().rstrip( '\n\r' )
  52. while line.startswith( '#' ):
  53. line = f.readline().rstrip( '\n\r' )
  54. lines.append( line )
  55. if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
  56. if lines[0] != lines[1]:
  57. stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
  58. defline = lines[0][1:]
  59. if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
  60. defline = defline[ : len( defline )-3 ]
  61. elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
  62. if trim_first_base:
  63. lines[0] = lines[0][1:]
  64. if double_encode:
  65. de = Translator(frm="0123.", to="ACGTN")
  66. lines[0] = de(lines[0])
  67. qual = solid2sanger( lines[1], int( min_qual ) )
  68. if qual:
  69. if out == 'fastq':
  70. f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) )
  71. if out == 'txt':
  72. f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
  73. if out == 'db':
  74. cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )
  75. lines = []
  76. def main():
  77. usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
  78. parser = optparse.OptionParser(usage=usage)
  79. parser.add_option(
  80. '--fr','--f_reads',
  81. metavar="F3_CSFASTA_FILE",
  82. dest='fr',
  83. help='Name of F3 file with color space reads')
  84. parser.add_option(
  85. '--fq','--f_qual',
  86. metavar="F3_QUAL_FILE",
  87. dest='fq',
  88. help='Name of F3 file with color quality values')
  89. parser.add_option(
  90. '--fout','--f3_fastq_output',
  91. metavar="F3_OUTPUT",
  92. dest='fout',
  93. help='Name for F3 output file')
  94. parser.add_option(
  95. '--rr','--r_reads',
  96. metavar="R3_CSFASTA_FILE",
  97. dest='rr',
  98. default = False,
  99. help='Name of R3 file with color space reads')
  100. parser.add_option(
  101. '--rq','--r_qual',
  102. metavar="R3_QUAL_FILE",
  103. dest='rq',
  104. default = False,
  105. help='Name of R3 file with color quality values')
  106. parser.add_option(
  107. '--rout',
  108. metavar="R3_OUTPUT",
  109. dest='rout',
  110. help='Name for F3 output file')
  111. parser.add_option(
  112. '-q','--min_qual',
  113. dest='min_qual',
  114. default = '-1000',
  115. help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
  116. parser.add_option(
  117. '-t','--trim_name',
  118. dest='trim_name',
  119. action='store_true',
  120. default = False,
  121. help='Trim _R3 and _F3 off read names. Default is False')
  122. parser.add_option(
  123. '-f','--trim_first_base',
  124. dest='trim_first_base',
  125. action='store_true',
  126. default = False,
  127. help='Remove the first base of reads in color-space. Default is False')
  128. parser.add_option(
  129. '-d','--double_encode',
  130. dest='de',
  131. action='store_true',
  132. default = False,
  133. help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
  134. options, args = parser.parse_args()
  135. if not ( options.fout and options.fr and options.fq ):
  136. parser.error("""
  137. One or more of the three required paremetrs is missing:
  138. (1) --fr F3.csfasta file
  139. (2) --fq F3.qual file
  140. (3) --fout name of output file
  141. Use --help for more info
  142. """)
  143. fr = open ( options.fr , 'r' )
  144. fq = open ( options.fq , 'r' )
  145. f_out = open ( options.fout , 'w' )
  146. if options.rr and options.rq:
  147. rr = open ( options.rr , 'r' )
  148. rq = open ( options.rq , 'r' )
  149. if not options.rout:
  150. parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
  151. r_out = open ( options.rout, 'w' )
  152. db = tempfile.NamedTemporaryFile()
  153. try:
  154. con = sqlite3.connect(db.name)
  155. cur = con.cursor()
  156. except:
  157. stop_err('Cannot connect to %s\n') % db.name
  158. merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
  159. merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
  160. cur.execute('create index f3_name on f3( name )')
  161. cur.execute('create index r3_name on r3( name )')
  162. cur.execute('select * from f3,r3 where f3.name = r3.name')
  163. for item in cur:
  164. f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
  165. r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
  166. else:
  167. merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
  168. f_out.close()
  169. if __name__ == "__main__":
  170. main()