/tools/next_gen_conversion/solid2fastq.py

https://bitbucket.org/h_morita_dbcls/galaxy-central · Python · 209 lines · 159 code · 41 blank · 9 comment · 31 complexity · 41a0372b075cb8f65df81ac8bbb6b8dd MD5 · raw file

  1. #!/usr/bin/env python
  2. import sys
  3. import string
  4. import optparse
  5. import tempfile
  6. import sqlite3
  7. def stop_err( msg ):
  8. sys.stderr.write( msg )
  9. sys.exit()
  10. def solid2sanger( quality_string, min_qual = 0 ):
  11. sanger = ""
  12. quality_string = quality_string.rstrip( " " )
  13. for qv in quality_string.split(" "):
  14. try:
  15. if int( qv ) < 0:
  16. qv = '0'
  17. if int( qv ) < min_qual:
  18. return False
  19. break
  20. sanger += chr( int( qv ) + 33 )
  21. except:
  22. pass
  23. return sanger
  24. def Translator(frm='', to='', delete='', keep=None):
  25. allchars = string.maketrans('','')
  26. if len(to) == 1:
  27. to = to * len(frm)
  28. trans = string.maketrans(frm, to)
  29. if keep is not None:
  30. delete = allchars.translate(allchars, keep.translate(allchars, delete))
  31. def callable(s):
  32. return s.translate(trans, delete)
  33. return callable
  34. def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
  35. # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
  36. # which can have three values: fastq, txt, and db
  37. # fastq = fastq format
  38. # txt = space delimited format with defline, reads, and qvs
  39. # dp = dump data into sqlite3 db.
  40. # IMPORTNAT! If out = db two optins must be provided:
  41. # 1. f_out must be a db connection object initialized with sqlite3.connect()
  42. # 2. table_name must be provided
  43. if out == 'db':
  44. cursor = f_out.cursor()
  45. sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
  46. cursor.execute(sql)
  47. lines = []
  48. line = " "
  49. while line:
  50. for f in [ f_reads, f_qual ]:
  51. line = f.readline().rstrip( '\n\r' )
  52. while line.startswith( '#' ):
  53. line = f.readline().rstrip( '\n\r' )
  54. lines.append( line )
  55. if lines[0].startswith( '>' ):
  56. defline = lines[0][1:]
  57. if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
  58. defline = defline[ : len( defline )-3 ]
  59. else:
  60. if trim_first_base:
  61. lines[0] = lines[0][1:]
  62. if double_encode:
  63. de = Translator(frm="0123.", to="ACGTN")
  64. lines[0] = de(lines[0])
  65. qual = solid2sanger( lines[1], int( min_qual ) )
  66. if qual:
  67. if out == 'fastq':
  68. f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) )
  69. if out == 'txt':
  70. f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
  71. if out == 'db':
  72. cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )
  73. lines = []
  74. def main():
  75. usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
  76. parser = optparse.OptionParser(usage=usage)
  77. parser.add_option(
  78. '--fr','--f_reads',
  79. metavar="F3_CSFASTA_FILE",
  80. dest='fr',
  81. help='Name of F3 file with color space reads')
  82. parser.add_option(
  83. '--fq','--f_qual',
  84. metavar="F3_QUAL_FILE",
  85. dest='fq',
  86. help='Name of F3 file with color quality values')
  87. parser.add_option(
  88. '--fout','--f3_fastq_output',
  89. metavar="F3_OUTPUT",
  90. dest='fout',
  91. help='Name for F3 output file')
  92. parser.add_option(
  93. '--rr','--r_reads',
  94. metavar="R3_CSFASTA_FILE",
  95. dest='rr',
  96. default = False,
  97. help='Name of R3 file with color space reads')
  98. parser.add_option(
  99. '--rq','--r_qual',
  100. metavar="R3_QUAL_FILE",
  101. dest='rq',
  102. default = False,
  103. help='Name of R3 file with color quality values')
  104. parser.add_option(
  105. '--rout',
  106. metavar="R3_OUTPUT",
  107. dest='rout',
  108. help='Name for F3 output file')
  109. parser.add_option(
  110. '-q','--min_qual',
  111. dest='min_qual',
  112. default = '-1000',
  113. help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
  114. parser.add_option(
  115. '-t','--trim_name',
  116. dest='trim_name',
  117. action='store_true',
  118. default = False,
  119. help='Trim _R3 and _F3 off read names. Default is False')
  120. parser.add_option(
  121. '-f','--trim_first_base',
  122. dest='trim_first_base',
  123. action='store_true',
  124. default = False,
  125. help='Remove the first base of reads in color-space. Default is False')
  126. parser.add_option(
  127. '-d','--double_encode',
  128. dest='de',
  129. action='store_true',
  130. default = False,
  131. help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
  132. options, args = parser.parse_args()
  133. if not ( options.fout and options.fr and options.fq ):
  134. parser.error("""
  135. One or more of the three required paremetrs is missing:
  136. (1) --fr F3.csfasta file
  137. (2) --fq F3.qual file
  138. (3) --fout name of output file
  139. Use --help for more info
  140. """)
  141. fr = open ( options.fr , 'r' )
  142. fq = open ( options.fq , 'r' )
  143. f_out = open ( options.fout , 'w' )
  144. if options.rr and options.rq:
  145. rr = open ( options.rr , 'r' )
  146. rq = open ( options.rq , 'r' )
  147. if not options.rout:
  148. parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
  149. r_out = open ( options.rout, 'w' )
  150. db = tempfile.NamedTemporaryFile()
  151. try:
  152. con = sqlite3.connect(db.name)
  153. cur = con.cursor()
  154. except:
  155. stop_err('Cannot connect to %s\n') % db.name
  156. merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
  157. merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
  158. cur.execute('create index f3_name on f3( name )')
  159. cur.execute('create index r3_name on r3( name )')
  160. cur.execute('select * from r3,f3 where f3.name = r3.name')
  161. for item in cur:
  162. f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
  163. r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
  164. else:
  165. merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
  166. f_out.close()
  167. if __name__ == "__main__":
  168. main()