PageRenderTime 56ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/filters/sff_extract.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1512 lines | 1308 code | 65 blank | 139 comment | 75 complexity | e7d96e6b8e7d717a0647332becc288e9 MD5 | raw file
  1. #!/usr/bin/python
  2. '''This software extracts the seq, qual and ancillary information from an sff
  3. file, like the ones used by the 454 sequencer.
  4. Optionally, it can also split paired-end reads if given the linker sequence.
  5. The splitting is done with maximum match, i.e., every occurence of the linker
  6. sequence will be removed, even if occuring multiple times.'''
  7. #copyright Jose Blanca and Bastien Chevreux
  8. #COMAV institute, Universidad Politecnica de Valencia (UPV)
  9. #Valencia, Spain
  10. # additions to handle paired end reads by Bastien Chevreux
  11. # bugfixes for linker specific lengths: Lionel Guy
  12. #This program is free software: you can redistribute it and/or modify
  13. #it under the terms of the GNU General Public License as published by
  14. #the Free Software Foundation, either version 3 of the License, or
  15. #(at your option) any later version.
  16. #This program is distributed in the hope that it will be useful,
  17. #but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. #GNU General Public License for more details.
  20. #You should have received a copy of the GNU General Public License
  21. #along with this program. If not, see <http://www.gnu.org/licenses/>.
  22. __author__ = 'Jose Blanca and Bastien Chevreux'
  23. __copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
  24. __license__ = 'GPLv3 or later'
  25. __version__ = '0.2.10'
  26. __email__ = 'jblanca@btc.upv.es'
  27. __status__ = 'beta'
  28. import struct
  29. import sys
  30. import os
  31. import subprocess
  32. import tempfile
  33. fake_sff_name = 'fake_sff_name'
  34. # readname as key: lines with matches from SSAHA, one best match
  35. ssahapematches = {}
  36. # linker readname as key: length of linker sequence
  37. linkerlengths = {}
  38. # set to true if something really fishy is going on with the sequences
  39. stern_warning = True
  40. def read_bin_fragment(struct_def, fileh, offset=0, data=None,
  41. byte_padding=None):
  42. '''It reads a chunk of a binary file.
  43. You have to provide the struct, a file object, the offset (where to start
  44. reading).
  45. Also you can provide an optional dict that will be populated with the
  46. extracted data.
  47. If a byte_padding is given the number of bytes read will be a multiple of
  48. that number, adding the required pad at the end.
  49. It returns the number of bytes reads and the data dict.
  50. '''
  51. if data is None:
  52. data = {}
  53. #we read each item
  54. bytes_read = 0
  55. for item in struct_def:
  56. #we go to the place and read
  57. fileh.seek(offset + bytes_read)
  58. n_bytes = struct.calcsize(item[1])
  59. buffer = fileh.read(n_bytes)
  60. read = struct.unpack('>' + item[1], buffer)
  61. if len(read) == 1:
  62. read = read[0]
  63. data[item[0]] = read
  64. bytes_read += n_bytes
  65. #if there is byte_padding the bytes_to_read should be a multiple of the
  66. #byte_padding
  67. if byte_padding is not None:
  68. pad = byte_padding
  69. bytes_read = ((bytes_read + pad - 1) // pad) * pad
  70. return (bytes_read, data)
  71. def check_magic(magic):
  72. '''It checks that the magic number of the file matches the sff magic.'''
  73. if magic != 779314790:
  74. raise RuntimeError('This file does not seems to be an sff file.')
  75. def check_version(version):
  76. '''It checks that the version is supported, otherwise it raises an error.'''
  77. supported = ('\x00', '\x00', '\x00', '\x01')
  78. i = 0
  79. for item in version:
  80. if version[i] != supported[i]:
  81. raise RuntimeError('SFF version not supported. Please contact the author of the software.')
  82. i += 1
  83. def read_header(fileh):
  84. '''It reads the header from the sff file and returns a dict with the
  85. information'''
  86. #first we read the first part of the header
  87. head_struct = [
  88. ('magic_number', 'I'),
  89. ('version', 'cccc'),
  90. ('index_offset', 'Q'),
  91. ('index_length', 'I'),
  92. ('number_of_reads', 'I'),
  93. ('header_length', 'H'),
  94. ('key_length', 'H'),
  95. ('number_of_flows_per_read', 'H'),
  96. ('flowgram_format_code', 'B'),
  97. ]
  98. data = {}
  99. first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
  100. offset=0, data=data)
  101. check_magic(data['magic_number'])
  102. check_version(data['version'])
  103. #now that we know the number_of_flows_per_read and the key_length
  104. #we can read the second part of the header
  105. struct2 = [
  106. ('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
  107. ('key_sequence', str(data['key_length']) + 'c')
  108. ]
  109. read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
  110. data=data)
  111. return data
  112. def read_sequence(header, fileh, fposition):
  113. '''It reads one read from the sff file located at the fposition and
  114. returns a dict with the information.'''
  115. header_length = header['header_length']
  116. index_offset = header['index_offset']
  117. index_length = header['index_length']
  118. #the sequence struct
  119. read_header_1 = [
  120. ('read_header_length', 'H'),
  121. ('name_length', 'H'),
  122. ('number_of_bases', 'I'),
  123. ('clip_qual_left', 'H'),
  124. ('clip_qual_right', 'H'),
  125. ('clip_adapter_left', 'H'),
  126. ('clip_adapter_right', 'H'),
  127. ]
  128. def read_header_2(name_length):
  129. '''It returns the struct definition for the second part of the header'''
  130. return [('name', str(name_length) +'c')]
  131. def read_data(number_of_bases):
  132. '''It returns the struct definition for the read data section.'''
  133. #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
  134. if header['flowgram_format_code'] == 1:
  135. flow_type = 'H'
  136. else:
  137. raise Error('file version not supported')
  138. number_of_bases = str(number_of_bases)
  139. return [
  140. ('flowgram_values', str(header['number_of_flows_per_read']) +
  141. flow_type),
  142. ('flow_index_per_base', number_of_bases + 'B'),
  143. ('bases', number_of_bases + 'c'),
  144. ('quality_scores', number_of_bases + 'B'),
  145. ]
  146. data = {}
  147. #we read the first part of the header
  148. bytes_read, data = read_bin_fragment(struct_def=read_header_1,
  149. fileh=fileh, offset=fposition, data=data)
  150. read_bin_fragment(struct_def=read_header_2(data['name_length']),
  151. fileh=fileh, offset=fposition + bytes_read, data=data)
  152. #we join the letters of the name
  153. data['name'] = ''.join(data['name'])
  154. offset = data['read_header_length']
  155. #we read the sequence and the quality
  156. read_data_st = read_data(data['number_of_bases'])
  157. bytes_read, data = read_bin_fragment(struct_def=read_data_st,
  158. fileh=fileh, offset=fposition + offset,
  159. data=data, byte_padding=8)
  160. #we join the bases
  161. data['bases'] = ''.join(data['bases'])
  162. #print data
  163. #print "pre cqr: ", data['clip_qual_right']
  164. #print "pre car: ", data['clip_adapter_right']
  165. #print "pre cql: ", data['clip_qual_left']
  166. #print "pre cal: ", data['clip_adapter_left']
  167. # correct for the case the right clip is <= than the left clip
  168. # in this case, left clip is 0 are set to 0 (right clip == 0 means
  169. # "whole sequence")
  170. if data['clip_qual_right'] <= data['clip_qual_left'] :
  171. data['clip_qual_right'] = 0
  172. data['clip_qual_left'] = 0
  173. if data['clip_adapter_right'] <= data['clip_adapter_left'] :
  174. data['clip_adapter_right'] = 0
  175. data['clip_adapter_left'] = 0
  176. #the clipping section follows the NCBI's guidelines Trace Archive RFC
  177. #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
  178. #if there's no adapter clip: qual -> vector
  179. #else: qual-> qual
  180. # adapter -> vector
  181. if not data['clip_adapter_left']:
  182. data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
  183. if not data['clip_adapter_right']:
  184. data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
  185. # see whether we have to override the minimum left clips
  186. if config['min_leftclip'] > 0:
  187. if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
  188. data['clip_adapter_left'] = config['min_leftclip']
  189. if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
  190. data['clip_qual_left'] = config['min_leftclip']
  191. #print "post cqr: ", data['clip_qual_right']
  192. #print "post car: ", data['clip_adapter_right']
  193. #print "post cql: ", data['clip_qual_left']
  194. #print "post cal: ", data['clip_adapter_left']
  195. # for handling the -c (clip) option gently, we already clip here
  196. # and set all clip points to the sequence end points
  197. if config['clip']:
  198. data['bases'], data['quality_scores'] = clip_read(data)
  199. data['number_of_bases']=len(data['bases'])
  200. data['clip_qual_right'] = data['number_of_bases']
  201. data['clip_adapter_right'] = data['number_of_bases']
  202. data['clip_qual_left'] = 0
  203. data['clip_adapter_left'] = 0
  204. return data['read_header_length'] + bytes_read, data
  205. def sequences(fileh, header):
  206. '''It returns a generator with the data for each read.'''
  207. #now we can read all the sequences
  208. fposition = header['header_length'] #position in the file
  209. reads_read = 0
  210. while True:
  211. if fposition == header['index_offset']:
  212. #we have to skip the index section
  213. fposition += index_length
  214. continue
  215. else:
  216. bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
  217. fposition=fposition)
  218. yield seq_data
  219. fposition += bytes_read
  220. reads_read += 1
  221. if reads_read >= header['number_of_reads']:
  222. break
  223. def remove_last_xmltag_in_file(fname, tag=None):
  224. '''Given an xml file name and a tag, it removes the last tag of the
  225. file if it matches the given tag. Tag removal is performed via file
  226. truncation.
  227. It the given tag is not the last in the file, a RunTimeError will be
  228. raised.
  229. The resulting xml file will be not xml valid. This function is a hack
  230. that allows to append records to xml files in a quick and dirty way.
  231. '''
  232. fh = open(fname, 'r+')
  233. #we have to read from the end to the start of the file and keep the
  234. #string enclosed by </ >
  235. i = -1
  236. last_tag = [] #the chars that form the last tag
  237. start_offset = None #in which byte does the last tag starts?
  238. end_offset = None #in which byte does the last tag ends?
  239. while True:
  240. fh.seek(i, 2)
  241. char = fh.read(1)
  242. if not char.isspace():
  243. last_tag.append(char)
  244. if char == '>':
  245. end_offset = i
  246. if char == '<':
  247. start_offset = i
  248. break
  249. i -= 1
  250. #we have read the last tag backwards
  251. last_tag = ''.join(last_tag[::-1])
  252. #we remove the </ and >
  253. last_tag = last_tag.rstrip('>').lstrip('</')
  254. #we check that we're removing the asked tag
  255. if tag is not None and tag != last_tag:
  256. etxt=join('The given xml tag (',tag,') was not the last one in the file');
  257. raise RuntimeError(etxt)
  258. # while we are at it: also remove all white spaces in that line :-)
  259. i -= 1
  260. while True:
  261. fh.seek(i, 2)
  262. char = fh.read(1)
  263. if not char == ' ' and not char == '\t':
  264. break;
  265. if fh.tell() == 1:
  266. break;
  267. i -= 1
  268. fh.truncate();
  269. fh.close()
  270. return last_tag
  271. def create_basic_xml_info(readname, fname):
  272. '''Formats a number of read specific infos into XML format.
  273. Currently formated: name and the tags set from command line
  274. '''
  275. to_print = [' <trace>\n']
  276. to_print.append(' <trace_name>')
  277. to_print.append(readname)
  278. to_print.append('</trace_name>\n')
  279. #extra information
  280. #do we have extra info for this file?
  281. info = None
  282. if config['xml_info']:
  283. #with this name?
  284. if fname in config['xml_info']:
  285. info = config['xml_info'][fname]
  286. else:
  287. #with no name?
  288. try:
  289. info = config['xml_info'][fake_sff_name]
  290. except KeyError:
  291. pass
  292. #we print the info that we have
  293. if info:
  294. for key in info:
  295. to_print.append(' <' + key + '>' + info[key] + \
  296. '</' + key +'>\n')
  297. return ''.join(to_print)
  298. def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
  299. '''Takes the clip values of the read and formats them into XML
  300. Corrects "wrong" values that might have resulted through
  301. simplified calculations earlier in the process of conversion
  302. (especially during splitting of paired-end reads)
  303. '''
  304. to_print = [""]
  305. # if right borders are >= to read length, they don't need
  306. # to be printed
  307. if adapr >= readlen:
  308. adapr = 0
  309. if qualr >= readlen:
  310. qualr = 0
  311. # BaCh
  312. # when called via split_paired_end(), some values may be < 0
  313. # (when clip values were 0 previously)
  314. # instead of putting tons of if clauses for different calculations there,
  315. # I centralise corrective measure here
  316. # set all values <0 to 0
  317. if adapr < 0:
  318. adapr = 0
  319. if qualr <0:
  320. qualr = 0
  321. if adapl < 0:
  322. adapl = 0
  323. if quall <0:
  324. quall = 0
  325. if quall:
  326. to_print.append(' <clip_quality_left>')
  327. to_print.append(str(quall))
  328. to_print.append('</clip_quality_left>\n')
  329. if qualr:
  330. to_print.append(' <clip_quality_right>')
  331. to_print.append(str(qualr))
  332. to_print.append('</clip_quality_right>\n')
  333. if adapl:
  334. to_print.append(' <clip_vector_left>')
  335. to_print.append(str(adapl))
  336. to_print.append('</clip_vector_left>\n')
  337. if adapr:
  338. to_print.append(' <clip_vector_right>')
  339. to_print.append(str(adapr))
  340. to_print.append('</clip_vector_right>\n')
  341. return ''.join(to_print)
  342. def create_xml_for_unpaired_read(data, fname):
  343. '''Given the data for one read it returns an str with the xml ancillary
  344. data.'''
  345. to_print = [create_basic_xml_info(data['name'],fname)]
  346. #clippings in the XML only if we do not hard clip
  347. if not config['clip']:
  348. to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
  349. to_print.append(' </trace>\n')
  350. return ''.join(to_print)
  351. def format_as_fasta(name,seq,qual):
  352. name_line = ''.join(('>', name,'\n'))
  353. seqstring = ''.join((name_line, seq, '\n'))
  354. qual_line = ' '.join([str(q) for q in qual])
  355. qualstring = ''.join((name_line, qual_line, '\n'))
  356. return seqstring, qualstring
  357. def format_as_fastq(name,seq,qual):
  358. qual_line = ''.join([chr(q+33) for q in qual])
  359. #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
  360. seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
  361. return seqstring
  362. def get_read_data(data):
  363. '''Given the data for one read it returns 2 strs with the fasta seq
  364. and fasta qual.'''
  365. #seq and qual
  366. if config['mix_case']:
  367. seq = sequence_case(data)
  368. qual = data['quality_scores']
  369. else :
  370. seq = data['bases']
  371. qual = data['quality_scores']
  372. return seq, qual
  373. def extract_read_info(data, fname):
  374. '''Given the data for one read it returns 3 strs with the fasta seq, fasta
  375. qual and xml ancillary data.'''
  376. seq,qual = get_read_data(data)
  377. seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
  378. #name_line = ''.join(('>', data['name'],'\n'))
  379. #seq = ''.join((name_line, seq, '\n'))
  380. #qual_line = ' '.join([str(q) for q in qual])
  381. #qual = ''.join((name_line, qual_line, '\n'))
  382. xmlstring = create_xml_for_unpaired_read(data, fname)
  383. return seqstring, qualstring, xmlstring
  384. def write_sequence(name,seq,qual,seq_fh,qual_fh):
  385. '''Write sequence and quality FASTA and FASTA qual filehandles
  386. (or into FASTQ and XML)
  387. if sequence length is 0, don't write'''
  388. if len(seq) == 0 : return
  389. if qual_fh is None:
  390. seq_fh.write(format_as_fastq(name,seq,qual))
  391. else:
  392. seqstring, qualstring = format_as_fasta(name,seq,qual)
  393. seq_fh.write(seqstring)
  394. qual_fh.write(qualstring)
  395. return
  396. def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
  397. '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
  398. (or into FASTQ and XML)
  399. if sequence length is 0, don't write'''
  400. seq,qual = get_read_data(data)
  401. if len(seq) == 0 : return
  402. write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
  403. anci = create_xml_for_unpaired_read(data, sff_fh.name)
  404. if anci is not None:
  405. xml_fh.write(anci)
  406. return
  407. def reverse_complement(seq):
  408. '''Returns the reverse complement of a DNA sequence as string'''
  409. compdict = {
  410. 'a': 't',
  411. 'c': 'g',
  412. 'g': 'c',
  413. 't': 'a',
  414. 'u': 't',
  415. 'm': 'k',
  416. 'r': 'y',
  417. 'w': 'w',
  418. 's': 's',
  419. 'y': 'r',
  420. 'k': 'm',
  421. 'v': 'b',
  422. 'h': 'd',
  423. 'd': 'h',
  424. 'b': 'v',
  425. 'x': 'x',
  426. 'n': 'n',
  427. 'A': 'T',
  428. 'C': 'G',
  429. 'G': 'C',
  430. 'T': 'A',
  431. 'U': 'T',
  432. 'M': 'K',
  433. 'R': 'Y',
  434. 'W': 'W',
  435. 'S': 'S',
  436. 'Y': 'R',
  437. 'K': 'M',
  438. 'V': 'B',
  439. 'H': 'D',
  440. 'D': 'H',
  441. 'B': 'V',
  442. 'X': 'X',
  443. 'N': 'N',
  444. '*': '*'
  445. }
  446. complseq = ''.join([compdict[base] for base in seq])
  447. # python hack to reverse a list/string/etc
  448. complseq = complseq[::-1]
  449. return complseq
  450. def mask_sequence(seq, maskchar, fpos, tpos):
  451. '''Given a sequence, mask it with maskchar starting at fpos (including) and
  452. ending at tpos (excluding)
  453. '''
  454. if len(maskchar) > 1:
  455. raise RuntimeError("Internal error: more than one character given to mask_sequence")
  456. if fpos<0:
  457. fpos = 0
  458. if tpos > len(seq):
  459. tpos = len(seq)
  460. newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
  461. return newseq
  462. def fragment_sequences(sequence, qualities, splitchar):
  463. '''Works like split() on strings, except it does this on a sequence
  464. and the corresponding list with quality values.
  465. Returns a tuple for each fragment, each sublist has the fragment
  466. sequence as first and the fragment qualities as second elemnt'''
  467. # this is slow (due to zip and list appends... use an iterator over
  468. # the sequence find find variations and splices on seq and qual
  469. if len(sequence) != len(qualities):
  470. print sequence, qualities
  471. raise RuntimeError("Internal error: length of sequence and qualities don't match???")
  472. retlist = ([])
  473. if len(sequence) == 0:
  474. return retlist
  475. actseq = ([])
  476. actqual = ([])
  477. if sequence[0] != splitchar:
  478. inseq = True
  479. else:
  480. inseq = False
  481. for char,qual in zip(sequence,qualities):
  482. if inseq:
  483. if char != splitchar:
  484. actseq.append(char)
  485. actqual.append(qual)
  486. else:
  487. retlist.append((''.join(actseq), actqual))
  488. actseq = ([])
  489. actqual = ([])
  490. inseq = False
  491. else:
  492. if char != splitchar:
  493. inseq = True
  494. actseq.append(char)
  495. actqual.append(qual)
  496. if inseq and len(actseq):
  497. retlist.append((''.join(actseq), actqual))
  498. return retlist
  499. def calc_subseq_boundaries(maskedseq, maskchar):
  500. '''E.g.:
  501. ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
  502. to
  503. (0,8),(8,16),(16,26),(26,47),(47,56)
  504. '''
  505. blist = ([])
  506. if len(maskedseq) == 0:
  507. return blist
  508. inmask = True
  509. if maskedseq[0] != maskchar:
  510. inmask = False
  511. start = 0
  512. for spos in range(len(maskedseq)):
  513. if inmask and maskedseq[spos] != maskchar:
  514. blist.append(([start,spos]))
  515. start = spos
  516. inmask = False
  517. elif not inmask and maskedseq[spos] == maskchar:
  518. blist.append(([start,spos]))
  519. start = spos
  520. inmask = True
  521. blist.append(([start,spos+1]))
  522. return blist
  523. def correct_for_smallhits(maskedseq, maskchar, linkername):
  524. '''If partial hits were found, take preventive measure: grow
  525. the masked areas by 20 bases in each direction
  526. Returns either unchanged "maskedseq" or a new sequence
  527. with some more characters masked.
  528. '''
  529. global linkerlengths
  530. CEBUG = 0
  531. if CEBUG : print "correct_for_smallhits"
  532. if CEBUG : print "Masked seq\n", maskedseq
  533. if CEBUG : print "Linkername: ", linkername
  534. if len(maskedseq) == 0:
  535. return maskedseq
  536. growl=40
  537. growl2=growl/2
  538. boundaries = calc_subseq_boundaries(maskedseq,maskchar)
  539. if CEBUG : print "Boundaries: ", boundaries
  540. foundpartial = False
  541. for bounds in boundaries:
  542. if CEBUG : print "\tbounds: ", bounds
  543. left, right = bounds
  544. if left != 0 and right != len(maskedseq):
  545. if maskedseq[left] == maskchar:
  546. # allow 10% discrepancy
  547. # -linkerlengths[linkername]/10
  548. # that's a kind of safety net if there are slight sequencing
  549. # errors in the linker itself
  550. if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
  551. if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
  552. foundpartial = True
  553. if not foundpartial:
  554. return maskedseq
  555. # grow
  556. newseq = ""
  557. for bounds in boundaries:
  558. if CEBUG : print "Bounds: ", bounds
  559. left, right = bounds
  560. if maskedseq[left] == maskchar:
  561. newseq += maskedseq[left:right]
  562. else:
  563. clearstart = 0
  564. if left > 0 :
  565. clearstart = left+growl2
  566. clearstop = len(maskedseq)
  567. if right < len(maskedseq):
  568. clearstop = right-growl2
  569. if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
  570. if clearstop <= clearstart:
  571. newseq += maskchar * (right-left)
  572. else:
  573. if clearstart != left:
  574. newseq += maskchar * growl2
  575. newseq += maskedseq[clearstart:clearstop]
  576. if clearstop != right:
  577. newseq += maskchar * growl2
  578. #print "newseq\n",newseq
  579. return newseq
  580. def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
  581. '''Splits a paired end read and writes sequences into FASTA, FASTA qual
  582. and XML traceinfo file. Returns the number of sequences created.
  583. As the linker sequence may be anywhere in the read, including the ends
  584. and overlapping with bad quality sequence, we need to perform some
  585. computing and eventually set new clip points.
  586. If the resulting split yields only one sequence (because linker
  587. was not present or overlapping with left or right clip), only one
  588. sequence will be written with ".fn" appended to the name.
  589. If the read can be split, two reads will be written. The side left of
  590. the linker will be named ".r" and will be written in reverse complement
  591. into the file to conform with what approximately all assemblers expect
  592. when reading paired-end data: reads in forward direction in file. The side
  593. right of the linker will be named ".f"
  594. If SSAHA found partial linker (linker sequences < length of linker),
  595. the sequences will get a "_pl" furthermore be cut back thoroughly.
  596. If SSAHA found multiple occurences of the linker, the names will get an
  597. additional "_mlc" within the name to show that there was "multiple
  598. linker contamination".
  599. For multiple or partial linker, the "good" parts of the reads are
  600. stored with a ".part<number>" name, additionally they will not get
  601. template information in the XML
  602. '''
  603. global ssahapematches
  604. CEBUG = 0
  605. maskchar = "#"
  606. if CEBUG : print "Need to split: " + data['name']
  607. numseqs = 0;
  608. readname = data['name']
  609. readlen = data['number_of_bases']
  610. leftclip, rightclip = return_merged_clips(data)
  611. seq, qual = get_read_data(data)
  612. if CEBUG : print "Original read:\n",seq
  613. maskedseq = seq
  614. if leftclip > 0:
  615. maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
  616. if rightclip < len(maskedseq):
  617. maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
  618. leftclip, rightclip = return_merged_clips(data)
  619. readlen = data['number_of_bases']
  620. if CEBUG : print "Readname:", readname
  621. if CEBUG : print "Readlen:", readlen
  622. if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
  623. if CEBUG : print "matches:", ssahapematches[data['name']]
  624. for match in ssahapematches[data['name']]:
  625. score = int(match[0])
  626. linkername = match[2]
  627. leftreadhit = int(match[3])
  628. rightreadhit = int(match[4])
  629. #leftlinkerhit = int(match[5])
  630. #rightlinkerhit = int(match[6])
  631. #direction = match[7]
  632. #hitlen = int(match[8])
  633. #hitidentity = float(match[9])
  634. if CEBUG : print match
  635. if CEBUG : print "Match with score:", score
  636. if CEBUG : print "Read before:\n", maskedseq
  637. maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
  638. if CEBUG : print "Masked seq:\n", maskedseq
  639. correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
  640. if len(maskedseq) != len(correctedseq):
  641. raise RuntimeError("Internal error: maskedseq != correctedseq")
  642. partialhits = False
  643. if correctedseq != maskedseq:
  644. if CEBUG : print "Partial hits in", readname
  645. if CEBUG : print "Original seq:\n", seq
  646. if CEBUG : print "Masked seq:\n", maskedseq
  647. if CEBUG : print "Corrected seq\n", correctedseq
  648. partialhits = True
  649. readname += "_pl"
  650. maskedseq = correctedseq
  651. fragments = fragment_sequences(maskedseq, qual, maskchar)
  652. if CEBUG : print "Fragments (", len(fragments), "): ", fragments
  653. mlcflag = False
  654. #if len(ssahapematches[data['name']]) > 1:
  655. # #print "Multi linker contamination"
  656. # mlcflag = True
  657. # readname += "_mlc"
  658. if len(fragments) > 2:
  659. if CEBUG : print "Multi linker contamination"
  660. mlcflag = True
  661. readname += "_mlc"
  662. #print fragments
  663. if mlcflag or partialhits:
  664. fragcounter = 1
  665. readname += ".part"
  666. for frag in fragments:
  667. actseq = frag[0]
  668. if len(actseq) >= 20:
  669. actqual = frag[1]
  670. oname = readname + str(fragcounter)
  671. #seq_fh.write(">"+oname+"\n")
  672. #seq_fh.write(actseq+"\n")
  673. #qual_fh.write(">"+oname+"\n")
  674. #qual_fh.write(' '.join((str(q) for q in actqual)))
  675. #qual_fh.write("\n")
  676. write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
  677. to_print = [create_basic_xml_info(oname,sff_fh.name)]
  678. # No clipping in XML ... the multiple and partial fragments
  679. # are clipped "hard"
  680. # No template ID and trace_end: we don't know the
  681. # orientation of the frahments. Even if it were
  682. # only two, the fact we had multiple linkers
  683. # says something went wrong, so simply do not
  684. # write any paired-end information for all these fragments
  685. to_print.append(' </trace>\n')
  686. xml_fh.write(''.join(to_print))
  687. numseqs += 1
  688. fragcounter += 1
  689. else:
  690. if len(fragments) >2:
  691. raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
  692. # nothing will happen for 0 fragments
  693. if len(fragments) == 1:
  694. #print "Tada1"
  695. boundaries = calc_subseq_boundaries(maskedseq,maskchar)
  696. if len(boundaries) < 1 or len(boundaries) >3:
  697. raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
  698. if len(boundaries) == 3:
  699. # case: mask char on both sides of sequence
  700. #print "bounds3"
  701. data['clip_adapter_left']=boundaries[0][1]
  702. data['clip_adapter_right']=boundaries[2][0]
  703. elif len(boundaries) == 2:
  704. # case: mask char left or right of sequence
  705. #print "bounds2",
  706. if maskedseq[0] == maskchar :
  707. # case: mask char left
  708. #print "left"
  709. data['clip_adapter_left']=boundaries[0][1]
  710. else:
  711. # case: mask char right
  712. #print "right"
  713. data['clip_adapter_right']=boundaries[1][0]
  714. data['name'] = data['name'] + ".fn"
  715. write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
  716. numseqs = 1
  717. elif len(fragments) == 2:
  718. #print "Tada2"
  719. oname = readname + ".r"
  720. seq, qual = get_read_data(data)
  721. startsearch = False
  722. for spos in range(len(maskedseq)):
  723. if maskedseq[spos] != maskchar:
  724. startsearch = True;
  725. else:
  726. if startsearch:
  727. break
  728. #print "\nspos: ", spos
  729. lseq=seq[:spos]
  730. #print "lseq:", lseq
  731. actseq = reverse_complement(lseq)
  732. lreadlen = len(actseq)
  733. lqual = qual[:spos];
  734. # python hack to reverse a list/string/etc
  735. lqual = lqual[::-1];
  736. #seq_fh.write(">"+oname+"\n")
  737. #seq_fh.write(actseq+"\n")
  738. #qual_fh.write(">"+oname+"\n")
  739. #qual_fh.write(' '.join((str(q) for q in lqual)))
  740. #qual_fh.write("\n")
  741. write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
  742. to_print = [create_basic_xml_info(oname,sff_fh.name)]
  743. to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
  744. to_print.append(' <template_id>')
  745. to_print.append(readname)
  746. to_print.append('</template_id>\n')
  747. to_print.append(' <trace_end>r</trace_end>\n')
  748. to_print.append(' </trace>\n')
  749. xml_fh.write(''.join(to_print))
  750. oname = readname + ".f"
  751. startsearch = False
  752. for spos in range(len(maskedseq)-1,-1,-1):
  753. if maskedseq[spos] != maskchar:
  754. startsearch = True;
  755. else:
  756. if startsearch:
  757. break
  758. actseq = seq[spos+1:]
  759. actqual = qual[spos+1:];
  760. #print "\nspos: ", spos
  761. #print "rseq:", actseq
  762. #seq_fh.write(">"+oname+"\n")
  763. #seq_fh.write(actseq+"\n")
  764. #qual_fh.write(">"+oname+"\n")
  765. #qual_fh.write(' '.join((str(q) for q in actqual)))
  766. #qual_fh.write("\n")
  767. write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
  768. rreadlen = len(actseq)
  769. to_print = [create_basic_xml_info(oname,sff_fh.name)]
  770. to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
  771. to_print.append(' <template_id>')
  772. to_print.append(readname)
  773. to_print.append('</template_id>\n')
  774. to_print.append(' <trace_end>f</trace_end>\n')
  775. to_print.append(' </trace>\n')
  776. xml_fh.write(''.join(to_print))
  777. numseqs = 2
  778. return numseqs
  779. def extract_reads_from_sff(config, sff_files):
  780. '''Given the configuration and the list of sff_files it writes the seqs,
  781. qualities and ancillary data into the output file(s).
  782. If file for paired-end linker was given, first extracts all sequences
  783. of an SFF and searches these against the linker(s) with SSAHA2 to
  784. create needed information to split reads.
  785. '''
  786. global ssahapematches
  787. if len(sff_files) == 0 :
  788. raise RuntimeError("No SFF file given?")
  789. #we go through all input files
  790. for sff_file in sff_files:
  791. if not os.path.getsize(sff_file):
  792. raise RuntimeError('Empty file? : ' + sff_file)
  793. fh = open(sff_file, 'r')
  794. fh.close()
  795. openmode = 'w'
  796. if config['append']:
  797. openmode = 'a'
  798. seq_fh = open(config['seq_fname'], openmode)
  799. xml_fh = open(config['xml_fname'], openmode)
  800. if config['want_fastq']:
  801. qual_fh = None
  802. try:
  803. os.remove(config['qual_fname'])
  804. except :
  805. python_formattingwithoutbracesisdumb_dummy = 1
  806. else:
  807. qual_fh = open(config['qual_fname'], openmode)
  808. if not config['append']:
  809. xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
  810. else:
  811. remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
  812. #we go through all input files
  813. for sff_file in sff_files:
  814. #print "Working on '" + sff_file + "':"
  815. ssahapematches.clear()
  816. seqcheckstore = ([])
  817. debug = 0
  818. if not debug and config['pelinker_fname']:
  819. #print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
  820. sys.stdout.flush()
  821. if 0 :
  822. # for debugging
  823. pid = os.getpid()
  824. tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
  825. tmpfasta_fh = open(tmpfasta_fname, 'w')
  826. else:
  827. tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
  828. suffix = '.fasta')
  829. sff_fh = open(sff_file, 'rb')
  830. header_data = read_header(fileh=sff_fh)
  831. for seq_data in sequences(fileh=sff_fh, header=header_data):
  832. seq,qual = get_read_data(seq_data)
  833. seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
  834. tmpfasta_fh.write(seqstring)
  835. #seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
  836. #tmpfasta_fh.write(seq)
  837. #print "done."
  838. tmpfasta_fh.seek(0)
  839. if 0 :
  840. # for debugging
  841. tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
  842. tmpssaha_fh = open(tmpssaha_fname, 'w+')
  843. else:
  844. tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
  845. suffix = '.ssaha2')
  846. launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
  847. tmpfasta_fh.close()
  848. tmpssaha_fh.seek(0)
  849. read_ssaha_data(tmpssaha_fh)
  850. tmpssaha_fh.close()
  851. if debug:
  852. tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
  853. read_ssaha_data(tmpssaha_fh)
  854. #print "Converting '" + sff_file + "' ... ",
  855. sys.stdout.flush()
  856. sff_fh = open(sff_file, 'rb')
  857. #read_header(infile)
  858. header_data = read_header(fileh=sff_fh)
  859. #now convert all reads
  860. nseqs_sff = 0
  861. nseqs_out = 0
  862. for seq_data in sequences(fileh=sff_fh, header=header_data):
  863. nseqs_sff += 1
  864. seq, qual = clip_read(seq_data)
  865. seqcheckstore.append(seq[0:50])
  866. #if nseqs_sff >1000:
  867. # check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
  868. # sys.exit()
  869. if ssahapematches.has_key(seq_data['name']):
  870. #print "Paired end:",seq_data['name']
  871. nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
  872. else:
  873. #print "Normal:",seq_data['name']
  874. if config['pelinker_fname']:
  875. seq_data['name'] = seq_data['name'] + ".fn"
  876. write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
  877. nseqs_out += 1
  878. #print "done."
  879. #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
  880. sff_fh.close()
  881. check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
  882. seqcheckstore = ([])
  883. xml_fh.write('</trace_volume>\n')
  884. xml_fh.close()
  885. seq_fh.close()
  886. if qual_fh is not None:
  887. qual_fh.close()
  888. return
  889. def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
  890. global stern_warning
  891. foundproblem = ""
  892. for checklen in range(1,len(seqcheckstore[0])):
  893. foundinloop = False
  894. seqdict = {}
  895. for seq in seqcheckstore:
  896. shortseq = seq[0:checklen]
  897. if shortseq in seqdict:
  898. seqdict[shortseq] += 1
  899. else:
  900. seqdict[shortseq] = 1
  901. for shortseq, count in seqdict.items():
  902. if float(count)/len(seqcheckstore) >= 0.5:
  903. foundinloop = True
  904. stern_warning
  905. foundproblem = "\n"+"*" * 80
  906. foundproblem += "\nWARNING: "
  907. foundproblem += "weird sequences in file " + sffname + "\n\n"
  908. foundproblem += "After applying left clips, " + str(count) + " sequences (="
  909. foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
  910. foundproblem += "%) start with these bases:\n" + shortseq
  911. foundproblem += "\n\nThis does not look sane.\n\n"
  912. foundproblem += "Countermeasures you *probably* must take:\n"
  913. foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n"
  914. foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n"
  915. left,right = return_merged_clips(seqdata)
  916. foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
  917. foundproblem += "*" * 80 + "\n"
  918. if not foundinloop :
  919. break
  920. if len(foundproblem):
  921. print foundproblem
  922. def parse_extra_info(info):
  923. '''It parses the information that will go in the xml file.
  924. There are two formats accepted for the extra information:
  925. key1:value1, key2:value2
  926. or:
  927. file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
  928. '''
  929. if not info:
  930. return info
  931. finfos = info.split(';') #information for each file
  932. data_for_files = {}
  933. for finfo in finfos:
  934. #we split the file name from the rest
  935. items = finfo.split('{')
  936. if len(items) == 1:
  937. fname = fake_sff_name
  938. info = items[0]
  939. else:
  940. fname = items[0]
  941. info = items[1]
  942. #now we get each key,value pair in the info
  943. info = info.replace('}', '')
  944. data = {}
  945. for item in info.split(','):
  946. key, value = item.strip().split(':')
  947. key = key.strip()
  948. value = value.strip()
  949. data[key] = value
  950. data_for_files[fname] = data
  951. return data_for_files
  952. def return_merged_clips(data):
  953. '''It returns the left and right positions to clip.'''
  954. def max(a, b):
  955. '''It returns the max of the two given numbers.
  956. It won't take into account the zero values.
  957. '''
  958. if not a and not b:
  959. return None
  960. if not a:
  961. return b
  962. if not b:
  963. return a
  964. if a >= b:
  965. return a
  966. else:
  967. return b
  968. def min(a, b):
  969. '''It returns the min of the two given numbers.
  970. It won't take into account the zero values.
  971. '''
  972. if not a and not b:
  973. return None
  974. if not a:
  975. return b
  976. if not b:
  977. return a
  978. if a <= b:
  979. return a
  980. else:
  981. return b
  982. left = max(data['clip_adapter_left'], data['clip_qual_left'])
  983. right = min(data['clip_adapter_right'], data['clip_qual_right'])
  984. #maybe both clips where zero
  985. if left is None:
  986. left = 1
  987. if right is None:
  988. right = data['number_of_bases']
  989. return left, right
  990. def sequence_case(data):
  991. '''Given the data for one read it returns the seq with mixed case.
  992. The regions to be clipped will be lower case and the rest upper case.
  993. '''
  994. left, right = return_merged_clips(data)
  995. seq = data['bases']
  996. if left >= right:
  997. new_seq = seq.lower()
  998. else:
  999. new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
  1000. return new_seq
  1001. def clip_read(data):
  1002. '''Given the data for one read it returns clipped seq and qual.'''
  1003. qual = data['quality_scores']
  1004. left, right = return_merged_clips(data)
  1005. seq = data['bases']
  1006. qual = data['quality_scores']
  1007. new_seq = seq[left-1:right]
  1008. new_qual = qual[left-1:right]
  1009. return new_seq, new_qual
  1010. def tests_for_ssaha():
  1011. '''Tests whether SSAHA2 can be successfully called.'''
  1012. try:
  1013. print "Testing whether SSAHA2 is installed and can be launched ... ",
  1014. sys.stdout.flush()
  1015. fh = open('/dev/null', 'w')
  1016. retcode = subprocess.call(["ssaha2"], stdout = fh)
  1017. fh.close()
  1018. print "ok."
  1019. except :
  1020. print "nope? Uh oh ...\n\n"
  1021. raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
  1022. def load_linker_sequences(linker_fname):
  1023. '''Loads all linker sequences into memory, storing only the length
  1024. of each linker.'''
  1025. global linkerlengths
  1026. if not os.path.getsize(linker_fname):
  1027. raise RuntimeError("File empty? '" + linker_fname + "'")
  1028. fh = open(linker_fname, 'r')
  1029. linkerseqs = read_fasta(fh)
  1030. if len(linkerseqs) == 0:
  1031. raise RuntimeError(linker_fname + ": no sequence found?")
  1032. for i in linkerseqs:
  1033. if linkerlengths.has_key(i.name):
  1034. raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
  1035. linkerlengths[i.name] = len(i.sequence)
  1036. fh.close()
  1037. def launch_ssaha(linker_fname, query_fname, output_fh):
  1038. '''Launches SSAHA2 on the linker and query file, string SSAHA2 output
  1039. into the output filehandle'''
  1040. tests_for_ssaha()
  1041. try:
  1042. print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
  1043. sys.stdout.flush()
  1044. retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
  1045. if retcode:
  1046. raise RuntimeError('Ups.')
  1047. else:
  1048. print "ok."
  1049. except:
  1050. print "\n"
  1051. raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
  1052. def read_ssaha_data(ssahadata_fh):
  1053. '''Given file handle, reads file generated with SSAHA2 (with default
  1054. output format) and stores all matches as list ssahapematches
  1055. (ssaha paired-end matches) dictionary'''
  1056. global ssahapematches
  1057. print "Parsing SSAHA2 result file ... ",
  1058. sys.stdout.flush()
  1059. for line in ssahadata_fh:
  1060. if line.startswith('ALIGNMENT'):
  1061. ml = line.split()
  1062. if len(ml) != 12 :
  1063. print "\n", line,
  1064. raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
  1065. if not ssahapematches.has_key(ml[2]) :
  1066. ssahapematches[ml[2]] = ([])
  1067. if ml[8] == 'F':
  1068. #print line,
  1069. # store everything except the first element (output
  1070. # format name (ALIGNMENT)) and the last element
  1071. # (length)
  1072. ssahapematches[ml[2]].append(ml[1:-1])
  1073. else:
  1074. #print ml
  1075. ml[4],ml[5] = ml[5],ml[4]
  1076. #print ml
  1077. ssahapematches[ml[2]].append(ml[1:-1])
  1078. print "done."
  1079. ##########################################################################
  1080. #
  1081. # BaCh: This block was shamelessly copied from
  1082. # http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
  1083. # and then subsequently modified to read fasta correctly
  1084. # It's still not fool proof, but should be good enough
  1085. #
  1086. ##########################################################################
  1087. class Fasta:
  1088. def __init__(self, name, sequence):
  1089. self.name = name
  1090. self.sequence = sequence
  1091. def read_fasta(file):
  1092. items = []
  1093. aninstance = Fasta('', '')
  1094. linenum = 0
  1095. for line in file:
  1096. linenum += 1
  1097. if line.startswith(">"):
  1098. if len(aninstance.sequence):
  1099. items.append(aninstance)
  1100. aninstance = Fasta('', '')
  1101. # name == all characters until the first whitespace
  1102. # (split()[0]) but without the starting ">" ([1:])
  1103. aninstance.name = line.split()[0][1:]
  1104. aninstance.sequence = ''
  1105. if len(aninstance.name) == 0:
  1106. raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
  1107. else:
  1108. if len(aninstance.name) == 0:
  1109. raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
  1110. aninstance.sequence += line.strip()
  1111. if len(aninstance.name) and len(aninstance.sequence):
  1112. items.append(aninstance)
  1113. return items
  1114. ##########################################################################
  1115. def version_string ():
  1116. return "sff_extract " + __version__
  1117. def read_config():
  1118. '''It reads the configuration options from the command line arguments and
  1119. it returns a dict with them.'''
  1120. from optparse import OptionParser, OptionGroup
  1121. usage = "usage: %prog [options] sff1 sff2 ..."
  1122. desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
  1123. " and XML traceinfo format. When a paired-end linker sequence"\
  1124. " is given (-l), use SSAHA2 to scan the sequences for the linker,"\
  1125. " then split the sequences, removing the linker."
  1126. parser = OptionParser(usage = usage, version = version_string(), description = desc)
  1127. parser.add_option('-a', '--append', action="store_true", dest='append',
  1128. help='append output to existing files', default=False)
  1129. parser.add_option('-i', '--xml_info', dest='xml_info',
  1130. help='extra info to write in the xml file')
  1131. parser.add_option("-l", "--linker_file", dest="pelinker_fname",
  1132. help="FASTA file with paired-end linker sequences", metavar="FILE")
  1133. group = OptionGroup(parser, "File name options","")
  1134. group.add_option('-c', '--clip', action="store_true", dest='clip',
  1135. help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
  1136. group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
  1137. help='all bases in upper case, including clipped ends', default=True)
  1138. group.add_option('', '--min_left_clip', dest='min_leftclip',
  1139. metavar="INTEGER", type = "int",
  1140. help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
  1141. group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
  1142. help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
  1143. parser.add_option_group(group)
  1144. group = OptionGroup(parser, "File name options","")
  1145. group.add_option("-o", "--out_basename", dest="basename",
  1146. help="base name for all output files")
  1147. group.add_option("-s", "--seq_file", dest="seq_fname",
  1148. help="output sequence file name", metavar="FILE")
  1149. group.add_option("-q", "--qual_file", dest="qual_fname",
  1150. help="output quality file name", metavar="FILE")
  1151. group.add_option("-x", "--xml_file", dest="xml_fname",
  1152. help="output ancillary xml file name", metavar="FILE")
  1153. parser.add_option_group(group)
  1154. #default fnames
  1155. #is there an sff file?
  1156. basename = 'reads'
  1157. if sys.argv[-1][-4:].lower() == '.sff':
  1158. basename = sys.argv[-1][:-4]
  1159. def_seq_fname = basename + '.fasta'
  1160. def_qual_fname = basename + '.fasta.qual'
  1161. def_xml_fname = basename + '.xml'
  1162. def_pelinker_fname = ''
  1163. parser.set_defaults(seq_fname = def_seq_fname)
  1164. parser.set_defaults(qual_fname = def_qual_fname)
  1165. parser.set_defaults(xml_fname = def_xml_fname)
  1166. parser.set_defaults(pelinker_fname = def_pelinker_fname)
  1167. #we parse the cmd line
  1168. (options, args) = parser.parse_args()
  1169. #we put the result in a dict
  1170. global config
  1171. config = {}
  1172. for property in dir(options):
  1173. if property[0] == '_' or property in ('ensure_value', 'read_file',
  1174. 'read_module'):
  1175. continue
  1176. config[property] = getattr(options, property)
  1177. if config['basename'] is None:
  1178. config['basename']=basename
  1179. #if we have not set a file name with -s, -q or -x we set the basename
  1180. #based file name
  1181. if config['want_fastq']:
  1182. config['qual_fname'] = ''
  1183. if config['seq_fname'] == def_seq_fname:
  1184. config['seq_fname'] = config['basename'] + '.fastq'
  1185. else:
  1186. if config['seq_fname'] == def_seq_fname:
  1187. config['seq_fname'] = config['basename'] + '.fasta'
  1188. if config['qual_fname'] == def_qual_fname:
  1189. config['qual_fname'] = config['basename'] + '.fasta.qual'
  1190. if config['xml_fname'] == def_xml_fname:
  1191. config['xml_fname'] = config['basename'] + '.xml'
  1192. #we parse the extra info for the xml file
  1193. config['xml_info'] = parse_extra_info(config['xml_info'])
  1194. return config, args
  1195. ##########################################################################
  1196. def testsome():
  1197. sys.exit()
  1198. return
  1199. def debug():
  1200. try:
  1201. dummy = 1
  1202. #debug()
  1203. #testsome()
  1204. config, args = read_config()
  1205. load_linker_sequences(config['pelinker_fname'])
  1206. #pid = os.getpid()
  1207. pid = 15603
  1208. #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
  1209. #tmpfasta_fh = open(tmpfasta_fname, 'w')
  1210. tmpfasta_fname = 'FLVI58L05.fa'
  1211. tmpfasta_fh = open(tmpfasta_fname, 'r')
  1212. tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
  1213. tmpssaha_fh = open(tmpssaha_fname, 'w')
  1214. launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
  1215. tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')
  1216. read_ssaha_data(tmpssaha_fh)
  1217. sys.exit()
  1218. extract_reads_from_sff(config, args)
  1219. except (OSError, IOError, RuntimeError), errval:
  1220. print errval
  1221. sys.exit()
  1222. sys.exit()
  1223. def main():
  1224. argv = sys.argv
  1225. if len(argv) == 1:
  1226. sys.argv.append('-h')
  1227. read_config()
  1228. sys.exit()
  1229. try:
  1230. #debug();
  1231. config, args = read_config()
  1232. if config['pelinker_fname']:
  1233. #tests_for_ssaha(config['pelinker_fname'])
  1234. load_linker_sequences(config['pelinker_fname'])
  1235. if len(args) == 0:
  1236. raise RuntimeError("No SFF file given?")
  1237. extract_reads_from_sff(config, args)
  1238. except (OSError, IOError, RuntimeError), errval:
  1239. print errval
  1240. return 1
  1241. if stern_warning:
  1242. return 1
  1243. return 0
  1244. if __name__ == "__main__":
  1245. sys.exit(main())