PageRenderTime 155ms CodeModel.GetById 73ms app.highlight 68ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/filters/sff_extract.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1512 lines | 1301 code | 67 blank | 144 comment | 89 complexity | e7d96e6b8e7d717a0647332becc288e9 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1#!/usr/bin/python
   2'''This software extracts the seq, qual and ancillary information from an sff
   3file, like the ones used by the 454 sequencer.
   4
   5Optionally, it can also split paired-end reads if given the linker sequence.
   6The splitting is done with maximum match, i.e., every occurence of the linker
   7sequence will be removed, even if occuring multiple times.'''
   8
   9#copyright Jose Blanca and Bastien Chevreux
  10#COMAV institute, Universidad Politecnica de Valencia (UPV)
  11#Valencia, Spain
  12
  13# additions to handle paired end reads by Bastien Chevreux
  14# bugfixes for linker specific lengths: Lionel Guy
  15
  16#This program is free software: you can redistribute it and/or modify
  17#it under the terms of the GNU General Public License as published by
  18#the Free Software Foundation, either version 3 of the License, or
  19#(at your option) any later version.
  20#This program is distributed in the hope that it will be useful,
  21#but WITHOUT ANY WARRANTY; without even the implied warranty of
  22#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23#GNU General Public License for more details.
  24#You should have received a copy of the GNU General Public License
  25#along with this program.  If not, see <http://www.gnu.org/licenses/>.
  26
  27__author__ = 'Jose Blanca and Bastien Chevreux'
  28__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
  29__license__ = 'GPLv3 or later'
  30__version__ = '0.2.10'
  31__email__ = 'jblanca@btc.upv.es'
  32__status__ = 'beta'
  33
  34import struct
  35import sys
  36import os
  37import subprocess
  38import tempfile
  39
  40
  41fake_sff_name = 'fake_sff_name'
  42
  43
  44# readname as key: lines with matches from SSAHA, one best match
  45ssahapematches = {}
  46# linker readname as key: length of linker sequence
  47linkerlengths = {}
  48
  49# set to true if something really fishy is going on with the sequences
  50stern_warning = True
  51
  52def read_bin_fragment(struct_def, fileh, offset=0, data=None,
  53                                                             byte_padding=None):
  54    '''It reads a chunk of a binary file.
  55
  56    You have to provide the struct, a file object, the offset (where to start
  57    reading).
  58    Also you can provide an optional dict that will be populated with the
  59    extracted data.
  60    If a byte_padding is given the number of bytes read will be a multiple of
  61    that number, adding the required pad at the end.
  62    It returns the number of bytes reads and the data dict.
  63    '''
  64    if data is None:
  65        data = {}
  66
  67    #we read each item
  68    bytes_read = 0
  69    for item in struct_def:
  70        #we go to the place and read
  71        fileh.seek(offset + bytes_read)
  72        n_bytes = struct.calcsize(item[1])
  73        buffer = fileh.read(n_bytes)
  74        read = struct.unpack('>' + item[1], buffer)
  75        if len(read) == 1:
  76            read = read[0]
  77        data[item[0]] = read
  78        bytes_read += n_bytes
  79
  80    #if there is byte_padding the bytes_to_read should be a multiple of the
  81    #byte_padding
  82    if byte_padding is not None:
  83        pad = byte_padding
  84        bytes_read = ((bytes_read + pad - 1) // pad) * pad
  85
  86    return (bytes_read, data)
  87
  88
  89def check_magic(magic):
  90    '''It checks that the magic number of the file matches the sff magic.'''
  91    if magic != 779314790:
  92        raise RuntimeError('This file does not seems to be an sff file.')
  93
  94def check_version(version):
  95    '''It checks that the version is supported, otherwise it raises an error.'''
  96    supported = ('\x00', '\x00', '\x00', '\x01')
  97    i = 0
  98    for item in version:
  99        if version[i] != supported[i]:
 100            raise RuntimeError('SFF version not supported. Please contact the author of the software.')
 101        i += 1
 102
 103def read_header(fileh):
 104    '''It reads the header from the sff file and returns a dict with the
 105    information'''
 106    #first we read the first part of the header
 107    head_struct = [
 108        ('magic_number', 'I'),
 109        ('version', 'cccc'),
 110        ('index_offset', 'Q'),
 111        ('index_length', 'I'),
 112        ('number_of_reads', 'I'),
 113        ('header_length', 'H'),
 114        ('key_length', 'H'),
 115        ('number_of_flows_per_read', 'H'),
 116        ('flowgram_format_code', 'B'),
 117    ]
 118    data = {}
 119    first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
 120                                                            offset=0, data=data)
 121    check_magic(data['magic_number'])
 122    check_version(data['version'])
 123    #now that we know the number_of_flows_per_read and the key_length
 124    #we can read the second part of the header
 125    struct2 = [
 126        ('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
 127        ('key_sequence', str(data['key_length']) + 'c')
 128    ]
 129    read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
 130                                                                      data=data)
 131    return data
 132
 133
 134def read_sequence(header, fileh, fposition):
 135    '''It reads one read from the sff file located at the fposition and
 136    returns a dict with the information.'''
 137    header_length = header['header_length']
 138    index_offset = header['index_offset']
 139    index_length = header['index_length']
 140
 141    #the sequence struct
 142    read_header_1 = [
 143        ('read_header_length', 'H'),
 144        ('name_length', 'H'),
 145        ('number_of_bases', 'I'),
 146        ('clip_qual_left', 'H'),
 147        ('clip_qual_right', 'H'),
 148        ('clip_adapter_left', 'H'),
 149        ('clip_adapter_right', 'H'),
 150    ]
 151    def read_header_2(name_length):
 152        '''It returns the struct definition for the second part of the header'''
 153        return [('name', str(name_length) +'c')]
 154    def read_data(number_of_bases):
 155        '''It returns the struct definition for the read data section.'''
 156        #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
 157        if header['flowgram_format_code'] == 1:
 158            flow_type = 'H'
 159        else:
 160            raise Error('file version not supported')
 161        number_of_bases = str(number_of_bases)
 162        return [
 163            ('flowgram_values', str(header['number_of_flows_per_read']) +
 164                                                                     flow_type),
 165            ('flow_index_per_base', number_of_bases + 'B'),
 166            ('bases', number_of_bases + 'c'),
 167            ('quality_scores', number_of_bases + 'B'),
 168        ]
 169
 170    data = {}
 171    #we read the first part of the header
 172    bytes_read, data = read_bin_fragment(struct_def=read_header_1,
 173                                    fileh=fileh, offset=fposition, data=data)
 174
 175    read_bin_fragment(struct_def=read_header_2(data['name_length']),
 176                          fileh=fileh, offset=fposition + bytes_read, data=data)
 177    #we join the letters of the name
 178    data['name'] = ''.join(data['name'])
 179    offset = data['read_header_length']
 180    #we read the sequence and the quality
 181    read_data_st = read_data(data['number_of_bases'])
 182    bytes_read, data = read_bin_fragment(struct_def=read_data_st,
 183                                    fileh=fileh, offset=fposition + offset,
 184                                    data=data, byte_padding=8)
 185    #we join the bases
 186    data['bases'] = ''.join(data['bases'])
 187
 188    #print data
 189    #print "pre cqr: ", data['clip_qual_right']
 190    #print "pre car: ", data['clip_adapter_right']
 191    #print "pre cql: ", data['clip_qual_left']
 192    #print "pre cal: ", data['clip_adapter_left']
 193
 194    # correct for the case the right clip is <= than the left clip
 195    # in this case, left clip is 0 are set to 0 (right clip == 0 means
 196    #  "whole sequence")
 197    if data['clip_qual_right'] <= data['clip_qual_left'] :
 198        data['clip_qual_right'] = 0
 199        data['clip_qual_left'] = 0
 200    if data['clip_adapter_right'] <= data['clip_adapter_left'] :
 201        data['clip_adapter_right'] = 0
 202        data['clip_adapter_left'] = 0
 203
 204    #the clipping section follows the NCBI's guidelines Trace Archive RFC
 205    #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
 206    #if there's no adapter clip: qual -> vector
 207    #else:  qual-> qual
 208    #       adapter -> vector
 209
 210    if not data['clip_adapter_left']:
 211        data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
 212    if not data['clip_adapter_right']:
 213        data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
 214
 215    # see whether we have to override the minimum left clips
 216    if config['min_leftclip'] > 0:
 217        if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
 218            data['clip_adapter_left'] = config['min_leftclip']
 219        if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
 220            data['clip_qual_left'] = config['min_leftclip']
 221
 222        
 223    #print "post cqr: ", data['clip_qual_right']
 224    #print "post car: ", data['clip_adapter_right']
 225    #print "post cql: ", data['clip_qual_left']
 226    #print "post cal: ", data['clip_adapter_left']
 227
 228
 229    # for handling the -c (clip) option gently, we already clip here
 230    #  and set all clip points to the sequence end points
 231    if config['clip']:
 232        data['bases'], data['quality_scores'] = clip_read(data)
 233
 234        data['number_of_bases']=len(data['bases'])
 235        data['clip_qual_right'] = data['number_of_bases']
 236        data['clip_adapter_right'] = data['number_of_bases']
 237        data['clip_qual_left'] = 0
 238        data['clip_adapter_left'] = 0
 239        
 240    return data['read_header_length'] + bytes_read, data
 241
 242
 243def sequences(fileh, header):
 244    '''It returns a generator with the data for each read.'''
 245    #now we can read all the sequences
 246    fposition = header['header_length']    #position in the file
 247    reads_read = 0
 248    while True:
 249        if fposition == header['index_offset']:
 250            #we have to skip the index section
 251            fposition += index_length
 252            continue
 253        else:
 254            bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
 255                                                            fposition=fposition)
 256            yield seq_data
 257            fposition += bytes_read
 258            reads_read += 1
 259            if reads_read >= header['number_of_reads']:
 260                break
 261
 262
 263def remove_last_xmltag_in_file(fname, tag=None):
 264    '''Given an xml file name and a tag, it removes the last tag of the
 265    file if it matches the given tag. Tag removal is performed via file
 266    truncation.
 267    
 268    It the given tag is not the last in the file, a RunTimeError will be
 269    raised.
 270
 271    The resulting xml file will be not xml valid. This function is a hack
 272    that allows to append records to xml files in a quick and dirty way.
 273    '''
 274
 275    fh = open(fname, 'r+')
 276    #we have to read from the end to the start of the file and keep the
 277    #string enclosed by </ >
 278    i = -1
 279    last_tag = []   #the chars that form the last tag
 280    start_offset = None     #in which byte does the last tag starts?
 281    end_offset = None     #in which byte does the last tag ends?
 282    while True:
 283        fh.seek(i, 2)
 284        char = fh.read(1)
 285        if not char.isspace():
 286            last_tag.append(char)
 287        if char == '>':
 288            end_offset = i
 289        if char == '<':
 290            start_offset = i
 291            break
 292        i -= 1
 293
 294    #we have read the last tag backwards
 295    last_tag = ''.join(last_tag[::-1])
 296    #we remove the </ and >
 297    last_tag = last_tag.rstrip('>').lstrip('</')
 298    
 299    #we check that we're removing the asked tag
 300    if tag is not None and tag != last_tag:
 301        etxt=join('The given xml tag (',tag,') was not the last one in the file');
 302        raise RuntimeError(etxt)
 303
 304    # while we are at it: also remove all white spaces in that line :-)
 305    i -= 1
 306    while True:
 307        fh.seek(i, 2)
 308        char = fh.read(1)
 309        if not char == ' ' and not char == '\t':
 310            break;
 311        if fh.tell() == 1:
 312            break;
 313        i -= 1
 314
 315    fh.truncate();
 316
 317    fh.close()
 318    return last_tag
 319
 320
 321def create_basic_xml_info(readname, fname):
 322    '''Formats a number of read specific infos into XML format.
 323    Currently formated: name and the tags set from command line
 324    '''
 325    to_print = ['    <trace>\n']
 326    to_print.append('        <trace_name>')
 327    to_print.append(readname)
 328    to_print.append('</trace_name>\n')
 329
 330    #extra information
 331    #do we have extra info for this file?
 332    info = None
 333    if config['xml_info']:
 334        #with this name?
 335        if fname in config['xml_info']:
 336            info = config['xml_info'][fname]
 337        else:
 338        #with no name?
 339            try:
 340                info = config['xml_info'][fake_sff_name]
 341            except KeyError:
 342                pass
 343    #we print the info that we have
 344    if info:
 345        for key in info:
 346            to_print.append('        <' + key + '>' + info[key] + \
 347                            '</' + key +'>\n')
 348
 349    return ''.join(to_print)
 350
 351
 352def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
 353    '''Takes the clip values of the read and formats them into XML
 354    Corrects "wrong" values that might have resulted through
 355    simplified calculations earlier in the process of conversion
 356    (especially during splitting of paired-end reads)
 357    '''
 358
 359    to_print = [""]
 360
 361    # if right borders are >= to read length, they don't need
 362    #  to be printed
 363    if adapr >= readlen:
 364        adapr = 0
 365    if qualr >= readlen:
 366        qualr = 0
 367
 368    # BaCh
 369    # when called via split_paired_end(), some values may be < 0
 370    #  (when clip values were 0 previously)
 371    # instead of putting tons of if clauses for different calculations there,
 372    #  I centralise corrective measure here
 373    # set all values <0 to 0
 374
 375    if adapr < 0:
 376        adapr = 0
 377    if qualr <0:
 378        qualr = 0
 379    if adapl < 0:
 380        adapl = 0
 381    if quall <0:
 382        quall = 0
 383
 384    if quall:
 385        to_print.append('        <clip_quality_left>')
 386        to_print.append(str(quall))
 387        to_print.append('</clip_quality_left>\n')
 388    if qualr:
 389        to_print.append('        <clip_quality_right>')
 390        to_print.append(str(qualr))
 391        to_print.append('</clip_quality_right>\n')
 392    if adapl:
 393        to_print.append('        <clip_vector_left>')
 394        to_print.append(str(adapl))
 395        to_print.append('</clip_vector_left>\n')
 396    if adapr:
 397        to_print.append('        <clip_vector_right>')
 398        to_print.append(str(adapr))
 399        to_print.append('</clip_vector_right>\n')
 400    return ''.join(to_print)
 401
 402
 403def create_xml_for_unpaired_read(data, fname):
 404    '''Given the data for one read it returns an str with the xml ancillary
 405    data.'''
 406    to_print = [create_basic_xml_info(data['name'],fname)]
 407    #clippings in the XML only if we do not hard clip
 408    if not config['clip']:
 409        to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
 410    to_print.append('    </trace>\n')
 411    return ''.join(to_print)
 412
 413
 414def format_as_fasta(name,seq,qual):
 415    name_line = ''.join(('>', name,'\n'))
 416    seqstring = ''.join((name_line, seq, '\n'))
 417    qual_line = ' '.join([str(q) for q in qual]) 
 418    qualstring = ''.join((name_line, qual_line, '\n'))
 419    return seqstring, qualstring
 420
 421def format_as_fastq(name,seq,qual):
 422    qual_line = ''.join([chr(q+33) for q in qual]) 
 423    #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
 424    seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
 425    return seqstring
 426
 427
 428def get_read_data(data):
 429    '''Given the data for one read it returns 2 strs with the fasta seq
 430    and fasta qual.'''
 431    #seq and qual
 432    if config['mix_case']:
 433        seq = sequence_case(data)
 434        qual = data['quality_scores']
 435    else :
 436        seq = data['bases']
 437        qual = data['quality_scores']
 438
 439    return seq, qual
 440
 441def extract_read_info(data, fname):
 442    '''Given the data for one read it returns 3 strs with the fasta seq, fasta
 443    qual and xml ancillary data.'''
 444
 445    seq,qual = get_read_data(data)
 446    seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
 447
 448    #name_line = ''.join(('>', data['name'],'\n'))
 449    #seq = ''.join((name_line, seq, '\n'))
 450    #qual_line = ' '.join([str(q) for q in qual]) 
 451    #qual = ''.join((name_line, qual_line, '\n'))
 452
 453    xmlstring = create_xml_for_unpaired_read(data, fname)
 454
 455    return seqstring, qualstring, xmlstring
 456
 457def write_sequence(name,seq,qual,seq_fh,qual_fh):
 458    '''Write sequence and quality FASTA and FASTA qual filehandles
 459    (or into FASTQ and XML)
 460    if sequence length is 0, don't write'''
 461
 462    if len(seq) == 0 : return
 463
 464    if qual_fh is None:
 465        seq_fh.write(format_as_fastq(name,seq,qual))
 466    else:
 467        seqstring, qualstring = format_as_fasta(name,seq,qual)
 468        seq_fh.write(seqstring)
 469        qual_fh.write(qualstring)
 470    return
 471
 472def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
 473    '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
 474    (or into FASTQ and XML)
 475    if sequence length is 0, don't write'''
 476
 477    seq,qual = get_read_data(data)
 478    if len(seq) == 0 : return
 479
 480    write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
 481
 482    anci = create_xml_for_unpaired_read(data, sff_fh.name)
 483    if anci is not None:
 484        xml_fh.write(anci)
 485    return
 486
 487
 488def reverse_complement(seq):
 489    '''Returns the reverse complement of a DNA sequence as string'''
 490
 491    compdict = {
 492        'a': 't', 
 493        'c': 'g',
 494        'g': 'c',
 495        't': 'a',
 496        'u': 't',
 497        'm': 'k',
 498        'r': 'y',
 499        'w': 'w',
 500        's': 's',
 501        'y': 'r',
 502        'k': 'm',
 503        'v': 'b',
 504        'h': 'd',
 505        'd': 'h',
 506        'b': 'v',
 507        'x': 'x',
 508        'n': 'n',
 509        'A': 'T',
 510        'C': 'G',
 511        'G': 'C',
 512        'T': 'A',
 513        'U': 'T',
 514        'M': 'K',
 515        'R': 'Y',
 516        'W': 'W',
 517        'S': 'S',
 518        'Y': 'R',
 519        'K': 'M',
 520        'V': 'B',
 521        'H': 'D',
 522        'D': 'H',
 523        'B': 'V',
 524        'X': 'X',
 525        'N': 'N', 
 526        '*': '*'
 527        }
 528
 529    complseq = ''.join([compdict[base] for base in seq])
 530    # python hack to reverse a list/string/etc
 531    complseq = complseq[::-1]
 532    return complseq
 533
 534
 535def mask_sequence(seq, maskchar, fpos, tpos):
 536    '''Given a sequence, mask it with maskchar starting at fpos (including) and
 537    ending at tpos (excluding)
 538    '''
 539
 540    if len(maskchar) > 1:
 541        raise RuntimeError("Internal error: more than one character given to mask_sequence")
 542    if fpos<0:
 543        fpos = 0
 544    if tpos > len(seq):
 545        tpos = len(seq)
 546
 547    newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
 548
 549    return newseq
 550
 551
 552def fragment_sequences(sequence, qualities, splitchar):
 553    '''Works like split() on strings, except it does this on a sequence
 554    and the corresponding list with quality values.
 555    Returns a tuple for each fragment, each sublist has the fragment
 556    sequence as first and the fragment qualities as second elemnt'''
 557
 558    # this is slow (due to zip and list appends... use an iterator over
 559    #  the sequence find find variations and splices on seq and qual
 560
 561    if len(sequence) != len(qualities):
 562        print sequence, qualities
 563        raise RuntimeError("Internal error: length of sequence and qualities don't match???")
 564
 565    retlist = ([])
 566    if len(sequence) == 0:
 567        return retlist
 568
 569    actseq = ([])
 570    actqual = ([])
 571    if sequence[0] != splitchar:
 572        inseq = True
 573    else:
 574        inseq = False
 575    for char,qual in zip(sequence,qualities):
 576        if inseq:
 577            if char != splitchar:
 578                actseq.append(char)
 579                actqual.append(qual)
 580            else:
 581                retlist.append((''.join(actseq), actqual))
 582                actseq = ([])
 583                actqual = ([])
 584                inseq = False
 585        else:
 586            if char != splitchar:
 587                inseq = True
 588                actseq.append(char)
 589                actqual.append(qual)
 590
 591    if inseq and len(actseq):
 592        retlist.append((''.join(actseq), actqual))
 593        
 594    return retlist
 595
 596
 597def calc_subseq_boundaries(maskedseq, maskchar):
 598    '''E.g.:
 599       ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
 600       to
 601         (0,8),(8,16),(16,26),(26,47),(47,56)
 602    '''
 603
 604    blist = ([])
 605    if len(maskedseq) == 0:
 606        return blist
 607
 608    inmask = True
 609    if maskedseq[0] != maskchar:
 610        inmask = False
 611
 612    start = 0
 613    for spos in range(len(maskedseq)):
 614        if inmask and maskedseq[spos] != maskchar:
 615            blist.append(([start,spos]))
 616            start = spos
 617            inmask = False
 618        elif not inmask and maskedseq[spos] == maskchar:
 619            blist.append(([start,spos]))
 620            start = spos
 621            inmask = True
 622
 623    blist.append(([start,spos+1]))
 624
 625    return blist
 626
 627
 628def correct_for_smallhits(maskedseq, maskchar, linkername):
 629    '''If partial hits were found, take preventive measure: grow
 630        the masked areas by 20 bases in each direction
 631       Returns either unchanged "maskedseq" or a new sequence
 632        with some more characters masked.
 633    '''
 634    global linkerlengths
 635
 636    CEBUG = 0
 637
 638    if CEBUG : print "correct_for_smallhits"
 639    if CEBUG : print "Masked seq\n", maskedseq
 640    if CEBUG : print "Linkername: ", linkername
 641    
 642    if len(maskedseq) == 0:
 643        return maskedseq
 644
 645    growl=40
 646    growl2=growl/2
 647
 648    boundaries = calc_subseq_boundaries(maskedseq,maskchar)
 649    if CEBUG : print "Boundaries: ", boundaries
 650
 651    foundpartial = False
 652    for bounds in boundaries:
 653        if CEBUG : print "\tbounds: ", bounds
 654        left, right = bounds
 655        if left != 0 and right != len(maskedseq):
 656            if maskedseq[left] == maskchar:
 657                # allow 10% discrepancy
 658                #    -linkerlengths[linkername]/10
 659                # that's a kind of safety net if there are slight sequencing 
 660                #  errors in the linker itself 
 661                if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
 662                    if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
 663                    foundpartial = True
 664
 665    if not foundpartial:
 666        return maskedseq
 667
 668    # grow
 669    newseq = ""
 670    for bounds in boundaries:
 671        if CEBUG : print "Bounds: ", bounds
 672        left, right = bounds
 673        if maskedseq[left] == maskchar:
 674            newseq += maskedseq[left:right]
 675        else:
 676            clearstart = 0
 677            if left > 0 :
 678                clearstart = left+growl2
 679            clearstop = len(maskedseq)
 680            if right < len(maskedseq):
 681                clearstop = right-growl2
 682
 683            if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
 684
 685            if clearstop <= clearstart:
 686                newseq += maskchar * (right-left)
 687            else:
 688                if clearstart != left:
 689                    newseq += maskchar * growl2
 690                newseq += maskedseq[clearstart:clearstop]
 691                if clearstop != right:
 692                    newseq += maskchar * growl2
 693            
 694        #print "newseq\n",newseq
 695
 696    return newseq
 697
 698
 699def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
 700    '''Splits a paired end read and writes sequences into FASTA, FASTA qual
 701    and XML traceinfo file. Returns the number of sequences created.
 702
 703    As the linker sequence may be anywhere in the read, including the ends
 704    and overlapping with bad quality sequence, we need to perform some
 705    computing and eventually set new clip points.
 706
 707    If the resulting split yields only one sequence (because linker
 708    was not present or overlapping with left or right clip), only one
 709    sequence will be written with ".fn" appended to the name.
 710
 711    If the read can be split, two reads will be written. The side left of
 712    the linker will be named ".r" and will be written in reverse complement
 713    into the file to conform with what approximately all assemblers expect
 714    when reading paired-end data: reads in forward direction in file. The side
 715    right of the linker will be named ".f"
 716
 717    If SSAHA found partial linker (linker sequences < length of linker),
 718    the sequences will get a "_pl" furthermore be cut back thoroughly.
 719
 720    If SSAHA found multiple occurences of the linker, the names will get an
 721    additional "_mlc" within the name to show that there was "multiple
 722    linker contamination".
 723
 724    For multiple or partial linker, the "good" parts of the reads are
 725    stored with a ".part<number>" name, additionally they will not get
 726    template information in the XML
 727    '''
 728
 729    global ssahapematches
 730
 731    CEBUG = 0
 732
 733    maskchar = "#"
 734
 735    if CEBUG : print "Need to split: " + data['name']
 736
 737    numseqs = 0;
 738    readname = data['name']
 739    readlen = data['number_of_bases']
 740
 741    leftclip, rightclip = return_merged_clips(data)
 742    seq, qual = get_read_data(data)
 743
 744    if CEBUG : print "Original read:\n",seq
 745    
 746    maskedseq = seq
 747    if leftclip > 0:
 748        maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
 749    if rightclip < len(maskedseq):
 750        maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
 751    
 752    leftclip, rightclip = return_merged_clips(data)
 753    readlen = data['number_of_bases']
 754    
 755    if CEBUG : print "Readname:", readname
 756    if CEBUG : print "Readlen:", readlen
 757    if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
 758    if CEBUG : print "matches:", ssahapematches[data['name']]
 759
 760    for match in ssahapematches[data['name']]:
 761        score = int(match[0])
 762        linkername = match[2]
 763        leftreadhit = int(match[3])
 764        rightreadhit = int(match[4])
 765        #leftlinkerhit = int(match[5])
 766        #rightlinkerhit = int(match[6])
 767        #direction = match[7]
 768        #hitlen = int(match[8])
 769        #hitidentity = float(match[9])
 770
 771        if CEBUG : print match
 772        if CEBUG : print "Match with score:", score
 773        if CEBUG : print "Read before:\n", maskedseq
 774        maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
 775        if CEBUG : print "Masked seq:\n", maskedseq
 776
 777    correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
 778
 779    if len(maskedseq) != len(correctedseq):
 780        raise RuntimeError("Internal error: maskedseq != correctedseq")
 781
 782    partialhits = False
 783    if correctedseq != maskedseq:
 784        if CEBUG : print "Partial hits in", readname
 785        if CEBUG : print "Original seq:\n", seq
 786        if CEBUG : print "Masked seq:\n", maskedseq
 787        if CEBUG : print "Corrected seq\n", correctedseq
 788        partialhits = True
 789        readname += "_pl"
 790        maskedseq = correctedseq
 791
 792    fragments = fragment_sequences(maskedseq, qual, maskchar)
 793
 794    if CEBUG : print "Fragments (", len(fragments), "): ", fragments
 795
 796    mlcflag = False
 797    #if len(ssahapematches[data['name']]) > 1:
 798    #    #print "Multi linker contamination"
 799    #    mlcflag = True
 800    #    readname += "_mlc"
 801
 802    if len(fragments) > 2:
 803        if CEBUG : print "Multi linker contamination"
 804        mlcflag = True
 805        readname += "_mlc"
 806
 807
 808    #print fragments
 809    if mlcflag or partialhits:
 810        fragcounter = 1
 811        readname += ".part"
 812        for frag in fragments:
 813            actseq = frag[0]
 814            if len(actseq) >= 20:
 815                actqual = frag[1]
 816                oname = readname + str(fragcounter)
 817                #seq_fh.write(">"+oname+"\n")
 818                #seq_fh.write(actseq+"\n")
 819                #qual_fh.write(">"+oname+"\n")
 820                #qual_fh.write(' '.join((str(q) for q in actqual)))
 821                #qual_fh.write("\n")
 822                write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
 823                to_print = [create_basic_xml_info(oname,sff_fh.name)]
 824                # No clipping in XML ... the multiple and partial fragments
 825                #  are clipped "hard"
 826                # No template ID and trace_end: we don't know the
 827                #  orientation of the frahments. Even if it were
 828                #  only two, the fact we had multiple linkers
 829                #  says something went wrong, so simply do not
 830                #  write any paired-end information for all these fragments
 831                to_print.append('    </trace>\n')
 832                xml_fh.write(''.join(to_print))
 833                numseqs += 1
 834                fragcounter += 1
 835    else:
 836        if len(fragments) >2:
 837            raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
 838        # nothing will happen for 0 fragments
 839        if len(fragments) == 1:
 840            #print "Tada1"
 841            boundaries = calc_subseq_boundaries(maskedseq,maskchar)
 842            if len(boundaries) < 1 or len(boundaries) >3:
 843                raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
 844            if len(boundaries) == 3:
 845                # case: mask char on both sides of sequence
 846                #print "bounds3"
 847                data['clip_adapter_left']=boundaries[0][1]
 848                data['clip_adapter_right']=boundaries[2][0]
 849            elif len(boundaries) == 2:
 850                # case: mask char left or right of sequence
 851                #print "bounds2",
 852                if maskedseq[0] == maskchar :
 853                    # case: mask char left
 854                    #print "left"
 855                    data['clip_adapter_left']=boundaries[0][1]
 856                else:
 857                    # case: mask char right
 858                    #print "right"
 859                    data['clip_adapter_right']=boundaries[1][0]
 860            data['name'] = data['name'] + ".fn"
 861            write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
 862            numseqs = 1
 863        elif len(fragments) == 2:
 864            #print "Tada2"
 865            oname = readname + ".r"
 866            seq, qual = get_read_data(data)
 867
 868            startsearch = False
 869            for spos in range(len(maskedseq)):
 870                if maskedseq[spos] != maskchar:
 871                    startsearch = True;
 872                else:
 873                    if startsearch:
 874                        break
 875
 876            #print "\nspos: ", spos
 877            lseq=seq[:spos]
 878            #print "lseq:", lseq
 879            actseq = reverse_complement(lseq)
 880            lreadlen = len(actseq)
 881            lqual = qual[:spos];
 882            # python hack to reverse a list/string/etc
 883            lqual = lqual[::-1];
 884
 885            #seq_fh.write(">"+oname+"\n")
 886            #seq_fh.write(actseq+"\n")
 887            #qual_fh.write(">"+oname+"\n")
 888            #qual_fh.write(' '.join((str(q) for q in lqual)))
 889            #qual_fh.write("\n")
 890
 891            write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
 892
 893            to_print = [create_basic_xml_info(oname,sff_fh.name)]
 894            to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
 895            to_print.append('        <template_id>')
 896            to_print.append(readname)
 897            to_print.append('</template_id>\n')
 898            to_print.append('        <trace_end>r</trace_end>\n')
 899            to_print.append('    </trace>\n')
 900            xml_fh.write(''.join(to_print))
 901
 902            oname = readname + ".f"
 903            startsearch = False
 904            for spos in range(len(maskedseq)-1,-1,-1):
 905                if maskedseq[spos] != maskchar:
 906                    startsearch = True;
 907                else:
 908                    if startsearch:
 909                        break
 910
 911            actseq = seq[spos+1:]
 912            actqual = qual[spos+1:];
 913        
 914            #print "\nspos: ", spos
 915            #print "rseq:", actseq
 916
 917            #seq_fh.write(">"+oname+"\n")
 918            #seq_fh.write(actseq+"\n")
 919            #qual_fh.write(">"+oname+"\n")
 920            #qual_fh.write(' '.join((str(q) for q in actqual)))
 921            #qual_fh.write("\n")
 922            write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
 923            
 924            rreadlen = len(actseq)
 925            to_print = [create_basic_xml_info(oname,sff_fh.name)]
 926            to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
 927            to_print.append('        <template_id>')
 928            to_print.append(readname)
 929            to_print.append('</template_id>\n')
 930            to_print.append('        <trace_end>f</trace_end>\n')
 931            to_print.append('    </trace>\n')
 932            xml_fh.write(''.join(to_print))
 933            numseqs = 2
 934
 935    return numseqs
 936
 937
 938
 939def extract_reads_from_sff(config, sff_files):
 940    '''Given the configuration and the list of sff_files it writes the seqs,
 941    qualities and ancillary data into the output file(s).
 942
 943    If file for paired-end linker was given, first extracts all sequences
 944    of an SFF and searches these against the linker(s) with SSAHA2 to
 945    create needed information to split reads.
 946    '''
 947
 948    global ssahapematches
 949
 950    
 951    if len(sff_files) == 0 :
 952        raise RuntimeError("No SFF file given?")
 953
 954    #we go through all input files
 955    for sff_file in sff_files:
 956        if not os.path.getsize(sff_file):
 957            raise RuntimeError('Empty file? : ' + sff_file)
 958        fh = open(sff_file, 'r')
 959        fh.close()
 960
 961    openmode = 'w'
 962    if config['append']:
 963        openmode = 'a'
 964
 965    seq_fh = open(config['seq_fname'], openmode)
 966    xml_fh = open(config['xml_fname'], openmode)
 967    if config['want_fastq']:
 968        qual_fh = None
 969        try:
 970            os.remove(config['qual_fname'])
 971        except :
 972            python_formattingwithoutbracesisdumb_dummy = 1
 973    else:
 974        qual_fh = open(config['qual_fname'], openmode)
 975
 976    if not config['append']:
 977        xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
 978    else:
 979        remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
 980
 981    #we go through all input files
 982    for sff_file in sff_files:
 983        #print "Working on '" + sff_file + "':"
 984        ssahapematches.clear()
 985
 986        seqcheckstore = ([])
 987
 988        debug = 0
 989
 990        if not debug and config['pelinker_fname']:
 991            #print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
 992            sys.stdout.flush()
 993
 994            if 0 :
 995                # for debugging
 996                pid = os.getpid()
 997                tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
 998                tmpfasta_fh = open(tmpfasta_fname, 'w')
 999            else:
1000                tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
1001                                                          suffix = '.fasta')
1002
1003            sff_fh = open(sff_file, 'rb')
1004            header_data = read_header(fileh=sff_fh)
1005            for seq_data in sequences(fileh=sff_fh, header=header_data):
1006                seq,qual = get_read_data(seq_data)
1007                seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
1008                tmpfasta_fh.write(seqstring)
1009                #seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
1010                #tmpfasta_fh.write(seq)
1011            #print "done."
1012            tmpfasta_fh.seek(0)
1013
1014            if 0 :
1015                # for debugging
1016                tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1017                tmpssaha_fh = open(tmpssaha_fname, 'w+')
1018            else:
1019                tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
1020                                                          suffix = '.ssaha2')
1021
1022            launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1023            tmpfasta_fh.close()
1024
1025            tmpssaha_fh.seek(0)
1026            read_ssaha_data(tmpssaha_fh)
1027            tmpssaha_fh.close()
1028
1029        if debug:
1030            tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
1031            read_ssaha_data(tmpssaha_fh)
1032
1033        #print "Converting '" + sff_file + "' ... ",
1034        sys.stdout.flush()
1035        sff_fh = open(sff_file, 'rb')
1036        #read_header(infile)
1037        header_data = read_header(fileh=sff_fh)
1038
1039        #now convert all reads
1040        nseqs_sff = 0
1041        nseqs_out = 0
1042        for seq_data in sequences(fileh=sff_fh, header=header_data):
1043            nseqs_sff += 1
1044
1045            seq, qual = clip_read(seq_data)
1046            seqcheckstore.append(seq[0:50])
1047
1048            #if nseqs_sff >1000:
1049            #    check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1050            #    sys.exit()
1051
1052            if ssahapematches.has_key(seq_data['name']):
1053                #print "Paired end:",seq_data['name']
1054                nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1055            else:
1056                #print "Normal:",seq_data['name']
1057                if config['pelinker_fname']:
1058                    seq_data['name'] = seq_data['name'] + ".fn"
1059                write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1060                nseqs_out += 1
1061        #print "done."
1062        #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
1063        sff_fh.close()
1064
1065        check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1066        seqcheckstore = ([])
1067
1068    xml_fh.write('</trace_volume>\n')
1069
1070    xml_fh.close()
1071    seq_fh.close()
1072    if qual_fh is not None:
1073        qual_fh.close()
1074
1075    return
1076
1077def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
1078
1079    global stern_warning
1080
1081    foundproblem = ""
1082    for checklen in range(1,len(seqcheckstore[0])):
1083        foundinloop = False
1084        seqdict = {}
1085        for seq in seqcheckstore:
1086            shortseq = seq[0:checklen]
1087            if shortseq in seqdict:
1088                seqdict[shortseq] += 1
1089            else:
1090                seqdict[shortseq] = 1
1091
1092        for shortseq, count in seqdict.items():
1093            if float(count)/len(seqcheckstore) >= 0.5:
1094                foundinloop = True
1095                stern_warning
1096                foundproblem = "\n"+"*" * 80
1097                foundproblem += "\nWARNING: "
1098                foundproblem += "weird sequences in file " + sffname + "\n\n"
1099                foundproblem += "After applying left clips, " + str(count) + " sequences (=" 
1100                foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
1101                foundproblem += "%) start with these bases:\n" + shortseq
1102                foundproblem += "\n\nThis does not look sane.\n\n"
1103                foundproblem += "Countermeasures you *probably* must take:\n"
1104                foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n    corrected in the SFF.\n"
1105                foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n    react, use the --min_left_clip of sff_extract.\n"
1106                left,right = return_merged_clips(seqdata)
1107                foundproblem += "    (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
1108                foundproblem += "*" * 80 + "\n"
1109        if not foundinloop :
1110            break
1111    if len(foundproblem):
1112        print foundproblem
1113            
1114
1115def parse_extra_info(info):
1116    '''It parses the information that will go in the xml file.
1117
1118    There are two formats accepted for the extra information:
1119    key1:value1, key2:value2
1120    or:
1121    file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
1122    '''
1123    if not info:
1124        return info
1125    finfos = info.split(';')    #information for each file
1126    data_for_files = {}
1127    for finfo in finfos:
1128        #we split the file name from the rest
1129        items = finfo.split('{')
1130        if len(items) == 1:
1131            fname = fake_sff_name
1132            info = items[0]
1133        else:
1134            fname = items[0]
1135            info = items[1]
1136        #now we get each key,value pair in the info
1137        info = info.replace('}', '')
1138        data = {}
1139        for item in info.split(','):
1140            key, value = item.strip().split(':')
1141            key = key.strip()
1142            value = value.strip()
1143            data[key] = value
1144        data_for_files[fname] = data
1145    return data_for_files
1146
1147
1148def return_merged_clips(data):
1149    '''It returns the left and right positions to clip.'''
1150    def max(a, b):
1151        '''It returns the max of the two given numbers.
1152
1153        It won't take into account the zero values.
1154        '''
1155        if not a and not b:
1156            return None
1157        if not a:
1158            return b
1159        if not b:
1160            return a
1161        if a >= b:
1162            return a
1163        else:
1164            return b
1165    def min(a, b):
1166        '''It returns the min of the two given numbers.
1167
1168        It won't take into account the zero values.
1169        '''
1170        if not a and not b:
1171            return None
1172        if not a:
1173            return b
1174        if not b:
1175            return a
1176        if a <= b:
1177            return a
1178        else:
1179            return b
1180    left = max(data['clip_adapter_left'], data['clip_qual_left'])
1181    right = min(data['clip_adapter_right'], data['clip_qual_right'])
1182    #maybe both clips where zero
1183    if left is None:
1184        left = 1
1185    if right is None:
1186        right = data['number_of_bases']
1187    return left, right
1188
1189def sequence_case(data):
1190    '''Given the data for one read it returns the seq with mixed case.
1191
1192    The regions to be clipped will be lower case and the rest upper case.
1193    '''
1194    left, right = return_merged_clips(data)
1195    seq = data['bases']
1196    if left >= right:
1197        new_seq = seq.lower()
1198    else:
1199        new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
1200
1201    return new_seq
1202
1203def clip_read(data):
1204    '''Given the data for one read it returns clipped seq and qual.'''
1205
1206    qual = data['quality_scores']
1207    left, right = return_merged_clips(data)
1208    seq = data['bases']
1209    qual = data['quality_scores']
1210    new_seq = seq[left-1:right]
1211    new_qual = qual[left-1:right]
1212
1213    return new_seq, new_qual
1214
1215
1216
1217def tests_for_ssaha():
1218    '''Tests whether SSAHA2 can be successfully called.'''
1219    
1220    try:
1221        print "Testing whether SSAHA2 is installed and can be launched ... ",
1222        sys.stdout.flush()
1223        fh = open('/dev/null', 'w')
1224        retcode = subprocess.call(["ssaha2"], stdout = fh)
1225        fh.close()
1226        print "ok."
1227    except :
1228        print "nope? Uh oh ...\n\n"
1229        raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
1230
1231
1232def load_linker_sequences(linker_fname):
1233    '''Loads all linker sequences into memory, storing only the length
1234    of each linker.'''
1235    
1236    global linkerlengths
1237
1238    if not os.path.getsize(linker_fname):
1239        raise RuntimeError("File empty? '" + linker_fname + "'")
1240    fh = open(linker_fname, 'r')
1241    linkerseqs = read_fasta(fh)
1242    if len(linkerseqs) == 0:
1243        raise RuntimeError(linker_fname + ": no sequence found?")
1244    for i in linkerseqs:
1245        if linkerlengths.has_key(i.name):
1246            raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
1247        linkerlengths[i.name] = len(i.sequence)
1248    fh.close()
1249
1250
1251def launch_ssaha(linker_fname, query_fname, output_fh):
1252    '''Launches SSAHA2 on the linker and query file, string SSAHA2 output
1253    into the output filehandle'''
1254
1255    tests_for_ssaha()
1256
1257    try:
1258        print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
1259        sys.stdout.flush()
1260        retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
1261        if retcode:
1262            raise RuntimeError('Ups.')
1263        else:
1264            print "ok."
1265    except:
1266        print "\n"
1267        raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
1268
1269def read_ssaha_data(ssahadata_fh):
1270    '''Given file handle, reads file generated with SSAHA2 (with default
1271    output format) and stores all matches as list ssahapematches
1272    (ssaha paired-end matches) dictionary'''
1273
1274    global ssahapematches
1275
1276    print "Parsing SSAHA2 result file ... ",
1277    sys.stdout.flush()
1278
1279    for line in ssahadata_fh:
1280        if line.startswith('ALIGNMENT'):
1281            ml = line.split()
1282            if len(ml) != 12 :
1283                print "\n", line,
1284                raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
1285            if not ssahapematches.has_key(ml[2]) :
1286                ssahapematches[ml[2]] = ([])
1287            if ml[8] == 'F':
1288                #print line,
1289
1290                # store everything except the first element (output
1291                #  format name (ALIGNMENT)) and the last element
1292                #  (length)
1293                ssahapematches[ml[2]].append(ml[1:-1])
1294            else:
1295                #print ml
1296                ml[4],ml[5] = ml[5],ml[4]
1297                #print ml
1298                ssahapematches[ml[2]].append(ml[1:-1])
1299
1300    print "done."
1301
1302
1303##########################################################################
1304#
1305# BaCh: This block was shamelessly copied from
1306#  http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
1307# and then subsequently modified to read fasta correctly
1308# It's still not fool proof, but should be good enough
1309#
1310##########################################################################
1311
1312class Fasta:
1313    def __init__(self, name, sequence):
1314        self.name = name
1315        self.sequence = sequence
1316
1317def read_fasta(file):
1318    items = []
1319    aninstance = Fasta('', '')
1320    linenum = 0
1321    for line in file:
1322        linenum += 1
1323        if line.startswith(">"):
1324            if len(aninstance.sequence):
1325                items.append(aninstance)
1326                aninstance = Fasta('', '')
1327            # name == all characters until the first whitespace
1328            #  (split()[0]) but without the starting ">" ([1:])
1329            aninstance.name = line.split()[0][1:]
1330            aninstance.sequence = ''
1331            if len(aninstance.name) == 0:
1332                raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
1333                
1334        else:
1335            if len(aninstance.name) == 0:
1336                raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
1337            aninstance.sequence += line.strip()
1338
1339    if len(aninstance.name) and len(aninstance.sequence):
1340        items.append(aninstance)
1341
1342    return items
1343##########################################################################
1344
1345def version_string ():
1346    return "sff_extract " + __version__
1347
1348def read_config():
1349    '''It reads the configuration options from the command line arguments and
1350    it returns a dict with them.'''
1351    from optparse import OptionParser, OptionGroup
1352    usage = "usage: %prog [options] sff1 sff2 ..."
1353    desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
1354           " and XML traceinfo format. When a paired-end linker sequence"\
1355           " is given (-l), use SSAHA2 to scan the sequences for the linker,"\
1356           " then split the sequences, removing the linker."
1357    parser = OptionParser(usage = usage, version = version_string(), description = desc)
1358    parser.add_option('-a', '--append', action="store_true", dest='append',
1359            help='append output to existing files', default=False)
1360    parser.add_option('-i', '--xml_info', dest='xml_info',
1361            help='extra info to write in the xml file')
1362    parser.add_option("-l", "--linker_file", dest="pelinker_fname",
1363            help="FASTA file with paired-end linker sequences", metavar="FILE")
1364
1365    group = OptionGroup(parser, "File name options","")
1366    group.add_option('-c', '--clip', action="store_true", dest='clip',
1367                     help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
1368    group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
1369                      help='all bases in upper case, including clipped ends', default=True)
1370    group.add_option('', '--min_left_clip', dest='min_leftclip',
1371                     metavar="INTEGER", type = "int",
1372                     help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
1373    group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
1374                      help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
1375    parser.add_option_group(group)
1376
1377    group = OptionGroup(parser, "File name options","")
1378    group.add_option("-o", "--out_basename", dest="basename",
1379            help="base name for all output files")
1380    group.add_option("-s", "--seq_file", dest="seq_fname",
1381            help="output sequence file name", metavar="FILE")
1382    group.add_option("-q", "--qual_file", dest="qual_fname",
1383            help="output quality file name", metavar="FILE")
1384    group.add_option("-x", "--xml_file", dest="xml_fname",
1385            help="output ancillary xml file name", metavar="FILE")
1386    parser.add_option_group(group)
1387
1388    #default fnames
1389    #is there an sff file?
1390    basename = 'reads'
1391    if sys.argv[-1][-4:].lower() == '.sff':
1392        basename = sys.argv[-1][:-4]
1393    def_seq_fname = basename + '.fasta'
1394    def_qual_fname = basename + '.fasta.qual'
1395    def_xml_fname = basename + '.xml'
1396    def_pelinker_fname = ''
1397    parser.set_defaults(seq_fname = def_seq_fname)
1398    parser.set_defaults(qual_fname = def_qual_fname)
1399    parser.set_defaults(xml_fname = def_xml_fname)
1400    parser.set_defaults(pelinker_fname = def_pelinker_fname)
1401
1402    #we parse the cmd line
1403    (options, args) = parser.parse_args()
1404
1405    #we put the result in a dict
1406    global config
1407    config = {}
1408    for property in dir(options):
1409        if property[0] == '_' or property in ('ensure_value', 'read_file', 
1410                                                                'read_module'):
1411            continue
1412        config[property] = getattr(options, property)
1413
1414    if config['basename'] is None:

Large files files are truncated, but you can click here to view the full file