/Bio/SearchIO/FastaIO.py
Python | 579 lines | 533 code | 19 blank | 27 comment | 27 complexity | 942af16a55fa910414131c0d1d1a42c3 MD5 | raw file
- # Adapted from Bio.AlignIO.FastaIO copyright 2008-2011 by Peter Cock.
- # Copyright 2012 by Wibowo Arindrarto.
- # All rights reserved.
- # This code is part of the Biopython distribution and governed by its
- # license. Please see the LICENSE file that should have been included
- # as part of this package.
- """Bio.SearchIO support for Bill Pearson's FASTA tools.
- This module adds support for parsing FASTA outputs. FASTA is a suite of
- programs that finds regions of local or global similarity between protein
- or nucleotide sequences, either by searching databases or identifying
- local duplications.
- Bio.SearchIO.FastaIO was tested on the following FASTA flavors and versions:
- - flavors: fasta, ssearch, tfastx
- - versions: 35, 36
- Other flavors and/or versions may introduce some bugs. Please file a bug report
- if you see such problems to Biopython's bug tracker.
- More information on FASTA are available through these links:
- - Website: http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml
- - User guide: http://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf
- Supported Formats
- =================
- Bio.SearchIO.FastaIO supports parsing and indexing FASTA outputs triggered by
- the -m 10 flag. Other formats that mimic other programs (e.g. the BLAST tabular
- format using the -m 8 flag) may be parseable but using SearchIO's other parsers
- (in this case, using the 'blast-tab' parser).
- fasta-m10
- =========
- Note that in FASTA -m 10 outputs, HSPs from different strands are considered to
- be from different hits. They are listed as two separate entries in the hit
- table. FastaIO recognizes this and will group HSPs with the same hit ID into a
- single Hit object, regardless of strand.
- FASTA also sometimes output extra sequences adjacent to the HSP match. These
- extra sequences are discarded by FastaIO. Only regions containing the actual
- sequence match are extracted.
- The following object attributes are provided:
- +-----------------+-------------------------+----------------------------------+
- | Object | Attribute | Value |
- +=================+=========================+==================================+
- | QueryResult | description | query sequence description |
- | +-------------------------+----------------------------------+
- | | id | query sequence ID |
- | +-------------------------+----------------------------------+
- | | program | FASTA flavor |
- | +-------------------------+----------------------------------+
- | | seq_len | full length of query sequence |
- | +-------------------------+----------------------------------+
- | | target | target search database |
- | +-------------------------+----------------------------------+
- | | version | FASTA version |
- +-----------------+-------------------------+----------------------------------+
- | Hit | seq_len | full length of the hit sequence |
- +-----------------+-------------------------+----------------------------------+
- | HSP | bitscore | \*_bits line |
- | +-------------------------+----------------------------------+
- | | evalue | \*_expect line |
- | +-------------------------+----------------------------------+
- | | ident_pct | \*_ident line |
- | +-------------------------+----------------------------------+
- | | init1_score | \*_init1 line |
- | +-------------------------+----------------------------------+
- | | initn_score | \*_initn line |
- | +-------------------------+----------------------------------+
- | | opt_score | \*_opt line, \*_s-w opt line |
- | +-------------------------+----------------------------------+
- | | pos_pct | \*_sim line |
- | +-------------------------+----------------------------------+
- | | sw_score | \*_score line |
- | +-------------------------+----------------------------------+
- | | z_score | \*_z-score line |
- +-----------------+-------------------------+----------------------------------+
- | HSPFragment | aln_annotation | al_cons block, if present |
- | (also via HSP) +-------------------------+----------------------------------+
- | | hit | hit sequence |
- | +-------------------------+----------------------------------+
- | | hit_end | hit sequence end coordinate |
- | +-------------------------+----------------------------------+
- | | hit_start | hit sequence start coordinate |
- | +-------------------------+----------------------------------+
- | | hit_strand | hit sequence strand |
- | +-------------------------+----------------------------------+
- | | query | query sequence |
- | +-------------------------+----------------------------------+
- | | query_end | query sequence end coordinate |
- | +-------------------------+----------------------------------+
- | | query_start | query sequence start coordinate |
- | +-------------------------+----------------------------------+
- | | query_strand | query sequence strand |
- +-----------------+-------------------------+----------------------------------+
- """
- import re
- from Bio._py3k import _as_bytes, _bytes_to_string
- from Bio.Alphabet import generic_dna, generic_protein
- from Bio.File import UndoHandle
- from Bio.SearchIO._index import SearchIndexer
- from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
- __all__ = ['FastaM10Parser', 'FastaM10Indexer']
- __docformat__ = "restructuredtext en"
- # precompile regex patterns
- # regex for program name
- _RE_FLAVS = re.compile(r't?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search')
- # regex for sequence ID and length ~ deals with both \n and \r\n
- _PTR_ID_DESC_SEQLEN = r'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$'
- _RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN)
- _RE_ID_DESC_SEQLEN_IDX = re.compile(_as_bytes(_PTR_ID_DESC_SEQLEN))
- # regex for qresult, hit, or hsp attribute value
- _RE_ATTR = re.compile(r'^; [a-z]+(_[ \w-]+):\s+(.*)$')
- # regex for capturing excess start and end sequences in alignments
- _RE_START_EXC = re.compile(r'^-*')
- _RE_END_EXC = re.compile(r'-*$')
- # attribute name mappings
- _HSP_ATTR_MAP = {
- '_initn': ('initn_score', int),
- '_init1': ('init1_score', int),
- '_opt': ('opt_score', int),
- '_s-w opt': ('opt_score', int),
- '_z-score': ('z_score', float),
- '_bits': ('bitscore', float),
- '_expect': ('evalue', float),
- '_score': ('sw_score', int),
- '_ident': ('ident_pct', float),
- '_sim': ('pos_pct', float),
- }
- # state flags
- _STATE_NONE = 0
- _STATE_QUERY_BLOCK = 1
- _STATE_HIT_BLOCK = 2
- _STATE_CONS_BLOCK = 3
- def _set_qresult_hits(qresult, hit_rows=[]):
- """Helper function for appending Hits without alignments into QueryResults."""
- for hit_row in hit_rows:
- hit_id, remainder = hit_row.split(' ', 1)
- # TODO: parse hit and hsp properties properly; by dealing with:
- # - any character in the description (brackets, spaces, etc.)
- # - possible [f] or [r] presence (for frame info)
- # - possible presence of E2() column
- # - possible incomplete hit_id due to column length limit
- # The current method only looks at the Hit ID, none of the things above
- if hit_id not in qresult:
- frag = HSPFragment(hit_id, qresult.id)
- hsp = HSP([frag])
- hit = Hit([hsp])
- qresult.append(hit)
- return qresult
- def _set_hsp_seqs(hsp, parsed, program):
- """Helper function for the main parsing code.
- :param hsp: HSP whose properties will be set
- :type hsp: HSP
- :param parsed: parsed values of the HSP attributes
- :type parsed: dictionary {string: object}
- :param program: program name
- :type program: string
- """
- # get aligned sequences and check if they have equal lengths
- start = 0
- for seq_type in ('hit', 'query'):
- if 'tfast' not in program:
- pseq = parsed[seq_type]
- # adjust start and end coordinates based on the amount of
- # filler characters
- start, stop = _get_aln_slice_coords(pseq)
- start_adj = len(re.search(_RE_START_EXC, pseq['seq']).group(0))
- stop_adj = len(re.search(_RE_END_EXC, pseq['seq']).group(0))
- start = start + start_adj
- stop = stop + start_adj - stop_adj
- parsed[seq_type]['seq'] = pseq['seq'][start:stop]
- assert len(parsed['query']['seq']) == len(parsed['hit']['seq']), "%r %r" \
- % (len(parsed['query']['seq']), len(parsed['hit']['seq']))
- if 'similarity' in hsp.aln_annotation:
- # only using 'start' since FASTA seems to have trimmed the 'excess'
- # end part
- hsp.aln_annotation['similarity'] = hsp.aln_annotation['similarity'][start:]
- # hit or query works equally well here
- assert len(hsp.aln_annotation['similarity']) == len(parsed['hit']['seq'])
- # query and hit sequence types must be the same
- assert parsed['query']['_type'] == parsed['hit']['_type']
- type_val = parsed['query']['_type'] # hit works fine too
- alphabet = generic_dna if type_val == 'D' else generic_protein
- setattr(hsp.fragment, 'alphabet', alphabet)
- for seq_type in ('hit', 'query'):
- # get and set start and end coordinates
- start = int(parsed[seq_type]['_start'])
- end = int(parsed[seq_type]['_stop'])
- setattr(hsp.fragment, seq_type + '_start', min(start, end) - 1)
- setattr(hsp.fragment, seq_type + '_end', max(start, end))
- # set seq and alphabet
- setattr(hsp.fragment, seq_type, parsed[seq_type]['seq'])
- if alphabet is not generic_protein:
- # get strand from coordinate; start <= end is plus
- # start > end is minus
- if start <= end:
- setattr(hsp.fragment, seq_type + '_strand', 1)
- else:
- setattr(hsp.fragment, seq_type + '_strand', -1)
- else:
- setattr(hsp.fragment, seq_type + '_strand', 0)
- def _get_aln_slice_coords(parsed_hsp):
- """Helper function for the main parsing code.
- To get the actual pairwise alignment sequences, we must first
- translate the un-gapped sequence based coordinates into positions
- in the gapped sequence (which may have a flanking region shown
- using leading - characters). To date, I have never seen any
- trailing flanking region shown in the m10 file, but the
- following code should also cope with that.
- Note that this code seems to work fine even when the "sq_offset"
- entries are prsent as a result of using the -X command line option.
- """
- seq = parsed_hsp['seq']
- seq_stripped = seq.strip('-')
- disp_start = int(parsed_hsp['_display_start'])
- start = int(parsed_hsp['_start'])
- stop = int(parsed_hsp['_stop'])
- if start <= stop:
- start = start - disp_start
- stop = stop - disp_start + 1
- else:
- start = disp_start - start
- stop = disp_start - stop + 1
- stop += seq_stripped.count('-')
- assert 0 <= start and start < stop and stop <= len(seq_stripped), \
- "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \
- % (seq, start, stop, parsed_hsp)
- return start, stop
- class FastaM10Parser(object):
- """Parser for Bill Pearson's FASTA suite's -m 10 output."""
- def __init__(self, handle, __parse_hit_table=False):
- self.handle = UndoHandle(handle)
- self._preamble = self._parse_preamble()
- def __iter__(self):
- for qresult in self._parse_qresult():
- # re-set desc, for hsp query description
- qresult.description = qresult.description
- yield qresult
- def _parse_preamble(self):
- """Parses the Fasta preamble for Fasta flavor and version."""
- preamble = {}
- while True:
- self.line = self.handle.readline()
- # this should be the line just before the first qresult
- if self.line.startswith('Query'):
- break
- # try to match for version line
- elif self.line.startswith(' version'):
- preamble['version'] = self.line.split(' ')[2]
- else:
- # try to match for flavor line
- flav_match = re.match(_RE_FLAVS, self.line.lower())
- if flav_match:
- preamble['program'] = flav_match.group(0)
- return preamble
- def __parse_hit_table(self):
- """Parses hit table rows."""
- # move to the first row
- self.line = self.handle.readline()
- # parse hit table until we see an empty line
- hit_rows = []
- while self.line and not self.line.strip():
- hit_rows.append(self.line.strip())
- self.line = self.handle.readline()
- return hit_rows
- def _parse_qresult(self):
- # initial qresult value
- qresult = None
- hit_rows = []
- # state values
- state_QRES_NEW = 1
- state_QRES_HITTAB = 3
- state_QRES_CONTENT = 5
- state_QRES_END = 7
- while True:
- # one line before the hit table
- if self.line.startswith('The best scores are:'):
- qres_state = state_QRES_HITTAB
- # the end of a query or the file altogether
- elif self.line.strip() == '>>>///' or not self.line:
- qres_state = state_QRES_END
- # the beginning of a new query
- elif not self.line.startswith('>>>') and '>>>' in self.line:
- qres_state = state_QRES_NEW
- # the beginning of the query info and its hits + hsps
- elif self.line.startswith('>>>') and not \
- self.line.strip() == '>>><<<':
- qres_state = state_QRES_CONTENT
- # default qres mark
- else:
- qres_state = None
- if qres_state is not None:
- if qres_state == state_QRES_HITTAB:
- # parse hit table if flag is set
- hit_rows = self.__parse_hit_table()
- elif qres_state == state_QRES_END:
- yield _set_qresult_hits(qresult, hit_rows)
- break
- elif qres_state == state_QRES_NEW:
- # if qresult is filled, yield it first
- if qresult is not None:
- yield _set_qresult_hits(qresult, hit_rows)
- regx = re.search(_RE_ID_DESC_SEQLEN, self.line)
- query_id = regx.group(1)
- seq_len = regx.group(3)
- desc = regx.group(2)
- qresult = QueryResult(id=query_id)
- qresult.seq_len = int(seq_len)
- # get target from the next line
- self.line = self.handle.readline()
- qresult.target = [x for x in self.line.split(' ') if x][1].strip()
- if desc is not None:
- qresult.description = desc
- # set values from preamble
- for key, value in self._preamble.items():
- setattr(qresult, key, value)
- elif qres_state == state_QRES_CONTENT:
- assert self.line[3:].startswith(qresult.id), self.line
- for hit, strand in self._parse_hit(query_id):
- # HACK: re-set desc, for hsp hit and query description
- hit.description = hit.description
- hit.query_description = qresult.description
- # if hit is not in qresult, append it
- if hit.id not in qresult:
- qresult.append(hit)
- # otherwise, it might be the same hit with a different strand
- else:
- # make sure strand is different and then append hsp to
- # existing hit
- for hsp in hit.hsps:
- assert strand != hsp.query_strand
- qresult[hit.id].append(hsp)
- self.line = self.handle.readline()
- def _parse_hit(self, query_id):
- while True:
- self.line = self.handle.readline()
- if self.line.startswith('>>'):
- break
- strand = None
- hsp_list = []
- while True:
- peekline = self.handle.peekline()
- # yield hit if we've reached the start of a new query or
- # the end of the search
- if peekline.strip() in [">>><<<", ">>>///"] or \
- (not peekline.startswith('>>>') and '>>>' in peekline):
- # append last parsed_hsp['hit']['seq'] line
- if state == _STATE_HIT_BLOCK:
- parsed_hsp['hit']['seq'] += self.line.strip()
- elif state == _STATE_CONS_BLOCK:
- hsp.aln_annotation['similarity'] += \
- self.line.strip('\r\n')
- # process HSP alignment and coordinates
- _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
- hit = Hit(hsp_list)
- hit.description = hit_desc
- hit.seq_len = seq_len
- yield hit, strand
- hsp_list = []
- break
- # yield hit and create a new one if we're still in the same query
- elif self.line.startswith('>>'):
- # try yielding, if we have hsps
- if hsp_list:
- _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
- hit = Hit(hsp_list)
- hit.description = hit_desc
- hit.seq_len = seq_len
- yield hit, strand
- hsp_list = []
- # try to get the hit id and desc, and handle cases without descs
- try:
- hit_id, hit_desc = self.line[2:].strip().split(' ', 1)
- except ValueError:
- hit_id = self.line[2:].strip().split(' ', 1)[0]
- hit_desc = ''
- # create the HSP object for Hit
- frag = HSPFragment(hit_id, query_id)
- hsp = HSP([frag])
- hsp_list.append(hsp)
- # set or reset the state to none
- state = _STATE_NONE
- parsed_hsp = {'query':{}, 'hit': {}}
- # create and append a new HSP if line starts with '>--'
- elif self.line.startswith('>--'):
- # set seq attributes of previous hsp
- _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
- # and create a new one
- frag = HSPFragment(hit_id, query_id)
- hsp = HSP([frag])
- hsp_list.append(hsp)
- # set the state ~ none yet
- state = _STATE_NONE
- parsed_hsp = {'query':{}, 'hit': {}}
- # this is either query or hit data in the HSP, depending on the state
- elif self.line.startswith('>'):
- if state == _STATE_NONE:
- # make sure it's the correct query
- assert query_id.startswith(self.line[1:].split(' ')[0]), \
- "%r vs %r" % (query_id, self.line)
- state = _STATE_QUERY_BLOCK
- parsed_hsp['query']['seq'] = ''
- elif state == _STATE_QUERY_BLOCK:
- # make sure it's the correct hit
- assert hit_id.startswith(self.line[1:].split(' ')[0])
- state = _STATE_HIT_BLOCK
- parsed_hsp['hit']['seq'] = ''
- # check for conservation block
- elif self.line.startswith('; al_cons'):
- state = _STATE_CONS_BLOCK
- hsp.fragment.aln_annotation['similarity'] = ''
- elif self.line.startswith(';'):
- # Fasta outputs do not make a clear distinction between Hit
- # and HSPs, so we check the attribute names to determine
- # whether it belongs to a Hit or HSP
- regx = re.search(_RE_ATTR, self.line.strip())
- name = regx.group(1)
- value = regx.group(2)
- # for values before the '>...' query block
- if state == _STATE_NONE:
- if name in _HSP_ATTR_MAP:
- attr_name, caster = _HSP_ATTR_MAP[name]
- if caster is not str:
- value = caster(value)
- if name in ['_ident', '_sim']:
- value *= 100
- setattr(hsp, attr_name, value)
- # otherwise, pool the values for processing later
- elif state == _STATE_QUERY_BLOCK:
- parsed_hsp['query'][name] = value
- elif state == _STATE_HIT_BLOCK:
- if name == '_len':
- seq_len = int(value)
- else:
- parsed_hsp['hit'][name] = value
- # for values in the hit block
- else:
- raise ValueError("Unexpected line: %r" % self.line)
- # otherwise, it must be lines containing the sequences
- else:
- assert '>' not in self.line
- # if we're in hit, parse into hsp.hit
- if state == _STATE_HIT_BLOCK:
- parsed_hsp['hit']['seq'] += self.line.strip()
- elif state == _STATE_QUERY_BLOCK:
- parsed_hsp['query']['seq'] += self.line.strip()
- elif state == _STATE_CONS_BLOCK:
- hsp.fragment.aln_annotation['similarity'] += \
- self.line.strip('\r\n')
- # we should not get here!
- else:
- raise ValueError("Unexpected line: %r" % self.line)
- self.line = self.handle.readline()
- class FastaM10Indexer(SearchIndexer):
- """Indexer class for Bill Pearson's FASTA suite's -m 10 output."""
- _parser = FastaM10Parser
- def __init__(self, filename):
- SearchIndexer.__init__(self, filename)
- self._handle = UndoHandle(self._handle)
- def __iter__(self):
- handle = self._handle
- handle.seek(0)
- start_offset = handle.tell()
- qresult_key = None
- query_mark = _as_bytes('>>>')
- while True:
- line = handle.readline()
- peekline = handle.peekline()
- end_offset = handle.tell()
- if not line.startswith(query_mark) and query_mark in line:
- regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
- qresult_key = _bytes_to_string(regx.group(1))
- start_offset = end_offset - len(line)
- # yield whenever we encounter a new query or at the end of the file
- if qresult_key is not None:
- if (not peekline.startswith(query_mark)
- and query_mark in peekline) or not line:
- yield qresult_key, start_offset, end_offset - start_offset
- if not line:
- break
- start_offset = end_offset
- def get_raw(self, offset):
- handle = self._handle
- qresult_raw = _as_bytes('')
- query_mark = _as_bytes('>>>')
- # read header first
- handle.seek(0)
- while True:
- line = handle.readline()
- peekline = handle.peekline()
- qresult_raw += line
- if not peekline.startswith(query_mark) and query_mark in peekline:
- break
- # and read the qresult raw string
- handle.seek(offset)
- while True:
- # preserve whitespace, don't use read_forward
- line = handle.readline()
- peekline = handle.peekline()
- qresult_raw += line
- # break when we've reached qresult end
- if (not peekline.startswith(query_mark) and query_mark in peekline) or \
- not line:
- break
- # append mock end marker to qresult_raw, since it's not always present
- return qresult_raw + _as_bytes('>>><<<\n')
- # if not used as a module, run the doctest
- if __name__ == "__main__":
- from Bio._utils import run_doctest
- run_doctest()