PageRenderTime 35ms CodeModel.GetById 12ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ncbi_blast_plus/blastxml_to_tabular.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 254 lines | 247 code | 0 blank | 7 comment | 0 complexity | 5387fabc8e21c304e4b2bc15d0f0edf4 MD5 | raw file
  1#!/usr/bin/env python
  2"""Convert a BLAST XML file to 12 column tabular output
  3
  4Takes three command line options, input BLAST XML filename, output tabular
  5BLAST filename, output format (std for standard 12 columns, or ext for the
  6extended 24 columns offered in the BLAST+ wrappers).
  7
  8The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
  9qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
 10mean:
 11   
 12====== ========= ============================================
 13Column NCBI name Description
 14------ --------- --------------------------------------------
 15     1 qseqid    Query Seq-id (ID of your sequence)
 16     2 sseqid    Subject Seq-id (ID of the database hit)
 17     3 pident    Percentage of identical matches
 18     4 length    Alignment length
 19     5 mismatch  Number of mismatches
 20     6 gapopen   Number of gap openings
 21     7 qstart    Start of alignment in query
 22     8 qend      End of alignment in query
 23     9 sstart    Start of alignment in subject (database hit)
 24    10 send      End of alignment in subject (database hit)
 25    11 evalue    Expectation value (E-value)
 26    12 bitscore  Bit score
 27====== ========= ============================================
 28
 29The additional columns offered in the Galaxy BLAST+ wrappers are:
 30
 31====== ============= ===========================================
 32Column NCBI name     Description
 33------ ------------- -------------------------------------------
 34    13 sallseqid     All subject Seq-id(s), separated by a ';'
 35    14 score         Raw score
 36    15 nident        Number of identical matches
 37    16 positive      Number of positive-scoring matches
 38    17 gaps          Total number of gaps
 39    18 ppos          Percentage of positive-scoring matches
 40    19 qframe        Query frame
 41    20 sframe        Subject frame
 42    21 qseq          Aligned part of query sequence
 43    22 sseq          Aligned part of subject sequence
 44    23 qlen          Query sequence length
 45    24 slen          Subject sequence length
 46====== ============= ===========================================
 47
 48Most of these fields are given explicitly in the XML file, others some like
 49the percentage identity and the number of gap openings must be calculated.
 50
 51Be aware that the sequence in the extended tabular output or XML direct from
 52BLAST+ may or may not use XXXX masking on regions of low complexity. This
 53can throw the off the calculation of percentage identity and gap openings.
 54[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
 55with these numbers changing depending on whether or not the low complexity
 56filter is used.]
 57
 58This script attempts to produce identical output to what BLAST+ would have done.
 59However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
 60space character (probably a bug).
 61"""
 62import sys
 63import re
 64
 65if sys.version_info[:2] >= ( 2, 5 ):
 66    import xml.etree.cElementTree as ElementTree
 67else:
 68    from galaxy import eggs
 69    import pkg_resources; pkg_resources.require( "elementtree" )
 70    from elementtree import ElementTree
 71
 72def stop_err( msg ):
 73    sys.stderr.write("%s\n" % msg)
 74    sys.exit(1)
 75
 76#Parse Command Line
 77try:
 78    in_file, out_file, out_fmt = sys.argv[1:]
 79except:
 80    stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
 81
 82if out_fmt == "std":
 83    extended = False
 84elif out_fmt == "x22":
 85    stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
 86elif out_fmt == "ext":
 87    extended = True
 88else:
 89    stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
 90
 91
 92# get an iterable
 93try: 
 94    context = ElementTree.iterparse(in_file, events=("start", "end"))
 95except:
 96    stop_err("Invalid data format.")
 97# turn it into an iterator
 98context = iter(context)
 99# get the root element
100try:
101    event, root = context.next()
102except:
103    stop_err( "Invalid data format." )
104
105
106re_default_query_id = re.compile("^Query_\d+$")
107assert re_default_query_id.match("Query_101")
108assert not re_default_query_id.match("Query_101a")
109assert not re_default_query_id.match("MyQuery_101")
110re_default_subject_id = re.compile("^Subject_\d+$")
111assert re_default_subject_id.match("Subject_1")
112assert not re_default_subject_id.match("Subject_")
113assert not re_default_subject_id.match("Subject_12a")
114assert not re_default_subject_id.match("TheSubject_1")
115
116
117outfile = open(out_file, 'w')
118blast_program = None
119for event, elem in context:
120    if event == "end" and elem.tag == "BlastOutput_program":
121        blast_program = elem.text
122    # for every <Iteration> tag
123    if event == "end" and elem.tag == "Iteration":
124        #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
125        # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
126        # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
127        # <Iteration_query-len>406</Iteration_query-len>
128        # <Iteration_hits></Iteration_hits>
129        #
130        #Or, from BLAST 2.2.24+ run online
131        # <Iteration_query-ID>Query_1</Iteration_query-ID>
132        # <Iteration_query-def>Sample</Iteration_query-def>
133        # <Iteration_query-len>516</Iteration_query-len>
134        # <Iteration_hits>...
135        qseqid = elem.findtext("Iteration_query-ID")
136        if re_default_query_id.match(qseqid):
137            #Place holder ID, take the first word of the query definition
138            qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
139        qlen = int(elem.findtext("Iteration_query-len"))
140                                        
141        # for every <Hit> within <Iteration>
142        for hit in elem.findall("Iteration_hits/Hit"):
143            #Expecting either this,
144            # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
145            # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
146            # <Hit_accession>P56514</Hit_accession>
147            #or,
148            # <Hit_id>Subject_1</Hit_id>
149            # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
150            # <Hit_accession>Subject_1</Hit_accession>
151            #
152            #apparently depending on the parse_deflines switch
153            sseqid = hit.findtext("Hit_id").split(None,1)[0]
154            hit_def = sseqid + " " + hit.findtext("Hit_def")
155            if re_default_subject_id.match(sseqid) \
156            and sseqid == hit.findtext("Hit_accession"):
157                #Place holder ID, take the first word of the subject definition
158                hit_def = hit.findtext("Hit_def")
159                sseqid = hit_def.split(None,1)[0]
160            # for every <Hsp> within <Hit>
161            for hsp in hit.findall("Hit_hsps/Hsp"):
162                nident = hsp.findtext("Hsp_identity")
163                length = hsp.findtext("Hsp_align-len")
164                pident = "%0.2f" % (100*float(nident)/float(length))
165
166                q_seq = hsp.findtext("Hsp_qseq")
167                h_seq = hsp.findtext("Hsp_hseq")
168                m_seq = hsp.findtext("Hsp_midline")
169                assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
170                gapopen = str(len(q_seq.replace('-', ' ').split())-1  + \
171                              len(h_seq.replace('-', ' ').split())-1)
172
173                mismatch = m_seq.count(' ') + m_seq.count('+') \
174                         - q_seq.count('-') - h_seq.count('-')
175                #TODO - Remove this alternative mismatch calculation and test
176                #once satisifed there are no problems
177                expected_mismatch = len(q_seq) \
178                                  - sum(1 for q,h in zip(q_seq, h_seq) \
179                                        if q == h or q == "-" or h == "-")
180                xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
181                if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
182                    stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
183                             % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
184                                int(mismatch), expected_mismatch))
185
186                #TODO - Remove this alternative identity calculation and test
187                #once satisifed there are no problems
188                expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
189                if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
190                    stop_err("%s vs %s identities, expected %i <= %i <= %i" \
191                             % (qseqid, sseqid, expected_identity, int(nident),
192                                expected_identity + q_seq.count("X")))
193                
194
195                evalue = hsp.findtext("Hsp_evalue")
196                if evalue == "0":
197                    evalue = "0.0"
198                else:
199                    evalue = "%0.0e" % float(evalue)
200                
201                bitscore = float(hsp.findtext("Hsp_bit-score"))
202                if bitscore < 100:
203                    #Seems to show one decimal place for lower scores
204                    bitscore = "%0.1f" % bitscore
205                else:
206                    #Note BLAST does not round to nearest int, it truncates
207                    bitscore = "%i" % bitscore
208
209                values = [qseqid,
210                          sseqid,
211                          pident,
212                          length, #hsp.findtext("Hsp_align-len")
213                          str(mismatch),
214                          gapopen,
215                          hsp.findtext("Hsp_query-from"), #qstart,
216                          hsp.findtext("Hsp_query-to"), #qend,
217                          hsp.findtext("Hsp_hit-from"), #sstart,
218                          hsp.findtext("Hsp_hit-to"), #send,
219                          evalue, #hsp.findtext("Hsp_evalue") in scientific notation
220                          bitscore, #hsp.findtext("Hsp_bit-score") rounded
221                          ]
222
223                if extended:
224                    sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
225                    #print hit_def, "-->", sallseqid
226                    positive = hsp.findtext("Hsp_positive")
227                    ppos = "%0.2f" % (100*float(positive)/float(length))
228                    qframe = hsp.findtext("Hsp_query-frame")
229                    sframe = hsp.findtext("Hsp_hit-frame")
230                    if blast_program == "blastp":
231                        #Probably a bug in BLASTP that they use 0 or 1 depending on format
232                        if qframe == "0": qframe = "1"
233                        if sframe == "0": sframe = "1"
234                    slen = int(hit.findtext("Hit_len"))
235                    values.extend([sallseqid,
236                                   hsp.findtext("Hsp_score"), #score,
237                                   nident,
238                                   positive,
239                                   hsp.findtext("Hsp_gaps"), #gaps,
240                                   ppos,
241                                   qframe,
242                                   sframe,
243                                   #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
244                                   q_seq,
245                                   h_seq,
246                                   str(qlen),
247                                   str(slen),
248                                   ])
249                #print "\t".join(values) 
250                outfile.write("\t".join(values) + "\n")
251        # prevents ElementTree from growing large datastructure
252        root.clear()
253        elem.clear()
254outfile.close()