PageRenderTime 31ms CodeModel.GetById 7ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/ncbi_blast_plus/blastxml_to_tabular.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 254 lines | 247 code | 0 blank | 7 comment | 0 complexity | 5387fabc8e21c304e4b2bc15d0f0edf4 MD5 | raw file
  1. #!/usr/bin/env python
  2. """Convert a BLAST XML file to 12 column tabular output
  3. Takes three command line options, input BLAST XML filename, output tabular
  4. BLAST filename, output format (std for standard 12 columns, or ext for the
  5. extended 24 columns offered in the BLAST+ wrappers).
  6. The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
  7. qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
  8. mean:
  9. ====== ========= ============================================
  10. Column NCBI name Description
  11. ------ --------- --------------------------------------------
  12. 1 qseqid Query Seq-id (ID of your sequence)
  13. 2 sseqid Subject Seq-id (ID of the database hit)
  14. 3 pident Percentage of identical matches
  15. 4 length Alignment length
  16. 5 mismatch Number of mismatches
  17. 6 gapopen Number of gap openings
  18. 7 qstart Start of alignment in query
  19. 8 qend End of alignment in query
  20. 9 sstart Start of alignment in subject (database hit)
  21. 10 send End of alignment in subject (database hit)
  22. 11 evalue Expectation value (E-value)
  23. 12 bitscore Bit score
  24. ====== ========= ============================================
  25. The additional columns offered in the Galaxy BLAST+ wrappers are:
  26. ====== ============= ===========================================
  27. Column NCBI name Description
  28. ------ ------------- -------------------------------------------
  29. 13 sallseqid All subject Seq-id(s), separated by a ';'
  30. 14 score Raw score
  31. 15 nident Number of identical matches
  32. 16 positive Number of positive-scoring matches
  33. 17 gaps Total number of gaps
  34. 18 ppos Percentage of positive-scoring matches
  35. 19 qframe Query frame
  36. 20 sframe Subject frame
  37. 21 qseq Aligned part of query sequence
  38. 22 sseq Aligned part of subject sequence
  39. 23 qlen Query sequence length
  40. 24 slen Subject sequence length
  41. ====== ============= ===========================================
  42. Most of these fields are given explicitly in the XML file, others some like
  43. the percentage identity and the number of gap openings must be calculated.
  44. Be aware that the sequence in the extended tabular output or XML direct from
  45. BLAST+ may or may not use XXXX masking on regions of low complexity. This
  46. can throw the off the calculation of percentage identity and gap openings.
  47. [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
  48. with these numbers changing depending on whether or not the low complexity
  49. filter is used.]
  50. This script attempts to produce identical output to what BLAST+ would have done.
  51. However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
  52. space character (probably a bug).
  53. """
  54. import sys
  55. import re
  56. if sys.version_info[:2] >= ( 2, 5 ):
  57. import xml.etree.cElementTree as ElementTree
  58. else:
  59. from galaxy import eggs
  60. import pkg_resources; pkg_resources.require( "elementtree" )
  61. from elementtree import ElementTree
  62. def stop_err( msg ):
  63. sys.stderr.write("%s\n" % msg)
  64. sys.exit(1)
  65. #Parse Command Line
  66. try:
  67. in_file, out_file, out_fmt = sys.argv[1:]
  68. except:
  69. stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
  70. if out_fmt == "std":
  71. extended = False
  72. elif out_fmt == "x22":
  73. stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
  74. elif out_fmt == "ext":
  75. extended = True
  76. else:
  77. stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
  78. # get an iterable
  79. try:
  80. context = ElementTree.iterparse(in_file, events=("start", "end"))
  81. except:
  82. stop_err("Invalid data format.")
  83. # turn it into an iterator
  84. context = iter(context)
  85. # get the root element
  86. try:
  87. event, root = context.next()
  88. except:
  89. stop_err( "Invalid data format." )
  90. re_default_query_id = re.compile("^Query_\d+$")
  91. assert re_default_query_id.match("Query_101")
  92. assert not re_default_query_id.match("Query_101a")
  93. assert not re_default_query_id.match("MyQuery_101")
  94. re_default_subject_id = re.compile("^Subject_\d+$")
  95. assert re_default_subject_id.match("Subject_1")
  96. assert not re_default_subject_id.match("Subject_")
  97. assert not re_default_subject_id.match("Subject_12a")
  98. assert not re_default_subject_id.match("TheSubject_1")
  99. outfile = open(out_file, 'w')
  100. blast_program = None
  101. for event, elem in context:
  102. if event == "end" and elem.tag == "BlastOutput_program":
  103. blast_program = elem.text
  104. # for every <Iteration> tag
  105. if event == "end" and elem.tag == "Iteration":
  106. #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
  107. # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
  108. # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
  109. # <Iteration_query-len>406</Iteration_query-len>
  110. # <Iteration_hits></Iteration_hits>
  111. #
  112. #Or, from BLAST 2.2.24+ run online
  113. # <Iteration_query-ID>Query_1</Iteration_query-ID>
  114. # <Iteration_query-def>Sample</Iteration_query-def>
  115. # <Iteration_query-len>516</Iteration_query-len>
  116. # <Iteration_hits>...
  117. qseqid = elem.findtext("Iteration_query-ID")
  118. if re_default_query_id.match(qseqid):
  119. #Place holder ID, take the first word of the query definition
  120. qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
  121. qlen = int(elem.findtext("Iteration_query-len"))
  122. # for every <Hit> within <Iteration>
  123. for hit in elem.findall("Iteration_hits/Hit"):
  124. #Expecting either this,
  125. # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
  126. # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
  127. # <Hit_accession>P56514</Hit_accession>
  128. #or,
  129. # <Hit_id>Subject_1</Hit_id>
  130. # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
  131. # <Hit_accession>Subject_1</Hit_accession>
  132. #
  133. #apparently depending on the parse_deflines switch
  134. sseqid = hit.findtext("Hit_id").split(None,1)[0]
  135. hit_def = sseqid + " " + hit.findtext("Hit_def")
  136. if re_default_subject_id.match(sseqid) \
  137. and sseqid == hit.findtext("Hit_accession"):
  138. #Place holder ID, take the first word of the subject definition
  139. hit_def = hit.findtext("Hit_def")
  140. sseqid = hit_def.split(None,1)[0]
  141. # for every <Hsp> within <Hit>
  142. for hsp in hit.findall("Hit_hsps/Hsp"):
  143. nident = hsp.findtext("Hsp_identity")
  144. length = hsp.findtext("Hsp_align-len")
  145. pident = "%0.2f" % (100*float(nident)/float(length))
  146. q_seq = hsp.findtext("Hsp_qseq")
  147. h_seq = hsp.findtext("Hsp_hseq")
  148. m_seq = hsp.findtext("Hsp_midline")
  149. assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
  150. gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
  151. len(h_seq.replace('-', ' ').split())-1)
  152. mismatch = m_seq.count(' ') + m_seq.count('+') \
  153. - q_seq.count('-') - h_seq.count('-')
  154. #TODO - Remove this alternative mismatch calculation and test
  155. #once satisifed there are no problems
  156. expected_mismatch = len(q_seq) \
  157. - sum(1 for q,h in zip(q_seq, h_seq) \
  158. if q == h or q == "-" or h == "-")
  159. xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
  160. if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
  161. stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
  162. % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
  163. int(mismatch), expected_mismatch))
  164. #TODO - Remove this alternative identity calculation and test
  165. #once satisifed there are no problems
  166. expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
  167. if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
  168. stop_err("%s vs %s identities, expected %i <= %i <= %i" \
  169. % (qseqid, sseqid, expected_identity, int(nident),
  170. expected_identity + q_seq.count("X")))
  171. evalue = hsp.findtext("Hsp_evalue")
  172. if evalue == "0":
  173. evalue = "0.0"
  174. else:
  175. evalue = "%0.0e" % float(evalue)
  176. bitscore = float(hsp.findtext("Hsp_bit-score"))
  177. if bitscore < 100:
  178. #Seems to show one decimal place for lower scores
  179. bitscore = "%0.1f" % bitscore
  180. else:
  181. #Note BLAST does not round to nearest int, it truncates
  182. bitscore = "%i" % bitscore
  183. values = [qseqid,
  184. sseqid,
  185. pident,
  186. length, #hsp.findtext("Hsp_align-len")
  187. str(mismatch),
  188. gapopen,
  189. hsp.findtext("Hsp_query-from"), #qstart,
  190. hsp.findtext("Hsp_query-to"), #qend,
  191. hsp.findtext("Hsp_hit-from"), #sstart,
  192. hsp.findtext("Hsp_hit-to"), #send,
  193. evalue, #hsp.findtext("Hsp_evalue") in scientific notation
  194. bitscore, #hsp.findtext("Hsp_bit-score") rounded
  195. ]
  196. if extended:
  197. sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
  198. #print hit_def, "-->", sallseqid
  199. positive = hsp.findtext("Hsp_positive")
  200. ppos = "%0.2f" % (100*float(positive)/float(length))
  201. qframe = hsp.findtext("Hsp_query-frame")
  202. sframe = hsp.findtext("Hsp_hit-frame")
  203. if blast_program == "blastp":
  204. #Probably a bug in BLASTP that they use 0 or 1 depending on format
  205. if qframe == "0": qframe = "1"
  206. if sframe == "0": sframe = "1"
  207. slen = int(hit.findtext("Hit_len"))
  208. values.extend([sallseqid,
  209. hsp.findtext("Hsp_score"), #score,
  210. nident,
  211. positive,
  212. hsp.findtext("Hsp_gaps"), #gaps,
  213. ppos,
  214. qframe,
  215. sframe,
  216. #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
  217. q_seq,
  218. h_seq,
  219. str(qlen),
  220. str(slen),
  221. ])
  222. #print "\t".join(values)
  223. outfile.write("\t".join(values) + "\n")
  224. # prevents ElementTree from growing large datastructure
  225. root.clear()
  226. elem.clear()
  227. outfile.close()