megablast_xml_parser.py - This is a Python script that take…

/tools/metag_tools/megablast_xml_parser.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 78 lines · 60 code · 8 blank · 10 comment · 13 complexity · ce64d58eceab6a8d45762afabb901377 MD5 · raw file


#!/usr/bin/env python
    
import sys, os, re

if sys.version_info[:2] >= ( 2, 5 ):
    import xml.etree.cElementTree as ElementTree
else:
    from galaxy import eggs
    import pkg_resources; pkg_resources.require( "elementtree" )
    from elementtree import ElementTree

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def __main__():
    source  = sys.argv[1]
    hspTags = [
           "Hsp_bit-score",
           "Hsp_evalue",
           "Hsp_query-from",
           "Hsp_query-to",
           "Hsp_hit-from",
           "Hsp_hit-to",
           "Hsp_query-frame",
           "Hsp_hit-frame",
           "Hsp_identity",
           "Hsp_align-len",
           "Hsp_qseq",
           "Hsp_hseq",
           "Hsp_midline"
          ]
    hspData = []

    # get an iterable
    try: 
        context = ElementTree.iterparse( source, events=( "start", "end" ) )
    except:
        stop_err( "Invalid data format." )
    # turn it into an iterator
    context = iter( context )
    # get the root element
    try:
        event, root = context.next()
    except:
        stop_err( "Invalid data format." )

    outfile = open( sys.argv[2], 'w' )
    try:
        for event, elem in context:
           # for every <Iteration> tag
           if event == "end" and elem.tag == "Iteration":
               query = elem.findtext( "Iteration_query-def" )
               qLen = elem.findtext( "Iteration_query-len" )
               # for every <Hit> within <Iteration>
               for hit in elem.findall( "Iteration_hits/Hit" ):
                   subject = hit.findtext( "Hit_id" )
                   if re.search( '^gi', subject ):
                       subject = subject.split('|')[1]
                   sLen = hit.findtext( "Hit_len" )
                   # for every <Hsp> within <Hit>
                   for hsp in hit.findall( "Hit_hsps/Hsp" ):
                        outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) )
                        for tag in hspTags:
                            outfile.write("\t%s" %(hsp.findtext( tag )))
                            #hspData.append( hsp.findtext( tag ) )
                        #hspData = []
                        outfile.write('\n')
               # prevents ElementTree from growing large datastructure
               root.clear()
               elem.clear()
    except:
        outfile.close()
        stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] )

    outfile.close()

if __name__ == "__main__": __main__()

Summary ✨

This is a Python script that takes two command-line arguments: a Blast XML file and an output file name. It reads the Blast XML file, extracts information from each hit, and writes the results to the output file in a tab-delimited format. The script uses the xml.etree.ElementTree module to parse the XML file and the re module for regular expression matching.

Tech Fingerprint

Alerts (5)

'def' Ensure functions have docstrings for documentation
12
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
38 45 72
'open(' Use 'with open()' to ensure Files are properly closed
48