find_diag_hits.py - This Python script takes a taxonomy fil…

/tools/taxonomy/find_diag_hits.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 170 lines · 149 code · 4 blank · 17 comment · 6 complexity · 90e16dbaa2f441bf8e6547899aff00a6 MD5 · raw file


#!/usr/bin/env python

"""
tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
    finds reads that only hit one taxonomic group. For example, consider the folliowing example:
    
    read1   mammalia
    read1   insecta
    read2   insecta
    
    in this case only read2 will be selected becuase it stays within insecta
    
    This program takes the following options:
    
    file in taxonomy format - dataset that complies with Galaxy's taxonomy format
    id column               - integer specifying the number of column containing seq id (starting with 1)
    taxonomic ranks         - a comma separated list of ranks from this list:
    
         superkingdom
         kingdom
         subkingdom
         superphylum
         phylum
         subphylum
         superclass
         class
         subclass
         superorder
         order
         suborder
         superfamily
         family
         subfamily
         tribe
         subtribe
         genus
         subgenus
         species
         subspecies
    
    output format           - reads or counts

"""

from galaxy import eggs
import pkg_resources
pkg_resources.require( 'pysqlite' )
from pysqlite2 import dbapi2 as sqlite
import string, sys, tempfile

# This dictionary maps taxonomic ranks to fields of Taxonomy file
taxRank = {
        'root'        :2, 
        'superkingdom':3, 
        'kingdom'     :4, 
        'subkingdom'  :5, 
        'superphylum' :6, 
        'phylum'      :7, 
        'subphylum'   :8, 
        'superclass'  :9, 
        'class'       :10, 
        'subclass'    :11, 
        'superorder'  :12, 
        'ord'         :13, 
        'suborder'    :14, 
        'superfamily' :15,
        'family'      :16,
        'subfamily'   :17,
        'tribe'       :18,
        'subtribe'    :19,
        'genus'       :20,
        'subgenus'    :21,
        'species'     :22,
        'subspecies'  :23,
        'order'       :13
    }


def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()


db = tempfile.NamedTemporaryFile('w')

try:
    con = sqlite.connect(db.name)
    cur = con.cursor()
except:
    stop_err('Cannot connect to %s\n') % db.name
    
try:
    tax_file   = open(sys.argv[1], 'r')
    id_col     = int(sys.argv[2]) - 1
    taxa       = string.split(sys.argv[3].rstrip(),',')
    
    if sys.argv[4] == 'reads':
        out_format = True
    elif sys.argv[4] == 'counts':
        out_format = False
    else:
        stop_err('Please specify "reads" or "counts" for output format\n')
    out_file = open(sys.argv[5], 'w')
    
except:
    stop_err('Check arguments\n')
    
if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')

sql = ""
for i in range(len(taxa)):
        if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
        sql += '%s text, ' % taxa[i]

sql = sql.strip(', ')
sql = 'create table tax (name varchar(50) not null, ' + sql + ')'

    
cur.execute(sql)

invalid_line_number = 0

try:
    for line in tax_file:
        fields = string.split(line.rstrip(), '\t')
        if len(fields) < 24: 
            invalid_line_number += 1
            continue # Skipping malformed taxonomy lines
        
        val_string = '"' + fields[id_col] + '", '
        
        for rank in taxa:
            taxon = fields[taxRank[rank]]
            val_string += '"%s", ' % taxon
                
        val_string = val_string.strip(', ')
        val_string = "insert into tax values(" + val_string + ")"
        cur.execute(val_string)
except Exception, e:
    stop_err('%s\n' % e)

tax_file.close()    

try:    
    for rank in taxa:
        cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank  )
        cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
        cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
        cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
        
        if rank == 'ord':
            rankName = 'order'
        else:
            rankName = rank
    
        if out_format:
            cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
            for item in cur.fetchall():
                out_string = '%s\t%s\t' % ( item[0], item[1] )
                out_string += rankName
                print >>out_file, out_string
        else:
            cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
            for item in cur.fetchall():
                out_string = '%s\t%s\t' % ( item[0], item[1] )
                out_string += rankName
                print >>out_file, out_string
except Exception, e:
    stop_err("%s\n" % e)

Summary ✨

This Python script takes a taxonomy file and outputs reads that only hit one taxonomic group. It uses SQLite to create temporary tables for each rank in the taxonomy and counts the number of reads that match each rank. The output is either a list of reads that only hit one taxonomic group or a count of reads that only hit one taxonomic group for each rank.

Alerts (6)

'def' Ensure functions have docstrings for documentation
79
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
89 105
'open(' Use 'with open()' to ensure Files are properly closed
93 103
'try:' Ensure try blocks have corresponding except or finally blocks
123