PageRenderTime 35ms CodeModel.GetById 18ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/taxonomy/find_diag_hits.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 170 lines | 149 code | 4 blank | 17 comment | 6 complexity | 90e16dbaa2f441bf8e6547899aff00a6 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
  5    finds reads that only hit one taxonomic group. For example, consider the folliowing example:
  6    
  7    read1   mammalia
  8    read1   insecta
  9    read2   insecta
 10    
 11    in this case only read2 will be selected becuase it stays within insecta
 12    
 13    This program takes the following options:
 14    
 15    file in taxonomy format - dataset that complies with Galaxy's taxonomy format
 16    id column               - integer specifying the number of column containing seq id (starting with 1)
 17    taxonomic ranks         - a comma separated list of ranks from this list:
 18    
 19         superkingdom
 20         kingdom
 21         subkingdom
 22         superphylum
 23         phylum
 24         subphylum
 25         superclass
 26         class
 27         subclass
 28         superorder
 29         order
 30         suborder
 31         superfamily
 32         family
 33         subfamily
 34         tribe
 35         subtribe
 36         genus
 37         subgenus
 38         species
 39         subspecies
 40    
 41    output format           - reads or counts
 42
 43"""
 44
 45from galaxy import eggs
 46import pkg_resources
 47pkg_resources.require( 'pysqlite' )
 48from pysqlite2 import dbapi2 as sqlite
 49import string, sys, tempfile
 50
 51# This dictionary maps taxonomic ranks to fields of Taxonomy file
 52taxRank = {
 53        'root'        :2, 
 54        'superkingdom':3, 
 55        'kingdom'     :4, 
 56        'subkingdom'  :5, 
 57        'superphylum' :6, 
 58        'phylum'      :7, 
 59        'subphylum'   :8, 
 60        'superclass'  :9, 
 61        'class'       :10, 
 62        'subclass'    :11, 
 63        'superorder'  :12, 
 64        'ord'         :13, 
 65        'suborder'    :14, 
 66        'superfamily' :15,
 67        'family'      :16,
 68        'subfamily'   :17,
 69        'tribe'       :18,
 70        'subtribe'    :19,
 71        'genus'       :20,
 72        'subgenus'    :21,
 73        'species'     :22,
 74        'subspecies'  :23,
 75        'order'       :13
 76    }
 77
 78
 79def stop_err(msg):
 80    sys.stderr.write(msg)
 81    sys.exit()
 82
 83
 84db = tempfile.NamedTemporaryFile('w')
 85
 86try:
 87    con = sqlite.connect(db.name)
 88    cur = con.cursor()
 89except:
 90    stop_err('Cannot connect to %s\n') % db.name
 91    
 92try:
 93    tax_file   = open(sys.argv[1], 'r')
 94    id_col     = int(sys.argv[2]) - 1
 95    taxa       = string.split(sys.argv[3].rstrip(),',')
 96    
 97    if sys.argv[4] == 'reads':
 98        out_format = True
 99    elif sys.argv[4] == 'counts':
100        out_format = False
101    else:
102        stop_err('Please specify "reads" or "counts" for output format\n')
103    out_file = open(sys.argv[5], 'w')
104    
105except:
106    stop_err('Check arguments\n')
107    
108if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
109
110sql = ""
111for i in range(len(taxa)):
112        if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
113        sql += '%s text, ' % taxa[i]
114
115sql = sql.strip(', ')
116sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
117
118    
119cur.execute(sql)
120
121invalid_line_number = 0
122
123try:
124    for line in tax_file:
125        fields = string.split(line.rstrip(), '\t')
126        if len(fields) < 24: 
127            invalid_line_number += 1
128            continue # Skipping malformed taxonomy lines
129        
130        val_string = '"' + fields[id_col] + '", '
131        
132        for rank in taxa:
133            taxon = fields[taxRank[rank]]
134            val_string += '"%s", ' % taxon
135                
136        val_string = val_string.strip(', ')
137        val_string = "insert into tax values(" + val_string + ")"
138        cur.execute(val_string)
139except Exception, e:
140    stop_err('%s\n' % e)
141
142tax_file.close()    
143
144try:    
145    for rank in taxa:
146        cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank  )
147        cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
148        cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
149        cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
150        
151        if rank == 'ord':
152            rankName = 'order'
153        else:
154            rankName = rank
155    
156        if out_format:
157            cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
158            for item in cur.fetchall():
159                out_string = '%s\t%s\t' % ( item[0], item[1] )
160                out_string += rankName
161                print >>out_file, out_string
162        else:
163            cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
164            for item in cur.fetchall():
165                out_string = '%s\t%s\t' % ( item[0], item[1] )
166                out_string += rankName
167                print >>out_file, out_string
168except Exception, e:
169    stop_err("%s\n" % e)
170