PageRenderTime 26ms CodeModel.GetById 12ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/rgenetics/rgRegion.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 80 lines | 67 code | 0 blank | 13 comment | 14 complexity | e450092533558698b36cafdfc69a9434 MD5 | raw file
 1"""
 2released under the terms of the LGPL
 3copyright ross lazarus August 2007 
 4for the rgenetics project
 5
 6Special galaxy tool for the camp2007 data
 7Allows grabbing arbitrary columns from an arbitrary region
 8
 9Needs a mongo results file in the location hardwired below or could be passed in as
10a library parameter - but this file must have a very specific structure
11rs chrom offset float1...floatn
12
13called as
14    <command interpreter="python">
15        rsRegion.py $infile '$cols' $r $tag $out_file1
16    </command>
17
18cols is a delimited list of chosen column names for the subset
19r is a ucsc location region pasted into the tool
20
21"""
22
23
24import sys,string       
25
26trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
27print >> sys.stdout, '##rgRegion.py started'
28if len(sys.argv) <> 6: 
29  print >> sys.stdout, '##!expected  params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv)
30  sys.exit(1)
31print '##got %d - %s' % (len(sys.argv),sys.argv)
32# quick and dirty for galaxy - we always get something for each parameter
33fname = sys.argv[1]
34wewant = sys.argv[2].split(',')
35region = sys.argv[3].lower()
36tag = sys.argv[4].translate(trantab)
37ofname = sys.argv[5] 
38myname = 'rgRegion'
39if len(wewant) == 0: # no columns selected?
40  print >> sys.stdout, '##!%s:  no columns selected - cannot run' % myname
41  sys.exit(1)
42try:
43  f = open(fname,'r')
44except: # bad input file name?
45  print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname)
46  sys.exit(1)
47try: # TODO make a regexp?
48  c,rest = region.split(':')
49  c = c.replace('chr','') # leave although will break strict genome graphs  
50  rest = rest.replace(',','') # remove commas
51  spos,epos = rest.split('-')
52  spos = int(spos)
53  epos = int(epos)
54except:
55  print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region)
56  sys.exit(1)
57print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos)
58res = []
59cnames = f.next().strip().split() # column titles for output
60linelen = len(cnames)
61wewant = [int(x) - 1 for x in wewant] # need col numbers base 0
62for n,l in enumerate(f):
63  ll = l.strip().split()
64  thisc = ll[1]
65  thispos = int(ll[2])
66  if (thisc == c) and (thispos >= spos) and (thispos <= epos):
67     if len(ll) == linelen:
68        res.append([ll[x] for x in wewant]) # subset of columns!
69     else:
70        print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll))
71o = file(ofname,'w')
72res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string
73print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res))
74head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names
75o.write('%s\n' % '\t'.join(head)) # header row for output
76o.write(''.join(res))
77o.close()
78f.close()    
79
80