/tools/rgenetics/rgRegion.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 80 lines · 61 code · 5 blank · 14 comment · 14 complexity · e450092533558698b36cafdfc69a9434 MD5 · raw file
- """
- released under the terms of the LGPL
- copyright ross lazarus August 2007
- for the rgenetics project
- Special galaxy tool for the camp2007 data
- Allows grabbing arbitrary columns from an arbitrary region
- Needs a mongo results file in the location hardwired below or could be passed in as
- a library parameter - but this file must have a very specific structure
- rs chrom offset float1...floatn
- called as
- <command interpreter="python">
- rsRegion.py $infile '$cols' $r $tag $out_file1
- </command>
- cols is a delimited list of chosen column names for the subset
- r is a ucsc location region pasted into the tool
- """
- import sys,string
-
- trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
- print >> sys.stdout, '##rgRegion.py started'
- if len(sys.argv) <> 6:
- print >> sys.stdout, '##!expected params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv)
- sys.exit(1)
- print '##got %d - %s' % (len(sys.argv),sys.argv)
- # quick and dirty for galaxy - we always get something for each parameter
- fname = sys.argv[1]
- wewant = sys.argv[2].split(',')
- region = sys.argv[3].lower()
- tag = sys.argv[4].translate(trantab)
- ofname = sys.argv[5]
- myname = 'rgRegion'
- if len(wewant) == 0: # no columns selected?
- print >> sys.stdout, '##!%s: no columns selected - cannot run' % myname
- sys.exit(1)
- try:
- f = open(fname,'r')
- except: # bad input file name?
- print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname)
- sys.exit(1)
- try: # TODO make a regexp?
- c,rest = region.split(':')
- c = c.replace('chr','') # leave although will break strict genome graphs
- rest = rest.replace(',','') # remove commas
- spos,epos = rest.split('-')
- spos = int(spos)
- epos = int(epos)
- except:
- print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region)
- sys.exit(1)
- print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos)
- res = []
- cnames = f.next().strip().split() # column titles for output
- linelen = len(cnames)
- wewant = [int(x) - 1 for x in wewant] # need col numbers base 0
- for n,l in enumerate(f):
- ll = l.strip().split()
- thisc = ll[1]
- thispos = int(ll[2])
- if (thisc == c) and (thispos >= spos) and (thispos <= epos):
- if len(ll) == linelen:
- res.append([ll[x] for x in wewant]) # subset of columns!
- else:
- print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll))
- o = file(ofname,'w')
- res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string
- print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res))
- head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names
- o.write('%s\n' % '\t'.join(head)) # header row for output
- o.write(''.join(res))
- o.close()
- f.close()