/tools/rgenetics/rgRegion.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 80 lines · 61 code · 5 blank · 14 comment · 14 complexity · e450092533558698b36cafdfc69a9434 MD5 · raw file

  1. """
  2. released under the terms of the LGPL
  3. copyright ross lazarus August 2007
  4. for the rgenetics project
  5. Special galaxy tool for the camp2007 data
  6. Allows grabbing arbitrary columns from an arbitrary region
  7. Needs a mongo results file in the location hardwired below or could be passed in as
  8. a library parameter - but this file must have a very specific structure
  9. rs chrom offset float1...floatn
  10. called as
  11. <command interpreter="python">
  12. rsRegion.py $infile '$cols' $r $tag $out_file1
  13. </command>
  14. cols is a delimited list of chosen column names for the subset
  15. r is a ucsc location region pasted into the tool
  16. """
  17. import sys,string
  18. trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
  19. print >> sys.stdout, '##rgRegion.py started'
  20. if len(sys.argv) <> 6:
  21. print >> sys.stdout, '##!expected params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv)
  22. sys.exit(1)
  23. print '##got %d - %s' % (len(sys.argv),sys.argv)
  24. # quick and dirty for galaxy - we always get something for each parameter
  25. fname = sys.argv[1]
  26. wewant = sys.argv[2].split(',')
  27. region = sys.argv[3].lower()
  28. tag = sys.argv[4].translate(trantab)
  29. ofname = sys.argv[5]
  30. myname = 'rgRegion'
  31. if len(wewant) == 0: # no columns selected?
  32. print >> sys.stdout, '##!%s: no columns selected - cannot run' % myname
  33. sys.exit(1)
  34. try:
  35. f = open(fname,'r')
  36. except: # bad input file name?
  37. print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname)
  38. sys.exit(1)
  39. try: # TODO make a regexp?
  40. c,rest = region.split(':')
  41. c = c.replace('chr','') # leave although will break strict genome graphs
  42. rest = rest.replace(',','') # remove commas
  43. spos,epos = rest.split('-')
  44. spos = int(spos)
  45. epos = int(epos)
  46. except:
  47. print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region)
  48. sys.exit(1)
  49. print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos)
  50. res = []
  51. cnames = f.next().strip().split() # column titles for output
  52. linelen = len(cnames)
  53. wewant = [int(x) - 1 for x in wewant] # need col numbers base 0
  54. for n,l in enumerate(f):
  55. ll = l.strip().split()
  56. thisc = ll[1]
  57. thispos = int(ll[2])
  58. if (thisc == c) and (thispos >= spos) and (thispos <= epos):
  59. if len(ll) == linelen:
  60. res.append([ll[x] for x in wewant]) # subset of columns!
  61. else:
  62. print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll))
  63. o = file(ofname,'w')
  64. res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string
  65. print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res))
  66. head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names
  67. o.write('%s\n' % '\t'.join(head)) # header row for output
  68. o.write(''.join(res))
  69. o.close()
  70. f.close()