/tools/rgenetics/rgWebLogo3.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 157 lines · 124 code · 14 blank · 19 comment · 28 complexity · aa4e3a2c86b07640948b1e3422acaa77 MD5 · raw file

  1. """
  2. # modified june 2 ross lazarus to add units option at Assaf Gordon's suggestion
  3. # rgWebLogo3.py
  4. # wrapper to check that all fasta files are same length
  5. """
  6. import optparse, os, sys, subprocess, tempfile
  7. WEBLOGO = 'weblogo' # executable name for weblogo3 - confusing isn't it?
  8. class WL3:
  9. """
  10. simple wrapper class to check fasta sequence lengths are all identical
  11. """
  12. FASTASTARTSYM = '>'
  13. badseq = '## error - sequences in file %s are not all the same length - cannot proceed. Please read the tool documentation carefully'
  14. def __init__(self,opts=None):
  15. assert opts<>None,'WL3 class needs opts passed in - got None'
  16. self.opts = opts
  17. self.fastaf = file(self.opts.input,'r')
  18. self.clparams = {}
  19. def whereis(self,program):
  20. for path in os.environ.get('PATH', '').split(':'):
  21. if os.path.exists(os.path.join(path, program)) and not os.path.isdir(os.path.join(path, program)):
  22. return os.path.join(path, program)
  23. return None
  24. def runCL(self):
  25. """ construct and run a command line
  26. """
  27. wl = self.whereis(WEBLOGO)
  28. if not wl:
  29. print >> sys.stderr, '## rgWebLogo3.py error - cannot locate the weblogo binary %s on the current path' % WEBLOGO
  30. print >> sys.stderr, '## Please ensure it is installed and working from http://code.google.com/p/weblogo'
  31. sys.exit(1)
  32. cll = [WEBLOGO,]
  33. cll += [' '.join(it) for it in list(self.clparams.items())]
  34. cl = ' '.join(cll)
  35. assert cl > '', 'runCL needs a command line as clparms'
  36. fd,templog = tempfile.mkstemp(suffix='rgtempRun.txt')
  37. tlf = open(templog,'w')
  38. process = subprocess.Popen(cl, shell=True, stderr=tlf, stdout=tlf)
  39. rval = process.wait()
  40. tlf.close()
  41. tlogs = ''.join(open(templog,'r').readlines())
  42. if len(tlogs) > 1:
  43. s = '## executing %s returned status %d and log (stdout/stderr) records: \n%s\n' % (cl,rval,tlogs)
  44. else:
  45. s = '## executing %s returned status %d. Nothing appeared on stderr/stdout\n' % (cl,rval)
  46. os.unlink(templog) # always
  47. if rval <> 0:
  48. print >> sys.stderr, '## rgWebLogo3.py error - executing %s returned error code %d' % (cl,rval)
  49. print >> sys.stderr, '## This may be a data problem or a tool dependency (%s) installation problem' % WEBLOGO
  50. print >> sys.stderr, '## Please ensure %s is correctly installed and working on the command line -see http://code.google.com/p/weblogo' % WEBLOGO
  51. sys.exit(1)
  52. return s
  53. def iter_fasta(self):
  54. """
  55. generator for fasta sequences from a file
  56. """
  57. aseq = []
  58. seqname = None
  59. for i,row in enumerate(self.fastaf):
  60. if row.startswith(self.FASTASTARTSYM):
  61. if seqname <> None: # already in a sequence
  62. s = ''.join(aseq)
  63. l = len(s)
  64. yield (seqname,l)
  65. seqname = row[1:].strip()
  66. aseq = []
  67. else:
  68. if i > 0:
  69. print >> sys.stderr,'Invalid fasta file %s - does not start with %s - please read the tool documentation carefully' % (self.opts.input,self.FASTASTARTSYM)
  70. sys.exit(1)
  71. else:
  72. seqname = row[1:].strip()
  73. else: # sequence row
  74. if seqname == None:
  75. print >> sys.stderr,'Invalid fasta file %s - does not start with %s - please read the tool documentation carefully' % (self.opts.input,self.FASTASTARTSYM)
  76. sys.exit(1)
  77. else:
  78. aseq.append(row.strip())
  79. if seqname <> None: # last one
  80. l = len(''.join(aseq))
  81. yield (seqname,l)
  82. def fcheck(self):
  83. """ are all fasta sequence same length?
  84. might be mongo big
  85. """
  86. flen = None
  87. lasti = None
  88. f = self.iter_fasta()
  89. for i,(seqname,seqlen) in enumerate(f):
  90. lasti = i
  91. if i == 0:
  92. flen = seqlen
  93. else:
  94. if seqlen <> flen:
  95. print >> sys.stderr,self.badseq % self.opts.input
  96. sys.exit(1)
  97. return '# weblogo input %s has %d sequences all of length %d' % (self.opts.input,lasti,flen)
  98. def run(self):
  99. check = self.fcheck()
  100. self.clparams['-f'] = self.opts.input
  101. self.clparams['-o'] = self.opts.output
  102. self.clparams['-t'] = '"%s"' % self.opts.logoname # must be wrapped as a string
  103. self.clparams['-F'] = self.opts.outformat
  104. if self.opts.size <> None:
  105. self.clparams['-s'] = self.opts.size
  106. if self.opts.lower <> None:
  107. self.clparams['-l'] = self.opts.lower
  108. if self.opts.upper <> None:
  109. self.clparams['-u'] = self.opts.upper
  110. if self.opts.colours <> None:
  111. self.clparams['-c'] = self.opts.colours
  112. if self.opts.units <> None:
  113. self.clparams['-U'] = self.opts.units
  114. s = self.runCL()
  115. return check,s
  116. if __name__ == '__main__':
  117. '''
  118. called as
  119. <command interpreter="python">
  120. rgWebLogo3.py --outformat $outformat -s $size -i $input -o $output -t "$logoname" -c "$colours"
  121. #if $range.mode == 'part'
  122. -l "$range.seqstart" -u "$range.seqend"
  123. #end if
  124. </command>
  125. '''
  126. op = optparse.OptionParser()
  127. op.add_option('-i', '--input', default=None)
  128. op.add_option('-F', '--outformat', default='png')
  129. op.add_option('-s', '--size', default=None)
  130. op.add_option('-o', '--output', default='rgWebLogo3')
  131. op.add_option('-t', '--logoname', default='rgWebLogo3')
  132. op.add_option('-c', '--colours', default=None)
  133. op.add_option('-l', '--lower', default=None)
  134. op.add_option('-u', '--upper', default=None)
  135. op.add_option('-U', '--units', default=None)
  136. opts, args = op.parse_args()
  137. assert opts.input <> None,'weblogo3 needs a -i parameter with a fasta input file - cannot open'
  138. assert os.path.isfile(opts.input),'weblogo3 needs a valid fasta input file - cannot open %s' % opts.input
  139. w = WL3(opts)
  140. checks,s = w.run()
  141. print >> sys.stdout, checks # for info