PageRenderTime 82ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/expression/diffexp_code.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 186 lines | 177 code | 5 blank | 4 comment | 5 complexity | 3ced7ac0aaf9561975a64a4c3c9819d8 MD5 | raw file
  1. #build list of available data
  2. import string, os, sys, glob, shutil
  3. import galaxy.util, shlex
  4. from galaxy import datatypes, config, jobs
  5. from com.lilly.cistrome import htmlParser
  6. #repository = "/usr/local/galaxy/data/rg/library"
  7. def getHasPheno(phef):
  8. if not phef:
  9. return [('Missing input','nothing',False)]
  10. hasPheno = phef.metadata.pheno
  11. phenoOptions = []
  12. if (hasPheno == None):
  13. phenoOptions.append(('Manual', 'manual', True))
  14. else:
  15. phenoOptions.append(('Use uploaded pheno file', 'file', True))
  16. phenoOptions.append(('Manual', 'manual', False))
  17. return phenoOptions
  18. def getSamples(phef):
  19. if not phef:
  20. return []
  21. #phe = phef.metadata.pheno
  22. infile = open(phef.get_file_name())
  23. header = infile.readline().strip()
  24. # samples = shlex.split(header)
  25. samples = header.split('\t')
  26. if samples[0]=="":
  27. samples = samples[1:]
  28. samples = map(lambda x : (x, x, False), samples)
  29. infile.close()
  30. return samples
  31. def get_phecols(phef):
  32. """ this version only gives dichotomous columns
  33. return phenotype column names for an rexpression - eg eset or affybatch
  34. A column with only 1 value doesn't change, so is not interesting for
  35. analysis. A column with a different value in every row is equivalent to a unique
  36. identifier so is also not interesting for anova or limma analysis so both these
  37. are removed after the concordance (count of unique terms) is constructed for each
  38. column.
  39. """
  40. sep = 'XXX'
  41. phe = phef.metadata.pheno
  42. if (phe == None):
  43. res = [('1. No dichotomous phenotype columns found','',False),]
  44. return res
  45. phelist = phe.split('\n')
  46. head = None
  47. for nrows,row in enumerate(phelist): # construct concordance
  48. row = row.strip().split('\t')
  49. if nrows == 0:
  50. head = row
  51. totcols = len(row)
  52. concordance = [{} for x in head] # list of dicts
  53. else:
  54. for col,code in enumerate(row): # keep column order correct
  55. if not concordance[col].get(code,None):
  56. concordance[col][code] = 1 # first
  57. else:
  58. concordance[col][code] += 1
  59. useCols = []
  60. useConc = [] # columns of interest to keep
  61. colNames = []
  62. nrows -= 1 # drop head from count
  63. for c,conc in enumerate(concordance): # c is column number
  64. ck = len(conc.keys())
  65. if (ck > 1) and (ck < nrows): # not all same and not all different!!
  66. useConc.append(conc) # keep concordance
  67. useCols.append(c) # keep column
  68. colNames.append(head[c])
  69. nuse = len(useCols)
  70. res = []
  71. for c,conc in enumerate(useConc): # these are all unique columns for the design matrix
  72. if len(conc.keys()) == 2: # dichotomous
  73. ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
  74. ccounts.sort()
  75. cc = ['%s (%d)' % (x[1],x[0]) for x in ccounts]
  76. retc = [x[1] for x in ccounts] # for design matrix, just want values, not counts
  77. show = '%s%s%s' % (colNames[c],sep,sep.join(cc)) # undecorate to get codes + counts in order
  78. rets = '%s%s%s' % (colNames[c],sep,sep.join(retc)) # for design
  79. res.append((rets,rets,False))
  80. if len(res) >= 1:
  81. x,y,z = res[0] # 0,1 = fid,iid
  82. res[0] = (x,y,True) # set second selected
  83. else:
  84. res = [('2. No dichotomous phenotype columns found','',False),]
  85. return res
  86. def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
  87. """Sets the name of the html file to the title"""
  88. trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
  89. job_name = param_dict.get( 'title', 'AffyExpress' ).translate(trantab)
  90. data = out_data['outfile']
  91. newname = '%s.html' % job_name
  92. data = app.datatypes_registry.change_datatype(data,'html')
  93. data.name = newname
  94. out_data['outfile'] = data
  95. logpath = out_data['outfile'].file_name
  96. loglist = file(logpath, 'r').readlines()
  97. mng = '### makenewgalaxy'
  98. txtpath = out_data['txtoutfile'].file_name
  99. # txtf = file(txtpath, 'w')
  100. # txtf.write('Probe\tSymbol\tDescription\tGene\tCytoband\tLog2Ratio\tPValue\n')
  101. atLeastOne = False
  102. newfile = [x for x in loglist if x.split('\t')[0] == mng]
  103. newfile = [x.strip().split('\t')[1:] for x in newfile]
  104. for (file_path, title) in newfile:
  105. okToOpen = os.path.exists(file_path)
  106. atLeastOne = okToOpen or atLeastOne
  107. if (not okToOpen):
  108. continue
  109. htmlParser.html2Txt(file_path, txtpath)
  110. # html = file(file_path, 'r').readlines()
  111. """
  112. data = [x for x in html if x.split(' ')[0] == '<td']
  113. i = 0
  114. for (line) in data:
  115. i = i + 1
  116. if (i == 1):
  117. td = string.split(line, '>')
  118. probeset = td[2]
  119. td = string.split(probeset, '<')
  120. probeset = td[0].strip()
  121. if (i == 2):
  122. td = string.split(line, '>')
  123. geneSymbol = td[1]
  124. td = string.split(geneSymbol, '<')
  125. geneSymbol = td[0].strip()
  126. if (i == 3):
  127. td = string.split(line, '>')
  128. description = td[1]
  129. td = string.split(description , '<')
  130. description = td[0].strip()
  131. if (i == 4):
  132. td = string.split(line, '>')
  133. entrezId = td[2]
  134. td = string.split(entrezId , '<')
  135. entrezId = td[0].strip()
  136. if (i == 5):
  137. td = string.split(line, '>')
  138. cytoband = td[2]
  139. td = string.split(cytoband , '<')
  140. cytoband = td[0].strip()
  141. if (i == 6):
  142. td = string.split(line, '>')
  143. log_2_ratio = td[1]
  144. td = string.split(log_2_ratio , '<')
  145. log_2_ratio = td[0].strip()
  146. if (i == 7):
  147. td = string.split(line, '>')
  148. pvalue = td[1]
  149. td = string.split(pvalue , '<')
  150. pvalue = td[0].strip()
  151. i = 0
  152. s = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (probeset, geneSymbol, description, entrezId, cytoband, log_2_ratio, pvalue)
  153. txtf.write(s)
  154. txtf.close()
  155. """
  156. if (not atLeastOne):
  157. # overwrite file and output a message
  158. txtf = file(txtpath, 'w')
  159. infname = param_dict['rexprIn'].name
  160. pthresh = param_dict['pthresh']
  161. txtf.write("No differentially expressed gene found in %s at p-value %s" % (infname, pthresh))
  162. txtf.close()
  163. out_data['txtoutfile'].set_peek()
  164. out_data['txtoutfile'].set_meta()
  165. out_data['txtoutfile'].set_size()
  166. title = param_dict.get('title', 'Differential_expression')
  167. out_data['txtoutfile'].name = title + "TargetGenes.txt"
  168. out_data['txtoutfile'].info = title + " target genes"
  169. app.model.context.flush()