PageRenderTime 34ms CodeModel.GetById 16ms app.highlight 14ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/expression/diffexp_code.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 186 lines | 177 code | 5 blank | 4 comment | 7 complexity | 3ced7ac0aaf9561975a64a4c3c9819d8 MD5 | raw file
  1#build list of available data
  2import string, os, sys, glob, shutil
  3import galaxy.util, shlex
  4from galaxy import datatypes, config, jobs 
  5
  6from com.lilly.cistrome import htmlParser
  7
  8#repository = "/usr/local/galaxy/data/rg/library"
  9
 10def getHasPheno(phef):
 11  if not phef:
 12    return [('Missing input','nothing',False)]
 13  hasPheno = phef.metadata.pheno
 14  phenoOptions = [] 
 15  if (hasPheno == None):
 16    phenoOptions.append(('Manual', 'manual', True))
 17  else:
 18    phenoOptions.append(('Use uploaded pheno file', 'file', True))
 19    phenoOptions.append(('Manual', 'manual', False))
 20  return phenoOptions
 21
 22
 23def getSamples(phef):
 24  if not phef:
 25    return []
 26  #phe = phef.metadata.pheno
 27  infile = open(phef.get_file_name())
 28  header = infile.readline().strip()
 29#  samples = shlex.split(header)
 30  samples = header.split('\t')
 31  if samples[0]=="":
 32    samples = samples[1:]
 33  samples = map(lambda x : (x, x, False), samples)
 34  infile.close()
 35  return samples
 36
 37def get_phecols(phef):
 38   """ this version only gives dichotomous columns
 39   return phenotype column names for an rexpression - eg eset or affybatch
 40   A column with only 1 value doesn't change, so is not interesting for
 41   analysis. A column with a different value in every row is equivalent to a unique
 42   identifier so is also not interesting for anova or limma analysis so both these
 43   are removed after the concordance (count of unique terms) is constructed for each
 44   column. 
 45   """
 46   sep = 'XXX'
 47   phe = phef.metadata.pheno
 48   if (phe == None):
 49      res = [('1. No dichotomous phenotype columns found','',False),]
 50      return res
 51   phelist = phe.split('\n')
 52   head = None 
 53   for nrows,row in enumerate(phelist): # construct concordance
 54        row = row.strip().split('\t')
 55        if nrows == 0:
 56           head = row
 57           totcols = len(row) 
 58           concordance = [{} for x in head] # list of dicts
 59        else:
 60            for col,code in enumerate(row): # keep column order correct
 61                if not concordance[col].get(code,None):
 62                    concordance[col][code] = 1 # first
 63                else:
 64                    concordance[col][code] += 1
 65                
 66   useCols = []
 67   useConc = [] # columns of interest to keep
 68   colNames = []
 69   nrows -= 1 # drop head from count
 70   for c,conc in enumerate(concordance): # c is column number
 71        ck = len(conc.keys())
 72        if (ck > 1) and (ck < nrows): # not all same and not all different!!
 73            useConc.append(conc) # keep concordance
 74            useCols.append(c) # keep column
 75            colNames.append(head[c])
 76   nuse = len(useCols) 
 77   res = [] 
 78   for c,conc in enumerate(useConc): # these are all unique columns for the design matrix
 79            if len(conc.keys()) == 2: # dichotomous
 80                ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
 81                ccounts.sort()
 82                cc = ['%s (%d)' % (x[1],x[0]) for x in ccounts]
 83                retc = [x[1] for x in ccounts] # for design matrix, just want values, not counts
 84                show = '%s%s%s' % (colNames[c],sep,sep.join(cc)) # undecorate to get codes + counts in order
 85                rets = '%s%s%s' % (colNames[c],sep,sep.join(retc)) # for design
 86                res.append((rets,rets,False))
 87   if len(res) >= 1:
 88      x,y,z = res[0] # 0,1 = fid,iid
 89      res[0] = (x,y,True) # set second selected
 90   else:
 91      res = [('2. No dichotomous phenotype columns found','',False),]
 92   return res
 93
 94
 95def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
 96    """Sets the name of the html file to the title"""
 97    trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
 98    job_name = param_dict.get( 'title', 'AffyExpress' ).translate(trantab)
 99    data = out_data['outfile']
100    newname = '%s.html' % job_name
101    data = app.datatypes_registry.change_datatype(data,'html')
102    data.name = newname
103    out_data['outfile'] = data
104
105    logpath = out_data['outfile'].file_name
106    loglist = file(logpath, 'r').readlines()
107    mng = '### makenewgalaxy'
108    txtpath = out_data['txtoutfile'].file_name
109#    txtf = file(txtpath, 'w')
110#    txtf.write('Probe\tSymbol\tDescription\tGene\tCytoband\tLog2Ratio\tPValue\n')
111    atLeastOne = False
112    newfile = [x for x in loglist if x.split('\t')[0] == mng]
113    newfile = [x.strip().split('\t')[1:] for x in newfile]
114    for (file_path, title) in newfile:
115        okToOpen = os.path.exists(file_path)
116        atLeastOne = okToOpen or atLeastOne
117        if (not okToOpen):
118            continue
119
120        htmlParser.html2Txt(file_path, txtpath)
121#        html = file(file_path, 'r').readlines()
122        """
123        data = [x for x in html if x.split(' ')[0] == '<td']
124        i = 0
125        for (line) in data:
126            i = i + 1
127            if (i == 1):
128                td = string.split(line, '>')
129                probeset = td[2]
130                td = string.split(probeset, '<')
131                probeset = td[0].strip()
132            if (i == 2):
133                td = string.split(line, '>')
134                geneSymbol = td[1]
135                td = string.split(geneSymbol, '<')
136                geneSymbol = td[0].strip()
137            if (i == 3):
138                td = string.split(line, '>')
139                description = td[1]
140                td = string.split(description , '<')
141                description = td[0].strip()
142            if (i == 4):
143                td = string.split(line, '>')
144                entrezId = td[2]
145                td = string.split(entrezId , '<')
146                entrezId = td[0].strip()
147            if (i == 5):
148                td = string.split(line, '>')
149                cytoband = td[2]
150                td = string.split(cytoband , '<')
151                cytoband = td[0].strip()
152            if (i == 6):
153                td = string.split(line, '>')
154                log_2_ratio = td[1]
155                td = string.split(log_2_ratio , '<')
156                log_2_ratio = td[0].strip()
157            if (i == 7):
158                td = string.split(line, '>')
159                pvalue = td[1]
160                td = string.split(pvalue , '<')
161                pvalue = td[0].strip()
162
163                i = 0
164                s = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (probeset, geneSymbol, description, entrezId, cytoband, log_2_ratio, pvalue)
165                txtf.write(s)
166
167    txtf.close()
168"""
169
170    if (not atLeastOne):
171        # overwrite file and output a message
172        txtf = file(txtpath, 'w')
173        infname = param_dict['rexprIn'].name
174        pthresh = param_dict['pthresh']
175        txtf.write("No differentially expressed gene found in %s at p-value %s" % (infname, pthresh))
176        txtf.close()
177
178    out_data['txtoutfile'].set_peek()
179    out_data['txtoutfile'].set_meta()
180    out_data['txtoutfile'].set_size()
181    title = param_dict.get('title', 'Differential_expression')
182    out_data['txtoutfile'].name = title + "TargetGenes.txt"
183    out_data['txtoutfile'].info = title + " target genes"
184
185    app.model.context.flush()
186