/tools/expression/diffexp_code.py
Python | 186 lines | 177 code | 5 blank | 4 comment | 5 complexity | 3ced7ac0aaf9561975a64a4c3c9819d8 MD5 | raw file
- #build list of available data
- import string, os, sys, glob, shutil
- import galaxy.util, shlex
- from galaxy import datatypes, config, jobs
- from com.lilly.cistrome import htmlParser
- #repository = "/usr/local/galaxy/data/rg/library"
- def getHasPheno(phef):
- if not phef:
- return [('Missing input','nothing',False)]
- hasPheno = phef.metadata.pheno
- phenoOptions = []
- if (hasPheno == None):
- phenoOptions.append(('Manual', 'manual', True))
- else:
- phenoOptions.append(('Use uploaded pheno file', 'file', True))
- phenoOptions.append(('Manual', 'manual', False))
- return phenoOptions
- def getSamples(phef):
- if not phef:
- return []
- #phe = phef.metadata.pheno
- infile = open(phef.get_file_name())
- header = infile.readline().strip()
- # samples = shlex.split(header)
- samples = header.split('\t')
- if samples[0]=="":
- samples = samples[1:]
- samples = map(lambda x : (x, x, False), samples)
- infile.close()
- return samples
- def get_phecols(phef):
- """ this version only gives dichotomous columns
- return phenotype column names for an rexpression - eg eset or affybatch
- A column with only 1 value doesn't change, so is not interesting for
- analysis. A column with a different value in every row is equivalent to a unique
- identifier so is also not interesting for anova or limma analysis so both these
- are removed after the concordance (count of unique terms) is constructed for each
- column.
- """
- sep = 'XXX'
- phe = phef.metadata.pheno
- if (phe == None):
- res = [('1. No dichotomous phenotype columns found','',False),]
- return res
- phelist = phe.split('\n')
- head = None
- for nrows,row in enumerate(phelist): # construct concordance
- row = row.strip().split('\t')
- if nrows == 0:
- head = row
- totcols = len(row)
- concordance = [{} for x in head] # list of dicts
- else:
- for col,code in enumerate(row): # keep column order correct
- if not concordance[col].get(code,None):
- concordance[col][code] = 1 # first
- else:
- concordance[col][code] += 1
-
- useCols = []
- useConc = [] # columns of interest to keep
- colNames = []
- nrows -= 1 # drop head from count
- for c,conc in enumerate(concordance): # c is column number
- ck = len(conc.keys())
- if (ck > 1) and (ck < nrows): # not all same and not all different!!
- useConc.append(conc) # keep concordance
- useCols.append(c) # keep column
- colNames.append(head[c])
- nuse = len(useCols)
- res = []
- for c,conc in enumerate(useConc): # these are all unique columns for the design matrix
- if len(conc.keys()) == 2: # dichotomous
- ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
- ccounts.sort()
- cc = ['%s (%d)' % (x[1],x[0]) for x in ccounts]
- retc = [x[1] for x in ccounts] # for design matrix, just want values, not counts
- show = '%s%s%s' % (colNames[c],sep,sep.join(cc)) # undecorate to get codes + counts in order
- rets = '%s%s%s' % (colNames[c],sep,sep.join(retc)) # for design
- res.append((rets,rets,False))
- if len(res) >= 1:
- x,y,z = res[0] # 0,1 = fid,iid
- res[0] = (x,y,True) # set second selected
- else:
- res = [('2. No dichotomous phenotype columns found','',False),]
- return res
- def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
- """Sets the name of the html file to the title"""
- trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation))
- job_name = param_dict.get( 'title', 'AffyExpress' ).translate(trantab)
- data = out_data['outfile']
- newname = '%s.html' % job_name
- data = app.datatypes_registry.change_datatype(data,'html')
- data.name = newname
- out_data['outfile'] = data
- logpath = out_data['outfile'].file_name
- loglist = file(logpath, 'r').readlines()
- mng = '### makenewgalaxy'
- txtpath = out_data['txtoutfile'].file_name
- # txtf = file(txtpath, 'w')
- # txtf.write('Probe\tSymbol\tDescription\tGene\tCytoband\tLog2Ratio\tPValue\n')
- atLeastOne = False
- newfile = [x for x in loglist if x.split('\t')[0] == mng]
- newfile = [x.strip().split('\t')[1:] for x in newfile]
- for (file_path, title) in newfile:
- okToOpen = os.path.exists(file_path)
- atLeastOne = okToOpen or atLeastOne
- if (not okToOpen):
- continue
- htmlParser.html2Txt(file_path, txtpath)
- # html = file(file_path, 'r').readlines()
- """
- data = [x for x in html if x.split(' ')[0] == '<td']
- i = 0
- for (line) in data:
- i = i + 1
- if (i == 1):
- td = string.split(line, '>')
- probeset = td[2]
- td = string.split(probeset, '<')
- probeset = td[0].strip()
- if (i == 2):
- td = string.split(line, '>')
- geneSymbol = td[1]
- td = string.split(geneSymbol, '<')
- geneSymbol = td[0].strip()
- if (i == 3):
- td = string.split(line, '>')
- description = td[1]
- td = string.split(description , '<')
- description = td[0].strip()
- if (i == 4):
- td = string.split(line, '>')
- entrezId = td[2]
- td = string.split(entrezId , '<')
- entrezId = td[0].strip()
- if (i == 5):
- td = string.split(line, '>')
- cytoband = td[2]
- td = string.split(cytoband , '<')
- cytoband = td[0].strip()
- if (i == 6):
- td = string.split(line, '>')
- log_2_ratio = td[1]
- td = string.split(log_2_ratio , '<')
- log_2_ratio = td[0].strip()
- if (i == 7):
- td = string.split(line, '>')
- pvalue = td[1]
- td = string.split(pvalue , '<')
- pvalue = td[0].strip()
- i = 0
- s = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (probeset, geneSymbol, description, entrezId, cytoband, log_2_ratio, pvalue)
- txtf.write(s)
- txtf.close()
- """
- if (not atLeastOne):
- # overwrite file and output a message
- txtf = file(txtpath, 'w')
- infname = param_dict['rexprIn'].name
- pthresh = param_dict['pthresh']
- txtf.write("No differentially expressed gene found in %s at p-value %s" % (infname, pthresh))
- txtf.close()
- out_data['txtoutfile'].set_peek()
- out_data['txtoutfile'].set_meta()
- out_data['txtoutfile'].set_size()
- title = param_dict.get('title', 'Differential_expression')
- out_data['txtoutfile'].name = title + "TargetGenes.txt"
- out_data['txtoutfile'].info = title + " target genes"
- app.model.context.flush()