PageRenderTime 29ms CodeModel.GetById 13ms app.highlight 6ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/motif/seqpos.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 231 lines | 198 code | 33 blank | 0 comment | 0 complexity | 9f641e6bb05ab957915e2a8c816827ad MD5 | raw file
  1<tool name="SeqPos motif tool" id="motif_denovo">
  2  <description>Find motifs from given regions enriched near the centers</description>
  3  <command interpreter="command">/bin/bash $shscript </command>
  4  <inputs>
  5      <param format="bed" name="bfile" type="data" label="BED file (at most 5K lines.If you have more than 5K lines,please sort them and pick top 5k lines first)" help="Tip: the chromosome in bed file cannot be something like 'chr1_xxxx'. You need to filter them out using the tool 'Filter and Sort -> Select' by 'NOT matching' for the pattern '^chr([0-9A-Za-z])+_'">
  6        <validator type="unspecified_build" />
  7      </param>
  8      <conditional name="mdb">
  9        <param name="mdb_select" type="select" label="select one to get the list of database" help="The curated database only includes human and mouse data. for detail, see below.">
 10          <option value="cistrome">show our curated cistrome motif database</option>
 11          <option value="public">show the list of public database. (pbm, JASPAR etc.)</option>
 12        </param>
 13        <when value="cistrome">
 14          <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
 15            <option value="cistrome.xml">cistrome</option>
 16            <option value="denovo">de novo motif search</option>
 17          </param>
 18        </when>
 19        <when value="public">
 20          <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
 21            <option value="pbm.xml">pbm</option>
 22            <option value="y1h.xml">y1h</option>
 23            <option value="transfac.xml">transfac</option>
 24            <option value="hpdi.xml">hpdi</option>
 25            <option value="jaspar.xml">jaspar</option>
 26	        <option value="denovo">de novo motif search</option>
 27          </param>
 28        </when>
 29      </conditional>
 30      <param name="species_list" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which species to filter the results by (Optional)">
 31          <option value="hs,mm">Homo Sapien or Mus Musculus</option>
 32          <option value="ce">Caenorhabditis Elegans</option>
 33          <option value="dm">Drosophila Melanogaster</option>
 34      </param>
 35      <param name="width" type="integer" label="width of region to be scanned" value="600">
 36      	<validator type="in_range" max="10000" min="100" message="width is out of range, width has to be between 100 to 10000" />
 37      </param>
 38      <param name="pval" type="float" label="p-value cutoff" value="0.001">
 39        <validator type="in_range" max="1" min="0" message="Pvalue is out of range, Pvalue has to be between 0 to 1" />
 40      </param>
 41      <param name="maxmotif" type="text" label="max output hits. (0 means output all fit the pvalue cutoff)" value="0" optional="true" />
 42      <param name="hcluster" type="text" label="The similarity cutoff for hierarchical clustering of the output (The higher, the more groups, 0 ~ 1)" value="0.8"/>
 43  </inputs>
 44  <outputs>
 45      <data format="xml" name="output_xml" label="SeqPos xml output on
 46      ${bfile.name}"/>
 47      <data format="html" name="output_html" label="SeqPos html output
 48      on ${bfile.name}"/>
 49      <data format="txt" name="log" label="SeqPos Log on ${bfile.name}"/>
 50  </outputs>
 51  <configfiles>
 52    <configfile name="shscript">
 53#!/bin/bash
 54#import os
 55
 56## #DEBUG: dump params
 57## echo ${1} ${2} ${3} ${4} ${5} ${6} > tmp.txt
 58## cp tmp.txt ${4}
 59## cp tmp.txt ${5}
 60
 61#set $dollar = chr(36)
 62#set $gt = chr(62)
 63#set $lt = chr(60)
 64#set $ad = chr(38)
 65
 66#set $path = os.path.abspath($__app__.config.tool_path)
 67
 68##NOTE: ${3} will come as as a list: pbm.xml,transfac.xml,denovo OR as a 
 69##singleton: pbm.xml
 70##we can send this list using the -m param, BUT we need to handle the 
 71##denovo special case
 72
 73## check the number of lines...
 74lines=$bfile.metadata.data_lines
 75## check the format of bed file
 76format=`$path/validation/fcfunc.py $bfile`
 77
 78if [[ ${dollar}lines -gt 5000 ]];then
 79    echo "Total lines of the bed file exceed the limit of 5000 lines!" ${gt}${ad}2;
 80    exit;
 81elif [[ ${dollar}format != "passed" ]];then
 82    echo ${dollar}format ${gt}${ad}2;
 83    exit;
 84else
 85
 86if [ "$mdb.search_type" == "None" ]; then ##ERROR: no search type selected
 87   echo "Please specify what type of motif database to use OR select de novo" ${gt} tmp.txt
 88   cp tmp.txt $output_xml
 89   cp tmp.txt $output_html
 90   cp tmp.txt $output_html.extra_files_path
 91else
 92    DENOVO=""
 93    DB=""
 94    ##parse out to search_types, use regex ","
 95    if [[ "$mdb.search_type" =~ "," ]]; then #list
 96	if [[ "$mdb.search_type" =~ "denovo" ]]; then
 97	    DENOVO="-d"
 98	    ##REMOVE ',denovo' from the list
 99            #set $tmp = str($mdb.search_type).replace(',denovo','') 
100            DB="-m $tmp"
101	else
102            DB="-m $mdb.search_type"
103        fi
104    else ##singleton
105	if [ $mdb.search_type != "denovo" ]; then
106	    DB="-m $mdb.search_type"
107	else
108	    DENOVO="-d"
109	fi
110    fi
111    
112    SPECIESLIST=""
113    if [ "$species_list" == "hs,mm" -o "$species_list" == "ce" -o "$species_list" == "dm" ]; then
114        SPECIESLIST="-s $species_list"
115    fi
116
117    ##AT this point DENOVO and DB are set correctly, we now make the call
118    echo ${dollar}DB
119    echo ${dollar}DENOVO
120    echo ${dollar}SPECIESLIST
121    MDSeqPos.py $bfile $bfile.metadata.dbkey ${dollar}DB ${dollar}DENOVO ${dollar}SPECIESLIST -v -c --hcluster="$hcluster" -w "$width" --maxmotif=$maxmotif -p "$pval" ${ad}${gt} $log
122
123    ##SPECIAL case, if no denovo, then create an empty xml file
124    if [ "${dollar}DENOVO" == "" ]; then
125	touch results/denovo.xml
126    fi
127
128    cp results/denovo.xml $output_xml
129    cp results/mdseqpos_index.html $output_html
130    ## copy over the extra files
131    EXTRA_FILE_DIR=$output_html.extra_files_path
132    mkdir ${dollar}EXTRA_FILE_DIR
133    cp -R results/* ${dollar}EXTRA_FILE_DIR
134    ####cp results/*.js ${dollar}EXTRA_FILE_DIR
135    ####cp results/*.css ${dollar}EXTRA_FILE_DIR
136    ####cp results/*.png ${dollar}EXTRA_FILE_DIR
137
138    ##copy over the motif logos if there are any
139    ##save a list of *.png files to list.txt
140    ##redirect errors for ls to /dev/null, b/c we don't want the msg in list.txt
141    ####ls *.png ${gt} list.txt ${gt} list.txt 2${gt} /dev/null
142    ####if [ -s 'list.txt' ]; then 
143	##IF list is NOT and empty file
144	####cp *.png ${dollar}EXTRA_FILE_DIR
145    ####fi
146
147fi
148
149fi
150    </configfile>
151  </configfiles>
152  <help>
153The **SeqPos** tool will find motifs enriched in a set of
154regions. **SeqPos** use the distances from motif positions to the peak
155summits ( center of the regions) to find the most enriched motifs near
156peak summits. **SeqPos** can scan all the motifs in TRANSFAC, Matha's
157Protein Binding Microarray ( a.k.a PBM) and Scot Wolfe's protein DNA
158binding database ( y1h). Also **SeqPos** can try to find *de novo*
159motifs using MDscan algorithm. At last, **SeqPos** can cluster the
160similar motifs in a cluster tree to help user filter out the redundant
161motifs. This tool is made by Cliff Meyer and Len Taing. A detail
162explanation of the algorithm can be found in the supplementary
163material of the paper "Nucleosome dynamics define transcriptional
164enhancers." (Nat Genet, 42(4):343-347) The tool was modified then by
165Jian Ma and Tao Liu. Version: 0.590.
166
167About our curated cistrome motif database: This database only 
168includes human and mouse data. It puts data from Transfac, 
169JASPAR, UniPROBE (pbm), hPDI together, also it includes the motifs derived 
170from ChIP-seq data. After that we delete the motifs look similar from 
171each other to keep a clean and smaller database. This database is a 
172recommended one and always in updating.
173
174.. class:: infomark
175
176**TIP:** Please make sure the regions in your BED file is valid! If
177the region is out of boundary of chromosome, it will cause error. Also
178please avoid abnormal chromosome names.
179
180.. class:: infomark
181
182**TIP:** The running time is increasing with the number of
183regions. Please avoid using more than 10 thousand regions for input.
184
185.. class:: warningmark
186
187**NEED IMPROVEMENT**
188
189-----
190
191**Parameters**
192
193- **BED file** is the input file. It can be the output from peak
194  calling softwares. Please pay attention that the regions in the BED
195  file should not be out of boundary of chromosome. 
196  *This file can only contain at most 5000 lines. If not, please 
197  filter it using Galaxy:Filter and Sort tool*.
198
199- **Genome Assembly version** is the UCSC database version.
200- **Motif databases** is the known motif collections in Cistrome,
201  including TRANSFAC, PBM and Scot wolfe's database. You can select
202  *de novo motif search* to enable *de novo* motif scan.
203- **Species list** are the species that you want to filter the results
204  with.  Select none of the species to see all of the results.
205- **Width of regions** is the region to scan for motifs around peak
206  summits ( centers of input regions).
207- **P-value cutoff** can be used to filter the results.
208
209.. class:: infomark
210
211**TIP:** To browse the known motif databases, click here_
212
213.. _here: http://cistrome.org/motif/
214
215-----
216
217**Output**
218
219- **HTML output** can be open in web browser. Users can browse the
220  result in either the middle list view of the page or the bottom
221  cluster tree view, and the detail of motif can be seen in the top
222  detail view. The list view is sortable at every field. The detail
223  view provides two buttons to open the detail information in a
224  separate webpage, or to show the PSSM of the motif.
225- **XML output** is the XML formated output.
226- **LOG file** is for job log. If you see errors, please attach this
227  in the bug report.
228
229  </help>
230
231</tool>