/tools/motif/seqpos.xml
https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 231 lines · 198 code · 33 blank · 0 comment · 0 complexity · 9f641e6bb05ab957915e2a8c816827ad MD5 · raw file
- <tool name="SeqPos motif tool" id="motif_denovo">
- <description>Find motifs from given regions enriched near the centers</description>
- <command interpreter="command">/bin/bash $shscript </command>
- <inputs>
- <param format="bed" name="bfile" type="data" label="BED file (at most 5K lines.If you have more than 5K lines,please sort them and pick top 5k lines first)" help="Tip: the chromosome in bed file cannot be something like 'chr1_xxxx'. You need to filter them out using the tool 'Filter and Sort -> Select' by 'NOT matching' for the pattern '^chr([0-9A-Za-z])+_'">
- <validator type="unspecified_build" />
- </param>
- <conditional name="mdb">
- <param name="mdb_select" type="select" label="select one to get the list of database" help="The curated database only includes human and mouse data. for detail, see below.">
- <option value="cistrome">show our curated cistrome motif database</option>
- <option value="public">show the list of public database. (pbm, JASPAR etc.)</option>
- </param>
- <when value="cistrome">
- <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
- <option value="cistrome.xml">cistrome</option>
- <option value="denovo">de novo motif search</option>
- </param>
- </when>
- <when value="public">
- <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
- <option value="pbm.xml">pbm</option>
- <option value="y1h.xml">y1h</option>
- <option value="transfac.xml">transfac</option>
- <option value="hpdi.xml">hpdi</option>
- <option value="jaspar.xml">jaspar</option>
- <option value="denovo">de novo motif search</option>
- </param>
- </when>
- </conditional>
- <param name="species_list" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which species to filter the results by (Optional)">
- <option value="hs,mm">Homo Sapien or Mus Musculus</option>
- <option value="ce">Caenorhabditis Elegans</option>
- <option value="dm">Drosophila Melanogaster</option>
- </param>
- <param name="width" type="integer" label="width of region to be scanned" value="600">
- <validator type="in_range" max="10000" min="100" message="width is out of range, width has to be between 100 to 10000" />
- </param>
- <param name="pval" type="float" label="p-value cutoff" value="0.001">
- <validator type="in_range" max="1" min="0" message="Pvalue is out of range, Pvalue has to be between 0 to 1" />
- </param>
- <param name="maxmotif" type="text" label="max output hits. (0 means output all fit the pvalue cutoff)" value="0" optional="true" />
- <param name="hcluster" type="text" label="The similarity cutoff for hierarchical clustering of the output (The higher, the more groups, 0 ~ 1)" value="0.8"/>
- </inputs>
- <outputs>
- <data format="xml" name="output_xml" label="SeqPos xml output on
- ${bfile.name}"/>
- <data format="html" name="output_html" label="SeqPos html output
- on ${bfile.name}"/>
- <data format="txt" name="log" label="SeqPos Log on ${bfile.name}"/>
- </outputs>
- <configfiles>
- <configfile name="shscript">
- #!/bin/bash
- #import os
- ## #DEBUG: dump params
- ## echo ${1} ${2} ${3} ${4} ${5} ${6} > tmp.txt
- ## cp tmp.txt ${4}
- ## cp tmp.txt ${5}
- #set $dollar = chr(36)
- #set $gt = chr(62)
- #set $lt = chr(60)
- #set $ad = chr(38)
- #set $path = os.path.abspath($__app__.config.tool_path)
- ##NOTE: ${3} will come as as a list: pbm.xml,transfac.xml,denovo OR as a
- ##singleton: pbm.xml
- ##we can send this list using the -m param, BUT we need to handle the
- ##denovo special case
- ## check the number of lines...
- lines=$bfile.metadata.data_lines
- ## check the format of bed file
- format=`$path/validation/fcfunc.py $bfile`
- if [[ ${dollar}lines -gt 5000 ]];then
- echo "Total lines of the bed file exceed the limit of 5000 lines!" ${gt}${ad}2;
- exit;
- elif [[ ${dollar}format != "passed" ]];then
- echo ${dollar}format ${gt}${ad}2;
- exit;
- else
- if [ "$mdb.search_type" == "None" ]; then ##ERROR: no search type selected
- echo "Please specify what type of motif database to use OR select de novo" ${gt} tmp.txt
- cp tmp.txt $output_xml
- cp tmp.txt $output_html
- cp tmp.txt $output_html.extra_files_path
- else
- DENOVO=""
- DB=""
- ##parse out to search_types, use regex ","
- if [[ "$mdb.search_type" =~ "," ]]; then #list
- if [[ "$mdb.search_type" =~ "denovo" ]]; then
- DENOVO="-d"
- ##REMOVE ',denovo' from the list
- #set $tmp = str($mdb.search_type).replace(',denovo','')
- DB="-m $tmp"
- else
- DB="-m $mdb.search_type"
- fi
- else ##singleton
- if [ $mdb.search_type != "denovo" ]; then
- DB="-m $mdb.search_type"
- else
- DENOVO="-d"
- fi
- fi
-
- SPECIESLIST=""
- if [ "$species_list" == "hs,mm" -o "$species_list" == "ce" -o "$species_list" == "dm" ]; then
- SPECIESLIST="-s $species_list"
- fi
- ##AT this point DENOVO and DB are set correctly, we now make the call
- echo ${dollar}DB
- echo ${dollar}DENOVO
- echo ${dollar}SPECIESLIST
- MDSeqPos.py $bfile $bfile.metadata.dbkey ${dollar}DB ${dollar}DENOVO ${dollar}SPECIESLIST -v -c --hcluster="$hcluster" -w "$width" --maxmotif=$maxmotif -p "$pval" ${ad}${gt} $log
- ##SPECIAL case, if no denovo, then create an empty xml file
- if [ "${dollar}DENOVO" == "" ]; then
- touch results/denovo.xml
- fi
- cp results/denovo.xml $output_xml
- cp results/mdseqpos_index.html $output_html
- ## copy over the extra files
- EXTRA_FILE_DIR=$output_html.extra_files_path
- mkdir ${dollar}EXTRA_FILE_DIR
- cp -R results/* ${dollar}EXTRA_FILE_DIR
- ####cp results/*.js ${dollar}EXTRA_FILE_DIR
- ####cp results/*.css ${dollar}EXTRA_FILE_DIR
- ####cp results/*.png ${dollar}EXTRA_FILE_DIR
- ##copy over the motif logos if there are any
- ##save a list of *.png files to list.txt
- ##redirect errors for ls to /dev/null, b/c we don't want the msg in list.txt
- ####ls *.png ${gt} list.txt ${gt} list.txt 2${gt} /dev/null
- ####if [ -s 'list.txt' ]; then
- ##IF list is NOT and empty file
- ####cp *.png ${dollar}EXTRA_FILE_DIR
- ####fi
- fi
- fi
- </configfile>
- </configfiles>
- <help>
- The **SeqPos** tool will find motifs enriched in a set of
- regions. **SeqPos** use the distances from motif positions to the peak
- summits ( center of the regions) to find the most enriched motifs near
- peak summits. **SeqPos** can scan all the motifs in TRANSFAC, Matha's
- Protein Binding Microarray ( a.k.a PBM) and Scot Wolfe's protein DNA
- binding database ( y1h). Also **SeqPos** can try to find *de novo*
- motifs using MDscan algorithm. At last, **SeqPos** can cluster the
- similar motifs in a cluster tree to help user filter out the redundant
- motifs. This tool is made by Cliff Meyer and Len Taing. A detail
- explanation of the algorithm can be found in the supplementary
- material of the paper "Nucleosome dynamics define transcriptional
- enhancers." (Nat Genet, 42(4):343-347) The tool was modified then by
- Jian Ma and Tao Liu. Version: 0.590.
- About our curated cistrome motif database: This database only
- includes human and mouse data. It puts data from Transfac,
- JASPAR, UniPROBE (pbm), hPDI together, also it includes the motifs derived
- from ChIP-seq data. After that we delete the motifs look similar from
- each other to keep a clean and smaller database. This database is a
- recommended one and always in updating.
- .. class:: infomark
- **TIP:** Please make sure the regions in your BED file is valid! If
- the region is out of boundary of chromosome, it will cause error. Also
- please avoid abnormal chromosome names.
- .. class:: infomark
- **TIP:** The running time is increasing with the number of
- regions. Please avoid using more than 10 thousand regions for input.
- .. class:: warningmark
- **NEED IMPROVEMENT**
- -----
- **Parameters**
- - **BED file** is the input file. It can be the output from peak
- calling softwares. Please pay attention that the regions in the BED
- file should not be out of boundary of chromosome.
- *This file can only contain at most 5000 lines. If not, please
- filter it using Galaxy:Filter and Sort tool*.
- - **Genome Assembly version** is the UCSC database version.
- - **Motif databases** is the known motif collections in Cistrome,
- including TRANSFAC, PBM and Scot wolfe's database. You can select
- *de novo motif search* to enable *de novo* motif scan.
- - **Species list** are the species that you want to filter the results
- with. Select none of the species to see all of the results.
- - **Width of regions** is the region to scan for motifs around peak
- summits ( centers of input regions).
- - **P-value cutoff** can be used to filter the results.
- .. class:: infomark
- **TIP:** To browse the known motif databases, click here_
- .. _here: http://cistrome.org/motif/
- -----
- **Output**
- - **HTML output** can be open in web browser. Users can browse the
- result in either the middle list view of the page or the bottom
- cluster tree view, and the detail of motif can be seen in the top
- detail view. The list view is sortable at every field. The detail
- view provides two buttons to open the detail information in a
- separate webpage, or to show the PSSM of the motif.
- - **XML output** is the XML formated output.
- - **LOG file** is for job log. If you see errors, please attach this
- in the bug report.
- </help>
- </tool>