seqpos.xml | searchcode

/tools/motif/seqpos.xml

https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 231 lines · 198 code · 33 blank · 0 comment · 0 complexity · 9f641e6bb05ab957915e2a8c816827ad MD5 · raw file

<tool name="SeqPos motif tool" id="motif_denovo">
  <description>Find motifs from given regions enriched near the centers</description>
  <command interpreter="command">/bin/bash $shscript </command>
  <inputs>
      <param format="bed" name="bfile" type="data" label="BED file (at most 5K lines.If you have more than 5K lines,please sort them and pick top 5k lines first)" help="Tip: the chromosome in bed file cannot be something like 'chr1_xxxx'. You need to filter them out using the tool 'Filter and Sort -> Select' by 'NOT matching' for the pattern '^chr([0-9A-Za-z])+_'">
        <validator type="unspecified_build" />
      </param>
      <conditional name="mdb">
        <param name="mdb_select" type="select" label="select one to get the list of database" help="The curated database only includes human and mouse data. for detail, see below.">
          <option value="cistrome">show our curated cistrome motif database</option>
          <option value="public">show the list of public database. (pbm, JASPAR etc.)</option>
        </param>
        <when value="cistrome">
          <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
            <option value="cistrome.xml">cistrome</option>
            <option value="denovo">de novo motif search</option>
          </param>
        </when>
        <when value="public">
          <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
            <option value="pbm.xml">pbm</option>
            <option value="y1h.xml">y1h</option>
            <option value="transfac.xml">transfac</option>
            <option value="hpdi.xml">hpdi</option>
            <option value="jaspar.xml">jaspar</option>
	        <option value="denovo">de novo motif search</option>
          </param>
        </when>
      </conditional>
      <param name="species_list" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which species to filter the results by (Optional)">
          <option value="hs,mm">Homo Sapien or Mus Musculus</option>
          <option value="ce">Caenorhabditis Elegans</option>
          <option value="dm">Drosophila Melanogaster</option>
      </param>
      <param name="width" type="integer" label="width of region to be scanned" value="600">
      	<validator type="in_range" max="10000" min="100" message="width is out of range, width has to be between 100 to 10000" />
      </param>
      <param name="pval" type="float" label="p-value cutoff" value="0.001">
        <validator type="in_range" max="1" min="0" message="Pvalue is out of range, Pvalue has to be between 0 to 1" />
      </param>
      <param name="maxmotif" type="text" label="max output hits. (0 means output all fit the pvalue cutoff)" value="0" optional="true" />
      <param name="hcluster" type="text" label="The similarity cutoff for hierarchical clustering of the output (The higher, the more groups, 0 ~ 1)" value="0.8"/>
  </inputs>
  <outputs>
      <data format="xml" name="output_xml" label="SeqPos xml output on
      ${bfile.name}"/>
      <data format="html" name="output_html" label="SeqPos html output
      on ${bfile.name}"/>
      <data format="txt" name="log" label="SeqPos Log on ${bfile.name}"/>
  </outputs>
  <configfiles>
    <configfile name="shscript">
#!/bin/bash
#import os

## #DEBUG: dump params
## echo ${1} ${2} ${3} ${4} ${5} ${6} > tmp.txt
## cp tmp.txt ${4}
## cp tmp.txt ${5}

#set $dollar = chr(36)
#set $gt = chr(62)
#set $lt = chr(60)
#set $ad = chr(38)

#set $path = os.path.abspath($__app__.config.tool_path)

##NOTE: ${3} will come as as a list: pbm.xml,transfac.xml,denovo OR as a 
##singleton: pbm.xml
##we can send this list using the -m param, BUT we need to handle the 
##denovo special case

## check the number of lines...
lines=$bfile.metadata.data_lines
## check the format of bed file
format=`$path/validation/fcfunc.py $bfile`

if [[ ${dollar}lines -gt 5000 ]];then
    echo "Total lines of the bed file exceed the limit of 5000 lines!" ${gt}${ad}2;
    exit;
elif [[ ${dollar}format != "passed" ]];then
    echo ${dollar}format ${gt}${ad}2;
    exit;
else

if [ "$mdb.search_type" == "None" ]; then ##ERROR: no search type selected
   echo "Please specify what type of motif database to use OR select de novo" ${gt} tmp.txt
   cp tmp.txt $output_xml
   cp tmp.txt $output_html
   cp tmp.txt $output_html.extra_files_path
else
    DENOVO=""
    DB=""
    ##parse out to search_types, use regex ","
    if [[ "$mdb.search_type" =~ "," ]]; then #list
	if [[ "$mdb.search_type" =~ "denovo" ]]; then
	    DENOVO="-d"
	    ##REMOVE ',denovo' from the list
            #set $tmp = str($mdb.search_type).replace(',denovo','') 
            DB="-m $tmp"
	else
            DB="-m $mdb.search_type"
        fi
    else ##singleton
	if [ $mdb.search_type != "denovo" ]; then
	    DB="-m $mdb.search_type"
	else
	    DENOVO="-d"
	fi
    fi
    
    SPECIESLIST=""
    if [ "$species_list" == "hs,mm" -o "$species_list" == "ce" -o "$species_list" == "dm" ]; then
        SPECIESLIST="-s $species_list"
    fi

    ##AT this point DENOVO and DB are set correctly, we now make the call
    echo ${dollar}DB
    echo ${dollar}DENOVO
    echo ${dollar}SPECIESLIST
    MDSeqPos.py $bfile $bfile.metadata.dbkey ${dollar}DB ${dollar}DENOVO ${dollar}SPECIESLIST -v -c --hcluster="$hcluster" -w "$width" --maxmotif=$maxmotif -p "$pval" ${ad}${gt} $log

    ##SPECIAL case, if no denovo, then create an empty xml file
    if [ "${dollar}DENOVO" == "" ]; then
	touch results/denovo.xml
    fi

    cp results/denovo.xml $output_xml
    cp results/mdseqpos_index.html $output_html
    ## copy over the extra files
    EXTRA_FILE_DIR=$output_html.extra_files_path
    mkdir ${dollar}EXTRA_FILE_DIR
    cp -R results/* ${dollar}EXTRA_FILE_DIR
    ####cp results/*.js ${dollar}EXTRA_FILE_DIR
    ####cp results/*.css ${dollar}EXTRA_FILE_DIR
    ####cp results/*.png ${dollar}EXTRA_FILE_DIR

    ##copy over the motif logos if there are any
    ##save a list of *.png files to list.txt
    ##redirect errors for ls to /dev/null, b/c we don't want the msg in list.txt
    ####ls *.png ${gt} list.txt ${gt} list.txt 2${gt} /dev/null
    ####if [ -s 'list.txt' ]; then 
	##IF list is NOT and empty file
	####cp *.png ${dollar}EXTRA_FILE_DIR
    ####fi

fi

fi
    </configfile>
  </configfiles>
  <help>
The **SeqPos** tool will find motifs enriched in a set of
regions. **SeqPos** use the distances from motif positions to the peak
summits ( center of the regions) to find the most enriched motifs near
peak summits. **SeqPos** can scan all the motifs in TRANSFAC, Matha's
Protein Binding Microarray ( a.k.a PBM) and Scot Wolfe's protein DNA
binding database ( y1h). Also **SeqPos** can try to find *de novo*
motifs using MDscan algorithm. At last, **SeqPos** can cluster the
similar motifs in a cluster tree to help user filter out the redundant
motifs. This tool is made by Cliff Meyer and Len Taing. A detail
explanation of the algorithm can be found in the supplementary
material of the paper "Nucleosome dynamics define transcriptional
enhancers." (Nat Genet, 42(4):343-347) The tool was modified then by
Jian Ma and Tao Liu. Version: 0.590.

About our curated cistrome motif database: This database only 
includes human and mouse data. It puts data from Transfac, 
JASPAR, UniPROBE (pbm), hPDI together, also it includes the motifs derived 
from ChIP-seq data. After that we delete the motifs look similar from 
each other to keep a clean and smaller database. This database is a 
recommended one and always in updating.

.. class:: infomark

**TIP:** Please make sure the regions in your BED file is valid! If
the region is out of boundary of chromosome, it will cause error. Also
please avoid abnormal chromosome names.

.. class:: infomark

**TIP:** The running time is increasing with the number of
regions. Please avoid using more than 10 thousand regions for input.

.. class:: warningmark

**NEED IMPROVEMENT**

-----

**Parameters**

- **BED file** is the input file. It can be the output from peak
  calling softwares. Please pay attention that the regions in the BED
  file should not be out of boundary of chromosome. 
  *This file can only contain at most 5000 lines. If not, please 
  filter it using Galaxy:Filter and Sort tool*.

- **Genome Assembly version** is the UCSC database version.
- **Motif databases** is the known motif collections in Cistrome,
  including TRANSFAC, PBM and Scot wolfe's database. You can select
  *de novo motif search* to enable *de novo* motif scan.
- **Species list** are the species that you want to filter the results
  with.  Select none of the species to see all of the results.
- **Width of regions** is the region to scan for motifs around peak
  summits ( centers of input regions).
- **P-value cutoff** can be used to filter the results.

.. class:: infomark

**TIP:** To browse the known motif databases, click here_

.. _here: http://cistrome.org/motif/

-----

**Output**

- **HTML output** can be open in web browser. Users can browse the
  result in either the middle list view of the page or the bottom
  cluster tree view, and the detail of motif can be seen in the top
  detail view. The list view is sortable at every field. The detail
  view provides two buttons to open the detail information in a
  separate webpage, or to show the PSSM of the motif.
- **XML output** is the XML formated output.
- **LOG file** is for job log. If you see errors, please attach this
  in the bug report.

  </help>

</tool>