/tools/motif/seqpos.xml

https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 231 lines · 198 code · 33 blank · 0 comment · 0 complexity · 9f641e6bb05ab957915e2a8c816827ad MD5 · raw file

  1. <tool name="SeqPos motif tool" id="motif_denovo">
  2. <description>Find motifs from given regions enriched near the centers</description>
  3. <command interpreter="command">/bin/bash $shscript </command>
  4. <inputs>
  5. <param format="bed" name="bfile" type="data" label="BED file (at most 5K lines.If you have more than 5K lines,please sort them and pick top 5k lines first)" help="Tip: the chromosome in bed file cannot be something like 'chr1_xxxx'. You need to filter them out using the tool 'Filter and Sort -> Select' by 'NOT matching' for the pattern '^chr([0-9A-Za-z])+_'">
  6. <validator type="unspecified_build" />
  7. </param>
  8. <conditional name="mdb">
  9. <param name="mdb_select" type="select" label="select one to get the list of database" help="The curated database only includes human and mouse data. for detail, see below.">
  10. <option value="cistrome">show our curated cistrome motif database</option>
  11. <option value="public">show the list of public database. (pbm, JASPAR etc.)</option>
  12. </param>
  13. <when value="cistrome">
  14. <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
  15. <option value="cistrome.xml">cistrome</option>
  16. <option value="denovo">de novo motif search</option>
  17. </param>
  18. </when>
  19. <when value="public">
  20. <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use">
  21. <option value="pbm.xml">pbm</option>
  22. <option value="y1h.xml">y1h</option>
  23. <option value="transfac.xml">transfac</option>
  24. <option value="hpdi.xml">hpdi</option>
  25. <option value="jaspar.xml">jaspar</option>
  26. <option value="denovo">de novo motif search</option>
  27. </param>
  28. </when>
  29. </conditional>
  30. <param name="species_list" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which species to filter the results by (Optional)">
  31. <option value="hs,mm">Homo Sapien or Mus Musculus</option>
  32. <option value="ce">Caenorhabditis Elegans</option>
  33. <option value="dm">Drosophila Melanogaster</option>
  34. </param>
  35. <param name="width" type="integer" label="width of region to be scanned" value="600">
  36. <validator type="in_range" max="10000" min="100" message="width is out of range, width has to be between 100 to 10000" />
  37. </param>
  38. <param name="pval" type="float" label="p-value cutoff" value="0.001">
  39. <validator type="in_range" max="1" min="0" message="Pvalue is out of range, Pvalue has to be between 0 to 1" />
  40. </param>
  41. <param name="maxmotif" type="text" label="max output hits. (0 means output all fit the pvalue cutoff)" value="0" optional="true" />
  42. <param name="hcluster" type="text" label="The similarity cutoff for hierarchical clustering of the output (The higher, the more groups, 0 ~ 1)" value="0.8"/>
  43. </inputs>
  44. <outputs>
  45. <data format="xml" name="output_xml" label="SeqPos xml output on
  46. ${bfile.name}"/>
  47. <data format="html" name="output_html" label="SeqPos html output
  48. on ${bfile.name}"/>
  49. <data format="txt" name="log" label="SeqPos Log on ${bfile.name}"/>
  50. </outputs>
  51. <configfiles>
  52. <configfile name="shscript">
  53. #!/bin/bash
  54. #import os
  55. ## #DEBUG: dump params
  56. ## echo ${1} ${2} ${3} ${4} ${5} ${6} > tmp.txt
  57. ## cp tmp.txt ${4}
  58. ## cp tmp.txt ${5}
  59. #set $dollar = chr(36)
  60. #set $gt = chr(62)
  61. #set $lt = chr(60)
  62. #set $ad = chr(38)
  63. #set $path = os.path.abspath($__app__.config.tool_path)
  64. ##NOTE: ${3} will come as as a list: pbm.xml,transfac.xml,denovo OR as a
  65. ##singleton: pbm.xml
  66. ##we can send this list using the -m param, BUT we need to handle the
  67. ##denovo special case
  68. ## check the number of lines...
  69. lines=$bfile.metadata.data_lines
  70. ## check the format of bed file
  71. format=`$path/validation/fcfunc.py $bfile`
  72. if [[ ${dollar}lines -gt 5000 ]];then
  73. echo "Total lines of the bed file exceed the limit of 5000 lines!" ${gt}${ad}2;
  74. exit;
  75. elif [[ ${dollar}format != "passed" ]];then
  76. echo ${dollar}format ${gt}${ad}2;
  77. exit;
  78. else
  79. if [ "$mdb.search_type" == "None" ]; then ##ERROR: no search type selected
  80. echo "Please specify what type of motif database to use OR select de novo" ${gt} tmp.txt
  81. cp tmp.txt $output_xml
  82. cp tmp.txt $output_html
  83. cp tmp.txt $output_html.extra_files_path
  84. else
  85. DENOVO=""
  86. DB=""
  87. ##parse out to search_types, use regex ","
  88. if [[ "$mdb.search_type" =~ "," ]]; then #list
  89. if [[ "$mdb.search_type" =~ "denovo" ]]; then
  90. DENOVO="-d"
  91. ##REMOVE ',denovo' from the list
  92. #set $tmp = str($mdb.search_type).replace(',denovo','')
  93. DB="-m $tmp"
  94. else
  95. DB="-m $mdb.search_type"
  96. fi
  97. else ##singleton
  98. if [ $mdb.search_type != "denovo" ]; then
  99. DB="-m $mdb.search_type"
  100. else
  101. DENOVO="-d"
  102. fi
  103. fi
  104. SPECIESLIST=""
  105. if [ "$species_list" == "hs,mm" -o "$species_list" == "ce" -o "$species_list" == "dm" ]; then
  106. SPECIESLIST="-s $species_list"
  107. fi
  108. ##AT this point DENOVO and DB are set correctly, we now make the call
  109. echo ${dollar}DB
  110. echo ${dollar}DENOVO
  111. echo ${dollar}SPECIESLIST
  112. MDSeqPos.py $bfile $bfile.metadata.dbkey ${dollar}DB ${dollar}DENOVO ${dollar}SPECIESLIST -v -c --hcluster="$hcluster" -w "$width" --maxmotif=$maxmotif -p "$pval" ${ad}${gt} $log
  113. ##SPECIAL case, if no denovo, then create an empty xml file
  114. if [ "${dollar}DENOVO" == "" ]; then
  115. touch results/denovo.xml
  116. fi
  117. cp results/denovo.xml $output_xml
  118. cp results/mdseqpos_index.html $output_html
  119. ## copy over the extra files
  120. EXTRA_FILE_DIR=$output_html.extra_files_path
  121. mkdir ${dollar}EXTRA_FILE_DIR
  122. cp -R results/* ${dollar}EXTRA_FILE_DIR
  123. ####cp results/*.js ${dollar}EXTRA_FILE_DIR
  124. ####cp results/*.css ${dollar}EXTRA_FILE_DIR
  125. ####cp results/*.png ${dollar}EXTRA_FILE_DIR
  126. ##copy over the motif logos if there are any
  127. ##save a list of *.png files to list.txt
  128. ##redirect errors for ls to /dev/null, b/c we don't want the msg in list.txt
  129. ####ls *.png ${gt} list.txt ${gt} list.txt 2${gt} /dev/null
  130. ####if [ -s 'list.txt' ]; then
  131. ##IF list is NOT and empty file
  132. ####cp *.png ${dollar}EXTRA_FILE_DIR
  133. ####fi
  134. fi
  135. fi
  136. </configfile>
  137. </configfiles>
  138. <help>
  139. The **SeqPos** tool will find motifs enriched in a set of
  140. regions. **SeqPos** use the distances from motif positions to the peak
  141. summits ( center of the regions) to find the most enriched motifs near
  142. peak summits. **SeqPos** can scan all the motifs in TRANSFAC, Matha's
  143. Protein Binding Microarray ( a.k.a PBM) and Scot Wolfe's protein DNA
  144. binding database ( y1h). Also **SeqPos** can try to find *de novo*
  145. motifs using MDscan algorithm. At last, **SeqPos** can cluster the
  146. similar motifs in a cluster tree to help user filter out the redundant
  147. motifs. This tool is made by Cliff Meyer and Len Taing. A detail
  148. explanation of the algorithm can be found in the supplementary
  149. material of the paper "Nucleosome dynamics define transcriptional
  150. enhancers." (Nat Genet, 42(4):343-347) The tool was modified then by
  151. Jian Ma and Tao Liu. Version: 0.590.
  152. About our curated cistrome motif database: This database only
  153. includes human and mouse data. It puts data from Transfac,
  154. JASPAR, UniPROBE (pbm), hPDI together, also it includes the motifs derived
  155. from ChIP-seq data. After that we delete the motifs look similar from
  156. each other to keep a clean and smaller database. This database is a
  157. recommended one and always in updating.
  158. .. class:: infomark
  159. **TIP:** Please make sure the regions in your BED file is valid! If
  160. the region is out of boundary of chromosome, it will cause error. Also
  161. please avoid abnormal chromosome names.
  162. .. class:: infomark
  163. **TIP:** The running time is increasing with the number of
  164. regions. Please avoid using more than 10 thousand regions for input.
  165. .. class:: warningmark
  166. **NEED IMPROVEMENT**
  167. -----
  168. **Parameters**
  169. - **BED file** is the input file. It can be the output from peak
  170. calling softwares. Please pay attention that the regions in the BED
  171. file should not be out of boundary of chromosome.
  172. *This file can only contain at most 5000 lines. If not, please
  173. filter it using Galaxy:Filter and Sort tool*.
  174. - **Genome Assembly version** is the UCSC database version.
  175. - **Motif databases** is the known motif collections in Cistrome,
  176. including TRANSFAC, PBM and Scot wolfe's database. You can select
  177. *de novo motif search* to enable *de novo* motif scan.
  178. - **Species list** are the species that you want to filter the results
  179. with. Select none of the species to see all of the results.
  180. - **Width of regions** is the region to scan for motifs around peak
  181. summits ( centers of input regions).
  182. - **P-value cutoff** can be used to filter the results.
  183. .. class:: infomark
  184. **TIP:** To browse the known motif databases, click here_
  185. .. _here: http://cistrome.org/motif/
  186. -----
  187. **Output**
  188. - **HTML output** can be open in web browser. Users can browse the
  189. result in either the middle list view of the page or the bottom
  190. cluster tree view, and the detail of motif can be seen in the top
  191. detail view. The list view is sortable at every field. The detail
  192. view provides two buttons to open the detail information in a
  193. separate webpage, or to show the PSSM of the motif.
  194. - **XML output** is the XML formated output.
  195. - **LOG file** is for job log. If you see errors, please attach this
  196. in the bug report.
  197. </help>
  198. </tool>