/tools/ceas/gca.xml

https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 134 lines · 116 code · 18 blank · 0 comment · 0 complexity · 981e788ed5c8365e0d7357e96cf673df MD5 · raw file

  1. <tool name="GCA: Gene centered annotation" id="ceas_gca">
  2. <description>Find the nearest interval in the given intervals set fo every annotated coding gene</description>
  3. <command interpreter="command">/bin/bash/ $shscript </command>
  4. <inputs>
  5. <param ftype="bed" format="bed" name="bfile" type="data" label="BED file(100,000 lines max)">
  6. <validator type="unspecified_build" />
  7. </param>
  8. <param name="span" type="text" label="Span" value="3000">
  9. <validator type="in_range" max="1000000" min="100" message="Span is out of range, Span has to be between 100 to 1000000" />
  10. </param>
  11. </inputs>
  12. <outputs>
  13. <data format="xls" name="output" />
  14. <data format="txt" name="log" label="job log"/>
  15. </outputs>
  16. <configfiles>
  17. <configfile name="shscript">
  18. #!/bin/bash
  19. #import os
  20. #set $dollar = chr(36)
  21. #set $gt = chr(62)
  22. #set $lt = chr(60)
  23. #set $ad = chr(38)
  24. #set $path = $os.path.abspath($__app__.config.tool_path)
  25. ##check line count and file format accuracy of bed file
  26. lines=`wc -l $bfile | tail -1 | awk '{print ${dollar}1}'`
  27. format=`$path/validation/fcfunc.py $bfile`
  28. if [[ ${dollar}lines -gt 100000 ]];then
  29. echo "Total lines of the files exceed the limit of 100000 lines!" ${gt}${ad}2;
  30. exit;
  31. elif [[ ${dollar}format != "passed" ]];then
  32. echo ${dollar}format ${gt}${ad}2
  33. exit;
  34. else
  35. #set $gtpath = os.path.join( os.path.abspath($__app__.config.cistrome_static_library_path), "ceaslib", "GeneTable", $bfile.metadata.dbkey )
  36. gca -b $bfile --span=$span -g $gtpath --name=gca_out ${gt}${ad} $log
  37. cp gca_out.xls $output
  38. fi
  39. </configfile>
  40. </configfiles>
  41. <tests>
  42. <test maxseconds="3600" name="GCA_1">
  43. <param name="bfile" value="bedfile.bed" />
  44. <param name="span" value="3000" />
  45. <param name="genome" value="hg18" />
  46. <output name="output" file="gca_1/gca_1.xls" />
  47. <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />
  48. </test>
  49. <test maxseconds="3600" name="GCA_2">
  50. <param name="bfile" value="bedfile.bed" />
  51. <param name="span" value="100" />
  52. <param name="genome" value="hg18" />
  53. <output name="output" file="gca_2/gca_2.xls" />
  54. <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />
  55. </test>
  56. <test maxseconds="3600" name="GCA_3">
  57. <param name="bfile" value="bedfile.bed" />
  58. <param name="span" value="500" />
  59. <param name="genome" value="hg18" />
  60. <output name="output" file="gca_3/gca_3.xls" />
  61. <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />
  62. </test>
  63. <test maxseconds="3600" name="GCA_4">
  64. <param name="bfile" value="bedfile.bed" />
  65. <param name="span" value="1000" />
  66. <param name="genome" value="hg18" />
  67. <output name="output" file="gca_4/gca_4.xls" />
  68. <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />
  69. </test>
  70. <test maxseconds="3600" name="GCA_5">
  71. <param name="bfile" value="bedfile.bed" />
  72. <param name="span" value="10000" />
  73. <param name="genome" value="hg18" />
  74. <output name="output" file="gca_5/gca_5.xls" />
  75. <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />
  76. </test>
  77. </tests>
  78. <help>
  79. This tool finds the nearest binding sites in the given BED file for
  80. every annotated coding gene. It's a module in CEAS package which is
  81. written by Hyunjin Gene Shin, published in Bioinformatics (pubmed
  82. id:19689956).
  83. .. class:: warningmark
  84. **NEED IMPROVEMENT**
  85. -----
  86. **Parameters**
  87. - **BED file** contains the transcription factor binding sites,
  88. generally the BED files for peaks from peak calling tools.
  89. - **Span** is the span for ChIP regions.
  90. - **Genome Annotation Version** to specify the annotations according to
  91. the data set. The annotations are downloaded from UCSC genome site.
  92. -----
  93. **Output**
  94. - **XLS file** is the tab-delimited file.
  95. -----
  96. **script parameter list of GCA**
  97. Options:
  98. --version show program's version number and exit
  99. -h, --help Show this help message and exit.
  100. -b BED, --bed=BED BED file of ChIP regions.
  101. -g GDB, --gt=GDB Gene annotation table. This can be a sqlite3 local db
  102. file, BED file or genome version of UCSC. The BED file
  103. must have an extension of '.bed'
  104. --span=SPAN Span in search of ChIP regions from TSS and TTS,
  105. DEFAULT=3000bp
  106. --name=NAME Experiment name. This will be used to name the output
  107. file. If an experiment name is not given, input BED
  108. file name will be used instead.
  109. --gn-group=GN_GROUP A particular group of genes of interest. If a txt file
  110. with one column of gene names (eg RefSeq IDs in case of
  111. using a refGene table) is given, gca returns the gene-
  112. centered annotation of this particular gene group.
  113. --gname2=NAME2 The gene names of --gn-group will be regarded as
  114. 'name2.' See the schema of the gene annotation table.
  115. </help>
  116. </tool>