/tools/ceas/gca.xml
https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 134 lines · 116 code · 18 blank · 0 comment · 0 complexity · 981e788ed5c8365e0d7357e96cf673df MD5 · raw file
- <tool name="GCA: Gene centered annotation" id="ceas_gca">
- <description>Find the nearest interval in the given intervals set fo every annotated coding gene</description>
- <command interpreter="command">/bin/bash/ $shscript </command>
- <inputs>
- <param ftype="bed" format="bed" name="bfile" type="data" label="BED file(100,000 lines max)">
- <validator type="unspecified_build" />
- </param>
- <param name="span" type="text" label="Span" value="3000">
- <validator type="in_range" max="1000000" min="100" message="Span is out of range, Span has to be between 100 to 1000000" />
- </param>
- </inputs>
- <outputs>
- <data format="xls" name="output" />
- <data format="txt" name="log" label="job log"/>
- </outputs>
- <configfiles>
- <configfile name="shscript">
- #!/bin/bash
- #import os
- #set $dollar = chr(36)
- #set $gt = chr(62)
- #set $lt = chr(60)
- #set $ad = chr(38)
- #set $path = $os.path.abspath($__app__.config.tool_path)
- ##check line count and file format accuracy of bed file
- lines=`wc -l $bfile | tail -1 | awk '{print ${dollar}1}'`
- format=`$path/validation/fcfunc.py $bfile`
- if [[ ${dollar}lines -gt 100000 ]];then
- echo "Total lines of the files exceed the limit of 100000 lines!" ${gt}${ad}2;
- exit;
- elif [[ ${dollar}format != "passed" ]];then
- echo ${dollar}format ${gt}${ad}2
- exit;
- else
- #set $gtpath = os.path.join( os.path.abspath($__app__.config.cistrome_static_library_path), "ceaslib", "GeneTable", $bfile.metadata.dbkey )
- gca -b $bfile --span=$span -g $gtpath --name=gca_out ${gt}${ad} $log
- cp gca_out.xls $output
- fi
- </configfile>
- </configfiles>
- <tests>
- <test maxseconds="3600" name="GCA_1">
- <param name="bfile" value="bedfile.bed" />
- <param name="span" value="3000" />
- <param name="genome" value="hg18" />
- <output name="output" file="gca_1/gca_1.xls" />
- <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />
- </test>
- <test maxseconds="3600" name="GCA_2">
- <param name="bfile" value="bedfile.bed" />
- <param name="span" value="100" />
- <param name="genome" value="hg18" />
- <output name="output" file="gca_2/gca_2.xls" />
- <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />
- </test>
- <test maxseconds="3600" name="GCA_3">
- <param name="bfile" value="bedfile.bed" />
- <param name="span" value="500" />
- <param name="genome" value="hg18" />
- <output name="output" file="gca_3/gca_3.xls" />
- <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />
- </test>
- <test maxseconds="3600" name="GCA_4">
- <param name="bfile" value="bedfile.bed" />
- <param name="span" value="1000" />
- <param name="genome" value="hg18" />
- <output name="output" file="gca_4/gca_4.xls" />
- <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />
- </test>
- <test maxseconds="3600" name="GCA_5">
- <param name="bfile" value="bedfile.bed" />
- <param name="span" value="10000" />
- <param name="genome" value="hg18" />
- <output name="output" file="gca_5/gca_5.xls" />
- <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />
- </test>
- </tests>
- <help>
- This tool finds the nearest binding sites in the given BED file for
- every annotated coding gene. It's a module in CEAS package which is
- written by Hyunjin Gene Shin, published in Bioinformatics (pubmed
- id:19689956).
- .. class:: warningmark
- **NEED IMPROVEMENT**
- -----
- **Parameters**
- - **BED file** contains the transcription factor binding sites,
- generally the BED files for peaks from peak calling tools.
- - **Span** is the span for ChIP regions.
- - **Genome Annotation Version** to specify the annotations according to
- the data set. The annotations are downloaded from UCSC genome site.
- -----
- **Output**
- - **XLS file** is the tab-delimited file.
- -----
- **script parameter list of GCA**
- Options:
- --version show program's version number and exit
- -h, --help Show this help message and exit.
- -b BED, --bed=BED BED file of ChIP regions.
- -g GDB, --gt=GDB Gene annotation table. This can be a sqlite3 local db
- file, BED file or genome version of UCSC. The BED file
- must have an extension of '.bed'
- --span=SPAN Span in search of ChIP regions from TSS and TTS,
- DEFAULT=3000bp
- --name=NAME Experiment name. This will be used to name the output
- file. If an experiment name is not given, input BED
- file name will be used instead.
- --gn-group=GN_GROUP A particular group of genes of interest. If a txt file
- with one column of gene names (eg RefSeq IDs in case of
- using a refGene table) is given, gca returns the gene-
- centered annotation of this particular gene group.
- --gname2=NAME2 The gene names of --gn-group will be regarded as
- 'name2.' See the schema of the gene annotation table.
- </help>
- </tool>