/tools/stats/grouping.xml
https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 142 lines · 105 code · 16 blank · 21 comment · 0 complexity · ae39d0d234e26043f4dd2d8e2995d94e MD5 · raw file
- <tool id="Grouping1" name="Group" version="2.1.0">
- <description>data by a column and perform aggregate operation on other columns.</description>
- <command interpreter="python">
- grouping.py
- $out_file1
- $input1
- $groupcol
- $ignorecase
- $ignorelines
- #for $op in $operations
- '${op.optype}
- ${op.opcol}
- ${op.opround}'
- #end for
- </command>
- <inputs>
- <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
- <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" />
- <param name="ignorecase" type="boolean" truevalue="1" falsevalue="0">
- <label>Ignore case while grouping?</label>
- </param>
- <param name="ignorelines" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are not grouped">
- <option value="62">></option>
- <option value="64">@</option>
- <option value="43">+</option>
- <option value="60"><</option>
- <option value="42">*</option>
- <option value="45">-</option>
- <option value="61">=</option>
- <option value="124">|</option>
- <option value="63">?</option>
- <option value="36">$</option>
- <option value="46">.</option>
- <option value="58">:</option>
- <option value="38">&</option>
- <option value="37">%</option>
- <option value="94">^</option>
- <option value="35">#</option>
- </param>
- <repeat name="operations" title="Operation">
- <param name="optype" type="select" label="Type">
- <option value="mean">Mean</option>
- <option value="median">Median</option>
- <option value="mode">Mode</option>
- <option value="max">Maximum</option>
- <option value="min">Minimum</option>
- <option value="sum">Sum</option>
- <option value="length">Count</option>
- <option value="unique">Count Distinct</option>
- <option value="cat">Concatenate</option>
- <option value="cat_uniq">Concatenate Distinct</option>
- <option value="random">Randomly pick</option>
- <option value="std">Standard deviation</option>
- </param>
- <param name="opcol" label="On column" type="data_column" data_ref="input1" />
- <param name="opround" type="select" label="Round result to nearest integer?">
- <option value="no">NO</option>
- <option value="yes">YES</option>
- </param>
- </repeat>
- </inputs>
- <outputs>
- <data format="tabular" name="out_file1" />
- </outputs>
- <requirements>
- <requirement type="python-module">numpy</requirement>
- </requirements>
- <tests>
- <!-- Test valid data -->
- <test>
- <param name="input1" value="1.bed"/>
- <param name="groupcol" value="1"/>
- <param name="ignorecase" value="true"/>
- <param name="optype" value="mean"/>
- <param name="opcol" value="2"/>
- <param name="opround" value="no"/>
- <output name="out_file1" file="groupby_out1.dat"/>
- </test>
- <!-- Long case but test framework doesn't allow yet
- <test>
- <param name="input1" value="1.bed"/>
- <param name="groupcol" value="1"/>
- <param name="ignorecase" value="false"/>
- <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/>
- <output name="out_file1" file="groupby_out3.tabular"/>
- </test>
- -->
- <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors
- <test>
- <param name="input1" value="1.tabular"/>
- <param name="groupcol" value="1"/>
- <param name="ignorecase" value="true"/>
- <param name="optype" value="mean"/>
- <param name="opcol" value="2"/>
- <param name="opround" value="no"/>
- <output name="out_file1" file="groupby_out2.dat"/>
- </test>
- -->
- </tests>
- <help>
- .. class:: infomark
- **TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
- -----
- **Syntax**
- This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Concatenate, and Randomly pick on any column(s).
- The Concatenate function will take, for each group, each item in the specified column and build a comma delimited list. Concatenate Unique will do the same but will build a list of unique items with no repetition.
- Count and Count Unique are equivalent to Concatenate and Concatenate Unique, but will only count the number of items and will return an integer.
- - If multiple modes are present, all are reported.
- -----
- **Example**
- - For the following input::
- chr22 1000 1003 TTT
- chr22 2000 2003 aaa
- chr10 2200 2203 TTT
- chr10 1200 1203 ttt
- chr22 1600 1603 AAA
- - **Grouping on column 4** while ignoring case, and performing operation **Count on column 1** will return::
- AAA 2
- TTT 3
-
- - **Grouping on column 4** while not ignoring case, and performing operation **Count on column 1** will return::
- aaa 1
- AAA 1
- ttt 1
- TTT 2
- </help>
- </tool>