PageRenderTime 12ms CodeModel.GetById 2ms app.highlight 6ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/new_operations/column_join.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 260 lines | 226 code | 18 blank | 16 comment | 0 complexity | 64c62b6f695252b6d76434f87641ae29 MD5 | raw file
  1<tool id="column_join" name="Column Join" version="1.1.0">
  2  <description></description>
  3  <command interpreter="python">
  4    column_join.py
  5        --output=$output
  6        --input1=$input1
  7        --input2=$input2
  8        --hinge=$hinge
  9        --columns=$columns
 10        #if $fill_empty_columns.fill_empty_columns_switch == "fill_empty":
 11            --fill_options_file=$fill_options_file
 12        #end if
 13        #for $f in $file_chooser:
 14            ${f.input}
 15        #end for
 16  </command>
 17  <inputs>
 18    <param name="input1" type="data" format="tabular" label="Choose the first file for the join" />
 19    <param name="hinge" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Use this column and columns to left the 'hinge' (matching data for each join)" help="All columns to left of selected column (plus selected column) will be used. Select 2 for pileup" />
 20    <param name="columns" type="data_column" data_ref="input1" multiple="true" numerical="false" label="Include these column" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
 21    <conditional name="fill_empty_columns">
 22      <param name="fill_empty_columns_switch" type="select" label="Fill empty columns">
 23        <option value="no_fill" selected="True">No</option>
 24        <option value="fill_empty">Yes</option>
 25      </param>
 26      <when value="no_fill" />
 27      <when value="fill_empty">
 28        <conditional name="do_fill_empty_columns">
 29          <param name="column_fill_type" type="select" label="Fill Columns by">
 30            <option value="single_fill_value" selected="True">Single fill value</option>
 31            <option value="fill_value_by_column">Values by column</option>
 32          </param>
 33          <when value="single_fill_value">
 34            <param type="text" name="fill_value" label="Fill value" value="." />
 35          </when>
 36          <when value="fill_value_by_column">
 37            <repeat name="column_fill" title="Fill Column">
 38              <param name="column_number" label="Column" type="data_column" data_ref="input1" />
 39              <param type="text" name="fill_value" value="." />
 40            </repeat>
 41          </when>
 42        </conditional>
 43      </when>
 44    </conditional>
 45    <param name="input2" type="data" format="tabular" label="Choose the second file for the join" />
 46    <repeat name="file_chooser" title="Additional Input">
 47      <param name="input" label="Additional input file" type="data" format="tabular" />
 48    </repeat>
 49  </inputs>
 50  <configfiles>
 51    <configfile name="fill_options_file">&lt;%
 52import simplejson
 53%&gt;
 54#set $__fill_options = {}
 55#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty':
 56    #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value':
 57        #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value
 58    #else:
 59        #set $__start_fill = ""
 60    #end if
 61    #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ]
 62    #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column':
 63        #for column_fill in $fill_empty_columns['do_fill_empty_columns']['column_fill']:
 64            #set $__fill_options['file1_columns'][ int( column_fill['column_number'].value ) - 1 ] = column_fill['fill_value'].value
 65        #end for
 66    #end if
 67#end if
 68${simplejson.dumps( __fill_options )}
 69    </configfile>
 70  </configfiles>
 71  <outputs>
 72    <data name="output" format="tabular" />
 73  </outputs>
 74  <tests>
 75    <test>
 76      <param name="input1" value="column_join_in1.pileup" ftype="pileup" />
 77      <param name="hinge" value="2" />
 78      <param name="columns" value="1,2,3,4,5,7" />
 79      <param name="fill_empty_columns_switch" value="fill_empty" />
 80      <param name="column_fill_type" value="single_fill_value" />
 81      <param name="fill_value" value="?" />
 82      <param name="input2" value="column_join_in2.pileup" ftype="pileup" />
 83      <param name="input" value="column_join_in3.pileup" ftype="pileup" />
 84      <output name="output" file="column_join_out1.pileup" ftype="tabular" />
 85    </test>
 86    <test>
 87      <param name="input1" value="column_join_in4.pileup" ftype="pileup" />
 88      <param name="hinge" value="2" />
 89      <param name="columns" value="1,2,3,4" />
 90      <param name="fill_empty_columns_switch" value="no_fill" />
 91      <param name="input2" value="column_join_in5.pileup" ftype="pileup" />
 92      <param name="input" value="column_join_in6.pileup" ftype="pileup" />
 93      <output name="output" file="column_join_out2.pileup" ftype="tabular" />
 94    </test>
 95<!--  This test is failing for an unclear reason (the column values do not get
 96      passed into the script), but passes in the browser
 97    <test>
 98      <param name="input1" value="column_join_in7.pileup" ftype="tabular" />
 99      <param name="hinge" value="2" />
100      <param name="columns" value="3,4,5" />
101      <param name="fill_empty_columns_switch" value="fill_empty" />
102      <param name="column_fill_type" value="fill_value_by_column" />
103      <param name="column_number" value="5" />
104      <param name="fill_value" value="X" />
105      <param name="input2" value="column_join_in8.pileup" ftype="tabular" />
106      <param name="input" value="column_join_in9.pileup" ftype="tabular" />
107      <output name="output" file="column_join_out3.pileup" ftype="tabular" />
108    </test>
109-->
110    <test>
111      <param name="input1" value="column_join_in10.pileup" ftype="pileup" />
112      <param name="hinge" value="1" />
113      <param name="columns" value="2,7" />
114      <param name="fill_empty_columns_switch" value="no_fill" />
115      <param name="input2" value="column_join_in11.pileup" ftype="pileup" />
116      <param name="input" value="column_join_in12.pileup" ftype="pileup" />
117      <output name="output" file="column_join_out4.pileup" ftype="tabular" />
118    </test>
119    <test>
120      <!-- Test for handling missing column -->
121      <param name="input1" value="column_join_in13.tabular" ftype="tabular" />
122      <param name="hinge" value="1" />
123      <param name="columns" value="5" />
124      <param name="fill_empty_columns_switch" value="fill_empty" />
125      <param name="column_fill_type" value="single_fill_value" />
126      <param name="fill_value" value="0" />
127      <param name="input2" value="column_join_in14.tabular" ftype="tabular" />
128      <param name="input" value="column_join_in15.tabular" ftype="tabular" />
129      <output name="output" file="column_join_out5.tabular" ftype="tabular" />
130    </test>
131  </tests>
132  <help>
133**What it does**
134
135This tool allows you to join several files with the same column structure into one file, removing certain columns if necessary. The user needs to select a 'hinge', which is the number of left-most columns to match on. They also need to select the columns to include in the join, which should include the hinge columns, too.
136
137Note that the files are expected to have the same number of columns. If for some reason the join column is missing (this only applies to the last column(s)), the tool attempts to handle this situation by inserting an empty item (or the appropriate filler) for that column on that row. This could lead to the situation where a row has a hinge but entirely empty or filled columns, if the hinge exists in at least one file but every file that has it is missing the join column. Also, note that the tool does not distinguish between a file missing the hinge altogether and a file having the hinge but missing the column (in both cases the column would be empty or filled). There is an example of this below.
138
139-----
140
141**General Example**
142
143Given the following files::
144
145  FILE 1
146  chr2    1    T    6    .C...,     I$$III
147  chr2    2    G    6    ..N..,     III@II
148  chr2    3    C    7    ..C...,    I$IIIII
149  chr2    4    G    7    .G....,    I#IIIII
150  chr2    5    G    7    ...N..,    IIII#BI
151  chr2    6    A    7    ..T...,    I$IDIII
152  chr1    1    C    1    ^:.        I
153  chr1    2    G    2    .^:.       $I
154  chr1    3    A    2    ..         I%
155  chr1    4    C    2    ..         I$
156  chr1    5    T    3    ..^:.      I#I
157  chr1    6    G    3    ..^:,      I#I
158
159  FILE 2
160  chr1    3    T    1    ^:.        I
161  chr1    4    G    2    .^:.       $I
162  chr1    5    T    2    ..         I%
163  chr1    6    C    3    ..^:.      III
164  chr1    7    G    3    ..^:.      I#I
165  chr1    8    T    4    ...^:,     I#II
166  chr2    77   C    6    .G...,     I$$III
167  chr2    78   G    6    ..N..,     III@II
168  chr2    79   T    7    ..N...,    I$IIIII
169  chr2    80   C    7    .G....,    I#IIIII
170  chr2    81   G    7    ...A..,    IIII#BI
171  chr2    82   A    8    ...G...,   I$IDIIII
172  chr2    83   T    8    .A.....N   IIIIIIII
173  chr2    84   A    9    ......T.   I$IIIIIII
174
175  FILE 3
176  chr1    1    A    1    .          I
177  chr1    2    T    2    G.         I$
178  chr1    3    C    2    .,         I@
179  chr1    4    C    3    ..N        III
180  chr1    42   C    5    ...N^:.    III@I
181  chr1    43   C    5    .N..^:.    IIIII
182  chr1    44   T    5    .A..,      IA@II
183  chr1    45   A    6    .N...^:.   IIIII$
184  chr1    46   G    6    .GN..^:.   I@IIII
185  chr1    47   A    7    ....^:..,  IIIII$I
186  chr2    73   T    5    .N..,      II$II
187  chr2    74   A    5    ....,      IIIII
188  chr2    75   T    5    ....,      IIIII
189  chr2    76   T    5    ....,      IIIII
190  chr2    77   C    5    ....,      IIIBI
191  chr2    78   T    5    ....,      IDIII
192
193To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output::
194
195  chr1    1    C    1              A    1
196  chr1    2    G    2              T    2
197  chr1    3    A    2    T    1    C    2
198  chr1    4    C    2    G    2    C    3
199  chr1    5    T    3    T    2
200  chr1    6    G    3    C    3
201  chr1    7              G    3
202  chr1    8              T    4
203  chr1    42                       C    5
204  chr1    43                       C    5
205  chr1    44                       T    5
206  chr1    45                       A    6
207  chr1    46                       G    6
208  chr1    47                       A    7
209  chr2    1    T    6
210  chr2    2    G    6
211  chr2    3    C    7
212  chr2    4    G    7
213  chr2    5    G    7
214  chr2    6    A    7
215  chr2    73                       T    5
216  chr2    74                       A    5
217  chr2    75                       T    5
218  chr2    76                       T    5
219  chr2    77             C    6    C    5
220  chr2    78             G    6    T    5
221  chr2    79             T    7
222  chr2    80             C    7
223  chr2    81             G    7
224  chr2    82             A    8
225  chr2    83             T    8
226  chr2    84             A    9
227
228**Example with missing columns**
229
230Given the following input files::
231
232  FILE 1
233  1   A
234  2   B   b
235  4   C   c
236  5   D
237  6   E   e
238
239  FILE 2
240  1   M   m
241  2   N
242  3   O   o
243  4   P   p
244  5   Q
245  7   R   r
246
247if we join only column 3 using column 1 as the hinge and with a fill value of '0', this is what will be output::
248
249  1   0   m
250  2   b   0
251  3   0   o
252  4   c   p
253  5   0   0
254  6   e   0
255  7   0   r
256
257Row 5 appears in both files with the missing column, so it's got nothing but fill values in the output file.
258
259  </help>
260</tool>