PageRenderTime 29ms CodeModel.GetById 18ms app.highlight 4ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/extract/extract_genomic_dna.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 189 lines | 161 code | 24 blank | 4 comment | 0 complexity | b320d4e4bf1691e981095771738afc15 MD5 | raw file
  1<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="2.2.3">
  2  <description>using coordinates from assembled/unassembled genomes</description>
  3  <command interpreter="python">
  4      extract_genomic_dna.py $input $out_file1 -o $out_format -d $dbkey 
  5      
  6      #if str( $interpret_features ) == "yes":
  7        -I
  8      #end if
  9      
 10      ## Columns to use in input file.
 11      #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
 12        -1 1,4,5,7 --gff
 13      #else:
 14        -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol},${input.metadata.nameCol}
 15      #end if
 16
 17      #if $seq_source.index_source == "cached":
 18        ## Genomic data from cache.
 19        -g ${GALAXY_DATA_INDEX_DIR}
 20      #else:
 21        ## Genomic data from history.
 22        -F $seq_source.ref_file
 23      #end if
 24  </command>
 25  <inputs>
 26      <param format="interval,gff" name="input" type="data" label="Fetch sequences for intervals in"/>
 27      <param name="interpret_features" type="select" label="Interpret features when possible" help="Only meaningful for GFF, GTF datasets.">
 28          <option value="yes">Yes</option>
 29          <option value="no">No</option>
 30      </param>
 31      <conditional name="seq_source">
 32          <param name="index_source" type="select" label="Source for Genomic Data">
 33              <option value="cached">Locally cached</option>
 34              <option value="history">History</option>
 35          </param>
 36          <when value="cached">
 37          </when>
 38          <when value="history">
 39              <param name="ref_file" type="data" format="fasta" label="Using reference file" />
 40          </when>
 41      </conditional>
 42	  <param name="out_format" type="select" label="Output data type">
 43    	  <option value="fasta">FASTA</option>
 44    	  <option value="interval">Interval</option>
 45	  </param>
 46  </inputs>
 47  <outputs>
 48      <data format="input" name="out_file1" metadata_source="input">
 49          <change_format>
 50              <when input="out_format" value="fasta" format="fasta" />
 51          </change_format>
 52      </data>
 53  </outputs>
 54  <requirements>
 55      <requirement type="package">ucsc_tools</requirement>
 56      <requirement type="binary">faToTwoBit</requirement>
 57  </requirements>
 58  <tests>
 59    <test>
 60      <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
 61      <param name="interpret_features" value="yes"/>
 62      <param name="index_source" value="cached"/>
 63      <param name="out_format" value="fasta"/>
 64      <output name="out_file1">
 65	<assert_contents>
 66	  <!-- First few lines... -->
 67	  <has_text text=">hg17_chr1_147962192_147962580_- CCDS989.1_cds_0_0_chr1_147962193_r" />
 68	  <has_text text="ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG" />
 69	  <has_text text="GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT" />
 70	  <has_text text="GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT" />
 71	  <!-- Last few lines... -->
 72	  <has_text text="GCTGTGGCACAGAACATGGACTCTGTGTTTAAGGAGCTCTTGGGAAAGAC" />
 73	  <has_text text="CTCTGTCCGCCAGGGCCTTGGGCCAGCATCTACCACCTCTCCCAGTCCTG" />
 74	  <has_text text="GGCCCCGAAGCCCAAAGGCCCCGCCCAGCAGCCGCCTGGGCAGGAACAAA" />
 75	  <has_text text="GGCTTCTCCCGGGGCCCTGGGGCCCCAGCCTCACCCTCAGCTTCCCACCC" />
 76	  <has_text text="CCAGGGCCTAGACACGACCCCCAAGCCACACTGA" />
 77	</assert_contents>
 78      </output>
 79    </test>
 80    <test>
 81      <param name="input" value="droPer1.bed" dbkey="droPer1" ftype="bed" />
 82      <param name="interpret_features" value="yes"/>
 83      <param name="index_source" value="cached"/>
 84      <param name="out_format" value="fasta"/>
 85      <output name="out_file1" file="extract_genomic_dna_out2.fasta" />
 86    </test>
 87    <test>
 88      <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />
 89      <param name="interpret_features" value="yes"/>
 90      <param name="index_source" value="cached"/>
 91      <param name="out_format" value="interval"/>
 92      <output name="out_file1" file="extract_genomic_dna_out3.interval" />
 93    </test>
 94    <!-- Test GFF file support. -->
 95    <test>
 96      <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
 97      <param name="interpret_features" value="no"/>
 98      <param name="index_source" value="cached"/>
 99      <param name="out_format" value="interval"/>
100      <output name="out_file1" file="extract_genomic_dna_out4.gff" />
101    </test>
102    <test>
103      <param name="input" value="gff_filter_by_attribute_out1.gff" dbkey="mm9" ftype="gff" />
104      <param name="interpret_features" value="no"/>
105      <param name="out_format" value="fasta"/>
106      <param name="index_source" value="cached"/>
107      <output name="out_file1" file="extract_genomic_dna_out5.fasta" />
108    </test>
109    <!-- Test custom sequences support and GFF feature interpretation. -->
110    <test>
111      <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
112      <param name="interpret_features" value="no"/>
113      <param name="index_source" value="history"/>
114      <param name="ref_file" value="tophat_in1.fasta"/>
115      <param name="out_format" value="fasta"/>
116      <output name="out_file1" file="extract_genomic_dna_out6.fasta" />
117    </test>
118    <test>
119      <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />
120      <param name="interpret_features" value="yes"/>
121      <param name="index_source" value="history"/>
122      <param name="ref_file" value="tophat_in1.fasta"/>
123      <param name="out_format" value="fasta"/>
124      <output name="out_file1" file="extract_genomic_dna_out7.fasta" />
125    </test>
126  </tests>
127  <help>
128
129.. class:: warningmark
130
131This tool requires interval or gff (special tabular formatted data).  If your data is not TAB delimited, first use *Text Manipulation-&gt;Convert*.
132
133.. class:: warningmark
134
135Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified). 
136
137.. class:: warningmark
138
139All of the following will cause a line from the input dataset to be skipped and a warning generated.  The number of warnings and skipped lines is documented in the resulting history item.
140 - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.
141 - Sequences that fall outside of the range of a line's start and end coordinates. 
142 - Chromosome, start or end coordinates that are invalid for the specified build.
143 - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).
144
145.. class:: infomark
146
147 **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools. 
148
149-----
150
151**What it does**
152
153This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.
154
155If strand is not defined, the default value is "+".
156
157-----
158
159**Example**
160
161If the input dataset is::
162
163    chr7  127475281  127475310  NM_000230  0  +
164    chr7  127485994  127486166  NM_000230  0  +
165    chr7  127486011  127486166  D49487     0  +
166
167Extracting sequences with **FASTA** output data type returns::
168
169    &gt;hg17_chr7_127475281_127475310_+ NM_000230
170    GTAGGAATCGCAGCGCCAGCGGTTGCAAG
171    &gt;hg17_chr7_127485994_127486166_+ NM_000230
172    GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG
173    GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC
174    CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG
175    GATCAATGACATTTCACACACG
176    &gt;hg17_chr7_127486011_127486166_+ D49487
177    TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG
178    CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA
179    CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
180    ACACG
181
182Extracting sequences with **Interval** output data type returns::
183
184    chr7    127475281       127475310       NM_000230       0       +       GTAGGAATCGCAGCGCCAGCGGTTGCAAG
185    chr7    127485994       127486166       NM_000230       0       +       GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
186    chr7    127486011       127486166       D49487  0       +       TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
187
188</help>
189</tool>