PageRenderTime 2ms CodeModel.GetById 36ms app.highlight 3ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/ncbi_blast_plus/blastxml_to_tabular.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 127 lines | 111 code | 8 blank | 8 comment | 0 complexity | f5900e3943083b3c926fe193a6a9c7f5 MD5 | raw file
  1<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.8">
  2    <description>Convert BLAST XML output to tabular</description>
  3    <command interpreter="python">
  4      blastxml_to_tabular.py $blastxml_file $tabular_file $out_format
  5    </command>
  6    <inputs>
  7        <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> 
  8        <param name="out_format" type="select" label="Output format">
  9            <option value="std" selected="True">Tabular (standard 12 columns)</option>
 10            <option value="ext">Tabular (extended 24 columns)</option>
 11        </param>
 12    </inputs>
 13    <outputs>
 14        <data name="tabular_file" format="tabular" label="BLAST results as tabular" />
 15    </outputs>
 16    <requirements>
 17    </requirements>
 18    <tests>
 19        <test>
 20            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
 21            <param name="out_format" value="std" />
 22            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin.tabluar -->
 23            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted.tabular" ftype="tabular" />
 24        </test>
 25        <test>
 26            <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
 27            <param name="out_format" value="ext" />
 28            <!-- Note this has some white space differences from the actual blastp output blast_four_human_vs_rhodopsin_22c.tabluar -->
 29            <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_converted_ext.tabular" ftype="tabular" />
 30        </test>
 31        <test>
 32            <param name="blastxml_file" value="blastp_sample.xml" ftype="blastxml" />
 33            <param name="out_format" value="std" />
 34            <!-- Note this has some white space differences from the actual blastp output -->
 35            <output name="tabular_file" file="blastp_sample_converted.tabular" ftype="tabular" />
 36        </test>
 37        <test>
 38            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
 39            <param name="out_format" value="std" />
 40            <!-- Note this has some white space differences from the actual blastx output -->
 41            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted.tabular" ftype="tabular" />
 42        </test>
 43        <test>
 44            <param name="blastxml_file" value="blastx_rhodopsin_vs_four_human.xml" ftype="blastxml" />
 45            <param name="out_format" value="ext" />
 46            <!-- Note this has some white space and XXXX masking differences from the actual blastx output -->
 47            <output name="tabular_file" file="blastx_rhodopsin_vs_four_human_converted_ext.tabular" ftype="tabular" />
 48        </test>
 49        <test>
 50            <param name="blastxml_file" value="blastx_sample.xml" ftype="blastxml" />
 51            <param name="out_format" value="std" />
 52            <!-- Note this has some white space differences from the actual blastx output -->
 53            <output name="tabular_file" file="blastx_sample_converted.tabular" ftype="tabular" />
 54        </test>
 55        <test>
 56            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
 57            <param name="out_format" value="std" />
 58            <!-- Note this has some white space differences from the actual blastp output -->
 59            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_std.tabular" ftype="tabular" />
 60        </test>
 61        <test>
 62            <param name="blastxml_file" value="blastp_human_vs_pdb_seg_no.xml" ftype="blastxml" />
 63            <param name="out_format" value="ext" />
 64            <!-- Note this has some white space differences from the actual blastp output -->
 65            <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
 66        </test>
 67    </tests>
 68    <help>
 69    
 70**What it does**
 71
 72NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of
 73formats including tabular and a more detailed XML format. A complex workflow
 74may need both the XML and the tabular output - but running BLAST twice is
 75slow and wasteful.
 76
 77This tool takes the BLAST XML output and by default converts it into the
 78standard 12 column tabular equivalent:
 79
 80====== ========= ============================================
 81Column NCBI name Description
 82------ --------- --------------------------------------------
 83     1 qseqid    Query Seq-id (ID of your sequence)
 84     2 sseqid    Subject Seq-id (ID of the database hit)
 85     3 pident    Percentage of identical matches
 86     4 length    Alignment length
 87     5 mismatch  Number of mismatches
 88     6 gapopen   Number of gap openings
 89     7 qstart    Start of alignment in query
 90     8 qend      End of alignment in query
 91     9 sstart    Start of alignment in subject (database hit)
 92    10 send      End of alignment in subject (database hit)
 93    11 evalue    Expectation value (E-value)
 94    12 bitscore  Bit score
 95====== ========= ============================================
 96
 97The BLAST+ tools can optionally output additional columns of information,
 98but this takes longer to calculate. Most (but not all) of these columns are
 99included by selecting the extended tabular output. The extra columns are
100included *after* the standard 12 columns. This is so that you can write
101workflow filtering steps that accept either the 12 or 22 column tabular
102BLAST output.
103
104====== ============= ===========================================
105Column NCBI name     Description
106------ ------------- -------------------------------------------
107    13 sallseqid     All subject Seq-id(s), separated by a ';'
108    14 score         Raw score
109    15 nident        Number of identical matches
110    16 positive      Number of positive-scoring matches
111    17 gaps          Total number of gaps
112    18 ppos          Percentage of positive-scoring matches
113    19 qframe        Query frame
114    20 sframe        Subject frame
115    21 qseq          Aligned part of query sequence
116    22 sseq          Aligned part of subject sequence
117    23 qlen          Query sequence length
118    24 slen          Subject sequence length
119====== ============= ===========================================
120
121Beware that the XML file (and thus the conversion) and the tabular output
122direct from BLAST+ may differ in the presence of XXXX masking on regions
123low complexity (columns 21 and 22), and thus also calculated figures like
124the percentage idenity (column 3).
125
126    </help>
127</tool>