/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml
https://bitbucket.org/cistrome/cistrome-harvard/ · XML · 276 lines · 246 code · 18 blank · 12 comment · 0 complexity · 4b7985cccb23bb316af8a52ac3cd1bef MD5 · raw file
- <tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.11">
- <description>Search protein database with protein query sequence(s)</description>
- <version_command>blastp -version</version_command>
- <command interpreter="python">hide_stderr.py
- ## The command is a Cheetah template which allows some Python based syntax.
- ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
- blastp
- -query "$query"
- #if $db_opts.db_opts_selector == "db":
- -db "${db_opts.database.fields.path}"
- #else:
- -subject "$db_opts.subject"
- #end if
- -task $blast_type
- -evalue $evalue_cutoff
- -out $output1
- ##Set the extended list here so if/when we add things, saved workflows are not affected
- #if str($out_format)=="ext":
- -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen"
- #else:
- -outfmt $out_format
- #end if
- -num_threads 8
- #if $adv_opts.adv_opts_selector=="advanced":
- $adv_opts.filter_query
- -matrix $adv_opts.matrix
- ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
- ## Note -max_target_seqs overrides -num_descriptions and -num_alignments
- #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
- -max_target_seqs $adv_opts.max_hits
- #end if
- #if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0):
- -word_size $adv_opts.word_size
- #end if
- ##Ungapped disabled for now - see comments below
- ##$adv_opts.ungapped
- $adv_opts.parse_deflines
- ## End of advanced options:
- #end if
- </command>
- <inputs>
- <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/>
- <conditional name="db_opts">
- <param name="db_opts_selector" type="select" label="Subject database/sequences">
- <option value="db" selected="True">BLAST Database</option>
- <option value="file">FASTA file</option>
- </param>
- <when value="db">
- <param name="database" type="select" label="Protein BLAST database">
- <options from_file="blastdb_p.loc">
- <column name="value" index="0"/>
- <column name="name" index="1"/>
- <column name="path" index="2"/>
- </options>
- </param>
- <param name="subject" type="hidden" value="" />
- </when>
- <when value="file">
- <param name="database" type="hidden" value="" />
- <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/>
- </when>
- </conditional>
- <param name="blast_type" type="select" display="radio" label="Type of BLAST">
- <option value="blastp">blastp</option>
- <option value="blastp-short">blastp-short</option>
- </param>
- <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" />
- <param name="out_format" type="select" label="Output format">
- <option value="6" selected="True">Tabular (standard 12 columns)</option>
- <option value="ext">Tabular (extended 24 columns)</option>
- <option value="5">BLAST XML</option>
- <option value="0">Pairwise text</option>
- <option value="0 -html">Pairwise HTML</option>
- <option value="2">Query-anchored text</option>
- <option value="2 -html">Query-anchored HTML</option>
- <option value="4">Flat query-anchored text</option>
- <option value="4 -html">Flat query-anchored HTML</option>
- <!--
- <option value="-outfmt 11">BLAST archive format (ASN.1)</option>
- -->
- </param>
- <conditional name="adv_opts">
- <param name="adv_opts_selector" type="select" label="Advanced Options">
- <option value="basic" selected="True">Hide Advanced Options</option>
- <option value="advanced">Show Advanced Options</option>
- </param>
- <when value="basic" />
- <when value="advanced">
- <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' -->
- <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" />
- <param name="matrix" type="select" label="Scoring matrix">
- <option value="BLOSUM90">BLOSUM90</option>
- <option value="BLOSUM80">BLOSUM80</option>
- <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option>
- <option value="BLOSUM50">BLOSUM50</option>
- <option value="BLOSUM45">BLOSUM45</option>
- <option value="PAM250">PAM250</option>
- <option value="PAM70">PAM70</option>
- <option value="PAM30">PAM30</option>
- </param>
- <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer -->
- <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits">
- <validator type="in_range" min="0" />
- </param>
- <!-- I'd like word_size to be optional, with minimum 2 for blastp -->
- <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2.">
- <validator type="in_range" min="0" />
- </param>
- <!--
- Can't use '-ungapped' on its own, error back is:
- Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search
- Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.'
- <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" />
- -->
- <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/>
- </when>
- </conditional>
- </inputs>
- <outputs>
- <data name="output1" format="tabular" label="${blast_type.value_label} on ${db_opts.db_opts_selector}">
- <change_format>
- <when input="out_format" value="0" format="txt"/>
- <when input="out_format" value="0 -html" format="html"/>
- <when input="out_format" value="2" format="txt"/>
- <when input="out_format" value="2 -html" format="html"/>
- <when input="out_format" value="4" format="txt"/>
- <when input="out_format" value="4 -html" format="html"/>
- <when input="out_format" value="5" format="blastxml"/>
- </change_format>
- </data>
- </outputs>
- <requirements>
- <requirement type="binary">blastp</requirement>
- </requirements>
- <tests>
- <test>
- <param name="query" value="four_human_proteins.fasta" ftype="fasta" />
- <param name="db_opts_selector" value="file" />
- <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" />
- <param name="database" value="" />
- <param name="evalue_cutoff" value="1e-8" />
- <param name="blast_type" value="blastp" />
- <param name="out_format" value="5" />
- <param name="adv_opts_selector" value="advanced" />
- <param name="filter_query" value="False" />
- <param name="matrix" value="BLOSUM62" />
- <param name="max_hits" value="0" />
- <param name="word_size" value="0" />
- <param name="parse_deflines" value="True" />
- <output name="output1" file="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" />
- </test>
- <test>
- <param name="query" value="four_human_proteins.fasta" ftype="fasta" />
- <param name="db_opts_selector" value="file" />
- <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" />
- <param name="database" value="" />
- <param name="evalue_cutoff" value="1e-8" />
- <param name="blast_type" value="blastp" />
- <param name="out_format" value="6" />
- <param name="adv_opts_selector" value="advanced" />
- <param name="filter_query" value="False" />
- <param name="matrix" value="BLOSUM62" />
- <param name="max_hits" value="0" />
- <param name="word_size" value="0" />
- <param name="parse_deflines" value="True" />
- <output name="output1" file="blastp_four_human_vs_rhodopsin.tabular" ftype="tabular" />
- </test>
- <test>
- <param name="query" value="four_human_proteins.fasta" ftype="fasta" />
- <param name="db_opts_selector" value="file" />
- <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" />
- <param name="database" value="" />
- <param name="evalue_cutoff" value="1e-8" />
- <param name="blast_type" value="blastp" />
- <param name="out_format" value="ext" />
- <param name="adv_opts_selector" value="advanced" />
- <param name="filter_query" value="False" />
- <param name="matrix" value="BLOSUM62" />
- <param name="max_hits" value="0" />
- <param name="word_size" value="0" />
- <param name="parse_deflines" value="True" />
- <output name="output1" file="blastp_four_human_vs_rhodopsin_ext.tabular" ftype="tabular" />
- </test>
- <test>
- <param name="query" value="rhodopsin_proteins.fasta" ftype="fasta" />
- <param name="db_opts_selector" value="file" />
- <param name="subject" value="four_human_proteins.fasta" ftype="fasta" />
- <param name="database" value="" />
- <param name="evalue_cutoff" value="1e-8" />
- <param name="blast_type" value="blastp" />
- <param name="out_format" value="6" />
- <param name="adv_opts_selector" value="basic" />
- <output name="output1" file="blastp_rhodopsin_vs_four_human.tabular" ftype="tabular" />
- </test>
- </tests>
- <help>
-
- .. class:: warningmark
- **Note**. Database searches may take a substantial amount of time.
- For large input datasets it is advisable to allow overnight processing.
- -----
- **What it does**
- Search a *protein database* using a *protein query*,
- using the NCBI BLAST+ blastp command line tool.
- -----
- **Output format**
- Because Galaxy focuses on processing tabular data, the default output of this
- tool is tabular. The standard BLAST+ tabular output contains 12 columns:
- ====== ========= ============================================
- Column NCBI name Description
- ------ --------- --------------------------------------------
- 1 qseqid Query Seq-id (ID of your sequence)
- 2 sseqid Subject Seq-id (ID of the database hit)
- 3 pident Percentage of identical matches
- 4 length Alignment length
- 5 mismatch Number of mismatches
- 6 gapopen Number of gap openings
- 7 qstart Start of alignment in query
- 8 qend End of alignment in query
- 9 sstart Start of alignment in subject (database hit)
- 10 send End of alignment in subject (database hit)
- 11 evalue Expectation value (E-value)
- 12 bitscore Bit score
- ====== ========= ============================================
- The BLAST+ tools can optionally output additional columns of information,
- but this takes longer to calculate. Most (but not all) of these columns are
- included by selecting the extended tabular output. The extra columns are
- included *after* the standard 12 columns. This is so that you can write
- workflow filtering steps that accept either the 12 or 24 column tabular
- BLAST output.
- ====== ============= ===========================================
- Column NCBI name Description
- ------ ------------- -------------------------------------------
- 13 sallseqid All subject Seq-id(s), separated by a ';'
- 14 score Raw score
- 15 nident Number of identical matches
- 16 positive Number of positive-scoring matches
- 17 gaps Total number of gaps
- 18 ppos Percentage of positive-scoring matches
- 19 qframe Query frame
- 20 sframe Subject frame
- 21 qseq Aligned part of query sequence
- 22 sseq Aligned part of subject sequence
- 23 qlen Query sequence length
- 24 slen Subject sequence length
- ====== ============= ===========================================
- The third option is BLAST XML output, which is designed to be parsed by
- another program, and is understood by some Galaxy tools.
- You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program).
- The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website.
- The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
- The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
- and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
- -------
- **References**
- Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402.
- Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005.
- </help>
- </tool>