PageRenderTime 22ms CodeModel.GetById 9ms app.highlight 4ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/indels/indel_analysis.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 167 lines | 146 code | 21 blank | 0 comment | 0 complexity | ea1ca12e1e6ee0dcfa81669074831cda MD5 | raw file
  1<tool id="indel_analysis" name="Indel Analysis" version="1.0.0">
  2  <description></description>
  3  <command interpreter="python">
  4    indel_analysis.py
  5      --input=$input1
  6      --threshold=$threshold
  7      --out_ins=$out_ins
  8      --out_del=$out_del
  9  </command>
 10  <inputs>
 11    <param format="sam" name="input1" type="data" label="Select sam file to analyze" />
 12    <param name="threshold" type="float" value="0.015" size="5" label="Frequency threshold" help="Cutoff" />
 13  </inputs>
 14  <outputs>
 15    <data format="interval" name="out_del" />
 16    <data format="interval" name="out_ins" />
 17  </outputs>
 18  <tests>
 19    <test>
 20      <param name="input1" value="indel_analysis_in1.sam" ftype="sam"/>
 21      <param name="threshold" value="0.017"/>
 22      <output name="out_del" file="indel_analysis_out1.interval" ftype="interval"/>
 23      <output name="out_ins" file="indel_analysis_out2.interval" ftype="interval"/>
 24    </test>
 25    <test>
 26      <param name="input1" value="indel_analysis_in2.sam" ftype="sam"/>
 27      <param name="threshold" value="0.09"/>
 28      <output name="out_del" file="indel_analysis_out3.interval" ftype="interval"/>
 29      <output name="out_ins" file="indel_analysis_out4.interval" ftype="interval"/>
 30    </test>
 31  </tests>
 32  <help>
 33
 34**What it does**
 35
 36Given an input sam file, this tool provides analysis of the indels. It filters out matches that do not meet the frequency threshold. The way this frequency of occurence is calculated is different for deletions and insertions. The CIGAR string's "M" can indicate an exact match or a mismatch. For SAM containing the following bits of information (assuming the reference "ACTGCTCGAT")::
 37
 38 CHROM  POS   CIGAR  SEQ
 39   ref    3  2M1I3M  TACTTC
 40   ref    1  2M1D3M  ACGCT
 41   ref    4  4M2I3M  GTTCAAGAT
 42   ref    2  2M2D3M  CTCCG
 43   ref    1  3M1D4M  AACCTGG
 44   ref    6  3M1I2M  TTCAAT
 45   ref    5  3M1I3M  CTCTGTT
 46   ref    7      4M  CTAT
 47   ref    5      5M  CGCTA
 48   ref    3  2M1D2M  TGCC
 49
 50The following totals would be calculated (this is an intermediate step and not output)::
 51
 52 -------------------------------------------------------------------------------------------------------
 53  POS  BASE  NUMREADS  DELPROPCALC  DELPROP  INSPROPSTARTCALC  INSSTARTPROP  INSPROPENDCALC  INSENDPROP
 54 -------------------------------------------------------------------------------------------------------
 55    1     A         2          2/2     1.00               ---           ---             ---         ---
 56    2     A         1          1/3     0.33               ---           ---             ---         ---
 57          C         2          2/3     0.67               ---           ---             ---         ---
 58    3     C         1          1/5     0.20               ---           ---             ---         ---
 59          T         3          3/5     0.60               ---           ---             ---         ---
 60          -         1          1/5     0.20               ---           ---             ---         ---
 61    4     A         1          1/6     0.17               ---           ---             ---         ---
 62          G         3          3/6     0.50               ---           ---             ---         ---
 63          -         1          1/6     0.17               ---           ---             ---         ---
 64         --         1          1/6     0.17               ---           ---             ---         ---
 65    5     C         4          4/7     0.57               ---           ---             ---         ---
 66          T         2          2/7     0.29               ---           ---             ---         ---
 67          -         1          1/7     0.14               ---           ---             ---         ---
 68         +C         1          ---      ---               1/7          0.14             1/9        0.11
 69    6     C         2          2/9     0.22               ---           ---             ---         ---
 70          G         1          1/9     0.11               ---           ---             ---         ---
 71          T         6          6/9     0.67               ---           ---             ---         ---
 72    7     C         7          7/9     0.78               ---           ---             ---         ---
 73          G         1          1/9     0.11               ---           ---             ---         ---
 74          T         1          1/9     0.11               ---           ---             ---         ---
 75    8     C         1          1/7     0.14               ---           ---             ---         ---
 76          G         4          4/7     0.57               ---           ---             ---         ---
 77          T         2          2/7     0.29               ---           ---             ---         ---
 78         +T         1          ---      ---               1/8          0.13             1/6        0.17
 79        +AA         1          ---      ---               1/8          0.13             1/6        0.17
 80    9     A         4          4/5     0.80               ---           ---             ---         ---
 81          T         1          1/5     0.20               ---           ---             ---         ---
 82         +A         1          ---      ---               1/6          0.17             1/5        0.20
 83   10     T         4          4/4     1.00               ---           ---             ---         ---
 84
 85The general idea for calculating these is that we want to find out the proportion of times a particular event occurred at a position among all reads that touch that base in some way. First, the basic total number of reads at a given position is the number of reads with each particular base plus the number of reads with that a deletion at that given position (including the bases that are "mismatches"). Note that deletions of two bases and one base would be counted completely separately. Insertions are not counted in this total. For position 4 above, the reference base is G, and there are 3 occurrences of it along with one mismatching base, A. Also, there is a 1-base deletion and another 2-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are 1/6 = 0.17 (A) and 3/6 = 0.50 (G), and for each deletion it is 1/6 = 0.17.
 86
 87Insertions are slightly more complicated. We actually want to get the frequency of occurrence for both the associated start and end positions, since an insertion appears between those two bases. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the basic total for either the start or end. So for the insertions at position 8, there are a total of 7 matches/mismatches/deletions at position 8, and two insertions that each occur once, so each has an INSSTARTPROP of 1/8 = 0.13. For the end position there are 5 matches/mismatches/deletions, so the INSENDPROP is 1/6 = 0.17 for both insertions (T and AA).
 88
 89These proportions (DELPROP and either INSSTARTPROP or INSENDPROP) need to be greater than the threshold frequency specified by the user in order for that base, deletion or insertion to be included in the output.
 90
 91
 92** Output format **
 93
 94The output varies for deletions and insertions, although for both, the first three columns are chromosome, start position, and end position.
 95
 96Columns in the deletions file::
 97
 98                        Column  Description
 99 -----------------------------  ---------------------------------------------------------------------------------------------------
100  1                      Chrom  Chromosome
101  2                      Start  Starting position
102  3                        End  Ending position
103  4                   Coverage  Number of reads containing this exact deletion
104  5       Frequency Percentage  Frequency of this exact deletion (2 and 1 are mutually exclusive, for instance), as percentage (%)
105
106Columns in the insertions file::
107
108                   Column  Description
109 ------------------------  -----------------------------------------------------------------------------------------------------------------
110  1                 Chrom  Chromosome
111  2                 Start  Starting position
112  3                   End  Ending position (always Start + 1 for insertions)
113  4      Inserted Base(s)  The exact base(s) inserted at Start position
114  5              Coverage  Number of reads containing this exact insertion
115  6  Freq. Perc. at Start  Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%)
116  7    Freq. Perc. at End  Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%)
117
118Before using this tool, you may will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required.
119
120
121-----
122
123**Example**
124
125If you set the threshold to 0.0 and have the following SAM file::
126
127 r327     16   chrM   11   37      8M1D10M   *   0   0             CTTACCAGATAGTCATCA   -+&lt;2;?@BA@?-,.+4=4             XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1  MD:Z:41^C35
128 r457      0   chr1   14   37          14M   *   0   0                 ACCTGACAGATATC   =/DF;?@1A@?-,.                 XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
129 r501     16   chrM    6   23      7M1I13M   *   0   0          TCTGTGCCTACCAGACATTCA   +=$2;?@BA@?-,.+4=4=4A          XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9        XA:Z:chrM,+134263658,14M1I61M,4;
130 r1288    16   chrM    8   37      11M1I7M   *   0   0            TCACTTACCTGTACACACA   /*F2;?@%A@?-,.+4=4=            XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69
131 r1902     0   chr1    4   37      7M2D18M   *   0   0        AGTCTCTTACCTGACGGTTATGA   &lt;2;?@BA@?-,.+4=4=4AA663        XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
132 r2204    16   chrM    9    0          19M   *   0   0            CTGGTACCTGACAGGTATC   2;?@BA@?-,.+4=4=4AA            XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75           XA:Z:chrM,-564927,76M,1;
133 r2314    16   chrM    6   37      10M2D8M   *   0   0               TCACTCTTACGTCTGA   &lt;2;?@BA@?-,.+4=4               XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45
134 r3001     0   chrM   13   37   3M1D5M2I7M   *   0   0              TACAGTCACCCTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
135 r3218     0   chr1   13   37       8M1D7M   *   0   0                TACAGTCACTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
136 r4767    16   chr2    3   37      15M2I7M   *   0   0       CAGACTCTCTTACCAAAGACAGAC   &lt;2;?@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65
137 r5333     0   chrM    5   37      17M1D8M   *   0   0       GTCTCTCATACCAGACAACGGCAT   FB3$@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13
138 r6690    16   chrM    7   23          20M   *   0   0           CTCTCTTACCAGACAGACAT   2;?@BA/(@?-,.+4=4=4A           XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76             XA:Z:chrM,-568532,76M,1;
139 r7211     0   chrM    7   37          24M   *   0   0       CGACAGAGACAAAATAACATTTAA   //&lt;2;?@BA@?-,.+4=442;;6:       XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0
140 r9922    16   chrM    4    0       7M3I9M   *   0   0            CCAGACATTTGAAATCAGG   F/D4=44^D++26632;;6            XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
141 r9987    16   chrM    4    0      9M1I18M   *   0   0   AGGTTCTCATTACCTGACACTCATCTTG   G/AD6"/+4=4426632;;6:&lt;2;?@BA   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
142 r10145   16   chr1   16    0       5M2D7M   *   0   0                   CACATTGTTGTA   G//+4=44=4AA                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
143 r10324   16   chrM   15    0       6M1D5M   *   0   0                   CCGTTCTACTTG   A@??8.G//+4=                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
144 r12331   16   chrM   17    0       4M2I6M   *   0   0                  AGTCGAATACGTG   632;;6:&lt;2;?@B                  XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
145 r12914   16   chr2   24    0       4M3I3M   *   0   0                     ACTACCCCAA   G//+4=42,.                     XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
146
147The following will be produced (deletions file followed by insertions file)::
148
149 chr1   11   13   1   100.00
150 chr1   21   22   1    25.00
151 chr1   21   23   1    25.00
152 chrM   16   18   1     9.09
153 chrM   19   20   1     8.33
154 chrM   21   22   1     9.09
155 chrM   22   23   1     9.09
156
157 chr2   18   19    AA   1   50.00   50.00
158 chr2   28   29   CCC   1   50.00   50.00
159 chrM   11   12   TTT   1    9.09    9.09
160 chrM   13   14     C   1    9.09    9.09
161 chrM   13   14     T   1    9.09    9.09
162 chrM   19   20     T   1    7.69    8.33
163 chrM   21   22    GA   1    8.33    8.33
164
165
166  </help>
167</tool>