PageRenderTime 51ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/code_lic/phylogeny.pl

http://research-code-base-animesh.googlecode.com/
Perl | 263 lines | 225 code | 13 blank | 25 comment | 24 complexity | 578b4fa9c372b1bd976e45540ca602d8 MD5 | raw file
  1. # This program is free software: you can redistribute it and/or modify
  2. # it under the terms of the GNU General Public License as published by
  3. # the Free Software Foundation, either version 3 of the License, or
  4. # (at your option) any later version.
  5. #
  6. # This program is distributed in the hope that it will be useful,
  7. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. # GNU General Public License for more details.
  10. #
  11. # You should have received a copy of the GNU General Public License
  12. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. #
  14. # Code base of Animesh Sharma [ sharma.animesh@gmail.com ]
  15. #!/usr/bin/perl
  16. print "Name of the file containing multiple sequences in FASTA format : \n";
  17. $file=<STDIN>;
  18. chomp $file; $least=10000;
  19. $seq="";
  20. open (F,"testseq")||die "cant open :$!";
  21. while ($line = <F>)
  22. {
  23. chomp ($line);
  24. if ($line =~ /^>/)
  25. {
  26. $line =~ s/>//;
  27. push(@seqname,$line);
  28. if ($seq ne "")
  29. {
  30. push(@seq,$seq);
  31. $seq = "";
  32. }
  33. }
  34. else
  35. {
  36. $seq=$seq.$line;
  37. }
  38. }
  39. push(@seq,$seq);
  40. unshift ( @seq,0);
  41. $num=0;
  42. for ($i=1;$i<=$#seq;$i++)
  43. {
  44. for($k=1;$k<=$#seq;$k++)
  45. {
  46. if ($i==$k)
  47. {
  48. $distij[$i][$k]=10000;
  49. print "\t$distij[$i][$k]";
  50. }
  51. else
  52. {
  53. $s=$seq[$i];
  54. $t=$seq[$k];
  55. alignment($s,$t);
  56. $dist=$counter/(length($foutputmat));
  57. if ($dist<$least)
  58. {
  59. $least=$dist;
  60. $leastrow=$i;
  61. $leastcol=$k;
  62. }
  63. $distij[$i][$k]=$dist;
  64. print "\t$distij[$i][$k]";
  65. }
  66. }
  67. print "\n";
  68. }
  69. $least[$num]=$least;
  70. $leastrow[$num]=$leastrow;
  71. $leastcol[$num]=$leastcol;
  72. print " \n$least[$num], row=$leastrow[$num], col=$leastcol[$num]\n\n";
  73. # to begin clustering
  74. $count=$#seq-1;
  75. #print "total seq=$#seq\n";
  76. while ( $count!=1)
  77. { @copydistij=@distij;
  78. $least=10000; $dist=$least;
  79. for ($i=1;$i<=$#seq;$i++)
  80. { # print "i=$i ";
  81. for($k=1;$k<=$#seq;$k++)
  82. {
  83. # print "\tk=$k ";
  84. if ($i==$leastrow[$num])
  85. {
  86. if ($k==$leastcol[$num] or $i==$k)
  87. {
  88. $distij[$i][$k]=10000;
  89. print "\t$distij[$i][$k]";
  90. }
  91. else
  92. {
  93. $dist=(($copydistij[$i][$k])+($copydistij[$leastcol[$num]][$k]))/2;
  94. $distij[$i][$k]=$dist;
  95. print "\t$distij[$i][$k]";
  96. }
  97. }
  98. elsif ($i==$leastcol[$num])
  99. {
  100. $distij[$i][$k]=10000;
  101. print "\t$distij[$i][$k]";
  102. }
  103. elsif ($k==$leastcol[$num] )
  104. { print "c";
  105. $dist=(($copydistij[$i][$k])+($copydistij[$i][$leastrow[$num]]))/2;
  106. $distij[$i][$k]=$dist;
  107. print "\t$distij[$i][$k]";
  108. }
  109. elsif ($k==$leastrow[$num])
  110. { print "r";
  111. $dist=(($copydistij[$i][$k])+($copydistij[$i][$leastcol[$num]]))/2;
  112. $distij[$i][$k]=$dist;
  113. print "\t$distij[$i][$k]";
  114. }
  115. else
  116. { $dist=$distij[$i][$k];
  117. print "\t$distij[$i][$k]";
  118. }
  119. if ($dist<$least)
  120. { print "le";
  121. $least=$dist;
  122. $leastrow=$i;
  123. $leastcol=$k;
  124. }
  125. }
  126. print "\n";}
  127. $num++;
  128. $least[$num]=$least;
  129. $leastrow[$num]=$leastrow;
  130. $leastcol[$num]=$leastcol;
  131. $count--;
  132. print " \n$least[$num], row=$leastrow[$num], col=$leastcol[$num]\n\n";
  133. }
  134. #print the least pairs
  135. @leastrow=reverse(@leastrow);
  136. @leastcol=reverse(@leastcol);
  137. $j=0;
  138. foreach $r(@leastrow)
  139. {
  140. print " \nleast pair = $r ,$leastcol[$j]";
  141. $j++;
  142. }
  143. sub alignment {
  144. $gap=-2;
  145. $match=2;
  146. $counter=0; @sequencecol=();@sequencerow=(); @point1=();@fscore1=(); @mysymbol=();
  147. $point;
  148. $foutputmat="";
  149. $foutputrow="";
  150. $foutputcol="";
  151. $outputrow="";
  152. $outputcol="";
  153. $outputmat="";
  154. $sequence=shift;
  155. @sequencerow=split(//,$sequence); #split each element and store in array
  156. unshift(@sequencerow,0); #add 0 as first element
  157. $sequence1=shift;
  158. @sequencecol=split(//,$sequence1);
  159. unshift(@sequencecol,0);
  160. for ($row=0;$row<=$#sequencerow;$row++) {
  161. for ($column=0;$column<=$#sequencecol;$column++){
  162. if ($row==0 )
  163. {
  164. $fscore1[$row][$column]=$gap*$column;
  165. $point1[$row][$column]="h";
  166. #print "\t $fscore1[$row][$column] ";
  167. }
  168. else{
  169. if ($row>0 and $column==0)
  170. {
  171. $fscore1[$row][$column]=$gap*$row;
  172. $point1[$row][$column]="v";
  173. # print "\t $fscore1[$row][$column] ";
  174. }
  175. else #calculating values for the matching score for i,j
  176. {
  177. if (($sequencecol[$column]) eq ($sequencerow[$row]))
  178. {
  179. $score=$match;
  180. $mysymbol[$row][$column]="|";
  181. }
  182. else
  183. {
  184. $score=0;
  185. $mysymbol[$row][$column]="X";
  186. }
  187. $scoreij=$score+$fscore1[$row-1][$column-1]; #calculating the final score for each cell and storing the pointer
  188. if ($scoreij>(($fscore1[$row-1][$column])+$gap))
  189. {
  190. $fscore=$scoreij;
  191. $point='d';
  192. }
  193. else
  194. {
  195. $fscore=($fscore1[$row-1][$column])+$gap;
  196. $point="v";
  197. }
  198. if ($fscore>(($fscore1[$row][$column-1])+$gap))
  199. {
  200. $fscore1[$row][$column]=$fscore;
  201. $point1[$row][$column]=$point;
  202. }
  203. else
  204. {
  205. $fscore1[$row][$column]=($fscore1[$row][$column- 1])+$gap;
  206. $point1[$row][$column]="h";
  207. }
  208. # print "\t $fscore1[$row][$column] ";
  209. #print "$point1[$row][$column] ";
  210. }
  211. }
  212. }
  213. # print "\n";
  214. }
  215. # TRACEBACK
  216. $row--,$column--;
  217. $scorealignment=$fscore1[$row][$column];
  218. while ($row!=0 or $column!=0)
  219. {
  220. if ($point1[$row][$column] eq "d")
  221. {
  222. $outputrow=$outputrow.$sequencerow[$row];
  223. $outputcol=$outputcol.$sequencecol[$column];
  224. $outputmat=$outputmat.$mysymbol[$row][$column];
  225. $row--;$column--;
  226. }
  227. else
  228. {
  229. if ( $point1[$row][$column] eq "h")
  230. {
  231. $outputrow=$outputrow."-";
  232. $outputcol=$outputcol.$sequencecol[$column];
  233. $outputmat=$outputmat."X";
  234. $column--;
  235. }
  236. else
  237. {
  238. $outputrow=$outputrow.$sequencerow[$row];
  239. $outputcol=$outputcol."-";
  240. $outputmat=$outputmat."X";
  241. $row--;
  242. }
  243. }
  244. }
  245. $foutputrow=uc(reverse($outputrow));
  246. $foutputcol=uc(reverse($outputcol));
  247. $foutputmat=reverse($outputmat);
  248. $counter= $foutputmat =~ s/X/X/g;
  249. return $counter;
  250. }