PageRenderTime 70ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/regVariation/parseMAF_smallIndels.pl

https://bitbucket.org/chapmanb/galaxy-central/
Perl | 698 lines | 552 code | 73 blank | 73 comment | 101 complexity | 52a8c0d056cf5c5467079a7707586383 MD5 | raw file
  1. #!/usr/bin/perl -w
  2. # a program to get indels
  3. # input is a MAF format 3-way alignment file
  4. # from 3-way blocks only at this time
  5. # translate seq2, seq3, etc coordinates to + if align orient is reverse complement
  6. use strict;
  7. use warnings;
  8. # declare and initialize variables
  9. my $fh; # variable to store filehandle
  10. my $record;
  11. my $offset;
  12. my $library = $ARGV[0];
  13. my $count = 0;
  14. my $count2 = 0;
  15. my $count3 = 0;
  16. my $count4 = 0;
  17. my $start1 = my $start2 = my $start3 = my $start4 = my $start5 = my $start6 = 0;
  18. my $orient = "";
  19. my $outgroup = $ARGV[2];
  20. my $ingroup1 = my $ingroup2 = "";
  21. my $count_seq1insert = my $count_seq1delete = 0;
  22. my $count_seq2insert = my $count_seq2delete = 0;
  23. my $count_seq3insert = my $count_seq3delete = 0;
  24. my @seq1_insert_lengths = my @seq1_delete_lengths = ();
  25. my @seq2_insert_lengths = my @seq2_delete_lengths = ();
  26. my @seq3_insert_lengths = my @seq3_delete_lengths = ();
  27. my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = ();
  28. my @seq1_insert_startOnly = my @seq1_delete_startOnly = my @seq2_insert_startOnly = my @seq2_delete_startOnly = ();
  29. my @seq3_insert_startOnly = my @seq3_delete_startOnly = ();
  30. my @indels = ();
  31. # check to make sure correct files
  32. my $usage = "usage: parseMAF_smallIndels.pl [MAF.in] [small_Indels_summary.out] [outgroup]\n";
  33. die $usage unless @ARGV == 3;
  34. # perform some standard subroutines
  35. $fh = open_file($library);
  36. $offset = tell($fh);
  37. #my $ofile = $ARGV[2];
  38. #unless (open(OFILE, ">$ofile")){
  39. # print "Cannot open output file \"$ofile\"\n\n";
  40. # exit;
  41. #}
  42. my $ofile2 = $ARGV[1];
  43. unless (open(OFILE2, ">$ofile2")){
  44. print "Cannot open output file \"$ofile2\"\n\n";
  45. exit;
  46. }
  47. # header line for output files
  48. #print OFILE "# small indel events, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n";
  49. #print OFILE "#align\tingroup1\tingroup1_coord\tingroup1_orient\tingroup2\tingroup2_coord\tingroup2_orient\toutgroup\toutgroup_coord\toutgroup_orient\tindel_type\n";
  50. #print OFILE2 "# small indels summary, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n";
  51. print OFILE2 "#block\tindel_type\tindel_length\tingroup1\tingroup1_start\tingroup1_end\tingroup1_alignSize\tingroup1_orient\tingroup2\tingroup2_start\tingroup2_end\tingroup2_alignSize\tingroup2_orient\toutgroup\toutgroup_start\toutgroup_end\toutgroup_alignSize\toutgroup_orient\n";
  52. # main body of program
  53. while ($record = get_next_record($fh) ){
  54. if ($record=~ m/\s*##maf(.*)\s*# maf/s){
  55. next;
  56. }
  57. my @sequences = get_sequences_within_block($record);
  58. my @seq_info = get_indels_within_block(@sequences);
  59. get_indels_lengths(@seq_info);
  60. $offset = tell($fh);
  61. $count++;
  62. }
  63. get_starts_only(@seq1_insert);
  64. get_starts_only(@seq1_delete);
  65. get_starts_only(@seq2_insert);
  66. get_starts_only(@seq2_delete);
  67. get_starts_only(@seq3_insert);
  68. get_starts_only(@seq3_delete);
  69. # print some things to keep track of progress
  70. #print "# $library\n";
  71. #print "# number of records = $count\n";
  72. #print "# number of sequence \"s\" lines = $count2\n";
  73. if ($count3 != 0){
  74. print "Skipped $count3 blocks with only 2 seqs;\n";
  75. }
  76. #print "# number of records with only h-m = $count4\n\n";
  77. print "Ingroup1 = $ingroup1; Ingroup2 = $ingroup2; Outgroup = $outgroup;\n";
  78. print "# of ingroup1 inserts = $count_seq1insert;\n";
  79. print "# of ingroup1 deletes = $count_seq1delete;\n";
  80. print "# of ingroup2 inserts = $count_seq2insert;\n";
  81. print "# of ingroup2 deletes = $count_seq2delete;\n";
  82. print "# of outgroup3 inserts = $count_seq3insert;\n";
  83. print "# of outgroup3 deletes = $count_seq3delete\n";
  84. #close OFILE;
  85. if ($count == $count3){
  86. print STDERR "Skipped all blocks since none of them contain 3-way alignments.\n";
  87. exit -1;
  88. }
  89. ###################SUBROUTINES#####################################
  90. # subroutine to open file
  91. sub open_file {
  92. my($filename) = @_;
  93. my $fh;
  94. unless (open($fh, $filename)){
  95. print "Cannot open file $filename\n";
  96. exit;
  97. }
  98. return $fh;
  99. }
  100. # get next record
  101. sub get_next_record {
  102. my ($fh) = @_;
  103. my ($offset);
  104. my ($record) = "";
  105. my ($save_input_separator) = $/;
  106. $/ = "a score";
  107. $record = <$fh>;
  108. $/ = $save_input_separator;
  109. return $record;
  110. }
  111. # get header info
  112. sub get_sequences_within_block{
  113. my (@alignment) = @_;
  114. my @lines = ();
  115. my @sequences = ();
  116. @lines = split ("\n", $record);
  117. foreach (@lines){
  118. chomp($_);
  119. if (m/^\s*$/){
  120. next;
  121. }
  122. elsif (m/^\s*=(\d+\.*\d*)/){
  123. }elsif (m/^\s*s(.*)$/){
  124. $count2++;
  125. push (@sequences,$_);
  126. }
  127. }
  128. return @sequences;
  129. }
  130. sub get_indels_within_block{
  131. my (@sequences) = @_;
  132. my $line1 = my $line2 = my $line3 = "";
  133. my @line1 = my @line2 = my @line3 = ();
  134. my $score = 0;
  135. my $start1 = my $align_length1 = my $end1 = my $seq_length1 = 0;
  136. my $start2 = my $align_length2 = my $end2 = my $seq_length2 = 0;
  137. my $start3 = my $align_length3 = my $end3 = my $seq_length3 = 0;
  138. my $seq1 = my $orient1 = "";
  139. my $seq2 = my $orient2 = "";
  140. my $seq3 = my $orient3 = "";
  141. my $start1_plus = my $end1_plus = 0;
  142. my $start2_plus = my $end2_plus = 0;
  143. my $start3_plus = my $end3_plus = 0;
  144. my @test = ();
  145. my $test = "";
  146. my $header = "";
  147. my @header = ();
  148. my $sequence1 = my $sequence2 = my $sequence3 ="";
  149. my @array_return = ();
  150. my $test1 = 0;
  151. my $line1_stat = my $line2_stat = my $line3_stat = "";
  152. # process 3-way blocks only
  153. if (scalar(@sequences) == 3){
  154. $line1 = $sequences[0];
  155. chomp ($line1);
  156. $line2 = $sequences[1];
  157. chomp ($line2);
  158. $line3 = $sequences[2];
  159. chomp ($line3);
  160. # check order of sequences and assign uniformly seq1= human, seq2 = chimp, seq3 = macaque
  161. if ($line1 =~ m/$outgroup/){
  162. $line1_stat = "out";
  163. $line2=~ s/^\s*//;
  164. $line2 =~ s/\s+/\t/g;
  165. @line2 = split(/\t/, $line2);
  166. if (($ingroup1 eq "") || ($line2[1] =~ m/$ingroup1/)){
  167. $line2_stat = "in1";
  168. $line3_stat = "in2";
  169. }
  170. else{
  171. $line3_stat = "in1";
  172. $line2_stat = "in2"; }
  173. }
  174. elsif ($line2 =~ m/$outgroup/){
  175. $line2_stat = "out";
  176. $line1=~ s/^\s*//;
  177. $line1 =~ s/\s+/\t/g;
  178. @line1 = split(/\t/, $line1);
  179. if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){
  180. $line1_stat = "in1";
  181. $line3_stat = "in2";
  182. }
  183. else{
  184. $line3_stat = "in1";
  185. $line1_stat = "in2"; }
  186. }
  187. elsif ($line3 =~ m/$outgroup/){
  188. $line3_stat = "out";
  189. $line1=~ s/^\s*//;
  190. $line1 =~ s/\s+/\t/g;
  191. @line1 = split(/\t/, $line1);
  192. if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){
  193. $line1_stat = "in1";
  194. $line2_stat = "in2";
  195. }
  196. else{
  197. $line2_stat = "in1";
  198. $line1_stat = "in2"; }
  199. }
  200. #print "# l1 = $line1_stat\n";
  201. #print "# l2 = $line2_stat\n";
  202. #print "# l3 = $line3_stat\n";
  203. my $linei1 = my $linei2 = my $lineo = "";
  204. my @linei1 = my @linei2 = my @lineo = ();
  205. if ($line1_stat eq "out"){
  206. $lineo = $line1;
  207. }
  208. elsif ($line1_stat eq "in1"){
  209. $linei1 = $line1;
  210. }
  211. else{
  212. $linei2 = $line1;
  213. }
  214. if ($line2_stat eq "out"){
  215. $lineo = $line2;
  216. }
  217. elsif ($line2_stat eq "in1"){
  218. $linei1 = $line2;
  219. }
  220. else{
  221. $linei2 = $line2;
  222. }
  223. if ($line3_stat eq "out"){
  224. $lineo = $line3;
  225. }
  226. elsif ($line3_stat eq "in1"){
  227. $linei1 = $line3;
  228. }
  229. else{
  230. $linei2 = $line3;
  231. }
  232. $linei1=~ s/^\s*//;
  233. $linei1 =~ s/\s+/\t/g;
  234. @linei1 = split(/\t/, $linei1);
  235. $end1 =($linei1[2]+$linei1[3]-1);
  236. $seq1 = $linei1[1].":".$linei1[3];
  237. $ingroup1 = (split(/\./, $seq1))[0];
  238. $start1 = $linei1[2];
  239. $align_length1 = $linei1[3];
  240. $orient1 = $linei1[4];
  241. $seq_length1 = $linei1[5];
  242. $sequence1 = $linei1[6];
  243. $test1 = length($sequence1);
  244. my $total_length1 = $test1+$start1;
  245. my @array1 = ($start1,$end1,$orient1,$seq_length1);
  246. ($start1_plus, $end1_plus) = convert_coords(@array1);
  247. $linei2=~ s/^\s*//;
  248. $linei2 =~ s/\s+/\t/g;
  249. @linei2 = split(/\t/, $linei2);
  250. $end2 =($linei2[2]+$linei2[3]-1);
  251. $seq2 = $linei2[1].":".$linei2[3];
  252. $ingroup2 = (split(/\./, $seq2))[0];
  253. $start2 = $linei2[2];
  254. $align_length2 = $linei2[3];
  255. $orient2 = $linei2[4];
  256. $seq_length2 = $linei2[5];
  257. $sequence2 = $linei2[6];
  258. my $test2 = length($sequence2);
  259. my $total_length2 = $test2+$start2;
  260. my @array2 = ($start2,$end2,$orient2,$seq_length2);
  261. ($start2_plus, $end2_plus) = convert_coords(@array2);
  262. $lineo=~ s/^\s*//;
  263. $lineo =~ s/\s+/\t/g;
  264. @lineo = split(/\t/, $lineo);
  265. $end3 =($lineo[2]+$lineo[3]-1);
  266. $seq3 = $lineo[1].":".$lineo[3];
  267. $start3 = $lineo[2];
  268. $align_length3 = $lineo[3];
  269. $orient3 = $lineo[4];
  270. $seq_length3 = $lineo[5];
  271. $sequence3 = $lineo[6];
  272. my $test3 = length($sequence3);
  273. my $total_length3 = $test3+$start3;
  274. my @array3 = ($start3,$end3,$orient3,$seq_length3);
  275. ($start3_plus, $end3_plus) = convert_coords(@array3);
  276. #print "# l1 = $ingroup1\n";
  277. #print "# l2 = $ingroup2\n";
  278. #print "# l3 = $outgroup\n";
  279. my $ABC = "";
  280. my $coord1 = my $coord2 = my $coord3 = 0;
  281. $coord1 = $start1_plus;
  282. $coord2 = $start2_plus;
  283. $coord3 = $start3_plus;
  284. for (my $position = 0; $position < $test1; $position++) {
  285. my $indelType = "";
  286. my $indel_line = "";
  287. # seq1 deletes
  288. if ((substr($sequence1,$position,1) eq "-")
  289. && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/)
  290. && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){
  291. $ABC = join("",($ABC,"X"));
  292. my @s = split(/:/, $seq1);
  293. $indelType = $s[0]."_delete";
  294. #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n";
  295. $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType));
  296. push (@indels,$indel_line);
  297. push (@seq1_delete,$indel_line);
  298. $coord2++; $coord3++;
  299. }
  300. # seq2 deletes
  301. elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/)
  302. && (substr($sequence2,$position,1) eq "-")
  303. && (substr($sequence3,$position,1) !~ m/[-*\$?^]/)){
  304. $ABC = join("",($ABC,"Y"));
  305. my @s = split(/:/, $seq2);
  306. $indelType = $s[0]."_delete";
  307. #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n";
  308. $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType));
  309. push (@indels,$indel_line);
  310. push (@seq2_delete,$indel_line);
  311. $coord1++;
  312. $coord3++;
  313. }
  314. # seq1 inserts
  315. elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/)
  316. && (substr($sequence2,$position,1) eq "-")
  317. && (substr($sequence3,$position,1) eq "-")){
  318. $ABC = join("",($ABC,"Z"));
  319. my @s = split(/:/, $seq1);
  320. $indelType = $s[0]."_insert";
  321. #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n";
  322. $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType));
  323. push (@indels,$indel_line);
  324. push (@seq1_insert,$indel_line);
  325. $coord1++;
  326. }
  327. # seq2 inserts
  328. elsif ((substr($sequence1,$position,1) eq "-")
  329. && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/)
  330. && (substr($sequence3,$position,1) eq "-")){
  331. $ABC = join("",($ABC,"W"));
  332. my @s = split(/:/, $seq2);
  333. $indelType = $s[0]."_insert";
  334. #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n";
  335. $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType));
  336. push (@indels,$indel_line);
  337. push (@seq2_insert,$indel_line);
  338. $coord2++;
  339. }
  340. # seq3 deletes
  341. elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/)
  342. && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/)
  343. && (substr($sequence3,$position,1) eq "-")){
  344. $ABC = join("",($ABC,"S"));
  345. my @s = split(/:/, $seq3);
  346. $indelType = $s[0]."_delete";
  347. #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n";
  348. $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType));
  349. push (@indels,$indel_line);
  350. push (@seq3_delete,$indel_line);
  351. $coord1++; $coord2++;
  352. }
  353. # seq3 inserts
  354. elsif ((substr($sequence1,$position,1) eq "-")
  355. && (substr($sequence2,$position,1) eq "-")
  356. && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){
  357. $ABC = join("",($ABC,"T"));
  358. my @s = split(/:/, $seq3);
  359. $indelType = $s[0]."_insert";
  360. #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n";
  361. $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType));
  362. push (@indels,$indel_line);
  363. push (@seq3_insert,$indel_line);
  364. $coord3++;
  365. }else{
  366. $ABC = join("",($ABC,"N"));
  367. $coord1++; $coord2++; $coord3++;
  368. }
  369. }
  370. @array_return=($seq1,$seq2,$seq3,$ABC);
  371. return (@array_return);
  372. }
  373. # ignore pairwise cases for now, just count the number of blocks
  374. elsif (scalar(@sequences) == 2){
  375. my $ABC = "";
  376. my $coord1 = my $coord2 = my $coord3 = 0;
  377. $count3++;
  378. $line1 = $sequences[0];
  379. $line2 = $sequences[1];
  380. chomp ($line1);
  381. chomp ($line2);
  382. if ($line2 !~ m/$ingroup2/){
  383. $count4++;
  384. }
  385. }
  386. }
  387. sub get_indels_lengths{
  388. my (@array) = @_;
  389. if (scalar(@array) == 4){
  390. my $seq1 = $array[0]; my $seq2 = $array[1]; my $seq3 = $array[2]; my $ABC = $array[3];
  391. while ($ABC =~ m/(X+)/g) {
  392. push (@seq1_delete_lengths,length($1));
  393. $count_seq1delete++;
  394. }
  395. while ($ABC =~ m/(Y+)/g) {
  396. push (@seq2_delete_lengths,length($1));
  397. $count_seq2delete++;
  398. }
  399. while ($ABC =~ m/(S+)/g) {
  400. push (@seq3_delete_lengths,length($1));
  401. $count_seq3delete++;
  402. }
  403. while ($ABC =~ m/(Z+)/g) {
  404. push (@seq1_insert_lengths,length($1));
  405. $count_seq1insert++;
  406. }
  407. while ($ABC =~ m/(W+)/g) {
  408. push(@seq2_insert_lengths,length($1));
  409. $count_seq2insert++;
  410. }
  411. while ($ABC =~ m/(T+)/g) {
  412. push (@seq3_insert_lengths,length($1));
  413. $count_seq3insert++;
  414. }
  415. }
  416. elsif (scalar(@array) == 0){
  417. next;
  418. }
  419. }
  420. # convert to coordinates to + strand if align orient = -
  421. sub convert_coords{
  422. my (@array) = @_;
  423. my $s = $array[0];
  424. my $e = $array[1];
  425. my $o = $array[2];
  426. my $l = $array[3];
  427. my $start_plus = my $end_plus = 0;
  428. if ($o eq "-"){
  429. $start_plus = ($l - $e);
  430. $end_plus = ($l - $s);
  431. }elsif ($o eq "+"){
  432. $start_plus = $s;
  433. $end_plus = $e;
  434. }
  435. return ($start_plus, $end_plus);
  436. }
  437. # get first line only for each event
  438. sub get_starts_only{
  439. my (@test) = @_;
  440. my $seq1 = my $seq2 = my $seq3 = my $indelType = my $old_seq1 = my $old_seq2 = my $old_seq3 = my $old_indelType = my $old_line = "";
  441. my $coord1 = my $coord2 = my $coord3 = my $old_coord1 = my $old_coord2 = my $old_coord3 = 0;
  442. my @matches = ();
  443. my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = ();
  444. foreach my $line (@test){
  445. chomp($line);
  446. $line =~ s/^\s*//;
  447. $line =~ s/\s+/\t/g;
  448. my @line1 = split(/\t/, $line);
  449. $seq1 = $line1[1];
  450. $coord1 = $line1[2];
  451. $seq2 = $line1[4];
  452. $coord2 = $line1[5];
  453. $seq3 = $line1[7];
  454. $coord3 = $line1[8];
  455. $indelType = $line1[10];
  456. if ($indelType =~ m/$ingroup1/ && $indelType =~ m/insert/){
  457. if ($coord1 != $old_coord1+1 || ($coord2 != $old_coord2 || $coord3 != $old_coord3)){
  458. $start1++;
  459. push (@seq1_insert_startOnly,$line);
  460. }
  461. }
  462. elsif ($indelType =~ m/$ingroup1/ && $indelType =~ m/delete/){
  463. if ($coord1 != $old_coord1 || ($coord2 != $old_coord2+1 || $coord3 != $old_coord3+1)){
  464. $start2++;
  465. push(@seq1_delete_startOnly,$line);
  466. }
  467. }
  468. elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/insert/){
  469. if ($coord2 != $old_coord2+1 || ($coord1 != $old_coord1 || $coord3 != $old_coord3)){
  470. $start3++;
  471. push(@seq2_insert_startOnly,$line);
  472. }
  473. }
  474. elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/delete/){
  475. if ($coord2 != $old_coord2 || ($coord1 != $old_coord1+1 || $coord3 != $old_coord3+1)){
  476. $start4++;
  477. push(@seq2_delete_startOnly,$line);
  478. }
  479. }
  480. elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/insert/){
  481. if ($coord3 != $old_coord3+1 || ($coord1 != $old_coord1 || $coord2 != $old_coord2)){
  482. $start5++;
  483. push(@seq3_insert_startOnly,$line);
  484. }
  485. }
  486. elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/delete/){
  487. if ($coord3 != $old_coord3 || ($coord1 != $old_coord1+1 ||$coord2 != $old_coord2+1)){
  488. $start6++;
  489. push(@seq3_delete_startOnly,$line);
  490. }
  491. }
  492. $old_indelType = $indelType;
  493. $old_seq1 = $seq1;
  494. $old_coord1 = $coord1;
  495. $old_seq2 = $seq2;
  496. $old_coord2 = $coord2;
  497. $old_seq3 = $seq3;
  498. $old_coord3 = $coord3;
  499. $old_line = $line;
  500. }
  501. }
  502. # append lengths to each event start line
  503. my $counter1; my $counter2; my $counter3; my $counter4; my $counter5; my $counter6;
  504. my @final1 = my @final2 = my @final3 = my @final4 = my @final5 = my @final6 = ();
  505. my $final_line1 = my $final_line2 = my $final_line3 = my $final_line4 = my $final_line5 = my $final_line6 = "";
  506. for ($counter1 = 0; $counter1 < @seq3_insert_startOnly; $counter1++){
  507. $final_line1 = join("\t",($seq3_insert_startOnly[$counter1],$seq3_insert_lengths[$counter1]));
  508. push (@final1,$final_line1);
  509. }
  510. for ($counter2 = 0; $counter2 < @seq3_delete_startOnly; $counter2++){
  511. $final_line2 = join("\t",($seq3_delete_startOnly[$counter2],$seq3_delete_lengths[$counter2]));
  512. push(@final2,$final_line2);
  513. }
  514. for ($counter3 = 0; $counter3 < @seq2_insert_startOnly; $counter3++){
  515. $final_line3 = join("\t",($seq2_insert_startOnly[$counter3],$seq2_insert_lengths[$counter3]));
  516. push(@final3,$final_line3);
  517. }
  518. for ($counter4 = 0; $counter4 < @seq2_delete_startOnly; $counter4++){
  519. $final_line4 = join("\t",($seq2_delete_startOnly[$counter4],$seq2_delete_lengths[$counter4]));
  520. push(@final4,$final_line4);
  521. }
  522. for ($counter5 = 0; $counter5 < @seq1_insert_startOnly; $counter5++){
  523. $final_line5 = join("\t",($seq1_insert_startOnly[$counter5],$seq1_insert_lengths[$counter5]));
  524. push(@final5,$final_line5);
  525. }
  526. for ($counter6 = 0; $counter6 < @seq1_delete_startOnly; $counter6++){
  527. $final_line6 = join("\t",($seq1_delete_startOnly[$counter6],$seq1_delete_lengths[$counter6]));
  528. push(@final6,$final_line6);
  529. }
  530. # format final output
  531. # # if inserts, increase coords for the sequence inserted, other sequences give coords for 5' and 3' bases flanking the gap
  532. # # for deletes, increase coords for other 2 sequences and the one deleted give coords for 5' and 3' bases flanking the gap
  533. get_final_format(@final5);
  534. get_final_format(@final6);
  535. get_final_format(@final3);
  536. get_final_format(@final4);
  537. get_final_format(@final1);
  538. get_final_format(@final2);
  539. sub get_final_format{
  540. my (@final) = @_;
  541. foreach (@final){
  542. my $event_line = $_;
  543. my @events = split(/\t/, $event_line);
  544. my $event_type = $events[10];
  545. my @name_align1 = split(/:/, $events[1]);
  546. my @name_align2 = split(/:/, $events[4]);
  547. my @name_align3 = split(/:/, $events[7]);
  548. my $seq1_event_start = my $seq1_event_end = my $seq2_event_start = my $seq2_event_end = my $seq3_event_start = my $seq3_event_end = 0;
  549. my $final_event_line = "";
  550. # seq1_insert
  551. if ($event_type =~ m/$ingroup1/ && $event_type =~ m/insert/){
  552. # only increase coord for human
  553. # remember that other two sequnences, the gap spans (coord - 1) --> coord
  554. $seq1_event_start = ($events[2]);
  555. $seq1_event_end = ($events[2]+$events[11]-1);
  556. $seq2_event_start = ($events[5]-1);
  557. $seq2_event_end = ($events[5]);
  558. $seq3_event_start = ($events[8]-1);
  559. $seq3_event_end = ($events[8]);
  560. $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));
  561. }
  562. # seq1_delete
  563. elsif ($event_type =~ m/$ingroup1/ && $event_type =~ m/delete/){
  564. # only increase coords for seq2 and seq3
  565. # remember for seq1, the gap spans (coord - 1) --> coord
  566. $seq1_event_start = ($events[2]-1);
  567. $seq1_event_end = ($events[2]);
  568. $seq2_event_start = ($events[5]);
  569. $seq2_event_end = ($events[5]+$events[11]-1);
  570. $seq3_event_start = ($events[8]);
  571. $seq3_event_end = ($events[8]+$events[11]-1);
  572. $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));
  573. }
  574. # seq2_insert
  575. elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/insert/){
  576. # only increase coords for seq2
  577. # remember that other two sequnences, the gap spans (coord - 1) --> coord
  578. $seq1_event_start = ($events[2]-1);
  579. $seq1_event_end = ($events[2]);
  580. $seq2_event_start = ($events[5]);
  581. $seq2_event_end = ($events[5]+$events[11]-1);
  582. $seq3_event_start = ($events[8]-1);
  583. $seq3_event_end = ($events[8]);
  584. $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));
  585. }
  586. # seq2_delete
  587. elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/delete/){
  588. # only increase coords for seq1 and seq3
  589. # remember for seq2, the gap spans (coord - 1) --> coord
  590. $seq1_event_start = ($events[2]);
  591. $seq1_event_end = ($events[2]+$events[11]-1);
  592. $seq2_event_start = ($events[5]-1);
  593. $seq2_event_end = ($events[5]);
  594. $seq3_event_start = ($events[8]);
  595. $seq3_event_end = ($events[8]+$events[11]-1);
  596. $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));
  597. }
  598. # start testing w/seq3_insert
  599. elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/insert/){
  600. # only increase coord for rheMac
  601. # remember that other two sequnences, the gap spans (coord - 1) --> coord
  602. $seq1_event_start = ($events[2]-1);
  603. $seq1_event_end = ($events[2]);
  604. $seq2_event_start = ($events[5]-1);
  605. $seq2_event_end = ($events[5]);
  606. $seq3_event_start = ($events[8]);
  607. $seq3_event_end = ($events[8]+$events[11]-1);
  608. $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));
  609. }
  610. # seq3_delete
  611. elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/delete/){
  612. # only increase coords for seq1 and seq2
  613. # remember for seq3, the gap spans (coord - 1) --> coord
  614. $seq1_event_start = ($events[2]);
  615. $seq1_event_end = ($events[2]+$events[11]-1);
  616. $seq2_event_start = ($events[5]);
  617. $seq2_event_end = ($events[5]+$events[11]-1);
  618. $seq3_event_start = ($events[8]-1);
  619. $seq3_event_end = ($events[8]);
  620. $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9]));
  621. }
  622. print OFILE2 "$final_event_line\n";
  623. }
  624. }
  625. close OFILE2;