/tools/regVariation/delete_overlapping_indels.pl

https://bitbucket.org/cistrome/cistrome-harvard/ · Perl · 94 lines · 48 code · 19 blank · 27 comment · 10 complexity · 848e2ae2ebfafc25f57556a96c74f167 MD5 · raw file

  1. #!/usr/bin/perl -w
  2. # This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels,
  3. # the first encountered one is kept and all others are removed. It requires three inputs:
  4. # The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
  5. # The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
  6. # The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
  7. # The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
  8. # Note: The number of the first column is 1.
  9. use strict;
  10. use warnings;
  11. #varaibles to handle information related to indels
  12. my $indel1 = "";
  13. my $indel2 = "";
  14. my @indelArray1 = ();
  15. my @indelArray2 = ();
  16. my $lineCounter1 = 0;
  17. my $lineCounter2 = 0;
  18. my $totalNumberofNonOverlappingIndels = 0;
  19. # check to make sure having correct files
  20. my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n";
  21. die $usage unless @ARGV == 4;
  22. my $inputFile = $ARGV[0];
  23. my $indelStartColumn = $ARGV[1] - 1;
  24. my $indelEndColumn = $ARGV[2] - 1;
  25. my $outputFile = $ARGV[3];
  26. #verifie column numbers
  27. if ($indelStartColumn < 0 ){
  28. die ("The indel start column number is invalid \n");
  29. }
  30. if ($indelEndColumn < 0 ){
  31. die ("The indel end column number is invalid \n");
  32. }
  33. #open the input and output files
  34. open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n");
  35. open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n");
  36. #store the input file in the array @rawData
  37. my @indelsRawData = <INPUT>;
  38. #iterated through the indels of the input file
  39. INDEL1:
  40. foreach $indel1 (@indelsRawData){
  41. chomp ($indel1);
  42. $lineCounter1++;
  43. #get the first indel
  44. @indelArray1 = split(/\t/, $indel1);
  45. #our purpose is to detect overlapping indels and to store one copy of them only in the output file
  46. #all other non-overlapping indels will stored in the output file also
  47. $lineCounter2 = 0;
  48. #iterated through the indels of the input file
  49. INDEL2:
  50. foreach $indel2 (@indelsRawData){
  51. chomp ($indel2);
  52. $lineCounter2++;
  53. if ($lineCounter2 > $lineCounter1){
  54. #get the second indel
  55. @indelArray2 = split(/\t/, $indel2);
  56. #check if the two indels are overlapping
  57. if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){
  58. #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n");
  59. #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n");
  60. #break out of the loop and go back to the outerloop
  61. next INDEL1;
  62. }
  63. else{
  64. #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n");
  65. }
  66. }
  67. }
  68. print OUTPUT $indel1 . "\n";
  69. $totalNumberofNonOverlappingIndels++;
  70. }
  71. #print("The total number of indels is: " . $lineCounter1 . "\n");
  72. #print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n");
  73. #close the input and output files
  74. close(OUTPUT);
  75. close(INPUT);