PageRenderTime 44ms CodeModel.GetById 33ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/regVariation/delete_overlapping_indels.pl

https://bitbucket.org/cistrome/cistrome-harvard/
Perl | 94 lines | 48 code | 19 blank | 27 comment | 10 complexity | 848e2ae2ebfafc25f57556a96c74f167 MD5 | raw file
 1#!/usr/bin/perl -w
 2
 3# This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, 
 4# the first encountered one is kept and all others are removed. It requires three inputs: 
 5# The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
 6# The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
 7# The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
 8# The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
 9# Note: The number of the first column is 1.
10 
11use strict;
12use warnings;
13
14#varaibles to handle information related to indels
15my $indel1 = "";
16my $indel2 = "";
17my @indelArray1 = ();
18my @indelArray2 = ();
19my $lineCounter1 = 0;
20my $lineCounter2 = 0;
21my $totalNumberofNonOverlappingIndels = 0;
22
23# check to make sure having correct files
24my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n";
25die $usage unless @ARGV == 4;
26
27my $inputFile = $ARGV[0];
28my $indelStartColumn = $ARGV[1] - 1;
29my $indelEndColumn = $ARGV[2] - 1;
30my $outputFile = $ARGV[3];
31
32#verifie column numbers
33if ($indelStartColumn < 0 ){
34	die ("The indel start column number is invalid \n"); 
35}
36if ($indelEndColumn < 0 ){
37	die ("The indel end column number is invalid \n"); 
38}
39
40#open the input and output files
41open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); 
42open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); 
43
44#store the input file in the array @rawData
45my @indelsRawData = <INPUT>;
46
47#iterated through the indels of the input file
48INDEL1:
49foreach $indel1 (@indelsRawData){
50	chomp ($indel1);
51	$lineCounter1++;
52	
53	#get the first indel
54	@indelArray1 = split(/\t/, $indel1);
55	 
56	#our purpose is to detect overlapping indels and to store one copy of them only in the output file
57	#all other non-overlapping indels will stored in the output file also
58			 
59	$lineCounter2 = 0;
60		 
61	#iterated through the indels of the input file
62	INDEL2:
63	foreach $indel2 (@indelsRawData){
64		chomp ($indel2);
65		$lineCounter2++;
66				
67		if ($lineCounter2 > $lineCounter1){
68			#get the second indel
69			@indelArray2 = split(/\t/, $indel2);
70		 				
71 			#check if the two indels are overlapping
72 			if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){
73 				#print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n");
74 				#print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n");
75 				
76 				#break out of the loop and go back to the outerloop
77 				next INDEL1;
78 			}
79 			else{
80 				#print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n");
81 			}
82		}
83	}
84		 
85	print OUTPUT $indel1 . "\n";
86	$totalNumberofNonOverlappingIndels++;
87}
88
89#print("The total number of indels is: " . $lineCounter1 . "\n");
90#print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n");
91
92#close the input and output files
93close(OUTPUT);
94close(INPUT);