PageRenderTime 27ms CodeModel.GetById 12ms app.highlight 12ms RepoModel.GetById 2ms app.codeStats 0ms

/tools/stats/correlation.pl

https://bitbucket.org/cistrome/cistrome-harvard/
Perl | 84 lines | 68 code | 9 blank | 7 comment | 10 complexity | 322bef75f4269461cb05178b5c41101f MD5 | raw file
 1#!/usr/bin/perl
 2
 3###########################################################################
 4# Purpose: To calculate the correlation of two sets of scores in one file.
 5# Usage: correlation.pl infile.bed output.txt column1 column2
 6#        (column start from 1)
 7# Written by: Yi Zhang  (June, 2005)
 8###########################################################################
 9if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) {
10   print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n";
11   print STDERR "       (column start from 1)\n"; 
12   exit;
13}
14my $file = $ARGV[0];
15my $out = $ARGV[1];
16
17die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/);
18
19my $col1 = $ARGV[2] - 1;
20my $col2 = $ARGV[3] - 1;
21
22my ($f, $o);
23my (@a, @b);
24
25my $n_t = 0;
26open($f, $file) or die "Could't open $file, $!\n";
27while(<$f>) {
28  chomp;
29  my @t = split(/\t/);
30  if ($n_t == 0) { 
31     $n_t = scalar(@t) - 1; 
32     die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t );
33  }
34  die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/);  
35  push(@a, $t[$col1]);
36  push(@b, $t[$col2]);
37}
38close($f);
39
40my $result = correlation(\@a, \@b);
41
42open($o, ">$out") or die "Couldn't open $out, $!\n";
43$col1 = $col1 + 1;
44$col2 = $col2 + 1;
45print $o "The correlation of column $col1 and $col2 is $result\n";
46close($o);
47print "The correlation of column $col1 and $col2 is $result\n";
48
49sub correlation {
50   my ($array1ref, $array2ref) = @_;
51   my ($sum1, $sum2);
52   my ($sum1_squared, $sum2_squared); 
53   foreach (@$array1ref) { $sum1 += $_;  $sum1_squared += $_**2; }
54   foreach (@$array2ref) { $sum2 += $_;  $sum2_squared += $_**2; }
55   my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref);
56   my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) *
57                          ((@$array1ref * $sum2_squared) - ($sum2**2)));
58   my $r;
59   if ($denominator == 0) {
60     print STDERR "The denominator is 0.\n";
61	 exit 0; 
62   } else {
63      $r = $numerator / $denominator;
64   }
65    return $r;
66}
67
68sub covariance {
69   my ($array1ref, $array2ref) = @_;
70   my ($i, $result);
71   for ($i = 0; $i < @$array1ref; $i++) {
72       $result += $array1ref->[$i] * $array2ref->[$i];
73   }
74   $result /= @$array1ref;
75   $result -= mean($array1ref) * mean($array2ref);
76}
77
78sub mean {
79  my ($arrayref) = @_;
80  my $result;
81  foreach (@$arrayref) { $result += $_; }
82  return $result/@$arrayref;
83}
84