/tools/stats/correlation.pl

https://bitbucket.org/cistrome/cistrome-harvard/ · Perl · 84 lines · 66 code · 11 blank · 7 comment · 16 complexity · 322bef75f4269461cb05178b5c41101f MD5 · raw file

  1. #!/usr/bin/perl
  2. ###########################################################################
  3. # Purpose: To calculate the correlation of two sets of scores in one file.
  4. # Usage: correlation.pl infile.bed output.txt column1 column2
  5. # (column start from 1)
  6. # Written by: Yi Zhang (June, 2005)
  7. ###########################################################################
  8. if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) {
  9. print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n";
  10. print STDERR " (column start from 1)\n";
  11. exit;
  12. }
  13. my $file = $ARGV[0];
  14. my $out = $ARGV[1];
  15. die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/);
  16. my $col1 = $ARGV[2] - 1;
  17. my $col2 = $ARGV[3] - 1;
  18. my ($f, $o);
  19. my (@a, @b);
  20. my $n_t = 0;
  21. open($f, $file) or die "Could't open $file, $!\n";
  22. while(<$f>) {
  23. chomp;
  24. my @t = split(/\t/);
  25. if ($n_t == 0) {
  26. $n_t = scalar(@t) - 1;
  27. die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t );
  28. }
  29. die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/);
  30. push(@a, $t[$col1]);
  31. push(@b, $t[$col2]);
  32. }
  33. close($f);
  34. my $result = correlation(\@a, \@b);
  35. open($o, ">$out") or die "Couldn't open $out, $!\n";
  36. $col1 = $col1 + 1;
  37. $col2 = $col2 + 1;
  38. print $o "The correlation of column $col1 and $col2 is $result\n";
  39. close($o);
  40. print "The correlation of column $col1 and $col2 is $result\n";
  41. sub correlation {
  42. my ($array1ref, $array2ref) = @_;
  43. my ($sum1, $sum2);
  44. my ($sum1_squared, $sum2_squared);
  45. foreach (@$array1ref) { $sum1 += $_; $sum1_squared += $_**2; }
  46. foreach (@$array2ref) { $sum2 += $_; $sum2_squared += $_**2; }
  47. my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref);
  48. my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) *
  49. ((@$array1ref * $sum2_squared) - ($sum2**2)));
  50. my $r;
  51. if ($denominator == 0) {
  52. print STDERR "The denominator is 0.\n";
  53. exit 0;
  54. } else {
  55. $r = $numerator / $denominator;
  56. }
  57. return $r;
  58. }
  59. sub covariance {
  60. my ($array1ref, $array2ref) = @_;
  61. my ($i, $result);
  62. for ($i = 0; $i < @$array1ref; $i++) {
  63. $result += $array1ref->[$i] * $array2ref->[$i];
  64. }
  65. $result /= @$array1ref;
  66. $result -= mean($array1ref) * mean($array2ref);
  67. }
  68. sub mean {
  69. my ($arrayref) = @_;
  70. my $result;
  71. foreach (@$arrayref) { $result += $_; }
  72. return $result/@$arrayref;
  73. }