PageRenderTime 64ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/evolution/codingSnps.pl

https://bitbucket.org/cistrome/cistrome-harvard/
Perl | 569 lines | 518 code | 18 blank | 33 comment | 95 complexity | b7694560eb4693214d20e6a7e83c9dbe MD5 | raw file
  1. #!/usr/bin/perl -w
  2. use strict;
  3. #########################################################################
  4. # codingSnps.pl
  5. # This takes a bed file with the names being / separated nts
  6. # and a gene bed file with cds start and stop.
  7. # It then checks for changes in coding regions, reporting
  8. # those that cause a frameshift or substitution in the amino acid.
  9. # Output columns:
  10. # chrom, start, end, allele as given (amb code translated)
  11. # Gene ID from genes file, ref amino acid:variant amino acids,
  12. # codon number, (in strand of gene)ref nt, refCodon:variantCodons
  13. #########################################################################
  14. my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib
  15. if (!@ARGV or scalar @ARGV < 3) {
  16. print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# strand=#|-|+ keepColumns=1 synon=1 unique=1] > codingSnps.txt\n";
  17. exit;
  18. }
  19. my $uniq = 0; #flag for whether want uniq positions
  20. my $syn = 0; #flag for if want synonomous changes rather than non-syn
  21. my $keep = 0; #keep old columns and append new ones
  22. my $snpFile = shift @ARGV;
  23. my $geneFile = shift @ARGV;
  24. my $nibDir = shift @ARGV; #2bit or nib, depending on flag above
  25. if ($nibDir eq 'Galaxy') { getGalaxyInfo(); }
  26. my $col0 = 0; #bed like columns in default positions
  27. my $col1 = 1;
  28. my $col2 = 2;
  29. my $col3 = 3;
  30. my $strand = -1;
  31. #column positions 1 based coming in (for Galaxy)
  32. foreach (@ARGV) {
  33. if (/chr=(\d+)/) { $col0 = $1 -1; }
  34. elsif (/start=(\d+)/) { $col1 = $1 -1; }
  35. elsif (/end=(\d+)/) { $col2 = $1 -1; }
  36. elsif (/snp=(\d+)/) { $col3 = $1 -1; }
  37. elsif (/keepColumns=1/) { $keep = 1; }
  38. elsif (/synon=1/) { $syn = 1; }
  39. elsif (/unique=1/) { $uniq = 1; }
  40. elsif (/strand=(\d+)/) { $strand = $1 -1; } #0 based column
  41. elsif (/strand=-/) { $strand = -99; } #special case of all minus
  42. }
  43. if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {
  44. print STDERR "ERROR column numbers are given with origin 1\n";
  45. exit 1;
  46. }
  47. my @genes; #bed lines for genes, sorted by chrom and start
  48. my %chrSt; #index in array where each chrom starts
  49. my %codon; #hash of codon amino acid conversions
  50. my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom
  51. my $ignoreN = 1; #skip N
  52. my $origAll; #alleles from input file (before changes for strand)
  53. my %amb = (
  54. "R" => "A/G",
  55. "Y" => "C/T",
  56. "S" => "C/G",
  57. "W" => "A/T",
  58. "K" => "G/T",
  59. "M" => "A/C",
  60. "B" => "C/G/T",
  61. "D" => "A/G/T",
  62. "H" => "A/C/T",
  63. "V" => "A/C/G",
  64. "N" => "A/C/G/T"
  65. );
  66. fill_codon();
  67. open(FH, "cat $geneFile | sort -k1,1 -k2,2n |")
  68. or die "Couldn't open and sort $geneFile, $!\n";
  69. my $i = 0;
  70. while(<FH>) {
  71. chomp;
  72. if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }
  73. push(@genes, "$_");
  74. my @f = split(/\t/);
  75. if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }
  76. $i++;
  77. }
  78. close FH or die "Couldn't close $geneFile, $!\n";
  79. if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; }
  80. #open snps sorted as well
  81. my $s1 = $col0 + 1; #sort order is origin 1
  82. my $s2 = $col1 + 1;
  83. open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")
  84. or die "Couldn't open and sort $snpFile, $!\n";
  85. $i = 0;
  86. my @g; #one genes fields, should be used repeatedly
  87. my %done;
  88. while(<FH>) {
  89. chomp;
  90. if (/^\s*#/) { next; } #comment
  91. my @s = split(/\t/); #SNP fields
  92. if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; }
  93. my $size = $#s;
  94. if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {
  95. print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n";
  96. exit 1;
  97. }
  98. if ($strand >= 0 && $strand > $size) {
  99. print STDERR "ERROR file has fewer columns than requested, requested strand in $strand (0 based), file has $size\n";
  100. exit 1;
  101. }
  102. if ($s[$col1] =~ /\D/) {
  103. print STDERR "ERROR the start point must be an integer not $s[$col1]\n";
  104. exit 1;
  105. }
  106. if ($s[$col2] =~ /\D/) {
  107. print STDERR "ERROR the start point must be an integer not $s[$col2]\n";
  108. exit 1;
  109. }
  110. if ($s[$col3] eq 'N' && $ignoreN) { next; }
  111. if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }
  112. if (($strand >= 0 && $s[$strand] eq '-') or $strand == -99) {
  113. #reverse complement nts
  114. $origAll = $s[$col3];
  115. $s[$col3] = reverseCompAlleles($s[$col3]);
  116. }else { undef $origAll }
  117. if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row
  118. $i = $chrSt{$s[$col0]};
  119. @g = split(/\t/, $genes[$i]);
  120. if (scalar @g < 12) {
  121. print STDERR "ERROR the gene file must be the whole genes in BED format\n";
  122. exit 1;
  123. }
  124. }elsif (!@g) {
  125. next; #no gene for this chrom
  126. }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom
  127. $i = $chrSt{$s[$col0]};
  128. @g = split(/\t/, $genes[$i]);
  129. }elsif ($s[$col0] ne $g[0]) {
  130. next; #no gene for this chrom
  131. }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) {
  132. next; #before any genes
  133. }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) {
  134. next; #after all genes on chr
  135. }else {
  136. while ($s[$col1] > $g[2] && $i < $#genes) {
  137. $i++;
  138. @g = split(/\t/, $genes[$i]);
  139. if ($s[$col0] ne $g[0]) { last; } #end of gene
  140. }
  141. if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) {
  142. next; #no overlap with genes
  143. }
  144. }
  145. processSnp(\@s, \@g);
  146. if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; }
  147. my $k = $i + 1; #check for more genes without losing data of first
  148. if ($k <= $#genes) {
  149. my @g2 = split(/\t/, $genes[$k]);
  150. while (@g2 && $k <= $#genes) {
  151. @g2 = split(/\t/, $genes[$k]);
  152. if ($s[$col0] ne $g2[0]) {
  153. undef @g2;
  154. last; #not same chrom
  155. }else {
  156. while ($s[$col1] > $g2[2] && $k < $#genes) {
  157. $k++;
  158. @g2 = split(/\t/, $genes[$k]);
  159. if ($s[$col0] ne $g2[0]) { last; } #end of chrom
  160. }
  161. if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) {
  162. undef @g2;
  163. last; #no overlap with more genes
  164. }
  165. processSnp(\@s, \@g2);
  166. if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; }
  167. }
  168. $k++;
  169. }
  170. }
  171. }
  172. close FH or die "Couldn't close $snpFile, $!\n";
  173. exit;
  174. ########################################################################
  175. sub processSnp {
  176. my $sref = shift;
  177. my $gref = shift;
  178. #overlaps gene, but maybe not coding seq
  179. #inside cds
  180. if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) {
  181. return; #outside of coding
  182. }
  183. #now check exon
  184. my $i = 0;
  185. my @st = split(/,/, $gref->[11]);
  186. my @size = split(/,/, $gref->[10]);
  187. if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; }
  188. my @pos;
  189. my $in = 0;
  190. for($i = 0; $i < $gref->[9]; $i++) {
  191. my $sta = $gref->[1] + $st[$i] + 1; #1 based position
  192. my $end = $sta + $size[$i] - 1; #
  193. if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes
  194. if ($end < $gref->[6]) { next; } #utr only
  195. if ($sta > $gref->[7]) { next; } #utr only
  196. #shorten to coding only
  197. if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; }
  198. if ($end > $gref->[7]) { $end = $gref->[7]; }
  199. if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; }
  200. elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; }
  201. push(@pos, ($sta .. $end)); #add exon worth of positions
  202. }
  203. #@pos has coding positions for whole gene (chr coors),
  204. #and $in has whether we need to continue
  205. if (!$in) { return; } #not in coding exon
  206. if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons
  207. if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or
  208. $sref->[$col3] =~ /^-+$/) { #indel or del
  209. my $copy = $sref->[$col3];
  210. my $c = ($copy =~ tr/-//);
  211. if ($c % 3 == 0) { return; } #not frameshift
  212. #handle bed4 or any interval file
  213. if (!$keep) {
  214. print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
  215. print "\t$gref->[3]\tframeshift\n";
  216. }else {
  217. my @s = @{$sref};
  218. print join("\t", @s), "\t$gref->[3]\tframeshift\n";
  219. }
  220. $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
  221. return;
  222. }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion
  223. my $copy = $sref->[$col3];
  224. my $c = ($copy =~ tr/\[ACTG]+//);
  225. if ($c % 3 == 0) { return; } #not frameshift
  226. #handle bed4 or any interval file
  227. if (!$keep) {
  228. print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
  229. print "\t$gref->[3]\tframeshift\n";
  230. }else {
  231. my @s = @{$sref};
  232. print join("\t", @s), "\t$gref->[3]\tframeshift\n";
  233. }
  234. $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
  235. return;
  236. }elsif ($sref->[$col3] =~ /-/) { #indel and sub?
  237. return; #skip
  238. }
  239. #check for amino acid substitutions
  240. my $s = $sref->[$col1] + 1;
  241. my $e = $sref->[$col2];
  242. my $len = $sref->[$col2] - $sref->[$col1];
  243. if ($gref->[5] eq '-') {
  244. @pos = reverse(@pos);
  245. my $t = $s;
  246. $s = $e;
  247. $e = $t;
  248. }
  249. $i = 0;
  250. my $found = 0;
  251. foreach (@pos) {
  252. if ($s == $_) {
  253. $found = 1;
  254. last;
  255. }
  256. $i++;
  257. }
  258. if ($found) {
  259. my $fs = $i; #keep original start index
  260. #have index where substitution starts
  261. my $cp = $i % 3;
  262. $i -= $cp; #i is now first position in codon
  263. my $cdNum = int($i / 3) + 1;
  264. my $ls = $i;
  265. if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; }
  266. if (!@pos) { die "ERROR not defined array pos\n"; }
  267. if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; }
  268. if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; }
  269. while ($ls <= $#pos && $pos[$ls] ne $e) {
  270. $ls++;
  271. }
  272. my $i2 = $ls + (2 - ($ls % 3));
  273. if ($i2 > $#pos) { return; } #not a full codon, partial gene?
  274. if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; }
  275. my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]);
  276. if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; }
  277. my @vars = split(/\//, $sref->[$col3]);
  278. if ($gref->[5] eq '-') { #complement oldnts and revcomp vars
  279. $oldnts = compl($oldnts);
  280. if (!$oldnts) { return; } #skip this one
  281. $oldnts = join('', (reverse(split(/ */, $oldnts))));
  282. foreach (@vars) {
  283. $_ = reverse(split(/ */)); #needed for indels
  284. $_ = compl($_);
  285. }
  286. }
  287. my $r = $fs - $i; #difference in old indexes gives new index
  288. my @newnts;
  289. my $changed = '';
  290. foreach my $v (@vars) {
  291. if (!$v or length($v) != 1) { return; } #only simple changes
  292. my @new = split(/ */, $oldnts);
  293. $changed = splice(@new, $r, $len, split(/ */, $v));
  294. #should only change single nt
  295. push(@newnts, join("", @new));
  296. }
  297. #now compute amino acids
  298. my $oldaa = getaa($oldnts);
  299. my $codon = "$oldnts:";
  300. my @newaa;
  301. my $change = 0; #flag for if there is a change
  302. foreach my $v (@newnts) {
  303. my $t = getaa($v);
  304. if ($t ne $oldaa) { $change = 1; }
  305. push(@newaa, "$t");
  306. $codon .= "$v/";
  307. }
  308. $codon =~ s/\/$//;
  309. if (!$change && $syn) {
  310. if (!$keep) {
  311. print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
  312. print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\t$codon\n";
  313. }else {
  314. my @s = @{$sref};
  315. print join("\t", @s),
  316. "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\t$codon\n";
  317. }
  318. $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
  319. return;
  320. }elsif ($syn) { return; } #only want synonymous changes
  321. if (!$change) { return; } #no change in amino acids
  322. if (!$keep) {
  323. my $a = $sref->[$col3];
  324. if (($strand >= 0 && $origAll) or $strand == -99) { $a = $origAll; }
  325. print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$a";
  326. #my $minus = $changed; #in case minus strand and change back
  327. #if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
  328. if (!$changed) { return; } #skip this one
  329. print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\t$codon\n";
  330. }else {
  331. my @s = @{$sref};
  332. if (($strand >= 0 && $origAll) or $strand == -99) { $s[$col3] = $origAll; }
  333. print join("\t", @s);
  334. #my $minus = $changed; #in case minus strand and change back
  335. #if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
  336. if (!$changed) { return; } #skip this one
  337. print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\t$codon\n";
  338. }
  339. $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
  340. }
  341. }
  342. sub getnts {
  343. my $chr = shift;
  344. my @pos = @_; #list of positions not necessarily in order
  345. #list may be reversed or have gaps(introns), at least 3 bps
  346. my $seq = '';
  347. if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; }
  348. if ($pos[0] < $pos[1]) { #not reversed
  349. my $s = $pos[0];
  350. for(my $i = 1; $i <= $#pos; $i++) {
  351. if ($pos[$i] == $pos[$i-1] + 1) { next; }
  352. if ($seqFlag eq '2bit') {
  353. $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]);
  354. }else {
  355. $seq .= fetchSeqNib($chr, $s, $pos[$i-1]);
  356. }
  357. $s = $pos[$i];
  358. }
  359. if (length $seq != scalar @pos) { #still need to fetch seq
  360. if ($seqFlag eq '2bit') {
  361. $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]);
  362. }else {
  363. $seq .= fetchSeqNib($chr, $s, $pos[$#pos]);
  364. }
  365. }
  366. }else { #reversed
  367. my $s = $pos[$#pos];
  368. for(my $i = $#pos -1; $i >= 0; $i--) {
  369. if ($pos[$i] == $pos[$i+1] + 1) { next; }
  370. if ($seqFlag eq '2bit') {
  371. $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]);
  372. }else {
  373. $seq .= fetchSeqNib($chr, $s, $pos[$i+1]);
  374. }
  375. $s = $pos[$i];
  376. }
  377. if (length $seq != scalar @pos) { #still need to fetch seq
  378. if ($seqFlag eq '2bit') {
  379. $seq .= fetchSeq2bit($chr, $s, $pos[0]);
  380. }else {
  381. $seq .= fetchSeqNib($chr, $s, $pos[0]);
  382. }
  383. }
  384. }
  385. }
  386. sub fetchSeq2bit {
  387. my $chr = shift;
  388. my $st = shift;
  389. my $end = shift;
  390. my $strand = '+';
  391. $st--; #change to UCSC numbering
  392. open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or
  393. die "Couldn't run twoBitToFa, $!\n";
  394. my $seq = '';
  395. while (<BIT>) {
  396. chomp;
  397. if (/^>/) { next; } #header
  398. $seq .= uc($_);
  399. }
  400. close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n";
  401. return $seq;
  402. }
  403. sub fetchSeqNib {
  404. my $chr = shift;
  405. my $st = shift;
  406. my $end = shift;
  407. my $strand = '+';
  408. $st--; #change to UCSC numbering
  409. open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n";
  410. my $seq = '';
  411. while (<NIB>) {
  412. chomp;
  413. if (/^>/) { next; } #header
  414. $seq .= $_;
  415. }
  416. close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
  417. return $seq;
  418. }
  419. sub compl {
  420. my $nts = shift;
  421. my $comp = '';
  422. if (!$nts) { die "ERROR called compl with nts undefined"; }
  423. foreach my $n (split(/ */, $nts)) {
  424. if ($n eq 'A') { $comp .= 'T'; }
  425. elsif ($n eq 'T') { $comp .= 'A'; }
  426. elsif ($n eq 'C') { $comp .= 'G'; }
  427. elsif ($n eq 'G') { $comp .= 'C'; }
  428. elsif ($n eq 'N') { $comp .= 'N'; }
  429. elsif ($n eq '-') { $comp .= '-'; } #deletion
  430. else { $comp = undef; }
  431. }
  432. return $comp;
  433. }
  434. sub reverseCompAlleles {
  435. my $all = shift;
  436. my @nt = split(/\//, $all);
  437. my $rv = '';
  438. foreach my $n (@nt) {
  439. $n = reverse(split(/ */, $n)); #needed for indels
  440. $n = compl($n);
  441. $rv .= "$n/";
  442. }
  443. $rv =~ s/\/$//;
  444. return $rv;
  445. }
  446. sub getaa {
  447. my $nts = shift; #in multiples of 3
  448. my $aa = '';
  449. my @n = split(/ */, $nts);
  450. while (@n) {
  451. my @t = splice(@n, 0, 3);
  452. my $n = uc(join("", @t));
  453. if (!exists $codon{$n}) { $aa .= 'N'; next; }
  454. $aa .= $codon{$n};
  455. }
  456. return $aa;
  457. }
  458. sub fill_codon {
  459. $codon{GCA} = 'Ala';
  460. $codon{GCC} = 'Ala';
  461. $codon{GCG} = 'Ala';
  462. $codon{GCT} = 'Ala';
  463. $codon{CGG} = 'Arg';
  464. $codon{CGT} = 'Arg';
  465. $codon{CGC} = 'Arg';
  466. $codon{AGA} = 'Arg';
  467. $codon{AGG} = 'Arg';
  468. $codon{CGA} = 'Arg';
  469. $codon{AAC} = 'Asn';
  470. $codon{AAT} = 'Asn';
  471. $codon{GAC} = 'Asp';
  472. $codon{GAT} = 'Asp';
  473. $codon{TGC} = 'Cys';
  474. $codon{TGT} = 'Cys';
  475. $codon{CAG} = 'Gln';
  476. $codon{CAA} = 'Gln';
  477. $codon{GAA} = 'Glu';
  478. $codon{GAG} = 'Glu';
  479. $codon{GGG} = 'Gly';
  480. $codon{GGA} = 'Gly';
  481. $codon{GGC} = 'Gly';
  482. $codon{GGT} = 'Gly';
  483. $codon{CAC} = 'His';
  484. $codon{CAT} = 'His';
  485. $codon{ATA} = 'Ile';
  486. $codon{ATT} = 'Ile';
  487. $codon{ATC} = 'Ile';
  488. $codon{CTA} = 'Leu';
  489. $codon{CTC} = 'Leu';
  490. $codon{CTG} = 'Leu';
  491. $codon{CTT} = 'Leu';
  492. $codon{TTG} = 'Leu';
  493. $codon{TTA} = 'Leu';
  494. $codon{AAA} = 'Lys';
  495. $codon{AAG} = 'Lys';
  496. $codon{ATG} = 'Met';
  497. $codon{TTC} = 'Phe';
  498. $codon{TTT} = 'Phe';
  499. $codon{CCT} = 'Pro';
  500. $codon{CCA} = 'Pro';
  501. $codon{CCC} = 'Pro';
  502. $codon{CCG} = 'Pro';
  503. $codon{TCA} = 'Ser';
  504. $codon{AGC} = 'Ser';
  505. $codon{AGT} = 'Ser';
  506. $codon{TCC} = 'Ser';
  507. $codon{TCT} = 'Ser';
  508. $codon{TCG} = 'Ser';
  509. $codon{TGA} = 'Stop';
  510. $codon{TAG} = 'Stop';
  511. $codon{TAA} = 'Stop';
  512. $codon{ACT} = 'Thr';
  513. $codon{ACA} = 'Thr';
  514. $codon{ACC} = 'Thr';
  515. $codon{ACG} = 'Thr';
  516. $codon{TGG} = 'Trp';
  517. $codon{TAT} = 'Tyr';
  518. $codon{TAC} = 'Tyr';
  519. $codon{GTC} = 'Val';
  520. $codon{GTA} = 'Val';
  521. $codon{GTG} = 'Val';
  522. $codon{GTT} = 'Val';
  523. }
  524. sub getGalaxyInfo {
  525. my $build;
  526. my $locFile;
  527. foreach (@ARGV) {
  528. if (/build=(.*)/) { $build = $1; }
  529. elsif (/loc=(.*)/) { $locFile = $1; }
  530. }
  531. if (!$build or !$locFile) {
  532. print STDERR "ERROR missing build or locfile for Galaxy input\n";
  533. exit 1;
  534. }
  535. # read $locFile to get $nibDir (ignoring commets)
  536. open(LF, "< $locFile") || die "open($locFile): $!\n";
  537. while(<LF>) {
  538. s/#.*$//;
  539. s/(?:^\s+|\s+$)//g;
  540. next if (/^$/);
  541. my @t = split(/\t/);
  542. if ($t[0] eq $build) { $nibDir = $t[1]; }
  543. }
  544. close(LF);
  545. if ($nibDir eq 'Galaxy') {
  546. print STDERR "Failed to find sequence directory in locfile $locFile\n";
  547. }
  548. $nibDir .= "/$build.2bit"; #we want full path and filename
  549. }