PageRenderTime 58ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/back/VenSeq.Back3.pl

http://dawg-paws.googlecode.com/
Perl | 770 lines | 417 code | 80 blank | 273 comment | 43 complexity | 77c91617dde365712d59eaf906f4476b MD5 | raw file
  1. #!/usr/bin/perl -w
  2. #-----------------------------------------------------------+
  3. # |
  4. # VennSeq.pl |
  5. # |
  6. #-----------------------------------------------------------+
  7. # AUTHOR: James C. Estill |
  8. # CONTACT: jestill_at_sourceforge.net |
  9. # STARTED: 03/06/2007 |
  10. # UPDATED: 03/12/2007 |
  11. # |
  12. # SHORT DESCRIPTION: |
  13. # Creates a Venn Diagram comparing the overlap of sequence |
  14. # features along a query sequence. The purpose is to allow |
  15. # me to visualize the overlap of Repeat Databases or gene |
  16. # models along a BAC. |
  17. # |
  18. # DEPENDENCIES: |
  19. # -BioPERL |
  20. # -NCBI BLAST |
  21. # -Venn Master |
  22. # |
  23. # USAGE: |
  24. # |
  25. # |
  26. # NOTES: |
  27. # -All array indices start at one. |
  28. # This will facilitate parsing information from sequence |
  29. # features that are all indexed from 1 to L |
  30. # -q is quiet, Q is super quiet |
  31. #-----------------------------------------------------------+
  32. # To do. If no fasta file then the sequence length may
  33. # be passed as a variable from the command line.
  34. print "The VennSeq Program has started.\n";
  35. =head1 SETUP
  36. Add includes and set variable scope
  37. =cut
  38. #-----------------------------+
  39. # INCLUDES |
  40. #-----------------------------+
  41. use Getopt::Std; # Get options from the command line
  42. use Bio::SearchIO; # Parse BLAST output
  43. use Bio::SeqIO; # Parse fasta sequence input file
  44. use GD; # Draw using the GD program
  45. # In case I want to draw a coverage map
  46. #-----------------------------+
  47. # SET VARIABLE SCOPE |
  48. #-----------------------------+
  49. my $Infile; # Path to sequence file in fasta format
  50. my $Out; # Output file path
  51. my $FeatDir; # Directory containing sequence features
  52. my $Format; # Format of the sequence feature files
  53. # - blast or b
  54. # - tab or t
  55. # - gff or g
  56. my $SeqLen; # Full length of the sequence that the
  57. # features are being mapped onto
  58. my @FeatFiles; # Array to hold the path of the
  59. # feature files.
  60. my @FeatFilesList; # The original list of potential feature
  61. # files. The zero length files will
  62. # be ignored leaving only files with data
  63. my @EmptyFiles; # Files with no hits (0 length files)
  64. my $NumFeatFiles; # The number of feature files
  65. my @FeatMatrix; # Count of feature occurrences
  66. my @BinFeatMatrix; # Presence/abscence binary form of the
  67. # feature matrix
  68. my @FeatLabels; # Category labels for the features
  69. # this will take the
  70. my @CrossTab; # Cross table matrix
  71. my $FileTestNum = 0; # Initialize counter of file test number
  72. my $EmptyFileNum = 0; # Initialize counter of empty files
  73. =head1 GET VARIABLES
  74. Get variables from the command line.
  75. =cut
  76. #-----------------------------------------------------------+
  77. # COMMAND LINE VARIABLES |
  78. #-----------------------------------------------------------+
  79. getopts('i:o:d:f:x:y:C:q', \%Options);
  80. # x and y will possibly be used later to draw coverage map
  81. # using x and y scale factor where x and y are integers
  82. # greater then 1.
  83. # may also use a "MAX" varialble
  84. my $Usage = "\nVennSeq.pl -i InputFastaFile -o OutputFile\n".
  85. "-d FeatureFileDirecotry \n";
  86. my $Config = $Options{C};
  87. my $quiet = $Options{q};
  88. if ($Config)
  89. {
  90. print "Varialbes will be loaded from the config file";
  91. #-----------------------------+
  92. # GET CONFIG FILE OPTIONS |
  93. #-----------------------------+
  94. $Infile = &ParseConfigFile($Config,"SeqIn")
  95. || die "Config file must include a fasta file\n$Usage";
  96. $Out = &ParseConfigFile($Config,"Out")
  97. || die "Config file must include an output file\n$Usage";
  98. $FeatDir = &ParseConfigFile($Config,"FeatDir")
  99. || die "Config file must include a sequence feature directory\n$Usage";
  100. $Format = &ParseConfigFile($Config,"Format")
  101. || die "Config file must indicate a feature file format\n$Usage";
  102. }else{
  103. #-----------------------------+
  104. # GET COMMAND LINE OPTIONS |
  105. #-----------------------------+
  106. $Infile = $Options{i} ||
  107. die "Command line must include an input fasta file path.\n$Usage\n";
  108. $Out = $Options{o} ||
  109. die "Command line must include and output file path.\n$Usage\n";
  110. $FeatDir = $Options{d} ||
  111. die "Command line must include a sequence feature directory\n$Usage\n";
  112. $Format = $Options{f} ||
  113. die "Command line must include a feature file format\n$Usage\n";
  114. }
  115. #-----------------------------+
  116. # CHECK FILE EXISTENCE |
  117. #-----------------------------+
  118. unless (-e $Infile)
  119. {
  120. print "ERROR:The input sequence file could not be found:\n$Infile\n";
  121. }
  122. #-----------------------------+
  123. # OPEN THE OUTPUT FILE |
  124. #-----------------------------+
  125. open (OUT, ">$Out") ||
  126. die "Could not open output file for writing at:\n$Out\n";
  127. #-----------------------------+
  128. # GET LENGTH OF THE SEQUENCE |
  129. # FROM THE INPUT FILE |
  130. #-----------------------------+
  131. # This will be used to determine the length of the array
  132. # This includes an internal check for only one sequence
  133. # in the input file.
  134. my $seq_in = Bio::SeqIO->new('-file' => "<$Infile",
  135. '-format' => 'fasta') ||
  136. die "Can not open input file:n\$Infile\n";
  137. my $NumSeq = 0;
  138. while (my $inseq = $seq_in->next_seq)
  139. {
  140. $NumSeq++;
  141. $SeqLen = $inseq->length();
  142. if ($NumSeq >> 1)
  143. {
  144. print "ERROR. Input sequence file contains more then one sequence.\n";
  145. exit;
  146. }
  147. }
  148. =head1 GET SEQUENCE FEATURES
  149. =cut
  150. #-----------------------------------------------------------+
  151. # GET SEQUENCE FEATURES |
  152. #-----------------------------------------------------------+
  153. if ($Format =~ "tab")
  154. {
  155. #-----------------------------------------------------------+
  156. # TAB DELIM BLAST OUTPUT |
  157. #-----------------------------------------------------------+
  158. # Currently only supporting the blo extension
  159. opendir(INDIR, $FeatDir) ||
  160. die "Can not open input directory:\n$FeatDir: $!";
  161. @FeatFilesList = grep /blo$/, readdir INDIR;
  162. #-----------------------------+
  163. # TEST FOR FILE LENGTH |
  164. #-----------------------------+
  165. # Check for files with no hits here by checking for zero length
  166. #
  167. # Can load indexes to delete ??
  168. foreach my $FileTest (@FeatFilesList)
  169. {
  170. $FileTestNum++;
  171. $FileSize = -s $FeatDir.$FileTest || 0;
  172. print $FileTest." ".$FileSize."\n";
  173. #LOAD ZERO LENGTH FILE TO EmptyFiles array
  174. # if ($FileSize == 0)
  175. if (-z $FeatDir.$FileTest) # If the file is zero length
  176. {
  177. $EmptyFileNum++;
  178. $EmptyFiles[$EmptyFileNum][1] = $EmptyFileNum;
  179. $EmptyFiles[$EmptyFileNum][2] = $FileTestNum;
  180. $EmptyFiles[$EmptyFileNum][3] = $FileTest;
  181. }else{
  182. push;
  183. }# End of if FileSize = 0
  184. } # End of for each file in File test
  185. #-----------------------------+
  186. # PRINT OUT THE EMPTY FILES |
  187. #-----------------------------+
  188. # This is a place to drop empty files from the array
  189. if ($EmptyFileNum > 0)
  190. {
  191. print "The following files had no BLAST hits:\n";
  192. for (my $i++; $i<=$EmptyFileNum;$i++)
  193. {
  194. print $EmptyFiles[$i][1]."\t";
  195. print $EmptyFiles[$i][2]."\t";
  196. print $EmptyFiles[$i][3]."\n";
  197. }
  198. }else{
  199. # I can get rid of this else when things are working
  200. print "Empty File num is: $EmptyFileNum\n";
  201. }
  202. # Temp exit while I test working with file size
  203. exit;
  204. $NumFeatFiles = @FeatFiles;
  205. if ($NumFeatFiles == 0)
  206. {
  207. print "No feature files of type \"$Format\" were found in:\n".
  208. "$FeatDir\n";
  209. exit;
  210. }else{
  211. print "$NumFeatFiles of type $Format were found in\n".
  212. "$FeatDir\n";
  213. # temp exit for debug
  214. #exit;
  215. }
  216. #-----------------------------+
  217. # FILL THE FEATURE MATRIX |
  218. # WITH ZEROS |
  219. #-----------------------------+
  220. print "Initializing feature matrix and binary matrix with zeros\n";
  221. for (my $row=1; $row<=$NumFeatFiles; $row++)
  222. {
  223. my $NumCol=0; # Varialbe to count number of cols
  224. # for debug purpsoses
  225. print "\tInitializing row $row...";
  226. for (my $col=1; $col<=$SeqLen; $col++)
  227. {
  228. $FeatMatrix[$row][$col] = 0;
  229. $BinFeatMatrix[$row][$col] = 0;
  230. $NumCol++;
  231. }
  232. print "$NumCol cols.\n";
  233. }
  234. #-----------------------------+
  235. # FOR EACH OF THE FEATURE |
  236. # FILES INCREMENT THE PROPER |
  237. # ROW IN @FeatMatrix |
  238. #-----------------------------+
  239. # Use uppercase Row and Col here to avoid
  240. # any problems with var names
  241. my $Row = 0; # Reset row count to zero
  242. # Each feature has a unique row
  243. print "Loading data from feature files.\n";
  244. foreach my $IndFile (@FeatFiles)
  245. {
  246. $Row++; # Increment row for each file
  247. $FeatLabels[$Row] = &GetShortName($IndFile);
  248. print "\tLoading data from feature file $FeatLabels[$Row]\n";
  249. my $FeatPath = $FeatDir.$IndFile;
  250. my $NumLines = 0; # Number of lines in the feat file
  251. my $BinFeatCount = 0; # Counter for the "binary" features
  252. open (FEATIN, "<$FeatPath") ||
  253. die "Can not open the feature file:\n$FeatPath\n";
  254. while (<FEATIN>)
  255. {
  256. chomp;
  257. $NumLines++;
  258. # Use unless to ignore comment lines from -m 9 output
  259. unless (m/^\#.*/)
  260. {
  261. # TEST THAT THE LENGTH OF THE ARRAY IS
  262. # WHAT IS EXPECT
  263. my @TabSplit = split(/\t/);
  264. my $TestLen = @TabSplit;
  265. if ($TestLen > '12')
  266. {
  267. print "ERROR: The BLAST file has an unexpected\n".
  268. "number of tab delimitedy columns.";
  269. #exit;
  270. # Currently just skip this record and move forward
  271. }
  272. my ($QryId, $SubId, $PID, $Len,
  273. $MisMatch, $GapOpen,
  274. $QStart,$QEnd, $SStart, $SEnd,
  275. $EVal, $BitScore) = split(/\t/);
  276. #-----------------------------+
  277. # PRINT SOME FEATURE VALUES |
  278. # FOR DEBUG PURPOSES |
  279. #-----------------------------+
  280. # Just do this for the first few hits
  281. unless ($quiet)
  282. {
  283. if ($NumLines < '3')
  284. {
  285. print "\n\t\t LINE: $NumLines\n";
  286. print "\t\t QRY: $QryId\n";
  287. print "\t\t SUB: $SubId\n";
  288. print "\t\tQSTART: $QStart\n";
  289. print "\t\t QEND: $QEnd\n\n";
  290. } # End of If NumLines less then three
  291. } # End of unless quiet
  292. for (my $Col=$QStart; $Col<=$QEnd; $Col++)
  293. {
  294. $BinFeatCount++;
  295. $FeatMatrix[$Row][$Col] = $FeatMatrix[$Row][$Col] + 1;
  296. } # End of for my $col from Query Start to End
  297. } # End of unless # (unless comment line)
  298. } # End of while FEATIN
  299. close FEATIN;
  300. #-----------------------------+
  301. # REPORT NUMBER OF FEATURES |
  302. # THAT WERE PROCESSED |
  303. #-----------------------------+
  304. # For tab delimited blast the number of features
  305. # is roughly equal to the number of lines == number of HSPS
  306. #print "\t\tFILE: $FeatPath\n";
  307. unless ($quiet)
  308. {
  309. print "\t\tFILE: $IndFile\n";
  310. print "\t\tFEAT: $NumLines\n";
  311. print "\t\t BIN: $BinFeatCount\n";
  312. } # End of unless quiet
  313. } # End of For each $IndFile in @FeatFiles
  314. }else{
  315. print "A proper sequence feature file format was not selected.";
  316. exit;
  317. } # END OF IF $Format
  318. =head1 FILL BINARY MATRIX
  319. Fill the binary matrix and print output for Venn Master
  320. =cut
  321. #-----------------------------------------------------------+
  322. # FILL THE BINARY MATRIX AND PRINT OUTPUT FOR VENN MASTER |
  323. #-----------------------------------------------------------+
  324. print "Filling the binary matrix.\n";
  325. for ($row=1; $row<=$NumFeatFiles; $row++)
  326. #for ($row=1; $row<=3; $row++)
  327. {
  328. print "\tLoading row $row\n";
  329. my $BinFeatLen = 0; # Binary feature length
  330. my $BinFeatOccur = 0; # Occurrence of features
  331. for ($col=1; $col<=$SeqLen; $col++)
  332. {
  333. $BinFeatLen++;
  334. # If the feature matrix contains data
  335. if ($FeatMatrix[$row][$col] >> 0)
  336. {
  337. $BinFeatOccur++;
  338. $BinFeatMatrix[$row][$col] = 1;
  339. # Print to screen first to see if this actually
  340. # does work
  341. print OUT "$col\t$FeatLabels[$row]\n";
  342. # print "$col\t$FeatLabels[$row]\n";
  343. }
  344. } # End of for $col
  345. #-----------------------------+
  346. # REPORT THE PERCENT COVERAGE |
  347. # OF THE FEATURE AFTER |
  348. # TESTING FOR DIVIDE BY ZERO |
  349. #-----------------------------+
  350. if ($BinFeatOccur == 0)
  351. {
  352. print "\t\tERROR: The feature has zero length.\n";
  353. } else {
  354. # Calculate feature coverage
  355. my $BinFeatCov = $BinFeatOccur/$BinFeatLen;
  356. # Convert feature coverage to percent
  357. $BinFeatCov = sprintf("%.4f",$BinFeatCov);
  358. print "\t\tLEN: $BinFeatLen\tCOV: $BinFeatCov\n"
  359. }
  360. } # End of for row
  361. =head1 CROSS TABLE
  362. Generate a Cross Table comparing sequence feature coverage.
  363. =cut
  364. #-----------------------------------------------------------+
  365. # GENERATE COMPARISON MATRIX @CrossTab |
  366. #-----------------------------------------------------------+
  367. # This should be made and option, ie. don't do under quiet
  368. # Generates the NxN CrossTab matrix
  369. # Where N is the number of sequence features being comparedh zeros
  370. print "Generating the Cross tab\n";
  371. print "\n";
  372. #-----------------------------+
  373. # PRINT TABLE HEADER |
  374. #-----------------------------+
  375. #-----------------------------+
  376. # CREATE SEPERATOR STRING AND |
  377. # PRINT TOP OF CROSS TABLE |
  378. #-----------------------------+
  379. my $Sep = "+";
  380. for (my $i=1; $i<=$NumFeatFiles ; $i++)
  381. {$Sep = $Sep."--------+";}
  382. print $Sep."\n";
  383. #-----------------------------+
  384. # PRINT BODY OF CROSS TABLE |
  385. #-----------------------------+
  386. for (my $i=1; $i<=$NumFeatFiles ; $i++)
  387. {
  388. print "|"; # Print the left hand side of the cross tab
  389. for (my $j=1; $j<=$NumFeatFiles ; $j++)
  390. {
  391. my $SharedCount=0; # Initialize the count of shared
  392. my $iCount=0; # Initialize the i feature count
  393. my $CrossRatio; # Set scope of the cross ratio
  394. if ($i != $j)
  395. {
  396. #-----------------------------+
  397. # CALCULATE CROSS TABLE RATIO |
  398. #-----------------------------+
  399. # k is the sequence position
  400. for (my $k=1; $k<=$SeqLen; $k++)
  401. {
  402. if ($BinFeatMatrix[$i][$k] == 1)
  403. {
  404. # Increment the i feature count
  405. $iCount++;
  406. if ($BinFeatMatrix[$j][$k] == 1)
  407. {
  408. # Increment the shared feature count
  409. $SharedCount++;
  410. } # End of if j = 1 in binary matrix
  411. } # End of if i = 1 in binary matrix
  412. } # End of for k
  413. #-----------------------------+
  414. # LOAD RESULT TO CROSS TABLE |
  415. # AFTER DIVIDE BY ZERO TEST |
  416. #-----------------------------+
  417. if ($iCount >0)
  418. {
  419. $CrossRatio = $SharedCount/$iCount;
  420. $CrossRatio = sprintf("%.4f",$CrossRatio);
  421. }else{
  422. # If if = 0 this is the NULL case
  423. $CrossRatio = " NULL ";
  424. } # End of if $iCount > 0
  425. # Load to the Cross Tab Array
  426. $CrossTab[$i][$j]=$CrossRatio;
  427. # To print in table format
  428. print " ".$CrossRatio." |";
  429. }else{
  430. # If i = j then print the data source
  431. # label in the table, this will need to
  432. # be formatted to fit in the box
  433. # - sign below indicates left justification
  434. my $DatSrcLab = sprintf ('%-*s', 8, $FeatLabels[$i]);
  435. print $DatSrcLab."|";
  436. }
  437. } # End of for j (Seq feature set two)
  438. print "\n"; # Print new line for end of row in cross tab
  439. print $Sep."\n";
  440. } # End of for i (Seq feature set one)
  441. #print $Sep."\n";
  442. print "\n"; # Print final new row at the very end of cross tab
  443. =head1 OPEN VENN MASTER
  444. Open the Venn master program and display the results.
  445. =cut
  446. #-----------------------------------------------------------+
  447. # OPEN VENN MASTER TO DISPLAY VENN DIAGRAMS |
  448. #-----------------------------------------------------------+
  449. =head1 COVERAGE DIAGRAM & HEAT MAP
  450. Create an image that represents the coverage diagram and a
  451. heat map of the distribution of sequence features.
  452. =cut
  453. #-----------------------------------------------------------+
  454. # CREATE COVERAGE DIAGRAM AS A HEAT MAP |
  455. #-----------------------------------------------------------+
  456. # This can me made to be an option
  457. =head1 SUBFUNCTIONS
  458. Subfunctions for the program
  459. =cut
  460. #-----------------------------------------------------------+
  461. # SUBFUNCTIONS |
  462. #-----------------------------------------------------------+
  463. sub ParseConfigFile
  464. {
  465. #-----------------------------------------------------------+
  466. # This will parase a configuration text file to set |
  467. # variables that would normally be set at the command line |
  468. # This is not the fastest way to parse the config file |
  469. # but this makes the subfuction reusable. |
  470. # If the variable name occurs more then once in the text |
  471. # file, the last occurrence will be used. |
  472. #-----------------------------------------------------------+
  473. my $ConfigFilePath = $_[0];
  474. my $VarName = $_[1];
  475. my $VarValue;
  476. open (CONFILE, $ConfigFilePath) ||
  477. die "Could not open config file:\n\t$ConfigFilePath";
  478. while (<CONFILE>)
  479. {
  480. chomp; # Remove newline character
  481. unless (m/\#.*/) # Ignore comment lines
  482. {
  483. my @SplitLine = split;
  484. if ($SplitLine[0] =~ $VarName){
  485. $VarValue = $SplitLine[1];}
  486. }
  487. }
  488. close CONFILE;
  489. return $VarValue;
  490. }
  491. sub GetShortName
  492. {
  493. #-----------------------------------------------------------+
  494. # CONVERT THE NAME TO A SHORTENED FORM |
  495. #-----------------------------------------------------------+
  496. # This will be used to write short names for the CrossTab
  497. # table or any other use of the name in the program. The short
  498. # form of the name will be limited to 8 characters
  499. # NOTE:
  500. # This is a kluge that will only work for the BLAST databases
  501. # that I am working with in wheat (03/12/2007 - JCE)
  502. my $InName = $_[0]; # Input name string
  503. my $OutName; # Set scope for the name to return
  504. #-----------------------------------------------------------+
  505. # REPEAT DATABASES |
  506. #-----------------------------------------------------------+
  507. if ($InName =~ m/TREP9_nr/)
  508. {
  509. $OutName = "TREP9nr";
  510. }elsif ($InName =~ m/TREP9_total/){
  511. $OutName = "TREP9tot";
  512. }elsif ($InName =~ m/mips_REdat/){
  513. $OutName = "MIPS";
  514. }elsif ($InName =~ m/TIGR_Gram/){
  515. $OutName = "TIGRGram";
  516. }elsif ($InName =~ m/RB_pln/){
  517. $OutName = "RB_pln";
  518. }elsif ($InName =~ m/TATrans/){
  519. $OutName = "TATran";
  520. }elsif ($InName =~ m/Wessler/){
  521. $OutName = "Wessler ";
  522. }elsif ($InName =~ m/SanMiguel/){
  523. $OutName = "SanMig";
  524. #-----------------------------------------------------------+
  525. # EST DATABASES |
  526. #-----------------------------------------------------------+
  527. # TIGR GENE INDICES
  528. }elsif ($InName =~ m/TaGI_10/){
  529. $OutName = "TaGI_10";
  530. }elsif ($InName =~ m/AtGI_13/){
  531. $OutName = "AtGI_13";
  532. }elsif ($InName =~ m/ZmGI_17/){
  533. $OutName = "ZmGI_17";
  534. }elsif ($InName =~ m/SbGI_8/){
  535. $OutName = "SbGI_8";
  536. }elsif ($InName =~ m/OsGI_17/){
  537. $OutName = "OsGI_17";
  538. }elsif ($InName =~ m/HvGI_9/){
  539. $OutName = "HvGI_17";
  540. #-----------------------------+
  541. # EST: TIGR TAs |
  542. #-----------------------------+
  543. }elsif ($InName =~ m/AcTA_1/){
  544. $OutName = "AcTA_1";
  545. }elsif ($InName =~ m/AsTA_1/){
  546. $OutName = "AsTA_1";
  547. }elsif ($InName =~ m/AsTA_1/){
  548. $OutName = "AsTA_1";
  549. }elsif ($InName =~ m/AvenSatTA_2/){
  550. $OutName = "AvenSaTa";
  551. }elsif ($InName =~ m/CdTA_2/){
  552. $OutName = "CeTA_2";
  553. }elsif ($InName =~ m/FaTA_2/){
  554. $OutName = "FaTA_2";
  555. }elsif ($InName =~ m/HvTA_2/){
  556. $OutName = "HvTA_2";
  557. }elsif ($InName =~ m/OsTA_2/){
  558. $OutName = "OsTA_2";
  559. }elsif ($InName =~ m/PgTA_2/){
  560. $OutName = "PgTA_2";
  561. }elsif ($InName =~ m/SbTA_2/){
  562. $OutName = "SbTA_2";
  563. }elsif ($InName =~ m/ShTA_2/){
  564. $OutName = "ShTA_2";
  565. }elsif ($InName =~ m/SoTA_2/){
  566. $OutName = "SoTA_2";
  567. }elsif ($InName =~ m/SpTA_2/){
  568. $OutName = "SpTA_2";
  569. }elsif ($InName =~ m/TaTA_2/){
  570. $OutName = "TaTA_2";
  571. }elsif ($InName =~ m/TmTA_2/){
  572. $OutName = "TmTA_2";
  573. }elsif ($InName =~ m/TtTA_1/){
  574. $OutName = "TtTA_1";
  575. }elsif ($InName =~ m/ZmB73TA_1/){
  576. $OutName = "ZmB71TA_1";
  577. #-----------------------------+
  578. # EST: NCBI UniGenes |
  579. #-----------------------------+
  580. }elsif ($InName =~ m/AtUniGene_56/){
  581. $OutName = "AtUn_56";
  582. }elsif ($InName =~ m/HvUniGene_47/){
  583. $OutName = "HvUn_47";
  584. }elsif ($InName =~ m/OsUniGene_65/){
  585. $OutName = "OsUn_65";
  586. }elsif ($InName =~ m/SbUniGene_21/){
  587. $OutName = "SbUn_21";
  588. }elsif ($InName =~ m/SoUniGene_8/){
  589. $OutName = "SoUn_8";
  590. }elsif ($InName =~ m/TaUniGene_46/){
  591. $OutName = "TaUn_46";
  592. }elsif ($InName =~ m/ZmUniGene_61/){
  593. $OutName = "ZmUn_61";
  594. #-----------------------------+
  595. # EST: PLANT GDB PUTS |
  596. #-----------------------------+
  597. }elsif ($InName =~ m/AsPUT_157/){
  598. $OutName = "AsPUT_157";
  599. }elsif ($InName =~ m/AtPUT_157/){
  600. $OutName = "AtPUT_157";
  601. }elsif ($InName =~ m/HvPUT_157/){
  602. $OutName = "HvPUT_157";
  603. }elsif ($InName =~ m/OsIndPUT_157/){
  604. $OutName = "OsIndPUT";
  605. }elsif ($InName =~ m/OsJap_157/){
  606. $OutName = "OsJapPUT";
  607. }elsif ($InName =~ m/OsPut_157/){
  608. $OutName = "OsPUT_157";
  609. }elsif ($InName =~ m/SbPUT_157/){
  610. $OutName = "SbPUT_157";
  611. }elsif ($InName =~ m/SpPUT_157/){
  612. $OutName = "SpPUT_157";
  613. }elsif ($InName =~ m/TaPUT_157/){
  614. $OutName = "TaPUT_157";
  615. }elsif ($InName =~ m/TmPUT_157/){
  616. $OutName = "TmPUT_157";
  617. }elsif ($InName =~ m/ZmPUT_157/){
  618. $OutName = "ZmPUT_157";
  619. #-----------------------------------------------------------+
  620. # PLANT PROTEIN DATABASES |
  621. #-----------------------------------------------------------+
  622. #-----------------------------------------------------------+
  623. # FINISHED GENOMES PROTEINS BLASTX |
  624. #-----------------------------------------------------------+
  625. }elsif ($InName =~ m/TAIR6/){
  626. $OutName = "TAIR6";
  627. }elsif ($InName =~ m/Os_5_cds_pep/){
  628. $OutName = "OS_5_cds_pep";
  629. }elsif ($InName =~ m/Os_5_RAP1_pep/){
  630. $OutName = "OS_5_cds_pep";
  631. #-----------------------------------------------------------+
  632. # UNIPROT |
  633. #-----------------------------------------------------------+
  634. }elsif ($InName =~ m/Os_5_RAP1_pep/){
  635. $OutName = "OS_5_cds_pep";
  636. #-----------------------------------------------------------+
  637. # UNKNOWN DATABASES |
  638. #-----------------------------------------------------------+
  639. # Use the input name
  640. # It may be better to flag this as UNK
  641. }else{
  642. $OutName = "UNK";
  643. #$OutName = $InName;
  644. }
  645. return $OutName;
  646. }
  647. =head1 HISTORY
  648. The history of modifications of this file.
  649. =cut
  650. #-----------------------------------------------------------+
  651. # HISTORY |
  652. #-----------------------------------------------------------+
  653. # 03/06/2007
  654. # - Program started
  655. # - Included ParseConfigFile from previous work
  656. # - Main body of program written
  657. # Reading input fasta file and sequence features and
  658. # loading to the FeatMatrix 2d array.
  659. # - Printing output to screen for testing purposes
  660. # - Ability to read tab delim blast output
  661. #
  662. # 03/10/2007
  663. # - Working with test tabl delim blast output
  664. # - Adding a coverage calculation and report
  665. # - Adding quiet flag to turn of debug print information
  666. # - Fixed problem parsing the tab delimited BLAST files:
  667. # I was using The regular expression: (m/\#.*/) to ignore
  668. # comment lines. However this will ignore any line that
  669. # CONTAINS the # symbol. Changing the reg exp to
  670. # (m/^\#.*/) fixes the problem. This is a particular issue with
  671. # BLAST databases that have been formatted in the
  672. # RepBase format. (ie EnSpm1_TD#EnSpm)
  673. # - Added CrosTab function
  674. #
  675. # 03/12/2007
  676. # - Added GetShortName subfunction
  677. # - Got GetShortName working for Repeat databases and
  678. # the for all other BLAST databases used for the
  679. # Wheat Annotation project
  680. # - Added the EmptyFiles array. This will prevent
  681. # wasting time doing counts for zero length files
  682. =head1 TO DO
  683. To do list
  684. =cut
  685. #-----------------------------------------------------------+
  686. # TO DO LIST
  687. #-----------------------------------------------------------+
  688. # - Use GD to draw image of feature coverage and richness
  689. # - TEST CMD IS:
  690. # ./VennSeq.pl -i /home/jestill/projects/wheat_annotation/VennTest/HEX0075G21.fasta.masked -o /home/jestill/projects/wheat_annotation/VennTest/TestOutTwo.txt -d /home/jestill/projects/wheat_annotation/VennTest/ -f tab