/scripts/back/VenSeq.Back3.pl
Perl | 770 lines | 417 code | 80 blank | 273 comment | 43 complexity | 77c91617dde365712d59eaf906f4476b MD5 | raw file
- #!/usr/bin/perl -w
- #-----------------------------------------------------------+
- # |
- # VennSeq.pl |
- # |
- #-----------------------------------------------------------+
- # AUTHOR: James C. Estill |
- # CONTACT: jestill_at_sourceforge.net |
- # STARTED: 03/06/2007 |
- # UPDATED: 03/12/2007 |
- # |
- # SHORT DESCRIPTION: |
- # Creates a Venn Diagram comparing the overlap of sequence |
- # features along a query sequence. The purpose is to allow |
- # me to visualize the overlap of Repeat Databases or gene |
- # models along a BAC. |
- # |
- # DEPENDENCIES: |
- # -BioPERL |
- # -NCBI BLAST |
- # -Venn Master |
- # |
- # USAGE: |
- # |
- # |
- # NOTES: |
- # -All array indices start at one. |
- # This will facilitate parsing information from sequence |
- # features that are all indexed from 1 to L |
- # -q is quiet, Q is super quiet |
- #-----------------------------------------------------------+
- # To do. If no fasta file then the sequence length may
- # be passed as a variable from the command line.
- print "The VennSeq Program has started.\n";
- =head1 SETUP
- Add includes and set variable scope
- =cut
- #-----------------------------+
- # INCLUDES |
- #-----------------------------+
- use Getopt::Std; # Get options from the command line
- use Bio::SearchIO; # Parse BLAST output
- use Bio::SeqIO; # Parse fasta sequence input file
- use GD; # Draw using the GD program
- # In case I want to draw a coverage map
- #-----------------------------+
- # SET VARIABLE SCOPE |
- #-----------------------------+
- my $Infile; # Path to sequence file in fasta format
- my $Out; # Output file path
- my $FeatDir; # Directory containing sequence features
- my $Format; # Format of the sequence feature files
- # - blast or b
- # - tab or t
- # - gff or g
- my $SeqLen; # Full length of the sequence that the
- # features are being mapped onto
- my @FeatFiles; # Array to hold the path of the
- # feature files.
- my @FeatFilesList; # The original list of potential feature
- # files. The zero length files will
- # be ignored leaving only files with data
- my @EmptyFiles; # Files with no hits (0 length files)
- my $NumFeatFiles; # The number of feature files
- my @FeatMatrix; # Count of feature occurrences
- my @BinFeatMatrix; # Presence/abscence binary form of the
- # feature matrix
- my @FeatLabels; # Category labels for the features
- # this will take the
- my @CrossTab; # Cross table matrix
- my $FileTestNum = 0; # Initialize counter of file test number
- my $EmptyFileNum = 0; # Initialize counter of empty files
- =head1 GET VARIABLES
- Get variables from the command line.
- =cut
- #-----------------------------------------------------------+
- # COMMAND LINE VARIABLES |
- #-----------------------------------------------------------+
- getopts('i:o:d:f:x:y:C:q', \%Options);
- # x and y will possibly be used later to draw coverage map
- # using x and y scale factor where x and y are integers
- # greater then 1.
- # may also use a "MAX" varialble
- my $Usage = "\nVennSeq.pl -i InputFastaFile -o OutputFile\n".
- "-d FeatureFileDirecotry \n";
- my $Config = $Options{C};
- my $quiet = $Options{q};
- if ($Config)
- {
- print "Varialbes will be loaded from the config file";
- #-----------------------------+
- # GET CONFIG FILE OPTIONS |
- #-----------------------------+
- $Infile = &ParseConfigFile($Config,"SeqIn")
- || die "Config file must include a fasta file\n$Usage";
- $Out = &ParseConfigFile($Config,"Out")
- || die "Config file must include an output file\n$Usage";
- $FeatDir = &ParseConfigFile($Config,"FeatDir")
- || die "Config file must include a sequence feature directory\n$Usage";
- $Format = &ParseConfigFile($Config,"Format")
- || die "Config file must indicate a feature file format\n$Usage";
- }else{
- #-----------------------------+
- # GET COMMAND LINE OPTIONS |
- #-----------------------------+
- $Infile = $Options{i} ||
- die "Command line must include an input fasta file path.\n$Usage\n";
- $Out = $Options{o} ||
- die "Command line must include and output file path.\n$Usage\n";
- $FeatDir = $Options{d} ||
- die "Command line must include a sequence feature directory\n$Usage\n";
- $Format = $Options{f} ||
- die "Command line must include a feature file format\n$Usage\n";
- }
- #-----------------------------+
- # CHECK FILE EXISTENCE |
- #-----------------------------+
- unless (-e $Infile)
- {
- print "ERROR:The input sequence file could not be found:\n$Infile\n";
- }
- #-----------------------------+
- # OPEN THE OUTPUT FILE |
- #-----------------------------+
- open (OUT, ">$Out") ||
- die "Could not open output file for writing at:\n$Out\n";
- #-----------------------------+
- # GET LENGTH OF THE SEQUENCE |
- # FROM THE INPUT FILE |
- #-----------------------------+
- # This will be used to determine the length of the array
- # This includes an internal check for only one sequence
- # in the input file.
- my $seq_in = Bio::SeqIO->new('-file' => "<$Infile",
- '-format' => 'fasta') ||
- die "Can not open input file:n\$Infile\n";
- my $NumSeq = 0;
- while (my $inseq = $seq_in->next_seq)
- {
- $NumSeq++;
- $SeqLen = $inseq->length();
- if ($NumSeq >> 1)
- {
- print "ERROR. Input sequence file contains more then one sequence.\n";
- exit;
- }
- }
- =head1 GET SEQUENCE FEATURES
- =cut
- #-----------------------------------------------------------+
- # GET SEQUENCE FEATURES |
- #-----------------------------------------------------------+
- if ($Format =~ "tab")
- {
- #-----------------------------------------------------------+
- # TAB DELIM BLAST OUTPUT |
- #-----------------------------------------------------------+
- # Currently only supporting the blo extension
- opendir(INDIR, $FeatDir) ||
- die "Can not open input directory:\n$FeatDir: $!";
- @FeatFilesList = grep /blo$/, readdir INDIR;
- #-----------------------------+
- # TEST FOR FILE LENGTH |
- #-----------------------------+
- # Check for files with no hits here by checking for zero length
- #
- # Can load indexes to delete ??
- foreach my $FileTest (@FeatFilesList)
- {
- $FileTestNum++;
- $FileSize = -s $FeatDir.$FileTest || 0;
- print $FileTest." ".$FileSize."\n";
-
- #LOAD ZERO LENGTH FILE TO EmptyFiles array
- # if ($FileSize == 0)
- if (-z $FeatDir.$FileTest) # If the file is zero length
- {
- $EmptyFileNum++;
- $EmptyFiles[$EmptyFileNum][1] = $EmptyFileNum;
- $EmptyFiles[$EmptyFileNum][2] = $FileTestNum;
- $EmptyFiles[$EmptyFileNum][3] = $FileTest;
- }else{
- push;
- }# End of if FileSize = 0
- } # End of for each file in File test
-
- #-----------------------------+
- # PRINT OUT THE EMPTY FILES |
- #-----------------------------+
- # This is a place to drop empty files from the array
- if ($EmptyFileNum > 0)
- {
- print "The following files had no BLAST hits:\n";
- for (my $i++; $i<=$EmptyFileNum;$i++)
- {
- print $EmptyFiles[$i][1]."\t";
- print $EmptyFiles[$i][2]."\t";
- print $EmptyFiles[$i][3]."\n";
- }
- }else{
- # I can get rid of this else when things are working
- print "Empty File num is: $EmptyFileNum\n";
- }
- # Temp exit while I test working with file size
- exit;
-
- $NumFeatFiles = @FeatFiles;
- if ($NumFeatFiles == 0)
- {
- print "No feature files of type \"$Format\" were found in:\n".
- "$FeatDir\n";
- exit;
- }else{
- print "$NumFeatFiles of type $Format were found in\n".
- "$FeatDir\n";
- # temp exit for debug
- #exit;
- }
- #-----------------------------+
- # FILL THE FEATURE MATRIX |
- # WITH ZEROS |
- #-----------------------------+
- print "Initializing feature matrix and binary matrix with zeros\n";
- for (my $row=1; $row<=$NumFeatFiles; $row++)
- {
- my $NumCol=0; # Varialbe to count number of cols
- # for debug purpsoses
- print "\tInitializing row $row...";
- for (my $col=1; $col<=$SeqLen; $col++)
- {
- $FeatMatrix[$row][$col] = 0;
- $BinFeatMatrix[$row][$col] = 0;
- $NumCol++;
- }
- print "$NumCol cols.\n";
- }
- #-----------------------------+
- # FOR EACH OF THE FEATURE |
- # FILES INCREMENT THE PROPER |
- # ROW IN @FeatMatrix |
- #-----------------------------+
- # Use uppercase Row and Col here to avoid
- # any problems with var names
- my $Row = 0; # Reset row count to zero
- # Each feature has a unique row
- print "Loading data from feature files.\n";
- foreach my $IndFile (@FeatFiles)
- {
- $Row++; # Increment row for each file
- $FeatLabels[$Row] = &GetShortName($IndFile);
- print "\tLoading data from feature file $FeatLabels[$Row]\n";
- my $FeatPath = $FeatDir.$IndFile;
- my $NumLines = 0; # Number of lines in the feat file
- my $BinFeatCount = 0; # Counter for the "binary" features
- open (FEATIN, "<$FeatPath") ||
- die "Can not open the feature file:\n$FeatPath\n";
- while (<FEATIN>)
- {
- chomp;
- $NumLines++;
- # Use unless to ignore comment lines from -m 9 output
- unless (m/^\#.*/)
- {
- # TEST THAT THE LENGTH OF THE ARRAY IS
- # WHAT IS EXPECT
- my @TabSplit = split(/\t/);
- my $TestLen = @TabSplit;
- if ($TestLen > '12')
- {
- print "ERROR: The BLAST file has an unexpected\n".
- "number of tab delimitedy columns.";
- #exit;
- # Currently just skip this record and move forward
- }
- my ($QryId, $SubId, $PID, $Len,
- $MisMatch, $GapOpen,
- $QStart,$QEnd, $SStart, $SEnd,
- $EVal, $BitScore) = split(/\t/);
-
- #-----------------------------+
- # PRINT SOME FEATURE VALUES |
- # FOR DEBUG PURPOSES |
- #-----------------------------+
- # Just do this for the first few hits
- unless ($quiet)
- {
- if ($NumLines < '3')
- {
- print "\n\t\t LINE: $NumLines\n";
- print "\t\t QRY: $QryId\n";
- print "\t\t SUB: $SubId\n";
- print "\t\tQSTART: $QStart\n";
- print "\t\t QEND: $QEnd\n\n";
- } # End of If NumLines less then three
- } # End of unless quiet
- for (my $Col=$QStart; $Col<=$QEnd; $Col++)
- {
- $BinFeatCount++;
- $FeatMatrix[$Row][$Col] = $FeatMatrix[$Row][$Col] + 1;
- } # End of for my $col from Query Start to End
- } # End of unless # (unless comment line)
- } # End of while FEATIN
- close FEATIN;
- #-----------------------------+
- # REPORT NUMBER OF FEATURES |
- # THAT WERE PROCESSED |
- #-----------------------------+
- # For tab delimited blast the number of features
- # is roughly equal to the number of lines == number of HSPS
- #print "\t\tFILE: $FeatPath\n";
- unless ($quiet)
- {
- print "\t\tFILE: $IndFile\n";
- print "\t\tFEAT: $NumLines\n";
- print "\t\t BIN: $BinFeatCount\n";
- } # End of unless quiet
- } # End of For each $IndFile in @FeatFiles
- }else{
- print "A proper sequence feature file format was not selected.";
- exit;
- } # END OF IF $Format
- =head1 FILL BINARY MATRIX
- Fill the binary matrix and print output for Venn Master
- =cut
- #-----------------------------------------------------------+
- # FILL THE BINARY MATRIX AND PRINT OUTPUT FOR VENN MASTER |
- #-----------------------------------------------------------+
- print "Filling the binary matrix.\n";
- for ($row=1; $row<=$NumFeatFiles; $row++)
- #for ($row=1; $row<=3; $row++)
- {
- print "\tLoading row $row\n";
- my $BinFeatLen = 0; # Binary feature length
- my $BinFeatOccur = 0; # Occurrence of features
- for ($col=1; $col<=$SeqLen; $col++)
- {
- $BinFeatLen++;
- # If the feature matrix contains data
- if ($FeatMatrix[$row][$col] >> 0)
- {
- $BinFeatOccur++;
- $BinFeatMatrix[$row][$col] = 1;
- # Print to screen first to see if this actually
- # does work
- print OUT "$col\t$FeatLabels[$row]\n";
- # print "$col\t$FeatLabels[$row]\n";
- }
- } # End of for $col
- #-----------------------------+
- # REPORT THE PERCENT COVERAGE |
- # OF THE FEATURE AFTER |
- # TESTING FOR DIVIDE BY ZERO |
- #-----------------------------+
- if ($BinFeatOccur == 0)
- {
- print "\t\tERROR: The feature has zero length.\n";
- } else {
- # Calculate feature coverage
- my $BinFeatCov = $BinFeatOccur/$BinFeatLen;
- # Convert feature coverage to percent
- $BinFeatCov = sprintf("%.4f",$BinFeatCov);
- print "\t\tLEN: $BinFeatLen\tCOV: $BinFeatCov\n"
- }
- } # End of for row
- =head1 CROSS TABLE
- Generate a Cross Table comparing sequence feature coverage.
- =cut
- #-----------------------------------------------------------+
- # GENERATE COMPARISON MATRIX @CrossTab |
- #-----------------------------------------------------------+
- # This should be made and option, ie. don't do under quiet
- # Generates the NxN CrossTab matrix
- # Where N is the number of sequence features being comparedh zeros
- print "Generating the Cross tab\n";
- print "\n";
- #-----------------------------+
- # PRINT TABLE HEADER |
- #-----------------------------+
- #-----------------------------+
- # CREATE SEPERATOR STRING AND |
- # PRINT TOP OF CROSS TABLE |
- #-----------------------------+
- my $Sep = "+";
- for (my $i=1; $i<=$NumFeatFiles ; $i++)
- {$Sep = $Sep."--------+";}
- print $Sep."\n";
- #-----------------------------+
- # PRINT BODY OF CROSS TABLE |
- #-----------------------------+
- for (my $i=1; $i<=$NumFeatFiles ; $i++)
- {
- print "|"; # Print the left hand side of the cross tab
- for (my $j=1; $j<=$NumFeatFiles ; $j++)
- {
- my $SharedCount=0; # Initialize the count of shared
- my $iCount=0; # Initialize the i feature count
- my $CrossRatio; # Set scope of the cross ratio
- if ($i != $j)
- {
- #-----------------------------+
- # CALCULATE CROSS TABLE RATIO |
- #-----------------------------+
- # k is the sequence position
- for (my $k=1; $k<=$SeqLen; $k++)
- {
- if ($BinFeatMatrix[$i][$k] == 1)
- {
- # Increment the i feature count
- $iCount++;
- if ($BinFeatMatrix[$j][$k] == 1)
- {
- # Increment the shared feature count
- $SharedCount++;
- } # End of if j = 1 in binary matrix
- } # End of if i = 1 in binary matrix
- } # End of for k
-
- #-----------------------------+
- # LOAD RESULT TO CROSS TABLE |
- # AFTER DIVIDE BY ZERO TEST |
- #-----------------------------+
- if ($iCount >0)
- {
- $CrossRatio = $SharedCount/$iCount;
- $CrossRatio = sprintf("%.4f",$CrossRatio);
- }else{
- # If if = 0 this is the NULL case
- $CrossRatio = " NULL ";
- } # End of if $iCount > 0
-
- # Load to the Cross Tab Array
- $CrossTab[$i][$j]=$CrossRatio;
- # To print in table format
- print " ".$CrossRatio." |";
- }else{
- # If i = j then print the data source
- # label in the table, this will need to
- # be formatted to fit in the box
- # - sign below indicates left justification
- my $DatSrcLab = sprintf ('%-*s', 8, $FeatLabels[$i]);
- print $DatSrcLab."|";
- }
-
- } # End of for j (Seq feature set two)
- print "\n"; # Print new line for end of row in cross tab
- print $Sep."\n";
- } # End of for i (Seq feature set one)
- #print $Sep."\n";
- print "\n"; # Print final new row at the very end of cross tab
- =head1 OPEN VENN MASTER
- Open the Venn master program and display the results.
- =cut
- #-----------------------------------------------------------+
- # OPEN VENN MASTER TO DISPLAY VENN DIAGRAMS |
- #-----------------------------------------------------------+
- =head1 COVERAGE DIAGRAM & HEAT MAP
- Create an image that represents the coverage diagram and a
- heat map of the distribution of sequence features.
- =cut
- #-----------------------------------------------------------+
- # CREATE COVERAGE DIAGRAM AS A HEAT MAP |
- #-----------------------------------------------------------+
- # This can me made to be an option
- =head1 SUBFUNCTIONS
- Subfunctions for the program
- =cut
- #-----------------------------------------------------------+
- # SUBFUNCTIONS |
- #-----------------------------------------------------------+
- sub ParseConfigFile
- {
- #-----------------------------------------------------------+
- # This will parase a configuration text file to set |
- # variables that would normally be set at the command line |
- # This is not the fastest way to parse the config file |
- # but this makes the subfuction reusable. |
- # If the variable name occurs more then once in the text |
- # file, the last occurrence will be used. |
- #-----------------------------------------------------------+
- my $ConfigFilePath = $_[0];
- my $VarName = $_[1];
- my $VarValue;
- open (CONFILE, $ConfigFilePath) ||
- die "Could not open config file:\n\t$ConfigFilePath";
- while (<CONFILE>)
- {
- chomp; # Remove newline character
- unless (m/\#.*/) # Ignore comment lines
- {
- my @SplitLine = split;
- if ($SplitLine[0] =~ $VarName){
- $VarValue = $SplitLine[1];}
- }
- }
- close CONFILE;
- return $VarValue;
- }
- sub GetShortName
- {
- #-----------------------------------------------------------+
- # CONVERT THE NAME TO A SHORTENED FORM |
- #-----------------------------------------------------------+
- # This will be used to write short names for the CrossTab
- # table or any other use of the name in the program. The short
- # form of the name will be limited to 8 characters
- # NOTE:
- # This is a kluge that will only work for the BLAST databases
- # that I am working with in wheat (03/12/2007 - JCE)
- my $InName = $_[0]; # Input name string
- my $OutName; # Set scope for the name to return
- #-----------------------------------------------------------+
- # REPEAT DATABASES |
- #-----------------------------------------------------------+
- if ($InName =~ m/TREP9_nr/)
- {
- $OutName = "TREP9nr";
- }elsif ($InName =~ m/TREP9_total/){
- $OutName = "TREP9tot";
- }elsif ($InName =~ m/mips_REdat/){
- $OutName = "MIPS";
- }elsif ($InName =~ m/TIGR_Gram/){
- $OutName = "TIGRGram";
- }elsif ($InName =~ m/RB_pln/){
- $OutName = "RB_pln";
- }elsif ($InName =~ m/TATrans/){
- $OutName = "TATran";
- }elsif ($InName =~ m/Wessler/){
- $OutName = "Wessler ";
- }elsif ($InName =~ m/SanMiguel/){
- $OutName = "SanMig";
- #-----------------------------------------------------------+
- # EST DATABASES |
- #-----------------------------------------------------------+
- # TIGR GENE INDICES
- }elsif ($InName =~ m/TaGI_10/){
- $OutName = "TaGI_10";
- }elsif ($InName =~ m/AtGI_13/){
- $OutName = "AtGI_13";
- }elsif ($InName =~ m/ZmGI_17/){
- $OutName = "ZmGI_17";
- }elsif ($InName =~ m/SbGI_8/){
- $OutName = "SbGI_8";
- }elsif ($InName =~ m/OsGI_17/){
- $OutName = "OsGI_17";
- }elsif ($InName =~ m/HvGI_9/){
- $OutName = "HvGI_17";
- #-----------------------------+
- # EST: TIGR TAs |
- #-----------------------------+
- }elsif ($InName =~ m/AcTA_1/){
- $OutName = "AcTA_1";
- }elsif ($InName =~ m/AsTA_1/){
- $OutName = "AsTA_1";
- }elsif ($InName =~ m/AsTA_1/){
- $OutName = "AsTA_1";
- }elsif ($InName =~ m/AvenSatTA_2/){
- $OutName = "AvenSaTa";
- }elsif ($InName =~ m/CdTA_2/){
- $OutName = "CeTA_2";
- }elsif ($InName =~ m/FaTA_2/){
- $OutName = "FaTA_2";
- }elsif ($InName =~ m/HvTA_2/){
- $OutName = "HvTA_2";
- }elsif ($InName =~ m/OsTA_2/){
- $OutName = "OsTA_2";
- }elsif ($InName =~ m/PgTA_2/){
- $OutName = "PgTA_2";
- }elsif ($InName =~ m/SbTA_2/){
- $OutName = "SbTA_2";
- }elsif ($InName =~ m/ShTA_2/){
- $OutName = "ShTA_2";
- }elsif ($InName =~ m/SoTA_2/){
- $OutName = "SoTA_2";
- }elsif ($InName =~ m/SpTA_2/){
- $OutName = "SpTA_2";
- }elsif ($InName =~ m/TaTA_2/){
- $OutName = "TaTA_2";
- }elsif ($InName =~ m/TmTA_2/){
- $OutName = "TmTA_2";
- }elsif ($InName =~ m/TtTA_1/){
- $OutName = "TtTA_1";
- }elsif ($InName =~ m/ZmB73TA_1/){
- $OutName = "ZmB71TA_1";
- #-----------------------------+
- # EST: NCBI UniGenes |
- #-----------------------------+
- }elsif ($InName =~ m/AtUniGene_56/){
- $OutName = "AtUn_56";
- }elsif ($InName =~ m/HvUniGene_47/){
- $OutName = "HvUn_47";
- }elsif ($InName =~ m/OsUniGene_65/){
- $OutName = "OsUn_65";
- }elsif ($InName =~ m/SbUniGene_21/){
- $OutName = "SbUn_21";
- }elsif ($InName =~ m/SoUniGene_8/){
- $OutName = "SoUn_8";
- }elsif ($InName =~ m/TaUniGene_46/){
- $OutName = "TaUn_46";
- }elsif ($InName =~ m/ZmUniGene_61/){
- $OutName = "ZmUn_61";
- #-----------------------------+
- # EST: PLANT GDB PUTS |
- #-----------------------------+
- }elsif ($InName =~ m/AsPUT_157/){
- $OutName = "AsPUT_157";
- }elsif ($InName =~ m/AtPUT_157/){
- $OutName = "AtPUT_157";
- }elsif ($InName =~ m/HvPUT_157/){
- $OutName = "HvPUT_157";
- }elsif ($InName =~ m/OsIndPUT_157/){
- $OutName = "OsIndPUT";
- }elsif ($InName =~ m/OsJap_157/){
- $OutName = "OsJapPUT";
- }elsif ($InName =~ m/OsPut_157/){
- $OutName = "OsPUT_157";
- }elsif ($InName =~ m/SbPUT_157/){
- $OutName = "SbPUT_157";
- }elsif ($InName =~ m/SpPUT_157/){
- $OutName = "SpPUT_157";
- }elsif ($InName =~ m/TaPUT_157/){
- $OutName = "TaPUT_157";
- }elsif ($InName =~ m/TmPUT_157/){
- $OutName = "TmPUT_157";
- }elsif ($InName =~ m/ZmPUT_157/){
- $OutName = "ZmPUT_157";
- #-----------------------------------------------------------+
- # PLANT PROTEIN DATABASES |
- #-----------------------------------------------------------+
- #-----------------------------------------------------------+
- # FINISHED GENOMES PROTEINS BLASTX |
- #-----------------------------------------------------------+
- }elsif ($InName =~ m/TAIR6/){
- $OutName = "TAIR6";
- }elsif ($InName =~ m/Os_5_cds_pep/){
- $OutName = "OS_5_cds_pep";
- }elsif ($InName =~ m/Os_5_RAP1_pep/){
- $OutName = "OS_5_cds_pep";
- #-----------------------------------------------------------+
- # UNIPROT |
- #-----------------------------------------------------------+
- }elsif ($InName =~ m/Os_5_RAP1_pep/){
- $OutName = "OS_5_cds_pep";
- #-----------------------------------------------------------+
- # UNKNOWN DATABASES |
- #-----------------------------------------------------------+
- # Use the input name
- # It may be better to flag this as UNK
- }else{
- $OutName = "UNK";
- #$OutName = $InName;
- }
- return $OutName;
- }
- =head1 HISTORY
- The history of modifications of this file.
- =cut
- #-----------------------------------------------------------+
- # HISTORY |
- #-----------------------------------------------------------+
- # 03/06/2007
- # - Program started
- # - Included ParseConfigFile from previous work
- # - Main body of program written
- # Reading input fasta file and sequence features and
- # loading to the FeatMatrix 2d array.
- # - Printing output to screen for testing purposes
- # - Ability to read tab delim blast output
- #
- # 03/10/2007
- # - Working with test tabl delim blast output
- # - Adding a coverage calculation and report
- # - Adding quiet flag to turn of debug print information
- # - Fixed problem parsing the tab delimited BLAST files:
- # I was using The regular expression: (m/\#.*/) to ignore
- # comment lines. However this will ignore any line that
- # CONTAINS the # symbol. Changing the reg exp to
- # (m/^\#.*/) fixes the problem. This is a particular issue with
- # BLAST databases that have been formatted in the
- # RepBase format. (ie EnSpm1_TD#EnSpm)
- # - Added CrosTab function
- #
- # 03/12/2007
- # - Added GetShortName subfunction
- # - Got GetShortName working for Repeat databases and
- # the for all other BLAST databases used for the
- # Wheat Annotation project
- # - Added the EmptyFiles array. This will prevent
- # wasting time doing counts for zero length files
- =head1 TO DO
- To do list
- =cut
- #-----------------------------------------------------------+
- # TO DO LIST
- #-----------------------------------------------------------+
- # - Use GD to draw image of feature coverage and richness
- # - TEST CMD IS:
- # ./VennSeq.pl -i /home/jestill/projects/wheat_annotation/VennTest/HEX0075G21.fasta.masked -o /home/jestill/projects/wheat_annotation/VennTest/TestOutTwo.txt -d /home/jestill/projects/wheat_annotation/VennTest/ -f tab