makeZoo3.pl | searchcode

/src/hg/makeDb/makeZoo3.pl

https://github.com/rpique/UCSC-Browser-code-add-ons
Perl | 324 lines | 85 code | 64 blank | 175 comment | 10 complexity | 43ad318b2429c26e63ef1b2ecc4b74dc MD5 | raw file

#!/usr/bin/perl -w
use strict;

#
# This script creates the zoo browser databases
# It runs in 2 large main loops. The first loop distributes input files and sets up
# blat jobs. It may need some babysitting or incremental running to make sure
# that it works correctly since it does a lot. Uncomment each track are you want to 
# create. The first loop works only when run on kk, because some of the data
# it references lives there.
# After you create the blat job input files you need to ssh to kk and run the blat jobs by hand.
# I have not scripted this because it requires lots of user intervention.
# After the blat jobs are done you can comment out the first loop and run the second loop.
# This loads the database with the blat results.
#




# Hash keyed by genus species, containing values indicating the project name
my %orgHash = (
    "Papio_cynocephalus_anubis" => "zooBaboon3",
    "Felis_catus" => "zooCat3",
    "Gallus_gallus" => "zooChicken3",
    "Pan_troglodytes" => "zooChimp3",
    "Bos_taurus" => "zooCow3",
    "Canis_familiaris" => "zooDog3",
    "Takifugu_rubripes" => "zooFugu3",
    "Homo_sapiens" => "zooHuman3",
    "Mus_musculus" => "zooMouse3", 
    "Sus_scrofa" => "zooPig3",
    "Rattus_norvegicus" => "zooRat3",
    "Tetraodon_nigroviridis" => "zooTetra3",
    "Danio_rerio" => "zooZebrafish3",       
               );

# Hash keyed by genus species, containing values indicating the common organism name
my %orgNameHash = (
    "Papio_cynocephalus_anubis" => "baboon",
    "Felis_catus" => "cat",
    "Gallus_gallus" => "chicken",
    "Pan_troglodytes" => "chimp",
    "Bos_taurus" => "cow",
    "Canis_familiaris" => "dog",
    "Takifugu_rubripes" => "fugu",
    "Homo_sapiens" => "human",
    "Mus_musculus" => "mouse", 
    "Sus_scrofa" => "pig",
    "Rattus_norvegicus" => "rat",
    "Tetraodon_nigroviridis" => "tetra",
    "Danio_rerio" => "zfish",
                   );

my $zooDir = "/cluster/store2/zoo3";
my $mrnaRoot = "/cluster/store1/mrna.130/org";
my $xenoRoot = "/cluster/store1/mrna.130/zoo/work";
my $refSeqDir = "/cluster/store1/mrna.130/refSeq";
my $org;
my $orgName;
my $dbName;
my $dirName;
my $blatDir;
my $workingDir;

# One time step - create all the databases
##run("mysql -u SECRET_USER -pBIG_SECRET -A < ${zooDir}/createDbs.sql\n");

# Set up the blat and repeat masker directories
##run("rm -rf work");
##run("mkdir -p work");
##run("mkdir -p work/blat");
##run("mkdir -p work/blatFish");
##run("mkdir -p work/simpleRepeat");
##run("mkdir -p work/xeno");
##run("mkdir -p work/refSeq");
##run("cp scripts/blatFish-gsub work/blatFish/gsub");
##run("cp scripts/simpleRepeat-gsub work/simpleRepeat/gsub");

for $org (keys(%orgHash)) {
    $dirName = $dbName = $orgHash{$org};
    $orgName = $orgNameHash{$org};

# pjt.pl is a one-off script for a prorietary manually curated data set 
#  I got from Pamela Jacques-Thomas
##    run("$zooDir/pjt.pl data/$orgName/${orgName}_target1.annot.ff ${orgName}_pjt");
##    run("hgsql $dbName < sql/dropCrap.sql");
##    run("hgLoadBed -noBin $dbName pjt_gene ${orgName}_pjt_all.bed");
##    run("hgLoadBed -noBin $dbName pjt_genscan ${orgName}_pjt_genscan.bed");
 
##    run("mkdir -p $dirName");
##    run("mkdir -p $dirName/bed");
##    run("mkdir -p $dirName/1");
##    run("mkdir -p $dirName/bed/rmskS");
##    run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.masked $dirName/1/target1.fa");
##    run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.out $dirName/bed/rmskS/target1.fa.out");
##    run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.masked $dirName/1/target1.fa");
##    run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.out $dirName/bed/rmskS/target1.fa.out");
    run("cp $zooDir/data/${orgName}/${orgName}_target1.agp $dirName/1/target1.agp");
    run("hgGoldGapGl $dbName -noGl $zooDir $dirName");
# Uncomment this next line only for a major db rebuild
# Add the trackDb database table

##    run("hgLoadRna drop $dbName");
##    run("hgTrackDb $dbName trackDb ~/kent/src/hg/lib/trackDb.sql ~/kent/src/hg/makeDb/hgTrackDb/hgRoot");

# Create the zooSeq ordering table
##    run("hgsql $dbName < sql/zooSeqOrder.sql");

##    run("hgLoadRna new $dbName");
##    run("hgLoadRna add -type=mRna $dbName $mrnaRoot/$org/mrna.fa $mrnaRoot/$org/mrna.ra");

##    if (-e "$refSeqDir/$orgName/refSeq.fa") {
##        run("hgLoadRna add -type=refSeq $dbName $refSeqDir/$orgName/refSeq.fa $refSeqDir/$orgName/refSeq.ra");
##     }

# If the file exists, load it
##    if (-e "$mrnaRoot/$org/est.fa") {
##       run("hgLoadRna add -type=EST $dbName $mrnaRoot/$org/est.fa $mrnaRoot/$org/est.ra");
##   } else {
##       print("No est file in $mrnaRoot/$org/est.fa\n");
##   }

## run("hgLoadRna add -type=xenoRna $dbName /cluster/store1/mrna.129/zoo/work/${orgName}XenoRna.fa /cluster/store1/mrna.129/zoo/work/${orgName}XenoRna.ra");
## run("hgLoadRna add -type=xenoEst $dbName /cluster/store1/mrna.129/zoo/work/${orgName}XenoEst.fa /cluster/store1/mrna.129/zoo/work/${orgName}XenoEst.ra");

##    foreach $blatDir ("mrna", "est") {
##        if (-e "$mrnaRoot/$org/$blatDir.fa") {
##           run("mkdir -p $dirName/bed/$blatDir");
##           run("mkdir -p $dirName/bed/$blatDir/psl");            
            # Make the mrna/est alignment BLAT spec file
##           run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna -ooc=/cluster/home/kent/hg/h/11.ooc mask=lower $zooDir/$dirName/1/target1.fa $mrnaRoot/$org/${blatDir}.fa $zooDir/$dirName/bed/${blatDir}/psl/target1_${blatDir}.psl >> work/blat/spec");
##        }
##    }

# Set up refSeq track if a refSeq sequence exists
##        if (-e "$refSeqDir/$orgName/$orgName.fa") {
##           run("mkdir -p $dirName/bed/refSeq");
##           run("mkdir -p $dirName/bed/refSeq/psl");            
##           run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna -ooc=/cluster/home/kent/hg/h/11.ooc mask=lower $zooDir/$dirName/1/target1.fa $refSeqDir/$orgName/refSeq.fa $zooDir/$dirName/bed/refSeq/psl/target1_refSeq.psl >> work/refSeq/spec");
##}

##    run("mkdir -p $dirName/bed/xenoRna");
##    run("mkdir -p $dirName/bed/xenoRna/psl");
##    run("mkdir -p $dirName/bed/xenoEst");
##    run("mkdir -p $dirName/bed/xenoEst/psl");
##    run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna mask=lower {check in line+ $zooDir/$dirName/1/target1.fa} {check in line+ $xenoRoot/${orgName}XenoRna.fa} {check out line+  $zooDir/$dirName/bed/xenoRna/psl/target1_xenoRna.psl} >> work/xeno/spec");
##    run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna mask=lower {check in line+ $zooDir/$dirName/1/target1.fa} {check in line+ $xenoRoot/${orgName}XenoEst.fa} {check out line+  $zooDir/$dirName/bed/xenoEst/psl/target1_xenoEst.psl} >> work/xeno/spec");

    
# Load repeat masker and GCPercent tracks
##   run("scripts/makeNib.csh $dirName");
##   run("hgLoadOut $dbName $zooDir/$dirName/bed/rmskS/*.fa.out");
##   run("mysql -u SECRET_USER -pBIG_SECRET -A $dbName  < ~/kent/src/hg/lib/chromInfo.sql");
##   run("hgNibSeq -preMadeNib $dbName $zooDir/$dirName/nib $zooDir/$dirName/?/*.fa");
##   run("hgGoldGapGl $dbName $zooDir/ $dirName -noGl");

    # Make and load GC percent table
##   run("mkdir -p $zooDir/$dirName/bed/gcPercent");
##   run("mysql -u SECRET_USER -pBIG_SECRET -A $dbName < ~/src/hg/lib/gcPercent.sql");
##   run("hgGcPercent $dbName $dirName/nib");
##   run("mv gcPercent.bed $dirName/bed/gcPercent");

##    run("mkdir -p $dirName/bed/blatFish");
##    run("mkdir -p $dirName/bed/blatFish/psl");
##    run("mkdir -p $dirName/bed/blatFish/chrom");
##    run("echo $zooDir/$dirName | wordLine stdin >> work/blatFish/genome.lst");

##    run("ls -1S $zooDir/$dirName/ >> work/simpleRepeat/genome.lst"); # HACK - need to generalize the gsub file
##    run("mkdir -p $dirName/bed/simpleRepeat");
##    run("mkdir -p $dirName/bed/simpleRepeat/trf");

##    run("gbToFaRa dna.fil $orgName.fa $orgName.ra $orgName.ta data/$orgName/${orgName}_target1.annot.ff");

##    if ($dbName eq "zooHuman3") {
##        run("hgLoadBed $dbName mcs_u $zooDir/data/bed/mcs_u.bed");
##        run("hgLoadBed $dbName mcs_b $zooDir/data/bed/mcs_b.bed");
##        run("hgLoadBed $dbName dhs $zooDir/data/bed/dhs.bed");
##    }
} # END OF FIRST WHILE LOOP

# The line below only works on kk - that's where the fish is stored
##run("ls -1S /scratch/hg/tetFish/*.fa > work/blatFish/blatFish.lst");
##run("gensub2 work/blatFish/genome.lst work/blatFish/blatFish.lst work/blatFish/gsub work/blatFish/spec");
##run("gensub2 work/simpleRepeat/genome.lst single work/simpleRepeat/gsub work/simpleRepeat/spec");









# NOW RUN BLAT BY HAND USING PARASOL
# Second main loop iteration
# This will work ONLY AFTER BLAT is run and you have results in the desired output dirs

for $org (keys(%orgHash)) {
    $dirName = $dbName = $orgHash{$org};
    $orgName = $orgNameHash{$org};

    # Process psl alignment files
#    run("cp $dirName/bed/mrna/psl/target1_mrna.psl $dirName/bed/mrna/chrom/target1_mrna.psl");

#    if (-e "$dirName/bed/est") {
#        run("cp $dirName/bed/est/psl/target1_est.psl $dirName/bed/est/chrom/target1_est.psl");
#    }
    
    $workingDir = "$dirName/bed/mrna";
##    run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
##    run("pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 $workingDir/raw.psl $workingDir/all_mrna.psl /dev/null");
##    run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/all_mrna.psl");

    $workingDir = "$dirName/bed/est";
    if (-e $workingDir) {
##        run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
##        run("pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 $workingDir/raw.psl $workingDir/all_est.psl /dev/null");
##        run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/all_est.psl");
    }

    $workingDir = "$dirName/bed/refSeq";
    if (-e "$workingDir/psl") {
##        run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
##        run("pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 $workingDir/raw.psl $workingDir/all_refSeq.psl /dev/null");
##        run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/all_refSeq.psl");
    }

    $workingDir = "$dirName/bed/xenoRna";
##    run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
##    run("pslReps -minAli=0.25 $workingDir/raw.psl $workingDir/xenoMrna.psl /dev/null");
##    run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/xenoMrna.psl");

    $workingDir = "$dirName/bed/xenoEst";
##    run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
##    run("pslReps -minAli=0.25 $workingDir/raw.psl $workingDir/xenoEst.psl /dev/null");
##    run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/xenoEst.psl");

    # Load psl alignment files into the DB
    $workingDir = "$dirName/bed/mrna";
##    run("hgLoadPsl $dbName $workingDir/chrom/*.psl");
##    run("hgLoadPsl $dbName $workingDir/all_mrna.psl -nobin");

    $workingDir = "$dirName/bed/est";
    if (-e $workingDir) {
##    run("hgLoadPsl $dbName $workingDir/chrom/*.psl");
##    run("hgLoadPsl $dbName $workingDir/all_est.psl -nobin");
    }

    $workingDir = "$dirName/bed/xenoRna";
    if (-e $workingDir) {
##    run("hgLoadPsl $dbName $workingDir/xenoMrna.psl -nobin");
    }

    $workingDir = "$dirName/bed/xenoEst";
    if (-e $workingDir) {
##    run("hgLoadPsl $dbName $workingDir/xenoEst.psl -nobin");
    }

    $workingDir = "$dirName/bed/refSeq";
    if (-e "$workingDir/chrom") {
##        run("pslCat -dir $workingDir/chrom > $workingDir/refSeqAli.psl");
##        run("hgLoadPsl $dbName -tNameIx $workingDir/refSeqAli.psl");
    }

    $workingDir = "$dirName/bed/refSeq";
##    print("$refSeqDir/$orgName\n\n");
##    if (-e "$refSeqDir/$orgName") {
##        run("hgRefSeqMrna $dbName $refSeqDir/$orgName/refSeq.fa $refSeqDir/$orgName/refSeq.ra $workingDir/all_refSeq.psl $refSeqDir/loc2ref $refSeqDir/$orgName/$orgName.prot.fa $refSeqDir/mim2loc");
##        run("hgRefSeqStatus $dbName $refSeqDir/loc2ref");
##    }

#    $workingDir = "$dirName/bed/xeno";
#    run("hgLoadPsl -tNameIx $dbName $workingDir/psl/xenoMrna.psl");    
#    run("hgLoadRna add $dbName /cluster/store1/mrna.129/mrna.fa /cluster/store1/mrna.129/mrna.ra");

# Make spliced ESTs track
##    run("scripts/makeIntronEst.csh $dirName");
##    run("hgLoadPsl $dbName $dirName/bed/est/intronEst/*.psl");

    # Sort the fish blat alignments
    $workingDir = "$zooDir/$dirName/bed/blatFish";
##    run("pslCat -dir $workingDir/psl | pslSortAcc nohead $workingDir/chrom temp stdin");
##    run("scripts/loadFish.csh $dirName");

    # Make fish blat track sequence data
##    run("hgLoadRna addSeq $dbName /cluster/store3/matt/tetFish/*.fa");
##    run("rm *.tab");

#    $workingDir = $dirName;
#    run("hgLoadPsl $dbName $workingDir/bed/crossAlign/psl/*.psl");

##    run("rm -f $dirName/bed/simpleRepeat/*.bed");
##    run("cat $dirName/bed/simpleRepeat/trf/*.bed >> $dirName/bed/simpleRepeat/simpleRepeat.bed");
##    run("hgLoadBed $dbName simpleRepeat $dirName/bed/simpleRepeat/simpleRepeat.bed -sqlTable=/cluster/home/kent/src/hg/lib/simpleRepeat.sql");

##    my $tablePrefix;
##    for $tablePrefix (values(%orgNameHash)) {
##        if ($tablePrefix eq "zfish") {
##            $tablePrefix = "zebrafish";
##        }

# Create the tables and load the NIB files
##        run("echo \"DROP TABLE ${tablePrefix}Chrom\" | hgsql $dbName");
##        run("echo \"CREATE TABLE ${tablePrefix}Chrom (chrom varchar(255) not null, size int unsigned not null, fileName varchar(255) not null, PRIMARY KEY(chrom(16)))\" | hgsql $dbName");
##        run("hgNibSeq -preMadeNib -table=${tablePrefix}Chrom $dbName $dirName/nib $dirName/?/chr*.fa $dirName/??/chr*.fa");
#    }

# RYAN GO HERE
} # END OF LOOP


################## END OF MAIN ROUTINE #######################################

################## SUBROUTINES ###############################################
sub run
{
    my ($cmd) = @_;
    print $cmd . "\n";
    system $cmd;
    if ($? != 0) {
        die("$cmd FAILED!!!\n");
        print("$cmd FAILED!!!\n");
    }
}