PageRenderTime 114ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 0ms

/src/hg/makeDb/makeZoo3.pl

https://github.com/rpique/UCSC-Browser-code-add-ons
Perl | 324 lines | 85 code | 64 blank | 175 comment | 10 complexity | 43ad318b2429c26e63ef1b2ecc4b74dc MD5 | raw file
  1. #!/usr/bin/perl -w
  2. use strict;
  3. #
  4. # This script creates the zoo browser databases
  5. # It runs in 2 large main loops. The first loop distributes input files and sets up
  6. # blat jobs. It may need some babysitting or incremental running to make sure
  7. # that it works correctly since it does a lot. Uncomment each track are you want to
  8. # create. The first loop works only when run on kk, because some of the data
  9. # it references lives there.
  10. # After you create the blat job input files you need to ssh to kk and run the blat jobs by hand.
  11. # I have not scripted this because it requires lots of user intervention.
  12. # After the blat jobs are done you can comment out the first loop and run the second loop.
  13. # This loads the database with the blat results.
  14. #
  15. # Hash keyed by genus species, containing values indicating the project name
  16. my %orgHash = (
  17. "Papio_cynocephalus_anubis" => "zooBaboon3",
  18. "Felis_catus" => "zooCat3",
  19. "Gallus_gallus" => "zooChicken3",
  20. "Pan_troglodytes" => "zooChimp3",
  21. "Bos_taurus" => "zooCow3",
  22. "Canis_familiaris" => "zooDog3",
  23. "Takifugu_rubripes" => "zooFugu3",
  24. "Homo_sapiens" => "zooHuman3",
  25. "Mus_musculus" => "zooMouse3",
  26. "Sus_scrofa" => "zooPig3",
  27. "Rattus_norvegicus" => "zooRat3",
  28. "Tetraodon_nigroviridis" => "zooTetra3",
  29. "Danio_rerio" => "zooZebrafish3",
  30. );
  31. # Hash keyed by genus species, containing values indicating the common organism name
  32. my %orgNameHash = (
  33. "Papio_cynocephalus_anubis" => "baboon",
  34. "Felis_catus" => "cat",
  35. "Gallus_gallus" => "chicken",
  36. "Pan_troglodytes" => "chimp",
  37. "Bos_taurus" => "cow",
  38. "Canis_familiaris" => "dog",
  39. "Takifugu_rubripes" => "fugu",
  40. "Homo_sapiens" => "human",
  41. "Mus_musculus" => "mouse",
  42. "Sus_scrofa" => "pig",
  43. "Rattus_norvegicus" => "rat",
  44. "Tetraodon_nigroviridis" => "tetra",
  45. "Danio_rerio" => "zfish",
  46. );
  47. my $zooDir = "/cluster/store2/zoo3";
  48. my $mrnaRoot = "/cluster/store1/mrna.130/org";
  49. my $xenoRoot = "/cluster/store1/mrna.130/zoo/work";
  50. my $refSeqDir = "/cluster/store1/mrna.130/refSeq";
  51. my $org;
  52. my $orgName;
  53. my $dbName;
  54. my $dirName;
  55. my $blatDir;
  56. my $workingDir;
  57. # One time step - create all the databases
  58. ##run("mysql -u SECRET_USER -pBIG_SECRET -A < ${zooDir}/createDbs.sql\n");
  59. # Set up the blat and repeat masker directories
  60. ##run("rm -rf work");
  61. ##run("mkdir -p work");
  62. ##run("mkdir -p work/blat");
  63. ##run("mkdir -p work/blatFish");
  64. ##run("mkdir -p work/simpleRepeat");
  65. ##run("mkdir -p work/xeno");
  66. ##run("mkdir -p work/refSeq");
  67. ##run("cp scripts/blatFish-gsub work/blatFish/gsub");
  68. ##run("cp scripts/simpleRepeat-gsub work/simpleRepeat/gsub");
  69. for $org (keys(%orgHash)) {
  70. $dirName = $dbName = $orgHash{$org};
  71. $orgName = $orgNameHash{$org};
  72. # pjt.pl is a one-off script for a prorietary manually curated data set
  73. # I got from Pamela Jacques-Thomas
  74. ## run("$zooDir/pjt.pl data/$orgName/${orgName}_target1.annot.ff ${orgName}_pjt");
  75. ## run("hgsql $dbName < sql/dropCrap.sql");
  76. ## run("hgLoadBed -noBin $dbName pjt_gene ${orgName}_pjt_all.bed");
  77. ## run("hgLoadBed -noBin $dbName pjt_genscan ${orgName}_pjt_genscan.bed");
  78. ## run("mkdir -p $dirName");
  79. ## run("mkdir -p $dirName/bed");
  80. ## run("mkdir -p $dirName/1");
  81. ## run("mkdir -p $dirName/bed/rmskS");
  82. ## run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.masked $dirName/1/target1.fa");
  83. ## run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.out $dirName/bed/rmskS/target1.fa.out");
  84. ## run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.masked $dirName/1/target1.fa");
  85. ## run("cp $zooDir/data/${orgName}/repeat/${orgName}_target1.fasta.out $dirName/bed/rmskS/target1.fa.out");
  86. run("cp $zooDir/data/${orgName}/${orgName}_target1.agp $dirName/1/target1.agp");
  87. run("hgGoldGapGl $dbName -noGl $zooDir $dirName");
  88. # Uncomment this next line only for a major db rebuild
  89. # Add the trackDb database table
  90. ## run("hgLoadRna drop $dbName");
  91. ## run("hgTrackDb $dbName trackDb ~/kent/src/hg/lib/trackDb.sql ~/kent/src/hg/makeDb/hgTrackDb/hgRoot");
  92. # Create the zooSeq ordering table
  93. ## run("hgsql $dbName < sql/zooSeqOrder.sql");
  94. ## run("hgLoadRna new $dbName");
  95. ## run("hgLoadRna add -type=mRna $dbName $mrnaRoot/$org/mrna.fa $mrnaRoot/$org/mrna.ra");
  96. ## if (-e "$refSeqDir/$orgName/refSeq.fa") {
  97. ## run("hgLoadRna add -type=refSeq $dbName $refSeqDir/$orgName/refSeq.fa $refSeqDir/$orgName/refSeq.ra");
  98. ## }
  99. # If the file exists, load it
  100. ## if (-e "$mrnaRoot/$org/est.fa") {
  101. ## run("hgLoadRna add -type=EST $dbName $mrnaRoot/$org/est.fa $mrnaRoot/$org/est.ra");
  102. ## } else {
  103. ## print("No est file in $mrnaRoot/$org/est.fa\n");
  104. ## }
  105. ## run("hgLoadRna add -type=xenoRna $dbName /cluster/store1/mrna.129/zoo/work/${orgName}XenoRna.fa /cluster/store1/mrna.129/zoo/work/${orgName}XenoRna.ra");
  106. ## run("hgLoadRna add -type=xenoEst $dbName /cluster/store1/mrna.129/zoo/work/${orgName}XenoEst.fa /cluster/store1/mrna.129/zoo/work/${orgName}XenoEst.ra");
  107. ## foreach $blatDir ("mrna", "est") {
  108. ## if (-e "$mrnaRoot/$org/$blatDir.fa") {
  109. ## run("mkdir -p $dirName/bed/$blatDir");
  110. ## run("mkdir -p $dirName/bed/$blatDir/psl");
  111. # Make the mrna/est alignment BLAT spec file
  112. ## run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna -ooc=/cluster/home/kent/hg/h/11.ooc mask=lower $zooDir/$dirName/1/target1.fa $mrnaRoot/$org/${blatDir}.fa $zooDir/$dirName/bed/${blatDir}/psl/target1_${blatDir}.psl >> work/blat/spec");
  113. ## }
  114. ## }
  115. # Set up refSeq track if a refSeq sequence exists
  116. ## if (-e "$refSeqDir/$orgName/$orgName.fa") {
  117. ## run("mkdir -p $dirName/bed/refSeq");
  118. ## run("mkdir -p $dirName/bed/refSeq/psl");
  119. ## run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna -ooc=/cluster/home/kent/hg/h/11.ooc mask=lower $zooDir/$dirName/1/target1.fa $refSeqDir/$orgName/refSeq.fa $zooDir/$dirName/bed/refSeq/psl/target1_refSeq.psl >> work/refSeq/spec");
  120. ##}
  121. ## run("mkdir -p $dirName/bed/xenoRna");
  122. ## run("mkdir -p $dirName/bed/xenoRna/psl");
  123. ## run("mkdir -p $dirName/bed/xenoEst");
  124. ## run("mkdir -p $dirName/bed/xenoEst/psl");
  125. ## run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna mask=lower {check in line+ $zooDir/$dirName/1/target1.fa} {check in line+ $xenoRoot/${orgName}XenoRna.fa} {check out line+ $zooDir/$dirName/bed/xenoRna/psl/target1_xenoRna.psl} >> work/xeno/spec");
  126. ## run("echo /cluster/home/kent/bin/i386/blat -t=dna -q=rna mask=lower {check in line+ $zooDir/$dirName/1/target1.fa} {check in line+ $xenoRoot/${orgName}XenoEst.fa} {check out line+ $zooDir/$dirName/bed/xenoEst/psl/target1_xenoEst.psl} >> work/xeno/spec");
  127. # Load repeat masker and GCPercent tracks
  128. ## run("scripts/makeNib.csh $dirName");
  129. ## run("hgLoadOut $dbName $zooDir/$dirName/bed/rmskS/*.fa.out");
  130. ## run("mysql -u SECRET_USER -pBIG_SECRET -A $dbName < ~/kent/src/hg/lib/chromInfo.sql");
  131. ## run("hgNibSeq -preMadeNib $dbName $zooDir/$dirName/nib $zooDir/$dirName/?/*.fa");
  132. ## run("hgGoldGapGl $dbName $zooDir/ $dirName -noGl");
  133. # Make and load GC percent table
  134. ## run("mkdir -p $zooDir/$dirName/bed/gcPercent");
  135. ## run("mysql -u SECRET_USER -pBIG_SECRET -A $dbName < ~/src/hg/lib/gcPercent.sql");
  136. ## run("hgGcPercent $dbName $dirName/nib");
  137. ## run("mv gcPercent.bed $dirName/bed/gcPercent");
  138. ## run("mkdir -p $dirName/bed/blatFish");
  139. ## run("mkdir -p $dirName/bed/blatFish/psl");
  140. ## run("mkdir -p $dirName/bed/blatFish/chrom");
  141. ## run("echo $zooDir/$dirName | wordLine stdin >> work/blatFish/genome.lst");
  142. ## run("ls -1S $zooDir/$dirName/ >> work/simpleRepeat/genome.lst"); # HACK - need to generalize the gsub file
  143. ## run("mkdir -p $dirName/bed/simpleRepeat");
  144. ## run("mkdir -p $dirName/bed/simpleRepeat/trf");
  145. ## run("gbToFaRa dna.fil $orgName.fa $orgName.ra $orgName.ta data/$orgName/${orgName}_target1.annot.ff");
  146. ## if ($dbName eq "zooHuman3") {
  147. ## run("hgLoadBed $dbName mcs_u $zooDir/data/bed/mcs_u.bed");
  148. ## run("hgLoadBed $dbName mcs_b $zooDir/data/bed/mcs_b.bed");
  149. ## run("hgLoadBed $dbName dhs $zooDir/data/bed/dhs.bed");
  150. ## }
  151. } # END OF FIRST WHILE LOOP
  152. # The line below only works on kk - that's where the fish is stored
  153. ##run("ls -1S /scratch/hg/tetFish/*.fa > work/blatFish/blatFish.lst");
  154. ##run("gensub2 work/blatFish/genome.lst work/blatFish/blatFish.lst work/blatFish/gsub work/blatFish/spec");
  155. ##run("gensub2 work/simpleRepeat/genome.lst single work/simpleRepeat/gsub work/simpleRepeat/spec");
  156. # NOW RUN BLAT BY HAND USING PARASOL
  157. # Second main loop iteration
  158. # This will work ONLY AFTER BLAT is run and you have results in the desired output dirs
  159. for $org (keys(%orgHash)) {
  160. $dirName = $dbName = $orgHash{$org};
  161. $orgName = $orgNameHash{$org};
  162. # Process psl alignment files
  163. # run("cp $dirName/bed/mrna/psl/target1_mrna.psl $dirName/bed/mrna/chrom/target1_mrna.psl");
  164. # if (-e "$dirName/bed/est") {
  165. # run("cp $dirName/bed/est/psl/target1_est.psl $dirName/bed/est/chrom/target1_est.psl");
  166. # }
  167. $workingDir = "$dirName/bed/mrna";
  168. ## run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
  169. ## run("pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 $workingDir/raw.psl $workingDir/all_mrna.psl /dev/null");
  170. ## run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/all_mrna.psl");
  171. $workingDir = "$dirName/bed/est";
  172. if (-e $workingDir) {
  173. ## run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
  174. ## run("pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 $workingDir/raw.psl $workingDir/all_est.psl /dev/null");
  175. ## run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/all_est.psl");
  176. }
  177. $workingDir = "$dirName/bed/refSeq";
  178. if (-e "$workingDir/psl") {
  179. ## run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
  180. ## run("pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 $workingDir/raw.psl $workingDir/all_refSeq.psl /dev/null");
  181. ## run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/all_refSeq.psl");
  182. }
  183. $workingDir = "$dirName/bed/xenoRna";
  184. ## run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
  185. ## run("pslReps -minAli=0.25 $workingDir/raw.psl $workingDir/xenoMrna.psl /dev/null");
  186. ## run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/xenoMrna.psl");
  187. $workingDir = "$dirName/bed/xenoEst";
  188. ## run("pslSort dirs $workingDir/raw.psl /tmp $workingDir/psl");
  189. ## run("pslReps -minAli=0.25 $workingDir/raw.psl $workingDir/xenoEst.psl /dev/null");
  190. ## run("pslSortAcc nohead $workingDir/chrom /tmp $workingDir/xenoEst.psl");
  191. # Load psl alignment files into the DB
  192. $workingDir = "$dirName/bed/mrna";
  193. ## run("hgLoadPsl $dbName $workingDir/chrom/*.psl");
  194. ## run("hgLoadPsl $dbName $workingDir/all_mrna.psl -nobin");
  195. $workingDir = "$dirName/bed/est";
  196. if (-e $workingDir) {
  197. ## run("hgLoadPsl $dbName $workingDir/chrom/*.psl");
  198. ## run("hgLoadPsl $dbName $workingDir/all_est.psl -nobin");
  199. }
  200. $workingDir = "$dirName/bed/xenoRna";
  201. if (-e $workingDir) {
  202. ## run("hgLoadPsl $dbName $workingDir/xenoMrna.psl -nobin");
  203. }
  204. $workingDir = "$dirName/bed/xenoEst";
  205. if (-e $workingDir) {
  206. ## run("hgLoadPsl $dbName $workingDir/xenoEst.psl -nobin");
  207. }
  208. $workingDir = "$dirName/bed/refSeq";
  209. if (-e "$workingDir/chrom") {
  210. ## run("pslCat -dir $workingDir/chrom > $workingDir/refSeqAli.psl");
  211. ## run("hgLoadPsl $dbName -tNameIx $workingDir/refSeqAli.psl");
  212. }
  213. $workingDir = "$dirName/bed/refSeq";
  214. ## print("$refSeqDir/$orgName\n\n");
  215. ## if (-e "$refSeqDir/$orgName") {
  216. ## run("hgRefSeqMrna $dbName $refSeqDir/$orgName/refSeq.fa $refSeqDir/$orgName/refSeq.ra $workingDir/all_refSeq.psl $refSeqDir/loc2ref $refSeqDir/$orgName/$orgName.prot.fa $refSeqDir/mim2loc");
  217. ## run("hgRefSeqStatus $dbName $refSeqDir/loc2ref");
  218. ## }
  219. # $workingDir = "$dirName/bed/xeno";
  220. # run("hgLoadPsl -tNameIx $dbName $workingDir/psl/xenoMrna.psl");
  221. # run("hgLoadRna add $dbName /cluster/store1/mrna.129/mrna.fa /cluster/store1/mrna.129/mrna.ra");
  222. # Make spliced ESTs track
  223. ## run("scripts/makeIntronEst.csh $dirName");
  224. ## run("hgLoadPsl $dbName $dirName/bed/est/intronEst/*.psl");
  225. # Sort the fish blat alignments
  226. $workingDir = "$zooDir/$dirName/bed/blatFish";
  227. ## run("pslCat -dir $workingDir/psl | pslSortAcc nohead $workingDir/chrom temp stdin");
  228. ## run("scripts/loadFish.csh $dirName");
  229. # Make fish blat track sequence data
  230. ## run("hgLoadRna addSeq $dbName /cluster/store3/matt/tetFish/*.fa");
  231. ## run("rm *.tab");
  232. # $workingDir = $dirName;
  233. # run("hgLoadPsl $dbName $workingDir/bed/crossAlign/psl/*.psl");
  234. ## run("rm -f $dirName/bed/simpleRepeat/*.bed");
  235. ## run("cat $dirName/bed/simpleRepeat/trf/*.bed >> $dirName/bed/simpleRepeat/simpleRepeat.bed");
  236. ## run("hgLoadBed $dbName simpleRepeat $dirName/bed/simpleRepeat/simpleRepeat.bed -sqlTable=/cluster/home/kent/src/hg/lib/simpleRepeat.sql");
  237. ## my $tablePrefix;
  238. ## for $tablePrefix (values(%orgNameHash)) {
  239. ## if ($tablePrefix eq "zfish") {
  240. ## $tablePrefix = "zebrafish";
  241. ## }
  242. # Create the tables and load the NIB files
  243. ## run("echo \"DROP TABLE ${tablePrefix}Chrom\" | hgsql $dbName");
  244. ## run("echo \"CREATE TABLE ${tablePrefix}Chrom (chrom varchar(255) not null, size int unsigned not null, fileName varchar(255) not null, PRIMARY KEY(chrom(16)))\" | hgsql $dbName");
  245. ## run("hgNibSeq -preMadeNib -table=${tablePrefix}Chrom $dbName $dirName/nib $dirName/?/chr*.fa $dirName/??/chr*.fa");
  246. # }
  247. # RYAN GO HERE
  248. } # END OF LOOP
  249. ################## END OF MAIN ROUTINE #######################################
  250. ################## SUBROUTINES ###############################################
  251. sub run
  252. {
  253. my ($cmd) = @_;
  254. print $cmd . "\n";
  255. system $cmd;
  256. if ($? != 0) {
  257. die("$cmd FAILED!!!\n");
  258. print("$cmd FAILED!!!\n");
  259. }
  260. }