PageRenderTime 44ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/Phase/phase_trio.sh

https://github.com/BioinformaticsArchive/fCNV
Shell | 87 lines | 37 code | 21 blank | 29 comment | 2 complexity | 381d3181a62b5f4993d29ca92b939711 MD5 | raw file
  1. #!/bin/bash
  2. # --> genotype & phase the trio
  3. # $1 - path to maternal bam files
  4. # $2 - path to paternal bam files
  5. # $3 - path to fetal bam files
  6. # $4 - reference sequence .fa file
  7. # $5 - region to extract
  8. #logfile=log_phaseTrio_$5.$$.log
  9. #exec > $logfile 2>&1
  10. date
  11. if [ $# -ne 5 ]; then
  12. echo "Wrong number of arguments: got $#, expected 5"
  13. exit
  14. fi
  15. bindir="/dupa-filer/laci/bin"
  16. #parse out chromosome ID
  17. region=$5
  18. chr=`echo $region | awk 'BEGIN {FS=":"}{print $1}'`
  19. chrno=`echo $chr | awk 'BEGIN {FS="chr"}{print $2}'`
  20. reference=$4
  21. <<comment
  22. # (1) extract the region and remove PCR duplicates
  23. echo "merging and removing PCR duplicates:"
  24. samtools merge -R $region - $1*/*/*.bam | samtools rmdup - __M.part.bam &
  25. samtools merge -R $region - $2*/*/*.bam | samtools rmdup - __P.part.bam &
  26. samtools merge -R $region - $3*/*/*.bam | samtools rmdup - __F.part.bam &
  27. wait
  28. samtools index __M.part.bam &
  29. samtools index __P.part.bam &
  30. samtools index __F.part.bam &
  31. wait
  32. #samtools view -h __M.part.bam > __M.part.sam &
  33. #samtools view -h __P.part.bam > __P.part.sam &
  34. #wait
  35. echo "-------- step 1 done ----------"
  36. comment
  37. # (2) genotype M, P, F, filter and phase
  38. prefix=trio
  39. echo "genotyping the trio"
  40. time samtools mpileup -uDSI -C50 -r $region -f $reference __M.allreads.part.bam __P.part.bam __F.part.bam | bcftools view -bvcg - > $prefix.genotype.raw.bcf
  41. time bcftools view $prefix.genotype.raw.bcf | vcfutils.pl varFilter -d60 -D200 -Q20 > $prefix.genotype.vcf
  42. # TODO: ???? what limit for depth of coverage to use?
  43. #annotate SNPs by rs-ids from dbSNP
  44. echo "annotating called SNPs"
  45. java -Xmx24000m -jar ~/apps/snpEff/SnpSift.jar annotate -v /dupa-filer/laci/dbSnp.vcf $prefix.genotype.vcf > $prefix.genotype.annot.vcf
  46. #extract only SNPs with reasonable quality score TODO: change qlimit depending on nummber of samples
  47. snpsFile=$prefix.snps.annot
  48. cat $prefix.genotype.annot.vcf | extract_annot_snps.awk -v qlimit=100 > $snpsFile.vcf
  49. #compress and index
  50. #bgzip $prefix.snps.annot.vcf
  51. #tabix -p vcf $prefix.snps.annot.vcf.gz
  52. refpanels=/dupa-filer/laci/reference_panels/$chr.1kg.ref.phase1_release_v3.20101123.vcf.gz
  53. #modify our trio VCF file so that its records are consistent with the reference VCF file
  54. #...this doesn't seems to work, rather use custom implementation - conform.py
  55. #time java -jar ~/apps/jar/conform-gt.jar gt=$prefix.snps.annot.vcf.gz out=conform chrom=$chr ref=$refpanels
  56. #exclude markers that are not in the reference
  57. echo "conforming markers with the reference panels"
  58. zcat $refpanels | conform.py $snpsFile.vcf /dev/stdin
  59. sed "s/$chr/$chrno/g" $snpsFile.ftr.vcf > $snpsFile.ftr2.vcf
  60. #phase by Beagle
  61. echo "invoking Beagle for trio phasing w/ reference panels"
  62. time java -Xmx24000m -jar $bindir/b4.r1128.jar gt=$snpsFile.ftr2.vcf ped=pedigree.txt out=$prefix.phase ref=$refpanels impute=false
  63. #time java -jar $bindir/b4.r1128.jar gt=$snpsFile.vcf.gz ped=pedigree.txt out=$prefix.phase
  64. zcat $prefix.phase.vcf.gz > $prefix.phase.vcf
  65. # CLEAN-UP
  66. #rm __*.*