/tools/solid_tools/maq_cs_wrapper.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 270 lines · 238 code · 26 blank · 6 comment · 53 complexity · 9593f2f461ff43d0e32e268dbbb88dc0 MD5 · raw file
- #!/usr/bin/env python
- #Guruprasad Ananda
- #MAQ mapper for SOLiD colourspace-reads
- import sys, os, zipfile, tempfile, subprocess
- def stop_err( msg ):
- sys.stderr.write( "%s\n" % msg )
- sys.exit()
-
- def __main__():
- out_fname = sys.argv[1].strip()
- out_f2 = open(sys.argv[2].strip(),'r+')
- ref_fname = sys.argv[3].strip()
- f3_read_fname = sys.argv[4].strip()
- f3_qual_fname = sys.argv[5].strip()
- paired = sys.argv[6]
- if paired == 'yes':
- r3_read_fname = sys.argv[7].strip()
- r3_qual_fname = sys.argv[8].strip()
- min_mapqual = int(sys.argv[9].strip())
- max_mismatch = int(sys.argv[10].strip())
- out_f3name = sys.argv[11].strip()
- subprocess_dict = {}
- ref_csfa = tempfile.NamedTemporaryFile()
- ref_bfa = tempfile.NamedTemporaryFile()
- ref_csbfa = tempfile.NamedTemporaryFile()
- cmd2_1 = 'maq fasta2csfa %s > %s 2>&1' %(ref_fname,ref_csfa.name)
- cmd2_2 = 'maq fasta2bfa %s %s 2>&1' %(ref_csfa.name,ref_csbfa.name)
- cmd2_3 = 'maq fasta2bfa %s %s 2>&1' %(ref_fname,ref_bfa.name)
- try:
- os.system(cmd2_1)
- os.system(cmd2_2)
- os.system(cmd2_3)
- except Exception, erf:
- stop_err(str(erf)+"Error processing reference sequence")
-
- if paired == 'yes': #paired end reads
- tmpf = tempfile.NamedTemporaryFile() #forward reads
- tmpr = tempfile.NamedTemporaryFile() #reverse reads
- tmps = tempfile.NamedTemporaryFile() #single reads
- tmpffastq = tempfile.NamedTemporaryFile()
- tmprfastq = tempfile.NamedTemporaryFile()
- tmpsfastq = tempfile.NamedTemporaryFile()
- cmd1 = "solid2fastq_modified.pl 'yes' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,tmpr.name,tmps.name,f3_read_fname,f3_qual_fname,r3_read_fname,r3_qual_fname)
- try:
- os.system(cmd1)
- os.system('gunzip -c %s >> %s' %(tmpf.name,tmpffastq.name))
- os.system('gunzip -c %s >> %s' %(tmpr.name,tmprfastq.name))
- os.system('gunzip -c %s >> %s' %(tmps.name,tmpsfastq.name))
- except Exception, eq:
- stop_err("Error converting data to fastq format." + str(eq))
-
- #make a temp directory where the split fastq files will be stored
- try:
- split_dir = tempfile.mkdtemp()
- split_file_prefix_f = tempfile.mktemp(dir=split_dir)
- split_file_prefix_r = tempfile.mktemp(dir=split_dir)
- splitcmd_f = 'split -a 2 -l %d %s %s' %(32000000,tmpffastq.name,split_file_prefix_f) #32M lines correspond to 8M reads
- splitcmd_r = 'split -a 2 -l %d %s %s' %(32000000,tmprfastq.name,split_file_prefix_r) #32M lines correspond to 8M reads
- os.system(splitcmd_f)
- os.system(splitcmd_r)
- os.chdir(split_dir)
- ii = 0
- for fastq in os.listdir(split_dir):
- if not fastq.startswith(split_file_prefix_f.split("/")[-1]):
- continue
- fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] #find the reverse strand fastq corresponding to formward strand fastq
- tmpbfq_f = tempfile.NamedTemporaryFile()
- tmpbfq_r = tempfile.NamedTemporaryFile()
- cmd3 = 'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq_f.name,fastq_r,tmpbfq_r.name,fastq,ref_csbfa.name,tmpbfq_f.name,tmpbfq_r.name,fastq,fastq)
- subprocess_dict['sp'+str(ii+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)
- ii += 1
- while True:
- all_done = True
- for j,k in enumerate(subprocess_dict.keys()):
- if subprocess_dict['sp'+str(j+1)].wait() != 0:
- err = subprocess_dict['sp'+str(j+1)].communicate()[1]
- if err != None:
- stop_err("Mapping error: %s" %err)
- all_done = False
- if all_done:
- break
- cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname)
- os.system(cmdout)
-
- tmpcsmap = tempfile.NamedTemporaryFile()
- cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name)
- os.system(cmd_cat_csmap)
-
- tmppileup = tempfile.NamedTemporaryFile()
- cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)
- os.system(cmdpileup)
- tmppileup.seek(0)
- print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count"
- for line in file(tmppileup.name):
- elems = line.strip().split()
- ref_nt = elems[2].capitalize()
- read_nt = elems[4]
- coverage = int(elems[3])
- a,t,g,c = 0,0,0,0
- ref_nt_count = 0
- for ch in read_nt:
- ch = ch.capitalize()
- if ch not in ['A','T','G','C',',','.']:
- continue
- if ch in [',','.']:
- ch = ref_nt
- ref_nt_count += 1
- try:
- nt_ind = ['A','T','G','C'].index(ch)
- if nt_ind == 0:
- a+=1
- elif nt_ind == 1:
- t+=1
- elif nt_ind == 2:
- g+=1
- else:
- c+=1
- except ValueError, we:
- print >>sys.stderr, we
- print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)
- except Exception, er2:
- stop_err("Encountered error while mapping: %s" %(str(er2)))
-
-
- else: #single end reads
- tmpf = tempfile.NamedTemporaryFile()
- tmpfastq = tempfile.NamedTemporaryFile()
- cmd1 = "solid2fastq_modified.pl 'no' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,None,None,f3_read_fname,f3_qual_fname,None,None)
- try:
- os.system(cmd1)
- os.system('gunzip -c %s >> %s' %(tmpf.name,tmpfastq.name))
- tmpf.close()
- except:
- stop_err("Error converting data to fastq format.")
-
- #make a temp directory where the split fastq files will be stored
- try:
- split_dir = tempfile.mkdtemp()
- split_file_prefix = tempfile.mktemp(dir=split_dir)
- splitcmd = 'split -a 2 -l %d %s %s' %(32000000,tmpfastq.name,split_file_prefix) #32M lines correspond to 8M reads
- os.system(splitcmd)
- os.chdir(split_dir)
- for i,fastq in enumerate(os.listdir(split_dir)):
- tmpbfq = tempfile.NamedTemporaryFile()
- cmd3 = 'maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq.name,fastq,ref_csbfa.name,tmpbfq.name,fastq,fastq)
- subprocess_dict['sp'+str(i+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE)
-
- while True:
- all_done = True
- for j,k in enumerate(subprocess_dict.keys()):
- if subprocess_dict['sp'+str(j+1)].wait() != 0:
- err = subprocess_dict['sp'+str(j+1)].communicate()[1]
- if err != None:
- stop_err("Mapping error: %s" %err)
- all_done = False
- if all_done:
- break
- cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname)
- os.system(cmdout)
-
- tmpcsmap = tempfile.NamedTemporaryFile()
- cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name)
- os.system(cmd_cat_csmap)
-
- tmppileup = tempfile.NamedTemporaryFile()
- cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name)
- os.system(cmdpileup)
- tmppileup.seek(0)
- print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count"
- for line in file(tmppileup.name):
- elems = line.strip().split()
- ref_nt = elems[2].capitalize()
- read_nt = elems[4]
- coverage = int(elems[3])
- a,t,g,c = 0,0,0,0
- ref_nt_count = 0
- for ch in read_nt:
- ch = ch.capitalize()
- if ch not in ['A','T','G','C',',','.']:
- continue
- if ch in [',','.']:
- ch = ref_nt
- ref_nt_count += 1
- try:
- nt_ind = ['A','T','G','C'].index(ch)
- if nt_ind == 0:
- a+=1
- elif nt_ind == 1:
- t+=1
- elif nt_ind == 2:
- g+=1
- else:
- c+=1
- except:
- pass
- print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c)
- except Exception, er2:
- stop_err("Encountered error while mapping: %s" %(str(er2)))
-
- #Build custom track from pileup
- chr_list=[]
- out_f2.seek(0)
- fcov = tempfile.NamedTemporaryFile()
- fout_a = tempfile.NamedTemporaryFile()
- fout_t = tempfile.NamedTemporaryFile()
- fout_g = tempfile.NamedTemporaryFile()
- fout_c = tempfile.NamedTemporaryFile()
- fcov.write('''track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\n''')
- fout_a.write('''track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\n''')
- fout_t.write('''track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\n''')
- fout_g.write('''track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\n''')
- fout_c.write('''track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\n''')
-
- for line in out_f2:
- if line.startswith("#"):
- continue
- elems = line.split()
- chr = elems[0]
-
- if chr not in chr_list:
- chr_list.append(chr)
- if not (chr.startswith('chr') or chr.startswith('scaffold')):
- chr = 'chr'
- header = "variableStep chrom=%s" %(chr)
- fcov.write("%s\n" %(header))
- fout_a.write("%s\n" %(header))
- fout_t.write("%s\n" %(header))
- fout_g.write("%s\n" %(header))
- fout_c.write("%s\n" %(header))
- try:
- pos = int(elems[1])
- cov = int(elems[3])
- a = int(elems[5])
- t = int(elems[6])
- g = int(elems[7])
- c = int(elems[8])
- except:
- continue
- fcov.write("%s\t%s\n" %(pos,cov))
- try:
- a_freq = a*100./cov
- t_freq = t*100./cov
- g_freq = g*100./cov
- c_freq = c*100./cov
- except ZeroDivisionError:
- a_freq=t_freq=g_freq=c_freq=0
- fout_a.write("%s\t%s\n" %(pos,a_freq))
- fout_t.write("%s\t%s\n" %(pos,t_freq))
- fout_g.write("%s\t%s\n" %(pos,g_freq))
- fout_c.write("%s\t%s\n" %(pos,c_freq))
- fcov.seek(0)
- fout_a.seek(0)
- fout_g.seek(0)
- fout_t.seek(0)
- fout_c.seek(0)
- os.system("cat %s %s %s %s %s | cat > %s" %(fcov.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_f3name))
- if __name__=="__main__":
- __main__()