PageRenderTime 22ms CodeModel.GetById 10ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/microbes/ncbi_to_ucsc.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 146 lines | 130 code | 9 blank | 7 comment | 9 complexity | 52708119824d70490f40b3a11a5632e6 MD5 | raw file
  1#!/usr/bin/env python
  2
  3"""
  4Walk downloaded Genome Projects and Convert, in place, IDs to match the UCSC Archaea browser, where applicable.
  5Uses UCSC Archaea DSN.
  6"""
  7
  8import sys, os
  9import urllib
 10from elementtree import ElementTree
 11from BeautifulSoup import BeautifulSoup
 12from shutil import move
 13
 14def __main__():
 15    base_dir = os.path.join( os.getcwd(), "bacteria" )
 16    try:
 17        base_dir = sys.argv[1]
 18    except:
 19        print "using default base_dir:", base_dir
 20    
 21    organisms = {}
 22    for result in os.walk(base_dir):
 23        this_base_dir,sub_dirs,files = result
 24        for file in files:
 25            if file[-5:] == ".info":
 26                dict = {}
 27                info_file = open(os.path.join(this_base_dir,file),'r')
 28                info = info_file.readlines()
 29                info_file.close()
 30                for line in info:
 31                    fields = line.replace("\n","").split("=")
 32                    dict[fields[0]]="=".join(fields[1:])
 33                if 'genome project id' in dict.keys():
 34                    if dict['genome project id'] not in organisms.keys():
 35                        organisms[dict['genome project id']] = {'chrs':{},'base_dir':this_base_dir}
 36                    for key in dict.keys():
 37                        organisms[dict['genome project id']][key]=dict[key]
 38                else:
 39                    if dict['organism'] not in organisms.keys():
 40                        organisms[dict['organism']] = {'chrs':{},'base_dir':this_base_dir}
 41                    organisms[dict['organism']]['chrs'][dict['chromosome']]=dict
 42
 43    ##get UCSC data
 44
 45    URL = "http://archaea.ucsc.edu/cgi-bin/das/dsn"
 46
 47
 48    try:
 49        page = urllib.urlopen(URL)
 50    except:
 51        print "#Unable to open " + URL
 52        print "?\tunspecified (?)"
 53        sys.exit(1)
 54
 55    text = page.read()
 56    try:
 57        tree = ElementTree.fromstring(text)
 58    except:
 59        print "#Invalid xml passed back from " + URL
 60        print "?\tunspecified (?)"
 61        sys.exit(1)
 62
 63
 64    builds = {}
 65
 66
 67    #print "#Harvested from http://archaea.ucsc.edu/cgi-bin/das/dsn"
 68    #print "?\tunspecified (?)"
 69    for dsn in tree:
 70        build = dsn.find("SOURCE").attrib['id']
 71        try:
 72            org_page = urllib.urlopen("http://archaea.ucsc.edu/cgi-bin/hgGateway?db="+build).read().replace("\n","").split("<table border=2 cellspacing=2 cellpadding=2>")[1].split("</table>")[0].split("</tr>")
 73        except:
 74            print "NO CHROMS FOR",build
 75            continue
 76        org_page.pop(0)
 77        if org_page[-1]=="":
 78            org_page.pop(-1)
 79        
 80        for row in org_page:
 81            chr = row.split("</a>")[0].split(">")[-1]
 82            refseq = row.split("</a>")[-2].split(">")[-1]
 83            for org in organisms:
 84                for org_chr in organisms[org]['chrs']:
 85                    if organisms[org]['chrs'][org_chr]['chromosome']==refseq:
 86                        if org not in builds:
 87                            builds[org]={'chrs':{},'build':build}
 88                        builds[org]['chrs'][refseq]=chr
 89                        #print build,org,chr,refseq
 90        
 91    print
 92    ext_to_edit = ['bed', 'info', ]
 93    for org in builds:
 94        print org,"changed to",builds[org]['build']
 95        
 96        #org info file
 97        info_file_old = os.path.join(base_dir+org,org+".info")
 98        info_file_new = os.path.join(base_dir+org,builds[org]['build']+".info")
 99        
100        
101        old_dir = base_dir+org
102        new_dir = base_dir+builds[org]['build']
103        
104        #open and edit org info file
105        info_file_contents = open(info_file_old).read()
106        info_file_contents = info_file_contents+"build="+builds[org]['build']+"\n"
107        for chrom in builds[org]['chrs']:
108            info_file_contents = info_file_contents.replace(chrom,builds[org]['chrs'][chrom])
109            for result in os.walk(base_dir+org):
110                this_base_dir,sub_dirs,files = result
111                for file in files:
112                    if file[0:len(chrom)]==chrom:
113                        #rename file
114                        old_name = os.path.join(this_base_dir,file)
115                        new_name = os.path.join(this_base_dir,builds[org]['chrs'][chrom]+file[len(chrom):])
116                        move(old_name,new_name)
117                        
118                        #edit contents of file, skiping those in list
119                        if file.split(".")[-1] not in ext_to_edit: continue
120                        
121                        file_contents = open(new_name).read()
122                        file_contents = file_contents.replace(chrom,builds[org]['chrs'][chrom])
123                        
124                        #special case fixes...
125                        if file[-5:] == ".info":
126                            file_contents = file_contents.replace("organism="+org,"organism="+builds[org]['build'])
127                            file_contents = file_contents.replace("refseq="+builds[org]['chrs'][chrom],"refseq="+chrom)
128                        
129                        #write out new file
130                        file_out = open(new_name,'w')
131                        file_out.write(file_contents)
132                        file_out.close()
133                        
134        
135        
136        #write out org info file and remove old file
137        org_info_out = open(info_file_new,'w')
138        org_info_out.write(info_file_contents)
139        org_info_out.close()
140        os.unlink(info_file_old)
141        
142        #change org directory name
143        move(old_dir,new_dir)
144        
145
146if __name__ == "__main__": __main__()