PageRenderTime 55ms CodeModel.GetById 12ms app.highlight 38ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/microbes/harvest_bacteria.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 246 lines | 232 code | 7 blank | 7 comment | 3 complexity | 74ff2857ec7ce928d9c219f14981b546 MD5 | raw file
  1#!/usr/bin/env python
  2#Dan Blankenberg
  3
  4#Harvest Bacteria
  5#Connects to NCBI's Microbial Genome Projects website and scrapes it for information.
  6#Downloads and converts annotations for each Genome
  7
  8import sys, os, time
  9from urllib2 import urlopen
 10from urllib import urlretrieve
 11from ftplib import FTP
 12from BeautifulSoup import BeautifulSoup
 13from util import get_bed_from_genbank, get_bed_from_glimmer3, get_bed_from_GeneMarkHMM, get_bed_from_GeneMark
 14
 15assert sys.version_info[:2] >= ( 2, 4 )
 16
 17#this defines the types of ftp files we are interested in, and how to process/convert them to a form for our use
 18desired_ftp_files = {'GeneMark':{'ext':'GeneMark-2.5f','parser':'process_GeneMark'},
 19                    'GeneMarkHMM':{'ext':'GeneMarkHMM-2.6m','parser':'process_GeneMarkHMM'},
 20                    'Glimmer3':{'ext':'Glimmer3','parser':'process_Glimmer3'},
 21                    'fna':{'ext':'fna','parser':'process_FASTA'},
 22                    'gbk':{'ext':'gbk','parser':'process_Genbank'} }
 23
 24
 25
 26#number, name, chroms, kingdom, group, genbank, refseq, info_url, ftp_url
 27def iter_genome_projects( url = "http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1", info_url_base = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ):
 28    for row in BeautifulSoup( urlopen( url ) ).findAll( name = 'tr', bgcolor = ["#EEFFDD", "#E8E8DD"] ):
 29        row = str( row ).replace( "\n", "" ).replace( "\r", "" )
 30        
 31        fields = row.split( "</td>" )
 32        
 33        org_num = fields[0].split( "list_uids=" )[-1].split( "\"" )[0]
 34        
 35        name = fields[1].split( "\">" )[-1].split( "<" )[0]
 36        
 37        kingdom = "archaea"
 38        if "<td class=\"bacteria\" align=\"center\">B" in fields[2]:
 39            kingdom = "bacteria"
 40            
 41        group = fields[3].split( ">" )[-1]
 42        
 43        info_url = "%s%s" % ( info_url_base, org_num )
 44        
 45        org_genbank = fields[7].split( "\">" )[-1].split( "<" )[0].split( "." )[0]
 46        org_refseq = fields[8].split( "\">" )[-1].split( "<" )[0].split( "." )[0]
 47        
 48        #seems some things donot have an ftp url, try and except it here:
 49        try:
 50            ftp_url = fields[22].split( "href=\"" )[1].split( "\"" )[0]
 51        except:
 52            print "FAILED TO AQUIRE FTP ADDRESS:", org_num, info_url
 53            ftp_url = None
 54        
 55        chroms = get_chroms_by_project_id( org_num )
 56        
 57        yield org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url
 58
 59def get_chroms_by_project_id( org_num, base_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ):
 60    html_count = 0
 61    html = None
 62    while html_count < 500 and html == None:
 63        html_count += 1
 64        url = "%s%s" % ( base_url, org_num )
 65        try:
 66            html = urlopen( url )
 67        except:
 68            print "GENOME PROJECT FAILED:", html_count, "org:", org_num, url
 69            html = None
 70            time.sleep( 1 ) #Throttle Connection
 71    if html is None:
 72        "GENOME PROJECT COMPLETELY FAILED TO LOAD", "org:", org_num,"http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids="+org_num
 73        return None
 74    
 75    chroms = []
 76    for chr_row in BeautifulSoup( html ).findAll( "tr", { "class" : "vvv" } ):
 77        chr_row = str( chr_row ).replace( "\n","" ).replace( "\r", "" )
 78        fields2 = chr_row.split( "</td>" )
 79        refseq = fields2[1].split( "</a>" )[0].split( ">" )[-1]
 80        #genbank = fields2[2].split( "</a>" )[0].split( ">" )[-1]
 81        chroms.append( refseq )
 82    
 83    return chroms
 84
 85def get_ftp_contents( ftp_url ):
 86    ftp_count = 0
 87    ftp_contents = None
 88    while ftp_count < 500 and ftp_contents == None:
 89        ftp_count += 1
 90        try:
 91            ftp = FTP( ftp_url.split("/")[2] )
 92            ftp.login()
 93            ftp.cwd( ftp_url.split( ftp_url.split( "/" )[2] )[-1] )
 94            ftp_contents = ftp.nlst()
 95            ftp.close()
 96        except:
 97            ftp_contents = None
 98            time.sleep( 1 ) #Throttle Connection
 99    return ftp_contents
100
101def scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url ):
102    for file_type, items in desired_ftp_files.items():
103        ext = items['ext']
104        ftp_filename = "%s.%s" % ( refseq, ext )
105        target_filename = os.path.join( org_dir, "%s.%s" % ( refseq, ext ) )
106        if ftp_filename in ftp_contents:
107            url_count = 0
108            url = "%s/%s" % ( ftp_url, ftp_filename )
109            results = None
110            while url_count < 500 and results is None:
111                url_count += 1
112                try:
113                    results = urlretrieve( url, target_filename )
114                except:
115                    results = None
116                    time.sleep(1) #Throttle Connection
117            if results is None:
118                "URL COMPLETELY FAILED TO LOAD:", url
119                return
120            
121            #do special processing for each file type:
122            if items['parser'] is not None:
123                parser_results = globals()[items['parser']]( target_filename, org_num, refseq )
124        else:
125            print "FTP filetype:", file_type, "not found for", org_num, refseq
126    #FTP Files have been Loaded
127    
128
129def process_FASTA( filename, org_num, refseq ):
130    fasta = []
131    fasta = [line.strip() for line in open( filename, 'rb' ).readlines()]
132    fasta_header = fasta.pop( 0 )[1:]
133    fasta_header_split = fasta_header.split( "|" )
134    chr_name = fasta_header_split.pop( -1 ).strip()
135    accesions = {fasta_header_split[0]:fasta_header_split[1], fasta_header_split[2]:fasta_header_split[3]}
136    fasta = "".join( fasta )
137    
138    #Create Chrom Info File:
139    chrom_info_file = open( os.path.join( os.path.split( filename )[0], "%s.info" % refseq ), 'wb+' )
140    chrom_info_file.write( "chromosome=%s\nname=%s\nlength=%s\norganism=%s\n" % ( refseq, chr_name, len( fasta ), org_num ) )
141    try:
142        chrom_info_file.write( "gi=%s\n" % accesions['gi'] )
143    except:
144        chrom_info_file.write( "gi=None\n" )
145    try:
146        chrom_info_file.write( "gb=%s\n" % accesions['gb'] )
147    except:
148        chrom_info_file.write( "gb=None\n" )
149    try:
150        chrom_info_file.write( "refseq=%s\n" % refseq )
151    except:
152        chrom_info_file.write( "refseq=None\n" )
153    chrom_info_file.close()
154
155def process_Genbank( filename, org_num, refseq ):
156    #extracts 'CDS', 'tRNA', 'rRNA' features from genbank file
157    features = get_bed_from_genbank( filename, refseq, ['CDS', 'tRNA', 'rRNA'] )
158    for feature in features.keys():
159        feature_file = open( os.path.join( os.path.split( filename )[0], "%s.%s.bed" % ( refseq, feature ) ), 'wb+' )
160        feature_file.write( '\n'.join( features[feature] ) )
161        feature_file.close()
162    print "Genbank extraction finished for chrom:", refseq, "file:", filename
163
164def process_Glimmer3( filename, org_num, refseq ):
165    try:
166        glimmer3_bed = get_bed_from_glimmer3( filename, refseq )
167    except Exception, e:
168        print "Converting Glimmer3 to bed FAILED! For chrom:", refseq, "file:", filename, e
169        glimmer3_bed = []
170    glimmer3_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.Glimmer3.bed" % refseq ), 'wb+' )
171    glimmer3_bed_file.write( '\n'.join( glimmer3_bed ) )
172    glimmer3_bed_file.close()
173
174def process_GeneMarkHMM( filename, org_num, refseq ):
175    try:
176        geneMarkHMM_bed = get_bed_from_GeneMarkHMM( filename, refseq )
177    except Exception, e:
178        print "Converting GeneMarkHMM to bed FAILED! For chrom:", refseq, "file:", filename, e
179        geneMarkHMM_bed = []
180    geneMarkHMM_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMarkHMM.bed" % refseq ), 'wb+' )
181    geneMarkHMM_bed_bed_file.write( '\n'.join( geneMarkHMM_bed ) )
182    geneMarkHMM_bed_bed_file.close()
183
184def process_GeneMark( filename, org_num, refseq ):
185    try:
186        geneMark_bed = get_bed_from_GeneMark( filename, refseq )
187    except Exception, e:
188        print "Converting GeneMark to bed FAILED! For chrom:", refseq, "file:", filename, e
189        geneMark_bed = []
190    geneMark_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMark.bed" % refseq ), 'wb+' )
191    geneMark_bed_bed_file.write( '\n'.join( geneMark_bed ) )
192    geneMark_bed_bed_file.close()
193
194
195
196def __main__():
197    start_time = time.time()
198    base_dir = os.path.join( os.getcwd(), "bacteria" )
199    try:
200        base_dir = sys.argv[1]
201    except:
202        print "using default base_dir:", base_dir
203    
204    try:
205        os.mkdir( base_dir )
206        print "path '%s' has been created" % base_dir
207    except:
208        print "path '%s' seems to already exist" % base_dir
209    
210    for org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url in iter_genome_projects():
211        if chroms is None:
212            continue #No chrom information, we can't really do anything with this organism
213        #Create org directory, if exists, assume it is done and complete --> skip it
214        try:
215            org_dir = os.path.join( base_dir, org_num )
216            os.mkdir( org_dir )
217        except:
218            print "Organism %s already exists on disk, skipping" % org_num
219            continue
220        
221        #get ftp contents
222        ftp_contents = get_ftp_contents( ftp_url )
223        if ftp_contents is None:
224            "FTP COMPLETELY FAILED TO LOAD", "org:", org_num, "ftp:", ftp_url
225        else:
226            for refseq in chroms:
227                ftp_result = scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url )
228                #FTP Files have been Loaded
229                print "Org:", org_num, "chrom:", refseq, "[", time.time() - start_time, "seconds elapsed. ]"
230        
231        #Create org info file
232        info_file = open( os.path.join( org_dir, "%s.info" % org_num ), 'wb+' )
233        info_file.write("genome project id=%s\n" % org_num )
234        info_file.write("name=%s\n" % name )
235        info_file.write("kingdom=%s\n" % kingdom )
236        info_file.write("group=%s\n" % group )
237        info_file.write("chromosomes=%s\n" % ",".join( chroms ) )
238        info_file.write("info url=%s\n" % info_url )
239        info_file.write("ftp url=%s\n" % ftp_url )
240        info_file.close()
241        
242    print "Finished Harvesting", "[", time.time() - start_time, "seconds elapsed. ]"
243    print "[", ( time.time() - start_time )/60, "minutes. ]"
244    print "[", ( time.time() - start_time )/60/60, "hours. ]"
245
246if __name__ == "__main__": __main__()