PageRenderTime 44ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/microbes/harvest_bacteria.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 246 lines | 232 code | 7 blank | 7 comment | 2 complexity | 74ff2857ec7ce928d9c219f14981b546 MD5 | raw file
  1. #!/usr/bin/env python
  2. #Dan Blankenberg
  3. #Harvest Bacteria
  4. #Connects to NCBI's Microbial Genome Projects website and scrapes it for information.
  5. #Downloads and converts annotations for each Genome
  6. import sys, os, time
  7. from urllib2 import urlopen
  8. from urllib import urlretrieve
  9. from ftplib import FTP
  10. from BeautifulSoup import BeautifulSoup
  11. from util import get_bed_from_genbank, get_bed_from_glimmer3, get_bed_from_GeneMarkHMM, get_bed_from_GeneMark
  12. assert sys.version_info[:2] >= ( 2, 4 )
  13. #this defines the types of ftp files we are interested in, and how to process/convert them to a form for our use
  14. desired_ftp_files = {'GeneMark':{'ext':'GeneMark-2.5f','parser':'process_GeneMark'},
  15. 'GeneMarkHMM':{'ext':'GeneMarkHMM-2.6m','parser':'process_GeneMarkHMM'},
  16. 'Glimmer3':{'ext':'Glimmer3','parser':'process_Glimmer3'},
  17. 'fna':{'ext':'fna','parser':'process_FASTA'},
  18. 'gbk':{'ext':'gbk','parser':'process_Genbank'} }
  19. #number, name, chroms, kingdom, group, genbank, refseq, info_url, ftp_url
  20. def iter_genome_projects( url = "http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?view=1", info_url_base = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ):
  21. for row in BeautifulSoup( urlopen( url ) ).findAll( name = 'tr', bgcolor = ["#EEFFDD", "#E8E8DD"] ):
  22. row = str( row ).replace( "\n", "" ).replace( "\r", "" )
  23. fields = row.split( "</td>" )
  24. org_num = fields[0].split( "list_uids=" )[-1].split( "\"" )[0]
  25. name = fields[1].split( "\">" )[-1].split( "<" )[0]
  26. kingdom = "archaea"
  27. if "<td class=\"bacteria\" align=\"center\">B" in fields[2]:
  28. kingdom = "bacteria"
  29. group = fields[3].split( ">" )[-1]
  30. info_url = "%s%s" % ( info_url_base, org_num )
  31. org_genbank = fields[7].split( "\">" )[-1].split( "<" )[0].split( "." )[0]
  32. org_refseq = fields[8].split( "\">" )[-1].split( "<" )[0].split( "." )[0]
  33. #seems some things donot have an ftp url, try and except it here:
  34. try:
  35. ftp_url = fields[22].split( "href=\"" )[1].split( "\"" )[0]
  36. except:
  37. print "FAILED TO AQUIRE FTP ADDRESS:", org_num, info_url
  38. ftp_url = None
  39. chroms = get_chroms_by_project_id( org_num )
  40. yield org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url
  41. def get_chroms_by_project_id( org_num, base_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids=" ):
  42. html_count = 0
  43. html = None
  44. while html_count < 500 and html == None:
  45. html_count += 1
  46. url = "%s%s" % ( base_url, org_num )
  47. try:
  48. html = urlopen( url )
  49. except:
  50. print "GENOME PROJECT FAILED:", html_count, "org:", org_num, url
  51. html = None
  52. time.sleep( 1 ) #Throttle Connection
  53. if html is None:
  54. "GENOME PROJECT COMPLETELY FAILED TO LOAD", "org:", org_num,"http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=genomeprj&cmd=Retrieve&dopt=Overview&list_uids="+org_num
  55. return None
  56. chroms = []
  57. for chr_row in BeautifulSoup( html ).findAll( "tr", { "class" : "vvv" } ):
  58. chr_row = str( chr_row ).replace( "\n","" ).replace( "\r", "" )
  59. fields2 = chr_row.split( "</td>" )
  60. refseq = fields2[1].split( "</a>" )[0].split( ">" )[-1]
  61. #genbank = fields2[2].split( "</a>" )[0].split( ">" )[-1]
  62. chroms.append( refseq )
  63. return chroms
  64. def get_ftp_contents( ftp_url ):
  65. ftp_count = 0
  66. ftp_contents = None
  67. while ftp_count < 500 and ftp_contents == None:
  68. ftp_count += 1
  69. try:
  70. ftp = FTP( ftp_url.split("/")[2] )
  71. ftp.login()
  72. ftp.cwd( ftp_url.split( ftp_url.split( "/" )[2] )[-1] )
  73. ftp_contents = ftp.nlst()
  74. ftp.close()
  75. except:
  76. ftp_contents = None
  77. time.sleep( 1 ) #Throttle Connection
  78. return ftp_contents
  79. def scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url ):
  80. for file_type, items in desired_ftp_files.items():
  81. ext = items['ext']
  82. ftp_filename = "%s.%s" % ( refseq, ext )
  83. target_filename = os.path.join( org_dir, "%s.%s" % ( refseq, ext ) )
  84. if ftp_filename in ftp_contents:
  85. url_count = 0
  86. url = "%s/%s" % ( ftp_url, ftp_filename )
  87. results = None
  88. while url_count < 500 and results is None:
  89. url_count += 1
  90. try:
  91. results = urlretrieve( url, target_filename )
  92. except:
  93. results = None
  94. time.sleep(1) #Throttle Connection
  95. if results is None:
  96. "URL COMPLETELY FAILED TO LOAD:", url
  97. return
  98. #do special processing for each file type:
  99. if items['parser'] is not None:
  100. parser_results = globals()[items['parser']]( target_filename, org_num, refseq )
  101. else:
  102. print "FTP filetype:", file_type, "not found for", org_num, refseq
  103. #FTP Files have been Loaded
  104. def process_FASTA( filename, org_num, refseq ):
  105. fasta = []
  106. fasta = [line.strip() for line in open( filename, 'rb' ).readlines()]
  107. fasta_header = fasta.pop( 0 )[1:]
  108. fasta_header_split = fasta_header.split( "|" )
  109. chr_name = fasta_header_split.pop( -1 ).strip()
  110. accesions = {fasta_header_split[0]:fasta_header_split[1], fasta_header_split[2]:fasta_header_split[3]}
  111. fasta = "".join( fasta )
  112. #Create Chrom Info File:
  113. chrom_info_file = open( os.path.join( os.path.split( filename )[0], "%s.info" % refseq ), 'wb+' )
  114. chrom_info_file.write( "chromosome=%s\nname=%s\nlength=%s\norganism=%s\n" % ( refseq, chr_name, len( fasta ), org_num ) )
  115. try:
  116. chrom_info_file.write( "gi=%s\n" % accesions['gi'] )
  117. except:
  118. chrom_info_file.write( "gi=None\n" )
  119. try:
  120. chrom_info_file.write( "gb=%s\n" % accesions['gb'] )
  121. except:
  122. chrom_info_file.write( "gb=None\n" )
  123. try:
  124. chrom_info_file.write( "refseq=%s\n" % refseq )
  125. except:
  126. chrom_info_file.write( "refseq=None\n" )
  127. chrom_info_file.close()
  128. def process_Genbank( filename, org_num, refseq ):
  129. #extracts 'CDS', 'tRNA', 'rRNA' features from genbank file
  130. features = get_bed_from_genbank( filename, refseq, ['CDS', 'tRNA', 'rRNA'] )
  131. for feature in features.keys():
  132. feature_file = open( os.path.join( os.path.split( filename )[0], "%s.%s.bed" % ( refseq, feature ) ), 'wb+' )
  133. feature_file.write( '\n'.join( features[feature] ) )
  134. feature_file.close()
  135. print "Genbank extraction finished for chrom:", refseq, "file:", filename
  136. def process_Glimmer3( filename, org_num, refseq ):
  137. try:
  138. glimmer3_bed = get_bed_from_glimmer3( filename, refseq )
  139. except Exception, e:
  140. print "Converting Glimmer3 to bed FAILED! For chrom:", refseq, "file:", filename, e
  141. glimmer3_bed = []
  142. glimmer3_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.Glimmer3.bed" % refseq ), 'wb+' )
  143. glimmer3_bed_file.write( '\n'.join( glimmer3_bed ) )
  144. glimmer3_bed_file.close()
  145. def process_GeneMarkHMM( filename, org_num, refseq ):
  146. try:
  147. geneMarkHMM_bed = get_bed_from_GeneMarkHMM( filename, refseq )
  148. except Exception, e:
  149. print "Converting GeneMarkHMM to bed FAILED! For chrom:", refseq, "file:", filename, e
  150. geneMarkHMM_bed = []
  151. geneMarkHMM_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMarkHMM.bed" % refseq ), 'wb+' )
  152. geneMarkHMM_bed_bed_file.write( '\n'.join( geneMarkHMM_bed ) )
  153. geneMarkHMM_bed_bed_file.close()
  154. def process_GeneMark( filename, org_num, refseq ):
  155. try:
  156. geneMark_bed = get_bed_from_GeneMark( filename, refseq )
  157. except Exception, e:
  158. print "Converting GeneMark to bed FAILED! For chrom:", refseq, "file:", filename, e
  159. geneMark_bed = []
  160. geneMark_bed_bed_file = open( os.path.join( os.path.split( filename )[0], "%s.GeneMark.bed" % refseq ), 'wb+' )
  161. geneMark_bed_bed_file.write( '\n'.join( geneMark_bed ) )
  162. geneMark_bed_bed_file.close()
  163. def __main__():
  164. start_time = time.time()
  165. base_dir = os.path.join( os.getcwd(), "bacteria" )
  166. try:
  167. base_dir = sys.argv[1]
  168. except:
  169. print "using default base_dir:", base_dir
  170. try:
  171. os.mkdir( base_dir )
  172. print "path '%s' has been created" % base_dir
  173. except:
  174. print "path '%s' seems to already exist" % base_dir
  175. for org_num, name, chroms, kingdom, group, org_genbank, org_refseq, info_url, ftp_url in iter_genome_projects():
  176. if chroms is None:
  177. continue #No chrom information, we can't really do anything with this organism
  178. #Create org directory, if exists, assume it is done and complete --> skip it
  179. try:
  180. org_dir = os.path.join( base_dir, org_num )
  181. os.mkdir( org_dir )
  182. except:
  183. print "Organism %s already exists on disk, skipping" % org_num
  184. continue
  185. #get ftp contents
  186. ftp_contents = get_ftp_contents( ftp_url )
  187. if ftp_contents is None:
  188. "FTP COMPLETELY FAILED TO LOAD", "org:", org_num, "ftp:", ftp_url
  189. else:
  190. for refseq in chroms:
  191. ftp_result = scrape_ftp( ftp_contents, org_dir, org_num, refseq, ftp_url )
  192. #FTP Files have been Loaded
  193. print "Org:", org_num, "chrom:", refseq, "[", time.time() - start_time, "seconds elapsed. ]"
  194. #Create org info file
  195. info_file = open( os.path.join( org_dir, "%s.info" % org_num ), 'wb+' )
  196. info_file.write("genome project id=%s\n" % org_num )
  197. info_file.write("name=%s\n" % name )
  198. info_file.write("kingdom=%s\n" % kingdom )
  199. info_file.write("group=%s\n" % group )
  200. info_file.write("chromosomes=%s\n" % ",".join( chroms ) )
  201. info_file.write("info url=%s\n" % info_url )
  202. info_file.write("ftp url=%s\n" % ftp_url )
  203. info_file.close()
  204. print "Finished Harvesting", "[", time.time() - start_time, "seconds elapsed. ]"
  205. print "[", ( time.time() - start_time )/60, "minutes. ]"
  206. print "[", ( time.time() - start_time )/60/60, "hours. ]"
  207. if __name__ == "__main__": __main__()