/finalSpliceGraph/DownloadData.py

https://github.com/JasonAng/ResearchScripts · Python · 150 lines · 104 code · 27 blank · 19 comment · 44 complexity · 94450e4b5fed95f416b32fc0c87d2846 MD5 · raw file

  1. #---------------------------------------------------------------------------------------
  2. # $Id: DownloadData.py,v 1.17 2005/10/14 20:56:31 stadler Exp $
  3. #---------------------------------------------------------------------------------------
  4. """Framework for reading downloading files from the internet
  5. Synopsis:
  6. import DownloadData
  7. slurp = DownloadData.DownloadData()
  8. slurp.getAllForSpecies("hsa")
  9. slurp.getAllForAll()
  10. """
  11. import sys, os, re, subprocess, configuration, urllib
  12. class DownloadData:
  13. "DownloadData object"
  14. chromosomePat = re.compile('\$c\$')
  15. ESTsuffixPat = re.compile('\$e\$')
  16. compressSuffix = '.gz'
  17. def __init__(self, conf=None, replace=True, log=sys.stderr, gunzip=True):
  18. if conf is not None and isinstance(conf, configuration.Configuration):
  19. self.conf = conf
  20. elif conf is not None and os.path.exists(conf):
  21. self.conf = configuration.Configuration(filename=conf)
  22. else:
  23. self.conf = configuration.Configuration()
  24. self.replace = bool(replace)
  25. self.species = None
  26. self.log = log
  27. self.gunzip = gunzip
  28. self.tablenames = {} #format: self.tablenames[species][tablename] = dir_with_sql_and_data_files
  29. def __getURL(self, url, path):
  30. "Get file from URL and store locally --> boolean"
  31. if self.replace or \
  32. (not path.endswith(self.compressSuffix) and not os.path.exists(path)) or \
  33. ( path.endswith(self.compressSuffix) and not self.gunzip and not os.path.exists(path)) or \
  34. ( path.endswith(self.compressSuffix) and self.gunzip and not os.path.exists(path[0:-len(self.compressSuffix)])):
  35. try:
  36. if os.path.exists(path):
  37. os.remove(path)
  38. urllib.urlretrieve(url, path)
  39. except:
  40. self.log.write("\tERROR downloading %s: %s\n" % (url, sys.exc_info()[1]))
  41. self.log.flush()
  42. return False
  43. else:
  44. os.chmod(path, 0660)
  45. if self.gunzip and path.endswith('.gz'):
  46. self.__gunzip(path)
  47. #retain table name and path for later use
  48. if path.endswith('.sql') and not self.tablenames[self.species].has_key(os.path.basename(path)[0:-4]):
  49. self.tablenames[self.species][os.path.basename(path)[0:-4]] = os.path.dirname(path)
  50. self.log.write("\tsuccess downloading %s\n" % url)
  51. self.log.flush()
  52. return True
  53. else:
  54. self.log.write("\tskipping existing %s\n" % url)
  55. self.log.flush()
  56. return True
  57. def __gunzip(self, path):
  58. "Uncompress a local file --> boolean"
  59. retcode = subprocess.Popen(['gunzip','-q',path]).wait()
  60. return True
  61. def getAllForSpecies(self, species):
  62. "Download all data for a given species --> boolean"
  63. result = True
  64. self.tablenames[species] = {}
  65. if(species in self.conf.getSpeciesList()):
  66. self.species = species
  67. for file in self.conf[species]["RawDataFiles"].split(","):
  68. print file, '1'
  69. file = self.ESTsuffixPat.sub(self.conf[species]["ESTsuffix"], file)
  70. if self.chromosomePat.match(file):
  71. print file, '2'
  72. for chr in self.conf[species]["Chromosomes"].split(","):
  73. newfile = self.chromosomePat.sub(chr, file)
  74. result = self.__getURL("%s/%s" % (self.conf[species]["RawDataDownloadURL"],newfile),
  75. "%s/%s" % (self.conf[species]["RawDataStoragePath"],newfile)) and result
  76. ## if newfile.endswith('.sql') and not self.tablenames[species].has_key(newfile[0:-4]):
  77. ## self.tablenames[species][newfile[0:-4]] = self.conf[species]["RawDataStoragePath"]
  78. else:
  79. print file, '2here'
  80. result = self.__getURL("%s/%s" % (self.conf[species]["RawDataDownloadURL"],file),
  81. "%s/%s" % (self.conf[species]["RawDataStoragePath"],file)) and result
  82. ## if file.endswith('.sql') and not self.tablenames[species].has_key(file[0:-4]):
  83. ## self.tablenames[species][file[0:-4]] = self.conf[species]["RawDataStoragePath"]
  84. if self.conf[species].has_key("MetaDataStoragePath") and \
  85. self.conf[species].has_key("MetaDataFiles"):
  86. for url in self.conf[species]["MetaDataFiles"].split(","):
  87. url = self.ESTsuffixPat.sub(self.conf[species]["ESTsuffix"], url)
  88. if self.chromosomePat.match(url):
  89. for chr in self.conf[species]["Chromosomes"].split(","):
  90. newurl = chromosomePat.sub(chr, url)
  91. file = "%s/%s" % (self.conf[species]["MetaDataStoragePath"],os.path.basename(newurl))
  92. result = self.__getURL(newurl, file) and result
  93. ## if os.path.basename(newurl).endswith('.sql') and not self.tablenames[species].has_key(os.path.basename(newurl)[0:-4]):
  94. ## self.tablenames[species][os.path.basename(newurl)[0:-4]] = self.conf[species]["MetaDataStoragePath"]
  95. else:
  96. file = "%s/%s" % (self.conf[species]["MetaDataStoragePath"],os.path.basename(url))
  97. result = self.__getURL(url, file) and result
  98. ## if os.path.basename(url).endswith('.sql') and not self.tablenames[species].has_key(os.path.basename(url)[0:-4]):
  99. ## self.tablenames[species][os.path.basename(url)[0:-4]] = self.conf[species]["MetaDataStoragePath"]
  100. else:
  101. result = False
  102. self.species = None
  103. return result
  104. def getAllForAll(self):
  105. "Download all data for all species --> boolean"
  106. result = True
  107. for species in self.conf.getSpeciesList():
  108. result = self.getAllForSpecies(species) and result
  109. return result
  110. if __name__ == "__main__":
  111. import DownloadData
  112. slurp = DownloadData.DownloadData()
  113. print >> sys.stdout, DownloadData.__doc__
  114. print >> sys.stdout
  115. print >> sys.stdout, "Download all data as specified in %s (y/n)?" % slurp.conf.configFile()
  116. if(sys.stdin.readline().startswith("y")):
  117. if slurp.getAllForAll():
  118. print >> sys.stderr, "download finished successfully."
  119. else:
  120. print >> sys.stderr, "download finished (WARNING: there were errors)."