PageRenderTime 40ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/batch_submit/pymodules/python2.7/lib/python/statsmodels-0.5.0-py2.7-linux-x86_64.egg/statsmodels/datasets/utils.py

https://gitlab.com/pooja043/Globus_Docker
Python | 286 lines | 225 code | 30 blank | 31 comment | 42 complexity | a141646c0462dbe3c24af746abf9f2b1 MD5 | raw file
  1. import sys
  2. import shutil
  3. import pickle
  4. from os import environ
  5. from os import makedirs
  6. from os.path import basename
  7. from os.path import expanduser
  8. from os.path import exists
  9. from os.path import expanduser
  10. from os.path import join
  11. from StringIO import StringIO
  12. import time
  13. from urllib2 import urlopen, HTTPError
  14. import numpy as np
  15. from numpy import genfromtxt, array
  16. from pandas import read_csv
  17. class Dataset(dict):
  18. def __init__(self, **kw):
  19. # define some default attributes, so pylint can find them
  20. self.endog = None
  21. self.exog = None
  22. self.data = None
  23. self.names = None
  24. dict.__init__(self,kw)
  25. self.__dict__ = self
  26. # Some datasets have string variables. If you want a raw_data
  27. # attribute you must create this in the dataset's load function.
  28. try: # some datasets have string variables
  29. self.raw_data = self.data.view((float, len(self.names)))
  30. except:
  31. pass
  32. def __repr__(self):
  33. return str(self.__class__)
  34. def process_recarray(data, endog_idx=0, exog_idx=None, stack=True, dtype=None):
  35. names = list(data.dtype.names)
  36. if isinstance(endog_idx, int):
  37. endog = array(data[names[endog_idx]], dtype=dtype)
  38. endog_name = names[endog_idx]
  39. endog_idx = [endog_idx]
  40. else:
  41. endog_name = [names[i] for i in endog_idx]
  42. if stack:
  43. endog = np.column_stack(data[field] for field in endog_name)
  44. else:
  45. endog = data[endog_name]
  46. if exog_idx is None:
  47. exog_name = [names[i] for i in xrange(len(names))
  48. if i not in endog_idx]
  49. else:
  50. exog_name = [names[i] for i in exog_idx]
  51. if stack:
  52. exog = np.column_stack(data[field] for field in exog_name)
  53. else:
  54. exog = data[exog_name]
  55. if dtype:
  56. endog = endog.astype(dtype)
  57. exog = exog.astype(dtype)
  58. dataset = Dataset(data=data, names=names, endog=endog, exog=exog,
  59. endog_name=endog_name, exog_name=exog_name)
  60. return dataset
  61. def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None,
  62. index_idx=None):
  63. from pandas import DataFrame
  64. data = DataFrame(data, dtype=dtype)
  65. names = data.columns
  66. if isinstance(endog_idx, int):
  67. endog_name = names[endog_idx]
  68. endog = data[endog_name]
  69. if exog_idx is None:
  70. exog = data.drop([endog_name], axis=1)
  71. else:
  72. exog = data.filter(names[exog_idx])
  73. else:
  74. endog = data.ix[:, endog_idx]
  75. endog_name = list(endog.columns)
  76. if exog_idx is None:
  77. exog = data.drop(endog_name, axis=1)
  78. elif isinstance(exog_idx, int):
  79. exog = data.filter([names[exog_idx]])
  80. else:
  81. exog = data.filter(names[exog_idx])
  82. if index_idx is not None: #NOTE: will have to be improved for dates
  83. from pandas import Index
  84. endog.index = Index(data.ix[:, index_idx])
  85. exog.index = Index(data.ix[:, index_idx])
  86. data = data.set_index(names[index_idx])
  87. exog_name = list(exog.columns)
  88. dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
  89. endog_name=endog_name, exog_name=exog_name)
  90. return dataset
  91. def _maybe_reset_index(data):
  92. """
  93. All the Rdatasets have the integer row.labels from R if there is no
  94. real index. Strip this for a zero-based index
  95. """
  96. from pandas import Index
  97. if data.index.equals(Index(range(1,len(data)+1))):
  98. data = data.reset_index(drop=True)
  99. return data
  100. def _get_cache(cache):
  101. if cache is False:
  102. # do not do any caching or load from cache
  103. cache = None
  104. elif cache is True: # use default dir for cache
  105. cache = get_data_home(None)
  106. else:
  107. cache = get_data_home(cache)
  108. return cache
  109. def _cache_it(data, cache_path):
  110. if sys.version_info[0] >= 3:
  111. # for some reason encode("zip") won't work for me in Python 3?
  112. import zlib
  113. open(cache_path, "wb").write(zlib.compress(pickle.dumps(data)))
  114. else:
  115. open(cache_path, "wb").write(pickle.dumps(data).encode("zip"))
  116. def _open_cache(cache_path):
  117. if sys.version_info[0] >= 3:
  118. #NOTE: don't know why but decode('zip') doesn't work on my
  119. # Python 3 build
  120. import zlib
  121. data = zlib.decompress(open(cache_path, 'rb').read())
  122. data = pickle.loads(data)
  123. else:
  124. data = open(cache_path, 'rb').read().decode('zip')
  125. data = pickle.loads(data)
  126. return data
  127. def _urlopen_cached(url, cache):
  128. """
  129. Tries to load data from cache location otherwise downloads it. If it
  130. downloads the data and cache is not None then it will put the downloaded
  131. data in the cache path.
  132. """
  133. from_cache = False
  134. if cache is not None:
  135. cache_path = join(cache,
  136. url.split("://")[-1].replace('/', ',') +".zip")
  137. try:
  138. data = _open_cache(cache_path)
  139. from_cache = True
  140. except:
  141. pass
  142. # not using the cache or didn't find it in cache
  143. if not from_cache:
  144. data = urlopen(url).read()
  145. if cache is not None: # then put it in the cache
  146. _cache_it(data, cache_path)
  147. return data, from_cache
  148. def _get_data(base_url, dataname, cache, extension="csv"):
  149. url = base_url + (dataname + ".%s") % extension
  150. try:
  151. data, from_cache = _urlopen_cached(url, cache)
  152. except HTTPError, err:
  153. if '404' in str(err):
  154. raise ValueError("Dataset %s was not found." % dataname)
  155. else:
  156. raise err
  157. #Python 3, don't think there will be any unicode in r datasets
  158. if sys.version[0] == '3': # pragma: no cover
  159. data = data.decode('ascii', errors='strict')
  160. return StringIO(data), from_cache
  161. def _get_dataset_meta(dataname, package, cache):
  162. # get the index, you'll probably want this cached because you have
  163. # to download info about all the data to get info about any of the data...
  164. index_url = ("https://raw.github.com/vincentarelbundock/Rdatasets/master/"
  165. "datasets.csv")
  166. data, _ = _urlopen_cached(index_url, cache)
  167. #Python 3
  168. if sys.version[0] == '3': # pragma: no cover
  169. data = data.decode('ascii', errors='strict')
  170. index = read_csv(StringIO(data))
  171. idx = np.logical_and(index.Item == dataname, index.Package == package)
  172. dataset_meta = index.ix[idx]
  173. return dataset_meta["Title"].item()
  174. def get_rdataset(dataname, package="datasets", cache=False):
  175. """download and return R dataset
  176. Parameters
  177. ----------
  178. dataname : str
  179. The name of the dataset you want to download
  180. package : str
  181. The package in which the dataset is found. The default is the core
  182. 'datasets' package.
  183. cache : bool or str
  184. If True, will download this data into the STATSMODELS_DATA folder.
  185. The default location is a folder called statsmodels_data in the
  186. user home folder. Otherwise, you can specify a path to a folder to
  187. use for caching the data. If False, the data will not be cached.
  188. Returns
  189. -------
  190. dataset : Dataset instance
  191. A `statsmodels.data.utils.Dataset` instance. This objects has
  192. attributes::
  193. * data - A pandas DataFrame containing the data
  194. * title - The dataset title
  195. * package - The package from which the data came
  196. * from_cache - Whether not cached data was retrieved
  197. * __doc__ - The verbatim R documentation.
  198. Notes
  199. -----
  200. If the R dataset has an integer index. This is reset to be zero-based.
  201. Otherwise the index is preserved. The caching facilities are dumb. That
  202. is, no download dates, e-tags, or otherwise identifying information
  203. is checked to see if the data should be downloaded again or not. If the
  204. dataset is in the cache, it's used.
  205. """
  206. #NOTE: use raw github bc html site might not be most up to date
  207. data_base_url = ("https://raw.github.com/vincentarelbundock/Rdatasets/"
  208. "master/csv/"+package+"/")
  209. docs_base_url = ("https://raw.github.com/vincentarelbundock/Rdatasets/"
  210. "master/doc/"+package+"/rst/")
  211. cache = _get_cache(cache)
  212. data, from_cache = _get_data(data_base_url, dataname, cache)
  213. data = read_csv(data, index_col=0)
  214. data = _maybe_reset_index(data)
  215. title = _get_dataset_meta(dataname, package, cache)
  216. doc, _ = _get_data(docs_base_url, dataname, cache, "rst")
  217. return Dataset(data=data, __doc__=doc.read(), package=package, title=title,
  218. from_cache=from_cache)
  219. ### The below function were taken from sklearn
  220. def get_data_home(data_home=None):
  221. """Return the path of the statsmodels data dir.
  222. This folder is used by some large dataset loaders to avoid
  223. downloading the data several times.
  224. By default the data dir is set to a folder named 'statsmodels_data'
  225. in the user home folder.
  226. Alternatively, it can be set by the 'STATSMODELS_DATA' environment
  227. variable or programatically by giving an explit folder path. The
  228. '~' symbol is expanded to the user home folder.
  229. If the folder does not already exist, it is automatically created.
  230. """
  231. if data_home is None:
  232. data_home = environ.get('STATSMODELS_DATA',
  233. join('~', 'statsmodels_data'))
  234. data_home = expanduser(data_home)
  235. if not exists(data_home):
  236. makedirs(data_home)
  237. return data_home
  238. def clear_data_home(data_home=None):
  239. """Delete all the content of the data home cache."""
  240. data_home = get_data_home(data_home)
  241. shutil.rmtree(data_home)