PageRenderTime 99ms CodeModel.GetById 73ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 0ms

/statsmodels/tsa/base/tsa_model.py

http://github.com/statsmodels/statsmodels
Python | 651 lines | 647 code | 4 blank | 0 comment | 0 complexity | 5488cfee83cb8c0e8ae8b900799d346c MD5 | raw file
  1from statsmodels.compat.pandas import is_numeric_dtype
  2
  3import numbers
  4
  5import warnings
  6import numpy as np
  7from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  8                    PeriodIndex, RangeIndex, Timestamp, Series, Index,
  9                    Float64Index, date_range, period_range)
 10from pandas.tseries.frequencies import to_offset
 11
 12from statsmodels.base import data
 13import statsmodels.base.model as base
 14import statsmodels.base.wrapper as wrap
 15from statsmodels.tools.sm_exceptions import ValueWarning
 16
 17_tsa_doc = """
 18    %(model)s
 19
 20    Parameters
 21    ----------
 22    %(params)s
 23    dates : array_like, optional
 24        An array-like object of datetime objects. If a pandas object is given
 25        for endog or exog, it is assumed to have a DateIndex.
 26    freq : str, optional
 27        The frequency of the time-series. A Pandas offset or 'B', 'D', 'W',
 28        'M', 'A', or 'Q'. This is optional if dates are given.
 29    %(extra_params)s
 30    %(extra_sections)s"""
 31
 32_model_doc = "Timeseries model base class"
 33
 34_generic_params = base._model_params_doc
 35_missing_param_doc = base._missing_param_doc
 36
 37
 38class TimeSeriesModel(base.LikelihoodModel):
 39
 40    __doc__ = _tsa_doc % {"model": _model_doc, "params": _generic_params,
 41                          "extra_params": _missing_param_doc,
 42                          "extra_sections": ""}
 43
 44    def __init__(self, endog, exog=None, dates=None, freq=None,
 45                 missing='none', **kwargs):
 46        super(TimeSeriesModel, self).__init__(endog, exog, missing=missing,
 47                                              **kwargs)
 48
 49        # Date handling in indexes
 50        self._init_dates(dates, freq)
 51
 52    def _init_dates(self, dates=None, freq=None):
 53        """
 54        Initialize dates
 55
 56        Parameters
 57        ----------
 58        dates : array_like, optional
 59            An array like object containing dates.
 60        freq : str, tuple, datetime.timedelta, DateOffset or None, optional
 61            A frequency specification for either `dates` or the row labels from
 62            the endog / exog data.
 63
 64        Notes
 65        -----
 66        Creates `self._index` and related attributes. `self._index` is always
 67        a Pandas index, and it is always Int64Index, DatetimeIndex, or
 68        PeriodIndex.
 69
 70        If Pandas objects, endog / exog may have any type of index. If it is
 71        an Int64Index with values 0, 1, ..., nobs-1 or if it is (coerceable to)
 72        a DatetimeIndex or PeriodIndex *with an associated frequency*, then it
 73        is called a "supported" index. Otherwise it is called an "unsupported"
 74        index.
 75
 76        Supported indexes are standardized (i.e. a list of date strings is
 77        converted to a DatetimeIndex) and the result is put in `self._index`.
 78
 79        Unsupported indexes are ignored, and a supported Int64Index is
 80        generated and put in `self._index`. Warnings are issued in this case
 81        to alert the user if the returned index from some operation (e.g.
 82        forecasting) is different from the original data's index. However,
 83        whenever possible (e.g. purely in-sample prediction), the original
 84        index is returned.
 85
 86        The benefit of supported indexes is that they allow *forecasting*, i.e.
 87        it is possible to extend them in a reasonable way. Thus every model
 88        must have an underlying supported index, even if it is just a generated
 89        Int64Index.
 90        """
 91
 92        # Get our index from `dates` if available, otherwise from whatever
 93        # Pandas index we might have retrieved from endog, exog
 94        if dates is not None:
 95            index = dates
 96        else:
 97            index = self.data.row_labels
 98
 99        # Sanity check that we do not have a `freq` without an index
100        if index is None and freq is not None:
101            raise ValueError('Frequency provided without associated index.')
102
103        # If an index is available, see if it is a date-based index or if it
104        # can be coerced to one. (If it cannot we'll fall back, below, to an
105        # internal, 0, 1, ... nobs-1 integer index for modeling purposes)
106        inferred_freq = False
107        if index is not None:
108            # Try to coerce to date-based index
109            if not isinstance(index, (DatetimeIndex, PeriodIndex)):
110                try:
111                    # Only try to coerce non-numeric index types (string,
112                    # list of date-times, etc.)
113                    # Note that np.asarray(Float64Index([...])) yields an
114                    # object dtype array in earlier versions of Pandas (and so
115                    # will not have is_numeric_dtype == True), so explicitly
116                    # check for it here. But note also that in very early
117                    # Pandas (~0.12), Float64Index does not exist (and so the
118                    # statsmodels compat makes it an empty tuple, so in that
119                    # case also check if the first element is a float.
120                    _index = np.asarray(index)
121                    if (is_numeric_dtype(_index) or
122                            isinstance(index, Float64Index) or
123                            (Float64Index == tuple() and
124                             isinstance(_index[0], float))):
125                        raise ValueError('Numeric index given')
126                    # If a non-index Pandas series was given, only keep its
127                    # values (because we must have a pd.Index type, below, and
128                    # pd.to_datetime will return a Series when passed
129                    # non-list-like objects)
130                    if isinstance(index, Series):
131                        index = index.values
132                    # All coercion is done via pd.to_datetime
133                    # Note: date coercion via pd.to_datetime does not handle
134                    # string versions of PeriodIndex objects most of the time.
135                    _index = to_datetime(index)
136                    # Older versions of Pandas can sometimes fail here and
137                    # return a numpy array - check to make sure it's an index
138                    if not isinstance(_index, Index):
139                        raise ValueError('Could not coerce to date index')
140                    index = _index
141                except:
142                    # Only want to actually raise an exception if `dates` was
143                    # provided but cannot be coerced. If we got the index from
144                    # the row_labels, we'll just ignore it and use the integer
145                    # index below
146                    if dates is not None:
147                        raise ValueError('Non-date index index provided to'
148                                         ' `dates` argument.')
149            # Now, if we were given, or coerced, a date-based index, make sure
150            # it has an associated frequency
151            if isinstance(index, (DatetimeIndex, PeriodIndex)):
152                # If no frequency, try to get an inferred frequency
153                if freq is None and index.freq is None:
154                    freq = index.inferred_freq
155                    # If we got an inferred frequncy, alert the user
156                    if freq is not None:
157                        inferred_freq = True
158                        if freq is not None:
159                            warnings.warn('No frequency information was'
160                                          ' provided, so inferred frequency %s'
161                                          ' will be used.'
162                                          % freq, ValueWarning)
163
164                # Convert the passed freq to a pandas offset object
165                if freq is not None:
166                    freq = to_offset(freq)
167
168                # Now, if no frequency information is available from the index
169                # itself or from the `freq` argument, raise an exception
170                if freq is None and index.freq is None:
171                    # But again, only want to raise the exception if `dates`
172                    # was provided.
173                    if dates is not None:
174                        raise ValueError('No frequency information was'
175                                         ' provided with date index and no'
176                                         ' frequency could be inferred.')
177                # However, if the index itself has no frequency information but
178                # the `freq` argument is available (or was inferred), construct
179                # a new index with an associated frequency
180                elif freq is not None and index.freq is None:
181                    resampled_index = date_range(
182                        start=index[0], end=index[-1], freq=freq)
183                    if not inferred_freq and not resampled_index.equals(index):
184                        raise ValueError('The given frequency argument could'
185                                         ' not be matched to the given index.')
186                    index = resampled_index
187                # Finally, if the index itself has a frequency and there was
188                # also a given frequency, raise an exception if they are not
189                # equal
190                elif (freq is not None and not inferred_freq and
191                        not (index.freq == freq)):
192                    raise ValueError('The given frequency argument is'
193                                     ' incompatible with the given index.')
194            # Finally, raise an exception if we could not coerce to date-based
195            # but we were given a frequency argument
196            elif freq is not None:
197                raise ValueError('Given index could not be coerced to dates'
198                                 ' but `freq` argument was provided.')
199
200        # Get attributes of the index
201        has_index = index is not None
202        date_index = isinstance(index, (DatetimeIndex, PeriodIndex))
203        period_index = isinstance(index, PeriodIndex)
204        int_index = isinstance(index, Int64Index)
205        range_index = isinstance(index, RangeIndex)
206        has_freq = index.freq is not None if date_index else None
207        increment = Index(range(self.endog.shape[0]))
208        is_increment = index.equals(increment) if int_index else None
209        is_monotonic = index.is_monotonic if date_index else None
210
211        # Issue warnings for unsupported indexes
212        if has_index and not (date_index or range_index or is_increment):
213            warnings.warn('An unsupported index was provided and will be'
214                          ' ignored when e.g. forecasting.', ValueWarning)
215        if date_index and not has_freq:
216            warnings.warn('A date index has been provided, but it has no'
217                          ' associated frequency information and so will be'
218                          ' ignored when e.g. forecasting.', ValueWarning)
219        if date_index and not is_monotonic:
220            warnings.warn('A date index has been provided, but it is not'
221                          ' monotonic and so will be ignored when e.g.'
222                          ' forecasting.', ValueWarning)
223
224        # Construct the internal index
225        index_generated = False
226        valid_index = ((date_index and has_freq and is_monotonic) or
227                       (int_index and is_increment) or range_index)
228
229        if valid_index:
230            _index = index
231        else:
232            _index = increment
233            index_generated = True
234        self._index = _index
235        self._index_generated = index_generated
236        self._index_none = index is None
237        self._index_int64 = int_index and not range_index and not date_index
238        self._index_dates = date_index and not index_generated
239        self._index_freq = self._index.freq if self._index_dates else None
240        self._index_inferred_freq = inferred_freq
241
242        # For backwards compatibility, set data.dates, data.freq
243        self.data.dates = self._index if self._index_dates else None
244        self.data.freq = self._index.freqstr if self._index_dates else None
245
246    def _get_index_loc(self, key, base_index=None):
247        """
248        Get the location of a specific key in an index
249
250        Parameters
251        ----------
252        key : label
253            The key for which to find the location if the underlying index is
254            a DateIndex or a location if the underlying index is a RangeIndex
255            or an Int64Index.
256        base_index : pd.Index, optional
257            Optionally the base index to search. If None, the model's index is
258            searched.
259
260        Returns
261        -------
262        loc : int
263            The location of the key
264        index : pd.Index
265            The index including the key; this is a copy of the original index
266            unless the index had to be expanded to accommodate `key`.
267        index_was_expanded : bool
268            Whether or not the index was expanded to accommodate `key`.
269
270        Notes
271        -----
272        If `key` is past the end of of the given index, and the index is either
273        an Int64Index or a date index, this function extends the index up to
274        and including key, and then returns the location in the new index.
275        """
276        if base_index is None:
277            base_index = self._index
278
279        index = base_index
280        date_index = isinstance(base_index, (PeriodIndex, DatetimeIndex))
281        int_index = isinstance(base_index, Int64Index)
282        range_index = isinstance(base_index, RangeIndex)
283        index_class = type(base_index)
284        nobs = len(index)
285
286        # Special handling for RangeIndex
287        if range_index and isinstance(key, (int, np.integer)):
288            # Negative indices (that lie in the Index)
289            if key < 0 and -key <= nobs:
290                key = nobs + key
291            # Out-of-sample (note that we include key itself in the new index)
292            elif key > nobs - 1:
293                # See gh5835. Remove the except after pandas 0.25 required.
294                try:
295                    base_index_start = base_index.start
296                    base_index_step = base_index.step
297                except AttributeError:
298                    base_index_start = base_index._start
299                    base_index_step = base_index._step
300                stop = base_index_start + (key + 1) * base_index_step
301                index = RangeIndex(start=base_index_start,
302                                   stop=stop,
303                                   step=base_index_step)
304
305        # Special handling for Int64Index
306        if (not range_index and int_index and not date_index and
307                isinstance(key, (int, np.integer))):
308            # Negative indices (that lie in the Index)
309            if key < 0 and -key <= nobs:
310                key = nobs + key
311            # Out-of-sample (note that we include key itself in the new index)
312            elif key > base_index[-1]:
313                index = Int64Index(np.arange(base_index[0], int(key + 1)))
314
315        # Special handling for date indexes
316        if date_index:
317            # Use index type to choose creation function
318            if index_class is DatetimeIndex:
319                index_fn = date_range
320            else:
321                index_fn = period_range
322            # Integer key (i.e. already given a location)
323            if isinstance(key, (int, np.integer)):
324                # Negative indices (that lie in the Index)
325                if key < 0 and -key < nobs:
326                    key = index[nobs + key]
327                # Out-of-sample (note that we include key itself in the new
328                # index)
329                elif key > len(base_index) - 1:
330                    index = index_fn(start=base_index[0],
331                                     periods=int(key + 1),
332                                     freq=base_index.freq)
333                    key = index[-1]
334                else:
335                    key = index[key]
336            # Other key types (i.e. string date or some datetime-like object)
337            else:
338                # Covert the key to the appropriate date-like object
339                if index_class is PeriodIndex:
340                    date_key = Period(key, freq=base_index.freq)
341                else:
342                    date_key = Timestamp(key, freq=base_index.freq)
343
344                # Out-of-sample
345                if date_key > base_index[-1]:
346                    # First create an index that may not always include `key`
347                    index = index_fn(start=base_index[0], end=date_key,
348                                     freq=base_index.freq)
349
350                    # Now make sure we include `key`
351                    if not index[-1] == date_key:
352                        index = index_fn(start=base_index[0],
353                                         periods=len(index) + 1,
354                                         freq=base_index.freq)
355
356                    # To avoid possible inconsistencies with `get_loc` below,
357                    # set the key directly equal to the last index location
358                    key = index[-1]
359
360        # Get the location
361        if date_index:
362            # (note that get_loc will throw a KeyError if key is invalid)
363            loc = index.get_loc(key)
364        elif int_index or range_index:
365            # For Int64Index and RangeIndex, key is assumed to be the location
366            # and not an index value (this assumption is required to support
367            # RangeIndex)
368            try:
369                index[key]
370            # We want to raise a KeyError in this case, to keep the exception
371            # consistent across index types.
372            # - Attempting to index with an out-of-bound location (e.g.
373            #   index[10] on an index of length 9) will raise an IndexError
374            #   (as of Pandas 0.22)
375            # - Attemtping to index with a type that cannot be cast to integer
376            #   (e.g. a non-numeric string) will raise a ValueError if the
377            #   index is RangeIndex (otherwise will raise an IndexError)
378            #   (as of Pandas 0.22)
379            except (IndexError, ValueError) as e:
380                raise KeyError(str(e))
381            loc = key
382        else:
383            loc = index.get_loc(key)
384
385        # Check if we now have a modified index
386        index_was_expanded = index is not base_index
387
388        # Return the index through the end of the loc / slice
389        if isinstance(loc, slice):
390            end = loc.stop - 1
391        else:
392            end = loc
393
394        return loc, index[:end + 1], index_was_expanded
395
396    def _get_index_label_loc(self, key, base_index=None):
397        """
398        Get the location of a specific key in an index or model row labels
399
400        Parameters
401        ----------
402        key : label
403            The key for which to find the location if the underlying index is
404            a DateIndex or is only being used as row labels, or a location if
405            the underlying index is a RangeIndex or an Int64Index.
406        base_index : pd.Index, optional
407            Optionally the base index to search. If None, the model's index is
408            searched.
409
410        Returns
411        -------
412        loc : int
413            The location of the key
414        index : pd.Index
415            The index including the key; this is a copy of the original index
416            unless the index had to be expanded to accommodate `key`.
417        index_was_expanded : bool
418            Whether or not the index was expanded to accommodate `key`.
419
420        Notes
421        -----
422        This method expands on `_get_index_loc` by first trying the given
423        base index (or the model's index if the base index was not given) and
424        then falling back to try again with the model row labels as the base
425        index.
426        """
427        try:
428            loc, index, index_was_expanded = (
429                self._get_index_loc(key, base_index))
430        except KeyError as e:
431            try:
432                if not isinstance(key, (int, np.integer)):
433                    loc = self.data.row_labels.get_loc(key)
434                else:
435                    raise
436                # Require scalar
437                # Pandas may return a slice if there are multiple matching
438                # locations that are monotonic increasing (otherwise it may
439                # return an array of integer locations, see below).
440                if isinstance(loc, slice):
441                    loc = loc.start
442                if isinstance(loc, np.ndarray):
443                    # Pandas may return a mask (boolean array), for e.g.:
444                    # pd.Index(list('abcb')).get_loc('b')
445                    if loc.dtype == bool:
446                        # Return the first True value
447                        # (we know there is at least one True value if we're
448                        # here because otherwise the get_loc call would have
449                        # raised an exception)
450                        loc = np.argmax(loc)
451                    # Finally, Pandas may return an integer array of
452                    # locations that match the given value, for e.g.
453                    # pd.DatetimeIndex(['2001-02', '2001-01']).get_loc('2001')
454                    # (this appears to be slightly undocumented behavior, since
455                    # only int, slice, and mask are mentioned in docs for
456                    # pandas.Index.get_loc as of 0.23.4)
457                    else:
458                        loc = loc[0]
459                if not isinstance(loc, numbers.Integral):
460                    raise
461
462                index = self.data.row_labels[:loc + 1]
463                index_was_expanded = False
464            except:
465                raise e
466        return loc, index, index_was_expanded
467
468    def _get_prediction_index(self, start, end, index=None, silent=False):
469        """
470        Get the location of a specific key in an index or model row labels
471
472        Parameters
473        ----------
474        start : label
475            The key at which to start prediction. Depending on the underlying
476            model's index, may be an integer, a date (string, datetime object,
477            pd.Timestamp, or pd.Period object), or some other object in the
478            model's row labels.
479        end : label
480            The key at which to end prediction (note that this key will be
481            *included* in prediction). Depending on the underlying
482            model's index, may be an integer, a date (string, datetime object,
483            pd.Timestamp, or pd.Period object), or some other object in the
484            model's row labels.
485        index : pd.Index, optional
486            Optionally an index to associate the predicted results to. If None,
487            an attempt is made to create an index for the predicted results
488            from the model's index or model's row labels.
489        silent : bool, optional
490            Argument to silence warnings.
491
492        Returns
493        -------
494        start : int
495            The index / observation location at which to begin prediction.
496        end : int
497            The index / observation location at which to end in-sample
498            prediction. The maximum value for this is nobs-1.
499        out_of_sample : int
500            The number of observations to forecast after the end of the sample.
501        prediction_index : pd.Index or None
502            The index associated with the prediction results. This index covers
503            the range [start, end + out_of_sample]. If the model has no given
504            index and no given row labels (i.e. endog/exog is not Pandas), then
505            this will be None.
506
507        Notes
508        -----
509        The arguments `start` and `end` behave differently, depending on if
510        they are integer or not. If either is an integer, then it is assumed
511        to refer to a *location* in the index, not to an index value. On the
512        other hand, if it is a date string or some other type of object, then
513        it is assumed to refer to an index *value*. In all cases, the returned
514        `start` and `end` values refer to index *locations* (so in the former
515        case, the given location is validated and returned whereas in the
516        latter case a location is found that corresponds to the given index
517        value).
518
519        This difference in behavior is necessary to support `RangeIndex`. This
520        is because integers for a RangeIndex could refer either to index values
521        or to index locations in an ambiguous way (while for `Int64Index`,
522        since we have required them to be full indexes, there is no ambiguity).
523        """
524
525        # Convert index keys (start, end) to index locations and get associated
526        # indexes.
527        try:
528            start, start_index, start_oos = self._get_index_label_loc(start)
529        except KeyError:
530            raise KeyError('The `start` argument could not be matched to a'
531                           ' location related to the index of the data.')
532        if end is None:
533            end = max(start, len(self._index) - 1)
534        try:
535            end, end_index, end_oos = self._get_index_label_loc(end)
536        except KeyError:
537            raise KeyError('The `end` argument could not be matched to a'
538                           ' location related to the index of the data.')
539
540        # Handle slices (if the given index keys cover more than one date)
541        if isinstance(start, slice):
542            start = start.start
543        if isinstance(end, slice):
544            end = end.stop - 1
545
546        # Get the actual index for the prediction
547        prediction_index = end_index[start:]
548
549        # Validate prediction options
550        if end < start:
551            raise ValueError('Prediction must have `end` after `start`.')
552
553        # Handle custom prediction index
554        # First, if we were given an index, check that it's the right size and
555        # use it if so
556        if index is not None:
557            if not len(prediction_index) == len(index):
558                raise ValueError('Invalid `index` provided in prediction.'
559                                 ' Must have length consistent with `start`'
560                                 ' and `end` arguments.')
561            # But if we weren't given Pandas input, this index will not be
562            # used because the data will not be wrapped; in that case, issue
563            # a warning
564            if not isinstance(self.data, data.PandasData) and not silent:
565                warnings.warn('Because the model data (`endog`, `exog`) were'
566                              ' not given as Pandas objects, the prediction'
567                              ' output will be Numpy arrays, and the given'
568                              ' `index` argument will only be used'
569                              ' internally.', ValueWarning)
570            prediction_index = Index(index)
571        # Now, if we *do not* have a supported index, but we were given some
572        # kind of index...
573        elif self._index_generated and not self._index_none:
574            # If we are in sample, and have row labels, use them
575            if self.data.row_labels is not None and not (start_oos or end_oos):
576                prediction_index = self.data.row_labels[start:end + 1]
577            # Otherwise, warn the user that they will get an Int64Index
578            else:
579                if not silent:
580                    warnings.warn('No supported index is available.'
581                                  ' Prediction results will be given with'
582                                  ' an integer index beginning at `start`.',
583                                  ValueWarning)
584                warnings.warn('No supported index is available. In the next'
585                              ' version, calling this method in a model'
586                              ' without a supported index will result in an'
587                              ' exception.', DeprecationWarning)
588        elif self._index_none:
589            prediction_index = None
590
591        # For backwards compatibility, set `predict_*` values
592        if prediction_index is not None:
593            self.data.predict_start = prediction_index[0]
594            self.data.predict_end = prediction_index[-1]
595            self.data.predict_dates = prediction_index
596        else:
597            self.data.predict_start = None
598            self.data.predict_end = None
599            self.data.predict_dates = None
600
601        # Compute out-of-sample observations
602        nobs = len(self.endog)
603        out_of_sample = max(end - (nobs - 1), 0)
604        end -= out_of_sample
605
606        return start, end, out_of_sample, prediction_index
607
608    def _get_exog_names(self):
609        return self.data.xnames
610
611    def _set_exog_names(self, vals):
612        if not isinstance(vals, list):
613            vals = [vals]
614        self.data.xnames = vals
615
616    # overwrite with writable property for (V)AR models
617    exog_names = property(_get_exog_names, _set_exog_names, None,
618                          'The names of the exogenous variables.')
619
620
621class TimeSeriesModelResults(base.LikelihoodModelResults):
622    def __init__(self, model, params, normalized_cov_params, scale=1.):
623        self.data = model.data
624        super(TimeSeriesModelResults,
625                self).__init__(model, params, normalized_cov_params, scale)
626
627
628class TimeSeriesResultsWrapper(wrap.ResultsWrapper):
629    _attrs = {}
630    _wrap_attrs = wrap.union_dicts(base.LikelihoodResultsWrapper._wrap_attrs,
631                                    _attrs)
632    _methods = {'predict' : 'dates'}
633    _wrap_methods = wrap.union_dicts(base.LikelihoodResultsWrapper._wrap_methods,
634                                     _methods)
635wrap.populate_wrapper(TimeSeriesResultsWrapper,  # noqa:E305
636                      TimeSeriesModelResults)
637
638
639if __name__ == "__main__":
640    import statsmodels.api as sm
641    import pandas
642
643    mdata = sm.datasets.macrodata.load(as_pandas=False)
644
645    #make a DataFrame
646    #TODO: attach a DataFrame to some of the datasets, for quicker use
647    dates = [str(int(x[0])) +':'+ str(int(x[1])) \
648             for x in mdata.data[['year','quarter']]]
649
650    df = pandas.DataFrame(mdata.data[['realgdp','realinv','realcons']], index=dates)
651    ex_mod = TimeSeriesModel(df)