/pandas/tseries/tools.py
Python | 593 lines | 441 code | 79 blank | 73 comment | 115 complexity | 478241ad239204c1d94119b029f4ffe9 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
- from datetime import datetime, timedelta
- import re
- import sys
- import numpy as np
- import pandas.lib as lib
- import pandas.tslib as tslib
- import pandas.core.common as com
- from pandas.compat import StringIO, callable
- import pandas.compat as compat
- try:
- import dateutil
- from dateutil.parser import parse, DEFAULTPARSER
- from dateutil.relativedelta import relativedelta
- # raise exception if dateutil 2.0 install on 2.x platform
- if (sys.version_info[0] == 2 and
- dateutil.__version__ == '2.0'): # pragma: no cover
- raise Exception('dateutil 2.0 incompatible with Python 2.x, you must '
- 'install version 1.5 or 2.1+!')
- except ImportError: # pragma: no cover
- print('Please install python-dateutil via easy_install or some method!')
- raise # otherwise a 2nd import won't show the message
- _DATEUTIL_LEXER_SPLIT = None
- try:
- # Since these are private methods from dateutil, it is safely imported
- # here so in case this interface changes, pandas will just fallback
- # to not using the functionality
- from dateutil.parser import _timelex
- if hasattr(_timelex, 'split'):
- def _lexer_split_from_str(dt_str):
- # The StringIO(str(_)) is for dateutil 2.2 compatibility
- return _timelex.split(StringIO(str(dt_str)))
- _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
- except (ImportError, AttributeError):
- pass
- def _infer_tzinfo(start, end):
- def _infer(a, b):
- tz = a.tzinfo
- if b and b.tzinfo:
- if not (tslib.get_timezone(tz) == tslib.get_timezone(b.tzinfo)):
- raise AssertionError('Inputs must both have the same timezone,'
- ' {0} != {1}'.format(tz, b.tzinfo))
- return tz
- tz = None
- if start is not None:
- tz = _infer(start, end)
- elif end is not None:
- tz = _infer(end, start)
- return tz
- def _maybe_get_tz(tz, date=None):
- tz = tslib.maybe_get_tz(tz)
- if com.is_integer(tz):
- import pytz
- tz = pytz.FixedOffset(tz / 60)
- # localize and get the tz
- if date is not None and tz is not None:
- if date.tzinfo is not None and hasattr(tz,'localize'):
- tz = tz.localize(date.replace(tzinfo=None)).tzinfo
- return tz
- def _guess_datetime_format(dt_str, dayfirst=False,
- dt_str_parse=compat.parse_date,
- dt_str_split=_DATEUTIL_LEXER_SPLIT):
- """
- Guess the datetime format of a given datetime string.
- Parameters
- ----------
- dt_str : string, datetime string to guess the format of
- dayfirst : boolean, default False
- If True parses dates with the day first, eg 20/01/2005
- Warning: dayfirst=True is not strict, but will prefer to parse
- with day first (this is a known bug).
- dt_str_parse : function, defaults to `compate.parse_date` (dateutil)
- This function should take in a datetime string and return
- a `datetime.datetime` guess that the datetime string represents
- dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil)
- This function should take in a datetime string and return
- a list of strings, the guess of the various specific parts
- e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30']
- Returns
- -------
- ret : datetime formatt string (for `strftime` or `strptime`)
- """
- if dt_str_parse is None or dt_str_split is None:
- return None
- if not isinstance(dt_str, compat.string_types):
- return None
- day_attribute_and_format = (('day',), '%d')
- datetime_attrs_to_format = [
- (('year', 'month', 'day'), '%Y%m%d'),
- (('year',), '%Y'),
- (('month',), '%B'),
- (('month',), '%b'),
- (('month',), '%m'),
- day_attribute_and_format,
- (('hour',), '%H'),
- (('minute',), '%M'),
- (('second',), '%S'),
- (('microsecond',), '%f'),
- (('second', 'microsecond'), '%S.%f'),
- ]
- if dayfirst:
- datetime_attrs_to_format.remove(day_attribute_and_format)
- datetime_attrs_to_format.insert(0, day_attribute_and_format)
- try:
- parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst)
- except:
- # In case the datetime can't be parsed, its format cannot be guessed
- return None
- if parsed_datetime is None:
- return None
- try:
- tokens = dt_str_split(dt_str)
- except:
- # In case the datetime string can't be split, its format cannot
- # be guessed
- return None
- format_guess = [None] * len(tokens)
- found_attrs = set()
- for attrs, attr_format in datetime_attrs_to_format:
- # If a given attribute has been placed in the format string, skip
- # over other formats for that same underlying attribute (IE, month
- # can be represented in multiple different ways)
- if set(attrs) & found_attrs:
- continue
- if all(getattr(parsed_datetime, attr) is not None for attr in attrs):
- for i, token_format in enumerate(format_guess):
- if (token_format is None and
- tokens[i] == parsed_datetime.strftime(attr_format)):
- format_guess[i] = attr_format
- found_attrs.update(attrs)
- break
- # Only consider it a valid guess if we have a year, month and day
- if len(set(['year', 'month', 'day']) & found_attrs) != 3:
- return None
- output_format = []
- for i, guess in enumerate(format_guess):
- if guess is not None:
- # Either fill in the format placeholder (like %Y)
- output_format.append(guess)
- else:
- # Or just the token separate (IE, the dashes in "01-01-2013")
- try:
- # If the token is numeric, then we likely didn't parse it
- # properly, so our guess is wrong
- float(tokens[i])
- return None
- except ValueError:
- pass
- output_format.append(tokens[i])
- guessed_format = ''.join(output_format)
- if parsed_datetime.strftime(guessed_format) == dt_str:
- return guessed_format
- def _guess_datetime_format_for_array(arr, **kwargs):
- # Try to guess the format based on the first non-NaN element
- non_nan_elements = com.notnull(arr).nonzero()[0]
- if len(non_nan_elements):
- return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
- def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
- format=None, coerce=False, unit='ns',
- infer_datetime_format=False):
- """
- Convert argument to datetime
- Parameters
- ----------
- arg : string, datetime, array of strings (with possible NAs)
- errors : {'ignore', 'raise'}, default 'ignore'
- Errors are ignored by default (values left untouched)
- dayfirst : boolean, default False
- If True parses dates with the day first, eg 20/01/2005
- Warning: dayfirst=True is not strict, but will prefer to parse
- with day first (this is a known bug).
- utc : boolean, default None
- Return UTC DatetimeIndex if True (converting any tz-aware
- datetime.datetime objects as well)
- box : boolean, default True
- If True returns a DatetimeIndex, if False returns ndarray of values
- format : string, default None
- strftime to parse time, eg "%d/%m/%Y"
- coerce : force errors to NaT (False by default)
- unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
- (e.g. a unix timestamp), which is an integer/float number
- infer_datetime_format: boolean, default False
- If no `format` is given, try to infer the format based on the first
- datetime string. Provides a large speed-up in many cases.
- Returns
- -------
- ret : datetime if parsing succeeded
- Examples
- --------
- Take separate series and convert to datetime
- >>> import pandas as pd
- >>> i = pd.date_range('20000101',periods=100)
- >>> df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))
- >>> pd.to_datetime(df.year*10000 + df.month*100 + df.day, format='%Y%m%d')
- Or from strings
- >>> df = df.astype(str)
- >>> pd.to_datetime(df.day + df.month + df.year, format="%d%m%Y")
- """
- from pandas import Timestamp
- from pandas.core.series import Series
- from pandas.tseries.index import DatetimeIndex
- def _convert_listlike(arg, box, format):
- if isinstance(arg, (list,tuple)):
- arg = np.array(arg, dtype='O')
- if com.is_datetime64_ns_dtype(arg):
- if box and not isinstance(arg, DatetimeIndex):
- try:
- return DatetimeIndex(arg, tz='utc' if utc else None)
- except ValueError:
- pass
- return arg
- arg = com._ensure_object(arg)
- if infer_datetime_format and format is None:
- format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
- if format is not None:
- # There is a special fast-path for iso8601 formatted
- # datetime strings, so in those cases don't use the inferred
- # format because this path makes process slower in this
- # special case
- format_is_iso8601 = (
- '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
- '%Y-%m-%d %H:%M:%S.%f'.startswith(format)
- )
- if format_is_iso8601:
- format = None
- try:
- result = None
- if format is not None:
- # shortcut formatting here
- if format == '%Y%m%d':
- try:
- result = _attempt_YYYYMMDD(arg)
- except:
- raise ValueError("cannot convert the input to '%Y%m%d' date format")
- # fallback
- if result is None:
- try:
- result = tslib.array_strptime(
- arg, format, coerce=coerce
- )
- except (tslib.OutOfBoundsDatetime):
- if errors == 'raise':
- raise
- result = arg
- except ValueError:
- # Only raise this error if the user provided the
- # datetime format, and not when it was inferred
- if not infer_datetime_format:
- raise
- if result is None and (format is None or infer_datetime_format):
- result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
- utc=utc, dayfirst=dayfirst,
- coerce=coerce, unit=unit)
- if com.is_datetime64_dtype(result) and box:
- result = DatetimeIndex(result, tz='utc' if utc else None)
- return result
- except ValueError as e:
- try:
- values, tz = tslib.datetime_to_datetime64(arg)
- return DatetimeIndex._simple_new(values, None, tz=tz)
- except (ValueError, TypeError):
- raise e
- if arg is None:
- return arg
- elif isinstance(arg, Timestamp):
- return arg
- elif isinstance(arg, Series):
- values = _convert_listlike(arg.values, False, format)
- return Series(values, index=arg.index, name=arg.name)
- elif com.is_list_like(arg):
- return _convert_listlike(arg, box, format)
- return _convert_listlike(np.array([ arg ]), box, format)[0]
- class DateParseError(ValueError):
- pass
- def _attempt_YYYYMMDD(arg):
- """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
- arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) """
- def calc(carg):
- # calculate the actual result
- carg = carg.astype(object)
- return lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100)
- def calc_with_mask(carg,mask):
- result = np.empty(carg.shape, dtype='M8[ns]')
- iresult = result.view('i8')
- iresult[~mask] = tslib.iNaT
- result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).astype('M8[ns]')
- return result
- # try intlike / strings that are ints
- try:
- return calc(arg.astype(np.int64))
- except:
- pass
- # a float with actual np.nan
- try:
- carg = arg.astype(np.float64)
- return calc_with_mask(carg,com.notnull(carg))
- except:
- pass
- # string with NaN-like
- try:
- mask = ~lib.ismember(arg, tslib._nat_strings)
- return calc_with_mask(arg,mask)
- except:
- pass
- return None
- # patterns for quarters like '4Q2005', '05Q1'
- qpat1full = re.compile(r'(\d)Q(\d\d\d\d)')
- qpat2full = re.compile(r'(\d\d\d\d)Q(\d)')
- qpat1 = re.compile(r'(\d)Q(\d\d)')
- qpat2 = re.compile(r'(\d\d)Q(\d)')
- ypat = re.compile(r'(\d\d\d\d)$')
- has_time = re.compile('(.+)([\s]|T)+(.+)')
- def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
- """
- Try hard to parse datetime string, leveraging dateutil plus some extra
- goodies like quarter recognition.
- Parameters
- ----------
- arg : compat.string_types
- freq : str or DateOffset, default None
- Helps with interpreting time string if supplied
- dayfirst : bool, default None
- If None uses default from print_config
- yearfirst : bool, default None
- If None uses default from print_config
- Returns
- -------
- datetime, datetime/dateutil.parser._result, str
- """
- from pandas.core.config import get_option
- from pandas.tseries.offsets import DateOffset
- from pandas.tseries.frequencies import (_get_rule_month, _month_numbers,
- _get_freq_str)
- if not isinstance(arg, compat.string_types):
- return arg
- arg = arg.upper()
- default = datetime(1, 1, 1).replace(hour=0, minute=0,
- second=0, microsecond=0)
- # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
- if len(arg) in [4, 6]:
- m = ypat.match(arg)
- if m:
- ret = default.replace(year=int(m.group(1)))
- return ret, ret, 'year'
- add_century = False
- if len(arg) == 4:
- add_century = True
- qpats = [(qpat1, 1), (qpat2, 0)]
- else:
- qpats = [(qpat1full, 1), (qpat2full, 0)]
- for pat, yfirst in qpats:
- qparse = pat.match(arg)
- if qparse is not None:
- if yfirst:
- yi, qi = 1, 2
- else:
- yi, qi = 2, 1
- q = int(qparse.group(yi))
- y_str = qparse.group(qi)
- y = int(y_str)
- if add_century:
- y += 2000
- if freq is not None:
- # hack attack, #1228
- mnum = _month_numbers[_get_rule_month(freq)] + 1
- month = (mnum + (q - 1) * 3) % 12 + 1
- if month > mnum:
- y -= 1
- else:
- month = (q - 1) * 3 + 1
- ret = default.replace(year=y, month=month)
- return ret, ret, 'quarter'
- is_mo_str = freq is not None and freq == 'M'
- is_mo_off = getattr(freq, 'rule_code', None) == 'M'
- is_monthly = is_mo_str or is_mo_off
- if len(arg) == 6 and is_monthly:
- try:
- ret = _try_parse_monthly(arg)
- if ret is not None:
- return ret, ret, 'month'
- except Exception:
- pass
- # montly f7u12
- mresult = _attempt_monthly(arg)
- if mresult:
- return mresult
- if dayfirst is None:
- dayfirst = get_option("display.date_dayfirst")
- if yearfirst is None:
- yearfirst = get_option("display.date_yearfirst")
- try:
- parsed, reso = dateutil_parse(arg, default, dayfirst=dayfirst,
- yearfirst=yearfirst)
- except Exception as e:
- # TODO: allow raise of errors within instead
- raise DateParseError(e)
- if parsed is None:
- raise DateParseError("Could not parse %s" % arg)
- return parsed, parsed, reso # datetime, resolution
- def dateutil_parse(timestr, default,
- ignoretz=False, tzinfos=None,
- **kwargs):
- """ lifted from dateutil to get resolution"""
- from dateutil import tz
- import time
- fobj = StringIO(str(timestr))
- res = DEFAULTPARSER._parse(fobj, **kwargs)
- # dateutil 2.2 compat
- if isinstance(res, tuple):
- res, _ = res
- if res is None:
- raise ValueError("unknown string format")
- repl = {}
- reso = None
- for attr in ["year", "month", "day", "hour",
- "minute", "second", "microsecond"]:
- value = getattr(res, attr)
- if value is not None:
- repl[attr] = value
- reso = attr
- if reso is None:
- raise ValueError("Cannot parse date.")
- if reso == 'microsecond':
- if repl['microsecond'] == 0:
- reso = 'second'
- elif repl['microsecond'] % 1000 == 0:
- reso = 'millisecond'
- ret = default.replace(**repl)
- if res.weekday is not None and not res.day:
- ret = ret + relativedelta.relativedelta(weekday=res.weekday)
- if not ignoretz:
- if callable(tzinfos) or tzinfos and res.tzname in tzinfos:
- if callable(tzinfos):
- tzdata = tzinfos(res.tzname, res.tzoffset)
- else:
- tzdata = tzinfos.get(res.tzname)
- if isinstance(tzdata, datetime.tzinfo):
- tzinfo = tzdata
- elif isinstance(tzdata, compat.string_types):
- tzinfo = tz.tzstr(tzdata)
- elif isinstance(tzdata, int):
- tzinfo = tz.tzoffset(res.tzname, tzdata)
- else:
- raise ValueError("offset must be tzinfo subclass, "
- "tz string, or int offset")
- ret = ret.replace(tzinfo=tzinfo)
- elif res.tzname and res.tzname in time.tzname:
- ret = ret.replace(tzinfo=tz.tzlocal())
- elif res.tzoffset == 0:
- ret = ret.replace(tzinfo=tz.tzutc())
- elif res.tzoffset:
- ret = ret.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset))
- return ret, reso
- def _attempt_monthly(val):
- pats = ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']
- for pat in pats:
- try:
- ret = datetime.strptime(val, pat)
- return ret, ret, 'month'
- except Exception:
- pass
- def _try_parse_monthly(arg):
- base = 2000
- add_base = False
- default = datetime(1, 1, 1).replace(hour=0, minute=0, second=0,
- microsecond=0)
- if len(arg) == 4:
- add_base = True
- y = int(arg[:2])
- m = int(arg[2:4])
- elif len(arg) >= 6: # 201201
- y = int(arg[:4])
- m = int(arg[4:6])
- if add_base:
- y += base
- ret = default.replace(year=y, month=m)
- return ret
- normalize_date = tslib.normalize_date
- def format(dt):
- """Returns date in YYYYMMDD format."""
- return dt.strftime('%Y%m%d')
- OLE_TIME_ZERO = datetime(1899, 12, 30, 0, 0, 0)
- def ole2datetime(oledt):
- """function for converting excel date to normal date format"""
- val = float(oledt)
- # Excel has a bug where it thinks the date 2/29/1900 exists
- # we just reject any date before 3/1/1900.
- if val < 61:
- raise ValueError("Value is outside of acceptable range: %s " % val)
- return OLE_TIME_ZERO + timedelta(days=val)