PageRenderTime 40ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/tseries/tools.py

http://github.com/pydata/pandas
Python | 593 lines | 441 code | 79 blank | 73 comment | 115 complexity | 478241ad239204c1d94119b029f4ffe9 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. from datetime import datetime, timedelta
  2. import re
  3. import sys
  4. import numpy as np
  5. import pandas.lib as lib
  6. import pandas.tslib as tslib
  7. import pandas.core.common as com
  8. from pandas.compat import StringIO, callable
  9. import pandas.compat as compat
  10. try:
  11. import dateutil
  12. from dateutil.parser import parse, DEFAULTPARSER
  13. from dateutil.relativedelta import relativedelta
  14. # raise exception if dateutil 2.0 install on 2.x platform
  15. if (sys.version_info[0] == 2 and
  16. dateutil.__version__ == '2.0'): # pragma: no cover
  17. raise Exception('dateutil 2.0 incompatible with Python 2.x, you must '
  18. 'install version 1.5 or 2.1+!')
  19. except ImportError: # pragma: no cover
  20. print('Please install python-dateutil via easy_install or some method!')
  21. raise # otherwise a 2nd import won't show the message
  22. _DATEUTIL_LEXER_SPLIT = None
  23. try:
  24. # Since these are private methods from dateutil, it is safely imported
  25. # here so in case this interface changes, pandas will just fallback
  26. # to not using the functionality
  27. from dateutil.parser import _timelex
  28. if hasattr(_timelex, 'split'):
  29. def _lexer_split_from_str(dt_str):
  30. # The StringIO(str(_)) is for dateutil 2.2 compatibility
  31. return _timelex.split(StringIO(str(dt_str)))
  32. _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str
  33. except (ImportError, AttributeError):
  34. pass
  35. def _infer_tzinfo(start, end):
  36. def _infer(a, b):
  37. tz = a.tzinfo
  38. if b and b.tzinfo:
  39. if not (tslib.get_timezone(tz) == tslib.get_timezone(b.tzinfo)):
  40. raise AssertionError('Inputs must both have the same timezone,'
  41. ' {0} != {1}'.format(tz, b.tzinfo))
  42. return tz
  43. tz = None
  44. if start is not None:
  45. tz = _infer(start, end)
  46. elif end is not None:
  47. tz = _infer(end, start)
  48. return tz
  49. def _maybe_get_tz(tz, date=None):
  50. tz = tslib.maybe_get_tz(tz)
  51. if com.is_integer(tz):
  52. import pytz
  53. tz = pytz.FixedOffset(tz / 60)
  54. # localize and get the tz
  55. if date is not None and tz is not None:
  56. if date.tzinfo is not None and hasattr(tz,'localize'):
  57. tz = tz.localize(date.replace(tzinfo=None)).tzinfo
  58. return tz
  59. def _guess_datetime_format(dt_str, dayfirst=False,
  60. dt_str_parse=compat.parse_date,
  61. dt_str_split=_DATEUTIL_LEXER_SPLIT):
  62. """
  63. Guess the datetime format of a given datetime string.
  64. Parameters
  65. ----------
  66. dt_str : string, datetime string to guess the format of
  67. dayfirst : boolean, default False
  68. If True parses dates with the day first, eg 20/01/2005
  69. Warning: dayfirst=True is not strict, but will prefer to parse
  70. with day first (this is a known bug).
  71. dt_str_parse : function, defaults to `compate.parse_date` (dateutil)
  72. This function should take in a datetime string and return
  73. a `datetime.datetime` guess that the datetime string represents
  74. dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil)
  75. This function should take in a datetime string and return
  76. a list of strings, the guess of the various specific parts
  77. e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30']
  78. Returns
  79. -------
  80. ret : datetime formatt string (for `strftime` or `strptime`)
  81. """
  82. if dt_str_parse is None or dt_str_split is None:
  83. return None
  84. if not isinstance(dt_str, compat.string_types):
  85. return None
  86. day_attribute_and_format = (('day',), '%d')
  87. datetime_attrs_to_format = [
  88. (('year', 'month', 'day'), '%Y%m%d'),
  89. (('year',), '%Y'),
  90. (('month',), '%B'),
  91. (('month',), '%b'),
  92. (('month',), '%m'),
  93. day_attribute_and_format,
  94. (('hour',), '%H'),
  95. (('minute',), '%M'),
  96. (('second',), '%S'),
  97. (('microsecond',), '%f'),
  98. (('second', 'microsecond'), '%S.%f'),
  99. ]
  100. if dayfirst:
  101. datetime_attrs_to_format.remove(day_attribute_and_format)
  102. datetime_attrs_to_format.insert(0, day_attribute_and_format)
  103. try:
  104. parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst)
  105. except:
  106. # In case the datetime can't be parsed, its format cannot be guessed
  107. return None
  108. if parsed_datetime is None:
  109. return None
  110. try:
  111. tokens = dt_str_split(dt_str)
  112. except:
  113. # In case the datetime string can't be split, its format cannot
  114. # be guessed
  115. return None
  116. format_guess = [None] * len(tokens)
  117. found_attrs = set()
  118. for attrs, attr_format in datetime_attrs_to_format:
  119. # If a given attribute has been placed in the format string, skip
  120. # over other formats for that same underlying attribute (IE, month
  121. # can be represented in multiple different ways)
  122. if set(attrs) & found_attrs:
  123. continue
  124. if all(getattr(parsed_datetime, attr) is not None for attr in attrs):
  125. for i, token_format in enumerate(format_guess):
  126. if (token_format is None and
  127. tokens[i] == parsed_datetime.strftime(attr_format)):
  128. format_guess[i] = attr_format
  129. found_attrs.update(attrs)
  130. break
  131. # Only consider it a valid guess if we have a year, month and day
  132. if len(set(['year', 'month', 'day']) & found_attrs) != 3:
  133. return None
  134. output_format = []
  135. for i, guess in enumerate(format_guess):
  136. if guess is not None:
  137. # Either fill in the format placeholder (like %Y)
  138. output_format.append(guess)
  139. else:
  140. # Or just the token separate (IE, the dashes in "01-01-2013")
  141. try:
  142. # If the token is numeric, then we likely didn't parse it
  143. # properly, so our guess is wrong
  144. float(tokens[i])
  145. return None
  146. except ValueError:
  147. pass
  148. output_format.append(tokens[i])
  149. guessed_format = ''.join(output_format)
  150. if parsed_datetime.strftime(guessed_format) == dt_str:
  151. return guessed_format
  152. def _guess_datetime_format_for_array(arr, **kwargs):
  153. # Try to guess the format based on the first non-NaN element
  154. non_nan_elements = com.notnull(arr).nonzero()[0]
  155. if len(non_nan_elements):
  156. return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
  157. def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
  158. format=None, coerce=False, unit='ns',
  159. infer_datetime_format=False):
  160. """
  161. Convert argument to datetime
  162. Parameters
  163. ----------
  164. arg : string, datetime, array of strings (with possible NAs)
  165. errors : {'ignore', 'raise'}, default 'ignore'
  166. Errors are ignored by default (values left untouched)
  167. dayfirst : boolean, default False
  168. If True parses dates with the day first, eg 20/01/2005
  169. Warning: dayfirst=True is not strict, but will prefer to parse
  170. with day first (this is a known bug).
  171. utc : boolean, default None
  172. Return UTC DatetimeIndex if True (converting any tz-aware
  173. datetime.datetime objects as well)
  174. box : boolean, default True
  175. If True returns a DatetimeIndex, if False returns ndarray of values
  176. format : string, default None
  177. strftime to parse time, eg "%d/%m/%Y"
  178. coerce : force errors to NaT (False by default)
  179. unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
  180. (e.g. a unix timestamp), which is an integer/float number
  181. infer_datetime_format: boolean, default False
  182. If no `format` is given, try to infer the format based on the first
  183. datetime string. Provides a large speed-up in many cases.
  184. Returns
  185. -------
  186. ret : datetime if parsing succeeded
  187. Examples
  188. --------
  189. Take separate series and convert to datetime
  190. >>> import pandas as pd
  191. >>> i = pd.date_range('20000101',periods=100)
  192. >>> df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))
  193. >>> pd.to_datetime(df.year*10000 + df.month*100 + df.day, format='%Y%m%d')
  194. Or from strings
  195. >>> df = df.astype(str)
  196. >>> pd.to_datetime(df.day + df.month + df.year, format="%d%m%Y")
  197. """
  198. from pandas import Timestamp
  199. from pandas.core.series import Series
  200. from pandas.tseries.index import DatetimeIndex
  201. def _convert_listlike(arg, box, format):
  202. if isinstance(arg, (list,tuple)):
  203. arg = np.array(arg, dtype='O')
  204. if com.is_datetime64_ns_dtype(arg):
  205. if box and not isinstance(arg, DatetimeIndex):
  206. try:
  207. return DatetimeIndex(arg, tz='utc' if utc else None)
  208. except ValueError:
  209. pass
  210. return arg
  211. arg = com._ensure_object(arg)
  212. if infer_datetime_format and format is None:
  213. format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
  214. if format is not None:
  215. # There is a special fast-path for iso8601 formatted
  216. # datetime strings, so in those cases don't use the inferred
  217. # format because this path makes process slower in this
  218. # special case
  219. format_is_iso8601 = (
  220. '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
  221. '%Y-%m-%d %H:%M:%S.%f'.startswith(format)
  222. )
  223. if format_is_iso8601:
  224. format = None
  225. try:
  226. result = None
  227. if format is not None:
  228. # shortcut formatting here
  229. if format == '%Y%m%d':
  230. try:
  231. result = _attempt_YYYYMMDD(arg)
  232. except:
  233. raise ValueError("cannot convert the input to '%Y%m%d' date format")
  234. # fallback
  235. if result is None:
  236. try:
  237. result = tslib.array_strptime(
  238. arg, format, coerce=coerce
  239. )
  240. except (tslib.OutOfBoundsDatetime):
  241. if errors == 'raise':
  242. raise
  243. result = arg
  244. except ValueError:
  245. # Only raise this error if the user provided the
  246. # datetime format, and not when it was inferred
  247. if not infer_datetime_format:
  248. raise
  249. if result is None and (format is None or infer_datetime_format):
  250. result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
  251. utc=utc, dayfirst=dayfirst,
  252. coerce=coerce, unit=unit)
  253. if com.is_datetime64_dtype(result) and box:
  254. result = DatetimeIndex(result, tz='utc' if utc else None)
  255. return result
  256. except ValueError as e:
  257. try:
  258. values, tz = tslib.datetime_to_datetime64(arg)
  259. return DatetimeIndex._simple_new(values, None, tz=tz)
  260. except (ValueError, TypeError):
  261. raise e
  262. if arg is None:
  263. return arg
  264. elif isinstance(arg, Timestamp):
  265. return arg
  266. elif isinstance(arg, Series):
  267. values = _convert_listlike(arg.values, False, format)
  268. return Series(values, index=arg.index, name=arg.name)
  269. elif com.is_list_like(arg):
  270. return _convert_listlike(arg, box, format)
  271. return _convert_listlike(np.array([ arg ]), box, format)[0]
  272. class DateParseError(ValueError):
  273. pass
  274. def _attempt_YYYYMMDD(arg):
  275. """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
  276. arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) """
  277. def calc(carg):
  278. # calculate the actual result
  279. carg = carg.astype(object)
  280. return lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100)
  281. def calc_with_mask(carg,mask):
  282. result = np.empty(carg.shape, dtype='M8[ns]')
  283. iresult = result.view('i8')
  284. iresult[~mask] = tslib.iNaT
  285. result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).astype('M8[ns]')
  286. return result
  287. # try intlike / strings that are ints
  288. try:
  289. return calc(arg.astype(np.int64))
  290. except:
  291. pass
  292. # a float with actual np.nan
  293. try:
  294. carg = arg.astype(np.float64)
  295. return calc_with_mask(carg,com.notnull(carg))
  296. except:
  297. pass
  298. # string with NaN-like
  299. try:
  300. mask = ~lib.ismember(arg, tslib._nat_strings)
  301. return calc_with_mask(arg,mask)
  302. except:
  303. pass
  304. return None
  305. # patterns for quarters like '4Q2005', '05Q1'
  306. qpat1full = re.compile(r'(\d)Q(\d\d\d\d)')
  307. qpat2full = re.compile(r'(\d\d\d\d)Q(\d)')
  308. qpat1 = re.compile(r'(\d)Q(\d\d)')
  309. qpat2 = re.compile(r'(\d\d)Q(\d)')
  310. ypat = re.compile(r'(\d\d\d\d)$')
  311. has_time = re.compile('(.+)([\s]|T)+(.+)')
  312. def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
  313. """
  314. Try hard to parse datetime string, leveraging dateutil plus some extra
  315. goodies like quarter recognition.
  316. Parameters
  317. ----------
  318. arg : compat.string_types
  319. freq : str or DateOffset, default None
  320. Helps with interpreting time string if supplied
  321. dayfirst : bool, default None
  322. If None uses default from print_config
  323. yearfirst : bool, default None
  324. If None uses default from print_config
  325. Returns
  326. -------
  327. datetime, datetime/dateutil.parser._result, str
  328. """
  329. from pandas.core.config import get_option
  330. from pandas.tseries.offsets import DateOffset
  331. from pandas.tseries.frequencies import (_get_rule_month, _month_numbers,
  332. _get_freq_str)
  333. if not isinstance(arg, compat.string_types):
  334. return arg
  335. arg = arg.upper()
  336. default = datetime(1, 1, 1).replace(hour=0, minute=0,
  337. second=0, microsecond=0)
  338. # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
  339. if len(arg) in [4, 6]:
  340. m = ypat.match(arg)
  341. if m:
  342. ret = default.replace(year=int(m.group(1)))
  343. return ret, ret, 'year'
  344. add_century = False
  345. if len(arg) == 4:
  346. add_century = True
  347. qpats = [(qpat1, 1), (qpat2, 0)]
  348. else:
  349. qpats = [(qpat1full, 1), (qpat2full, 0)]
  350. for pat, yfirst in qpats:
  351. qparse = pat.match(arg)
  352. if qparse is not None:
  353. if yfirst:
  354. yi, qi = 1, 2
  355. else:
  356. yi, qi = 2, 1
  357. q = int(qparse.group(yi))
  358. y_str = qparse.group(qi)
  359. y = int(y_str)
  360. if add_century:
  361. y += 2000
  362. if freq is not None:
  363. # hack attack, #1228
  364. mnum = _month_numbers[_get_rule_month(freq)] + 1
  365. month = (mnum + (q - 1) * 3) % 12 + 1
  366. if month > mnum:
  367. y -= 1
  368. else:
  369. month = (q - 1) * 3 + 1
  370. ret = default.replace(year=y, month=month)
  371. return ret, ret, 'quarter'
  372. is_mo_str = freq is not None and freq == 'M'
  373. is_mo_off = getattr(freq, 'rule_code', None) == 'M'
  374. is_monthly = is_mo_str or is_mo_off
  375. if len(arg) == 6 and is_monthly:
  376. try:
  377. ret = _try_parse_monthly(arg)
  378. if ret is not None:
  379. return ret, ret, 'month'
  380. except Exception:
  381. pass
  382. # montly f7u12
  383. mresult = _attempt_monthly(arg)
  384. if mresult:
  385. return mresult
  386. if dayfirst is None:
  387. dayfirst = get_option("display.date_dayfirst")
  388. if yearfirst is None:
  389. yearfirst = get_option("display.date_yearfirst")
  390. try:
  391. parsed, reso = dateutil_parse(arg, default, dayfirst=dayfirst,
  392. yearfirst=yearfirst)
  393. except Exception as e:
  394. # TODO: allow raise of errors within instead
  395. raise DateParseError(e)
  396. if parsed is None:
  397. raise DateParseError("Could not parse %s" % arg)
  398. return parsed, parsed, reso # datetime, resolution
  399. def dateutil_parse(timestr, default,
  400. ignoretz=False, tzinfos=None,
  401. **kwargs):
  402. """ lifted from dateutil to get resolution"""
  403. from dateutil import tz
  404. import time
  405. fobj = StringIO(str(timestr))
  406. res = DEFAULTPARSER._parse(fobj, **kwargs)
  407. # dateutil 2.2 compat
  408. if isinstance(res, tuple):
  409. res, _ = res
  410. if res is None:
  411. raise ValueError("unknown string format")
  412. repl = {}
  413. reso = None
  414. for attr in ["year", "month", "day", "hour",
  415. "minute", "second", "microsecond"]:
  416. value = getattr(res, attr)
  417. if value is not None:
  418. repl[attr] = value
  419. reso = attr
  420. if reso is None:
  421. raise ValueError("Cannot parse date.")
  422. if reso == 'microsecond':
  423. if repl['microsecond'] == 0:
  424. reso = 'second'
  425. elif repl['microsecond'] % 1000 == 0:
  426. reso = 'millisecond'
  427. ret = default.replace(**repl)
  428. if res.weekday is not None and not res.day:
  429. ret = ret + relativedelta.relativedelta(weekday=res.weekday)
  430. if not ignoretz:
  431. if callable(tzinfos) or tzinfos and res.tzname in tzinfos:
  432. if callable(tzinfos):
  433. tzdata = tzinfos(res.tzname, res.tzoffset)
  434. else:
  435. tzdata = tzinfos.get(res.tzname)
  436. if isinstance(tzdata, datetime.tzinfo):
  437. tzinfo = tzdata
  438. elif isinstance(tzdata, compat.string_types):
  439. tzinfo = tz.tzstr(tzdata)
  440. elif isinstance(tzdata, int):
  441. tzinfo = tz.tzoffset(res.tzname, tzdata)
  442. else:
  443. raise ValueError("offset must be tzinfo subclass, "
  444. "tz string, or int offset")
  445. ret = ret.replace(tzinfo=tzinfo)
  446. elif res.tzname and res.tzname in time.tzname:
  447. ret = ret.replace(tzinfo=tz.tzlocal())
  448. elif res.tzoffset == 0:
  449. ret = ret.replace(tzinfo=tz.tzutc())
  450. elif res.tzoffset:
  451. ret = ret.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset))
  452. return ret, reso
  453. def _attempt_monthly(val):
  454. pats = ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']
  455. for pat in pats:
  456. try:
  457. ret = datetime.strptime(val, pat)
  458. return ret, ret, 'month'
  459. except Exception:
  460. pass
  461. def _try_parse_monthly(arg):
  462. base = 2000
  463. add_base = False
  464. default = datetime(1, 1, 1).replace(hour=0, minute=0, second=0,
  465. microsecond=0)
  466. if len(arg) == 4:
  467. add_base = True
  468. y = int(arg[:2])
  469. m = int(arg[2:4])
  470. elif len(arg) >= 6: # 201201
  471. y = int(arg[:4])
  472. m = int(arg[4:6])
  473. if add_base:
  474. y += base
  475. ret = default.replace(year=y, month=m)
  476. return ret
  477. normalize_date = tslib.normalize_date
  478. def format(dt):
  479. """Returns date in YYYYMMDD format."""
  480. return dt.strftime('%Y%m%d')
  481. OLE_TIME_ZERO = datetime(1899, 12, 30, 0, 0, 0)
  482. def ole2datetime(oledt):
  483. """function for converting excel date to normal date format"""
  484. val = float(oledt)
  485. # Excel has a bug where it thinks the date 2/29/1900 exists
  486. # we just reject any date before 3/1/1900.
  487. if val < 61:
  488. raise ValueError("Value is outside of acceptable range: %s " % val)
  489. return OLE_TIME_ZERO + timedelta(days=val)