PageRenderTime 53ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/perlinpinpin.py

https://github.com/cyberdelia/perlinpinpin
Python | 414 lines | 401 code | 11 blank | 2 comment | 0 complexity | ad7e1b6c5f8bea9f5171b13d2bd7e9c5 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. # -*- coding: utf-8 -*-
  2. import calendar
  3. import datetime
  4. import os
  5. import re
  6. import time
  7. __version__ = "0.9.1"
  8. class Perlinpinpin(object):
  9. days = "Lundi Mardi Mercredi Jeudi Vendredi Samedi Dimanche".split(' ')
  10. months = "Janvier Fevrier Mars Avril Mai Juin Juillet Aout Septembre Octobre Novembre Decembre".split(' ')
  11. def __init__(self):
  12. relative = r"(?:aujourd[']?hui|hier|maintenant|matin|soir|apres[\s-]?midi|demain|apres[\s-]?demain|avant[\s-]?hier)"
  13. relative_spec = r"(?:suivant[es]?|précédent[es]?|prochain[es]?|dernier[es]?)"
  14. relative_delta = r"(?:dans|il\sy\sa)"
  15. delta = r"(?:%s\s+(?:\d+\s+(?:(?:semaine|jour|heure|minute|seconde)[s]?)+[^\d]*)+)" % relative_delta
  16. weekday = r"(?:le|%s)" % '|'.join(Perlinpinpin.days)
  17. month = r"(?:%s)" % '|'.join(Perlinpinpin.months)
  18. relative_weekday = r"(?:%s\s*%s)" % (weekday, relative_spec)
  19. relative_week = r"(?:semaine\s*%s)" % relative_spec
  20. # 1 - 31
  21. cardinal_monthday = r"(?:[1-9]|[0-2][0-9]|3[01])"
  22. monthday = r"(?:%s\s*(ier|er|iere)?)" % cardinal_monthday
  23. day_month = r"(?:(%s)?\s*%s\s*%s)" % (
  24. weekday, monthday, month
  25. )
  26. month_day = r"(?:%s\s*%s)" % (month, monthday)
  27. day_month_year = r"(?:(?:%s|%s)[-\s]*\d{4})" % (
  28. day_month, month_day
  29. )
  30. day = r"(?:(le|%s)+\s*%s)" % (weekday, monthday)
  31. yyyymmdd = r"(?:\d{4}[-/]?\d{1,2}[-/]?\d{1,2})"
  32. ddmmyy = r"(?:\d{1,2}[-/]?\d{1,2}[-/]?\d{2})"
  33. ddmmyyyy = r"(?:\d{1,2}[-/]?\d{1,2}[-/]?\d{4})"
  34. self.detect = re.compile(r"""
  35. \b(
  36. %(relative)s
  37. | %(relative_weekday)s # Vendredi dernier
  38. | %(relative_week)s # Semaine suivante
  39. | %(delta)s # Dans 1 semaine
  40. | %(day_month_year)s # 12 Décembre, 1985
  41. | %(day_month)s # 12 Septembre
  42. | %(day)s # le 12
  43. | %(month_day)s # Novembre 13
  44. | %(yyyymmdd)s # 1986/11/13
  45. | %(ddmmyyyy)s # 11-13-1986
  46. | %(ddmmyy)s # 11-13-86
  47. )\b
  48. """ % {
  49. 'relative': relative,
  50. 'relative_weekday': relative_weekday,
  51. 'relative_week': relative_week,
  52. 'delta': delta,
  53. 'weekday': weekday,
  54. 'day': day,
  55. 'day_month_year': day_month_year,
  56. 'day_month': day_month,
  57. 'month_day': month_day,
  58. 'yyyymmdd': yyyymmdd,
  59. 'ddmmyy': ddmmyy,
  60. 'ddmmyyyy': ddmmyyyy,
  61. }, (re.VERBOSE | re.IGNORECASE))
  62. self.convert = [
  63. # Il y a x
  64. (re.compile(
  65. r'''^
  66. il\sy\sa\s
  67. ((?P<weeks>\d+) \s semaine(s?)?)?
  68. [^\d]*
  69. ((?P<days>\d+) \s jour(s?)?)?
  70. [^\d]*
  71. ((?P<hours>\d+) \s heure(s?)?)?
  72. [^\d]*
  73. ((?P<minutes>\d+) \s minute(s?)?)?
  74. [^\d]*
  75. ((?P<seconds>\d+) \s seconde(s?)?)?
  76. ''',
  77. (re.VERBOSE | re.IGNORECASE)),
  78. lambda m: datetime.date.today() - datetime.timedelta(
  79. days=int(m.group('days') or 0),
  80. seconds=int(m.group('seconds') or 0),
  81. minutes=int(m.group('minutes') or 0),
  82. hours=int(m.group('hours') or 0),
  83. weeks=int(m.group('weeks') or 0))),
  84. # Dans x
  85. (re.compile(
  86. r'''^
  87. dans\s
  88. ((?P<weeks>\d+) \s semaine(s?)?)?
  89. [^\d]*
  90. ((?P<days>\d+) \s jour(s?)?)?
  91. [^\d]*
  92. ((?P<hours>\d+) \s heure(s?)?)?
  93. [^\d]*
  94. ((?P<minutes>\d+) \s minute(s?)?)?
  95. [^\d]*
  96. ((?P<seconds>\d+) \s seconde(s?)?)?
  97. ''',
  98. (re.VERBOSE | re.IGNORECASE)),
  99. lambda m: datetime.date.today() + datetime.timedelta(
  100. days=int(m.group('days') or 0),
  101. seconds=int(m.group('seconds') or 0),
  102. minutes=int(m.group('minutes') or 0),
  103. hours=int(m.group('hours') or 0),
  104. weeks=int(m.group('weeks') or 0))),
  105. # Today
  106. (re.compile(
  107. r'''^
  108. aujourd[']?hui # Today
  109. ''',
  110. (re.VERBOSE | re.IGNORECASE)),
  111. lambda m: datetime.date.today()),
  112. # Now
  113. (re.compile(
  114. r'''^
  115. maintenant # Now
  116. ''',
  117. (re.VERBOSE | re.IGNORECASE)),
  118. lambda m: datetime.date.today()),
  119. # Tomorrow
  120. (re.compile(
  121. r'''^
  122. demain # Tomorrow
  123. ''',
  124. (re.VERBOSE | re.IGNORECASE)),
  125. lambda m: datetime.date.today() + datetime.timedelta(days=1)),
  126. # Yesterday
  127. (re.compile(
  128. r'''^
  129. hier # Yesterday
  130. ''',
  131. (re.VERBOSE | re.IGNORECASE)),
  132. lambda m: datetime.date.today() - datetime.timedelta(days=1)),
  133. # After-tomorrow
  134. (re.compile(
  135. r'''^
  136. apres[\s-]?demain # After-tomorrow
  137. ''',
  138. (re.VERBOSE | re.IGNORECASE)),
  139. lambda m: datetime.date.today() + datetime.timedelta(days=2)),
  140. # Before-yesterday
  141. (re.compile(
  142. r'''^
  143. avant[\s-]?hier # Before-yesterday
  144. ''',
  145. (re.VERBOSE | re.IGNORECASE)),
  146. lambda m: datetime.date.today() - datetime.timedelta(days=2)),
  147. # This morning
  148. (re.compile(
  149. r'''^
  150. matin # morning
  151. $ # EOL
  152. ''',
  153. (re.VERBOSE | re.IGNORECASE)),
  154. lambda m: datetime.date.today()),
  155. # This afternoon
  156. (re.compile(
  157. r'''^
  158. apres[\s-]?midi # afternoon
  159. $ # EOL
  160. ''',
  161. (re.VERBOSE | re.IGNORECASE)),
  162. lambda m: datetime.date.today()),
  163. # This evening
  164. (re.compile(
  165. r'''^
  166. soir # evening
  167. $ # EOL
  168. ''',
  169. (re.VERBOSE | re.IGNORECASE)),
  170. lambda m: datetime.date.today()),
  171. # 4
  172. (re.compile(
  173. r'''^
  174. (le\s*)? # le
  175. (%s\s*)? # vendredi
  176. (?P<day>[1-9]|[0-2][0-9]|3[01]) # 4
  177. (?:\s*(ier|er|iere)?) # optional suffix
  178. $ # EOL
  179. ''' % weekday,
  180. (re.VERBOSE | re.IGNORECASE)),
  181. lambda m: datetime.date.today().replace(
  182. day=int(m.group('day')))),
  183. # 4 Janvier
  184. (re.compile(
  185. r'''^
  186. (%s\s*)? # vendredi
  187. (?P<day>[1-9]|[0-2][0-9]|3[01]) # 4
  188. (?:\s*(ier|er|iere)?) # optional suffix
  189. \s+ # whitespace
  190. (?P<month>%s+) # Janvier
  191. $ # EOL
  192. ''' % (weekday, '|'.join(Perlinpinpin.months)),
  193. (re.VERBOSE | re.IGNORECASE)),
  194. lambda m: datetime.date.today().replace(
  195. day=int(m.group('day')),
  196. month=self._month(m.group('month')))),
  197. # 4 Janvier 2003
  198. (re.compile(
  199. r'''^
  200. (%s\s*)? # vendredi
  201. (?P<day>[1-9]|[0-2][0-9]|3[01]) # 4
  202. (?:\s*(ier|er|iere)?) # optional suffix
  203. \s+ # whitespace
  204. (?P<month>%s+) # Janvier
  205. ,? # optional comma
  206. \s+ # whitespace
  207. (?P<year>\d{4}) # 2003
  208. $ # EOL
  209. ''' % (weekday, '|'.join(Perlinpinpin.months)),
  210. (re.VERBOSE | re.IGNORECASE)),
  211. lambda m: datetime.date(
  212. year=int(m.group('year')),
  213. month=self._month(m.group('month')),
  214. day=int(m.group('day')))),
  215. # dd/mm/yyyy (European style, default in case of doubt)
  216. (re.compile(
  217. r'''^
  218. (?P<day>[1-9]|[0-2][0-9]|3[01]) # d or dd
  219. [-/]? #
  220. (?P<month>[1-9]|0[0-9]|1[0-2]) # m or mm
  221. [-/]? #
  222. (?P<year>\d{4}) # yyyy
  223. $ # EOL
  224. ''',
  225. (re.VERBOSE | re.IGNORECASE)),
  226. lambda m: datetime.date(*time.strptime(
  227. "%s %s %s" % (m.group('year'), m.group('month'), m.group('day')), "%Y %m %d")[:3]
  228. )),
  229. # dd/mm/yy (European short style)
  230. (re.compile(
  231. r'''^
  232. (?P<day>[1-9]|[0-2][0-9]|3[01]) # d or dd
  233. [-/]? #
  234. (?P<month>[1-9]|0[0-9]|1[0-2]) # m or mm
  235. [-/]? #
  236. (?P<year>\d{2}) # yy
  237. $ # EOL
  238. ''',
  239. (re.VERBOSE | re.IGNORECASE)),
  240. lambda m: datetime.date(*time.strptime(
  241. "%s %s %s" % (m.group('year'), m.group('month'), m.group('day')), "%y %m %d")[:3]
  242. )),
  243. # mm/dd/yyyy (American style)
  244. (re.compile(
  245. r'''^
  246. (?P<month>[1-9]|0[0-9]|1[0-2]) # m or mm
  247. [-/]? #
  248. (?P<day>[1-9]|[0-2][0-9]|3[01]) # d or dd
  249. [-/]? #
  250. (?P<year>\d{4}) # yyyy
  251. $ # EOL
  252. ''',
  253. (re.VERBOSE | re.IGNORECASE)),
  254. lambda m: datetime.date(*time.strptime(
  255. "%s %s %s" % (m.group('year'), m.group('month'), m.group('day')), "%Y %m %d")[:3]
  256. )),
  257. # mm/dd/yy (American short style)
  258. (re.compile(
  259. r'''^
  260. (?P<month>[1-9]|0[0-9]|1[0-2]) # m or mm
  261. [-/]? #
  262. (?P<day>[1-9]|[0-2][0-9]|3[01]) # d or dd
  263. [-/]? #
  264. (?P<year>\d{2}) # yy
  265. $ # EOL
  266. ''',
  267. (re.VERBOSE | re.IGNORECASE)),
  268. lambda m: datetime.date(*time.strptime(
  269. "%s %s %s" % (m.group('year'), m.group('month'), m.group('day')), "%y %m %d")[:3]
  270. )),
  271. # yyyy-mm-dd (ISO style)
  272. (re.compile(
  273. r'''^
  274. (?P<year>\d{4}) # yyyy
  275. [-/]? #
  276. (?P<month>[1-9]|0[0-9]|1[0-2]) # m or mm
  277. [-/]? #
  278. (?P<day>[1-9]|[0-2][0-9]|3[01]) # d or dd
  279. $ # EOL
  280. ''',
  281. (re.VERBOSE | re.IGNORECASE)),
  282. lambda m: datetime.date(
  283. year=int(m.group('year')),
  284. month=int(m.group('month')),
  285. day=int(m.group('day')))),
  286. # Semaine dernière
  287. (re.compile(
  288. r'''^
  289. semaine # week
  290. \s+ # whitespace
  291. (derniere|precedente)? # last
  292. $ # EOL
  293. ''',
  294. (re.VERBOSE | re.IGNORECASE)),
  295. lambda m: datetime.date.today() - datetime.timedelta(days=7)),
  296. # Semaine prochaine
  297. (re.compile(
  298. r'''^
  299. semaine # week
  300. \s+ # whitespace
  301. (prochaine|suivante)? # last
  302. $ # EOL
  303. ''',
  304. (re.VERBOSE | re.IGNORECASE)),
  305. lambda m: datetime.date.today() + datetime.timedelta(days=7)),
  306. # Mardi prochain
  307. (re.compile(
  308. r'''^
  309. (?P<weekday>\w+) # Mardi
  310. \s+ # whitespace
  311. (prochain|suivant)? # next
  312. $ # EOL
  313. ''',
  314. (re.VERBOSE | re.IGNORECASE)),
  315. lambda m: self._next_weekday(self._weekday(m.group('weekday')))),
  316. # Mardi dernier
  317. (re.compile(
  318. r'''^
  319. (?P<weekday>\w+) # Mardi
  320. \s+ # whitespace
  321. (dernier|precedent)? # last
  322. $ # EOL
  323. ''',
  324. (re.VERBOSE | re.IGNORECASE)),
  325. lambda m: self._last_weekday(self._weekday(m.group('weekday')))),
  326. ]
  327. def extract(self, text, timezone=None):
  328. """Extract dates from fuzzy text with respect to the given timezone"""
  329. text = self._normalize(text)
  330. if timezone:
  331. os.environ['TZ'] = timezone
  332. matches = []
  333. for match in self.detect.finditer(text.strip()):
  334. if match:
  335. date = self.parse(match.group())
  336. if date:
  337. matches.append(date)
  338. return matches
  339. def parse(self, date, timezone=None):
  340. """Parse fuzzy date with respect to the given timezone"""
  341. date = self._normalize(date)
  342. if timezone:
  343. os.environ['TZ'] = timezone
  344. for regexp, func in self.convert:
  345. match = regexp.match(date.strip())
  346. if match:
  347. return func(match)
  348. return None
  349. def _normalize(self, text):
  350. """Remove accents from text"""
  351. import unicodedata
  352. return unicodedata.normalize('NFKD', unicode(text)).encode('ASCII', 'ignore')
  353. def _month(self, text):
  354. """Get the month as a decimal number"""
  355. for i, month in enumerate(Perlinpinpin.months):
  356. regexp = re.compile(text, re.IGNORECASE)
  357. if regexp.match(month):
  358. return i + 1
  359. else:
  360. raise ValueError
  361. def _weekday(self, text):
  362. """Get weekday as a decimal number"""
  363. for i, day in enumerate(Perlinpinpin.days):
  364. regexp = re.compile(text, re.IGNORECASE)
  365. if regexp.match(day):
  366. return i
  367. else:
  368. raise ValueError
  369. def _next_weekday(self, weekday):
  370. """Get next weekday as a date"""
  371. day = datetime.date.today() + datetime.timedelta(days=1)
  372. while calendar.weekday(*day.timetuple()[:3]) != weekday:
  373. day = day + datetime.timedelta(days=1)
  374. return day
  375. def _last_weekday(self, weekday):
  376. """Get previous weekday as a date"""
  377. day = datetime.date.today() - datetime.timedelta(days=1)
  378. while calendar.weekday(*day.timetuple()[:3]) != weekday:
  379. day = day - datetime.timedelta(days=1)
  380. return day
  381. def perlinpinpin(text, timezone=None):
  382. dates = Perlinpinpin().extract(text, timezone)
  383. if dates:
  384. return dates[0]
  385. raise ValueError
  386. def parse(text, timezone=None):
  387. return Perlinpinpin().parse(text, timezone)
  388. def extract(text, timezone=None):
  389. return Perlinpinpin().extract(text, timezone)