PageRenderTime 44ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/corehq/apps/es/aggregations.py

https://github.com/dimagi/commcare-hq
Python | 630 lines | 617 code | 0 blank | 13 comment | 2 complexity | 3cb495ab544f9f1a279510ae4f0da9b4 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1
  1. """
  2. Aggregate Queries
  3. -----------------
  4. Aggregations are a replacement for Facets
  5. Here is an example used to calculate how many new pregnancy cases each user has
  6. opened in a certain date range.
  7. .. code-block:: python
  8. res = (CaseES()
  9. .domain(self.domain)
  10. .case_type('pregnancy')
  11. .date_range('opened_on', gte=startdate, lte=enddate))
  12. .aggregation(TermsAggregation('by_user', 'opened_by')
  13. .size(0)
  14. buckets = res.aggregations.by_user.buckets
  15. buckets.user1.doc_count
  16. There's a bit of magic happening here - you can access the raw json data from
  17. this aggregation via ``res.aggregation('by_user')`` if you'd prefer to skip it.
  18. The ``res`` object has a ``aggregations`` property, which returns a namedtuple
  19. pointing to the wrapped aggregation results. The name provided at instantiation is
  20. used here (``by_user`` in this example).
  21. The wrapped ``aggregation_result`` object has a ``result`` property containing the
  22. aggregation data, as well as utilties for parsing that data into something more
  23. useful. For example, the ``TermsAggregation`` result also has a ``counts_by_bucket``
  24. method that returns a ``{bucket: count}`` dictionary, which is normally what you
  25. want.
  26. As of this writing, there's not much else developed, but it's pretty easy to
  27. add support for other aggregation types and more results processing
  28. """
  29. import datetime
  30. import re
  31. from collections import defaultdict, namedtuple
  32. from copy import deepcopy
  33. from corehq.apps.es.const import SIZE_LIMIT
  34. MISSING_KEY = None
  35. class AggregationResult(object):
  36. def __init__(self, raw, aggregation):
  37. self.aggregation = aggregation
  38. self.raw = raw
  39. self.result = raw.get(self.aggregation.name, {})
  40. self._aggregations = self.aggregation.aggregations
  41. class Aggregation(object):
  42. name = None
  43. type = None
  44. body = None
  45. result_class = AggregationResult
  46. aggregations = None
  47. def __init__(self):
  48. raise NotImplementedError()
  49. def aggregation(self, aggregation):
  50. if not self.aggregations:
  51. self.aggregations = []
  52. self.aggregations.append(aggregation)
  53. return self
  54. def assemble(self):
  55. if self.type == "case_property":
  56. assembled = self.body
  57. else:
  58. assembled = {self.type: self.body}
  59. if self.aggregations:
  60. assembled['aggs'] = {}
  61. for agg in self.aggregations:
  62. assembled['aggs'][agg.name] = agg.assemble()
  63. return assembled
  64. def parse_result(self, result):
  65. return self.result_class(result, self)
  66. class BucketResult(AggregationResult):
  67. @property
  68. def keys(self):
  69. return [b['key'] for b in self.normalized_buckets]
  70. @property
  71. def buckets(self):
  72. n_buckets = self.normalized_buckets
  73. buckets = namedtuple('buckets', [b['key'] for b in n_buckets])
  74. return buckets(**{b['key']: Bucket(b, self._aggregations) for b in n_buckets})
  75. @property
  76. def buckets_dict(self):
  77. return {b['key']: Bucket(b, self._aggregations) for b in self.normalized_buckets}
  78. @property
  79. def buckets_list(self):
  80. return [Bucket(b, self._aggregations) for b in self.normalized_buckets]
  81. @property
  82. def raw_buckets(self):
  83. return self.result['buckets']
  84. @property
  85. def normalized_buckets(self):
  86. return self.raw_buckets
  87. def counts_by_bucket(self):
  88. return {b['key']: b['doc_count'] for b in self.normalized_buckets}
  89. class MissingResult(AggregationResult):
  90. @property
  91. def bucket(self):
  92. return Bucket(self.result, self._aggregations)
  93. class TopHitsResult(AggregationResult):
  94. @property
  95. def raw_hits(self):
  96. return self.result['hits']['hits']
  97. @property
  98. def doc_ids(self):
  99. """Return just the docs ids from the response."""
  100. return [r['_id'] for r in self.raw_hits]
  101. @property
  102. def hits(self):
  103. """Return the docs from the response."""
  104. return [r['_source'] for r in self.raw_hits]
  105. @property
  106. def total(self):
  107. """Return the total number of docs matching the query."""
  108. return self.result['hits']['total']
  109. class StatsResult(AggregationResult):
  110. @property
  111. def count(self):
  112. return self.result['count']
  113. @property
  114. def max(self):
  115. return self.result['max']
  116. @property
  117. def min(self):
  118. return self.result['min']
  119. @property
  120. def avg(self):
  121. return self.result['avg']
  122. class ExtendedStatsResult(StatsResult):
  123. @property
  124. def std_dev(self):
  125. return self.result['std_deviation']
  126. class Bucket(object):
  127. def __init__(self, result, aggregations):
  128. self.result = result
  129. self.aggregations = aggregations
  130. @property
  131. def key(self):
  132. return self.result.get('key', MISSING_KEY)
  133. @property
  134. def doc_count(self):
  135. return self.result['doc_count']
  136. def __getattr__(self, attr):
  137. sub_aggregation = list(filter(lambda a: a.name == attr, self.aggregations))[0]
  138. if sub_aggregation:
  139. return sub_aggregation.parse_result(self.result)
  140. def __repr__(self):
  141. return "Bucket(key='{}', doc_count='{}')".format(self.key, self.doc_count)
  142. class TermsAggregation(Aggregation):
  143. """
  144. Bucket aggregation that aggregates by field
  145. :param name: aggregation name
  146. :param field: name of the field to bucket on
  147. :param size:
  148. :param missing: define how documents that are missing a value should be treated.
  149. By default, they will be ignored. If a value is supplied here it will be used where
  150. the value is missing.
  151. """
  152. type = "terms"
  153. result_class = BucketResult
  154. def __init__(self, name, field, size=None, missing=None):
  155. assert re.match(r'\w+$', name), \
  156. "Names must be valid python variable names, was {}".format(name)
  157. self.name = name
  158. self.body = {
  159. "field": field,
  160. "size": size if size is not None else SIZE_LIMIT,
  161. }
  162. if missing:
  163. self.body["missing"] = missing
  164. def order(self, field, order="asc", reset=True):
  165. query = deepcopy(self)
  166. order_field = {field: order}
  167. if reset:
  168. query.body['order'] = [order_field]
  169. else:
  170. if not query.body.get('order'):
  171. query.body['order'] = []
  172. query.body['order'].append(order_field)
  173. return query
  174. def size(self, size):
  175. query = deepcopy(self)
  176. query.body['size'] = size
  177. return query
  178. class SumResult(AggregationResult):
  179. @property
  180. def value(self):
  181. return self.result['value']
  182. class SumAggregation(Aggregation):
  183. """
  184. Bucket aggregation that sums a field
  185. :param name: aggregation name
  186. :param field: name of the field to sum
  187. """
  188. type = "sum"
  189. result_class = SumResult
  190. def __init__(self, name, field):
  191. assert re.match(r'\w+$', name), \
  192. "Names must be valid python variable names, was {}".format(name)
  193. self.name = name
  194. self.body = {
  195. "field": field,
  196. }
  197. class MinAggregation(SumAggregation):
  198. """
  199. Bucket aggregation that returns the minumum value of a field
  200. :param name: aggregation name
  201. :param field: name of the field to min
  202. """
  203. type = "min"
  204. class MaxAggregation(SumAggregation):
  205. type = "max"
  206. class AvgAggregation(SumAggregation):
  207. type = "avg"
  208. class ValueCountAggregation(SumAggregation):
  209. type = "value_count"
  210. class CardinalityAggregation(SumAggregation):
  211. type = "cardinality"
  212. class MissingAggregation(Aggregation):
  213. """
  214. A field data based single bucket aggregation, that creates a bucket of all
  215. documents in the current document set context that are missing a field value
  216. (effectively, missing a field or having the configured NULL value set).
  217. :param name: aggregation name
  218. :param field: name of the field to bucket on
  219. """
  220. type = "missing"
  221. result_class = MissingResult
  222. def __init__(self, name, field):
  223. assert re.match(r'\w+$', name), \
  224. "Names must be valid python variable names, was {}".format(name)
  225. self.name = name
  226. self.body = {"field": field}
  227. class StatsAggregation(Aggregation):
  228. """
  229. Stats aggregation that computes a stats aggregation by field
  230. :param name: aggregation name
  231. :param field: name of the field to collect stats on
  232. :param script: an optional field to allow you to script the computed field
  233. """
  234. type = "stats"
  235. result_class = StatsResult
  236. def __init__(self, name, field, script=None):
  237. assert re.match(r'\w+$', name), \
  238. "Names must be valid python variable names, was {}".format(name)
  239. self.name = name
  240. self.body = {"field": field}
  241. if script:
  242. self.body.update({'script': script})
  243. class ExtendedStatsAggregation(StatsAggregation):
  244. """
  245. Extended stats aggregation that computes an extended stats aggregation by field
  246. """
  247. type = "extended_stats"
  248. result_class = ExtendedStatsResult
  249. class TopHitsAggregation(Aggregation):
  250. """
  251. A top_hits metric aggregator keeps track of the most relevant document being aggregated
  252. This aggregator is intended to be used as a sub aggregator, so that the top matching
  253. documents can be aggregated per bucket.
  254. :param name: Aggregation name
  255. :param field: This is the field to sort the top hits by. If None, defaults to sorting
  256. by score.
  257. :param is_ascending: Whether to sort the hits in ascending or descending order.
  258. :param size: The number of hits to include. Defaults to 1.
  259. :param include: An array of fields to include in the hit. Defaults to returning the whole document.
  260. """
  261. type = "top_hits"
  262. result_class = TopHitsResult
  263. def __init__(self, name, field=None, is_ascending=True, size=1, include=None):
  264. assert re.match(r'\w+$', name), \
  265. "Names must be valid python variable names, was {}".format(name)
  266. self.name = name
  267. self.body = {
  268. 'size': size,
  269. }
  270. if field:
  271. self.body["sort"] = [{
  272. field: {
  273. "order": 'asc' if is_ascending else 'desc'
  274. },
  275. }]
  276. if include:
  277. self.body["_source"] = {"include": include}
  278. class FilterResult(AggregationResult):
  279. def __getattr__(self, attr):
  280. sub_aggregation = list([a for a in self._aggregations if a.name == attr])[0]
  281. if sub_aggregation:
  282. return sub_aggregation.parse_result(self.result)
  283. @property
  284. def doc_count(self):
  285. return self.result['doc_count']
  286. class FilterAggregation(Aggregation):
  287. """
  288. Bucket aggregation that creates a single bucket for the specified filter
  289. :param name: aggregation name
  290. :param filter: filter body
  291. """
  292. type = "filter"
  293. result_class = FilterResult
  294. def __init__(self, name, filter):
  295. self.name = name
  296. self.body = filter
  297. class FiltersAggregation(Aggregation):
  298. """
  299. Bucket aggregation that creates a bucket for each filter specified using
  300. the filter name.
  301. :param name: aggregation name
  302. """
  303. type = "filters"
  304. result_class = BucketResult
  305. def __init__(self, name, filters=None):
  306. self.name = name
  307. self.body = {"filters": (filters or {})}
  308. def add_filter(self, name, filter):
  309. """
  310. :param name: filter name
  311. :param filter: filter body
  312. """
  313. self.body["filters"][name] = filter
  314. return self
  315. class AggregationRange(namedtuple('AggregationRange', 'start end key')):
  316. """
  317. Note that a range includes the "start" value and excludes the "end" value.
  318. i.e. start <= X < end
  319. :param start: range start
  320. :param end: range end
  321. :param key: optional key name for the range
  322. """
  323. def __new__(cls, start=None, end=None, key=None):
  324. assert start or end, "At least one of 'from' or 'to' are required"
  325. return super(AggregationRange, cls).__new__(cls, start, end, key)
  326. def assemble(self):
  327. range_ = {}
  328. for key, attr in {'from': 'start', 'to': 'end', 'key': 'key'}.items():
  329. value = getattr(self, attr)
  330. if value:
  331. if isinstance(value, datetime.date):
  332. value = value.isoformat()
  333. elif not isinstance(value, str):
  334. value = str(value)
  335. range_[key] = value
  336. return range_
  337. class RangeResult(BucketResult):
  338. @property
  339. def normalized_buckets(self):
  340. buckets = self.raw_buckets
  341. if self.aggregation.keyed:
  342. def _add_key(key, bucket):
  343. bucket['key'] = key
  344. return bucket
  345. return [_add_key(k, b) for k, b in buckets.items()]
  346. else:
  347. def _add_key(bucket):
  348. key = '{}-{}'.format(bucket.get('from', '*'), bucket.get('to', '*'))
  349. bucket['key'] = key
  350. return bucket
  351. return [_add_key(b) for b in buckets]
  352. class RangeAggregation(Aggregation):
  353. """
  354. Bucket aggregation that creates one bucket for each range
  355. :param name: the aggregation name
  356. :param field: the field to perform the range aggregations on
  357. :param ranges: list of AggregationRange objects
  358. :param keyed: set to True to have the results returned by key instead of as
  359. a list (see RangeResult.normalized_buckets)
  360. """
  361. type = "range"
  362. result_class = RangeResult
  363. def __init__(self, name, field, ranges=None, keyed=True):
  364. self.keyed = keyed
  365. self.name = name
  366. self.body = {
  367. 'field': field,
  368. 'keyed': keyed,
  369. 'ranges': []
  370. }
  371. if ranges:
  372. for range_ in ranges:
  373. self.add_range(range_)
  374. def add_range(self, range_):
  375. if isinstance(range_, AggregationRange):
  376. range_ = range_.assemble()
  377. if range_.get('key'):
  378. self.body['keyed'] = True
  379. self.body["ranges"].append(range_)
  380. return self
  381. class DateHistogramResult(BucketResult):
  382. @property
  383. def normalized_buckets(self):
  384. return [{
  385. 'key': b['key_as_string'],
  386. 'doc_count': b['doc_count'],
  387. } for b in self.raw_buckets]
  388. _Interval = namedtuple('_Interval', 'interval result_format')
  389. class DateHistogram(Aggregation):
  390. """
  391. Aggregate by date range. This can answer questions like "how many forms
  392. were created each day?".
  393. :param name: what do you want to call this aggregation
  394. :param datefield: the document's date field to look at
  395. :param interval: the date interval to use - from DateHistogram.Interval
  396. :param timezone: do bucketing using this time zone instead of UTC
  397. """
  398. type = "date_histogram"
  399. result_class = DateHistogramResult
  400. class Interval:
  401. # Feel free to add more options here
  402. # year, quarter, month, week, day, hour, minute, second
  403. YEAR = _Interval('year', 'yyyy')
  404. MONTH = _Interval('month', 'yyyy-MM')
  405. DAY = _Interval('day', 'yyyy-MM-dd')
  406. def __init__(self, name, datefield, interval, timezone=None):
  407. self.name = name
  408. self.body = {
  409. 'field': datefield,
  410. 'interval': interval.interval,
  411. 'format': interval.result_format,
  412. 'min_doc_count': 1, # Only include buckets with results
  413. }
  414. if timezone:
  415. self.body['time_zone'] = timezone
  416. class NestedAggregation(Aggregation):
  417. """
  418. A special single bucket aggregation that enables aggregating nested documents.
  419. :param path: Path to nested document
  420. """
  421. type = "nested"
  422. result_class = FilterResult
  423. def __init__(self, name, path):
  424. self.name = name
  425. self.body = {
  426. "path": path
  427. }
  428. AggregationTerm = namedtuple('AggregationTerm', ['name', 'field'])
  429. class NestedTermAggregationsHelper(object):
  430. """
  431. Helper to run nested term-based queries (equivalent to SQL group-by clauses).
  432. This is not at all related to the ES 'nested aggregation'. The final aggregation
  433. is a count of documents.
  434. Example usage:
  435. .. code-block:: python
  436. # counting all forms submitted in a domain grouped by app id and user id
  437. NestedTermAggregationsHelper(
  438. base_query=FormES().domain(domain_name),
  439. terms=[
  440. AggregationTerm('app_id', 'app_id'),
  441. AggregationTerm('user_id', 'form.meta.userID'),
  442. ]
  443. ).get_data()
  444. This works by bucketing docs first by one terms aggregation, then within
  445. that bucket, bucketing further by the next term, and so on. This is then
  446. flattened out to appear like a group-by-multiple.
  447. """
  448. def __init__(self, base_query, terms):
  449. self.base_query = base_query
  450. self.terms = terms
  451. @property
  452. def query(self):
  453. previous_term = None
  454. for name, field in reversed(self.terms):
  455. term = TermsAggregation(name, field)
  456. if previous_term is not None:
  457. term = term.aggregation(previous_term)
  458. previous_term = term
  459. return self.base_query.aggregation(term)
  460. def get_data(self):
  461. def _add_terms(aggregation_bucket, term, remaining_terms, current_counts, current_key=None):
  462. for bucket in getattr(aggregation_bucket, term.name).buckets_list:
  463. key = (bucket.key,) if current_key is None else current_key + (bucket.key,)
  464. if remaining_terms:
  465. _add_terms(bucket, remaining_terms[0], remaining_terms[1:], current_counts, current_key=key)
  466. else:
  467. current_counts[key] += bucket.doc_count
  468. counts = defaultdict(lambda: 0)
  469. _add_terms(self.query.size(0).run().aggregations, self.terms[0], self.terms[1:], current_counts=counts)
  470. return self._format_counts(counts)
  471. def _format_counts(self, counts):
  472. final_aggregation_name = ('doc_count')
  473. row_class = namedtuple('NestedQueryRow', [term.name for term in self.terms] + [final_aggregation_name])
  474. for combined_key, count in counts.items():
  475. yield row_class(*(combined_key + (count,)))