/be/src/util/streaming-sampler.h

https://gitlab.com/s9perween/Impala · C Header · 150 lines · 86 code · 22 blank · 42 comment · 7 complexity · 796c8f0f1b4fbf851ba1a05051831471 MD5 · raw file

  1. // Copyright 2012 Cloudera Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifndef IMPALA_UTIL_STREAMING_SAMPLER_H
  15. #define IMPALA_UTIL_STREAMING_SAMPLER_H
  16. #include <string.h>
  17. #include <iostream>
  18. #include "util/spinlock.h"
  19. namespace impala {
  20. // A fixed-size sampler to collect samples over time. AddSample should be
  21. // called periodically with the sampled value. Samples are added at the max
  22. // resolution possible. When the sample buffer is full, the current samples
  23. // are collapsed and the collection period is doubled.
  24. // The input period and the streaming sampler period do not need to match, the
  25. // streaming sampler will average values.
  26. // T is the type of the sample and must be a native numerical type (e.g. int or float).
  27. template<typename T, int MAX_SAMPLES>
  28. class StreamingSampler {
  29. public:
  30. StreamingSampler(int initial_period = 500)
  31. : samples_collected_(0) ,
  32. period_(initial_period),
  33. current_sample_sum_(0),
  34. current_sample_count_(0),
  35. current_sample_total_time_(0) {
  36. }
  37. // Initialize the sampler with values.
  38. StreamingSampler(int period, const std::vector<T>& initial_samples)
  39. : samples_collected_(initial_samples.size()),
  40. period_(period),
  41. current_sample_sum_(0),
  42. current_sample_count_(0),
  43. current_sample_total_time_(0) {
  44. DCHECK_LE(samples_collected_, MAX_SAMPLES);
  45. memcpy(samples_, &initial_samples[0], sizeof(T) * samples_collected_);
  46. }
  47. // Add a sample to the sampler. 'ms' is the time elapsed since the last time this
  48. // was called.
  49. // The input value is accumulated into current_*. If the total time elapsed
  50. // in current_sample_total_time_ is higher than the storage period, the value is
  51. // stored. 'sample' should be interpreted as a representative sample from
  52. // (now - ms, now].
  53. // TODO: we can make this more complex by taking a weighted average of samples
  54. // accumulated in a period.
  55. void AddSample(T sample, int ms) {
  56. ScopedSpinLock l(&lock_);
  57. ++current_sample_count_;
  58. current_sample_sum_ += sample;
  59. current_sample_total_time_ += ms;
  60. if (current_sample_total_time_ >= period_) {
  61. samples_[samples_collected_++] = current_sample_sum_ / current_sample_count_;
  62. current_sample_count_ = 0;
  63. current_sample_sum_ = 0;
  64. current_sample_total_time_ = 0;
  65. if (samples_collected_ == MAX_SAMPLES) {
  66. // collapse the samples in half by averaging them and doubling the storage period
  67. period_ *= 2;
  68. for (int i = 0; i < MAX_SAMPLES / 2; ++i) {
  69. samples_[i] = (samples_[i * 2] + samples_[i * 2 + 1]) / 2;
  70. }
  71. samples_collected_ /= 2;
  72. }
  73. }
  74. }
  75. // Get the samples collected. Returns the number of samples and
  76. // the period they were collected at.
  77. // If lock is non-null, the lock will be taken before returning. The caller
  78. // must unlock it.
  79. const T* GetSamples(int* num_samples, int* period, SpinLock** lock = NULL) const {
  80. if (lock != NULL) {
  81. lock_.Lock();
  82. *lock = &lock_;
  83. }
  84. *num_samples = samples_collected_;
  85. *period = period_;
  86. return samples_;
  87. }
  88. // Set the underlying data to period/samples
  89. void SetSamples(int period, const std::vector<T>& samples) {
  90. DCHECK_LE(samples.size(), MAX_SAMPLES);
  91. ScopedSpinLock l(&lock_);
  92. period_ = period;
  93. samples_collected_ = samples.size();
  94. memcpy(samples_, &samples[0], sizeof(T) * samples_collected_);
  95. current_sample_sum_ = 0;
  96. current_sample_count_ = 0;
  97. current_sample_total_time_ = 0;
  98. }
  99. std::string DebugString(const std::string& prefix="") const {
  100. ScopedSpinLock l(&lock_);
  101. std::stringstream ss;
  102. ss << prefix << "Period = " << period_ << std::endl
  103. << prefix << "Num = " << samples_collected_ << std::endl
  104. << prefix << "Samples = {";
  105. for (int i = 0; i < samples_collected_; ++i) {
  106. ss << samples_[i] << ", ";
  107. }
  108. ss << prefix << "}" << std::endl;
  109. return ss.str();
  110. }
  111. private:
  112. mutable SpinLock lock_;
  113. // Aggregated samples collected. Note: this is not all the input samples from
  114. // AddSample(), as logically, those samples get resampled and aggregated.
  115. T samples_[MAX_SAMPLES];
  116. // Number of samples collected <= MAX_SAMPLES.
  117. int samples_collected_;
  118. // Storage period in ms.
  119. int period_;
  120. // The sum of input samples that makes up the next stored sample.
  121. T current_sample_sum_;
  122. // The number of input samples that contribute to current_sample_sum_.
  123. int current_sample_count_;
  124. // The total time that current_sample_sum_ represents
  125. int current_sample_total_time_;
  126. };
  127. }
  128. #endif