PageRenderTime 41ms CodeModel.GetById 14ms app.highlight 23ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/udf/UDAFPercentile.java

#
Java | 311 lines | 196 code | 46 blank | 69 comment | 66 complexity | 9fcfd43445aaa588a1bd5edd08a62da9 MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19package org.apache.hadoop.hive.ql.udf;
 20
 21import java.util.ArrayList;
 22import java.util.Collections;
 23import java.util.Comparator;
 24import java.util.HashMap;
 25import java.util.List;
 26import java.util.Map;
 27import java.util.Set;
 28
 29import org.apache.hadoop.hive.ql.exec.Description;
 30import org.apache.hadoop.hive.ql.exec.UDAF;
 31import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
 32import org.apache.hadoop.hive.serde2.io.DoubleWritable;
 33import org.apache.hadoop.io.LongWritable;
 34
 35/**
 36 * UDAF for calculating the percentile values.
 37 * There are several definitions of percentile, and we take the method recommended by
 38 * NIST.
 39 * @see http://en.wikipedia.org/wiki/Percentile#Alternative_methods
 40 */
 41@Description(name = "percentile",
 42    value = "_FUNC_(expr, pc) - Returns the percentile(s) of expr at pc (range: [0,1])."
 43      + "pc can be a double or double array")
 44public class UDAFPercentile extends UDAF {
 45
 46  /**
 47   * A state class to store intermediate aggregation results.
 48   */
 49  public static class State {
 50    private Map<LongWritable, LongWritable> counts;
 51    private List<DoubleWritable> percentiles;
 52  }
 53
 54  /**
 55   * A comparator to sort the entries in order.
 56   */
 57  public static class MyComparator implements Comparator<Map.Entry<LongWritable, LongWritable>> {
 58    @Override
 59    public int compare(Map.Entry<LongWritable, LongWritable> o1,
 60        Map.Entry<LongWritable, LongWritable> o2) {
 61      return o1.getKey().compareTo(o2.getKey());
 62    }
 63  }
 64
 65  /**
 66   * Increment the State object with o as the key, and i as the count.
 67   */
 68  private static void increment(State s, LongWritable o, long i) {
 69    if (s.counts == null) {
 70      s.counts = new HashMap<LongWritable, LongWritable>();
 71    }
 72    LongWritable count = s.counts.get(o);
 73    if (count == null) {
 74      // We have to create a new object, because the object o belongs
 75      // to the code that creates it and may get its value changed.
 76      LongWritable key = new LongWritable();
 77      key.set(o.get());
 78      s.counts.put(key, new LongWritable(i));
 79    } else {
 80      count.set(count.get() + i);
 81    }
 82  }
 83
 84  /**
 85   * Get the percentile value.
 86   */
 87  private static double getPercentile(List<Map.Entry<LongWritable, LongWritable>> entriesList,
 88      double position) {
 89    // We may need to do linear interpolation to get the exact percentile
 90    long lower = (long)Math.floor(position);
 91    long higher = (long)Math.ceil(position);
 92
 93    // Linear search since this won't take much time from the total execution anyway
 94    // lower has the range of [0 .. total-1]
 95    // The first entry with accumulated count (lower+1) corresponds to the lower position.
 96    int i = 0;
 97    while (entriesList.get(i).getValue().get() < lower + 1) {
 98      i++;
 99    }
100
101    long lowerKey = entriesList.get(i).getKey().get();
102    if (higher == lower) {
103      // no interpolation needed because position does not have a fraction
104      return lowerKey;
105    }
106
107    if (entriesList.get(i).getValue().get() < higher + 1) {
108      i++;
109    }
110    long higherKey = entriesList.get(i).getKey().get();
111
112    if (higherKey == lowerKey) {
113      // no interpolation needed because lower position and higher position has the same key
114      return lowerKey;
115    }
116
117    // Linear interpolation to get the exact percentile
118    return (higher - position) * lowerKey + (position - lower) * higherKey;
119  }
120
121
122  /**
123   * The evaluator for percentile computation based on long.
124   */
125  public static class PercentileLongEvaluator implements UDAFEvaluator {
126
127    private final State state;
128
129    public PercentileLongEvaluator() {
130      state = new State();
131    }
132
133    public void init() {
134      if (state.counts != null) {
135        // We reuse the same hashmap to reduce new object allocation.
136        // This means counts can be empty when there is no input data.
137        state.counts.clear();
138      }
139    }
140
141    /** Note that percentile can be null in a global aggregation with
142     *  0 input rows:  "select percentile(col, 0.5) from t where false"
143     *  In that case, iterate(null, null) will be called once.
144     */
145    public boolean iterate(LongWritable o, Double percentile) {
146      if (o == null && percentile == null) {
147        return false;
148      }
149      if (state.percentiles == null) {
150        if (percentile < 0.0 || percentile > 1.0) {
151          throw new RuntimeException("Percentile value must be wihin the range of 0 to 1.");
152        }
153        state.percentiles = new ArrayList<DoubleWritable>(1);
154        state.percentiles.add(new DoubleWritable(percentile.doubleValue()));
155      }
156      if (o != null) {
157        increment(state, o, 1);
158      }
159      return true;
160    }
161
162    public State terminatePartial() {
163      return state;
164    }
165
166    public boolean merge(State other) {
167      if (other == null || other.counts == null || other.percentiles == null) {
168        return false;
169      }
170
171      if (state.percentiles == null) {
172        state.percentiles = new ArrayList<DoubleWritable>(other.percentiles);
173      }
174
175      for (Map.Entry<LongWritable, LongWritable> e: other.counts.entrySet()) {
176        increment(state, e.getKey(), e.getValue().get());
177      }
178      return true;
179    }
180
181    private DoubleWritable result;
182
183    public DoubleWritable terminate() {
184      // No input data.
185      if (state.counts == null || state.counts.size() == 0) {
186        return null;
187      }
188
189      // Get all items into an array and sort them.
190      Set<Map.Entry<LongWritable, LongWritable>> entries = state.counts.entrySet();
191      List<Map.Entry<LongWritable, LongWritable>> entriesList =
192        new ArrayList<Map.Entry<LongWritable, LongWritable>>(entries);
193      Collections.sort(entriesList, new MyComparator());
194
195      // Accumulate the counts.
196      long total = 0;
197      for (int i = 0; i < entriesList.size(); i++) {
198        LongWritable count = entriesList.get(i).getValue();
199        total += count.get();
200        count.set(total);
201      }
202
203      // Initialize the result.
204      if (result == null) {
205        result = new DoubleWritable();
206      }
207
208      // maxPosition is the 1.0 percentile
209      long maxPosition = total - 1;
210      double position = maxPosition * state.percentiles.get(0).get();
211      result.set(getPercentile(entriesList, position));
212      return result;
213    }
214  }
215
216  /**
217   * The evaluator for percentile computation based on long for an array of percentiles.
218   */
219  public static class PercentileLongArrayEvaluator implements UDAFEvaluator {
220
221    private final State state;
222
223    public PercentileLongArrayEvaluator() {
224      state = new State();
225    }
226
227    public void init() {
228      if (state.counts != null) {
229        // We reuse the same hashmap to reduce new object allocation.
230        // This means counts can be empty when there is no input data.
231        state.counts.clear();
232      }
233    }
234
235    public boolean iterate(LongWritable o, List<DoubleWritable> percentiles) {
236      if (state.percentiles == null) {
237        for (int i = 0; i < percentiles.size(); i++) {
238          if (percentiles.get(i).get() < 0.0 || percentiles.get(i).get() > 1.0) {
239            throw new RuntimeException("Percentile value must be wihin the range of 0 to 1.");
240          }
241        }
242        state.percentiles = new ArrayList<DoubleWritable>(percentiles);
243      }
244      if (o != null) {
245        increment(state, o, 1);
246      }
247      return true;
248    }
249
250    public State terminatePartial() {
251      return state;
252    }
253
254    public boolean merge(State other) {
255      if (other == null || other.counts == null || other.percentiles == null) {
256        return true;
257      }
258
259      if (state.percentiles == null) {
260        state.percentiles = new ArrayList<DoubleWritable>(other.percentiles);
261      }
262
263      for (Map.Entry<LongWritable, LongWritable> e: other.counts.entrySet()) {
264        increment(state, e.getKey(), e.getValue().get());
265      }
266      return true;
267    }
268
269
270    private List<DoubleWritable> results;
271
272    public List<DoubleWritable> terminate() {
273      // No input data
274      if (state.counts == null || state.counts.size() == 0) {
275        return null;
276      }
277
278      // Get all items into an array and sort them
279      Set<Map.Entry<LongWritable, LongWritable>> entries = state.counts.entrySet();
280      List<Map.Entry<LongWritable, LongWritable>> entriesList =
281        new ArrayList<Map.Entry<LongWritable, LongWritable>>(entries);
282      Collections.sort(entriesList, new MyComparator());
283
284      // accumulate the counts
285      long total = 0;
286      for (int i = 0; i < entriesList.size(); i++) {
287        LongWritable count = entriesList.get(i).getValue();
288        total += count.get();
289        count.set(total);
290      }
291
292      // maxPosition is the 1.0 percentile
293      long maxPosition = total - 1;
294
295      // Initialize the results
296      if (results == null) {
297        results = new ArrayList<DoubleWritable>();
298        for (int i = 0; i < state.percentiles.size(); i++) {
299          results.add(new DoubleWritable());
300        }
301      }
302      // Set the results
303      for (int i = 0; i < state.percentiles.size(); i++) {
304        double position = maxPosition * state.percentiles.get(i).get();
305        results.get(i).set(getPercentile(entriesList, position));
306      }
307      return results;
308    }
309  }
310
311}