PageRenderTime 77ms CodeModel.GetById 13ms RepoModel.GetById 3ms app.codeStats 1ms

/tags/release-0.1-rc2/hive/external/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java

#
Java | 289 lines | 200 code | 40 blank | 49 comment | 34 complexity | bcc6aabcf735745cc315f1b0ebb46d13 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hadoop.hive.ql.udf.generic;
  19. import java.util.ArrayList;
  20. import org.apache.commons.logging.Log;
  21. import org.apache.commons.logging.LogFactory;
  22. import org.apache.hadoop.hive.ql.exec.Description;
  23. import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
  24. import org.apache.hadoop.hive.ql.metadata.HiveException;
  25. import org.apache.hadoop.hive.ql.parse.SemanticException;
  26. import org.apache.hadoop.hive.serde2.io.DoubleWritable;
  27. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
  28. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
  29. import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
  30. import org.apache.hadoop.hive.serde2.objectinspector.StructField;
  31. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
  32. import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
  33. import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
  34. import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
  35. import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
  36. import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
  37. import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
  38. import org.apache.hadoop.io.LongWritable;
  39. import org.apache.hadoop.util.StringUtils;
  40. /**
  41. * Compute the variance. This class is extended by: GenericUDAFVarianceSample
  42. * GenericUDAFStd GenericUDAFStdSample
  43. *
  44. */
  45. @Description(name = "variance,var_pop",
  46. value = "_FUNC_(x) - Returns the variance of a set of numbers")
  47. public class GenericUDAFVariance extends AbstractGenericUDAFResolver {
  48. static final Log LOG = LogFactory.getLog(GenericUDAFVariance.class.getName());
  49. @Override
  50. public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
  51. if (parameters.length != 1) {
  52. throw new UDFArgumentTypeException(parameters.length - 1,
  53. "Exactly one argument is expected.");
  54. }
  55. if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
  56. throw new UDFArgumentTypeException(0,
  57. "Only primitive type arguments are accepted but "
  58. + parameters[0].getTypeName() + " is passed.");
  59. }
  60. switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) {
  61. case BYTE:
  62. case SHORT:
  63. case INT:
  64. case LONG:
  65. case FLOAT:
  66. case DOUBLE:
  67. case STRING:
  68. return new GenericUDAFVarianceEvaluator();
  69. case BOOLEAN:
  70. default:
  71. throw new UDFArgumentTypeException(0,
  72. "Only numeric or string type arguments are accepted but "
  73. + parameters[0].getTypeName() + " is passed.");
  74. }
  75. }
  76. /**
  77. * Evaluate the variance using the algorithm described by Chan, Golub, and LeVeque in
  78. * "Algorithms for computing the sample variance: analysis and recommendations"
  79. * The American Statistician, 37 (1983) pp. 242--247.
  80. *
  81. * variance = variance1 + variance2 + n/(m*(m+n)) * pow(((m/n)*t1 - t2),2)
  82. *
  83. * where: - variance is sum[x-avg^2] (this is actually n times the variance)
  84. * and is updated at every step. - n is the count of elements in chunk1 - m is
  85. * the count of elements in chunk2 - t1 = sum of elements in chunk1, t2 =
  86. * sum of elements in chunk2.
  87. *
  88. * This algorithm was proven to be numerically stable by J.L. Barlow in
  89. * "Error analysis of a pairwise summation algorithm to compute sample variance"
  90. * Numer. Math, 58 (1991) pp. 583--590
  91. *
  92. */
  93. public static class GenericUDAFVarianceEvaluator extends GenericUDAFEvaluator {
  94. // For PARTIAL1 and COMPLETE
  95. private PrimitiveObjectInspector inputOI;
  96. // For PARTIAL2 and FINAL
  97. private StructObjectInspector soi;
  98. private StructField countField;
  99. private StructField sumField;
  100. private StructField varianceField;
  101. private LongObjectInspector countFieldOI;
  102. private DoubleObjectInspector sumFieldOI;
  103. private DoubleObjectInspector varianceFieldOI;
  104. // For PARTIAL1 and PARTIAL2
  105. private Object[] partialResult;
  106. // For FINAL and COMPLETE
  107. private DoubleWritable result;
  108. @Override
  109. public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
  110. assert (parameters.length == 1);
  111. super.init(m, parameters);
  112. // init input
  113. if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
  114. inputOI = (PrimitiveObjectInspector) parameters[0];
  115. } else {
  116. soi = (StructObjectInspector) parameters[0];
  117. countField = soi.getStructFieldRef("count");
  118. sumField = soi.getStructFieldRef("sum");
  119. varianceField = soi.getStructFieldRef("variance");
  120. countFieldOI = (LongObjectInspector) countField
  121. .getFieldObjectInspector();
  122. sumFieldOI = (DoubleObjectInspector) sumField.getFieldObjectInspector();
  123. varianceFieldOI = (DoubleObjectInspector) varianceField
  124. .getFieldObjectInspector();
  125. }
  126. // init output
  127. if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
  128. // The output of a partial aggregation is a struct containing
  129. // a long count and doubles sum and variance.
  130. ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>();
  131. foi.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector);
  132. foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
  133. foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector);
  134. ArrayList<String> fname = new ArrayList<String>();
  135. fname.add("count");
  136. fname.add("sum");
  137. fname.add("variance");
  138. partialResult = new Object[3];
  139. partialResult[0] = new LongWritable(0);
  140. partialResult[1] = new DoubleWritable(0);
  141. partialResult[2] = new DoubleWritable(0);
  142. return ObjectInspectorFactory.getStandardStructObjectInspector(fname,
  143. foi);
  144. } else {
  145. setResult(new DoubleWritable(0));
  146. return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
  147. }
  148. }
  149. static class StdAgg implements AggregationBuffer {
  150. long count; // number of elements
  151. double sum; // sum of elements
  152. double variance; // sum[x-avg^2] (this is actually n times the variance)
  153. };
  154. @Override
  155. public AggregationBuffer getNewAggregationBuffer() throws HiveException {
  156. StdAgg result = new StdAgg();
  157. reset(result);
  158. return result;
  159. }
  160. @Override
  161. public void reset(AggregationBuffer agg) throws HiveException {
  162. StdAgg myagg = (StdAgg) agg;
  163. myagg.count = 0;
  164. myagg.sum = 0;
  165. myagg.variance = 0;
  166. }
  167. private boolean warned = false;
  168. @Override
  169. public void iterate(AggregationBuffer agg, Object[] parameters)
  170. throws HiveException {
  171. assert (parameters.length == 1);
  172. Object p = parameters[0];
  173. if (p != null) {
  174. StdAgg myagg = (StdAgg) agg;
  175. try {
  176. double v = PrimitiveObjectInspectorUtils.getDouble(p, inputOI);
  177. myagg.count++;
  178. myagg.sum += v;
  179. if(myagg.count > 1) {
  180. double t = myagg.count*v - myagg.sum;
  181. myagg.variance += (t*t) / ((double)myagg.count*(myagg.count-1));
  182. }
  183. } catch (NumberFormatException e) {
  184. if (!warned) {
  185. warned = true;
  186. LOG.warn(getClass().getSimpleName() + " "
  187. + StringUtils.stringifyException(e));
  188. LOG.warn(getClass().getSimpleName()
  189. + " ignoring similar exceptions.");
  190. }
  191. }
  192. }
  193. }
  194. @Override
  195. public Object terminatePartial(AggregationBuffer agg) throws HiveException {
  196. StdAgg myagg = (StdAgg) agg;
  197. ((LongWritable) partialResult[0]).set(myagg.count);
  198. ((DoubleWritable) partialResult[1]).set(myagg.sum);
  199. ((DoubleWritable) partialResult[2]).set(myagg.variance);
  200. return partialResult;
  201. }
  202. @Override
  203. public void merge(AggregationBuffer agg, Object partial) throws HiveException {
  204. if (partial != null) {
  205. StdAgg myagg = (StdAgg) agg;
  206. Object partialCount = soi.getStructFieldData(partial, countField);
  207. Object partialSum = soi.getStructFieldData(partial, sumField);
  208. Object partialVariance = soi.getStructFieldData(partial, varianceField);
  209. long n = myagg.count;
  210. long m = countFieldOI.get(partialCount);
  211. if (n == 0) {
  212. // Just copy the information since there is nothing so far
  213. myagg.variance = sumFieldOI.get(partialVariance);
  214. myagg.count = countFieldOI.get(partialCount);
  215. myagg.sum = sumFieldOI.get(partialSum);
  216. }
  217. if (m != 0 && n != 0) {
  218. // Merge the two partials
  219. double a = myagg.sum;
  220. double b = sumFieldOI.get(partialSum);
  221. myagg.count += m;
  222. myagg.sum += b;
  223. double t = (m/(double)n)*a - b;
  224. myagg.variance += sumFieldOI.get(partialVariance) + ((n/(double)m)/((double)n+m)) * t * t;
  225. }
  226. }
  227. }
  228. @Override
  229. public Object terminate(AggregationBuffer agg) throws HiveException {
  230. StdAgg myagg = (StdAgg) agg;
  231. if (myagg.count == 0) { // SQL standard - return null for zero elements
  232. return null;
  233. } else {
  234. if (myagg.count > 1) {
  235. getResult().set(myagg.variance / (myagg.count));
  236. } else { // for one element the variance is always 0
  237. getResult().set(0);
  238. }
  239. return getResult();
  240. }
  241. }
  242. public void setResult(DoubleWritable result) {
  243. this.result = result;
  244. }
  245. public DoubleWritable getResult() {
  246. return result;
  247. }
  248. }
  249. }