PageRenderTime 51ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/common/mahout-distribution-0.7/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java

https://bitbucket.org/AlexeyD/hibench
Java | 340 lines | 216 code | 19 blank | 105 comment | 33 complexity | 5d01c210b38a5cfa796c7588fd6ca698 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.mahout.clustering.cdbw;
  18. import java.util.Iterator;
  19. import java.util.List;
  20. import java.util.Map;
  21. import com.google.common.collect.Lists;
  22. import com.google.common.collect.Maps;
  23. import org.apache.hadoop.conf.Configuration;
  24. import org.apache.hadoop.fs.Path;
  25. import org.apache.mahout.clustering.Cluster;
  26. import org.apache.mahout.clustering.GaussianAccumulator;
  27. import org.apache.mahout.clustering.OnlineGaussianAccumulator;
  28. import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
  29. import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
  30. import org.apache.mahout.clustering.iterator.ClusterWritable;
  31. import org.apache.mahout.common.ClassUtils;
  32. import org.apache.mahout.common.distance.DistanceMeasure;
  33. import org.apache.mahout.common.iterator.sequencefile.PathFilters;
  34. import org.apache.mahout.common.iterator.sequencefile.PathType;
  35. import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
  36. import org.apache.mahout.math.Vector;
  37. import org.apache.mahout.math.VectorWritable;
  38. import org.slf4j.Logger;
  39. import org.slf4j.LoggerFactory;
  40. /**
  41. * This class calculates the CDbw metric as defined in
  42. * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf
  43. */
  44. public class CDbwEvaluator {
  45. private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);
  46. private final Map<Integer, List<VectorWritable>> representativePoints;
  47. private final Map<Integer, Double> stDevs = Maps.newHashMap();
  48. private final List<Cluster> clusters;
  49. private final DistanceMeasure measure;
  50. private boolean pruned;
  51. /**
  52. * For testing only
  53. *
  54. * @param representativePoints
  55. * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
  56. * @param clusters
  57. * a Map<Integer,Cluster> of the clusters keyed by clusterId
  58. * @param measure
  59. * an appropriate DistanceMeasure
  60. */
  61. public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints,
  62. List<Cluster> clusters,
  63. DistanceMeasure measure) {
  64. this.representativePoints = representativePoints;
  65. this.clusters = clusters;
  66. this.measure = measure;
  67. for (Integer cId : representativePoints.keySet()) {
  68. computeStd(cId);
  69. }
  70. }
  71. /**
  72. * Initialize a new instance from job information
  73. *
  74. * @param conf
  75. * a Configuration with appropriate parameters
  76. * @param clustersIn
  77. * a String path to the input clusters directory
  78. */
  79. public CDbwEvaluator(Configuration conf, Path clustersIn) {
  80. measure = ClassUtils.instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY),
  81. DistanceMeasure.class);
  82. representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
  83. clusters = loadClusters(conf, clustersIn);
  84. for (Integer cId : representativePoints.keySet()) {
  85. computeStd(cId);
  86. }
  87. }
  88. /**
  89. * Load the clusters from their sequence files
  90. *
  91. * @param clustersIn
  92. * a String pathname to the directory containing input cluster files
  93. * @return a List<Cluster> of the clusters
  94. */
  95. private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
  96. List<Cluster> clusters = Lists.newArrayList();
  97. for (ClusterWritable clusterWritable :
  98. new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
  99. Cluster cluster = clusterWritable.getValue();
  100. clusters.add(cluster);
  101. }
  102. return clusters;
  103. }
  104. /**
  105. * Compute the standard deviation of the representative points for the given cluster.
  106. * Store these in stDevs, indexed by cI
  107. *
  108. * @param cI a int clusterId.
  109. */
  110. private void computeStd(int cI) {
  111. List<VectorWritable> repPts = representativePoints.get(cI);
  112. GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
  113. for (VectorWritable vw : repPts) {
  114. accumulator.observe(vw.get(), 1.0);
  115. }
  116. accumulator.compute();
  117. double d = accumulator.getAverageStd();
  118. stDevs.put(cI, d);
  119. }
  120. /**
  121. * Return if the cluster is valid. Valid clusters must have more than 2 representative points,
  122. * and at least one of them must be different than the cluster center. This is because the
  123. * representative points extraction will duplicate the cluster center if it is empty.
  124. *
  125. * @param clusterI a Cluster
  126. * @return a boolean
  127. */
  128. private boolean invalidCluster(Cluster clusterI) {
  129. List<VectorWritable> repPts = representativePoints.get(clusterI.getId());
  130. if (repPts.size() < 2) {
  131. return true;
  132. }
  133. for (VectorWritable vw : repPts) {
  134. Vector vector = vw.get();
  135. if (!vector.equals(clusterI.getCenter())) {
  136. return false;
  137. }
  138. }
  139. return true;
  140. }
  141. private void pruneInvalidClusters() {
  142. if (pruned) {
  143. return;
  144. }
  145. for (Iterator<Cluster> it = clusters.iterator(); it.hasNext();) {
  146. Cluster cluster = it.next();
  147. if (invalidCluster(cluster)) {
  148. log.info("Pruning cluster Id={}", cluster.getId());
  149. it.remove();
  150. representativePoints.remove(cluster.getId());
  151. }
  152. }
  153. pruned = true;
  154. }
  155. /**
  156. * Compute the term density (eqn 2) used for inter-cluster density calculation
  157. *
  158. * @param uIJ the Vector midpoint between the closest representative of the clusters
  159. * @param cI the int clusterId of the i-th cluster
  160. * @param cJ the int clusterId of the j-th cluster
  161. * @return a double
  162. */
  163. double interDensity(Vector uIJ, int cI, int cJ) {
  164. List<VectorWritable> repI = representativePoints.get(cI);
  165. List<VectorWritable> repJ = representativePoints.get(cJ);
  166. double sum = 0.0;
  167. Double stdevI = stDevs.get(cI);
  168. Double stdevJ = stDevs.get(cJ);
  169. // count the number of representative points of the clusters which are within the
  170. // average std of the two clusters from the midpoint uIJ (eqn 3)
  171. double avgStd = (stdevI + stdevJ) / 2.0;
  172. for (VectorWritable vwI : repI) {
  173. if (measure.distance(uIJ, vwI.get()) <= avgStd) {
  174. sum++;
  175. }
  176. }
  177. for (VectorWritable vwJ : repJ) {
  178. if (measure.distance(uIJ, vwJ.get()) <= avgStd) {
  179. sum++;
  180. }
  181. }
  182. int nI = repI.size();
  183. int nJ = repJ.size();
  184. return sum / (nI + nJ);
  185. }
  186. /**
  187. * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which
  188. * have a high intraClusterDensity and also a high cluster separation.
  189. *
  190. * @return a double
  191. */
  192. public double getCDbw() {
  193. pruneInvalidClusters();
  194. return intraClusterDensity() * separation();
  195. }
  196. /**
  197. * The average density within clusters is defined as the percentage of representative points that reside
  198. * in the neighborhood of the clusters' centers. The goal is the density within clusters to be
  199. * significantly high. (eqn 5)
  200. *
  201. * @return a double
  202. */
  203. public double intraClusterDensity() {
  204. pruneInvalidClusters();
  205. // compute the average standard deviation of the clusters
  206. double stdev = 0.0;
  207. for (Integer cI : representativePoints.keySet()) {
  208. stdev += stDevs.get(cI);
  209. }
  210. int c = representativePoints.size();
  211. stdev /= c;
  212. // accumulate the summations
  213. double sumI = 0.0;
  214. for (Cluster cluster : clusters) {
  215. Integer cI = cluster.getId();
  216. List<VectorWritable> repPtsI = representativePoints.get(cI);
  217. int r = repPtsI.size();
  218. double sumJ = 0.0;
  219. // compute the term density (eqn 6)
  220. for (VectorWritable pt : repPtsI) {
  221. // compute f(x, vIJ) (eqn 7)
  222. Vector repJ = pt.get();
  223. double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0;
  224. // accumulate sumJ
  225. sumJ += densityIJ / stdev;
  226. }
  227. // accumulate sumI
  228. sumI += sumJ / r;
  229. }
  230. return sumI / c;
  231. }
  232. /**
  233. * Calculate the separation of clusters (eqn 4) taking into account both the distances between the
  234. * clusters' closest points and the Inter-cluster density. The goal is the distances between clusters
  235. * to be high while the representative point density in the areas between them are low.
  236. *
  237. * @return a double
  238. */
  239. public double separation() {
  240. pruneInvalidClusters();
  241. double minDistanceSum = 0;
  242. for (int i = 0; i < clusters.size(); i++) {
  243. Integer cI = clusters.get(i).getId();
  244. List<VectorWritable> closRepI = representativePoints.get(cI);
  245. for (int j = 0; j < clusters.size(); j++) {
  246. if (i == j) {
  247. continue;
  248. }
  249. // find min{d(closRepI, closRepJ)}
  250. Integer cJ = clusters.get(j).getId();
  251. List<VectorWritable> closRepJ = representativePoints.get(cJ);
  252. double minDistance = Double.MAX_VALUE;
  253. for (VectorWritable aRepI : closRepI) {
  254. for (VectorWritable aRepJ : closRepJ) {
  255. double distance = measure.distance(aRepI.get(), aRepJ.get());
  256. if (distance < minDistance) {
  257. minDistance = distance;
  258. }
  259. }
  260. }
  261. minDistanceSum += minDistance;
  262. }
  263. }
  264. return minDistanceSum / (1.0 + interClusterDensity());
  265. }
  266. /**
  267. * This function evaluates the average density of points in the regions between clusters (eqn 1).
  268. * The goal is the density in the area between clusters to be significant low.
  269. *
  270. * @return a double
  271. */
  272. public double interClusterDensity() {
  273. pruneInvalidClusters();
  274. double sum = 0.0;
  275. // find the closest representative points between the clusters
  276. for (int i = 0; i < clusters.size(); i++) {
  277. Integer cI = clusters.get(i).getId();
  278. List<VectorWritable> repI = representativePoints.get(cI);
  279. for (int j = 1; j < clusters.size(); j++) {
  280. Integer cJ = clusters.get(j).getId();
  281. if (i == j) {
  282. continue;
  283. }
  284. List<VectorWritable> repJ = representativePoints.get(cJ);
  285. double minDistance = Double.MAX_VALUE; // the distance between the closest representative points
  286. Vector uIJ = null; // the midpoint between the closest representative points
  287. // find the closest representative points between the i-th and j-th clusters
  288. for (VectorWritable aRepI : repI) {
  289. for (VectorWritable aRepJ : repJ) {
  290. Vector closRepI = aRepI.get();
  291. Vector closRepJ = aRepJ.get();
  292. double distance = measure.distance(closRepI, closRepJ);
  293. if (distance < minDistance) {
  294. // set the distance and compute the midpoint
  295. minDistance = distance;
  296. uIJ = closRepI.plus(closRepJ).divide(2);
  297. }
  298. }
  299. }
  300. double stDevI = stDevs.get(cI);
  301. double stDevJ = stDevs.get(cJ);
  302. double interDensity = interDensity(uIJ, cI, cJ);
  303. double stdSum = stDevI + stDevJ;
  304. double density = 0.0;
  305. if (stdSum > 0.0) {
  306. density = minDistance * interDensity / stdSum;
  307. }
  308. if (log.isDebugEnabled()) {
  309. log.debug("minDistance[{},{}]={}", new Object[] {cI, cJ, minDistance});
  310. log.debug("stDev[{}]={}", cI, stDevI);
  311. log.debug("stDev[{}]={}", cJ, stDevJ);
  312. log.debug("interDensity[{},{}]={}", new Object[] {cI, cJ, interDensity});
  313. log.debug("density[{},{}]={}", new Object[] {cI, cJ, density});
  314. }
  315. sum += density;
  316. }
  317. }
  318. log.debug("interClusterDensity={}", sum);
  319. return sum;
  320. }
  321. }