/common/mahout-distribution-0.7/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
Java | 340 lines | 216 code | 19 blank | 105 comment | 33 complexity | 5d01c210b38a5cfa796c7588fd6ca698 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.mahout.clustering.cdbw;
- import java.util.Iterator;
- import java.util.List;
- import java.util.Map;
- import com.google.common.collect.Lists;
- import com.google.common.collect.Maps;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.mahout.clustering.Cluster;
- import org.apache.mahout.clustering.GaussianAccumulator;
- import org.apache.mahout.clustering.OnlineGaussianAccumulator;
- import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
- import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
- import org.apache.mahout.clustering.iterator.ClusterWritable;
- import org.apache.mahout.common.ClassUtils;
- import org.apache.mahout.common.distance.DistanceMeasure;
- import org.apache.mahout.common.iterator.sequencefile.PathFilters;
- import org.apache.mahout.common.iterator.sequencefile.PathType;
- import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
- import org.apache.mahout.math.Vector;
- import org.apache.mahout.math.VectorWritable;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- /**
- * This class calculates the CDbw metric as defined in
- * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf
- */
- public class CDbwEvaluator {
- private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);
- private final Map<Integer, List<VectorWritable>> representativePoints;
- private final Map<Integer, Double> stDevs = Maps.newHashMap();
- private final List<Cluster> clusters;
- private final DistanceMeasure measure;
- private boolean pruned;
- /**
- * For testing only
- *
- * @param representativePoints
- * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
- * @param clusters
- * a Map<Integer,Cluster> of the clusters keyed by clusterId
- * @param measure
- * an appropriate DistanceMeasure
- */
- public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints,
- List<Cluster> clusters,
- DistanceMeasure measure) {
- this.representativePoints = representativePoints;
- this.clusters = clusters;
- this.measure = measure;
- for (Integer cId : representativePoints.keySet()) {
- computeStd(cId);
- }
- }
- /**
- * Initialize a new instance from job information
- *
- * @param conf
- * a Configuration with appropriate parameters
- * @param clustersIn
- * a String path to the input clusters directory
- */
- public CDbwEvaluator(Configuration conf, Path clustersIn) {
- measure = ClassUtils.instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY),
- DistanceMeasure.class);
- representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
- clusters = loadClusters(conf, clustersIn);
- for (Integer cId : representativePoints.keySet()) {
- computeStd(cId);
- }
- }
- /**
- * Load the clusters from their sequence files
- *
- * @param clustersIn
- * a String pathname to the directory containing input cluster files
- * @return a List<Cluster> of the clusters
- */
- private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
- List<Cluster> clusters = Lists.newArrayList();
- for (ClusterWritable clusterWritable :
- new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
- Cluster cluster = clusterWritable.getValue();
- clusters.add(cluster);
- }
- return clusters;
- }
- /**
- * Compute the standard deviation of the representative points for the given cluster.
- * Store these in stDevs, indexed by cI
- *
- * @param cI a int clusterId.
- */
- private void computeStd(int cI) {
- List<VectorWritable> repPts = representativePoints.get(cI);
- GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
- for (VectorWritable vw : repPts) {
- accumulator.observe(vw.get(), 1.0);
- }
- accumulator.compute();
- double d = accumulator.getAverageStd();
- stDevs.put(cI, d);
- }
- /**
- * Return if the cluster is valid. Valid clusters must have more than 2 representative points,
- * and at least one of them must be different than the cluster center. This is because the
- * representative points extraction will duplicate the cluster center if it is empty.
- *
- * @param clusterI a Cluster
- * @return a boolean
- */
- private boolean invalidCluster(Cluster clusterI) {
- List<VectorWritable> repPts = representativePoints.get(clusterI.getId());
- if (repPts.size() < 2) {
- return true;
- }
- for (VectorWritable vw : repPts) {
- Vector vector = vw.get();
- if (!vector.equals(clusterI.getCenter())) {
- return false;
- }
- }
- return true;
- }
- private void pruneInvalidClusters() {
- if (pruned) {
- return;
- }
- for (Iterator<Cluster> it = clusters.iterator(); it.hasNext();) {
- Cluster cluster = it.next();
- if (invalidCluster(cluster)) {
- log.info("Pruning cluster Id={}", cluster.getId());
- it.remove();
- representativePoints.remove(cluster.getId());
- }
- }
- pruned = true;
- }
- /**
- * Compute the term density (eqn 2) used for inter-cluster density calculation
- *
- * @param uIJ the Vector midpoint between the closest representative of the clusters
- * @param cI the int clusterId of the i-th cluster
- * @param cJ the int clusterId of the j-th cluster
- * @return a double
- */
- double interDensity(Vector uIJ, int cI, int cJ) {
- List<VectorWritable> repI = representativePoints.get(cI);
- List<VectorWritable> repJ = representativePoints.get(cJ);
- double sum = 0.0;
- Double stdevI = stDevs.get(cI);
- Double stdevJ = stDevs.get(cJ);
- // count the number of representative points of the clusters which are within the
- // average std of the two clusters from the midpoint uIJ (eqn 3)
- double avgStd = (stdevI + stdevJ) / 2.0;
- for (VectorWritable vwI : repI) {
- if (measure.distance(uIJ, vwI.get()) <= avgStd) {
- sum++;
- }
- }
- for (VectorWritable vwJ : repJ) {
- if (measure.distance(uIJ, vwJ.get()) <= avgStd) {
- sum++;
- }
- }
- int nI = repI.size();
- int nJ = repJ.size();
- return sum / (nI + nJ);
- }
- /**
- * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which
- * have a high intraClusterDensity and also a high cluster separation.
- *
- * @return a double
- */
- public double getCDbw() {
- pruneInvalidClusters();
- return intraClusterDensity() * separation();
- }
- /**
- * The average density within clusters is defined as the percentage of representative points that reside
- * in the neighborhood of the clusters' centers. The goal is the density within clusters to be
- * significantly high. (eqn 5)
- *
- * @return a double
- */
- public double intraClusterDensity() {
- pruneInvalidClusters();
- // compute the average standard deviation of the clusters
- double stdev = 0.0;
- for (Integer cI : representativePoints.keySet()) {
- stdev += stDevs.get(cI);
- }
- int c = representativePoints.size();
- stdev /= c;
- // accumulate the summations
- double sumI = 0.0;
- for (Cluster cluster : clusters) {
- Integer cI = cluster.getId();
- List<VectorWritable> repPtsI = representativePoints.get(cI);
- int r = repPtsI.size();
- double sumJ = 0.0;
- // compute the term density (eqn 6)
- for (VectorWritable pt : repPtsI) {
- // compute f(x, vIJ) (eqn 7)
- Vector repJ = pt.get();
- double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0;
- // accumulate sumJ
- sumJ += densityIJ / stdev;
- }
- // accumulate sumI
- sumI += sumJ / r;
- }
- return sumI / c;
- }
- /**
- * Calculate the separation of clusters (eqn 4) taking into account both the distances between the
- * clusters' closest points and the Inter-cluster density. The goal is the distances between clusters
- * to be high while the representative point density in the areas between them are low.
- *
- * @return a double
- */
- public double separation() {
- pruneInvalidClusters();
- double minDistanceSum = 0;
- for (int i = 0; i < clusters.size(); i++) {
- Integer cI = clusters.get(i).getId();
- List<VectorWritable> closRepI = representativePoints.get(cI);
- for (int j = 0; j < clusters.size(); j++) {
- if (i == j) {
- continue;
- }
- // find min{d(closRepI, closRepJ)}
- Integer cJ = clusters.get(j).getId();
- List<VectorWritable> closRepJ = representativePoints.get(cJ);
- double minDistance = Double.MAX_VALUE;
- for (VectorWritable aRepI : closRepI) {
- for (VectorWritable aRepJ : closRepJ) {
- double distance = measure.distance(aRepI.get(), aRepJ.get());
- if (distance < minDistance) {
- minDistance = distance;
- }
- }
- }
- minDistanceSum += minDistance;
- }
- }
- return minDistanceSum / (1.0 + interClusterDensity());
- }
- /**
- * This function evaluates the average density of points in the regions between clusters (eqn 1).
- * The goal is the density in the area between clusters to be significant low.
- *
- * @return a double
- */
- public double interClusterDensity() {
- pruneInvalidClusters();
- double sum = 0.0;
- // find the closest representative points between the clusters
- for (int i = 0; i < clusters.size(); i++) {
- Integer cI = clusters.get(i).getId();
- List<VectorWritable> repI = representativePoints.get(cI);
- for (int j = 1; j < clusters.size(); j++) {
- Integer cJ = clusters.get(j).getId();
- if (i == j) {
- continue;
- }
- List<VectorWritable> repJ = representativePoints.get(cJ);
- double minDistance = Double.MAX_VALUE; // the distance between the closest representative points
- Vector uIJ = null; // the midpoint between the closest representative points
- // find the closest representative points between the i-th and j-th clusters
- for (VectorWritable aRepI : repI) {
- for (VectorWritable aRepJ : repJ) {
- Vector closRepI = aRepI.get();
- Vector closRepJ = aRepJ.get();
- double distance = measure.distance(closRepI, closRepJ);
- if (distance < minDistance) {
- // set the distance and compute the midpoint
- minDistance = distance;
- uIJ = closRepI.plus(closRepJ).divide(2);
- }
- }
- }
- double stDevI = stDevs.get(cI);
- double stDevJ = stDevs.get(cJ);
- double interDensity = interDensity(uIJ, cI, cJ);
- double stdSum = stDevI + stDevJ;
- double density = 0.0;
- if (stdSum > 0.0) {
- density = minDistance * interDensity / stdSum;
- }
- if (log.isDebugEnabled()) {
- log.debug("minDistance[{},{}]={}", new Object[] {cI, cJ, minDistance});
- log.debug("stDev[{}]={}", cI, stDevI);
- log.debug("stDev[{}]={}", cJ, stDevJ);
- log.debug("interDensity[{},{}]={}", new Object[] {cI, cJ, interDensity});
- log.debug("density[{},{}]={}", new Object[] {cI, cJ, density});
- }
- sum += density;
- }
- }
- log.debug("interClusterDensity={}", sum);
- return sum;
- }
- }