CDbwEvaluator.java | searchcode

/common/mahout-distribution-0.7/distribution/target/mahout-distribution-0.7-src/mahout-distribution-0.7/integration/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java

https://bitbucket.org/AlexeyD/hibench · Java · 340 lines · 216 code · 19 blank · 105 comment · 33 complexity · 5d01c210b38a5cfa796c7588fd6ca698 MD5 · raw file

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.cdbw;

import java.util.Iterator;
import java.util.List;
import java.util.Map;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.GaussianAccumulator;
import org.apache.mahout.clustering.OnlineGaussianAccumulator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class calculates the CDbw metric as defined in
 * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf 
 */
public class CDbwEvaluator {

  private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);

  private final Map<Integer, List<VectorWritable>> representativePoints;
  private final Map<Integer, Double> stDevs = Maps.newHashMap();
  private final List<Cluster> clusters;
  private final DistanceMeasure measure;
  private boolean pruned;

  /**
   * For testing only
   * 
   * @param representativePoints
   *            a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
   * @param clusters
   *            a Map<Integer,Cluster> of the clusters keyed by clusterId
   * @param measure
   *            an appropriate DistanceMeasure
   */
  public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints,
                       List<Cluster> clusters,
                       DistanceMeasure measure) {
    this.representativePoints = representativePoints;
    this.clusters = clusters;
    this.measure = measure;
    for (Integer cId : representativePoints.keySet()) {
      computeStd(cId);
    }
  }

  /**
   * Initialize a new instance from job information
   * 
   * @param conf
   *            a Configuration with appropriate parameters
   * @param clustersIn
   *            a String path to the input clusters directory
   */
  public CDbwEvaluator(Configuration conf, Path clustersIn) {
    measure = ClassUtils.instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY),
                                       DistanceMeasure.class);
    representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
    clusters = loadClusters(conf, clustersIn);
    for (Integer cId : representativePoints.keySet()) {
      computeStd(cId);
    }
  }

  /**
   * Load the clusters from their sequence files
   * 
   * @param clustersIn 
   *            a String pathname to the directory containing input cluster files
   * @return a List<Cluster> of the clusters
   */
  private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) {
    List<Cluster> clusters = Lists.newArrayList();
    for (ClusterWritable clusterWritable :
         new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
    	Cluster cluster = clusterWritable.getValue();    	
        clusters.add(cluster);
    }
    return clusters;
  }

  /**
   * Compute the standard deviation of the representative points for the given cluster.
   * Store these in stDevs, indexed by cI
   * 
   * @param cI a int clusterId. 
   */
  private void computeStd(int cI) {
    List<VectorWritable> repPts = representativePoints.get(cI);
    GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
    for (VectorWritable vw : repPts) {
      accumulator.observe(vw.get(), 1.0);
    }
    accumulator.compute();
    double d = accumulator.getAverageStd();
    stDevs.put(cI, d);
  }

  /**
   * Return if the cluster is valid. Valid clusters must have more than 2 representative points,
   * and at least one of them must be different than the cluster center. This is because the
   * representative points extraction will duplicate the cluster center if it is empty.
   * 
   * @param clusterI a Cluster
   * @return a boolean
   */
  private boolean invalidCluster(Cluster clusterI) {
    List<VectorWritable> repPts = representativePoints.get(clusterI.getId());
    if (repPts.size() < 2) {
      return true;
    }
    for (VectorWritable vw : repPts) {
      Vector vector = vw.get();
      if (!vector.equals(clusterI.getCenter())) {
        return false;
      }
    }
    return true;
  }

  private void pruneInvalidClusters() {
    if (pruned) {
      return;
    }
    for (Iterator<Cluster> it = clusters.iterator(); it.hasNext();) {
      Cluster cluster = it.next();
      if (invalidCluster(cluster)) {
        log.info("Pruning cluster Id={}", cluster.getId());
        it.remove();
        representativePoints.remove(cluster.getId());
      }
    }
    pruned = true;
  }

  /**
   * Compute the term density (eqn 2) used for inter-cluster density calculation
   * 
   * @param uIJ the Vector midpoint between the closest representative of the clusters
   * @param cI the int clusterId of the i-th cluster
   * @param cJ the int clusterId of the j-th cluster
   * @return a double
   */
  double interDensity(Vector uIJ, int cI, int cJ) {
    List<VectorWritable> repI = representativePoints.get(cI);
    List<VectorWritable> repJ = representativePoints.get(cJ);
    double sum = 0.0;
    Double stdevI = stDevs.get(cI);
    Double stdevJ = stDevs.get(cJ);
    // count the number of representative points of the clusters which are within the
    // average std of the two clusters from the midpoint uIJ (eqn 3)
    double avgStd = (stdevI + stdevJ) / 2.0;
    for (VectorWritable vwI : repI) {
      if (measure.distance(uIJ, vwI.get()) <= avgStd) {
        sum++;
      }
    }
    for (VectorWritable vwJ : repJ) {
      if (measure.distance(uIJ, vwJ.get()) <= avgStd) {
        sum++;
      }
    }
    int nI = repI.size();
    int nJ = repJ.size();
    return sum / (nI + nJ);
  }

  /**
   * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which
   * have a high intraClusterDensity and also a high cluster separation.
   * 
   * @return a double
   */
  public double getCDbw() {
    pruneInvalidClusters();
    return intraClusterDensity() * separation();
  }

  /**
   * The average density within clusters is defined as the percentage of representative points that reside 
   * in the neighborhood of the clusters' centers. The goal is the density within clusters to be 
   * significantly high. (eqn 5)
   * 
   * @return a double
   */
  public double intraClusterDensity() {
    pruneInvalidClusters();
    // compute the average standard deviation of the clusters
    double stdev = 0.0;
    for (Integer cI : representativePoints.keySet()) {
      stdev += stDevs.get(cI);
    }
    int c = representativePoints.size();
    stdev /= c;
    // accumulate the summations
    double sumI = 0.0;
    for (Cluster cluster : clusters) {
      Integer cI = cluster.getId();
      List<VectorWritable> repPtsI = representativePoints.get(cI);
      int r = repPtsI.size();
      double sumJ = 0.0;
      // compute the term density (eqn 6)
      for (VectorWritable pt : repPtsI) {
        // compute f(x, vIJ) (eqn 7)
        Vector repJ = pt.get();
        double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0;
        // accumulate sumJ
        sumJ += densityIJ / stdev;
      }
      // accumulate sumI
      sumI += sumJ / r;
    }
    return sumI / c;
  }

  /**
   * Calculate the separation of clusters (eqn 4) taking into account both the distances between the
   * clusters' closest points and the Inter-cluster density. The goal is the distances between clusters 
   * to be high while the representative point density in the areas between them are low.
   * 
   * @return a double
   */
  public double separation() {
    pruneInvalidClusters();
    double minDistanceSum = 0;
    for (int i = 0; i < clusters.size(); i++) {
      Integer cI = clusters.get(i).getId();
      List<VectorWritable> closRepI = representativePoints.get(cI);
      for (int j = 0; j < clusters.size(); j++) {
        if (i == j) {
          continue;
        }
        // find min{d(closRepI, closRepJ)}
        Integer cJ = clusters.get(j).getId();
        List<VectorWritable> closRepJ = representativePoints.get(cJ);
        double minDistance = Double.MAX_VALUE;
        for (VectorWritable aRepI : closRepI) {
          for (VectorWritable aRepJ : closRepJ) {
            double distance = measure.distance(aRepI.get(), aRepJ.get());
            if (distance < minDistance) {
              minDistance = distance;
            }
          }
        }
        minDistanceSum += minDistance;
      }
    }
    return minDistanceSum / (1.0 + interClusterDensity());
  }

  /**
   * This function evaluates the average density of points in the regions between clusters (eqn 1). 
   * The goal is the density in the area between clusters to be significant low.
   * 
   * @return a double
   */
  public double interClusterDensity() {
    pruneInvalidClusters();
    double sum = 0.0;
    // find the closest representative points between the clusters
    for (int i = 0; i < clusters.size(); i++) {
      Integer cI = clusters.get(i).getId();
      List<VectorWritable> repI = representativePoints.get(cI);
      for (int j = 1; j < clusters.size(); j++) {
        Integer cJ = clusters.get(j).getId();
        if (i == j) {
          continue;
        }
        List<VectorWritable> repJ = representativePoints.get(cJ);
        double minDistance = Double.MAX_VALUE; // the distance between the closest representative points
        Vector uIJ = null; // the midpoint between the closest representative points
        // find the closest representative points between the i-th and j-th clusters
        for (VectorWritable aRepI : repI) {
          for (VectorWritable aRepJ : repJ) {
            Vector closRepI = aRepI.get();
            Vector closRepJ = aRepJ.get();
            double distance = measure.distance(closRepI, closRepJ);
            if (distance < minDistance) {
              // set the distance and compute the midpoint
              minDistance = distance;
              uIJ = closRepI.plus(closRepJ).divide(2);
            }
          }
        }
        double stDevI = stDevs.get(cI);
        double stDevJ = stDevs.get(cJ);
        double interDensity = interDensity(uIJ, cI, cJ);
        double stdSum = stDevI + stDevJ;
        double density = 0.0;
        if (stdSum > 0.0) {
          density = minDistance * interDensity / stdSum;
        }

        if (log.isDebugEnabled()) {
          log.debug("minDistance[{},{}]={}", new Object[] {cI, cJ, minDistance});
          log.debug("stDev[{}]={}", cI, stDevI);
          log.debug("stDev[{}]={}", cJ, stDevJ);
          log.debug("interDensity[{},{}]={}", new Object[] {cI, cJ, interDensity});
          log.debug("density[{},{}]={}", new Object[] {cI, cJ, density});
        }

        sum += density;
      }
    }
    log.debug("interClusterDensity={}", sum);
    return sum;
  }
}
Tech Fingerprint

Alerts (8)

'<' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
141
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
184 313
Complexity hotspot; lines 261 to 262 (total complexity: 3)
261 262
'Object[]' Avoid using raw 'Object[]'. Prefer typed arrays (e.g., String[]) or generic Collections (e.g., List<Object> or List<SpecificType>). If converting a collection, use `toArray(new Type[0])` or `toArray(Type[]::new)`.
327 330 331