toolkit/components/places/ClusterLib.js

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/toolkit/components/places/ClusterLib.js	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,248 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +/**
     1.9 + * Class that can run the hierarchical clustering algorithm with the given
    1.10 + * parameters.
    1.11 + *
    1.12 + * @param distance
    1.13 + *        Function that should return the distance between two items.
    1.14 + *        Defaults to clusterlib.euclidean_distance.
    1.15 + * @param merge
    1.16 + *        Function that should take in two items and return a merged one.
    1.17 + *        Defaults to clusterlib.average_linkage.
    1.18 + * @param threshold
    1.19 + *        The maximum distance between two items for which their clusters
    1.20 + *        can be merged.
    1.21 + */
    1.22 +function HierarchicalClustering(distance, merge, threshold) {
    1.23 +  this.distance = distance || clusterlib.euclidean_distance;
    1.24 +  this.merge = merge || clusterlib.average_linkage;
    1.25 +  this.threshold = threshold == undefined ? Infinity : threshold;
    1.26 +}
    1.27 +
    1.28 +HierarchicalClustering.prototype = {
    1.29 +  /**
    1.30 +   * Run the hierarchical clustering algorithm on the given items to produce
    1.31 +   * a final set of clusters.  Uses the parameters set in the constructor.
    1.32 +   *
    1.33 +   * @param items
    1.34 +   *        An array of "things" to cluster - this is the domain-specific
    1.35 +   *        collection you're trying to cluster (colors, points, etc.)
    1.36 +   * @param snapshotGap
    1.37 +   *        How many iterations of the clustering algorithm to wait between
    1.38 +   *        calling the snapshotCallback
    1.39 +   * @param snapshotCallback
    1.40 +   *        If provided, will be called as clusters are merged to let you view
    1.41 +   *        the progress of the algorithm.  Passed the current array of
    1.42 +   *        clusters, cached distances, and cached closest clusters.
    1.43 +   *
    1.44 +   * @return An array of merged clusters.  The represented item can be
    1.45 +   *         found in the "item" property of the cluster.
    1.46 +   */
    1.47 +  cluster: function HC_cluster(items, snapshotGap, snapshotCallback) {
    1.48 +    // array of all remaining clusters
    1.49 +    let clusters = [];
    1.50 +    // 2D matrix of distances between each pair of clusters, indexed by key
    1.51 +    let distances = [];
    1.52 +    // closest cluster key for each cluster, indexed by key
    1.53 +    let neighbors = [];
    1.54 +    // an array of all clusters, but indexed by key
    1.55 +    let clustersByKey = [];
    1.56 +
    1.57 +    // set up clusters from the initial items array
    1.58 +    for (let index = 0; index < items.length; index++) {
    1.59 +      let cluster = {
    1.60 +        // the item this cluster represents
    1.61 +        item: items[index],
    1.62 +        // a unique key for this cluster, stays constant unless merged itself
    1.63 +        key: index,
    1.64 +        // index of cluster in clusters array, can change during any merge
    1.65 +        index: index,
    1.66 +        // how many clusters have been merged into this one
    1.67 +        size: 1
    1.68 +      };
    1.69 +      clusters[index] = cluster;
    1.70 +      clustersByKey[index] = cluster;
    1.71 +      distances[index] = [];
    1.72 +      neighbors[index] = 0;
    1.73 +    }
    1.74 +
    1.75 +    // initialize distance matrix and cached neighbors
    1.76 +    for (let i = 0; i < clusters.length; i++) {
    1.77 +      for (let j = 0; j <= i; j++) {
    1.78 +        var dist = (i == j) ? Infinity :
    1.79 +          this.distance(clusters[i].item, clusters[j].item);
    1.80 +        distances[i][j] = dist;
    1.81 +        distances[j][i] = dist;
    1.82 +
    1.83 +        if (dist < distances[i][neighbors[i]]) {
    1.84 +          neighbors[i] = j;
    1.85 +        }
    1.86 +      }
    1.87 +    }
    1.88 +
    1.89 +    // merge the next two closest clusters until none of them are close enough
    1.90 +    let next = null, i = 0;
    1.91 +    for (; next = this.closestClusters(clusters, distances, neighbors); i++) {
    1.92 +      if (snapshotCallback && (i % snapshotGap) == 0) {
    1.93 +        snapshotCallback(clusters);
    1.94 +      }
    1.95 +      this.mergeClusters(clusters, distances, neighbors, clustersByKey,
    1.96 +                         clustersByKey[next[0]], clustersByKey[next[1]]);
    1.97 +    }
    1.98 +    return clusters;
    1.99 +  },
   1.100 +
   1.101 +  /**
   1.102 +   * Once we decide to merge two clusters in the cluster method, actually
   1.103 +   * merge them.  Alters the given state of the algorithm.
   1.104 +   *
   1.105 +   * @param clusters
   1.106 +   *        The array of all remaining clusters
   1.107 +   * @param distances
   1.108 +   *        Cached distances between pairs of clusters
   1.109 +   * @param neighbors
   1.110 +   *        Cached closest clusters
   1.111 +   * @param clustersByKey
   1.112 +   *        Array of all clusters, indexed by key
   1.113 +   * @param cluster1
   1.114 +   *        First cluster to merge
   1.115 +   * @param cluster2
   1.116 +   *        Second cluster to merge
   1.117 +   */
   1.118 +  mergeClusters: function HC_mergeClus(clusters, distances, neighbors,
   1.119 +                                       clustersByKey, cluster1, cluster2) {
   1.120 +    let merged = { item: this.merge(cluster1.item, cluster2.item),
   1.121 +                   left: cluster1,
   1.122 +                   right: cluster2,
   1.123 +                   key: cluster1.key,
   1.124 +                   size: cluster1.size + cluster2.size };
   1.125 +
   1.126 +    clusters[cluster1.index] = merged;
   1.127 +    clusters.splice(cluster2.index, 1);
   1.128 +    clustersByKey[cluster1.key] = merged;
   1.129 +
   1.130 +    // update distances with new merged cluster
   1.131 +    for (let i = 0; i < clusters.length; i++) {
   1.132 +      var ci = clusters[i];
   1.133 +      var dist;
   1.134 +      if (cluster1.key == ci.key) {
   1.135 +        dist = Infinity;
   1.136 +      } else if (this.merge == clusterlib.single_linkage) {
   1.137 +        dist = distances[cluster1.key][ci.key];
   1.138 +        if (distances[cluster1.key][ci.key] >
   1.139 +            distances[cluster2.key][ci.key]) {
   1.140 +          dist = distances[cluster2.key][ci.key];
   1.141 +        }
   1.142 +      } else if (this.merge == clusterlib.complete_linkage) {
   1.143 +        dist = distances[cluster1.key][ci.key];
   1.144 +        if (distances[cluster1.key][ci.key] <
   1.145 +            distances[cluster2.key][ci.key]) {
   1.146 +          dist = distances[cluster2.key][ci.key];
   1.147 +        }
   1.148 +      } else if (this.merge == clusterlib.average_linkage) {
   1.149 +        dist = (distances[cluster1.key][ci.key] * cluster1.size
   1.150 +              + distances[cluster2.key][ci.key] * cluster2.size)
   1.151 +              / (cluster1.size + cluster2.size);
   1.152 +      } else {
   1.153 +        dist = this.distance(ci.item, cluster1.item);
   1.154 +      }
   1.155 +
   1.156 +      distances[cluster1.key][ci.key] = distances[ci.key][cluster1.key]
   1.157 +                                      = dist;
   1.158 +    }
   1.159 +
   1.160 +    // update cached neighbors
   1.161 +    for (let i = 0; i < clusters.length; i++) {
   1.162 +      var key1 = clusters[i].key;
   1.163 +      if (neighbors[key1] == cluster1.key ||
   1.164 +          neighbors[key1] == cluster2.key) {
   1.165 +        let minKey = key1;
   1.166 +        for (let j = 0; j < clusters.length; j++) {
   1.167 +          var key2 = clusters[j].key;
   1.168 +          if (distances[key1][key2] < distances[key1][minKey]) {
   1.169 +            minKey = key2;
   1.170 +          }
   1.171 +        }
   1.172 +        neighbors[key1] = minKey;
   1.173 +      }
   1.174 +      clusters[i].index = i;
   1.175 +    }
   1.176 +  },
   1.177 +
   1.178 +  /**
   1.179 +   * Given the current state of the algorithm, return the keys of the two
   1.180 +   * clusters that are closest to each other so we know which ones to merge
   1.181 +   * next.
   1.182 +   *
   1.183 +   * @param clusters
   1.184 +   *        The array of all remaining clusters
   1.185 +   * @param distances
   1.186 +   *        Cached distances between pairs of clusters
   1.187 +   * @param neighbors
   1.188 +   *        Cached closest clusters
   1.189 +   *
   1.190 +   * @return An array of two keys of clusters to merge, or null if there are
   1.191 +   *         no more clusters close enough to merge
   1.192 +   */
   1.193 +  closestClusters: function HC_closestClus(clusters, distances, neighbors) {
   1.194 +    let minKey = 0, minDist = Infinity;
   1.195 +    for (let i = 0; i < clusters.length; i++) {
   1.196 +      var key = clusters[i].key;
   1.197 +      if (distances[key][neighbors[key]] < minDist) {
   1.198 +        minKey = key;
   1.199 +        minDist = distances[key][neighbors[key]];
   1.200 +      }
   1.201 +    }
   1.202 +    if (minDist < this.threshold) {
   1.203 +      return [minKey, neighbors[minKey]];
   1.204 +    }
   1.205 +    return null;
   1.206 +  }
   1.207 +};
   1.208 +
   1.209 +let clusterlib = {
   1.210 +  hcluster: function hcluster(items, distance, merge, threshold, snapshotGap,
   1.211 +                              snapshotCallback) {
   1.212 +    return (new HierarchicalClustering(distance, merge, threshold))
   1.213 +           .cluster(items, snapshotGap, snapshotCallback);
   1.214 +  },
   1.215 +
   1.216 +  single_linkage: function single_linkage(cluster1, cluster2) {
   1.217 +    return cluster1;
   1.218 +  },
   1.219 +
   1.220 +  complete_linkage: function complete_linkage(cluster1, cluster2) {
   1.221 +    return cluster1;
   1.222 +  },
   1.223 +
   1.224 +  average_linkage: function average_linkage(cluster1, cluster2) {
   1.225 +    return cluster1;
   1.226 +  },
   1.227 +
   1.228 +  euclidean_distance: function euclidean_distance(v1, v2) {
   1.229 +    let total = 0;
   1.230 +    for (let i = 0; i < v1.length; i++) {
   1.231 +      total += Math.pow(v2[i] - v1[i], 2);
   1.232 +    }
   1.233 +    return Math.sqrt(total);
   1.234 +  },
   1.235 +
   1.236 +  manhattan_distance: function manhattan_distance(v1, v2) {
   1.237 +    let total = 0;
   1.238 +    for (let i = 0; i < v1.length; i++) {
   1.239 +      total += Math.abs(v2[i] - v1[i]);
   1.240 +    }
   1.241 +    return total;
   1.242 +  },
   1.243 +
   1.244 +  max_distance: function max_distance(v1, v2) {
   1.245 +    let max = 0;
   1.246 +    for (let i = 0; i < v1.length; i++) {
   1.247 +      max = Math.max(max, Math.abs(v2[i] - v1[i]));
   1.248 +    }
   1.249 +    return max;
   1.250 +  }
   1.251 +};

mercurial