michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: /** michael@0: * Class that can run the hierarchical clustering algorithm with the given michael@0: * parameters. michael@0: * michael@0: * @param distance michael@0: * Function that should return the distance between two items. michael@0: * Defaults to clusterlib.euclidean_distance. michael@0: * @param merge michael@0: * Function that should take in two items and return a merged one. michael@0: * Defaults to clusterlib.average_linkage. michael@0: * @param threshold michael@0: * The maximum distance between two items for which their clusters michael@0: * can be merged. michael@0: */ michael@0: function HierarchicalClustering(distance, merge, threshold) { michael@0: this.distance = distance || clusterlib.euclidean_distance; michael@0: this.merge = merge || clusterlib.average_linkage; michael@0: this.threshold = threshold == undefined ? Infinity : threshold; michael@0: } michael@0: michael@0: HierarchicalClustering.prototype = { michael@0: /** michael@0: * Run the hierarchical clustering algorithm on the given items to produce michael@0: * a final set of clusters. Uses the parameters set in the constructor. michael@0: * michael@0: * @param items michael@0: * An array of "things" to cluster - this is the domain-specific michael@0: * collection you're trying to cluster (colors, points, etc.) michael@0: * @param snapshotGap michael@0: * How many iterations of the clustering algorithm to wait between michael@0: * calling the snapshotCallback michael@0: * @param snapshotCallback michael@0: * If provided, will be called as clusters are merged to let you view michael@0: * the progress of the algorithm. Passed the current array of michael@0: * clusters, cached distances, and cached closest clusters. michael@0: * michael@0: * @return An array of merged clusters. The represented item can be michael@0: * found in the "item" property of the cluster. michael@0: */ michael@0: cluster: function HC_cluster(items, snapshotGap, snapshotCallback) { michael@0: // array of all remaining clusters michael@0: let clusters = []; michael@0: // 2D matrix of distances between each pair of clusters, indexed by key michael@0: let distances = []; michael@0: // closest cluster key for each cluster, indexed by key michael@0: let neighbors = []; michael@0: // an array of all clusters, but indexed by key michael@0: let clustersByKey = []; michael@0: michael@0: // set up clusters from the initial items array michael@0: for (let index = 0; index < items.length; index++) { michael@0: let cluster = { michael@0: // the item this cluster represents michael@0: item: items[index], michael@0: // a unique key for this cluster, stays constant unless merged itself michael@0: key: index, michael@0: // index of cluster in clusters array, can change during any merge michael@0: index: index, michael@0: // how many clusters have been merged into this one michael@0: size: 1 michael@0: }; michael@0: clusters[index] = cluster; michael@0: clustersByKey[index] = cluster; michael@0: distances[index] = []; michael@0: neighbors[index] = 0; michael@0: } michael@0: michael@0: // initialize distance matrix and cached neighbors michael@0: for (let i = 0; i < clusters.length; i++) { michael@0: for (let j = 0; j <= i; j++) { michael@0: var dist = (i == j) ? Infinity : michael@0: this.distance(clusters[i].item, clusters[j].item); michael@0: distances[i][j] = dist; michael@0: distances[j][i] = dist; michael@0: michael@0: if (dist < distances[i][neighbors[i]]) { michael@0: neighbors[i] = j; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // merge the next two closest clusters until none of them are close enough michael@0: let next = null, i = 0; michael@0: for (; next = this.closestClusters(clusters, distances, neighbors); i++) { michael@0: if (snapshotCallback && (i % snapshotGap) == 0) { michael@0: snapshotCallback(clusters); michael@0: } michael@0: this.mergeClusters(clusters, distances, neighbors, clustersByKey, michael@0: clustersByKey[next[0]], clustersByKey[next[1]]); michael@0: } michael@0: return clusters; michael@0: }, michael@0: michael@0: /** michael@0: * Once we decide to merge two clusters in the cluster method, actually michael@0: * merge them. Alters the given state of the algorithm. michael@0: * michael@0: * @param clusters michael@0: * The array of all remaining clusters michael@0: * @param distances michael@0: * Cached distances between pairs of clusters michael@0: * @param neighbors michael@0: * Cached closest clusters michael@0: * @param clustersByKey michael@0: * Array of all clusters, indexed by key michael@0: * @param cluster1 michael@0: * First cluster to merge michael@0: * @param cluster2 michael@0: * Second cluster to merge michael@0: */ michael@0: mergeClusters: function HC_mergeClus(clusters, distances, neighbors, michael@0: clustersByKey, cluster1, cluster2) { michael@0: let merged = { item: this.merge(cluster1.item, cluster2.item), michael@0: left: cluster1, michael@0: right: cluster2, michael@0: key: cluster1.key, michael@0: size: cluster1.size + cluster2.size }; michael@0: michael@0: clusters[cluster1.index] = merged; michael@0: clusters.splice(cluster2.index, 1); michael@0: clustersByKey[cluster1.key] = merged; michael@0: michael@0: // update distances with new merged cluster michael@0: for (let i = 0; i < clusters.length; i++) { michael@0: var ci = clusters[i]; michael@0: var dist; michael@0: if (cluster1.key == ci.key) { michael@0: dist = Infinity; michael@0: } else if (this.merge == clusterlib.single_linkage) { michael@0: dist = distances[cluster1.key][ci.key]; michael@0: if (distances[cluster1.key][ci.key] > michael@0: distances[cluster2.key][ci.key]) { michael@0: dist = distances[cluster2.key][ci.key]; michael@0: } michael@0: } else if (this.merge == clusterlib.complete_linkage) { michael@0: dist = distances[cluster1.key][ci.key]; michael@0: if (distances[cluster1.key][ci.key] < michael@0: distances[cluster2.key][ci.key]) { michael@0: dist = distances[cluster2.key][ci.key]; michael@0: } michael@0: } else if (this.merge == clusterlib.average_linkage) { michael@0: dist = (distances[cluster1.key][ci.key] * cluster1.size michael@0: + distances[cluster2.key][ci.key] * cluster2.size) michael@0: / (cluster1.size + cluster2.size); michael@0: } else { michael@0: dist = this.distance(ci.item, cluster1.item); michael@0: } michael@0: michael@0: distances[cluster1.key][ci.key] = distances[ci.key][cluster1.key] michael@0: = dist; michael@0: } michael@0: michael@0: // update cached neighbors michael@0: for (let i = 0; i < clusters.length; i++) { michael@0: var key1 = clusters[i].key; michael@0: if (neighbors[key1] == cluster1.key || michael@0: neighbors[key1] == cluster2.key) { michael@0: let minKey = key1; michael@0: for (let j = 0; j < clusters.length; j++) { michael@0: var key2 = clusters[j].key; michael@0: if (distances[key1][key2] < distances[key1][minKey]) { michael@0: minKey = key2; michael@0: } michael@0: } michael@0: neighbors[key1] = minKey; michael@0: } michael@0: clusters[i].index = i; michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Given the current state of the algorithm, return the keys of the two michael@0: * clusters that are closest to each other so we know which ones to merge michael@0: * next. michael@0: * michael@0: * @param clusters michael@0: * The array of all remaining clusters michael@0: * @param distances michael@0: * Cached distances between pairs of clusters michael@0: * @param neighbors michael@0: * Cached closest clusters michael@0: * michael@0: * @return An array of two keys of clusters to merge, or null if there are michael@0: * no more clusters close enough to merge michael@0: */ michael@0: closestClusters: function HC_closestClus(clusters, distances, neighbors) { michael@0: let minKey = 0, minDist = Infinity; michael@0: for (let i = 0; i < clusters.length; i++) { michael@0: var key = clusters[i].key; michael@0: if (distances[key][neighbors[key]] < minDist) { michael@0: minKey = key; michael@0: minDist = distances[key][neighbors[key]]; michael@0: } michael@0: } michael@0: if (minDist < this.threshold) { michael@0: return [minKey, neighbors[minKey]]; michael@0: } michael@0: return null; michael@0: } michael@0: }; michael@0: michael@0: let clusterlib = { michael@0: hcluster: function hcluster(items, distance, merge, threshold, snapshotGap, michael@0: snapshotCallback) { michael@0: return (new HierarchicalClustering(distance, merge, threshold)) michael@0: .cluster(items, snapshotGap, snapshotCallback); michael@0: }, michael@0: michael@0: single_linkage: function single_linkage(cluster1, cluster2) { michael@0: return cluster1; michael@0: }, michael@0: michael@0: complete_linkage: function complete_linkage(cluster1, cluster2) { michael@0: return cluster1; michael@0: }, michael@0: michael@0: average_linkage: function average_linkage(cluster1, cluster2) { michael@0: return cluster1; michael@0: }, michael@0: michael@0: euclidean_distance: function euclidean_distance(v1, v2) { michael@0: let total = 0; michael@0: for (let i = 0; i < v1.length; i++) { michael@0: total += Math.pow(v2[i] - v1[i], 2); michael@0: } michael@0: return Math.sqrt(total); michael@0: }, michael@0: michael@0: manhattan_distance: function manhattan_distance(v1, v2) { michael@0: let total = 0; michael@0: for (let i = 0; i < v1.length; i++) { michael@0: total += Math.abs(v2[i] - v1[i]); michael@0: } michael@0: return total; michael@0: }, michael@0: michael@0: max_distance: function max_distance(v1, v2) { michael@0: let max = 0; michael@0: for (let i = 0; i < v1.length; i++) { michael@0: max = Math.max(max, Math.abs(v2[i] - v1[i])); michael@0: } michael@0: return max; michael@0: } michael@0: };