Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | |
michael@0 | 5 | /** |
michael@0 | 6 | * Class that can run the hierarchical clustering algorithm with the given |
michael@0 | 7 | * parameters. |
michael@0 | 8 | * |
michael@0 | 9 | * @param distance |
michael@0 | 10 | * Function that should return the distance between two items. |
michael@0 | 11 | * Defaults to clusterlib.euclidean_distance. |
michael@0 | 12 | * @param merge |
michael@0 | 13 | * Function that should take in two items and return a merged one. |
michael@0 | 14 | * Defaults to clusterlib.average_linkage. |
michael@0 | 15 | * @param threshold |
michael@0 | 16 | * The maximum distance between two items for which their clusters |
michael@0 | 17 | * can be merged. |
michael@0 | 18 | */ |
michael@0 | 19 | function HierarchicalClustering(distance, merge, threshold) { |
michael@0 | 20 | this.distance = distance || clusterlib.euclidean_distance; |
michael@0 | 21 | this.merge = merge || clusterlib.average_linkage; |
michael@0 | 22 | this.threshold = threshold == undefined ? Infinity : threshold; |
michael@0 | 23 | } |
michael@0 | 24 | |
michael@0 | 25 | HierarchicalClustering.prototype = { |
michael@0 | 26 | /** |
michael@0 | 27 | * Run the hierarchical clustering algorithm on the given items to produce |
michael@0 | 28 | * a final set of clusters. Uses the parameters set in the constructor. |
michael@0 | 29 | * |
michael@0 | 30 | * @param items |
michael@0 | 31 | * An array of "things" to cluster - this is the domain-specific |
michael@0 | 32 | * collection you're trying to cluster (colors, points, etc.) |
michael@0 | 33 | * @param snapshotGap |
michael@0 | 34 | * How many iterations of the clustering algorithm to wait between |
michael@0 | 35 | * calling the snapshotCallback |
michael@0 | 36 | * @param snapshotCallback |
michael@0 | 37 | * If provided, will be called as clusters are merged to let you view |
michael@0 | 38 | * the progress of the algorithm. Passed the current array of |
michael@0 | 39 | * clusters, cached distances, and cached closest clusters. |
michael@0 | 40 | * |
michael@0 | 41 | * @return An array of merged clusters. The represented item can be |
michael@0 | 42 | * found in the "item" property of the cluster. |
michael@0 | 43 | */ |
michael@0 | 44 | cluster: function HC_cluster(items, snapshotGap, snapshotCallback) { |
michael@0 | 45 | // array of all remaining clusters |
michael@0 | 46 | let clusters = []; |
michael@0 | 47 | // 2D matrix of distances between each pair of clusters, indexed by key |
michael@0 | 48 | let distances = []; |
michael@0 | 49 | // closest cluster key for each cluster, indexed by key |
michael@0 | 50 | let neighbors = []; |
michael@0 | 51 | // an array of all clusters, but indexed by key |
michael@0 | 52 | let clustersByKey = []; |
michael@0 | 53 | |
michael@0 | 54 | // set up clusters from the initial items array |
michael@0 | 55 | for (let index = 0; index < items.length; index++) { |
michael@0 | 56 | let cluster = { |
michael@0 | 57 | // the item this cluster represents |
michael@0 | 58 | item: items[index], |
michael@0 | 59 | // a unique key for this cluster, stays constant unless merged itself |
michael@0 | 60 | key: index, |
michael@0 | 61 | // index of cluster in clusters array, can change during any merge |
michael@0 | 62 | index: index, |
michael@0 | 63 | // how many clusters have been merged into this one |
michael@0 | 64 | size: 1 |
michael@0 | 65 | }; |
michael@0 | 66 | clusters[index] = cluster; |
michael@0 | 67 | clustersByKey[index] = cluster; |
michael@0 | 68 | distances[index] = []; |
michael@0 | 69 | neighbors[index] = 0; |
michael@0 | 70 | } |
michael@0 | 71 | |
michael@0 | 72 | // initialize distance matrix and cached neighbors |
michael@0 | 73 | for (let i = 0; i < clusters.length; i++) { |
michael@0 | 74 | for (let j = 0; j <= i; j++) { |
michael@0 | 75 | var dist = (i == j) ? Infinity : |
michael@0 | 76 | this.distance(clusters[i].item, clusters[j].item); |
michael@0 | 77 | distances[i][j] = dist; |
michael@0 | 78 | distances[j][i] = dist; |
michael@0 | 79 | |
michael@0 | 80 | if (dist < distances[i][neighbors[i]]) { |
michael@0 | 81 | neighbors[i] = j; |
michael@0 | 82 | } |
michael@0 | 83 | } |
michael@0 | 84 | } |
michael@0 | 85 | |
michael@0 | 86 | // merge the next two closest clusters until none of them are close enough |
michael@0 | 87 | let next = null, i = 0; |
michael@0 | 88 | for (; next = this.closestClusters(clusters, distances, neighbors); i++) { |
michael@0 | 89 | if (snapshotCallback && (i % snapshotGap) == 0) { |
michael@0 | 90 | snapshotCallback(clusters); |
michael@0 | 91 | } |
michael@0 | 92 | this.mergeClusters(clusters, distances, neighbors, clustersByKey, |
michael@0 | 93 | clustersByKey[next[0]], clustersByKey[next[1]]); |
michael@0 | 94 | } |
michael@0 | 95 | return clusters; |
michael@0 | 96 | }, |
michael@0 | 97 | |
michael@0 | 98 | /** |
michael@0 | 99 | * Once we decide to merge two clusters in the cluster method, actually |
michael@0 | 100 | * merge them. Alters the given state of the algorithm. |
michael@0 | 101 | * |
michael@0 | 102 | * @param clusters |
michael@0 | 103 | * The array of all remaining clusters |
michael@0 | 104 | * @param distances |
michael@0 | 105 | * Cached distances between pairs of clusters |
michael@0 | 106 | * @param neighbors |
michael@0 | 107 | * Cached closest clusters |
michael@0 | 108 | * @param clustersByKey |
michael@0 | 109 | * Array of all clusters, indexed by key |
michael@0 | 110 | * @param cluster1 |
michael@0 | 111 | * First cluster to merge |
michael@0 | 112 | * @param cluster2 |
michael@0 | 113 | * Second cluster to merge |
michael@0 | 114 | */ |
michael@0 | 115 | mergeClusters: function HC_mergeClus(clusters, distances, neighbors, |
michael@0 | 116 | clustersByKey, cluster1, cluster2) { |
michael@0 | 117 | let merged = { item: this.merge(cluster1.item, cluster2.item), |
michael@0 | 118 | left: cluster1, |
michael@0 | 119 | right: cluster2, |
michael@0 | 120 | key: cluster1.key, |
michael@0 | 121 | size: cluster1.size + cluster2.size }; |
michael@0 | 122 | |
michael@0 | 123 | clusters[cluster1.index] = merged; |
michael@0 | 124 | clusters.splice(cluster2.index, 1); |
michael@0 | 125 | clustersByKey[cluster1.key] = merged; |
michael@0 | 126 | |
michael@0 | 127 | // update distances with new merged cluster |
michael@0 | 128 | for (let i = 0; i < clusters.length; i++) { |
michael@0 | 129 | var ci = clusters[i]; |
michael@0 | 130 | var dist; |
michael@0 | 131 | if (cluster1.key == ci.key) { |
michael@0 | 132 | dist = Infinity; |
michael@0 | 133 | } else if (this.merge == clusterlib.single_linkage) { |
michael@0 | 134 | dist = distances[cluster1.key][ci.key]; |
michael@0 | 135 | if (distances[cluster1.key][ci.key] > |
michael@0 | 136 | distances[cluster2.key][ci.key]) { |
michael@0 | 137 | dist = distances[cluster2.key][ci.key]; |
michael@0 | 138 | } |
michael@0 | 139 | } else if (this.merge == clusterlib.complete_linkage) { |
michael@0 | 140 | dist = distances[cluster1.key][ci.key]; |
michael@0 | 141 | if (distances[cluster1.key][ci.key] < |
michael@0 | 142 | distances[cluster2.key][ci.key]) { |
michael@0 | 143 | dist = distances[cluster2.key][ci.key]; |
michael@0 | 144 | } |
michael@0 | 145 | } else if (this.merge == clusterlib.average_linkage) { |
michael@0 | 146 | dist = (distances[cluster1.key][ci.key] * cluster1.size |
michael@0 | 147 | + distances[cluster2.key][ci.key] * cluster2.size) |
michael@0 | 148 | / (cluster1.size + cluster2.size); |
michael@0 | 149 | } else { |
michael@0 | 150 | dist = this.distance(ci.item, cluster1.item); |
michael@0 | 151 | } |
michael@0 | 152 | |
michael@0 | 153 | distances[cluster1.key][ci.key] = distances[ci.key][cluster1.key] |
michael@0 | 154 | = dist; |
michael@0 | 155 | } |
michael@0 | 156 | |
michael@0 | 157 | // update cached neighbors |
michael@0 | 158 | for (let i = 0; i < clusters.length; i++) { |
michael@0 | 159 | var key1 = clusters[i].key; |
michael@0 | 160 | if (neighbors[key1] == cluster1.key || |
michael@0 | 161 | neighbors[key1] == cluster2.key) { |
michael@0 | 162 | let minKey = key1; |
michael@0 | 163 | for (let j = 0; j < clusters.length; j++) { |
michael@0 | 164 | var key2 = clusters[j].key; |
michael@0 | 165 | if (distances[key1][key2] < distances[key1][minKey]) { |
michael@0 | 166 | minKey = key2; |
michael@0 | 167 | } |
michael@0 | 168 | } |
michael@0 | 169 | neighbors[key1] = minKey; |
michael@0 | 170 | } |
michael@0 | 171 | clusters[i].index = i; |
michael@0 | 172 | } |
michael@0 | 173 | }, |
michael@0 | 174 | |
michael@0 | 175 | /** |
michael@0 | 176 | * Given the current state of the algorithm, return the keys of the two |
michael@0 | 177 | * clusters that are closest to each other so we know which ones to merge |
michael@0 | 178 | * next. |
michael@0 | 179 | * |
michael@0 | 180 | * @param clusters |
michael@0 | 181 | * The array of all remaining clusters |
michael@0 | 182 | * @param distances |
michael@0 | 183 | * Cached distances between pairs of clusters |
michael@0 | 184 | * @param neighbors |
michael@0 | 185 | * Cached closest clusters |
michael@0 | 186 | * |
michael@0 | 187 | * @return An array of two keys of clusters to merge, or null if there are |
michael@0 | 188 | * no more clusters close enough to merge |
michael@0 | 189 | */ |
michael@0 | 190 | closestClusters: function HC_closestClus(clusters, distances, neighbors) { |
michael@0 | 191 | let minKey = 0, minDist = Infinity; |
michael@0 | 192 | for (let i = 0; i < clusters.length; i++) { |
michael@0 | 193 | var key = clusters[i].key; |
michael@0 | 194 | if (distances[key][neighbors[key]] < minDist) { |
michael@0 | 195 | minKey = key; |
michael@0 | 196 | minDist = distances[key][neighbors[key]]; |
michael@0 | 197 | } |
michael@0 | 198 | } |
michael@0 | 199 | if (minDist < this.threshold) { |
michael@0 | 200 | return [minKey, neighbors[minKey]]; |
michael@0 | 201 | } |
michael@0 | 202 | return null; |
michael@0 | 203 | } |
michael@0 | 204 | }; |
michael@0 | 205 | |
michael@0 | 206 | let clusterlib = { |
michael@0 | 207 | hcluster: function hcluster(items, distance, merge, threshold, snapshotGap, |
michael@0 | 208 | snapshotCallback) { |
michael@0 | 209 | return (new HierarchicalClustering(distance, merge, threshold)) |
michael@0 | 210 | .cluster(items, snapshotGap, snapshotCallback); |
michael@0 | 211 | }, |
michael@0 | 212 | |
michael@0 | 213 | single_linkage: function single_linkage(cluster1, cluster2) { |
michael@0 | 214 | return cluster1; |
michael@0 | 215 | }, |
michael@0 | 216 | |
michael@0 | 217 | complete_linkage: function complete_linkage(cluster1, cluster2) { |
michael@0 | 218 | return cluster1; |
michael@0 | 219 | }, |
michael@0 | 220 | |
michael@0 | 221 | average_linkage: function average_linkage(cluster1, cluster2) { |
michael@0 | 222 | return cluster1; |
michael@0 | 223 | }, |
michael@0 | 224 | |
michael@0 | 225 | euclidean_distance: function euclidean_distance(v1, v2) { |
michael@0 | 226 | let total = 0; |
michael@0 | 227 | for (let i = 0; i < v1.length; i++) { |
michael@0 | 228 | total += Math.pow(v2[i] - v1[i], 2); |
michael@0 | 229 | } |
michael@0 | 230 | return Math.sqrt(total); |
michael@0 | 231 | }, |
michael@0 | 232 | |
michael@0 | 233 | manhattan_distance: function manhattan_distance(v1, v2) { |
michael@0 | 234 | let total = 0; |
michael@0 | 235 | for (let i = 0; i < v1.length; i++) { |
michael@0 | 236 | total += Math.abs(v2[i] - v1[i]); |
michael@0 | 237 | } |
michael@0 | 238 | return total; |
michael@0 | 239 | }, |
michael@0 | 240 | |
michael@0 | 241 | max_distance: function max_distance(v1, v2) { |
michael@0 | 242 | let max = 0; |
michael@0 | 243 | for (let i = 0; i < v1.length; i++) { |
michael@0 | 244 | max = Math.max(max, Math.abs(v2[i] - v1[i])); |
michael@0 | 245 | } |
michael@0 | 246 | return max; |
michael@0 | 247 | } |
michael@0 | 248 | }; |