|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 /** |
|
6 * Class that can run the hierarchical clustering algorithm with the given |
|
7 * parameters. |
|
8 * |
|
9 * @param distance |
|
10 * Function that should return the distance between two items. |
|
11 * Defaults to clusterlib.euclidean_distance. |
|
12 * @param merge |
|
13 * Function that should take in two items and return a merged one. |
|
14 * Defaults to clusterlib.average_linkage. |
|
15 * @param threshold |
|
16 * The maximum distance between two items for which their clusters |
|
17 * can be merged. |
|
18 */ |
|
19 function HierarchicalClustering(distance, merge, threshold) { |
|
20 this.distance = distance || clusterlib.euclidean_distance; |
|
21 this.merge = merge || clusterlib.average_linkage; |
|
22 this.threshold = threshold == undefined ? Infinity : threshold; |
|
23 } |
|
24 |
|
25 HierarchicalClustering.prototype = { |
|
26 /** |
|
27 * Run the hierarchical clustering algorithm on the given items to produce |
|
28 * a final set of clusters. Uses the parameters set in the constructor. |
|
29 * |
|
30 * @param items |
|
31 * An array of "things" to cluster - this is the domain-specific |
|
32 * collection you're trying to cluster (colors, points, etc.) |
|
33 * @param snapshotGap |
|
34 * How many iterations of the clustering algorithm to wait between |
|
35 * calling the snapshotCallback |
|
36 * @param snapshotCallback |
|
37 * If provided, will be called as clusters are merged to let you view |
|
38 * the progress of the algorithm. Passed the current array of |
|
39 * clusters, cached distances, and cached closest clusters. |
|
40 * |
|
41 * @return An array of merged clusters. The represented item can be |
|
42 * found in the "item" property of the cluster. |
|
43 */ |
|
44 cluster: function HC_cluster(items, snapshotGap, snapshotCallback) { |
|
45 // array of all remaining clusters |
|
46 let clusters = []; |
|
47 // 2D matrix of distances between each pair of clusters, indexed by key |
|
48 let distances = []; |
|
49 // closest cluster key for each cluster, indexed by key |
|
50 let neighbors = []; |
|
51 // an array of all clusters, but indexed by key |
|
52 let clustersByKey = []; |
|
53 |
|
54 // set up clusters from the initial items array |
|
55 for (let index = 0; index < items.length; index++) { |
|
56 let cluster = { |
|
57 // the item this cluster represents |
|
58 item: items[index], |
|
59 // a unique key for this cluster, stays constant unless merged itself |
|
60 key: index, |
|
61 // index of cluster in clusters array, can change during any merge |
|
62 index: index, |
|
63 // how many clusters have been merged into this one |
|
64 size: 1 |
|
65 }; |
|
66 clusters[index] = cluster; |
|
67 clustersByKey[index] = cluster; |
|
68 distances[index] = []; |
|
69 neighbors[index] = 0; |
|
70 } |
|
71 |
|
72 // initialize distance matrix and cached neighbors |
|
73 for (let i = 0; i < clusters.length; i++) { |
|
74 for (let j = 0; j <= i; j++) { |
|
75 var dist = (i == j) ? Infinity : |
|
76 this.distance(clusters[i].item, clusters[j].item); |
|
77 distances[i][j] = dist; |
|
78 distances[j][i] = dist; |
|
79 |
|
80 if (dist < distances[i][neighbors[i]]) { |
|
81 neighbors[i] = j; |
|
82 } |
|
83 } |
|
84 } |
|
85 |
|
86 // merge the next two closest clusters until none of them are close enough |
|
87 let next = null, i = 0; |
|
88 for (; next = this.closestClusters(clusters, distances, neighbors); i++) { |
|
89 if (snapshotCallback && (i % snapshotGap) == 0) { |
|
90 snapshotCallback(clusters); |
|
91 } |
|
92 this.mergeClusters(clusters, distances, neighbors, clustersByKey, |
|
93 clustersByKey[next[0]], clustersByKey[next[1]]); |
|
94 } |
|
95 return clusters; |
|
96 }, |
|
97 |
|
98 /** |
|
99 * Once we decide to merge two clusters in the cluster method, actually |
|
100 * merge them. Alters the given state of the algorithm. |
|
101 * |
|
102 * @param clusters |
|
103 * The array of all remaining clusters |
|
104 * @param distances |
|
105 * Cached distances between pairs of clusters |
|
106 * @param neighbors |
|
107 * Cached closest clusters |
|
108 * @param clustersByKey |
|
109 * Array of all clusters, indexed by key |
|
110 * @param cluster1 |
|
111 * First cluster to merge |
|
112 * @param cluster2 |
|
113 * Second cluster to merge |
|
114 */ |
|
115 mergeClusters: function HC_mergeClus(clusters, distances, neighbors, |
|
116 clustersByKey, cluster1, cluster2) { |
|
117 let merged = { item: this.merge(cluster1.item, cluster2.item), |
|
118 left: cluster1, |
|
119 right: cluster2, |
|
120 key: cluster1.key, |
|
121 size: cluster1.size + cluster2.size }; |
|
122 |
|
123 clusters[cluster1.index] = merged; |
|
124 clusters.splice(cluster2.index, 1); |
|
125 clustersByKey[cluster1.key] = merged; |
|
126 |
|
127 // update distances with new merged cluster |
|
128 for (let i = 0; i < clusters.length; i++) { |
|
129 var ci = clusters[i]; |
|
130 var dist; |
|
131 if (cluster1.key == ci.key) { |
|
132 dist = Infinity; |
|
133 } else if (this.merge == clusterlib.single_linkage) { |
|
134 dist = distances[cluster1.key][ci.key]; |
|
135 if (distances[cluster1.key][ci.key] > |
|
136 distances[cluster2.key][ci.key]) { |
|
137 dist = distances[cluster2.key][ci.key]; |
|
138 } |
|
139 } else if (this.merge == clusterlib.complete_linkage) { |
|
140 dist = distances[cluster1.key][ci.key]; |
|
141 if (distances[cluster1.key][ci.key] < |
|
142 distances[cluster2.key][ci.key]) { |
|
143 dist = distances[cluster2.key][ci.key]; |
|
144 } |
|
145 } else if (this.merge == clusterlib.average_linkage) { |
|
146 dist = (distances[cluster1.key][ci.key] * cluster1.size |
|
147 + distances[cluster2.key][ci.key] * cluster2.size) |
|
148 / (cluster1.size + cluster2.size); |
|
149 } else { |
|
150 dist = this.distance(ci.item, cluster1.item); |
|
151 } |
|
152 |
|
153 distances[cluster1.key][ci.key] = distances[ci.key][cluster1.key] |
|
154 = dist; |
|
155 } |
|
156 |
|
157 // update cached neighbors |
|
158 for (let i = 0; i < clusters.length; i++) { |
|
159 var key1 = clusters[i].key; |
|
160 if (neighbors[key1] == cluster1.key || |
|
161 neighbors[key1] == cluster2.key) { |
|
162 let minKey = key1; |
|
163 for (let j = 0; j < clusters.length; j++) { |
|
164 var key2 = clusters[j].key; |
|
165 if (distances[key1][key2] < distances[key1][minKey]) { |
|
166 minKey = key2; |
|
167 } |
|
168 } |
|
169 neighbors[key1] = minKey; |
|
170 } |
|
171 clusters[i].index = i; |
|
172 } |
|
173 }, |
|
174 |
|
175 /** |
|
176 * Given the current state of the algorithm, return the keys of the two |
|
177 * clusters that are closest to each other so we know which ones to merge |
|
178 * next. |
|
179 * |
|
180 * @param clusters |
|
181 * The array of all remaining clusters |
|
182 * @param distances |
|
183 * Cached distances between pairs of clusters |
|
184 * @param neighbors |
|
185 * Cached closest clusters |
|
186 * |
|
187 * @return An array of two keys of clusters to merge, or null if there are |
|
188 * no more clusters close enough to merge |
|
189 */ |
|
190 closestClusters: function HC_closestClus(clusters, distances, neighbors) { |
|
191 let minKey = 0, minDist = Infinity; |
|
192 for (let i = 0; i < clusters.length; i++) { |
|
193 var key = clusters[i].key; |
|
194 if (distances[key][neighbors[key]] < minDist) { |
|
195 minKey = key; |
|
196 minDist = distances[key][neighbors[key]]; |
|
197 } |
|
198 } |
|
199 if (minDist < this.threshold) { |
|
200 return [minKey, neighbors[minKey]]; |
|
201 } |
|
202 return null; |
|
203 } |
|
204 }; |
|
205 |
|
206 let clusterlib = { |
|
207 hcluster: function hcluster(items, distance, merge, threshold, snapshotGap, |
|
208 snapshotCallback) { |
|
209 return (new HierarchicalClustering(distance, merge, threshold)) |
|
210 .cluster(items, snapshotGap, snapshotCallback); |
|
211 }, |
|
212 |
|
213 single_linkage: function single_linkage(cluster1, cluster2) { |
|
214 return cluster1; |
|
215 }, |
|
216 |
|
217 complete_linkage: function complete_linkage(cluster1, cluster2) { |
|
218 return cluster1; |
|
219 }, |
|
220 |
|
221 average_linkage: function average_linkage(cluster1, cluster2) { |
|
222 return cluster1; |
|
223 }, |
|
224 |
|
225 euclidean_distance: function euclidean_distance(v1, v2) { |
|
226 let total = 0; |
|
227 for (let i = 0; i < v1.length; i++) { |
|
228 total += Math.pow(v2[i] - v1[i], 2); |
|
229 } |
|
230 return Math.sqrt(total); |
|
231 }, |
|
232 |
|
233 manhattan_distance: function manhattan_distance(v1, v2) { |
|
234 let total = 0; |
|
235 for (let i = 0; i < v1.length; i++) { |
|
236 total += Math.abs(v2[i] - v1[i]); |
|
237 } |
|
238 return total; |
|
239 }, |
|
240 |
|
241 max_distance: function max_distance(v1, v2) { |
|
242 let max = 0; |
|
243 for (let i = 0; i < v1.length; i++) { |
|
244 max = Math.max(max, Math.abs(v2[i] - v1[i])); |
|
245 } |
|
246 return max; |
|
247 } |
|
248 }; |