mirror of
https://github.moeyy.xyz/https://github.com/trekhleb/javascript-algorithms.git
synced 2024-12-25 22:46:20 +08:00
Added kmeans clustering (#595)
* added kmeans * added kmeans * added kmeans Co-authored-by: Oleksii Trekhleb <trehleb@gmail.com>
This commit is contained in:
parent
90ec1b76d0
commit
b7cd425ce9
@ -147,6 +147,7 @@ a set of rules that precisely define a sequence of operations.
|
||||
* **Machine Learning**
|
||||
* `B` [NanoNeuron](https://github.com/trekhleb/nano-neuron) - 7 simple JS functions that illustrate how machines can actually learn (forward/backward propagation)
|
||||
* `B` [k-NN](src/algorithms/ml/knn) - k-nearest neighbors classification algorithm
|
||||
* `B` [k-Means](src/algorithms/ml/kmeans) - k-Means clustering algorithm
|
||||
* **Uncategorized**
|
||||
* `B` [Tower of Hanoi](src/algorithms/uncategorized/hanoi-tower)
|
||||
* `B` [Square Matrix Rotation](src/algorithms/uncategorized/square-matrix-rotation) - in-place algorithm
|
||||
|
32
src/algorithms/ml/kmeans/README.md
Normal file
32
src/algorithms/ml/kmeans/README.md
Normal file
@ -0,0 +1,32 @@
|
||||
# k-Means Algorithm
|
||||
|
||||
The **k-Means algorithm** is an unsupervised Machine Learning algorithm. It's a clustering algorithm, which groups the sample data on the basis of similarity between dimentions of vectors.
|
||||
|
||||
In k-Means classification, the output is a set of classess asssigned to each vector. Each cluster location is continously optimized in order to get the accurate locations of each cluster such that they represent each group clearly.
|
||||
|
||||
The idea is to calculate the similarity between cluster location and data vectors, and reassign clusters based on it. [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) is used mostly for this task.
|
||||
|
||||
![Euclidean distance between two points](https://upload.wikimedia.org/wikipedia/commons/5/55/Euclidean_distance_2d.svg)
|
||||
|
||||
_Image source: [Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)_
|
||||
|
||||
The algorithm is as follows:
|
||||
|
||||
1. Check for errors like invalid/inconsistent data
|
||||
2. Initialize the k cluster locations with initial/random k points
|
||||
3. Calculate the distance of each data point from each cluster
|
||||
4. Assign the cluster label of each data point equal to that of the cluster at it's minimum distance
|
||||
5. Calculate the centroid of each cluster based on the data points it contains
|
||||
6. Repeat each of the above steps until the centroid locations are varying
|
||||
|
||||
Here is a visualization of k-Means clustering for better understanding:
|
||||
|
||||
![KNN Visualization 1](https://upload.wikimedia.org/wikipedia/commons/e/ea/K-means_convergence.gif)
|
||||
|
||||
_Image source: [Wikipedia](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm)_
|
||||
|
||||
The centroids are moving continously in order to create better distinction between the different set of data points. As we can see, after a few iterations, the difference in centroids is quite low between iterations. For example between itrations `13` and `14` the difference is quite small because there the optimizer is tuning boundary cases.
|
||||
|
||||
## References
|
||||
|
||||
- [k-Means neighbors algorithm on Wikipedia](https://en.wikipedia.org/wiki/K-means_clustering)
|
36
src/algorithms/ml/kmeans/__test__/kmeans.test.js
Normal file
36
src/algorithms/ml/kmeans/__test__/kmeans.test.js
Normal file
@ -0,0 +1,36 @@
|
||||
import kMeans from '../kmeans';
|
||||
|
||||
describe('kMeans', () => {
|
||||
it('should throw an error on invalid data', () => {
|
||||
expect(() => {
|
||||
kMeans();
|
||||
}).toThrowError('Either dataSet or labels or toClassify were not set');
|
||||
});
|
||||
|
||||
it('should throw an error on inconsistent data', () => {
|
||||
expect(() => {
|
||||
kMeans([[1, 2], [1]], 2);
|
||||
}).toThrowError('Inconsistent vector lengths');
|
||||
});
|
||||
|
||||
it('should find the nearest neighbour', () => {
|
||||
const dataSet = [[1, 1], [6, 2], [3, 3], [4, 5], [9, 2], [2, 4], [8, 7]];
|
||||
const k = 2;
|
||||
const expectedCluster = [0, 1, 0, 1, 1, 0, 1];
|
||||
expect(kMeans(dataSet, k)).toEqual(expectedCluster);
|
||||
});
|
||||
|
||||
it('should find the clusters with equal distances', () => {
|
||||
const dataSet = [[0, 0], [1, 1], [2, 2]];
|
||||
const k = 3;
|
||||
const expectedCluster = [0, 1, 2];
|
||||
expect(kMeans(dataSet, k)).toEqual(expectedCluster);
|
||||
});
|
||||
|
||||
it('should find the nearest neighbour in 3D space', () => {
|
||||
const dataSet = [[0, 0, 0], [0, 1, 0], [2, 0, 2]];
|
||||
const k = 2;
|
||||
const expectedCluster = [1, 1, 0];
|
||||
expect(kMeans(dataSet, k)).toEqual(expectedCluster);
|
||||
});
|
||||
});
|
98
src/algorithms/ml/kmeans/kmeans.js
Normal file
98
src/algorithms/ml/kmeans/kmeans.js
Normal file
@ -0,0 +1,98 @@
|
||||
/**
|
||||
* Calculates calculate the euclidean distance between 2 vectors.
|
||||
*
|
||||
* @param {number[]} x1
|
||||
* @param {number[]} x2
|
||||
* @returns {number}
|
||||
*/
|
||||
function euclideanDistance(x1, x2) {
|
||||
// Checking for errors.
|
||||
if (x1.length !== x2.length) {
|
||||
throw new Error('Inconsistent vector lengths');
|
||||
}
|
||||
// Calculate the euclidean distance between 2 vectors and return.
|
||||
let squaresTotal = 0;
|
||||
for (let i = 0; i < x1.length; i += 1) {
|
||||
squaresTotal += (x1[i] - x2[i]) ** 2;
|
||||
}
|
||||
return Number(Math.sqrt(squaresTotal).toFixed(2));
|
||||
}
|
||||
/**
|
||||
* Classifies the point in space based on k-nearest neighbors algorithm.
|
||||
*
|
||||
* @param {number[][]} dataSet - array of dataSet points, i.e. [[0, 1], [3, 4], [5, 7]]
|
||||
* @param {number} k - number of nearest neighbors which will be taken into account (preferably odd)
|
||||
* @return {number[]} - the class of the point
|
||||
*/
|
||||
export default function kMeans(
|
||||
dataSetm,
|
||||
k = 1,
|
||||
) {
|
||||
const dataSet = dataSetm;
|
||||
if (!dataSet) {
|
||||
throw new Error('Either dataSet or labels or toClassify were not set');
|
||||
}
|
||||
|
||||
// starting algorithm
|
||||
// assign k clusters locations equal to the location of initial k points
|
||||
const clusterCenters = [];
|
||||
const nDim = dataSet[0].length;
|
||||
for (let i = 0; i < k; i += 1) {
|
||||
clusterCenters[clusterCenters.length] = Array.from(dataSet[i]);
|
||||
}
|
||||
|
||||
// continue optimization till convergence
|
||||
// centroids should not be moving once optimized
|
||||
// calculate distance of each candidate vector from each cluster center
|
||||
// assign cluster number to each data vector according to minimum distance
|
||||
let flag = true;
|
||||
while (flag) {
|
||||
flag = false;
|
||||
// calculate and store distance of each dataSet point from each cluster
|
||||
for (let i = 0; i < dataSet.length; i += 1) {
|
||||
for (let n = 0; n < k; n += 1) {
|
||||
dataSet[i][nDim + n] = euclideanDistance(clusterCenters[n], dataSet[i].slice(0, nDim));
|
||||
}
|
||||
|
||||
// assign the cluster number to each dataSet point
|
||||
const sliced = dataSet[i].slice(nDim, nDim + k);
|
||||
let minmDistCluster = Math.min(...sliced);
|
||||
for (let j = 0; j < sliced.length; j += 1) {
|
||||
if (minmDistCluster === sliced[j]) {
|
||||
minmDistCluster = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (dataSet[i].length !== nDim + k + 1) {
|
||||
flag = true;
|
||||
dataSet[i][nDim + k] = minmDistCluster;
|
||||
} else if (dataSet[i][nDim + k] !== minmDistCluster) {
|
||||
flag = true;
|
||||
dataSet[i][nDim + k] = minmDistCluster;
|
||||
}
|
||||
}
|
||||
// recalculate cluster centriod values via all dimensions of the points under it
|
||||
for (let i = 0; i < k; i += 1) {
|
||||
clusterCenters[i] = Array(nDim).fill(0);
|
||||
let classCount = 0;
|
||||
for (let j = 0; j < dataSet.length; j += 1) {
|
||||
if (dataSet[j][dataSet[j].length - 1] === i) {
|
||||
classCount += 1;
|
||||
for (let n = 0; n < nDim; n += 1) {
|
||||
clusterCenters[i][n] += dataSet[j][n];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (let n = 0; n < nDim; n += 1) {
|
||||
clusterCenters[i][n] = Number((clusterCenters[i][n] / classCount).toFixed(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
// return the clusters assigned
|
||||
const soln = [];
|
||||
for (let i = 0; i < dataSet.length; i += 1) {
|
||||
soln.push(dataSet[i][dataSet[i].length - 1]);
|
||||
}
|
||||
return soln;
|
||||
}
|
Loading…
Reference in New Issue
Block a user