diff --git a/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/closest_centroid.js b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/closest_centroid.js new file mode 100644 index 000000000000..1e0cc282cad3 --- /dev/null +++ b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/closest_centroid.js @@ -0,0 +1,62 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +'use strict'; + +// MODULES // + + +// MAIN // + +/** +* Find closest centroid. +* @private +* @param {Function} dist - distance method. +* @param {PositiveInteger} N - number of features. +* @param {PositiveInteger} k - number of clusters. +* @param {Float64Array} X - input strided matrix. +* @param {integer} sx - stride length of X. +* @param {integer} ox - starting index of X. +* @param {Float64Array} c - strided array centroid locations. +* @param {integer} sc1 - stride of the third dimension. +* @param {integer} sc2 - stride of second dimension. +* @param {integer} oc - initial index of centroids. +* @returns {NonNegativeInteger} closest centroid. +*/ +function closestCentroid( dist, N, k, X, sx, ox, c, sc1, sc2, oc ) { + var bestDist; + var best; + var d; + var i; + + best = 0; + bestDist = dist( N, X, sx, ox, c, sc2, oc ); + + oc += sc1; // move to the next centroid + for ( i = 1; i < k; i++ ) { + d = dist( N, X, sx, ox, c, sc2, oc ); + if ( d < bestDist ) { + bestDist = d; + best = i; + } + oc += sc1; + } + return best; +} + +module.exports = closestCentroid; diff --git a/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/compute_inertia.js b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/compute_inertia.js new file mode 100644 index 000000000000..058f9b27b881 --- /dev/null +++ b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/compute_inertia.js @@ -0,0 +1,64 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +'use strict'; + +// MODULES // + +var drss = require( '@stdlib/blas/ext/base/drss' ).ndarray; + + +// MAIN // + +/** +* Compute inertia. +* @private +* @param {PositiveInteger} M - number of samples. +* @param {PositiveInteger} N - number of features. +* @param {Float64Array} X - input strided matrix. +* @param {integer} strideX1 - stride length of first dimension of X. +* @param {integer} strideX2 - stride length of second dimension of X. +* @param {integer} offsetX - starting index of X. +* @param {Float64Array} centroids - strided array centroid locations. +* @param {integer} strideC1 - stride length of first dimension of c. +* @param {integer} strideC2 - stride length of second dimension of c. +* @param {integer} offsetC - initial index of centroids. +* @param {Int32Array} labels - labels array. +* @returns {number} inertia. +*/ +function computeInertia( M, N, X, strideX1, strideX2, offsetX, centroids, strideC1, strideC2, offsetC, labels ) { // eslint-disable-line max-len, max-params + var inertia; + var ox; + var oc; + var d; + var c; + var i; + + inertia = 0.0; + ox = offsetX; + for ( i = 0; i < M; i++ ) { + c = labels[ i ]; + oc = offsetC + ( c * strideC1 ); + d = drss( N, X, strideX2, ox, centroids, strideC2, oc ); + inertia += d; + ox += strideX1; + } + return inertia; +} + +module.exports = computeInertia; diff --git a/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/index.js b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/index.js new file mode 100644 index 000000000000..d8a0c5ecc49e --- /dev/null +++ b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/index.js @@ -0,0 +1,40 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +'use strict'; + +/** +* Compute fitted cluster results using Lloyd algorithm. +* +* @module @stdlib/ml/cluster/strided/dkmeansld +* +* @example +* var Float64Array = require( '@stdlib/array/float64' ); +* var ndarray = require( '@stdlib/ndarray/ctor' ); +* var kmeans = require( '@stdlib/ml/cluster/strided/dkmeansld' ); +* +*/ + +// MAIN // + +var main = require( './main.js' ); + + +// EXPORTS // + +module.exports = main; diff --git a/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/main.js b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/main.js new file mode 100644 index 000000000000..3a9915150b91 --- /dev/null +++ b/lib/node_modules/@stdlib/ml/cluster/strided/dkmeansld/lib/main.js @@ -0,0 +1,239 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2026 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +'use strict'; + +// MODULES // + +var Int32Array = require( '@stdlib/array/int32' ); +var Float64Array = require( '@stdlib/array/float64' ); +var isEqualArray = require( '@stdlib/assert/is-equal-array' ); +var gfill = require( '@stdlib/blas/ext/base/gfill' ).ndarray; +var gcopy = require( '@stdlib/blas/base/gcopy' ).ndarray; +var dlacpy = require( '@stdlib/lapack/base/dlacpy' ).ndarray; +var isnan = require( '@stdlib/assert/is-nan' ); +var deuclidean = require( '@stdlib/stats/strided/distances/deuclidean' ).ndarray; +var dcosine = require( '@stdlib/stats/strided/distances/dcosine-distance' ).ndarray; +var dcityblock = require( '@stdlib/stats/strided/distances/dcityblock' ).ndarray; +var closestCentroid = require( './closest_centroid.js' ); +var computeInertia = require( './compute_inertia.js' ); + + +// VARIABLES // + +var METHOD = 'lloyd'; + + +// MAIN // + +/** +* Compute fitted cluster results using Lloyd algorithm. +* @param {PositiveInteger} M - number of samples +* @param {PositiveInteger} N - number of features +* @param {PositiveInteger} k - number of clusters +* @param {NonNegativeInteger} replicates - number of times to repeat clustering with different centroids. +* @param {String} metric - distance metric +* @param {NonNegativeInteger} maxIter - maximum number of iterations. +* @param {integer} tol - relative tolerance before declaring convergence. +* @param {Float64Array} X - input strided matrix +* @param {integer} strideX1 - stride of the first dimension. +* @param {integer} strideX2 - stride of the second dimension. +* @param {integer} offsetX - starting index. +* @param {Float64Array} init - strided array containing initial centroid locations. +* @param {integer} strideInit1 - stride of first dimension. +* @param {integer} strideInit2 - stride of second dimension. +* @param {integer} strideInit3 - stride of the third dimension. +* @param {integer} offsetInit - initial index. +* @param {Float64Array} bestCentroids - strided array buffer for final centroid locations. +* @param {integer} strideC1 - stride of first dimensions of C. +* @param {integer} strideC2 - stride of second dimensions of C. +* @param {integer} offsetC - initial index. +* @param {Float64Array} statistics - strided array buffer for cluster stats. +* @param {integer} strideS1 - stride of first dimensions of S. +* @param {integer} strideS2 - stride of second dimensions of S. +* @param {integer} offsetS - initial index. +* @param {Int32Array} bestLabels - strided array buffer for sample labels. +* @param {integer} strideL - stride of first dimensions of labels. +* @param {integer} offsetL - stride of second dimensions of labels. +* @param {Object} out - output results object. +*/ +function dkmeansld( M, N, k, replicates, metric, maxIter, tol, X, strideX1, strideX2, offsetX, init, strideInit1, strideInit2, strideInit3, offsetInit, bestCentroids, strideC1, strideC2, offsetC, statistics, strideS1, strideS2, offsetS, bestLabels, strideL, offsetL, out ) { // eslint-disable-line max-len, max-params, max-statements + var centroidsNew; + var bestInertia; + var strictConv; + var labelsOld; + var centroids; + var bestIter; + var inertia; + var labels; + var counts; + var bestR; + var shift; + var dist; + var best; + var iter; + var ocj; + var oij; + var oxj; + var si; + var oc; + var ox; + var oi; + var r; + var i; + var j; + var c; + var d; + + if ( metric === 'euclidean' ) { + dist = deuclidean; // TODO: change it to dsquared-euclidean once implemented + } else if ( metric === 'cosine' ) { + dist = dcosine; // TODO: change it to dsquared-cosine once implemented + } else if ( metric === 'cityblock' ) { + dist = dcityblock; + } + + labelsOld = new Int32Array( M ); + labels = new Int32Array( M ); + counts = new Float64Array( k ); // q: sklearn supports sample_weights, should we do the same? if yes, change it to Float64Array + centroidsNew = new Float64Array( k*N ); + centroids = new Float64Array( k*N ); + + oi = offsetInit; // start from the offset + bestInertia = NaN; + for ( r = 0; r < replicates; r++ ) { + gfill( M, -1, labels, 1, 0 ); + gfill( M, -1, labelsOld, 1, 0 ); + + // initCentroids( N, k, X, init, si1, strideInit2, strideInit3, offsetInit ); // TODO + + strictConv = false; + for ( iter = 1; iter <= maxIter; iter++ ) { + gfill( k, 0, counts, 1, 0 ); // How do I fill it with int32 values? + gfill( k*N, 0, centroidsNew, 1, 0 ); + + ox = offsetX; + for ( i = 0; i < M; i++ ) { + best = closestCentroid( dist, N, k, X, strideX2, ox, init, strideInit2, strideInit3, oi ); // eslint-disable-line max-len + + labels[ i ] = best; + counts[ best ] += 1; + ox += strideX1; // move to the next sample + } + + ox = offsetX; + for ( i = 0; i < M; i++ ) { + c = labels[ i ]; + oc = c * N; // move to starting position of respective centroid in buf + oxj = ox; + for ( j = 0; j < N; j++ ) { + centroidsNew[ oc ] += X[ oxj ]; // accumulate sum of each sample in the cluster + oc += 1; // move to the next feature of the respective centroid + oxj += strideX2; // move to the next feature of the respective sample + } + ox += strideX1; + } + + oc = 0; + si = offsetInit; + for ( c = 0; c < k; c++ ) { + if ( counts[ c ] > 0 ) { + ocj = oc; + for ( j = 0; j < N; j++ ) { + centroidsNew[ ocj ] /= counts[ c ]; + ocj += 1; // move to the next feature of the respective centroid + } + } else { // if empty cluster, retain the old features + ocj = oc; + oij = si; + for ( j = 0; j < N; j++ ) { + centroidsNew[ ocj ] = init[ oij ]; + ocj += 1; // move to the next feature of the respective centroid + oij += strideInit3; // move to the next feature of the respective centroid's old feature set + } + } + oc += N; // move to the next centroid + si += strideInit2; // move to the next centroid's old feature set + } + + oc = 0; + shift = 0.0; + si = offsetInit; + for ( c = 0; c < k; c++ ) { + oij = si; + ocj = oc; + for ( j = 0; j < N; j++ ) { + d = centroidsNew[ ocj ] - init[ oij ]; + shift += d * d; + init[ oij ] = centroidsNew[ ocj ]; // update init for next iteration + ocj += 1; + oij += strideInit3; + } + oc += N; + si += strideInit2; + } + dlacpy( 'all', k, N, centroidsNew, N, 1, 0, centroids, N, 1, 0 ); + + // We know stride for labels is `1` and offset is `0`, so this method for similarity checking works + if ( shift < tol || isEqualArray( labels, labelsOld ) ) { + strictConv = true; + break; + } + gcopy( M, labels, 1, 0, labelsOld, 1, 0 ); + } + + // re-run E-step + if (!strictConv) { + ox = offsetX; + for ( i = 0; i < M; i++ ) { + best = closestCentroid( dist, N, k, X, strideX2, ox, init, strideInit2, strideInit3, oi ); // eslint-disable-line max-len + labels[ i ] = best; + ox += strideX1; // move to the next sample + } + } + + inertia = computeInertia( M, N, X, strideX1, strideX2, offsetX, centroids, N, 1, 0, labels ); // eslint-disable-line max-len + if ( isnan( bestInertia ) || inertia < bestInertia ) { + gcopy( M, labels, 1, 0, bestLabels, strideL, offsetL); + dlacpy( 'all', k, N, centroids, N, 1, 0, bestCentroids, strideC1, strideC2, offsetC ); + bestInertia = inertia; + bestR = r; + if ( iter > maxIter ) { + bestIter = maxIter; + } else { + bestIter = iter; + } + } + oi += strideInit1; // move to the next replicate + } + + out.replicates = replicates; + out.replicate = bestR; + out.metric = metric; + out.iterations = bestIter; + out.method = METHOD; + out.inertia = bestInertia; + out.k = k; + out.samples = M; + out.features = N; +} + + +// EXPORTS // + +module.exports = dkmeansld;