diff --git a/README.md b/README.md index d0490f8..070b0bd 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ pyplot.scatter(centroids[:, 0], centroids[:, 1], c="white", s=150) You should see something like this: ![Clustered dots](img/cls_euclidean.png) -#### K-means, angular (cosine) distance +#### K-means, angular (cosine) distance + average ```python import numpy @@ -146,7 +146,9 @@ arr = numpy.empty((10000, 2), dtype=numpy.float32) angs = numpy.random.rand(10000) * 2 * numpy.pi for i in range(10000): arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i]) -centroids, assignments = kmeans_cuda(arr, 4, metric="cos", verbosity=1, seed=3) +centroids, assignments, avg_distance = kmeans_cuda( + arr, 4, metric="cos", verbosity=1, seed=3, average_distance=True) +print("Average distance between centroids and members:", avg_distance) print(centroids) pyplot.scatter(arr[:, 0], arr[:, 1], c=assignments) pyplot.scatter(centroids[:, 0], centroids[:, 1], c="white", s=150) @@ -192,7 +194,8 @@ Python API ---------- ```python def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++", - yinyang_t=0.1, metric="L2", seed=time(), device=0, verbosity=0) + yinyang_t=0.1, metric="L2", average_distance=False, + seed=time(), device=0, verbosity=0) ``` **samples** numpy array of shape \[number of samples, number of features\] or tuple(raw device pointer (int), device index (int), shape (tuple(number of samples, number of features\[, fp16x2 marker\]))). @@ -212,9 +215,14 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++", **yinyang_t** float, the relative number of cluster groups, usually 0.1. **metric** str, the name of the distance metric to use. The default is Euclidean (L2), - can be changed to "cos" to behave as Spherical K-means with the - angular distance. Please note that samples *must* be normalized in that - case. + can be changed to "cos" to behave as Spherical K-means with the + angular distance. Please note that samples *must* be normalized in that + case. + +**average_distance** boolean, the value indicating whether to calculate + the average distance between cluster elements and + the corresponding centroids. Useful for finding + the best K. Returned as the third tuple element. **seed** integer, random generator seed for reproducible results. @@ -276,7 +284,7 @@ KMCUDAResult kmeans_cuda( KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, - uint32_t *assignments) + uint32_t *assignments, float *average_distance) ``` **init** specifies the centroids initialization method: k-means++, random or import (in the latter case, **centroids** is read). @@ -322,6 +330,9 @@ KMCUDAResult kmeans_cuda( **assignments** output array of cluster indices for each sample of size samples_size x 1. +**average_distance** output mean distance between cluster elements and + the corresponding centroids. If nullptr, not calculated. + Returns KMCUDAResult (see `kmcuda.h`); ```C diff --git a/kmcuda.cc b/kmcuda.cc index 0412474..1642337 100644 --- a/kmcuda.cc +++ b/kmcuda.cc @@ -317,11 +317,12 @@ KMCUDAResult kmeans_cuda( KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, - uint32_t *assignments) { + uint32_t *assignments, float *average_distance) { DEBUG("arguments: %d %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %" - PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p\n", init, tolerance, + PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, tolerance, yinyang_t, metric, samples_size, features_size, clusters_size, seed, - device, fp16x2, verbosity, samples, centroids, assignments); + device, fp16x2, verbosity, samples, centroids, assignments, + average_distance); RETERR(check_kmeans_args( tolerance, yinyang_t, samples_size, features_size, clusters_size, device, fp16x2, verbosity, samples, centroids, assignments)); @@ -402,7 +403,13 @@ KMCUDAResult kmeans_cuda( metric, devs, fp16x2, verbosity, device_samples, &device_centroids, &device_ccounts, &device_assignments_prev, &device_assignments, &device_assignments_yy, &device_centroids_yy, &device_bounds_yy, &device_drifts_yy, &device_passed_yy), - DEBUG("kmeans_cuda_internal failed: %s\n", CUERRSTR())); + DEBUG("kmeans_cuda_yy failed: %s\n", CUERRSTR())); + if (average_distance) { + RETERR(kmeans_cuda_calc_average_distance( + samples_size, features_size, metric, devs, fp16x2, verbosity, + device_samples, device_centroids, device_assignments, average_distance), + DEBUG("kmeans_cuda_calc_average_distance failed: %s\n", CUERRSTR())); + } #ifdef PROFILE FOR_EACH_DEV(cudaProfilerStop()); #endif diff --git a/kmcuda.h b/kmcuda.h index cc745c8..b4a6c01 100644 --- a/kmcuda.h +++ b/kmcuda.h @@ -49,13 +49,15 @@ extern "C" { /// in row major format. /// @param assignments output array of cluster indices for each sample of size /// samples_size x 1. +/// @param average_distance output mean distance between cluster elements and +/// the corresponding centroids. If nullptr, not calculated. /// @return KMCUDAResult. KMCUDAResult kmeans_cuda( KMCUDAInitMethod init, float tolerance, float yinyang_t, KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, - uint32_t *assignments); + uint32_t *assignments, float *average_distance); /// @brief Calculates K nearest neighbors for every sample using /// the precalculated K-means clusters. diff --git a/kmeans.cu b/kmeans.cu index 2f94698..8aa17ec 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -45,11 +45,11 @@ __device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) { template __global__ void kmeans_plus_plus( - const uint32_t border, const uint32_t cc, const F *__restrict__ samples, + const uint32_t length, const uint32_t cc, const F *__restrict__ samples, const F *__restrict__ centroids, float *__restrict__ dists, float *__restrict__ dist_sums) { uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample >= border) { + if (sample >= length) { return; } samples += static_cast(sample) * d_features_size; @@ -67,8 +67,8 @@ __global__ void kmeans_plus_plus( } local_dists[threadIdx.x] = dist; uint32_t end = blockDim.x; - if ((blockIdx.x + 1) * blockDim.x > border) { - end = border - blockIdx.x * blockDim.x; + if ((blockIdx.x + 1) * blockDim.x > length) { + end = length - blockIdx.x * blockDim.x; } __syncthreads(); if (threadIdx.x % 16 == 0) { @@ -90,12 +90,12 @@ __global__ void kmeans_plus_plus( template __global__ void kmeans_assign_lloyd_smallc( - const uint32_t border, const F *__restrict__ samples, + const uint32_t length, const F *__restrict__ samples, const F *__restrict__ centroids, uint32_t *__restrict__ assignments_prev, uint32_t * __restrict__ assignments) { using HF = typename HALF::type; uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample >= border) { + if (sample >= length) { return; } samples += static_cast(sample) * d_features_size; @@ -108,7 +108,7 @@ __global__ void kmeans_assign_lloyd_smallc( (d_features_size + 1); F *csqrs = shared_centroids + cstep * d_features_size; const uint32_t size_each = cstep / - min(blockDim.x, border - blockIdx.x * blockDim.x) + 1; + min(blockDim.x, length - blockIdx.x * blockDim.x) + 1; bool insane = _neq(samples[0], samples[0]); F ssqr = _const(0); const uint32_t soffset = threadIdx.x * d_features_size; @@ -168,12 +168,12 @@ __global__ void kmeans_assign_lloyd_smallc( template __global__ void kmeans_assign_lloyd( - const uint32_t border, const F *__restrict__ samples, + const uint32_t length, const F *__restrict__ samples, const F *__restrict__ centroids, uint32_t *__restrict__ assignments_prev, uint32_t * __restrict__ assignments) { using HF = typename HALF::type; uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample >= border) { + if (sample >= length) { return; } samples += static_cast(sample) * d_features_size; @@ -184,7 +184,7 @@ __global__ void kmeans_assign_lloyd( const uint32_t cstep = d_shmem_size / (d_features_size + 1); F *csqrs = shared_centroids + cstep * d_features_size; const uint32_t size_each = cstep / - min(blockDim.x, border - blockIdx.x * blockDim.x) + 1; + min(blockDim.x, length - blockIdx.x * blockDim.x) + 1; bool insane = _neq(samples[0], samples[0]); F ssqr = _const(0); if (!insane) { @@ -243,13 +243,13 @@ __global__ void kmeans_assign_lloyd( template __global__ void kmeans_adjust( - const uint32_t border, const uint32_t coffset, + const uint32_t coffset, const uint32_t length, const F *__restrict__ samples, const uint32_t *__restrict__ assignments_prev, const uint32_t *__restrict__ assignments, F *__restrict__ centroids, uint32_t *__restrict__ ccounts) { uint32_t c = blockIdx.x * blockDim.x + threadIdx.x; - if (c >= border) { + if (c >= length) { return; } c += coffset; @@ -308,11 +308,11 @@ __global__ void kmeans_adjust( template __global__ void kmeans_yy_init( - const uint32_t border, const F *__restrict__ samples, + const uint32_t length, const F *__restrict__ samples, const F *__restrict__ centroids, const uint32_t *__restrict__ assignments, const uint32_t *__restrict__ groups, float *__restrict__ volatile bounds) { uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample >= border) { + if (sample >= length) { return; } bounds += static_cast(sample) * (d_yy_groups_size + 1); @@ -326,7 +326,7 @@ __global__ void kmeans_yy_init( F *volatile shared_centroids = reinterpret_cast(shared_memory); const uint32_t cstep = d_shmem_size / d_features_size; const uint32_t size_each = cstep / - min(blockDim.x, border - blockIdx.x * blockDim.x) + 1; + min(blockDim.x, length - blockIdx.x * blockDim.x) + 1; for (uint32_t gc = 0; gc < d_clusters_size; gc += cstep) { uint32_t coffset = gc * d_features_size; @@ -365,10 +365,10 @@ __global__ void kmeans_yy_init( template __global__ void kmeans_yy_calc_drifts( - const uint32_t border, const uint32_t offset, + const uint32_t offset, const uint32_t length, const F *__restrict__ centroids, F *__restrict__ drifts) { uint32_t c = blockIdx.x * blockDim.x + threadIdx.x; - if (c >= border) { + if (c >= length) { return; } c += offset; @@ -378,17 +378,17 @@ __global__ void kmeans_yy_calc_drifts( } __global__ void kmeans_yy_find_group_max_drifts( - const uint32_t border, const uint32_t offset, + const uint32_t offset, const uint32_t length, const uint32_t *__restrict__ groups, float *__restrict__ drifts) { uint32_t group = blockIdx.x * blockDim.x + threadIdx.x; - if (group >= border) { + if (group >= length) { return; } group += offset; const uint32_t doffset = d_clusters_size * d_features_size; const uint32_t step = d_shmem_size / 2; const uint32_t size_each = d_shmem_size / - (2 * min(blockDim.x, border - blockIdx.x * blockDim.x)); + (2 * min(blockDim.x, length - blockIdx.x * blockDim.x)); extern __shared__ uint32_t shmem[]; float *cd = (float *)shmem; uint32_t *cg = shmem + step; @@ -418,13 +418,13 @@ __global__ void kmeans_yy_find_group_max_drifts( template __global__ void kmeans_yy_global_filter( - const uint32_t border, const F *__restrict__ samples, + const uint32_t length, const F *__restrict__ samples, const F *__restrict__ centroids, const uint32_t *__restrict__ groups, const float *__restrict__ drifts, const uint32_t *__restrict__ assignments, uint32_t *__restrict__ assignments_prev, float *__restrict__ bounds, uint32_t *__restrict__ passed) { uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample >= border) { + if (sample >= length) { return; } bounds += static_cast(sample) * (d_yy_groups_size + 1); @@ -549,6 +549,30 @@ __global__ void kmeans_yy_local_filter( } } +template +__global__ void kmeans_calc_average_distance( + uint32_t offset, uint32_t length, const F *__restrict__ samples, + const F *__restrict__ centroids, const uint32_t *__restrict__ assignments, + float *distance) { + uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x; + float dist = 0; + if (sample < length) { + sample += offset; + dist = METRIC::distance( + samples + sample * d_features_size, + centroids + assignments[sample] * d_features_size); + } + __shared__ float dists[BLOCK_SIZE]; + dists[threadIdx.x] = dist; + if (threadIdx.x == 0) { + float sum = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + sum += dists[i]; + } + atomicAdd(distance, sum); + } +} + ////////////////////------------------------------------------------------------ // Host functions //------------------------------------------------------------ ////////////////////------------------------------------------------------------ @@ -763,7 +787,7 @@ KMCUDAResult kmeans_cuda_lloyd( } dim3 cgrid(upper(length, cblock.x), 1, 1); KERNEL_SWITCH(kmeans_adjust, <<>>( - length, offset, reinterpret_cast(samples[devi].get()), + offset, length, reinterpret_cast(samples[devi].get()), (*assignments_prev)[devi].get(), (*assignments)[devi].get(), reinterpret_cast((*centroids)[devi].get()), (*ccounts)[devi].get())); ); @@ -917,7 +941,7 @@ KMCUDAResult kmeans_cuda_yy( } dim3 cgrid(upper(length, cblock.x), 1, 1); KERNEL_SWITCH(kmeans_adjust, <<>>( - length, offset, reinterpret_cast(samples[devi].get()), + offset, length, reinterpret_cast(samples[devi].get()), (*assignments_prev)[devi].get(), (*assignments)[devi].get(), reinterpret_cast((*centroids)[devi].get()), (*ccounts)[devi].get())); ); @@ -940,7 +964,7 @@ KMCUDAResult kmeans_cuda_yy( } dim3 cgrid(upper(length, cblock.x), 1, 1); KERNEL_SWITCH(kmeans_yy_calc_drifts, <<>>( - length, offset, reinterpret_cast((*centroids)[devi].get()), + offset, length, reinterpret_cast((*centroids)[devi].get()), reinterpret_cast((*drifts_yy)[devi].get()))); ); FOR_EACH_DEVI( @@ -961,7 +985,7 @@ KMCUDAResult kmeans_cuda_yy( } dim3 ggrid(upper(length, gblock.x), 1, 1); kmeans_yy_find_group_max_drifts<<>>( - length, offset, (*assignments_yy)[devi].get(), + offset, length, (*assignments_yy)[devi].get(), (*drifts_yy)[devi].get()); ); FOR_EACH_DEVI( @@ -1013,4 +1037,39 @@ KMCUDAResult kmeans_cuda_yy( ); } } + +KMCUDAResult kmeans_cuda_calc_average_distance( + uint32_t h_samples_size, uint16_t h_features_size, + KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, + int32_t verbosity, const udevptrs &samples, + const udevptrs ¢roids, const udevptrs &assignments, + float *average_distance) { + INFO("calculating the average distance...\n"); + auto plans = distribute(h_samples_size, h_features_size * sizeof(float), devs); + float *dev_dists[devs.size()]; + FOR_EACH_DEVI( + uint32_t offset, length; + std::tie(offset, length) = plans[devi]; + CUCH(cudaMalloc(&dev_dists[devi], sizeof(float)), + kmcudaMemoryAllocationFailure); + CUCH(cudaMemsetAsync(dev_dists[devi], 0, sizeof(float)), + kmcudaRuntimeError); + dim3 block(BLOCK_SIZE, 1, 1); + dim3 grid(upper(length, block.x), 1, 1); + KERNEL_SWITCH(kmeans_calc_average_distance, <<>>( + offset, length, reinterpret_cast(samples[devi].get()), + reinterpret_cast(centroids[devi].get()), + assignments[devi].get(), dev_dists[devi])); + ); + float sum = 0; + FOR_EACH_DEVI( + float hdist; + CUCH(cudaMemcpy(&hdist, dev_dists[devi], sizeof(float), cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); + sum += hdist; + ); + *average_distance = sum / h_samples_size; + return kmcudaSuccess; +} + } // extern "C" diff --git a/private.h b/private.h index b5d404b..54766e8 100644 --- a/private.h +++ b/private.h @@ -261,6 +261,13 @@ KMCUDAResult kmeans_cuda_setup( uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t yy_groups_size, const std::vector &devs, int32_t verbosity); +KMCUDAResult kmeans_init_centroids( + KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size, + uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, + const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, + const float *host_centroids, const udevptrs &samples, + udevptrs *dists, udevptrs *dev_sums, udevptrs *centroids); + KMCUDAResult kmeans_cuda_yy( float tolerance, uint32_t yy_groups_size, uint32_t samples_size, uint32_t clusters_size, uint16_t features_size, KMCUDADistanceMetric metric, @@ -271,12 +278,12 @@ KMCUDAResult kmeans_cuda_yy( udevptrs *centroids_yy, udevptrs *bounds_yy, udevptrs *drifts_yy, udevptrs *passed_yy); -KMCUDAResult kmeans_init_centroids( - KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size, - uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, - const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, - const float *host_centroids, const udevptrs &samples, - udevptrs *dists, udevptrs *dev_sums, udevptrs *centroids); +KMCUDAResult kmeans_cuda_calc_average_distance( + uint32_t samples_size, uint16_t features_size, + KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, + int32_t verbosity, const udevptrs &samples, + const udevptrs ¢roids, const udevptrs &assignments, + float *average_distance); KMCUDAResult knn_cuda_setup( uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, diff --git a/python.cc b/python.cc index f40d02e..7674a39 100644 --- a/python.cc +++ b/python.cc @@ -172,17 +172,18 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs device = 0; int32_t verbosity = 0; bool fp16x2 = false; + int adflag = 0; float tolerance = .01, yinyang_t = .1; PyObject *samples_obj, *init_obj = Py_None, *metric_obj = Py_None; static const char *kwlist[] = { - "samples", "clusters", "tolerance", "init", "yinyang_t", "metric", "seed", - "device", "verbosity", NULL}; + "samples", "clusters", "tolerance", "init", "yinyang_t", "metric", + "average_distance", "seed", "device", "verbosity", NULL}; /* Parse the input tuple */ if (!PyArg_ParseTupleAndKeywords( - args, kwargs, "OI|fOfOIIi", const_cast(kwlist), &samples_obj, - &clusters_size, &tolerance, &init_obj, &yinyang_t, &metric_obj, &seed, - &device, &verbosity)) { + args, kwargs, "OI|fOfOpIIi", const_cast(kwlist), &samples_obj, + &clusters_size, &tolerance, &init_obj, &yinyang_t, &metric_obj, &adflag, + &seed, &device, &verbosity)) { return NULL; } @@ -340,12 +341,14 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs } } } + float average_distance = 0; int result; Py_BEGIN_ALLOW_THREADS result = kmeans_cuda( init, tolerance, yinyang_t, metric, samples_size, static_cast(features_size), clusters_size, seed, device, - device_ptrs, fp16x2, verbosity, samples, centroids, assignments); + device_ptrs, fp16x2, verbosity, samples, centroids, assignments, + adflag? &average_distance : nullptr); Py_END_ALLOW_THREADS switch (result) { @@ -367,12 +370,27 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs return NULL; case kmcudaSuccess: if (device_ptrs < 0) { - return Py_BuildValue("OO", centroids_array.get(), assignments_array.get()); + if (!adflag) { + return Py_BuildValue( + "OO", centroids_array.get(), assignments_array.get()); + } else { + return Py_BuildValue( + "OOf", centroids_array.get(), assignments_array.get(), + average_distance); + } + } + if (!adflag) { + return Py_BuildValue( + "KK", + static_cast(reinterpret_cast(centroids)), + static_cast(reinterpret_cast(assignments))); + } else { + return Py_BuildValue( + "KKf", + static_cast(reinterpret_cast(centroids)), + static_cast(reinterpret_cast(assignments)), + average_distance); } - return Py_BuildValue( - "KK", - static_cast(reinterpret_cast(centroids)), - static_cast(reinterpret_cast(assignments))); default: PyErr_SetString(PyExc_AssertionError, "Unknown error code returned from kmeans_cuda"); diff --git a/setup.py b/setup.py index ea0c03d..ee06e9f 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ def is_pure(self): setup( name="libKMCUDA", description="Accelerated K-means and K-nn on GPU", - version="5.0.0", + version="5.1.0", license="MIT", author="Vadim Markovtsev", author_email="vadim@sourced.tech", diff --git a/test.py b/test.py index 0f628dc..bd270d9 100644 --- a/test.py +++ b/test.py @@ -479,6 +479,23 @@ def test_fp16_cosine_metric(self): self.assertEqual(numpy.min(assignments), 0) self.assertEqual(numpy.max(assignments), 3) + def _test_average_distance(self, dev): + centroids, assignments, distance = kmeans_cuda( + self.samples, 50, init="kmeans++", device=dev, + verbosity=2, seed=3, tolerance=0.05, yinyang_t=0, + average_distance=True) + valid_dist = 0.0 + for sample, ass in zip(self.samples, assignments): + valid_dist += numpy.linalg.norm(sample - centroids[ass]) + valid_dist /= self.samples.shape[0] + self.assertLess(numpy.abs(valid_dist - distance), 1e-6) + + def test_average_distance_single_dev(self): + self._test_average_distance(1) + + def test_average_distance_multiple_dev(self): + self._test_average_distance(0) + class KnnTests(unittest.TestCase): @classmethod