Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
vmarkovtsev committed Jan 24, 2017
2 parents e562632 + 66805d8 commit 2617e0e
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 56 deletions.
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ pyplot.scatter(centroids[:, 0], centroids[:, 1], c="white", s=150)
You should see something like this:
![Clustered dots](img/cls_euclidean.png)

#### K-means, angular (cosine) distance
#### K-means, angular (cosine) distance + average

```python
import numpy
Expand All @@ -146,7 +146,9 @@ arr = numpy.empty((10000, 2), dtype=numpy.float32)
angs = numpy.random.rand(10000) * 2 * numpy.pi
for i in range(10000):
arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
centroids, assignments = kmeans_cuda(arr, 4, metric="cos", verbosity=1, seed=3)
centroids, assignments, avg_distance = kmeans_cuda(
arr, 4, metric="cos", verbosity=1, seed=3, average_distance=True)
print("Average distance between centroids and members:", avg_distance)
print(centroids)
pyplot.scatter(arr[:, 0], arr[:, 1], c=assignments)
pyplot.scatter(centroids[:, 0], centroids[:, 1], c="white", s=150)
Expand Down Expand Up @@ -192,7 +194,8 @@ Python API
----------
```python
def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
yinyang_t=0.1, metric="L2", seed=time(), device=0, verbosity=0)
yinyang_t=0.1, metric="L2", average_distance=False,
seed=time(), device=0, verbosity=0)
```
**samples** numpy array of shape \[number of samples, number of features\]
or tuple(raw device pointer (int), device index (int), shape (tuple(number of samples, number of features\[, fp16x2 marker\]))).
Expand All @@ -212,9 +215,14 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
**yinyang_t** float, the relative number of cluster groups, usually 0.1.

**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
can be changed to "cos" to behave as Spherical K-means with the
angular distance. Please note that samples *must* be normalized in that
case.
can be changed to "cos" to behave as Spherical K-means with the
angular distance. Please note that samples *must* be normalized in that
case.

**average_distance** boolean, the value indicating whether to calculate
the average distance between cluster elements and
the corresponding centroids. Useful for finding
the best K. Returned as the third tuple element.

**seed** integer, random generator seed for reproducible results.

Expand Down Expand Up @@ -276,7 +284,7 @@ KMCUDAResult kmeans_cuda(
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
uint32_t *assignments)
uint32_t *assignments, float *average_distance)
```
**init** specifies the centroids initialization method: k-means++, random or import
(in the latter case, **centroids** is read).
Expand Down Expand Up @@ -322,6 +330,9 @@ KMCUDAResult kmeans_cuda(
**assignments** output array of cluster indices for each sample of size
samples_size x 1.
**average_distance** output mean distance between cluster elements and
the corresponding centroids. If nullptr, not calculated.
Returns KMCUDAResult (see `kmcuda.h`);
```C
Expand Down
15 changes: 11 additions & 4 deletions kmcuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,12 @@ KMCUDAResult kmeans_cuda(
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
uint32_t *assignments) {
uint32_t *assignments, float *average_distance) {
DEBUG("arguments: %d %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p\n", init, tolerance,
PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, tolerance,
yinyang_t, metric, samples_size, features_size, clusters_size, seed,
device, fp16x2, verbosity, samples, centroids, assignments);
device, fp16x2, verbosity, samples, centroids, assignments,
average_distance);
RETERR(check_kmeans_args(
tolerance, yinyang_t, samples_size, features_size, clusters_size,
device, fp16x2, verbosity, samples, centroids, assignments));
Expand Down Expand Up @@ -402,7 +403,13 @@ KMCUDAResult kmeans_cuda(
metric, devs, fp16x2, verbosity, device_samples, &device_centroids, &device_ccounts,
&device_assignments_prev, &device_assignments, &device_assignments_yy,
&device_centroids_yy, &device_bounds_yy, &device_drifts_yy, &device_passed_yy),
DEBUG("kmeans_cuda_internal failed: %s\n", CUERRSTR()));
DEBUG("kmeans_cuda_yy failed: %s\n", CUERRSTR()));
if (average_distance) {
RETERR(kmeans_cuda_calc_average_distance(
samples_size, features_size, metric, devs, fp16x2, verbosity,
device_samples, device_centroids, device_assignments, average_distance),
DEBUG("kmeans_cuda_calc_average_distance failed: %s\n", CUERRSTR()));
}
#ifdef PROFILE
FOR_EACH_DEV(cudaProfilerStop());
#endif
Expand Down
4 changes: 3 additions & 1 deletion kmcuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,15 @@ extern "C" {
/// in row major format.
/// @param assignments output array of cluster indices for each sample of size
/// samples_size x 1.
/// @param average_distance output mean distance between cluster elements and
/// the corresponding centroids. If nullptr, not calculated.
/// @return KMCUDAResult.
KMCUDAResult kmeans_cuda(
KMCUDAInitMethod init, float tolerance, float yinyang_t,
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
uint32_t *assignments);
uint32_t *assignments, float *average_distance);

/// @brief Calculates K nearest neighbors for every sample using
/// the precalculated K-means clusters.
Expand Down
111 changes: 85 additions & 26 deletions kmeans.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ __device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) {

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_plus_plus(
const uint32_t border, const uint32_t cc, const F *__restrict__ samples,
const uint32_t length, const uint32_t cc, const F *__restrict__ samples,
const F *__restrict__ centroids, float *__restrict__ dists,
float *__restrict__ dist_sums) {
uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x;
if (sample >= border) {
if (sample >= length) {
return;
}
samples += static_cast<uint64_t>(sample) * d_features_size;
Expand All @@ -67,8 +67,8 @@ __global__ void kmeans_plus_plus(
}
local_dists[threadIdx.x] = dist;
uint32_t end = blockDim.x;
if ((blockIdx.x + 1) * blockDim.x > border) {
end = border - blockIdx.x * blockDim.x;
if ((blockIdx.x + 1) * blockDim.x > length) {
end = length - blockIdx.x * blockDim.x;
}
__syncthreads();
if (threadIdx.x % 16 == 0) {
Expand All @@ -90,12 +90,12 @@ __global__ void kmeans_plus_plus(

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_assign_lloyd_smallc(
const uint32_t border, const F *__restrict__ samples,
const uint32_t length, const F *__restrict__ samples,
const F *__restrict__ centroids, uint32_t *__restrict__ assignments_prev,
uint32_t * __restrict__ assignments) {
using HF = typename HALF<F>::type;
uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x;
if (sample >= border) {
if (sample >= length) {
return;
}
samples += static_cast<uint64_t>(sample) * d_features_size;
Expand All @@ -108,7 +108,7 @@ __global__ void kmeans_assign_lloyd_smallc(
(d_features_size + 1);
F *csqrs = shared_centroids + cstep * d_features_size;
const uint32_t size_each = cstep /
min(blockDim.x, border - blockIdx.x * blockDim.x) + 1;
min(blockDim.x, length - blockIdx.x * blockDim.x) + 1;
bool insane = _neq(samples[0], samples[0]);
F ssqr = _const<F>(0);
const uint32_t soffset = threadIdx.x * d_features_size;
Expand Down Expand Up @@ -168,12 +168,12 @@ __global__ void kmeans_assign_lloyd_smallc(

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_assign_lloyd(
const uint32_t border, const F *__restrict__ samples,
const uint32_t length, const F *__restrict__ samples,
const F *__restrict__ centroids, uint32_t *__restrict__ assignments_prev,
uint32_t * __restrict__ assignments) {
using HF = typename HALF<F>::type;
uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x;
if (sample >= border) {
if (sample >= length) {
return;
}
samples += static_cast<uint64_t>(sample) * d_features_size;
Expand All @@ -184,7 +184,7 @@ __global__ void kmeans_assign_lloyd(
const uint32_t cstep = d_shmem_size / (d_features_size + 1);
F *csqrs = shared_centroids + cstep * d_features_size;
const uint32_t size_each = cstep /
min(blockDim.x, border - blockIdx.x * blockDim.x) + 1;
min(blockDim.x, length - blockIdx.x * blockDim.x) + 1;
bool insane = _neq(samples[0], samples[0]);
F ssqr = _const<F>(0);
if (!insane) {
Expand Down Expand Up @@ -243,13 +243,13 @@ __global__ void kmeans_assign_lloyd(

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_adjust(
const uint32_t border, const uint32_t coffset,
const uint32_t coffset, const uint32_t length,
const F *__restrict__ samples,
const uint32_t *__restrict__ assignments_prev,
const uint32_t *__restrict__ assignments,
F *__restrict__ centroids, uint32_t *__restrict__ ccounts) {
uint32_t c = blockIdx.x * blockDim.x + threadIdx.x;
if (c >= border) {
if (c >= length) {
return;
}
c += coffset;
Expand Down Expand Up @@ -308,11 +308,11 @@ __global__ void kmeans_adjust(

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_yy_init(
const uint32_t border, const F *__restrict__ samples,
const uint32_t length, const F *__restrict__ samples,
const F *__restrict__ centroids, const uint32_t *__restrict__ assignments,
const uint32_t *__restrict__ groups, float *__restrict__ volatile bounds) {
uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x;
if (sample >= border) {
if (sample >= length) {
return;
}
bounds += static_cast<uint64_t>(sample) * (d_yy_groups_size + 1);
Expand All @@ -326,7 +326,7 @@ __global__ void kmeans_yy_init(
F *volatile shared_centroids = reinterpret_cast<F*>(shared_memory);
const uint32_t cstep = d_shmem_size / d_features_size;
const uint32_t size_each = cstep /
min(blockDim.x, border - blockIdx.x * blockDim.x) + 1;
min(blockDim.x, length - blockIdx.x * blockDim.x) + 1;

for (uint32_t gc = 0; gc < d_clusters_size; gc += cstep) {
uint32_t coffset = gc * d_features_size;
Expand Down Expand Up @@ -365,10 +365,10 @@ __global__ void kmeans_yy_init(

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_yy_calc_drifts(
const uint32_t border, const uint32_t offset,
const uint32_t offset, const uint32_t length,
const F *__restrict__ centroids, F *__restrict__ drifts) {
uint32_t c = blockIdx.x * blockDim.x + threadIdx.x;
if (c >= border) {
if (c >= length) {
return;
}
c += offset;
Expand All @@ -378,17 +378,17 @@ __global__ void kmeans_yy_calc_drifts(
}

__global__ void kmeans_yy_find_group_max_drifts(
const uint32_t border, const uint32_t offset,
const uint32_t offset, const uint32_t length,
const uint32_t *__restrict__ groups, float *__restrict__ drifts) {
uint32_t group = blockIdx.x * blockDim.x + threadIdx.x;
if (group >= border) {
if (group >= length) {
return;
}
group += offset;
const uint32_t doffset = d_clusters_size * d_features_size;
const uint32_t step = d_shmem_size / 2;
const uint32_t size_each = d_shmem_size /
(2 * min(blockDim.x, border - blockIdx.x * blockDim.x));
(2 * min(blockDim.x, length - blockIdx.x * blockDim.x));
extern __shared__ uint32_t shmem[];
float *cd = (float *)shmem;
uint32_t *cg = shmem + step;
Expand Down Expand Up @@ -418,13 +418,13 @@ __global__ void kmeans_yy_find_group_max_drifts(

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_yy_global_filter(
const uint32_t border, const F *__restrict__ samples,
const uint32_t length, const F *__restrict__ samples,
const F *__restrict__ centroids, const uint32_t *__restrict__ groups,
const float *__restrict__ drifts, const uint32_t *__restrict__ assignments,
uint32_t *__restrict__ assignments_prev, float *__restrict__ bounds,
uint32_t *__restrict__ passed) {
uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x;
if (sample >= border) {
if (sample >= length) {
return;
}
bounds += static_cast<uint64_t>(sample) * (d_yy_groups_size + 1);
Expand Down Expand Up @@ -549,6 +549,30 @@ __global__ void kmeans_yy_local_filter(
}
}

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_calc_average_distance(
uint32_t offset, uint32_t length, const F *__restrict__ samples,
const F *__restrict__ centroids, const uint32_t *__restrict__ assignments,
float *distance) {
uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x;
float dist = 0;
if (sample < length) {
sample += offset;
dist = METRIC<M, F>::distance(
samples + sample * d_features_size,
centroids + assignments[sample] * d_features_size);
}
__shared__ float dists[BLOCK_SIZE];
dists[threadIdx.x] = dist;
if (threadIdx.x == 0) {
float sum = 0;
for (int i = 0; i < BLOCK_SIZE; i++) {
sum += dists[i];
}
atomicAdd(distance, sum);
}
}

////////////////////------------------------------------------------------------
// Host functions //------------------------------------------------------------
////////////////////------------------------------------------------------------
Expand Down Expand Up @@ -763,7 +787,7 @@ KMCUDAResult kmeans_cuda_lloyd(
}
dim3 cgrid(upper(length, cblock.x), 1, 1);
KERNEL_SWITCH(kmeans_adjust, <<<cgrid, cblock, shmem_sizes[devi]>>>(
length, offset, reinterpret_cast<const F*>(samples[devi].get()),
offset, length, reinterpret_cast<const F*>(samples[devi].get()),
(*assignments_prev)[devi].get(), (*assignments)[devi].get(),
reinterpret_cast<F*>((*centroids)[devi].get()), (*ccounts)[devi].get()));
);
Expand Down Expand Up @@ -917,7 +941,7 @@ KMCUDAResult kmeans_cuda_yy(
}
dim3 cgrid(upper(length, cblock.x), 1, 1);
KERNEL_SWITCH(kmeans_adjust, <<<cgrid, cblock, shmem_sizes[devi]>>>(
length, offset, reinterpret_cast<const F*>(samples[devi].get()),
offset, length, reinterpret_cast<const F*>(samples[devi].get()),
(*assignments_prev)[devi].get(), (*assignments)[devi].get(),
reinterpret_cast<F*>((*centroids)[devi].get()), (*ccounts)[devi].get()));
);
Expand All @@ -940,7 +964,7 @@ KMCUDAResult kmeans_cuda_yy(
}
dim3 cgrid(upper(length, cblock.x), 1, 1);
KERNEL_SWITCH(kmeans_yy_calc_drifts, <<<cgrid, cblock>>>(
length, offset, reinterpret_cast<const F*>((*centroids)[devi].get()),
offset, length, reinterpret_cast<const F*>((*centroids)[devi].get()),
reinterpret_cast<F*>((*drifts_yy)[devi].get())));
);
FOR_EACH_DEVI(
Expand All @@ -961,7 +985,7 @@ KMCUDAResult kmeans_cuda_yy(
}
dim3 ggrid(upper(length, gblock.x), 1, 1);
kmeans_yy_find_group_max_drifts<<<ggrid, gblock, shmem_sizes[devi]>>>(
length, offset, (*assignments_yy)[devi].get(),
offset, length, (*assignments_yy)[devi].get(),
(*drifts_yy)[devi].get());
);
FOR_EACH_DEVI(
Expand Down Expand Up @@ -1013,4 +1037,39 @@ KMCUDAResult kmeans_cuda_yy(
);
}
}

KMCUDAResult kmeans_cuda_calc_average_distance(
uint32_t h_samples_size, uint16_t h_features_size,
KMCUDADistanceMetric metric, const std::vector<int> &devs, int fp16x2,
int32_t verbosity, const udevptrs<float> &samples,
const udevptrs<float> &centroids, const udevptrs<uint32_t> &assignments,
float *average_distance) {
INFO("calculating the average distance...\n");
auto plans = distribute(h_samples_size, h_features_size * sizeof(float), devs);
float *dev_dists[devs.size()];
FOR_EACH_DEVI(
uint32_t offset, length;
std::tie(offset, length) = plans[devi];
CUCH(cudaMalloc(&dev_dists[devi], sizeof(float)),
kmcudaMemoryAllocationFailure);
CUCH(cudaMemsetAsync(dev_dists[devi], 0, sizeof(float)),
kmcudaRuntimeError);
dim3 block(BLOCK_SIZE, 1, 1);
dim3 grid(upper(length, block.x), 1, 1);
KERNEL_SWITCH(kmeans_calc_average_distance, <<<grid, block>>>(
offset, length, reinterpret_cast<const F*>(samples[devi].get()),
reinterpret_cast<const F*>(centroids[devi].get()),
assignments[devi].get(), dev_dists[devi]));
);
float sum = 0;
FOR_EACH_DEVI(
float hdist;
CUCH(cudaMemcpy(&hdist, dev_dists[devi], sizeof(float), cudaMemcpyDeviceToHost),
kmcudaMemoryCopyError);
sum += hdist;
);
*average_distance = sum / h_samples_size;
return kmcudaSuccess;
}

} // extern "C"
Loading

0 comments on commit 2617e0e

Please sign in to comment.