make README the cluster functio ndocstring; remove boilerplate code

JuliaML · Jun 18, 2024 · b163d24 · b163d24
1 parent 468bd68
commit b163d24
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -13,14 +13,51 @@ which interplay with the functions:
 - `cluster_labels`
 - `cluster_probs`
 
+## `cluster` documentation
+
+```julia
+cluster(ca::ClusteringAlgortihm, data) → cr::ClusteringResults
+```
+
+Cluster input `data` according to the algorithm specified by `ca`.
+All options related to the algorithm are given as keyword arguments when
+constructing `ca`.
+
+The input `data` is a length-m iterable of "vectors" (data points).
+"Vector" here is considered in the generalized sense, i.e., any objects that
+a distance can be defined on them so that they can be clustered.
+In the majority of cases these are vectors of real numbers.
+If you have a matrix with each row a data point, simply pass in `eachrow(matrix)`.
+
+The output is always a subtype of `ClusteringResults` that can be further queried.
+The cluster labels are always the
+positive integers `1:n` with `n::Int` the number of created clusters,
+Data points that couldn't get clustered (e.g., outliers or noise)
+get assigned negative integers, typically just `-1`.
+
+`ClusteringResults` subtypes always implement the following functions:
+
+- `cluster_labels(cr)` returns a length-m vector `labels::Vector{Int}` containing
+  the clustering labels , so that `data[i]` has label `labels[i]`.
+- `cluster_probs(cr)` returns `probs` a length-m vector of length-`n` vectors
+  containing the "probabilities" or "score" of each point belonging to one of
+  the created clusters (useful for fuzzy clustering algorithms).
+- `cluster_number(cr)` returns `n`.
+
+Other algorithm-related output can be obtained as a field of the result type,
+or by using other specific functions of the result type.
+This is described in the individual algorithm implementations docstrings.
+
+## For developers
+
 To create new clustering algorithms simply create a new
 subtype of `ClusteringAlgorithm` that extends `cluster`
 so that it returns a new subtype of `ClusteringResult`.
 This result must extend `cluster_number, cluster_labels`
 and optionally `cluster_probs`.
 
-For developers: see two helper functions `each_data_point, input_data_size`
-so that you can support matrix input while abiding the declared api
+See also the two helper functions `each_data_point, input_data_size`
+which help you can support matrix input while abiding the declared api
 of iterable of vectors as input.
 
 For more, see the docstring of `cluster`.
diff --git a/src/ClusteringAPI.jl b/src/ClusteringAPI.jl
@@ -6,41 +6,17 @@ export cluster, cluster_number, cluster_labels, cluster_probs
 abstract type ClusteringAlgorithm end
 abstract type ClusteringResults end
 
-"""
-    cluster(ca::ClusteringAlgortihm, data) → cr::ClusteringResults
-
-Cluster input `data` according to the algorithm specified by `ca`.
-All options related to the algorithm are given as keyword arguments when
-constructing `ca`.
-
-The input `data` is a length-m iterable of "vectors" (data points).
-"Vector" here is considered in the generalized sense, i.e., any objects that
-a distance can be defined on them so that they can be clustered.
-In the majority of cases these are vectors of real numbers.
-
-The output is always a subtype of `ClusteringResults` that can be further queried.
-The cluster labels are always the
-positive integers `1:n` with `n::Int` the number of created clusters,
-Data points that couldn't get clustered (e.g., outliers or noise)
-get assigned negative integers, typically just `-1`.
 
-`ClusteringResults` subtypes always implement the following functions:
-
-- `cluster_labels(cr)` returns a length-m vector `labels::Vector{Int}` containing
-  the clustering labels , so that `data[i]` has label `labels[i]`.
-- `cluster_probs(cr)` returns `probs` a length-m vector of length-`n` vectors
-  containing the "probabilities" or "score" of each point belonging to one of
-  the created clusters (useful for fuzzy clustering algorithms).
-- `cluster_number(cr)` returns `n`.
-
-Other algorithm-related output can be obtained as a field of the result type,
-or by using other specific functions of the result type.
-This is described in the individual algorithm implementations docstrings.
-"""
 function cluster(ca::ClusteringAlgorithm, data)
     throw(ArgumentError("No implementation for `cluster` for $(typeof(ca))."))
 end
 
+@doc let # make README the `cluster` function docstring.
+    path = joinpath(dirname(@__DIR__), "README.md")
+    include_dependency(path)
+    read(path, String)
+end cluster
+
 """
     cluster_number(cr::ClusteringResults) → n::Int
 
@@ -76,22 +52,4 @@ function cluster_probs(cr::ClusteringResults)
     return probs
 end
 
-# two helper functions for agnostic input data type
-"""
-    input_data_size(data) → (d, m)
-
-Return the data point dimension and number of data points.
-"""
-input_data_size(A::AbstractMatrix) = size(A)
-input_data_size(A::AbstractVector{<:AbstractVector}) = (length(first(A)), length(A))
-
-"""
-    each_data_point(data)
-
-Return an indexable iterator over each data point in `data`, that can be
-indexed with indices `1:m`.
-"""
-each_data_point(A::AbstractMatrix) = eachcol(A)
-each_data_point(A::AbstractVector{<:AbstractVector}) = A
-
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -16,10 +16,3 @@ cr = cluster(TestClustering(), randn(100))
 @test cluster_number(cr) == 1
 @test cluster_labels(cr) == fill(1, 100)
 @test cluster_probs(cr) == fill([1.0], 100)
-
-@test ClusteringAPI.input_data_size([rand(3) for _ in 1:30]) == (3, 30)
-@test ClusteringAPI.input_data_size(rand(3,30)) == (3, 30)
-
-v = [ones(3) for _ in 1:30]
-@test ClusteringAPI.each_data_point(v) == v
-@test ClusteringAPI.each_data_point(ones(3,30)) == v