diff --git a/src/SyntheticDatasets.jl b/src/SyntheticDatasets.jl index e1fcbbf..9ebfe23 100644 --- a/src/SyntheticDatasets.jl +++ b/src/SyntheticDatasets.jl @@ -12,6 +12,7 @@ end include("sklearn.jl") include("matlab.jl") +include("descriptor.jl") function convert(features::Array{T, 2}, labels::Array{D, 1})::DataFrame where {T <: Number, D <: Number} df = DataFrame() diff --git a/src/descriptor.jl b/src/descriptor.jl new file mode 100644 index 0000000..25eeeef --- /dev/null +++ b/src/descriptor.jl @@ -0,0 +1,102 @@ +mutable struct MethodDescriber + name::String + description::Union{String, Nothing} + problem_type::Union{Symbol, Nothing} + f::Union{Function, Nothing} + + MethodDescriber() = new() + +end + +function MethodDescriber( name::String; + description = nothing, + problem_type = nothing, + f = nothing) + + method = MethodDescriber() + + method.name = name + method.description = description + method.problem_type = problem_type + method.f = f + return method +end + +function Base.show(io::IO, method::MethodDescriber) + println(io, "$(method.name)") + method.problem_type !== nothing && println(io, "problem type: " * string(method.problem_type)) + method.description !== nothing && println(io, "Description:\n" * method.description) +end + +mutable struct MethodDescriberSet + describers::Array{MethodDescriber, 1} +end + +MethodDescriberSet(args...) = MethodDescriberSet([args...]) + +function methodsFilter(methods::MethodDescriberSet, parameters::Union{Pair, Array{Pair}}) + if !(parameters isa Array) + parameters = [parameters] + end + + filtered_methods = Set() + + for parameter in parameters + if !(parameter[1] in fieldnames(MethodDescriber)) + @warn "$(parameter[1]) isn't a property of MethodDescriber" + continue + end + for method in methods.describers + property = getfield(method, parameter[1]) + if property == parameter[2] + push!(filtered_methods, method) + end + end + end + + return MethodDescriberSet(collect(filtered_methods)) +end + +methodsFilter(parameters::Union{Pair, Array{Pair}}) = methodsFilter(METHODS, parameters) + +Base.getindex(methods::MethodDescriberSet, index::Int) = methods.describers[index] + +function Base.show(io::IO, methods::MethodDescriberSet) + for method in methods.describers + print(io, method) + end +end + +const METHODS = MethodDescriberSet( + MethodDescriber( + "generate_blobs", + problem_type = :Classification, + description = """ + Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blobs. + """, + f = generate_blobs,), + MethodDescriber( + "generate_moons", + problem_type = :Classification, + description = """ + Generate isotropic Gaussian blobs for clustering. Sklearn interface to make_blobs. + """, + f = generate_moons,), + MethodDescriber( + "make_s_curve", + problem_type = :Regression, + description = """ + Generate an S curve dataset. Sklearn interface to make_s_curve. + """, + f = generate_s_curve,), + MethodDescriber( + "generate_regression", + problem_type = :Regression, + description = """ + Generate a random regression problem. Sklearn interface to make_regression. + """, + f = generate_regression,) + +) + +methods() = METHODS \ No newline at end of file diff --git a/test/describer.jl b/test/describer.jl new file mode 100644 index 0000000..7850d32 --- /dev/null +++ b/test/describer.jl @@ -0,0 +1,81 @@ +using SyntheticDatasets +using DataFrames +using Test + +@testset "SkLearn Generators" begin + samples = 20000 + features = 20 + + data = SyntheticDatasets.generate_blobs(centers = [-1 1;-0.5 0.75], + cluster_std = 0.225, + n_samples = 20000, + center_box = (-1.5, 1.5)) + + @test size(data)[1] == samples + @test size(data)[2] == 3 + + samples = 20000 + data = SyntheticDatasets.generate_moons(n_samples = 20000) + + @test size(data)[1] == samples + @test size(data)[2] == 3 + + data = SyntheticDatasets.generate_s_curve(n_samples = samples, + noise = 2.2, + random_state = 5) + + @test size(data)[1] == samples + @test size(data)[2] == 4 + + data = SyntheticDatasets.generate_circles(n_samples = samples) + + @test size(data)[1] == samples + @test size(data)[2] == 3 + + data = SyntheticDatasets.generate_regression(n_samples = samples, + n_features = features, + noise = 2.2, + random_state = 5) + + @test size(data)[1] == samples + @test size(data)[2] == features + 1 + + data = SyntheticDatasets.generate_classification(n_samples = samples, + n_features = features, + n_classes = 1) + + @test size(data)[1] == samples + @test size(data)[2] == features + 1 + + data = SyntheticDatasets.generate_friedman1(n_samples = samples, + n_features = features) + + @test size(data)[1] == samples + @test size(data)[2] == features + 1 + + data = SyntheticDatasets.generate_friedman2(n_samples = samples) + + @test size(data)[1] == samples + @test size(data)[2] == 5 + + data = SyntheticDatasets.generate_friedman3(n_samples = samples) + + @test size(data)[1] == samples + @test size(data)[2] == 5 + + data = SyntheticDatasets.generate_low_rank_matrix(n_samples = samples, + n_features = features, + effective_rank = 10, + tail_strength = 0.5, + random_state = 5) + + @test size(data)[1] == samples + @test size(data)[2] == features + + data = SyntheticDatasets.generate_swiss_roll(n_samples =samples, + noise = 2.2, + random_state = 5) + + @test size(data)[1] == samples + @test size(data)[2] == 4 +end