diff --git a/src/MLJFlux.jl b/src/MLJFlux.jl index 84bce73f..981bc4d4 100644 --- a/src/MLJFlux.jl +++ b/src/MLJFlux.jl @@ -1,4 +1,4 @@ -module MLJFlux +module MLJFlux export CUDALibs, CPU1 @@ -14,6 +14,8 @@ using ColorTypes using ComputationalResources using Random +const MMI=MLJModelInterface + include("penalizers.jl") include("core.jl") include("builders.jl") @@ -24,7 +26,7 @@ include("image.jl") include("mlj_model_interface.jl") ### Package specific model traits: -MLJModelInterface.metadata_pkg.((NeuralNetworkRegressor, +MMI.metadata_pkg.((NeuralNetworkRegressor, MultitargetNeuralNetworkRegressor, NeuralNetworkClassifier, ImageClassifier), @@ -37,4 +39,6 @@ MLJModelInterface.metadata_pkg.((NeuralNetworkRegressor, export NeuralNetworkRegressor, MultitargetNeuralNetworkRegressor export NeuralNetworkClassifier, ImageClassifier + + end #module diff --git a/src/classifier.jl b/src/classifier.jl index 82d4efc9..2825dff7 100644 --- a/src/classifier.jl +++ b/src/classifier.jl @@ -31,8 +31,4 @@ end MLJModelInterface.metadata_model(NeuralNetworkClassifier, input=Table(Continuous), target=AbstractVector{<:Finite}, - path="MLJFlux.NeuralNetworkClassifier", - descr="A neural network model for making "* - "probabilistic predictions of a "* - "`Multiclass` or `OrderedFactor` target, "* - "given a table of `Continuous` features. ") + path="MLJFlux.NeuralNetworkClassifier") diff --git a/src/image.jl b/src/image.jl index 5c973eb7..dc8d5637 100644 --- a/src/image.jl +++ b/src/image.jl @@ -29,7 +29,4 @@ end MLJModelInterface.metadata_model(ImageClassifier, input=AbstractVector{<:MLJModelInterface.Image}, target=AbstractVector{<:Multiclass}, - path="MLJFlux.ImageClassifier", - descr="A neural network model for making probabilistic "* - "predictions of a `GrayImage` target, "* - "given a table of `Continuous` features. ") + path="MLJFlux.ImageClassifier") diff --git a/src/regressor.jl b/src/regressor.jl index f932bff7..85a431aa 100644 --- a/src/regressor.jl +++ b/src/regressor.jl @@ -23,11 +23,7 @@ end MLJModelInterface.metadata_model(NeuralNetworkRegressor, input=Table(Continuous), target=AbstractVector{<:Continuous}, - path="MLJFlux.NeuralNetworkRegressor", - descr="A neural network model for making "* - "deterministic predictions of a "* - "`Continuous` target, given a table of "* - "`Continuous` features. ") + path="MLJFlux.NeuralNetworkRegressor") # # MULTITARGET NEURAL NETWORK REGRESSOR @@ -59,9 +55,4 @@ end MLJModelInterface.metadata_model(MultitargetNeuralNetworkRegressor, input=Table(Continuous), target=Table(Continuous), - path="MLJFlux.MultitargetNeuralNetworkRegressor", - descr = "A neural network model for making "* - "deterministic predictions of a "* - "`Continuous` multi-target, presented "* - "as a table, given a table of "* - "`Continuous` features. ") + path="MLJFlux.MultitargetNeuralNetworkRegressor") diff --git a/src/types.jl b/src/types.jl index bf5674af..1f454a63 100644 --- a/src/types.jl +++ b/src/types.jl @@ -3,51 +3,6 @@ abstract type MLJFluxDeterministic <: MLJModelInterface.Deterministic end const MLJFluxModel = Union{MLJFluxProbabilistic,MLJFluxDeterministic} -const doc_regressor(model_name) = """ - - $model_name(; hyparameters...) - -Instantiate an MLJFlux model. Available hyperparameters: - -- `builder`: Default = `MLJFlux.Linear(σ=Flux.relu)` (regressors) or - `MLJFlux.Short(n_hidden=0, dropout=0.5, σ=Flux.σ)` (classifiers) - -- `optimiser`: The optimiser to use for training. Default = - `Flux.ADAM()` - -- `loss`: The loss function used for training. Default = `Flux.mse` - (regressors) and `Flux.crossentropy` (classifiers) - -- `epochs`: Number of epochs to train for. Default = `10` - -- `batch_size`: The batch_size for the data. Default = 1 - -- `lambda`: The regularization strength. Default = 0. Range = [0, ∞) - -- `alpha`: The L2/L1 mix of regularization. Default = 0. Range = [0, 1] - -- `rng`: The random number generator (RNG) passed to builders, for - weight intitialization, for example. Can be any `AbstractRNG` or - the seed (integer) for a `MersenneTwister` that is reset on every - cold restart of model (machine) training. Default = - `GLOBAL_RNG`. - -- `acceleration`: Use `CUDALibs()` for training on GPU; default is `CPU1()`. - -- `optimiser_changes_trigger_retraining`: True if fitting an - associated machine should trigger retraining from scratch whenever - the optimiser changes. Default = `false` - -""" - -doc_classifier(model_name) = doc_regressor(model_name)*""" -- `finaliser`: Operation applied to the unnormalized output of the - final layer to obtain probabilities (outputs summing to - one). The shape of the inputs and outputs - of this operator must match. Default = `Flux.softmax`. - -""" - for Model in [:NeuralNetworkClassifier, :ImageClassifier] ex = quote @@ -67,7 +22,7 @@ for Model in [:NeuralNetworkClassifier, :ImageClassifier] function $Model(; builder::B = Short() , finaliser::F = Flux.softmax - , optimiser::O = Flux.Optimise.ADAM() + , optimiser::O = Flux.Optimise.Adam() , loss::L = Flux.crossentropy , epochs = 10 , batch_size = 1 @@ -97,13 +52,340 @@ for Model in [:NeuralNetworkClassifier, :ImageClassifier] return model end - @doc doc_classifier($Model) $Model - end eval(ex) end +""" +$(MMI.doc_header(NeuralNetworkClassifier)) + +`NeuralNetworkClassifier` is for training a data-dependent Flux.jl neural network +for making probabilistic predictions of a `Multiclass` or `OrderedFactor` target, +given a table of `Continuous` features. Users provide a recipe for constructing + the network, based on properties of the data that is encountered, by specifying + an appropriate `builder`. See MLJFlux documentation for more on builders. + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X, y) + +Where + +- `X`: is any table of input features (eg, a `DataFrame`) whose columns + are of scitype `Continuous`; check the column scitypes with `schema(X)`. +- `y`: is the target, which can be any `AbstractVector` whose element + scitype is `Multiclass` or `OrderedFactor` with `n_out` classes; + check the scitype with `scitype(y)` + + +# Hyper-parameters + +- `builder=MLJFlux.Short()`: An MLJFlux builder that constructs a neural + network. Possible `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, + and `MLJFlux.MLP`. See MLJFlux documentation for examples of + user-defined builders. +- `optimiser::Flux.Adam()`: A `Flux.Optimise` optimiser. The optimiser performs the updating of the weights of the network. For further reference, see either the examples or [the Flux optimiser documentation](https://fluxml.ai/Flux.jl/stable/training/optimisers/). To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of 10 between `1` and `1e-7`. +- `loss=Flux.crossentropy`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`. Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a classification task, the most natural loss functions are: + - `Flux.crossentropy`: Typically used as loss in multiclass classification, with labels in a 1-hot encoded format. + - `Flux.logitcrossentopy`: Mathematically equal to crossentropy, but computationally more numerically stable than finalising the outputs with `softmax` and then calculating crossentropy. + - `Flux.binarycrossentropy`: Typically used as loss in binary classification, with labels in a 1-hot encoded format. + - `Flux.logitbinarycrossentopy`: Mathematically equal to crossentropy, but computationally more numerically stable than finalising the outputs with `sigmoid` and then calculating binary crossentropy. + - `Flux.tversky_loss`: Used with imbalanced data to give more weight to false negatives. + - `Flux.focal_loss`: Used with highly imbalanced data. Weights harder examples more than easier examples. + - `Flux.binary_focal_loss`: Binary version of the above + Currently MLJ measures are not supported as loss functions here. +- `epochs::Int=10`: The number of epochs to train for. Typically, one epoch represents one pass through the entirety of the training dataset. +- `batch_size::int=1`: the batch size to be used for training. the batch size represents + the number of samples per update of the networks weights. typcally, batch size should be + somewhere between 8 and 512. smaller batch sizes lead to noisier training loss curves, + while larger batch sizes lead towards smoother training loss curves. + In general, it is a good idea to pick one fairly large batch size (e.g. 32, 64, 128), + and stick with it, and only tune the learning rate. In most examples, batch size is set + in powers of twos, but this is fairly arbitrary. +- `lambda::Float64=0`: The stregth of the regularization used during training. Can be any value in the range `[0, ∞)`. +- `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization. +- `rng::Union{AbstractRNG, Int64}`: The random number generator/seed used during training. +- `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when fitting a machine if the associated optimiser has changed. If true, the associated machine will retrain from scratch on `fit!`, otherwise it will not. +- `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CudaLibs()`. For training on GPU, use `CUDALibs()`. +- `finaliser=Flux.softmax`: The final activation function of the neural network. Defaults to `Flux.softmax`. For a classification task, `softmax` is used for multiclass, single label regression, `sigmoid` is used for either binary classification or multi label classification (when there are multiple possible labels for a given sample). + + +# Operations + +- `predict(mach, Xnew)`: return predictions of the target given new + features `Xnew` having the same scitype as `X` above. Predictions are + probabilistic but uncalibrated. +- `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions + returned above. + + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `chain`: The trained "chain" (Flux.jl model), namely the series of layers, + functions, and activations which make up the neural network. This includes + the final layer specified by `finaliser` (eg, `softmax`). + + +# Report + +The fields of `report(mach)` are: + +- `training_losses`: A vector of training losses (penalised if `lambda != 0`) in + historical order, of length `epochs + 1`. The first element is the pre-training loss. + +# Examples + +In this example we build a classification model using the Iris dataset. +```julia +using MLJ +using Flux +import RDatasets + +using Random +Random.seed!(123) + +``` +This is a very basic example, using a default builder and no standardization. +For a more advanced illustration, see [`NeuralNetworkRegressor`](@ref) or [`ImageClassifier`](@ref). First, we can load the data: +```julia +iris = RDatasets.dataset("datasets", "iris"); +y, X = unpack(iris, ==(:Species), rng=123); +NeuralNetworkClassifier = @load NeuralNetworkClassifier +clf = NeuralNetworkClassifier() +``` +Next, we can train the model: +```julia +import Random.seed!; seed!(123) +mach = machine(clf, X, y) +fit!(mach) +``` +We can train the model in an incremental fashion with the `optimizer_changes_trigger_retraining` flag set to false (which is by default). Here, we change the number of iterations and the learning rate of the optimiser: +```julia +clf.optimiser.eta = clf.optimiser.eta * 2 +clf.epochs = clf.epochs + 5 + +# note that if the `optimizer_changes_trigger_retraining` flag was set to true +# the model would be completely retrained from scratch because the optimizer was +# updated +fit!(mach, verbosity=2); +``` +We can inspect the mean training loss using the `cross_entropy` function: +```julia + +training_loss = cross_entropy(predict(mach, X), y) |> mean + +``` +And we can access the Flux chain (model) using `fitted_params`: +```julia +chain = fitted_params(mach).chain +``` +Finally, we can see how the out-of-sample performance changes over time, using the `learning_curve` function +```julia +r = range(clf, :epochs, lower=1, upper=200, scale=:log10) +curve = learning_curve(clf, X, y, + range=r, + resampling=Holdout(fraction_train=0.7), + measure=cross_entropy) +using Plots +plot(curve.parameter_values, + curve.measurements, + xlab=curve.parameter_name, + xscale=curve.parameter_scale, + ylab = "Cross Entropy") + +``` +See also +[`ImageClassifier`](@ref) +""" +NeuralNetworkClassifier + +""" +$(MMI.doc_header(ImageClassifier)) + +`ImageClassifier` classifies images using a neural network adapted to the type + of images provided (color or greyscale). Predictions are probabistic. Users + provide a recipe for constructing the network, based on properties of the image + encountered, by specifying an appropriate `builder`. See MLJFlux documentation + for more on builders. + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X, y) + +Where +- `X`: is any `AbstractVector` of images with `ColorImage` or `GrayImage` + scitype; check the scitype with `scitype(X)` and refer to ScientificTypes.jl + documentation on coercing typical image formats into an appropriate type. +- `y`: is the target, which can be any `AbstractVector` whose element + scitype is `Multiclass`; check the scitype with `scitype(y)`. + + +# Hyper-parameters + +- `builder`: An MLJFlux builder that constructs the neural network. + The fallback builds a depth-16 VGG architecture adapted to the image + size and number of target classes, with no batch normalisation; see the + Metalhead.jl documentation for details. See the example below for a + user-specified builder. +- `optimiser::Flux.Adam()`: A `Flux.Optimise` optimiser. The optimiser performs the updating of the weights of the network. For further reference, see either the examples or [the Flux optimiser documentation](https://fluxml.ai/Flux.jl/stable/training/optimisers/). To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to start out at `10e-3`, and tune using powers of 10 between `1` and `1e-7`. +- `loss=Flux.crossentropy`: The loss function which the network will optimize. Should be a function which can be called in the form `loss(yhat, y)`. Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a classification task, the most natural loss functions are: + - `Flux.crossentropy`: Typically used as loss in multiclass classification, with labels in a 1-hot encoded format. + - `Flux.logitcrossentopy`: Mathematically equal to crossentropy, but computationally more numerically stable than finalising the outputs with `softmax` and then calculating crossentropy. + - `Flux.binarycrossentropy`: Typically used as loss in binary classification, with labels in a 1-hot encoded format. + - `Flux.logitbinarycrossentopy`: Mathematically equal to crossentropy, but computationally more numerically stable than finalising the outputs with `sigmoid` and then calculating binary crossentropy. + - `Flux.tversky_loss`: Used with imbalanced data to give more weight to false negatives. + - `Flux.focal_loss`: Used with highly imbalanced data. Weights harder examples more than easier examples. + - `Flux.binary_focal_loss`: Binary version of the above + Currently MLJ measures are not supported as loss functions here. +- `epochs::Int=10`: The number of epochs to train for. Typically, one epoch represents one pass through the entirety of the training dataset. +- `batch_size::Int=1`: The batch size to be used for training. The batch size + represents the number of samples per update of the networks weights. Batch + sizes between 8 and 512 are typical. Increasing batch size can speed up + training, especially on a GPU (`acceleration=CUDALibs()`). +- `lambda::Float64=0`: The stregth of the regularization used during training. Can be any value in the range `[0, ∞)`. +- `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization. +- `rng::Union{AbstractRNG, Int64}`: The random number generator/seed used during training. +- `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when fitting a machine if the associated optimiser has changed. If true, the associated machine will retrain from scratch on `fit!`, otherwise it will not. +- `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. For Training on GPU, use `CudaLibs()`. For training on GPU, use `CUDALibs()`. +- `finaliser=Flux.softmax`: The final activation function of the neural network, + needed to convert outputs to probabilities (builders do not provide this). + + +# Operations + +- `predict(mach, Xnew)`: return predictions of the target given new + features `Xnew` having the same scitype as `X` above. Predictions are + probabilistic but uncalibrated. +- `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions + returned above. + + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `chain`: The trained "chain" (Flux.jl model), namely the series of layers, + functions, and activations which make up the neural network. This includes + the final layer specified by `finaliser` (eg, `softmax`). + + +# Report + +The fields of `report(mach)` are: +- `training_losses`: A vector of training losses (penalised if `lambda != 0`) in + historical order, of length `epochs + 1`. The first element is the pre-training loss. + +# Examples + +In this example we use MLJ to classify the MNIST image dataset +```julia +using MLJ +using Flux +import MLJFlux +import MLJIteration # for `skip` + +``` +First we want to download the MNIST dataset, and unpack into images and labels +```julia +import MLDatasets: MNIST + +images, labels = MNIST.traindata(); +``` +In MLJ, integers cannot be used for encoding categorical data, so we must coerce them into the `Multiclass` scitype: +```julia +labels = coerce(labels, Multiclass); +images = coerce(images, GrayImage); + +images[1] +``` +We start by defining a suitable `builder` object. This is a recipe +for building the neural network. Our builder will work for images of +any (constant) size, whether they be color or black and white (ie, +single or multi-channel). The architecture always consists of six +alternating convolution and max-pool layers, and a final dense +layer; the filter size and the number of channels after each +convolution layer is customisable. +```julia +import MLJFlux + +struct MyConvBuilder + filter_size::Int + channels1::Int + channels2::Int + channels3::Int +end + +make2d(x::AbstractArray) = reshape(x, :, size(x)[end]) + +function MLJFlux.build(b::MyConvBuilder, rng, n_in, n_out, n_channels) + k, c1, c2, c3 = b.filter_size, b.channels1, b.channels2, b.channels3 + mod(k, 2) == 1 || error("`filter_size` must be odd. ") + p = div(k - 1, 2) # padding to preserve image size + init = Flux.glorot_uniform(rng) + front = Chain( + Conv((k, k), n_channels => c1, pad=(p, p), relu, init=init), + MaxPool((2, 2)), + Conv((k, k), c1 => c2, pad=(p, p), relu, init=init), + MaxPool((2, 2)), + Conv((k, k), c2 => c3, pad=(p, p), relu, init=init), + MaxPool((2 ,2)), + make2d) + d = Flux.outputsize(front, (n_in..., n_channels, 1)) |> first + return Chain(front, Dense(d, n_out, init=init)) +end +``` +It is important to note that in our `build` function, there is no final `softmax`. This is applied by default in all MLJFlux classifiers (override this using the `finaliser` hyperparameter). Now that we have our builder defined, we can define the actual model. If you have a GPU, you can substitute in `acceleration=CUDALibs()` below to greatly speed up training. +```julia +ImageClassifier = @load ImageClassifier +clf = ImageClassifier(builder=MyConvBuilder(3, 16, 32, 32), + batch_size=50, + epochs=10, + rng=123) +``` +You can add flux options such as `optimiser` and `loss` in the snippet above. Currently, `loss` must be a flux-compatible loss, and not an MLJ measure. +Next, we can bind the model with the data in a machine, and fit the first 500 or so images: +```julia +mach = machine(clf, images, labels); + +fit!(mach, rows=1:500, verbosity=2); + +report(mach) + +chain = fitted_params(mach) + +Flux.params(chain)[2] +``` +We can tack on 20 more epochs by modifying the `epochs` field, and iteratively fit some more: +```julia +clf.epochs = clf.epochs + 20 +fit!(mach, rows=1:500); +``` +We can also make predictions and calculate an out-of-sample loss estimate: +```julia +predicted_labels = predict(mach, rows=501:1000); +cross_entropy(predicted_labels, labels[501:1000]) |> mean +``` +The preceding `fit!`/`predict`/evaluate workflow can be alternatively executed as folllows: + +```julia +evaluate!(mach, + resampling=Holdout(fraction_train=0.5), + measure=cross_entropy, + rows=1:1000, + verbosity=0) +``` +See also +[`NeuralNetworkClassifier`](@ref) +""" +ImageClassifier + for Model in [:NeuralNetworkRegressor, :MultitargetNeuralNetworkRegressor] ex = quote @@ -121,7 +403,7 @@ for Model in [:NeuralNetworkRegressor, :MultitargetNeuralNetworkRegressor] end function $Model(; builder::B = Linear() - , optimiser::O = Flux.Optimise.ADAM() + , optimiser::O = Flux.Optimise.Adam() , loss::L = Flux.mse , epochs = 10 , batch_size = 1 @@ -149,12 +431,368 @@ for Model in [:NeuralNetworkRegressor, :MultitargetNeuralNetworkRegressor] return model end - @doc $doc_regressor($Model) $Model - end eval(ex) end + +""" +$(MMI.doc_header(NeuralNetworkRegressor)) + +`NeuralNetworkRegressor` is for training a data-dependent Flux.jl neural +network to predict a `Continuous` target, given a table of +`Continuous` features. Users provide a recipe for constructing the +network, based on properties of the data that is encountered, by specifying +an appropriate `builder`. See MLJFlux documentation for more on builders. + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X, y) + +Where + +- `X`: is any table of input features (eg, a `DataFrame`) whose columns + are of scitype `Continuous`; check the column scitypes with `schema(X)`. +- `y`: is the target, which can be any `AbstractVector` whose element + scitype is `Continuous`; check the scitype with `scitype(y)` + + +# Hyper-parameters + +- `builder=MLJFlux.Linear(σ=Flux.relu)`: An MLJFlux builder that constructs + a neural network. Possible `builders` include: `MLJFlux.Linear`, `MLJFlux.Short`, + and `MLJFlux.MLP`. See below for an example of a user-specified builder. +- `optimiser::Flux.Adam()`: A `Flux.Optimise` optimiser. The optimiser performs the updating + of the weights of the network. For further reference, see either the examples or + [the Flux optimiser documentation](https://fluxml.ai/Flux.jl/stable/training/optimisers/). + To choose a learning rate (the update rate of the optimizer), a good rule of thumb is to + start out at `10e-3`, and tune using powers of 10 between `1` and `1e-7`. +- `loss=Flux.mse`: The loss function which the network will optimize. Should be a function + which can be called in the form `loss(yhat, y)`. + Possible loss functions are listed in [the Flux loss function documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). + For a regression task, the most natural loss functions are: + - `Flux.mse` + - `Flux.mae` + - `Flux.msle` + - `Flux.huber_loss` + Currently MLJ measures are not supported as loss functions here. +- `epochs::Int=10`: The number of epochs to train for. Typically, one epoch represents + one pass through the entirety of the training dataset. +- `batch_size::Int=1`: The batch size to be used for training. The batch size + represents the number of samples per update of the networks weights. Batch + sizes between 8 and 512 are typical. Increasing batch size can speed up + training, especially on a GPU (`acceleration=CUDALibs()`). +- `lambda::Float64=0`: The stregth of the regularization used during training. Can be any value + in the range `[0, ∞)`. +- `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. + A value of 0 represents L2 regularization, and a value of 1 represents L1 regularization. +- `rng::Union{AbstractRNG, Int64}`: The random number generator/seed used during training. +- `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when fitting a machine if the associated optimiser has changed. If true, the associated machine will retrain from scratch on `fit!`, otherwise it will not. +- `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. +For training on GPU, use `CudaLibs()`. For training on GPU, use `CUDALibs()`. + + +# Operations + +- `predict(mach, Xnew)`: return predictions of the target given new + features `Xnew` having the same scitype as `X` above. Predictions are + deterministic. + + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `chain`: The trained "chain" (Flux.jl model), namely the series of layers, + functions, and activations which make up the neural network. This includes + the final layer specified by `finaliser` (eg, `softmax`). + + +# Report + +The fields of `report(mach)` are: + +- `training_losses`: A vector of training losses (penalised if `lambda != 0`) in + historical order, of length `epochs + 1`. The first element is the pre-training loss. + +# Examples + +In this example we build a regression model using the Boston house price dataset +```julia + using MLJ + using MLJFlux + using Flux +``` +First, we load in the data, with target `:MEDV`. We load in all features except `:CHAS`: +```julia +data = OpenML.load(531); # Loads from https://www.openml.org/d/531 + +y, X = unpack(data, ==(:MEDV), !=(:CHAS); rng=123); + +scitype(y) +schema(X) +``` +Since MLJFlux models do not handle ordered factors, we can treat `:RAD` as `Continuous`: +```julia +X = coerce(X, :RAD=>Continuous) +``` +Lets also make a test set: +```julia +(X, Xtest), (y, ytest) = partition((X, y), 0.7, multi=true); +``` +Next, we can define a `builder`. In the following macro call, `n_in` is the number of expected input features, and rng is a RNG. `init` is the function used to generate the random initial weights of the network. +expected input features, and rng is a RNG. `init` is the function used to generate the random initial weights of the network. +random initial weights of the network. +```julia +builder = MLJFlux.@builder begin + init=Flux.glorot_uniform(rng) + Chain(Dense(n_in, 64, relu, init=init), + Dense(64, 32, relu, init=init), + Dense(32, 1, init=init)) +end +``` +Finally, we can define the model! +```julia +NeuralNetworkRegressor = @load NeuralNetworkRegressor + model = NeuralNetworkRegressor(builder=builder, + rng=123, + epochs=20) +``` +We will arrange for standardizaion of the the target by wrapping our model + in `TransformedTargetModel`, and standardization of the features by +inserting the wrapped model in a pipeline: +```julia +pipe = Standardizer |> TransformedTargetModel(model, target=Standardizer) +``` +If we fit with a high verbosity (>1), we will see the losses during training. We can also see the losses in the output of `report(mach)` +also see the losses in the output of `report(mach)` +```julia +mach = machine(pipe, X, y) +fit!(mach, verbosity=2) + +# first element initial loss, 2:end per epoch training losses +report(mach).transformed_target_model_deterministic.training_losses + +``` + +## Experimenting with learning rate + +We can visually compare how the learning rate affects the predictions: +```julia +using Plots + +rates = 10. .^ (-5:0) + +foreach(rates) do η + pipe.transformed_target_model_deterministic.model.optimiser.eta = η + fit!(mach, force=true, verbosity=0) + losses = + report(mach).transformed_target_model_deterministic.model.training_losses[3:end] + plot!(1:length(losses), losses, label=η) +end + + +pipe.transformed_target_model_deterministic.model.optimiser.eta = 0.0001 + +# CV estimate, based on `(X, y)`: +evaluate!(mach, resampling=CV(nfolds=5), measure=l2) + +# loss for `(Xtest, test)`: +fit!(mach) # train on `(X, y)` +yhat = predict(mach, Xtest) +l2(yhat, ytest) |> mean +``` + +For impementing stopping criterion and other iteration controls, refer to examples linked +from the MLJFlux documentation + +See also +[`MultitargetNeuralNetworkRegressor`](@ref) +""" +NeuralNetworkRegressor + +""" +$(MMI.doc_header(MultitargetNeuralNetworkRegressor)) + +`MultitargetNeuralNetworkRegressor` is for training a data-dependent Flux.jl + neural network to predict a multivalued `Continuous` target, represented as a table, + given a table of `Continuous` features. Users provide a recipe for constructing the + network, based on properties of the data that is encountered, by specifying an +appropriate `builder`. See MLJFlux documentation for more on builders. + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X, y) + +Where + +- `X`: is any table of input features (eg, a `DataFrame`) whose columns + are of scitype `Continuous`; check the column scitypes with `schema(X)`. +- `y`: is the target, which can be any table of output targets whose element + scitype is `Continuous`; check the scitype with `schema(y)` + + +# Hyper-parameters + +- `builder=MLJFlux.Linear(σ=Flux.relu)`: An MLJFlux builder that constructs a neural + network. Possible `builders` include: `Linear`, `Short`, and `MLP`. You can construct + your own builder using the `@builder` macro, see examples for further information. +- `optimiser::Flux.Adam()`: A `Flux.Optimise` optimiser. The optimiser performs the + updating of the weights of the network. For further reference, see either the examples + or [the Flux optimiser + documentation](https://fluxml.ai/Flux.jl/stable/training/optimisers/). To choose a + learning rate (the update rate of the optimizer), a good rule of thumb is to start out + at `10e-3`, and tune using powers of 10 between `1` and `1e-7`. +- `loss=Flux.mse`: The loss function which the network will optimize. Should be a + function which can be called in the form `loss(yhat, y)`. Possible loss functions are + listed in [the Flux loss function + documentation](https://fluxml.ai/Flux.jl/stable/models/losses/). For a regression task, + the most natural loss functions are: + - `Flux.mse` + - `Flux.mae` + - `Flux.msle` + - `Flux.huber_loss` + Currently MLJ measures are not supported as loss functions here. +- `epochs::Int=10`: The number of epochs to train for. Typically, one epoch represents + one pass through the entirety of the training dataset. +- `batch_size::Int=1`: The batch size to be used for training. The batch size + represents the number of samples per update of the networks weights. Batch + sizes between 8 and 512 are typical. Increasing batch size can speed up + training, especially on a GPU (`acceleration=CUDALibs()`). +- `lambda::Float64=0`: The stregth of the regularization used during training. Can be + any value in the range `[0, ∞)`. +- `alpha::Float64=0`: The L2/L1 mix of regularization, in the range `[0, 1]`. A value of + 0 represents L2 regularization, and a value of 1 represents L1 regularization. +- `rng::Union{AbstractRNG, Int64}`: The random number generator/seed used during + training. +- `optimizer_changes_trigger_retraining::Bool=false`: Defines what happens when fitting a machine if the associated optimiser has changed. If true, the associated machine will retrain from scratch on `fit!`, otherwise it will not. +- `acceleration::AbstractResource=CPU1()`: Defines on what hardware training is done. +For Training on GPU, use `CudaLibs()`. For training on GPU, use `CUDALibs()`. + +# Operations + +- `predict(mach, Xnew)`: return predictions of the target given new + features `Xnew` having the same scitype as `X` above. Predictions are + deterministic. + + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `chain`: The trained "chain" (Flux.jl model), namely the series of layers, + functions, and activations which make up the neural network. This includes + the final layer specified by `finaliser` (eg, `softmax`). + + +# Report + +The fields of `report(mach)` are: + +- `training_losses`: A vector of training losses (penalised if `lambda != 0`) in + historical order, of length `epochs + 1`. The first element is the pre-training loss. + +# Examples + +In this example we build a regression model using a toy dataset. +```julia +using MLJ +using MLJFlux +using Flux +using MLJBase: augment_X +``` +First, we generate some data: +```julia +X = augment_X(randn(10000, 8), true); +θ = randn((9,2)); +y = X * θ; +X = MLJ.table(X) +y = MLJ.table(y) + +schema(y) +schema(X) +``` +Lets also make a test set: +```julia +(X, Xtest), (y, ytest) = partition((X, y), 0.7, multi=true); +``` +Next, we can define a `builder`. In the following macro call, `n_in` is the number of expected input features, and rng is a RNG. `init` is the function used to generate the random initial weights of the network. +```julia +builder = MLJFlux.@builder begin + init=Flux.glorot_uniform(rng) + Chain(Dense(n_in, 64, relu, init=init), + Dense(64, 32, relu, init=init), + Dense(32, 1, init=init)) +end +``` +Finally, we can define the model! +```julia +MultitargetNeuralNetworkRegressor = @load MultitargetNeuralNetworkRegressor +model = MultitargetNeuralNetworkRegressor(builder=builder, rng=123, epochs=20) +``` +We will arrange for standardizaion of the the target by wrapping our model + in `TransformedTargetModel`, and standardization of the features by +inserting the wrapped model in a pipeline: +```julia +pipe = Standardizer |> TransformedTargetModel(model, target=Standardizer) +``` +If we fit with a high verbosity (>1), we will see the losses during training. We can also see the losses in the output of `report(mach)` + +```julia +mach = machine(pipe, X, y) +fit!(mach, verbosity=2) + +# first element initial loss, 2:end per epoch training losses +report(mach).transformed_target_model_deterministic.training_losses + +``` + +## Experimenting with learning rate + +We can visually compare how the learning rate affects the predictions: +```julia +using Plots + +rates = 10. .^ (-5:0) + +foreach(rates) do η + pipe.transformed_target_model_deterministic.model.optimiser.eta = η + fit!(mach, force=true, verbosity=0) + losses = + report(mach).transformed_target_model_deterministic.model.training_losses[3:end] + plot!(1:length(losses), losses, label=η) +end + + + +pipe.transformed_target_model_deterministic.model.optimiser.eta = 0.0001 + +``` + +With the learning rate fixed, we can now compute a CV estimate of the performance (using +all data bound to `mach`) and compare this with performance on the test set: +```julia +# custom MLJ loss: +multi_loss(yhat, y) = l2(MLJ.matrix(yhat), MLJ.matrix(y)) |> mean + +# CV estimate, based on `(X, y)`: +evaluate!(mach, resampling=CV(nfolds=5), measure=multi_loss) + +# loss for `(Xtest, test)`: +fit!(mach) +yhat = predict(mach, Xtest) +multi_loss(yhat, y) +``` + +See also +[`NeuralNetworkRegressor`](@ref) +""" +MultitargetNeuralNetworkRegressor + const Regressor = Union{NeuralNetworkRegressor, MultitargetNeuralNetworkRegressor}