diff --git a/NEWS.md b/NEWS.md
index 9b0ef1caf..86cd981f7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,7 @@
 # v4 Breaking changes
 
-1. The main change in this breaking release has been the way mini-batching is handled. The data argument in the solve call and the implicit iteration of that in the callback has been removed,
-the stochastic solvers (Optimisers.jl and Sophia) now handle it explicitly. You would now pass in a DataLoader to OptimziationProblem as the second argument to the objective etc (p) if you
-want to do minibatching, else for full batch just pass in the full data.
+ 1. The main change in this breaking release has been the way mini-batching is handled. The data argument in the solve call and the implicit iteration of that in the callback has been removed,
+    the stochastic solvers (Optimisers.jl and Sophia) now handle it explicitly. You would now pass in a DataLoader to OptimizationProblem as the second argument to the objective etc (p) if you
+    want to do minibatching, else for full batch just pass in the full data.
 
-2. The support for extra returns from objective function has been removed. Now the objective should only return a scalar loss value, hence callback doesn't take extra arguments other than the state and loss value.
-
- 
+ 2. The support for extra returns from objective function has been removed. Now the objective should only return a scalar loss value, hence callback doesn't take extra arguments other than the state and loss value.
diff --git a/Project.toml b/Project.toml
index a7f52a394..e484ab75c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Optimization"
 uuid = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
-version = "4.0.2"
+version = "4.0.3"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -11,7 +11,6 @@ LBFGSB = "5be7bae1-8223-5378-bac3-9e7378a2f6e6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
-MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 OptimizationBase = "bca83a33-5cc9-4baa-983d-23429ab6bcbb"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
@@ -29,16 +28,11 @@ LBFGSB = "0.4.1"
 LinearAlgebra = "1.10"
 Logging = "1.10"
 LoggingExtras = "0.4, 1"
-MLUtils = "0.4.4"
 OptimizationBase = "2"
 Printf = "1.10"
 ProgressLogging = "0.1"
 Reexport = "1.2"
 SciMLBase = "2.39.0"
 SparseArrays = "1.10"
-Symbolics = "5.12"
 TerminalLoggers = "0.1"
 julia = "1.9"
-
-[extras]
-Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
diff --git a/docs/src/index.md b/docs/src/index.md
index 95473e8e8..34f3edd07 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -54,21 +54,21 @@ to add the specific wrapper packages.
 ```@raw html
 <details>
   <summary><strong>BlackBoxOptim</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
     - Box Constraints
 </details>
 <details>
   <summary><strong>CMAEvolutionaryStrategy</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
     - Box Constraints
 </details>
 <details>
   <summary><strong>Evolutionary</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
     - Box Constraints
@@ -76,38 +76,38 @@ to add the specific wrapper packages.
 </details>
 <details>
   <summary><strong>GCMAES</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - First order
     - Box Constraints
     - Unconstrained
 </details>
 <details>
   <summary><strong>Manopt</strong></summary>
-  - **Local Methods**
+  - <strong>Local Methods</strong>
     - First order
     - Second order
     - Zeroth order
     - Box Constraints
     - Constrained 🟡
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
 </details>
 <details>
   <summary><strong>MathOptInterface</strong></summary>
-  - **Local Methods**
+  - <strong>Local Methods</strong>
     - First order
     - Second order
     - Box Constraints
     - Constrained
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - First order
     - Second order
     - Constrained
 </details>
 <details>
   <summary><strong>MultistartOptimization</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - First order
     - Second order
@@ -115,14 +115,14 @@ to add the specific wrapper packages.
 </details>
 <details>
   <summary><strong>Metaheuristics</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
     - Box Constraints
 </details>
 <details>
   <summary><strong>NOMAD</strong></summary>
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
     - Box Constraints
@@ -130,13 +130,13 @@ to add the specific wrapper packages.
 </details>
 <details>
   <summary><strong>NLopt</strong></summary>
-  - **Local Methods**
+  - <strong>Local Methods</strong>
     - First order
     - Zeroth order
     - Second order 🟡
     - Box Constraints
     - Local Constrained 🟡
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - First order
     - Unconstrained
@@ -144,20 +144,20 @@ to add the specific wrapper packages.
 </details>
 <details>
   <summary><strong>Optim</strong></summary>
-  - **Local Methods**
+  - <strong>Local Methods</strong>
     - Zeroth order
     - First order
     - Second order
     - Box Constraints
     - Constrained
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Zeroth order
     - Unconstrained
     - Box Constraints
 </details>
 <details>
   <summary><strong>PRIMA</strong></summary>
-  - **Local Methods**
+  - <strong>Local Methods</strong>
     - Derivative-Free: ✅
   - **Constraints**
     - Box Constraints: ✅
@@ -167,13 +167,15 @@ to add the specific wrapper packages.
   <summary><strong>QuadDIRECT</strong></summary>
   - **Constraints**
     - Box Constraints: ✅
-  - **Global Methods**
+  - <strong>Global Methods</strong>
     - Unconstrained: ✅
 </details>
 ```
+
 🟡 = supported in downstream library but not yet implemented in `Optimization.jl`; PR to add this functionality are welcome
 
 ## Citation
+
 ```
 @software{vaibhav_kumar_dixit_2023_7738525,
 	author = {Vaibhav Kumar Dixit and Christopher Rackauckas},
@@ -185,37 +187,48 @@ to add the specific wrapper packages.
   	url = {https://doi.org/10.5281/zenodo.7738525},
 	year = 2023}
 ```
+
 ## Reproducibility
+
 ```@raw html
 <details><summary>The documentation of this SciML package was built using these direct dependencies,</summary>
 ```
+
 ```@example
 using Pkg # hide
 Pkg.status() # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 <details><summary>and using this machine and Julia version.</summary>
 ```
+
 ```@example
 using InteractiveUtils # hide
 versioninfo() # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 <details><summary>A more complete overview of all dependencies and their versions is also provided.</summary>
 ```
+
 ```@example
 using Pkg # hide
 Pkg.status(; mode = PKGMODE_MANIFEST) # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@eval
 using TOML
 using Markdown
diff --git a/docs/src/tutorials/certification.md b/docs/src/tutorials/certification.md
index 9ecdc0c35..09132c2f3 100644
--- a/docs/src/tutorials/certification.md
+++ b/docs/src/tutorials/certification.md
@@ -7,7 +7,7 @@ This works with the `structural_analysis` keyword argument to `OptimizationProbl
 We'll use a simple example to illustrate the convexity structure certification process.
 
 ```@example symanalysis
-using SymbolicAnalysis, Zygote, LinearAlgebra, Optimization, OptimizationMOI
+using SymbolicAnalysis, Zygote, LinearAlgebra, Optimization
 
 function f(x, p = nothing)
     return exp(x[1]) + x[1]^2
diff --git a/docs/src/tutorials/minibatch.md b/docs/src/tutorials/minibatch.md
index 08f362f71..8748bd066 100644
--- a/docs/src/tutorials/minibatch.md
+++ b/docs/src/tutorials/minibatch.md
@@ -54,7 +54,7 @@ end
 function loss_adjoint(fullp, data)
     batch, time_batch = data
     pred = predict_adjoint(fullp, time_batch)
-    sum(abs2, batch .- pred), pred
+    sum(abs2, batch .- pred)
 end
 
 k = 10
diff --git a/lib/OptimizationOptimJL/Project.toml b/lib/OptimizationOptimJL/Project.toml
index b8bcedb5c..5349f7885 100644
--- a/lib/OptimizationOptimJL/Project.toml
+++ b/lib/OptimizationOptimJL/Project.toml
@@ -1,7 +1,7 @@
 name = "OptimizationOptimJL"
 uuid = "36348300-93cb-4f02-beb5-3c3902f8871e"
 authors = ["Vaibhav Dixit <vaibhavyashdixit@gmail.com> and contributors"]
-version = "0.4.0"
+version = "0.4.1"
 
 [deps]
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
diff --git a/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl b/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl
index aea9ada02..b2d6db4f8 100644
--- a/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl
+++ b/lib/OptimizationOptimJL/src/OptimizationOptimJL.jl
@@ -26,6 +26,7 @@ function SciMLBase.requireshessian(opt::Union{
     true
 end
 SciMLBase.requiresgradient(opt::Optim.Fminbox) = true
+# SciMLBase.allowsfg(opt::Union{Optim.AbstractOptimizer, Optim.ConstrainedOptimizer, Optim.Fminbox, Optim.SAMIN}) = true
 
 function __map_optimizer_args(cache::OptimizationCache,
         opt::Union{Optim.AbstractOptimizer, Optim.Fminbox,
@@ -142,11 +143,11 @@ function SciMLBase.__solve(cache::OptimizationCache{
         θ = metadata[cache.opt isa Optim.NelderMead ? "centroid" : "x"]
         opt_state = Optimization.OptimizationState(iter = trace.iteration,
             u = θ,
-            objective = x[1],
+            objective = trace.value,
             grad = get(metadata, "g(x)", nothing),
             hess = get(metadata, "h(x)", nothing),
             original = trace)
-        cb_call = cache.callback(opt_state, x...)
+        cb_call = cache.callback(opt_state, trace.value)
         if !(cb_call isa Bool)
             error("The callback should return a boolean `halt` for whether to stop the optimization process.")
         end
@@ -261,11 +262,11 @@ function SciMLBase.__solve(cache::OptimizationCache{
             metadata["x"]
         opt_state = Optimization.OptimizationState(iter = trace.iteration,
             u = θ,
-            objective = x[1],
+            objective = trace.value,
             grad = get(metadata, "g(x)", nothing),
             hess = get(metadata, "h(x)", nothing),
             original = trace)
-        cb_call = cache.callback(opt_state, x...)
+        cb_call = cache.callback(opt_state, trace.value)
         if !(cb_call isa Bool)
             error("The callback should return a boolean `halt` for whether to stop the optimization process.")
         end
@@ -277,14 +278,19 @@ function SciMLBase.__solve(cache::OptimizationCache{
         __x = first(x)
         return cache.sense === Optimization.MaxSense ? -__x : __x
     end
-    fg! = function (G, θ)
-        if G !== nothing
-            cache.f.grad(G, θ)
-            if cache.sense === Optimization.MaxSense
-                G .*= -one(eltype(G))
+
+    if cache.f.fg === nothing
+        fg! = function (G, θ)
+            if G !== nothing
+                cache.f.grad(G, θ)
+                if cache.sense === Optimization.MaxSense
+                    G .*= -one(eltype(G))
+                end
             end
+            return _loss(θ)
         end
-        return _loss(θ)
+    else
+        fg! = cache.f.fg
     end
 
     gg = function (G, θ)
@@ -344,9 +350,9 @@ function SciMLBase.__solve(cache::OptimizationCache{
             u = metadata["x"],
             grad = get(metadata, "g(x)", nothing),
             hess = get(metadata, "h(x)", nothing),
-            objective = x[1],
+            objective = trace.value,
             original = trace)
-        cb_call = cache.callback(opt_state, x...)
+        cb_call = cache.callback(opt_state, trace.value)
         if !(cb_call isa Bool)
             error("The callback should return a boolean `halt` for whether to stop the optimization process.")
         end
@@ -358,15 +364,21 @@ function SciMLBase.__solve(cache::OptimizationCache{
         __x = first(x)
         return cache.sense === Optimization.MaxSense ? -__x : __x
     end
-    fg! = function (G, θ)
-        if G !== nothing
-            cache.f.grad(G, θ)
-            if cache.sense === Optimization.MaxSense
-                G .*= -one(eltype(G))
+
+    if cache.f.fg === nothing
+        fg! = function (G, θ)
+            if G !== nothing
+                cache.f.grad(G, θ)
+                if cache.sense === Optimization.MaxSense
+                    G .*= -one(eltype(G))
+                end
             end
+            return _loss(θ)
         end
-        return _loss(θ)
+    else
+        fg! = cache.f.fg
     end
+
     gg = function (G, θ)
         cache.f.grad(G, θ)
         if cache.sense === Optimization.MaxSense
@@ -434,7 +446,7 @@ PrecompileTools.@compile_workload begin
     function obj_f(x, p)
         A = p[1]
         b = p[2]
-        return sum((A * x - b) .^ 2)
+        return sum((A * x .- b) .^ 2)
     end
 
     function solve_nonnegative_least_squares(A, b, solver)
diff --git a/lib/OptimizationOptimisers/Project.toml b/lib/OptimizationOptimisers/Project.toml
index e03709fd9..b0e763c2f 100644
--- a/lib/OptimizationOptimisers/Project.toml
+++ b/lib/OptimizationOptimisers/Project.toml
@@ -1,7 +1,7 @@
 name = "OptimizationOptimisers"
 uuid = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
 authors = ["Vaibhav Dixit <vaibhavyashdixit@gmail.com> and contributors"]
-version = "0.3.2"
+version = "0.3.3"
 
 [deps]
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
@@ -10,17 +10,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 
-[weakdeps]
-MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
-MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
-
-[extensions]
-OptimizationOptimisersMLDataDevicesExt = "MLDataDevices"
-OptimizationOptimisersMLUtilsExt = "MLUtils"
-
 [compat]
-MLDataDevices = "1.1"
-MLUtils = "0.4.4"
 Optimisers = "0.2, 0.3"
 Optimization = "4"
 ProgressLogging = "0.1"
diff --git a/lib/OptimizationOptimisers/ext/OptimizationOptimisersMLDataDevicesExt.jl b/lib/OptimizationOptimisers/ext/OptimizationOptimisersMLDataDevicesExt.jl
deleted file mode 100644
index 545f73c6c..000000000
--- a/lib/OptimizationOptimisers/ext/OptimizationOptimisersMLDataDevicesExt.jl
+++ /dev/null
@@ -1,8 +0,0 @@
-module OptimizationOptimisersMLDataDevicesExt
-
-using MLDataDevices
-using OptimizationOptimisers
-
-OptimizationOptimisers.isa_dataiterator(::DeviceIterator) = true
-
-end
diff --git a/lib/OptimizationOptimisers/ext/OptimizationOptimisersMLUtilsExt.jl b/lib/OptimizationOptimisers/ext/OptimizationOptimisersMLUtilsExt.jl
deleted file mode 100644
index 1790d7aea..000000000
--- a/lib/OptimizationOptimisers/ext/OptimizationOptimisersMLUtilsExt.jl
+++ /dev/null
@@ -1,8 +0,0 @@
-module OptimizationOptimisersMLUtilsExt
-
-using MLUtils
-using OptimizationOptimisers
-
-OptimizationOptimisers.isa_dataiterator(::MLUtils.DataLoader) = true
-
-end
diff --git a/lib/OptimizationOptimisers/src/OptimizationOptimisers.jl b/lib/OptimizationOptimisers/src/OptimizationOptimisers.jl
index 67583ce1c..99743d24d 100644
--- a/lib/OptimizationOptimisers/src/OptimizationOptimisers.jl
+++ b/lib/OptimizationOptimisers/src/OptimizationOptimisers.jl
@@ -2,7 +2,7 @@ module OptimizationOptimisers
 
 using Reexport, Printf, ProgressLogging
 @reexport using Optimisers, Optimization
-using Optimization.SciMLBase
+using Optimization.SciMLBase, Optimization.OptimizationBase
 
 SciMLBase.supports_opt_cache_interface(opt::AbstractRule) = true
 SciMLBase.requiresgradient(opt::AbstractRule) = true
@@ -16,8 +16,6 @@ function SciMLBase.__init(
         kwargs...)
 end
 
-isa_dataiterator(data) = false
-
 function SciMLBase.__solve(cache::OptimizationCache{
         F,
         RC,
@@ -59,7 +57,7 @@ function SciMLBase.__solve(cache::OptimizationCache{
         throw(ArgumentError("The number of epochs must be specified as the epochs or maxiters kwarg."))
     end
 
-    if isa_dataiterator(cache.p)
+    if OptimizationBase.isa_dataiterator(cache.p)
         data = cache.p
         dataiterate = true
     else
diff --git a/src/sophia.jl b/src/sophia.jl
index 5419b87d7..88b0812c3 100644
--- a/src/sophia.jl
+++ b/src/sophia.jl
@@ -1,5 +1,3 @@
-using Optimization.LinearAlgebra, MLUtils
-
 struct Sophia
     η::Float64
     βs::Tuple{Float64, Float64}
@@ -64,7 +62,7 @@ function SciMLBase.__solve(cache::OptimizationCache{
 
     maxiters = Optimization._check_and_convert_maxiters(cache.solver_args.maxiters)
 
-    if cache.p isa MLUtils.DataLoader
+    if OptimizationBase.isa_dataiterator(cache.p)
         data = cache.p
         dataiterate = true
     else
diff --git a/test/diffeqfluxtests.jl b/test/diffeqfluxtests.jl
index 243027246..2e5142991 100644
--- a/test/diffeqfluxtests.jl
+++ b/test/diffeqfluxtests.jl
@@ -31,11 +31,11 @@ end
 function loss_adjoint(p)
     prediction = predict_adjoint(p)
     loss = sum(abs2, x - 1 for x in prediction)
-    return loss, prediction
+    return loss
 end
 
 iter = 0
-callback = function (state, l, pred)
+callback = function (state, l)
     display(l)
 
     # using `remake` to re-create our `prob` with current parameters `p`
@@ -81,11 +81,11 @@ end
 function loss_neuralode(p)
     pred = predict_neuralode(p)
     loss = sum(abs2, ode_data .- pred)
-    return loss, pred
+    return loss
 end
 
 iter = 0
-callback = function (st, l, pred...)
+callback = function (st, l)
     global iter
     iter += 1
 
diff --git a/test/minibatch.jl b/test/minibatch.jl
index a1b08a439..4e0ca6ce8 100644
--- a/test/minibatch.jl
+++ b/test/minibatch.jl
@@ -45,7 +45,7 @@ end
 function loss_adjoint(fullp, p)
     (batch, time_batch) = p
     pred = predict_adjoint(fullp, time_batch)
-    sum(abs2, batch .- pred), pred
+    sum(abs2, batch .- pred)
 end
 
 k = 10