diff --git a/src/discretize.jl b/src/discretize.jl
index f7a6daa30f..9015f31562 100644
--- a/src/discretize.jl
+++ b/src/discretize.jl
@@ -443,7 +443,12 @@ function SciMLBase.symbolic_discretize(pde_system::PDESystem,
 
         # the aggregation happens on cpu even if the losses are gpu, probably fine since it's only a few of them
         pde_losses = [pde_loss_function(θ) for pde_loss_function in pde_loss_functions]
-        bc_losses = [bc_loss_function(θ) for bc_loss_function in bc_loss_functions]
+
+        if discretization.constrained
+            bc_losses = eltype(θ)[]
+        else
+            bc_losses = [bc_loss_function(θ) for bc_loss_function in bc_loss_functions]
+        end
 
         # this is kind of a hack, and means that whenever the outer function is evaluated the increment goes up, even if it's not being optimized
         # that's why we prefer the user to maintain the increment in the outer loop callback during optimization
@@ -454,7 +459,12 @@ function SciMLBase.symbolic_discretize(pde_system::PDESystem,
         Zygote.@ignore begin reweight_losses_func(θ, pde_losses, bc_losses) end
 
         weighted_pde_losses = adaloss.pde_loss_weights .* pde_losses
-        weighted_bc_losses = adaloss.bc_loss_weights .* bc_losses
+
+        if discretization.constrained
+            weighted_bc_losses = eltype(θ)[]
+        else
+            weighted_bc_losses = adaloss.bc_loss_weights .* bc_losses
+        end
 
         sum_weighted_pde_losses = sum(weighted_pde_losses)
         sum_weighted_bc_losses = sum(weighted_bc_losses)
@@ -514,7 +524,21 @@ end
 # Convert a PDE problem into an OptimizationProblem
 function SciMLBase.discretize(pde_system::PDESystem, discretization::PhysicsInformedNN)
     pinnrep = symbolic_discretize(pde_system, discretization)
-    f = OptimizationFunction(pinnrep.loss_functions.full_loss_function,
-                             Optimization.AutoZygote())
-    Optimization.OptimizationProblem(f, pinnrep.flat_initθ)
+
+    if discretization.constrained
+        function constraint_equations(θ,p)
+            [bc_loss_function(θ) for bc_loss_function in pinnrep.loss_functions.bc_loss_functions]
+        end
+        lcons = zeros(length(pinnrep.loss_functions.bc_loss_functions))
+        ucons = zeros(length(pinnrep.loss_functions.bc_loss_functions))
+        f = OptimizationFunction(pinnrep.loss_functions.full_loss_function,
+                                 Optimization.AutoZygote(),
+                                 cons = constraint_equations)
+    else
+        lcons = nothing
+        ucons = nothing
+        f = OptimizationFunction(pinnrep.loss_functions.full_loss_function,
+                                Optimization.AutoZygote())
+    end
+    Optimization.OptimizationProblem(f, pinnrep.flat_initθ, lcons=lcons, ucons=ucons)
 end
diff --git a/src/pinn_types.jl b/src/pinn_types.jl
index d25a87d4f9..57baf7abf9 100644
--- a/src/pinn_types.jl
+++ b/src/pinn_types.jl
@@ -38,6 +38,7 @@ PhysicsInformedNN(chain,
                   logger = nothing,
                   log_options = LogOptions(),
                   iteration = nothing,
+                  constrained = false,
                   kwargs...) where {iip}
 
 A `discretize` algorithm for the ModelingToolkit PDESystem interface which transforms a
@@ -63,6 +64,11 @@ methodology.
   of the neural network defining `phi`). By default this is generated from the `chain`. This
   should only be used to more directly impose functional information in the training problem,
   for example imposing the boundary condition by the test function formulation.
+* `constrained`: whether the optimization process should treat boundary conditions as
+  constraints. Defaults to `false`, which means that the boundary conditions are treated
+  as additions to the loss values. When `true`, the boundary conditions will specify equations
+  in the `OptimizationProblem` via `cons`, but will require an optimizer that can handle
+  constraint equations.
 
 (to be added)
 
@@ -90,6 +96,7 @@ struct PhysicsInformedNN{T, P, PH, DER, PE, AL, ADA, LOG, K} <: AbstractPINN
     iteration::Vector{Int64}
     self_increment::Bool
     multioutput::Bool
+    constrained::Bool
     kwargs::K
 
     @add_kwonly function PhysicsInformedNN(chain,
@@ -103,6 +110,7 @@ struct PhysicsInformedNN{T, P, PH, DER, PE, AL, ADA, LOG, K} <: AbstractPINN
                                            logger = nothing,
                                            log_options = LogOptions(),
                                            iteration = nothing,
+                                           constrained = false,
                                            kwargs...) where {iip}
         if init_params === nothing
             if chain isa AbstractArray
@@ -164,6 +172,7 @@ struct PhysicsInformedNN{T, P, PH, DER, PE, AL, ADA, LOG, K} <: AbstractPINN
                                                                                             iteration,
                                                                                             self_increment,
                                                                                             multioutput,
+                                                                                            constrained,
                                                                                             kwargs)
     end
 end
diff --git a/test/NNPDE_tests.jl b/test/NNPDE_tests.jl
index 3cdcf0c0e1..28f8f93ca7 100644
--- a/test/NNPDE_tests.jl
+++ b/test/NNPDE_tests.jl
@@ -441,6 +441,32 @@ u_predict = [[phi[i]([x, y], minimizers[i])[1] for x in xs for y in ys] for i in
 @test u_predict[1]≈u_real[1] atol=0.1
 @test u_predict[2]≈u_real[2] atol=0.1
 
+## Now do it with constrained
+
+discretization = NeuralPDE.PhysicsInformedNN(chain, quadrature_strategy;
+                                             init_params = initθ,
+                                             constrained = true)
+
+@named pde_system = PDESystem(eqs, bcs, domains, [x, y], [u1(x, y), u2(x, y)])
+
+prob = NeuralPDE.discretize(pde_system, discretization)
+@test_throws Any Optimization.solve(prob, BFGS(); maxiters = 1000)
+
+phi = discretization.phi
+
+analytic_sol_func(x, y) = [1 / 3 * (6x - y), 1 / 2 * (6x - y)]
+xs, ys = [infimum(d.domain):0.01:supremum(d.domain) for d in domains]
+u_real = [[analytic_sol_func(x, y)[i] for x in xs for y in ys] for i in 1:2]
+
+initθ = discretization.init_params
+acum = [0; accumulate(+, length.(initθ))]
+sep = [(acum[i] + 1):acum[i + 1] for i in 1:(length(acum) - 1)]
+minimizers = [res.minimizer[s] for s in sep]
+u_predict = [[phi[i]([x, y], minimizers[i])[1] for x in xs for y in ys] for i in 1:2]
+
+@test u_predict[1]≈u_real[1] atol=0.1
+@test u_predict[2]≈u_real[2] atol=0.1
+
 # p1 =plot(xs, ys, u_predict, st=:surface);
 # p2 = plot(xs, ys, u_real, st=:surface);
 # plot(p1,p2)