diff --git a/replication/debiasing/copy.sh b/replication/debiasing/copy.sh index 6036ab5..3df973e 100644 --- a/replication/debiasing/copy.sh +++ b/replication/debiasing/copy.sh @@ -1,11 +1,7 @@ DEST="/home/will/overleaf/CKU_2023_MondrianRF/" -cp table_d1_n1000_B800_reps3000.tex $DEST -cp table_d2_n1000_B800_reps3000.tex $DEST +#cp table_d1_n1000_B800_reps3000.tex $DEST +#cp table_d2_n1000_B800_reps3000.tex $DEST cp table_d1_n1000_B10_reps3000.tex $DEST cp table_d2_n1000_B10_reps3000.tex $DEST -cp table_d1_n1000_B5_reps3000.tex $DEST -cp table_d2_n1000_B5_reps3000.tex $DEST -cp table_d1_n1000_B2_reps3000.tex $DEST -cp table_d2_n1000_B2_reps3000.tex $DEST cp table_d1_n1000_B1_reps3000.tex $DEST cp table_d2_n1000_B1_reps3000.tex $DEST diff --git a/replication/debiasing/debiasing.jl b/replication/debiasing/debiasing.jl index 7c1666c..64d472c 100644 --- a/replication/debiasing/debiasing.jl +++ b/replication/debiasing/debiasing.jl @@ -60,18 +60,14 @@ end function run_all() # tables format is (d, n, B) tables::Vector{Tuple{Int,Int,Int}} = [ - #(1, 1000, 800), - #(2, 1000, 800), - #(1, 1000, 10), + (2, 1000, 800), + (1, 1000, 800), #(2, 1000, 10), - #(2, 1000, 5), - #(1, 1000, 5), - #(2, 1000, 2), - #(1, 1000, 2), + #(1, 1000, 10), #(2, 1000, 1), - (1, 1000, 1), + #(1, 1000, 1), ] - n_reps = 300 + n_reps = 3000 lifetime_methods::Vector{LifetimeMethod} = [opt::LifetimeMethod, pol::LifetimeMethod] lifetime_multipliers::Vector{Float64} = [0.8, 0.9, 1.0, 1.1, 1.2] X_dist::Distribution = Uniform(0, 1) @@ -114,7 +110,7 @@ function run_all() for rep in 1:n_reps X::Vector{NTuple{d,Float64}} = [ntuple(j -> rand(X_dist), d) for i in 1:n] Y::Vector{Float64} = [mu(X[i]) + rand(eps_dist) for i in 1:n] - valid_indices = [i for i in 1:n_experiments + valid_indices = [i for i in 1:n_experiments if (experiments[i].rep, experiments[i].d, experiments[i].n, experiments[i].B) == (rep, d, n, B)] Threads.@threads for i in valid_indices @@ -125,16 +121,13 @@ function run_all() f *= "rep = $(experiment.rep)" rate = count / t1 t_left = (n_experiments - count) / rate - println(round(t_left, digits=0), "s left, ", - round(t_left / 60, digits=2), "min left") - println(f) - println("$count / $n_experiments") - #println(Base.summarysize(X)/1e6, " MB") - #mem_use = get_mem_use() - #println(mem_use, " MB") - #println() - #display(varinfo()) - println() + if count % 100 == 0 + println(round(t_left, digits=0), "s left, ", + round(t_left / 60, digits=2), "min left") + println(f) + println("$count / $n_experiments") + println() + end run(experiment, X, Y) count += 1 end @@ -210,7 +203,7 @@ function get_theory(experiment::Experiment) experiment.sd_theory = sqrt(lambda^d * sigma2 * 0.4091^d / n) experiment.bias_theory = - pi^2 * d / (2 * lambda^2) elseif experiment.J_estimator == 1 - C = 3.2 * 0.4091^d - 2.88 * 0.4932^d + 3.24 * 0.6137^d + C = 0.64 * 0.4091^d - 2.88 * 0.4932^d + 3.24 * 0.6137^d experiment.sd_theory = sqrt(lambda^d * sigma2 * C / n) experiment.bias_theory = -4 * pi^4 * d / (27 * lambda^4) end @@ -229,7 +222,7 @@ function select_lifetime(X::Vector{NTuple{d,Float64}}, Y::Vector{Float64}, denominator = sigma2 * 0.4091^d return (numerator / denominator)^(1 / (4+d)) elseif J_lifetime == 1 - C = 3.2 * 0.4091^d - 2.88 * 0.4932^d + 3.24 * 0.6137^d + C = 0.64 * 0.4091^d - 2.88 * 0.4932^d + 3.24 * 0.6137^d numerator = 128 * d * pi^8 * n denominator = 27^2 * sigma2 * C return (numerator / denominator)^(1 / (8+d)) diff --git a/replication/debiasing/fast.jl b/replication/debiasing/fast.jl deleted file mode 100644 index 03dcbe3..0000000 --- a/replication/debiasing/fast.jl +++ /dev/null @@ -1,256 +0,0 @@ -using Distributions - -mutable struct FastDebiasedMondrianForest - # parameters - const lambda::Float64 - const n_trees::Int - const n_data::Int - const x_eval::Vector{Float64} - const debias_order::Int - const significance_level::Float64 - # data - const X_data::Vector{Vector{Float64}} - const Y_data::Vector{Float64} - # estimates - debias_scaling::Vector{Float64} - debias_coeffs::Vector{Float64} - #trees::Matrix{MondrianTree{d}} - mu_hat::Float64 - sigma2_hat::Float64 - Sigma_hat::Float64 - confidence_band::Vector{Tuple{Float64,Float64}} -end - -function FastDebiasedMondrianForest(lambda::Float64, n_trees::Int, - x_eval::Vector{Float64}, - debias_order::Int, - X_data::Vector{Vector{Float64}}, Y_data::Vector{Float64}, - significance_level::Float64=0.05) - - n_data = length(X_data) - - forest = FastDebiasedMondrianForest(lambda, n_trees, n_data, x_eval, - debias_order, significance_level, X_data, Y_data, Float64[], Float64[], NaN, NaN, NaN, - Tuple{Float64,Float64}[]) - - forest.debias_scaling = get_debias_scaling(debias_order) - forest.debias_coeffs = get_debias_coeffs(debias_order) - - estimate_mu_hat(forest) - estimate_sigma2_hat(forest) - estimate_Sigma_hat(forest) - #construct_confidence_band(forest) - return forest -end - - -function get_debias_scaling(debias_order::Int) - J = debias_order - debias_scaling = [1.5^r for r in 0:J] - return debias_scaling -end - -function get_debias_coeffs(debias_order::Int) - J = debias_order - debias_scaling = get_debias_scaling(debias_order) - A = zeros(J + 1, J + 1) - for r in 1:(J + 1) - for s in 1:(J + 1) - A[r, s] = debias_scaling[r]^(2 - 2 * s) - end - end - e0 = [[1]; [0 for _ in 1:J]] - debias_coeffs = A \ e0 - return debias_coeffs -end - -function estimate_mu_hat(forest::FastDebiasedMondrianForest) - mu_hat = 0.0 - Y_bar = sum(forest.Y_data) / forest.n_data - x_eval = forest.x_eval - lambda = forest.lambda - - @inbounds for j in 0:(forest.debias_order) - scaling = forest.debias_scaling[j + 1] - E_dist = Exponential(1 / (lambda * scaling)) - coeff = forest.debias_coeffs[j + 1] - @inbounds for b in 1:(forest.n_trees) - E_lower = rand(E_dist, d) - E_upper = rand(E_dist, d) - cell_lower = max.(0, x_eval .- E_lower) - cell_upper = min.(1, x_eval .+ E_lower) - denom = sum(all(cell_lower .<= forest.X_data[i] .<= cell_upper) - for i in 1:(forest.n_data)) - if denom > 0 - numer = sum(all(cell_lower .<= forest.X_data[i] .<= cell_upper) - * forest.Y_data[i] for i in 1:(forest.n_data)) - mu_hat += coeff * numer / denom - else - mu_hat += coeff * Y_bar - end - end - end - - forest.mu_hat = mu_hat / forest.n_trees - return nothing -end - - -function estimate_sigma2_hat(forest::FastDebiasedMondrianForest) - sigma2_hat = 0.0 - x_eval = forest.x_eval - lambda = forest.lambda - E_dist = Exponential(1 / lambda) - - @inbounds for b in 1:(forest.n_trees) - E_lower = rand(E_dist, d) - E_upper = rand(E_dist, d) - cell_lower = max.(0, x_eval .- E_lower) - cell_upper = min.(1, x_eval .+ E_lower) - denom = sum(all(cell_lower .<= forest.X_data[i] .<= cell_upper) - for i in 1:(forest.n_data)) - if denom > 0 - numer = sum(all(cell_lower .<= forest.X_data[i] .<= cell_upper) - * (forest.Y_data[i] - forest.mu_hat)^2 - for i in 1:(forest.n_data)) - sigma2_hat += numer / denom - end - end - - forest.sigma2_hat = sigma2_hat / forest.n_trees - return nothing -end - -function estimate_Sigma_hat(forest::FastDebiasedMondrianForest) - Sigma_hat = 0.0 - x_eval = forest.x_eval - lambda = forest.lambda - - @inbounds for i in 1:(forest.n_data) - X = forest.X_data[i] - A = 0.0 - @inbounds for j in 0:(forest.debias_order) - scaling = forest.debias_scaling[j + 1] - E_dist = Exponential(1 / (lambda * scaling)) - coeff = forest.debias_coeffs[j + 1] - @inbounds for b in 1:(forest.n_trees) - E_lower = rand(E_dist, d) - E_upper = rand(E_dist, d) - cell_lower = max.(0, x_eval .- E_lower) - cell_upper = min.(1, x_eval .+ E_lower) - if all(cell_lower .<= X .<= cell_upper) - A += coeff / Ns[b, j + 1, s] - end - end - end - Sigma_hat[s] += (A / forest.n_trees)^2 - end - - @inbounds for b in 1:(forest.n_trees) - E_lower = rand(E_dist, d) - E_upper = rand(E_dist, d) - cell_lower = max.(0, x_eval .- E_lower) - cell_upper = min.(1, x_eval .+ E_lower) - denom = sum(all(cell_lower .<= forest.X_data[i] .<= cell_upper) - for i in 1:(forest.n_data)) - if denom > 0 - numer = sum(all(cell_lower .<= forest.X_data[i] .<= cell_upper) - * (forest.Y_data[i] - forest.mu_hat)^2 - for i in 1:(forest.n_data)) - sigma2_hat += numer / denom - end - end - - forest.sigma2_hat = sigma2_hat / forest.n_trees - return nothing - - - - -end - -#= - -function estimate_Sigma_hat(forest::DebiasedMondrianForest{d}, Ns::Array{Int,3}) where {d} -Sigma_hat = [0.0 for _ in 1:(forest.n_evals)] - -@inbounds Threads.@threads for s in 1:(forest.n_evals) -x_eval = forest.x_evals[s] -@inbounds for i in 1:(forest.n_data) -X = forest.X_data[i] -A = 0.0 -@inbounds for j in 0:(forest.debias_order) -coeff = forest.debias_coeffs[j + 1] -@inbounds for b in 1:(forest.n_trees) -tree = forest.trees[b, j + 1] -if are_in_same_leaf(X, x_eval, tree) -A += coeff / Ns[b, j + 1, s] -end -end -end -Sigma_hat[s] += (A / forest.n_trees)^2 -end -end # COV_EXCL_LINE - -n_data = forest.n_data -lambda = forest.lambda -Sigma_hat .*= forest.sigma2_hat .* n_data / lambda^d -forest.Sigma_hat = Sigma_hat -return nothing -end - -function construct_confidence_band(forest::DebiasedMondrianForest{d}) where {d} -n_data = forest.n_data -n_evals = forest.n_evals -lambda = forest.lambda -mu_hat = forest.mu_hat -q = quantile(Normal(0, 1), 1 - forest.significance_level / 2) -width = q .* sqrt.(forest.Sigma_hat) .* sqrt(lambda^d / n_data) -confidence_band = [(mu_hat[s] - width[s], mu_hat[s] + width[s]) for s in 1:n_evals] -forest.confidence_band = confidence_band -return nothing -end - -""" -Base.show(forest::DebiasedMondrianForest{d}) where {d} - -Show a debiased Mondrian random forest. -""" -function Base.show(forest::DebiasedMondrianForest{d}) where {d} -println("lambda: ", forest.lambda) -println("n_data: ", forest.n_data) -println("n_trees: ", forest.n_trees) -println("n_evals: ", length(forest.x_evals)) -println("x_evals: ", forest.x_evals) -println("debias_order: ", forest.debias_order) -println("debias_scaling: ", forest.debias_scaling) -println("debias_coeffs: ", forest.debias_coeffs) -println("mu_hat: ", forest.mu_hat) -println("sigma2_hat: ", forest.sigma2_hat) -return println("Sigma_hat: ", forest.Sigma_hat) -end -=# - - -lambda = 10.0 -n_trees = 1000 -d = 1 -n = 1000 -x_eval = [0.5 for _ in 1:d] -debias_order = 0 -significance_level = 0.05 - -X_dist = Uniform(0, 1) -mu = (x -> sum(sin.(pi .* x))) -sigma = 0.1 -eps_dist = Normal(0, sigma) -X = [[rand(X_dist) for j in 1:d] for i in 1:n] -Y = [mu(X[i]) + rand(eps_dist) for i in 1:n] - -forest = FastDebiasedMondrianForest(lambda, n_trees, - x_eval, - debias_order, - X, Y, - significance_level) -println(forest.mu_hat) -println(forest.sigma2_hat) diff --git a/replication/debiasing/tables.jl b/replication/debiasing/tables.jl index 70de4c6..6422a82 100644 --- a/replication/debiasing/tables.jl +++ b/replication/debiasing/tables.jl @@ -77,6 +77,10 @@ function make_table(df) cell = "$cell\\%" elseif col == :lifetime_multiplier cell = @sprintf "%.1f" cell + elseif col == :lambda + cell = @sprintf "%.2f" cell + elseif col == :average_width + cell = @sprintf "%.3f" cell elseif isa(cell, Float64) cell = @sprintf "%.4f" cell end diff --git a/replication/debiasing/test_sigma.jl b/replication/debiasing/test_sigma.jl deleted file mode 100644 index f1c3728..0000000 --- a/replication/debiasing/test_sigma.jl +++ /dev/null @@ -1,26 +0,0 @@ -using Distributions -using MondrianForests - -d = 1 -n = 200 -B = 200 -n_reps = 20 -X_dist = Uniform(0, 1) -mu = (x -> sum(sin.(pi .* x))) -sigma = 0.3 -eps_dist = Normal(0, sigma) -lambda = 10.0 -x_evals = [ntuple(j -> 0.5, d)] -J = 0 -Sigma_hat = 0.0 - -t0 = time() -for rep in 1:n_reps - println(rep) - X = [ntuple(j -> rand(X_dist), d) for i in 1:n] - Y = [mu(X[i]) + rand(eps_dist) for i in 1:n] - forest = DebiasedMondrianForest(lambda, B, x_evals, J, X, Y, true) - global Sigma_hat += forest.Sigma_hat[] / n_reps -end -println(time() - t0) -println(Sigma_hat)