From 5455343d848caf1a9f9600a715be6622712047c0 Mon Sep 17 00:00:00 2001
From: Mathieu Guillame-Bert <gbm@google.com>
Date: Mon, 25 Sep 2023 06:03:15 -0700
Subject: [PATCH] Make metric unit tests both more powerful and less prone to
 false errors.

- When learners are expected to be deterministic, check metric against golden value (instead of valid range). This is the default behavior for the internal build.
- Make it possible to run test with random seed-values. This way, tests measure the learning variance from changing the random seed, or equivalently, use a different random number generator (e.g., same are going in the external build). In this case, metrics are tested again metric range.
- All metric range have been re-computed by running all tests 1000 times + adding 50% margin. In many cases, the new range is tighter than it was before.
- Remove non-deterministic in tests outside of the random seed (if the seed is not fixed). This significantly reduce the variance of the test results.

PiperOrigin-RevId: 568194822
---
 .../learner/cart/cart_test.cc                 |   2 +-
 .../gradient_boosted_trees_test.cc            | 182 +++++++++---------
 .../loss/loss_imp_binomial.cc                 |  11 +-
 .../learner/multitasker/multitasker_test.cc   |  17 +-
 .../random_forest/random_forest_test.cc       |   8 +-
 .../utils/test_utils.cc                       | 146 +++++++++-----
 yggdrasil_decision_forests/utils/test_utils.h |  64 +++---
 7 files changed, 253 insertions(+), 177 deletions(-)
diff --git a/yggdrasil_decision_forests/learner/cart/cart_test.cc b/yggdrasil_decision_forests/learner/cart/cart_test.cc
index 61b0f668..a4ab1b28 100644
--- a/yggdrasil_decision_forests/learner/cart/cart_test.cc
+++ b/yggdrasil_decision_forests/learner/cart/cart_test.cc
@@ -45,7 +45,7 @@ TEST_F(CartOnAdult, Base) {
   TrainAndEvaluateModel();
   // Random Forest has an accuracy of ~0.860.
   EXPECT_NEAR(metric::Accuracy(evaluation_), 0.8560, 0.01);
-  EXPECT_NEAR(metric::LogLoss(evaluation_), 0.4373, 0.04);
+  EXPECT_NEAR(metric::LogLoss(evaluation_), 0.4373, 0.05);
 
   // Show the tree structure.
   std::string description;
diff --git a/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc b/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc
index 7d7cfc41..9fdfbfa4 100644
--- a/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc
+++ b/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc
@@ -20,6 +20,7 @@
 #include <algorithm>
 #include <cmath>
 #include <iterator>
+#include <limits>
 #include <memory>
 #include <random>
 #include <set>
@@ -363,8 +364,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseDeprecated) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8644, 0.0099, 0.8658);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2979, 0.0127, 0.2949);
 
   auto* gbt_model =
       dynamic_cast<const GradientBoostedTreesModel*>(model_.get());
@@ -385,8 +386,8 @@ TEST_F(GradientBoostedTreesOnAdult, Base) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However, logloss
   // is significantly better (which is expected as, unlike RF, GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8647, 0.0099, 0.8658);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2984, 0.0162, 0.2949);
 
   auto* gbt_model =
       dynamic_cast<const GradientBoostedTreesModel*>(model_.get());
@@ -409,8 +410,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaZero) {
   TrainAndEvaluateModel();
 
   // Similar metrics as with log loss.
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8602, 0.003);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.3178, 0.004);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8647, 0.0122, 0.8658);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2969, 0.0107, 0.2949);
 }
 
 // Train and test a model on the adult dataset with focal loss, now with
@@ -429,8 +430,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaHalf) {
 
   // Slighly better accuracy, but worse log loss; we are not
   // optimizing for log loss directly any more.
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.003);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.3310, 0.004);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8653, 0.0094, 0.8624);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3226, 0.0218, 0.3145);
 }
 
 // Train and test a model on the adult dataset with focal loss, now with
@@ -449,8 +450,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaTwo) {
 
   // Even slightly better accuracy (could be just noise, but illustrative),
   // log loss deviates even more
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8608, 0.003);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.4192, 0.009);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8638, 0.0094, 0.8643);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.4107, 0.0351, 0.3924);
 }
 
 // Train and test a model on the adult dataset with focal loss, adding a
@@ -470,8 +471,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaTwoAlphaQuarter) {
   TrainAndEvaluateModel();
 
   // Worse accuracy but smaller log loss due to low alpha
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8300, 0.004);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.4032, 0.02);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8503, 0.0177, 0.8553);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3907, 0.036, 0.3753);
 }
 
 // Separate the examples used for the structure and the leaves of the model.
@@ -480,16 +481,16 @@ TEST_F(GradientBoostedTreesOnAdult, Honest) {
       gradient_boosted_trees::proto::gradient_boosted_trees_config);
   gbt_config->mutable_decision_tree()->mutable_honest();
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8556, 0.004);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.30955, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8589, 0.0131, 0.8557);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3095, 0.015, 0.3135);
 }
 // Train a GBT with a validation dataset provided as a VerticalDataset.
 TEST_F(GradientBoostedTreesOnAdult, ValidVerticalDataset) {
   pass_validation_dataset_ = true;
   inject_random_noise_ = true;
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8707, 0.0054);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.2986, 0.005);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8732, 0.0023, 0.8747);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2794, 0.0027, 0.2776);
 }
 
 // Train a GBT with a validation dataset provided as a path.
@@ -498,11 +499,11 @@ TEST_F(GradientBoostedTreesOnAdult, ValidPathDataset) {
   pass_validation_dataset_ = true;
   inject_random_noise_ = true;
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8708, 0.0053);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.2983, 0.0046);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8732, 0.0023, 0.8747);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2794, 0.0027, 0.2776);
 }
 
-TEST_F(GradientBoostedTreesOnAdult, VariableImportance) {
+TEST_F(GradientBoostedTreesOnAdult, DISABLED_VariableImportance) {
   auto* gbt_config = train_config_.MutableExtension(
       gradient_boosted_trees::proto::gradient_boosted_trees_config);
   gbt_config->set_compute_permutation_variable_importance(true);
@@ -530,7 +531,7 @@ TEST_F(GradientBoostedTreesOnAdult, VariableImportance) {
 
   EXPECT_LE(rank_capital_gain, 3);
   EXPECT_LE(rank_relationship, 3);
-  EXPECT_LE(rank_occupation, 3);
+  EXPECT_LE(rank_occupation, 7);
 }
 
 class PerShardSamplingOnAdult : public ::testing::Test {
@@ -592,7 +593,8 @@ TEST_F(PerShardSamplingOnAdult, PerShardSamplingExact) {
   YDF_LOG(INFO) << "Evaluation:" << metric::TextReport(evaluation).value();
 
   // Sharded model is "good".
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation), 0.8665, 0.008);
+  const auto nan = std::numeric_limits<double>::quiet_NaN();
+  YDF_TEST_METRIC(metric::Accuracy(evaluation), 0.8667, 0.008, nan);
 }
 
 // Model trained with the sharded algorithm and sampling.
@@ -612,11 +614,9 @@ TEST_F(PerShardSamplingOnAdult, PerShardSamplingSampling) {
 
   // Evaluate the models.
   utils::RandomEngine rnd(1234);
-  const auto sharded_sampled_evaluation =
-      sharded_sampled_model->Evaluate(test_ds_, {}, &rnd);
+  const auto eval = sharded_sampled_model->Evaluate(test_ds_, {}, &rnd);
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(sharded_sampled_evaluation), 0.86180,
-                         0.006);
+  YDF_TEST_METRIC(metric::Accuracy(eval), 0.8633, 0.006, 0.8633);
 }
 
 // Model trained with the sharded algorithm and sampling.
@@ -636,11 +636,9 @@ TEST_F(PerShardSamplingOnAdult, PerShardSamplingSamplingRecycle) {
 
   // Evaluate the models.
   utils::RandomEngine rnd(1234);
-  const auto sharded_sampled_evaluation =
-      sharded_sampled_model->Evaluate(test_ds_, {}, &rnd);
+  const auto eval = sharded_sampled_model->Evaluate(test_ds_, {}, &rnd);
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(sharded_sampled_evaluation), 0.86088,
-                         0.005);
+  YDF_TEST_METRIC(metric::Accuracy(eval), 0.8589, 0.005, 0.8589);
 }
 
 // Train and test a model on the adult dataset using random categorical splits.
@@ -657,8 +655,8 @@ TEST_F(GradientBoostedTreesOnAdult, RandomCategorical) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.005);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8642, 0.0097, 0.863);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2954, 0.0095, 0.294);
 
   auto* gbt_model =
       dynamic_cast<const GradientBoostedTreesModel*>(model_.get());
@@ -680,8 +678,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseNoQuickScorer) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8549, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8596, 0.0134, 0.8566);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3146, 0.0212, 0.3104);
 
   auto* gbt_model =
       dynamic_cast<const GradientBoostedTreesModel*>(model_.get());
@@ -703,8 +701,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseConcurrentDeprecated) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8662, 0.0094, 0.8664);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2966, 0.0145, 0.2942);
 }
 
 // Train and test a model on the adult dataset.
@@ -721,8 +719,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseConcurrent) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8656, 0.0094, 0.8664);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.296, 0.0117, 0.2942);
 }
 
 // Train and test a model on the adult dataset with Goss sampling.
@@ -735,8 +733,8 @@ TEST_F(GradientBoostedTreesOnAdult, GossDeprecated) {
   gbt_config->set_use_goss(true);
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8528, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.86, 0.012, 0.86);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3106, 0.0168, 0.3048);
 }
 
 // Train and test a model on the adult dataset with Goss sampling.
@@ -749,8 +747,8 @@ TEST_F(GradientBoostedTreesOnAdult, Goss) {
   gbt_config->mutable_gradient_one_side_sampling();
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8528, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8601, 0.0127, 0.86);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3095, 0.0138, 0.3048);
 }
 
 // Train and test a model on the adult dataset.
@@ -768,8 +766,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseDiscretizedNumerical) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8649, 0.0097, 0.8658);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2986, 0.0148, 0.2955);
 }
 
 // Train and test a model on the adult dataset.
@@ -790,8 +788,9 @@ TEST_F(GradientBoostedTreesOnAdult, BaseAggresiveDiscretizedNumerical) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8562, 0.005);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  const auto nan = std::numeric_limits<double>::quiet_NaN();
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8607, 0.0131, nan);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3099, 0.0183, nan);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, BaseWithWeights) {
@@ -804,8 +803,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseWithWeights) {
 
   TrainAndEvaluateModel(/*numerical_weight_attribute=*/"age");
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.845, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8388, 0.0146, 0.8375);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3614, 0.0313, 0.3534);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, NumCandidateAttributeRatio) {
@@ -822,8 +821,8 @@ TEST_F(GradientBoostedTreesOnAdult, NumCandidateAttributeRatio) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8644, 0.0108, 0.8649);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3011, 0.0151, 0.2972);
 }
 
 // Train and test a model on the adult dataset.
@@ -839,8 +838,8 @@ TEST_F(GradientBoostedTreesOnAdult, LeafWiseGrow) {
 
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8662, 0.0094, 0.8639);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3026, 0.0218, 0.2931);
 }
 
 // Train and test a model on the adult dataset with L2 regularization.
@@ -858,8 +857,8 @@ TEST_F(GradientBoostedTreesOnAdult, L2Regularization) {
   // Note: Accuracy is similar as RF (see :random_forest_test). However logloss
   // is significantly better (which is expected as, unlike RF,  GBT is
   // calibrated).
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8639, 0.0097, 0.8621);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2977, 0.011, 0.2953);
 }
 
 // Multiclass version of the algorithm on the binary class adult dataset.
@@ -876,8 +875,8 @@ TEST_F(GradientBoostedTreesOnAdult, FakeMulticlass) {
 
   // Note: As expected, the results are similar to the binary class
   // implementation.
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8641, 0.0099, 0.8658);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2979, 0.0126, 0.2969);
 }
 
 // Multiclass version of the algorithm on the binary class adult dataset with L2
@@ -896,8 +895,8 @@ TEST_F(GradientBoostedTreesOnAdult, FakeMulticlassL2Regularization) {
 
   // Note: As expected, the results are similar to the binary class
   // implementation.
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8649, 0.0092, 0.8658);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3002, 0.0152, 0.2952);
 }
 
 // Train and test a model on the adult dataset for a maximum given duration.
@@ -1016,8 +1015,9 @@ TEST_F(GradientBoostedTreesOnAdult, Dart) {
   TrainAndEvaluateModel();
 
   // Note: Dart seems to be unstable.
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.874, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.04);
+  const auto nan = std::numeric_limits<double>::quiet_NaN();
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8459, 0.0449, nan);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3293, 0.0727, nan);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, Hessian) {
@@ -1030,8 +1030,8 @@ TEST_F(GradientBoostedTreesOnAdult, Hessian) {
 
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.05);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8664, 0.0101, 0.8661);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2962, 0.0159, 0.2907);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, HessianRandomCategorical) {
@@ -1045,8 +1045,8 @@ TEST_F(GradientBoostedTreesOnAdult, HessianRandomCategorical) {
 
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.01);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8636, 0.0092, 0.8664);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2924, 0.0112, 0.2904);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, HessianDiscretizedNumerical) {
@@ -1060,8 +1060,8 @@ TEST_F(GradientBoostedTreesOnAdult, HessianDiscretizedNumerical) {
 
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.05);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8662, 0.0104, 0.8664);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2938, 0.0116, 0.2899);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, HessianL2Categorical) {
@@ -1075,8 +1075,8 @@ TEST_F(GradientBoostedTreesOnAdult, HessianL2Categorical) {
 
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.015);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.05);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8652, 0.0124, 0.867);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2962, 0.0125, 0.2922);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, PureServingModel) {
@@ -1086,8 +1086,8 @@ TEST_F(GradientBoostedTreesOnAdult, PureServingModel) {
   train_config_.set_pure_serving_model(true);
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8661, 0.0134, 0.8615);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3001, 0.0204, 0.2975);
 }
 
 TEST_F(GradientBoostedTreesOnAdult, MakingAModelPurePureServingModel) {
@@ -1096,15 +1096,15 @@ TEST_F(GradientBoostedTreesOnAdult, MakingAModelPurePureServingModel) {
   gbt_config->set_num_trees(100);
   TrainAndEvaluateModel();
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8676, 0.0129, 0.8615);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2977, 0.0167, 0.2975);
   const auto pre_pruning_size = model_->ModelSizeInBytes().value();
   YDF_LOG(INFO) << "pre_pruning_size:" << pre_pruning_size;
 
   CHECK_OK(model_->MakePureServing());
 
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8676, 0.0129, 0.8615);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2977, 0.0167, 0.2975);
 
   const auto post_pruning_size = model_->ModelSizeInBytes().value();
   YDF_LOG(INFO) << "post_pruning_size:" << post_pruning_size;
@@ -1129,7 +1129,7 @@ class GradientBoostedTreesOnAbalone : public utils::TrainAndTestTester {
 
 TEST_F(GradientBoostedTreesOnAbalone, Base) {
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.119, 0.01);
+  YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1684, 0.0979, 2.1138);
 }
 
 TEST_F(GradientBoostedTreesOnAbalone, L2Regularization) {
@@ -1137,7 +1137,7 @@ TEST_F(GradientBoostedTreesOnAbalone, L2Regularization) {
       gradient_boosted_trees::proto::gradient_boosted_trees_config);
   gbt_config->set_l2_regularization(0.1f);
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.1339, 0.01);
+  YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1593, 0.0776, 2.1145);
 }
 
 TEST_F(GradientBoostedTreesOnAbalone, SparseOblique) {
@@ -1146,7 +1146,7 @@ TEST_F(GradientBoostedTreesOnAbalone, SparseOblique) {
       gradient_boosted_trees::proto::gradient_boosted_trees_config);
   gbt_config->mutable_decision_tree()->mutable_sparse_oblique_split();
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.079, 0.02);
+  YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1155, 0.0988, 2.1001);
 }
 
 TEST_F(GradientBoostedTreesOnAbalone, PoissonLoss) {
@@ -1156,7 +1156,7 @@ TEST_F(GradientBoostedTreesOnAbalone, PoissonLoss) {
   gbt_config->set_loss(proto::Loss::POISSON);
 
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.15, 0.05);
+  YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1563, 0.0852, 2.1232);
 }
 
 class GradientBoostedTreesOnIris : public utils::TrainAndTestTester {
@@ -1175,8 +1175,8 @@ class GradientBoostedTreesOnIris : public utils::TrainAndTestTester {
 
 TEST_F(GradientBoostedTreesOnIris, Base) {
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9599, 0.02);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.22079, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9533, 0.03, 0.96);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2988, 0.2562, 0.2193);
   // Note: R RandomForest has an OOB accuracy of 0.9467.
 }
 
@@ -1185,8 +1185,8 @@ TEST_F(GradientBoostedTreesOnIris, Hessian) {
       gradient_boosted_trees::proto::gradient_boosted_trees_config);
   gbt_config->set_use_hessian_gain(true);
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9599, 0.02);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1360, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.94, 0.05, 0.9733);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3225, 0.3002, 0.138);
 }
 
 TEST_F(GradientBoostedTreesOnIris, Dart) {
@@ -1196,8 +1196,8 @@ TEST_F(GradientBoostedTreesOnIris, Dart) {
   gbt_config->mutable_dart()->set_dropout_rate(0.1f);
   gbt_config->mutable_decision_tree()->set_num_candidate_attributes(8);
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9599, 0.03);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1618, 0.06);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9467, 0.04, 0.9733);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1925, 0.1226, 0.18);
   // Note: R RandomForest has an OOB accuracy of 0.9467.
 }
 
@@ -1217,8 +1217,8 @@ class GradientBoostedTreesOnDNA : public utils::TrainAndTestTester {
 
 TEST_F(GradientBoostedTreesOnDNA, Base) {
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9529, 0.02);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1465, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9507, 0.0108, 0.9517);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1933, 0.08, 0.1446);
   // Note: R RandomForest has an OOB accuracy of 0.909.
 }
 
@@ -1227,15 +1227,15 @@ TEST_F(GradientBoostedTreesOnDNA, Hessian) {
       gradient_boosted_trees::proto::gradient_boosted_trees_config);
   gbt_config->set_use_hessian_gain(true);
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9554, 0.02);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1397, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9539, 0.0099, 0.9573);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1831, 0.0743, 0.1403);
 }
 
 TEST_F(GradientBoostedTreesOnDNA, BaseBooleanAsNumerical) {
   guide_filename_ = "dna_guide.pbtxt";
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9529, 0.02);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1465, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9514, 0.0118, 0.9517);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1757, 0.0568, 0.1446);
   // Note: R RandomForest has an OOB accuracy of 0.909.
 }
 
@@ -1245,8 +1245,8 @@ TEST_F(GradientBoostedTreesOnDNA, HessianBooleanAsNumerical) {
   gbt_config->set_use_hessian_gain(true);
   guide_filename_ = "dna_guide.pbtxt";
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9548, 0.02);
-  YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1443, 0.04);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9532, 0.0118, 0.9567);
+  YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1813, 0.0716, 0.1422);
 }
 
 TEST(GradientBoostedTrees, SetHyperParameters) {
diff --git a/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc b/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc
index 4d7b3b54..d3c98509 100644
--- a/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc
+++ b/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc
@@ -240,6 +240,7 @@ void BinomialLogLikelihoodLoss::TemplatedLossImp(
     const std::vector<float>& weights, size_t begin_example_idx,
     size_t end_example_idx, double* __restrict sum_loss,
     utils::IntegersConfusionMatrixDouble* confusion_matrix) {
+  double local_sum_loss = 0;
   for (size_t example_idx = begin_example_idx; example_idx < end_example_idx;
        example_idx++) {
     // The loss function expects a 0/1 label.
@@ -250,19 +251,19 @@ void BinomialLogLikelihoodLoss::TemplatedLossImp(
     if constexpr (use_weights) {
       const float weight = weights[example_idx];
       confusion_matrix->Add(labels[example_idx], predicted_label, weight);
-      *sum_loss -=
+      local_sum_loss -=
           2 * weight *
           (label_for_loss * prediction - std::log(1.f + std::exp(prediction)));
     } else {
       confusion_matrix->Add(labels[example_idx], predicted_label, 1.f);
       // Loss:
       //   -2 * ( label * prediction - log(1+exp(prediction)))
-      *sum_loss -= 2 * (label_for_loss * prediction -
-                        std::log(1.f + std::exp(prediction)));
-      DCheckIsFinite(*sum_loss);
+      local_sum_loss -= 2 * (label_for_loss * prediction -
+                             std::log(1.f + std::exp(prediction)));
     }
-    DCheckIsFinite(*sum_loss);
+    DCheckIsFinite(local_sum_loss);
   }
+  *sum_loss += local_sum_loss;
 }
 
 template <typename T>
diff --git a/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc b/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc
index c552d27c..fc6d2aac 100644
--- a/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc
+++ b/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc
@@ -15,6 +15,8 @@
 
 #include "yggdrasil_decision_forests/learner/multitasker/multitasker.h"
 
+#include <limits>
+
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "absl/flags/flag.h"
@@ -79,7 +81,8 @@ TEST_F(MultitaskerOnAdult, Base) {
   t3->set_task(model::proto::Task::CLASSIFICATION);
 
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.860, 0.01,
+                  std::numeric_limits<double>::quiet_NaN());
 
   utils::RandomEngine rnd(1234);
 
@@ -98,7 +101,8 @@ TEST_F(MultitaskerOnAdult, Base) {
     metric::proto::EvaluationOptions eval_options;
     eval_options.set_task(model::proto::Task::CLASSIFICATION);
     auto eval = submodel->Evaluate(test_dataset_, eval_options, &rnd);
-    YDF_EXPECT_METRIC_NEAR(metric::Accuracy(eval), 0.860, 0.01);
+    YDF_TEST_METRIC(metric::Accuracy(eval), 0.860, 0.01,
+                    std::numeric_limits<double>::quiet_NaN());
   }
 
   {
@@ -113,7 +117,8 @@ TEST_F(MultitaskerOnAdult, Base) {
     metric::proto::EvaluationOptions eval_options;
     eval_options.set_task(model::proto::Task::REGRESSION);
     auto eval = submodel->Evaluate(test_dataset_, eval_options, &rnd);
-    YDF_EXPECT_METRIC_NEAR(metric::RMSE(eval), 10.2048, 0.05);
+    YDF_TEST_METRIC(metric::RMSE(eval), 10.2048, 0.05,
+                    std::numeric_limits<double>::quiet_NaN());
   }
 
   {
@@ -128,7 +133,8 @@ TEST_F(MultitaskerOnAdult, Base) {
     metric::proto::EvaluationOptions eval_options;
     eval_options.set_task(model::proto::Task::CLASSIFICATION);
     auto eval = submodel->Evaluate(test_dataset_, eval_options, &rnd);
-    YDF_EXPECT_METRIC_NEAR(metric::Accuracy(eval), 0.76474, 0.01);
+    YDF_TEST_METRIC(metric::Accuracy(eval), 0.76474, 0.01,
+                    std::numeric_limits<double>::quiet_NaN());
   }
 
   {
@@ -185,7 +191,8 @@ TEST_F(MultitaskerOnAdult, Stacked) {
   t3->set_task(model::proto::Task::CLASSIFICATION);
 
   TrainAndEvaluateModel();
-  YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01);
+  YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.860, 0.01,
+                  std::numeric_limits<double>::quiet_NaN());
 
   utils::RandomEngine rnd(1234);
 
diff --git a/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc b/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc
index f501b3f8..1621fe10 100644
--- a/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc
+++ b/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc
@@ -238,7 +238,7 @@ TEST_F(RandomForestOnAdult, Base) {
 
   EXPECT_LE(rank_capital_gain, 5);
   EXPECT_LE(rank_relationship, 5);
-  EXPECT_LE(rank_occupation, 5);
+  EXPECT_LE(rank_occupation, 6);
 
   // Worst 2 variables.
   const int rank_fnlwgt = utils::GetVariableImportanceRank(
@@ -397,7 +397,7 @@ TEST_F(RandomForestOnAdult, NoWinnerTakeAllRandomCategorical) {
   rf_config->mutable_decision_tree()->mutable_categorical()->mutable_random();
   TrainAndEvaluateModel();
   EXPECT_NEAR(metric::Accuracy(evaluation_), 0.82618, 0.005);
-  EXPECT_NEAR(metric::LogLoss(evaluation_), 0.40623, 0.02);
+  EXPECT_NEAR(metric::LogLoss(evaluation_), 0.3817, 0.02);
 }
 
 TEST_F(RandomForestOnAdult, NoWinnerTakeAllExampleSampling) {
@@ -570,7 +570,7 @@ TEST_F(RandomForestOnAdult, MaxNumNodes) {
 
   EXPECT_NEAR(metric::Accuracy(evaluation_), 0.862, 0.015);
   // Disabling winner take all reduce the logloss (as expected).
-  EXPECT_NEAR(metric::LogLoss(evaluation_), 0.368, 0.045);
+  EXPECT_NEAR(metric::LogLoss(evaluation_), 0.368, 0.06);
 }
 
 TEST_F(RandomForestOnAdult, SparseOblique) {
@@ -634,7 +634,7 @@ TEST_F(RandomForestOnAbalone, Base) {
       absl::StrCat("csv:", oob_prediction_path));
 
   TrainAndEvaluateModel();
-  EXPECT_NEAR(metric::RMSE(evaluation_), 2.0825, 0.01);
+  EXPECT_NEAR(metric::RMSE(evaluation_), 2.0926, 0.01);
 
   // Check the oob predictions.
   const auto oob_predictions = file::GetContent(oob_prediction_path).value();
diff --git a/yggdrasil_decision_forests/utils/test_utils.cc b/yggdrasil_decision_forests/utils/test_utils.cc
index 5e9387e1..1dbc4797 100644
--- a/yggdrasil_decision_forests/utils/test_utils.cc
+++ b/yggdrasil_decision_forests/utils/test_utils.cc
@@ -18,6 +18,7 @@
 #include <cxxabi.h>
 
 #include <algorithm>
+#include <cstring>
 #include <memory>
 #include <random>
 #include <string>
@@ -69,6 +70,7 @@ namespace utils {
 
 namespace {
 
+// Shuffles a dataset randomly. Does not rely on a static seed.
 void ShuffleDataset(dataset::VerticalDataset* dataset) {
   absl::BitGen bitgen;
   std::vector<dataset::VerticalDataset::row_t> example_idxs(dataset->nrow());
@@ -77,6 +79,45 @@ void ShuffleDataset(dataset::VerticalDataset* dataset) {
   *dataset = dataset->Extract(example_idxs).value();
 }
 
+// Generates a random seed. Does not rely on a static seed.
+int64_t RandomSeed() {
+  absl::BitGen bitgen;
+  return bitgen();
+}
+
+// Generates a deterministic sequence of boolean value approximating poorly a
+// binomial distribution sampling.
+class DeterministicBinomial {
+ public:
+  DeterministicBinomial(const float rate) : rate_(rate) {}
+
+  bool Sample() {
+    if (num_total_ == 0) {
+      // Always return false first, unless the rate is 1.
+      num_total_++;
+      if (rate_ == 1) {
+        num_pos_++;
+        return true;
+      }
+      return false;
+    }
+
+    if (num_pos_ > rate_ * num_total_) {
+      num_total_++;
+      return false;
+    } else {
+      num_pos_++;
+      num_total_++;
+      return true;
+    }
+  }
+
+ private:
+  float rate_;
+  int num_pos_ = 0;
+  int num_total_ = 0;
+};
+
 }  // namespace
 
 void TrainAndTestTester::ConfigureForSyntheticDataset() {
@@ -136,9 +177,8 @@ void TrainAndTestTester::TrainAndEvaluateModel(
   // Configure the learner.
   CHECK_OK(model::GetLearner(train_config_, &learner_, deployment_config_));
 
-  if (inject_random_noise_ && !learner_->training_config().has_random_seed()) {
-    absl::BitGen bitgen;
-    learner_->mutable_training_config()->set_random_seed(bitgen());
+  if (change_random_seed_ && !learner_->training_config().has_random_seed()) {
+    learner_->mutable_training_config()->set_random_seed(RandomSeed());
   }
 
   if (generic_parameters_.has_value()) {
@@ -204,7 +244,7 @@ void TrainAndTestTester::TrainAndEvaluateModel(
   YDF_LOG(INFO) << "Training duration: " << training_duration_;
 
   // Evaluate the model.
-  utils::RandomEngine rnd(1234);
+  utils::RandomEngine rnd(1234);  // Not used
   evaluation_ = model_->Evaluate(test_dataset_, eval_options_, &rnd);
 
   // Print the model evaluation.
@@ -406,21 +446,22 @@ void TrainAndTestTester::BuildTrainValidTestDatasets(
   CHECK_OK(LoadVerticalDataset(train_path, data_spec, &dataset));
 
   // Split the dataset in two folds: training and testing.
-  std::vector<dataset::VerticalDataset::row_t> train_example_idxs,
-      test_example_idxs, valid_example_idxs;
+  std::vector<dataset::VerticalDataset::row_t> train_example_idxs;
+  std::vector<dataset::VerticalDataset::row_t> test_example_idxs;
+  std::vector<dataset::VerticalDataset::row_t> valid_example_idxs;
+
+  DeterministicBinomial sampling(dataset_sampling_);
+  DeterministicBinomial train_test_split(split_train_ratio_);
+  DeterministicBinomial test_valid_split(0.5f);
 
+  // TODO: Make deterministic.
   utils::RandomEngine rnd(1234);
   std::uniform_real_distribution<double> dist_01;
-  // If a validation example should be generated (i.e.
-  // pass_validation_dataset_=true), next_example_is_valid indicates if the next
-  // example will be used for validation or testing.
-  bool next_example_is_valid = true;
 
   for (dataset::VerticalDataset::row_t example_idx = 0;
        example_idx < dataset.nrow(); example_idx++) {
     // Down-sampling of examples.
-    // TODO: Make the split deterministic.
-    if (dataset_sampling_ < dist_01(rnd)) {
+    if (!sampling.Sample()) {
       continue;
     }
 
@@ -438,21 +479,13 @@ void TrainAndTestTester::BuildTrainValidTestDatasets(
       }
     }
 
-    bool is_training_example;
-    if (split_train_ratio_ == 0.5f) {
-      // Deterministic split.
-      is_training_example = (example_idx % 2) == 0;
-    } else {
-      is_training_example = dist_01(rnd) < split_train_ratio_;
-    }
+    const bool is_training_example = train_test_split.Sample();
 
     if (is_training_example) {
       train_example_idxs.push_back(example_idx);
     } else {
-      if (pass_validation_dataset_) {
-        (next_example_is_valid ? valid_example_idxs : test_example_idxs)
-            .push_back(example_idx);
-        next_example_is_valid ^= true;
+      if (pass_validation_dataset_ && test_valid_split.Sample()) {
+        valid_example_idxs.push_back(example_idx);
       } else {
         test_example_idxs.push_back(example_idx);
       }
@@ -720,7 +753,7 @@ void TestPredefinedHyperParameters(
 
     // Evaluate the model.
     if (min_accuracy.has_value()) {
-      utils::RandomEngine rnd(1234);
+      utils::RandomEngine rnd(1234);  // Not used.
       const auto evaluation = model->Evaluate(test_ds, {}, &rnd);
       EXPECT_GE(metric::Accuracy(evaluation), min_accuracy.value());
     }
@@ -847,32 +880,53 @@ absl::Status ExportUpliftPredictionsToTFUpliftCsvFormat(
 
 void InternalExportMetricCondition(const absl::string_view test,
                                    const double value, const double center,
-                                   const double margin,
+                                   const double margin, const double golden,
                                    const absl::string_view metric,
                                    const int line,
                                    const absl::string_view file) {
+  // Margin of error when comparing golden metric values.
+  constexpr double kGoldenMargin = 0.0001;
+
   const auto filename = file::GetBasename(file);
-  const auto abs_diff = std::abs(value - center);
-  const auto success = abs_diff < margin;
-#ifdef EXPORT_METRIC_CONDITION
-  const auto uid = GenUniqueId();
-  const auto path =
-      file::JoinPath(EXPORT_METRIC_CONDITION, absl::StrCat(uid, ".csv"));
-  std::string content =
-      absl::StrCat("test,value,center,margin,metric,line,filename,success\n",
-                   test, ",", value, ",", center, ",", margin, ",", metric, ",",
-                   line, ",", filename, ",", success);
-  CHECK_OK(file::SetContent(path, content));
-#endif
-  if (!success) {
-    EXPECT_TRUE(false) << "Non satified range condition for " << metric
-                       << " in " << test << "\ndefined at\n"
-                       << file << ":" << line << "\nThe metric value " << value
-                       << " is not in " << center << " +- " << margin
-                       << ".\ni.e. not in [" << (center - margin) << " , "
-                       << (center + margin)
-                       << "].\nThe absolute value of the difference is "
-                       << abs_diff << ".";
+  const bool golden_test = kYdfTestMetricCheckGold && !std::isnan(golden);
+
+  double abs_diff_margin = std::abs(value - center);
+  double abs_diff_golden = std::abs(value - golden);
+  bool success_margin = abs_diff_margin < margin;
+  bool success_golden = abs_diff_golden < kGoldenMargin;
+
+  if (strlen(kYdfTestMetricDumpDir) > 0) {
+    // Export metric to csv file.
+    const auto uid = GenUniqueId();
+    const auto path =
+        file::JoinPath(kYdfTestMetricDumpDir, absl::StrCat(uid, ".csv"));
+    std::string content = absl::StrCat(
+        "test,value,center,margin,metric,line,filename,success_margin,success_"
+        "golden,golden\n",
+        test, ",", value, ",", center, ",", margin, ",", metric, ",", line, ",",
+        filename, ",", success_margin, ",", success_golden, ",", golden);
+    CHECK_OK(file::SetContent(path, content));
+  } else {
+    if (!success_margin) {
+      EXPECT_TRUE(false) << "Non satified range condition for " << metric
+                         << " in " << test << "\ndefined at\n"
+                         << file << ":" << line << "\nThe metric value "
+                         << value << " is not in " << center << " +- " << margin
+                         << ".\ni.e. not in [" << (center - margin) << " , "
+                         << (center + margin)
+                         << "].\nThe absolute value of the difference is "
+                         << abs_diff_margin << ".";
+    }
+
+    if (golden_test && !success_golden) {
+      EXPECT_TRUE(false) << "Non satified golden value condition for " << metric
+                         << " in " << test << "\ndefined at\n"
+                         << file << ":" << line << "\nThe metric value "
+                         << value << " is different from " << golden
+                         << " (margin:" << kGoldenMargin
+                         << ").\nThe absolute value of the difference is "
+                         << abs_diff_golden << ".";
+    }
   }
 }
 
diff --git a/yggdrasil_decision_forests/utils/test_utils.h b/yggdrasil_decision_forests/utils/test_utils.h
index a2003dab..f281d884 100644
--- a/yggdrasil_decision_forests/utils/test_utils.h
+++ b/yggdrasil_decision_forests/utils/test_utils.h
@@ -168,15 +168,12 @@ class TrainAndTestTester : public ::testing::Test {
   // the logs.
   bool show_full_model_structure_ = false;
 
-  // If false, models trained and and evaluated in unit tests are expected to
-  // always be the same for a given implementation of the pseudo random number
-  // generator and a given version of the code. If true, training noise in
-  // injected through initial dataset shuffeling and randomization of the pseudo
-  // random number generator seed.
-  //
-  // TODO: Default to true.
+  // If true, shuffle the datasets in unit tests.
   bool inject_random_noise_ = false;
 
+  // If true, randomize learner seeds in unit tests.
+  bool change_random_seed_ = false;
+
  private:
   std::pair<std::string, std::string> GetTrainAndTestDatasetPaths();
 
@@ -335,13 +332,11 @@ absl::Status ExportUpliftPredictionsToTFUpliftCsvFormat(
     const model::AbstractModel& model, const dataset::VerticalDataset& dataset,
     absl::string_view output_csv_path);
 
-// Internal implementation of "YDF_EXPECT_METRIC_NEAR".
-void InternalExportMetricCondition(const absl::string_view test,
-                                   const double value, const double center,
-                                   const double margin,
-                                   const absl::string_view metric,
-                                   const int line,
-                                   const absl::string_view file);
+// Internal implementation of "YDF_TEST_METRIC".
+void InternalExportMetricCondition(absl::string_view test, double value,
+                                   double center, double margin, double gold,
+                                   absl::string_view metric, int line,
+                                   absl::string_view file);
 
 // Gets the name of the current test.
 template <typename T>
@@ -370,21 +365,40 @@ int GetVariableImportanceRank(
 }  // namespace utils
 }  // namespace yggdrasil_decision_forests
 
-// Checks that "value" is in [center-margin, center+margin].
-#define YDF_EXPECT_METRIC_NEAR(value, center, margin)                        \
+// Checks that "value" is in [center-margin, center+margin] (margin test) and
+// equal to "golden". If "kYdfTestMetricCheckGold=False" or if "golden=NaN",
+// only check the margin test.
+//
+#define YDF_TEST_METRIC(value, center, margin, golden)                       \
   ::yggdrasil_decision_forests::utils::InternalExportMetricCondition(        \
       ::yggdrasil_decision_forests::utils::InternalGetTestName(this), value, \
-      center, margin, #value, __LINE__, __FILE__)
+      center, margin, golden, #value, __LINE__, __FILE__)
 
-// If set, exports the metric conditions (both valid and invalid) tested by
-// "YDF_EXPECT_METRIC_NEAR" in csv files in the directory specified by
-// "EXPORT_METRIC_CONDITION". Note: The directory should be already existing.
-// This command is compatible with "--runs_per_test" (e.g. --runs_per_test=50).
+// TODO: Simplify protocol.
+//
+// The following block allows to export unit-test evaluation metrics to csv
+// files, to then analyse the distribution of metrics in a notebook, and
+// possibly update valid margins.
 //
-// EXPORT_METRIC_CONDITION is especially useful with tests with
-// "inject_random_noise_=true" in order to study the distibution of metrics and
-// better adjust the valid range.
+// If "kYdfTestMetricDumpDir" is set, the result of unit test metrics
+// tested with "YDF_TEST_METRIC" are exported to csv files in the
+// directory specified by "kYdfTestMetricDumpDir" (Note: The directory
+// should already exist) and the tests become non-failing (i.e., if a
+// metric is not in a valid range, the test does not fail).
 //
-// #define EXPORT_METRIC_CONDITION "/tmp/metric_condition"
+// YDF training is deterministic modulo changes in implementation of the random
+// generator (or equivalent, e.g. change of default random seed, change of query
+// order of the random generator) and floating point compiler optimizations.
+// Stability of unit tests to random seeds can be tested with
+// "change_random_seed_=True" in conjunction with value for "--runs_per_test"
+// e.g. "--runs_per_test=100".
+//
+
+// If set, export metrics to disk, and disable metric unit tests.
+constexpr char kYdfTestMetricDumpDir[] = "";
+// To enable logging of unit test metrics.
+// constexpr char kYdfTestMetricDumpDir[] = "/tmp/metric_condition";
+
+constexpr bool kYdfTestMetricCheckGold = false;
 
 #endif  // YGGDRASIL_DECISION_FORESTS_TOOL_TEST_UTILS_H_