From 5455343d848caf1a9f9600a715be6622712047c0 Mon Sep 17 00:00:00 2001 From: Mathieu Guillame-Bert Date: Mon, 25 Sep 2023 06:03:15 -0700 Subject: [PATCH] Make metric unit tests both more powerful and less prone to false errors. - When learners are expected to be deterministic, check metric against golden value (instead of valid range). This is the default behavior for the internal build. - Make it possible to run test with random seed-values. This way, tests measure the learning variance from changing the random seed, or equivalently, use a different random number generator (e.g., same are going in the external build). In this case, metrics are tested again metric range. - All metric range have been re-computed by running all tests 1000 times + adding 50% margin. In many cases, the new range is tighter than it was before. - Remove non-deterministic in tests outside of the random seed (if the seed is not fixed). This significantly reduce the variance of the test results. PiperOrigin-RevId: 568194822 --- .../learner/cart/cart_test.cc | 2 +- .../gradient_boosted_trees_test.cc | 182 +++++++++--------- .../loss/loss_imp_binomial.cc | 11 +- .../learner/multitasker/multitasker_test.cc | 17 +- .../random_forest/random_forest_test.cc | 8 +- .../utils/test_utils.cc | 146 +++++++++----- yggdrasil_decision_forests/utils/test_utils.h | 64 +++--- 7 files changed, 253 insertions(+), 177 deletions(-) diff --git a/yggdrasil_decision_forests/learner/cart/cart_test.cc b/yggdrasil_decision_forests/learner/cart/cart_test.cc index 61b0f668..a4ab1b28 100644 --- a/yggdrasil_decision_forests/learner/cart/cart_test.cc +++ b/yggdrasil_decision_forests/learner/cart/cart_test.cc @@ -45,7 +45,7 @@ TEST_F(CartOnAdult, Base) { TrainAndEvaluateModel(); // Random Forest has an accuracy of ~0.860. EXPECT_NEAR(metric::Accuracy(evaluation_), 0.8560, 0.01); - EXPECT_NEAR(metric::LogLoss(evaluation_), 0.4373, 0.04); + EXPECT_NEAR(metric::LogLoss(evaluation_), 0.4373, 0.05); // Show the tree structure. std::string description; diff --git a/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc b/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc index 7d7cfc41..9fdfbfa4 100644 --- a/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc +++ b/yggdrasil_decision_forests/learner/gradient_boosted_trees/gradient_boosted_trees_test.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -363,8 +364,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseDeprecated) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8644, 0.0099, 0.8658); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2979, 0.0127, 0.2949); auto* gbt_model = dynamic_cast(model_.get()); @@ -385,8 +386,8 @@ TEST_F(GradientBoostedTreesOnAdult, Base) { // Note: Accuracy is similar as RF (see :random_forest_test). However, logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8647, 0.0099, 0.8658); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2984, 0.0162, 0.2949); auto* gbt_model = dynamic_cast(model_.get()); @@ -409,8 +410,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaZero) { TrainAndEvaluateModel(); // Similar metrics as with log loss. - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8602, 0.003); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.3178, 0.004); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8647, 0.0122, 0.8658); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2969, 0.0107, 0.2949); } // Train and test a model on the adult dataset with focal loss, now with @@ -429,8 +430,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaHalf) { // Slighly better accuracy, but worse log loss; we are not // optimizing for log loss directly any more. - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.003); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.3310, 0.004); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8653, 0.0094, 0.8624); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3226, 0.0218, 0.3145); } // Train and test a model on the adult dataset with focal loss, now with @@ -449,8 +450,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaTwo) { // Even slightly better accuracy (could be just noise, but illustrative), // log loss deviates even more - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8608, 0.003); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.4192, 0.009); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8638, 0.0094, 0.8643); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.4107, 0.0351, 0.3924); } // Train and test a model on the adult dataset with focal loss, adding a @@ -470,8 +471,8 @@ TEST_F(GradientBoostedTreesOnAdult, FocalLossWithGammaTwoAlphaQuarter) { TrainAndEvaluateModel(); // Worse accuracy but smaller log loss due to low alpha - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8300, 0.004); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.4032, 0.02); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8503, 0.0177, 0.8553); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3907, 0.036, 0.3753); } // Separate the examples used for the structure and the leaves of the model. @@ -480,16 +481,16 @@ TEST_F(GradientBoostedTreesOnAdult, Honest) { gradient_boosted_trees::proto::gradient_boosted_trees_config); gbt_config->mutable_decision_tree()->mutable_honest(); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8556, 0.004); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.30955, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8589, 0.0131, 0.8557); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3095, 0.015, 0.3135); } // Train a GBT with a validation dataset provided as a VerticalDataset. TEST_F(GradientBoostedTreesOnAdult, ValidVerticalDataset) { pass_validation_dataset_ = true; inject_random_noise_ = true; TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8707, 0.0054); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.2986, 0.005); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8732, 0.0023, 0.8747); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2794, 0.0027, 0.2776); } // Train a GBT with a validation dataset provided as a path. @@ -498,11 +499,11 @@ TEST_F(GradientBoostedTreesOnAdult, ValidPathDataset) { pass_validation_dataset_ = true; inject_random_noise_ = true; TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8708, 0.0053); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.2983, 0.0046); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8732, 0.0023, 0.8747); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2794, 0.0027, 0.2776); } -TEST_F(GradientBoostedTreesOnAdult, VariableImportance) { +TEST_F(GradientBoostedTreesOnAdult, DISABLED_VariableImportance) { auto* gbt_config = train_config_.MutableExtension( gradient_boosted_trees::proto::gradient_boosted_trees_config); gbt_config->set_compute_permutation_variable_importance(true); @@ -530,7 +531,7 @@ TEST_F(GradientBoostedTreesOnAdult, VariableImportance) { EXPECT_LE(rank_capital_gain, 3); EXPECT_LE(rank_relationship, 3); - EXPECT_LE(rank_occupation, 3); + EXPECT_LE(rank_occupation, 7); } class PerShardSamplingOnAdult : public ::testing::Test { @@ -592,7 +593,8 @@ TEST_F(PerShardSamplingOnAdult, PerShardSamplingExact) { YDF_LOG(INFO) << "Evaluation:" << metric::TextReport(evaluation).value(); // Sharded model is "good". - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation), 0.8665, 0.008); + const auto nan = std::numeric_limits::quiet_NaN(); + YDF_TEST_METRIC(metric::Accuracy(evaluation), 0.8667, 0.008, nan); } // Model trained with the sharded algorithm and sampling. @@ -612,11 +614,9 @@ TEST_F(PerShardSamplingOnAdult, PerShardSamplingSampling) { // Evaluate the models. utils::RandomEngine rnd(1234); - const auto sharded_sampled_evaluation = - sharded_sampled_model->Evaluate(test_ds_, {}, &rnd); + const auto eval = sharded_sampled_model->Evaluate(test_ds_, {}, &rnd); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(sharded_sampled_evaluation), 0.86180, - 0.006); + YDF_TEST_METRIC(metric::Accuracy(eval), 0.8633, 0.006, 0.8633); } // Model trained with the sharded algorithm and sampling. @@ -636,11 +636,9 @@ TEST_F(PerShardSamplingOnAdult, PerShardSamplingSamplingRecycle) { // Evaluate the models. utils::RandomEngine rnd(1234); - const auto sharded_sampled_evaluation = - sharded_sampled_model->Evaluate(test_ds_, {}, &rnd); + const auto eval = sharded_sampled_model->Evaluate(test_ds_, {}, &rnd); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(sharded_sampled_evaluation), 0.86088, - 0.005); + YDF_TEST_METRIC(metric::Accuracy(eval), 0.8589, 0.005, 0.8589); } // Train and test a model on the adult dataset using random categorical splits. @@ -657,8 +655,8 @@ TEST_F(GradientBoostedTreesOnAdult, RandomCategorical) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.005); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8642, 0.0097, 0.863); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2954, 0.0095, 0.294); auto* gbt_model = dynamic_cast(model_.get()); @@ -680,8 +678,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseNoQuickScorer) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8549, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8596, 0.0134, 0.8566); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3146, 0.0212, 0.3104); auto* gbt_model = dynamic_cast(model_.get()); @@ -703,8 +701,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseConcurrentDeprecated) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8662, 0.0094, 0.8664); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2966, 0.0145, 0.2942); } // Train and test a model on the adult dataset. @@ -721,8 +719,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseConcurrent) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8656, 0.0094, 0.8664); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.296, 0.0117, 0.2942); } // Train and test a model on the adult dataset with Goss sampling. @@ -735,8 +733,8 @@ TEST_F(GradientBoostedTreesOnAdult, GossDeprecated) { gbt_config->set_use_goss(true); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8528, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.86, 0.012, 0.86); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3106, 0.0168, 0.3048); } // Train and test a model on the adult dataset with Goss sampling. @@ -749,8 +747,8 @@ TEST_F(GradientBoostedTreesOnAdult, Goss) { gbt_config->mutable_gradient_one_side_sampling(); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8528, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8601, 0.0127, 0.86); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3095, 0.0138, 0.3048); } // Train and test a model on the adult dataset. @@ -768,8 +766,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseDiscretizedNumerical) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8649, 0.0097, 0.8658); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2986, 0.0148, 0.2955); } // Train and test a model on the adult dataset. @@ -790,8 +788,9 @@ TEST_F(GradientBoostedTreesOnAdult, BaseAggresiveDiscretizedNumerical) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8562, 0.005); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + const auto nan = std::numeric_limits::quiet_NaN(); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8607, 0.0131, nan); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3099, 0.0183, nan); } TEST_F(GradientBoostedTreesOnAdult, BaseWithWeights) { @@ -804,8 +803,8 @@ TEST_F(GradientBoostedTreesOnAdult, BaseWithWeights) { TrainAndEvaluateModel(/*numerical_weight_attribute=*/"age"); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.845, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8388, 0.0146, 0.8375); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3614, 0.0313, 0.3534); } TEST_F(GradientBoostedTreesOnAdult, NumCandidateAttributeRatio) { @@ -822,8 +821,8 @@ TEST_F(GradientBoostedTreesOnAdult, NumCandidateAttributeRatio) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8644, 0.0108, 0.8649); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3011, 0.0151, 0.2972); } // Train and test a model on the adult dataset. @@ -839,8 +838,8 @@ TEST_F(GradientBoostedTreesOnAdult, LeafWiseGrow) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8662, 0.0094, 0.8639); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3026, 0.0218, 0.2931); } // Train and test a model on the adult dataset with L2 regularization. @@ -858,8 +857,8 @@ TEST_F(GradientBoostedTreesOnAdult, L2Regularization) { // Note: Accuracy is similar as RF (see :random_forest_test). However logloss // is significantly better (which is expected as, unlike RF, GBT is // calibrated). - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8639, 0.0097, 0.8621); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2977, 0.011, 0.2953); } // Multiclass version of the algorithm on the binary class adult dataset. @@ -876,8 +875,8 @@ TEST_F(GradientBoostedTreesOnAdult, FakeMulticlass) { // Note: As expected, the results are similar to the binary class // implementation. - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8641, 0.0099, 0.8658); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2979, 0.0126, 0.2969); } // Multiclass version of the algorithm on the binary class adult dataset with L2 @@ -896,8 +895,8 @@ TEST_F(GradientBoostedTreesOnAdult, FakeMulticlassL2Regularization) { // Note: As expected, the results are similar to the binary class // implementation. - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8649, 0.0092, 0.8658); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3002, 0.0152, 0.2952); } // Train and test a model on the adult dataset for a maximum given duration. @@ -1016,8 +1015,9 @@ TEST_F(GradientBoostedTreesOnAdult, Dart) { TrainAndEvaluateModel(); // Note: Dart seems to be unstable. - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.874, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.04); + const auto nan = std::numeric_limits::quiet_NaN(); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8459, 0.0449, nan); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3293, 0.0727, nan); } TEST_F(GradientBoostedTreesOnAdult, Hessian) { @@ -1030,8 +1030,8 @@ TEST_F(GradientBoostedTreesOnAdult, Hessian) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.05); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8664, 0.0101, 0.8661); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2962, 0.0159, 0.2907); } TEST_F(GradientBoostedTreesOnAdult, HessianRandomCategorical) { @@ -1045,8 +1045,8 @@ TEST_F(GradientBoostedTreesOnAdult, HessianRandomCategorical) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.01); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8636, 0.0092, 0.8664); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2924, 0.0112, 0.2904); } TEST_F(GradientBoostedTreesOnAdult, HessianDiscretizedNumerical) { @@ -1060,8 +1060,8 @@ TEST_F(GradientBoostedTreesOnAdult, HessianDiscretizedNumerical) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.05); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8662, 0.0104, 0.8664); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2938, 0.0116, 0.2899); } TEST_F(GradientBoostedTreesOnAdult, HessianL2Categorical) { @@ -1075,8 +1075,8 @@ TEST_F(GradientBoostedTreesOnAdult, HessianL2Categorical) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.86, 0.015); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.283, 0.05); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8652, 0.0124, 0.867); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2962, 0.0125, 0.2922); } TEST_F(GradientBoostedTreesOnAdult, PureServingModel) { @@ -1086,8 +1086,8 @@ TEST_F(GradientBoostedTreesOnAdult, PureServingModel) { train_config_.set_pure_serving_model(true); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8661, 0.0134, 0.8615); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3001, 0.0204, 0.2975); } TEST_F(GradientBoostedTreesOnAdult, MakingAModelPurePureServingModel) { @@ -1096,15 +1096,15 @@ TEST_F(GradientBoostedTreesOnAdult, MakingAModelPurePureServingModel) { gbt_config->set_num_trees(100); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8676, 0.0129, 0.8615); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2977, 0.0167, 0.2975); const auto pre_pruning_size = model_->ModelSizeInBytes().value(); YDF_LOG(INFO) << "pre_pruning_size:" << pre_pruning_size; CHECK_OK(model_->MakePureServing()); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.8605, 0.0025); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.320, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.8676, 0.0129, 0.8615); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2977, 0.0167, 0.2975); const auto post_pruning_size = model_->ModelSizeInBytes().value(); YDF_LOG(INFO) << "post_pruning_size:" << post_pruning_size; @@ -1129,7 +1129,7 @@ class GradientBoostedTreesOnAbalone : public utils::TrainAndTestTester { TEST_F(GradientBoostedTreesOnAbalone, Base) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.119, 0.01); + YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1684, 0.0979, 2.1138); } TEST_F(GradientBoostedTreesOnAbalone, L2Regularization) { @@ -1137,7 +1137,7 @@ TEST_F(GradientBoostedTreesOnAbalone, L2Regularization) { gradient_boosted_trees::proto::gradient_boosted_trees_config); gbt_config->set_l2_regularization(0.1f); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.1339, 0.01); + YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1593, 0.0776, 2.1145); } TEST_F(GradientBoostedTreesOnAbalone, SparseOblique) { @@ -1146,7 +1146,7 @@ TEST_F(GradientBoostedTreesOnAbalone, SparseOblique) { gradient_boosted_trees::proto::gradient_boosted_trees_config); gbt_config->mutable_decision_tree()->mutable_sparse_oblique_split(); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.079, 0.02); + YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1155, 0.0988, 2.1001); } TEST_F(GradientBoostedTreesOnAbalone, PoissonLoss) { @@ -1156,7 +1156,7 @@ TEST_F(GradientBoostedTreesOnAbalone, PoissonLoss) { gbt_config->set_loss(proto::Loss::POISSON); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::RMSE(evaluation_), 2.15, 0.05); + YDF_TEST_METRIC(metric::RMSE(evaluation_), 2.1563, 0.0852, 2.1232); } class GradientBoostedTreesOnIris : public utils::TrainAndTestTester { @@ -1175,8 +1175,8 @@ class GradientBoostedTreesOnIris : public utils::TrainAndTestTester { TEST_F(GradientBoostedTreesOnIris, Base) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9599, 0.02); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.22079, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9533, 0.03, 0.96); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.2988, 0.2562, 0.2193); // Note: R RandomForest has an OOB accuracy of 0.9467. } @@ -1185,8 +1185,8 @@ TEST_F(GradientBoostedTreesOnIris, Hessian) { gradient_boosted_trees::proto::gradient_boosted_trees_config); gbt_config->set_use_hessian_gain(true); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9599, 0.02); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1360, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.94, 0.05, 0.9733); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.3225, 0.3002, 0.138); } TEST_F(GradientBoostedTreesOnIris, Dart) { @@ -1196,8 +1196,8 @@ TEST_F(GradientBoostedTreesOnIris, Dart) { gbt_config->mutable_dart()->set_dropout_rate(0.1f); gbt_config->mutable_decision_tree()->set_num_candidate_attributes(8); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9599, 0.03); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1618, 0.06); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9467, 0.04, 0.9733); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1925, 0.1226, 0.18); // Note: R RandomForest has an OOB accuracy of 0.9467. } @@ -1217,8 +1217,8 @@ class GradientBoostedTreesOnDNA : public utils::TrainAndTestTester { TEST_F(GradientBoostedTreesOnDNA, Base) { TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9529, 0.02); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1465, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9507, 0.0108, 0.9517); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1933, 0.08, 0.1446); // Note: R RandomForest has an OOB accuracy of 0.909. } @@ -1227,15 +1227,15 @@ TEST_F(GradientBoostedTreesOnDNA, Hessian) { gradient_boosted_trees::proto::gradient_boosted_trees_config); gbt_config->set_use_hessian_gain(true); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9554, 0.02); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1397, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9539, 0.0099, 0.9573); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1831, 0.0743, 0.1403); } TEST_F(GradientBoostedTreesOnDNA, BaseBooleanAsNumerical) { guide_filename_ = "dna_guide.pbtxt"; TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9529, 0.02); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1465, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9514, 0.0118, 0.9517); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1757, 0.0568, 0.1446); // Note: R RandomForest has an OOB accuracy of 0.909. } @@ -1245,8 +1245,8 @@ TEST_F(GradientBoostedTreesOnDNA, HessianBooleanAsNumerical) { gbt_config->set_use_hessian_gain(true); guide_filename_ = "dna_guide.pbtxt"; TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.9548, 0.02); - YDF_EXPECT_METRIC_NEAR(metric::LogLoss(evaluation_), 0.1443, 0.04); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.9532, 0.0118, 0.9567); + YDF_TEST_METRIC(metric::LogLoss(evaluation_), 0.1813, 0.0716, 0.1422); } TEST(GradientBoostedTrees, SetHyperParameters) { diff --git a/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc b/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc index 4d7b3b54..d3c98509 100644 --- a/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc +++ b/yggdrasil_decision_forests/learner/gradient_boosted_trees/loss/loss_imp_binomial.cc @@ -240,6 +240,7 @@ void BinomialLogLikelihoodLoss::TemplatedLossImp( const std::vector& weights, size_t begin_example_idx, size_t end_example_idx, double* __restrict sum_loss, utils::IntegersConfusionMatrixDouble* confusion_matrix) { + double local_sum_loss = 0; for (size_t example_idx = begin_example_idx; example_idx < end_example_idx; example_idx++) { // The loss function expects a 0/1 label. @@ -250,19 +251,19 @@ void BinomialLogLikelihoodLoss::TemplatedLossImp( if constexpr (use_weights) { const float weight = weights[example_idx]; confusion_matrix->Add(labels[example_idx], predicted_label, weight); - *sum_loss -= + local_sum_loss -= 2 * weight * (label_for_loss * prediction - std::log(1.f + std::exp(prediction))); } else { confusion_matrix->Add(labels[example_idx], predicted_label, 1.f); // Loss: // -2 * ( label * prediction - log(1+exp(prediction))) - *sum_loss -= 2 * (label_for_loss * prediction - - std::log(1.f + std::exp(prediction))); - DCheckIsFinite(*sum_loss); + local_sum_loss -= 2 * (label_for_loss * prediction - + std::log(1.f + std::exp(prediction))); } - DCheckIsFinite(*sum_loss); + DCheckIsFinite(local_sum_loss); } + *sum_loss += local_sum_loss; } template diff --git a/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc b/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc index c552d27c..fc6d2aac 100644 --- a/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc +++ b/yggdrasil_decision_forests/learner/multitasker/multitasker_test.cc @@ -15,6 +15,8 @@ #include "yggdrasil_decision_forests/learner/multitasker/multitasker.h" +#include + #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/flags/flag.h" @@ -79,7 +81,8 @@ TEST_F(MultitaskerOnAdult, Base) { t3->set_task(model::proto::Task::CLASSIFICATION); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.860, 0.01, + std::numeric_limits::quiet_NaN()); utils::RandomEngine rnd(1234); @@ -98,7 +101,8 @@ TEST_F(MultitaskerOnAdult, Base) { metric::proto::EvaluationOptions eval_options; eval_options.set_task(model::proto::Task::CLASSIFICATION); auto eval = submodel->Evaluate(test_dataset_, eval_options, &rnd); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(eval), 0.860, 0.01); + YDF_TEST_METRIC(metric::Accuracy(eval), 0.860, 0.01, + std::numeric_limits::quiet_NaN()); } { @@ -113,7 +117,8 @@ TEST_F(MultitaskerOnAdult, Base) { metric::proto::EvaluationOptions eval_options; eval_options.set_task(model::proto::Task::REGRESSION); auto eval = submodel->Evaluate(test_dataset_, eval_options, &rnd); - YDF_EXPECT_METRIC_NEAR(metric::RMSE(eval), 10.2048, 0.05); + YDF_TEST_METRIC(metric::RMSE(eval), 10.2048, 0.05, + std::numeric_limits::quiet_NaN()); } { @@ -128,7 +133,8 @@ TEST_F(MultitaskerOnAdult, Base) { metric::proto::EvaluationOptions eval_options; eval_options.set_task(model::proto::Task::CLASSIFICATION); auto eval = submodel->Evaluate(test_dataset_, eval_options, &rnd); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(eval), 0.76474, 0.01); + YDF_TEST_METRIC(metric::Accuracy(eval), 0.76474, 0.01, + std::numeric_limits::quiet_NaN()); } { @@ -185,7 +191,8 @@ TEST_F(MultitaskerOnAdult, Stacked) { t3->set_task(model::proto::Task::CLASSIFICATION); TrainAndEvaluateModel(); - YDF_EXPECT_METRIC_NEAR(metric::Accuracy(evaluation_), 0.860, 0.01); + YDF_TEST_METRIC(metric::Accuracy(evaluation_), 0.860, 0.01, + std::numeric_limits::quiet_NaN()); utils::RandomEngine rnd(1234); diff --git a/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc b/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc index f501b3f8..1621fe10 100644 --- a/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc +++ b/yggdrasil_decision_forests/learner/random_forest/random_forest_test.cc @@ -238,7 +238,7 @@ TEST_F(RandomForestOnAdult, Base) { EXPECT_LE(rank_capital_gain, 5); EXPECT_LE(rank_relationship, 5); - EXPECT_LE(rank_occupation, 5); + EXPECT_LE(rank_occupation, 6); // Worst 2 variables. const int rank_fnlwgt = utils::GetVariableImportanceRank( @@ -397,7 +397,7 @@ TEST_F(RandomForestOnAdult, NoWinnerTakeAllRandomCategorical) { rf_config->mutable_decision_tree()->mutable_categorical()->mutable_random(); TrainAndEvaluateModel(); EXPECT_NEAR(metric::Accuracy(evaluation_), 0.82618, 0.005); - EXPECT_NEAR(metric::LogLoss(evaluation_), 0.40623, 0.02); + EXPECT_NEAR(metric::LogLoss(evaluation_), 0.3817, 0.02); } TEST_F(RandomForestOnAdult, NoWinnerTakeAllExampleSampling) { @@ -570,7 +570,7 @@ TEST_F(RandomForestOnAdult, MaxNumNodes) { EXPECT_NEAR(metric::Accuracy(evaluation_), 0.862, 0.015); // Disabling winner take all reduce the logloss (as expected). - EXPECT_NEAR(metric::LogLoss(evaluation_), 0.368, 0.045); + EXPECT_NEAR(metric::LogLoss(evaluation_), 0.368, 0.06); } TEST_F(RandomForestOnAdult, SparseOblique) { @@ -634,7 +634,7 @@ TEST_F(RandomForestOnAbalone, Base) { absl::StrCat("csv:", oob_prediction_path)); TrainAndEvaluateModel(); - EXPECT_NEAR(metric::RMSE(evaluation_), 2.0825, 0.01); + EXPECT_NEAR(metric::RMSE(evaluation_), 2.0926, 0.01); // Check the oob predictions. const auto oob_predictions = file::GetContent(oob_prediction_path).value(); diff --git a/yggdrasil_decision_forests/utils/test_utils.cc b/yggdrasil_decision_forests/utils/test_utils.cc index 5e9387e1..1dbc4797 100644 --- a/yggdrasil_decision_forests/utils/test_utils.cc +++ b/yggdrasil_decision_forests/utils/test_utils.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -69,6 +70,7 @@ namespace utils { namespace { +// Shuffles a dataset randomly. Does not rely on a static seed. void ShuffleDataset(dataset::VerticalDataset* dataset) { absl::BitGen bitgen; std::vector example_idxs(dataset->nrow()); @@ -77,6 +79,45 @@ void ShuffleDataset(dataset::VerticalDataset* dataset) { *dataset = dataset->Extract(example_idxs).value(); } +// Generates a random seed. Does not rely on a static seed. +int64_t RandomSeed() { + absl::BitGen bitgen; + return bitgen(); +} + +// Generates a deterministic sequence of boolean value approximating poorly a +// binomial distribution sampling. +class DeterministicBinomial { + public: + DeterministicBinomial(const float rate) : rate_(rate) {} + + bool Sample() { + if (num_total_ == 0) { + // Always return false first, unless the rate is 1. + num_total_++; + if (rate_ == 1) { + num_pos_++; + return true; + } + return false; + } + + if (num_pos_ > rate_ * num_total_) { + num_total_++; + return false; + } else { + num_pos_++; + num_total_++; + return true; + } + } + + private: + float rate_; + int num_pos_ = 0; + int num_total_ = 0; +}; + } // namespace void TrainAndTestTester::ConfigureForSyntheticDataset() { @@ -136,9 +177,8 @@ void TrainAndTestTester::TrainAndEvaluateModel( // Configure the learner. CHECK_OK(model::GetLearner(train_config_, &learner_, deployment_config_)); - if (inject_random_noise_ && !learner_->training_config().has_random_seed()) { - absl::BitGen bitgen; - learner_->mutable_training_config()->set_random_seed(bitgen()); + if (change_random_seed_ && !learner_->training_config().has_random_seed()) { + learner_->mutable_training_config()->set_random_seed(RandomSeed()); } if (generic_parameters_.has_value()) { @@ -204,7 +244,7 @@ void TrainAndTestTester::TrainAndEvaluateModel( YDF_LOG(INFO) << "Training duration: " << training_duration_; // Evaluate the model. - utils::RandomEngine rnd(1234); + utils::RandomEngine rnd(1234); // Not used evaluation_ = model_->Evaluate(test_dataset_, eval_options_, &rnd); // Print the model evaluation. @@ -406,21 +446,22 @@ void TrainAndTestTester::BuildTrainValidTestDatasets( CHECK_OK(LoadVerticalDataset(train_path, data_spec, &dataset)); // Split the dataset in two folds: training and testing. - std::vector train_example_idxs, - test_example_idxs, valid_example_idxs; + std::vector train_example_idxs; + std::vector test_example_idxs; + std::vector valid_example_idxs; + + DeterministicBinomial sampling(dataset_sampling_); + DeterministicBinomial train_test_split(split_train_ratio_); + DeterministicBinomial test_valid_split(0.5f); + // TODO: Make deterministic. utils::RandomEngine rnd(1234); std::uniform_real_distribution dist_01; - // If a validation example should be generated (i.e. - // pass_validation_dataset_=true), next_example_is_valid indicates if the next - // example will be used for validation or testing. - bool next_example_is_valid = true; for (dataset::VerticalDataset::row_t example_idx = 0; example_idx < dataset.nrow(); example_idx++) { // Down-sampling of examples. - // TODO: Make the split deterministic. - if (dataset_sampling_ < dist_01(rnd)) { + if (!sampling.Sample()) { continue; } @@ -438,21 +479,13 @@ void TrainAndTestTester::BuildTrainValidTestDatasets( } } - bool is_training_example; - if (split_train_ratio_ == 0.5f) { - // Deterministic split. - is_training_example = (example_idx % 2) == 0; - } else { - is_training_example = dist_01(rnd) < split_train_ratio_; - } + const bool is_training_example = train_test_split.Sample(); if (is_training_example) { train_example_idxs.push_back(example_idx); } else { - if (pass_validation_dataset_) { - (next_example_is_valid ? valid_example_idxs : test_example_idxs) - .push_back(example_idx); - next_example_is_valid ^= true; + if (pass_validation_dataset_ && test_valid_split.Sample()) { + valid_example_idxs.push_back(example_idx); } else { test_example_idxs.push_back(example_idx); } @@ -720,7 +753,7 @@ void TestPredefinedHyperParameters( // Evaluate the model. if (min_accuracy.has_value()) { - utils::RandomEngine rnd(1234); + utils::RandomEngine rnd(1234); // Not used. const auto evaluation = model->Evaluate(test_ds, {}, &rnd); EXPECT_GE(metric::Accuracy(evaluation), min_accuracy.value()); } @@ -847,32 +880,53 @@ absl::Status ExportUpliftPredictionsToTFUpliftCsvFormat( void InternalExportMetricCondition(const absl::string_view test, const double value, const double center, - const double margin, + const double margin, const double golden, const absl::string_view metric, const int line, const absl::string_view file) { + // Margin of error when comparing golden metric values. + constexpr double kGoldenMargin = 0.0001; + const auto filename = file::GetBasename(file); - const auto abs_diff = std::abs(value - center); - const auto success = abs_diff < margin; -#ifdef EXPORT_METRIC_CONDITION - const auto uid = GenUniqueId(); - const auto path = - file::JoinPath(EXPORT_METRIC_CONDITION, absl::StrCat(uid, ".csv")); - std::string content = - absl::StrCat("test,value,center,margin,metric,line,filename,success\n", - test, ",", value, ",", center, ",", margin, ",", metric, ",", - line, ",", filename, ",", success); - CHECK_OK(file::SetContent(path, content)); -#endif - if (!success) { - EXPECT_TRUE(false) << "Non satified range condition for " << metric - << " in " << test << "\ndefined at\n" - << file << ":" << line << "\nThe metric value " << value - << " is not in " << center << " +- " << margin - << ".\ni.e. not in [" << (center - margin) << " , " - << (center + margin) - << "].\nThe absolute value of the difference is " - << abs_diff << "."; + const bool golden_test = kYdfTestMetricCheckGold && !std::isnan(golden); + + double abs_diff_margin = std::abs(value - center); + double abs_diff_golden = std::abs(value - golden); + bool success_margin = abs_diff_margin < margin; + bool success_golden = abs_diff_golden < kGoldenMargin; + + if (strlen(kYdfTestMetricDumpDir) > 0) { + // Export metric to csv file. + const auto uid = GenUniqueId(); + const auto path = + file::JoinPath(kYdfTestMetricDumpDir, absl::StrCat(uid, ".csv")); + std::string content = absl::StrCat( + "test,value,center,margin,metric,line,filename,success_margin,success_" + "golden,golden\n", + test, ",", value, ",", center, ",", margin, ",", metric, ",", line, ",", + filename, ",", success_margin, ",", success_golden, ",", golden); + CHECK_OK(file::SetContent(path, content)); + } else { + if (!success_margin) { + EXPECT_TRUE(false) << "Non satified range condition for " << metric + << " in " << test << "\ndefined at\n" + << file << ":" << line << "\nThe metric value " + << value << " is not in " << center << " +- " << margin + << ".\ni.e. not in [" << (center - margin) << " , " + << (center + margin) + << "].\nThe absolute value of the difference is " + << abs_diff_margin << "."; + } + + if (golden_test && !success_golden) { + EXPECT_TRUE(false) << "Non satified golden value condition for " << metric + << " in " << test << "\ndefined at\n" + << file << ":" << line << "\nThe metric value " + << value << " is different from " << golden + << " (margin:" << kGoldenMargin + << ").\nThe absolute value of the difference is " + << abs_diff_golden << "."; + } } } diff --git a/yggdrasil_decision_forests/utils/test_utils.h b/yggdrasil_decision_forests/utils/test_utils.h index a2003dab..f281d884 100644 --- a/yggdrasil_decision_forests/utils/test_utils.h +++ b/yggdrasil_decision_forests/utils/test_utils.h @@ -168,15 +168,12 @@ class TrainAndTestTester : public ::testing::Test { // the logs. bool show_full_model_structure_ = false; - // If false, models trained and and evaluated in unit tests are expected to - // always be the same for a given implementation of the pseudo random number - // generator and a given version of the code. If true, training noise in - // injected through initial dataset shuffeling and randomization of the pseudo - // random number generator seed. - // - // TODO: Default to true. + // If true, shuffle the datasets in unit tests. bool inject_random_noise_ = false; + // If true, randomize learner seeds in unit tests. + bool change_random_seed_ = false; + private: std::pair GetTrainAndTestDatasetPaths(); @@ -335,13 +332,11 @@ absl::Status ExportUpliftPredictionsToTFUpliftCsvFormat( const model::AbstractModel& model, const dataset::VerticalDataset& dataset, absl::string_view output_csv_path); -// Internal implementation of "YDF_EXPECT_METRIC_NEAR". -void InternalExportMetricCondition(const absl::string_view test, - const double value, const double center, - const double margin, - const absl::string_view metric, - const int line, - const absl::string_view file); +// Internal implementation of "YDF_TEST_METRIC". +void InternalExportMetricCondition(absl::string_view test, double value, + double center, double margin, double gold, + absl::string_view metric, int line, + absl::string_view file); // Gets the name of the current test. template @@ -370,21 +365,40 @@ int GetVariableImportanceRank( } // namespace utils } // namespace yggdrasil_decision_forests -// Checks that "value" is in [center-margin, center+margin]. -#define YDF_EXPECT_METRIC_NEAR(value, center, margin) \ +// Checks that "value" is in [center-margin, center+margin] (margin test) and +// equal to "golden". If "kYdfTestMetricCheckGold=False" or if "golden=NaN", +// only check the margin test. +// +#define YDF_TEST_METRIC(value, center, margin, golden) \ ::yggdrasil_decision_forests::utils::InternalExportMetricCondition( \ ::yggdrasil_decision_forests::utils::InternalGetTestName(this), value, \ - center, margin, #value, __LINE__, __FILE__) + center, margin, golden, #value, __LINE__, __FILE__) -// If set, exports the metric conditions (both valid and invalid) tested by -// "YDF_EXPECT_METRIC_NEAR" in csv files in the directory specified by -// "EXPORT_METRIC_CONDITION". Note: The directory should be already existing. -// This command is compatible with "--runs_per_test" (e.g. --runs_per_test=50). +// TODO: Simplify protocol. +// +// The following block allows to export unit-test evaluation metrics to csv +// files, to then analyse the distribution of metrics in a notebook, and +// possibly update valid margins. // -// EXPORT_METRIC_CONDITION is especially useful with tests with -// "inject_random_noise_=true" in order to study the distibution of metrics and -// better adjust the valid range. +// If "kYdfTestMetricDumpDir" is set, the result of unit test metrics +// tested with "YDF_TEST_METRIC" are exported to csv files in the +// directory specified by "kYdfTestMetricDumpDir" (Note: The directory +// should already exist) and the tests become non-failing (i.e., if a +// metric is not in a valid range, the test does not fail). // -// #define EXPORT_METRIC_CONDITION "/tmp/metric_condition" +// YDF training is deterministic modulo changes in implementation of the random +// generator (or equivalent, e.g. change of default random seed, change of query +// order of the random generator) and floating point compiler optimizations. +// Stability of unit tests to random seeds can be tested with +// "change_random_seed_=True" in conjunction with value for "--runs_per_test" +// e.g. "--runs_per_test=100". +// + +// If set, export metrics to disk, and disable metric unit tests. +constexpr char kYdfTestMetricDumpDir[] = ""; +// To enable logging of unit test metrics. +// constexpr char kYdfTestMetricDumpDir[] = "/tmp/metric_condition"; + +constexpr bool kYdfTestMetricCheckGold = false; #endif // YGGDRASIL_DECISION_FORESTS_TOOL_TEST_UTILS_H_