Testing i386 commit

stephematician · Oct 27, 2023 · f85871e · f85871e
1 parent 2e8977c
commit f85871e
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 32 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,38 +1,49 @@
-literanger NEWS
-===============
+# Changelog - literanger 
 
-# version 0.0.2
+[![Common Changelog](https://common-changelog.org/badge.svg)](https://common-changelog.org)
 
-Performance enhancements
--   Faster (correct) test for number of candidate values in node splitting.
--   Remove lock on log gamma (beta splitting rule).
+## [0.0.3]() - 2023-xx-xx
 
-Bug-fixes
--   Fix container overrun and incorrect (unweighted) sampling without
-    replacement.
 
-Documentation fixes
--   Incorrect spelling of Breiman and missing reference in README
--   Added github links
 
+## [0.0.2](https://github.com/stephematician/literanger/releases/tag/v0.0.1) - 2023-07-11
 
-# version 0.0.1
+Update to pass CRAN's ASAN check
 
-This is the initial release of literanger, a refactoring and adaptation of the
-ranger package <https://github.com/imbs-hl/ranger> for random forests. The
-purpose of this update was to refactor the prediction code to enable efficient
-prediction when embedded into the multiple imputation algorithm proposed by
+### Changed
+
+-   Improve performance of node splitting ([`d3f6424`](https://github.com/stephematician/literanger/commit/d3f64245))
+
+### Added
+
+-   Add re-entrant log gamma to speed up beta splitting rule
+    ([`d7f058d`](https://github.com/stephematician/literanger/commit/d7f058dd))
+-   Minor fixes to documentation ([`91b6c6d`](https://github.com/stephematician/literanger/commit/91b6c6d),
+    [`0f62d02`](https://github.com/stephematician/literanger/commit/0f62d027))
+
+### Fixed
+
+-   Fix potential illegal access and incorrect unweighted sampling without
+    replacement ([`b6df5d9`](https://github.com/stephematician/literanger/commit/b6df5d9))
+
+
+## [0.0.1](https://github.com/stephematician/literanger/releases/tag/v0.0.1) - 2023-06-25
+
+_First release_
+
+A refactoring and adaptation of the ranger package
+<https://github.com/imbs-hl/ranger> for random forests. Has faster prediction
+mode intended for embedding into the multiple imputation algorithm proposed by
 Doove et al in:
 
 Doove, L. L., Van Buuren, S., & Dusseldorp, E. (2014). Recursive partitioning
 for missing data imputation in the presence of interaction effects.
 _Computational statistics & data analysis_, 72, 92-104.
 
-Currently supports:
--   Classification and regression trees/forests.
--   Prediction types:
-    -   Conventional 'bagged' prediction (most frequent value or mean).
-    -   Terminal node identifiers for all trees.
-    -   Prediction given by drawing a tree for each prediction and then drawing
-        an in-bag value from the terminal node.
+### Added
+
+-   Fit classification and regression trees
+-   Prediction via most frequent value or mean
+-   Get predictions as terminal node identifiers in each tree or as a random
+    draw from inbag values in a random tree
 
diff --git a/cran-comments.md b/cran-comments.md
@@ -7,7 +7,7 @@ Submit version 0.0.2: bug fix and performance improvements.
     https://www.stats.ox.ac.uk/pub/bdr/memtests/clang-ASAN/literanger/00check.log
 
 
-# `win-builder` R CMD CHECK results 
+## `win-builder` R CMD CHECK results 
 
 ```
   Maintainer: 'Stephen Wade <[email protected]>'
@@ -32,7 +32,7 @@ Submit version 0.0.2: bug fix and performance improvements.
 -   The misspelled words above are names.
 
 
-# Ubuntu 22.04 (my machine) R CMD CHECK results
+## Ubuntu 22.04 (my machine) R CMD CHECK results
 
 ```
 ── R CMD check results ─────────────────────────────────── literanger 0.0.2 ────

diff --git a/src/DataSparse.h b/src/DataSparse.h
@@ -114,7 +114,7 @@ inline double DataSparse::get_x(const size_t sample_key,
     /* test this TODO: */
     using int_t = cpp11::integers::value_type;
     const int_t j_start = x_p[predictor_key];
-    const int_t j_end = x_p[predictor_key + 1l];
+    const int_t j_end = x_p[predictor_key + (size_t)1];
     if (j_start == j_end) return 0.0;
 
     const int_t row_offset = as_row_offset(sample_key, permute);

diff --git a/src/Tree.defn.h b/src/Tree.defn.h
@@ -71,7 +71,8 @@ void Tree<ImplT>::predict(const std::shared_ptr<const Data> data,
             }
         } else {
           /* NOTE: probably unsafe */
-            const ull_bitenc split_enc = *((size_t *)(&split_values[node_key]));
+            const ull_bitenc split_enc =
+                *((unsigned long long *)(&split_values[node_key]));
             if (!split_enc.test(std::floor(value) - 1)) {
                 node_key = left_children[node_key];
             } else {
@@ -275,11 +276,12 @@ void Tree<ImplT>::best_decrease_by_value_extratrees_unordered(
    * is drawn randomly from all available partitions that put at least one of
    * the observed levels to the right. */
     auto to_partition_key = [&](size_t j){
+        using ull_rng_t = std::uniform_int_distribution<unsigned long long>;
         ull_bitenc key = 0;
         { /* don't allow full or empty for splitting on present values */
             const size_t n_partition =
                 (2ull << (is_in_node.count() - 1ull)) - 2ull;
-            std::uniform_int_distribution<size_t> U_rng(1, n_partition);
+            ull_rng_t U_rng(1, n_partition);
 
             const ull_bitenc drawn_in_partition = U_rng(gen);
             size_t key_j = 0;
@@ -293,7 +295,7 @@ void Tree<ImplT>::best_decrease_by_value_extratrees_unordered(
         { /* allow full or empty for splitting on non-present values */
             const size_t n_partition =
                 (2ull << (is_ex_node.count() - 1ull)) - 1ull;
-            std::uniform_int_distribution<size_t> U_rng(0, n_partition);
+            u_rng_t U_rng(0, n_partition);
 
             const ull_bitenc drawn_ex_partition = U_rng(gen);
             size_t key_j = 0;

diff --git a/src/TreeParameters.h b/src/TreeParameters.h
@@ -47,7 +47,7 @@ struct TreeParameters {
      * each tree.
      * @param[in] n_try The number of candidate predictors for each split.
      * @param[in] draw_always_predictor_keys The key of each predictor that will
-     * always be a candidate for splitting.
+     * always be a candidate for splitting (sorted by key).
      * @param[in] draw_predictor_weights Weights for each predictor when drawing
      * candidates.
      * @param[in] split_rule The rule for identifying the best split.

diff --git a/src/utility.h b/src/utility.h
@@ -115,7 +115,7 @@ PtrT<std::vector<bool>> make_is_ordered(
 
 
 /** Make a container of keys for the predictors that are always
- * candidates.
+ * candidates (sorted by key).
  * @param[in] predictor_names The names of the predictor variables in the order
  * they (will) appear in the data.
  * @param[in] names_of_always_draw The names of predictor that will always
@@ -293,6 +293,9 @@ PtrT<std::vector<size_t>> make_draw_always_predictor_keys(
             "splitting plus 'n_try' cannot be larger than total number of "
             "predictors (columns)");
 
+  /* must be sorted to pass to draw_no_replace */
+    std::sort(result.begin(), result.end());
+
     return result;
 }