From 1f65943ec7334bc60b58ff0b80e12b28a16d53ba Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Mon, 16 Sep 2024 13:56:38 +0100
Subject: [PATCH 01/18] remove check for model column

---
 R/check-input-helpers.R                   | 29 ----------------
 R/forecast.R                              | 40 ++++++-----------------
 man/as_forecast.Rd                        |  4 ---
 man/as_forecast_binary.Rd                 |  4 ---
 man/as_forecast_generic.Rd                |  7 +---
 man/as_forecast_nominal.Rd                |  4 ---
 man/as_forecast_point.Rd                  |  4 ---
 man/as_forecast_quantile.Rd               |  4 ---
 man/as_forecast_sample.Rd                 |  4 ---
 man/ensure_model_column.Rd                | 21 ------------
 tests/testthat/test-forecast.R            | 13 ++------
 tests/testthat/test-input-check-helpers.R | 14 --------
 12 files changed, 14 insertions(+), 134 deletions(-)
 delete mode 100644 man/ensure_model_column.Rd
diff --git a/R/check-input-helpers.R b/R/check-input-helpers.R
index 1a2c71da5..ea16b0c18 100644
--- a/R/check-input-helpers.R
+++ b/R/check-input-helpers.R
@@ -40,35 +40,6 @@ check_try <- function(expr) {
   return(msg)
 }
 
-
-#' @title Assure that data has a `model` column
-#'
-#' @description
-#' Check whether the data.table has a column called `model`.
-#' If not, a column called `model` is added with the value `Unspecified model`.
-#' @inheritParams as_forecast
-#' @importFrom cli cli_inform
-#' @importFrom checkmate assert_data_table
-#' @return The data.table with a column called `model`
-#' @keywords internal_input_check
-ensure_model_column <- function(data) {
-  assert_data_table(data)
-  if (!("model" %in% colnames(data))) {
-    #nolint start: keyword_quote_linter
-    cli_warn(
-      c(
-        "!" = "There is no column called `model` in the data.",
-        "i" = "scoringutils assumes that all forecasts come from the
-        same model"
-      )
-    )
-    #nolint end
-    data[, model := "Unspecified model"]
-  }
-  return(data[])
-}
-
-
 #' Check that all forecasts have the same number of quantiles or samples
 #' @description
 #' Function checks the number of quantiles or samples per forecast.
diff --git a/R/forecast.R b/R/forecast.R
index 4165d2d7f..bc5e24a19 100644
--- a/R/forecast.R
+++ b/R/forecast.R
@@ -12,7 +12,7 @@
 #'
 #' The `as_forecast_<type>()` functions give users some control over how their
 #' data is parsed.
-#' Using the arguments `observed`, `predicted`, `model`, etc. users can rename
+#' Using the arguments `observed`, `predicted`, etc. users can rename
 #' existing columns of their input data to match the required columns for a
 #' forecast object. Using the argument `forecast_unit`, users can specify the
 #' the columns that uniquely identify a single forecast (and remove the others,
@@ -37,9 +37,6 @@
 #'   observed values. This column will be renamed to "observed".
 #' @param predicted (optional) Name of the column in `data` that contains the
 #'   predicted values. This column will be renamed to "predicted".
-#' @param model (optional) Name of the column in `data` that contains the names
-#'   of the models/forecasters that generated the predicted values.
-#'   This column will be renamed to "model".
 #' @inheritSection forecast_types Forecast types and input formats
 #' @inheritSection forecast_types Forecast unit
 #' @return
@@ -72,8 +69,7 @@ NULL
 as_forecast_generic <- function(data,
                                 forecast_unit = NULL,
                                 observed = NULL,
-                                predicted = NULL,
-                                model = NULL) {
+                                predicted = NULL) {
   # check inputs - general
   data <- ensure_data.table(data)
   assert_character(observed, len = 1, null.ok = TRUE)
@@ -82,9 +78,6 @@ as_forecast_generic <- function(data,
   assert_character(predicted, len = 1, null.ok = TRUE)
   assert_subset(predicted, names(data), empty.ok = TRUE)
 
-  assert_character(model, len = 1, null.ok = TRUE)
-  assert_subset(model, names(data), empty.ok = TRUE)
-
   # rename columns - general
   if (!is.null(observed)) {
     setnames(data, old = observed, new = "observed")
@@ -92,12 +85,6 @@ as_forecast_generic <- function(data,
   if (!is.null(predicted)) {
     setnames(data, old = predicted, new = "predicted")
   }
-  if (!is.null(model)) {
-    setnames(data, old = model, new = "model")
-  }
-
-  # ensure that a model column is present after renaming
-  ensure_model_column(data)
 
   # set forecast unit (error handling is done in `set_forecast_unit()`)
   if (!is.null(forecast_unit)) {
@@ -119,9 +106,8 @@ as_forecast_generic <- function(data,
 as_forecast_binary <- function(data,
                                forecast_unit = NULL,
                                observed = NULL,
-                               predicted = NULL,
-                               model = NULL) {
-  data <- as_forecast_generic(data, forecast_unit, observed, predicted, model)
+                               predicted = NULL) {
+  data <- as_forecast_generic(data, forecast_unit, observed, predicted)
   data <- new_forecast(data, "forecast_binary")
   assert_forecast(data)
   return(data)
@@ -149,9 +135,8 @@ as_forecast_point.default <- function(data,
                                       forecast_unit = NULL,
                                       observed = NULL,
                                       predicted = NULL,
-                                      model = NULL,
                                       ...) {
-  data <- as_forecast_generic(data, forecast_unit, observed, predicted, model)
+  data <- as_forecast_generic(data, forecast_unit, observed, predicted)
   data <- new_forecast(data, "forecast_point")
   assert_forecast(data)
   return(data)
@@ -206,7 +191,6 @@ as_forecast_quantile.default <- function(data,
                                          forecast_unit = NULL,
                                          observed = NULL,
                                          predicted = NULL,
-                                         model = NULL,
                                          quantile_level = NULL,
                                          ...) {
   assert_character(quantile_level, len = 1, null.ok = TRUE)
@@ -215,7 +199,7 @@ as_forecast_quantile.default <- function(data,
     setnames(data, old = quantile_level, new = "quantile_level")
   }
 
-  data <- as_forecast_generic(data, forecast_unit, observed, predicted, model)
+  data <- as_forecast_generic(data, forecast_unit, observed, predicted)
   data <- new_forecast(data, "forecast_quantile")
   assert_forecast(data)
   return(data)
@@ -280,7 +264,6 @@ as_forecast_sample <- function(data,
                                forecast_unit = NULL,
                                observed = NULL,
                                predicted = NULL,
-                               model = NULL,
                                sample_id = NULL) {
   assert_character(sample_id, len = 1, null.ok = TRUE)
   assert_subset(sample_id, names(data), empty.ok = TRUE)
@@ -288,7 +271,7 @@ as_forecast_sample <- function(data,
     setnames(data, old = sample_id, new = "sample_id")
   }
 
-  data <- as_forecast_generic(data, forecast_unit, observed, predicted, model)
+  data <- as_forecast_generic(data, forecast_unit, observed, predicted)
   data <- new_forecast(data, "forecast_sample")
   assert_forecast(data)
   return(data)
@@ -312,7 +295,6 @@ as_forecast_nominal <- function(data,
                                 forecast_unit = NULL,
                                 observed = NULL,
                                 predicted = NULL,
-                                model = NULL,
                                 predicted_label = NULL) {
   assert_character(predicted_label, len = 1, null.ok = TRUE)
   assert_subset(predicted_label, names(data), empty.ok = TRUE)
@@ -320,7 +302,7 @@ as_forecast_nominal <- function(data,
     setnames(data, old = predicted_label, new = "predicted_label")
   }
 
-  data <- as_forecast_generic(data, forecast_unit, observed, predicted, model)
+  data <- as_forecast_generic(data, forecast_unit, observed, predicted)
   data <- new_forecast(data, "forecast_nominal")
   assert_forecast(data)
   return(data)
@@ -523,7 +505,7 @@ validate_forecast <- function(forecast, forecast_type = NULL, verbose = TRUE) {
 #' The function runs input checks that apply to all input data, regardless of
 #' forecast type. The function
 #' - asserts that the forecast is a data.table which has columns `observed` and
-#' `predicted`, as well as a column called `model`.
+#' `predicted`
 #' - checks the forecast type and forecast unit
 #' - checks there are no duplicate forecasts
 #' - if appropriate, checks the number of samples / quantiles is the same
@@ -539,7 +521,7 @@ validate_forecast <- function(forecast, forecast_type = NULL, verbose = TRUE) {
 assert_forecast_generic <- function(data, verbose = TRUE) {
   # check that data is a data.table and that the columns look fine
   assert_data_table(data, min.rows = 1)
-  assert(check_columns_present(data, c("observed", "predicted", "model")))
+  assert(check_columns_present(data, c("observed", "predicted")))
   problem <- test_columns_present(data, c("sample_id", "quantile_level"))
   if (problem) {
     cli_abort(
@@ -616,7 +598,6 @@ clean_forecast <- function(forecast, copy = FALSE, na.omit = FALSE) {
 #' @description
 #' Construct a class based on a data.frame or similar. The constructor
 #' - coerces the data into a data.table
-#' - makes sure that a column called `model` exists and if not creates one
 #' - assigns a class
 #'
 #' @inheritParams as_forecast
@@ -626,7 +607,6 @@ clean_forecast <- function(forecast, copy = FALSE, na.omit = FALSE) {
 #' @keywords internal
 new_forecast <- function(data, classname) {
   data <- as.data.table(data)
-  data <- ensure_model_column(data)
   class(data) <- c(classname, "forecast", class(data))
   data <- copy(data)
   return(data[])
diff --git a/man/as_forecast.Rd b/man/as_forecast.Rd
index f57886dca..ce041f6cc 100644
--- a/man/as_forecast.Rd
+++ b/man/as_forecast.Rd
@@ -20,10 +20,6 @@ observed values. This column will be renamed to "observed".}
 
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
-
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
 }
 \value{
 Depending on the forecast type, an object of the following class will be
diff --git a/man/as_forecast_binary.Rd b/man/as_forecast_binary.Rd
index e856a9bd1..b64788b73 100644
--- a/man/as_forecast_binary.Rd
+++ b/man/as_forecast_binary.Rd
@@ -29,10 +29,6 @@ observed values. This column will be renamed to "observed".}
 
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
-
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
 }
 \description{
 Create a \code{forecast} object for binary forecasts. See more information on
diff --git a/man/as_forecast_generic.Rd b/man/as_forecast_generic.Rd
index c69663df2..3895e756e 100644
--- a/man/as_forecast_generic.Rd
+++ b/man/as_forecast_generic.Rd
@@ -8,8 +8,7 @@ as_forecast_generic(
   data,
   forecast_unit = NULL,
   observed = NULL,
-  predicted = NULL,
-  model = NULL
+  predicted = NULL
 )
 }
 \arguments{
@@ -29,10 +28,6 @@ observed values. This column will be renamed to "observed".}
 
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
-
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
 }
 \description{
 Common functionality for \verb{as_forecast_<type>} functions
diff --git a/man/as_forecast_nominal.Rd b/man/as_forecast_nominal.Rd
index 178859f12..b08f17fe7 100644
--- a/man/as_forecast_nominal.Rd
+++ b/man/as_forecast_nominal.Rd
@@ -31,10 +31,6 @@ observed values. This column will be renamed to "observed".}
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
 
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
-
 \item{predicted_label}{(optional) Name of the column in \code{data} that denotes
 the outcome to which a predicted probability corresponds to.
 This column will be renamed to "predicted_label". Only applicable to
diff --git a/man/as_forecast_point.Rd b/man/as_forecast_point.Rd
index 690679c92..f829697a8 100644
--- a/man/as_forecast_point.Rd
+++ b/man/as_forecast_point.Rd
@@ -38,10 +38,6 @@ observed values. This column will be renamed to "observed".}
 
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
-
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
 }
 \description{
 Create a \code{forecast} object for point forecasts. See more information on
diff --git a/man/as_forecast_quantile.Rd b/man/as_forecast_quantile.Rd
index 599e29e4a..ac0363eac 100644
--- a/man/as_forecast_quantile.Rd
+++ b/man/as_forecast_quantile.Rd
@@ -45,10 +45,6 @@ observed values. This column will be renamed to "observed".}
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
 
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
-
 \item{quantile_level}{(optional) Name of the column in \code{data} that contains
 the quantile level of the predicted values. This column will be renamed to
 "quantile_level". Only applicable to quantile-based forecasts.}
diff --git a/man/as_forecast_sample.Rd b/man/as_forecast_sample.Rd
index 443c1d29e..d74a99e60 100644
--- a/man/as_forecast_sample.Rd
+++ b/man/as_forecast_sample.Rd
@@ -31,10 +31,6 @@ observed values. This column will be renamed to "observed".}
 \item{predicted}{(optional) Name of the column in \code{data} that contains the
 predicted values. This column will be renamed to "predicted".}
 
-\item{model}{(optional) Name of the column in \code{data} that contains the names
-of the models/forecasters that generated the predicted values.
-This column will be renamed to "model".}
-
 \item{sample_id}{(optional) Name of the column in \code{data} that contains the
 sample id. This column will be renamed to "sample_id". Only applicable to
 sample-based forecasts.}
diff --git a/man/ensure_model_column.Rd b/man/ensure_model_column.Rd
deleted file mode 100644
index 221df0304..000000000
--- a/man/ensure_model_column.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/check-input-helpers.R
-\name{ensure_model_column}
-\alias{ensure_model_column}
-\title{Assure that data has a \code{model} column}
-\usage{
-ensure_model_column(data)
-}
-\arguments{
-\item{data}{A data.frame (or similar) with predicted and observed values.
-See the details section of \code{\link[=as_forecast]{as_forecast()}} for additional information
-on required input formats.}
-}
-\value{
-The data.table with a column called \code{model}
-}
-\description{
-Check whether the data.table has a column called \code{model}.
-If not, a column called \code{model} is added with the value \verb{Unspecified model}.
-}
-\keyword{internal_input_check}
diff --git a/tests/testthat/test-forecast.R b/tests/testthat/test-forecast.R
index 2ae498805..a116e97f7 100644
--- a/tests/testthat/test-forecast.R
+++ b/tests/testthat/test-forecast.R
@@ -43,12 +43,12 @@ test_that("as_forecast() works as expected", {
   # additional test with renaming the model column
   test <- na.omit(data.table::copy(example_sample_continuous))
   data.table::setnames(test,
-    old = c("observed", "predicted", "sample_id", "model"),
-    new = c("obs", "pred", "sample", "mod")
+    old = c("observed", "predicted", "sample_id"),
+    new = c("obs", "pred", "sample")
   )
   expect_no_condition(
     as_forecast_sample(test,
-      observed = "obs", predicted = "pred", model = "mod",
+      observed = "obs", predicted = "pred",
       forecast_unit = c(
         "location", "model", "target_type",
         "target_end_date", "horizon"
@@ -128,13 +128,6 @@ test_that("as_forecast() function throws an error with duplicate forecasts", {
   )
 })
 
-test_that("as_forecast_quantile() function warns when no model column is present", {
-  no_model <- as.data.table(example_quantile[model == "EuroCOVIDhub-ensemble"])[, model := NULL][]
-  expect_warning(
-    as_forecast_quantile(no_model),
-    "There is no column called `model` in the data.")
-})
-
 test_that("as_forecast_quantile() function throws an error when no predictions or observed values are present", {
   expect_error(suppressMessages(suppressWarnings(as_forecast_quantile(
     data.table::copy(example_quantile)[, predicted := NULL]
diff --git a/tests/testthat/test-input-check-helpers.R b/tests/testthat/test-input-check-helpers.R
index 0b200d2b8..1c00215cf 100644
--- a/tests/testthat/test-input-check-helpers.R
+++ b/tests/testthat/test-input-check-helpers.R
@@ -10,20 +10,6 @@ test_that("Check equal length works if all arguments have length 1", {
   expect_equal(out, 0.05)
 })
 
-test_that("ensure_model_column works", {
-  test <- as.data.table(example_binary)
-  expect_warning(
-    ensure_model_column(test[, model := NULL]),
-    "There is no column called `model` in the data."
-  )
-  expect_true(
-    setequal(
-      ensure_model_column(example_binary),
-      example_binary
-    )
-  )
-})
-
 test_that("check_number_per_forecast works", {
   expect_identical(
     capture.output(

From 42d149c9395ae4ac0be285bef825ed2decf9117d Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Mon, 16 Sep 2024 14:02:31 +0100
Subject: [PATCH 02/18] remove default from summarise scores

---
 R/summarise_scores.R           |  8 +++++---
 man/as_forecast.Rd             |  2 +-
 man/as_forecast_binary.Rd      |  3 +--
 man/as_forecast_nominal.Rd     |  1 -
 man/as_forecast_point.Rd       |  1 -
 man/as_forecast_quantile.Rd    |  1 -
 man/as_forecast_sample.Rd      |  1 -
 man/assert_forecast_generic.Rd |  2 +-
 man/new_forecast.Rd            |  1 -
 man/summarise_scores.Rd        | 10 ++++++----
 10 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/R/summarise_scores.R b/R/summarise_scores.R
index 50f192c98..b87c89b13 100644
--- a/R/summarise_scores.R
+++ b/R/summarise_scores.R
@@ -13,8 +13,10 @@
 #'
 #' @param scores An object of class `scores` (a data.table with
 #'   scores and an additional attribute `metrics` as produced by [score()]).
-#' @param by Character vector with column names to summarise scores by. Default
-#'   is `model`, meaning that there will be one score per model in the output.
+#' @param by Character vector with column names to summarise scores by. An
+#'   example here would be something like a `model` column when summarising
+#'   scores by model. Default is an empty character vector, which means that
+#'   scores are summarised without grouping.
 #' @param fun A function used for summarising scores. Default is [mean()].
 #' @param ... Additional parameters that can be passed to the summary function
 #'   provided to `fun`. For more information see the documentation of the
@@ -50,7 +52,7 @@
 #' @keywords scoring
 
 summarise_scores <- function(scores,
-                             by = "model",
+                             by = character(0),
                              fun = mean,
                              ...) {
   # input checking ------------------------------------------------------------
diff --git a/man/as_forecast.Rd b/man/as_forecast.Rd
index ce041f6cc..42f1525ef 100644
--- a/man/as_forecast.Rd
+++ b/man/as_forecast.Rd
@@ -40,7 +40,7 @@ on the expected input formats.
 
 The \verb{as_forecast_<type>()} functions give users some control over how their
 data is parsed.
-Using the arguments \code{observed}, \code{predicted}, \code{model}, etc. users can rename
+Using the arguments \code{observed}, \code{predicted}, etc. users can rename
 existing columns of their input data to match the required columns for a
 forecast object. Using the argument \code{forecast_unit}, users can specify the
 the columns that uniquely identify a single forecast (and remove the others,
diff --git a/man/as_forecast_binary.Rd b/man/as_forecast_binary.Rd
index b64788b73..160b3ffcb 100644
--- a/man/as_forecast_binary.Rd
+++ b/man/as_forecast_binary.Rd
@@ -8,8 +8,7 @@ as_forecast_binary(
   data,
   forecast_unit = NULL,
   observed = NULL,
-  predicted = NULL,
-  model = NULL
+  predicted = NULL
 )
 }
 \arguments{
diff --git a/man/as_forecast_nominal.Rd b/man/as_forecast_nominal.Rd
index b08f17fe7..badc43b8d 100644
--- a/man/as_forecast_nominal.Rd
+++ b/man/as_forecast_nominal.Rd
@@ -9,7 +9,6 @@ as_forecast_nominal(
   forecast_unit = NULL,
   observed = NULL,
   predicted = NULL,
-  model = NULL,
   predicted_label = NULL
 )
 }
diff --git a/man/as_forecast_point.Rd b/man/as_forecast_point.Rd
index f829697a8..983ca7d89 100644
--- a/man/as_forecast_point.Rd
+++ b/man/as_forecast_point.Rd
@@ -13,7 +13,6 @@ as_forecast_point(data, ...)
   forecast_unit = NULL,
   observed = NULL,
   predicted = NULL,
-  model = NULL,
   ...
 )
 
diff --git a/man/as_forecast_quantile.Rd b/man/as_forecast_quantile.Rd
index ac0363eac..96ffe1c71 100644
--- a/man/as_forecast_quantile.Rd
+++ b/man/as_forecast_quantile.Rd
@@ -13,7 +13,6 @@ as_forecast_quantile(data, ...)
   forecast_unit = NULL,
   observed = NULL,
   predicted = NULL,
-  model = NULL,
   quantile_level = NULL,
   ...
 )
diff --git a/man/as_forecast_sample.Rd b/man/as_forecast_sample.Rd
index d74a99e60..78bc80952 100644
--- a/man/as_forecast_sample.Rd
+++ b/man/as_forecast_sample.Rd
@@ -9,7 +9,6 @@ as_forecast_sample(
   forecast_unit = NULL,
   observed = NULL,
   predicted = NULL,
-  model = NULL,
   sample_id = NULL
 )
 }
diff --git a/man/assert_forecast_generic.Rd b/man/assert_forecast_generic.Rd
index 2f1247227..89fa6aaae 100644
--- a/man/assert_forecast_generic.Rd
+++ b/man/assert_forecast_generic.Rd
@@ -21,7 +21,7 @@ The function runs input checks that apply to all input data, regardless of
 forecast type. The function
 \itemize{
 \item asserts that the forecast is a data.table which has columns \code{observed} and
-\code{predicted}, as well as a column called \code{model}.
+\code{predicted}
 \item checks the forecast type and forecast unit
 \item checks there are no duplicate forecasts
 \item if appropriate, checks the number of samples / quantiles is the same
diff --git a/man/new_forecast.Rd b/man/new_forecast.Rd
index 4f8bc2acb..d22e48fff 100644
--- a/man/new_forecast.Rd
+++ b/man/new_forecast.Rd
@@ -20,7 +20,6 @@ An object of the class indicated by \code{classname}
 Construct a class based on a data.frame or similar. The constructor
 \itemize{
 \item coerces the data into a data.table
-\item makes sure that a column called \code{model} exists and if not creates one
 \item assigns a class
 }
 }
diff --git a/man/summarise_scores.Rd b/man/summarise_scores.Rd
index 8be524a8d..42a4fae2b 100644
--- a/man/summarise_scores.Rd
+++ b/man/summarise_scores.Rd
@@ -5,16 +5,18 @@
 \alias{summarize_scores}
 \title{Summarise scores as produced by \code{\link[=score]{score()}}}
 \usage{
-summarise_scores(scores, by = "model", fun = mean, ...)
+summarise_scores(scores, by = character(0), fun = mean, ...)
 
-summarize_scores(scores, by = "model", fun = mean, ...)
+summarize_scores(scores, by = character(0), fun = mean, ...)
 }
 \arguments{
 \item{scores}{An object of class \code{scores} (a data.table with
 scores and an additional attribute \code{metrics} as produced by \code{\link[=score]{score()}}).}
 
-\item{by}{Character vector with column names to summarise scores by. Default
-is \code{model}, meaning that there will be one score per model in the output.}
+\item{by}{Character vector with column names to summarise scores by. An
+example here would be something like a \code{model} column when summarising
+scores by model. Default is an empty character vector, which means that
+scores are summarised without grouping.}
 
 \item{fun}{A function used for summarising scores. Default is \code{\link[=mean]{mean()}}.}
 

From 72b05c0e16319755daa113253b3f6c5f49830cf0 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 13:55:47 +0100
Subject: [PATCH 03/18] split `by` argument to `pairwise_comparison()`

---
 R/pairwise-comparisons.R | 199 +++++++++++++++++++++------------------
 1 file changed, 107 insertions(+), 92 deletions(-)

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index 9a295a045..e2af84c0e 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -58,14 +58,15 @@
 #' `permutationTest` from the `surveillance` package by Michael Höhle,
 #' Andrea Riebler and Michaela Paul.
 #'
-#' @param by Character vector with column names that define the grouping level
-#'   for the pairwise comparisons. By default (`model`), there will be one
-#'   relative skill score per model. If, for example,
-#'   `by = c("model", "location")`. Then you will get a
-#'   separate relative skill score for every model in every location. Internally,
-#'   the data.table with scores will be split according `by` (removing "model"
-#'   before splitting) and the pairwise comparisons will be computed separately
-#'   for the split data.tables.
+#' @param compare Character vector with a single colum name that defines the
+#'   elements for the pairwise comparison. For example, if this is set to
+#'   "model", then elements of the "model" column will be compared.
+#' @param by Character vector with column names that define further grouping
+#'   levels for the pairwise comparisons. By default this is an empty character
+#'   vector and there will be one relative skill score per distinct entry of the
+#'   column selected in `compare`. If further columns are given here, for
+#'   example, `by = "location"`, then you a separate relative skill score is
+#'   calculated for every model in every location.
 #' @param metric A string with the name of the metric for which
 #'   a relative skill shall be computed. By default this is either "crps",
 #'   "wis" or "brier_score" if any of these are available.
@@ -102,9 +103,12 @@
 #'  as_forecast_quantile() %>%
 #'  score()
 #'
-#' pairwise <- get_pairwise_comparisons(scores, by = "target_type")
+#' pairwise <- get_pairwise_comparisons(
+#'   scores, compare = "model", by = "target_type"
+#' )
 #' pairwise2 <- get_pairwise_comparisons(
-#'   scores, by = "target_type", baseline = "EuroCOVIDhub-baseline"
+#'   scores, compare = "model", by = target_type",
+#'   baseline = "EuroCOVIDhub-baseline"
 #' )
 #'
 #' library(ggplot2)
@@ -113,7 +117,8 @@
 
 get_pairwise_comparisons <- function(
   scores,
-  by = "model",
+  compare,
+  by = character(0),
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL,
   ...
@@ -130,25 +135,29 @@ get_pairwise_comparisons <- function(
   assert_subset(metric, metrics, empty.ok = FALSE)
   assert_character(metric, len = 1)
 
-  # check that model column + columns in 'by' are present
+  # check that columns in 'by' are present
   #nolint start: keyword_quote_linter object_usage_linter
-  by_cols <- check_columns_present(scores, by)
-  if (!isTRUE(by_cols)) {
-    cli_abort(
-      c(
-        "!" = "Not all columns specified in `by` are present: {.var {by_cols}}"
+  if (length(by) > 0) {
+    by_cols <- check_columns_present(scores, by)
+    if (!isTRUE(by_cols)) {
+      cli_abort(
+        c(
+          "!" = "Not all columns specified in `by` are present: {.var {by_cols}}"
+        )
       )
-    )
-    #nolint end
+      #nolint end
+    }
   }
-  assert(check_columns_present(scores, "model"))
 
-  # check that baseline is one of the existing models
-  models <- as.vector(unique(scores$model))
-  assert_subset(baseline, models)
+  # check that 'compare' column in is are present
+  assert(check_columns_present(scores, compare))
+
+  # check that baseline exists
+  comparators <- as.vector(unique(scores[[compare]]))
+  assert_subset(baseline, comparators)
 
-  # check there are enough models
-  if (length(setdiff(models, baseline)) < 2) {
+  # check there are enough comparators
+  if (length(setdiff(comparators, baseline)) < 2) {
     #nolint start: keyword_quote_linter
     cli_abort(
       c(
@@ -196,36 +205,35 @@ get_pairwise_comparisons <- function(
   # identify unit of single observation.
   forecast_unit <- get_forecast_unit(scores)
 
-  # if by is equal to forecast_unit, then pairwise comparisons don't make sense
-  # if by == forecast_unit == "model" then this will pass and all relative skill
+  # if compare is equal to forecast_unit, then pairwise comparisons don't make
+  # sense
+  # if compare == forecast_unit then this all relative skill
   # scores will simply be 1.
-  if (setequal(by, forecast_unit)) {
-    if (setequal(by, "model")) {
-      #nolint start: keyword_quote_linter
-      cli_warn(
-        c(
-          "!" = "`by` is set to 'model', which is also the unit of a single
-         forecast. This doesn't look right.",
+  if (setequal(compare, forecast_unit)) {
+    #nolint start: keyword_quote_linter
+    cli_warn(
+      c(
+        "!" = "`compare` is set to the unit of a single forecast. This doesn't
+            look right.",
         "i" = "All relative skill scores will be equal to 1."
-        )
       )
-    } else {
-      by <- "model"
-      cli_inform(
-        c(
-          "!" = "relative skill can only be computed if `by` is different from the
-        unit of a single forecast.",
-        "i" = "`by` was set to 'model'"
-        )
+    )
+  } else if (setequal(c(compare, by), forecast_unit)) {
+    #nolint start: keyword_quote_linter
+    cli_inform(
+      c(
+        "!" = "relative skill can only be computed if the combination of
+        `compare` and `by` is different from the unit of a single forecast.",
+        "i" = "`by` was set to an empty character vector"
       )
-      #nolint end
-    }
+    )
+    #nolint end
+    by <- character(0)
   }
 
   # do the pairwise comparison -------------------------------------------------
   # split data set into groups determined by 'by'
-  split_by <- setdiff(by, "model")
-  split_scores <- split(scores, by = split_by)
+  split_scores <- split(scores, by = by)
 
   results <- lapply(split_scores,
     FUN = function(scores) {
@@ -233,6 +241,7 @@ get_pairwise_comparisons <- function(
         scores = scores,
         metric = metric,
         baseline = baseline,
+        compare = compare,
         by = by,
         ...
       )
@@ -262,21 +271,22 @@ get_pairwise_comparisons <- function(
 pairwise_comparison_one_group <- function(scores,
                                           metric,
                                           baseline,
+                                          compare,
                                           by,
                                           ...) {
-  if (!("model" %in% names(scores))) {
+  if (!(compare %in% names(scores))) {
     cli_abort(
-      "pairwise comparisons require a column called 'model'"
+      "pairwise comparisons require a column as given by `compare`"
     )
   }
 
   # get list of models
-  models <- unique(scores$model)
+  comparators <- unique(scores[[compare]])
 
   # if there aren't enough models to do any comparison, abort
-  if (length(models) < 2) {
+  if (length(comparators) < 2) {
     cli_abort(
-      c("!" = "There are not enough models to do any comparison")
+      c("!" = "There are not enough comparators to do any comparison")
     )
   }
 
@@ -286,13 +296,14 @@ pairwise_comparison_one_group <- function(scores,
   # be the same
 
   # set up initial data.frame with all possible pairwise comparisons
-  combinations <- data.table::as.data.table(t(combn(models, m = 2)))
-  colnames(combinations) <- c("model", "compare_against")
+  combinations <- data.table::as.data.table(t(combn(comparators, m = 2)))
+  colnames(combinations) <- c("..compare", "compare_against")
 
-  combinations[, c("ratio", "pval") := compare_two_models(
+  combinations[, c("ratio", "pval") := compare_forecasts(
+    compare = compare,
     scores = scores,
-    name_model1 = model,
-    name_model2 = compare_against,
+    name_comparator1 = ..compare,
+    name_comparator2 = compare_against,
     metric = metric,
     ...
   ),
@@ -305,20 +316,19 @@ pairwise_comparison_one_group <- function(scores,
   # mirror computations
   combinations_mirrored <- data.table::copy(combinations)
   data.table::setnames(combinations_mirrored,
-    old = c("model", "compare_against"),
-    new = c("compare_against", "model")
+    old = c("..compare", "compare_against"),
+    new = c("compare_against", "..compare")
   )
   combinations_mirrored[, ratio := 1 / ratio]
 
   # add a one for those that are the same
   combinations_equal <- data.table::data.table(
-    model = models,
-    compare_against = models,
+    ..compare = comparators,
+    compare_against = comparators,
     ratio = 1,
     pval = 1,
     adj_pval = 1
   )
-
   result <- data.table::rbindlist(list(
     combinations,
     combinations_mirrored,
@@ -329,35 +339,37 @@ pairwise_comparison_one_group <- function(scores,
 
   # make result character instead of factor
   result[, `:=`(
-    model = as.character(model),
+    ..compare = as.character(..compare),
     compare_against = as.character(compare_against)
   )]
 
+  data.table::setnames(result, old = "..compare", new = compare)
+
   # calculate relative skill as geometric mean
   # small theta is again better (assuming that the score is negatively oriented)
   result[, `:=`(
     theta = geometric_mean(ratio)
   ),
-  by = "model"
+  by = compare,
   ]
 
   if (!is.null(baseline)) {
-    baseline_theta <- unique(result[model == baseline, ]$theta)
+    baseline_theta <- unique(result[get(compare) == baseline, ]$theta)
     if (length(baseline_theta) == 0) {
       cli_abort(
-        "Baseline model {.var {baseline}} missing."
+        "Baseline comparator {.var {baseline}} missing."
       )
     }
     result[, rel_to_baseline := theta / baseline_theta]
   }
 
   # remove all the rows that are not present in by before merging
-  cols_to_keep <- unique(c(by, "model"))
+  cols_to_keep <- unique(c(by, compare))
   cols_to_remove <- colnames(scores)[!(colnames(scores) %in% cols_to_keep)]
   scores[, eval(cols_to_remove) := NULL]
   scores <- unique(scores)
   # allow.cartesian needs to be set as sometimes rows will be duplicated a lot
-  out <- merge(scores, result, by = "model", all = TRUE)
+  out <- merge(scores, result, by = compare, all = TRUE)
 
   # rename ratio to mean_scores_ratio
   data.table::setnames(out,
@@ -377,21 +389,21 @@ pairwise_comparison_one_group <- function(scores,
   return(out[])
 }
 
-#' @title Compare two models based on subset of common forecasts
+#' @title Compare a subset of common forecasts
 #'
 #' @description
-#' This function compares two models based on the subset of forecasts for which
-#' both models have made a prediction. It gets called
+#' This function compares two comparators based on the subset of forecasts for which
+#' both comparators have made a prediction. It gets called
 #' from [pairwise_comparison_one_group()], which handles the
-#' comparison of multiple models on a single set of forecasts (there are no
+#' comparison of multiple comparators on a single set of forecasts (there are no
 #' subsets of forecasts to be distinguished). [pairwise_comparison_one_group()]
 #' in turn gets called from from [get_pairwise_comparisons()] which can handle
 #' pairwise comparisons for a set of forecasts with multiple subsets, e.g.
 #' pairwise comparisons for one set of forecasts, but done separately for two
 #' different forecast targets.
 #' @inheritParams get_pairwise_comparisons
-#' @param name_model1 Character, name of the first model
-#' @param name_model2 Character, name of the model to compare against
+#' @param name_comparator1 Character, name of the first comparator
+#' @param name_comparator2 Character, name of the comparator to compare against
 #' @param one_sided Boolean, default is `FALSE`, whether two conduct a one-sided
 #'   instead of a two-sided test to determine significance in a pairwise
 #'   comparison.
@@ -401,38 +413,39 @@ pairwise_comparison_one_group <- function(scores,
 #' @param n_permutations Numeric, the number of permutations for a
 #'   permutation test. Default is 999.
 #' @return A list with mean score ratios and p-values for the comparison
-#' between two models
+#' between two comparators
 #' @importFrom cli cli_abort
 #' @author Johannes Bracher, \email{johannes.bracher@@kit.edu}
 #' @author Nikos Bosse \email{nikosbosse@@gmail.com}
 #' @keywords internal
 
-compare_two_models <- function(scores,
-                               name_model1,
-                               name_model2,
-                               metric,
-                               one_sided = FALSE,
-                               test_type = c("non_parametric", "permutation"),
-                               n_permutations = 999) {
+compare_forecasts <- function(scores,
+                              compare,
+                              name_comparator1,
+                              name_comparator2,
+                              metric,
+                              one_sided = FALSE,
+                              test_type = c("non_parametric", "permutation"),
+                              n_permutations = 999) {
   scores <- data.table::as.data.table(scores)
 
   forecast_unit <- get_forecast_unit(scores)
 
-  if (!("model" %in% names(scores))) {
+  if (!(compare %in% names(scores))) {
     cli_abort(
-      "pairwise comparisons require a column called 'model'"
+      "pairwise comparisons require a column as given by `compare`"
     )
   }
 
   # select only columns in c(by, var)
-  a <- scores[model == name_model1]
-  b <- scores[model == name_model2]
+  a <- scores[get(compare) == name_comparator1]
+  b <- scores[get(compare) == name_comparator2]
 
-  # remove "model" from 'by' before merging
-  merge_by <- setdiff(forecast_unit, "model")
+  # remove compare column from 'by' before merging
+  merge_by <- setdiff(forecast_unit, compare)
 
   overlap <- merge(a, b, by = merge_by, allow.cartesian = TRUE)
-  unique(overlap)
+  overlap <- unique(overlap)
 
   if (nrow(overlap) == 0) {
     return(list(ratio = NA_real_, pval = NA_real_))
@@ -441,9 +454,9 @@ compare_two_models <- function(scores,
   values_x <- overlap[[paste0(metric, ".x")]]
   values_y <- overlap[[paste0(metric, ".y")]]
 
-  # calculate ratio to of average scores achieved by both models.
+  # calculate ratio to of average scores achieved by both comparator.
   # this should be equivalent to theta_ij in Johannes Bracher's document.
-  # ratio < 1 --> model1 is better.
+  # ratio < 1 --> comparator 1 is better.
   # note we could also take mean(values_x) / mean(values_y), as it cancels out
   ratio <- sum(values_x) / sum(values_y)
 
@@ -562,7 +575,8 @@ permutation_test <- function(scores1,
 #' @keywords keyword scoring
 add_relative_skill <- function(
   scores,
-  by = "model",
+  compare,
+  by = character(0),
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL
 ) {
@@ -573,6 +587,7 @@ add_relative_skill <- function(
     scores = scores,
     metric = metric,
     baseline = baseline,
+    compare = compare,
     by = by
   )
 

From 6cec8bea42ff45055e3f9f449ea2755c2cf1a5e2 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 15:01:59 +0100
Subject: [PATCH 04/18] update unit tests

---
 R/pairwise-comparisons.R                      |  4 +-
 tests/testthat/test-pairwise_comparison.R     | 90 +++++++++++--------
 .../testthat/test-plot_pairwise_comparison.R  |  2 +-
 3 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index e2af84c0e..dec27e859 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -207,8 +207,7 @@ get_pairwise_comparisons <- function(
 
   # if compare is equal to forecast_unit, then pairwise comparisons don't make
   # sense
-  # if compare == forecast_unit then this all relative skill
-  # scores will simply be 1.
+  # if compare == forecast_unit then all relative skill scores will simply be 1.
   if (setequal(compare, forecast_unit)) {
     #nolint start: keyword_quote_linter
     cli_warn(
@@ -218,6 +217,7 @@ get_pairwise_comparisons <- function(
         "i" = "All relative skill scores will be equal to 1."
       )
     )
+    #nolint end
   } else if (setequal(c(compare, by), forecast_unit)) {
     #nolint start: keyword_quote_linter
     cli_inform(
diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R
index 1e93ca514..7d635a3a6 100644
--- a/tests/testthat/test-pairwise_comparison.R
+++ b/tests/testthat/test-pairwise_comparison.R
@@ -65,11 +65,13 @@ test_that("get_pairwise_comparisons() works", {
     )
   )
   eval_without_baseline <- suppressMessages(
-    add_relative_skill(eval_without_rel_skill)
+    add_relative_skill(eval_without_rel_skill, compare = "model")
   )
 
   eval_with_baseline <- suppressMessages(
-    add_relative_skill(eval_without_rel_skill, baseline = "m1")
+    add_relative_skill(
+      eval_without_rel_skill, compare = "model", baseline = "m1"
+    )
   )
 
 
@@ -203,7 +205,9 @@ test_that("get_pairwise_comparisons() works", {
 
   eval <- score(data_formatted)
   eval_summarised <- summarise_scores(eval, by = c("model", "location"))
-  eval_with_baseline <- add_relative_skill(eval, by = c("model", "location"), baseline = "m1")
+  eval_with_baseline <- add_relative_skill(
+    eval, compare = "model", by = "location", baseline = "m1"
+)
   eval_with_baseline <- summarise_scores(eval_with_baseline, by = c("model", "location"))
 
   relative_skills_with <- eval_with_baseline[
@@ -220,7 +224,7 @@ test_that("get_pairwise_comparisons() works", {
 test_that("get_pairwise_comparisons() work in score() with integer data", {
   eval <- suppressMessages(score(forecast = as_forecast_sample(example_sample_discrete)))
   eval_summarised <- summarise_scores(eval, by = c("model", "target_type"))
-  eval <- add_relative_skill(eval_summarised)
+  eval <- add_relative_skill(eval_summarised, compare = "model")
   expect_true("crps_relative_skill" %in% colnames(eval))
 })
 
@@ -228,7 +232,7 @@ test_that("get_pairwise_comparisons() work in score() with integer data", {
 test_that("get_pairwise_comparisons() work in score() with binary data", {
   eval <- suppressMessages(score(forecast = as_forecast_binary(example_binary)))
   eval_summarised <- summarise_scores(eval, by = c("model", "target_type"))
-  eval <- add_relative_skill(eval_summarised)
+  eval <- add_relative_skill(eval_summarised, compare = "model")
   expect_true("brier_score_relative_skill" %in% colnames(eval))
 })
 
@@ -245,7 +249,9 @@ test_that("get_pairwise_comparisons() works", {
   )
   attr(df, "metrics") <- c("wis", "ae_median")
 
-  res <- suppressMessages(get_pairwise_comparisons(df, baseline = "model1"))
+  res <- suppressMessages(get_pairwise_comparisons(
+    df, compare = "model", baseline = "model1")
+    )
 
   colnames <- c(
     "model", "compare_against", "mean_scores_ratio",
@@ -257,7 +263,7 @@ test_that("get_pairwise_comparisons() works", {
   # output class is as expected
   expect_s3_class(res, c("data.table", "data.frame"), exact = TRUE)
   expect_s3_class(
-    get_pairwise_comparisons(scores_quantile),
+    get_pairwise_comparisons(scores_quantile, compare = "model"),
     c("data.table", "data.frame"), exact = TRUE
   )
 })
@@ -267,11 +273,11 @@ test_that("get_pairwise_comparisons() and `add_relative_skill()` give same resul
   eval <- scores_sample_continuous
 
   pairwise <- get_pairwise_comparisons(eval,
-    by = "model",
+    compare = "model",
     metric = "crps"
   )
 
-  eval2 <- add_relative_skill(scores_sample_continuous, by = "model")
+  eval2 <- add_relative_skill(scores_sample_continuous, compare = "model")
   eval2 <- summarise_scores(eval2, by = "model")
 
   expect_equal(
@@ -281,7 +287,9 @@ test_that("get_pairwise_comparisons() and `add_relative_skill()` give same resul
 
 test_that("get_pairwise_comparisons() realises when there is no baseline model", {
   expect_error(
-    get_pairwise_comparisons(scores_quantile, baseline = "missing_model"),
+    get_pairwise_comparisons(
+      scores_quantile, compare = "model", baseline = "missing_model"
+    ),
     "Assertion on 'baseline' failed: Must be a subset of"
   )
 })
@@ -292,7 +300,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   # check that model column + columns in 'by' + baseline model are present
   expect_error(
     add_relative_skill(
-      eval, by = c("model", "missing"), metric = "crps"
+      eval, compare = "model", by = "missing", metric = "crps"
     ),
     "Not all columns specified in `by` are present:"
   )
@@ -300,7 +308,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   # error if baseline is not present
   expect_error(
     add_relative_skill(
-      eval, by = "model", baseline = "missing", metric = "crps"
+      eval, compare = "model", baseline = "missing", metric = "crps"
     ),
     "Assertion on 'baseline' failed: Must be a subset of"
   )
@@ -309,12 +317,12 @@ test_that("Basic input checks for `add_relative_skill() work", {
   eval_few <- eval[model %in% c("EuroCOVIDhub-ensemble", "EuroCOVIDhub-baseline")]
   expect_no_error(
     add_relative_skill(
-      eval_few, by = "model", metric = "crps"
+      eval_few, compare = "model", metric = "crps"
     )
   )
   expect_error(
     add_relative_skill(
-      eval_few, by = "model", baseline = "EuroCOVIDhub-baseline",
+      eval_few, compare = "model", baseline = "EuroCOVIDhub-baseline",
       metric = "crps"
     ),
     "More than one non-baseline model is needed to compute pairwise compairisons."
@@ -323,14 +331,14 @@ test_that("Basic input checks for `add_relative_skill() work", {
   # error if no relative skill metric is found
   expect_error(
     add_relative_skill(
-      eval, by = "model",
+      eval, compare = "model",
       metric = "missing"
     )
   )
   eval_nometric <- data.table::copy(eval)[, "crps" := NULL]
   expect_error(
     suppressWarnings(add_relative_skill(
-      eval_nometric, by = "model"
+      eval_nometric, compare = "model"
     )),
     "Assertion on 'metric' failed: Must be a subset of "
   )
@@ -339,7 +347,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   eval_nomodel <- data.table::copy(eval)[, "model" := NULL]
   expect_error(
     add_relative_skill(
-      eval_nomodel, by = "target_type", metric = "crps"
+      eval_nomodel, compare = "model", by = "target_type", metric = "crps"
     ),
     "Assertion on 'scores' failed: Column 'model' not found in data."
   )
@@ -349,7 +357,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   attr(eval_noattribute, "metrics") <- NULL
   expect_error(
     add_relative_skill(
-      eval_noattribute, by = "model", metric = "crps"
+      eval_noattribute, compare = "model", metric = "crps"
     ),
     "needs an attribute `metrics`"
   )
@@ -359,7 +367,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   eval_nas[1:10, "crps" := NA]
   expect_warning(
     add_relative_skill(
-      eval_nas, by = "model", metric = "crps"
+      eval_nas, compare = "model", metric = "crps"
     ),
     "Some values for the metric `crps` are NA. These have been removed."
   )
@@ -368,7 +376,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   eval_nas[, "crps" := NA]
   expect_error(
     add_relative_skill(
-      eval_nas, by = "model", metric = "crps"
+      eval_nas, compare = "model", metric = "crps"
     ),
     "After removing \"NA\" values for `crps`, no values were left."
   )
@@ -378,7 +386,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   eval_diffsign[1:10, "crps" := -eval_diffsign[1:10, "crps"]]
   expect_error(
     add_relative_skill(
-      eval_diffsign, by = "model", metric = "crps"
+      eval_diffsign, compare = "model", metric = "crps"
     ),
     "To compute pairwise comparisons, all values of `crps` must have the same sign."
   )
@@ -387,17 +395,18 @@ test_that("Basic input checks for `add_relative_skill() work", {
   fu <- get_forecast_unit(eval)
   expect_message(
     add_relative_skill(
-      eval, by = fu, metric = "crps"),
-    "relative skill can only be computed if `by` is different from the unit of a single forecast."
+      eval, compare = "model", by = setdiff(fu, "model"), metric = "crps"
+    ),
+    "relative skill can only be computed if the combination of `compare` and `by` is different from the unit of a single forecast."
   )
 
   # warning if by is equal to the forecast unit and also by is "model"
   eval_summ <- summarise_scores(eval, by = "model")
   expect_warning(
     add_relative_skill(
-      eval_summ, by = "model", metric = "crps"
+      eval_summ, compare = "model", metric = "crps"
     ),
-    "`by` is set to 'model', which is also the unit of a single forecast."
+    "`compare` is set to the unit of a single forecast."
   )
 })
 
@@ -407,7 +416,7 @@ test_that("get_pairwise_comparisons() throws errors with wrong inputs", {
 
   # expect error if no model column is found
   expect_error(
-    get_pairwise_comparisons(test, by = "location", metric = "crps"),
+    get_pairwise_comparisons(test, compare = "model", metric = "crps"),
     "Assertion on 'scores' failed: Column 'model' not found in data."
   )
 })
@@ -418,40 +427,40 @@ test_that("pairwise_comparison_one_group() throws error with wrong inputs", {
 
   # expect error if no model column is found
   expect_error(
-    pairwise_comparison_one_group(test, by = "location", metric = "crps"),
-    "pairwise comparisons require a column called 'model'"
+    pairwise_comparison_one_group(test, compare = "model", metric = "crps"),
+    "pairwise comparisons require a column as given by `compare`"
   )
 
   # expect error as a result if scores has zero rows
   test <- data.table::copy(scores_sample_continuous)[model == "impossible"]
   expect_error(
-    pairwise_comparison_one_group(test, by = "model", metric = "crps"),
-    "not enough models"
+    pairwise_comparison_one_group(test, compare = "model", metric = "crps"),
+    "not enough comparators"
   )
 
   # expect error if there aren't enough models
   test <- data.table::copy(scores_sample_continuous)[model == "EuroCOVIDhub-ensemble"]
   expect_error(
-    pairwise_comparison_one_group(test, by = "model", metric = "crps"),
-    "not enough models"
+    pairwise_comparison_one_group(test, compare = "model", metric = "crps"),
+    "not enough comparators"
   )
 
   # expect error if baseline model is missing
   test <- data.table::copy(scores_sample_continuous)
   expect_error(
-    pairwise_comparison_one_group(test, by = "model", baseline = "missing", metric = "crps"),
-    "Baseline model `missing` missing"
+    pairwise_comparison_one_group(test, compare = "model", baseline = "missing", metric = "crps"),
+    "Baseline comparator `missing` missing"
   )
 })
 
-test_that("compare_two_models() throws error with wrong inputs", {
+test_that("compare_forecasts() throws error with wrong inputs", {
   test <- data.table::copy(scores_sample_continuous)
   test <- test[, "model" := NULL]
 
   # expect error if no model column is found
   expect_error(
-    compare_two_models(test, metric = "crps"),
-    "pairwise comparisons require a column called 'model'"
+    compare_forecasts(test, compare = "model", metric = "crps"),
+    "pairwise comparisons require a column as given by `compare`"
   )
 })
 
@@ -459,13 +468,14 @@ test_that("add_relative_skill() works with point forecasts", {
   expect_no_condition(
     pw_point <- add_relative_skill(
       scores_point,
+      compare = "model",
       metric = "se_point"
     )
   )
   pw_point <- summarise_scores(pw_point, by = "model")
 
   pw_manual <- get_pairwise_comparisons(
-    scores_point, by = "model", metric = "se_point"
+    scores_point, compare = "model", metric = "se_point"
   )
 
   expect_equal(
@@ -477,6 +487,7 @@ test_that("add_relative_skill() works with point forecasts", {
 test_that("add_relative_skill() can compute relative measures", {
   scores_with <- add_relative_skill(
     scores_quantile,
+    compare = "model"
   )
   expect_s3_class(
     scores_with,
@@ -492,7 +503,7 @@ test_that("add_relative_skill() can compute relative measures", {
   )
 
   scores_with <- add_relative_skill(
-    scores_quantile, by = "model",
+    scores_quantile, compare = "model",
     metric = "ae_median"
   )
   scores_with <- summarise_scores(scores_with, by = "model")
@@ -508,6 +519,7 @@ test_that("permutation_tests work as expected", {
   expect_no_condition(
     get_pairwise_comparisons(
       scores_quantile,
+      compare = "model",
       test_type = "permutation",
       one_sided = TRUE,
       n_permutations = 50
diff --git a/tests/testthat/test-plot_pairwise_comparison.R b/tests/testthat/test-plot_pairwise_comparison.R
index 88bac4fa5..50add6b14 100644
--- a/tests/testthat/test-plot_pairwise_comparison.R
+++ b/tests/testthat/test-plot_pairwise_comparison.R
@@ -1,5 +1,5 @@
 pairwise <- suppressMessages(
-  get_pairwise_comparisons(scores_quantile, by = "target_type")
+  get_pairwise_comparisons(scores_quantile, compare = "model", by = "target_type")
 )
 
 test_that("plot_pairwise_comparisons() works as expected", {

From f7ce235e47396f17205287c186465676c1f31ef3 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 15:03:31 +0100
Subject: [PATCH 05/18] add global

---
 R/z_globalVariables.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/z_globalVariables.R b/R/z_globalVariables.R
index 08db6c24f..dd69ae466 100644
--- a/R/z_globalVariables.R
+++ b/R/z_globalVariables.R
@@ -1,4 +1,5 @@
 globalVariables(c(
+  "..compare",
   "..index",
   "..probs",
   "..type",

From 9c4d7eab84586715aefd94b250aa3c77b1f10a67 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 15:09:21 +0100
Subject: [PATCH 06/18] update documentation

---
 man/add_relative_skill.Rd            | 21 +++++-----
 man/compare_two_models.Rd            | 60 ----------------------------
 man/get_metrics.forecast_point.Rd    |  2 +-
 man/get_pairwise_comparisons.Rd      | 41 ++++++-------------
 man/pairwise_comparison_one_group.Rd | 20 +++++-----
 5 files changed, 36 insertions(+), 108 deletions(-)
 delete mode 100644 man/compare_two_models.Rd

diff --git a/man/add_relative_skill.Rd b/man/add_relative_skill.Rd
index 04fc0e41b..528ca4ca4 100644
--- a/man/add_relative_skill.Rd
+++ b/man/add_relative_skill.Rd
@@ -6,7 +6,8 @@
 \usage{
 add_relative_skill(
   scores,
-  by = "model",
+  compare,
+  by = character(0),
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL
 )
@@ -15,14 +16,16 @@ add_relative_skill(
 \item{scores}{An object of class \code{scores} (a data.table with
 scores and an additional attribute \code{metrics} as produced by \code{\link[=score]{score()}}).}
 
-\item{by}{Character vector with column names that define the grouping level
-for the pairwise comparisons. By default (\code{model}), there will be one
-relative skill score per model. If, for example,
-\code{by = c("model", "location")}. Then you will get a
-separate relative skill score for every model in every location. Internally,
-the data.table with scores will be split according \code{by} (removing "model"
-before splitting) and the pairwise comparisons will be computed separately
-for the split data.tables.}
+\item{compare}{Character vector with a single colum name that defines the
+elements for the pairwise comparison. For example, if this is set to
+"model", then elements of the "model" column will be compared.}
+
+\item{by}{Character vector with column names that define further grouping
+levels for the pairwise comparisons. By default this is an empty character
+vector and there will be one relative skill score per distinct entry of the
+column selected in \code{compare}. If further columns are given here, for
+example, \code{by = "location"}, then you a separate relative skill score is
+calculated for every model in every location.}
 
 \item{metric}{A string with the name of the metric for which
 a relative skill shall be computed. By default this is either "crps",
diff --git a/man/compare_two_models.Rd b/man/compare_two_models.Rd
deleted file mode 100644
index 7f47b6342..000000000
--- a/man/compare_two_models.Rd
+++ /dev/null
@@ -1,60 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pairwise-comparisons.R
-\name{compare_two_models}
-\alias{compare_two_models}
-\title{Compare two models based on subset of common forecasts}
-\usage{
-compare_two_models(
-  scores,
-  name_model1,
-  name_model2,
-  metric,
-  one_sided = FALSE,
-  test_type = c("non_parametric", "permutation"),
-  n_permutations = 999
-)
-}
-\arguments{
-\item{scores}{An object of class \code{scores} (a data.table with
-scores and an additional attribute \code{metrics} as produced by \code{\link[=score]{score()}}).}
-
-\item{name_model1}{Character, name of the first model}
-
-\item{name_model2}{Character, name of the model to compare against}
-
-\item{metric}{A string with the name of the metric for which
-a relative skill shall be computed. By default this is either "crps",
-"wis" or "brier_score" if any of these are available.}
-
-\item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided
-instead of a two-sided test to determine significance in a pairwise
-comparison.}
-
-\item{test_type}{Character, either "non_parametric" (the default) or
-"permutation". This determines which kind of test shall be conducted to
-determine p-values.}
-
-\item{n_permutations}{Numeric, the number of permutations for a
-permutation test. Default is 999.}
-}
-\value{
-A list with mean score ratios and p-values for the comparison
-between two models
-}
-\description{
-This function compares two models based on the subset of forecasts for which
-both models have made a prediction. It gets called
-from \code{\link[=pairwise_comparison_one_group]{pairwise_comparison_one_group()}}, which handles the
-comparison of multiple models on a single set of forecasts (there are no
-subsets of forecasts to be distinguished). \code{\link[=pairwise_comparison_one_group]{pairwise_comparison_one_group()}}
-in turn gets called from from \code{\link[=get_pairwise_comparisons]{get_pairwise_comparisons()}} which can handle
-pairwise comparisons for a set of forecasts with multiple subsets, e.g.
-pairwise comparisons for one set of forecasts, but done separately for two
-different forecast targets.
-}
-\author{
-Johannes Bracher, \email{johannes.bracher@kit.edu}
-
-Nikos Bosse \email{nikosbosse@gmail.com}
-}
-\keyword{internal}
diff --git a/man/get_metrics.forecast_point.Rd b/man/get_metrics.forecast_point.Rd
index 4bc4dae2f..43f8143e5 100644
--- a/man/get_metrics.forecast_point.Rd
+++ b/man/get_metrics.forecast_point.Rd
@@ -34,7 +34,7 @@ The mean squared error, for example, is only a meaningful scoring rule if
 the forecaster actually reported the mean of their predictive distribution
 as a point forecast. If the forecaster reported the median, then the mean
 absolute error would be the appropriate scoring rule. If the scoring rule
-and the predictive task do not align, misleading results ensue.
+and the predictive task do not align, the results will be misleading.
 
 Failure to respect this correspondence can lead to grossly misleading
 results! Consider the example in the section below.
diff --git a/man/get_pairwise_comparisons.Rd b/man/get_pairwise_comparisons.Rd
index bc1be6cb1..221b7fa38 100644
--- a/man/get_pairwise_comparisons.Rd
+++ b/man/get_pairwise_comparisons.Rd
@@ -6,7 +6,8 @@
 \usage{
 get_pairwise_comparisons(
   scores,
-  by = "model",
+  compare,
+  by = character(0),
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL,
   ...
@@ -16,14 +17,16 @@ get_pairwise_comparisons(
 \item{scores}{An object of class \code{scores} (a data.table with
 scores and an additional attribute \code{metrics} as produced by \code{\link[=score]{score()}}).}
 
-\item{by}{Character vector with column names that define the grouping level
-for the pairwise comparisons. By default (\code{model}), there will be one
-relative skill score per model. If, for example,
-\code{by = c("model", "location")}. Then you will get a
-separate relative skill score for every model in every location. Internally,
-the data.table with scores will be split according \code{by} (removing "model"
-before splitting) and the pairwise comparisons will be computed separately
-for the split data.tables.}
+\item{compare}{Character vector with a single colum name that defines the
+elements for the pairwise comparison. For example, if this is set to
+"model", then elements of the "model" column will be compared.}
+
+\item{by}{Character vector with column names that define further grouping
+levels for the pairwise comparisons. By default this is an empty character
+vector and there will be one relative skill score per distinct entry of the
+column selected in \code{compare}. If further columns are given here, for
+example, \code{by = "location"}, then you a separate relative skill score is
+calculated for every model in every location.}
 
 \item{metric}{A string with the name of the metric for which
 a relative skill shall be computed. By default this is either "crps",
@@ -102,26 +105,6 @@ The implementation of the permutation test follows the function
 \code{permutationTest} from the \code{surveillance} package by Michael Höhle,
 Andrea Riebler and Michaela Paul.
 }
-\examples{
-\dontshow{
-  data.table::setDTthreads(2) # restricts number of cores used on CRAN
-}
-
-library(magrittr) # pipe operator
-
-scores <- example_quantile \%>\%
- as_forecast_quantile() \%>\%
- score()
-
-pairwise <- get_pairwise_comparisons(scores, by = "target_type")
-pairwise2 <- get_pairwise_comparisons(
-  scores, by = "target_type", baseline = "EuroCOVIDhub-baseline"
-)
-
-library(ggplot2)
-plot_pairwise_comparisons(pairwise, type = "mean_scores_ratio") +
-  facet_wrap(~target_type)
-}
 \author{
 Nikos Bosse \email{nikosbosse@gmail.com}
 
diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd
index d8dc304ea..935d0c2e3 100644
--- a/man/pairwise_comparison_one_group.Rd
+++ b/man/pairwise_comparison_one_group.Rd
@@ -4,7 +4,7 @@
 \alias{pairwise_comparison_one_group}
 \title{Do pairwise comparison for one set of forecasts}
 \usage{
-pairwise_comparison_one_group(scores, metric, baseline, by, ...)
+pairwise_comparison_one_group(scores, metric, baseline, compare, by, ...)
 }
 \arguments{
 \item{scores}{An object of class \code{scores} (a data.table with
@@ -19,14 +19,16 @@ given, then a scaled relative skill with respect to the baseline will be
 returned. By default (\code{NULL}), relative skill will not be scaled with
 respect to a baseline model.}
 
-\item{by}{Character vector with column names that define the grouping level
-for the pairwise comparisons. By default (\code{model}), there will be one
-relative skill score per model. If, for example,
-\code{by = c("model", "location")}. Then you will get a
-separate relative skill score for every model in every location. Internally,
-the data.table with scores will be split according \code{by} (removing "model"
-before splitting) and the pairwise comparisons will be computed separately
-for the split data.tables.}
+\item{compare}{Character vector with a single colum name that defines the
+elements for the pairwise comparison. For example, if this is set to
+"model", then elements of the "model" column will be compared.}
+
+\item{by}{Character vector with column names that define further grouping
+levels for the pairwise comparisons. By default this is an empty character
+vector and there will be one relative skill score per distinct entry of the
+column selected in \code{compare}. If further columns are given here, for
+example, \code{by = "location"}, then you a separate relative skill score is
+calculated for every model in every location.}
 
 \item{...}{Additional arguments for the comparison between two models. See
 \code{\link[=compare_two_models]{compare_two_models()}} for more information.}

From 0258d8657b6d7e9935d8e3b308a35c91b6de0c2f Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 15:18:36 +0100
Subject: [PATCH 07/18] fix check errors

---
 R/pairwise-comparisons.R             |  6 +--
 R/plot.R                             |  4 +-
 man/compare_forecasts.Rd             | 65 ++++++++++++++++++++++++++++
 man/get_pairwise_comparisons.Rd      | 25 ++++++++++-
 man/pairwise_comparison_one_group.Rd |  4 +-
 man/plot_pairwise_comparisons.Rd     |  4 +-
 6 files changed, 100 insertions(+), 8 deletions(-)
 create mode 100644 man/compare_forecasts.Rd

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index dec27e859..c30a029eb 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -75,7 +75,7 @@
 #'   returned. By default (`NULL`), relative skill will not be scaled with
 #'   respect to a baseline model.
 #' @param ... Additional arguments for the comparison between two models. See
-#'   [compare_two_models()] for more information.
+#'   [compare_forecasts()] for more information.
 #' @inheritParams summarise_scores
 #' @return A data.table with the results of pairwise comparisons
 #' containing the mean score ratios (`mean_scores_ratio`),
@@ -107,7 +107,7 @@
 #'   scores, compare = "model", by = "target_type"
 #' )
 #' pairwise2 <- get_pairwise_comparisons(
-#'   scores, compare = "model", by = target_type",
+#'   scores, compare = "model", by = "target_type",
 #'   baseline = "EuroCOVIDhub-baseline"
 #' )
 #'
@@ -263,7 +263,7 @@ get_pairwise_comparisons <- function(
 #' for different forecast targets) and then the actual pairwise comparison for
 #' that subgroup is managed from [pairwise_comparison_one_group()]. In order to
 #' actually do the comparison between two models over a subset of common
-#' forecasts it calls [compare_two_models()].
+#' forecasts it calls [compare_forecasts()].
 #' @inherit get_pairwise_comparisons params return
 #' @importFrom cli cli_abort
 #' @keywords internal
diff --git a/R/plot.R b/R/plot.R
index d9b85054d..d3e88d063 100644
--- a/R/plot.R
+++ b/R/plot.R
@@ -319,7 +319,9 @@ plot_quantile_coverage <- function(coverage,
 #' scores <- example_quantile %>%
 #'   as_forecast_quantile %>%
 #'   score()
-#' pairwise <- get_pairwise_comparisons(scores, by = "target_type")
+#' pairwise <- get_pairwise_comparisons(
+#'   scores, compare = "model", by = "target_type"
+#' )
 #' plot_pairwise_comparisons(pairwise, type = "mean_scores_ratio") +
 #'   facet_wrap(~target_type)
 
diff --git a/man/compare_forecasts.Rd b/man/compare_forecasts.Rd
new file mode 100644
index 000000000..c648e24c6
--- /dev/null
+++ b/man/compare_forecasts.Rd
@@ -0,0 +1,65 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pairwise-comparisons.R
+\name{compare_forecasts}
+\alias{compare_forecasts}
+\title{Compare a subset of common forecasts}
+\usage{
+compare_forecasts(
+  scores,
+  compare,
+  name_comparator1,
+  name_comparator2,
+  metric,
+  one_sided = FALSE,
+  test_type = c("non_parametric", "permutation"),
+  n_permutations = 999
+)
+}
+\arguments{
+\item{scores}{An object of class \code{scores} (a data.table with
+scores and an additional attribute \code{metrics} as produced by \code{\link[=score]{score()}}).}
+
+\item{compare}{Character vector with a single colum name that defines the
+elements for the pairwise comparison. For example, if this is set to
+"model", then elements of the "model" column will be compared.}
+
+\item{name_comparator1}{Character, name of the first comparator}
+
+\item{name_comparator2}{Character, name of the comparator to compare against}
+
+\item{metric}{A string with the name of the metric for which
+a relative skill shall be computed. By default this is either "crps",
+"wis" or "brier_score" if any of these are available.}
+
+\item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided
+instead of a two-sided test to determine significance in a pairwise
+comparison.}
+
+\item{test_type}{Character, either "non_parametric" (the default) or
+"permutation". This determines which kind of test shall be conducted to
+determine p-values.}
+
+\item{n_permutations}{Numeric, the number of permutations for a
+permutation test. Default is 999.}
+}
+\value{
+A list with mean score ratios and p-values for the comparison
+between two comparators
+}
+\description{
+This function compares two comparators based on the subset of forecasts for which
+both comparators have made a prediction. It gets called
+from \code{\link[=pairwise_comparison_one_group]{pairwise_comparison_one_group()}}, which handles the
+comparison of multiple comparators on a single set of forecasts (there are no
+subsets of forecasts to be distinguished). \code{\link[=pairwise_comparison_one_group]{pairwise_comparison_one_group()}}
+in turn gets called from from \code{\link[=get_pairwise_comparisons]{get_pairwise_comparisons()}} which can handle
+pairwise comparisons for a set of forecasts with multiple subsets, e.g.
+pairwise comparisons for one set of forecasts, but done separately for two
+different forecast targets.
+}
+\author{
+Johannes Bracher, \email{johannes.bracher@kit.edu}
+
+Nikos Bosse \email{nikosbosse@gmail.com}
+}
+\keyword{internal}
diff --git a/man/get_pairwise_comparisons.Rd b/man/get_pairwise_comparisons.Rd
index 221b7fa38..0b18d4618 100644
--- a/man/get_pairwise_comparisons.Rd
+++ b/man/get_pairwise_comparisons.Rd
@@ -38,7 +38,7 @@ returned. By default (\code{NULL}), relative skill will not be scaled with
 respect to a baseline model.}
 
 \item{...}{Additional arguments for the comparison between two models. See
-\code{\link[=compare_two_models]{compare_two_models()}} for more information.}
+\code{\link[=compare_forecasts]{compare_forecasts()}} for more information.}
 }
 \value{
 A data.table with the results of pairwise comparisons
@@ -105,6 +105,29 @@ The implementation of the permutation test follows the function
 \code{permutationTest} from the \code{surveillance} package by Michael Höhle,
 Andrea Riebler and Michaela Paul.
 }
+\examples{
+\dontshow{
+  data.table::setDTthreads(2) # restricts number of cores used on CRAN
+}
+
+library(magrittr) # pipe operator
+
+scores <- example_quantile \%>\%
+ as_forecast_quantile() \%>\%
+ score()
+
+pairwise <- get_pairwise_comparisons(
+  scores, compare = "model", by = "target_type"
+)
+pairwise2 <- get_pairwise_comparisons(
+  scores, compare = "model", by = "target_type",
+  baseline = "EuroCOVIDhub-baseline"
+)
+
+library(ggplot2)
+plot_pairwise_comparisons(pairwise, type = "mean_scores_ratio") +
+  facet_wrap(~target_type)
+}
 \author{
 Nikos Bosse \email{nikosbosse@gmail.com}
 
diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd
index 935d0c2e3..1aae9588f 100644
--- a/man/pairwise_comparison_one_group.Rd
+++ b/man/pairwise_comparison_one_group.Rd
@@ -31,7 +31,7 @@ example, \code{by = "location"}, then you a separate relative skill score is
 calculated for every model in every location.}
 
 \item{...}{Additional arguments for the comparison between two models. See
-\code{\link[=compare_two_models]{compare_two_models()}} for more information.}
+\code{\link[=compare_forecasts]{compare_forecasts()}} for more information.}
 }
 \value{
 A data.table with the results of pairwise comparisons
@@ -49,6 +49,6 @@ specified by the user (e.g. if pairwise comparison should be done separately
 for different forecast targets) and then the actual pairwise comparison for
 that subgroup is managed from \code{\link[=pairwise_comparison_one_group]{pairwise_comparison_one_group()}}. In order to
 actually do the comparison between two models over a subset of common
-forecasts it calls \code{\link[=compare_two_models]{compare_two_models()}}.
+forecasts it calls \code{\link[=compare_forecasts]{compare_forecasts()}}.
 }
 \keyword{internal}
diff --git a/man/plot_pairwise_comparisons.Rd b/man/plot_pairwise_comparisons.Rd
index fd175c424..be1b6778a 100644
--- a/man/plot_pairwise_comparisons.Rd
+++ b/man/plot_pairwise_comparisons.Rd
@@ -32,7 +32,9 @@ library(magrittr) # pipe operator
 scores <- example_quantile \%>\%
   as_forecast_quantile \%>\%
   score()
-pairwise <- get_pairwise_comparisons(scores, by = "target_type")
+pairwise <- get_pairwise_comparisons(
+  scores, compare = "model", by = "target_type"
+)
 plot_pairwise_comparisons(pairwise, type = "mean_scores_ratio") +
   facet_wrap(~target_type)
 }

From 3d0cca251b65b7b29fe835041e85bbf5ec42cc4d Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 15:18:46 +0100
Subject: [PATCH 08/18] update manuscript

---
 inst/manuscript/R/00-standalone-Figure-replication.R | 2 +-
 inst/manuscript/manuscript.Rmd                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/manuscript/R/00-standalone-Figure-replication.R b/inst/manuscript/R/00-standalone-Figure-replication.R
index c4ffdc788..23e457fc7 100644
--- a/inst/manuscript/R/00-standalone-Figure-replication.R
+++ b/inst/manuscript/R/00-standalone-Figure-replication.R
@@ -575,7 +575,7 @@ score(example_quantile) |>
 # Figure 9
 # =============================================================================#
 score(example_quantile) |>
-  get_pairwise_comparisons(by = c("model", "target_type"),
+  get_pairwise_comparisons(compare = "model", by = "target_type",
                            baseline = "EuroCOVIDhub-baseline") |>
   plot_pairwise_comparisons() +
   facet_wrap(~ target_type)
diff --git a/inst/manuscript/manuscript.Rmd b/inst/manuscript/manuscript.Rmd
index f8d697196..ecb4e1083 100644
--- a/inst/manuscript/manuscript.Rmd
+++ b/inst/manuscript/manuscript.Rmd
@@ -467,7 +467,7 @@ The mean score ratios resulting from \fct{pairwise\_comparison} can then be visu
 ```{r pairwise-plot, echo=TRUE, fig.width = 8, fig.cap="Ratios of mean weighted interval scores based on overlapping forecast sets. When interpreting the plot one should look at the model on the y-axis, and the model on the x-axis is the one it is compared against. If a tile is blue, then the model on the y-axis performed better (assuming that scores are negatively oriented, i.e., that lower scores are better). If it is red, the model on the x-axis performed better in direct comparison. In the example above, the EuroCOVIDhub-ensemble performs best (it only has values smaller than one), while the EuroCOVIDhub-baseline performs worst (and only has values larger than one). For cases, the UMass-MechBayes model is excluded as there are no case forecasts available and therefore the set of overlapping forecasts is empty."}
 forecast_quantile |>
   score() |>
-  get_pairwise_comparisons(by = c("model", "target_type")) |>
+  get_pairwise_comparisons(compare = "model", by = "target_type") |>
   plot_pairwise_comparisons() + 
   facet_wrap(~ target_type)
 ```

From a6da128e0126f1d3714f978cbd96aa2527d03712 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 18 Sep 2024 20:30:58 +0100
Subject: [PATCH 09/18] add warnings if `compare`/`by` is unset

---
 R/pairwise-comparisons.R | 13 +++++++++++++
 R/summarise_scores.R     | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index c30a029eb..1a985941d 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -127,6 +127,19 @@ get_pairwise_comparisons <- function(
   # input checks ---------------------------------------------------------------
   scores <- ensure_data.table(scores)
 
+  # allow legacy fixed column name for mdoels
+  if (missing(compare) && "model" %in% colnames(scores)) {
+    compare <- "model"
+    cli_warn(
+      c(
+        "!" = "Setting `compare` to \"model\" to reflect previous behaviour.
+          In the future this behaviour will be deprecated, and users will have
+          to specify which column to use for comparison. To silence this message
+          set `compare = \"model\"` explicitly."
+      )
+    )
+  }
+
   # we need the score names attribute to make sure we can determine the
   # forecast unit correctly, so here we check it exists
   metrics <- get_metrics.scores(scores, error = TRUE)
diff --git a/R/summarise_scores.R b/R/summarise_scores.R
index b87c89b13..479e30aff 100644
--- a/R/summarise_scores.R
+++ b/R/summarise_scores.R
@@ -61,6 +61,19 @@ summarise_scores <- function(scores,
   assert_subset(by, names(scores), empty.ok = TRUE)
   assert_function(fun)
 
+  # allow legacy fixed column name for mdoels
+  if (missing(by) && "model" %in% colnames(scores)) {
+    by <- "model"
+    cli_warn(
+      c(
+        "!" = "Setting `by` to \"model\" to reflect previous behaviour.
+          In the future this behaviour will be deprecated, and users will have
+          to specify which column to use for comparison. To silence this message
+          set `by = \"model\"` explicitly."
+      )
+    )
+  }
+
   metrics <- get_metrics.scores(scores, error = TRUE)
 
   # summarise scores -----------------------------------------------------------

From bcc7d69867d3b3d5a6410406853615b876642039 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Thu, 19 Sep 2024 11:04:32 +0100
Subject: [PATCH 10/18] update tests

---
 tests/testthat/test-plot_correlation.R | 2 +-
 tests/testthat/test-summarise_scores.R | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/testthat/test-plot_correlation.R b/tests/testthat/test-plot_correlation.R
index 5c03329a7..018b33a9c 100644
--- a/tests/testthat/test-plot_correlation.R
+++ b/tests/testthat/test-plot_correlation.R
@@ -12,7 +12,7 @@ test_that("plot_correlations() works as expected", {
 
   # expect an error if you forgot to compute correlations
   expect_error(
-    plot_correlations(summarise_scores(scores_quantile)),
+    plot_correlations(summarise_scores(scores_quantile, by = "model")),
     "Did you forget to call `scoringutils::get_correlations()`?"
   )
 })
diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R
index 37ca68df9..ec403b513 100644
--- a/tests/testthat/test-summarise_scores.R
+++ b/tests/testthat/test-summarise_scores.R
@@ -1,10 +1,3 @@
-test_that("summarise_scores() works as expected with by = forecast unit", {
-  expect_no_condition(
-    summarised_scores <- summarise_scores(scores_quantile)
-  )
-  expect_s3_class(summarised_scores, c("scores", "data.table", "data.frame"), exact = TRUE)
-})
-
 test_that("summarise_scores() works as expected with by = forecast unit", {
   # the only effect of running summarise_scores with by = forecast unit is
   # that coverage is now a numeric instead of a boolean
@@ -13,6 +6,7 @@ test_that("summarise_scores() works as expected with by = forecast unit", {
     by = get_forecast_unit(scores_quantile)
   )
 
+  expect_s3_class(summarised_scores, c("scores", "data.table", "data.frame"), exact = TRUE)
   expect_equal(dim(summarised_scores), dim(scores_quantile))
   expect_equal(summarised_scores$wis, scores_quantile$wis)
 

From d8e123a96e8ac315d884f2213940a5cb99aa73d6 Mon Sep 17 00:00:00 2001
From: nikosbosse <nikosbosse@gmail.com>
Date: Mon, 23 Sep 2024 21:49:17 +0200
Subject: [PATCH 11/18] add test and fix bug

---
 R/summarise_scores.R                   |  2 +-
 tests/testthat/test-summarise_scores.R | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/R/summarise_scores.R b/R/summarise_scores.R
index 479e30aff..8d5f8239b 100644
--- a/R/summarise_scores.R
+++ b/R/summarise_scores.R
@@ -58,7 +58,7 @@ summarise_scores <- function(scores,
   # input checking ------------------------------------------------------------
   assert_data_frame(scores)
   scores <- ensure_data.table(scores)
-  assert_subset(by, names(scores), empty.ok = TRUE)
+  assert_subset(by, names(scores), empty.ok = FALSE)
   assert_function(fun)
 
   # allow legacy fixed column name for mdoels
diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R
index ec403b513..4ed5dfc75 100644
--- a/tests/testthat/test-summarise_scores.R
+++ b/tests/testthat/test-summarise_scores.R
@@ -59,3 +59,13 @@ test_that("summarise_scores() handles data.frames correctly", {
     summarise_scores(test, by = "model")
   )
 })
+
+
+test_that("summarise_scores() errors if `by = NULL", {
+  expect_error(
+    summarise_scores(scores_quantile, by = NULL),
+    "Assertion on 'by' failed: Must be a subset of"
+  )
+})
+
+

From d776de27df76070132df0320c112c6214bc4fb3f Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 25 Sep 2024 10:08:09 +0100
Subject: [PATCH 12/18] improve explanation in the docs

Co-authored-by: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com>
---
 R/pairwise-comparisons.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index 1a985941d..2a64fe117 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -65,8 +65,8 @@
 #'   levels for the pairwise comparisons. By default this is an empty character
 #'   vector and there will be one relative skill score per distinct entry of the
 #'   column selected in `compare`. If further columns are given here, for
-#'   example, `by = "location"`, then you a separate relative skill score is
-#'   calculated for every model in every location.
+#'   example, `by = "location"` and `compare = "model"`, then one separate relative 
+#'   skill score is calculated for every model in every location.
 #' @param metric A string with the name of the metric for which
 #'   a relative skill shall be computed. By default this is either "crps",
 #'   "wis" or "brier_score" if any of these are available.

From 2be8f53155cbf9c257bb62b2236c5e3fa3631de9 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 25 Sep 2024 10:41:53 +0100
Subject: [PATCH 13/18] restore model default

---
 NAMESPACE                                     |  1 +
 R/pairwise-comparisons.R                      | 60 ++++++++-----------
 R/plot.R                                      |  4 +-
 R/summarise_scores.R                          | 23 ++-----
 man/add_relative_skill.Rd                     | 14 ++---
 man/compare_forecasts.Rd                      |  5 +-
 man/get_pairwise_comparisons.Rd               | 23 ++++---
 man/pairwise_comparison_one_group.Rd          | 21 ++++---
 man/plot_pairwise_comparisons.Rd              |  4 +-
 man/summarise_scores.Rd                       | 12 ++--
 tests/testthat/test-pairwise_comparison.R     | 39 ++++++------
 tests/testthat/test-plot_correlation.R        |  2 +-
 .../testthat/test-plot_pairwise_comparison.R  |  2 +-
 tests/testthat/test-summarise_scores.R        |  8 ++-
 14 files changed, 98 insertions(+), 120 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index e24968a9e..ad4dbe690 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -102,6 +102,7 @@ importFrom(checkmate,assert_character)
 importFrom(checkmate,assert_class)
 importFrom(checkmate,assert_data_frame)
 importFrom(checkmate,assert_data_table)
+importFrom(checkmate,assert_disjunct)
 importFrom(checkmate,assert_factor)
 importFrom(checkmate,assert_function)
 importFrom(checkmate,assert_list)
diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index 2a64fe117..07e996f54 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -60,13 +60,14 @@
 #'
 #' @param compare Character vector with a single colum name that defines the
 #'   elements for the pairwise comparison. For example, if this is set to
-#'   "model", then elements of the "model" column will be compared.
+#'   "model" (the default), then elements of the "model" column will be
+#'   compared.
 #' @param by Character vector with column names that define further grouping
-#'   levels for the pairwise comparisons. By default this is an empty character
-#'   vector and there will be one relative skill score per distinct entry of the
-#'   column selected in `compare`. If further columns are given here, for
-#'   example, `by = "location"` and `compare = "model"`, then one separate relative 
-#'   skill score is calculated for every model in every location.
+#'   levels for the pairwise comparisons. By default this is `NULL` and there
+#'   will be one relative skill score per distinct entry of the column selected
+#'   in `compare`. If further columns are given here, for example, `by =
+#'   "location"` with `compare = "model"`, then one separate relative skill
+#'   score is calculated for every model in every location.
 #' @param metric A string with the name of the metric for which
 #'   a relative skill shall be computed. By default this is either "crps",
 #'   "wis" or "brier_score" if any of these are available.
@@ -86,7 +87,7 @@
 #' @importFrom data.table as.data.table data.table setnames copy
 #' @importFrom stats sd rbinom wilcox.test p.adjust
 #' @importFrom utils combn
-#' @importFrom checkmate assert_subset assert_character
+#' @importFrom checkmate assert_subset assert_character assert_disjunct
 #' @importFrom cli cli_abort cli_inform cli_warn
 #' @export
 #' @author Nikos Bosse \email{nikosbosse@@gmail.com}
@@ -103,12 +104,9 @@
 #'  as_forecast_quantile() %>%
 #'  score()
 #'
-#' pairwise <- get_pairwise_comparisons(
-#'   scores, compare = "model", by = "target_type"
-#' )
+#' pairwise <- get_pairwise_comparisons(scores, by = "target_type")
 #' pairwise2 <- get_pairwise_comparisons(
-#'   scores, compare = "model", by = "target_type",
-#'   baseline = "EuroCOVIDhub-baseline"
+#'   scores, by = "target_type", baseline = "EuroCOVIDhub-baseline"
 #' )
 #'
 #' library(ggplot2)
@@ -117,8 +115,8 @@
 
 get_pairwise_comparisons <- function(
   scores,
-  compare,
-  by = character(0),
+  compare = "model",
+  by = NULL,
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL,
   ...
@@ -126,19 +124,10 @@ get_pairwise_comparisons <- function(
 
   # input checks ---------------------------------------------------------------
   scores <- ensure_data.table(scores)
-
-  # allow legacy fixed column name for mdoels
-  if (missing(compare) && "model" %in% colnames(scores)) {
-    compare <- "model"
-    cli_warn(
-      c(
-        "!" = "Setting `compare` to \"model\" to reflect previous behaviour.
-          In the future this behaviour will be deprecated, and users will have
-          to specify which column to use for comparison. To silence this message
-          set `compare = \"model\"` explicitly."
-      )
-    )
-  }
+  # check that 'compare' column in is are present
+  assert(check_columns_present(scores, compare))
+  # check that column(s) in `by` ar not in `compare`
+  assert_disjunct(by, compare)
 
   # we need the score names attribute to make sure we can determine the
   # forecast unit correctly, so here we check it exists
@@ -160,11 +149,11 @@ get_pairwise_comparisons <- function(
       )
       #nolint end
     }
+  } else {
+    ## set explicitly to character(0) in case it was given as NULL
+    by <- character(0)
   }
 
-  # check that 'compare' column in is are present
-  assert(check_columns_present(scores, compare))
-
   # check that baseline exists
   comparators <- as.vector(unique(scores[[compare]]))
   assert_subset(baseline, comparators)
@@ -279,12 +268,13 @@ get_pairwise_comparisons <- function(
 #' forecasts it calls [compare_forecasts()].
 #' @inherit get_pairwise_comparisons params return
 #' @importFrom cli cli_abort
+#' @importFrom data.table setnames
 #' @keywords internal
 
 pairwise_comparison_one_group <- function(scores,
                                           metric,
                                           baseline,
-                                          compare,
+                                          compare = "model",
                                           by,
                                           ...) {
   if (!(compare %in% names(scores))) {
@@ -328,7 +318,7 @@ pairwise_comparison_one_group <- function(scores,
 
   # mirror computations
   combinations_mirrored <- data.table::copy(combinations)
-  data.table::setnames(combinations_mirrored,
+  setnames(combinations_mirrored,
     old = c("..compare", "compare_against"),
     new = c("compare_against", "..compare")
   )
@@ -356,7 +346,7 @@ pairwise_comparison_one_group <- function(scores,
     compare_against = as.character(compare_against)
   )]
 
-  data.table::setnames(result, old = "..compare", new = compare)
+  setnames(result, old = "..compare", new = compare)
 
   # calculate relative skill as geometric mean
   # small theta is again better (assuming that the score is negatively oriented)
@@ -433,7 +423,7 @@ pairwise_comparison_one_group <- function(scores,
 #' @keywords internal
 
 compare_forecasts <- function(scores,
-                              compare,
+                              compare = "model",
                               name_comparator1,
                               name_comparator2,
                               metric,
@@ -588,7 +578,7 @@ permutation_test <- function(scores1,
 #' @keywords keyword scoring
 add_relative_skill <- function(
   scores,
-  compare,
+  compare = "model",
   by = character(0),
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL
diff --git a/R/plot.R b/R/plot.R
index d7bb1d3c6..e382dcc55 100644
--- a/R/plot.R
+++ b/R/plot.R
@@ -319,9 +319,7 @@ plot_quantile_coverage <- function(coverage,
 #' scores <- example_quantile %>%
 #'   as_forecast_quantile %>%
 #'   score()
-#' pairwise <- get_pairwise_comparisons(
-#'   scores, compare = "model", by = "target_type"
-#' )
+#' pairwise <- get_pairwise_comparisons(scores, by = "target_type")
 #' plot_pairwise_comparisons(pairwise, type = "mean_scores_ratio") +
 #'   facet_wrap(~target_type)
 
diff --git a/R/summarise_scores.R b/R/summarise_scores.R
index 8d5f8239b..ddd29c14f 100644
--- a/R/summarise_scores.R
+++ b/R/summarise_scores.R
@@ -13,10 +13,8 @@
 #'
 #' @param scores An object of class `scores` (a data.table with
 #'   scores and an additional attribute `metrics` as produced by [score()]).
-#' @param by Character vector with column names to summarise scores by. An
-#'   example here would be something like a `model` column when summarising
-#'   scores by model. Default is an empty character vector, which means that
-#'   scores are summarised without grouping.
+#' @param by Character vector with column names to summarise scores by. Default
+#'   is "model", i.e. scores are summarised by the "model" column.
 #' @param fun A function used for summarising scores. Default is [mean()].
 #' @param ... Additional parameters that can be passed to the summary function
 #'   provided to `fun`. For more information see the documentation of the
@@ -44,7 +42,7 @@
 #' summarise_scores(scores, by = "model", fun = sd)
 #'
 #' # round digits
-#' summarise_scores(scores,by = "model") %>%
+#' summarise_scores(scores, by = "model") %>%
 #'   summarise_scores(fun = signif, digits = 2)
 #' @export
 #' @importFrom checkmate assert_subset assert_function test_subset
@@ -52,7 +50,7 @@
 #' @keywords scoring
 
 summarise_scores <- function(scores,
-                             by = character(0),
+                             by = "model",
                              fun = mean,
                              ...) {
   # input checking ------------------------------------------------------------
@@ -61,19 +59,6 @@ summarise_scores <- function(scores,
   assert_subset(by, names(scores), empty.ok = FALSE)
   assert_function(fun)
 
-  # allow legacy fixed column name for mdoels
-  if (missing(by) && "model" %in% colnames(scores)) {
-    by <- "model"
-    cli_warn(
-      c(
-        "!" = "Setting `by` to \"model\" to reflect previous behaviour.
-          In the future this behaviour will be deprecated, and users will have
-          to specify which column to use for comparison. To silence this message
-          set `by = \"model\"` explicitly."
-      )
-    )
-  }
-
   metrics <- get_metrics.scores(scores, error = TRUE)
 
   # summarise scores -----------------------------------------------------------
diff --git a/man/add_relative_skill.Rd b/man/add_relative_skill.Rd
index 528ca4ca4..6c196edad 100644
--- a/man/add_relative_skill.Rd
+++ b/man/add_relative_skill.Rd
@@ -6,7 +6,7 @@
 \usage{
 add_relative_skill(
   scores,
-  compare,
+  compare = "model",
   by = character(0),
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL
@@ -18,14 +18,14 @@ scores and an additional attribute \code{metrics} as produced by \code{\link[=sc
 
 \item{compare}{Character vector with a single colum name that defines the
 elements for the pairwise comparison. For example, if this is set to
-"model", then elements of the "model" column will be compared.}
+"model" (the default), then elements of the "model" column will be
+compared.}
 
 \item{by}{Character vector with column names that define further grouping
-levels for the pairwise comparisons. By default this is an empty character
-vector and there will be one relative skill score per distinct entry of the
-column selected in \code{compare}. If further columns are given here, for
-example, \code{by = "location"}, then you a separate relative skill score is
-calculated for every model in every location.}
+levels for the pairwise comparisons. By default this is \code{NULL} and there
+will be one relative skill score per distinct entry of the column selected
+in \code{compare}. If further columns are given here, for example, \code{by = "location"} with \code{compare = "model"}, then one separate relative skill
+score is calculated for every model in every location.}
 
 \item{metric}{A string with the name of the metric for which
 a relative skill shall be computed. By default this is either "crps",
diff --git a/man/compare_forecasts.Rd b/man/compare_forecasts.Rd
index c648e24c6..5350fbe81 100644
--- a/man/compare_forecasts.Rd
+++ b/man/compare_forecasts.Rd
@@ -6,7 +6,7 @@
 \usage{
 compare_forecasts(
   scores,
-  compare,
+  compare = "model",
   name_comparator1,
   name_comparator2,
   metric,
@@ -21,7 +21,8 @@ scores and an additional attribute \code{metrics} as produced by \code{\link[=sc
 
 \item{compare}{Character vector with a single colum name that defines the
 elements for the pairwise comparison. For example, if this is set to
-"model", then elements of the "model" column will be compared.}
+"model" (the default), then elements of the "model" column will be
+compared.}
 
 \item{name_comparator1}{Character, name of the first comparator}
 
diff --git a/man/get_pairwise_comparisons.Rd b/man/get_pairwise_comparisons.Rd
index 0b18d4618..3bc06c557 100644
--- a/man/get_pairwise_comparisons.Rd
+++ b/man/get_pairwise_comparisons.Rd
@@ -6,8 +6,8 @@
 \usage{
 get_pairwise_comparisons(
   scores,
-  compare,
-  by = character(0),
+  compare = "model",
+  by = NULL,
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL,
   ...
@@ -19,14 +19,14 @@ scores and an additional attribute \code{metrics} as produced by \code{\link[=sc
 
 \item{compare}{Character vector with a single colum name that defines the
 elements for the pairwise comparison. For example, if this is set to
-"model", then elements of the "model" column will be compared.}
+"model" (the default), then elements of the "model" column will be
+compared.}
 
 \item{by}{Character vector with column names that define further grouping
-levels for the pairwise comparisons. By default this is an empty character
-vector and there will be one relative skill score per distinct entry of the
-column selected in \code{compare}. If further columns are given here, for
-example, \code{by = "location"}, then you a separate relative skill score is
-calculated for every model in every location.}
+levels for the pairwise comparisons. By default this is \code{NULL} and there
+will be one relative skill score per distinct entry of the column selected
+in \code{compare}. If further columns are given here, for example, \code{by = "location"} with \code{compare = "model"}, then one separate relative skill
+score is calculated for every model in every location.}
 
 \item{metric}{A string with the name of the metric for which
 a relative skill shall be computed. By default this is either "crps",
@@ -116,12 +116,9 @@ scores <- example_quantile \%>\%
  as_forecast_quantile() \%>\%
  score()
 
-pairwise <- get_pairwise_comparisons(
-  scores, compare = "model", by = "target_type"
-)
+pairwise <- get_pairwise_comparisons(scores, by = "target_type")
 pairwise2 <- get_pairwise_comparisons(
-  scores, compare = "model", by = "target_type",
-  baseline = "EuroCOVIDhub-baseline"
+  scores, by = "target_type", baseline = "EuroCOVIDhub-baseline"
 )
 
 library(ggplot2)
diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd
index 1aae9588f..2982a753f 100644
--- a/man/pairwise_comparison_one_group.Rd
+++ b/man/pairwise_comparison_one_group.Rd
@@ -4,7 +4,14 @@
 \alias{pairwise_comparison_one_group}
 \title{Do pairwise comparison for one set of forecasts}
 \usage{
-pairwise_comparison_one_group(scores, metric, baseline, compare, by, ...)
+pairwise_comparison_one_group(
+  scores,
+  metric,
+  baseline,
+  compare = "model",
+  by,
+  ...
+)
 }
 \arguments{
 \item{scores}{An object of class \code{scores} (a data.table with
@@ -21,14 +28,14 @@ respect to a baseline model.}
 
 \item{compare}{Character vector with a single colum name that defines the
 elements for the pairwise comparison. For example, if this is set to
-"model", then elements of the "model" column will be compared.}
+"model" (the default), then elements of the "model" column will be
+compared.}
 
 \item{by}{Character vector with column names that define further grouping
-levels for the pairwise comparisons. By default this is an empty character
-vector and there will be one relative skill score per distinct entry of the
-column selected in \code{compare}. If further columns are given here, for
-example, \code{by = "location"}, then you a separate relative skill score is
-calculated for every model in every location.}
+levels for the pairwise comparisons. By default this is \code{NULL} and there
+will be one relative skill score per distinct entry of the column selected
+in \code{compare}. If further columns are given here, for example, \code{by = "location"} with \code{compare = "model"}, then one separate relative skill
+score is calculated for every model in every location.}
 
 \item{...}{Additional arguments for the comparison between two models. See
 \code{\link[=compare_forecasts]{compare_forecasts()}} for more information.}
diff --git a/man/plot_pairwise_comparisons.Rd b/man/plot_pairwise_comparisons.Rd
index be1b6778a..fd175c424 100644
--- a/man/plot_pairwise_comparisons.Rd
+++ b/man/plot_pairwise_comparisons.Rd
@@ -32,9 +32,7 @@ library(magrittr) # pipe operator
 scores <- example_quantile \%>\%
   as_forecast_quantile \%>\%
   score()
-pairwise <- get_pairwise_comparisons(
-  scores, compare = "model", by = "target_type"
-)
+pairwise <- get_pairwise_comparisons(scores, by = "target_type")
 plot_pairwise_comparisons(pairwise, type = "mean_scores_ratio") +
   facet_wrap(~target_type)
 }
diff --git a/man/summarise_scores.Rd b/man/summarise_scores.Rd
index 42a4fae2b..5e7cedb54 100644
--- a/man/summarise_scores.Rd
+++ b/man/summarise_scores.Rd
@@ -5,18 +5,16 @@
 \alias{summarize_scores}
 \title{Summarise scores as produced by \code{\link[=score]{score()}}}
 \usage{
-summarise_scores(scores, by = character(0), fun = mean, ...)
+summarise_scores(scores, by = "model", fun = mean, ...)
 
-summarize_scores(scores, by = character(0), fun = mean, ...)
+summarize_scores(scores, by = "model", fun = mean, ...)
 }
 \arguments{
 \item{scores}{An object of class \code{scores} (a data.table with
 scores and an additional attribute \code{metrics} as produced by \code{\link[=score]{score()}}).}
 
-\item{by}{Character vector with column names to summarise scores by. An
-example here would be something like a \code{model} column when summarising
-scores by model. Default is an empty character vector, which means that
-scores are summarised without grouping.}
+\item{by}{Character vector with column names to summarise scores by. Default
+is "model", i.e. scores are summarised by the "model" column.}
 
 \item{fun}{A function used for summarising scores. Default is \code{\link[=mean]{mean()}}.}
 
@@ -59,7 +57,7 @@ summarise_scores(scores, by = c("model", "target_type"))
 summarise_scores(scores, by = "model", fun = sd)
 
 # round digits
-summarise_scores(scores,by = "model") \%>\%
+summarise_scores(scores, by = "model") \%>\%
   summarise_scores(fun = signif, digits = 2)
 }
 \keyword{scoring}
diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R
index 7d635a3a6..9228b88e7 100644
--- a/tests/testthat/test-pairwise_comparison.R
+++ b/tests/testthat/test-pairwise_comparison.R
@@ -65,13 +65,11 @@ test_that("get_pairwise_comparisons() works", {
     )
   )
   eval_without_baseline <- suppressMessages(
-    add_relative_skill(eval_without_rel_skill, compare = "model")
+    add_relative_skill(eval_without_rel_skill)
   )
 
   eval_with_baseline <- suppressMessages(
-    add_relative_skill(
-      eval_without_rel_skill, compare = "model", baseline = "m1"
-    )
+    add_relative_skill(eval_without_rel_skill, baseline = "m1")
   )
 
 
@@ -205,9 +203,7 @@ test_that("get_pairwise_comparisons() works", {
 
   eval <- score(data_formatted)
   eval_summarised <- summarise_scores(eval, by = c("model", "location"))
-  eval_with_baseline <- add_relative_skill(
-    eval, compare = "model", by = "location", baseline = "m1"
-)
+  eval_with_baseline <- add_relative_skill(eval, by = "location", baseline = "m1")
   eval_with_baseline <- summarise_scores(eval_with_baseline, by = c("model", "location"))
 
   relative_skills_with <- eval_with_baseline[
@@ -224,7 +220,7 @@ test_that("get_pairwise_comparisons() works", {
 test_that("get_pairwise_comparisons() work in score() with integer data", {
   eval <- suppressMessages(score(forecast = as_forecast_sample(example_sample_discrete)))
   eval_summarised <- summarise_scores(eval, by = c("model", "target_type"))
-  eval <- add_relative_skill(eval_summarised, compare = "model")
+  eval <- add_relative_skill(eval_summarised)
   expect_true("crps_relative_skill" %in% colnames(eval))
 })
 
@@ -232,7 +228,7 @@ test_that("get_pairwise_comparisons() work in score() with integer data", {
 test_that("get_pairwise_comparisons() work in score() with binary data", {
   eval <- suppressMessages(score(forecast = as_forecast_binary(example_binary)))
   eval_summarised <- summarise_scores(eval, by = c("model", "target_type"))
-  eval <- add_relative_skill(eval_summarised, compare = "model")
+  eval <- add_relative_skill(eval_summarised)
   expect_true("brier_score_relative_skill" %in% colnames(eval))
 })
 
@@ -249,9 +245,7 @@ test_that("get_pairwise_comparisons() works", {
   )
   attr(df, "metrics") <- c("wis", "ae_median")
 
-  res <- suppressMessages(get_pairwise_comparisons(
-    df, compare = "model", baseline = "model1")
-    )
+  res <- suppressMessages(get_pairwise_comparisons(df, baseline = "model1"))
 
   colnames <- c(
     "model", "compare_against", "mean_scores_ratio",
@@ -263,7 +257,7 @@ test_that("get_pairwise_comparisons() works", {
   # output class is as expected
   expect_s3_class(res, c("data.table", "data.frame"), exact = TRUE)
   expect_s3_class(
-    get_pairwise_comparisons(scores_quantile, compare = "model"),
+    get_pairwise_comparisons(scores_quantile),
     c("data.table", "data.frame"), exact = TRUE
   )
 })
@@ -287,9 +281,7 @@ test_that("get_pairwise_comparisons() and `add_relative_skill()` give same resul
 
 test_that("get_pairwise_comparisons() realises when there is no baseline model", {
   expect_error(
-    get_pairwise_comparisons(
-      scores_quantile, compare = "model", baseline = "missing_model"
-    ),
+    get_pairwise_comparisons(scores_quantile, baseline = "missing_model"),
     "Assertion on 'baseline' failed: Must be a subset of"
   )
 })
@@ -305,6 +297,14 @@ test_that("Basic input checks for `add_relative_skill() work", {
     "Not all columns specified in `by` are present:"
   )
 
+  # check that none of the columns in `by` are in `compare`
+  expect_error(
+    add_relative_skill(
+      eval, by = c("model", "target_type"), metric = "crps"
+    ),
+    "Must be disjunct from \\{'model'\\}"
+  )
+
   # error if baseline is not present
   expect_error(
     add_relative_skill(
@@ -347,7 +347,7 @@ test_that("Basic input checks for `add_relative_skill() work", {
   eval_nomodel <- data.table::copy(eval)[, "model" := NULL]
   expect_error(
     add_relative_skill(
-      eval_nomodel, compare = "model", by = "target_type", metric = "crps"
+      eval_nomodel, by = "target_type", metric = "crps"
     ),
     "Assertion on 'scores' failed: Column 'model' not found in data."
   )
@@ -468,7 +468,6 @@ test_that("add_relative_skill() works with point forecasts", {
   expect_no_condition(
     pw_point <- add_relative_skill(
       scores_point,
-      compare = "model",
       metric = "se_point"
     )
   )
@@ -486,8 +485,7 @@ test_that("add_relative_skill() works with point forecasts", {
 
 test_that("add_relative_skill() can compute relative measures", {
   scores_with <- add_relative_skill(
-    scores_quantile,
-    compare = "model"
+    scores_quantile
   )
   expect_s3_class(
     scores_with,
@@ -519,7 +517,6 @@ test_that("permutation_tests work as expected", {
   expect_no_condition(
     get_pairwise_comparisons(
       scores_quantile,
-      compare = "model",
       test_type = "permutation",
       one_sided = TRUE,
       n_permutations = 50
diff --git a/tests/testthat/test-plot_correlation.R b/tests/testthat/test-plot_correlation.R
index 018b33a9c..5c03329a7 100644
--- a/tests/testthat/test-plot_correlation.R
+++ b/tests/testthat/test-plot_correlation.R
@@ -12,7 +12,7 @@ test_that("plot_correlations() works as expected", {
 
   # expect an error if you forgot to compute correlations
   expect_error(
-    plot_correlations(summarise_scores(scores_quantile, by = "model")),
+    plot_correlations(summarise_scores(scores_quantile)),
     "Did you forget to call `scoringutils::get_correlations()`?"
   )
 })
diff --git a/tests/testthat/test-plot_pairwise_comparison.R b/tests/testthat/test-plot_pairwise_comparison.R
index 50add6b14..88bac4fa5 100644
--- a/tests/testthat/test-plot_pairwise_comparison.R
+++ b/tests/testthat/test-plot_pairwise_comparison.R
@@ -1,5 +1,5 @@
 pairwise <- suppressMessages(
-  get_pairwise_comparisons(scores_quantile, compare = "model", by = "target_type")
+  get_pairwise_comparisons(scores_quantile, by = "target_type")
 )
 
 test_that("plot_pairwise_comparisons() works as expected", {
diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R
index 4ed5dfc75..420781fea 100644
--- a/tests/testthat/test-summarise_scores.R
+++ b/tests/testthat/test-summarise_scores.R
@@ -1,3 +1,10 @@
+test_that("summarise_scores() works as expected with by = forecast unit", {
+  expect_no_condition(
+    summarised_scores <- summarise_scores(scores_quantile)
+  )
+  expect_s3_class(summarised_scores, c("scores", "data.table", "data.frame"), exact = TRUE)
+})
+
 test_that("summarise_scores() works as expected with by = forecast unit", {
   # the only effect of running summarise_scores with by = forecast unit is
   # that coverage is now a numeric instead of a boolean
@@ -6,7 +13,6 @@ test_that("summarise_scores() works as expected with by = forecast unit", {
     by = get_forecast_unit(scores_quantile)
   )
 
-  expect_s3_class(summarised_scores, c("scores", "data.table", "data.frame"), exact = TRUE)
   expect_equal(dim(summarised_scores), dim(scores_quantile))
   expect_equal(summarised_scores$wis, scores_quantile$wis)
 

From 0df417338de3453f08a4fce7f5b6088a3b873692 Mon Sep 17 00:00:00 2001
From: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:16:06 +0200
Subject: [PATCH 14/18] fix names (#920)

---
 DESCRIPTION | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f15bcf91c..d1f7ada04 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -8,7 +8,8 @@ Authors@R: c(
            role = c("aut", "cre"),
            email = "nikosbosse@gmail.com",
            comment = c(ORCID = "0000-0002-7750-5280")),
-    person(given = "Sam Abbott",
+    person(given = "Sam",
+           family = "Abbott",
            role = c("aut"),
            email = "contact@samabbott.co.uk",
            comment = c(ORCID = "0000-0001-8057-8037")), 
@@ -17,7 +18,8 @@ Authors@R: c(
            role = c("aut"),
            email = "hugo.gruson+R@normalesup.org",
            comment = c(ORCID = "0000-0002-4094-1476")),
-    person(given = "Johannes Bracher",
+    person(given = "Johannes",
+           family = "Bracher",
            role = c("ctb"),
            email = "johannes.bracher@kit.edu",
            comment = c(ORCID = "0000-0002-3777-1410")), 

From 8244923988a046d69c8479bb1e888c6d9abd5226 Mon Sep 17 00:00:00 2001
From: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com>
Date: Tue, 24 Sep 2024 19:24:47 +0200
Subject: [PATCH 15/18] #896 Update documentation of log score (#918)

* use neg_log_score

* update docs, revert previous change

* update docs
---
 R/metrics-binary.R               |  1 +
 R/metrics-nominal.R              |  1 +
 R/metrics-sample.R               | 14 +++++++++-----
 man/logs_sample.Rd               | 19 ++++++++++++++-----
 man/scoring-functions-binary.Rd  |  6 ++++++
 man/scoring-functions-nominal.Rd |  6 ++++++
 6 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/R/metrics-binary.R b/R/metrics-binary.R
index 7a6b92b3b..49cc800c9 100644
--- a/R/metrics-binary.R
+++ b/R/metrics-binary.R
@@ -80,6 +80,7 @@ brier_score <- function(observed, predicted) {
 #' @importFrom methods hasArg
 #' @export
 #' @keywords metric
+#' @family log score functions
 #' @rdname scoring-functions-binary
 logs_binary <- function(observed, predicted) {
   assert_input_binary(observed, predicted)
diff --git a/R/metrics-nominal.R b/R/metrics-nominal.R
index 103ef9d4a..63afc6c75 100644
--- a/R/metrics-nominal.R
+++ b/R/metrics-nominal.R
@@ -18,6 +18,7 @@
 #' @export
 #' @keywords metric
 #' @rdname scoring-functions-nominal
+#' @family log score functions
 #' @examples
 #' factor_levels <- c("one", "two", "three")
 #' predicted_label <- factor(c("one", "two", "three"), levels = factor_levels)
diff --git a/R/metrics-sample.R b/R/metrics-sample.R
index 38c3f0b21..74479d1c1 100644
--- a/R/metrics-sample.R
+++ b/R/metrics-sample.R
@@ -149,19 +149,23 @@ se_mean_sample <- function(observed, predicted) {
 #' [`logs_sample()`][scoringRules::scores_sample_univ] function from the
 #' \pkg{scoringRules} package.
 #'
+#' The log score is the negative logarithm of the predictive density evaluated
+#' at the observed value.
+#'
 #' The function should be used to score continuous predictions only.
 #' While the Log Score is in theory also applicable
-#' to discrete forecasts, the problem lies in the implementation: The Log score
-#' needs a kernel density estimation, which is not well defined with
-#' integer-valued Monte Carlo Samples. The Log score can be used for specific
-#' discrete probability distributions. See the scoringRules package for
-#' more details.
+#' to discrete forecasts, the problem lies in the implementation: The function
+#' uses a kernel density estimation, which is not well defined with
+#' integer-valued Monte Carlo Samples.
+#' See the scoringRules package for more details and alternatives, e.g.
+#' calculating scores for specific discrete probability distributions.
 #' @inheritParams ae_median_sample
 #' @param ... Additional arguments passed to
 #' [logs_sample()][scoringRules::logs_sample()] from the scoringRules package.
 #' @inheritSection illustration-input-metric-sample Input format
 #' @return Vector with scores.
 #' @importFrom scoringRules logs_sample
+#' @family log score functions
 #' @examples
 #' observed <- rpois(30, lambda = 1:30)
 #' predicted <- replicate(200, rpois(n = 30, lambda = 1:30))
diff --git a/man/logs_sample.Rd b/man/logs_sample.Rd
index 00812c507..f5d71e66d 100644
--- a/man/logs_sample.Rd
+++ b/man/logs_sample.Rd
@@ -24,13 +24,16 @@ This function is a wrapper around the
 \code{\link[scoringRules:scores_sample_univ]{logs_sample()}} function from the
 \pkg{scoringRules} package.
 
+The log score is the negative logarithm of the predictive density evaluated
+at the observed value.
+
 The function should be used to score continuous predictions only.
 While the Log Score is in theory also applicable
-to discrete forecasts, the problem lies in the implementation: The Log score
-needs a kernel density estimation, which is not well defined with
-integer-valued Monte Carlo Samples. The Log score can be used for specific
-discrete probability distributions. See the scoringRules package for
-more details.
+to discrete forecasts, the problem lies in the implementation: The function
+uses a kernel density estimation, which is not well defined with
+integer-valued Monte Carlo Samples.
+See the scoringRules package for more details and alternatives, e.g.
+calculating scores for specific discrete probability distributions.
 }
 \section{Input format}{
 \if{html}{
@@ -52,4 +55,10 @@ logs_sample(observed, predicted)
 Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic
 Forecasts with scoringRules, \url{https://www.jstatsoft.org/article/view/v090i12}
 }
+\seealso{
+Other log score functions: 
+\code{\link{logs_nominal}()},
+\code{\link{scoring-functions-binary}}
+}
+\concept{log score functions}
 \keyword{metric}
diff --git a/man/scoring-functions-binary.Rd b/man/scoring-functions-binary.Rd
index 4956d400f..371f7eab5 100644
--- a/man/scoring-functions-binary.Rd
+++ b/man/scoring-functions-binary.Rd
@@ -85,4 +85,10 @@ predicted <- runif(n = 30, min = 0, max = 1)
 brier_score(observed, predicted)
 logs_binary(observed, predicted)
 }
+\seealso{
+Other log score functions: 
+\code{\link{logs_nominal}()},
+\code{\link{logs_sample}()}
+}
+\concept{log score functions}
 \keyword{metric}
diff --git a/man/scoring-functions-nominal.Rd b/man/scoring-functions-nominal.Rd
index dfc400556..a7b6d81ed 100644
--- a/man/scoring-functions-nominal.Rd
+++ b/man/scoring-functions-nominal.Rd
@@ -37,4 +37,10 @@ predicted <- matrix(c(0.8, 0.1, 0.4,
                     nrow = 3)
 logs_nominal(observed, predicted, predicted_label)
 }
+\seealso{
+Other log score functions: 
+\code{\link{logs_sample}()},
+\code{\link{scoring-functions-binary}}
+}
+\concept{log score functions}
 \keyword{metric}

From b1785030b969aa7f0c5a46dd6f346439a14f4bf2 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 25 Sep 2024 17:48:46 +0100
Subject: [PATCH 16/18] fix typo

Co-authored-by: Nikos Bosse <37978797+nikosbosse@users.noreply.github.com>
---
 R/pairwise-comparisons.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index 07e996f54..4a00b050c 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -124,7 +124,7 @@ get_pairwise_comparisons <- function(
 
   # input checks ---------------------------------------------------------------
   scores <- ensure_data.table(scores)
-  # check that 'compare' column in is are present
+  # check that column in 'compare' is present
   assert(check_columns_present(scores, compare))
   # check that column(s) in `by` ar not in `compare`
   assert_disjunct(by, compare)

From 2c5a2983f82e7777d1486906378a033d88125f7c Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 25 Sep 2024 17:49:23 +0100
Subject: [PATCH 17/18] make NULL the default

---
 R/pairwise-comparisons.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index 4a00b050c..d467f2e98 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -579,7 +579,7 @@ permutation_test <- function(scores1,
 add_relative_skill <- function(
   scores,
   compare = "model",
-  by = character(0),
+  by = NULL,
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL
 ) {

From 26a6270f13a2e9d6d5a2ef193af19a852ca0a9c6 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Wed, 25 Sep 2024 17:58:08 +0100
Subject: [PATCH 18/18] render doc

---
 man/add_relative_skill.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/add_relative_skill.Rd b/man/add_relative_skill.Rd
index 6c196edad..62bf180ad 100644
--- a/man/add_relative_skill.Rd
+++ b/man/add_relative_skill.Rd
@@ -7,7 +7,7 @@
 add_relative_skill(
   scores,
   compare = "model",
-  by = character(0),
+  by = NULL,
   metric = intersect(c("wis", "crps", "brier_score"), names(scores)),
   baseline = NULL
 )