diff --git a/.Rbuildignore b/.Rbuildignore index 0ad7fbb..ba0759a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,12 +3,10 @@ inst/tests.qmd inst/SGT.R inst/OpenTopics.qmd token -random.Rmd -selection.Rmd -spatial.Rmd -s_pb.Rmd -topmodels.Rmd +README.qmd .github ^_quarto$ ^altdoc$ ^docs$ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index 2a90233..96e0265 100644 --- a/.gitignore +++ b/.gitignore @@ -3,12 +3,11 @@ .Rhistory .Rapp.history .Rproj.user/ - # vignette output vignettes/*.html vignettes/*.pdf - # altdoc infrastructure altdoc/freeze.rds _quarto/ docs/ +.Rbuildignore diff --git a/DESCRIPTION b/DESCRIPTION index 0f94831..bea7ed8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ URL: https://gamlss-dev.github.io/gamlss2/ BugReports: https://github.com/gamlss-dev/gamlss2/issues Depends: R (>= 4.1.0), gamlss.dist, mgcv Imports: parallel, Formula, mvtnorm -Suggests: gamlss, gamlss.data, colorspace, knitr, scoringRules, partykit, Matrix, nnet, lattice, rpart, distributions3, nlme +Suggests: gamlss, gamlss.data, colorspace, ggplot2, knitr, scoringRules, partykit, Matrix, nnet, lattice, rpart, distributions3, nlme, sf, spdep LazyLoad: yes VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 64ea76e..d4aa1e2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,7 +46,8 @@ export( "new_formula", "softplus", "shiftlog", - "Kumaraswamy" + "Kumaraswamy", + "make.link2" ) S3method(gamlss2, formula) diff --git a/R/specials.R b/R/specials.R index 7f568e9..821abd2 100644 --- a/R/specials.R +++ b/R/specials.R @@ -98,6 +98,16 @@ special_terms <- function(x, data, binning = FALSE, digits = Inf, ...) } } + if(any(dups <- duplicated(names(sterms)))) { + dups <- names(sterms)[dups] + for(j in dups) { + dn <- grep(j, names(sterms), fixed = TRUE, value = TRUE) + dn <- paste0(dn, ".", 1:length(dn)) + i <- grep(j, names(sterms), fixed = TRUE) + names(sterms)[i] <- dn + } + } + return(sterms) } diff --git a/README.md b/README.md index 3b5b98f..1483d05 100644 --- a/README.md +++ b/README.md @@ -1,45 +1,37 @@ - - -# gamlss2: Infrastructure for Distributional Regression +# gamlss2: GAMLSS Infrastructure for Flexible Distributional Regression ## Installation -The development version of *gamlss2* can be installed via +The development version of _gamlss2_ can be installed via -``` r +```{r installation, eval = FALSE} install.packages("gamlss2", repos = c("https://gamlss-dev.R-universe.dev", "https://cloud.R-project.org")) ``` -## Overview - -The primary purpose of this package is to facilitate the creation of -advanced infrastructures designed to enhance the GAMLSS modeling -framework. Notably, the *gamlss2* package represents a significant -overhaul of its predecessor, *gamlss*, with a key emphasis on improving -estimation speed and incorporating more flexible infrastructures. These -enhancements enable the seamless integration of various algorithms into -GAMLSS, including gradient boosting, Bayesian estimation, regression -trees, and forests, fostering a more versatile and powerful modeling -environment. - -Moreover, the package expands its compatibility by supporting all model -terms from the base R *mgcv* package. Additionally, the *gamlss2* -package introduces the capability to accommodate more than four -parameter families. Essentially, this means that users can now specify -any type of model using these new infrastructures, making the package -highly flexible and accommodating to a wide range of modeling -requirements. - -- The main model function is `gamlss2()`. -- The default optimizer functions is `RS()`. Optimizer functions can - be exchanged. -- Most important methods: `summary()`, `plot()`, `predict()`. -- Easy development of new family objects, see `?family.gamlss2`. -- User-specific “special” terms are possible, see `?special_terms`. +## Overview + +The primary purpose of this package is to facilitate the creation of advanced infrastructures +designed to enhance the GAMLSS modeling framework. Notably, the _gamlss2_ package represents a +significant overhaul of its predecessor, _gamlss_, with a key emphasis on improving estimation +speed and incorporating more flexible infrastructures. These enhancements enable the seamless +integration of various algorithms into GAMLSS, including gradient boosting, Bayesian estimation, +regression trees, and forests, fostering a more versatile and powerful modeling environment. + +Moreover, the package expands its compatibility by supporting all model terms from the base +R _mgcv_ package. Additionally, the _gamlss2_ package introduces the capability to +accommodate more than four parameter families. Essentially, this means that users can now +specify any type of model using these new infrastructures, making the package highly +flexible and accommodating to a wide range of modeling requirements. + +* The main model function is `gamlss2()`. +* The default optimizer functions is `RS()`. Optimizer functions can be exchanged. +* Most important methods: `summary()`, `plot()`, `predict()`. +* Easy development of new family objects, see `?family.gamlss2`. +* User-specific "special" terms are possible, see `?special_terms`. For examples, please visit the manual pages. -``` r +```{r help, eval = FALSE} help(package = "gamlss2") ``` diff --git a/data/Germany.rda b/data/Germany.rda new file mode 100644 index 0000000..c259532 Binary files /dev/null and b/data/Germany.rda differ diff --git a/data/storms.rda b/data/storms.rda new file mode 100644 index 0000000..344c621 Binary files /dev/null and b/data/storms.rda differ diff --git a/inst/extra/GermanyElevation.grd b/inst/extra/GermanyElevation.grd new file mode 100644 index 0000000..81d7813 --- /dev/null +++ b/inst/extra/GermanyElevation.grd @@ -0,0 +1,27 @@ +[general] +creator=R package 'raster' +created=2024-09-30 08:52:19 +[data] +datatype=FLT4S +byteorder=little +nbands=1 +bandorder=BIL +categorical=FALSE +minvalue=-33 +maxvalue=1992 +nodatavalue=-3.4e+38 +[legend] +legendtype= +values= +color= +[description] +layername=file29ba723b9492a +[georeference] +nrows=209 +ncols=246 +xmin=5.87622623088577 +ymin=47.2597641144092 +xmax=15.0252873245434 +ymax=55.0327469134923 +projection=+proj=longlat +datum=WGS84 +no_defs +wkt=GEOGCRS["unknown", DATUM["World Geodetic System 1984", ELLIPSOID["WGS 84",6378137,298.257223563, LENGTHUNIT["metre",1]], ID["EPSG",6326]], PRIMEM["Greenwich",0, ANGLEUNIT["degree",0.0174532925199433], ID["EPSG",8901]], CS[ellipsoidal,2], AXIS["longitude",east, ORDER[1], ANGLEUNIT["degree",0.0174532925199433, ID["EPSG",9122]]], AXIS["latitude",north, ORDER[2], ANGLEUNIT["degree",0.0174532925199433, ID["EPSG",9122]]]] diff --git a/inst/extra/GermanyElevation.gri b/inst/extra/GermanyElevation.gri new file mode 100644 index 0000000..1349f2f Binary files /dev/null and b/inst/extra/GermanyElevation.gri differ diff --git a/man/Germany.Rd b/man/Germany.Rd new file mode 100644 index 0000000..68ceaa6 --- /dev/null +++ b/man/Germany.Rd @@ -0,0 +1,47 @@ +\name{Germany} +\alias{Germany} + +\encoding{UTF-8} + +\title{Map of Germany} + +\description{ + The map contains the counties of Germany. The data was originally taken from + GADM (\url{https://gadm.org/}) and slightly simplified to reduce disk space. +} + +\usage{data("storms", package = "gamlss2")} + +\format{ +A class \code{"sf"} data frame containing 403 counties of Germany. +\describe{ + \item{id}{Factor, a county id.} + \item{county}{Character, the county in Germany where the weather station is located.} + \item{state}{Character, the state in Germany where the weather station is located.} + \item{geometry}{The polygon information.} +} +} + +\source{ + Map of Germany: + \describe{ + \item{Data Source:}{GADM} + \item{Licence:}{CC BY} + \item{URL:}{\url{https://gadm.org/}} + \item{Coordinate Reference System:}{Longitude/latitude and the WGS84 datum.} + } +} + +\examples{\dontshow{ if(!requireNamespace("sf")) { if(interactive() || is.na(Sys.getenv("_R_CHECK_PACKAGE_NAME_", NA))) { stop("not all packages required for the example are installed") } else q() }} +## load sf package for plotting +library("sf") + +## load the data +data("Germany", package = "gamlss2") + +## plot the map +plot(st_geometry(Germany)) +} + +\keyword{datasets} + diff --git a/man/make.link2.Rd b/man/make.link2.Rd new file mode 100644 index 0000000..09d6de6 --- /dev/null +++ b/man/make.link2.Rd @@ -0,0 +1,44 @@ +\name{make.link2} +\alias{make.link2} + +\title{Create a Link for Families} + +\description{ + This function is used with the \code{family} functions in \code{gamlss2()}. + Given the name of a link, it returns a link function, an inverse + link function, the derivative \eqn{d\mu/d\eta}, and a function for domain + checking. Note that \code{make.link2()} is slightly more flexible and also allows + functions as arguments. +} + +\usage{ +make.link2(link) +} + +\arguments{ + \item{link}{ + A character string, see function \code{\link[stats]{make.link}}, or function. + } +} + +\value{ + A list containing the following components: + \item{linkfun}{Link function \code{function(mu)}.} + \item{linkinv}{Inverse link function \code{function(eta)}.} + \item{mu.eta}{Derivative \code{function(eta)}: \eqn{d\mu/d\eta}.} + \item{valideta}{Function \code{function(eta)} that returns \code{TRUE} if \code{eta} is in the domain of \code{linkinv}.} + \item{name}{A character string representing the name of the link function.} +} + +\seealso{ + \code{\link[stats]{make.link}}, \code{\link{gamlss2}}, \code{\link{gamlss2.family}}. +} + +\examples{ +## character specification +utils::str(make.link2("logit")) + +## functions +utils::str(make.link2(softplus)) +} + diff --git a/man/storms.Rd b/man/storms.Rd new file mode 100644 index 0000000..5088a80 --- /dev/null +++ b/man/storms.Rd @@ -0,0 +1,66 @@ +\name{storms} +\alias{storms} + +\encoding{UTF-8} + +\title{Severe Storms in Germany} + +\description{ + According to the Beaufort scale, severe storms occur from a wind speed of 24.5-28.4 m/s. + This dataset contains annual severe storm counts from weather stations in Germany + from 1981 to 2021. +} + +\usage{data("storms", package = "gamlss2")} + +\format{ +A data frame containing 3494 observations on 8 variables. +\describe{ + \item{id}{Factor, the weather station id.} + \item{county}{Character, the county in Germany where the weather station is located.} + \item{state}{Character, the state in Germany where the weather station is located.} + \item{year}{Integer, the year the observation was measured.} + \item{counts}{Integer, the number of severe storms in this year.} + \item{alt}{Numeric, the altitude in meters above sea level of the weather station.} + \item{lon}{Numeric, the longitude coordinate of the weather station.} + \item{lat}{Numeric, the latitude coordinate of the weather station.} +} +} + +\source{ + Severe Storms Data: + \describe{ + \item{Data Source:}{Deutscher Wetterdienst (DWD), Climate Data Center (CDC).} + \item{Licence:}{CC BY 4.0} + \item{URL:}{\url{https://opendata.dwd.de/climate_environment/CDC/}} + \item{Coordinate Reference System:}{Longitude/latitude and the WGS84 datum.} + } +} + +\examples{ +## load the data +data("storms", package = "gamlss2") + +## yearly observations +plot(counts ~ year, data = storms) + +## count distribution +barplot(table(storms$counts)) + +## NBI model +## model formula including spatial effect +\dontrun{f <- counts ~ s(year) + s(alt) + s(lon, lat) | + s(year) + s(alt) + s(lon, lat) + +## estimate model +b <- gamlss2(f, data = storms, family = NBI) + +## estimated effects +plot(b) + +## residual diagnostics +plot(b, which = "resid") +}} + +\keyword{datasets} + diff --git a/vignettes/families.Rmd b/vignettes/families.Rmd new file mode 100644 index 0000000..c94c2c4 --- /dev/null +++ b/vignettes/families.Rmd @@ -0,0 +1,234 @@ +--- +title: "Family Objects" +output: + html_document: + toc: true + toc_float: true + theme: flatly +bibliography: gamlss2.bib +nocite: | + @Rigby+Stasinopoulos:2005 +vignette: > + %\VignetteIndexEntry{Family Objects} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteDepends{gamlss2} + %\VignetteKeywords{distributional regression, family objects} + %\VignettePackage{gamlss2} +--- + +```{r preliminaries, echo=FALSE, message=FALSE, results="hide"} +library("gamlss2") +``` + +Note that all family objects of the _gamlss.dist_ package can be used for modeling. However, for +users wanting to specify their own (new) distribution model, this document provides a guide +on how to define custom family objects within the gamlss2 framework. + +Family objects in the _gamlss2_ package play an essential role in defining the models used for +fitting data to distributions. These objects encapsulate the necessary details about the +distribution and the parameters, such as: + +* The names of the parameters. +* The link functions that map the parameters to the predictor. +* Functions for the density, log-likelihood, and their derivatives. + +This document provides an overview of how to construct and use family objects within _gamlss2_. +By the end, you should have a good understanding of how to implement a custom family for +use in statistical models. + +### Defining Family Objects + +A family object in _gamlss2_ is a list that must meet the following minimum criteria: + +* Family Name: The object must contain the family name as a character string. +* Parameters: The object must list the parameters of the distribution (e.g., `"mu"` and `"sigma"` + for a normal distribution). +* Link Functions: It must specify the link functions associated with each parameter. +* Density Function: A `d()` function must be provided to evaluate the (log-)density of the + distribution. + +Optionally, a family object can include functions to calculate the log-likelihood, +random number generation, cumulative distribution function (CDF), and quantile function. + +Here's an example of a minimal family object for the normal distribution. + +```{r} +Normal <- function(...) { + fam <- list( + "family" = "Normal", + "names" = c("mu", "sigma"), + "links" = c("mu" = "identity", "sigma" = "log"), + "d" = function(y, par, log = FALSE, ...) { + dnorm(y, par$mu, par$sigma, log = log) + } + ) + class(fam) <- "gamlss2.family" + return(fam) +} +``` + +In this example, we define a normal distribution with two parameters: `"mu"` (mean) and `"sigma"` +(standard deviation). The link function for `"mu"` is the identity, and for `"sigma"`, +it is the log function. The density function uses the standard `dnorm()` function from +to calculate the normal density. + +### Density Function + +The density function must accept the following arguments: + +```{r, eval=FALSE} +d(y, par, log = FALSE, ...) +``` + +* `y`: The response variable. +* `par`: A named list of parameters (e.g., `"mu"`, `"sigma"` for the normal distribution). +* `log`: A logical value indicating whether to return the log-density. + +### Optional Derivatives + +Family objects can optionally include functions to compute the first and second derivatives of +the log-likelihood with respect to the predictors (or its expectations). These derivatives are used for +optimization during model fitting. + +The derivative functions follow the form: + +```{r, eval=FALSE} +function(y, par, ...) +``` + +The derivate functions of first order must be provided as a named list, one list element for +each parameter of the distribution, and is named `"score"`. The second order derivative list +is named `"hess"`. Note that these functions must return the derivative **w.r.t. predictor** +and the `"hess"` functions must return the **negative** (expected) second derivatives + +An example of setting up first and second order derivatives for the normal is +provided in the following code: + +```{r} +Normal <- function(...) { + fam <- list( + "family" = "Normal", + "names" = c("mu", "sigma"), + "links" = c("mu" = "identity", "sigma" = "log"), + "d" = function(y, par, log = FALSE, ...) { + dnorm(y, par$mu, par$sigma, log = log) + }, + "score" = list( + "mu" = function(y, par, ...) { + (y - par$mu) / (par$sigma^2) + }, + "sigma" = function(y, par, ...) { + -1 + (y - par$mu)^2 / (par$sigma^2) + } + ), + "hess" = list( + "mu" = function(y, par, ...) { + 1 / (par$sigma^2) + }, + "sigma" = function(y, par, ...) { + rep(2, length(y)) + } + ) + ) + class(fam) <- "gamlss2.family" + return(fam) +} +``` + +If no derivatives are provided, numerical approximations will be used by the package. + +### Additional Functions + +Family objects can also include other functions such as: + +* Cumulative distribution function (`p()`). +* Quantile function (`q()`). +* Random number generation (`r()`). + +These functions should adhere to the same structure as the density function, taking the response +(`y`), parameters (`par`), and other relevant arguments. + +### Flexible Links + +Note that the example above used static link functions to define the family object. +However, users can easily create families with flexible link functions as well. +A helpful example of how to implement such flexibility can be found in the Kumaraswamy +distribution implementation, which provides a clear template for setting up families with +customizable link functions. + +The Kumaraswamy distribution is a continuous distribution defined on the interval $(0, 1)$. +It is similar to the Beta distribution but has simpler forms for its cumulative distribution and +inverse cumulative distribution functions, making it more computationally efficient for certain applications. + +The probability density function (PDF) of the Kumaraswamy distribution is: + +$$ +f(y; a, b) = aby^{a-1}(1 - y^a)^{b-1} +$$ + +where $y \in (0, 1)$ is the response, and $a$ and $b$ are non-negative parameters that +determine the shape of the distribution. The complete implementation, including flexible link +functions is provided in the `Kumaraswamy()` family. + +In the following example, we will create the family object for the Kumaraswamy distribution +using the `Kumaraswamy()` function and estimate a model using this distribution. + +#### Example: Modeling with the Kumaraswamy Distribution + +In this example, we will: + +* Define the Kumaraswamy family object. +* Simulate data based on this distribution. +* Estimate the model and plot the results. + +```{r Kumaraswamy-example, eval=TRUE, echo=TRUE} +## Define the Kumaraswamy family object with specific link functions. +fam <- Kumaraswamy(a.link = shiftlog, b.link = "log") + +## Set seed for reproducibility. +set.seed(123) + +## Simulate data for 1000 observations. +n <- 1000 +d <- data.frame("x" = runif(n, -pi, pi)) + +## Specify the true parameters. +par <- data.frame( + "a" = exp(1.2 + sin(d$x)) + 1, # Parameter 'a' depends on 'x' + "b" = 1 # Parameter 'b' is constant +) + +## Sample response values using the family object. +d$y <- fam$r(1, par) + +## Estimate a model using the Kumaraswamy family. +b <- gamlss2(y ~ s(x), data = d, family = fam) + +## Plot the estimated effect. +plot(b) + +## Plot residual diagnostics. +plot(b, which = "resid") +``` + +In this example, we simulated a dataset where the parameter `a` of the Kumaraswamy +distribution varies with `x` following a sinusoidal pattern. We then used the `gamlss2()` function +to fit a smooth model that estimates this relationship. The effect of `x` on `y` is plotted, +followed by a diagnostic plot to assess residuals. + +The `Kumaraswamy()` family in _gamlss2_ is flexible, allowing the user to specify +different link functions for its parameters, such as the default `shiftlog` link function for +parameter `a`, which ensures non-negative values. + +### Conclusion + +Family objects in the _gamlss2_ package are a fundamental component for defining flexible, +distribution-based regression models, and beyond. By encapsulating the necessary elements, +such as parameters, link functions, and density functions, they provide a powerful framework +for customizing models to fit specific data. The flexibility to define custom families, +as demonstrated with the `Kumaraswamy()` distribution, enables users to extend the package +beyond its default families, making it adaptable to a wide range of modeling scenarios. +Furthermore, the ability to define both static and dynamic link functions enhances the +versatility of _gamlss2_ for distributional regression, empowering users to tailor +models to their unique data and research needs. + diff --git a/vignettes/random.Rmd b/vignettes/random.Rmd index 2e1be70..859f9a5 100644 --- a/vignettes/random.Rmd +++ b/vignettes/random.Rmd @@ -9,7 +9,7 @@ bibliography: gamlss2.bib nocite: | @Rigby+Stasinopoulos:2005 vignette: > - %\VignetteIndexEntry{Top Models} + %\VignetteIndexEntry{Random Effects} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{gamlss2} %\VignetteKeywords{distributional regression, random effects} @@ -20,7 +20,5 @@ vignette: > library("gamlss2") ``` -### Random Effects - Describe how to incorporate random effects in _gamlss2_ diff --git a/vignettes/s_pb.Rmd b/vignettes/s_pb.Rmd index 3910611..55e37e3 100644 --- a/vignettes/s_pb.Rmd +++ b/vignettes/s_pb.Rmd @@ -24,7 +24,7 @@ nocite: | @Mayretal2012 vignette: > - %\VignetteIndexEntry{Top Models} + %\VignetteIndexEntry{Smooth Terms} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{gamlss2} %\VignetteKeywords{distributional regression, inference, forecasting} @@ -35,7 +35,5 @@ vignette: > library("gamlss2") ``` -### Smooth Terms using s() and pb() - Examples on how to set up models using smooth terms with `s()` and `pb()`. diff --git a/vignettes/selection.Rmd b/vignettes/selection.Rmd index e04950d..941f487 100644 --- a/vignettes/selection.Rmd +++ b/vignettes/selection.Rmd @@ -9,7 +9,7 @@ bibliography: gamlss2.bib nocite: | @Rigby+Stasinopoulos:2005 vignette: > - %\VignetteIndexEntry{Top Models} + %\VignetteIndexEntry{Selection} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{gamlss2} %\VignetteKeywords{distributional regression, model selection, variable selection} @@ -20,8 +20,6 @@ vignette: > library("gamlss2") ``` -### Variable and Model Selection - * Find the best fitting distribution. * Find relevant variables. diff --git a/vignettes/spatial.Rmd b/vignettes/spatial.Rmd index a65325d..3e5e930 100644 --- a/vignettes/spatial.Rmd +++ b/vignettes/spatial.Rmd @@ -9,7 +9,7 @@ bibliography: gamlss2.bib nocite: | @Rigby+Stasinopoulos:2005 vignette: > - %\VignetteIndexEntry{Top Models} + %\VignetteIndexEntry{Spatial Effects} %\VignetteEngine{knitr::rmarkdown} %\VignetteDepends{gamlss2} %\VignetteKeywords{distributional regression, spatial effects} @@ -18,9 +18,192 @@ vignette: > ```{r preliminaries, echo=FALSE, message=FALSE, results="hide"} library("gamlss2") +library("sf") ``` -### Spatial Effects +Spatial data analysis is important in many fields such as environmental science, epidemiology, +and climatology, where observations are collected across different geographical locations. +A key challenge in spatial modeling is accounting for the dependence structure among nearby +regions, which often display correlated patterns in outcomes. -Describe how to incorporate spatial effects in _gamlss2_ +In generalized additive models for location, scale, and shape (GAMLSS), spatial effects can be +incorporated in various ways, such as through Markov random fields (MRFs). MRFs handle spatial +correlation by applying penalties that reflect the neighborhood structure of the spatial data. +This vignette is divided into two parts: the first part demonstrates how to estimate discrete +spatial effects using MRFs with the gamlss2 package, while the second part provides examples +of modeling spatial effects with smooth functions like thin-plate splines or tensor product splines. + +### Example: Modeling Severe Storm Counts in Germany + +In this example, we analyze severe storm counts recorded at various weather stations across +Germany over multiple years. Our goal is to model these storm counts while accounting for the +spatial dependence between stations. To achieve this, we associate each weather station with +its respective county in Germany, enabling us to incorporate the geographical structure into +the model. + +```{r} +## load the Germany severe storm data +data("storms", package = "gamlss2") + +## plot storm counts per station and year +plot(range(storms$year), range(storms$counts), type = "n", + xlab = "Year", ylab = "Counts") +for(j in levels(storms$id)) { + dj <- subset(storms, id == j) + dj <- dj[order(dj$year), ] + with(dj, lines(counts ~ year, type = "b", pch = 16, + col = rgb(0.1, 0.1, 0.1, alpha = 0.4))) +} +``` + +The data contains storm counts per year for each station. A preliminary visualization +of these counts allows us to inspect patterns of storm frequency over time and across stations. + +### Visualizing the Spatial Structure + +We begin by plotting the locations of weather stations on a map of Germany. The _sf_ package is +used to manage and plot spatial data. + +```{r} +## load map of Germany +## needs sf package for plotting +library("sf") +data("Germany", package = "gamlss2") + +## plot station locations +plot(st_geometry(Germany)) +co <- unique(storms[, c("lat", "lon")]) +points(co, col = 2, pch = 4, lwd = 2) +``` + +This map shows the geographical distribution of weather stations. The spatial structure will +be incorporated into our model to account for the proximity of stations when estimating storm counts. + +### Defining the Neighborhood Structure + +Next, we define the neighborhood structure among the weather stations using a distance-based +criterion. This is crucial for the Markov random field, as it specifies how spatial +correlation should be penalized. + +```{r} +## estimate spatial count model using +## a Markov random field, first a neighbor matrix +## needs to be computed, here we use distance based +## neighbors +library("spdep") +nb <- dnearneigh(st_centroid(st_geometry(Germany)), d1 = 0, d2 = 80) +plot(st_geometry(Germany), border = "lightgray") +plot.nb(nb, st_geometry(Germany), add = TRUE) +``` + +The neighbor matrix is constructed using the `poly2nb()` function from the _spdep_ package, +which calculates the adjacency structure of the geographical regions. We then visualize +the spatial network of neighbors on the map. + +### Constructing the Penalty Matrix + +The penalty matrix defines the spatial penalties imposed by the Markov random field. +The matrix is constructed based on the neighbor relationships defined earlier. + +```{r} +## compute final neighbor penalty matrix +K <- nb2mat(nb, style = "B", zero.policy = TRUE) + +## assign region names +rownames(K) <- colnames(K) <- levels(Germany$id) + +## set up final penalty matrix +K <- -1 * K +diag(K) <- -1 * rowSums(K) + +## remove regions not in data +i <- which(rownames(K) %in% levels(storms$id)) +K <- K[i, i] +``` + +The penalty matrix `K` is set up such that it reflects the neighborhood relationships between the +regions. Each element of the matrix represents how strongly each region is connected to its +neighbors. The diagonal entries represent the total number of neighbors for each region. + +### Estimating the Model + +We now estimate the spatial count model using the Negative Binomial distribution (`NBI`). +The model includes smooth functions of altitude, year, and an interaction between altitude and year. +Spatial effects are incorporated using the `bs = "mrf"` option. + +```{r, results="hide"} +## estimate count model using the NBI family, +## model formula is +f <- ~ s(alt) + s(year) + s(id, bs = "mrf", xt = list("penalty" = K)) + + te(alt, year) +f <- list(update(f, counts ~ .), f) + +## estimate model using BIC for shrinkage parameter selection +b <- gamlss2(f, data = storms, family = NBI, criterion = "BIC") +``` + +```{r} +## model summary +summary(b) +``` + +Here, the spatial effect is modeled as an MRF smooth (`s(id, bs = "mrf")`), where the penalty +matrix `K` enforces spatial structure based on neighboring stations. We use the +Bayesian Information Criterion (BIC) to select the optimal smoothing parameters. + +### Visualizing the Estimated Effects + +Finally, we visualize the estimated effects from the fitted model. + +```{r} +plot(b) +``` + +The plot shows the estimated smooth functions for altitude, year, and the spatial effect. +These visualizations help us interpret how storm counts vary across space and time. + +### Prediction + +First we predict the spatial effects on the scale of the predictors. Therefore, we +set up a new data frame containing only the unique loactions. + +```{r} +nd <- unique(storms[, "id", drop = FALSE]) + +## set some dummy values, not needed when predicting single model term +nd$alt <- 1000 +nd$year <- 2020 + +## prediction for the mu predictor +nd$fmu <- predict(b, newdata = nd, + model = "mu", term = "s(id)", + type = "link") + +## same for sigma +nd$fsigma <- predict(b, newdata = nd, + model = "sigma", term = "s(id)", + type = "link") + +## add fitted values to map of Germany +m <- merge(Germany, nd, by = "id", all.x = TRUE) + +## plot spatial effects +library("ggplot2") +library("colorspace") +ggplot(m) + geom_sf(aes(fill = fmu)) + +scale_fill_continuous_diverging("Blue-Red 3") + theme_bw() + +## note that because of the discrete spatial effect, +## there are a lot of NAs, therefore, we need to compute +## predictions in such regions by averaging using the neighbors +## of a region. we use the neighbour list object nb to compute +## the predictions, this is iterated. +while(any(is.na(m$fmu))) { + m$fmu <- sapply(nb, function(i) mean(m$fmu[i], na.rm = TRUE)) +} + +## plot final spatial effect +ggplot(m) + geom_sf(aes(fill = fmu)) + +scale_fill_continuous_diverging("Blue-Red 3") + theme_bw() +``` diff --git a/vignettes/topmodels.Rmd b/vignettes/topmodels.Rmd index b6d3edf..69a9524 100644 --- a/vignettes/topmodels.Rmd +++ b/vignettes/topmodels.Rmd @@ -20,7 +20,5 @@ vignette: > library("gamlss2") ``` -### Top Models - Introduction on how to use the _topmodels_ package with _gamlss2_.