feat: add gaussian kde

rust-dd · Jan 13, 2025 · 96a8cdd · 96a8cdd
1 parent e0a9d6e
commit 96a8cdd
Show file tree

Hide file tree

Showing 18 changed files with 355 additions and 11 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,6 +35,7 @@ ndarray = { version = "0.16.1", features = [
     "matrixmultiply-threading",
     "blas",
 ] }
+ndarray-linalg = { version = "0.16.0", features = ["openblas-static"] }
 ndarray-npy = "0.9.1"
 ndarray-rand = "0.15.0"
 ndarray-stats = "0.6.0"

diff --git a/src/stats.rs b/src/stats.rs
@@ -1,8 +1,8 @@
 pub mod cir;
-pub mod copula;
 pub mod copulas;
 pub mod double_exp;
 pub mod fd;
 pub mod fou_estimator;
+pub mod gaussian_kde;
 pub mod mle;
 pub mod non_central_chi_squared;
diff --git a/src/stats/copulas.rs b/src/stats/copulas.rs
@@ -2,5 +2,5 @@ pub mod bivariate;
 pub mod correlation;
 pub mod empirical;
 pub mod multivariate;
-pub mod optimize;
+pub mod samples;
 pub mod univariate;
diff --git a/src/stats/copulas/empirical.rs b/src/stats/copulas/empirical.rs
@@ -62,7 +62,7 @@ mod tests {
   use rand::thread_rng;
   use rand_distr::{Distribution, Uniform};
 
-  use crate::{stats::copula::plot_copula_samples, stochastic::N};
+  use crate::{stats::copulas::samples::plot_copula_samples, stochastic::N};
 
   use super::EmpiricalCopula2D;
 

diff --git a/src/stats/copulas/multivariate/gaussian.rs b/src/stats/copulas/multivariate/gaussian.rs
@@ -0,0 +1,41 @@
+use std::error::Error;
+
+use ndarray::{Array1, Array2};
+use statrs::distribution::{Continuous, ContinuousCDF, Normal};
+
+use crate::stats::copulas::univariate::gaussian::GaussianUnivariate;
+
+use super::{CopulaType, Multivariate};
+
+#[derive(Debug)]
+pub struct GaussianMultivariate;
+
+impl Multivariate for GaussianMultivariate {
+  fn r#type(&self) -> CopulaType {
+    CopulaType::Gaussian
+  }
+
+  fn sample(&self, n: usize) -> Result<Array2<f64>, Box<dyn Error>> {
+    todo!()
+  }
+
+  fn fit(&mut self, X: Array2<f64>) -> Result<(), Box<dyn Error>> {
+    todo!()
+  }
+
+  fn check_fit(&self, X: &Array2<f64>) -> Result<(), Box<dyn Error>> {
+    todo!()
+  }
+
+  fn pdf(&self, X: Array2<f64>) -> Result<Array1<f64>, Box<dyn Error>> {
+    todo!()
+  }
+
+  fn log_pdf(&self, X: Array2<f64>) -> Result<Array1<f64>, Box<dyn Error>> {
+    todo!()
+  }
+
+  fn cdf(&self, X: Array2<f64>) -> Result<Array1<f64>, Box<dyn Error>> {
+    todo!()
+  }
+}
diff --git a/src/stats/copulas/optimize.rs b/src/stats/copulas/optimize.rs
diff --git a/src/stats/copula.rs → src/stats/copulas/samples.rs b/src/stats/copula.rs → src/stats/copulas/samples.rs
diff --git a/src/stats/copulas/univariate.rs b/src/stats/copulas/univariate.rs
@@ -1,9 +1 @@
-pub mod beta;
-pub mod gamma;
 pub mod gaussian;
-pub mod gaussian_kde;
-pub mod log_laplace;
-pub mod selection;
-pub mod student_t;
-pub mod truncated_gaussian;
-pub mod uniform;
diff --git a/src/stats/copulas/univariate/beta.rs b/src/stats/copulas/univariate/beta.rs
diff --git a/src/stats/copulas/univariate/gamma.rs b/src/stats/copulas/univariate/gamma.rs
diff --git a/src/stats/copulas/univariate/gaussian.rs b/src/stats/copulas/univariate/gaussian.rs
@@ -0,0 +1,60 @@
+use ndarray::Array1;
+use statrs::distribution::{Continuous, ContinuousCDF, Normal};
+
+#[derive(Debug, Clone)]
+pub struct GaussianUnivariate {
+  dist: Option<Normal>, // statrs Normal distribution
+}
+
+impl GaussianUnivariate {
+  pub fn new() -> Self {
+    Self { dist: None }
+  }
+
+  /// Fit mean and std to data column, then store a statrs Normal distribution.
+  pub fn fit(&mut self, column: &Array1<f64>) {
+    let n = column.len() as f64;
+    if n < 2.0 {
+      // fallback: something trivial
+      self.dist = Some(Normal::new(0.0, 1.0).unwrap());
+      return;
+    }
+    let mean = column.mean().unwrap_or(0.0);
+    let var = column.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
+    let std = var.sqrt().max(1e-12);
+
+    // Create a Normal distribution
+    self.dist = Some(Normal::new(mean, std).unwrap());
+  }
+
+  pub fn is_fitted(&self) -> bool {
+    self.dist.is_some()
+  }
+
+  /// Shortcut for the underlying normal CDF.
+  pub fn cdf(&self, x: f64) -> f64 {
+    if let Some(d) = &self.dist {
+      d.cdf(x)
+    } else {
+      0.5
+    }
+  }
+
+  /// Shortcut for the underlying normal PDF.
+  pub fn pdf(&self, x: f64) -> f64 {
+    if let Some(d) = &self.dist {
+      d.pdf(x)
+    } else {
+      1.0
+    }
+  }
+
+  /// Shortcut for the underlying normal PPF (inverse CDF).
+  pub fn ppf(&self, p: f64) -> f64 {
+    if let Some(d) = &self.dist {
+      d.inverse_cdf(p)
+    } else {
+      0.0
+    }
+  }
+}
diff --git a/src/stats/copulas/univariate/gaussian_kde.rs b/src/stats/copulas/univariate/gaussian_kde.rs
diff --git a/src/stats/copulas/univariate/log_laplace.rs b/src/stats/copulas/univariate/log_laplace.rs
diff --git a/src/stats/copulas/univariate/selection.rs b/src/stats/copulas/univariate/selection.rs
diff --git a/src/stats/copulas/univariate/student_t.rs b/src/stats/copulas/univariate/student_t.rs
diff --git a/src/stats/copulas/univariate/truncated_gaussian.rs b/src/stats/copulas/univariate/truncated_gaussian.rs
diff --git a/src/stats/copulas/univariate/uniform.rs b/src/stats/copulas/univariate/uniform.rs
diff --git a/src/stats/gaussian_kde.rs b/src/stats/gaussian_kde.rs
@@ -0,0 +1,250 @@
+use ndarray::{Array, Array1};
+use std::f64::consts::PI;
+
+/// A struct representing a Gaussian Kernel Density Estimator (KDE).
+///
+/// # Fields
+/// - `data`: 1D array of data points.
+/// - `bandwidth`: The bandwidth (smoothing parameter) for the Gaussian kernel.
+#[derive(Debug)]
+pub struct GaussianKDE {
+  data: Array1<f64>,
+  bandwidth: f64,
+}
+
+impl GaussianKDE {
+  /// Creates a new `GaussianKDE` with given data and bandwidth.
+  ///
+  /// # Arguments
+  ///
+  /// * `data` - 1D array of data points.
+  /// * `bandwidth` - The smoothing parameter for the Gaussian kernel.
+  ///
+  /// # Returns
+  ///
+  /// A `GaussianKDE` instance containing the specified data and bandwidth.
+  ///
+  /// # Examples
+  ///
+  /// ```
+  /// use ndarray::Array1;
+  /// // Suppose we already have a GaussianKDE struct in scope
+  /// // let data = Array1::from(vec![1.0, 2.0, 3.0]);
+  /// // let kde = GaussianKDE::new(data, 0.5);
+  /// ```
+  pub fn new(data: Array1<f64>, bandwidth: f64) -> Self {
+    Self { data, bandwidth }
+  }
+
+  /// Creates a new `GaussianKDE` where the bandwidth is automatically
+  /// chosen by Silverman's rule of thumb.
+  ///
+  /// Silverman’s rule of thumb (for 1D) is:
+  ///
+  /// `h = 0.9 * min(σ, IQR/1.34) * n^(-1/5)`,
+  ///
+  /// where:
+  /// - `σ` is the standard deviation of the data,
+  /// - `IQR` is the interquartile range (75th percentile - 25th percentile),
+  /// - `n` is the number of data points.
+  ///
+  /// # Arguments
+  ///
+  /// * `data` - 1D array of data points.
+  ///
+  /// # Returns
+  ///
+  /// A `GaussianKDE` instance with bandwidth estimated by Silverman's rule.
+  pub fn with_silverman_bandwidth(data: Array1<f64>) -> Self {
+    let h = silverman_bandwidth(&data);
+    Self { data, bandwidth: h }
+  }
+
+  /// Evaluates the Gaussian kernel (normal PDF) for a given `x`, centered at `xi`,
+  /// using the instance's bandwidth.
+  ///
+  /// # Arguments
+  ///
+  /// * `x`  - The point at which to evaluate the kernel.
+  /// * `xi` - The center of the kernel (an individual data point).
+  ///
+  /// # Returns
+  ///
+  /// The kernel value at `x`.
+  fn gaussian_kernel(&self, x: f64, xi: f64) -> f64 {
+    let norm_factor = 1.0 / (self.bandwidth * (2.0 * PI).sqrt());
+    let exponent = -0.5 * ((x - xi) / self.bandwidth).powi(2);
+    norm_factor * exponent.exp()
+  }
+
+  /// Evaluates the Gaussian KDE at a single point `x`.
+  ///
+  /// # Arguments
+  ///
+  /// * `x` - The point at which to evaluate the KDE.
+  ///
+  /// # Returns
+  ///
+  /// The estimated density at `x`.
+  ///
+  /// # Examples
+  ///
+  /// ```
+  /// // let kde = GaussianKDE::new(data, 0.5);
+  /// // let density_value = kde.evaluate(1.5);
+  /// ```
+  pub fn evaluate(&self, x: f64) -> f64 {
+    let sum: f64 = self
+      .data
+      .iter()
+      .map(|&xi| self.gaussian_kernel(x, xi))
+      .sum();
+    sum / (self.data.len() as f64)
+  }
+
+  /// Evaluates the Gaussian KDE for multiple values of `x` (an array of points).
+  ///
+  /// # Arguments
+  ///
+  /// * `x_values` - 1D array of points at which to evaluate the KDE.
+  ///
+  /// # Returns
+  ///
+  /// A 1D array containing the density estimates for each point in `x_values`.
+  ///
+  /// # Examples
+  ///
+  /// ```
+  /// // let kde = GaussianKDE::new(data, 0.5);
+  /// // let xs = Array1::linspace(0.0, 5.0, 50);
+  /// // let ys = kde.evaluate_array(&xs);
+  /// ```
+  pub fn evaluate_array(&self, x_values: &Array1<f64>) -> Array1<f64> {
+    x_values.mapv(|x| self.evaluate(x))
+  }
+}
+
+/// Computes the bandwidth using Silverman's rule of thumb for 1D data.
+///
+/// Silverman’s rule of thumb (for 1D) is:
+///
+/// `h = 0.9 * min(σ, IQR/1.34) * n^(-1/5)`,
+///
+/// - `σ` = standard deviation of the data,
+/// - `IQR` = interquartile range = 75th percentile - 25th percentile,
+/// - `n` = number of data points.
+///
+/// # Arguments
+///
+/// * `data` - 1D array of data points.
+///
+/// # Returns
+///
+/// The estimated bandwidth.
+pub fn silverman_bandwidth(data: &Array1<f64>) -> f64 {
+  let n = data.len() as f64;
+  if n < 2.0 {
+    // If there's only one data point or empty, fallback to something minimal.
+    return 1e-6;
+  }
+
+  // Compute the standard deviation.
+  let mean = data.mean().unwrap_or(0.0);
+  let std = (data.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)).sqrt();
+
+  // Compute the interquartile range.
+  let mut sorted = data.to_vec();
+  sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+  let q1 = percentile(&sorted, 25.0);
+  let q3 = percentile(&sorted, 75.0);
+  let iqr = q3 - q1;
+
+  let scale = std.min(iqr / 1.34);
+  let h = 0.9 * scale * (n.powf(-1.0 / 5.0));
+  // Avoid extremely small bandwidth.
+  if h < 1e-8 {
+    1e-8
+  } else {
+    h
+  }
+}
+
+/// Returns the p-th percentile of a sorted vector.
+///
+/// # Arguments
+///
+/// * `sorted_data` - A sorted vector of floating-point values.
+/// * `p`           - The percentile (0..100).
+///
+/// # Returns
+///
+/// The value corresponding to the p-th percentile.
+pub fn percentile(sorted_data: &Vec<f64>, p: f64) -> f64 {
+  if sorted_data.is_empty() {
+    return 0.0;
+  }
+  if p <= 0.0 {
+    return sorted_data[0];
+  }
+  if p >= 100.0 {
+    return sorted_data[sorted_data.len() - 1];
+  }
+
+  let rank = (p / 100.0) * (sorted_data.len() as f64 - 1.0);
+  let lower_index = rank.floor() as usize;
+  let upper_index = rank.ceil() as usize;
+
+  if lower_index == upper_index {
+    sorted_data[lower_index]
+  } else {
+    // Linear interpolation between lower and upper index
+    let weight = rank - lower_index as f64;
+    let lower_val = sorted_data[lower_index];
+    let upper_val = sorted_data[upper_index];
+    lower_val + weight * (upper_val - lower_val)
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+  use ndarray::Array1;
+
+  #[test]
+  fn test_kde() {
+    let data = Array1::from(vec![1.0, 1.5, 2.0, 2.5, 3.0]);
+
+    let kde_auto = GaussianKDE::with_silverman_bandwidth(data.clone());
+    println!("Silverman bandwidth: {:.5}", kde_auto.bandwidth);
+
+    let x_single = 2.0;
+    let density_single = kde_auto.evaluate(x_single);
+    println!("KDE({}) = {:.6}", x_single, density_single);
+
+    let x_values = Array::linspace(0.0, 4.0, 11);
+    let density_values = kde_auto.evaluate_array(&x_values);
+    println!("\nEvaluating multiple points (0..4):");
+    for (x, dens) in x_values.iter().zip(density_values.iter()) {
+      println!("x = {:.2}, KDE = {:.6}", x, dens);
+    }
+  }
+
+  #[test]
+  fn test_kde_evaluate_single() {
+    let data = Array1::from(vec![1.0, 2.0, 3.0]);
+    let kde = GaussianKDE::new(data.clone(), 0.5);
+    let density = kde.evaluate(2.0);
+
+    assert!(density.is_finite());
+    assert!(density >= 0.0);
+  }
+
+  #[test]
+  fn test_silverman_bandwidth() {
+    let data = Array1::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
+    let h = silverman_bandwidth(&data);
+
+    assert!(h > 0.0, "Bandwidth should be positive");
+    assert!(h < 10.0, "Bandwidth seems unexpectedly large");
+  }
+}