Skip to content

Commit

Permalink
feat: add gaussian kde
Browse files Browse the repository at this point in the history
  • Loading branch information
dancixx committed Jan 13, 2025
1 parent e0a9d6e commit 96a8cdd
Show file tree
Hide file tree
Showing 18 changed files with 355 additions and 11 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ ndarray = { version = "0.16.1", features = [
"matrixmultiply-threading",
"blas",
] }
ndarray-linalg = { version = "0.16.0", features = ["openblas-static"] }
ndarray-npy = "0.9.1"
ndarray-rand = "0.15.0"
ndarray-stats = "0.6.0"
Expand Down
2 changes: 1 addition & 1 deletion src/stats.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
pub mod cir;
pub mod copula;
pub mod copulas;
pub mod double_exp;
pub mod fd;
pub mod fou_estimator;
pub mod gaussian_kde;
pub mod mle;
pub mod non_central_chi_squared;
2 changes: 1 addition & 1 deletion src/stats/copulas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ pub mod bivariate;
pub mod correlation;
pub mod empirical;
pub mod multivariate;
pub mod optimize;
pub mod samples;
pub mod univariate;
2 changes: 1 addition & 1 deletion src/stats/copulas/empirical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ mod tests {
use rand::thread_rng;
use rand_distr::{Distribution, Uniform};

use crate::{stats::copula::plot_copula_samples, stochastic::N};
use crate::{stats::copulas::samples::plot_copula_samples, stochastic::N};

use super::EmpiricalCopula2D;

Expand Down
41 changes: 41 additions & 0 deletions src/stats/copulas/multivariate/gaussian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use std::error::Error;

use ndarray::{Array1, Array2};
use statrs::distribution::{Continuous, ContinuousCDF, Normal};

use crate::stats::copulas::univariate::gaussian::GaussianUnivariate;

use super::{CopulaType, Multivariate};

#[derive(Debug)]
pub struct GaussianMultivariate;

impl Multivariate for GaussianMultivariate {
fn r#type(&self) -> CopulaType {
CopulaType::Gaussian
}

fn sample(&self, n: usize) -> Result<Array2<f64>, Box<dyn Error>> {
todo!()
}

fn fit(&mut self, X: Array2<f64>) -> Result<(), Box<dyn Error>> {
todo!()
}

fn check_fit(&self, X: &Array2<f64>) -> Result<(), Box<dyn Error>> {
todo!()
}

fn pdf(&self, X: Array2<f64>) -> Result<Array1<f64>, Box<dyn Error>> {
todo!()
}

fn log_pdf(&self, X: Array2<f64>) -> Result<Array1<f64>, Box<dyn Error>> {
todo!()
}

fn cdf(&self, X: Array2<f64>) -> Result<Array1<f64>, Box<dyn Error>> {
todo!()
}
}
Empty file removed src/stats/copulas/optimize.rs
Empty file.
File renamed without changes.
8 changes: 0 additions & 8 deletions src/stats/copulas/univariate.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1 @@
pub mod beta;
pub mod gamma;
pub mod gaussian;
pub mod gaussian_kde;
pub mod log_laplace;
pub mod selection;
pub mod student_t;
pub mod truncated_gaussian;
pub mod uniform;
Empty file.
Empty file.
60 changes: 60 additions & 0 deletions src/stats/copulas/univariate/gaussian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
use ndarray::Array1;
use statrs::distribution::{Continuous, ContinuousCDF, Normal};

#[derive(Debug, Clone)]
pub struct GaussianUnivariate {
dist: Option<Normal>, // statrs Normal distribution
}

impl GaussianUnivariate {
pub fn new() -> Self {
Self { dist: None }
}

/// Fit mean and std to data column, then store a statrs Normal distribution.
pub fn fit(&mut self, column: &Array1<f64>) {
let n = column.len() as f64;
if n < 2.0 {
// fallback: something trivial
self.dist = Some(Normal::new(0.0, 1.0).unwrap());
return;
}
let mean = column.mean().unwrap_or(0.0);
let var = column.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
let std = var.sqrt().max(1e-12);

// Create a Normal distribution
self.dist = Some(Normal::new(mean, std).unwrap());
}

pub fn is_fitted(&self) -> bool {
self.dist.is_some()
}

/// Shortcut for the underlying normal CDF.
pub fn cdf(&self, x: f64) -> f64 {
if let Some(d) = &self.dist {
d.cdf(x)
} else {
0.5
}
}

/// Shortcut for the underlying normal PDF.
pub fn pdf(&self, x: f64) -> f64 {
if let Some(d) = &self.dist {
d.pdf(x)
} else {
1.0
}
}

/// Shortcut for the underlying normal PPF (inverse CDF).
pub fn ppf(&self, p: f64) -> f64 {
if let Some(d) = &self.dist {
d.inverse_cdf(p)
} else {
0.0
}
}
}
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
250 changes: 250 additions & 0 deletions src/stats/gaussian_kde.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
use ndarray::{Array, Array1};
use std::f64::consts::PI;

/// A struct representing a Gaussian Kernel Density Estimator (KDE).
///
/// # Fields
/// - `data`: 1D array of data points.
/// - `bandwidth`: The bandwidth (smoothing parameter) for the Gaussian kernel.
#[derive(Debug)]
pub struct GaussianKDE {
data: Array1<f64>,
bandwidth: f64,
}

impl GaussianKDE {
/// Creates a new `GaussianKDE` with given data and bandwidth.
///
/// # Arguments
///
/// * `data` - 1D array of data points.
/// * `bandwidth` - The smoothing parameter for the Gaussian kernel.
///
/// # Returns
///
/// A `GaussianKDE` instance containing the specified data and bandwidth.
///
/// # Examples
///
/// ```
/// use ndarray::Array1;
/// // Suppose we already have a GaussianKDE struct in scope
/// // let data = Array1::from(vec![1.0, 2.0, 3.0]);
/// // let kde = GaussianKDE::new(data, 0.5);
/// ```
pub fn new(data: Array1<f64>, bandwidth: f64) -> Self {
Self { data, bandwidth }
}

/// Creates a new `GaussianKDE` where the bandwidth is automatically
/// chosen by Silverman's rule of thumb.
///
/// Silverman’s rule of thumb (for 1D) is:
///
/// `h = 0.9 * min(σ, IQR/1.34) * n^(-1/5)`,
///
/// where:
/// - `σ` is the standard deviation of the data,
/// - `IQR` is the interquartile range (75th percentile - 25th percentile),
/// - `n` is the number of data points.
///
/// # Arguments
///
/// * `data` - 1D array of data points.
///
/// # Returns
///
/// A `GaussianKDE` instance with bandwidth estimated by Silverman's rule.
pub fn with_silverman_bandwidth(data: Array1<f64>) -> Self {
let h = silverman_bandwidth(&data);
Self { data, bandwidth: h }
}

/// Evaluates the Gaussian kernel (normal PDF) for a given `x`, centered at `xi`,
/// using the instance's bandwidth.
///
/// # Arguments
///
/// * `x` - The point at which to evaluate the kernel.
/// * `xi` - The center of the kernel (an individual data point).
///
/// # Returns
///
/// The kernel value at `x`.
fn gaussian_kernel(&self, x: f64, xi: f64) -> f64 {
let norm_factor = 1.0 / (self.bandwidth * (2.0 * PI).sqrt());
let exponent = -0.5 * ((x - xi) / self.bandwidth).powi(2);
norm_factor * exponent.exp()
}

/// Evaluates the Gaussian KDE at a single point `x`.
///
/// # Arguments
///
/// * `x` - The point at which to evaluate the KDE.
///
/// # Returns
///
/// The estimated density at `x`.
///
/// # Examples
///
/// ```
/// // let kde = GaussianKDE::new(data, 0.5);
/// // let density_value = kde.evaluate(1.5);
/// ```
pub fn evaluate(&self, x: f64) -> f64 {
let sum: f64 = self
.data
.iter()
.map(|&xi| self.gaussian_kernel(x, xi))
.sum();
sum / (self.data.len() as f64)
}

/// Evaluates the Gaussian KDE for multiple values of `x` (an array of points).
///
/// # Arguments
///
/// * `x_values` - 1D array of points at which to evaluate the KDE.
///
/// # Returns
///
/// A 1D array containing the density estimates for each point in `x_values`.
///
/// # Examples
///
/// ```
/// // let kde = GaussianKDE::new(data, 0.5);
/// // let xs = Array1::linspace(0.0, 5.0, 50);
/// // let ys = kde.evaluate_array(&xs);
/// ```
pub fn evaluate_array(&self, x_values: &Array1<f64>) -> Array1<f64> {
x_values.mapv(|x| self.evaluate(x))
}
}

/// Computes the bandwidth using Silverman's rule of thumb for 1D data.
///
/// Silverman’s rule of thumb (for 1D) is:
///
/// `h = 0.9 * min(σ, IQR/1.34) * n^(-1/5)`,
///
/// - `σ` = standard deviation of the data,
/// - `IQR` = interquartile range = 75th percentile - 25th percentile,
/// - `n` = number of data points.
///
/// # Arguments
///
/// * `data` - 1D array of data points.
///
/// # Returns
///
/// The estimated bandwidth.
pub fn silverman_bandwidth(data: &Array1<f64>) -> f64 {
let n = data.len() as f64;
if n < 2.0 {
// If there's only one data point or empty, fallback to something minimal.
return 1e-6;
}

// Compute the standard deviation.
let mean = data.mean().unwrap_or(0.0);
let std = (data.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)).sqrt();

// Compute the interquartile range.
let mut sorted = data.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let q1 = percentile(&sorted, 25.0);
let q3 = percentile(&sorted, 75.0);
let iqr = q3 - q1;

let scale = std.min(iqr / 1.34);
let h = 0.9 * scale * (n.powf(-1.0 / 5.0));
// Avoid extremely small bandwidth.
if h < 1e-8 {
1e-8
} else {
h
}
}

/// Returns the p-th percentile of a sorted vector.
///
/// # Arguments
///
/// * `sorted_data` - A sorted vector of floating-point values.
/// * `p` - The percentile (0..100).
///
/// # Returns
///
/// The value corresponding to the p-th percentile.
pub fn percentile(sorted_data: &Vec<f64>, p: f64) -> f64 {
if sorted_data.is_empty() {
return 0.0;
}
if p <= 0.0 {
return sorted_data[0];
}
if p >= 100.0 {
return sorted_data[sorted_data.len() - 1];
}

let rank = (p / 100.0) * (sorted_data.len() as f64 - 1.0);
let lower_index = rank.floor() as usize;
let upper_index = rank.ceil() as usize;

if lower_index == upper_index {
sorted_data[lower_index]
} else {
// Linear interpolation between lower and upper index
let weight = rank - lower_index as f64;
let lower_val = sorted_data[lower_index];
let upper_val = sorted_data[upper_index];
lower_val + weight * (upper_val - lower_val)
}
}

#[cfg(test)]
mod tests {
use super::*;
use ndarray::Array1;

#[test]
fn test_kde() {
let data = Array1::from(vec![1.0, 1.5, 2.0, 2.5, 3.0]);

let kde_auto = GaussianKDE::with_silverman_bandwidth(data.clone());
println!("Silverman bandwidth: {:.5}", kde_auto.bandwidth);

let x_single = 2.0;
let density_single = kde_auto.evaluate(x_single);
println!("KDE({}) = {:.6}", x_single, density_single);

let x_values = Array::linspace(0.0, 4.0, 11);
let density_values = kde_auto.evaluate_array(&x_values);
println!("\nEvaluating multiple points (0..4):");
for (x, dens) in x_values.iter().zip(density_values.iter()) {
println!("x = {:.2}, KDE = {:.6}", x, dens);
}
}

#[test]
fn test_kde_evaluate_single() {
let data = Array1::from(vec![1.0, 2.0, 3.0]);
let kde = GaussianKDE::new(data.clone(), 0.5);
let density = kde.evaluate(2.0);

assert!(density.is_finite());
assert!(density >= 0.0);
}

#[test]
fn test_silverman_bandwidth() {
let data = Array1::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
let h = silverman_bandwidth(&data);

assert!(h > 0.0, "Bandwidth should be positive");
assert!(h < 10.0, "Bandwidth seems unexpectedly large");
}
}

0 comments on commit 96a8cdd

Please sign in to comment.