Fix variance gradient computation (#177)

relf · web-flow · commit a23503c4f245 · 2024-07-31T00:04:16.000+02:00
* Initial fix

* Tight test for variance derivatives and cleanup

* Fix doc indentation

* Remove dead code

* Fix doc indentation

* Allow dead code

* Fix python var gradient test
diff --git a/doe/src/lhs.rs b/doe/src/lhs.rs
@@ -89,7 +89,7 @@ impl<F: Float, R: Rng + Clone> SamplingMethod<F> for Lhs<F, R> {
 impl<F: Float, R: Rng + Clone> Lhs<F, R> {
     /// Constructor with given design space and random generator.
     /// * `xlimits`: (nx, 2) matrix where nx is the dimension of the samples and the ith row
-    /// is the definition interval of the ith component of x.
+    ///   is the definition interval of the ith component of x.
     /// * `rng`: random generator used for [LhsKind::Classic] and [LhsKind::Optimized] LHS
     pub fn new_with_rng(xlimits: &ArrayBase<impl Data<Elem = F>, Ix2>, rng: R) -> Self {
         if xlimits.ncols() != 2 {
diff --git a/doe/src/traits.rs b/doe/src/traits.rs
@@ -23,7 +23,7 @@ pub trait SamplingMethod<F: Float> {
     /// # Returns
     ///
     /// * A (ns, nx) matrix of samples where nx is the dimension of the sample space
-    /// each sample belongs to `[0., 1.]^nx` hypercube
+    ///   each sample belongs to `[0., 1.]^nx` hypercube
     fn normalized_sample(&self, ns: usize) -> Array2<F>;
 
     /// Generates a (ns, nx)-shaped array of samples belonging to `[lower_bound_xi, upper_bound_xi]^nx`
@@ -35,8 +35,8 @@ pub trait SamplingMethod<F: Float> {
     /// # Returns
     ///
     /// * A (ns, nx) matrix where nx is the dimension of the sample space.
-    /// each sample belongs to `[lower_bound_xi, upper_bound_xi]^nx` where bounds
-    /// are defined as returned values of `sampling_space` function.
+    ///   each sample belongs to `[lower_bound_xi, upper_bound_xi]^nx` where bounds
+    ///   are defined as returned values of `sampling_space` function.
     fn sample(&self, ns: usize) -> Array2<F> {
         let xlimits = self.sampling_space();
         let lower = xlimits.column(0);
diff --git a/ego/src/lib.rs b/ego/src/lib.rs
@@ -96,8 +96,8 @@
 //! Some of the most useful options are:
 //!
 //! * Specification of the size of the initial DoE. The default is nx+1 where nx is the dimension of x.
-//! If your objective function is not expensive you can take `3*nx` to help the optimizer
-//! approximating your objective function.
+//!   If your objective function is not expensive you can take `3*nx` to help the optimizer
+//!   approximating your objective function.
 //!
 //! ```no_run
 //! # use egobox_ego::{EgorConfig};
@@ -108,26 +108,26 @@
 //! You can also provide your initial doe though the `egor.doe(your_doe)` method.
 //!
 //! * As the dimension increase the gaussian process surrogate building may take longer or even fail
-//! in this case you can specify a PLS dimension reduction \[[Bartoli2019](#Bartoli2019)\].
-//! Gaussian process will be built using the `ndim` (usually 3 or 4) main components in the PLS projected space.
+//!   in this case you can specify a PLS dimension reduction \[[Bartoli2019](#Bartoli2019)\].
+//!   Gaussian process will be built using the `ndim` (usually 3 or 4) main components in the PLS projected space.
 //!
 //! ```no_run
 //! # let egor_config = egobox_ego::EgorConfig::default();
 //!     egor_config.kpls_dim(3);
 //! ```
 //!
 //! * Specifications of constraints (expected to be negative at the end of the optimization)
-//! In this example below we specify that 2 constraints will be computed with the objective values meaning
-//! the objective function is expected to return an array '\[nsamples, 1 obj value + 2 const values\]'.
+//!   In this example below we specify that 2 constraints will be computed with the objective values meaning
+//!   the objective function is expected to return an array '\[nsamples, 1 obj value + 2 const values\]'.
 //!
 //! ```no_run
 //! # let egor_config = egobox_ego::EgorConfig::default();
 //!     egor_config.n_cstr(2);
 //! ```
 //!
 //! * If the default infill strategy (WB2, Watson and Barnes 2nd criterion),
-//! you can switch for either EI (Expected Improvement) or WB2S (scaled version of WB2).
-//! See \[[Priem2019](#Priem2019)\]
+//!   you can switch for either EI (Expected Improvement) or WB2S (scaled version of WB2).
+//!   See \[[Priem2019](#Priem2019)\]
 //!
 //! ```no_run
 //! # use egobox_ego::{EgorConfig, InfillStrategy};
@@ -136,9 +136,9 @@
 //! ```
 //!
 //! * The default gaussian process surrogate is parameterized with a constant trend and a squared exponential correlation kernel, also
-//! known as Kriging. The optimizer use such surrogates to approximate objective and constraint functions. The kind of surrogate
-//! can be changed using `regression_spec` and `correlation_spec()` methods to specify trend and kernels tested to get the best
-//! approximation (quality tested through cross validation).
+//!   known as Kriging. The optimizer use such surrogates to approximate objective and constraint functions. The kind of surrogate
+//!   can be changed using `regression_spec` and `correlation_spec()` methods to specify trend and kernels tested to get the best
+//!   approximation (quality tested through cross validation).
 //!
 //! ```no_run
 //! # use egobox_ego::{EgorConfig, RegressionSpec, CorrelationSpec};
diff --git a/ego/src/utils/sort_axis.rs b/ego/src/utils/sort_axis.rs
@@ -166,25 +166,6 @@ where
     }
 }
 
-#[cfg(feature = "std")]
-fn main() {
-    let a = Array::linspace(0., 63., 64).into_shape((8, 8)).unwrap();
-    let strings = a.map(|x| x.to_string());
-
-    let perm = a.sort_axis_by(Axis(1), |i, j| a[[i, 0]] > a[[j, 0]]);
-    println!("{:?}", perm);
-    let b = a.permute_axis(Axis(0), &perm);
-    println!("{:?}", b);
-
-    println!("{:?}", strings);
-    let c = strings.permute_axis(Axis(1), &perm);
-    println!("{:?}", c);
-}
-
-#[cfg(not(feature = "std"))]
-#[allow(dead_code)]
-fn main() {}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/gp/src/algorithm.rs b/gp/src/algorithm.rs
@@ -85,21 +85,21 @@ impl<F: Float> Clone for GpInnerParams<F> {
 /// * `regr(x)` a vector of polynomial basis functions
 /// * `sigma^2` is the process variance
 /// * `corr(x, x')` is a correlation function which depends on `distance(x, x')`
-/// and a set of unknown parameters `thetas` to be determined.
+///   and a set of unknown parameters `thetas` to be determined.
 ///
 /// # Implementation
 ///
 /// * Based on [ndarray](https://github.com/rust-ndarray/ndarray)
-/// and [linfa](https://github.com/rust-ml/linfa) and strive to follow [linfa guidelines](https://github.com/rust-ml/linfa/blob/master/CONTRIBUTE.md)
+///   and [linfa](https://github.com/rust-ml/linfa) and strive to follow [linfa guidelines](https://github.com/rust-ml/linfa/blob/master/CONTRIBUTE.md)
 /// * GP mean model can be constant, linear or quadratic
 /// * GP correlation model can be build the following kernels: squared exponential, absolute exponential, matern 3/2, matern 5/2    
-/// cf. [SMT Kriging](https://smt.readthedocs.io/en/latest/_src_docs/surrogate_models/krg.html)
+///   cf. [SMT Kriging](https://smt.readthedocs.io/en/latest/_src_docs/surrogate_models/krg.html)
 /// * For high dimensional problems, the classic GP algorithm does not perform well as
-/// it depends on the inversion of a correlation (n, n) matrix which is an O(n3) operation.
-/// To work around this problem the library implements dimension reduction using
-/// Partial Least Squares method upon Kriging method also known as KPLS algorithm (see Reference)
+///   it depends on the inversion of a correlation (n, n) matrix which is an O(n3) operation.
+///   To work around this problem the library implements dimension reduction using
+///   Partial Least Squares method upon Kriging method also known as KPLS algorithm (see Reference)
 /// * GP models can be saved and loaded using [serde](https://serde.rs/).
-/// See `serializable` feature section below.
+///   See `serializable` feature section below.
 ///
 /// # Features
 ///
@@ -517,19 +517,18 @@ impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> GaussianProc
         let x = &(x.to_owned().insert_axis(Axis(0)));
         let xnorm = (x - &self.xt_norm.mean) / &self.xt_norm.std;
         let dx = pairwise_differences(&xnorm, &self.xt_norm.data);
-
         let sigma2 = self.inner_params.sigma2;
         let r_chol = &self.inner_params.r_chol;
 
         let r = self.params.corr.value(&dx, &self.theta, &self.w_star);
         let dr =
             self.params
                 .corr
-                .jacobian(&xnorm.row(0), &self.xt_norm.data, &self.theta, &self.w_star)
-                / &self.xt_norm.std.to_owned().insert_axis(Axis(0));
+                .jacobian(&xnorm.row(0), &self.xt_norm.data, &self.theta, &self.w_star);
 
         // rho1 = Rc^-1 . r(x, X)
         let rho1 = r_chol.solve_triangular(&r, UPLO::Lower).unwrap();
+
         // inv_kr = Rc^t^-1 . Rc^-1 . r(x, X) = R^-1 . r(x, X)
         let inv_kr = r_chol.t().solve_triangular(&rho1, UPLO::Upper).unwrap();
 
@@ -569,12 +568,11 @@ impl<F: Float, Mean: RegressionModel<F>, Corr: CorrelationModel<F>> GaussianProc
 
         // p4 = (B^-1 . A)^t . dA/dx^t = A^t . B^-1 . dA/dx^t = p3
         let p4 = d_mat.t().dot(&d_a.t());
-
         let two = F::cast(2.);
-        let prime_t = (-p2 + p4).mapv(|v| two * v).t().to_owned();
+        let prime = (p4 - p2).mapv(|v| two * v);
 
         let x_std = &self.xt_norm.std;
-        let dvar = (prime_t / x_std).mapv(|v| v * sigma2);
+        let dvar = (prime / x_std).mapv(|v| v * sigma2);
         dvar.row(0).into_owned()
     }
 
@@ -693,6 +691,7 @@ where
 }
 
 /// Gausssian Process adaptator to implement `linfa::Predict` trait for variance prediction.
+#[allow(dead_code)]
 pub struct GpVariancePredictor<'a, F, Mean, Corr>(&'a GaussianProcess<F, Mean, Corr>)
 where
     F: Float,
@@ -1091,7 +1090,7 @@ mod tests {
     use super::*;
     use approx::{assert_abs_diff_eq, assert_abs_diff_ne};
     use argmin_testfunctions::rosenbrock;
-    use egobox_doe::{Lhs, SamplingMethod};
+    use egobox_doe::{Lhs, LhsKind, SamplingMethod};
     use linfa::prelude::Predict;
     #[cfg(not(feature = "blas"))]
     use linfa_linalg::norm::Norm;
@@ -1436,7 +1435,7 @@ mod tests {
             paste! {
 
                 #[test]
-                fn [<test_gp_variance_derivatives_ $regr:snake _ $corr:snake>]() {
+                fn [<test_gp_variance_derivatives_ $regr:snake _ $corr:snake _ $func:snake>]() {
                     let mut rng = Xoshiro256Plus::seed_from_u64(42);
                     let xt = egobox_doe::Lhs::new(&array![[-$limit, $limit], [-$limit, $limit]]).with_rng(rng.clone()).sample($nt);
                     let yt = [<$func>](&xt);
@@ -1602,15 +1601,88 @@ mod tests {
 
     fn assert_rel_or_abs_error(y_deriv: f64, fdiff: f64) {
         println!("analytic deriv = {y_deriv}, fdiff = {fdiff}");
-        if fdiff.abs() < 2e-1 || y_deriv.abs() < 2e-1 {
-            let atol = 2e-1;
+        if fdiff.abs() < 6e-1 {
+            let atol = 6e-1;
             println!("Check absolute error: abs({y_deriv}) should be < {atol}");
             assert_abs_diff_eq!(y_deriv, 0.0, epsilon = atol); // check absolute when close to zero
         } else {
-            let rtol = 3e-1;
+            let rtol = 6e-1;
             let rel_error = (y_deriv - fdiff).abs() / fdiff.abs(); // check relative
             println!("Check relative error: {rel_error} should be < {rtol}");
             assert_abs_diff_eq!(rel_error, 0.0, epsilon = rtol);
         }
     }
+
+    fn sin_linear(x: &Array2<f64>) -> Array2<f64> {
+        // sin + linear trend
+        let x1 = x.column(0).to_owned().mapv(|v| v.sin());
+        let x2 = x.column(0).mapv(|v| 2. * v) + x.column(1).mapv(|v| 5. * v);
+        (x1 + x2)
+            .mapv(|v| v + 10.)
+            .into_shape((x.nrows(), 1))
+            .unwrap()
+    }
+
+    #[test]
+    fn test_bug_var_derivatives() {
+        let _xt = egobox_doe::Lhs::new(&array![[-5., 10.], [-5., 10.]])
+            .kind(LhsKind::Centered)
+            .sample(12);
+        let _yt = sin_linear(&_xt);
+
+        let xt = array![
+            [6.875, -4.375],
+            [-3.125, 1.875],
+            [1.875, -1.875],
+            [-4.375, 3.125],
+            [8.125, 9.375],
+            [4.375, 4.375],
+            [0.625, 0.625],
+            [9.375, 6.875],
+            [5.625, 8.125],
+            [-0.625, -3.125],
+            [3.125, 5.625],
+            [-1.875, -0.625]
+        ];
+        let yt = array![
+            [2.43286801],
+            [13.10840811],
+            [5.32908578],
+            [17.81862219],
+            [74.08849877],
+            [39.68137781],
+            [14.96009727],
+            [63.17475741],
+            [61.26331775],
+            [-7.46009727],
+            [44.39159189],
+            [2.17091422],
+        ];
+
+        let gp = GaussianProcess::<f64, ConstantMean, SquaredExponentialCorr>::params(
+            ConstantMean::default(),
+            SquaredExponentialCorr::default(),
+        )
+        .theta_tuning(ThetaTuning::Fixed(vec![0.0437386, 0.00697978]))
+        .fit(&Dataset::new(xt, yt))
+        .expect("GP fitting");
+
+        let e = 5e-6;
+        let xa = -1.3;
+        let xb = 2.5;
+        let x = array![
+            [xa, xb],
+            [xa + e, xb],
+            [xa - e, xb],
+            [xa, xb + e],
+            [xa, xb - e]
+        ];
+        let y_pred = gp.predict_var(&x).unwrap();
+        let y_deriv = gp.predict_var_gradients(&array![[xa, xb]]);
+        let diff_g = (y_pred[[1, 0]] - y_pred[[2, 0]]) / (2. * e);
+        let diff_d = (y_pred[[3, 0]] - y_pred[[4, 0]]) / (2. * e);
+
+        assert_abs_diff_eq!(y_deriv[[0, 0]], diff_g, epsilon = 1e-5);
+        assert_abs_diff_eq!(y_deriv[[0, 1]], diff_d, epsilon = 1e-5);
+    }
 }
diff --git a/gp/src/sparse_algorithm.rs b/gp/src/sparse_algorithm.rs
@@ -377,6 +377,7 @@ where
 }
 
 /// Sparse Gausssian Process adaptator to implement `linfa::Predict` trait for variance prediction.
+#[allow(dead_code)]
 pub struct SparseGpVariancePredictor<'a, F, Corr>(&'a SparseGaussianProcess<F, Corr>)
 where
     F: Float,
diff --git a/moe/src/algorithm.rs b/moe/src/algorithm.rs
@@ -917,6 +917,7 @@ impl<D: Data<Elem = f64>> PredictInplace<ArrayBase<D, Ix2>, Array2<f64>> for GpM
 }
 
 /// Adaptator to implement `linfa::Predict` for variance prediction
+#[allow(dead_code)]
 pub struct MoeVariancePredictor<'a>(&'a GpMixture);
 impl<'a, D: Data<Elem = f64>> PredictInplace<ArrayBase<D, Ix2>, Array2<f64>>
     for MoeVariancePredictor<'a>
diff --git a/moe/src/lib.rs b/moe/src/lib.rs
@@ -7,23 +7,23 @@
 //!
 //! The recombination between the GP models can be either:
 //! * `hard`: one GP model is being responsible to provide the predicted value
-//! at the given point. GP selection is done by taking the largest probability of the
-//! given point being part of the cluster corresponding to the expert GP.
-//! In hard mode, transition between models leads to discontinuity.
+//!   at the given point. GP selection is done by taking the largest probability of the
+//!   given point being part of the cluster corresponding to the expert GP.
+//!   In hard mode, transition between models leads to discontinuity.
 //! * `smooth`: all GPs models are taken and their predicted values at a given point are
-//! weighted regarding their responsability (probability of the given point being part
-//! of the cluster corresponding to the expert GP). In this case the MoE model is continuous.
-//! The smoothness is automatically adjusted using a factor, the heaviside factor,
-//! which can also be set manually.
+//!   weighted regarding their responsability (probability of the given point being part
+//!   of the cluster corresponding to the expert GP). In this case the MoE model is continuous.
+//!   The smoothness is automatically adjusted using a factor, the heaviside factor,
+//!    which can also be set manually.
 //!
 //! # Implementation
 //!
 //! * Clusters are defined by clustering the training data with
-//! [linfa-clustering](https://docs.rs/linfa-clustering/latest/linfa_clustering/)
-//! gaussian mixture model.
+//!   [linfa-clustering](https://docs.rs/linfa-clustering/latest/linfa_clustering/)
+//!   gaussian mixture model.
 //! * This library is a port of the
-//! [SMT MoE method](https://smt.readthedocs.io/en/latest/_src_docs/applications/moe.html)
-//! using egobox GP models as experts.
+//!   [SMT MoE method](https://smt.readthedocs.io/en/latest/_src_docs/applications/moe.html)
+//!   using egobox GP models as experts.
 //! * It leverages on the egobox GP PLS reduction feature to handle high dimensional problems.
 //! * MoE trained model can be save to disk and reloaded. See
 //!  
diff --git a/python/egobox/tests/test_gpmix.py b/python/egobox/tests/test_gpmix.py
@@ -50,7 +50,7 @@ def test_gpx_kriging(self):
             1.1204, gpx.predict_gradients(np.array([[1.1]])).item(), delta=1e-3
         )
         self.assertAlmostEqual(
-            0.0092, gpx.predict_var_gradients(np.array([[1.1]])).item(), delta=1e-3
+            0.0145, gpx.predict_var_gradients(np.array([[1.1]])).item(), delta=1e-3
         )
 
     def test_gpx_save_load(self):

Original file line number	Diff line number	Diff line change
`@@ -377,6 +377,7 @@ where`
`377`	`377`	`}`
`378`	`378`
`379`	`379`	/// Sparse Gausssian Process adaptator to implement `linfa::Predict` trait for variance prediction.
	`380`	`+#[allow(dead_code)]`
`380`	`381`	`pub struct SparseGpVariancePredictor<'a, F, Corr>(&'a SparseGaussianProcess<F, Corr>)`
`381`	`382`	`where`
`382`	`383`	`F: Float,`
Original file line number	Diff line number	Diff line change
`@@ -917,6 +917,7 @@ impl<D: Data<Elem = f64>> PredictInplace<ArrayBase<D, Ix2>, Array2<f64>> for GpM`
`917`	`917`	`}`
`918`	`918`
`919`	`919`	/// Adaptator to implement `linfa::Predict` for variance prediction
	`920`	`+#[allow(dead_code)]`
`920`	`921`	`pub struct MoeVariancePredictor<'a>(&'a GpMixture);`
`921`	`922`	`impl<'a, D: Data<Elem = f64>> PredictInplace<ArrayBase<D, Ix2>, Array2<f64>>`
`922`	`923`	`for MoeVariancePredictor<'a>`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def test_gpx_kriging(self):`
`50`	`50`	`1.1204, gpx.predict_gradients(np.array([[1.1]])).item(), delta=1e-3`
`51`	`51`	`)`
`52`	`52`	`self.assertAlmostEqual(`
`53`		`- 0.0092, gpx.predict_var_gradients(np.array([[1.1]])).item(), delta=1e-3`
	`53`	`+ 0.0145, gpx.predict_var_gradients(np.array([[1.1]])).item(), delta=1e-3`
`54`	`54`	`)`
`55`	`55`
`56`	`56`	`def test_gpx_save_load(self):`