From e8580de4880793270c3a0f3e35809ea8e86eaae5 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Tue, 31 May 2022 23:14:51 -0500
Subject: [PATCH 01/22] getting started

---
 src/MLJMultivariateStatsInterface.jl | 91 +++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 23 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 05c7bfb..97c79f5 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -45,13 +45,13 @@ const FactorAnalysis_DESCR = "Factor Analysis"
 const LDA_DESCR = """
       Multiclass linear discriminant analysis. The algorithm learns a
     projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
-    space of dimension `out_dim` such that the trace of the transformed between-class 
-    scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed 
-    within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that 
+    space of dimension `out_dim` such that the trace of the transformed between-class
+    scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed
+    within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that
     `Pᵀ*Sw*P=I` or `Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
-    Predicted class posterior probability for feature matrix `Xtest` are derived by 
-    applying a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains 
-    computed distances(based on a distance metric) in the transformed space of rowᵢ in 
+    Predicted class posterior probability for feature matrix `Xtest` are derived by
+    applying a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains
+    computed distances(based on a distance metric) in the transformed space of rowᵢ in
     `Xtest` to the centroid of each class.
     """
 const BayesianLDA_DESCR = """
@@ -59,10 +59,10 @@ const BayesianLDA_DESCR = """
     learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
     dimensional space of dimension `out_dim` such that the trace of the transformed
     between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled 
-    such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples 
+    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
+    such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples
     and `Σw` is the within-class covariance matrix).
-    Predicted class posterior probability distibution are derived by applying Bayes rule 
+    Predicted class posterior probability distibution are derived by applying Bayes rule
     with a multivariate Gaussian class-conditional distribution.
     """
 const SubspaceLDA_DESCR = """
@@ -71,24 +71,24 @@ const SubspaceLDA_DESCR = """
     projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
     dimensional space of dimension `nc - 1` such that the trace of the transformed
     between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled 
-    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of 
-    training samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized, 
-    `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes 
+    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
+    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
+    training samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
+    `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes
     in `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-    Predicted class posterior probability for feature matrix `Xtest` are derived by 
-    applying a softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains 
-    computed distances(based on a distance metric) in the transformed space of rowᵢ in 
+    Predicted class posterior probability for feature matrix `Xtest` are derived by
+    applying a softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains
+    computed distances(based on a distance metric) in the transformed space of rowᵢ in
     `Xtest` to the centroid of each class.
     """
 const BayesianSubspaceLDA_DESCR = """
-       Bayesian Multiclass linear discriminant analysis. Suitable for high dimensional data 
-    (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a projection 
-    matrix `P = W*L` (`Sw`), that projects a feature matrix `Xtrain` onto a lower 
-    dimensional space of dimension `nc-1` such that the trace of the transformed 
-    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the 
-    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled 
-    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of 
+       Bayesian Multiclass linear discriminant analysis. Suitable for high dimensional data
+    (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a projection
+    matrix `P = W*L` (`Sw`), that projects a feature matrix `Xtrain` onto a lower
+    dimensional space of dimension `nc-1` such that the trace of the transformed
+    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
+    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
+    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
     training samples, `mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
     `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes in
     `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
@@ -150,4 +150,49 @@ metadata_pkg.(
     is_wrapper = false
 )
 
+"""
+$(MMI.doc_header(LinearRegressor))
+`LinearRegressor` implements the $TODO.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X, y)
+
+Where
+
+TODO: Check if we can put factors here
+- `X`: any table of input features (eg, a `DataFrame`) whose columns
+  each have one of the following element scitypes: `Continuous`,
+  `Count`, or `<:OrderedFactor`; check column scitypes with `schema(X)`
+
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `Continuous`; check the scitype with `scitype(y)`
+
+# Hyper-parameters
+
+- `bias=true`: include bias term if true, else fit without bias term
+
+# Operations
+
+- `predict(mach, Xnew)`:
+
+# Fitted parameters
+The fields of `fitted_params(mach)` are:
+
+- `coefficients`:
+- `intercept`:
+
+# Report
+The fields of `report(mach)` are:
+
+# Examples
+```
+
+```
+See also
+TODO: ADD REFERENCES
+"""
+LinearRegressor
+
 end

From bf5dc3157ec35c6eae7e2fafbec3f7ae0861f256 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Tue, 31 May 2022 23:20:58 -0500
Subject: [PATCH 02/22] only continuous for linreg

---
 src/MLJMultivariateStatsInterface.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 97c79f5..4c069eb 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -161,10 +161,8 @@ In MLJ or MLJBase, bind an instance `model` to data with
 
 Where
 
-TODO: Check if we can put factors here
 - `X`: any table of input features (eg, a `DataFrame`) whose columns
-  each have one of the following element scitypes: `Continuous`,
-  `Count`, or `<:OrderedFactor`; check column scitypes with `schema(X)`
+  are of scitype `Continuous`
 
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`

From 315acdc81bff2690382148065d9c621cdd15b60a Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Tue, 31 May 2022 23:33:42 -0500
Subject: [PATCH 03/22] linreg example

---
 src/MLJMultivariateStatsInterface.jl | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 4c069eb..d35e2b1 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -152,7 +152,7 @@ metadata_pkg.(
 
 """
 $(MMI.doc_header(LinearRegressor))
-`LinearRegressor` implements the $TODO.
+`LinearRegressor` implements the TODO.
 
 # Training data
 
@@ -176,18 +176,32 @@ Where
 - `predict(mach, Xnew)`:
 
 # Fitted parameters
+
 The fields of `fitted_params(mach)` are:
 
 - `coefficients`:
 - `intercept`:
 
-# Report
-The fields of `report(mach)` are:
-
 # Examples
+
 ```
+# example from [JuliaStats](https://juliastats.org/MultivariateStats.jl/dev/lreg/#Examples)
+using MLJ
+
+
+LinearRegressor = @load LinearRegressor pkg=MultivariateStats
+linear_regressor = LinearRegressor()
 
+
+X, y = make_regression(100, 2) # synthetic data
+mach = machine(linear_regressor, X, y) |> fit!
+
+
+Xnew, _ = make_regression(3, 2)
+yhat = predict(mach, Xnew) # new predictions
+yhat_point = predict_mean(mach, Xnew) # new predictions
 ```
+
 See also
 TODO: ADD REFERENCES
 """

From 8920b5a85e8073bcc94403fdaf16d49108886497 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 6 Jun 2022 00:13:35 -0500
Subject: [PATCH 04/22] working example

---
 src/MLJMultivariateStatsInterface.jl | 88 ++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 11 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index d35e2b1..b4cc484 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -152,7 +152,12 @@ metadata_pkg.(
 
 """
 $(MMI.doc_header(LinearRegressor))
-`LinearRegressor` implements the TODO.
+
+`LinearRegressor` assumes the target is a continuous variable
+whose conditional distribution is normal with constant variance, and whose
+expected value is a linear combination of the features. Linear coefficients
+are calculated using least squares.
+Options exist to specify a bias term.
 
 # Training data
 
@@ -161,11 +166,11 @@ In MLJ or MLJBase, bind an instance `model` to data with
 
 Where
 
-- `X`: any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
 
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `Continuous`; check the scitype with `scitype(y)`
+  scitype is `Continuous`; check the scitype with `schema(y)`
 
 # Hyper-parameters
 
@@ -173,22 +178,21 @@ Where
 
 # Operations
 
-- `predict(mach, Xnew)`:
+- `predict(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `coefficients`:
-- `intercept`:
+- `coefficients`: The linear coefficients determined by the model.
+- `intercept`: The intercept determined by the model.
 
 # Examples
 
 ```
-# example from [JuliaStats](https://juliastats.org/MultivariateStats.jl/dev/lreg/#Examples)
 using MLJ
 
-
 LinearRegressor = @load LinearRegressor pkg=MultivariateStats
 linear_regressor = LinearRegressor()
 
@@ -196,10 +200,8 @@ linear_regressor = LinearRegressor()
 X, y = make_regression(100, 2) # synthetic data
 mach = machine(linear_regressor, X, y) |> fit!
 
-
 Xnew, _ = make_regression(3, 2)
 yhat = predict(mach, Xnew) # new predictions
-yhat_point = predict_mean(mach, Xnew) # new predictions
 ```
 
 See also
@@ -207,4 +209,68 @@ TODO: ADD REFERENCES
 """
 LinearRegressor
 
+"""
+$(MMI.doc_header(MultitargetLinearRegressor))
+
+`MultitargetLinearRegressor` assumes the target is a continuous variable
+whose conditional distribution is normal with constant variance, and whose
+expected value is a linear combination of the features. Linear coefficients
+are calculated using least squares. In this case, the output represents a
+response vector.
+Options exist to specify a bias term.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X, y)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+- `y`: is the target, which can be any `AbstractMatrix` whose element
+  scitype is `Continuous`; check the scitype with `schema(y)`
+
+# Hyper-parameters
+
+- `bias=true`: include bias term if true, else fit without bias term
+
+# Operations
+
+- `predict(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `coefficients`: The linear coefficients determined by the model.
+- `intercept`: The intercept determined by the model.
+
+# Examples
+
+```
+using MLJ
+using MLJBase: augment_X
+using DataFrames
+
+LinearRegressor = @load MultitargetLinearRegressor pkg=MultivariateStats
+linear_regressor = LinearRegressor()
+
+X = augment_X(randn(100, 8), true)
+θ = randn((9,2))
+y = X * θ
+X, y = map(x -> DataFrame(x, :auto), (X, y))
+
+mach = machine(linear_regressor, X, y) |> fit!
+
+Xnew, _ = make_regression(3, 9)
+yhat = predict(mach, Xnew) # new predictions
+```
+
+See also
+TODO: ADD REFERENCES
+"""
+MultitargetLinearRegressor
 end

From 1e81a325e134056b31389051368e6eb6087e5a83 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 6 Jun 2022 00:40:11 -0500
Subject: [PATCH 05/22] no more regression

---
 src/MLJMultivariateStatsInterface.jl | 160 ++++++++++++++++++++++++++-
 1 file changed, 158 insertions(+), 2 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index b4cc484..54fd1fd 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -174,7 +174,7 @@ Where
 
 # Hyper-parameters
 
-- `bias=true`: include bias term if true, else fit without bias term
+- `bias=true`: Include the bias term if true, otherwise fit without bias term.
 
 # Operations
 
@@ -234,7 +234,7 @@ Where
 
 # Hyper-parameters
 
-- `bias=true`: include bias term if true, else fit without bias term
+- `bias=true`: Include the bias term if true, otherwise fit without bias term.
 
 # Operations
 
@@ -273,4 +273,160 @@ See also
 TODO: ADD REFERENCES
 """
 MultitargetLinearRegressor
+
+"""
+$(MMI.doc_header(RidgeRegressor))
+
+`RidgeRegressor` adds a quadratic penalty term to least squares regression,
+for regularization. Ridge regression is particularly useful in the case of
+multicollinearity.
+Options exist to specify a bias term, and to adjust the strength of the penalty term.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X, y)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `Continuous`; check the scitype with `schema(y)`
+
+# Hyper-parameters
+
+- `lambda=1.0`: Is the non-negative parameter for the
+  regularization strength. If lambda is 0, ridge regression is equivalent
+  to linear least squares regression, and as lambda approaches infinity,
+  all the linear coefficients approach 0.
+
+- `bias=true`: Include the bias term if true, otherwise fit without bias term.
+
+# Operations
+
+- `predict(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `coefficients`: The linear coefficients determined by the model.
+- `intercept`: The intercept determined by the model.
+
+# Examples
+
+```
+using MLJ
+
+LinearRegressor = @load LinearRegressor pkg=MultivariateStats
+RidgeRegressor = @load RidgeRegressor pkg=MultivariateStats
+
+X, y = make_regression(100, 60) # synthetic data
+
+linear_regressor = LinearRegressor()
+mach = machine(linear_regressor, X, y) |> fit!
+llsq_coef = fitted_params(mach).coefficients
+
+ridge_regressor = RidgeRegressor(lambda=0)
+ridge_mach = machine(ridge_regressor, X, y) |> fit!
+coef = fitted_params(ridge_mach).coefficients
+difference = llsq_coef - coef
+@info "difference between λ=0 ridge and llsq" mean(difference) std(difference)
+
+
+ridge_regressor = RidgeRegressor(lambda=1.5)
+ridge_mach = machine(ridge_regressor, X, y) |> fit!
+
+Xnew, _ = make_regression(3, 60)
+yhat = predict(mach, Xnew) # new predictions
+```
+
+See also
+TODO: ADD REFERENCES
+"""
+RidgeRegressor
+
+"""
+$(MMI.doc_header(MultitargetRidgeRegressor))
+
+`MultitargetRidgeRegressor` adds a quadratic penalty term to least squares regression,
+for regularization. Ridge regression is particularly useful in the case of
+multicollinearity. In this case, the output represents a response vector.
+Options exist to specify a bias term, and to adjust the strength of the penalty term.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X, y)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+- `y`: is the target, which can be any `AbstractMatrix` whose element
+  scitype is `Continuous`; check the scitype with `schema(y)`
+
+# Hyper-parameters
+
+- `lambda=1.0`: Is the non-negative parameter for the
+  regularization strength. If lambda is 0, ridge regression is equivalent
+  to linear least squares regression, and as lambda approaches infinity,
+  all the linear coefficients approach 0.
+
+- `bias=true`: Include the bias term if true, otherwise fit without bias term.
+
+# Operations
+
+- `predict(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `coefficients`: The linear coefficients determined by the model.
+- `intercept`: The intercept determined by the model.
+
+# Examples
+
+```
+using MLJ
+using MLJBase: augment_X
+using DataFrames
+
+LinearRegressor = @load MultitargetLinearRegressor pkg=MultivariateStats
+RidgeRegressor = @load MultitargetRidgeRegressor pkg=MultivariateStats
+
+X = augment_X(randn(100, 80), true)
+θ = randn((81,4))
+y = X * θ
+X, y = map(x -> DataFrame(x, :auto), (X, y))
+
+linear_regressor = LinearRegressor()
+mach = machine(linear_regressor, X, y) |> fit!
+llsq_coef = fitted_params(mach).coefficients
+
+ridge_regressor = RidgeRegressor(lambda=0)
+ridge_mach = machine(ridge_regressor, X, y) |> fit!
+coef = fitted_params(ridge_mach).coefficients
+difference = llsq_coef - coef
+@info "difference between λ=0 ridge and llsq" mean(difference) std(difference)
+
+
+ridge_regressor = RidgeRegressor(lambda=1.5)
+ridge_mach = machine(ridge_regressor, X, y) |> fit!
+
+Xnew, _ = make_regression(3, 60)
+yhat = predict(mach, Xnew) # new predictions
+```
+
+See also
+TODO: ADD REFERENCES
+"""
+MultitargetRidgeRegressor
+
 end

From ffda05f5c95bedb0eabe8ecb4c4bcd92c5d1a1ef Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 6 Jun 2022 00:43:14 -0500
Subject: [PATCH 06/22] regression knocked out

---
 src/MLJMultivariateStatsInterface.jl | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 54fd1fd..c099a2e 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -406,15 +406,15 @@ X = augment_X(randn(100, 80), true)
 y = X * θ
 X, y = map(x -> DataFrame(x, :auto), (X, y))
 
-linear_regressor = LinearRegressor()
-mach = machine(linear_regressor, X, y) |> fit!
-llsq_coef = fitted_params(mach).coefficients
-
-ridge_regressor = RidgeRegressor(lambda=0)
-ridge_mach = machine(ridge_regressor, X, y) |> fit!
-coef = fitted_params(ridge_mach).coefficients
-difference = llsq_coef - coef
-@info "difference between λ=0 ridge and llsq" mean(difference) std(difference)
+# linear_regressor = LinearRegressor() # positive semi definite error for cholesky :(
+# mach = machine(linear_regressor, X, y) |> fit!
+# llsq_coef = fitted_params(mach).coefficients
+#
+# ridge_regressor = RidgeRegressor(lambda=0)
+# ridge_mach = machine(ridge_regressor, X, y) |> fit!
+# coef = fitted_params(ridge_mach).coefficients
+# difference = llsq_coef - coef
+# @info "difference between λ=0 ridge and llsq" mean(difference) std(difference)
 
 
 ridge_regressor = RidgeRegressor(lambda=1.5)
@@ -429,4 +429,13 @@ TODO: ADD REFERENCES
 """
 MultitargetRidgeRegressor
 
+PCA
+KernelPCA
+ICA
+LDA
+BayesianLDA
+SubspaceLDA
+BayesianSubspaceLDA
+FactorAnalysis
+PPCA
 end

From fa5ab053f728d3037512b77ad1dd6f897e609d5f Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 6 Jun 2022 00:52:59 -0500
Subject: [PATCH 07/22] matrix -> table

---
 src/MLJMultivariateStatsInterface.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index c099a2e..0583821 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -229,7 +229,7 @@ Where
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitype with `schema(X)`
 
-- `y`: is the target, which can be any `AbstractMatrix` whose element
+- `y`: is the target, which can be any table of responses whose element
   scitype is `Continuous`; check the scitype with `schema(y)`
 
 # Hyper-parameters
@@ -367,7 +367,7 @@ Where
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitype with `schema(X)`
 
-- `y`: is the target, which can be any `AbstractMatrix` whose element
+- `y`: is the target, which can be any table of responses whose element
   scitype is `Continuous`; check the scitype with `schema(y)`
 
 # Hyper-parameters

From 276427fc428869063f47482985ac5e44aeb0a20a Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 6 Jun 2022 01:14:48 -0500
Subject: [PATCH 08/22] pca underway!

---
 src/MLJMultivariateStatsInterface.jl | 55 ++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 0583821..aeee1d0 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -429,6 +429,61 @@ TODO: ADD REFERENCES
 """
 MultitargetRidgeRegressor
 
+"""
+$(MMI.doc_header(PCA))
+
+`PCA`
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+# Hyper-parameters
+
+# XXX: Would it be more consistent to use nothing or something as default?
+- `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
+  0, where all components are kept (e.g., the number of components/output dimensions
+  is equal to the size of the smallest dimension of the training matrix)
+- `method=:auto`: The method to use to solve the problem. Choices are
+    - `:svd`: Support Vector Decomposition of the matrix.
+    - `:cov`: Covariance matrix decomposition.
+    - `:auto`: Use `:cov` if the matrices first dimension is smaller than its second dimension
+      otherwise use `:svd`
+- `pratio::Float64=0.99`: The ratio of variance preserved after the transformation
+- `mean=nothing`: if set to nothing(default) centering will be computed and applied,
+  if set to `0` no centering(assumed pre-centered), if a vector is passed,
+  the centering is done with that vector.
+
+# Operations
+
+- `predict(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+TODO: Example, coeff, report
+
+The fields of `fitted_params(mach)` are:
+
+
+# Examples
+
+```
+using MLJ
+
+PCA = @load PCA pkg=MultivariateStats
+
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 PCA
 KernelPCA
 ICA

From cd43eff46a2306e5a18a0cb0357df8aadaf9f38c Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 6 Jun 2022 01:16:21 -0500
Subject: [PATCH 09/22] pca header

---
 src/MLJMultivariateStatsInterface.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index aeee1d0..0766e1f 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -432,7 +432,9 @@ MultitargetRidgeRegressor
 """
 $(MMI.doc_header(PCA))
 
-`PCA`
+`PCA` Principal component analysis. Learns a linear transformation to
+project the data  on a lower dimensional space while preserving most of the initial
+variance.
 
 # Training data
 

From 06455a4e34759b1281919f0a92bb429745381763 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 13 Jun 2022 14:43:36 -0500
Subject: [PATCH 10/22] found bug

---
 src/MLJMultivariateStatsInterface.jl | 7 +++++--
 src/models/decomposition_models.jl   | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 0766e1f..f974f4f 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -448,7 +448,6 @@ Where
 
 # Hyper-parameters
 
-# XXX: Would it be more consistent to use nothing or something as default?
 - `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
   0, where all components are kept (e.g., the number of components/output dimensions
   is equal to the size of the smallest dimension of the training matrix)
@@ -464,7 +463,7 @@ Where
 
 # Operations
 
-- `predict(mach, Xnew)`: Return predictions of the target given new
+- `transform(mach, Xnew)`: Return predictions of the target given new
   features `Xnew` having the same Scitype as `X` above.
 
 # Fitted parameters
@@ -481,6 +480,10 @@ using MLJ
 
 PCA = @load PCA pkg=MultivariateStats
 
+X, y = @load_iris
+model = PCA(maxoutdim=2)
+mach = machine(pca, X) |> fit!
+
 ```
 
 See also
diff --git a/src/models/decomposition_models.jl b/src/models/decomposition_models.jl
index 3fda057..68d5368 100644
--- a/src/models/decomposition_models.jl
+++ b/src/models/decomposition_models.jl
@@ -43,8 +43,8 @@ function MMI.fit(model::PCA, verbosity::Int, X)
     )
     cache = nothing
     report = (
-        indim=MS.size(fitresult,1),
-        outdim=MS.size(fitresult,2),
+        indim=MS.size(fitresult)[1],
+        outdim=MS.size(fitresult)[2],
         tprincipalvar=MS.tprincipalvar(fitresult),
         tresidualvar=MS.tresidualvar(fitresult),
         tvar=MS.var(fitresult),

From bc97ab7beff12b473d012f590f60a37e040ffad6 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 13 Jun 2022 15:11:03 -0500
Subject: [PATCH 11/22] hot fix

---
 Project.toml                         |   2 +-
 src/MLJMultivariateStatsInterface.jl | 176 ++++++++++++++++++++++++++-
 src/models/decomposition_models.jl   |   1 +
 3 files changed, 175 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index f1fad47..c969a92 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJMultivariateStatsInterface"
 uuid = "1b6a4a23-ba22-4f51-9698-8599985d3728"
 authors = ["Anthony D. Blaom <anthony.blaom@gmail.com>", "Thibaut Lienart <thibaut.lienart@gmail.com>", "Okon Samuel <okonsamuel50@gmail.com>"]
-version = "0.3.1"
+version = "0.3.2"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index f974f4f..86ab0a8 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -468,10 +468,24 @@ Where
 
 # Fitted parameters
 
-TODO: Example, coeff, report
-
 The fields of `fitted_params(mach)` are:
 
+- `projection`: Returns the projection matrix (of size `(d, p)`).
+  Each column of the projection matrix corresponds to a principal component.
+  The principal components are arranged in descending order of
+  the corresponding variances.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `indim`: Dimensions of the provided data.
+- `outdim`: Dimensions of the transformed result.
+- `tprincipalvar`: Total variance of the principal components.
+- `tresidualvar`: Total residual variance.
+- `tvar`: Total observation variance (principal + residual variance).
+- `mean`: The mean vector (of length `d`).
+- `principalvars`: The variance of the principal components.
 
 # Examples
 
@@ -481,16 +495,172 @@ using MLJ
 PCA = @load PCA pkg=MultivariateStats
 
 X, y = @load_iris
+
 model = PCA(maxoutdim=2)
-mach = machine(pca, X) |> fit!
+mach = machine(model, X) |> fit!
 
+projection = transform(mach, X)
 ```
 
 See also
 TODO: ADD REFERENCES
 """
 PCA
+"""
+$(MMI.doc_header(KernelPCA))
+
+`KernelPCA` Principal component analysis. Learns a linear transformation to
+project the data  on a lower dimensional space while preserving most of the initial
+variance.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+# Hyper-parameters
+
+- `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
+  0, where all components are kept (e.g., the number of components/output dimensions
+  is equal to the size of the smallest dimension of the training matrix).
+- `kernel::Function=(x,y)->x'y`: The kernel function, takes in 2 vector arguments
+   x and y, returns a scalar value. Defaults to the dot product of X and Y.
+- `solver::Symbol=:auto`: solver to use for the eigenvalues, one of `:eig`(default),
+  `:eigs`.
+- `inverse::Bool=true`: perform calculations needed for inverse transform
+- `beta::Real=1.0`: strength of the ridge regression that learns the inverse transform
+  when inverse is true.
+- `tol::Real=0.0`: Convergence tolerance for eigs solver.
+- `maxiter::Int=300`: maximum number of iterations for eigs solver.
+
+# Operations
+
+- `transform(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `projection`: Returns the projection matrix (of size `(d, p)`).
+  Each column of the projection matrix corresponds to a principal component.
+  The principal components are arranged in descending order of
+  the corresponding variances.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `indim`: Dimensions of the provided data.
+- `outdim`: Dimensions of the transformed result.
+- `principalvars`: The variance of the principal components.
+
+# Examples
+
+```
+using MLJ
+using LinearAlgebra
+
+KPCA = @load KernelPCA pkg=MultivariateStats
+
+X, y = @load_iris
+
+function rbf_kernel(length_scale)
+    return (x,y) -> norm(x-y)^2 / ((2 * length_scale)^2)
+end
+
+model = KPCA(maxoutdim=2, kernel = rbf_kernel(1))
+mach = machine(model, X) |> fit!
+
+projection = transform(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 KernelPCA
+"""
+$(MMI.doc_header(ICA))
+
+`ICA` Principal component analysis. Learns a linear transformation to
+project the data  on a lower dimensional space while preserving most of the initial
+variance.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+# Hyper-parameters
+
+- `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
+  0, where all components are kept (e.g., the number of components/output dimensions
+  is equal to the size of the smallest dimension of the training matrix).
+- `kernel::Function=(x,y)->x'y`: The kernel function, takes in 2 vector arguments
+   x and y, returns a scalar value. Defaults to the dot product of X and Y.
+- `solver::Symbol=:auto`: solver to use for the eigenvalues, one of `:eig`(default),
+  `:eigs`.
+- `inverse::Bool=true`: perform calculations needed for inverse transform
+- `beta::Real=1.0`: strength of the ridge regression that learns the inverse transform
+  when inverse is true.
+- `tol::Real=0.0`: Convergence tolerance for eigs solver.
+- `maxiter::Int=300`: maximum number of iterations for eigs solver.
+
+# Operations
+
+- `transform(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `projection`: Returns the projection matrix (of size `(d, p)`).
+  Each column of the projection matrix corresponds to a principal component.
+  The principal components are arranged in descending order of
+  the corresponding variances.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `indim`: Dimensions of the provided data.
+- `outdim`: Dimensions of the transformed result.
+- `principalvars`: The variance of the principal components.
+
+# Examples
+
+```
+using MLJ
+using LinearAlgebra
+
+KPCA = @load KernelPCA pkg=MultivariateStats
+
+X, y = @load_iris
+
+function rbf_kernel(length_scale)
+    return (x,y) -> norm(x-y)^2 / ((2 * length_scale)^2)
+end
+
+model = KPCA(maxoutdim=2, kernel = rbf_kernel(1))
+mach = machine(model, X) |> fit!
+
+projection = transform(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 ICA
 LDA
 BayesianLDA
diff --git a/src/models/decomposition_models.jl b/src/models/decomposition_models.jl
index 68d5368..584e0e2 100644
--- a/src/models/decomposition_models.jl
+++ b/src/models/decomposition_models.jl
@@ -43,6 +43,7 @@ function MMI.fit(model::PCA, verbosity::Int, X)
     )
     cache = nothing
     report = (
+        # TODO: Make PR to MultivariateStats
         indim=MS.size(fitresult)[1],
         outdim=MS.size(fitresult)[2],
         tprincipalvar=MS.tprincipalvar(fitresult),

From a0f090b9bc2ea89a982734b04d9dacc9f345cd5d Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 13 Jun 2022 15:41:10 -0500
Subject: [PATCH 12/22] wording

---
 src/MLJMultivariateStatsInterface.jl | 138 +++++++++++++++++++++------
 1 file changed, 107 insertions(+), 31 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 86ab0a8..0a227e3 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -463,7 +463,7 @@ Where
 
 # Operations
 
-- `transform(mach, Xnew)`: Return predictions of the target given new
+- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
   features `Xnew` having the same Scitype as `X` above.
 
 # Fitted parameters
@@ -509,9 +509,8 @@ PCA
 """
 $(MMI.doc_header(KernelPCA))
 
-`KernelPCA` Principal component analysis. Learns a linear transformation to
-project the data  on a lower dimensional space while preserving most of the initial
-variance.
+`KernelPCA` Kernel principal component analysis. Using a kernel, the linear
+operations of PCA are performed in a [reproducing Hilbert space](https://en.wikipedia.org/wiki/Reproducing_kernel_Hilbert_space).
 
 # Training data
 
@@ -587,9 +586,9 @@ KernelPCA
 """
 $(MMI.doc_header(ICA))
 
-`ICA` Principal component analysis. Learns a linear transformation to
-project the data  on a lower dimensional space while preserving most of the initial
-variance.
+`ICA` is a computational technique for separating a multivariate signal into
+additive subcomponents, with the assumption that the subcomponents are
+non-Gaussian and independent from each other.
 
 # Training data
 
@@ -603,32 +602,31 @@ Where
 
 # Hyper-parameters
 
-- `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
-  0, where all components are kept (e.g., the number of components/output dimensions
-  is equal to the size of the smallest dimension of the training matrix).
-- `kernel::Function=(x,y)->x'y`: The kernel function, takes in 2 vector arguments
-   x and y, returns a scalar value. Defaults to the dot product of X and Y.
-- `solver::Symbol=:auto`: solver to use for the eigenvalues, one of `:eig`(default),
-  `:eigs`.
-- `inverse::Bool=true`: perform calculations needed for inverse transform
-- `beta::Real=1.0`: strength of the ridge regression that learns the inverse transform
-  when inverse is true.
-- `tol::Real=0.0`: Convergence tolerance for eigs solver.
-- `maxiter::Int=300`: maximum number of iterations for eigs solver.
+- `k::Int=0`: The number of independent components to recover, set automatically if `0`.
+- `alg::Symbol=:fastica`: The algorithm to use (only `:fastica` is supported at the moment).
+- `fun::Symbol=:tanh`: The approximate neg-entropy function, one of `:tanh`, `:gaus`.
+- `do_whiten::Bool=true`: Whether or not to perform pre-whitening.
+- `maxiter::Int=100`: The maximum number of iterations.
+- `tol::Real=1e-6`: The convergence tolerance for change in matrix W.
+- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: mean to use, if nothing (default)
+   centering is computed and applied, if zero, no centering, a vector of means can
+   be passed.
+- `winit::Union{Nothing,Matrix{<:Real}}=nothing`: Initial guess for matrix `W` either
+   an empty matrix (random initilization of `W`), a matrix of size `k × k` (if `do_whiten`
+   is true), a matrix of size `m × k` otherwise. If unspecified i.e `nothing` an empty
+   `Matrix{<:Real}` is used.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return predictions of the target given new
+- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
   features `Xnew` having the same Scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix (of size `(d, p)`).
-  Each column of the projection matrix corresponds to a principal component.
-  The principal components are arranged in descending order of
-  the corresponding variances.
+ BUG: Does not have a projection class. It would also be cool to see the whitened
+matrix in fitted_params, to show how the covariance is the identity
 
 # Report
 
@@ -636,7 +634,7 @@ The fields of `report(mach)` are:
 
 - `indim`: Dimensions of the provided data.
 - `outdim`: Dimensions of the transformed result.
-- `principalvars`: The variance of the principal components.
+- `mean`: The mean vector.
 
 # Examples
 
@@ -644,15 +642,11 @@ The fields of `report(mach)` are:
 using MLJ
 using LinearAlgebra
 
-KPCA = @load KernelPCA pkg=MultivariateStats
+ICA = @load ICA pkg=MultivariateStats
 
 X, y = @load_iris
 
-function rbf_kernel(length_scale)
-    return (x,y) -> norm(x-y)^2 / ((2 * length_scale)^2)
-end
-
-model = KPCA(maxoutdim=2, kernel = rbf_kernel(1))
+model = ICA(k = 2, tol=0.1)
 mach = machine(model, X) |> fit!
 
 projection = transform(mach, X)
@@ -662,6 +656,88 @@ See also
 TODO: ADD REFERENCES
 """
 ICA
+"""
+$(MMI.doc_header(LDA))
+
+`LDA`: Multiclass linear discriminant analysis. The algorithm learns a
+projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
+space of dimension `out_dim` such that the trace of the transformed between-class
+scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed
+within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that
+`Pᵀ*Sw*P=I` or `Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
+Predicted class posterior probability for feature matrix `Xtest` are derived by
+applying a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains
+computed distances(based on a distance metric) in the transformed space of rowᵢ in
+`Xtest` to the centroid of each class.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+# Hyper-parameters
+
+- `method::Symbol=:gevd`: The solver, one of `:gevd` or `:whiten` methods.
+- `cov_w::CovarianceEstimator`=SimpleCovariance: An estimator for the within-class
+    covariance (used in computing within-class scatter matrix, Sw), by default set
+    to the standard `MultivariateStats.SimpleCovariance()` but
+    could be set to any robust estimator from `CovarianceEstimation.jl`..
+- `cov_b::CovarianceEstimator`=SimpleCovariance: The same as `cov_w` but for the between-class
+    covariance (used in computing between-class scatter matrix, Sb).
+- `out_dim::Int=0`: The output dimension, i.e dimension of the transformed space,
+    automatically set if 0 is given (default).
+- `regcoef::Float64=1e-6`: The regularization coefficient (default value 1e-6). A positive
+    value `regcoef * eigmax(Sw)` where `Sw` is the within-class scatter matrix, is added
+    to the diagonal of Sw to improve numerical stability. This can be useful if using
+    the standard covariance estimator.
+- `dist::SemiMetric=SqEuclidean`: The distance metric to use when performing classification
+    (to compare the distance between a new point and centroids in the transformed space),
+    an alternative choice can be the `CosineDist`.Defaults to `SqEuclidean`.
+
+# Operations
+
+- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+ BUG: Does not have a projection class. It would also be cool to see the whitened
+matrix in fitted_params, to show how the covariance is the identity
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `indim`: Dimensions of the provided data.
+- `outdim`: Dimensions of the transformed result.
+- `mean`: The mean vector.
+
+# Examples
+
+```
+using MLJ
+using LinearAlgebra
+
+LA = @load LDA pkg=MultivariateStats
+
+X, y = @load_iris
+
+model = ICA(k = 2, tol=0.1)
+mach = machine(model, X) |> fit!
+
+projection = transform(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 LDA
 BayesianLDA
 SubspaceLDA

From 49b127381cc537f6dd36b0d61782277f68e7eb3e Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 13 Jun 2022 16:34:29 -0500
Subject: [PATCH 13/22] done

---
 src/MLJMultivariateStatsInterface.jl | 475 ++++++++++++++++++++++++++-
 1 file changed, 465 insertions(+), 10 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 0a227e3..07faa2f 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -679,6 +679,9 @@ Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitype with `schema(X)`
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  with `schema(y)`
 
 # Hyper-parameters
 
@@ -686,7 +689,7 @@ Where
 - `cov_w::CovarianceEstimator`=SimpleCovariance: An estimator for the within-class
     covariance (used in computing within-class scatter matrix, Sw), by default set
     to the standard `MultivariateStats.SimpleCovariance()` but
-    could be set to any robust estimator from `CovarianceEstimation.jl`..
+    could be set to any robust estimator from `CovarianceEstimation.jl`.
 - `cov_b::CovarianceEstimator`=SimpleCovariance: The same as `cov_w` but for the between-class
     covariance (used in computing between-class scatter matrix, Sb).
 - `out_dim::Int=0`: The output dimension, i.e dimension of the transformed space,
@@ -702,46 +705,498 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having Scitype as `X` above.
+- `predict(mach, Xnew)`: Return predictions of the target given
+  features `Xnew` having the same scitype as `X` above. Predictions
+  are probabilistic.
+- `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
+   returned above.
+
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
- BUG: Does not have a projection class. It would also be cool to see the whitened
-matrix in fitted_params, to show how the covariance is the identity
+- `projected_class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
 
 # Report
 
 The fields of `report(mach)` are:
 
-- `indim`: Dimensions of the provided data.
-- `outdim`: Dimensions of the transformed result.
+- `classes`: The classes seen during model fitting.
+- `out_dim`: The dimensions the model is projected to.
+- `class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
 - `mean`: The mean vector.
+- `class_weights`: The weights of each class.
+- `Sb`: The between class scatter matrix.
+- `Sw`: The within class scatter matrix.
+- `nc`: The number of classes.
 
 # Examples
 
 ```
 using MLJ
-using LinearAlgebra
 
-LA = @load LDA pkg=MultivariateStats
+LDA = @load LDA pkg=MultivariateStats
 
 X, y = @load_iris
 
-model = ICA(k = 2, tol=0.1)
-mach = machine(model, X) |> fit!
+model = LDA()
+mach = machine(model, X, y) |> fit!
 
 projection = transform(mach, X)
+y_hat = predict(mach, x)
+labels = predict_mode(mach, x)
 ```
 
 See also
 TODO: ADD REFERENCES
 """
 LDA
+"""
+$(MMI.doc_header(BayesianLDA))
+
+`BayesianLDA`: Bayesian Multiclass linear discriminant analysis. The algorithm
+learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
+dimensional space of dimension `out_dim` such that the trace of the transformed
+between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
+transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
+such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples
+and `Σw` is the within-class covariance matrix).
+Predicted class posterior probability distibution are derived by applying Bayes rule
+with a multivariate Gaussian class-conditional distribution.
+
+See also the [package documentation](
+https://multivariatestatsjl.readthedocs.io/en/latest/lda.html).
+For more information about the algorithm, see the paper by Li, Zhu and Ogihara,
+[Using Discriminant Analysis for Multi-class Classification: An Experimental Investigation](
+http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.89.7068&rep=rep1&type=pdf).
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  with `schema(y)`
+
+# Hyper-parameters
+
+- `method::Symbol=:gevd`: choice of solver, one of `:gevd` or `:whiten` methods
+- `cov_w::CovarianceEstimator`=SimpleCovariance: An estimator for the within-class
+  covariance (used in computing within-class scatter matrix, Sw), by default set
+  to the standard `MultivariateStats.SimpleCovariance()` but
+  could be set to any robust estimator from `CovarianceEstimation.jl`.
+- `cov_b::CovarianceEstimator`=SimpleCovariance: The same as `cov_w` but for the between-class
+  covariance (used in computing between-class scatter matrix, Sb).
+- `out_dim::Int=0`: The output dimension, i.e dimension of the transformed space,
+  automatically set if 0 is given (default).
+- `regcoef::Float64=1e-6`: The regularization coefficient (default value 1e-6). A positive
+value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator, is added
+  to the diagonal of Sw to improve numerical stability. This can be useful if using the
+  standard covariance estimator.
+- `priors::Union{Nothing, Vector{Float64}}=nothing`: For use in prediction with Baye's rule. If `priors = nothing` then
+  `priors` are estimated from the class proportions in the training data. Otherwise it
+  requires a `Vector` containing class probabilities with probabilities specified using
+  the order given by `levels(y)` where y is the target vector.
+
+
+# Operations
+
+- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
+  features `Xnew` having Scitype as `X` above.
+- `predict(mach, Xnew)`: Return predictions of the target given
+  features `Xnew` having the same scitype as `X` above. Predictions
+  are probabilistic.
+- `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
+   returned above.
+
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `projected_class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
+- `priors`: The estimated class priors.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `classes`: The classes seen during model fitting.
+- `out_dim`: The dimensions the model is projected to.
+- `class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `mean`: The mean vector.
+- `class_weights`: The weights of each class.
+- `Sb`: The between class scatter matrix.
+- `Sw`: The within class scatter matrix.
+- `nc`: The number of classes.
+
+# Examples
+
+```
+using MLJ
+
+BLDA = @load BayesianLDA pkg=MultivariateStats
+
+X, y = @load_iris
+
+model = BLDA()
+mach = machine(model, X, y) |> fit!
+
+projection = transform(mach, X)
+y_hat = predict(mach, x)
+labels = predict_mode(mach, x)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 BayesianLDA
+"""
+$(MMI.doc_header(SubspaceLDA))
+
+`SubspaceLDA`: Multiclass linear discriminant analysis. Suitable for high
+dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a
+projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
+dimensional space of dimension `nc - 1` such that the trace of the transformed
+between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
+transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
+such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
+training samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
+`Σw` is the within-class covariance matrix, and `nc` is the number of unique classes
+in `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
+Predicted class posterior probability for feature matrix `Xtest` are derived by
+applying a softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains
+computed distances(based on a distance metric) in the transformed space of rowᵢ in
+`Xtest` to the centroid of each class.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  with `schema(y)`
+
+# Hyper-parameters
+
+- `normalize=true`: Option to normalize the between class variance for the number of
+   observations in each class, one of `true` or `false`.
+- `out_dim`: The dimension of the transformed space to be used by `predict` and
+   `transform` methods, automatically set if 0 is given (default).
+- `dist=SqEuclidean`: The distance metric to use when performing classification
+   (to compare the distance between a new point and centroids in the transformed space),
+   an alternative choice can be the `CosineDist`.
+
+
+# Operations
+
+- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
+  features `Xnew` having Scitype as `X` above.
+- `predict(mach, Xnew)`: Return predictions of the target given
+  features `Xnew` having the same scitype as `X` above. Predictions
+  are probabilistic.
+- `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
+   returned above.
+
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+BUG: RENAME
+- `class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `explained_variance_ratio`: The ratio of explained variance to total variance. Each dimension corresponds to an eigenvalue.
+- `classes`: The classes seen during model fitting.
+- `class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `mean`: The mean vector.
+- `class_weights`: The weights of each class.
+- `nc`: The number of classes.
+
+# Examples
+
+```
+using MLJ
+
+sLDA = @load SubspaceLDA pkg=MultivariateStats
+
+X, y = @load_iris
+
+model = sLDA()
+mach = machine(model, X, y) |> fit!
+
+projection = transform(mach, X)
+y_hat = predict(mach, X)
+labels = predict_mode(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 SubspaceLDA
+"""
+$(MMI.doc_header(BayesianSubspaceLDA))
+
+
+`SubspaceLDA`: Bayesian Multiclass linear discriminant analysis. Suitable for high dimensional data
+(Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a projection
+matrix `P = W*L` (`Sw`), that projects a feature matrix `Xtrain` onto a lower
+dimensional space of dimension `nc-1` such that the trace of the transformed
+between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
+transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
+such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
+training samples, `mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
+`Σw` is the within-class covariance matrix, and `nc` is the number of unique classes in
+`y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
+Posterior class probability distibution are derived by applying Bayes rule with a
+multivariate Gaussian class-conditional distribution
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  with `schema(y)`
+
+# Hyper-parameters
+
+- `normalize=true`: Option to normalize the between class variance for the number of
+   observations in each class, one of `true` or `false`.
+- `out_dim`: The dimension of the transformed space to be used by `predict` and
+   `transform` methods, automatically set if 0 is given (default).
+- `priors::Union{Nothing, Vector{Float64}}=nothing`: For use in prediction with Baye's
+    rule. If `priors = nothing` then `priors` are estimated from the class proportions
+    in the training data. Otherwise it requires a `Vector` containing class
+    probabilities with probabilities specified using the order given by `levels(y)`
+    where y is the target vector.
+
+
+# Operations
+
+- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
+  features `Xnew` having Scitype as `X` above.
+- `predict(mach, Xnew)`: Return predictions of the target given
+  features `Xnew` having the same scitype as `X` above. Predictions
+  are probabilistic.
+- `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
+   returned above.
+
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `projected_class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
+- `priors`: The estimated class priors.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `explained_variance_ratio`: The ratio of explained variance to total variance. Each dimension corresponds to an eigenvalue.
+- `classes`: The classes seen during model fitting.
+- `class_means`: The matrix comprised of class-specific means as
+  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
+- `mean`: The mean vector.
+- `class_weights`: The weights of each class.
+- `nc`: The number of classes.
+
+# Examples
+
+```
+using MLJ
+
+bsLDA = @load BayesianSubspaceLDA pkg=MultivariateStats
+
+X, y = @load_iris
+
+model = bsLDA()
+mach = machine(model, X, y) |> fit!
+
+projection = transform(mach, X)
+y_hat = predict(mach, X)
+labels = predict_mode(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 BayesianSubspaceLDA
+"""
+$(MMI.doc_header(FactorAnalysis))
+
+`FactorAnalysis`(FA) is a linear-Gaussian latent variable model that is
+closely related to probabilistic PCA. In contrast to the probabilistic PCA model,
+the covariance of conditional distribution of the observed variable given the latent variable is diagonal rather than isotropic
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+# Hyper-parameters
+
+- `method::Symbol=:cm`: Method to use to solve the problem, one of `:ml`, `:em`, `:bayes`.
+- `maxoutdim::Int=0`: Maximum number of output dimensions, uses max(no_of_features - 1, 1)
+    if 0 (default).
+- `maxiter::Int=1000`: Maximum number of iterations.
+- `tol::Real=1e-6`: Convergence tolerance.
+- `eta::Real=tol`: Variance lower bound.
+- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: If set to nothing(default)
+    centering will be computed and applied, if set to `0` no
+    centering(assumed pre-centered), if a vector is passed, the centering is done with
+    that vector.
+
+# Operations
+
+- `transform(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `projection`: Returns the projection matrix (of size `(d, m)`).
+  Each column of the projection matrix corresponds to a factor.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `indim`: Dimensions of the provided data.
+- `outdim`: Dimensions of the transformed result.
+- `variance`: The variance of the factors.
+- `covariance_matrix`: The estimated covariance matrix.
+- `mean`: The mean vector.
+- `loadings`: The factor loadings.
+
+# Examples
+
+```
+using MLJ
+
+FA = @load FactorAnalysis pkg=MultivariateStats
+
+X, y = @load_iris
+
+model = FA(maxoutdim=2)
+mach = machine(model, X) |> fit!
+
+projection = transform(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 FactorAnalysis
+"""
+$(MMI.doc_header(PPCA))
+
+`PPCA`(Probabilistic principal component analysis) represents a constrained
+form of the Gaussian distribution in which the number of free parameters can be
+restricted while still allowing the model to capture the dominant correlations
+in a data set. It is expressed as the maximum likelihood solution of a probabilistic
+latent variable mode.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any table of input features (eg, a `DataFrame`) whose columns
+  are of scitype `Continuous`; check the scitype with `schema(X)`
+
+# Hyper-parameters
+
+- `maxoutdim::Int=0`: The maximum number of output dimensions, uses max(no_of_features - 1, 1)
+    if 0 (default).
+- `method::Symbol=:ml`: The method to use to solve the problem, one of `:ml`, `:em`, `:bayes`.
+- `maxiter::Int=1000`: The maximum number of iterations.
+- `tol::Real=1e-6`: The convergence tolerance.
+- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: If set to nothing(default)
+    centering will be computed and applied, if set to `0` no
+    centering(assumed pre-centered), if a vector is passed, the centering is done with
+    that vector.
+
+# Operations
+
+- `transform(mach, Xnew)`: Return predictions of the target given new
+  features `Xnew` having the same Scitype as `X` above.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `projection`: Returns the projection matrix (of size `(d, m)`).
+  Each column of the projection matrix corresponds to a principal component.
+
+# Report
+
+The fields of `report(mach)` are:
+
+- `indim`: Dimensions of the provided data.
+- `outdim`: Dimensions of the transformed result.
+- `tvat`: The variance of the components.
+- `loadings`: The models loadings, weights for each variable used when calculating
+   principal components.
+
+# Examples
+
+```
+using MLJ
+
+PPCA = @load PPCA pkg=MultivariateStats
+
+X, y = @load_iris
+
+model = PPCA(maxoutdim=2)
+mach = machine(model, X) |> fit!
+
+projection = transform(mach, X)
+```
+
+See also
+TODO: ADD REFERENCES
+"""
 PPCA
 end

From a495da58de540d1e7e4315e28bf6f98340506387 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 13 Jun 2022 17:28:01 -0500
Subject: [PATCH 14/22] making progress

---
 src/MLJMultivariateStatsInterface.jl | 2 +-
 src/models/decomposition_models.jl   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 07faa2f..f50592e 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -921,7 +921,7 @@ Where
 
 The fields of `fitted_params(mach)` are:
 
-BUG: RENAME
+BUG: RENAME, Make note on top of PR for inconsistency
 - `class_means`: The matrix comprised of class-specific means as
   columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
 - `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
diff --git a/src/models/decomposition_models.jl b/src/models/decomposition_models.jl
index 584e0e2..4008d29 100644
--- a/src/models/decomposition_models.jl
+++ b/src/models/decomposition_models.jl
@@ -43,8 +43,7 @@ function MMI.fit(model::PCA, verbosity::Int, X)
     )
     cache = nothing
     report = (
-        # TODO: Make PR to MultivariateStats
-        indim=MS.size(fitresult)[1],
+        indim=MS.size(fitresult)[1]
         outdim=MS.size(fitresult)[2],
         tprincipalvar=MS.tprincipalvar(fitresult),
         tresidualvar=MS.tresidualvar(fitresult),

From ac1152df9e865da42f2d06a1f6b6213b8ebbc8c4 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 20 Jun 2022 16:04:31 -0500
Subject: [PATCH 15/22] code review polished

---
 src/MLJMultivariateStatsInterface.jl | 43 ++++++++++++++--------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index f50592e..e4e218f 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -170,7 +170,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `Continuous`; check the scitype with `schema(y)`
+  scitype is `Continuous`; check the scitype with `scitype(y)`
 
 # Hyper-parameters
 
@@ -196,7 +196,6 @@ using MLJ
 LinearRegressor = @load LinearRegressor pkg=MultivariateStats
 linear_regressor = LinearRegressor()
 
-
 X, y = make_regression(100, 2) # synthetic data
 mach = machine(linear_regressor, X, y) |> fit!
 
@@ -205,7 +204,7 @@ yhat = predict(mach, Xnew) # new predictions
 ```
 
 See also
-TODO: ADD REFERENCES
+[`MultitargetLinearRegressor`](@ref), [`RidgeRegressor`](@ref), [`MultitargetRidgeRegressor`](@ref)
 """
 LinearRegressor
 
@@ -230,7 +229,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 
 - `y`: is the target, which can be any table of responses whose element
-  scitype is `Continuous`; check the scitype with `schema(y)`
+  scitype is `Continuous`; check the scitype with `scitype(y)`
 
 # Hyper-parameters
 
@@ -270,7 +269,7 @@ yhat = predict(mach, Xnew) # new predictions
 ```
 
 See also
-TODO: ADD REFERENCES
+[`LinearRegressor`](@ref), [`RidgeRegressor`](@ref), [`MultitargetRidgeRegressor`](@ref)
 """
 MultitargetLinearRegressor
 
@@ -293,7 +292,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `Continuous`; check the scitype with `schema(y)`
+  scitype is `Continuous`; check the scitype with `scitype(y)`
 
 # Hyper-parameters
 
@@ -345,7 +344,7 @@ yhat = predict(mach, Xnew) # new predictions
 ```
 
 See also
-TODO: ADD REFERENCES
+[`LinearRegressor`](@ref), [`MultitargetLinearRegressor`](@ref), [`MultitargetRidgeRegressor`](@ref)
 """
 RidgeRegressor
 
@@ -368,7 +367,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 
 - `y`: is the target, which can be any table of responses whose element
-  scitype is `Continuous`; check the scitype with `schema(y)`
+  scitype is `Continuous`; check the scitype with `scitype(y)`
 
 # Hyper-parameters
 
@@ -425,7 +424,7 @@ yhat = predict(mach, Xnew) # new predictions
 ```
 
 See also
-TODO: ADD REFERENCES
+[`LinearRegressor`](@ref), [`MultitargetLinearRegressor`](@ref), [`RidgeRegressor`](@ref)
 """
 MultitargetRidgeRegressor
 
@@ -503,7 +502,7 @@ projection = transform(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`KernelPCA`](@ref), [`ICA`](@ref), [`FactorAnalysis`](@ref), [`PPCA`](@ref)
 """
 PCA
 """
@@ -580,7 +579,7 @@ projection = transform(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`PCA`](@ref), [`ICA`](@ref), [`FactorAnalysis`](@ref), [`PPCA`](@ref)
 """
 KernelPCA
 """
@@ -653,7 +652,7 @@ projection = transform(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`PCA`](@ref), [`KernelPCA`](@ref), [`FactorAnalysis`](@ref), [`PPCA`](@ref)
 """
 ICA
 """
@@ -681,7 +680,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
-  with `schema(y)`
+  with `scitype(y)`
 
 # Hyper-parameters
 
@@ -753,7 +752,7 @@ labels = predict_mode(mach, x)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`BayesianLDA`](@ref), [`SubspaceLDA`](@ref), [`BayesianSubspaceLDA`](@ref)
 """
 LDA
 """
@@ -786,7 +785,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
-  with `schema(y)`
+  with `scitype(y)`
 
 # Hyper-parameters
 
@@ -861,7 +860,7 @@ labels = predict_mode(mach, x)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`LDA`](@ref), [`SubspaceLDA`](@ref), [`BayesianSubspaceLDA`](@ref)
 """
 BayesianLDA
 """
@@ -893,7 +892,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
-  with `schema(y)`
+  with `scitype(y)`
 
 # Hyper-parameters
 
@@ -956,7 +955,7 @@ labels = predict_mode(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`LDA`](@ref), [`BayesianLDA`](@ref), [`BayesianSubspaceLDA`](@ref)
 """
 SubspaceLDA
 """
@@ -987,7 +986,7 @@ Where
   are of scitype `Continuous`; check the scitype with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
-  with `schema(y)`
+  with `scitype(y)`
 
 # Hyper-parameters
 
@@ -1052,7 +1051,7 @@ labels = predict_mode(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`LDA`](@ref), [`BayesianLDA`](@ref), [`SubspaceLDA`](@ref)
 """
 BayesianSubspaceLDA
 """
@@ -1124,7 +1123,7 @@ projection = transform(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`KernelPCA`](@ref), [`ICA`](@ref), [`PPCA`](@ref), [`PCA`](@ref)
 """
 FactorAnalysis
 """
@@ -1196,7 +1195,7 @@ projection = transform(mach, X)
 ```
 
 See also
-TODO: ADD REFERENCES
+[`KernelPCA`](@ref), [`ICA`](@ref), [`FactorAnalysis`](@ref), [`PCA`](@ref)
 """
 PPCA
 end

From 3cb5583d9c2b677dfcf010e102fcaa1660020154 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Wed, 6 Jul 2022 16:24:05 -0500
Subject: [PATCH 16/22] internet access at last!

---
 src/MLJMultivariateStatsInterface.jl | 132 ++++++++++++++++-----------
 src/models/decomposition_models.jl   |   4 +-
 2 files changed, 83 insertions(+), 53 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index e4e218f..e124f23 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -153,25 +153,26 @@ metadata_pkg.(
 """
 $(MMI.doc_header(LinearRegressor))
 
-`LinearRegressor` assumes the target is a continuous variable
-whose conditional distribution is normal with constant variance, and whose
-expected value is a linear combination of the features. Linear coefficients
-are calculated using least squares.
-Options exist to specify a bias term.
+`LinearRegressor` assumes the target is a continuous variable and trains a linear
+prediction function using the least squares algorithm. Options exist to specify
+a bias term.`
 
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X, y)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `bias=true`: Include the bias term if true, otherwise fit without bias term.
@@ -211,26 +212,26 @@ LinearRegressor
 """
 $(MMI.doc_header(MultitargetLinearRegressor))
 
-`MultitargetLinearRegressor` assumes the target is a continuous variable
-whose conditional distribution is normal with constant variance, and whose
-expected value is a linear combination of the features. Linear coefficients
-are calculated using least squares. In this case, the output represents a
-response vector.
-Options exist to specify a bias term.
+`MultitargetLinearRegressor` assumes the target variable is vector-valued with
+continuous components.  It trains a linear prediction function using the
+least squares algorithm. Options exist to specify a bias term.
 
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X, y)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 
 - `y`: is the target, which can be any table of responses whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `bias=true`: Include the bias term if true, otherwise fit without bias term.
@@ -238,7 +239,7 @@ Where
 # Operations
 
 - `predict(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -284,16 +285,19 @@ Options exist to specify a bias term, and to adjust the strength of the penalty
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X, y)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `lambda=1.0`: Is the non-negative parameter for the
@@ -306,7 +310,7 @@ Where
 # Operations
 
 - `predict(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -359,16 +363,19 @@ Options exist to specify a bias term, and to adjust the strength of the penalty
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X, y)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 
 - `y`: is the target, which can be any table of responses whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `lambda=1.0`: Is the non-negative parameter for the
@@ -381,7 +388,7 @@ Where
 # Operations
 
 - `predict(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -438,12 +445,15 @@ variance.
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
@@ -463,16 +473,14 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix (of size `(d, p)`).
-  Each column of the projection matrix corresponds to a principal component.
-  The principal components are arranged in descending order of
-  the corresponding variances.
+- `projection`: Returns the projection matrix, which has size `(p, p_out)`), where
+   `p` and `p_out` are the number of features of the input and ouput respectively.
 
 # Report
 
@@ -514,41 +522,42 @@ operations of PCA are performed in a [reproducing Hilbert space](https://en.wiki
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
-  0, where all components are kept (e.g., the number of components/output dimensions
-  is equal to the size of the smallest dimension of the training matrix).
+- `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
+   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+   number of observations and `indim` the input dimension.
 - `kernel::Function=(x,y)->x'y`: The kernel function, takes in 2 vector arguments
-   x and y, returns a scalar value. Defaults to the dot product of X and Y.
-- `solver::Symbol=:auto`: solver to use for the eigenvalues, one of `:eig`(default),
-  `:eigs`.
+   x and y, returns a scalar value. Defaults to the dot product of `x` and `y`.
+- `solver::Symbol=:auto`: solver to use for the eigenvalues, one of `:eig`(default, uses `LinearAlgebra.eigen`),
+  `:eigs`(uses `Arpack.eigs`).
 - `inverse::Bool=true`: perform calculations needed for inverse transform
 - `beta::Real=1.0`: strength of the ridge regression that learns the inverse transform
   when inverse is true.
-- `tol::Real=0.0`: Convergence tolerance for eigs solver.
-- `maxiter::Int=300`: maximum number of iterations for eigs solver.
+- `tol::Real=0.0`: Convergence tolerance for eigenvalue solver.
+- `maxiter::Int=300`: maximum number of iterations for eigenvalue solver.
 
 # Operations
 
 - `transform(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix (of size `(d, p)`).
-  Each column of the projection matrix corresponds to a principal component.
-  The principal components are arranged in descending order of
-  the corresponding variances.
+- `projection`: Returns the projection matrix, which has size `(p, p_out)`), where
+   `p` and `p_out` are the number of features of the input and ouput respectively.
 
 # Report
 
@@ -592,12 +601,15 @@ non-Gaussian and independent from each other.
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
@@ -618,7 +630,7 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -672,16 +684,19 @@ computed distances(based on a distance metric) in the transformed space of row
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
   with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `method::Symbol=:gevd`: The solver, one of `:gevd` or `:whiten` methods.
@@ -704,7 +719,7 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having Scitype as `X` above.
+  features `Xnew` having scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -777,16 +792,19 @@ http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.89.7068&rep=rep1&type=p
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
   with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `method::Symbol=:gevd`: choice of solver, one of `:gevd` or `:whiten` methods
@@ -811,7 +829,7 @@ value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having Scitype as `X` above.
+  features `Xnew` having scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -884,16 +902,19 @@ computed distances(based on a distance metric) in the transformed space of row
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
   with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `normalize=true`: Option to normalize the between class variance for the number of
@@ -908,7 +929,7 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having Scitype as `X` above.
+  features `Xnew` having scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -978,16 +999,19 @@ multivariate Gaussian class-conditional distribution
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
   with `scitype(y)`
 
+Train the machine using `fit!(mach, rows=...)`.
+
 # Hyper-parameters
 
 - `normalize=true`: Option to normalize the between class variance for the number of
@@ -1004,7 +1028,7 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having Scitype as `X` above.
+  features `Xnew` having scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -1064,12 +1088,15 @@ the covariance of conditional distribution of the observed variable given the la
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
@@ -1087,7 +1114,7 @@ Where
 # Operations
 
 - `transform(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -1138,12 +1165,15 @@ latent variable mode.
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
+
     mach = machine(model, X)
 
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitype with `schema(X)`
+  are of scitype `Continuous`; check the scitypes with `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
diff --git a/src/models/decomposition_models.jl b/src/models/decomposition_models.jl
index 4008d29..d763404 100644
--- a/src/models/decomposition_models.jl
+++ b/src/models/decomposition_models.jl
@@ -43,8 +43,8 @@ function MMI.fit(model::PCA, verbosity::Int, X)
     )
     cache = nothing
     report = (
-        indim=MS.size(fitresult)[1]
-        outdim=MS.size(fitresult)[2],
+        indim=MS.size(fitresult,1)
+        outdim=MS.size(fitresult,2),
         tprincipalvar=MS.tprincipalvar(fitresult),
         tresidualvar=MS.tresidualvar(fitresult),
         tvar=MS.var(fitresult),

From 449abff93ba2129894a326640826bc4ed6deab0a Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Wed, 6 Jul 2022 16:27:55 -0500
Subject: [PATCH 17/22] wrapping up

---
 src/MLJMultivariateStatsInterface.jl | 108 +++++++++++++--------------
 1 file changed, 52 insertions(+), 56 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index e124f23..4acdf68 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -457,9 +457,9 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `maxoutdim=0`: The maximum number of output dimensions. If not set, defaults to
-  0, where all components are kept (e.g., the number of components/output dimensions
-  is equal to the size of the smallest dimension of the training matrix)
+- `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
+   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+   number of observations and `indim` the input dimension.
 - `method=:auto`: The method to use to solve the problem. Choices are
     - `:svd`: Support Vector Decomposition of the matrix.
     - `:cov`: Covariance matrix decomposition.
@@ -472,8 +472,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -485,9 +484,11 @@ The fields of `fitted_params(mach)` are:
 # Report
 
 The fields of `report(mach)` are:
+`outdim = min(n, indim, maxoutdim)`, where `n` is the
+   number of observations and `indim` the input dimension.
 
-- `indim`: Dimensions of the provided data.
-- `outdim`: Dimensions of the transformed result.
+- `indim`: The input dimensions.
+- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
 - `tprincipalvar`: Total variance of the principal components.
 - `tresidualvar`: Total residual variance.
 - `tvar`: Total observation variance (principal + residual variance).
@@ -506,7 +507,7 @@ X, y = @load_iris
 model = PCA(maxoutdim=2)
 mach = machine(model, X) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 ```
 
 See also
@@ -549,8 +550,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -563,8 +563,8 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `indim`: Dimensions of the provided data.
-- `outdim`: Dimensions of the transformed result.
+- `indim`: The input dimensions.
+- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
 - `principalvars`: The variance of the principal components.
 
 # Examples
@@ -584,7 +584,7 @@ end
 model = KPCA(maxoutdim=2, kernel = rbf_kernel(1))
 mach = machine(model, X) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 ```
 
 See also
@@ -594,9 +594,9 @@ KernelPCA
 """
 $(MMI.doc_header(ICA))
 
-`ICA` is a computational technique for separating a multivariate signal into
-additive subcomponents, with the assumption that the subcomponents are
-non-Gaussian and independent from each other.
+`ICA` (independent component analysis) is a computational technique for separating a
+multivariate signal into additive subcomponents, with the assumption that the subcomponents
+are non-Gaussian and independent from each other.
 
 # Training data
 
@@ -618,34 +618,34 @@ Train the machine using `fit!(mach, rows=...)`.
 - `fun::Symbol=:tanh`: The approximate neg-entropy function, one of `:tanh`, `:gaus`.
 - `do_whiten::Bool=true`: Whether or not to perform pre-whitening.
 - `maxiter::Int=100`: The maximum number of iterations.
-- `tol::Real=1e-6`: The convergence tolerance for change in matrix W.
+- `tol::Real=1e-6`: The convergence tolerance for change in the unmixing matrix W.
 - `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: mean to use, if nothing (default)
-   centering is computed and applied, if zero, no centering, a vector of means can
+   centering is computed and applied, if zero, no centering; otherwise a vector of means can
    be passed.
-- `winit::Union{Nothing,Matrix{<:Real}}=nothing`: Initial guess for matrix `W` either
-   an empty matrix (random initilization of `W`), a matrix of size `k × k` (if `do_whiten`
-   is true), a matrix of size `m × k` otherwise. If unspecified i.e `nothing` an empty
-   `Matrix{<:Real}` is used.
+- `winit::Union{Nothing,Matrix{<:Real}}=nothing`: Initial guess for the unmixing matrix
+   `W`: either an empty matrix (for random initilization of `W`), a matrix of size `m × k`
+   (if `do_whiten` is true), or a matrix of size `m × k`. Here `m` is the number
+   of components (columns) of the input.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return the component-separated version of input
+   `Xnew`, which should have the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
- BUG: Does not have a projection class. It would also be cool to see the whitened
-matrix in fitted_params, to show how the covariance is the identity
+# TODO: Now that this is fixed, document
 
 # Report
 
 The fields of `report(mach)` are:
 
-- `indim`: Dimensions of the provided data.
-- `outdim`: Dimensions of the transformed result.
-- `mean`: The mean vector.
+- `indim`: Dimension (number of columns/components) of the training
+   data and new data to be transformed.
+- `outdim`: Dimension of transformed data (number of separated components).
+- `mean`: The mean vector, which has length `indim`.
 
 # Examples
 
@@ -660,7 +660,7 @@ X, y = @load_iris
 model = ICA(k = 2, tol=0.1)
 mach = machine(model, X) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 ```
 
 See also
@@ -718,8 +718,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -761,7 +760,7 @@ X, y = @load_iris
 model = LDA()
 mach = machine(model, X, y) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 y_hat = predict(mach, x)
 labels = predict_mode(mach, x)
 ```
@@ -828,8 +827,7 @@ value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator
 
 # Operations
 
-- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -872,7 +870,7 @@ X, y = @load_iris
 model = BLDA()
 mach = machine(model, X, y) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 y_hat = predict(mach, x)
 labels = predict_mode(mach, x)
 ```
@@ -928,8 +926,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -970,7 +967,7 @@ X, y = @load_iris
 model = sLDA()
 mach = machine(model, X, y) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 y_hat = predict(mach, X)
 labels = predict_mode(mach, X)
 ```
@@ -1027,8 +1024,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return lower dimensional projection of the target given new
-  features `Xnew` having scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic.
@@ -1069,7 +1065,7 @@ X, y = @load_iris
 model = bsLDA()
 mach = machine(model, X, y) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 y_hat = predict(mach, X)
 labels = predict_mode(mach, X)
 ```
@@ -1101,8 +1097,9 @@ Train the machine using `fit!(mach, rows=...)`.
 # Hyper-parameters
 
 - `method::Symbol=:cm`: Method to use to solve the problem, one of `:ml`, `:em`, `:bayes`.
-- `maxoutdim::Int=0`: Maximum number of output dimensions, uses max(no_of_features - 1, 1)
-    if 0 (default).
+- `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
+   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+   number of observations and `indim` the input dimension.
 - `maxiter::Int=1000`: Maximum number of iterations.
 - `tol::Real=1e-6`: Convergence tolerance.
 - `eta::Real=tol`: Variance lower bound.
@@ -1113,8 +1110,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -1127,8 +1123,8 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `indim`: Dimensions of the provided data.
-- `outdim`: Dimensions of the transformed result.
+- `indim`: The input dimensions.
+- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
 - `variance`: The variance of the factors.
 - `covariance_matrix`: The estimated covariance matrix.
 - `mean`: The mean vector.
@@ -1146,7 +1142,7 @@ X, y = @load_iris
 model = FA(maxoutdim=2)
 mach = machine(model, X) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 ```
 
 See also
@@ -1177,8 +1173,9 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `maxoutdim::Int=0`: The maximum number of output dimensions, uses max(no_of_features - 1, 1)
-    if 0 (default).
+- `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
+   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+   number of observations and `indim` the input dimension.
 - `method::Symbol=:ml`: The method to use to solve the problem, one of `:ml`, `:em`, `:bayes`.
 - `maxiter::Int=1000`: The maximum number of iterations.
 - `tol::Real=1e-6`: The convergence tolerance.
@@ -1189,8 +1186,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -1203,8 +1199,8 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `indim`: Dimensions of the provided data.
-- `outdim`: Dimensions of the transformed result.
+- `indim`: The input dimensions.
+- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
 - `tvat`: The variance of the components.
 - `loadings`: The models loadings, weights for each variable used when calculating
    principal components.
@@ -1221,7 +1217,7 @@ X, y = @load_iris
 model = PPCA(maxoutdim=2)
 mach = machine(model, X) |> fit!
 
-projection = transform(mach, X)
+Xproj = transform(mach, X)
 ```
 
 See also

From f7bd29f639958bd41da9fc00025055cbcdeccfdb Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 18 Jul 2022 15:41:23 -0500
Subject: [PATCH 18/22] code review update

---
 src/MLJMultivariateStatsInterface.jl | 223 +++++++++++++++------------
 1 file changed, 122 insertions(+), 101 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 4acdf68..d246ccb 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -88,10 +88,7 @@ const BayesianSubspaceLDA_DESCR = """
     dimensional space of dimension `nc-1` such that the trace of the transformed
     between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
     transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
-    training samples, `mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
-    `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes in
-    `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
+    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
     Posterior class probability distibution are derived by applying Bayes rule with a
     multivariate Gaussian class-conditional distribution
     """
@@ -151,6 +148,7 @@ metadata_pkg.(
 )
 
 """
+
 $(MMI.doc_header(LinearRegressor))
 
 `LinearRegressor` assumes the target is a continuous variable and trains a linear
@@ -210,6 +208,7 @@ See also
 LinearRegressor
 
 """
+
 $(MMI.doc_header(MultitargetLinearRegressor))
 
 `MultitargetLinearRegressor` assumes the target variable is vector-valued with
@@ -275,6 +274,7 @@ See also
 MultitargetLinearRegressor
 
 """
+
 $(MMI.doc_header(RidgeRegressor))
 
 `RidgeRegressor` adds a quadratic penalty term to least squares regression,
@@ -353,6 +353,7 @@ See also
 RidgeRegressor
 
 """
+
 $(MMI.doc_header(MultitargetRidgeRegressor))
 
 `MultitargetRidgeRegressor` adds a quadratic penalty term to least squares regression,
@@ -436,6 +437,7 @@ See also
 MultitargetRidgeRegressor
 
 """
+
 $(MMI.doc_header(PCA))
 
 `PCA` Principal component analysis. Learns a linear transformation to
@@ -472,14 +474,14 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix, which has size `(p, p_out)`), where
-   `p` and `p_out` are the number of features of the input and ouput respectively.
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
+   `indim` and `outdim` are the number of features of the input and ouput respectively.
 
 # Report
 
@@ -492,7 +494,7 @@ The fields of `report(mach)` are:
 - `tprincipalvar`: Total variance of the principal components.
 - `tresidualvar`: Total residual variance.
 - `tvar`: Total observation variance (principal + residual variance).
-- `mean`: The mean vector (of length `d`).
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 - `principalvars`: The variance of the principal components.
 
 # Examples
@@ -514,7 +516,9 @@ See also
 [`KernelPCA`](@ref), [`ICA`](@ref), [`FactorAnalysis`](@ref), [`PPCA`](@ref)
 """
 PCA
+
 """
+
 $(MMI.doc_header(KernelPCA))
 
 `KernelPCA` Kernel principal component analysis. Using a kernel, the linear
@@ -550,14 +554,14 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix, which has size `(p, p_out)`), where
-   `p` and `p_out` are the number of features of the input and ouput respectively.
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
+   `indim` and `outdim` are the number of features of the input and ouput respectively.
 
 # Report
 
@@ -591,7 +595,9 @@ See also
 [`PCA`](@ref), [`ICA`](@ref), [`FactorAnalysis`](@ref), [`PPCA`](@ref)
 """
 KernelPCA
+
 """
+
 $(MMI.doc_header(ICA))
 
 `ICA` (independent component analysis) is a computational technique for separating a
@@ -645,7 +651,7 @@ The fields of `report(mach)` are:
 - `indim`: Dimension (number of columns/components) of the training
    data and new data to be transformed.
 - `outdim`: Dimension of transformed data (number of separated components).
-- `mean`: The mean vector, which has length `indim`.
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 
 # Examples
 
@@ -667,7 +673,9 @@ See also
 [`PCA`](@ref), [`KernelPCA`](@ref), [`FactorAnalysis`](@ref), [`PPCA`](@ref)
 """
 ICA
+
 """
+
 $(MMI.doc_header(LDA))
 
 `LDA`: Multiclass linear discriminant analysis. The algorithm learns a
@@ -692,7 +700,7 @@ Where
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
 
 Train the machine using `fit!(mach, rows=...)`.
@@ -712,16 +720,17 @@ Train the machine using `fit!(mach, rows=...)`.
     value `regcoef * eigmax(Sw)` where `Sw` is the within-class scatter matrix, is added
     to the diagonal of Sw to improve numerical stability. This can be useful if using
     the standard covariance estimator.
-- `dist::SemiMetric=SqEuclidean`: The distance metric to use when performing classification
-    (to compare the distance between a new point and centroids in the transformed space),
-    an alternative choice can be the `CosineDist`.Defaults to `SqEuclidean`.
+- `dist=Distances.SqEuclidean()`: The distance metric to use when performing
+   classification (to compare the distance between a new point and centroids in
+   the transformed space); must be a subtype of `Distances.SemiMetric` from
+   Distances.jl, e.g., `Distances.CosineDist`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
-  are probabilistic.
+  are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
    returned above.
 
@@ -730,9 +739,11 @@ Train the machine using `fit!(mach, rows=...)`.
 
 The fields of `fitted_params(mach)` are:
 
-- `projected_class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
+- `projected_class_means`: The matrix comprised of class-specific means as columns,
+   of size `(in_dim, nc)`, where `in_dim` is the number of input features (columns) and
+   `nc` the number of target classes.
+- `projection_matrix`: The learned projection matrix, of size `(in_dim, out_dim)`, where
+ `in_dim` and `out_dim` are the input and output dimensions respectively.
 
 # Report
 
@@ -741,12 +752,13 @@ The fields of `report(mach)` are:
 - `classes`: The classes seen during model fitting.
 - `out_dim`: The dimensions the model is projected to.
 - `class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `mean`: The mean vector.
+  columns (see above).
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 - `class_weights`: The weights of each class.
 - `Sb`: The between class scatter matrix.
 - `Sw`: The within class scatter matrix.
-- `nc`: The number of classes.
+- `nc`: The number of classes directly observed in the training data (which can be
+   less than the total number of classes in the class pool)
 
 # Examples
 
@@ -761,26 +773,24 @@ model = LDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)
-y_hat = predict(mach, x)
-labels = predict_mode(mach, x)
+y_hat = predict(mach, X)
+labels = predict_mode(mach, X)
 ```
 
 See also
 [`BayesianLDA`](@ref), [`SubspaceLDA`](@ref), [`BayesianSubspaceLDA`](@ref)
 """
 LDA
+
 """
+
 $(MMI.doc_header(BayesianLDA))
 
-`BayesianLDA`: Bayesian Multiclass linear discriminant analysis. The algorithm
-learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
-dimensional space of dimension `out_dim` such that the trace of the transformed
-between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples
-and `Σw` is the within-class covariance matrix).
+`BayesianLDA`: Bayesian Multiclass linear discriminant analysis. The algorithm learns a
+projection matrix as described in [`LDA`](@ref)
 Predicted class posterior probability distibution are derived by applying Bayes rule
-with a multivariate Gaussian class-conditional distribution.
+with a multivariate Gaussian class-conditional distribution. A prior class distribution
+can be specified from by the user or inferred from training data class frequency.
 
 See also the [package documentation](
 https://multivariatestatsjl.readthedocs.io/en/latest/lda.html).
@@ -799,7 +809,7 @@ Where
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
 
 Train the machine using `fit!(mach, rows=...)`.
@@ -827,10 +837,10 @@ value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
-  are probabilistic.
+  are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
    returned above.
 
@@ -839,10 +849,13 @@ value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator
 
 The fields of `fitted_params(mach)` are:
 
-- `projected_class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
-- `priors`: The estimated class priors.
+- `projected_class_means`: The matrix comprised of class-specific means as columns,
+   of size `(in_dim, nc)`, where `in_dim` is the number of input features (columns) and
+   `nc` the number of target classes.
+- `projection_matrix`: The learned projection matrix, of size `(in_dim, out_dim)`, where
+ `in_dim` and `out_dim` are the input and output dimensions respectively.
+- `priors`: The class priors for classification. As inferred from training target `y`,
+   if not user-specified. A vector with order consistent with `levels(y)`.
 
 # Report
 
@@ -851,12 +864,13 @@ The fields of `report(mach)` are:
 - `classes`: The classes seen during model fitting.
 - `out_dim`: The dimensions the model is projected to.
 - `class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `mean`: The mean vector.
+  columns (see above).
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 - `class_weights`: The weights of each class.
 - `Sb`: The between class scatter matrix.
 - `Sw`: The within class scatter matrix.
-- `nc`: The number of classes.
+- `nc`: The number of classes directly observed in the training data (which can be
+   less than the total number of classes in the class pool)
 
 # Examples
 
@@ -871,31 +885,32 @@ model = BLDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)
-y_hat = predict(mach, x)
-labels = predict_mode(mach, x)
+y_hat = predict(mach, X)
+labels = predict_mode(mach, X)
 ```
 
 See also
 [`LDA`](@ref), [`SubspaceLDA`](@ref), [`BayesianSubspaceLDA`](@ref)
 """
 BayesianLDA
+
 """
+
 $(MMI.doc_header(SubspaceLDA))
 
-`SubspaceLDA`: Multiclass linear discriminant analysis. Suitable for high
-dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a
-projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
-dimensional space of dimension `nc - 1` such that the trace of the transformed
-between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
-training samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
-`Σw` is the within-class covariance matrix, and `nc` is the number of unique classes
-in `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-Predicted class posterior probability for feature matrix `Xtest` are derived by
-applying a softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains
-computed distances(based on a distance metric) in the transformed space of rowᵢ in
-`Xtest` to the centroid of each class.
+`SubspaceLDA`: Multiclass subspace linear discriminant analysis (LDA) is a variation on
+ordinary LDA suitable for high dimensional data, as it avoids storing scatter
+matrices. For details, refer the
+[MultivariateStats.jl documentation](https://juliastats.org/MultivariateStats.jl/stable/).
+In addition to dimension reduction, probabilistic classification is provided.
+In the case of classification, the class probability for a new observation
+reflects the proximity of that observation to training observations
+associated with that class, and how far away the observation is from those
+associated with other classes. Specifically, the distances, in the transformed
+(projected) space, of a new observation, from the centroid of each target class,
+is computed; the resulting vector of distances (times minus one) is passed to a
+softmax function to obtain a class probability prediction. Here "distance"
+is computed using a user-specified distance function.
 
 # Training data
 
@@ -908,7 +923,7 @@ Where
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
 
 Train the machine using `fit!(mach, rows=...)`.
@@ -919,17 +934,18 @@ Train the machine using `fit!(mach, rows=...)`.
    observations in each class, one of `true` or `false`.
 - `out_dim`: The dimension of the transformed space to be used by `predict` and
    `transform` methods, automatically set if 0 is given (default).
-- `dist=SqEuclidean`: The distance metric to use when performing classification
-   (to compare the distance between a new point and centroids in the transformed space),
-   an alternative choice can be the `CosineDist`.
+- `dist=Distances.SqEuclidean()`: The distance metric to use when performing
+   classification (to compare the distance between a new point and centroids in
+   the transformed space); must be a subtype of `Distances.SemiMetric` from
+   Distances.jl, e.g., `Distances.CosineDist`.
 
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
-  are probabilistic.
+  are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
    returned above.
 
@@ -938,7 +954,6 @@ Train the machine using `fit!(mach, rows=...)`.
 
 The fields of `fitted_params(mach)` are:
 
-BUG: RENAME, Make note on top of PR for inconsistency
 - `class_means`: The matrix comprised of class-specific means as
   columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
 - `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
@@ -950,21 +965,22 @@ The fields of `report(mach)` are:
 - `explained_variance_ratio`: The ratio of explained variance to total variance. Each dimension corresponds to an eigenvalue.
 - `classes`: The classes seen during model fitting.
 - `class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `mean`: The mean vector.
+  columns (see above).
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 - `class_weights`: The weights of each class.
-- `nc`: The number of classes.
+- `nc`: The number of classes directly observed in the training data (which can be
+   less than the total number of classes in the class pool)
 
 # Examples
 
 ```
 using MLJ
 
-sLDA = @load SubspaceLDA pkg=MultivariateStats
+SLDA = @load SubspaceLDA pkg=MultivariateStats
 
 X, y = @load_iris
 
-model = sLDA()
+model = SLDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)
@@ -976,22 +992,16 @@ See also
 [`LDA`](@ref), [`BayesianLDA`](@ref), [`BayesianSubspaceLDA`](@ref)
 """
 SubspaceLDA
+
 """
+
 $(MMI.doc_header(BayesianSubspaceLDA))
 
 
-`SubspaceLDA`: Bayesian Multiclass linear discriminant analysis. Suitable for high dimensional data
-(Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a projection
-matrix `P = W*L` (`Sw`), that projects a feature matrix `Xtrain` onto a lower
-dimensional space of dimension `nc-1` such that the trace of the transformed
-between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
-training samples, `mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
-`Σw` is the within-class covariance matrix, and `nc` is the number of unique classes in
-`y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-Posterior class probability distibution are derived by applying Bayes rule with a
-multivariate Gaussian class-conditional distribution
+`BayesianSubspaceLDA`: Bayesian Multiclass Subspace linear discriminant analysis. The algorithm learns a
+projection matrix as described in [`SubspaceLDA`](@ref). Posterior class probability
+distribution is derived as in [`BayesianLDA`](@ref).
+
 
 # Training data
 
@@ -1004,7 +1014,7 @@ Where
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
   are of scitype `Continuous`; check the scitypes with `schema(X)`
 - `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `<:OrderedFactor(2)` or `<:Multiclass(2)`; check the scitype
+  scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
 
 Train the machine using `fit!(mach, rows=...)`.
@@ -1024,10 +1034,10 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 - `predict(mach, Xnew)`: Return predictions of the target given
   features `Xnew` having the same scitype as `X` above. Predictions
-  are probabilistic.
+  are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
    returned above.
 
@@ -1036,10 +1046,13 @@ Train the machine using `fit!(mach, rows=...)`.
 
 The fields of `fitted_params(mach)` are:
 
-- `projected_class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
-- `priors`: The estimated class priors.
+- `projected_class_means`: The matrix comprised of class-specific means as columns,
+   of size `(in_dim, nc)`, where `in_dim` is the number of input features (columns) and
+   `nc` the number of target classes.
+- `projection_matrix`: The learned projection matrix, of size `(in_dim, out_dim)`, where
+ `in_dim` and `out_dim` are the input and output dimensions respectively.
+- `priors`: The class priors for classification. As inferred from training target `y`,
+   if not user-specified. A vector with order consistent with `levels(y)`.
 
 # Report
 
@@ -1048,21 +1061,22 @@ The fields of `report(mach)` are:
 - `explained_variance_ratio`: The ratio of explained variance to total variance. Each dimension corresponds to an eigenvalue.
 - `classes`: The classes seen during model fitting.
 - `class_means`: The matrix comprised of class-specific means as
-  columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `mean`: The mean vector.
+  columns (see above).
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 - `class_weights`: The weights of each class.
-- `nc`: The number of classes.
+- `nc`: The number of classes directly observed in the training data (which can be
+   less than the total number of classes in the class pool)
 
 # Examples
 
 ```
 using MLJ
 
-bsLDA = @load BayesianSubspaceLDA pkg=MultivariateStats
+BSLDA = @load BayesianSubspaceLDA pkg=MultivariateStats
 
 X, y = @load_iris
 
-model = bsLDA()
+model = BSLDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)
@@ -1074,7 +1088,9 @@ See also
 [`LDA`](@ref), [`BayesianLDA`](@ref), [`SubspaceLDA`](@ref)
 """
 BayesianSubspaceLDA
+
 """
+
 $(MMI.doc_header(FactorAnalysis))
 
 `FactorAnalysis`(FA) is a linear-Gaussian latent variable model that is
@@ -1110,13 +1126,15 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix (of size `(d, m)`).
+
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
+   `indim` and `outdim` are the number of features of the input and ouput respectively.
   Each column of the projection matrix corresponds to a factor.
 
 # Report
@@ -1127,7 +1145,7 @@ The fields of `report(mach)` are:
 - `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
 - `variance`: The variance of the factors.
 - `covariance_matrix`: The estimated covariance matrix.
-- `mean`: The mean vector.
+- `mean`: The mean of the untransformed training data, of length `in_dim`.
 - `loadings`: The factor loadings.
 
 # Examples
@@ -1149,7 +1167,9 @@ See also
 [`KernelPCA`](@ref), [`ICA`](@ref), [`PPCA`](@ref), [`PCA`](@ref)
 """
 FactorAnalysis
+
 """
+
 $(MMI.doc_header(PPCA))
 
 `PPCA`(Probabilistic principal component analysis) represents a constrained
@@ -1186,13 +1206,14 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Operations
 
-- `transform(mach, Xnew)`: Return a lower dimentional projection of the input `Xnew` having the same scitype as `X` above.
+- `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix (of size `(d, m)`).
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
+   `indim` and `outdim` are the number of features of the input and ouput respectively.
   Each column of the projection matrix corresponds to a principal component.
 
 # Report

From 9d760eb77d3a33c8dd538893dc6cc63ac083bf55 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 8 Aug 2022 15:43:37 -0500
Subject: [PATCH 19/22] review response

---
 src/MLJMultivariateStatsInterface.jl | 380 ++++++++++++++-------------
 src/models/decomposition_models.jl   |   2 +-
 2 files changed, 198 insertions(+), 184 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index d246ccb..f7234a4 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -45,7 +45,7 @@ const FactorAnalysis_DESCR = "Factor Analysis"
 const LDA_DESCR = """
       Multiclass linear discriminant analysis. The algorithm learns a
     projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
-    space of dimension `out_dim` such that the trace of the transformed between-class
+    space of dimension `outdim` such that the trace of the transformed between-class
     scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed
     within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that
     `Pᵀ*Sw*P=I` or `Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
@@ -57,7 +57,7 @@ const LDA_DESCR = """
 const BayesianLDA_DESCR = """
       Bayesian Multiclass linear discriminant analysis. The algorithm
     learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
-    dimensional space of dimension `out_dim` such that the trace of the transformed
+    dimensional space of dimension `outdim` such that the trace of the transformed
     between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
     transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
     such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples
@@ -69,7 +69,7 @@ const SubspaceLDA_DESCR = """
     Multiclass linear discriminant analysis. Suitable for high
     dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a
     projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
-    dimensional space of dimension `nc - 1` such that the trace of the transformed
+    dimensional space of dimension `min(rank(Sw), nc - 1)` such that the trace of the transformed
     between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
     transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
     such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
@@ -164,7 +164,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
@@ -178,7 +178,7 @@ Train the machine using `fit!(mach, rows=...)`.
 # Operations
 
 - `predict(mach, Xnew)`: Return predictions of the target given new
-  features `Xnew` having the same Scitype as `X` above.
+  features `Xnew` having the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -195,7 +195,7 @@ using MLJ
 LinearRegressor = @load LinearRegressor pkg=MultivariateStats
 linear_regressor = LinearRegressor()
 
-X, y = make_regression(100, 2) # synthetic data
+X, y = make_regression(100, 2) # a table and a vector (synthetic data)
 mach = machine(linear_regressor, X, y) |> fit!
 
 Xnew, _ = make_regression(3, 2)
@@ -224,7 +224,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 - `y`: is the target, which can be any table of responses whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
@@ -251,16 +251,12 @@ The fields of `fitted_params(mach)` are:
 
 ```
 using MLJ
-using MLJBase: augment_X
 using DataFrames
 
 LinearRegressor = @load MultitargetLinearRegressor pkg=MultivariateStats
 linear_regressor = LinearRegressor()
 
-X = augment_X(randn(100, 8), true)
-θ = randn((9,2))
-y = X * θ
-X, y = map(x -> DataFrame(x, :auto), (X, y))
+X, y = make_regression(100, 9; n_targets = 2) # a table and a table (synthetic data)
 
 mach = machine(linear_regressor, X, y) |> fit!
 
@@ -291,7 +287,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
@@ -327,24 +323,14 @@ using MLJ
 LinearRegressor = @load LinearRegressor pkg=MultivariateStats
 RidgeRegressor = @load RidgeRegressor pkg=MultivariateStats
 
-X, y = make_regression(100, 60) # synthetic data
-
-linear_regressor = LinearRegressor()
-mach = machine(linear_regressor, X, y) |> fit!
-llsq_coef = fitted_params(mach).coefficients
-
-ridge_regressor = RidgeRegressor(lambda=0)
-ridge_mach = machine(ridge_regressor, X, y) |> fit!
-coef = fitted_params(ridge_mach).coefficients
-difference = llsq_coef - coef
-@info "difference between λ=0 ridge and llsq" mean(difference) std(difference)
+X, y = @load_boston
 
+model1 = RidgeRegressor(lambda=10)
+model2  = Standardizer() |> model1
+mach1 = machine(model1, X, y) |> fit!
+mach2 = machine(model2, X, y) |> fit!
+predict(mach1, X) ≈ predict(mach2, X) # false, would be true for LinearRegressor
 
-ridge_regressor = RidgeRegressor(lambda=1.5)
-ridge_mach = machine(ridge_regressor, X, y) |> fit!
-
-Xnew, _ = make_regression(3, 60)
-yhat = predict(mach, Xnew) # new predictions
 ```
 
 See also
@@ -356,10 +342,11 @@ RidgeRegressor
 
 $(MMI.doc_header(MultitargetRidgeRegressor))
 
-`MultitargetRidgeRegressor` adds a quadratic penalty term to least squares regression,
-for regularization. Ridge regression is particularly useful in the case of
-multicollinearity. In this case, the output represents a response vector.
-Options exist to specify a bias term, and to adjust the strength of the penalty term.
+`MultitargetRidgeRegressor` adds a quadratic penalty term to multi-target
+least squares regression, for regularization. Ridge regression is particularly
+useful in the case of multicollinearity. In this case, the output represents a
+response vector. Options exist to specify a bias term, and to adjust the
+strength of the penalty term.
 
 # Training data
 
@@ -370,7 +357,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 - `y`: is the target, which can be any table of responses whose element
   scitype is `Continuous`; check the scitype with `scitype(y)`
@@ -402,27 +389,11 @@ The fields of `fitted_params(mach)` are:
 
 ```
 using MLJ
-using MLJBase: augment_X
 using DataFrames
 
-LinearRegressor = @load MultitargetLinearRegressor pkg=MultivariateStats
 RidgeRegressor = @load MultitargetRidgeRegressor pkg=MultivariateStats
 
-X = augment_X(randn(100, 80), true)
-θ = randn((81,4))
-y = X * θ
-X, y = map(x -> DataFrame(x, :auto), (X, y))
-
-# linear_regressor = LinearRegressor() # positive semi definite error for cholesky :(
-# mach = machine(linear_regressor, X, y) |> fit!
-# llsq_coef = fitted_params(mach).coefficients
-#
-# ridge_regressor = RidgeRegressor(lambda=0)
-# ridge_mach = machine(ridge_regressor, X, y) |> fit!
-# coef = fitted_params(ridge_mach).coefficients
-# difference = llsq_coef - coef
-# @info "difference between λ=0 ridge and llsq" mean(difference) std(difference)
-
+X, y = make_regression(100, 60; n_targets = 2)  # a table and a table (synthetic data)
 
 ridge_regressor = RidgeRegressor(lambda=1.5)
 ridge_mach = machine(ridge_regressor, X, y) |> fit!
@@ -453,48 +424,58 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
-   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
-   number of observations and `indim` the input dimension.
+- `maxoutdim=0`:  Together with `pratio`, controls the output dimension outdim chosen
+by the model. Specifically, suppose that k is the smallest integer such that retaining
+the k most significant principal components accounts for `pratio` of the total variance
+in the training data. Then outdim = min(k, maxoutdim). If maxoutdim=0 (default) then the
+effective maxoutdim is min(n, indim - 1) where n is the number of observations and indim
+the number of features in the training data.
+- `pratio::Float64=0.99`: The ratio of variance preserved after the transformation
 - `method=:auto`: The method to use to solve the problem. Choices are
     - `:svd`: Support Vector Decomposition of the matrix.
     - `:cov`: Covariance matrix decomposition.
     - `:auto`: Use `:cov` if the matrices first dimension is smaller than its second dimension
       otherwise use `:svd`
-- `pratio::Float64=0.99`: The ratio of variance preserved after the transformation
-- `mean=nothing`: if set to nothing(default) centering will be computed and applied,
+- `mean=nothing`: if set to nothing (default) centering will be computed and applied,
   if set to `0` no centering(assumed pre-centered), if a vector is passed,
   the centering is done with that vector.
 
 # Operations
 
 - `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
+- `inverse_transform(mach, Xsmall)`: For a dimension-reduced table `Xsmall`,
+  such as returned by `transform`, reconstruct a table, having same the number
+  of columns as the original training data `X`, that transforms to `Xsmall`.
+  Mathematically, `inverse_transform` is a right-inverse for the PCA projection
+  map, whose image is orthogonal to the kernel of that map. In particular, if
+  `Xsmall = transform(mach, Xnew)`, then `inverse_transform(Xsmall)` is
+  only an approximation to `Xnew`.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`, where
    `indim` and `outdim` are the number of features of the input and ouput respectively.
 
 # Report
 
 The fields of `report(mach)` are:
 `outdim = min(n, indim, maxoutdim)`, where `n` is the
-   number of observations and `indim` the input dimension.
+  number of observations and `indim` the input dimension.
 
-- `indim`: The input dimensions.
-- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
+- `indim`: Dimension (number of columns) of the training data and new data to be transformed.
+- `outdim`: Dimension of transformed data.
 - `tprincipalvar`: Total variance of the principal components.
 - `tresidualvar`: Total residual variance.
 - `tvar`: Total observation variance (principal + residual variance).
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 - `principalvars`: The variance of the principal components.
 
 # Examples
@@ -504,7 +485,7 @@ using MLJ
 
 PCA = @load PCA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = PCA(maxoutdim=2)
 mach = machine(model, X) |> fit!
@@ -533,17 +514,17 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
 - `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
-   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
-   number of observations and `indim` the input dimension.
+  `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+  number of observations and `indim` the input dimension.
 - `kernel::Function=(x,y)->x'y`: The kernel function, takes in 2 vector arguments
-   x and y, returns a scalar value. Defaults to the dot product of `x` and `y`.
+  x and y, returns a scalar value. Defaults to the dot product of `x` and `y`.
 - `solver::Symbol=:auto`: solver to use for the eigenvalues, one of `:eig`(default, uses `LinearAlgebra.eigen`),
   `:eigs`(uses `Arpack.eigs`).
 - `inverse::Bool=true`: perform calculations needed for inverse transform
@@ -555,20 +536,27 @@ Train the machine using `fit!(mach, rows=...)`.
 # Operations
 
 - `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
+- `inverse_transform(mach, Xsmall)`: For a dimension-reduced table `Xsmall`,
+  such as returned by `transform`, reconstruct a table, having same the number
+  of columns as the original training data `X`, that transforms to `Xsmall`.
+  Mathematically, `inverse_transform` is a right-inverse for the PCA projection
+  map, whose image is orthogonal to the kernel of that map. In particular, if
+  `Xsmall = transform(mach, Xnew)`, then `inverse_transform(Xsmall)` is
+  only an approximation to `Xnew`.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
-   `indim` and `outdim` are the number of features of the input and ouput respectively.
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`, where
+  `indim` and `outdim` are the number of features of the input and ouput respectively.
 
 # Report
 
 The fields of `report(mach)` are:
 
-- `indim`: The input dimensions.
-- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
+- `indim`: Dimension (number of columns) of the training data and new data to be transformed.
+- `outdim`: Dimension of transformed data.
 - `principalvars`: The variance of the principal components.
 
 # Examples
@@ -579,7 +567,7 @@ using LinearAlgebra
 
 KPCA = @load KernelPCA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 function rbf_kernel(length_scale)
     return (x,y) -> norm(x-y)^2 / ((2 * length_scale)^2)
@@ -613,7 +601,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 Train the machine using `fit!(mach, rows=...)`.
 
@@ -626,17 +614,17 @@ Train the machine using `fit!(mach, rows=...)`.
 - `maxiter::Int=100`: The maximum number of iterations.
 - `tol::Real=1e-6`: The convergence tolerance for change in the unmixing matrix W.
 - `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: mean to use, if nothing (default)
-   centering is computed and applied, if zero, no centering; otherwise a vector of means can
-   be passed.
+  centering is computed and applied, if zero, no centering; otherwise a vector of means can
+  be passed.
 - `winit::Union{Nothing,Matrix{<:Real}}=nothing`: Initial guess for the unmixing matrix
-   `W`: either an empty matrix (for random initilization of `W`), a matrix of size `m × k`
-   (if `do_whiten` is true), or a matrix of size `m × k`. Here `m` is the number
-   of components (columns) of the input.
+  `W`: either an empty matrix (for random initilization of `W`), a matrix of size `m × k`
+  (if `do_whiten` is true), or a matrix of size `m × k`. Here `m` is the number
+  of components (columns) of the input.
 
 # Operations
 
 - `transform(mach, Xnew)`: Return the component-separated version of input
-   `Xnew`, which should have the same scitype as `X` above.
+  `Xnew`, which should have the same scitype as `X` above.
 
 # Fitted parameters
 
@@ -648,10 +636,9 @@ The fields of `fitted_params(mach)` are:
 
 The fields of `report(mach)` are:
 
-- `indim`: Dimension (number of columns/components) of the training
-   data and new data to be transformed.
-- `outdim`: Dimension of transformed data (number of separated components).
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `indim`: Dimension (number of columns) of the training data and new data to be transformed.
+- `outdim`: Dimension of transformed data.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 
 # Examples
 
@@ -661,12 +648,24 @@ using LinearAlgebra
 
 ICA = @load ICA pkg=MultivariateStats
 
-X, y = @load_iris
+time = 8 .\ 0:2001
 
-model = ICA(k = 2, tol=0.1)
-mach = machine(model, X) |> fit!
+sine_wave = sin.(2*time)
+square_wave = sign.(sin.(3*time))
+sawtooth_wave = repeat(collect(4 .\ 0:10), 182)
+signal = [sine_wave, square_wave, sawtooth_wave]
+add_noise(x) = x + randn()
+signal = map((x -> add_noise.(x)), signal)
+signal = permutedims(hcat(signal...))'
+
+mixing_matrix = [ 1 1 1; 0.5 2 1; 1.5 1 2]
+X = MLJ.table(signal * mixing_matrix)
+
+model = ICA(k = 3, tol=0.1)
+mach = machine(model, X) |> fit! # this errors ERROR: MethodError: no method matching size(::MultivariateStats.ICA{Float64}, ::Int64)
 
 Xproj = transform(mach, X)
+sum(Xproj - signal)
 ```
 
 See also
@@ -678,17 +677,14 @@ ICA
 
 $(MMI.doc_header(LDA))
 
-`LDA`: Multiclass linear discriminant analysis. The algorithm learns a
-projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
-space of dimension `out_dim` such that the trace of the transformed between-class
-scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed
-within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that
-`Pᵀ*Sw*P=I` or `Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
-Predicted class posterior probability for feature matrix `Xtest` are derived by
-applying a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains
-computed distances(based on a distance metric) in the transformed space of rowᵢ in
-`Xtest` to the centroid of each class.
+`LDA`: LDA multiclass linear discriminant analysis learns a projection in a space of
+features to a lower dimensional space, in a way that attempts to preserve as much as
+possible the degree to which the target classes are separable can be discrimated
+[(reference)](https://en.wikipedia.org/wiki/Linear_discriminant_an  scitype is `OrderedFactor` or `Multiclass`; check the scitypealysis). This can be used
+either for dimension reduction of the features (see transform below) or for probabilistic
+classification of the target (see predict below).
 
+In the case of prediction, the class probability for a new observation reflects the proximity of that observation to training observations associated with that class, and how far away the observation is from those associated with other classes. Specifically, the distances, in the transformed (projected) space, of a new observation, from the centroid of each target class, is computed; the resulting vector of distances (times minus one) is passed to a softmax function to obtain a class probability prediction. Here "distance" is computed using a user-specified distance function.
 # Training data
 
 In MLJ or MLJBase, bind an instance `model` to data with
@@ -698,7 +694,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
@@ -709,21 +705,21 @@ Train the machine using `fit!(mach, rows=...)`.
 
 - `method::Symbol=:gevd`: The solver, one of `:gevd` or `:whiten` methods.
 - `cov_w::CovarianceEstimator`=SimpleCovariance: An estimator for the within-class
-    covariance (used in computing within-class scatter matrix, Sw), by default set
-    to the standard `MultivariateStats.SimpleCovariance()` but
-    could be set to any robust estimator from `CovarianceEstimation.jl`.
+  covariance (used in computing within-class scatter matrix, Sw), by default set
+  to the standard `MultivariateStats.SimpleCovariance()` but
+  could be set to any robust estimator from `CovarianceEstimation.jl`.
 - `cov_b::CovarianceEstimator`=SimpleCovariance: The same as `cov_w` but for the between-class
-    covariance (used in computing between-class scatter matrix, Sb).
+  covariance (used in computing between-class scatter matrix, Sb).
 - `out_dim::Int=0`: The output dimension, i.e dimension of the transformed space,
-    automatically set if 0 is given (default).
+  automatically set if 0 is given (default).
 - `regcoef::Float64=1e-6`: The regularization coefficient (default value 1e-6). A positive
-    value `regcoef * eigmax(Sw)` where `Sw` is the within-class scatter matrix, is added
-    to the diagonal of Sw to improve numerical stability. This can be useful if using
-    the standard covariance estimator.
+  value `regcoef * eigmax(Sw)` where `Sw` is the within-class scatter matrix, is added
+  to the diagonal of Sw to improve numerical stability. This can be useful if using
+  the standard covariance estimator.
 - `dist=Distances.SqEuclidean()`: The distance metric to use when performing
-   classification (to compare the distance between a new point and centroids in
-   the transformed space); must be a subtype of `Distances.SemiMetric` from
-   Distances.jl, e.g., `Distances.CosineDist`.
+  classification (to compare the distance between a new point and centroids in
+  the transformed space); must be a subtype of `Distances.SemiMetric` from
+  Distances.jl, e.g., `Distances.CosineDist`.
 
 # Operations
 
@@ -732,7 +728,7 @@ Train the machine using `fit!(mach, rows=...)`.
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
-   returned above.
+  returned above.
 
 
 # Fitted parameters
@@ -740,25 +736,25 @@ Train the machine using `fit!(mach, rows=...)`.
 The fields of `fitted_params(mach)` are:
 
 - `projected_class_means`: The matrix comprised of class-specific means as columns,
-   of size `(in_dim, nc)`, where `in_dim` is the number of input features (columns) and
-   `nc` the number of target classes.
-- `projection_matrix`: The learned projection matrix, of size `(in_dim, out_dim)`, where
- `in_dim` and `out_dim` are the input and output dimensions respectively.
+  of size `(indim, nc)`, where `indim` is the number of input features (columns) and
+  `nc` the number of target classes.
+- `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
+ `indim` and `outdim` are the input and output dimensions respectively.
 
 # Report
 
 The fields of `report(mach)` are:
 
 - `classes`: The classes seen during model fitting.
-- `out_dim`: The dimensions the model is projected to.
+- `outdim`: The dimensions the model is projected to.
 - `class_means`: The matrix comprised of class-specific means as
   columns (see above).
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
 - `Sb`: The between class scatter matrix.
 - `Sw`: The within class scatter matrix.
 - `nc`: The number of classes directly observed in the training data (which can be
-   less than the total number of classes in the class pool)
+  less than the total number of classes in the class pool)
 
 # Examples
 
@@ -767,7 +763,7 @@ using MLJ
 
 LDA = @load LDA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = LDA()
 mach = machine(model, X, y) |> fit!
@@ -807,7 +803,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
@@ -842,7 +838,7 @@ value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
-   returned above.
+  returned above.
 
 
 # Fitted parameters
@@ -850,27 +846,27 @@ value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator
 The fields of `fitted_params(mach)` are:
 
 - `projected_class_means`: The matrix comprised of class-specific means as columns,
-   of size `(in_dim, nc)`, where `in_dim` is the number of input features (columns) and
-   `nc` the number of target classes.
-- `projection_matrix`: The learned projection matrix, of size `(in_dim, out_dim)`, where
- `in_dim` and `out_dim` are the input and output dimensions respectively.
+  of size `(indim, nc)`, where `indim` is the number of input features (columns) and
+  `nc` the number of target classes.
+- `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
+ `indim` and `outdim` are the input and output dimensions respectively.
 - `priors`: The class priors for classification. As inferred from training target `y`,
-   if not user-specified. A vector with order consistent with `levels(y)`.
+  if not user-specified. A vector with order consistent with `levels(y)`.
 
 # Report
 
 The fields of `report(mach)` are:
 
 - `classes`: The classes seen during model fitting.
-- `out_dim`: The dimensions the model is projected to.
+- `outdim`: The dimensions the model is projected to.
 - `class_means`: The matrix comprised of class-specific means as
   columns (see above).
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
 - `Sb`: The between class scatter matrix.
 - `Sw`: The within class scatter matrix.
 - `nc`: The number of classes directly observed in the training data (which can be
-   less than the total number of classes in the class pool)
+  less than the total number of classes in the class pool)
 
 # Examples
 
@@ -879,7 +875,7 @@ using MLJ
 
 BLDA = @load BayesianLDA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = BLDA()
 mach = machine(model, X, y) |> fit!
@@ -921,7 +917,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
@@ -931,13 +927,15 @@ Train the machine using `fit!(mach, rows=...)`.
 # Hyper-parameters
 
 - `normalize=true`: Option to normalize the between class variance for the number of
-   observations in each class, one of `true` or `false`.
-- `out_dim`: The dimension of the transformed space to be used by `predict` and
-   `transform` methods, automatically set if 0 is given (default).
+  observations in each class, one of `true` or `false`.
+- `out_dim`: the dimension of the space to be used by `predict` and
+  `transform` methods, automatically set if `0` is given (default). If a non-zero
+  `out_dim` is passed, then the actual output dimension used is `min(rank, out_dim)`
+  where `rank` is the rank of the within-class covariance matrix.
 - `dist=Distances.SqEuclidean()`: The distance metric to use when performing
-   classification (to compare the distance between a new point and centroids in
-   the transformed space); must be a subtype of `Distances.SemiMetric` from
-   Distances.jl, e.g., `Distances.CosineDist`.
+  classification (to compare the distance between a new point and centroids in
+  the transformed space); must be a subtype of `Distances.SemiMetric` from
+  Distances.jl, e.g., `Distances.CosineDist`.
 
 
 # Operations
@@ -947,7 +945,7 @@ Train the machine using `fit!(mach, rows=...)`.
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
-   returned above.
+  returned above.
 
 
 # Fitted parameters
@@ -966,10 +964,10 @@ The fields of `report(mach)` are:
 - `classes`: The classes seen during model fitting.
 - `class_means`: The matrix comprised of class-specific means as
   columns (see above).
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
 - `nc`: The number of classes directly observed in the training data (which can be
-   less than the total number of classes in the class pool)
+  less than the total number of classes in the class pool)
 
 # Examples
 
@@ -978,7 +976,7 @@ using MLJ
 
 SLDA = @load SubspaceLDA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = SLDA()
 mach = machine(model, X, y) |> fit!
@@ -1012,7 +1010,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 - `y`: is the target, which can be any `AbstractVector` whose element
   scitype is `OrderedFactor` or `Multiclass`; check the scitype
   with `scitype(y)`
@@ -1022,14 +1020,16 @@ Train the machine using `fit!(mach, rows=...)`.
 # Hyper-parameters
 
 - `normalize=true`: Option to normalize the between class variance for the number of
-   observations in each class, one of `true` or `false`.
-- `out_dim`: The dimension of the transformed space to be used by `predict` and
-   `transform` methods, automatically set if 0 is given (default).
+  observations in each class, one of `true` or `false`.
+- `out_dim`: the dimension of the space to be used by `predict` and
+  `transform` methods, automatically set if `0` is given (default). If a non-zero
+  `out_dim` is passed, then the actual output dimension used is `min(rank, out_dim)`
+  where `rank` is the rank of the within-class covariance matrix.
 - `priors::Union{Nothing, Vector{Float64}}=nothing`: For use in prediction with Baye's
-    rule. If `priors = nothing` then `priors` are estimated from the class proportions
-    in the training data. Otherwise it requires a `Vector` containing class
-    probabilities with probabilities specified using the order given by `levels(y)`
-    where y is the target vector.
+  rule. If `priors = nothing` then `priors` are estimated from the class proportions
+  in the training data. Otherwise it requires a `Vector` containing class
+  probabilities with probabilities specified using the order given by `levels(y)`
+  where y is the target vector.
 
 
 # Operations
@@ -1039,7 +1039,7 @@ Train the machine using `fit!(mach, rows=...)`.
   features `Xnew` having the same scitype as `X` above. Predictions
   are probabilistic but uncalibrated.
 - `predict_mode(mach, Xnew)`: Return the modes of the probabilistic predictions
-   returned above.
+  returned above.
 
 
 # Fitted parameters
@@ -1047,12 +1047,12 @@ Train the machine using `fit!(mach, rows=...)`.
 The fields of `fitted_params(mach)` are:
 
 - `projected_class_means`: The matrix comprised of class-specific means as columns,
-   of size `(in_dim, nc)`, where `in_dim` is the number of input features (columns) and
-   `nc` the number of target classes.
-- `projection_matrix`: The learned projection matrix, of size `(in_dim, out_dim)`, where
- `in_dim` and `out_dim` are the input and output dimensions respectively.
+  of size `(indim, nc)`, where `indim` is the number of input features (columns) and
+  `nc` the number of target classes.
+- `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
+ `indim` and `outdim` are the input and output dimensions respectively.
 - `priors`: The class priors for classification. As inferred from training target `y`,
-   if not user-specified. A vector with order consistent with `levels(y)`.
+  if not user-specified. A vector with order consistent with `levels(y)`.
 
 # Report
 
@@ -1062,10 +1062,10 @@ The fields of `report(mach)` are:
 - `classes`: The classes seen during model fitting.
 - `class_means`: The matrix comprised of class-specific means as
   columns (see above).
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
 - `nc`: The number of classes directly observed in the training data (which can be
-   less than the total number of classes in the class pool)
+  less than the total number of classes in the class pool)
 
 # Examples
 
@@ -1074,7 +1074,7 @@ using MLJ
 
 BSLDA = @load BayesianSubspaceLDA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = BSLDA()
 mach = machine(model, X, y) |> fit!
@@ -1106,7 +1106,7 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 Train the machine using `fit!(mach, rows=...)`.
 
@@ -1114,38 +1114,45 @@ Train the machine using `fit!(mach, rows=...)`.
 
 - `method::Symbol=:cm`: Method to use to solve the problem, one of `:ml`, `:em`, `:bayes`.
 - `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
-   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
-   number of observations and `indim` the input dimension.
+  `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+  number of observations and `indim` the input dimension.
 - `maxiter::Int=1000`: Maximum number of iterations.
 - `tol::Real=1e-6`: Convergence tolerance.
 - `eta::Real=tol`: Variance lower bound.
-- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: If set to nothing(default)
-    centering will be computed and applied, if set to `0` no
-    centering(assumed pre-centered), if a vector is passed, the centering is done with
-    that vector.
+- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: If set to nothing (default)
+  centering will be computed and applied, if set to `0` no
+  centering(assumed pre-centered), if a vector is passed, the centering is done with
+  that vector.
 
 # Operations
 
 - `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
+- `inverse_transform(mach, Xsmall)`: For a dimension-reduced table `Xsmall`,
+  such as returned by `transform`, reconstruct a table, having same the number
+  of columns as the original training data `X`, that transforms to `Xsmall`.
+  Mathematically, `inverse_transform` is a right-inverse for the PCA projection
+  map, whose image is orthogonal to the kernel of that map. In particular, if
+  `Xsmall = transform(mach, Xnew)`, then `inverse_transform(Xsmall)` is
+  only an approximation to `Xnew`.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
 
-- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
-   `indim` and `outdim` are the number of features of the input and ouput respectively.
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`, where
+  `indim` and `outdim` are the number of features of the input and ouput respectively.
   Each column of the projection matrix corresponds to a factor.
 
 # Report
 
 The fields of `report(mach)` are:
 
-- `indim`: The input dimensions.
-- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
+- `indim`: Dimension (number of columns) of the training data and new data to be transformed.
+- `outdim`: Dimension of transformed data (number of factors).
 - `variance`: The variance of the factors.
 - `covariance_matrix`: The estimated covariance matrix.
-- `mean`: The mean of the untransformed training data, of length `in_dim`.
+- `mean`: The mean of the untransformed training data, of length `indim`.
 - `loadings`: The factor loadings.
 
 # Examples
@@ -1155,7 +1162,7 @@ using MLJ
 
 FA = @load FactorAnalysis pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = FA(maxoutdim=2)
 mach = machine(model, X) |> fit!
@@ -1176,7 +1183,7 @@ $(MMI.doc_header(PPCA))
 form of the Gaussian distribution in which the number of free parameters can be
 restricted while still allowing the model to capture the dominant correlations
 in a data set. It is expressed as the maximum likelihood solution of a probabilistic
-latent variable mode.
+latent variable model.
 
 # Training data
 
@@ -1187,44 +1194,51 @@ In MLJ or MLJBase, bind an instance `model` to data with
 Where
 
 - `X`: is any table of input features (eg, a `DataFrame`) whose columns
-  are of scitype `Continuous`; check the scitypes with `schema(X)`
+are of scitype `Continuous`; check the column scitypes with `schema(X)`.
 
 Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
 - `maxoutdim=0`: Controls the the dimension (number of columns) of the output,
-   `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
-   number of observations and `indim` the input dimension.
+  `outdim`. Specifically,  `outdim = min(n, indim, maxoutdim)`, where `n` is the
+  number of observations and `indim` the input dimension.
 - `method::Symbol=:ml`: The method to use to solve the problem, one of `:ml`, `:em`, `:bayes`.
 - `maxiter::Int=1000`: The maximum number of iterations.
 - `tol::Real=1e-6`: The convergence tolerance.
-- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: If set to nothing(default)
-    centering will be computed and applied, if set to `0` no
-    centering(assumed pre-centered), if a vector is passed, the centering is done with
-    that vector.
+- `mean::Union{Nothing, Real, Vector{Float64}}=nothing`: If set to nothing (default)
+  centering will be computed and applied, if set to `0` no
+  centering(assumed pre-centered), if a vector is passed, the centering is done with
+  that vector.
 
 # Operations
 
 - `transform(mach, Xnew)`: Return a lower dimensional projection of the input `Xnew` having the same scitype as `X` above.
+- `inverse_transform(mach, Xsmall)`: For a dimension-reduced table `Xsmall`,
+  such as returned by `transform`, reconstruct a table, having same the number
+  of columns as the original training data `X`, that transforms to `Xsmall`.
+  Mathematically, `inverse_transform` is a right-inverse for the PCA projection
+  map, whose image is orthogonal to the kernel of that map. In particular, if
+  `Xsmall = transform(mach, Xnew)`, then `inverse_transform(Xsmall)` is
+  only an approximation to `Xnew`.
 
 # Fitted parameters
 
 The fields of `fitted_params(mach)` are:
 
-- `projection`: Returns the projection matrix, which has size `(indim, outdim)`), where
-   `indim` and `outdim` are the number of features of the input and ouput respectively.
+- `projection`: Returns the projection matrix, which has size `(indim, outdim)`, where
+  `indim` and `outdim` are the number of features of the input and ouput respectively.
   Each column of the projection matrix corresponds to a principal component.
 
 # Report
 
 The fields of `report(mach)` are:
 
-- `indim`: The input dimensions.
-- `outdim`: `min(n, indim, maxoutdim)`, where `n` is the number of observations.
+- `indim`: Dimension (number of columns) of the training data and new data to be transformed.
+- `outdim`: Dimension of transformed data.
 - `tvat`: The variance of the components.
 - `loadings`: The models loadings, weights for each variable used when calculating
-   principal components.
+  principal components.
 
 # Examples
 
@@ -1233,7 +1247,7 @@ using MLJ
 
 PPCA = @load PPCA pkg=MultivariateStats
 
-X, y = @load_iris
+X, y = @load_iris # a table and a vector
 
 model = PPCA(maxoutdim=2)
 mach = machine(model, X) |> fit!
diff --git a/src/models/decomposition_models.jl b/src/models/decomposition_models.jl
index d763404..3fda057 100644
--- a/src/models/decomposition_models.jl
+++ b/src/models/decomposition_models.jl
@@ -43,7 +43,7 @@ function MMI.fit(model::PCA, verbosity::Int, X)
     )
     cache = nothing
     report = (
-        indim=MS.size(fitresult,1)
+        indim=MS.size(fitresult,1),
         outdim=MS.size(fitresult,2),
         tprincipalvar=MS.tprincipalvar(fitresult),
         tresidualvar=MS.tresidualvar(fitresult),

From 00e752c58ecc26b50ea95951919175c26fe27782 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 8 Aug 2022 17:08:25 -0500
Subject: [PATCH 20/22] outdim

---
 src/MLJMultivariateStatsInterface.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index f7234a4..35bec3c 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -710,7 +710,7 @@ Train the machine using `fit!(mach, rows=...)`.
   could be set to any robust estimator from `CovarianceEstimation.jl`.
 - `cov_b::CovarianceEstimator`=SimpleCovariance: The same as `cov_w` but for the between-class
   covariance (used in computing between-class scatter matrix, Sb).
-- `out_dim::Int=0`: The output dimension, i.e dimension of the transformed space,
+- `outdim::Int=0`: The output dimension, i.e dimension of the transformed space,
   automatically set if 0 is given (default).
 - `regcoef::Float64=1e-6`: The regularization coefficient (default value 1e-6). A positive
   value `regcoef * eigmax(Sw)` where `Sw` is the within-class scatter matrix, is added
@@ -819,7 +819,7 @@ Train the machine using `fit!(mach, rows=...)`.
   could be set to any robust estimator from `CovarianceEstimation.jl`.
 - `cov_b::CovarianceEstimator`=SimpleCovariance: The same as `cov_w` but for the between-class
   covariance (used in computing between-class scatter matrix, Sb).
-- `out_dim::Int=0`: The output dimension, i.e dimension of the transformed space,
+- `outdim::Int=0`: The output dimension, i.e dimension of the transformed space,
   automatically set if 0 is given (default).
 - `regcoef::Float64=1e-6`: The regularization coefficient (default value 1e-6). A positive
 value `regcoef * eigmax(Sw)` where `Sw` is the within-class covariance estimator, is added
@@ -928,9 +928,9 @@ Train the machine using `fit!(mach, rows=...)`.
 
 - `normalize=true`: Option to normalize the between class variance for the number of
   observations in each class, one of `true` or `false`.
-- `out_dim`: the dimension of the space to be used by `predict` and
+- `outdim`: the dimension of the space to be used by `predict` and
   `transform` methods, automatically set if `0` is given (default). If a non-zero
-  `out_dim` is passed, then the actual output dimension used is `min(rank, out_dim)`
+  `outdim` is passed, then the actual output dimension used is `min(rank, outdim)`
   where `rank` is the rank of the within-class covariance matrix.
 - `dist=Distances.SqEuclidean()`: The distance metric to use when performing
   classification (to compare the distance between a new point and centroids in
@@ -1021,9 +1021,9 @@ Train the machine using `fit!(mach, rows=...)`.
 
 - `normalize=true`: Option to normalize the between class variance for the number of
   observations in each class, one of `true` or `false`.
-- `out_dim`: the dimension of the space to be used by `predict` and
+- `outdim`: the dimension of the space to be used by `predict` and
   `transform` methods, automatically set if `0` is given (default). If a non-zero
-  `out_dim` is passed, then the actual output dimension used is `min(rank, out_dim)`
+  `outdim` is passed, then the actual output dimension used is `min(rank, outdim)`
   where `rank` is the rank of the within-class covariance matrix.
 - `priors::Union{Nothing, Vector{Float64}}=nothing`: For use in prediction with Baye's
   rule. If `priors = nothing` then `priors` are estimated from the class proportions

From 49f1173484cd997b2a3bb08ae4323c553f97e896 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 8 Aug 2022 22:37:38 -0500
Subject: [PATCH 21/22] wrapping up

---
 src/MLJMultivariateStatsInterface.jl | 131 ++++++---------------------
 1 file changed, 28 insertions(+), 103 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 35bec3c..35fcb8e 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -33,81 +33,6 @@ const FactorAnalysisResultType = MS.FactorAnalysis
 const default_kernel = (x, y) -> x'y #default kernel used in KernelPCA
 
 # Definitions of model descriptions for use in model doc-strings.
-const PCA_DESCR = """
-      Principal component analysis. Learns a linear transformation to
-    project the data  on a lower dimensional space while preserving most of the initial
-    variance.
-    """
-const KPCA_DESCR = "Kernel principal component analysis."
-const ICA_DESCR = "Independent component analysis."
-const PPCA_DESCR = "Probabilistic principal component analysis"
-const FactorAnalysis_DESCR = "Factor Analysis"
-const LDA_DESCR = """
-      Multiclass linear discriminant analysis. The algorithm learns a
-    projection matrix `P` that projects a feature matrix `Xtrain` onto a lower dimensional
-    space of dimension `outdim` such that the trace of the transformed between-class
-    scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the transformed
-    within-class scatter matrix (`Pᵀ*Sw*P`).The projection matrix is scaled such that
-    `Pᵀ*Sw*P=I` or `Pᵀ*Σw*P=I`(where `Σw` is the within-class covariance matrix) .
-    Predicted class posterior probability for feature matrix `Xtest` are derived by
-    applying a softmax transformationto a matrix `Pr`, such that  rowᵢ of `Pr` contains
-    computed distances(based on a distance metric) in the transformed space of rowᵢ in
-    `Xtest` to the centroid of each class.
-    """
-const BayesianLDA_DESCR = """
-      Bayesian Multiclass linear discriminant analysis. The algorithm
-    learns a projection matrix `P` that projects a feature matrix `Xtrain` onto a lower
-    dimensional space of dimension `outdim` such that the trace of the transformed
-    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-    such that `Pᵀ*Sw*P = n` or `Pᵀ*Σw*P=I` (Where `n` is the number of training samples
-    and `Σw` is the within-class covariance matrix).
-    Predicted class posterior probability distibution are derived by applying Bayes rule
-    with a multivariate Gaussian class-conditional distribution.
-    """
-const SubspaceLDA_DESCR = """
-    Multiclass linear discriminant analysis. Suitable for high
-    dimensional data (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a
-    projection matrix `P = W*L` that projects a feature matrix `Xtrain` onto a lower
-    dimensional space of dimension `min(rank(Sw), nc - 1)` such that the trace of the transformed
-    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` (where `n` is the number of
-    training samples, mult` is  one of `n` or `1` depending on whether `Sb` is normalized,
-    `Σw` is the within-class covariance matrix, and `nc` is the number of unique classes
-    in `y`) and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-    Predicted class posterior probability for feature matrix `Xtest` are derived by
-    applying a softmax transformation to a matrix `Pr`, such that  rowᵢ of `Pr` contains
-    computed distances(based on a distance metric) in the transformed space of rowᵢ in
-    `Xtest` to the centroid of each class.
-    """
-const BayesianSubspaceLDA_DESCR = """
-       Bayesian Multiclass linear discriminant analysis. Suitable for high dimensional data
-    (Avoids computing scatter matrices `Sw` ,`Sb`). The algorithm learns a projection
-    matrix `P = W*L` (`Sw`), that projects a feature matrix `Xtrain` onto a lower
-    dimensional space of dimension `nc-1` such that the trace of the transformed
-    between-class scatter matrix(`Pᵀ*Sb*P`) is maximized relative to the trace of the
-    transformed within-class scatter matrix (`Pᵀ*Sw*P`). The projection matrix is scaled
-    such that `Pᵀ*Sw*P = mult*I` or `Pᵀ*Σw*P=mult/(n-nc)*I` and also obeys `Wᵀ*Sb*p = λ*Wᵀ*Sw*p`, for every column `p` in `P`.
-    Posterior class probability distibution are derived by applying Bayes rule with a
-    multivariate Gaussian class-conditional distribution
-    """
-const LinearRegressor_DESCR = """
-    Linear Regression. Learns a linear combination of given
-    variables to fit the response by minimizing the squared error between.
-    """
-const MultitargetLinearRegressor_DESCR = """
-    Multitarget Linear Regression. Learns linear combinations of given
-    variables to fit the responses by minimizing the squared error between.
-    """
-const RidgeRegressor_DESCR = """
-    Ridge regressor with regularization parameter lambda. Learns a
-    linear regression with a penalty on the l2 norm of the coefficients.
-    """
-const MultitargetRidgeRegressor_DESCR = """
-    Multitarget Ridge regressor with regularization parameter lambda. Learns a
-    Multitarget linear regression with a penalty on the l2 norm of the coefficients.
-    """
 const PKG = "MLJMultivariateStatsInterface"
 
 # ===================================================================
@@ -430,13 +355,13 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `maxoutdim=0`:  Together with `pratio`, controls the output dimension outdim chosen
+- `maxoutdim=0`:  Together with `variance_ratio`, controls the output dimension outdim chosen
 by the model. Specifically, suppose that k is the smallest integer such that retaining
-the k most significant principal components accounts for `pratio` of the total variance
-in the training data. Then outdim = min(k, maxoutdim). If maxoutdim=0 (default) then the
+the k most significant principal components accounts for `variance_ratio` of the total variance
+in the training data. Then outdim = min(outdim, maxoutdim). If maxoutdim=0 (default) then the
 effective maxoutdim is min(n, indim - 1) where n is the number of observations and indim
 the number of features in the training data.
-- `pratio::Float64=0.99`: The ratio of variance preserved after the transformation
+- `variance_ratio::Float64=0.99`: The ratio of variance preserved after the transformation
 - `method=:auto`: The method to use to solve the problem. Choices are
     - `:svd`: Support Vector Decomposition of the matrix.
     - `:cov`: Covariance matrix decomposition.
@@ -607,7 +532,7 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-- `k::Int=0`: The number of independent components to recover, set automatically if `0`.
+- `outdim::Int=0`: The number of independent components to recover, set automatically if `0`.
 - `alg::Symbol=:fastica`: The algorithm to use (only `:fastica` is supported at the moment).
 - `fun::Symbol=:tanh`: The approximate neg-entropy function, one of `:tanh`, `:gaus`.
 - `do_whiten::Bool=true`: Whether or not to perform pre-whitening.
@@ -630,7 +555,8 @@ Train the machine using `fit!(mach, rows=...)`.
 
 The fields of `fitted_params(mach)` are:
 
-# TODO: Now that this is fixed, document
+- `projection`: The estimated component matrix.
+- `mean`: The estimated mean vector.
 
 # Report
 
@@ -661,7 +587,7 @@ signal = permutedims(hcat(signal...))'
 mixing_matrix = [ 1 1 1; 0.5 2 1; 1.5 1 2]
 X = MLJ.table(signal * mixing_matrix)
 
-model = ICA(k = 3, tol=0.1)
+model = ICA(outim = 3, tol=0.1)
 mach = machine(model, X) |> fit! # this errors ERROR: MethodError: no method matching size(::MultivariateStats.ICA{Float64}, ::Int64)
 
 Xproj = transform(mach, X)
@@ -737,7 +663,7 @@ The fields of `fitted_params(mach)` are:
 
 - `projected_class_means`: The matrix comprised of class-specific means as columns,
   of size `(indim, nc)`, where `indim` is the number of input features (columns) and
-  `nc` the number of target classes.
+  `nclasses` the number of target classes.
 - `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
  `indim` and `outdim` are the input and output dimensions respectively.
 
@@ -747,13 +673,13 @@ The fields of `report(mach)` are:
 
 - `classes`: The classes seen during model fitting.
 - `outdim`: The dimensions the model is projected to.
-- `class_means`: The matrix comprised of class-specific means as
+- `projected_class_means`: The matrix comprised of class-specific means as
   columns (see above).
 - `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
 - `Sb`: The between class scatter matrix.
 - `Sw`: The within class scatter matrix.
-- `nc`: The number of classes directly observed in the training data (which can be
+- `nclasses`: The number of classes directly observed in the training data (which can be
   less than the total number of classes in the class pool)
 
 # Examples
@@ -847,7 +773,7 @@ The fields of `fitted_params(mach)` are:
 
 - `projected_class_means`: The matrix comprised of class-specific means as columns,
   of size `(indim, nc)`, where `indim` is the number of input features (columns) and
-  `nc` the number of target classes.
+  `nclasses` the number of target classes.
 - `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
  `indim` and `outdim` are the input and output dimensions respectively.
 - `priors`: The class priors for classification. As inferred from training target `y`,
@@ -859,13 +785,13 @@ The fields of `report(mach)` are:
 
 - `classes`: The classes seen during model fitting.
 - `outdim`: The dimensions the model is projected to.
-- `class_means`: The matrix comprised of class-specific means as
+- `projected_class_means`: The matrix comprised of class-specific means as
   columns (see above).
 - `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
 - `Sb`: The between class scatter matrix.
 - `Sw`: The within class scatter matrix.
-- `nc`: The number of classes directly observed in the training data (which can be
+- `nclasses`: The number of classes directly observed in the training data (which can be
   less than the total number of classes in the class pool)
 
 # Examples
@@ -873,11 +799,11 @@ The fields of `report(mach)` are:
 ```
 using MLJ
 
-BLDA = @load BayesianLDA pkg=MultivariateStats
+BayesianLDA = @load BayesianLDA pkg=MultivariateStats
 
 X, y = @load_iris # a table and a vector
 
-model = BLDA()
+model = BayesianLDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)
@@ -903,7 +829,6 @@ In the case of classification, the class probability for a new observation
 reflects the proximity of that observation to training observations
 associated with that class, and how far away the observation is from those
 associated with other classes. Specifically, the distances, in the transformed
-(projected) space, of a new observation, from the centroid of each target class,
 is computed; the resulting vector of distances (times minus one) is passed to a
 softmax function to obtain a class probability prediction. Here "distance"
 is computed using a user-specified distance function.
@@ -952,9 +877,10 @@ Train the machine using `fit!(mach, rows=...)`.
 
 The fields of `fitted_params(mach)` are:
 
-- `class_means`: The matrix comprised of class-specific means as
+- `projected_class_means`: The matrix comprised of class-specific means as
   columns (of size `(d,m)`), where d corresponds to input features and m corresponds to class.
-- `projection_matrix`: The matrix used to project `X` into a lower dimensional space.
+- `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
+  `indim` and `outdim` are the input and output dimensions respectively.
 
 # Report
 
@@ -962,11 +888,11 @@ The fields of `report(mach)` are:
 
 - `explained_variance_ratio`: The ratio of explained variance to total variance. Each dimension corresponds to an eigenvalue.
 - `classes`: The classes seen during model fitting.
-- `class_means`: The matrix comprised of class-specific means as
+- `projected_class_means`: The matrix comprised of class-specific means as
   columns (see above).
 - `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
-- `nc`: The number of classes directly observed in the training data (which can be
+- `nclasses`: The number of classes directly observed in the training data (which can be
   less than the total number of classes in the class pool)
 
 # Examples
@@ -974,11 +900,11 @@ The fields of `report(mach)` are:
 ```
 using MLJ
 
-SLDA = @load SubspaceLDA pkg=MultivariateStats
+SubspaceLDA = @load SubspaceLDA pkg=MultivariateStats
 
 X, y = @load_iris # a table and a vector
 
-model = SLDA()
+model = SubspaceLDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)
@@ -992,7 +918,6 @@ See also
 SubspaceLDA
 
 """
-
 $(MMI.doc_header(BayesianSubspaceLDA))
 
 
@@ -1048,7 +973,7 @@ The fields of `fitted_params(mach)` are:
 
 - `projected_class_means`: The matrix comprised of class-specific means as columns,
   of size `(indim, nc)`, where `indim` is the number of input features (columns) and
-  `nc` the number of target classes.
+  `nclasses` the number of target classes.
 - `projection_matrix`: The learned projection matrix, of size `(indim, outdim)`, where
  `indim` and `outdim` are the input and output dimensions respectively.
 - `priors`: The class priors for classification. As inferred from training target `y`,
@@ -1060,11 +985,11 @@ The fields of `report(mach)` are:
 
 - `explained_variance_ratio`: The ratio of explained variance to total variance. Each dimension corresponds to an eigenvalue.
 - `classes`: The classes seen during model fitting.
-- `class_means`: The matrix comprised of class-specific means as
+- `projected_class_means`: The matrix comprised of class-specific means as
   columns (see above).
 - `mean`: The mean of the untransformed training data, of length `indim`.
 - `class_weights`: The weights of each class.
-- `nc`: The number of classes directly observed in the training data (which can be
+- `nclasses`: The number of classes directly observed in the training data (which can be
   less than the total number of classes in the class pool)
 
 # Examples
@@ -1072,11 +997,11 @@ The fields of `report(mach)` are:
 ```
 using MLJ
 
-BSLDA = @load BayesianSubspaceLDA pkg=MultivariateStats
+BayesianSubspaceLDA = @load BayesianSubspaceLDA pkg=MultivariateStats
 
 X, y = @load_iris # a table and a vector
 
-model = BSLDA()
+model = BayesianSubspaceLDA()
 mach = machine(model, X, y) |> fit!
 
 Xproj = transform(mach, X)

From fd5b524b9bd0ef88c40dd4b37279a16226710af3 Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 8 Aug 2022 22:43:09 -0500
Subject: [PATCH 22/22] updated!

---
 src/MLJMultivariateStatsInterface.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/MLJMultivariateStatsInterface.jl b/src/MLJMultivariateStatsInterface.jl
index 35fcb8e..bbc30c6 100644
--- a/src/MLJMultivariateStatsInterface.jl
+++ b/src/MLJMultivariateStatsInterface.jl
@@ -578,7 +578,7 @@ time = 8 .\ 0:2001
 
 sine_wave = sin.(2*time)
 square_wave = sign.(sin.(3*time))
-sawtooth_wave = repeat(collect(4 .\ 0:10), 182)
+sawtooth_wave = repeat(collect(0:10) / 4, 182)
 signal = [sine_wave, square_wave, sawtooth_wave]
 add_noise(x) = x + randn()
 signal = map((x -> add_noise.(x)), signal)
@@ -587,11 +587,11 @@ signal = permutedims(hcat(signal...))'
 mixing_matrix = [ 1 1 1; 0.5 2 1; 1.5 1 2]
 X = MLJ.table(signal * mixing_matrix)
 
-model = ICA(outim = 3, tol=0.1)
+model = ICA(k = 3, tol=0.1)
 mach = machine(model, X) |> fit! # this errors ERROR: MethodError: no method matching size(::MultivariateStats.ICA{Float64}, ::Int64)
 
 Xproj = transform(mach, X)
-sum(Xproj - signal)
+@info sum(abs, Xproj - signal)
 ```
 
 See also