vectice · AidanNell · Jul 9, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/Validation/README.md b/Validation/README.md
@@ -0,0 +1,28 @@
+## List of validation tests provided by Vectice (source code from PiML)
+| **Category**                 | **Test Name**                    | **Function**                         |
+|------------------------------|----------------------------------|--------------------------------------|
+| **Classification Tests**     | ROC Curve                        | `plot_roc_curve`                     |
+|                              | Confusion Matrix                 | `conf_matrix`                        |
+|                              | Explainability                   | `explainability`                     |
+|                              | Feature Importance               | `feature_importance`                 |
+|                              | Label Drift                      | `label_drift`                        |
+|                              | Prediction Drift                 | `prediction_drift`                   |
+|                              | Recall by class                  | `recall_by_class `                   |
+|                              | Precision by class               | `precision_by_class `                   |
+|                              | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` |
+|                              | **Multiclass Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift`, `recall_by_class `, `precision_by_class ` |
+| **Data Privacy Tests**       | Sensitive Data Check             | `sensitive_data_check`               |
+|                              | PII Check                        | `pii_check`                          |
+|                              | Sensitive Data Type Check        | `sensitive_data_type_check`          |
+| **Data Quality Tests**       | Dataset Split Validation         | `test_dataset_split`                 |
+|                              | IQR and Outliers                 | `iqr_and_outliers`                   |
+|                              | **Dataset Quality suite**    | `test_dataset_split`, `iqr_and_outliers` |
+| **Regression Tests**         | Residuals Plot                   | `plot_residuals`                     |
+|                              | R² Score                         | `r2_score`                           |
+|                              | Explainability                   | `explainability`                     |
+|                              | Feature Importance               | `feature_importance`                 |
+|                              | Target Drift                     | `target_drift`                       |
+|                              | Prediction Drift                 | `prediction_drift`                   |
+|                              | **Regression suite**         | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` |
+
+
diff --git a/Validation/master_config_test_suites.py b/Validation/master_config_test_suites.py
@@ -0,0 +1,65 @@
+# import the Vectice provided probability of default validation tests
+from vectice.models.test_library.binary_classification_test import (
+    plot_roc_curve,
+    conf_matrix,
+    explainability,
+    feature_importance,
+    label_drift,
+    prediction_drift,
+)
+
+
+# custom data quality validation tests
+from test_modules.data_quality_modules import (
+    test_dataset_split,
+    iqr_and_outliers,
+)
+
+# custom data privacy validation tests
+from test_modules.data_privacy_modules import (
+    sensitive_data_check,
+    sensitive_data_type_check,
+    pii_check,
+)
+
+from test_modules.correlation_matrix_module import (
+    plot_correlation_matrix
+)
+
+
+# The master test suite file is used to map all ADDITIONAL suite of test which can be run.
+# The tests can be provided by Vectice or custom functions from your modules.
+# Vectice uses this configuration to simply identify and bundle available tests into suite, when you run
+# your validations in your notebook.
+
+# Accumulation and mapping of all validation tests to be run for the PD model suite
+PD_model_suite= {
+    "binary_suite": [
+        plot_roc_curve,
+        conf_matrix,
+        explainability,
+        feature_importance,
+        label_drift,
+        prediction_drift,
+    ],
+    "data_quality_ext": [
+        test_dataset_split,
+        iqr_and_outliers,
+        plot_correlation_matrix,
+    ],
+    "corr_matrix_ext": [
+        plot_correlation_matrix,
+    ],
+}
+
+# Map the tests to be used for data privacy validation
+Robustness_suite = {
+    "sensitive_data_check": sensitive_data_check,
+    "pii_check": pii_check,
+    "sensitive_data_type_check": sensitive_data_type_check,
+    "data_privacy_full_suite": [
+        sensitive_data_check,
+        pii_check,
+        sensitive_data_type_check,
+    ],
+}
diff --git a/Validation/test_modules/correlation_matrix_module.py b/Validation/test_modules/correlation_matrix_module.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+if TYPE_CHECKING:
+    from matplotlib.container import BarContainer
+    from numpy import ndarray
+    from numpy.typing import ArrayLike
+    from pandas import DataFrame
+
+    from vectice.models.validation import TestSuiteReturnType
+
+_logger = logging.getLogger(__name__)
+
+def plot_correlation_matrix(
+    training_df: DataFrame,
+    testing_df: DataFrame,
+    target_column: str,
+    predictor: Any,
+    predict_proba_train: ArrayLike | None,
+    predict_proba_test: ArrayLike | None,
+    internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"},
+) -> TestSuiteReturnType:
+    from vectice.models.validation import TestSuiteReturnType
+
+    subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"])
+    cmap = internal_parameters.get("cmap", "Blues")
+
+    # Select subset of columns
+    training_df = training_df[subset_columns]
+
+    # Calculate the correlation matrix
+    corr_matrix = training_df.corr()
+
+    # Plot the correlation matrix
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True)
+    plt.title("Correlation Matrix")
+
+    # Save the plot
+    file_path = "Correlation_matrix_plot.png"
+    plt.savefig(file_path)
+    plt.close()
+
+    # RETURN IN THE VECTICE EXPECTED FORMART
+    return TestSuiteReturnType(
+        metrics={},
+        properties={},
+        tables=[],
+        attachments=[file_path],
+    )
diff --git a/Validation/test_modules/data_privacy_modules.py b/Validation/test_modules/data_privacy_modules.py
@@ -0,0 +1,148 @@
+# Write custom tests which can be used to validate your datasets security
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pandas as pd
+
+if TYPE_CHECKING:
+    from numpy.typing import ArrayLike
+    from pandas import DataFrame
+
+    from vectice.models.validation_dataset import TestSuiteReturnType
+
+
+def sensitive_data_check(
+    dataset: DataFrame | None = None,
+    training_df: DataFrame | None = None,
+    testing_df: DataFrame | None = None,
+    feature_columns: ArrayLike | list | None = None,
+    target_column: ArrayLike | str | None = None,
+    sensitive_keywords: list | None = None,
+) -> TestSuiteReturnType | None:
+    from vectice import Table
+    from vectice.models.validation_dataset import TestSuiteReturnType
+
+    if dataset is None or sensitive_keywords is None:
+        return None
+
+    # Initialize a dictionary to hold counts of sensitive data
+    sensitive_counts = {keyword: 0 for keyword in sensitive_keywords}
+
+    # Check each cell in the DataFrame for sensitive keywords
+    for keyword in sensitive_keywords:
+        sensitive_counts[keyword] = dataset.apply(
+            lambda x: x.astype(str).str.contains(keyword, case=False).sum()
+        ).sum()
+
+    # Create a DataFrame with the results
+    sensitive_counts_df = pd.DataFrame(
+        {
+            "Sensitive Keyword": list(sensitive_counts.keys()),
+            "Count": list(sensitive_counts.values()),
+        }
+    )
+
+    table = Table(sensitive_counts_df)
+
+    return TestSuiteReturnType(
+        properties={},
+        tables=[table],
+        attachments=[],
+    )
+
+
+def pii_check(
+    dataset: DataFrame | None = None,
+    training_df: DataFrame | None = None,
+    testing_df: DataFrame | None = None,
+    feature_columns: ArrayLike | list | None = None,
+    target_column: ArrayLike | str | None = None,
+) -> TestSuiteReturnType | None:
+    from vectice import Table
+    from vectice.models.validation_dataset import TestSuiteReturnType
+
+    if dataset is None:
+        return None
+
+    # Define common PII patterns
+    pii_patterns = {
+        "name": r"\b[A-Z][a-z]*\b",
+        "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b",
+        "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b",
+    }
+
+    # Initialize a dictionary to hold counts of PII matches
+    pii_counts = {key: 0 for key in pii_patterns.keys()}
+
+    # Check each column in the DataFrame for PII patterns
+    for column in dataset.columns:
+        for key, pattern in pii_patterns.items():
+            pii_counts[key] += (
+                dataset[column]
+                .astype(str)
+                .str.contains(pattern, case=False, regex=True)
+                .sum()
+            )
+
+    # Create a DataFrame with the results
+    pii_counts_df = pd.DataFrame(
+        {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())}
+    )
+
+    table = Table(pii_counts_df)
+
+    return TestSuiteReturnType(
+        properties={},
+        tables=[table],
+        attachments=[],
+    )
+
+
+def sensitive_data_type_check(
+    dataset: DataFrame | None = None,
+    training_df: DataFrame | None = None,
+    testing_df: DataFrame | None = None,
+    feature_columns: ArrayLike | list | None = None,
+    target_column: ArrayLike | str | None = None,
+) -> TestSuiteReturnType | None:
+    from vectice import Table
+    from vectice.models.validation_dataset import TestSuiteReturnType
+
+    if dataset is None:
+        return None
+
+    # Define patterns for sensitive data types
+    sensitive_data_patterns = {
+        "credit_card": r"\b(?:\d[ -]*?){13,16}\b",
+        "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
+    }
+
+    # Initialize a dictionary to hold counts of sensitive data type matches
+    sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()}
+
+    # Check each column in the DataFrame for sensitive data type patterns
+    for column in dataset.columns:
+        for key, pattern in sensitive_data_patterns.items():
+            sensitive_data_counts[key] += (
+                dataset[column]
+                .astype(str)
+                .str.contains(pattern, case=False, regex=True)
+                .sum()
+            )
+
+    # Create a DataFrame with the results
+    sensitive_data_counts_df = pd.DataFrame(
+        {
+            "Sensitive Data Type": list(sensitive_data_counts.keys()),
+            "Count": list(sensitive_data_counts.values()),
+        }
+    )
+
+    table = Table(sensitive_data_counts_df)
+
+    return TestSuiteReturnType(
+        properties={},
+        tables=[table],
+        attachments=[],
+    )