Skip to content

Validation Suite Modules #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Validation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
## List of validation tests provided by Vectice (source code from PiML)
| **Category** | **Test Name** | **Function** |
|------------------------------|----------------------------------|--------------------------------------|
| **Classification Tests** | ROC Curve | `plot_roc_curve` |
| | Confusion Matrix | `conf_matrix` |
| | Explainability | `explainability` |
| | Feature Importance | `feature_importance` |
| | Label Drift | `label_drift` |
| | Prediction Drift | `prediction_drift` |
| | Recall by class | `recall_by_class ` |
| | Precision by class | `precision_by_class ` |
| | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` |
| | **Multiclass Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift`, `recall_by_class `, `precision_by_class ` |
| **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` |
| | PII Check | `pii_check` |
| | Sensitive Data Type Check | `sensitive_data_type_check` |
| **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` |
| | IQR and Outliers | `iqr_and_outliers` |
| | **Dataset Quality suite** | `test_dataset_split`, `iqr_and_outliers` |
| **Regression Tests** | Residuals Plot | `plot_residuals` |
| | R² Score | `r2_score` |
| | Explainability | `explainability` |
| | Feature Importance | `feature_importance` |
| | Target Drift | `target_drift` |
| | Prediction Drift | `prediction_drift` |
| | **Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` |


65 changes: 65 additions & 0 deletions Validation/master_config_test_suites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# import the Vectice provided probability of default validation tests
from vectice.models.test_library.binary_classification_test import (
plot_roc_curve,
conf_matrix,
explainability,
feature_importance,
label_drift,
prediction_drift,
)


# custom data quality validation tests
from test_modules.data_quality_modules import (
test_dataset_split,
iqr_and_outliers,
)

# custom data privacy validation tests
from test_modules.data_privacy_modules import (
sensitive_data_check,
sensitive_data_type_check,
pii_check,
)

from test_modules.correlation_matrix_module import (
plot_correlation_matrix
)


# The master test suite file is used to map all ADDITIONAL suite of test which can be run.
# The tests can be provided by Vectice or custom functions from your modules.
# Vectice uses this configuration to simply identify and bundle available tests into suite, when you run
# your validations in your notebook.

# Accumulation and mapping of all validation tests to be run for the PD model suite
PD_model_suite= {
"binary_suite": [
plot_roc_curve,
conf_matrix,
explainability,
feature_importance,
label_drift,
prediction_drift,
],
"data_quality_ext": [
test_dataset_split,
iqr_and_outliers,
plot_correlation_matrix,
],
"corr_matrix_ext": [
plot_correlation_matrix,
],
}

# Map the tests to be used for data privacy validation
Robustness_suite = {
"sensitive_data_check": sensitive_data_check,
"pii_check": pii_check,
"sensitive_data_type_check": sensitive_data_type_check,
"data_privacy_full_suite": [
sensitive_data_check,
pii_check,
sensitive_data_type_check,
],
}
58 changes: 58 additions & 0 deletions Validation/test_modules/correlation_matrix_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Any, Dict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


if TYPE_CHECKING:
from matplotlib.container import BarContainer
from numpy import ndarray
from numpy.typing import ArrayLike
from pandas import DataFrame

from vectice.models.validation import TestSuiteReturnType

_logger = logging.getLogger(__name__)

def plot_correlation_matrix(
training_df: DataFrame,
testing_df: DataFrame,
target_column: str,
predictor: Any,
predict_proba_train: ArrayLike | None,
predict_proba_test: ArrayLike | None,
internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"},
) -> TestSuiteReturnType:
from vectice.models.validation import TestSuiteReturnType

subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"])
cmap = internal_parameters.get("cmap", "Blues")

# Select subset of columns
training_df = training_df[subset_columns]

# Calculate the correlation matrix
corr_matrix = training_df.corr()

# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True)
plt.title("Correlation Matrix")

# Save the plot
file_path = "Correlation_matrix_plot.png"
plt.savefig(file_path)
plt.close()

# RETURN IN THE VECTICE EXPECTED FORMART
return TestSuiteReturnType(
metrics={},
properties={},
tables=[],
attachments=[file_path],
)
148 changes: 148 additions & 0 deletions Validation/test_modules/data_privacy_modules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Write custom tests which can be used to validate your datasets security
from __future__ import annotations

from typing import TYPE_CHECKING

import pandas as pd

if TYPE_CHECKING:
from numpy.typing import ArrayLike
from pandas import DataFrame

from vectice.models.validation_dataset import TestSuiteReturnType


def sensitive_data_check(
dataset: DataFrame | None = None,
training_df: DataFrame | None = None,
testing_df: DataFrame | None = None,
feature_columns: ArrayLike | list | None = None,
target_column: ArrayLike | str | None = None,
sensitive_keywords: list | None = None,
) -> TestSuiteReturnType | None:
from vectice import Table
from vectice.models.validation_dataset import TestSuiteReturnType

if dataset is None or sensitive_keywords is None:
return None

# Initialize a dictionary to hold counts of sensitive data
sensitive_counts = {keyword: 0 for keyword in sensitive_keywords}

# Check each cell in the DataFrame for sensitive keywords
for keyword in sensitive_keywords:
sensitive_counts[keyword] = dataset.apply(
lambda x: x.astype(str).str.contains(keyword, case=False).sum()
).sum()

# Create a DataFrame with the results
sensitive_counts_df = pd.DataFrame(
{
"Sensitive Keyword": list(sensitive_counts.keys()),
"Count": list(sensitive_counts.values()),
}
)

table = Table(sensitive_counts_df)

return TestSuiteReturnType(
properties={},
tables=[table],
attachments=[],
)


def pii_check(
dataset: DataFrame | None = None,
training_df: DataFrame | None = None,
testing_df: DataFrame | None = None,
feature_columns: ArrayLike | list | None = None,
target_column: ArrayLike | str | None = None,
) -> TestSuiteReturnType | None:
from vectice import Table
from vectice.models.validation_dataset import TestSuiteReturnType

if dataset is None:
return None

# Define common PII patterns
pii_patterns = {
"name": r"\b[A-Z][a-z]*\b",
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b",
"phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b",
}

# Initialize a dictionary to hold counts of PII matches
pii_counts = {key: 0 for key in pii_patterns.keys()}

# Check each column in the DataFrame for PII patterns
for column in dataset.columns:
for key, pattern in pii_patterns.items():
pii_counts[key] += (
dataset[column]
.astype(str)
.str.contains(pattern, case=False, regex=True)
.sum()
)

# Create a DataFrame with the results
pii_counts_df = pd.DataFrame(
{"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())}
)

table = Table(pii_counts_df)

return TestSuiteReturnType(
properties={},
tables=[table],
attachments=[],
)


def sensitive_data_type_check(
dataset: DataFrame | None = None,
training_df: DataFrame | None = None,
testing_df: DataFrame | None = None,
feature_columns: ArrayLike | list | None = None,
target_column: ArrayLike | str | None = None,
) -> TestSuiteReturnType | None:
from vectice import Table
from vectice.models.validation_dataset import TestSuiteReturnType

if dataset is None:
return None

# Define patterns for sensitive data types
sensitive_data_patterns = {
"credit_card": r"\b(?:\d[ -]*?){13,16}\b",
"ssn": r"\b\d{3}-\d{2}-\d{4}\b",
}

# Initialize a dictionary to hold counts of sensitive data type matches
sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()}

# Check each column in the DataFrame for sensitive data type patterns
for column in dataset.columns:
for key, pattern in sensitive_data_patterns.items():
sensitive_data_counts[key] += (
dataset[column]
.astype(str)
.str.contains(pattern, case=False, regex=True)
.sum()
)

# Create a DataFrame with the results
sensitive_data_counts_df = pd.DataFrame(
{
"Sensitive Data Type": list(sensitive_data_counts.keys()),
"Count": list(sensitive_data_counts.values()),
}
)

table = Table(sensitive_data_counts_df)

return TestSuiteReturnType(
properties={},
tables=[table],
attachments=[],
)
Loading