diff --git a/Validation/README.md b/Validation/README.md new file mode 100644 index 0000000..349c78f --- /dev/null +++ b/Validation/README.md @@ -0,0 +1,28 @@ +## List of validation tests provided by Vectice (source code from PiML) +| **Category** | **Test Name** | **Function** | +|------------------------------|----------------------------------|--------------------------------------| +| **Classification Tests** | ROC Curve | `plot_roc_curve` | +| | Confusion Matrix | `conf_matrix` | +| | Explainability | `explainability` | +| | Feature Importance | `feature_importance` | +| | Label Drift | `label_drift` | +| | Prediction Drift | `prediction_drift` | +| | Recall by class | `recall_by_class ` | +| | Precision by class | `precision_by_class ` | +| | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | +| | **Multiclass Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift`, `recall_by_class `, `precision_by_class ` | +| **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` | +| | PII Check | `pii_check` | +| | Sensitive Data Type Check | `sensitive_data_type_check` | +| **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | +| | IQR and Outliers | `iqr_and_outliers` | +| | **Dataset Quality suite** | `test_dataset_split`, `iqr_and_outliers` | +| **Regression Tests** | Residuals Plot | `plot_residuals` | +| | R² Score | `r2_score` | +| | Explainability | `explainability` | +| | Feature Importance | `feature_importance` | +| | Target Drift | `target_drift` | +| | Prediction Drift | `prediction_drift` | +| | **Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | + + diff --git a/Validation/master_config_test_suites.py b/Validation/master_config_test_suites.py new file mode 100644 index 0000000..24a5005 --- /dev/null +++ b/Validation/master_config_test_suites.py @@ -0,0 +1,65 @@ +# import the Vectice provided probability of default validation tests +from vectice.models.test_library.binary_classification_test import ( + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, +) + + +# custom data quality validation tests +from test_modules.data_quality_modules import ( + test_dataset_split, + iqr_and_outliers, +) + +# custom data privacy validation tests +from test_modules.data_privacy_modules import ( + sensitive_data_check, + sensitive_data_type_check, + pii_check, +) + +from test_modules.correlation_matrix_module import ( + plot_correlation_matrix +) + + +# The master test suite file is used to map all ADDITIONAL suite of test which can be run. +# The tests can be provided by Vectice or custom functions from your modules. +# Vectice uses this configuration to simply identify and bundle available tests into suite, when you run +# your validations in your notebook. + +# Accumulation and mapping of all validation tests to be run for the PD model suite +PD_model_suite= { + "binary_suite": [ + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, + ], + "data_quality_ext": [ + test_dataset_split, + iqr_and_outliers, + plot_correlation_matrix, + ], + "corr_matrix_ext": [ + plot_correlation_matrix, + ], +} + +# Map the tests to be used for data privacy validation +Robustness_suite = { + "sensitive_data_check": sensitive_data_check, + "pii_check": pii_check, + "sensitive_data_type_check": sensitive_data_type_check, + "data_privacy_full_suite": [ + sensitive_data_check, + pii_check, + sensitive_data_type_check, + ], +} diff --git a/Validation/test_modules/correlation_matrix_module.py b/Validation/test_modules/correlation_matrix_module.py new file mode 100644 index 0000000..3935f9a --- /dev/null +++ b/Validation/test_modules/correlation_matrix_module.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + # RETURN IN THE VECTICE EXPECTED FORMART + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) \ No newline at end of file diff --git a/Validation/test_modules/data_privacy_modules.py b/Validation/test_modules/data_privacy_modules.py new file mode 100644 index 0000000..90d851f --- /dev/null +++ b/Validation/test_modules/data_privacy_modules.py @@ -0,0 +1,148 @@ +# Write custom tests which can be used to validate your datasets security +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pandas as pd + +if TYPE_CHECKING: + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation_dataset import TestSuiteReturnType + + +def sensitive_data_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, + sensitive_keywords: list | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None or sensitive_keywords is None: + return None + + # Initialize a dictionary to hold counts of sensitive data + sensitive_counts = {keyword: 0 for keyword in sensitive_keywords} + + # Check each cell in the DataFrame for sensitive keywords + for keyword in sensitive_keywords: + sensitive_counts[keyword] = dataset.apply( + lambda x: x.astype(str).str.contains(keyword, case=False).sum() + ).sum() + + # Create a DataFrame with the results + sensitive_counts_df = pd.DataFrame( + { + "Sensitive Keyword": list(sensitive_counts.keys()), + "Count": list(sensitive_counts.values()), + } + ) + + table = Table(sensitive_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def pii_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define common PII patterns + pii_patterns = { + "name": r"\b[A-Z][a-z]*\b", + "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b", + "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b", + } + + # Initialize a dictionary to hold counts of PII matches + pii_counts = {key: 0 for key in pii_patterns.keys()} + + # Check each column in the DataFrame for PII patterns + for column in dataset.columns: + for key, pattern in pii_patterns.items(): + pii_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + pii_counts_df = pd.DataFrame( + {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())} + ) + + table = Table(pii_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def sensitive_data_type_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define patterns for sensitive data types + sensitive_data_patterns = { + "credit_card": r"\b(?:\d[ -]*?){13,16}\b", + "ssn": r"\b\d{3}-\d{2}-\d{4}\b", + } + + # Initialize a dictionary to hold counts of sensitive data type matches + sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()} + + # Check each column in the DataFrame for sensitive data type patterns + for column in dataset.columns: + for key, pattern in sensitive_data_patterns.items(): + sensitive_data_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + sensitive_data_counts_df = pd.DataFrame( + { + "Sensitive Data Type": list(sensitive_data_counts.keys()), + "Count": list(sensitive_data_counts.values()), + } + ) + + table = Table(sensitive_data_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) diff --git a/Validation/test_modules/data_quality_modules.py b/Validation/test_modules/data_quality_modules.py new file mode 100644 index 0000000..05b3ae5 --- /dev/null +++ b/Validation/test_modules/data_quality_modules.py @@ -0,0 +1,116 @@ +# Write custom tests which can be used to validate your datasets quality +from __future__ import annotations + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pandas import DataFrame +from vectice.models.validation_dataset import TestSuiteReturnType + + +# custom test which can be used for dataset validation +def test_dataset_split( + dataset: DataFrame | None, + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + feature_columns: list | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: list | None = None, + target_column: str | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType | None: + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + files = [] + # disable plots showing + plt.ioff() + for column in dataset.select_dtypes(include=[np.number]).columns: + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline( + Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}" + ) + plt.axvline( + Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}" + ) + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx( + [0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}" + ) + + # Highlight outliers + outliers = dataset[ + (dataset[column] < lower_bound) | (dataset[column] > upper_bound) + ][column] + plt.scatter( + outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5 + ) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + properties={}, + tables=[], + attachments=files, + ) diff --git a/Validation/test_modules/default_tests_vectice.py b/Validation/test_modules/default_tests_vectice.py new file mode 100644 index 0000000..5ec37db --- /dev/null +++ b/Validation/test_modules/default_tests_vectice.py @@ -0,0 +1,523 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import shap +from scipy.stats import chi2_contingency, ks_2samp +from sklearn.metrics import auc, confusion_matrix, precision_score, recall_score, roc_curve + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + + +def plot_roc_curve( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"train_color": "green", "test_color": "blue", "threshold": 0.5}, +) -> TestSuiteReturnType | None: + from vectice.models.validation import TestSuiteReturnType + + X_train = training_df.drop(columns=[target_column]) + X_test = testing_df.drop(columns=[target_column]) + training_prediction_proba = predictor.predict_proba(X_train)[:, 1] + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_train is not None: + training_prediction_proba = predict_proba_train + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + fpr_train, tpr_train, _ = roc_curve(training_df[target_column], training_prediction_proba) + roc_auc_train = auc(fpr_train, tpr_train) + + fpr_test, tpr_test, _ = roc_curve(testing_df[target_column], testing_prediction_proba) + roc_auc_test = auc(fpr_test, tpr_test) + + file_path = "ROC_CURVE.png" + + plt.figure(figsize=(8, 6)) + plt.plot( + fpr_train, + tpr_train, + color=internal_parameters["train_color"], + linestyle="--", + label=f"Train ROC curve (AUC = {roc_auc_train:.2f})", + ) + plt.plot( + fpr_test, + tpr_test, + color=internal_parameters["test_color"], + label=f"Test ROC curve (AUC = {roc_auc_test:.2f})", + ) + plt.plot([0, 1], [0, 1], color="red", linestyle="--") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("Receiver Operating Characteristic (ROC) Curve") + plt.legend() + plt.grid(True) + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={"_ROC_auc_train": roc_auc_train, "_ROC_auc_test": roc_auc_test}, + properties={}, + tables=[], + attachments=[file_path], + ) + + +def conf_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"threshold": 0.5, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + threshold = internal_parameters["threshold"] + cmap = internal_parameters.get("cmap", "Blues") + + X_test = testing_df.drop(columns=[target_column]) + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + testing_prediction = (testing_prediction_proba >= threshold).astype(int) + + cm = confusion_matrix(testing_df[target_column], testing_prediction) + total_samples = np.sum(cm) + + precision = precision_score(testing_df[target_column], testing_prediction) + recall = recall_score(testing_df[target_column], testing_prediction) + + # Plot confusion matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(cm, annot=True, cmap=cmap, fmt="d", annot_kws={"fontsize": 12}, cbar=False) + for i in range(len(cm)): + for j in range(len(cm)): + plt.text( + j + 0.5, + i + 0.75, + f"{cm[i][j]/total_samples*100:.2f}%", + ha="center", + va="center", + color="black", + fontsize=12, + ) + plt.xlabel("Predicted Label") + plt.ylabel("True Label") + plt.title(f"Confusion Matrix\nPrecision: {precision:.2f}, Recall: {recall:.2f}") + + # Save the plot + file_path = "Confusion_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={"_precision_test": precision, "_recall_test": recall}, + properties={"Threshold": threshold}, + tables=[], + attachments=[file_path], + ) + + +def explainability( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) + shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) + shap.summary_plot( + shap_values[:, :, 0], training_df.drop(columns=[target_column]).head(1000), max_display=10, show=False + ) + summary_plot_path = "SHAP_summary_plot.png" + plt.savefig(summary_plot_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[summary_plot_path]) + + +def feature_importance( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) + shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) + clustering = shap.utils.hclust( + training_df.drop(columns=[target_column]).head(1000), training_df[target_column].head(1000) + ) + shap.plots.bar(shap_values[:, :, 0], clustering=clustering, max_display=10, show=False) + + feature_importance_path = "feature_importance.png" + plt.savefig(feature_importance_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[feature_importance_path]) + + +def cramers_v_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: + + min_length = min(len(x), len(y), 4000) + x = x[:min_length] + y = y[:min_length] + confusion_matrix = pd.crosstab(x, y) + chi2 = chi2_contingency(confusion_matrix)[0] + n = confusion_matrix.sum().sum() + phi2 = chi2 / n + r, k = confusion_matrix.shape + phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) + rcorr = r - ((r - 1) ** 2) / (n - 1) + kcorr = k - ((k - 1) ** 2) / (n - 1) + return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) + + +def ks_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: + min_length = min(len(x), len(y), 4000) + x = x[:min_length] + y = y[:min_length] + ks_statistic, _ = ks_2samp(x, y) + + return ks_statistic + + +def prediction_drift( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + threshold: float, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + X_train = training_df.drop(columns=[target_column]) + X_test = testing_df.drop(columns=[target_column]) + training_prediction_proba = predictor.predict_proba(X_train)[:, 1] + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_train is not None: + training_prediction_proba = predict_proba_train + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + train_predictions = np.array(training_prediction_proba) + test_predictions = np.array(testing_prediction_proba) + + light_red = "#FF8A80" # Light Red + darker_blue = "#1565C0" # Darker Blue + sns.set_palette([darker_blue, light_red]) + + _, ax = plt.subplots(figsize=(8, 6)) + + sns.kdeplot(train_predictions, color=light_red, label="Train Predictions", fill=True) + sns.kdeplot(test_predictions, color=darker_blue, label="Test Predictions", fill=True) + + # Plot vertical lines for means using the specified colors + ax.axvline( # pyright: ignore[reportAttributeAccessIssue] + np.mean(train_predictions), # pyright: ignore[reportArgumentType] + color=light_red, + linestyle="--", + label="Train Mean", + ) + ax.axvline( # pyright: ignore[reportAttributeAccessIssue] + np.mean(test_predictions), # pyright: ignore[reportArgumentType] + color=darker_blue, + linestyle="--", + label="Test Mean", + ) + + plt.xlabel("Predictions") + plt.ylabel("Density") + plt.title("Prediction Drift Plot (Kolmogorov-Smirnov drift score)") + plt.legend() + plt.grid(True) + path = "Prediction_drift.png" + + # Calculate and print drift score + drift_score = ks_score(train_predictions, test_predictions) + + # Set text position at the top + text_x = 0.5 + text_y = 0.95 + if drift_score < 0.1: + score_color = "green" + elif 0.1 <= drift_score <= 0.2: + score_color = "orange" + else: + score_color = "red" + + plt.text( + text_x, + text_y, + f"Drift score = {drift_score:.2f}", + ha="center", + va="top", + color=score_color, + transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] + ) + + plt.savefig(path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, properties={"_prediction_drift_score": drift_score}, tables=[], attachments=[path] + ) + + +def label_drift( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + train_labels = np.array(training_df[target_column]) + test_labels = np.array(testing_df[target_column]) + + light_red = "#FF8A80" # Light Red + darker_blue = "#1565C0" # Darker Blue + sns.set_palette([darker_blue, light_red]) + + _, ax = plt.subplots(figsize=(8, 6)) + + bar_width = 0.35 + index = np.arange(2) + + train_counts = [np.sum(train_labels == 0) / len(train_labels), np.sum(train_labels == 1) / len(train_labels)] + test_counts = [np.sum(test_labels == 0) / len(test_labels), np.sum(test_labels == 1) / len(test_labels)] + + train_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] + index, train_counts, bar_width, label="Train Labels" + ) + test_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] + index + bar_width, test_counts, bar_width, label="Test Labels" + ) + + ax.set_xlabel("Labels") # pyright: ignore[reportAttributeAccessIssue] + ax.set_ylabel("Frequency") # pyright: ignore[reportAttributeAccessIssue] + ax.set_title("Label Drift Plot (Cramer's V drift score)") # pyright: ignore[reportAttributeAccessIssue] + ax.set_xticks(index + bar_width / 2) # pyright: ignore[reportAttributeAccessIssue] + ax.set_xticklabels(["0", "1"]) # pyright: ignore[reportAttributeAccessIssue] + ax.legend() # pyright: ignore[reportAttributeAccessIssue] + + def autolabel(bars: BarContainer): + """Attach a text label above each bar in *bars*, displaying its height.""" + for bar in bars: + height = bar.get_height() + ax.annotate( # pyright: ignore[reportAttributeAccessIssue] + f"{height:.2f}", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha="center", + va="bottom", + ) + + autolabel(train_bar) + autolabel(test_bar) + + drift_score = cramers_v_score(train_labels, test_labels) + if drift_score < 0.1: + score_color = "green" + elif 0.1 <= drift_score <= 0.2: + score_color = "orange" + else: + score_color = "red" + + ax.text( # pyright: ignore[reportAttributeAccessIssue] + 0.5, + 0.95, + f"Drift score = {drift_score:.2f}", + ha="center", + va="top", + color=score_color, + transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] + ) + + plt.tight_layout() + path = "Label_drift.png" + plt.savefig(path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, properties={"_label_drift_score": drift_score}, tables=[], attachments=[path] + ) + + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + subset_columns = internal_parameters.get( + "subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"] + ) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) + + +# custom test which can be used for dataset validation +def test_dataset_split( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation import TestSuiteReturnType + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType(metrics={}, properties={}, tables=[table], attachments=[]) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType | None: + from vectice.models.validation import TestSuiteReturnType + + dataset = training_df + + files = [] + # disable plots showing + if internal_parameters.get("subset_columns") is not None: + columns = internal_parameters.get("subset_columns") + else: + columns = dataset.select_dtypes(include=[np.number]).columns[:10] + plt.ioff() + for column in columns: # type: ignore + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline(Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}") + plt.axvline(Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}") + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx([0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}") + + # Highlight outliers + outliers = dataset[(dataset[column] < lower_bound) | (dataset[column] > upper_bound)][column] + plt.scatter(outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=files, + ) \ No newline at end of file diff --git a/Validation/vectice_wrappers.py b/Validation/vectice_wrappers.py new file mode 100644 index 0000000..503fbc7 --- /dev/null +++ b/Validation/vectice_wrappers.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import inspect + +from typing import Any, Dict +from vectice.models.validation import TestSuiteReturnType + + +## You just pass your function as an argument +def Vectice_wrapper_function( + module: callable, + internal_functions_param: Dict[str, Any], +) -> TestSuiteReturnType: + + # Inspect the signature of the internal function + signature = inspect.signature(module) + + # Validate that all required parameters are provided + for param_name, param in signature.parameters.items(): + if param.default == inspect.Parameter.empty and param_name not in internal_functions_param: + raise ValueError(f"Missing required parameter: {param_name}") + + # Filter out any extra parameters not in the signature + filtered_params = {param_name: internal_functions_param[param_name] for param_name in signature.parameters if param_name in internal_functions_param} + + # Run the provided callable with filtered parameters + result = module(**filtered_params) + + # Helper function to extract paths + def extract_paths(obj): + paths = [] + if isinstance(obj, dict): + for key, value in obj.items(): + paths.extend(extract_paths(value)) + elif isinstance(obj, list): + for item in obj: + paths.extend(extract_paths(item)) + elif isinstance(obj, str): + paths.append(obj) + elif hasattr(obj, 'attachments'): + paths.extend(extract_paths(obj.attachments)) + return paths + + # Extract paths from the result + extracted_paths = extract_paths(result) + + # Convert the result to a dictionary + output_files = { + "paths": extracted_paths, + } + + # Return in the expected format + return TestSuiteReturnType(**output_files) + + + +def Vectice_wrapper( + output_files: Dict[str, Any] = {"paths": None, "dataframes": None, "metrics": None, "properties": None}, +) -> TestSuiteReturnType: + + #### + #####Paste your code Here + ##### + + + + # RETURN IN THE VECTICE EXPECTED FORMART + return TestSuiteReturnType( + metrics=output_files["metrics"], + properties=output_files["properties"], + tables=output_files["dataframes"], + attachments=output_files["paths"], + )