From 90dc7ee7564493109059753d34d070e4d7236b43 Mon Sep 17 00:00:00 2001 From: AidanNell Date: Tue, 9 Jul 2024 16:50:15 +0700 Subject: [PATCH 01/20] Validation suite modules --- .../binary_classification_full_suite.py | 26 +++ .../data_privacy_full_suite.py | 161 ++++++++++++++++++ .../data_quality_full_suite.py | 127 ++++++++++++++ .../validation_suites/master_test_suites.py | 23 +++ .../regression_full_suite.py | 26 +++ .../time_series_full_suite.py | 24 +++ 6 files changed, 387 insertions(+) create mode 100644 24.2/samples/validation_suites/binary_classification_full_suite.py create mode 100644 24.2/samples/validation_suites/data_privacy_full_suite.py create mode 100644 24.2/samples/validation_suites/data_quality_full_suite.py create mode 100644 24.2/samples/validation_suites/master_test_suites.py create mode 100644 24.2/samples/validation_suites/regression_full_suite.py create mode 100644 24.2/samples/validation_suites/time_series_full_suite.py diff --git a/24.2/samples/validation_suites/binary_classification_full_suite.py b/24.2/samples/validation_suites/binary_classification_full_suite.py new file mode 100644 index 0000000..8427247 --- /dev/null +++ b/24.2/samples/validation_suites/binary_classification_full_suite.py @@ -0,0 +1,26 @@ +# import the Vectice provided binary classification tests +from vectice.models.test_library.binary_classification_test import ( + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, +) + +# Map the tests to be used +BINARY_CLASSIFICATION_FULL_SUITE_MAP_TEST = { + "roc": plot_roc_curve, + "cm": conf_matrix, + "explainability": explainability, + "feature_importance": feature_importance, + "drift": [label_drift, prediction_drift], + "binary_full_suite": [ + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, + ], +} diff --git a/24.2/samples/validation_suites/data_privacy_full_suite.py b/24.2/samples/validation_suites/data_privacy_full_suite.py new file mode 100644 index 0000000..4fd2215 --- /dev/null +++ b/24.2/samples/validation_suites/data_privacy_full_suite.py @@ -0,0 +1,161 @@ +# Write custom tests which can be used to validate your datasets security +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pandas as pd + +if TYPE_CHECKING: + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation_dataset import TestSuiteReturnType + + +def sensitive_data_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, + sensitive_keywords: list | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None or sensitive_keywords is None: + return None + + # Initialize a dictionary to hold counts of sensitive data + sensitive_counts = {keyword: 0 for keyword in sensitive_keywords} + + # Check each cell in the DataFrame for sensitive keywords + for keyword in sensitive_keywords: + sensitive_counts[keyword] = dataset.apply( + lambda x: x.astype(str).str.contains(keyword, case=False).sum() + ).sum() + + # Create a DataFrame with the results + sensitive_counts_df = pd.DataFrame( + { + "Sensitive Keyword": list(sensitive_counts.keys()), + "Count": list(sensitive_counts.values()), + } + ) + + table = Table(sensitive_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def pii_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define common PII patterns + pii_patterns = { + "name": r"\b[A-Z][a-z]*\b", + "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b", + "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b", + } + + # Initialize a dictionary to hold counts of PII matches + pii_counts = {key: 0 for key in pii_patterns.keys()} + + # Check each column in the DataFrame for PII patterns + for column in dataset.columns: + for key, pattern in pii_patterns.items(): + pii_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + pii_counts_df = pd.DataFrame( + {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())} + ) + + table = Table(pii_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def sensitive_data_type_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define patterns for sensitive data types + sensitive_data_patterns = { + "credit_card": r"\b(?:\d[ -]*?){13,16}\b", + "ssn": r"\b\d{3}-\d{2}-\d{4}\b", + } + + # Initialize a dictionary to hold counts of sensitive data type matches + sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()} + + # Check each column in the DataFrame for sensitive data type patterns + for column in dataset.columns: + for key, pattern in sensitive_data_patterns.items(): + sensitive_data_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + sensitive_data_counts_df = pd.DataFrame( + { + "Sensitive Data Type": list(sensitive_data_counts.keys()), + "Count": list(sensitive_data_counts.values()), + } + ) + + table = Table(sensitive_data_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +# Map the tests to be used +DATA_PRIVACY_SUITE_MAP_TEST = { + "sensitive_data_check": sensitive_data_check, + "pii_check": pii_check, + "sensitive_data_type_check": sensitive_data_type_check, + "data_privacy_full_suite": [ + sensitive_data_check, + pii_check, + sensitive_data_type_check, + ], +} diff --git a/24.2/samples/validation_suites/data_quality_full_suite.py b/24.2/samples/validation_suites/data_quality_full_suite.py new file mode 100644 index 0000000..b946b94 --- /dev/null +++ b/24.2/samples/validation_suites/data_quality_full_suite.py @@ -0,0 +1,127 @@ +# Write custom tests which can be used to validate your datasets quality +from __future__ import annotations + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pandas import DataFrame +from vectice.models.validation_dataset import TestSuiteReturnType + + +# custom test which can be used for dataset validation +def test_dataset_split( + dataset: DataFrame | None, + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + feature_columns: list | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: list | None = None, + target_column: str | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType | None: + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + files = [] + # disable plots showing + plt.ioff() + for column in dataset.select_dtypes(include=[np.number]).columns: + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline( + Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}" + ) + plt.axvline( + Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}" + ) + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx( + [0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}" + ) + + # Highlight outliers + outliers = dataset[ + (dataset[column] < lower_bound) | (dataset[column] > upper_bound) + ][column] + plt.scatter( + outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5 + ) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + properties={}, + tables=[], + attachments=files, + ) + + +# Map the tests to be used +DATA_QUALITY_SUITE_MAP_TEST = { + "dataset_split": test_dataset_split, + "iqr_and_outliers": iqr_and_outliers, + "full_dataset_validation": [ + test_dataset_split, + iqr_and_outliers, + ], +} diff --git a/24.2/samples/validation_suites/master_test_suites.py b/24.2/samples/validation_suites/master_test_suites.py new file mode 100644 index 0000000..e6c8c41 --- /dev/null +++ b/24.2/samples/validation_suites/master_test_suites.py @@ -0,0 +1,23 @@ +# Vectice provided model validation tests +from binary_classification_full_suite import BINARY_CLASSIFICATION_FULL_SUITE_MAP_TEST + +# custom data quality tests +from data_quality_full_suite import ( + test_dataset_split, + iqr_and_outliers, +) + + +# The master test suite file is used to map all tests which can be run. +# The tests can be provided by Vectice or custom functions from your test suite modules. +# Vectice uses this configuration to simply identify available tests, when you run +# your validations in your notebook. + +# Accumulation and mapping of all tests to be run +MASTER_FULL_SUITE_MAP_TEST = { + "binary_full_suite": BINARY_CLASSIFICATION_FULL_SUITE_MAP_TEST["binary_full_suite"], + "full_dataset_validation": [ + test_dataset_split, + iqr_and_outliers, + ], +} diff --git a/24.2/samples/validation_suites/regression_full_suite.py b/24.2/samples/validation_suites/regression_full_suite.py new file mode 100644 index 0000000..d27e00e --- /dev/null +++ b/24.2/samples/validation_suites/regression_full_suite.py @@ -0,0 +1,26 @@ +# import the Vectice provided test +from vectice.models.test_library.regression_test import ( + plot_residuals, + r2_score, + explainability, + feature_importance, + target_drift, + prediction_drift, +) + +# Map the tests to be used +REGRESSION_FULL_SUITE_MAP_TEST = { + "roc": plot_residuals, + "cm": r2_score, + "explainability": explainability, + "feature_importance": feature_importance, + "drift": [target_drift, prediction_drift], + "binary_full_suite": [ + plot_residuals, + r2_score, + explainability, + feature_importance, + target_drift, + prediction_drift, + ], +} diff --git a/24.2/samples/validation_suites/time_series_full_suite.py b/24.2/samples/validation_suites/time_series_full_suite.py new file mode 100644 index 0000000..0fd7b0d --- /dev/null +++ b/24.2/samples/validation_suites/time_series_full_suite.py @@ -0,0 +1,24 @@ +# import the Vectice provided time series tests +from vectice.models.test_library.time_series_test import ( + trend_analysis, + seasonality_check, + autocorrelation_test, + stationarity_test, + missing_value_analysis, +) + +# Map the tests to be used +TIME_SERIES_FULL_SUITE_MAP_TEST = { + "trend": trend_analysis, + "seasonality": seasonality_check, + "autocorrelation": autocorrelation_test, + "stationarity": stationarity_test, + "missing_value": missing_value_analysis, + "time_series_full_suite": [ + trend_analysis, + seasonality_check, + autocorrelation_test, + stationarity_test, + missing_value_analysis, + ], +} From 59a31342dced9f6ee904b094a3228e3b5df0a993 Mon Sep 17 00:00:00 2001 From: AidanNell Date: Wed, 10 Jul 2024 09:03:21 +0700 Subject: [PATCH 02/20] updated file structure --- .../binary_classification_full_suite.py | 26 ------ ...cy_full_suite.py => data_privacy_tests.py} | 13 --- ...ty_full_suite.py => data_quality_tests.py} | 11 --- .../validation_suites/master_test_suites.py | 80 +++++++++++++++++-- .../regression_full_suite.py | 26 ------ .../time_series_full_suite.py | 24 ------ 6 files changed, 74 insertions(+), 106 deletions(-) delete mode 100644 24.2/samples/validation_suites/binary_classification_full_suite.py rename 24.2/samples/validation_suites/{data_privacy_full_suite.py => data_privacy_tests.py} (93%) rename 24.2/samples/validation_suites/{data_quality_full_suite.py => data_quality_tests.py} (93%) delete mode 100644 24.2/samples/validation_suites/regression_full_suite.py delete mode 100644 24.2/samples/validation_suites/time_series_full_suite.py diff --git a/24.2/samples/validation_suites/binary_classification_full_suite.py b/24.2/samples/validation_suites/binary_classification_full_suite.py deleted file mode 100644 index 8427247..0000000 --- a/24.2/samples/validation_suites/binary_classification_full_suite.py +++ /dev/null @@ -1,26 +0,0 @@ -# import the Vectice provided binary classification tests -from vectice.models.test_library.binary_classification_test import ( - plot_roc_curve, - conf_matrix, - explainability, - feature_importance, - label_drift, - prediction_drift, -) - -# Map the tests to be used -BINARY_CLASSIFICATION_FULL_SUITE_MAP_TEST = { - "roc": plot_roc_curve, - "cm": conf_matrix, - "explainability": explainability, - "feature_importance": feature_importance, - "drift": [label_drift, prediction_drift], - "binary_full_suite": [ - plot_roc_curve, - conf_matrix, - explainability, - feature_importance, - label_drift, - prediction_drift, - ], -} diff --git a/24.2/samples/validation_suites/data_privacy_full_suite.py b/24.2/samples/validation_suites/data_privacy_tests.py similarity index 93% rename from 24.2/samples/validation_suites/data_privacy_full_suite.py rename to 24.2/samples/validation_suites/data_privacy_tests.py index 4fd2215..90d851f 100644 --- a/24.2/samples/validation_suites/data_privacy_full_suite.py +++ b/24.2/samples/validation_suites/data_privacy_tests.py @@ -146,16 +146,3 @@ def sensitive_data_type_check( tables=[table], attachments=[], ) - - -# Map the tests to be used -DATA_PRIVACY_SUITE_MAP_TEST = { - "sensitive_data_check": sensitive_data_check, - "pii_check": pii_check, - "sensitive_data_type_check": sensitive_data_type_check, - "data_privacy_full_suite": [ - sensitive_data_check, - pii_check, - sensitive_data_type_check, - ], -} diff --git a/24.2/samples/validation_suites/data_quality_full_suite.py b/24.2/samples/validation_suites/data_quality_tests.py similarity index 93% rename from 24.2/samples/validation_suites/data_quality_full_suite.py rename to 24.2/samples/validation_suites/data_quality_tests.py index b946b94..05b3ae5 100644 --- a/24.2/samples/validation_suites/data_quality_full_suite.py +++ b/24.2/samples/validation_suites/data_quality_tests.py @@ -114,14 +114,3 @@ def iqr_and_outliers( tables=[], attachments=files, ) - - -# Map the tests to be used -DATA_QUALITY_SUITE_MAP_TEST = { - "dataset_split": test_dataset_split, - "iqr_and_outliers": iqr_and_outliers, - "full_dataset_validation": [ - test_dataset_split, - iqr_and_outliers, - ], -} diff --git a/24.2/samples/validation_suites/master_test_suites.py b/24.2/samples/validation_suites/master_test_suites.py index e6c8c41..25c9ea0 100644 --- a/24.2/samples/validation_suites/master_test_suites.py +++ b/24.2/samples/validation_suites/master_test_suites.py @@ -1,22 +1,90 @@ -# Vectice provided model validation tests -from binary_classification_full_suite import BINARY_CLASSIFICATION_FULL_SUITE_MAP_TEST +# import the Vectice provided probability of default tests +from vectice.models.test_library.probability_of_default_test import ( + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, +) + +# import the Vectice provided regression tests +from vectice.models.test_library.regression_test import ( + plot_residuals, + r2_score, + explainability, + feature_importance, + target_drift, + prediction_drift, +) + +# import the Vectice provided time series tests +from vectice.models.test_library.time_series_test import ( + trend_analysis, + seasonality_check, + autocorrelation_test, + stationarity_test, + missing_value_analysis, +) + # custom data quality tests -from data_quality_full_suite import ( +from data_quality_tests import ( test_dataset_split, iqr_and_outliers, ) +# Map the tests to be used for regression +REGRESSION_FULL_SUITE_MAP_TEST = { + "roc": plot_residuals, + "cm": r2_score, + "explainability": explainability, + "feature_importance": feature_importance, + "drift": [target_drift, prediction_drift], + "binary_full_suite": [ + plot_residuals, + r2_score, + explainability, + feature_importance, + target_drift, + prediction_drift, + ], +} + +# Map the tests to be used for time series +TIME_SERIES_FULL_SUITE_MAP_TEST = { + "trend": trend_analysis, + "seasonality": seasonality_check, + "autocorrelation": autocorrelation_test, + "stationarity": stationarity_test, + "missing_value": missing_value_analysis, + "time_series_full_suite": [ + trend_analysis, + seasonality_check, + autocorrelation_test, + stationarity_test, + missing_value_analysis, + ], +} + + # The master test suite file is used to map all tests which can be run. # The tests can be provided by Vectice or custom functions from your test suite modules. # Vectice uses this configuration to simply identify available tests, when you run # your validations in your notebook. # Accumulation and mapping of all tests to be run -MASTER_FULL_SUITE_MAP_TEST = { - "binary_full_suite": BINARY_CLASSIFICATION_FULL_SUITE_MAP_TEST["binary_full_suite"], - "full_dataset_validation": [ +MASTER_SUITE_MAP_TEST = { + "probability_of_default_validation": [ + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, + ], + "data_quality": [ test_dataset_split, iqr_and_outliers, ], diff --git a/24.2/samples/validation_suites/regression_full_suite.py b/24.2/samples/validation_suites/regression_full_suite.py deleted file mode 100644 index d27e00e..0000000 --- a/24.2/samples/validation_suites/regression_full_suite.py +++ /dev/null @@ -1,26 +0,0 @@ -# import the Vectice provided test -from vectice.models.test_library.regression_test import ( - plot_residuals, - r2_score, - explainability, - feature_importance, - target_drift, - prediction_drift, -) - -# Map the tests to be used -REGRESSION_FULL_SUITE_MAP_TEST = { - "roc": plot_residuals, - "cm": r2_score, - "explainability": explainability, - "feature_importance": feature_importance, - "drift": [target_drift, prediction_drift], - "binary_full_suite": [ - plot_residuals, - r2_score, - explainability, - feature_importance, - target_drift, - prediction_drift, - ], -} diff --git a/24.2/samples/validation_suites/time_series_full_suite.py b/24.2/samples/validation_suites/time_series_full_suite.py deleted file mode 100644 index 0fd7b0d..0000000 --- a/24.2/samples/validation_suites/time_series_full_suite.py +++ /dev/null @@ -1,24 +0,0 @@ -# import the Vectice provided time series tests -from vectice.models.test_library.time_series_test import ( - trend_analysis, - seasonality_check, - autocorrelation_test, - stationarity_test, - missing_value_analysis, -) - -# Map the tests to be used -TIME_SERIES_FULL_SUITE_MAP_TEST = { - "trend": trend_analysis, - "seasonality": seasonality_check, - "autocorrelation": autocorrelation_test, - "stationarity": stationarity_test, - "missing_value": missing_value_analysis, - "time_series_full_suite": [ - trend_analysis, - seasonality_check, - autocorrelation_test, - stationarity_test, - missing_value_analysis, - ], -} From a8fa33fee8549e1ccdb73fab3bc7a9558af825d4 Mon Sep 17 00:00:00 2001 From: AidanNell Date: Wed, 10 Jul 2024 09:10:01 +0700 Subject: [PATCH 03/20] fixed suite map in master file --- .../validation_suites/master_test_suites.py | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/24.2/samples/validation_suites/master_test_suites.py b/24.2/samples/validation_suites/master_test_suites.py index 25c9ea0..6370541 100644 --- a/24.2/samples/validation_suites/master_test_suites.py +++ b/24.2/samples/validation_suites/master_test_suites.py @@ -1,4 +1,4 @@ -# import the Vectice provided probability of default tests +# import the Vectice provided probability of default validation tests from vectice.models.test_library.probability_of_default_test import ( plot_roc_curve, conf_matrix, @@ -8,7 +8,7 @@ prediction_drift, ) -# import the Vectice provided regression tests +# import the Vectice provided regression validation tests from vectice.models.test_library.regression_test import ( plot_residuals, r2_score, @@ -18,7 +18,7 @@ prediction_drift, ) -# import the Vectice provided time series tests +# import the Vectice provided time series validation tests from vectice.models.test_library.time_series_test import ( trend_analysis, seasonality_check, @@ -28,14 +28,20 @@ ) -# custom data quality tests +# custom data quality validation tests from data_quality_tests import ( test_dataset_split, iqr_and_outliers, ) +# custom data privacy validation tests +from data_privacy_tests import ( + sensitive_data_check, + sensitive_data_type_check, + pii_check, +) -# Map the tests to be used for regression +# Map the tests to be used for regression validation REGRESSION_FULL_SUITE_MAP_TEST = { "roc": plot_residuals, "cm": r2_score, @@ -52,7 +58,7 @@ ], } -# Map the tests to be used for time series +# Map the tests to be used for time series validation TIME_SERIES_FULL_SUITE_MAP_TEST = { "trend": trend_analysis, "seasonality": seasonality_check, @@ -68,13 +74,34 @@ ], } +# Map the tests to be used for data quality +DATA_QUALITY_SUITE_MAP_TEST = { + "dataset_split": test_dataset_split, + "iqr_and_outliers": iqr_and_outliers, + "full_dataset_validation": [ + test_dataset_split, + iqr_and_outliers, + ], +} + +# Map the tests to be used for data privacy validation +DATA_PRIVACY_SUITE_MAP_TEST = { + "sensitive_data_check": sensitive_data_check, + "pii_check": pii_check, + "sensitive_data_type_check": sensitive_data_type_check, + "data_privacy_full_suite": [ + sensitive_data_check, + pii_check, + sensitive_data_type_check, + ], +} # The master test suite file is used to map all tests which can be run. # The tests can be provided by Vectice or custom functions from your test suite modules. # Vectice uses this configuration to simply identify available tests, when you run # your validations in your notebook. -# Accumulation and mapping of all tests to be run +# Accumulation and mapping of all validation tests to be run MASTER_SUITE_MAP_TEST = { "probability_of_default_validation": [ plot_roc_curve, From 79c2591c3fd84db8902c088d22eaf8ff7c0dc7a7 Mon Sep 17 00:00:00 2001 From: AidanNell Date: Wed, 10 Jul 2024 09:44:20 +0700 Subject: [PATCH 04/20] updated naming --- 24.2/samples/validation_suites/master_test_suites.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/24.2/samples/validation_suites/master_test_suites.py b/24.2/samples/validation_suites/master_test_suites.py index 6370541..b79944b 100644 --- a/24.2/samples/validation_suites/master_test_suites.py +++ b/24.2/samples/validation_suites/master_test_suites.py @@ -1,5 +1,5 @@ # import the Vectice provided probability of default validation tests -from vectice.models.test_library.probability_of_default_test import ( +from vectice.models.test_library.binary_classification_test import ( plot_roc_curve, conf_matrix, explainability, @@ -102,8 +102,8 @@ # your validations in your notebook. # Accumulation and mapping of all validation tests to be run -MASTER_SUITE_MAP_TEST = { - "probability_of_default_validation": [ +CUSTOM_TEST_PD_MODEL = { + "binary_suite": [ plot_roc_curve, conf_matrix, explainability, From a83eb1ef3b668c66446ee875cd4389f4ef5fd87b Mon Sep 17 00:00:00 2001 From: BDaversa Date: Tue, 9 Jul 2024 22:28:26 -0700 Subject: [PATCH 05/20] add more file --- .../samples/validation_suites/PiML_wrapper.py | 0 .../samples/validation_suites/custom_tests.py | 56 +++++++++++++++++++ .../validation_suites/master_test_suites.py | 7 +++ 3 files changed, 63 insertions(+) create mode 100644 24.2/samples/validation_suites/PiML_wrapper.py create mode 100644 24.2/samples/validation_suites/custom_tests.py diff --git a/24.2/samples/validation_suites/PiML_wrapper.py b/24.2/samples/validation_suites/PiML_wrapper.py new file mode 100644 index 0000000..e69de29 diff --git a/24.2/samples/validation_suites/custom_tests.py b/24.2/samples/validation_suites/custom_tests.py new file mode 100644 index 0000000..83510ad --- /dev/null +++ b/24.2/samples/validation_suites/custom_tests.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) \ No newline at end of file diff --git a/24.2/samples/validation_suites/master_test_suites.py b/24.2/samples/validation_suites/master_test_suites.py index b79944b..4a6123d 100644 --- a/24.2/samples/validation_suites/master_test_suites.py +++ b/24.2/samples/validation_suites/master_test_suites.py @@ -41,6 +41,10 @@ pii_check, ) +from custom_tests import ( + plot_correlation_matrix +) + # Map the tests to be used for regression validation REGRESSION_FULL_SUITE_MAP_TEST = { "roc": plot_residuals, @@ -115,4 +119,7 @@ test_dataset_split, iqr_and_outliers, ], + "custom":[ + plot_correlation_matrix, + ] } From 433bf30132fb31b24e1e9b549df724cbed561e11 Mon Sep 17 00:00:00 2001 From: BDaversa Date: Thu, 11 Jul 2024 10:58:37 -0700 Subject: [PATCH 06/20] change name --- ..._tests.py => correlation_matrix_module.py} | 0 ...ivacy_tests.py => data_privacy_modules.py} | 0 ...ality_tests.py => data_quality_modules.py} | 0 .../master_config_test_suites.py | 64 +++++++++ .../validation_suites/master_test_suites.py | 125 ------------------ 5 files changed, 64 insertions(+), 125 deletions(-) rename 24.2/samples/validation_suites/{custom_tests.py => correlation_matrix_module.py} (100%) rename 24.2/samples/validation_suites/{data_privacy_tests.py => data_privacy_modules.py} (100%) rename 24.2/samples/validation_suites/{data_quality_tests.py => data_quality_modules.py} (100%) create mode 100644 24.2/samples/validation_suites/master_config_test_suites.py delete mode 100644 24.2/samples/validation_suites/master_test_suites.py diff --git a/24.2/samples/validation_suites/custom_tests.py b/24.2/samples/validation_suites/correlation_matrix_module.py similarity index 100% rename from 24.2/samples/validation_suites/custom_tests.py rename to 24.2/samples/validation_suites/correlation_matrix_module.py diff --git a/24.2/samples/validation_suites/data_privacy_tests.py b/24.2/samples/validation_suites/data_privacy_modules.py similarity index 100% rename from 24.2/samples/validation_suites/data_privacy_tests.py rename to 24.2/samples/validation_suites/data_privacy_modules.py diff --git a/24.2/samples/validation_suites/data_quality_tests.py b/24.2/samples/validation_suites/data_quality_modules.py similarity index 100% rename from 24.2/samples/validation_suites/data_quality_tests.py rename to 24.2/samples/validation_suites/data_quality_modules.py diff --git a/24.2/samples/validation_suites/master_config_test_suites.py b/24.2/samples/validation_suites/master_config_test_suites.py new file mode 100644 index 0000000..0f13a6a --- /dev/null +++ b/24.2/samples/validation_suites/master_config_test_suites.py @@ -0,0 +1,64 @@ +# import the Vectice provided probability of default validation tests +from vectice.models.test_library.binary_classification_test import ( + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, +) + + +# custom data quality validation tests +from data_quality_modules import ( + test_dataset_split, + iqr_and_outliers, +) + +# custom data privacy validation tests +from data_privacy_modules import ( + sensitive_data_check, + sensitive_data_type_check, + pii_check, +) + +from correlation_matrix_module import ( + plot_correlation_matrix +) + + +# The master test suite file is used to map all suite of test which can be run. +# The tests can be provided by Vectice or custom functions from your modules. +# Vectice uses this configuration to simply identify and bundle available tests into suite, when you run +# your validations in your notebook. + +# Accumulation and mapping of all validation tests to be run for the PD model suite +PD_model_suite= { + "binary_suite": [ + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, + ], + "data_quality": [ + test_dataset_split, + iqr_and_outliers, + ], + "custom":[ + plot_correlation_matrix, + ] +} + +# Map the tests to be used for data privacy validation +Robustness_suite = { + "sensitive_data_check": sensitive_data_check, + "pii_check": pii_check, + "sensitive_data_type_check": sensitive_data_type_check, + "data_privacy_full_suite": [ + sensitive_data_check, + pii_check, + sensitive_data_type_check, + ], +} diff --git a/24.2/samples/validation_suites/master_test_suites.py b/24.2/samples/validation_suites/master_test_suites.py deleted file mode 100644 index 4a6123d..0000000 --- a/24.2/samples/validation_suites/master_test_suites.py +++ /dev/null @@ -1,125 +0,0 @@ -# import the Vectice provided probability of default validation tests -from vectice.models.test_library.binary_classification_test import ( - plot_roc_curve, - conf_matrix, - explainability, - feature_importance, - label_drift, - prediction_drift, -) - -# import the Vectice provided regression validation tests -from vectice.models.test_library.regression_test import ( - plot_residuals, - r2_score, - explainability, - feature_importance, - target_drift, - prediction_drift, -) - -# import the Vectice provided time series validation tests -from vectice.models.test_library.time_series_test import ( - trend_analysis, - seasonality_check, - autocorrelation_test, - stationarity_test, - missing_value_analysis, -) - - -# custom data quality validation tests -from data_quality_tests import ( - test_dataset_split, - iqr_and_outliers, -) - -# custom data privacy validation tests -from data_privacy_tests import ( - sensitive_data_check, - sensitive_data_type_check, - pii_check, -) - -from custom_tests import ( - plot_correlation_matrix -) - -# Map the tests to be used for regression validation -REGRESSION_FULL_SUITE_MAP_TEST = { - "roc": plot_residuals, - "cm": r2_score, - "explainability": explainability, - "feature_importance": feature_importance, - "drift": [target_drift, prediction_drift], - "binary_full_suite": [ - plot_residuals, - r2_score, - explainability, - feature_importance, - target_drift, - prediction_drift, - ], -} - -# Map the tests to be used for time series validation -TIME_SERIES_FULL_SUITE_MAP_TEST = { - "trend": trend_analysis, - "seasonality": seasonality_check, - "autocorrelation": autocorrelation_test, - "stationarity": stationarity_test, - "missing_value": missing_value_analysis, - "time_series_full_suite": [ - trend_analysis, - seasonality_check, - autocorrelation_test, - stationarity_test, - missing_value_analysis, - ], -} - -# Map the tests to be used for data quality -DATA_QUALITY_SUITE_MAP_TEST = { - "dataset_split": test_dataset_split, - "iqr_and_outliers": iqr_and_outliers, - "full_dataset_validation": [ - test_dataset_split, - iqr_and_outliers, - ], -} - -# Map the tests to be used for data privacy validation -DATA_PRIVACY_SUITE_MAP_TEST = { - "sensitive_data_check": sensitive_data_check, - "pii_check": pii_check, - "sensitive_data_type_check": sensitive_data_type_check, - "data_privacy_full_suite": [ - sensitive_data_check, - pii_check, - sensitive_data_type_check, - ], -} - -# The master test suite file is used to map all tests which can be run. -# The tests can be provided by Vectice or custom functions from your test suite modules. -# Vectice uses this configuration to simply identify available tests, when you run -# your validations in your notebook. - -# Accumulation and mapping of all validation tests to be run -CUSTOM_TEST_PD_MODEL = { - "binary_suite": [ - plot_roc_curve, - conf_matrix, - explainability, - feature_importance, - label_drift, - prediction_drift, - ], - "data_quality": [ - test_dataset_split, - iqr_and_outliers, - ], - "custom":[ - plot_correlation_matrix, - ] -} From c98e12f3c7061887c6e2f7415a0e438d5fadef25 Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:00:06 -0700 Subject: [PATCH 07/20] Create README.md --- 24.2/samples/validation_suites/README.md | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 24.2/samples/validation_suites/README.md diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md new file mode 100644 index 0000000..90e15ec --- /dev/null +++ b/24.2/samples/validation_suites/README.md @@ -0,0 +1,30 @@ +## All test modules below can be added to your test suite to by run on any models or datasets +| **Category** | **Test Name** | **Function** | +|------------------------------|----------------------------------|--------------------------------------| +| **Classification Tests** | ROC Curve | `plot_roc_curve` | +| | Confusion Matrix | `conf_matrix` | +| | Explainability | `explainability` | +| | Feature Importance | `feature_importance` | +| | Label Drift | `label_drift` | +| | Prediction Drift | `prediction_drift` | +| | **Full Binary Classification Test** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | +| **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` | +| | PII Check | `pii_check` | +| | Sensitive Data Type Check | `sensitive_data_type_check` | +| | **Full Data Privacy Test** | `sensitive_data_check`, `pii_check`, `sensitive_data_type_check` | +| **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | +| | IQR and Outliers | `iqr_and_outliers` | +| | **Full Dataset Quality Test** | `test_dataset_split`, `iqr_and_outliers` | +| **Regression Tests** | Residuals Plot | `plot_residuals` | +| | R² Score | `r2_score` | +| | Explainability | `explainability` | +| | Feature Importance | `feature_importance` | +| | Target Drift | `target_drift` | +| | Prediction Drift | `prediction_drift` | +| | **Full Regression Test** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | +| **Time Series Tests** | Trend Analysis | `trend_analysis` | +| | Seasonality Check | `seasonality_check` | +| | Autocorrelation Test | `autocorrelation_test` | +| | Stationarity Test | `stationarity_test` | +| | Missing Value Analysis | `missing_value_analysis` | +| | **Full Time Series Test** | `trend_analysis`, `seasonality_check`, `autocorrelation_test`, `stationarity_test`, `missing_value_analysis` | From 726ade4a5e071a26ef36ce437ae23dc55a4727cb Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:00:54 -0700 Subject: [PATCH 08/20] Update README.md --- 24.2/samples/validation_suites/README.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index 90e15ec..ab6fba0 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -14,17 +14,12 @@ | | **Full Data Privacy Test** | `sensitive_data_check`, `pii_check`, `sensitive_data_type_check` | | **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | | | IQR and Outliers | `iqr_and_outliers` | -| | **Full Dataset Quality Test** | `test_dataset_split`, `iqr_and_outliers` | +| | **Full Dataset Quality suiteest** | `test_dataset_split`, `iqr_and_outliers` | | **Regression Tests** | Residuals Plot | `plot_residuals` | | | R² Score | `r2_score` | | | Explainability | `explainability` | | | Feature Importance | `feature_importance` | | | Target Drift | `target_drift` | | | Prediction Drift | `prediction_drift` | -| | **Full Regression Test** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | -| **Time Series Tests** | Trend Analysis | `trend_analysis` | -| | Seasonality Check | `seasonality_check` | -| | Autocorrelation Test | `autocorrelation_test` | -| | Stationarity Test | `stationarity_test` | -| | Missing Value Analysis | `missing_value_analysis` | -| | **Full Time Series Test** | `trend_analysis`, `seasonality_check`, `autocorrelation_test`, `stationarity_test`, `missing_value_analysis` | +| | **Full Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | + From e70a6f11e9c73bad43c71074787a0fec8d92f86e Mon Sep 17 00:00:00 2001 From: BDaversa Date: Thu, 11 Jul 2024 13:03:43 -0700 Subject: [PATCH 09/20] package modules --- .../samples/validation_suites/PiML_wrapper.py | 523 ++++++++++++++++++ .../master_config_test_suites.py | 6 +- .../test_modules/correlation_matrix_module.py | 56 ++ .../test_modules/data_privacy_modules.py | 148 +++++ .../test_modules/data_quality_modules.py | 116 ++++ 5 files changed, 846 insertions(+), 3 deletions(-) create mode 100644 24.2/samples/validation_suites/test_modules/correlation_matrix_module.py create mode 100644 24.2/samples/validation_suites/test_modules/data_privacy_modules.py create mode 100644 24.2/samples/validation_suites/test_modules/data_quality_modules.py diff --git a/24.2/samples/validation_suites/PiML_wrapper.py b/24.2/samples/validation_suites/PiML_wrapper.py index e69de29..5ec37db 100644 --- a/24.2/samples/validation_suites/PiML_wrapper.py +++ b/24.2/samples/validation_suites/PiML_wrapper.py @@ -0,0 +1,523 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import shap +from scipy.stats import chi2_contingency, ks_2samp +from sklearn.metrics import auc, confusion_matrix, precision_score, recall_score, roc_curve + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + + +def plot_roc_curve( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"train_color": "green", "test_color": "blue", "threshold": 0.5}, +) -> TestSuiteReturnType | None: + from vectice.models.validation import TestSuiteReturnType + + X_train = training_df.drop(columns=[target_column]) + X_test = testing_df.drop(columns=[target_column]) + training_prediction_proba = predictor.predict_proba(X_train)[:, 1] + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_train is not None: + training_prediction_proba = predict_proba_train + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + fpr_train, tpr_train, _ = roc_curve(training_df[target_column], training_prediction_proba) + roc_auc_train = auc(fpr_train, tpr_train) + + fpr_test, tpr_test, _ = roc_curve(testing_df[target_column], testing_prediction_proba) + roc_auc_test = auc(fpr_test, tpr_test) + + file_path = "ROC_CURVE.png" + + plt.figure(figsize=(8, 6)) + plt.plot( + fpr_train, + tpr_train, + color=internal_parameters["train_color"], + linestyle="--", + label=f"Train ROC curve (AUC = {roc_auc_train:.2f})", + ) + plt.plot( + fpr_test, + tpr_test, + color=internal_parameters["test_color"], + label=f"Test ROC curve (AUC = {roc_auc_test:.2f})", + ) + plt.plot([0, 1], [0, 1], color="red", linestyle="--") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("Receiver Operating Characteristic (ROC) Curve") + plt.legend() + plt.grid(True) + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={"_ROC_auc_train": roc_auc_train, "_ROC_auc_test": roc_auc_test}, + properties={}, + tables=[], + attachments=[file_path], + ) + + +def conf_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"threshold": 0.5, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + threshold = internal_parameters["threshold"] + cmap = internal_parameters.get("cmap", "Blues") + + X_test = testing_df.drop(columns=[target_column]) + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + testing_prediction = (testing_prediction_proba >= threshold).astype(int) + + cm = confusion_matrix(testing_df[target_column], testing_prediction) + total_samples = np.sum(cm) + + precision = precision_score(testing_df[target_column], testing_prediction) + recall = recall_score(testing_df[target_column], testing_prediction) + + # Plot confusion matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(cm, annot=True, cmap=cmap, fmt="d", annot_kws={"fontsize": 12}, cbar=False) + for i in range(len(cm)): + for j in range(len(cm)): + plt.text( + j + 0.5, + i + 0.75, + f"{cm[i][j]/total_samples*100:.2f}%", + ha="center", + va="center", + color="black", + fontsize=12, + ) + plt.xlabel("Predicted Label") + plt.ylabel("True Label") + plt.title(f"Confusion Matrix\nPrecision: {precision:.2f}, Recall: {recall:.2f}") + + # Save the plot + file_path = "Confusion_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={"_precision_test": precision, "_recall_test": recall}, + properties={"Threshold": threshold}, + tables=[], + attachments=[file_path], + ) + + +def explainability( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) + shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) + shap.summary_plot( + shap_values[:, :, 0], training_df.drop(columns=[target_column]).head(1000), max_display=10, show=False + ) + summary_plot_path = "SHAP_summary_plot.png" + plt.savefig(summary_plot_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[summary_plot_path]) + + +def feature_importance( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) + shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) + clustering = shap.utils.hclust( + training_df.drop(columns=[target_column]).head(1000), training_df[target_column].head(1000) + ) + shap.plots.bar(shap_values[:, :, 0], clustering=clustering, max_display=10, show=False) + + feature_importance_path = "feature_importance.png" + plt.savefig(feature_importance_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[feature_importance_path]) + + +def cramers_v_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: + + min_length = min(len(x), len(y), 4000) + x = x[:min_length] + y = y[:min_length] + confusion_matrix = pd.crosstab(x, y) + chi2 = chi2_contingency(confusion_matrix)[0] + n = confusion_matrix.sum().sum() + phi2 = chi2 / n + r, k = confusion_matrix.shape + phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) + rcorr = r - ((r - 1) ** 2) / (n - 1) + kcorr = k - ((k - 1) ** 2) / (n - 1) + return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) + + +def ks_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: + min_length = min(len(x), len(y), 4000) + x = x[:min_length] + y = y[:min_length] + ks_statistic, _ = ks_2samp(x, y) + + return ks_statistic + + +def prediction_drift( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + threshold: float, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + X_train = training_df.drop(columns=[target_column]) + X_test = testing_df.drop(columns=[target_column]) + training_prediction_proba = predictor.predict_proba(X_train)[:, 1] + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_train is not None: + training_prediction_proba = predict_proba_train + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + train_predictions = np.array(training_prediction_proba) + test_predictions = np.array(testing_prediction_proba) + + light_red = "#FF8A80" # Light Red + darker_blue = "#1565C0" # Darker Blue + sns.set_palette([darker_blue, light_red]) + + _, ax = plt.subplots(figsize=(8, 6)) + + sns.kdeplot(train_predictions, color=light_red, label="Train Predictions", fill=True) + sns.kdeplot(test_predictions, color=darker_blue, label="Test Predictions", fill=True) + + # Plot vertical lines for means using the specified colors + ax.axvline( # pyright: ignore[reportAttributeAccessIssue] + np.mean(train_predictions), # pyright: ignore[reportArgumentType] + color=light_red, + linestyle="--", + label="Train Mean", + ) + ax.axvline( # pyright: ignore[reportAttributeAccessIssue] + np.mean(test_predictions), # pyright: ignore[reportArgumentType] + color=darker_blue, + linestyle="--", + label="Test Mean", + ) + + plt.xlabel("Predictions") + plt.ylabel("Density") + plt.title("Prediction Drift Plot (Kolmogorov-Smirnov drift score)") + plt.legend() + plt.grid(True) + path = "Prediction_drift.png" + + # Calculate and print drift score + drift_score = ks_score(train_predictions, test_predictions) + + # Set text position at the top + text_x = 0.5 + text_y = 0.95 + if drift_score < 0.1: + score_color = "green" + elif 0.1 <= drift_score <= 0.2: + score_color = "orange" + else: + score_color = "red" + + plt.text( + text_x, + text_y, + f"Drift score = {drift_score:.2f}", + ha="center", + va="top", + color=score_color, + transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] + ) + + plt.savefig(path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, properties={"_prediction_drift_score": drift_score}, tables=[], attachments=[path] + ) + + +def label_drift( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + train_labels = np.array(training_df[target_column]) + test_labels = np.array(testing_df[target_column]) + + light_red = "#FF8A80" # Light Red + darker_blue = "#1565C0" # Darker Blue + sns.set_palette([darker_blue, light_red]) + + _, ax = plt.subplots(figsize=(8, 6)) + + bar_width = 0.35 + index = np.arange(2) + + train_counts = [np.sum(train_labels == 0) / len(train_labels), np.sum(train_labels == 1) / len(train_labels)] + test_counts = [np.sum(test_labels == 0) / len(test_labels), np.sum(test_labels == 1) / len(test_labels)] + + train_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] + index, train_counts, bar_width, label="Train Labels" + ) + test_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] + index + bar_width, test_counts, bar_width, label="Test Labels" + ) + + ax.set_xlabel("Labels") # pyright: ignore[reportAttributeAccessIssue] + ax.set_ylabel("Frequency") # pyright: ignore[reportAttributeAccessIssue] + ax.set_title("Label Drift Plot (Cramer's V drift score)") # pyright: ignore[reportAttributeAccessIssue] + ax.set_xticks(index + bar_width / 2) # pyright: ignore[reportAttributeAccessIssue] + ax.set_xticklabels(["0", "1"]) # pyright: ignore[reportAttributeAccessIssue] + ax.legend() # pyright: ignore[reportAttributeAccessIssue] + + def autolabel(bars: BarContainer): + """Attach a text label above each bar in *bars*, displaying its height.""" + for bar in bars: + height = bar.get_height() + ax.annotate( # pyright: ignore[reportAttributeAccessIssue] + f"{height:.2f}", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha="center", + va="bottom", + ) + + autolabel(train_bar) + autolabel(test_bar) + + drift_score = cramers_v_score(train_labels, test_labels) + if drift_score < 0.1: + score_color = "green" + elif 0.1 <= drift_score <= 0.2: + score_color = "orange" + else: + score_color = "red" + + ax.text( # pyright: ignore[reportAttributeAccessIssue] + 0.5, + 0.95, + f"Drift score = {drift_score:.2f}", + ha="center", + va="top", + color=score_color, + transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] + ) + + plt.tight_layout() + path = "Label_drift.png" + plt.savefig(path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, properties={"_label_drift_score": drift_score}, tables=[], attachments=[path] + ) + + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + subset_columns = internal_parameters.get( + "subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"] + ) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) + + +# custom test which can be used for dataset validation +def test_dataset_split( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation import TestSuiteReturnType + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType(metrics={}, properties={}, tables=[table], attachments=[]) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType | None: + from vectice.models.validation import TestSuiteReturnType + + dataset = training_df + + files = [] + # disable plots showing + if internal_parameters.get("subset_columns") is not None: + columns = internal_parameters.get("subset_columns") + else: + columns = dataset.select_dtypes(include=[np.number]).columns[:10] + plt.ioff() + for column in columns: # type: ignore + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline(Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}") + plt.axvline(Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}") + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx([0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}") + + # Highlight outliers + outliers = dataset[(dataset[column] < lower_bound) | (dataset[column] > upper_bound)][column] + plt.scatter(outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=files, + ) \ No newline at end of file diff --git a/24.2/samples/validation_suites/master_config_test_suites.py b/24.2/samples/validation_suites/master_config_test_suites.py index 0f13a6a..8211612 100644 --- a/24.2/samples/validation_suites/master_config_test_suites.py +++ b/24.2/samples/validation_suites/master_config_test_suites.py @@ -10,19 +10,19 @@ # custom data quality validation tests -from data_quality_modules import ( +from test_modules.data_quality_modules import ( test_dataset_split, iqr_and_outliers, ) # custom data privacy validation tests -from data_privacy_modules import ( +from test_modules.data_privacy_modules import ( sensitive_data_check, sensitive_data_type_check, pii_check, ) -from correlation_matrix_module import ( +from test_modules.correlation_matrix_module import ( plot_correlation_matrix ) diff --git a/24.2/samples/validation_suites/test_modules/correlation_matrix_module.py b/24.2/samples/validation_suites/test_modules/correlation_matrix_module.py new file mode 100644 index 0000000..83510ad --- /dev/null +++ b/24.2/samples/validation_suites/test_modules/correlation_matrix_module.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) \ No newline at end of file diff --git a/24.2/samples/validation_suites/test_modules/data_privacy_modules.py b/24.2/samples/validation_suites/test_modules/data_privacy_modules.py new file mode 100644 index 0000000..90d851f --- /dev/null +++ b/24.2/samples/validation_suites/test_modules/data_privacy_modules.py @@ -0,0 +1,148 @@ +# Write custom tests which can be used to validate your datasets security +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pandas as pd + +if TYPE_CHECKING: + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation_dataset import TestSuiteReturnType + + +def sensitive_data_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, + sensitive_keywords: list | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None or sensitive_keywords is None: + return None + + # Initialize a dictionary to hold counts of sensitive data + sensitive_counts = {keyword: 0 for keyword in sensitive_keywords} + + # Check each cell in the DataFrame for sensitive keywords + for keyword in sensitive_keywords: + sensitive_counts[keyword] = dataset.apply( + lambda x: x.astype(str).str.contains(keyword, case=False).sum() + ).sum() + + # Create a DataFrame with the results + sensitive_counts_df = pd.DataFrame( + { + "Sensitive Keyword": list(sensitive_counts.keys()), + "Count": list(sensitive_counts.values()), + } + ) + + table = Table(sensitive_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def pii_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define common PII patterns + pii_patterns = { + "name": r"\b[A-Z][a-z]*\b", + "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b", + "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b", + } + + # Initialize a dictionary to hold counts of PII matches + pii_counts = {key: 0 for key in pii_patterns.keys()} + + # Check each column in the DataFrame for PII patterns + for column in dataset.columns: + for key, pattern in pii_patterns.items(): + pii_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + pii_counts_df = pd.DataFrame( + {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())} + ) + + table = Table(pii_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def sensitive_data_type_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define patterns for sensitive data types + sensitive_data_patterns = { + "credit_card": r"\b(?:\d[ -]*?){13,16}\b", + "ssn": r"\b\d{3}-\d{2}-\d{4}\b", + } + + # Initialize a dictionary to hold counts of sensitive data type matches + sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()} + + # Check each column in the DataFrame for sensitive data type patterns + for column in dataset.columns: + for key, pattern in sensitive_data_patterns.items(): + sensitive_data_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + sensitive_data_counts_df = pd.DataFrame( + { + "Sensitive Data Type": list(sensitive_data_counts.keys()), + "Count": list(sensitive_data_counts.values()), + } + ) + + table = Table(sensitive_data_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) diff --git a/24.2/samples/validation_suites/test_modules/data_quality_modules.py b/24.2/samples/validation_suites/test_modules/data_quality_modules.py new file mode 100644 index 0000000..05b3ae5 --- /dev/null +++ b/24.2/samples/validation_suites/test_modules/data_quality_modules.py @@ -0,0 +1,116 @@ +# Write custom tests which can be used to validate your datasets quality +from __future__ import annotations + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pandas import DataFrame +from vectice.models.validation_dataset import TestSuiteReturnType + + +# custom test which can be used for dataset validation +def test_dataset_split( + dataset: DataFrame | None, + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + feature_columns: list | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: list | None = None, + target_column: str | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType | None: + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + files = [] + # disable plots showing + plt.ioff() + for column in dataset.select_dtypes(include=[np.number]).columns: + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline( + Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}" + ) + plt.axvline( + Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}" + ) + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx( + [0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}" + ) + + # Highlight outliers + outliers = dataset[ + (dataset[column] < lower_bound) | (dataset[column] > upper_bound) + ][column] + plt.scatter( + outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5 + ) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + properties={}, + tables=[], + attachments=files, + ) From e7b4b46973a5dfcd73fe8e481d09dab4a53df222 Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 13:05:48 -0700 Subject: [PATCH 10/20] Update README.md --- 24.2/samples/validation_suites/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index ab6fba0..afec820 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -7,19 +7,19 @@ | | Feature Importance | `feature_importance` | | | Label Drift | `label_drift` | | | Prediction Drift | `prediction_drift` | -| | **Full Binary Classification Test** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | +| | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | | **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` | | | PII Check | `pii_check` | | | Sensitive Data Type Check | `sensitive_data_type_check` | -| | **Full Data Privacy Test** | `sensitive_data_check`, `pii_check`, `sensitive_data_type_check` | +| | **Data Privacy suite** | `sensitive_data_check`, `pii_check`, `sensitive_data_type_check` | | **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | | | IQR and Outliers | `iqr_and_outliers` | -| | **Full Dataset Quality suiteest** | `test_dataset_split`, `iqr_and_outliers` | +| | **Dataset Quality suite** | `test_dataset_split`, `iqr_and_outliers` | | **Regression Tests** | Residuals Plot | `plot_residuals` | | | R² Score | `r2_score` | | | Explainability | `explainability` | | | Feature Importance | `feature_importance` | | | Target Drift | `target_drift` | | | Prediction Drift | `prediction_drift` | -| | **Full Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | +| | **Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | From b6b672cfba62fe6387126e76aedaf27bdd745aab Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:12:43 -0700 Subject: [PATCH 11/20] Update README.md --- 24.2/samples/validation_suites/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index afec820..20584b1 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -7,11 +7,13 @@ | | Feature Importance | `feature_importance` | | | Label Drift | `label_drift` | | | Prediction Drift | `prediction_drift` | +| | Recall by class | `recall_by_class ` | +| | Precision by class | `precision_by_class ` | | | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | +| | **Multiclass Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift`, `recall_by_class `, `precision_by_class ` | | **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` | | | PII Check | `pii_check` | | | Sensitive Data Type Check | `sensitive_data_type_check` | -| | **Data Privacy suite** | `sensitive_data_check`, `pii_check`, `sensitive_data_type_check` | | **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | | | IQR and Outliers | `iqr_and_outliers` | | | **Dataset Quality suite** | `test_dataset_split`, `iqr_and_outliers` | @@ -23,3 +25,4 @@ | | Prediction Drift | `prediction_drift` | | | **Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | + From 91072e56c18314bad08b26c49711709943093ba7 Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:14:13 -0700 Subject: [PATCH 12/20] Update README.md --- 24.2/samples/validation_suites/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index 20584b1..34f5df3 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -1,4 +1,4 @@ -## All test modules below can be added to your test suite to by run on any models or datasets +## All tests and suites below can be added to your test suite to by run on any models or datasets and available inside Vectice default validation Library | **Category** | **Test Name** | **Function** | |------------------------------|----------------------------------|--------------------------------------| | **Classification Tests** | ROC Curve | `plot_roc_curve` | From 5b6027e08be62a9fe399fbdc7a1536c0e4a284b2 Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:41:42 -0700 Subject: [PATCH 13/20] Update README.md --- 24.2/samples/validation_suites/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index 34f5df3..1dd831e 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -1,4 +1,4 @@ -## All tests and suites below can be added to your test suite to by run on any models or datasets and available inside Vectice default validation Library +## List of default validation tests provided by Vectice with source code from PiML | **Category** | **Test Name** | **Function** | |------------------------------|----------------------------------|--------------------------------------| | **Classification Tests** | ROC Curve | `plot_roc_curve` | From 8c72c49fba7445aaf80b46de3c4f6cd7b0cae04e Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:42:13 -0700 Subject: [PATCH 14/20] Update README.md --- 24.2/samples/validation_suites/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index 1dd831e..6822cdc 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -1,4 +1,4 @@ -## List of default validation tests provided by Vectice with source code from PiML +## List of validation tests provided by Vectice with source code from PiML | **Category** | **Test Name** | **Function** | |------------------------------|----------------------------------|--------------------------------------| | **Classification Tests** | ROC Curve | `plot_roc_curve` | From 0702506d3d472dafb1feac2545faed12ad383ebf Mon Sep 17 00:00:00 2001 From: BDaversa <130710586+BDaversa@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:42:33 -0700 Subject: [PATCH 15/20] Update README.md --- 24.2/samples/validation_suites/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/validation_suites/README.md index 6822cdc..349c78f 100644 --- a/24.2/samples/validation_suites/README.md +++ b/24.2/samples/validation_suites/README.md @@ -1,4 +1,4 @@ -## List of validation tests provided by Vectice with source code from PiML +## List of validation tests provided by Vectice (source code from PiML) | **Category** | **Test Name** | **Function** | |------------------------------|----------------------------------|--------------------------------------| | **Classification Tests** | ROC Curve | `plot_roc_curve` | From de22e95b4e96226874c684c25aef6d4687ae4e5d Mon Sep 17 00:00:00 2001 From: BDaversa Date: Thu, 11 Jul 2024 14:48:47 -0700 Subject: [PATCH 16/20] reshape --- .../README.md | 0 .../master_config_test_suites.py | 0 .../correlation_matrix_module.py | 0 .../test_modules}/data_privacy_modules.py | 0 .../test_modules}/data_quality_modules.py | 0 .../test_modules/default_tests_vectice.py} | 0 .../test_modules/correlation_matrix_module.py | 56 ------- .../test_modules/data_privacy_modules.py | 148 ------------------ .../test_modules/data_quality_modules.py | 116 -------------- 9 files changed, 320 deletions(-) rename 24.2/samples/{validation_suites => test_suites_config}/README.md (100%) rename 24.2/samples/{validation_suites => test_suites_config}/master_config_test_suites.py (100%) rename 24.2/samples/{validation_suites => test_suites_config/test_modules}/correlation_matrix_module.py (100%) rename 24.2/samples/{validation_suites => test_suites_config/test_modules}/data_privacy_modules.py (100%) rename 24.2/samples/{validation_suites => test_suites_config/test_modules}/data_quality_modules.py (100%) rename 24.2/samples/{validation_suites/PiML_wrapper.py => test_suites_config/test_modules/default_tests_vectice.py} (100%) delete mode 100644 24.2/samples/validation_suites/test_modules/correlation_matrix_module.py delete mode 100644 24.2/samples/validation_suites/test_modules/data_privacy_modules.py delete mode 100644 24.2/samples/validation_suites/test_modules/data_quality_modules.py diff --git a/24.2/samples/validation_suites/README.md b/24.2/samples/test_suites_config/README.md similarity index 100% rename from 24.2/samples/validation_suites/README.md rename to 24.2/samples/test_suites_config/README.md diff --git a/24.2/samples/validation_suites/master_config_test_suites.py b/24.2/samples/test_suites_config/master_config_test_suites.py similarity index 100% rename from 24.2/samples/validation_suites/master_config_test_suites.py rename to 24.2/samples/test_suites_config/master_config_test_suites.py diff --git a/24.2/samples/validation_suites/correlation_matrix_module.py b/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py similarity index 100% rename from 24.2/samples/validation_suites/correlation_matrix_module.py rename to 24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py diff --git a/24.2/samples/validation_suites/data_privacy_modules.py b/24.2/samples/test_suites_config/test_modules/data_privacy_modules.py similarity index 100% rename from 24.2/samples/validation_suites/data_privacy_modules.py rename to 24.2/samples/test_suites_config/test_modules/data_privacy_modules.py diff --git a/24.2/samples/validation_suites/data_quality_modules.py b/24.2/samples/test_suites_config/test_modules/data_quality_modules.py similarity index 100% rename from 24.2/samples/validation_suites/data_quality_modules.py rename to 24.2/samples/test_suites_config/test_modules/data_quality_modules.py diff --git a/24.2/samples/validation_suites/PiML_wrapper.py b/24.2/samples/test_suites_config/test_modules/default_tests_vectice.py similarity index 100% rename from 24.2/samples/validation_suites/PiML_wrapper.py rename to 24.2/samples/test_suites_config/test_modules/default_tests_vectice.py diff --git a/24.2/samples/validation_suites/test_modules/correlation_matrix_module.py b/24.2/samples/validation_suites/test_modules/correlation_matrix_module.py deleted file mode 100644 index 83510ad..0000000 --- a/24.2/samples/validation_suites/test_modules/correlation_matrix_module.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Any, Dict - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns - - -if TYPE_CHECKING: - from matplotlib.container import BarContainer - from numpy import ndarray - from numpy.typing import ArrayLike - from pandas import DataFrame - - from vectice.models.validation import TestSuiteReturnType - -_logger = logging.getLogger(__name__) - -def plot_correlation_matrix( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) - cmap = internal_parameters.get("cmap", "Blues") - - # Select subset of columns - training_df = training_df[subset_columns] - - # Calculate the correlation matrix - corr_matrix = training_df.corr() - - # Plot the correlation matrix - plt.figure(figsize=(10, 8)) - sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) - plt.title("Correlation Matrix") - - # Save the plot - file_path = "Correlation_matrix_plot.png" - plt.savefig(file_path) - plt.close() - - return TestSuiteReturnType( - metrics={}, - properties={}, - tables=[], - attachments=[file_path], - ) \ No newline at end of file diff --git a/24.2/samples/validation_suites/test_modules/data_privacy_modules.py b/24.2/samples/validation_suites/test_modules/data_privacy_modules.py deleted file mode 100644 index 90d851f..0000000 --- a/24.2/samples/validation_suites/test_modules/data_privacy_modules.py +++ /dev/null @@ -1,148 +0,0 @@ -# Write custom tests which can be used to validate your datasets security -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pandas as pd - -if TYPE_CHECKING: - from numpy.typing import ArrayLike - from pandas import DataFrame - - from vectice.models.validation_dataset import TestSuiteReturnType - - -def sensitive_data_check( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: ArrayLike | list | None = None, - target_column: ArrayLike | str | None = None, - sensitive_keywords: list | None = None, -) -> TestSuiteReturnType | None: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None or sensitive_keywords is None: - return None - - # Initialize a dictionary to hold counts of sensitive data - sensitive_counts = {keyword: 0 for keyword in sensitive_keywords} - - # Check each cell in the DataFrame for sensitive keywords - for keyword in sensitive_keywords: - sensitive_counts[keyword] = dataset.apply( - lambda x: x.astype(str).str.contains(keyword, case=False).sum() - ).sum() - - # Create a DataFrame with the results - sensitive_counts_df = pd.DataFrame( - { - "Sensitive Keyword": list(sensitive_counts.keys()), - "Count": list(sensitive_counts.values()), - } - ) - - table = Table(sensitive_counts_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) - - -def pii_check( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: ArrayLike | list | None = None, - target_column: ArrayLike | str | None = None, -) -> TestSuiteReturnType | None: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - # Define common PII patterns - pii_patterns = { - "name": r"\b[A-Z][a-z]*\b", - "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b", - "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b", - } - - # Initialize a dictionary to hold counts of PII matches - pii_counts = {key: 0 for key in pii_patterns.keys()} - - # Check each column in the DataFrame for PII patterns - for column in dataset.columns: - for key, pattern in pii_patterns.items(): - pii_counts[key] += ( - dataset[column] - .astype(str) - .str.contains(pattern, case=False, regex=True) - .sum() - ) - - # Create a DataFrame with the results - pii_counts_df = pd.DataFrame( - {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())} - ) - - table = Table(pii_counts_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) - - -def sensitive_data_type_check( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: ArrayLike | list | None = None, - target_column: ArrayLike | str | None = None, -) -> TestSuiteReturnType | None: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - # Define patterns for sensitive data types - sensitive_data_patterns = { - "credit_card": r"\b(?:\d[ -]*?){13,16}\b", - "ssn": r"\b\d{3}-\d{2}-\d{4}\b", - } - - # Initialize a dictionary to hold counts of sensitive data type matches - sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()} - - # Check each column in the DataFrame for sensitive data type patterns - for column in dataset.columns: - for key, pattern in sensitive_data_patterns.items(): - sensitive_data_counts[key] += ( - dataset[column] - .astype(str) - .str.contains(pattern, case=False, regex=True) - .sum() - ) - - # Create a DataFrame with the results - sensitive_data_counts_df = pd.DataFrame( - { - "Sensitive Data Type": list(sensitive_data_counts.keys()), - "Count": list(sensitive_data_counts.values()), - } - ) - - table = Table(sensitive_data_counts_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) diff --git a/24.2/samples/validation_suites/test_modules/data_quality_modules.py b/24.2/samples/validation_suites/test_modules/data_quality_modules.py deleted file mode 100644 index 05b3ae5..0000000 --- a/24.2/samples/validation_suites/test_modules/data_quality_modules.py +++ /dev/null @@ -1,116 +0,0 @@ -# Write custom tests which can be used to validate your datasets quality -from __future__ import annotations - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from pandas import DataFrame -from vectice.models.validation_dataset import TestSuiteReturnType - - -# custom test which can be used for dataset validation -def test_dataset_split( - dataset: DataFrame | None, - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - feature_columns: list | None = None, - threshold: float | None = None, -) -> TestSuiteReturnType: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - total_df = len(training_df) + len(testing_df) - - # Create a DataFrame with the results - datasplit_df = pd.DataFrame( - { - "Dataset": ["Train", "Test", "Total"], - "Size": [len(training_df), len(testing_df), total_df], - "Percentage": [ - (len(training_df) / total_df * 100), - (len(testing_df) / total_df * 100), - 100, - ], - } - ) - - table = Table(datasplit_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) - - -# custom test which can be used for dataset validation -def iqr_and_outliers( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: list | None = None, - target_column: str | None = None, - threshold: float | None = None, -) -> TestSuiteReturnType | None: - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - files = [] - # disable plots showing - plt.ioff() - for column in dataset.select_dtypes(include=[np.number]).columns: - file_name = f"iqr_and_outliers_{column}.png" - - temp_file_path = file_name - - Q1 = dataset[column].quantile(0.25) - Q3 = dataset[column].quantile(0.75) - IQR = Q3 - Q1 - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - - plt.figure(figsize=(10, 6)) - plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) - plt.axvline( - Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}" - ) - plt.axvline( - Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}" - ) - plt.axvline( - dataset[column].median(), - color="g", - linestyle="-", - label=f"Median: {dataset[column].median():.2f}", - ) - plt.fill_betweenx( - [0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}" - ) - - # Highlight outliers - outliers = dataset[ - (dataset[column] < lower_bound) | (dataset[column] > upper_bound) - ][column] - plt.scatter( - outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5 - ) - - plt.title(f"Histogram with IQR and Outliers for {column}") - plt.xlabel(column) - plt.ylabel("Frequency") - plt.legend() - plt.savefig(temp_file_path, bbox_inches="tight") - files.append(temp_file_path) - - plt.ion() - return TestSuiteReturnType( - properties={}, - tables=[], - attachments=files, - ) From d5c7c8ea3d71590951320c29247e5cf9fe48c40d Mon Sep 17 00:00:00 2001 From: BDaversa Date: Thu, 11 Jul 2024 15:53:12 -0700 Subject: [PATCH 17/20] rephrase suite --- .../test_suites_config/master_config_test_suites.py | 9 +++++---- .../test_modules/correlation_matrix_module.py | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/24.2/samples/test_suites_config/master_config_test_suites.py b/24.2/samples/test_suites_config/master_config_test_suites.py index 8211612..24a5005 100644 --- a/24.2/samples/test_suites_config/master_config_test_suites.py +++ b/24.2/samples/test_suites_config/master_config_test_suites.py @@ -27,7 +27,7 @@ ) -# The master test suite file is used to map all suite of test which can be run. +# The master test suite file is used to map all ADDITIONAL suite of test which can be run. # The tests can be provided by Vectice or custom functions from your modules. # Vectice uses this configuration to simply identify and bundle available tests into suite, when you run # your validations in your notebook. @@ -42,13 +42,14 @@ label_drift, prediction_drift, ], - "data_quality": [ + "data_quality_ext": [ test_dataset_split, iqr_and_outliers, + plot_correlation_matrix, ], - "custom":[ + "corr_matrix_ext": [ plot_correlation_matrix, - ] + ], } # Map the tests to be used for data privacy validation diff --git a/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py b/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py index 83510ad..3935f9a 100644 --- a/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py +++ b/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py @@ -29,6 +29,7 @@ def plot_correlation_matrix( internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, ) -> TestSuiteReturnType: from vectice.models.validation import TestSuiteReturnType + subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) cmap = internal_parameters.get("cmap", "Blues") @@ -48,6 +49,7 @@ def plot_correlation_matrix( plt.savefig(file_path) plt.close() + # RETURN IN THE VECTICE EXPECTED FORMART return TestSuiteReturnType( metrics={}, properties={}, From 8b500550bbec3c9a6134eeea6cff49f8134bfce2 Mon Sep 17 00:00:00 2001 From: BDaversa Date: Thu, 11 Jul 2024 17:36:22 -0700 Subject: [PATCH 18/20] reorganize folder --- Validation test/test_suites_config/README.md | 28 + .../master_config_test_suites.py | 65 +++ .../test_modules/correlation_matrix_module.py | 58 ++ .../test_modules/data_privacy_modules.py | 148 +++++ .../test_modules/data_quality_modules.py | 116 ++++ .../test_modules/default_tests_vectice.py | 523 ++++++++++++++++++ 6 files changed, 938 insertions(+) create mode 100644 Validation test/test_suites_config/README.md create mode 100644 Validation test/test_suites_config/master_config_test_suites.py create mode 100644 Validation test/test_suites_config/test_modules/correlation_matrix_module.py create mode 100644 Validation test/test_suites_config/test_modules/data_privacy_modules.py create mode 100644 Validation test/test_suites_config/test_modules/data_quality_modules.py create mode 100644 Validation test/test_suites_config/test_modules/default_tests_vectice.py diff --git a/Validation test/test_suites_config/README.md b/Validation test/test_suites_config/README.md new file mode 100644 index 0000000..349c78f --- /dev/null +++ b/Validation test/test_suites_config/README.md @@ -0,0 +1,28 @@ +## List of validation tests provided by Vectice (source code from PiML) +| **Category** | **Test Name** | **Function** | +|------------------------------|----------------------------------|--------------------------------------| +| **Classification Tests** | ROC Curve | `plot_roc_curve` | +| | Confusion Matrix | `conf_matrix` | +| | Explainability | `explainability` | +| | Feature Importance | `feature_importance` | +| | Label Drift | `label_drift` | +| | Prediction Drift | `prediction_drift` | +| | Recall by class | `recall_by_class ` | +| | Precision by class | `precision_by_class ` | +| | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | +| | **Multiclass Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift`, `recall_by_class `, `precision_by_class ` | +| **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` | +| | PII Check | `pii_check` | +| | Sensitive Data Type Check | `sensitive_data_type_check` | +| **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | +| | IQR and Outliers | `iqr_and_outliers` | +| | **Dataset Quality suite** | `test_dataset_split`, `iqr_and_outliers` | +| **Regression Tests** | Residuals Plot | `plot_residuals` | +| | R² Score | `r2_score` | +| | Explainability | `explainability` | +| | Feature Importance | `feature_importance` | +| | Target Drift | `target_drift` | +| | Prediction Drift | `prediction_drift` | +| | **Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | + + diff --git a/Validation test/test_suites_config/master_config_test_suites.py b/Validation test/test_suites_config/master_config_test_suites.py new file mode 100644 index 0000000..24a5005 --- /dev/null +++ b/Validation test/test_suites_config/master_config_test_suites.py @@ -0,0 +1,65 @@ +# import the Vectice provided probability of default validation tests +from vectice.models.test_library.binary_classification_test import ( + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, +) + + +# custom data quality validation tests +from test_modules.data_quality_modules import ( + test_dataset_split, + iqr_and_outliers, +) + +# custom data privacy validation tests +from test_modules.data_privacy_modules import ( + sensitive_data_check, + sensitive_data_type_check, + pii_check, +) + +from test_modules.correlation_matrix_module import ( + plot_correlation_matrix +) + + +# The master test suite file is used to map all ADDITIONAL suite of test which can be run. +# The tests can be provided by Vectice or custom functions from your modules. +# Vectice uses this configuration to simply identify and bundle available tests into suite, when you run +# your validations in your notebook. + +# Accumulation and mapping of all validation tests to be run for the PD model suite +PD_model_suite= { + "binary_suite": [ + plot_roc_curve, + conf_matrix, + explainability, + feature_importance, + label_drift, + prediction_drift, + ], + "data_quality_ext": [ + test_dataset_split, + iqr_and_outliers, + plot_correlation_matrix, + ], + "corr_matrix_ext": [ + plot_correlation_matrix, + ], +} + +# Map the tests to be used for data privacy validation +Robustness_suite = { + "sensitive_data_check": sensitive_data_check, + "pii_check": pii_check, + "sensitive_data_type_check": sensitive_data_type_check, + "data_privacy_full_suite": [ + sensitive_data_check, + pii_check, + sensitive_data_type_check, + ], +} diff --git a/Validation test/test_suites_config/test_modules/correlation_matrix_module.py b/Validation test/test_suites_config/test_modules/correlation_matrix_module.py new file mode 100644 index 0000000..3935f9a --- /dev/null +++ b/Validation test/test_suites_config/test_modules/correlation_matrix_module.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + # RETURN IN THE VECTICE EXPECTED FORMART + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) \ No newline at end of file diff --git a/Validation test/test_suites_config/test_modules/data_privacy_modules.py b/Validation test/test_suites_config/test_modules/data_privacy_modules.py new file mode 100644 index 0000000..90d851f --- /dev/null +++ b/Validation test/test_suites_config/test_modules/data_privacy_modules.py @@ -0,0 +1,148 @@ +# Write custom tests which can be used to validate your datasets security +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pandas as pd + +if TYPE_CHECKING: + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation_dataset import TestSuiteReturnType + + +def sensitive_data_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, + sensitive_keywords: list | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None or sensitive_keywords is None: + return None + + # Initialize a dictionary to hold counts of sensitive data + sensitive_counts = {keyword: 0 for keyword in sensitive_keywords} + + # Check each cell in the DataFrame for sensitive keywords + for keyword in sensitive_keywords: + sensitive_counts[keyword] = dataset.apply( + lambda x: x.astype(str).str.contains(keyword, case=False).sum() + ).sum() + + # Create a DataFrame with the results + sensitive_counts_df = pd.DataFrame( + { + "Sensitive Keyword": list(sensitive_counts.keys()), + "Count": list(sensitive_counts.values()), + } + ) + + table = Table(sensitive_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def pii_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define common PII patterns + pii_patterns = { + "name": r"\b[A-Z][a-z]*\b", + "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b", + "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b", + } + + # Initialize a dictionary to hold counts of PII matches + pii_counts = {key: 0 for key in pii_patterns.keys()} + + # Check each column in the DataFrame for PII patterns + for column in dataset.columns: + for key, pattern in pii_patterns.items(): + pii_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + pii_counts_df = pd.DataFrame( + {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())} + ) + + table = Table(pii_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +def sensitive_data_type_check( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: ArrayLike | list | None = None, + target_column: ArrayLike | str | None = None, +) -> TestSuiteReturnType | None: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + # Define patterns for sensitive data types + sensitive_data_patterns = { + "credit_card": r"\b(?:\d[ -]*?){13,16}\b", + "ssn": r"\b\d{3}-\d{2}-\d{4}\b", + } + + # Initialize a dictionary to hold counts of sensitive data type matches + sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()} + + # Check each column in the DataFrame for sensitive data type patterns + for column in dataset.columns: + for key, pattern in sensitive_data_patterns.items(): + sensitive_data_counts[key] += ( + dataset[column] + .astype(str) + .str.contains(pattern, case=False, regex=True) + .sum() + ) + + # Create a DataFrame with the results + sensitive_data_counts_df = pd.DataFrame( + { + "Sensitive Data Type": list(sensitive_data_counts.keys()), + "Count": list(sensitive_data_counts.values()), + } + ) + + table = Table(sensitive_data_counts_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) diff --git a/Validation test/test_suites_config/test_modules/data_quality_modules.py b/Validation test/test_suites_config/test_modules/data_quality_modules.py new file mode 100644 index 0000000..05b3ae5 --- /dev/null +++ b/Validation test/test_suites_config/test_modules/data_quality_modules.py @@ -0,0 +1,116 @@ +# Write custom tests which can be used to validate your datasets quality +from __future__ import annotations + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pandas import DataFrame +from vectice.models.validation_dataset import TestSuiteReturnType + + +# custom test which can be used for dataset validation +def test_dataset_split( + dataset: DataFrame | None, + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + feature_columns: list | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType( + properties={}, + tables=[table], + attachments=[], + ) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + dataset: DataFrame | None = None, + training_df: DataFrame | None = None, + testing_df: DataFrame | None = None, + feature_columns: list | None = None, + target_column: str | None = None, + threshold: float | None = None, +) -> TestSuiteReturnType | None: + from vectice.models.validation_dataset import TestSuiteReturnType + + if dataset is None: + return None + + files = [] + # disable plots showing + plt.ioff() + for column in dataset.select_dtypes(include=[np.number]).columns: + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline( + Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}" + ) + plt.axvline( + Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}" + ) + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx( + [0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}" + ) + + # Highlight outliers + outliers = dataset[ + (dataset[column] < lower_bound) | (dataset[column] > upper_bound) + ][column] + plt.scatter( + outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5 + ) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + properties={}, + tables=[], + attachments=files, + ) diff --git a/Validation test/test_suites_config/test_modules/default_tests_vectice.py b/Validation test/test_suites_config/test_modules/default_tests_vectice.py new file mode 100644 index 0000000..5ec37db --- /dev/null +++ b/Validation test/test_suites_config/test_modules/default_tests_vectice.py @@ -0,0 +1,523 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Dict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import shap +from scipy.stats import chi2_contingency, ks_2samp +from sklearn.metrics import auc, confusion_matrix, precision_score, recall_score, roc_curve + +if TYPE_CHECKING: + from matplotlib.container import BarContainer + from numpy import ndarray + from numpy.typing import ArrayLike + from pandas import DataFrame + + from vectice.models.validation import TestSuiteReturnType + +_logger = logging.getLogger(__name__) + + +def plot_roc_curve( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"train_color": "green", "test_color": "blue", "threshold": 0.5}, +) -> TestSuiteReturnType | None: + from vectice.models.validation import TestSuiteReturnType + + X_train = training_df.drop(columns=[target_column]) + X_test = testing_df.drop(columns=[target_column]) + training_prediction_proba = predictor.predict_proba(X_train)[:, 1] + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_train is not None: + training_prediction_proba = predict_proba_train + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + fpr_train, tpr_train, _ = roc_curve(training_df[target_column], training_prediction_proba) + roc_auc_train = auc(fpr_train, tpr_train) + + fpr_test, tpr_test, _ = roc_curve(testing_df[target_column], testing_prediction_proba) + roc_auc_test = auc(fpr_test, tpr_test) + + file_path = "ROC_CURVE.png" + + plt.figure(figsize=(8, 6)) + plt.plot( + fpr_train, + tpr_train, + color=internal_parameters["train_color"], + linestyle="--", + label=f"Train ROC curve (AUC = {roc_auc_train:.2f})", + ) + plt.plot( + fpr_test, + tpr_test, + color=internal_parameters["test_color"], + label=f"Test ROC curve (AUC = {roc_auc_test:.2f})", + ) + plt.plot([0, 1], [0, 1], color="red", linestyle="--") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("Receiver Operating Characteristic (ROC) Curve") + plt.legend() + plt.grid(True) + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={"_ROC_auc_train": roc_auc_train, "_ROC_auc_test": roc_auc_test}, + properties={}, + tables=[], + attachments=[file_path], + ) + + +def conf_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"threshold": 0.5, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + threshold = internal_parameters["threshold"] + cmap = internal_parameters.get("cmap", "Blues") + + X_test = testing_df.drop(columns=[target_column]) + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + testing_prediction = (testing_prediction_proba >= threshold).astype(int) + + cm = confusion_matrix(testing_df[target_column], testing_prediction) + total_samples = np.sum(cm) + + precision = precision_score(testing_df[target_column], testing_prediction) + recall = recall_score(testing_df[target_column], testing_prediction) + + # Plot confusion matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(cm, annot=True, cmap=cmap, fmt="d", annot_kws={"fontsize": 12}, cbar=False) + for i in range(len(cm)): + for j in range(len(cm)): + plt.text( + j + 0.5, + i + 0.75, + f"{cm[i][j]/total_samples*100:.2f}%", + ha="center", + va="center", + color="black", + fontsize=12, + ) + plt.xlabel("Predicted Label") + plt.ylabel("True Label") + plt.title(f"Confusion Matrix\nPrecision: {precision:.2f}, Recall: {recall:.2f}") + + # Save the plot + file_path = "Confusion_matrix_plot.png" + plt.savefig(file_path) + plt.close() + + return TestSuiteReturnType( + metrics={"_precision_test": precision, "_recall_test": recall}, + properties={"Threshold": threshold}, + tables=[], + attachments=[file_path], + ) + + +def explainability( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) + shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) + shap.summary_plot( + shap_values[:, :, 0], training_df.drop(columns=[target_column]).head(1000), max_display=10, show=False + ) + summary_plot_path = "SHAP_summary_plot.png" + plt.savefig(summary_plot_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[summary_plot_path]) + + +def feature_importance( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) + shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) + clustering = shap.utils.hclust( + training_df.drop(columns=[target_column]).head(1000), training_df[target_column].head(1000) + ) + shap.plots.bar(shap_values[:, :, 0], clustering=clustering, max_display=10, show=False) + + feature_importance_path = "feature_importance.png" + plt.savefig(feature_importance_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[feature_importance_path]) + + +def cramers_v_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: + + min_length = min(len(x), len(y), 4000) + x = x[:min_length] + y = y[:min_length] + confusion_matrix = pd.crosstab(x, y) + chi2 = chi2_contingency(confusion_matrix)[0] + n = confusion_matrix.sum().sum() + phi2 = chi2 / n + r, k = confusion_matrix.shape + phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) + rcorr = r - ((r - 1) ** 2) / (n - 1) + kcorr = k - ((k - 1) ** 2) / (n - 1) + return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) + + +def ks_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: + min_length = min(len(x), len(y), 4000) + x = x[:min_length] + y = y[:min_length] + ks_statistic, _ = ks_2samp(x, y) + + return ks_statistic + + +def prediction_drift( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + threshold: float, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + X_train = training_df.drop(columns=[target_column]) + X_test = testing_df.drop(columns=[target_column]) + training_prediction_proba = predictor.predict_proba(X_train)[:, 1] + testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] + + if predict_proba_train is not None: + training_prediction_proba = predict_proba_train + + if predict_proba_test is not None: + testing_prediction_proba = predict_proba_test + + train_predictions = np.array(training_prediction_proba) + test_predictions = np.array(testing_prediction_proba) + + light_red = "#FF8A80" # Light Red + darker_blue = "#1565C0" # Darker Blue + sns.set_palette([darker_blue, light_red]) + + _, ax = plt.subplots(figsize=(8, 6)) + + sns.kdeplot(train_predictions, color=light_red, label="Train Predictions", fill=True) + sns.kdeplot(test_predictions, color=darker_blue, label="Test Predictions", fill=True) + + # Plot vertical lines for means using the specified colors + ax.axvline( # pyright: ignore[reportAttributeAccessIssue] + np.mean(train_predictions), # pyright: ignore[reportArgumentType] + color=light_red, + linestyle="--", + label="Train Mean", + ) + ax.axvline( # pyright: ignore[reportAttributeAccessIssue] + np.mean(test_predictions), # pyright: ignore[reportArgumentType] + color=darker_blue, + linestyle="--", + label="Test Mean", + ) + + plt.xlabel("Predictions") + plt.ylabel("Density") + plt.title("Prediction Drift Plot (Kolmogorov-Smirnov drift score)") + plt.legend() + plt.grid(True) + path = "Prediction_drift.png" + + # Calculate and print drift score + drift_score = ks_score(train_predictions, test_predictions) + + # Set text position at the top + text_x = 0.5 + text_y = 0.95 + if drift_score < 0.1: + score_color = "green" + elif 0.1 <= drift_score <= 0.2: + score_color = "orange" + else: + score_color = "red" + + plt.text( + text_x, + text_y, + f"Drift score = {drift_score:.2f}", + ha="center", + va="top", + color=score_color, + transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] + ) + + plt.savefig(path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, properties={"_prediction_drift_score": drift_score}, tables=[], attachments=[path] + ) + + +def label_drift( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + train_labels = np.array(training_df[target_column]) + test_labels = np.array(testing_df[target_column]) + + light_red = "#FF8A80" # Light Red + darker_blue = "#1565C0" # Darker Blue + sns.set_palette([darker_blue, light_red]) + + _, ax = plt.subplots(figsize=(8, 6)) + + bar_width = 0.35 + index = np.arange(2) + + train_counts = [np.sum(train_labels == 0) / len(train_labels), np.sum(train_labels == 1) / len(train_labels)] + test_counts = [np.sum(test_labels == 0) / len(test_labels), np.sum(test_labels == 1) / len(test_labels)] + + train_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] + index, train_counts, bar_width, label="Train Labels" + ) + test_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] + index + bar_width, test_counts, bar_width, label="Test Labels" + ) + + ax.set_xlabel("Labels") # pyright: ignore[reportAttributeAccessIssue] + ax.set_ylabel("Frequency") # pyright: ignore[reportAttributeAccessIssue] + ax.set_title("Label Drift Plot (Cramer's V drift score)") # pyright: ignore[reportAttributeAccessIssue] + ax.set_xticks(index + bar_width / 2) # pyright: ignore[reportAttributeAccessIssue] + ax.set_xticklabels(["0", "1"]) # pyright: ignore[reportAttributeAccessIssue] + ax.legend() # pyright: ignore[reportAttributeAccessIssue] + + def autolabel(bars: BarContainer): + """Attach a text label above each bar in *bars*, displaying its height.""" + for bar in bars: + height = bar.get_height() + ax.annotate( # pyright: ignore[reportAttributeAccessIssue] + f"{height:.2f}", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha="center", + va="bottom", + ) + + autolabel(train_bar) + autolabel(test_bar) + + drift_score = cramers_v_score(train_labels, test_labels) + if drift_score < 0.1: + score_color = "green" + elif 0.1 <= drift_score <= 0.2: + score_color = "orange" + else: + score_color = "red" + + ax.text( # pyright: ignore[reportAttributeAccessIssue] + 0.5, + 0.95, + f"Drift score = {drift_score:.2f}", + ha="center", + va="top", + color=score_color, + transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] + ) + + plt.tight_layout() + path = "Label_drift.png" + plt.savefig(path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, properties={"_label_drift_score": drift_score}, tables=[], attachments=[path] + ) + + +def plot_correlation_matrix( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice.models.validation import TestSuiteReturnType + + subset_columns = internal_parameters.get( + "subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"] + ) + cmap = internal_parameters.get("cmap", "Blues") + + # Select subset of columns + training_df = training_df[subset_columns] + + # Calculate the correlation matrix + corr_matrix = training_df.corr() + + # Plot the correlation matrix + plt.figure(figsize=(10, 8)) + sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) + plt.title("Correlation Matrix") + + # Save the plot + file_path = "Correlation_matrix_plot.png" + plt.savefig(file_path, bbox_inches="tight") + plt.close() + + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=[file_path], + ) + + +# custom test which can be used for dataset validation +def test_dataset_split( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType: + from vectice import Table + from vectice.models.validation import TestSuiteReturnType + + total_df = len(training_df) + len(testing_df) + + # Create a DataFrame with the results + datasplit_df = pd.DataFrame( + { + "Dataset": ["Train", "Test", "Total"], + "Size": [len(training_df), len(testing_df), total_df], + "Percentage": [ + (len(training_df) / total_df * 100), + (len(testing_df) / total_df * 100), + 100, + ], + } + ) + + table = Table(datasplit_df) + + return TestSuiteReturnType(metrics={}, properties={}, tables=[table], attachments=[]) + + +# custom test which can be used for dataset validation +def iqr_and_outliers( + training_df: DataFrame, + testing_df: DataFrame, + target_column: str, + predictor: Any, + predict_proba_train: ArrayLike | None, + predict_proba_test: ArrayLike | None, + internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, +) -> TestSuiteReturnType | None: + from vectice.models.validation import TestSuiteReturnType + + dataset = training_df + + files = [] + # disable plots showing + if internal_parameters.get("subset_columns") is not None: + columns = internal_parameters.get("subset_columns") + else: + columns = dataset.select_dtypes(include=[np.number]).columns[:10] + plt.ioff() + for column in columns: # type: ignore + file_name = f"iqr_and_outliers_{column}.png" + + temp_file_path = file_name + + Q1 = dataset[column].quantile(0.25) + Q3 = dataset[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + plt.figure(figsize=(10, 6)) + plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) + plt.axvline(Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}") + plt.axvline(Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}") + plt.axvline( + dataset[column].median(), + color="g", + linestyle="-", + label=f"Median: {dataset[column].median():.2f}", + ) + plt.fill_betweenx([0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}") + + # Highlight outliers + outliers = dataset[(dataset[column] < lower_bound) | (dataset[column] > upper_bound)][column] + plt.scatter(outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5) + + plt.title(f"Histogram with IQR and Outliers for {column}") + plt.xlabel(column) + plt.ylabel("Frequency") + plt.legend() + plt.savefig(temp_file_path, bbox_inches="tight") + files.append(temp_file_path) + + plt.ion() + return TestSuiteReturnType( + metrics={}, + properties={}, + tables=[], + attachments=files, + ) \ No newline at end of file From 33e485cd7dada0265530892c3d0180a82120569a Mon Sep 17 00:00:00 2001 From: BDaversa Date: Thu, 11 Jul 2024 17:44:24 -0700 Subject: [PATCH 19/20] change path --- {Validation test/test_suites_config => Validation}/README.md | 0 .../master_config_test_suites.py | 0 .../test_modules/correlation_matrix_module.py | 0 .../test_modules/data_privacy_modules.py | 0 .../test_modules/data_quality_modules.py | 0 .../test_modules/default_tests_vectice.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename {Validation test/test_suites_config => Validation}/README.md (100%) rename {Validation test/test_suites_config => Validation}/master_config_test_suites.py (100%) rename {Validation test/test_suites_config => Validation}/test_modules/correlation_matrix_module.py (100%) rename {Validation test/test_suites_config => Validation}/test_modules/data_privacy_modules.py (100%) rename {Validation test/test_suites_config => Validation}/test_modules/data_quality_modules.py (100%) rename {Validation test/test_suites_config => Validation}/test_modules/default_tests_vectice.py (100%) diff --git a/Validation test/test_suites_config/README.md b/Validation/README.md similarity index 100% rename from Validation test/test_suites_config/README.md rename to Validation/README.md diff --git a/Validation test/test_suites_config/master_config_test_suites.py b/Validation/master_config_test_suites.py similarity index 100% rename from Validation test/test_suites_config/master_config_test_suites.py rename to Validation/master_config_test_suites.py diff --git a/Validation test/test_suites_config/test_modules/correlation_matrix_module.py b/Validation/test_modules/correlation_matrix_module.py similarity index 100% rename from Validation test/test_suites_config/test_modules/correlation_matrix_module.py rename to Validation/test_modules/correlation_matrix_module.py diff --git a/Validation test/test_suites_config/test_modules/data_privacy_modules.py b/Validation/test_modules/data_privacy_modules.py similarity index 100% rename from Validation test/test_suites_config/test_modules/data_privacy_modules.py rename to Validation/test_modules/data_privacy_modules.py diff --git a/Validation test/test_suites_config/test_modules/data_quality_modules.py b/Validation/test_modules/data_quality_modules.py similarity index 100% rename from Validation test/test_suites_config/test_modules/data_quality_modules.py rename to Validation/test_modules/data_quality_modules.py diff --git a/Validation test/test_suites_config/test_modules/default_tests_vectice.py b/Validation/test_modules/default_tests_vectice.py similarity index 100% rename from Validation test/test_suites_config/test_modules/default_tests_vectice.py rename to Validation/test_modules/default_tests_vectice.py From 6868c9bf0b12d6309a3735dfaf36976c51d2c6fc Mon Sep 17 00:00:00 2001 From: BDaversa Date: Sun, 14 Jul 2024 14:37:57 -0700 Subject: [PATCH 20/20] save wrappers --- 24.2/samples/test_suites_config/README.md | 28 - .../master_config_test_suites.py | 65 --- .../test_modules/correlation_matrix_module.py | 58 -- .../test_modules/data_privacy_modules.py | 148 ----- .../test_modules/data_quality_modules.py | 116 ---- .../test_modules/default_tests_vectice.py | 523 ------------------ Validation/vectice_wrappers.py | 73 +++ 7 files changed, 73 insertions(+), 938 deletions(-) delete mode 100644 24.2/samples/test_suites_config/README.md delete mode 100644 24.2/samples/test_suites_config/master_config_test_suites.py delete mode 100644 24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py delete mode 100644 24.2/samples/test_suites_config/test_modules/data_privacy_modules.py delete mode 100644 24.2/samples/test_suites_config/test_modules/data_quality_modules.py delete mode 100644 24.2/samples/test_suites_config/test_modules/default_tests_vectice.py create mode 100644 Validation/vectice_wrappers.py diff --git a/24.2/samples/test_suites_config/README.md b/24.2/samples/test_suites_config/README.md deleted file mode 100644 index 349c78f..0000000 --- a/24.2/samples/test_suites_config/README.md +++ /dev/null @@ -1,28 +0,0 @@ -## List of validation tests provided by Vectice (source code from PiML) -| **Category** | **Test Name** | **Function** | -|------------------------------|----------------------------------|--------------------------------------| -| **Classification Tests** | ROC Curve | `plot_roc_curve` | -| | Confusion Matrix | `conf_matrix` | -| | Explainability | `explainability` | -| | Feature Importance | `feature_importance` | -| | Label Drift | `label_drift` | -| | Prediction Drift | `prediction_drift` | -| | Recall by class | `recall_by_class ` | -| | Precision by class | `precision_by_class ` | -| | **Binary Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift` | -| | **Multiclass Classification suite** | `plot_roc_curve`, `conf_matrix`, `explainability`, `feature_importance`, `label_drift`, `prediction_drift`, `recall_by_class `, `precision_by_class ` | -| **Data Privacy Tests** | Sensitive Data Check | `sensitive_data_check` | -| | PII Check | `pii_check` | -| | Sensitive Data Type Check | `sensitive_data_type_check` | -| **Data Quality Tests** | Dataset Split Validation | `test_dataset_split` | -| | IQR and Outliers | `iqr_and_outliers` | -| | **Dataset Quality suite** | `test_dataset_split`, `iqr_and_outliers` | -| **Regression Tests** | Residuals Plot | `plot_residuals` | -| | R² Score | `r2_score` | -| | Explainability | `explainability` | -| | Feature Importance | `feature_importance` | -| | Target Drift | `target_drift` | -| | Prediction Drift | `prediction_drift` | -| | **Regression suite** | `plot_residuals`, `r2_score`, `explainability`, `feature_importance`, `target_drift`, `prediction_drift` | - - diff --git a/24.2/samples/test_suites_config/master_config_test_suites.py b/24.2/samples/test_suites_config/master_config_test_suites.py deleted file mode 100644 index 24a5005..0000000 --- a/24.2/samples/test_suites_config/master_config_test_suites.py +++ /dev/null @@ -1,65 +0,0 @@ -# import the Vectice provided probability of default validation tests -from vectice.models.test_library.binary_classification_test import ( - plot_roc_curve, - conf_matrix, - explainability, - feature_importance, - label_drift, - prediction_drift, -) - - -# custom data quality validation tests -from test_modules.data_quality_modules import ( - test_dataset_split, - iqr_and_outliers, -) - -# custom data privacy validation tests -from test_modules.data_privacy_modules import ( - sensitive_data_check, - sensitive_data_type_check, - pii_check, -) - -from test_modules.correlation_matrix_module import ( - plot_correlation_matrix -) - - -# The master test suite file is used to map all ADDITIONAL suite of test which can be run. -# The tests can be provided by Vectice or custom functions from your modules. -# Vectice uses this configuration to simply identify and bundle available tests into suite, when you run -# your validations in your notebook. - -# Accumulation and mapping of all validation tests to be run for the PD model suite -PD_model_suite= { - "binary_suite": [ - plot_roc_curve, - conf_matrix, - explainability, - feature_importance, - label_drift, - prediction_drift, - ], - "data_quality_ext": [ - test_dataset_split, - iqr_and_outliers, - plot_correlation_matrix, - ], - "corr_matrix_ext": [ - plot_correlation_matrix, - ], -} - -# Map the tests to be used for data privacy validation -Robustness_suite = { - "sensitive_data_check": sensitive_data_check, - "pii_check": pii_check, - "sensitive_data_type_check": sensitive_data_type_check, - "data_privacy_full_suite": [ - sensitive_data_check, - pii_check, - sensitive_data_type_check, - ], -} diff --git a/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py b/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py deleted file mode 100644 index 3935f9a..0000000 --- a/24.2/samples/test_suites_config/test_modules/correlation_matrix_module.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Any, Dict - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns - - -if TYPE_CHECKING: - from matplotlib.container import BarContainer - from numpy import ndarray - from numpy.typing import ArrayLike - from pandas import DataFrame - - from vectice.models.validation import TestSuiteReturnType - -_logger = logging.getLogger(__name__) - -def plot_correlation_matrix( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - subset_columns = internal_parameters.get("subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"]) - cmap = internal_parameters.get("cmap", "Blues") - - # Select subset of columns - training_df = training_df[subset_columns] - - # Calculate the correlation matrix - corr_matrix = training_df.corr() - - # Plot the correlation matrix - plt.figure(figsize=(10, 8)) - sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) - plt.title("Correlation Matrix") - - # Save the plot - file_path = "Correlation_matrix_plot.png" - plt.savefig(file_path) - plt.close() - - # RETURN IN THE VECTICE EXPECTED FORMART - return TestSuiteReturnType( - metrics={}, - properties={}, - tables=[], - attachments=[file_path], - ) \ No newline at end of file diff --git a/24.2/samples/test_suites_config/test_modules/data_privacy_modules.py b/24.2/samples/test_suites_config/test_modules/data_privacy_modules.py deleted file mode 100644 index 90d851f..0000000 --- a/24.2/samples/test_suites_config/test_modules/data_privacy_modules.py +++ /dev/null @@ -1,148 +0,0 @@ -# Write custom tests which can be used to validate your datasets security -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pandas as pd - -if TYPE_CHECKING: - from numpy.typing import ArrayLike - from pandas import DataFrame - - from vectice.models.validation_dataset import TestSuiteReturnType - - -def sensitive_data_check( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: ArrayLike | list | None = None, - target_column: ArrayLike | str | None = None, - sensitive_keywords: list | None = None, -) -> TestSuiteReturnType | None: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None or sensitive_keywords is None: - return None - - # Initialize a dictionary to hold counts of sensitive data - sensitive_counts = {keyword: 0 for keyword in sensitive_keywords} - - # Check each cell in the DataFrame for sensitive keywords - for keyword in sensitive_keywords: - sensitive_counts[keyword] = dataset.apply( - lambda x: x.astype(str).str.contains(keyword, case=False).sum() - ).sum() - - # Create a DataFrame with the results - sensitive_counts_df = pd.DataFrame( - { - "Sensitive Keyword": list(sensitive_counts.keys()), - "Count": list(sensitive_counts.values()), - } - ) - - table = Table(sensitive_counts_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) - - -def pii_check( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: ArrayLike | list | None = None, - target_column: ArrayLike | str | None = None, -) -> TestSuiteReturnType | None: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - # Define common PII patterns - pii_patterns = { - "name": r"\b[A-Z][a-z]*\b", - "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b", - "phone": r"\b(\+?[\d]{1,3}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,4}[-.\s]?[\d]{1,9})\b", - } - - # Initialize a dictionary to hold counts of PII matches - pii_counts = {key: 0 for key in pii_patterns.keys()} - - # Check each column in the DataFrame for PII patterns - for column in dataset.columns: - for key, pattern in pii_patterns.items(): - pii_counts[key] += ( - dataset[column] - .astype(str) - .str.contains(pattern, case=False, regex=True) - .sum() - ) - - # Create a DataFrame with the results - pii_counts_df = pd.DataFrame( - {"PII Type": list(pii_counts.keys()), "Count": list(pii_counts.values())} - ) - - table = Table(pii_counts_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) - - -def sensitive_data_type_check( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: ArrayLike | list | None = None, - target_column: ArrayLike | str | None = None, -) -> TestSuiteReturnType | None: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - # Define patterns for sensitive data types - sensitive_data_patterns = { - "credit_card": r"\b(?:\d[ -]*?){13,16}\b", - "ssn": r"\b\d{3}-\d{2}-\d{4}\b", - } - - # Initialize a dictionary to hold counts of sensitive data type matches - sensitive_data_counts = {key: 0 for key in sensitive_data_patterns.keys()} - - # Check each column in the DataFrame for sensitive data type patterns - for column in dataset.columns: - for key, pattern in sensitive_data_patterns.items(): - sensitive_data_counts[key] += ( - dataset[column] - .astype(str) - .str.contains(pattern, case=False, regex=True) - .sum() - ) - - # Create a DataFrame with the results - sensitive_data_counts_df = pd.DataFrame( - { - "Sensitive Data Type": list(sensitive_data_counts.keys()), - "Count": list(sensitive_data_counts.values()), - } - ) - - table = Table(sensitive_data_counts_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) diff --git a/24.2/samples/test_suites_config/test_modules/data_quality_modules.py b/24.2/samples/test_suites_config/test_modules/data_quality_modules.py deleted file mode 100644 index 05b3ae5..0000000 --- a/24.2/samples/test_suites_config/test_modules/data_quality_modules.py +++ /dev/null @@ -1,116 +0,0 @@ -# Write custom tests which can be used to validate your datasets quality -from __future__ import annotations - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from pandas import DataFrame -from vectice.models.validation_dataset import TestSuiteReturnType - - -# custom test which can be used for dataset validation -def test_dataset_split( - dataset: DataFrame | None, - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - feature_columns: list | None = None, - threshold: float | None = None, -) -> TestSuiteReturnType: - from vectice import Table - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - total_df = len(training_df) + len(testing_df) - - # Create a DataFrame with the results - datasplit_df = pd.DataFrame( - { - "Dataset": ["Train", "Test", "Total"], - "Size": [len(training_df), len(testing_df), total_df], - "Percentage": [ - (len(training_df) / total_df * 100), - (len(testing_df) / total_df * 100), - 100, - ], - } - ) - - table = Table(datasplit_df) - - return TestSuiteReturnType( - properties={}, - tables=[table], - attachments=[], - ) - - -# custom test which can be used for dataset validation -def iqr_and_outliers( - dataset: DataFrame | None = None, - training_df: DataFrame | None = None, - testing_df: DataFrame | None = None, - feature_columns: list | None = None, - target_column: str | None = None, - threshold: float | None = None, -) -> TestSuiteReturnType | None: - from vectice.models.validation_dataset import TestSuiteReturnType - - if dataset is None: - return None - - files = [] - # disable plots showing - plt.ioff() - for column in dataset.select_dtypes(include=[np.number]).columns: - file_name = f"iqr_and_outliers_{column}.png" - - temp_file_path = file_name - - Q1 = dataset[column].quantile(0.25) - Q3 = dataset[column].quantile(0.75) - IQR = Q3 - Q1 - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - - plt.figure(figsize=(10, 6)) - plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) - plt.axvline( - Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}" - ) - plt.axvline( - Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}" - ) - plt.axvline( - dataset[column].median(), - color="g", - linestyle="-", - label=f"Median: {dataset[column].median():.2f}", - ) - plt.fill_betweenx( - [0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}" - ) - - # Highlight outliers - outliers = dataset[ - (dataset[column] < lower_bound) | (dataset[column] > upper_bound) - ][column] - plt.scatter( - outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5 - ) - - plt.title(f"Histogram with IQR and Outliers for {column}") - plt.xlabel(column) - plt.ylabel("Frequency") - plt.legend() - plt.savefig(temp_file_path, bbox_inches="tight") - files.append(temp_file_path) - - plt.ion() - return TestSuiteReturnType( - properties={}, - tables=[], - attachments=files, - ) diff --git a/24.2/samples/test_suites_config/test_modules/default_tests_vectice.py b/24.2/samples/test_suites_config/test_modules/default_tests_vectice.py deleted file mode 100644 index 5ec37db..0000000 --- a/24.2/samples/test_suites_config/test_modules/default_tests_vectice.py +++ /dev/null @@ -1,523 +0,0 @@ -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Any, Dict - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns -import shap -from scipy.stats import chi2_contingency, ks_2samp -from sklearn.metrics import auc, confusion_matrix, precision_score, recall_score, roc_curve - -if TYPE_CHECKING: - from matplotlib.container import BarContainer - from numpy import ndarray - from numpy.typing import ArrayLike - from pandas import DataFrame - - from vectice.models.validation import TestSuiteReturnType - -_logger = logging.getLogger(__name__) - - -def plot_roc_curve( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"train_color": "green", "test_color": "blue", "threshold": 0.5}, -) -> TestSuiteReturnType | None: - from vectice.models.validation import TestSuiteReturnType - - X_train = training_df.drop(columns=[target_column]) - X_test = testing_df.drop(columns=[target_column]) - training_prediction_proba = predictor.predict_proba(X_train)[:, 1] - testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] - - if predict_proba_train is not None: - training_prediction_proba = predict_proba_train - - if predict_proba_test is not None: - testing_prediction_proba = predict_proba_test - - fpr_train, tpr_train, _ = roc_curve(training_df[target_column], training_prediction_proba) - roc_auc_train = auc(fpr_train, tpr_train) - - fpr_test, tpr_test, _ = roc_curve(testing_df[target_column], testing_prediction_proba) - roc_auc_test = auc(fpr_test, tpr_test) - - file_path = "ROC_CURVE.png" - - plt.figure(figsize=(8, 6)) - plt.plot( - fpr_train, - tpr_train, - color=internal_parameters["train_color"], - linestyle="--", - label=f"Train ROC curve (AUC = {roc_auc_train:.2f})", - ) - plt.plot( - fpr_test, - tpr_test, - color=internal_parameters["test_color"], - label=f"Test ROC curve (AUC = {roc_auc_test:.2f})", - ) - plt.plot([0, 1], [0, 1], color="red", linestyle="--") - plt.xlabel("False Positive Rate") - plt.ylabel("True Positive Rate") - plt.title("Receiver Operating Characteristic (ROC) Curve") - plt.legend() - plt.grid(True) - plt.savefig(file_path) - plt.close() - - return TestSuiteReturnType( - metrics={"_ROC_auc_train": roc_auc_train, "_ROC_auc_test": roc_auc_test}, - properties={}, - tables=[], - attachments=[file_path], - ) - - -def conf_matrix( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"threshold": 0.5, "cmap": "Blues"}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - threshold = internal_parameters["threshold"] - cmap = internal_parameters.get("cmap", "Blues") - - X_test = testing_df.drop(columns=[target_column]) - testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] - - if predict_proba_test is not None: - testing_prediction_proba = predict_proba_test - - testing_prediction = (testing_prediction_proba >= threshold).astype(int) - - cm = confusion_matrix(testing_df[target_column], testing_prediction) - total_samples = np.sum(cm) - - precision = precision_score(testing_df[target_column], testing_prediction) - recall = recall_score(testing_df[target_column], testing_prediction) - - # Plot confusion matrix - plt.figure(figsize=(10, 8)) - sns.heatmap(cm, annot=True, cmap=cmap, fmt="d", annot_kws={"fontsize": 12}, cbar=False) - for i in range(len(cm)): - for j in range(len(cm)): - plt.text( - j + 0.5, - i + 0.75, - f"{cm[i][j]/total_samples*100:.2f}%", - ha="center", - va="center", - color="black", - fontsize=12, - ) - plt.xlabel("Predicted Label") - plt.ylabel("True Label") - plt.title(f"Confusion Matrix\nPrecision: {precision:.2f}, Recall: {recall:.2f}") - - # Save the plot - file_path = "Confusion_matrix_plot.png" - plt.savefig(file_path) - plt.close() - - return TestSuiteReturnType( - metrics={"_precision_test": precision, "_recall_test": recall}, - properties={"Threshold": threshold}, - tables=[], - attachments=[file_path], - ) - - -def explainability( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) - shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) - shap.summary_plot( - shap_values[:, :, 0], training_df.drop(columns=[target_column]).head(1000), max_display=10, show=False - ) - summary_plot_path = "SHAP_summary_plot.png" - plt.savefig(summary_plot_path, bbox_inches="tight") - plt.close() - - return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[summary_plot_path]) - - -def feature_importance( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - explainer = shap.Explainer(predictor, training_df.drop(columns=[target_column])) - shap_values = explainer(training_df.drop(columns=[target_column]).head(1000)) - clustering = shap.utils.hclust( - training_df.drop(columns=[target_column]).head(1000), training_df[target_column].head(1000) - ) - shap.plots.bar(shap_values[:, :, 0], clustering=clustering, max_display=10, show=False) - - feature_importance_path = "feature_importance.png" - plt.savefig(feature_importance_path, bbox_inches="tight") - plt.close() - - return TestSuiteReturnType(metrics={}, properties={}, tables=[], attachments=[feature_importance_path]) - - -def cramers_v_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: - - min_length = min(len(x), len(y), 4000) - x = x[:min_length] - y = y[:min_length] - confusion_matrix = pd.crosstab(x, y) - chi2 = chi2_contingency(confusion_matrix)[0] - n = confusion_matrix.sum().sum() - phi2 = chi2 / n - r, k = confusion_matrix.shape - phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) - rcorr = r - ((r - 1) ** 2) / (n - 1) - kcorr = k - ((k - 1) ** 2) / (n - 1) - return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) - - -def ks_score(x: ndarray[Any, Any], y: ndarray[Any, Any]) -> float: - min_length = min(len(x), len(y), 4000) - x = x[:min_length] - y = y[:min_length] - ks_statistic, _ = ks_2samp(x, y) - - return ks_statistic - - -def prediction_drift( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - threshold: float, - internal_parameters: Dict[str, Any] = {}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - X_train = training_df.drop(columns=[target_column]) - X_test = testing_df.drop(columns=[target_column]) - training_prediction_proba = predictor.predict_proba(X_train)[:, 1] - testing_prediction_proba = predictor.predict_proba(X_test)[:, 1] - - if predict_proba_train is not None: - training_prediction_proba = predict_proba_train - - if predict_proba_test is not None: - testing_prediction_proba = predict_proba_test - - train_predictions = np.array(training_prediction_proba) - test_predictions = np.array(testing_prediction_proba) - - light_red = "#FF8A80" # Light Red - darker_blue = "#1565C0" # Darker Blue - sns.set_palette([darker_blue, light_red]) - - _, ax = plt.subplots(figsize=(8, 6)) - - sns.kdeplot(train_predictions, color=light_red, label="Train Predictions", fill=True) - sns.kdeplot(test_predictions, color=darker_blue, label="Test Predictions", fill=True) - - # Plot vertical lines for means using the specified colors - ax.axvline( # pyright: ignore[reportAttributeAccessIssue] - np.mean(train_predictions), # pyright: ignore[reportArgumentType] - color=light_red, - linestyle="--", - label="Train Mean", - ) - ax.axvline( # pyright: ignore[reportAttributeAccessIssue] - np.mean(test_predictions), # pyright: ignore[reportArgumentType] - color=darker_blue, - linestyle="--", - label="Test Mean", - ) - - plt.xlabel("Predictions") - plt.ylabel("Density") - plt.title("Prediction Drift Plot (Kolmogorov-Smirnov drift score)") - plt.legend() - plt.grid(True) - path = "Prediction_drift.png" - - # Calculate and print drift score - drift_score = ks_score(train_predictions, test_predictions) - - # Set text position at the top - text_x = 0.5 - text_y = 0.95 - if drift_score < 0.1: - score_color = "green" - elif 0.1 <= drift_score <= 0.2: - score_color = "orange" - else: - score_color = "red" - - plt.text( - text_x, - text_y, - f"Drift score = {drift_score:.2f}", - ha="center", - va="top", - color=score_color, - transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] - ) - - plt.savefig(path, bbox_inches="tight") - plt.close() - - return TestSuiteReturnType( - metrics={}, properties={"_prediction_drift_score": drift_score}, tables=[], attachments=[path] - ) - - -def label_drift( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - train_labels = np.array(training_df[target_column]) - test_labels = np.array(testing_df[target_column]) - - light_red = "#FF8A80" # Light Red - darker_blue = "#1565C0" # Darker Blue - sns.set_palette([darker_blue, light_red]) - - _, ax = plt.subplots(figsize=(8, 6)) - - bar_width = 0.35 - index = np.arange(2) - - train_counts = [np.sum(train_labels == 0) / len(train_labels), np.sum(train_labels == 1) / len(train_labels)] - test_counts = [np.sum(test_labels == 0) / len(test_labels), np.sum(test_labels == 1) / len(test_labels)] - - train_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] - index, train_counts, bar_width, label="Train Labels" - ) - test_bar = ax.bar( # pyright: ignore[reportAttributeAccessIssue] - index + bar_width, test_counts, bar_width, label="Test Labels" - ) - - ax.set_xlabel("Labels") # pyright: ignore[reportAttributeAccessIssue] - ax.set_ylabel("Frequency") # pyright: ignore[reportAttributeAccessIssue] - ax.set_title("Label Drift Plot (Cramer's V drift score)") # pyright: ignore[reportAttributeAccessIssue] - ax.set_xticks(index + bar_width / 2) # pyright: ignore[reportAttributeAccessIssue] - ax.set_xticklabels(["0", "1"]) # pyright: ignore[reportAttributeAccessIssue] - ax.legend() # pyright: ignore[reportAttributeAccessIssue] - - def autolabel(bars: BarContainer): - """Attach a text label above each bar in *bars*, displaying its height.""" - for bar in bars: - height = bar.get_height() - ax.annotate( # pyright: ignore[reportAttributeAccessIssue] - f"{height:.2f}", - xy=(bar.get_x() + bar.get_width() / 2, height), - xytext=(0, 3), - textcoords="offset points", - ha="center", - va="bottom", - ) - - autolabel(train_bar) - autolabel(test_bar) - - drift_score = cramers_v_score(train_labels, test_labels) - if drift_score < 0.1: - score_color = "green" - elif 0.1 <= drift_score <= 0.2: - score_color = "orange" - else: - score_color = "red" - - ax.text( # pyright: ignore[reportAttributeAccessIssue] - 0.5, - 0.95, - f"Drift score = {drift_score:.2f}", - ha="center", - va="top", - color=score_color, - transform=ax.transAxes, # pyright: ignore[reportAttributeAccessIssue] - ) - - plt.tight_layout() - path = "Label_drift.png" - plt.savefig(path, bbox_inches="tight") - plt.close() - - return TestSuiteReturnType( - metrics={}, properties={"_label_drift_score": drift_score}, tables=[], attachments=[path] - ) - - -def plot_correlation_matrix( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, -) -> TestSuiteReturnType: - from vectice.models.validation import TestSuiteReturnType - - subset_columns = internal_parameters.get( - "subset_columns", [target_column] + [col for col in training_df.columns[:10] if col != "TARGET"] - ) - cmap = internal_parameters.get("cmap", "Blues") - - # Select subset of columns - training_df = training_df[subset_columns] - - # Calculate the correlation matrix - corr_matrix = training_df.corr() - - # Plot the correlation matrix - plt.figure(figsize=(10, 8)) - sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", annot_kws={"fontsize": 12}, cbar=True) - plt.title("Correlation Matrix") - - # Save the plot - file_path = "Correlation_matrix_plot.png" - plt.savefig(file_path, bbox_inches="tight") - plt.close() - - return TestSuiteReturnType( - metrics={}, - properties={}, - tables=[], - attachments=[file_path], - ) - - -# custom test which can be used for dataset validation -def test_dataset_split( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, -) -> TestSuiteReturnType: - from vectice import Table - from vectice.models.validation import TestSuiteReturnType - - total_df = len(training_df) + len(testing_df) - - # Create a DataFrame with the results - datasplit_df = pd.DataFrame( - { - "Dataset": ["Train", "Test", "Total"], - "Size": [len(training_df), len(testing_df), total_df], - "Percentage": [ - (len(training_df) / total_df * 100), - (len(testing_df) / total_df * 100), - 100, - ], - } - ) - - table = Table(datasplit_df) - - return TestSuiteReturnType(metrics={}, properties={}, tables=[table], attachments=[]) - - -# custom test which can be used for dataset validation -def iqr_and_outliers( - training_df: DataFrame, - testing_df: DataFrame, - target_column: str, - predictor: Any, - predict_proba_train: ArrayLike | None, - predict_proba_test: ArrayLike | None, - internal_parameters: Dict[str, Any] = {"subset_columns": None, "cmap": "Blues"}, -) -> TestSuiteReturnType | None: - from vectice.models.validation import TestSuiteReturnType - - dataset = training_df - - files = [] - # disable plots showing - if internal_parameters.get("subset_columns") is not None: - columns = internal_parameters.get("subset_columns") - else: - columns = dataset.select_dtypes(include=[np.number]).columns[:10] - plt.ioff() - for column in columns: # type: ignore - file_name = f"iqr_and_outliers_{column}.png" - - temp_file_path = file_name - - Q1 = dataset[column].quantile(0.25) - Q3 = dataset[column].quantile(0.75) - IQR = Q3 - Q1 - lower_bound = Q1 - 1.5 * IQR - upper_bound = Q3 + 1.5 * IQR - - plt.figure(figsize=(10, 6)) - plt.hist(dataset[column], bins=20, edgecolor="k", alpha=0.7) - plt.axvline(Q1, color="r", linestyle="--", label=f"Q1 (25th percentile): {Q1:.2f}") - plt.axvline(Q3, color="b", linestyle="--", label=f"Q3 (75th percentile): {Q3:.2f}") - plt.axvline( - dataset[column].median(), - color="g", - linestyle="-", - label=f"Median: {dataset[column].median():.2f}", - ) - plt.fill_betweenx([0, plt.ylim()[1]], Q1, Q3, color="gray", alpha=0.3, label=f"IQR: {IQR:.2f}") - - # Highlight outliers - outliers = dataset[(dataset[column] < lower_bound) | (dataset[column] > upper_bound)][column] - plt.scatter(outliers, [0] * len(outliers), color="red", label="Outliers", zorder=5) - - plt.title(f"Histogram with IQR and Outliers for {column}") - plt.xlabel(column) - plt.ylabel("Frequency") - plt.legend() - plt.savefig(temp_file_path, bbox_inches="tight") - files.append(temp_file_path) - - plt.ion() - return TestSuiteReturnType( - metrics={}, - properties={}, - tables=[], - attachments=files, - ) \ No newline at end of file diff --git a/Validation/vectice_wrappers.py b/Validation/vectice_wrappers.py new file mode 100644 index 0000000..503fbc7 --- /dev/null +++ b/Validation/vectice_wrappers.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import inspect + +from typing import Any, Dict +from vectice.models.validation import TestSuiteReturnType + + +## You just pass your function as an argument +def Vectice_wrapper_function( + module: callable, + internal_functions_param: Dict[str, Any], +) -> TestSuiteReturnType: + + # Inspect the signature of the internal function + signature = inspect.signature(module) + + # Validate that all required parameters are provided + for param_name, param in signature.parameters.items(): + if param.default == inspect.Parameter.empty and param_name not in internal_functions_param: + raise ValueError(f"Missing required parameter: {param_name}") + + # Filter out any extra parameters not in the signature + filtered_params = {param_name: internal_functions_param[param_name] for param_name in signature.parameters if param_name in internal_functions_param} + + # Run the provided callable with filtered parameters + result = module(**filtered_params) + + # Helper function to extract paths + def extract_paths(obj): + paths = [] + if isinstance(obj, dict): + for key, value in obj.items(): + paths.extend(extract_paths(value)) + elif isinstance(obj, list): + for item in obj: + paths.extend(extract_paths(item)) + elif isinstance(obj, str): + paths.append(obj) + elif hasattr(obj, 'attachments'): + paths.extend(extract_paths(obj.attachments)) + return paths + + # Extract paths from the result + extracted_paths = extract_paths(result) + + # Convert the result to a dictionary + output_files = { + "paths": extracted_paths, + } + + # Return in the expected format + return TestSuiteReturnType(**output_files) + + + +def Vectice_wrapper( + output_files: Dict[str, Any] = {"paths": None, "dataframes": None, "metrics": None, "properties": None}, +) -> TestSuiteReturnType: + + #### + #####Paste your code Here + ##### + + + + # RETURN IN THE VECTICE EXPECTED FORMART + return TestSuiteReturnType( + metrics=output_files["metrics"], + properties=output_files["properties"], + tables=output_files["dataframes"], + attachments=output_files["paths"], + )