gmgeorg
diff --git a/‎notebooks/pypsps_binary_survival_example.ipynb
Lines changed: 1868 additions & 0 deletions b/‎notebooks/pypsps_binary_survival_example.ipynb
Lines changed: 1868 additions & 0 deletions
diff --git a/‎poetry.lock
Lines changed: 190 additions & 1 deletion b/‎poetry.lock
Lines changed: 190 additions & 1 deletion
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎pypsps/datasets/base.py
Lines changed: 7 additions & 0 deletions b/‎pypsps/datasets/base.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎pypsps/datasets/binary_survival.py
Lines changed: 111 additions & 0 deletions b/‎pypsps/datasets/binary_survival.py
Lines changed: 111 additions & 0 deletions
diff --git a/‎pypsps/keras/callbacks.py
Lines changed: 1 addition & 1 deletion b/‎pypsps/keras/callbacks.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pypsps/keras/metrics.py
Lines changed: 95 additions & 11 deletions b/‎pypsps/keras/metrics.py
Lines changed: 95 additions & 11 deletions
@@ -27,6 +27,7 @@ seaborn = "^0.13.2"
 scikit-learn = "^1.6.1"
 optuna = "^4.2.1"
 pydot = "^3.0.4"
+scikit-survival = "^0.24.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 
@@ -24,6 +24,7 @@ def __init__(
         true_ate: Optional[float] = None,
         true_ute: Optional[pd.DataFrame] = None,
         true_propensity_score: Optional[pd.DataFrame] = None,
+        true_outcomes: Optional[pd.DataFrame] = None,
     ):
         """Initializes the class."""
         if isinstance(treatments, pd.Series):
@@ -40,12 +41,18 @@ def __init__(
         self.true_ate = true_ate
         self.true_ute = true_ute
         self.true_propensity_score = true_propensity_score
+        self.true_outcomes = true_outcomes
 
     def to_data_frame(self) -> pd.DataFrame:
         """Returns all data as a concatenated DataFrame."""
         list_dfs = [self.outcomes, self.treatments, self.features]
         if self.latent_features is not None:
             list_dfs.append(self.latent_features)
+        if self.true_outcomes is not None:
+            list_dfs.append(self.true_outcomes)
+        if self.true_propensity_score is not None:
+            list_dfs.append(self.true_propensity_score)
+
         return pd.concat(list_dfs, axis=1)
 
     def to_keras_inputs_outputs(
 
@@ -0,0 +1,111 @@
+"""Toy example survival of a cancer treatment to survival times."""
+
+import math
+
+import numpy as np
+import pandas as pd
+from scipy.special import expit  # logistic sigmoid
+
+from . import base
+
+_FEAT_COLS = ["gender", "age", "comorbidity", "cancer_severity"]
+
+
+def _simple_custom_uuid(val):
+    """
+    Returns a simple custom UUID string for the given value.
+    This function uses the built-in hash() and converts the absolute hash value
+    to a base-36 string (digits and lowercase letters).
+    """
+    # Get absolute hash value to avoid negative numbers.
+    h = abs(hash(val))
+    alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
+    if h == 0:
+        return alphabet[0]
+    s = []
+    while h:
+        s.append(alphabet[h % 36])
+        h //= 36
+    return "".join(reversed(s))
+
+
+class CancerSurvivalSimulator(base.BaseSimulator):
+    """Cancer survival simulation."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._rng = np.random.RandomState(self._seed)
+
+    def sample(self, n_samples: int):
+        """Samples example dataset."""
+
+        # 1. Generate features
+        # Gender: randomly assign Male/Female
+        genders = self._rng.choice(["male", "female"], size=n_samples)
+
+        # Age: uniformly distributed from 30 to 80.
+        ages = self._rng.uniform(30, 80, size=n_samples)
+
+        # Comorbidity: categorical; probabilities: Low (50%), Medium (30%), High (20%)
+        comorbidity = self._rng.choice(["low", "medium", "high"], size=n_samples, p=[0.5, 0.3, 0.2])
+
+        # Cancer severity: uniformly from 1 to 10.
+        cancer_severity = self._rng.uniform(1, 10, size=n_samples)
+
+        # 2. Treatment assignment (chemotherapy)
+        # Logistic model: more likely if cancer_severity is high and age is low.
+        # We'll use: lp = -0.6 + 0.1 * cancer_severity - 0.015 * age.
+        lp = 1.0 + 0.5 * cancer_severity - 0.05 * ages
+        p_chemo = expit(lp)
+        chemo = self._rng.binomial(1, p_chemo, size=n_samples)
+
+        # 3. Simulate true recovery time from exponential distribution.
+        # For untreated: median = 365 days => scale = 365/ln2
+        # For treated: median = 365/2 days => scale = (365/2)/ln2
+        # np.random.exponential uses "scale" parameter = 1/lambda = mean.
+        scale_untreated = 365 / math.log(2)  # ~527 days
+        scale_treated = (100.0) / math.log(2)  # ~263.5 days
+
+        # For each patient, choose scale based on treatment.
+        scales = np.where(chemo == 1, scale_treated, scale_untreated)
+        # Simulate recovery time from exponential distribution.
+        true_recovery_time = self._rng.exponential(scale=scales)
+
+        # 4. Impose study follow-up: end at 730 days.
+        # Observed time is min(true_recovery_time, 730)
+        observed_time = np.minimum(true_recovery_time, 540)
+        # Natural event indicator: 1 if true_recovery_time <= 730, else 0.
+        event_indicator = (true_recovery_time <= 540).astype(int)
+
+        # 6. Assemble DataFrame
+        df = pd.DataFrame(
+            {
+                "gender": genders,
+                "age": ages,
+                "comorbidity": comorbidity,
+                "cancer_severity": cancer_severity,
+                "chemotherapy": chemo,
+                "true_recovery_time": true_recovery_time,
+                "event_time": observed_time,
+                "event_indicator": event_indicator,
+                "prob_chemotherapy": p_chemo,
+            }
+        )
+        df = df.sort_values("prob_chemotherapy", ascending=False)
+        df["patient_id"] = (df.index.to_series() + 1e6).apply(_simple_custom_uuid)
+        df = df.set_index("patient_id", verify_integrity=True)
+
+        df["gender"] = df["gender"].map({"male": 0, "female": 1})
+        df["comorbidity"] = df["comorbidity"].map({"low": 0, "medium": 1, "high": 2})
+
+        true_ute = pd.Series((365.0 / 2.0) / math.log(2.0), index=df.index, name="true_ute")
+
+        return base.CausalDataset(
+            treatments=df["chemotherapy"],
+            outcomes=df[["event_time", "event_indicator"]],
+            features=df[_FEAT_COLS],
+            true_ate=true_ute.mean(),
+            true_ute=true_ute,
+            true_propensity_score=df["prob_chemotherapy"],
+            true_outcomes=df["true_recovery_time"],
+        )
@@ -17,7 +17,7 @@ def __init__(self, n: int = 10):
     def on_epoch_end(self, epoch, logs=None):
         """call at end of epoch"""
         # logs is a dictionary containing metric names and values.
-        if (epoch + 1) % self.n == 0:
+        if (epoch == 0) or ((epoch + 1) % self.n == 0):
             logs = logs or {}
             log_str = f"Epoch {epoch + 1}: " + ", ".join(
                 f"{key}={value:.4f}" for key, value in logs.items()
 
@@ -1,6 +1,6 @@
 """Module for metrics from pypsps predictions."""
 
-import pypress
+import pypress.utils
 import tensorflow as tf
 
 from .. import utils
@@ -11,10 +11,23 @@
 class PropensityScoreBinaryCrossentropy(tf.keras.metrics.BinaryCrossentropy):
     """Computes cross entropy for the propensity score. Used as a metric in pypsps model."""
 
+    def __init__(
+        self,
+        n_outcome_pred_cols: int,
+        n_treatment_pred_cols: int,
+        name="propensity_score_binary_crossentropy",
+        **kwargs,
+    ):
+        super().__init__(name=name)
+        self._n_outcome_pred_cols = n_outcome_pred_cols
+        self._n_treatment_pred_cols = n_treatment_pred_cols
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         """Updates state."""
         _, _, propensity_score = utils.split_y_pred(
-            y_pred, n_outcome_pred_cols=2, n_treatment_pred_cols=1
+            y_pred,
+            n_outcome_pred_cols=self._n_outcome_pred_cols,
+            n_treatment_pred_cols=self._n_treatment_pred_cols,
         )
         treatment_true = y_true[:, 1:]
         super().update_state(
@@ -26,10 +39,17 @@ def update_state(self, y_true, y_pred, sample_weight=None):
 class PropensityScoreAUC(tf.keras.metrics.AUC):
     """AUC computed on the ouptut for propensity part."""
 
+    def __init__(self, n_outcome_pred_cols: int, n_treatment_pred_cols: int, **kwargs):
+        super().__init__()
+        self._n_outcome_pred_cols = n_outcome_pred_cols
+        self._n_treatment_pred_cols = n_treatment_pred_cols
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         """Updates state"""
         _, _, propensity_score = utils.split_y_pred(
-            y_pred, n_outcome_pred_cols=2, n_treatment_pred_cols=1
+            y_pred,
+            n_outcome_pred_cols=self._n_outcome_pred_cols,
+            n_treatment_pred_cols=self._n_treatment_pred_cols,
         )
         treatment_true = y_true[:, 1:]
         super().update_state(
@@ -41,10 +61,26 @@ def update_state(self, y_true, y_pred, sample_weight=None):
 class TreatmentMeanSquaredError(tf.keras.metrics.MeanSquaredError):
     """MSE computed on continuous treatment prediction."""
 
+    def __init__(
+        self,
+        n_outcome_pred_cols: int,
+        n_treatment_pred_cols: int,
+        n_outcome_true_cols: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self._n_outcome_true_cols = n_outcome_true_cols
+        self._n_outcome_pred_cols = n_outcome_pred_cols
+        self._n_treatment_pred_cols = n_treatment_pred_cols
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         """Updates state"""
-        treat_pred = utils.split_y_pred(y_pred, n_outcome_pred_cols=1, n_treatment_pred_cols=2)[2]
-        treat_true = utils.split_y_true(y_true, n_outcome_true_cols=1)[1]
+        treat_pred = utils.split_y_pred(
+            y_pred,
+            n_outcome_pred_cols=self._n_outcome_pred_cols,
+            n_treatment_pred_cols=self._n_treatment_pred_cols,
+        )[2]
+        treat_true = utils.split_y_true(y_true, n_outcome_true_cols=self._n_outcome_true_cols)[1]
         super().update_state(y_true=treat_true, y_pred=treat_pred, sample_weight=sample_weight)
 
 
@@ -53,21 +89,53 @@ def update_state(self, y_true, y_pred, sample_weight=None):
 class TreatmentMeanAbsoluteError(tf.keras.metrics.MeanAbsoluteError):
     """MSE computed on the ouptut for weighted average outcome prediction."""
 
+    def __init__(
+        self,
+        n_outcome_pred_cols: int,
+        n_treatment_pred_cols: int,
+        n_outcome_true_cols: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self._n_outcome_true_cols = n_outcome_true_cols
+        self._n_outcome_pred_cols = n_outcome_pred_cols
+        self._n_treatment_pred_cols = n_treatment_pred_cols
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         """Updates state"""
-        treat_pred = utils.split_y_pred(y_pred, n_outcome_pred_cols=1, n_treatment_pred_cols=2)[2]
-        treat_true = utils.split_y_true(y_true, n_outcome_true_cols=1)[1]
+        treat_pred = utils.split_y_pred(
+            y_pred,
+            n_outcome_pred_cols=self._n_outcome_pred_cols,
+            n_treatment_pred_cols=self._n_treatment_pred_cols,
+        )[2]
+        treat_true = utils.split_y_true(y_true, n_outcome_true_cols=self._n_treatment_pred_cols)[1]
         super().update_state(y_true=treat_true, y_pred=treat_pred, sample_weight=sample_weight)
 
 
 @tf.keras.utils.register_keras_serializable(package="pypsps")
 class OutcomeMeanSquaredError(tf.keras.metrics.MeanSquaredError):
     """MSE computed on the ouptut for weighted average outcome prediction."""
 
+    def __init__(
+        self,
+        n_outcome_pred_cols: int,
+        n_treatment_pred_cols: int,
+        n_outcome_true_cols: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self._n_outcome_true_cols = n_outcome_true_cols
+        self._n_outcome_pred_cols = n_outcome_pred_cols
+        self._n_treatment_pred_cols = n_treatment_pred_cols
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         """Updates state"""
-        avg_outcome = utils.agg_outcome_pred(y_pred, n_outcome_pred_cols=2, n_treatment_pred_cols=1)
-        outcome_true = utils.split_y_true(y_true, n_outcome_true_cols=1)[0]
+        avg_outcome = utils.agg_outcome_pred(
+            y_pred,
+            n_outcome_pred_cols=self._n_outcome_pred_cols,
+            n_treatment_pred_cols=self._n_treatment_pred_cols,
+        )
+        outcome_true = utils.split_y_true(y_true, n_outcome_true_cols=self._n_outcome_true_cols)[0]
         super().update_state(y_true=outcome_true, y_pred=avg_outcome, sample_weight=sample_weight)
 
 
@@ -76,10 +144,26 @@ def update_state(self, y_true, y_pred, sample_weight=None):
 class OutcomeMeanAbsoluteError(tf.keras.metrics.MeanAbsoluteError):
     """MSE computed on the ouptut for weighted average outcome prediction."""
 
+    def __init__(
+        self,
+        n_outcome_pred_cols: int,
+        n_treatment_pred_cols: int,
+        n_outcome_true_cols: int,
+        **kwargs,
+    ):
+        super().__init__()
+        self._n_outcome_true_cols = n_outcome_true_cols
+        self._n_outcome_pred_cols = n_outcome_pred_cols
+        self._n_treatment_pred_cols = n_treatment_pred_cols
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         """Updates state"""
-        avg_outcome = utils.agg_outcome_pred(y_pred, n_outcome_pred_cols=2, n_treatment_pred_cols=1)
-        outcome_true = utils.split_y_true(y_true, n_outcome_true_cols=1)[0]
+        avg_outcome = utils.agg_outcome_pred(
+            y_pred,
+            n_outcome_pred_cols=self._n_outcome_pred_cols,
+            n_treatment_pred_cols=self._n_treatment_pred_cols,
+        )
+        outcome_true = utils.split_y_true(y_true, n_outcome_true_cols=self._n_outcome_true_cols)[0]
         super().update_state(y_true=outcome_true, y_pred=avg_outcome, sample_weight=sample_weight)