Add TrainerConfig.offline_loss_weight (#1791)

QuantuMope · web-flow · commit c40bcd39247e · 2025-07-16T09:19:36.000-07:00
diff --git a/alf/algorithms/algorithm.py b/alf/algorithms/algorithm.py
@@ -2145,6 +2145,18 @@ def _hybrid_update(self, experience, batch_info, offline_experience,
                                                      offline_valid_masks,
                                                      offline_batch_info)
 
+            # Weight the offline loss
+            offline_loss_weight = self._config.offline_loss_weight()
+            offline_loss_info = offline_loss_info._replace(
+                loss=offline_loss_info.loss *
+                self._config.offline_loss_weight())
+
+            # If the weight becomes 0, we'll switch from hybrid to online
+            # updates and release the offline buffer
+            if offline_loss_weight == 0:
+                self._has_offline = False
+                self._offline_replay_buffer = None
+
         if loss_info is not None:
             if self.is_rl():
                 valid_masks = (experience.step_type
diff --git a/alf/algorithms/config.py b/alf/algorithms/config.py
@@ -85,6 +85,7 @@ def __init__(self,
                  priority_replay_eps=1e-6,
                  offline_buffer_dir=None,
                  offline_buffer_length=None,
+                 offline_loss_weight=1.0,
                  rl_train_after_update_steps=0,
                  rl_train_every_update_steps=1,
                  empty_cache: bool = False,
@@ -335,6 +336,12 @@ def __init__(self,
                 buffer length is offline_buffer_length * len(offline_buffer_dir).
                 If None, all the samples from all the provided replay buffer
                 checkpoints will be loaded.
+            offline_loss_weight (float|Scheduler): weight for the offline loss.
+                Current behavior is that whenever offline loss becomes 0,
+                the offline replay buffer will be released, and we switch from
+                performing hybrid updates to online updates for speed. In
+                other words, it is assumed the weight will never go to zero
+                and climb back up after.
             rl_train_after_update_steps (int): only used in the hybrid training
                 mode. It is used as a starting criteria for the normal (non-offline)
                 part of the RL training, which only starts after so many number
@@ -437,6 +444,7 @@ def __init__(self,
         # offline options
         self.offline_buffer_dir = offline_buffer_dir
         self.offline_buffer_length = offline_buffer_length
+        self.offline_loss_weight = as_scheduler(offline_loss_weight)
         self.rl_train_after_update_steps = rl_train_after_update_steps
         self.rl_train_every_update_steps = rl_train_every_update_steps
         self.empty_cache = empty_cache