Summarize the first importance ratio for PPO (#1795)

emailweixu · web-flow · commit ac142f072aa4 · 2025-07-23T14:24:45.000-07:00
* Summary for first importance ratio for PPO

The importance ratio for the first gradient update in each iteration
should be exact 0 (or very close to it). Nonzero importance ratio
indicates something is wrong. So it is useful to summarize it to uncover
some hidden bugs.

* Address comments

Also summarize importance_ratio - 1 for better visualization.
diff --git a/alf/algorithms/algorithm.py b/alf/algorithms/algorithm.py
@@ -1640,12 +1640,15 @@ def _train_experience(self,
         if self._config.empty_cache:
             torch.cuda.empty_cache()
 
+        grad_step = 0
         indices = None
         for u in range(num_updates):
             if mini_batch_size < batch_size:
                 indices = torch.randperm(batch_size,
                                          device=experience.step_type.device)
             for b in range(0, batch_size, mini_batch_size):
+                alf.summary.set_grad_step_counter(grad_step)
+                grad_step += 1
 
                 is_last_mini_batch = (u == num_updates - 1
                                       and b + mini_batch_size >= batch_size)
diff --git a/alf/algorithms/ppo_loss.py b/alf/algorithms/ppo_loss.py
@@ -135,6 +135,22 @@ def _pg_loss(self, info, advantages):
             log_prob_clipping=self._log_prob_clipping,
             check_numerics=self._check_numerics,
             debug_summaries=self._debug_summaries)
+        if alf.summary.get_grad_step_counter() == 0:
+            # For the first gradient step in one iteration, the importance ratios
+            # should be 1. Summarize them so that we can notice something is wrong
+            # if they are not 1. Note that due to floating point precision,
+            # importance_ratio0 may not be exactly 1, but it should be very close
+            # to 1.
+            global_step = alf.summary.get_global_counter()
+            summary_interval = alf.get_config_value(
+                'TrainerConfig.summary_interval')
+            if global_step < summary_interval or global_step % summary_interval == 0:
+                with alf.summary.record_if(lambda: True), scope:
+                    alf.summary.histogram('importance_ratio0_minus1',
+                                          importance_ratio - 1)
+                    alf.summary.scalar('importance_ratio0_minus1_abs',
+                                       (importance_ratio - 1).abs().mean())
+
         # Pessimistically choose the maximum objective value for clipped and
         # unclipped importance ratios.
         pg_objective = -importance_ratio * advantages
@@ -143,9 +159,12 @@ def _pg_loss(self, info, advantages):
 
         if self._debug_summaries and alf.summary.should_record_summaries():
             with scope:
-                alf.summary.histogram('pg_objective', pg_objective)
-                alf.summary.histogram('pg_objective_clipped',
-                                      pg_objective_clipped)
+                alf.summary.scalar('pg_objective', pg_objective.mean())
+                alf.summary.scalar('pg_objective_clipped',
+                                   pg_objective_clipped.mean())
+                alf.summary.scalar('objective_clip_fraction',
+                                   (pg_objective_clipped
+                                    > pg_objective).float().mean())
 
         if self._check_numerics:
             assert torch.all(torch.isfinite(policy_gradient_loss))
diff --git a/alf/summary/summary_ops.py b/alf/summary/summary_ops.py
@@ -336,6 +336,39 @@ def set_global_counter(counter):
     update_progress("global_counter", counter)
 
 
+_grad_step_counter = 0
+
+
+def get_grad_step_counter():
+    """Get which gradient step we are at in the current global step.
+
+    For many algorithms, each global step (iteration) may have multiple gradient
+    steps. Typically, only the last gradient step will be recorded in summary.
+    This function return the current gradient step counter. If an algorithm needs
+    to record summaries at a different gradient step, it can use
+    `with record_if(lambda: alf.summary.get_grad_step_counter() == n):`
+    to record summaries at gradient step `n`.
+
+    Returns:
+        int: the current gradient step counter. The first gradient step in a
+            global step is 0, the second is 1, etc.
+    """
+    return _grad_step_counter
+
+
+def set_grad_step_counter(counter):
+    """Set the current gradient step counter.
+
+    This function is used by ALF framework to set the gradient step counter
+    before running the gradient step.
+
+    Args:
+        counter (int): the gradient step counter to set
+    """
+    global _grad_step_counter
+    _grad_step_counter = counter
+
+
 class record_if(object):
     """Context manager to set summary recording on or off according to `cond`."""