diff --git a/gymnasium/wrappers/stateful_reward.py b/gymnasium/wrappers/stateful_reward.py index 67e2b784f..96f0a2b16 100644 --- a/gymnasium/wrappers/stateful_reward.py +++ b/gymnasium/wrappers/stateful_reward.py @@ -20,10 +20,7 @@ class NormalizeReward( gym.Wrapper[ObsType, ActType, ObsType, ActType], gym.utils.RecordConstructorArgs ): - r"""This wrapper will scale rewards s.t. the discounted returns have a mean of 0 and std of 1. - - In a nutshell, the rewards are divided through by the standard deviation of a rolling discounted sum of the reward. - The exponential moving average will have variance :math:`(1 - \gamma)^2`. + r"""Normalizes immediate rewards such that their exponential moving average has an approximately fixed variance. The property `_update_running_mean` allows to freeze/continue the running mean calculation of the reward statistics. If `True` (default), the `RunningMeanStd` will get updated every time `self.normalize()` is called. @@ -31,11 +28,6 @@ class NormalizeReward( A vector version of the wrapper exists :class:`gymnasium.wrappers.vector.NormalizeReward`. - Important note: - Contrary to what the name suggests, this wrapper does not normalize the rewards to have a mean of 0 and a standard - deviation of 1. Instead, it scales the rewards such that **discounted returns** have approximately unit variance. - See [Engstrom et al.](https://openreview.net/forum?id=r1etN1rtPB) on "reward scaling" for more information. - Note: In v0.27, NormalizeReward was updated as the forward discounted reward estimate was incorrectly computed in Gym v0.25+. For more detail, read [#3154](https://github.com/openai/gym/pull/3152). @@ -74,7 +66,6 @@ class NormalizeReward( ... episode_rewards.append(reward) ... >>> env.close() - >>> # will approach 0.99 with more episodes >>> np.var(episode_rewards) np.float64(0.010162116476634746) @@ -89,7 +80,7 @@ def __init__( gamma: float = 0.99, epsilon: float = 1e-8, ): - """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. + """This wrapper will normalize immediate rewards s.t. their exponential moving average has an approximately fixed variance. Args: env (env): The environment to apply the wrapper diff --git a/gymnasium/wrappers/vector/stateful_reward.py b/gymnasium/wrappers/vector/stateful_reward.py index 2e0e8ea50..c0f70e6c2 100644 --- a/gymnasium/wrappers/vector/stateful_reward.py +++ b/gymnasium/wrappers/vector/stateful_reward.py @@ -19,20 +19,12 @@ class NormalizeReward(VectorWrapper, gym.utils.RecordConstructorArgs): - r"""This wrapper will scale rewards s.t. the discounted returns have a mean of 0 and std of 1. - - In a nutshell, the rewards are divided through by the standard deviation of a rolling discounted sum of the reward. - The exponential moving average will have variance :math:`(1 - \gamma)^2`. + r"""This wrapper will scale rewards s.t. their exponential moving average has an approximately fixed variance. The property `_update_running_mean` allows to freeze/continue the running mean calculation of the reward statistics. If `True` (default), the `RunningMeanStd` will get updated every time `self.normalize()` is called. If False, the calculated statistics are used but not updated anymore; this may be used during evaluation. - Important note: - Contrary to what the name suggests, this wrapper does not normalize the rewards to have a mean of 0 and a standard - deviation of 1. Instead, it scales the rewards such that **discounted returns** have approximately unit variance. - See [Engstrom et al.](https://openreview.net/forum?id=r1etN1rtPB) on "reward scaling" for more information. - Note: The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly instantiated or the policy was changed recently. @@ -79,7 +71,7 @@ def __init__( gamma: float = 0.99, epsilon: float = 1e-8, ): - """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance. + """This wrapper will normalize immediate rewards s.t. their exponential moving average has an approximately fixed variance. Args: env (env): The environment to apply the wrapper