Source code for torchrl.modules.tensordict_module.exploration
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import annotations
import warnings
from typing import Optional, Union
import numpy as np
import torch
from tensordict import TensorDictBase
from tensordict.nn import (
TensorDictModule,
TensorDictModuleBase,
TensorDictModuleWrapper,
)
from tensordict.utils import expand_as_right, expand_right, NestedKey
from torch import nn
from torchrl.data.tensor_specs import Composite, TensorSpec
from torchrl.envs.utils import exploration_type, ExplorationType
from torchrl.modules.tensordict_module.common import _forward_hook_safe_action
__all__ = [
"EGreedyWrapper",
"EGreedyModule",
"AdditiveGaussianModule",
"AdditiveGaussianWrapper",
"OrnsteinUhlenbeckProcessModule",
"OrnsteinUhlenbeckProcessWrapper",
]
[docs]class EGreedyModule(TensorDictModuleBase):
"""Epsilon-Greedy exploration module.
This module randomly updates the action(s) in a tensordict given an epsilon greedy exploration strategy.
At each call, random draws (one per action) are executed given a certain probability threshold. If successful,
the corresponding actions are being replaced by random samples drawn from the action spec provided.
Others are left unchanged.
Args:
spec (TensorSpec): the spec used for sampling actions.
eps_init (scalar, optional): initial epsilon value.
default: 1.0
eps_end (scalar, optional): final epsilon value.
default: 0.1
annealing_num_steps (int, optional): number of steps it will take for epsilon to reach
the ``eps_end`` value. Defaults to `1000`.
Keyword Args:
action_key (NestedKey, optional): the key where the action can be found in the input tensordict.
Default is ``"action"``.
action_mask_key (NestedKey, optional): the key where the action mask can be found in the input tensordict.
Default is ``None`` (corresponding to no mask).
device (torch.device, optional): the device of the exploration module.
.. note::
It is crucial to incorporate a call to :meth:`~.step` in the training loop
to update the exploration factor.
Since it is not easy to capture this omission no warning or exception
will be raised if this is ommitted!
Examples:
>>> import torch
>>> from tensordict import TensorDict
>>> from tensordict.nn import TensorDictSequential
>>> from torchrl.modules import EGreedyModule, Actor
>>> from torchrl.data import Bounded
>>> torch.manual_seed(0)
>>> spec = Bounded(-1, 1, torch.Size([4]))
>>> module = torch.nn.Linear(4, 4, bias=False)
>>> policy = Actor(spec=spec, module=module)
>>> explorative_policy = TensorDictSequential(policy, EGreedyModule(eps_init=0.2))
>>> td = TensorDict({"observation": torch.zeros(10, 4)}, batch_size=[10])
>>> print(explorative_policy(td).get("action"))
tensor([[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.9055, -0.9277, -0.6295, -0.2532],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000]], grad_fn=<AddBackward0>)
"""
def __init__(
self,
spec: TensorSpec,
eps_init: float = 1.0,
eps_end: float = 0.1,
annealing_num_steps: int = 1000,
*,
action_key: Optional[NestedKey] = "action",
action_mask_key: Optional[NestedKey] = None,
device: torch.device | None = None,
):
if not isinstance(eps_init, float):
warnings.warn("eps_init should be a float.")
if eps_end > eps_init:
raise RuntimeError("eps should decrease over time or be constant")
self.action_key = action_key
self.action_mask_key = action_mask_key
in_keys = [self.action_key]
if self.action_mask_key is not None:
in_keys.append(self.action_mask_key)
self.in_keys = in_keys
self.out_keys = [self.action_key]
super().__init__()
self.register_buffer("eps_init", torch.as_tensor(eps_init, device=device))
self.register_buffer("eps_end", torch.as_tensor(eps_end, device=device))
self.annealing_num_steps = annealing_num_steps
self.register_buffer(
"eps", torch.as_tensor(eps_init, dtype=torch.float32, device=device)
)
if spec is not None:
if not isinstance(spec, Composite) and len(self.out_keys) >= 1:
spec = Composite({action_key: spec}, shape=spec.shape[:-1])
if device is not None:
spec = spec.to(device)
self._spec = spec
@property
def spec(self):
return self._spec
[docs] def step(self, frames: int = 1) -> None:
"""A step of epsilon decay.
After `self.annealing_num_steps` calls to this method, calls result in no-op.
Args:
frames (int, optional): number of frames since last step. Defaults to ``1``.
"""
for _ in range(frames):
self.eps.data.copy_(
torch.maximum(
self.eps_end,
(
self.eps
- (self.eps_init - self.eps_end) / self.annealing_num_steps
),
)
)
[docs] def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
expl = exploration_type()
if expl in (ExplorationType.RANDOM, None):
if isinstance(self.action_key, tuple) and len(self.action_key) > 1:
action_tensordict = tensordict.get(self.action_key[:-1])
action_key = self.action_key[-1]
else:
action_tensordict = tensordict
action_key = self.action_key
out = action_tensordict.get(action_key)
eps = self.eps
cond = torch.rand(action_tensordict.shape, device=out.device) < eps
# cond = torch.zeros(action_tensordict.shape, device=out.device, dtype=torch.bool).bernoulli_(eps)
cond = expand_as_right(cond, out)
spec = self.spec
if spec is not None:
if isinstance(spec, Composite):
spec = spec[self.action_key]
if spec.shape != out.shape:
# In batched envs if the spec is passed unbatched, the rand() will not
# cover all batched dims
if (
not len(spec.shape)
or out.shape[-len(spec.shape) :] == spec.shape
):
spec = spec.expand(out.shape)
else:
raise ValueError(
"Action spec shape does not match the action shape"
)
if self.action_mask_key is not None:
action_mask = tensordict.get(self.action_mask_key, None)
if action_mask is None:
raise KeyError(
f"Action mask key {self.action_mask_key} not found in {tensordict}."
)
spec.update_mask(action_mask)
r = spec.rand()
if r.device != out.device:
r = r.to(out.device)
out = torch.where(cond, r, out)
else:
raise RuntimeError("spec must be provided to the exploration wrapper.")
action_tensordict.set(action_key, out)
return tensordict
[docs]class EGreedyWrapper(TensorDictModuleWrapper):
"""[Deprecated] Epsilon-Greedy PO wrapper."""
def __init__(
self,
policy: TensorDictModule,
*,
eps_init: float = 1.0,
eps_end: float = 0.1,
annealing_num_steps: int = 1000,
action_key: Optional[NestedKey] = "action",
action_mask_key: Optional[NestedKey] = None,
spec: Optional[TensorSpec] = None,
):
raise RuntimeError(
"This class has been deprecated in favor of torchrl.modules.EGreedyModule."
)
[docs]class AdditiveGaussianWrapper(TensorDictModuleWrapper):
"""Additive Gaussian PO wrapper.
Args:
policy (TensorDictModule): a policy.
Keyword Args:
sigma_init (scalar, optional): initial epsilon value.
default: 1.0
sigma_end (scalar, optional): final epsilon value.
default: 0.1
annealing_num_steps (int, optional): number of steps it will take for
sigma to reach the :obj:`sigma_end` value.
mean (:obj:`float`, optional): mean of each output element’s normal distribution.
std (:obj:`float`, optional): standard deviation of each output element’s normal distribution.
action_key (NestedKey, optional): if the policy module has more than one output key,
its output spec will be of type Composite. One needs to know where to
find the action spec.
Default is "action".
spec (TensorSpec, optional): if provided, the sampled action will be
projected onto the valid action space once explored. If not provided,
the exploration wrapper will attempt to recover it from the policy.
safe (boolean, optional): if False, the TensorSpec can be None. If it
is set to False but the spec is passed, the projection will still
happen.
Default is True.
device (torch.device, optional): the device where the buffers have to be stored.
.. note::
Once an environment has been wrapped in :class:`AdditiveGaussianWrapper`, it is
crucial to incorporate a call to :meth:`~.step` in the training loop
to update the exploration factor.
Since it is not easy to capture this omission no warning or exception
will be raised if this is ommitted!
"""
def __init__(
self,
policy: TensorDictModule,
*,
sigma_init: float = 1.0,
sigma_end: float = 0.1,
annealing_num_steps: int = 1000,
mean: float = 0.0,
std: float = 1.0,
action_key: Optional[NestedKey] = "action",
spec: Optional[TensorSpec] = None,
safe: Optional[bool] = True,
device: torch.device | None = None,
):
warnings.warn(
"AdditiveGaussianWrapper is deprecated and will be removed "
"in v0.7. Please use torchrl.modules.AdditiveGaussianModule "
"instead.",
category=DeprecationWarning,
)
if device is None and hasattr(policy, "parameters"):
for p in policy.parameters():
device = p.device
break
super().__init__(policy)
if sigma_end > sigma_init:
raise RuntimeError("sigma should decrease over time or be constant")
self.register_buffer("sigma_init", torch.tensor(sigma_init, device=device))
self.register_buffer("sigma_end", torch.tensor(sigma_end, device=device))
self.annealing_num_steps = annealing_num_steps
self.register_buffer("mean", torch.tensor(mean, device=device))
self.register_buffer("std", torch.tensor(std, device=device))
self.register_buffer(
"sigma", torch.tensor(sigma_init, dtype=torch.float32, device=device)
)
self.action_key = action_key
self.out_keys = list(self.td_module.out_keys)
if action_key not in self.out_keys:
raise RuntimeError(
f"The action key {action_key} was not found in the td_module out_keys {self.td_module.out_keys}."
)
if spec is not None:
if not isinstance(spec, Composite) and len(self.out_keys) >= 1:
spec = Composite({action_key: spec}, shape=spec.shape[:-1])
self._spec = spec
elif hasattr(self.td_module, "_spec"):
self._spec = self.td_module._spec.clone()
if action_key not in self._spec.keys(True, True):
self._spec[action_key] = None
elif hasattr(self.td_module, "spec"):
self._spec = self.td_module.spec.clone()
if action_key not in self._spec.keys(True, True):
self._spec[action_key] = None
else:
self._spec = Composite({key: None for key in policy.out_keys})
self.safe = safe
if self.safe:
self.register_forward_hook(_forward_hook_safe_action)
@property
def spec(self):
return self._spec
[docs] def step(self, frames: int = 1) -> None:
"""A step of sigma decay.
After self.annealing_num_steps, this function is a no-op.
Args:
frames (int): number of frames since last step.
"""
for _ in range(frames):
self.sigma.data.copy_(
torch.maximum(
self.sigma_end,
self.sigma
- (self.sigma_init - self.sigma_end) / self.annealing_num_steps,
),
)
def _add_noise(self, action: torch.Tensor) -> torch.Tensor:
sigma = self.sigma
mean = self.mean.expand(action.shape)
std = self.std.expand(action.shape)
if not mean.dtype.is_floating_point:
mean = mean.to(torch.get_default_dtype())
if not std.dtype.is_floating_point:
std = std.to(torch.get_default_dtype())
noise = torch.normal(mean=mean, std=std)
if noise.device != action.device:
noise = noise.to(action.device)
action = action + noise * sigma
spec = self.spec
spec = spec[self.action_key]
if spec is not None:
action = spec.project(action)
elif self.safe:
raise RuntimeError(
"the action spec must be provided to AdditiveGaussianWrapper unless "
"the `safe` keyword argument is turned off at initialization."
)
return action
[docs] def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
tensordict = self.td_module.forward(tensordict)
if exploration_type() is ExplorationType.RANDOM or exploration_type() is None:
out = tensordict.get(self.action_key)
out = self._add_noise(out)
tensordict.set(self.action_key, out)
return tensordict
[docs]class AdditiveGaussianModule(TensorDictModuleBase):
"""Additive Gaussian PO module.
Args:
spec (TensorSpec): the spec used for sampling actions. The sampled
action will be projected onto the valid action space once explored.
sigma_init (scalar, optional): initial epsilon value.
default: 1.0
sigma_end (scalar, optional): final epsilon value.
default: 0.1
annealing_num_steps (int, optional): number of steps it will take for
sigma to reach the :obj:`sigma_end` value.
default: 1000
mean (:obj:`float`, optional): mean of each output element’s normal distribution.
default: 0.0
std (:obj:`float`, optional): standard deviation of each output element’s normal distribution.
default: 1.0
Keyword Args:
action_key (NestedKey, optional): if the policy module has more than one output key,
its output spec will be of type Composite. One needs to know where to
find the action spec.
default: "action"
safe (bool): if ``True``, actions that are out of bounds given the action specs will be projected in the space
given the :obj:`TensorSpec.project` heuristic.
default: False
device (torch.device, optional): the device where the buffers have to be stored.
.. note::
It is
crucial to incorporate a call to :meth:`~.step` in the training loop
to update the exploration factor.
Since it is not easy to capture this omission no warning or exception
will be raised if this is ommitted!
"""
def __init__(
self,
spec: TensorSpec,
sigma_init: float = 1.0,
sigma_end: float = 0.1,
annealing_num_steps: int = 1000,
mean: float = 0.0,
std: float = 1.0,
*,
action_key: Optional[NestedKey] = "action",
# safe is already implemented because we project in the noise addition
safe: bool = False,
device: torch.device | None = None,
):
if not isinstance(sigma_init, float):
warnings.warn("eps_init should be a float.")
if sigma_end > sigma_init:
raise RuntimeError("sigma should decrease over time or be constant")
self.action_key = action_key
self.in_keys = [self.action_key]
self.out_keys = [self.action_key]
super().__init__()
self.register_buffer("sigma_init", torch.tensor(sigma_init, device=device))
self.register_buffer("sigma_end", torch.tensor(sigma_end, device=device))
self.annealing_num_steps = annealing_num_steps
self.register_buffer("mean", torch.tensor(mean, device=device))
self.register_buffer("std", torch.tensor(std, device=device))
self.register_buffer(
"sigma", torch.tensor(sigma_init, dtype=torch.float32, device=device)
)
if spec is not None:
if not isinstance(spec, Composite) and len(self.out_keys) >= 1:
spec = Composite({action_key: spec}, shape=spec.shape[:-1])
else:
raise RuntimeError("spec cannot be None.")
self._spec = spec
self.safe = safe
if self.safe:
self.register_forward_hook(_forward_hook_safe_action)
@property
def spec(self):
return self._spec
[docs] def step(self, frames: int = 1) -> None:
"""A step of sigma decay.
After `self.annealing_num_steps` calls to this method, calls result in no-op.
Args:
frames (int): number of frames since last step. Defaults to ``1``.
"""
for _ in range(frames):
self.sigma.data.copy_(
torch.maximum(
self.sigma_end,
(
self.sigma
- (self.sigma_init - self.sigma_end) / self.annealing_num_steps
),
)
)
def _add_noise(self, action: torch.Tensor) -> torch.Tensor:
sigma = self.sigma
mean = self.mean.expand(action.shape)
std = self.std.expand(action.shape)
if not mean.dtype.is_floating_point:
mean = mean.to(torch.get_default_dtype())
if not std.dtype.is_floating_point:
std = std.to(torch.get_default_dtype())
noise = torch.normal(mean=mean, std=std)
if noise.device != action.device:
noise = noise.to(action.device)
action = action + noise * sigma
spec = self.spec[self.action_key]
action = spec.project(action)
return action
[docs] def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
if exploration_type() is ExplorationType.RANDOM or exploration_type() is None:
out = tensordict.get(self.action_key)
out = self._add_noise(out)
tensordict.set(self.action_key, out)
return tensordict
[docs]class OrnsteinUhlenbeckProcessWrapper(TensorDictModuleWrapper):
r"""Ornstein-Uhlenbeck exploration policy wrapper.
Presented in "CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING", https://arxiv.org/pdf/1509.02971.pdf.
The OU exploration is to be used with continuous control policies and introduces a auto-correlated exploration
noise. This enables a sort of 'structured' exploration.
Noise equation:
.. math::
noise_t = noise_{t-1} + \theta * (mu - noise_{t-1}) * dt + \sigma_t * \sqrt{dt} * W
Sigma equation:
.. math::
\sigma_t = max(\sigma^{min, (-(\sigma_{t-1} - \sigma^{min}) / (n^{\text{steps annealing}}) * n^{\text{steps}} + \sigma))
To keep track of the steps and noise from sample to sample, an :obj:`"ou_prev_noise{id}"` and :obj:`"ou_steps{id}"` keys
will be written in the input/output tensordict. It is expected that the tensordict will be zeroed at reset,
indicating that a new trajectory is being collected. If not, and is the same tensordict is used for consecutive
trajectories, the step count will keep on increasing across rollouts. Note that the collector classes take care of
zeroing the tensordict at reset time.
.. note::
Once an environment has been wrapped in :class:`OrnsteinUhlenbeckProcessWrapper`, it is
crucial to incorporate a call to :meth:`~.step` in the training loop
to update the exploration factor.
Since it is not easy to capture this omission no warning or exception
will be raised if this is ommitted!
Args:
policy (TensorDictModule): a policy
Keyword Args:
eps_init (scalar): initial epsilon value, determining the amount of noise to be added.
default: 1.0
eps_end (scalar): final epsilon value, determining the amount of noise to be added.
default: 0.1
annealing_num_steps (int): number of steps it will take for epsilon to reach the eps_end value.
default: 1000
theta (scalar): theta factor in the noise equation
default: 0.15
mu (scalar): OU average (mu in the noise equation).
default: 0.0
sigma (scalar): sigma value in the sigma equation.
default: 0.2
dt (scalar): dt in the noise equation.
default: 0.01
x0 (Tensor, ndarray, optional): initial value of the process.
default: 0.0
sigma_min (number, optional): sigma_min in the sigma equation.
default: None
n_steps_annealing (int): number of steps for the sigma annealing.
default: 1000
action_key (NestedKey, optional): key of the action to be modified.
default: "action"
is_init_key (NestedKey, optional): key where to find the is_init flag used to reset the noise steps.
default: "is_init"
spec (TensorSpec, optional): if provided, the sampled action will be
projected onto the valid action space once explored. If not provided,
the exploration wrapper will attempt to recover it from the policy.
safe (bool): if ``True``, actions that are out of bounds given the action specs will be projected in the space
given the :obj:`TensorSpec.project` heuristic.
default: True
device (torch.device, optional): the device where the buffers have to be stored.
Examples:
>>> import torch
>>> from tensordict import TensorDict
>>> from torchrl.data import Bounded
>>> from torchrl.modules import OrnsteinUhlenbeckProcessWrapper, Actor
>>> torch.manual_seed(0)
>>> spec = Bounded(-1, 1, torch.Size([4]))
>>> module = torch.nn.Linear(4, 4, bias=False)
>>> policy = Actor(module=module, spec=spec)
>>> explorative_policy = OrnsteinUhlenbeckProcessWrapper(policy)
>>> td = TensorDict({"observation": torch.zeros(10, 4)}, batch_size=[10])
>>> print(explorative_policy(td))
TensorDict(
fields={
_ou_prev_noise: Tensor(torch.Size([10, 4]), dtype=torch.float32),
_ou_steps: Tensor(torch.Size([10, 1]), dtype=torch.int64),
action: Tensor(torch.Size([10, 4]), dtype=torch.float32),
observation: Tensor(torch.Size([10, 4]), dtype=torch.float32)},
batch_size=torch.Size([10]),
device=None,
is_shared=False)
"""
def __init__(
self,
policy: TensorDictModule,
*,
eps_init: float = 1.0,
eps_end: float = 0.1,
annealing_num_steps: int = 1000,
theta: float = 0.15,
mu: float = 0.0,
sigma: float = 0.2,
dt: float = 1e-2,
x0: Optional[Union[torch.Tensor, np.ndarray]] = None,
sigma_min: Optional[float] = None,
n_steps_annealing: int = 1000,
action_key: Optional[NestedKey] = "action",
is_init_key: Optional[NestedKey] = "is_init",
spec: TensorSpec = None,
safe: bool = True,
key: Optional[NestedKey] = None,
device: torch.device | None = None,
):
warnings.warn(
"OrnsteinUhlenbeckProcessWrapper is deprecated and will be removed "
"in v0.7. Please use torchrl.modules.OrnsteinUhlenbeckProcessModule "
"instead.",
category=DeprecationWarning,
)
if device is None and hasattr(policy, "parameters"):
for p in policy.parameters():
device = p.device
break
if key is not None:
action_key = key
warnings.warn(
f"the 'key' keyword argument of {type(self)} has been renamed 'action_key'. The 'key' entry will be deprecated soon."
)
super().__init__(policy)
self.ou = _OrnsteinUhlenbeckProcess(
theta=theta,
mu=mu,
sigma=sigma,
dt=dt,
x0=x0,
sigma_min=sigma_min,
n_steps_annealing=n_steps_annealing,
key=action_key,
device=device,
)
self.register_buffer("eps_init", torch.tensor(eps_init, device=device))
self.register_buffer("eps_end", torch.tensor(eps_end, device=device))
if self.eps_end > self.eps_init:
raise ValueError(
"eps should decrease over time or be constant, "
f"got eps_init={eps_init} and eps_end={eps_end}"
)
self.annealing_num_steps = annealing_num_steps
self.register_buffer(
"eps", torch.tensor(eps_init, dtype=torch.float32, device=device)
)
self.out_keys = list(self.td_module.out_keys) + self.ou.out_keys
self.is_init_key = is_init_key
noise_key = self.ou.noise_key
steps_key = self.ou.steps_key
if spec is not None:
if not isinstance(spec, Composite) and len(self.out_keys) >= 1:
spec = Composite({action_key: spec}, shape=spec.shape[:-1])
self._spec = spec
elif hasattr(self.td_module, "_spec"):
self._spec = self.td_module._spec.clone()
if action_key not in self._spec.keys(True, True):
self._spec[action_key] = None
elif hasattr(self.td_module, "spec"):
self._spec = self.td_module.spec.clone()
if action_key not in self._spec.keys(True, True):
self._spec[action_key] = None
else:
self._spec = Composite({key: None for key in policy.out_keys})
ou_specs = {
noise_key: None,
steps_key: None,
}
self._spec.update(ou_specs)
if len(set(self.out_keys)) != len(self.out_keys):
raise RuntimeError(f"Got multiple identical output keys: {self.out_keys}")
self.safe = safe
if self.safe:
self.register_forward_hook(_forward_hook_safe_action)
@property
def spec(self):
return self._spec
[docs] def step(self, frames: int = 1) -> None:
"""Updates the eps noise factor.
Args:
frames (int): number of frames of the current batch (corresponding to the number of updates to be made).
"""
for _ in range(frames):
if self.annealing_num_steps > 0:
self.eps.data.copy_(
torch.maximum(
self.eps_end,
(
self.eps
- (self.eps_init - self.eps_end) / self.annealing_num_steps
),
)
)
else:
raise ValueError(
f"{self.__class__.__name__}.step() called when "
f"self.annealing_num_steps={self.annealing_num_steps}. Expected a strictly positive "
f"number of frames."
)
[docs] def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
tensordict = super().forward(tensordict)
if exploration_type() == ExplorationType.RANDOM or exploration_type() is None:
is_init = tensordict.get(self.is_init_key, None)
if is_init is None:
warnings.warn(
f"The tensordict passed to {self.__class__.__name__} appears to be "
f"missing the '{self.is_init_key}' entry. This entry is used to "
f"reset the noise at the beginning of a trajectory, without it "
f"the behavior of this exploration method is undefined. "
f"This is allowed for BC compatibility purposes but it will be deprecated soon! "
f"To create a '{self.is_init_key}' entry, simply append an torchrl.envs.InitTracker "
f"transform to your environment with `env = TransformedEnv(env, InitTracker())`."
)
tensordict = self.ou.add_sample(tensordict, self.eps, is_init=is_init)
return tensordict
[docs]class OrnsteinUhlenbeckProcessModule(TensorDictModuleBase):
r"""Ornstein-Uhlenbeck exploration policy module.
Presented in "CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING", https://arxiv.org/pdf/1509.02971.pdf.
The OU exploration is to be used with continuous control policies and introduces a auto-correlated exploration
noise. This enables a sort of 'structured' exploration.
Noise equation:
.. math::
noise_t = noise_{t-1} + \theta * (mu - noise_{t-1}) * dt + \sigma_t * \sqrt{dt} * W
Sigma equation:
.. math::
\sigma_t = max(\sigma^{min, (-(\sigma_{t-1} - \sigma^{min}) / (n^{\text{steps annealing}}) * n^{\text{steps}} + \sigma))
To keep track of the steps and noise from sample to sample, an :obj:`"ou_prev_noise{id}"` and :obj:`"ou_steps{id}"` keys
will be written in the input/output tensordict. It is expected that the tensordict will be zeroed at reset,
indicating that a new trajectory is being collected. If not, and is the same tensordict is used for consecutive
trajectories, the step count will keep on increasing across rollouts. Note that the collector classes take care of
zeroing the tensordict at reset time.
.. note::
It is
crucial to incorporate a call to :meth:`~.step` in the training loop
to update the exploration factor.
Since it is not easy to capture this omission no warning or exception
will be raised if this is ommitted!
Args:
spec (TensorSpec): the spec used for sampling actions. The sampled
action will be projected onto the valid action space once explored.
eps_init (scalar): initial epsilon value, determining the amount of noise to be added.
default: 1.0
eps_end (scalar): final epsilon value, determining the amount of noise to be added.
default: 0.1
annealing_num_steps (int): number of steps it will take for epsilon to reach the eps_end value.
default: 1000
theta (scalar): theta factor in the noise equation
default: 0.15
mu (scalar): OU average (mu in the noise equation).
default: 0.0
sigma (scalar): sigma value in the sigma equation.
default: 0.2
dt (scalar): dt in the noise equation.
default: 0.01
x0 (Tensor, ndarray, optional): initial value of the process.
default: 0.0
sigma_min (number, optional): sigma_min in the sigma equation.
default: None
n_steps_annealing (int): number of steps for the sigma annealing.
default: 1000
Keyword Args:
action_key (NestedKey, optional): key of the action to be modified.
default: "action"
is_init_key (NestedKey, optional): key where to find the is_init flag used to reset the noise steps.
default: "is_init"
safe (boolean, optional): if False, the TensorSpec can be None. If it
is set to False but the spec is passed, the projection will still
happen.
Default is True.
device (torch.device, optional): the device where the buffers have to be stored.
Examples:
>>> import torch
>>> from tensordict import TensorDict
>>> from tensordict.nn import TensorDictSequential
>>> from torchrl.data import Bounded
>>> from torchrl.modules import OrnsteinUhlenbeckProcessModule, Actor
>>> torch.manual_seed(0)
>>> spec = Bounded(-1, 1, torch.Size([4]))
>>> module = torch.nn.Linear(4, 4, bias=False)
>>> policy = Actor(module=module, spec=spec)
>>> ou = OrnsteinUhlenbeckProcessModule(spec=spec)
>>> explorative_policy = TensorDictSequential(policy, ou)
>>> td = TensorDict({"observation": torch.zeros(10, 4)}, batch_size=[10])
>>> print(explorative_policy(td))
TensorDict(
fields={
_ou_prev_noise: Tensor(shape=torch.Size([10, 4]), device=cpu, dtype=torch.float32, is_shared=False),
_ou_steps: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False),
action: Tensor(shape=torch.Size([10, 4]), device=cpu, dtype=torch.float32, is_shared=False),
observation: Tensor(shape=torch.Size([10, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([10]),
device=None,
is_shared=False)
"""
def __init__(
self,
spec: TensorSpec,
eps_init: float = 1.0,
eps_end: float = 0.1,
annealing_num_steps: int = 1000,
theta: float = 0.15,
mu: float = 0.0,
sigma: float = 0.2,
dt: float = 1e-2,
x0: torch.Tensor | np.ndarray | None = None,
sigma_min: float | None = None,
n_steps_annealing: int = 1000,
*,
action_key: NestedKey = "action",
is_init_key: NestedKey = "is_init",
safe: bool = True,
device: torch.device | None = None,
):
super().__init__()
self.ou = _OrnsteinUhlenbeckProcess(
theta=theta,
mu=mu,
sigma=sigma,
dt=dt,
x0=x0,
sigma_min=sigma_min,
n_steps_annealing=n_steps_annealing,
key=action_key,
device=device,
)
self.register_buffer("eps_init", torch.tensor(eps_init, device=device))
self.register_buffer("eps_end", torch.tensor(eps_end, device=device))
if self.eps_end > self.eps_init:
raise ValueError(
"eps should decrease over time or be constant, "
f"got eps_init={eps_init} and eps_end={eps_end}"
)
self.annealing_num_steps = annealing_num_steps
self.register_buffer(
"eps", torch.tensor(eps_init, dtype=torch.float32, device=device)
)
self.in_keys = [self.ou.key]
self.out_keys = [self.ou.key] + self.ou.out_keys
self.is_init_key = is_init_key
noise_key = self.ou.noise_key
steps_key = self.ou.steps_key
if spec is not None:
if not isinstance(spec, Composite) and len(self.out_keys) >= 1:
spec = Composite({action_key: spec}, shape=spec.shape[:-1])
self._spec = spec
else:
raise RuntimeError("spec cannot be None.")
ou_specs = {
noise_key: None,
steps_key: None,
}
self._spec.update(ou_specs)
if len(set(self.out_keys)) != len(self.out_keys):
raise RuntimeError(f"Got multiple identical output keys: {self.out_keys}")
self.safe = safe
if self.safe:
self.register_forward_hook(_forward_hook_safe_action)
@property
def spec(self):
return self._spec
[docs] def step(self, frames: int = 1) -> None:
"""Updates the eps noise factor.
Args:
frames (int): number of frames of the current batch (corresponding to the number of updates to be made).
"""
for _ in range(frames):
if self.annealing_num_steps > 0:
self.eps.data.copy_(
torch.maximum(
self.eps_end,
(
self.eps
- (self.eps_init - self.eps_end) / self.annealing_num_steps
),
)
)
else:
raise ValueError(
f"{self.__class__.__name__}.step() called when "
f"self.annealing_num_steps={self.annealing_num_steps}. Expected a strictly positive "
f"number of frames."
)
[docs] def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
if exploration_type() == ExplorationType.RANDOM or exploration_type() is None:
is_init = tensordict.get(self.is_init_key, None)
if is_init is None:
warnings.warn(
f"The tensordict passed to {self.__class__.__name__} appears to be "
f"missing the '{self.is_init_key}' entry. This entry is used to "
f"reset the noise at the beginning of a trajectory, without it "
f"the behavior of this exploration method is undefined. "
f"This is allowed for BC compatibility purposes but it will be deprecated soon! "
f"To create a '{self.is_init_key}' entry, simply append an torchrl.envs.InitTracker "
f"transform to your environment with `env = TransformedEnv(env, InitTracker())`."
)
tensordict = self.ou.add_sample(tensordict, self.eps, is_init=is_init)
return tensordict
# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class _OrnsteinUhlenbeckProcess(nn.Module):
def __init__(
self,
theta: float,
mu: float = 0.0,
sigma: float = 0.2,
dt: float = 1e-2,
x0: Optional[Union[torch.Tensor, np.ndarray]] = None,
sigma_min: Optional[float] = None,
n_steps_annealing: int = 1000,
key: Optional[NestedKey] = "action",
is_init_key: Optional[NestedKey] = "is_init",
device: torch.device | None = None,
):
super().__init__()
self.register_buffer("_empty_tensor_device", torch.zeros(0, device=device))
self.mu = mu
self.sigma = sigma
if sigma_min is not None:
self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
self.c = sigma
self.sigma_min = sigma_min
else:
self.m = 0.0
self.c = sigma
self.sigma_min = sigma
self.theta = theta
self.mu = mu
self.dt = dt
self.x0 = x0 if x0 is not None else 0.0
self.key = key
self.is_init_key = is_init_key
self._noise_key = "_ou_prev_noise"
self._steps_key = "_ou_steps"
self.out_keys = [self.noise_key, self.steps_key]
self._auto_buffer()
def _auto_buffer(self):
for key, item in list(self.__dict__.items()):
if isinstance(item, torch.Tensor):
delattr(self, key)
self.register_buffer(key, item)
@property
def noise_key(self):
return self._noise_key # + str(id(self))
@property
def steps_key(self):
return self._steps_key # + str(id(self))
def _make_noise_pair(
self,
action_tensordict: TensorDictBase,
tensordict: TensorDictBase,
is_init: torch.Tensor,
):
device = tensordict.device
if device is None:
device = self._empty_tensor_device.device
if self.steps_key not in tensordict.keys():
noise = torch.zeros(tensordict.get(self.key).shape, device=device)
steps = torch.zeros(
action_tensordict.batch_size, dtype=torch.long, device=device
)
tensordict.set(self.noise_key, noise)
tensordict.set(self.steps_key, steps)
else:
# We must clone for cudagraph, otherwise the same tensor may re-enter the compiled region
noise = tensordict.get(self.noise_key).clone()
steps = tensordict.get(self.steps_key).clone()
if is_init is not None:
noise = torch.masked_fill(noise, expand_right(is_init, noise.shape), 0)
steps = torch.masked_fill(steps, expand_right(is_init, steps.shape), 0)
return noise, steps
def add_sample(
self,
tensordict: TensorDictBase,
eps: float = 1.0,
is_init: Optional[torch.Tensor] = None,
) -> TensorDictBase:
# Get the nested tensordict where the action lives
if isinstance(self.key, tuple) and len(self.key) > 1:
action_tensordict = tensordict.get(self.key[:-1])
else:
action_tensordict = tensordict
if is_init is None:
is_init = tensordict.get(self.is_init_key, None)
if (
is_init is not None
): # is_init has the shape of done_spec, let's bring it to the action_tensordict shape
if is_init.ndim > 1 and is_init.shape[-1] == 1:
is_init = is_init.squeeze(-1) # Squeeze dangling dim
if (
action_tensordict.ndim >= is_init.ndim
): # if is_init has fewer dimensions than action_tensordict we expand it
is_init = expand_right(is_init, action_tensordict.shape)
else:
is_init = is_init.sum(
tuple(range(action_tensordict.batch_dims, is_init.ndim)),
dtype=torch.bool,
) # otherwise we reduce it to that batch_size
if is_init.shape != action_tensordict.shape:
raise ValueError(
f"'{self.is_init_key}' shape not compatible with action tensordict shape, "
f"got {tensordict.get(self.is_init_key).shape} and {action_tensordict.shape}"
)
prev_noise, n_steps = self._make_noise_pair(
action_tensordict, tensordict, is_init
)
prev_noise = prev_noise + self.x0
noise = (
prev_noise
+ self.theta * (self.mu - prev_noise) * self.dt
+ self.current_sigma(expand_as_right(n_steps, prev_noise))
* np.sqrt(self.dt)
* torch.randn_like(prev_noise)
)
tensordict.set(self.noise_key, noise - self.x0)
tensordict.set(self.key, tensordict.get(self.key) + eps * noise)
tensordict.set(self.steps_key, n_steps + 1)
return tensordict
def current_sigma(self, n_steps: torch.Tensor) -> torch.Tensor:
sigma = (self.m * n_steps + self.c).clamp_min(self.sigma_min)
return sigma