Shortcuts

Source code for torchrl.envs.custom.tictactoeenv

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import annotations

from typing import Optional

import torch
from tensordict import TensorDict, TensorDictBase

from torchrl.data.tensor_specs import Categorical, Composite, Unbounded
from torchrl.envs.common import EnvBase


[docs]class TicTacToeEnv(EnvBase): """A Tic-Tac-Toe implementation. Keyword Args: single_player (bool, optional): whether one or two players have to be accounted for. ``single_player=True`` means that ``"player1"`` is playing randomly. If ``False`` (default), at each turn, one of the two players has to play. device (torch.device, optional): the device where to put the tensors. Defaults to ``None`` (default device). The environment is stateless. To run it across multiple batches, call >>> env.reset(TensorDict(batch_size=desired_batch_size)) If the ``"mask"`` entry is present, ``rand_action`` takes it into account to generate the next action. Any policy executed on this env should take this mask into account, as well as the turn of the player (stored in the ``"turn"`` output entry). Specs: >>> print(env.specs) Composite( output_spec: Composite( full_observation_spec: Composite( board: Categorical( shape=torch.Size([3, 3]), space=DiscreteBox(n=2), dtype=torch.int32, domain=discrete), turn: Categorical( shape=torch.Size([1]), space=DiscreteBox(n=2), dtype=torch.int32, domain=discrete), mask: Categorical( shape=torch.Size([9]), space=DiscreteBox(n=2), dtype=torch.bool, domain=discrete), shape=torch.Size([])), full_reward_spec: Composite( player0: Composite( reward: UnboundedContinuous( shape=torch.Size([1]), space=ContinuousBox( low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)), dtype=torch.float32, domain=continuous), shape=torch.Size([])), player1: Composite( reward: UnboundedContinuous( shape=torch.Size([1]), space=ContinuousBox( low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True), high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True)), dtype=torch.float32, domain=continuous), shape=torch.Size([])), shape=torch.Size([])), full_done_spec: Composite( done: Categorical( shape=torch.Size([1]), space=DiscreteBox(n=2), dtype=torch.bool, domain=discrete), terminated: Categorical( shape=torch.Size([1]), space=DiscreteBox(n=2), dtype=torch.bool, domain=discrete), truncated: Categorical( shape=torch.Size([1]), space=DiscreteBox(n=2), dtype=torch.bool, domain=discrete), shape=torch.Size([])), shape=torch.Size([])), input_spec: Composite( full_state_spec: Composite( board: Categorical( shape=torch.Size([3, 3]), space=DiscreteBox(n=2), dtype=torch.int32, domain=discrete), turn: Categorical( shape=torch.Size([1]), space=DiscreteBox(n=2), dtype=torch.int32, domain=discrete), mask: Categorical( shape=torch.Size([9]), space=DiscreteBox(n=2), dtype=torch.bool, domain=discrete), shape=torch.Size([])), full_action_spec: Composite( action: Categorical( shape=torch.Size([1]), space=DiscreteBox(n=9), dtype=torch.int64, domain=discrete), shape=torch.Size([])), shape=torch.Size([])), shape=torch.Size([])) To run a dummy rollout, execute the following command: Examples: >>> env = TicTacToeEnv() >>> env.rollout(10) TensorDict( fields={ action: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.int64, is_shared=False), board: Tensor(shape=torch.Size([9, 3, 3]), device=cpu, dtype=torch.int32, is_shared=False), done: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.bool, is_shared=False), mask: Tensor(shape=torch.Size([9, 9]), device=cpu, dtype=torch.bool, is_shared=False), next: TensorDict( fields={ board: Tensor(shape=torch.Size([9, 3, 3]), device=cpu, dtype=torch.int32, is_shared=False), done: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.bool, is_shared=False), mask: Tensor(shape=torch.Size([9, 9]), device=cpu, dtype=torch.bool, is_shared=False), player0: TensorDict( fields={ reward: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([9]), device=None, is_shared=False), player1: TensorDict( fields={ reward: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, batch_size=torch.Size([9]), device=None, is_shared=False), terminated: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.bool, is_shared=False), truncated: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.bool, is_shared=False), turn: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.int32, is_shared=False)}, batch_size=torch.Size([9]), device=None, is_shared=False), terminated: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.bool, is_shared=False), truncated: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.bool, is_shared=False), turn: Tensor(shape=torch.Size([9, 1]), device=cpu, dtype=torch.int32, is_shared=False)}, batch_size=torch.Size([9]), device=None, is_shared=False) """ # batch_locked is set to False since various batch sizes can be provided to the env batch_locked: bool = False def __init__(self, *, single_player: bool = False, device=None): super().__init__(device=device) self.single_player = single_player self.action_spec: Unbounded = Categorical( n=9, shape=(), device=device, ) self.full_observation_spec: Composite = Composite( board=Unbounded(shape=(3, 3), dtype=torch.int, device=device), turn=Categorical( 2, shape=(1,), dtype=torch.int, device=device, ), mask=Categorical( 2, shape=(9,), dtype=torch.bool, device=device, ), device=device, ) self.state_spec: Composite = self.observation_spec.clone() self.reward_spec: Unbounded = Composite( { ("player0", "reward"): Unbounded(shape=(1,), device=device), ("player1", "reward"): Unbounded(shape=(1,), device=device), }, device=device, ) self.full_done_spec: Categorical = Composite( done=Categorical(2, shape=(1,), dtype=torch.bool, device=device), device=device, ) self.full_done_spec["terminated"] = self.full_done_spec["done"].clone() self.full_done_spec["truncated"] = self.full_done_spec["done"].clone() def _reset(self, reset_td: TensorDict) -> TensorDict: shape = reset_td.shape if reset_td is not None else () state = self.state_spec.zero(shape) state["board"] -= 1 state["mask"].fill_(True) return state.update(self.full_done_spec.zero(shape)) def _step(self, state: TensorDict) -> TensorDict: board = state["board"].clone() turn = state["turn"].clone() action = state["action"] board.flatten(-2, -1).scatter_(index=action.unsqueeze(-1), dim=-1, value=1) wins = self.win(board, action) mask = board.flatten(-2, -1) == -1 done = wins | ~mask.any(-1, keepdim=True) terminated = done.clone() reward_0 = wins & (turn == 0) reward_1 = wins & (turn == 1) state = TensorDict( { "done": done, "terminated": terminated, ("player0", "reward"): reward_0.float(), ("player1", "reward"): reward_1.float(), "board": torch.where(board == -1, board, 1 - board), "turn": 1 - turn, "mask": mask, }, batch_size=state.batch_size, ) if self.single_player: select = (~done & (turn == 0)).squeeze(-1) if select.all(): state_select = state elif select.any(): state_select = state[select] else: return state state_select = self._step(self.rand_action(state_select)) if select.all(): return state_select return torch.where(done, state, state_select) return state def _set_seed(self, seed: int | None): ... @staticmethod def win(board: torch.Tensor, action: torch.Tensor): row = action // 3 # type: ignore col = action % 3 # type: ignore if board[..., row, :].sum() == 3: return True if board[..., col].sum() == 3: return True if board.diagonal(0, -2, -1).sum() == 3: return True if board.flip(-1).diagonal(0, -2, -1).sum() == 3: return True return False @staticmethod def full(board: torch.Tensor) -> bool: return torch.sym_int(board.abs().sum()) == 9 @staticmethod def get_action_mask(): pass
[docs] def rand_action(self, tensordict: Optional[TensorDictBase] = None): mask = tensordict.get("mask") action_spec = self.action_spec if tensordict.ndim: action_spec = action_spec.expand(tensordict.shape) else: action_spec = action_spec.clone() action_spec.update_mask(mask) tensordict.set(self.action_key, action_spec.rand()) return tensordict

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources