Shortcuts

Source code for torchrl.data.datasets.openml

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import annotations

import os
from pathlib import Path
from typing import Callable

import numpy as np
from tensordict import TensorDict
from torchrl.data.datasets.common import BaseDatasetExperienceReplay

from torchrl.data.datasets.utils import _get_root_dir
from torchrl.data.replay_buffers import (
    Sampler,
    SamplerWithoutReplacement,
    TensorStorage,
    Writer,
)


[docs]class OpenMLExperienceReplay(BaseDatasetExperienceReplay): """An experience replay for OpenML data. This class provides an easy entry point for public datasets. See "Dua, D. and Graff, C. (2017) UCI Machine Learning Repository. http://archive.ics.uci.edu/ml" The data format follows the :ref:`TED convention <TED-format>`. The data is accessed via scikit-learn. Make sure sklearn and pandas are installed before retrieving the data: .. code-block:: $ pip install scikit-learn pandas -U Args: name (str): the following datasets are supported: ``"adult_num"``, ``"adult_onehot"``, ``"mushroom_num"``, ``"mushroom_onehot"``, ``"covertype"``, ``"shuttle"`` and ``"magic"``. batch_size (int): the batch size to use during sampling. sampler (Sampler, optional): the sampler to be used. If none is provided a default RandomSampler() will be used. writer (Writer, optional): the writer to be used. If none is provided a default :class:`~torchrl.data.replay_buffers.writers.ImmutableDatasetWriter` will be used. collate_fn (callable, optional): merges a list of samples to form a mini-batch of Tensor(s)/outputs. Used when using batched loading from a map-style dataset. pin_memory (bool): whether pin_memory() should be called on the rb samples. prefetch (int, optional): number of next batches to be prefetched using multithreading. transform (Transform, optional): Transform to be executed when sample() is called. To chain transforms use the :class:`~torchrl.envs.transforms.transforms.Compose` class. """ def __init__( self, name: str, batch_size: int, root: Path | None = None, sampler: Sampler | None = None, writer: Writer | None = None, collate_fn: Callable | None = None, pin_memory: bool = False, prefetch: int | None = None, transform: "Transform" | None = None, # noqa-F821 ): if sampler is None: sampler = SamplerWithoutReplacement() if root is None: root = _get_root_dir("openml") self.root = Path(root) self.dataset_id = name if not self._is_downloaded(): dataset = self._get_data( name, ) storage = TensorStorage(dataset.memmap(self._dataset_path)) else: dataset = TensorDict.load_memmap(self._dataset_path) storage = TensorStorage(dataset) self.max_outcome_val = dataset["y"].max().item() super().__init__( batch_size=batch_size, storage=storage, sampler=sampler, writer=writer, collate_fn=collate_fn, pin_memory=pin_memory, prefetch=prefetch, transform=transform, ) @property def _dataset_path(self): return self.root / self.dataset_id def _is_downloaded(self): return os.path.exists(self._dataset_path) @classmethod def _get_data(cls, dataset_name): try: import pandas # noqa: F401 from sklearn.datasets import fetch_openml from sklearn.preprocessing import ( LabelEncoder, OneHotEncoder, StandardScaler, ) except ModuleNotFoundError: raise ModuleNotFoundError( "Make sure scikit-learn and pandas are installed before " f"creating a {cls.__name__} instance." ) if dataset_name in ["adult_num", "adult_onehot"]: X, y = fetch_openml("adult", version=1, return_X_y=True) is_NaN = X.isna() row_has_NaN = is_NaN.any(axis=1) X = X[~row_has_NaN] # y = y[~row_has_NaN] y = X["occupation"] X = X.drop(["occupation"], axis=1) cat_ix = X.select_dtypes(include=["category"]).columns num_ix = X.select_dtypes(include=["int64", "float64"]).columns encoder = LabelEncoder() # now apply the transformation to all the columns: for col in cat_ix: X[col] = encoder.fit_transform(X[col]) y = encoder.fit_transform(y) if dataset_name == "adult_onehot": cat_features = OneHotEncoder(sparse_output=False).fit_transform( X[cat_ix] ) num_features = StandardScaler().fit_transform(X[num_ix]) X = np.concatenate((num_features, cat_features), axis=1) else: X = StandardScaler().fit_transform(X) elif dataset_name in ["mushroom_num", "mushroom_onehot"]: X, y = fetch_openml("mushroom", version=1, return_X_y=True) encoder = LabelEncoder() # now apply the transformation to all the columns: for col in X.columns: X[col] = encoder.fit_transform(X[col]) # X = X.drop(["veil-type"],axis=1) y = encoder.fit_transform(y) if dataset_name == "mushroom_onehot": X = OneHotEncoder(sparse_output=False).fit_transform(X) else: X = StandardScaler().fit_transform(X) elif dataset_name == "covertype": # https://www.openml.org/d/150 # there are some 0/1 features -> consider just numeric X, y = fetch_openml("covertype", version=3, return_X_y=True) X = StandardScaler().fit_transform(X) y = LabelEncoder().fit_transform(y) elif dataset_name == "shuttle": # https://www.openml.org/d/40685 # all numeric, no missing values X, y = fetch_openml("shuttle", version=1, return_X_y=True) X = StandardScaler().fit_transform(X) y = LabelEncoder().fit_transform(y) elif dataset_name == "magic": # https://www.openml.org/d/1120 # all numeric, no missing values X, y = fetch_openml("MagicTelescope", version=1, return_X_y=True) X = StandardScaler().fit_transform(X) y = LabelEncoder().fit_transform(y) else: raise RuntimeError("Dataset does not exist") return TensorDict({"X": X, "y": y}, X.shape[:1])

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources