Source code for torchtune.datasets._text_completion
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from typing import Any, Callable, Dict, List, Mapping, Optional, Union
from datasets import load_dataset
from torch.utils.data import Dataset
from torchtune.data._utils import truncate
from torchtune.datasets._packed import PackedDataset
from torchtune.modules.tokenizers import ModelTokenizer
[docs]class TextCompletionDataset(Dataset):
"""
Freeform dataset for any unstructured text corpus. Quickly load any dataset
from Hugging Face or local disk and tokenize it for your model.
Args:
tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
source (str): path to dataset repository on Hugging Face. For local datasets,
define source as the data file type (e.g. "json", "csv", "text") and pass
in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
(https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
for more details.
column (str): name of column in the sample that contains the text data. This is typically required
for Hugging Face datasets or tabular data. For local datasets with a single column
(e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
when loaded into memory. Default is "text".
add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
details.
**load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. See Hugging
Face's `API ref <https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset>`_
for more details.
"""
def __init__(
self,
tokenizer: ModelTokenizer,
source: str,
column: str = "text",
add_eos: bool = True,
filter_fn: Optional[Callable] = None,
**load_dataset_kwargs: Dict[str, Any],
) -> None:
self._tokenizer = tokenizer
self._data = load_dataset(source, **load_dataset_kwargs)
self._column = column
self.add_eos = add_eos
if filter_fn is not None:
self._data = self._data.filter(filter_fn)
def __len__(self):
return len(self._data)
def __getitem__(self, index: int) -> Dict[str, List[int]]:
sample = self._data[index]
return self._prepare_sample(sample)
def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]:
prompt = sample[self._column]
tokens = self._tokenizer.encode(text=prompt, add_bos=True, add_eos=self.add_eos)
# Truncate if needed, but don't coerce EOS id
if self._tokenizer.max_seq_len is not None:
tokens = truncate(tokens, self._tokenizer.max_seq_len - 1)
# No need to offset labels by 1 - happens in the recipe
labels = tokens.copy()
return {"tokens": tokens, "labels": labels}
[docs]def text_completion_dataset(
tokenizer: ModelTokenizer,
source: str,
column: str = "text",
add_eos: bool = True,
packed: bool = False,
split_across_pack: bool = True,
split: str = "train",
filter_fn: Optional[Callable] = None,
**load_dataset_kwargs: Dict[str, Any],
) -> Union[TextCompletionDataset, PackedDataset]:
"""
Build a configurable dataset from a freeform, unstructured text corpus similar
to datasets used in pre-training. This method should be
used to configure a custom text dataset from the yaml config instead of
using :class:`~torchtune.datasets.TextCompletionDataset` directly, as it is made to be config friendly.
Args:
tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
source (str): path to dataset repository on Hugging Face. For local datasets,
define source as the data file type (e.g. "json", "csv", "text") and pass
in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
(https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
for more details.
column (str): name of column in the sample that contains the text data. This is typically required
for Hugging Face datasets or tabular data. For local datasets with a single column
(e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
when loaded into memory. Default is "text".
add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
split_across_pack (bool): if the last sample in a pack does not fit in ``max_seq_len``,
split the sample into the next pack, or move it entirely to the beginning of the next pack.
For pre-training, typically this is set to True for general text completion. For
fine-tuning, typically this is set to False to avoid truncating sentences in instruct
tuning. This argument is ignored if ``packed=False``. Default is True.
split (str): ``split`` argument for ``datasets.load_dataset``. You can use this argument to load a subset
of a given split, e.g. ``split="train[:10%]"``. Default is "train".
filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
details.
**load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
Examples:
>>> from torchtune.datasets import text_completion_dataset
>>> dataset = text_completion_dataset(
... tokenizer=tokenizer,
... source="allenai/c4",
... column="text",
... data_dir="realnewslike",
... packed=False,
... split="train",
... )
This can also be accomplished via the yaml config::
dataset:
_component_: torchtune.datasets.text_completion_dataset
source: allenai/c4
column: text
data_dir: realnewslike
packed: False
split: train
Returns:
Union[TextCompletionDataset, PackedDataset]: the configured :class:`~torchtune.datasets.TextCompletionDataset`
or :class:`~torchtune.datasets.PackedDataset` if ``packed=True``
Raises:
ValueError: If ``packed=True`` and ``tokenizer.max_seq_len`` is not set.
"""
ds = TextCompletionDataset(
tokenizer=tokenizer,
source=source,
column=column,
add_eos=add_eos,
split=split,
filter_fn=filter_fn,
**load_dataset_kwargs,
)
if packed:
if tokenizer.max_seq_len is None:
raise ValueError(
"PackedDataset requires a max_seq_len to be set on the tokenizer."
)
return PackedDataset(
ds, max_seq_len=tokenizer.max_seq_len, split_across_pack=split_across_pack
)
return ds