Source code for torchaudio.datasets.librilight_limited

import os
from pathlib import Path
from typing import List, Tuple, Union

import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.librispeech import _get_librispeech_metadata
from torchaudio.datasets.utils import _extract_tar


_ARCHIVE_NAME = "librispeech_finetuning"
_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
_SUBSET_MAP = {"10min": ["1h/0"], "1h": ["1h/*"], "10h": ["1h/*", "9h"]}


def _get_fileids_paths(path: Path, folders: List[str], _ext_audio: str) -> List[Tuple[str, str]]:
    """Get the file names and the corresponding file paths without `speaker_id`
    and `chapter_id` directories.
    The format of path is like:
        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
        {root}/{_ARCHIVE_NAME}/9h/[clean, other]

    Args:
        path (Path): Root path to the dataset.
        folders (List[str]): Folders that contain the desired audio files.
        _ext_audio (str): Extension of audio files.

    Returns:
        List[Tuple[str, str]]:
            List of tuples where the first element is the relative path to the audio file.
            The format of relative path is like:
            1h/[0-5]/[clean, other] or 9h/[clean, other]
            The second element is the file name without audio extension.
    """

    path = Path(path)
    files_paths = []
    for folder in folders:
        paths = [p.relative_to(path) for p in path.glob(f"{folder}/*/*/*/*{_ext_audio}")]
        files_paths += [(str(p.parent.parent.parent), str(p.stem)) for p in paths]  # get subset folder and file name
    files_paths.sort(key=lambda x: x[0] + x[1])
    return files_paths


[docs]class LibriLightLimited(Dataset):
    """Subset of Libri-light :cite:`librilight` dataset,
    which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
            (Default: ``"10min"``).
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
    """

    _ext_txt = ".trans.txt"
    _ext_audio = ".flac"

    def __init__(
        self,
        root: Union[str, Path],
        subset: str = "10min",
        download: bool = False,
    ) -> None:
        if subset not in _SUBSET_MAP:
            raise ValueError(f"`subset` must be one of {_SUBSET_MAP.keys()}. Found: {subset}")
        folders = _SUBSET_MAP[subset]

        root = os.fspath(root)
        self._path = os.path.join(root, _ARCHIVE_NAME)
        archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
        if not os.path.isdir(self._path):
            if not download:
                raise RuntimeError("Dataset not found. Please use `download=True` to download")
            if not os.path.isfile(archive):
                download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
            _extract_tar(archive)
        self._fileids_paths = _get_fileids_paths(self._path, folders, self._ext_audio)

[docs]    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
        """Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded
        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
        """
        file_path, fileid = self._fileids_paths[n]
        metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt)
        waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0]))
        return (waveform,) + metadata[1:]

    def __len__(self) -> int:
        return len(self._fileids_paths)
Source code for torchaudio.datasets.librilight_limited

Docs

Tutorials

Resources