Note
Click here to download the full example code
Audio Datasets¶
torchaudio
provides easy access to common, publicly accessible
datasets. Please refer to the official documentation for the list of
available datasets.
# When running this tutorial in Google Colab, install the required packages
# with the following.
# !pip install torchaudio
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
Out:
1.12.0
0.12.0
Preparing data and utility functions (skip this section)¶
# @title Prepare data and utility functions. {display-mode: "form"}
# @markdown
# @markdown You do not need to look into this cell.
# @markdown Just execute once and you are good to go.
# -------------------------------------------------------------------------------
# Preparation of data and helper functions.
# -------------------------------------------------------------------------------
import multiprocessing
import os
import matplotlib.pyplot as plt
from IPython.display import Audio, display
_SAMPLE_DIR = "_assets"
YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
os.makedirs(YESNO_DATASET_PATH, exist_ok=True)
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].specgram(waveform[c], Fs=sample_rate)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
if xlim:
axes[c].set_xlim(xlim)
figure.suptitle(title)
plt.show(block=False)
def play_audio(waveform, sample_rate):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
if num_channels == 1:
display(Audio(waveform[0], rate=sample_rate))
elif num_channels == 2:
display(Audio((waveform[0], waveform[1]), rate=sample_rate))
else:
raise ValueError("Waveform with more than 2 channels are not supported.")
Here, we show how to use the
torchaudio.datasets.YESNO()
dataset.
dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)
for i in [1, 3, 5]:
waveform, sample_rate, label = dataset[i]
plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
play_audio(waveform, sample_rate)
Out:
0%| | 0.00/4.49M [00:00<?, ?B/s]
1%| | 32.0k/4.49M [00:00<00:25, 186kB/s]
5%|4 | 208k/4.49M [00:00<00:06, 669kB/s]
19%|#9 | 888k/4.49M [00:00<00:01, 2.14MB/s]
56%|#####6 | 2.52M/4.49M [00:00<00:00, 6.18MB/s]
97%|#########7| 4.36M/4.49M [00:00<00:00, 9.14MB/s]
100%|##########| 4.49M/4.49M [00:00<00:00, 6.15MB/s]
<IPython.lib.display.Audio object>
<IPython.lib.display.Audio object>
<IPython.lib.display.Audio object>
Total running time of the script: ( 0 minutes 2.060 seconds)