Source code for torchaudio.prototype.transforms._transforms
from typing import Callable, Optional
import torch
from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
from torchaudio.transforms import Spectrogram
[docs]class BarkScale(torch.nn.Module):
r"""Turn a normal STFT into a bark frequency STFT with triangular filter banks.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
Args:
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
norm (str or None, optional): If ``"slaney"``, divide the triangular bark weights by the width of the bark band
(area normalization). (Default: ``None``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> barkscale_transform = transforms.BarkScale(sample_rate=sample_rate, n_stft=1024 // 2 + 1)
>>> barkscale_spectrogram = barkscale_transform(spectrogram)
See also:
:py:func:`torchaudio.prototype.functional.barkscale_fbanks` - The function used to
generate the filter banks.
"""
__constants__ = ["n_barks", "sample_rate", "f_min", "f_max"]
def __init__(
self,
n_barks: int = 128,
sample_rate: int = 16000,
f_min: float = 0.0,
f_max: Optional[float] = None,
n_stft: int = 201,
bark_scale: str = "traunmuller",
) -> None:
super(BarkScale, self).__init__()
self.n_barks = n_barks
self.sample_rate = sample_rate
self.f_max = f_max if f_max is not None else float(sample_rate // 2)
self.f_min = f_min
self.bark_scale = bark_scale
if f_min > self.f_max:
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, self.bark_scale)
self.register_buffer("fb", fb)
[docs] def forward(self, specgram: torch.Tensor) -> torch.Tensor:
r"""
Args:
specgram (torch.Tensor): A spectrogram STFT of dimension (..., freq, time).
Returns:
torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
"""
# (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
bark_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
return bark_specgram
[docs]class InverseBarkScale(torch.nn.Module):
r"""Estimate a STFT in normal frequency domain from bark frequency domain.
.. devices:: CPU CUDA
It minimizes the euclidian norm between the input bark-spectrogram and the product between
the estimated spectrogram and the filter banks using SGD.
Args:
n_stft (int): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
n_barks (int, optional): Number of bark filterbanks. (Default: ``128``)
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
max_iter (int, optional): Maximum number of optimization iterations. (Default: ``100000``)
tolerance_loss (float, optional): Value of loss to stop optimization at. (Default: ``1e-5``)
tolerance_change (float, optional): Difference in losses to stop optimization at. (Default: ``1e-8``)
sgdargs (dict or None, optional): Arguments for the SGD optimizer. (Default: ``None``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> mel_spectrogram_transform = transforms.BarkSpectrogram(sample_rate, n_fft=1024)
>>> mel_spectrogram = bark_spectrogram_transform(waveform)
>>> inverse_barkscale_transform = transforms.InverseBarkScale(n_stft=1024 // 2 + 1)
>>> spectrogram = inverse_barkscale_transform(mel_spectrogram)
"""
__constants__ = [
"n_stft",
"n_barks",
"sample_rate",
"f_min",
"f_max",
"max_iter",
"tolerance_loss",
"tolerance_change",
"sgdargs",
]
def __init__(
self,
n_stft: int,
n_barks: int = 128,
sample_rate: int = 16000,
f_min: float = 0.0,
f_max: Optional[float] = None,
max_iter: int = 100000,
tolerance_loss: float = 1e-5,
tolerance_change: float = 1e-8,
sgdargs: Optional[dict] = None,
bark_scale: str = "traunmuller",
) -> None:
super(InverseBarkScale, self).__init__()
self.n_barks = n_barks
self.sample_rate = sample_rate
self.f_max = f_max or float(sample_rate // 2)
self.f_min = f_min
self.max_iter = max_iter
self.tolerance_loss = tolerance_loss
self.tolerance_change = tolerance_change
self.sgdargs = sgdargs or {"lr": 0.1, "momentum": 0.9}
if f_min > self.f_max:
raise ValueError("Require f_min: {} <= f_max: {}".format(f_min, self.f_max))
fb = barkscale_fbanks(n_stft, self.f_min, self.f_max, self.n_barks, self.sample_rate, bark_scale)
self.register_buffer("fb", fb)
[docs] def forward(self, barkspec: torch.Tensor) -> torch.Tensor:
r"""
Args:
barkspec (torch.Tensor): A Bark frequency spectrogram of dimension (..., ``n_barks``, time)
Returns:
torch.Tensor: Linear scale spectrogram of size (..., freq, time)
"""
# pack batch
shape = barkspec.size()
barkspec = barkspec.view(-1, shape[-2], shape[-1])
n_barks, time = shape[-2], shape[-1]
freq, _ = self.fb.size() # (freq, n_mels)
barkspec = barkspec.transpose(-1, -2)
if self.n_barks != n_barks:
raise ValueError("Expected an input with {} bark bins. Found: {}".format(self.n_barks, n_barks))
specgram = torch.rand(
barkspec.size()[0], time, freq, requires_grad=True, dtype=barkspec.dtype, device=barkspec.device
)
optim = torch.optim.SGD([specgram], **self.sgdargs)
loss = float("inf")
for _ in range(self.max_iter):
optim.zero_grad()
diff = barkspec - specgram.matmul(self.fb)
new_loss = diff.pow(2).sum(axis=-1).mean()
# take sum over bark-frequency then average over other dimensions
# so that loss threshold is applied par unit timeframe
new_loss.backward()
optim.step()
specgram.data = specgram.data.clamp(min=0)
new_loss = new_loss.item()
if new_loss < self.tolerance_loss or abs(loss - new_loss) < self.tolerance_change:
break
loss = new_loss
specgram.requires_grad_(False)
specgram = specgram.clamp(min=0).transpose(-1, -2)
# unpack batch
specgram = specgram.view(shape[:-2] + (freq, time))
return specgram
[docs]class BarkSpectrogram(torch.nn.Module):
r"""Create BarkSpectrogram for a raw audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd TorchScript
This is a composition of :py:func:`torchaudio.transforms.Spectrogram` and
and :py:func:`torchaudio.transforms.BarkScale`.
Sources
* https://www.fon.hum.uva.nl/praat/manual/BarkSpectrogram.html
* Traunmüller, Hartmut. "Analytical Expressions for the Tonotopic Sensory Scale." Journal of the Acoustical
* Society of America. Vol. 88, Issue 1, 1990, pp. 97–100.
* https://ccrma.stanford.edu/courses/120-fall-2003/lecture-5.html
Args:
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
win_length (int or None, optional): Window size. (Default: ``n_fft``)
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
f_min (float, optional): Minimum frequency. (Default: ``0.``)
f_max (float or None, optional): Maximum frequency. (Default: ``None``)
pad (int, optional): Two sided padding of signal. (Default: ``0``)
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
power (float, optional): Exponent for the magnitude spectrogram,
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
center (bool, optional): whether to pad :attr:`waveform` on both sides so
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
(Default: ``True``)
pad_mode (string, optional): controls the padding method used when
:attr:`center` is ``True``. (Default: ``"reflect"``)
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> transform = transforms.BarkSpectrogram(sample_rate)
>>> bark_specgram = transform(waveform) # (channel, n_barks, time)
See also:
:py:func:`torchaudio.functional.melscale_fbanks` - The function used to
generate the filter banks.
"""
__constants__ = ["sample_rate", "n_fft", "win_length", "hop_length", "pad", "n_barks", "f_min"]
def __init__(
self,
sample_rate: int = 16000,
n_fft: int = 400,
win_length: Optional[int] = None,
hop_length: Optional[int] = None,
f_min: float = 0.0,
f_max: Optional[float] = None,
pad: int = 0,
n_barks: int = 128,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
power: float = 2.0,
normalized: bool = False,
wkwargs: Optional[dict] = None,
center: bool = True,
pad_mode: str = "reflect",
bark_scale: str = "traunmuller",
) -> None:
super(BarkSpectrogram, self).__init__()
self.sample_rate = sample_rate
self.n_fft = n_fft
self.win_length = win_length if win_length is not None else n_fft
self.hop_length = hop_length if hop_length is not None else self.win_length // 2
self.pad = pad
self.power = power
self.normalized = normalized
self.n_barks = n_barks # number of bark frequency bins
self.f_max = f_max
self.f_min = f_min
self.spectrogram = Spectrogram(
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length,
pad=self.pad,
window_fn=window_fn,
power=self.power,
normalized=self.normalized,
wkwargs=wkwargs,
center=center,
pad_mode=pad_mode,
onesided=True,
)
self.bark_scale = BarkScale(
self.n_barks, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1, bark_scale
)
[docs] def forward(self, waveform: torch.Tensor) -> torch.Tensor:
r"""
Args:
waveform (torch.Tensor): torch.Tensor of audio of dimension (..., time).
Returns:
torch.Tensor: Bark frequency spectrogram of size (..., ``n_barks``, time).
"""
specgram = self.spectrogram(waveform)
bark_specgram = self.bark_scale(specgram)
return bark_specgram
[docs]class ChromaScale(torch.nn.Module):
r"""Converts spectrogram to chromagram.
.. devices:: CPU CUDA
.. properties:: Autograd
Args:
sample_rate (int): Sample rate of audio signal.
n_freqs (int): Number of frequency bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
n_chroma (int, optional): Number of chroma. (Default: ``12``)
tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
If ``None``, then disable weighting altogether. (Default: 2.0)
norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
>>> spectrogram = spectrogram_transform(waveform)
>>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
>>> chroma_spectrogram = chroma_transform(spectrogram)
See also:
:py:func:`torchaudio.prototype.functional.chroma_filterbank` — function used to
generate the filter bank.
"""
def __init__(
self,
sample_rate: int,
n_freqs: int,
*,
n_chroma: int = 12,
tuning: float = 0.0,
ctroct: float = 5.0,
octwidth: Optional[float] = 2.0,
norm: int = 2,
base_c: bool = True,
):
super().__init__()
fb = chroma_filterbank(
sample_rate, n_freqs, n_chroma, tuning=tuning, ctroct=ctroct, octwidth=octwidth, norm=norm, base_c=base_c
)
self.register_buffer("fb", fb)
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor:
r"""
Args:
specgram (torch.Tensor): Spectrogram of dimension (..., ``n_freqs``, time).
Returns:
torch.Tensor: Chroma spectrogram of size (..., ``n_chroma``, time).
"""
return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
[docs]class ChromaSpectrogram(torch.nn.Module):
r"""Generates chromagram for audio signal.
.. devices:: CPU CUDA
.. properties:: Autograd
Composes :py:func:`torchaudio.transforms.Spectrogram` and
and :py:func:`torchaudio.prototype.transforms.ChromaScale`.
Args:
sample_rate (int): Sample rate of audio signal.
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins.
win_length (int or None, optional): Window size. (Default: ``n_fft``)
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
pad (int, optional): Two sided padding of signal. (Default: ``0``)
window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
power (float, optional): Exponent for the magnitude spectrogram,
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
center (bool, optional): whether to pad :attr:`waveform` on both sides so
that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
(Default: ``True``)
pad_mode (string, optional): controls the padding method used when
:attr:`center` is ``True``. (Default: ``"reflect"``)
n_chroma (int, optional): Number of chroma. (Default: ``12``)
tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
If ``None``, then disable weighting altogether. (Default: 2.0)
norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
Example
>>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
>>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
>>> chromagram = transform(waveform) # (channel, n_chroma, time)
"""
def __init__(
self,
sample_rate: int,
n_fft: int,
*,
win_length: Optional[int] = None,
hop_length: Optional[int] = None,
pad: int = 0,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
power: float = 2.0,
normalized: bool = False,
wkwargs: Optional[dict] = None,
center: bool = True,
pad_mode: str = "reflect",
n_chroma: int = 12,
tuning: float = 0.0,
ctroct: float = 5.0,
octwidth: Optional[float] = 2.0,
norm: int = 2,
base_c: bool = True,
):
super().__init__()
self.spectrogram = Spectrogram(
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
pad=pad,
window_fn=window_fn,
power=power,
normalized=normalized,
wkwargs=wkwargs,
center=center,
pad_mode=pad_mode,
onesided=True,
)
self.chroma_scale = ChromaScale(
sample_rate,
n_fft // 2 + 1,
n_chroma=n_chroma,
tuning=tuning,
base_c=base_c,
ctroct=ctroct,
octwidth=octwidth,
norm=norm,
)
[docs] def forward(self, waveform: torch.Tensor) -> torch.Tensor:
r"""
Args:
waveform (Tensor): Tensor of audio of dimension (..., time).
Returns:
Tensor: Chromagram of size (..., ``n_chroma``, time).
"""
spectrogram = self.spectrogram(waveform)
chroma_spectrogram = self.chroma_scale(spectrogram)
return chroma_spectrogram