Source code for torchaudio.prototype.pipelines.hifigan_pipeline
from dataclasses import dataclass
from typing import Any, Dict, Optional
import torch
import torch.nn.functional as F
from torch.nn import Module
from torchaudio._internal import load_state_dict_from_url
from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
from torchaudio.transforms import MelSpectrogram
[docs]@dataclass
class HiFiGANVocoderBundle:
"""Data class that bundles associated information to use pretrained
:py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
This class provides interfaces for instantiating the pretrained model along with
the information necessary to retrieve pretrained weights and additional data
to be used with the model.
Torchaudio library instantiates objects of this class, each of which represents
a different pretrained model. Client code should access pretrained models via these
instances.
This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like
`text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2,
to generate mel spectrogram from text. Please see below for the code example.
Example: Transform synthetic mel spectrogram to audio.
>>> import torch
>>> import torchaudio
>>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
>>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle
>>>
>>> # Load the HiFiGAN bundle
>>> vocoder = bundle.get_vocoder()
Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s]
>>>
>>> # Generate synthetic mel spectrogram
>>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100)
>>>
>>> # Transform mel spectrogram into audio
>>> waveform = vocoder(specgram)
>>> torchaudio.save('sample.wav', waveform, bundle.sample_rate)
Example: Usage together with Tacotron2, text to audio.
>>> import torch
>>> import torchaudio
>>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
>>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan
>>>
>>> # Load Tacotron2 bundle
>>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
>>> processor = bundle_tactron2.get_text_processor()
>>> tacotron2 = bundle_tactron2.get_tacotron2()
>>>
>>> # Use Tacotron2 to convert text to mel spectrogram
>>> text = "A quick brown fox jumped over a lazy dog"
>>> input, lengths = processor(text)
>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
>>>
>>> # Load HiFiGAN bundle
>>> vocoder = bundle_hifigan.get_vocoder()
Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s]
>>>
>>> # Use HiFiGAN to convert mel spectrogram to audio
>>> waveform = vocoder(specgram).squeeze(0)
>>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate)
""" # noqa: E501
_path: str
_vocoder_params: Dict[str, Any] # Vocoder parameters
_mel_params: Dict[str, Any] # Mel transformation parameters
_sample_rate: float
def _get_state_dict(self, dl_kwargs):
url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
state_dict = load_state_dict_from_url(url, **dl_kwargs)
return state_dict
[docs] def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
"""Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
The weight file is downloaded from the internet and cached with
:func:`torch.hub.load_state_dict_from_url`
Args:
dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
Returns:
Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
"""
model = hifigan_vocoder(**self._vocoder_params)
model.load_state_dict(self._get_state_dict(dl_kwargs))
model.eval()
return model
[docs] def get_mel_transform(self) -> Module:
"""Construct an object which transforms waveforms into mel spectrograms."""
return _HiFiGANMelSpectrogram(
n_mels=self._vocoder_params["in_channels"],
sample_rate=self._sample_rate,
**self._mel_params,
)
@property
def sample_rate(self):
"""Sample rate of the audio that the model is trained on.
:type: float
"""
return self._sample_rate
class _HiFiGANMelSpectrogram(torch.nn.Module):
"""
Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation:
https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72
This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive
equivalence with the HiFiGAN implementation.
Args:
hop_size (int): Length of hop between STFT windows.
n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins.
win_length (int): Window size.
f_min (float or None): Minimum frequency.
f_max (float or None): Maximum frequency.
sample_rate (int): Sample rate of audio signal.
n_mels (int): Number of mel filterbanks.
"""
def __init__(
self,
hop_size: int,
n_fft: int,
win_length: int,
f_min: Optional[float],
f_max: Optional[float],
sample_rate: float,
n_mels: int,
):
super(_HiFiGANMelSpectrogram, self).__init__()
self.mel_transform = MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_size,
f_min=f_min,
f_max=f_max,
n_mels=n_mels,
normalized=False,
pad=0,
mel_scale="slaney",
norm="slaney",
center=False,
)
self.sample_rate = sample_rate
self.hop_size = hop_size
self.n_fft = n_fft
self.win_length = win_length
self.f_min = f_min
self.f_max = f_max
self.n_mels = n_mels
self.pad_size = int((n_fft - hop_size) / 2)
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``.
Args:
waveform (Tensor): waveform of shape ``(batch_size, time_length)``.
Returns:
Tensor of shape ``(batch_size, n_mel, time_length)``
"""
ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect")
ref_waveform = ref_waveform.squeeze(1)
spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5
mel_spectrogram = self.mel_transform.mel_scale(spectr)
mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5))
return mel_spectrogram
HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle(
"hifigan_vocoder_v3_ljspeech.pth",
_vocoder_params={
"upsample_rates": (8, 8, 4),
"upsample_kernel_sizes": (16, 16, 8),
"upsample_initial_channel": 256,
"resblock_kernel_sizes": (3, 5, 7),
"resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)),
"resblock_type": 2,
"in_channels": 80,
"lrelu_slope": 0.1,
},
_mel_params={
"hop_size": 256,
"n_fft": 1024,
"win_length": 1024,
"f_min": 0,
"f_max": 8000,
},
_sample_rate=22050,
)
HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset*
:cite:`ljspeech17`.
This pipeine can be used with an external component which generates mel spectrograms from text, for example,
Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`.
Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2
using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original
HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from
:py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as
:py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here
<https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_.
The underlying vocoder is constructed by
:py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published
with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License
<https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to
pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__.
Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions.
"""