• Docs >
  • Module code >
  • torchaudio.prototype.pipelines.hifigan_pipeline >
  • Nightly (unstable)

Source code for torchaudio.prototype.pipelines.hifigan_pipeline

from dataclasses import dataclass
from typing import Any, Dict, Optional

import torch
import torch.nn.functional as F
from torch.nn import Module
from torchaudio._internal import load_state_dict_from_url

from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
from torchaudio.transforms import MelSpectrogram

[docs]@dataclass class HiFiGANVocoderBundle: """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`. This class provides interfaces for instantiating the pretrained model along with the information necessary to retrieve pretrained weights and additional data to be used with the model. Torchaudio library instantiates objects of this class, each of which represents a different pretrained model. Client code should access pretrained models via these instances. This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like `text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2, to generate mel spectrogram from text. Please see below for the code example. Example: Transform synthetic mel spectrogram to audio. >>> import torch >>> import torchaudio >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle >>> >>> # Load the HiFiGAN bundle >>> vocoder = bundle.get_vocoder() Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth" 100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s] >>> >>> # Generate synthetic mel spectrogram >>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100) >>> >>> # Transform mel spectrogram into audio >>> waveform = vocoder(specgram) >>> torchaudio.save('sample.wav', waveform, bundle.sample_rate) Example: Usage together with Tacotron2, text to audio. >>> import torch >>> import torchaudio >>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly >>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan >>> >>> # Load Tacotron2 bundle >>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH >>> processor = bundle_tactron2.get_text_processor() >>> tacotron2 = bundle_tactron2.get_tacotron2() >>> >>> # Use Tacotron2 to convert text to mel spectrogram >>> text = "A quick brown fox jumped over a lazy dog" >>> input, lengths = processor(text) >>> specgram, lengths, _ = tacotron2.infer(input, lengths) >>> >>> # Load HiFiGAN bundle >>> vocoder = bundle_hifigan.get_vocoder() Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth" 100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s] >>> >>> # Use HiFiGAN to convert mel spectrogram to audio >>> waveform = vocoder(specgram).squeeze(0) >>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate) """ # noqa: E501 _path: str _vocoder_params: Dict[str, Any] # Vocoder parameters _mel_params: Dict[str, Any] # Mel transformation parameters _sample_rate: float def _get_state_dict(self, dl_kwargs): url = f"https://download.pytorch.org/torchaudio/models/{self._path}" dl_kwargs = {} if dl_kwargs is None else dl_kwargs state_dict = load_state_dict_from_url(url, **dl_kwargs) return state_dict
[docs] def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder: """Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight. The weight file is downloaded from the internet and cached with :func:`torch.hub.load_state_dict_from_url` Args: dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`. Returns: Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`. """ model = hifigan_vocoder(**self._vocoder_params) model.load_state_dict(self._get_state_dict(dl_kwargs)) model.eval() return model
[docs] def get_mel_transform(self) -> Module: """Construct an object which transforms waveforms into mel spectrograms.""" return _HiFiGANMelSpectrogram( n_mels=self._vocoder_params["in_channels"], sample_rate=self._sample_rate, **self._mel_params, )
@property def sample_rate(self): """Sample rate of the audio that the model is trained on. :type: float """ return self._sample_rate
class _HiFiGANMelSpectrogram(torch.nn.Module): """ Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation: https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72 This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive equivalence with the HiFiGAN implementation. Args: hop_size (int): Length of hop between STFT windows. n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins. win_length (int): Window size. f_min (float or None): Minimum frequency. f_max (float or None): Maximum frequency. sample_rate (int): Sample rate of audio signal. n_mels (int): Number of mel filterbanks. """ def __init__( self, hop_size: int, n_fft: int, win_length: int, f_min: Optional[float], f_max: Optional[float], sample_rate: float, n_mels: int, ): super(_HiFiGANMelSpectrogram, self).__init__() self.mel_transform = MelSpectrogram( sample_rate=sample_rate, n_fft=n_fft, win_length=win_length, hop_length=hop_size, f_min=f_min, f_max=f_max, n_mels=n_mels, normalized=False, pad=0, mel_scale="slaney", norm="slaney", center=False, ) self.sample_rate = sample_rate self.hop_size = hop_size self.n_fft = n_fft self.win_length = win_length self.f_min = f_min self.f_max = f_max self.n_mels = n_mels self.pad_size = int((n_fft - hop_size) / 2) def forward(self, waveform: torch.Tensor) -> torch.Tensor: """Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``. Args: waveform (Tensor): waveform of shape ``(batch_size, time_length)``. Returns: Tensor of shape ``(batch_size, n_mel, time_length)`` """ ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect") ref_waveform = ref_waveform.squeeze(1) spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5 mel_spectrogram = self.mel_transform.mel_scale(spectr) mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5)) return mel_spectrogram HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle( "hifigan_vocoder_v3_ljspeech.pth", _vocoder_params={ "upsample_rates": (8, 8, 4), "upsample_kernel_sizes": (16, 16, 8), "upsample_initial_channel": 256, "resblock_kernel_sizes": (3, 5, 7), "resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)), "resblock_type": 2, "in_channels": 80, "lrelu_slope": 0.1, }, _mel_params={ "hop_size": 256, "n_fft": 1024, "win_length": 1024, "f_min": 0, "f_max": 8000, }, _sample_rate=22050, ) HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset* :cite:`ljspeech17`. This pipeine can be used with an external component which generates mel spectrograms from text, for example, Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`. Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2 using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from :py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as :py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_. The underlying vocoder is constructed by :py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License <https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__. Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions. """


Access comprehensive developer documentation for PyTorch

View Docs


Get in-depth tutorials for beginners and advanced developers

View Tutorials


Find development resources and get your questions answered

View Resources