Source code for deeppavlov.models.nemo.vocoder

# Copyright 2020 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List

import librosa
import numpy as np
from nemo.core.neural_types import NmTensor
from nemo.collections.tts import WaveGlowInferNM
from numpy import ndarray

log = getLogger(__name__)


class BaseVocoder:
    """Class is used to maintain consistency in the construction of the TTS pipeline based on NeMo modules."""

    def __call__(self, tensor: NmTensor) -> NmTensor:
        """Should return the tensor after the evaluation of which speech could be synthesized with `get_audio` method"""
        raise NotImplementedError

    def get_audio(self, evaluated_tensor: list, mel_len: list):
        """Synthesizes audio from the evaluated tensor constructed by `__call__` method."""
        raise NotImplementedError


[docs]class WaveGlow(BaseVocoder):
[docs]    def __init__(self, *, denoiser_strength: float = 0.0, n_window_stride: int = 160, **kwargs) -> None:
        """Wraps WaveGlowInferNM module.

        Args:
            denoiser_strength: Denoiser strength for waveglow.
            n_window_stride: Stride of window for FFT in samples used in model training.
            kwargs: Named arguments for WaveGlowInferNM constructor.

        """
        self.waveglow = WaveGlowInferNM(**kwargs)
        self.denoiser_strength = denoiser_strength
        self.n_window_stride = n_window_stride

    def __call__(self, mel_postnet: NmTensor) -> NmTensor:
        return self.waveglow(mel_spectrogram=mel_postnet)

    def __str__(self):
        return str(self.waveglow)

    def restore_from(self, path: str) -> None:
        """Wraps WaveGlowInferNM restore_from method."""
        self.waveglow.restore_from(path)
        if self.denoiser_strength > 0:
            log.info('Setup denoiser for WaveGlow')
            self.waveglow.setup_denoiser()

    def get_audio(self, evaluated_audio: list, mel_len: list) -> List[ndarray]:
        """Unpacks audio data from evaluated tensor and denoises it if `denoiser_strength` > 0."""
        audios = []
        for i, batch in enumerate(evaluated_audio):
            audio = batch.cpu().numpy()
            for j, sample in enumerate(audio):
                sample_len = mel_len[i][j] * self.n_window_stride
                sample = sample[:sample_len]
                if self.denoiser_strength > 0:
                    sample, _ = self.waveglow.denoise(sample, strength=self.denoiser_strength)
                audios.append(sample)
        return audios


[docs]class GriffinLim(BaseVocoder):
[docs]    def __init__(self, *,
                 sample_rate: float = 16000.0,
                 n_fft: int = 1024,
                 mag_scale: float = 2048.0,
                 power: float = 1.2,
                 n_iters: int = 50,
                 **kwargs) -> None:
        """Uses Griffin Lim algorithm to generate speech from spectrograms.

        Args:
            sample_rate:  Generated audio data sample rate.
            n_fft: The number of points to use for the FFT.
            mag_scale: Multiplied with the linear spectrogram to avoid audio sounding muted due to mel filter
                normalization.
            power: The linear spectrogram is raised to this power prior to running the Griffin Lim algorithm. A power
                of greater than 1 has been shown to improve audio quality.
            n_iters: Number of iterations of convertion magnitude spectrograms to audio signal.

        """
        self.mag_scale = mag_scale
        self.power = power
        self.n_iters = n_iters
        self.n_fft = n_fft
        self.filterbank = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, **kwargs)

    def __call__(self, mel_postnet: NmTensor) -> NmTensor:
        return mel_postnet

    def get_audio(self, mel_spec: list, mel_len: list) -> List[ndarray]:
        audios = []
        for i, batch in enumerate(mel_spec):
            log_mel = batch.cpu().numpy().transpose(0, 2, 1)
            mel = np.exp(log_mel)
            magnitudes = np.dot(mel, self.filterbank) * self.mag_scale
            for j, sample in enumerate(magnitudes):
                sample = sample[:mel_len[i][j], :]
                audio = self.griffin_lim(sample.T ** self.power)
                audios.append(audio)
        return audios

    def griffin_lim(self, magnitudes):
        """Griffin-Lim algorithm to convert magnitude spectrograms to audio signals."""
        phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
        complex_spec = magnitudes * phase
        signal = librosa.istft(complex_spec)

        for _ in range(self.n_iters):
            _, phase = librosa.magphase(librosa.stft(signal, n_fft=self.n_fft))
            complex_spec = magnitudes * phase
            signal = librosa.istft(complex_spec)
        return signal