Is there pretrained HIFI gan vocoder?
netuserjun opened this issue · 10 comments
Thank you for share this great repository.
I tried some song to convert voice, but there is problem with high pitch.
The pretrained vocoder can not express high pitch..
So If you have the pretrained hifi pkl file with proper config, please share..
Thanks.
I have pre-trained a Hifi-GAN model on LibriTTS for my TTS project. Not sure if it works well for VCTK or other datasets. Here is the link: https://drive.google.com/file/d/1odvXt8w_cjHDoYMzJMZCJtwR2r7yRaSY/view?usp=sharing
If you would like to fine-tune using the preprocessing of this repo, try replacing the meldataset.py in the Hifi-GAN repo with the following file:
import math
import os
import random
import torch
import torch.utils.data
import numpy as np
from librosa.util import normalize
from scipy.io.wavfile import read
from librosa.filters import mel as librosa_mel_fn
import torchaudio
import librosa
MAX_WAV_VALUE = 32768.0
def load_wav(full_path):
sampling_rate, data = read(full_path)
return data, sampling_rate
def dynamic_range_compression(x, C=1, clip_val=1e-5):
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
def dynamic_range_decompression(x, C=1):
return np.exp(x) / C
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
output = dynamic_range_compression_torch(magnitudes)
return output
def spectral_de_normalize_torch(magnitudes):
output = dynamic_range_decompression_torch(magnitudes)
return output
mel_basis = {}
hann_window = {}
# preprocess
to_mel = torchaudio.transforms.MelSpectrogram(
n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4
def preprocess(wave_tensor):
mel_tensor = to_mel(wave_tensor)
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
return mel_tensor
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
global mel_basis, hann_window
if fmax not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
center=center, pad_mode='reflect', normalized=False, onesided=True)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
spec = spectral_normalize_torch(spec)
return spec
def get_dataset_filelist(a):
with open(a.input_training_file, 'r', encoding='utf-8') as fi:
training_files = [x.split('|')[0]
for x in fi.read().split('\n') if len(x) > 0]
with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
validation_files = [x.split('|')[0]
for x in fi.read().split('\n') if len(x) > 0]
return training_files, validation_files
class MelDataset(torch.utils.data.Dataset):
def __init__(self, training_files, segment_size, n_fft, num_mels,
hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
self.audio_files = training_files
random.seed(1234)
if shuffle:
random.shuffle(self.audio_files)
self.segment_size = segment_size
self.sampling_rate = sampling_rate
self.split = split
self.n_fft = n_fft
self.num_mels = num_mels
self.hop_size = hop_size
self.win_size = win_size
self.fmin = fmin
self.fmax = fmax
self.fmax_loss = fmax_loss
self.cached_wav = None
self.n_cache_reuse = n_cache_reuse
self._cache_ref_count = 0
self.device = device
self.fine_tuning = fine_tuning
self.base_mels_path = base_mels_path
def __getitem__(self, index):
filename = self.audio_files[index]
if self._cache_ref_count == 0:
audio, sampling_rate = load_wav(filename)
audio = audio / MAX_WAV_VALUE
if not self.fine_tuning:
audio = normalize(audio) * 0.95
self.cached_wav = audio
if sampling_rate != self.sampling_rate:
audio = librosa.resample(audio, sampling_rate, self.sampling_rate)
# raise ValueError("{} SR doesn't match target {} SR, {}".format(
# sampling_rate, self.sampling_rate, filename))
self._cache_ref_count = self.n_cache_reuse
else:
audio = self.cached_wav
self._cache_ref_count -= 1
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0)
if not self.fine_tuning:
if self.split:
if audio.size(1) >= self.segment_size:
max_audio_start = audio.size(1) - self.segment_size
audio_start = random.randint(0, max_audio_start)
audio = audio[:, audio_start:audio_start+self.segment_size]
else:
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
mel = preprocess(audio)
else:
mel = np.load(filename + '.npy')
mel = torch.from_numpy(mel)
if len(mel.shape) < 3:
mel = mel.unsqueeze(0)
if self.split:
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
if audio.size(1) >= self.segment_size:
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
mel = mel[:, :, mel_start:mel_start + frames_per_seg]
audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
else:
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
center=False)
if mel.shape[-1] != mel_loss.shape[-1]:
mel = mel[..., :mel_loss.shape[-1]]
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
def __len__(self):
return len(self.audio_files)
Thank you so much!
I'll try and share the result.
The pretrained vocoder got a problem, "RuntimeError: PytorchStreamReader failed locating file constants.pkl: file not found", constants.pkl not included in the zip file.
Hi @yl4579
Thanks for sharing pretrained model.
I want to train mel features with the HiFiGAN setting for StarGAN-V2-VC. Can you share the meldataset.py? or guide me on what changes need to make?
Thanks
@yl4579 I tried the code that I posted above. for me losses NaN, Inf.
Can you please tell me, Is the hifigan features that pass as input to the StarGAN-v2-VC is wrong?
--- epoch 44 ---
train/real : nan
train/fake : nan
train/reg : inf
train/real_adv_cls: 0.0000
train/con_reg : nan
train/adv : nan
train/sty : nan
train/ds : 0.0005
train/cyc : nan
train/norm : inf
train/asr : 0.2963
train/f0 : 0.6593
train/adv_cls : 0.0000
eval/real : nan
eval/fake : nan
eval/reg : 0.0000
eval/real_adv_cls: 0.0000
eval/con_reg : 0.0000
eval/adv : nan
eval/sty : nan
eval/ds : nan
eval/cyc : nan
eval/norm : nan
eval/asr : nan
eval/f0 : nan
eval/adv_cls : 0.0000
Thanks
@mraj96 please see the meldataset.py I shared above.
As for NaN, please refer to #60 and see if it helps.
@yl4579 meldataset.py i am facing the issue.
python3 train.py --fine_tuning False --config config.json
Traceback (most recent call last):
File "/root/hifigan_v2vc/hifi-gan/train.py", line 271, in <module>
main()
File "/root/hifigan_v2vc/hifi-gan/train.py", line 267, in main
train(0, a, h)
File "/root/hifigan_v2vc/hifi-gan/train.py", line 149, in train
loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
File "/root/hifigan_v2vc/hifi-gan/models.py", line 255, in feature_loss
loss += torch.mean(torch.abs(rl - gl))
RuntimeError: The size of tensor a (9600) must match the size of tensor b (9602) at non-singleton dimension 2
when hifigan training, Have you modified the model.py code with these changes? - jik876/hifi-gan#12 (comment)
I fixed it. cutoff (conv_in_size * u) after each transposed convolution.
If you would like to fine-tune using the preprocessing of this repo, try replacing the meldataset.py in the Hifi-GAN repo with the following file:
import math import os import random import torch import torch.utils.data import numpy as np from librosa.util import normalize from scipy.io.wavfile import read from librosa.filters import mel as librosa_mel_fn import torchaudio import librosa MAX_WAV_VALUE = 32768.0 def load_wav(full_path): sampling_rate, data = read(full_path) return data, sampling_rate def dynamic_range_compression(x, C=1, clip_val=1e-5): return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) def dynamic_range_decompression(x, C=1): return np.exp(x) / C def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): return torch.log(torch.clamp(x, min=clip_val) * C) def dynamic_range_decompression_torch(x, C=1): return torch.exp(x) / C def spectral_normalize_torch(magnitudes): output = dynamic_range_compression_torch(magnitudes) return output def spectral_de_normalize_torch(magnitudes): output = dynamic_range_decompression_torch(magnitudes) return output mel_basis = {} hann_window = {} # preprocess to_mel = torchaudio.transforms.MelSpectrogram( n_mels=80, n_fft=2048, win_length=1200, hop_length=300) mean, std = -4, 4 def preprocess(wave_tensor): mel_tensor = to_mel(wave_tensor) mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std return mel_tensor def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): if torch.min(y) < -1.: print('min value is ', torch.min(y)) if torch.max(y) > 1.: print('max value is ', torch.max(y)) global mel_basis, hann_window if fmax not in mel_basis: mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') y = y.squeeze(1) spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], center=center, pad_mode='reflect', normalized=False, onesided=True) spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) spec = spectral_normalize_torch(spec) return spec def get_dataset_filelist(a): with open(a.input_training_file, 'r', encoding='utf-8') as fi: training_files = [x.split('|')[0] for x in fi.read().split('\n') if len(x) > 0] with open(a.input_validation_file, 'r', encoding='utf-8') as fi: validation_files = [x.split('|')[0] for x in fi.read().split('\n') if len(x) > 0] return training_files, validation_files class MelDataset(torch.utils.data.Dataset): def __init__(self, training_files, segment_size, n_fft, num_mels, hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): self.audio_files = training_files random.seed(1234) if shuffle: random.shuffle(self.audio_files) self.segment_size = segment_size self.sampling_rate = sampling_rate self.split = split self.n_fft = n_fft self.num_mels = num_mels self.hop_size = hop_size self.win_size = win_size self.fmin = fmin self.fmax = fmax self.fmax_loss = fmax_loss self.cached_wav = None self.n_cache_reuse = n_cache_reuse self._cache_ref_count = 0 self.device = device self.fine_tuning = fine_tuning self.base_mels_path = base_mels_path def __getitem__(self, index): filename = self.audio_files[index] if self._cache_ref_count == 0: audio, sampling_rate = load_wav(filename) audio = audio / MAX_WAV_VALUE if not self.fine_tuning: audio = normalize(audio) * 0.95 self.cached_wav = audio if sampling_rate != self.sampling_rate: audio = librosa.resample(audio, sampling_rate, self.sampling_rate) # raise ValueError("{} SR doesn't match target {} SR, {}".format( # sampling_rate, self.sampling_rate, filename)) self._cache_ref_count = self.n_cache_reuse else: audio = self.cached_wav self._cache_ref_count -= 1 audio = torch.FloatTensor(audio) audio = audio.unsqueeze(0) if not self.fine_tuning: if self.split: if audio.size(1) >= self.segment_size: max_audio_start = audio.size(1) - self.segment_size audio_start = random.randint(0, max_audio_start) audio = audio[:, audio_start:audio_start+self.segment_size] else: audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') mel = preprocess(audio) else: mel = np.load(filename + '.npy') mel = torch.from_numpy(mel) if len(mel.shape) < 3: mel = mel.unsqueeze(0) if self.split: frames_per_seg = math.ceil(self.segment_size / self.hop_size) if audio.size(1) >= self.segment_size: mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) mel = mel[:, :, mel_start:mel_start + frames_per_seg] audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] else: mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, center=False) if mel.shape[-1] != mel_loss.shape[-1]: mel = mel[..., :mel_loss.shape[-1]] return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) def __len__(self): return len(self.audio_files)
thanks for sharing the code.
i have 2 questions
- sampling_rate in config.yaml is 24000, and i find the function torchaudio.transforms.MelSpectrogram set sr=16k as default, should i set sr=24k when initial to_mel?
- why mel_loss has different transform function from mel
@1nlplearner For 1, see #10. For 2, it doesn't matter what loss function you see, I just used the default mel_loss
from HifiGAN because you are now working in the waveform domain. You can take any form of melspectrogram spectrogram to calculate the loss.