Error with 24khz data utils
steven850 opened this issue ยท 27 comments
So when trying to resume one of the existing 24khz models I am getting the following error.
File "Z:\FreeVCtrain24khz\train_24.py", line 65, in run train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps) File "Z:\FreeVCtrain24khz\data_utils_24.py", line 34, in __init__ self._filter() File "Z:\FreeVCtrain24khz\data_utils_24.py", line 46, in _filter lengths.append(os.path.getsize(audiopath[0]) // (2 * self.hop_length)) File "C:\Users\steven\AppData\Local\Programs\Python\Python37\lib\genericpath.py", line 50, in getsize return os.stat(filename).st_size FileNotFoundError: [WinError 3] The system cannot find the path specified: 'DUMMY\\p337\\p337_014.wav'
I noticed this line is missing in the 24khz datautils
audiopath = audiopath[0].replace("\\","/").replace("DUMMY", "dataset/vctk-16k")
yeah it's a way to solve this error.
data_utils_24.py
is a dirty code i wrote casually. the lengths
, which is used to bucketize the dataset, is obtained from DUMMY
dir. if the DUMMY
dir exists this error won't occur. ln -s dataset/vctk-16k DUMMY
or ln -s dataset/vctk-24k DUMMY
or etc. can solve this too.
If I create the dummy folder, I now get this error
File "Z:\FreeVCtrain24khz\data_utils_24.py", line 175, in __call__ wav_padded = commons.slice_segments(wav_padded, ids_slice * 480, wav_seglen) File "Z:\FreeVCtrain24khz\commons.py", line 53, in slice_segments ret[i] = x[i, :, idx_str:idx_end] RuntimeError: The expanded size of the tensor (61920) must match the existing size (26880) at non-singleton dimension 1. Target sizes: [1, 61920]. Tensor sizes: [26880]
ah sorry i forgot that spectrograms are in 16k dir. so only ln -s dataset/vctk-16k DUMMY
works.
I get that error regardless of what folder I use.
well similar, diff size listed.
RuntimeError: The expanded size of the tensor (61920) must match the existing size (33824) at non-singleton dimension 1. Target sizes: [1, 61920]. Tensor sizes: [33824]
I'm getting this error too. I ran all 16khz pre-processing steps then the 24khz downsample. Using the config file from the tips-for-synthesizing-24KHz-wavs-from-16kHz-wavs folder. Help pls!
./logs/freevc-24/G_0.pth
INFO:freevc-24:Loaded checkpoint './logs/freevc-24/G_0.pth' (iteration 3461)
./logs/freevc-24/D_0.pth
INFO:freevc-24:Loaded checkpoint './logs/freevc-24/D_0.pth' (iteration 3461)
INFO:torch.nn.parallel.distributed:Reducer buckets have been rebuilt in this iteration.
Traceback (most recent call last):
File "train_24.py", line 292, in <module>
main()
File "train_24.py", line 49, in main
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/train_24.py", line 115, in run
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/train_24.py", line 136, in train_and_evaluate
for batch_idx, items in enumerate(train_loader):
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1333, in _next_data
return self._process_data(data)
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
return self.collate_fn(data)
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/data_utils_24.py", line 174, in __call__
wav_padded = commons.slice_segments(wav_padded, ids_slice * 480, wav_seglen)
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/commons.py", line 53, in slice_segments
ret[i] = x[i, :, idx_str:idx_end]
RuntimeError: The expanded size of the tensor (15840) must match the existing size (8125) at non-singleton dimension 1. Target sizes: [1, 15840]. Tensor sizes: [8125]
hmmm i think it because your 'DUMMY' ('dataset/vctk-16k') dir does not contain those 16k '.spec.pt' files.
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != 16000:
raise ValueError("{} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_16 = audio / self.max_wav_value
audio_16 = audio_16.unsqueeze(0)
spec = spectrogram_torch(audio_16, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
But the spec.pt files are present in the 16khz folder. They also appeared in the 24khz folder when I had linked that one to the dummy.
Still getting the error with those changes
File "Z:\FreeVCtrain24khz\data_utils_24.py", line 174, in __call__ wav_padded = commons.slice_segments(wav_padded, ids_slice * 480, wav_seglen) File "Z:\FreeVCtrain24khz\commons.py", line 53, in slice_segments ret[i] = x[i, :, idx_str:idx_end] RuntimeError: The expanded size of the tensor (61920) must match the existing size (33824) at non-singleton dimension 1. Target sizes: [1, 61920]. Tensor sizes: [33824]
Still getting the error with those changes
File "Z:\FreeVCtrain24khz\data_utils_24.py", line 174, in __call__ wav_padded = commons.slice_segments(wav_padded, ids_slice * 480, wav_seglen) File "Z:\FreeVCtrain24khz\commons.py", line 53, in slice_segments ret[i] = x[i, :, idx_str:idx_end] RuntimeError: The expanded size of the tensor (61920) must match the existing size (33824) at non-singleton dimension 1. Target sizes: [1, 61920]. Tensor sizes: [33824]
Have u tried maybe removing all spec.pt files before running it again. i havent tried the changes myself yet
Ye this doesn't help
INFO:freevc-24:Loaded checkpoint './logs/freevc-24/G_0.pth' (iteration 3461)
./logs/freevc-24/D_0.pth
INFO:freevc-24:Loaded checkpoint './logs/freevc-24/D_0.pth' (iteration 3461)
INFO:torch.nn.parallel.distributed:Reducer buckets have been rebuilt in this iteration.
INFO:torch.nn.parallel.distributed:Reducer buckets have been rebuilt in this iteration.
Traceback (most recent call last):
File "train_24.py", line 292, in <module>
main()
File "train_24.py", line 49, in main
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/train_24.py", line 115, in run
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/train_24.py", line 136, in train_and_evaluate
for batch_idx, items in enumerate(train_loader):
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self._next_data()
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1333, in _next_data
return self._process_data(data)
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 7.
Original Traceback (most recent call last):
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/home/manjaro/.conda/envs/free-vc/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
return self.collate_fn(data)
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/data_utils_24.py", line 180, in __call__
wav_padded = commons.slice_segments(wav_padded, ids_slice * 480, wav_seglen)
File "/media/manjaro/NVME_2tb/NeuralNetworks/FreeVC/commons.py", line 53, in slice_segments
ret[i] = x[i, :, idx_str:idx_end]
RuntimeError: The expanded size of the tensor (16800) must match the existing size (13732) at non-singleton dimension 1. Target sizes: [1, 16800]. Tensor sizes: [13732]
If I set the hop_length to 480 manually:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, 480, self.win_length,
center=False)
It seems that the sizes almost match:
RuntimeError: The expanded size of the tensor (297) must match the existing size (298) at non-singleton dimension 1. Target sizes: [1024, 297]. Tensor sizes: [1024, 298]
self.hop_length seems to be 320 by default. Also, the audio_norm is actually the 24kHZ version of the audio. Is that supposed to be like this? I've tried importing the 16KHz versions and doing the spectrograms for them instead but the error persists.
Sorry, I'm not really smart enough for all this especially this 16 to 24KHz hack thing.
Alright, I got it to work!
I edited the data_utils_24.py like I said in post above.
Then I was thinking about it and I decided to compare the 24kHZ audio and the 16kHZ audio tracks. The 24kHZ track ended up considerably shorter than the 16 one. I'm not sure if it's because of the changes I made to the downsample_24k.py before or not but I changed it to be this and now it works and the training starts:
def process(wav_name):
# speaker 's5', 'p280', 'p315' are excluded,
speaker = wav_name[:4]
wav_path = os.path.join(args.in_dir, speaker, wav_name)
if os.path.exists(wav_path) and '.wav' in wav_path:
os.makedirs(os.path.join(args.out_dir1, speaker), exist_ok=True)
wav, sr = librosa.load(wav_path)
wav, index = librosa.effects.trim(wav, top_db=20)
peak = np.abs(wav).max()
if peak <= 0.01:
print(wav_name + " - Empty file?")
return
if peak > 1.0:
wav = 0.98 * wav / peak
wav = librosa.resample(wav, orig_sr=sr, target_sr=args.sr1)
# wav1, sr = librosa.load(wav_path, sr=args.sr1)
# wav = wav[int(index[0]*args.sr1/22050): int(index[1]*args.sr1/22050)]
# save_name = wav_name.replace("_mic2.flac", ".wav")
save_path1 = os.path.join(args.out_dir1, speaker, wav_name)
wavfile.write(
save_path1,
args.sr1,
(wav * np.iinfo(np.int16).max).astype(np.int16)
)
See I noticed the short lengths right away and had already changed my downsample.py to more or less match yours.
all my file lengths match but i still get that same error.
If I set the hop_length to 480 manually:
spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, 480, self.win_length, center=False)
It seems that the sizes almost match:
RuntimeError: The expanded size of the tensor (297) must match the existing size (298) at non-singleton dimension 1. Target sizes: [1024, 297]. Tensor sizes: [1024, 298]
self.hop_length seems to be 320 by default. Also, the audio_norm is actually the 24kHZ version of the audio. Is that supposed to be like this? I've tried importing the 16KHz versions and doing the spectrograms for them instead but the error persists. Sorry, I'm not really smart enough for all this especially this 16 to 24KHz hack thing.
It works! Thanks a lot
@Likkkez Can you post your full datautils_24 I changed the hop to 480 manually, but im still getting the same error.
@Likkkez Can you post your full datautils_24 I changed the hop to 480 manually, but im still getting the same error.
import time
import os
import random
import numpy as np
import torch
import torch.utils.data
import commons
from mel_processing import spectrogram_torch, spec_to_mel_torch
from utils import load_wav_to_torch, load_filepaths_and_text, transform
#import h5py
"""Multi speaker version"""
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths, hparams):
self.audiopaths = load_filepaths_and_text(audiopaths)
self.max_wav_value = hparams.data.max_wav_value
self.sampling_rate = hparams.data.sampling_rate
self.filter_length = hparams.data.filter_length
self.hop_length = hparams.data.hop_length
self.win_length = hparams.data.win_length
self.use_sr = hparams.train.use_sr
self.use_spk = hparams.model.use_spk
self.spec_len = hparams.train.max_speclen
random.seed(1234)
random.shuffle(self.audiopaths)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
lengths = []
for audiopath in self.audiopaths:
lengths.append(os.path.getsize(audiopath[0]) // (2 * self.hop_length))
self.lengths = lengths
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename.replace("DUMMY", "dataset/vctk-24k"))
if sampling_rate != 24000:
raise ValueError("{} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, 480, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
if self.use_spk:
spk_filename = filename.replace(".wav", ".npy")
spk_filename = spk_filename.replace("DUMMY", "dataset/spk")
spk = torch.from_numpy(np.load(spk_filename))
if not self.use_sr:
c_filename = filename.replace(".wav", ".pt")
c_filename = c_filename.replace("DUMMY", "dataset/wavlm")
c = torch.load(c_filename).squeeze(0)
else:
i = random.randint(68,92)
'''
basename = os.path.basename(filename)[:-4]
spkname = basename[:4]
#print(basename, spkname)
with h5py.File(f"dataset/rs/wavlm/{spkname}/{i}.hdf5","r") as f:
c = torch.from_numpy(f[basename][()]).squeeze(0)
#print(c)
'''
c_filename = filename.replace(".wav", f"_{i}.pt")
c_filename = c_filename.replace("DUMMY", "dataset/sr/wavlm")
c = torch.load(c_filename).squeeze(0)
'''
lmin = min(c.size(-1), spec.size(-1))
spec, c = spec[:, :lmin], c[:, :lmin]
audio_norm = audio_norm[:, :lmin*480]
_spec, _c, _audio_norm = spec, c, audio_norm
while spec.size(-1) < self.spec_len:
spec = torch.cat((spec, _spec), -1)
c = torch.cat((c, _c), -1)
audio_norm = torch.cat((audio_norm, _audio_norm), -1)
start = random.randint(0, spec.size(-1) - self.spec_len)
end = start + self.spec_len
spec = spec[:, start:end]
c = c[:, start:end]
audio_norm = audio_norm[:, start*480:end*480]
'''
if self.use_spk:
return c, spec, audio_norm, spk
else:
return c, spec, audio_norm
def __getitem__(self, index):
return self.get_audio(self.audiopaths[index][0])
def __len__(self):
return len(self.audiopaths)
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, hps):
self.hps = hps
self.use_sr = hps.train.use_sr
self.use_spk = hps.model.use_spk
def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[0].size(1) for x in batch]),
dim=0, descending=True)
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
if self.use_spk:
spks = torch.FloatTensor(len(batch), batch[0][3].size(0))
else:
spks = None
c_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
c_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
c = row[0]
c_padded[i, :, :c.size(1)] = c
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
if self.use_spk:
spks[i] = row[3]
spec_seglen = spec_lengths[-1] if spec_lengths[-1] < self.hps.train.max_speclen + 1 else self.hps.train.max_speclen + 1
wav_seglen = spec_seglen * 480
spec_padded, ids_slice = commons.rand_spec_segments(spec_padded, spec_lengths, spec_seglen)
wav_padded = commons.slice_segments(wav_padded, ids_slice * 480, wav_seglen)
c_padded = commons.slice_segments(c_padded, ids_slice, spec_seglen)[:,:,:-1]
spec_padded = spec_padded[:,:,:-1]
wav_padded = wav_padded[:,:,:-480]
if self.use_spk:
return c_padded, spec_padded, wav_padded, spks
else:
return c_padded, spec_padded, wav_padded
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
Length groups are specified by boundaries.
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
It removes samples which are not included in the boundaries.
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
"""
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
self.lengths = dataset.lengths
self.batch_size = batch_size
self.boundaries = boundaries
self.buckets, self.num_samples_per_bucket = self._create_buckets()
self.total_size = sum(self.num_samples_per_bucket)
self.num_samples = self.total_size // self.num_replicas
def _create_buckets(self):
buckets = [[] for _ in range(len(self.boundaries) - 1)]
for i in range(len(self.lengths)):
length = self.lengths[i]
idx_bucket = self._bisect(length)
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, 0, -1):
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i+1)
num_samples_per_bucket = []
for i in range(len(buckets)):
len_bucket = len(buckets[i])
total_batch_size = self.num_replicas * self.batch_size
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
num_samples_per_bucket.append(len_bucket + rem)
return buckets, num_samples_per_bucket
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
if self.shuffle:
for bucket in self.buckets:
indices.append(torch.randperm(len(bucket), generator=g).tolist())
else:
for bucket in self.buckets:
indices.append(list(range(len(bucket))))
batches = []
for i in range(len(self.buckets)):
bucket = self.buckets[i]
len_bucket = len(bucket)
ids_bucket = indices[i]
num_samples_bucket = self.num_samples_per_bucket[i]
# add extra samples to make it evenly divisible
rem = num_samples_bucket - len_bucket
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
# subsample
ids_bucket = ids_bucket[self.rank::self.num_replicas]
# batching
for j in range(len(ids_bucket) // self.batch_size):
batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
batches.append(batch)
if self.shuffle:
batch_ids = torch.randperm(len(batches), generator=g).tolist()
batches = [batches[i] for i in batch_ids]
self.batches = batches
assert len(self.batches) * self.batch_size == self.num_samples
return iter(self.batches)
def _bisect(self, x, lo=0, hi=None):
if hi is None:
hi = len(self.boundaries) - 1
if hi > lo:
mid = (hi + lo) // 2
if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
return mid
elif x <= self.boundaries[mid]:
return self._bisect(x, lo, mid)
else:
return self._bisect(x, mid + 1, hi)
else:
return -1
def __len__(self):
return self.num_samples // self.batch_size
Make sure that before you run this you remove all spec.pt files from ur dataset
find . -name "*.spec.pt" -type f -delete
Well it runs and starts, but after 300 steps it errors out with this
RuntimeError: The expanded size of the tensor (298) must match the existing size (299) at non-singleton dimension 1. Target sizes: [1024, 298]. Tensor sizes: [1024, 299]
Well it runs and starts, but after 300 steps it errors out with this
RuntimeError: The expanded size of the tensor (298) must match the existing size (299) at non-singleton dimension 1. Target sizes: [1024, 298]. Tensor sizes: [1024, 299]
I think that means one of your 24khz files doesnt match the 16khz version. Can you show me your downsampling scripts?
Sure
import os
import argparse
import librosa
import numpy as np
from multiprocessing import Pool, cpu_count
from scipy.io import wavfile
from tqdm import tqdm
def process(wav_name):
# speaker 's5', 'p280', 'p315' are excluded,
speaker = wav_name[:4]
wav_path = os.path.join("VCTK", speaker, wav_name)
if os.path.exists(wav_path) and ('.wav' in wav_path or '_mic2.flac' in wav_path):
os.makedirs(os.path.join("./dataset/vctk-24k", speaker), exist_ok=True)
wav, sr = librosa.load(wav_path,sr=48000)
wav, index = librosa.effects.trim(wav, top_db=20)
peak = np.abs(wav).max()
if peak > 1.0:
wav = 0.98 * wav / peak
wav1 = librosa.resample(wav, orig_sr=sr, target_sr=24000)
save_name = wav_name.replace("_mic2.flac", ".wav")
save_path1 = os.path.join("./dataset/vctk-24k", speaker, save_name)
wavfile.write(
save_path1,
24000,
(wav1 * np.iinfo(np.int16).max).astype(np.int16)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--sr1", type=int, default=24000, help="sampling rate")
parser.add_argument("--in_dir", type=str, default="./VCTK", help="path to source dir")
parser.add_argument("--out_dir1", type=str, default="./dataset/vctk-24k", help="path to target dir")
args = parser.parse_args()
pool = Pool(processes=cpu_count()-2)
for speaker in os.listdir("VCTK"):
spk_dir = os.path.join("VCTK", speaker)
if os.path.isdir(spk_dir):
for _ in tqdm(pool.imap_unordered(process, os.listdir(spk_dir))):
pass
The 16/22 is basically the same
def process(wav_name):
# speaker 's5', 'p280', 'p315' are excluded,
speaker = wav_name[:4]
wav_path = os.path.join("VCTK", speaker, wav_name)
if os.path.exists(wav_path) and ('.wav' in wav_path or '_mic2.flac' in wav_path):
os.makedirs(os.path.join("./dataset/vctk-16k", speaker), exist_ok=True)
os.makedirs(os.path.join("./dataset/vctk-22k", speaker), exist_ok=True)
wav, sr = librosa.load(wav_path)
wav, _ = librosa.effects.trim(wav, top_db=20)
peak = np.abs(wav).max()
if peak > 1.0:
wav = 0.98 * wav / peak
wav1 = librosa.resample(wav, orig_sr=sr, target_sr=16000)
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=22050)
save_name = wav_name.replace("_mic2.flac", ".wav")
save_path1 = os.path.join("./dataset/vctk-16k", speaker, save_name)
save_path2 = os.path.join("./dataset/vctk-22k", speaker, save_name)
wavfile.write(
save_path1,
16000,
(wav1 * np.iinfo(np.int16).max).astype(np.int16)
)
wavfile.write(
save_path2,
22050,
(wav2 * np.iinfo(np.int16).max).astype(np.int16)
)
The 16/22 is basically the same
def process(wav_name): # speaker 's5', 'p280', 'p315' are excluded, speaker = wav_name[:4] wav_path = os.path.join("VCTK", speaker, wav_name) if os.path.exists(wav_path) and ('.wav' in wav_path or '_mic2.flac' in wav_path): os.makedirs(os.path.join("./dataset/vctk-16k", speaker), exist_ok=True) os.makedirs(os.path.join("./dataset/vctk-22k", speaker), exist_ok=True) wav, sr = librosa.load(wav_path) wav, _ = librosa.effects.trim(wav, top_db=20) peak = np.abs(wav).max() if peak > 1.0: wav = 0.98 * wav / peak wav1 = librosa.resample(wav, orig_sr=sr, target_sr=16000) wav2 = librosa.resample(wav, orig_sr=sr, target_sr=22050) save_name = wav_name.replace("_mic2.flac", ".wav") save_path1 = os.path.join("./dataset/vctk-16k", speaker, save_name) save_path2 = os.path.join("./dataset/vctk-22k", speaker, save_name) wavfile.write( save_path1, 16000, (wav1 * np.iinfo(np.int16).max).astype(np.int16) ) wavfile.write( save_path2, 22050, (wav2 * np.iinfo(np.int16).max).astype(np.int16) )
Huh, that is quite odd. I don't see anything off about this. Maybe in the second script just change it from 22 to 24 since 22 isn't used anyway I think. So that it saves both in one script.
The 16/22 is basically the same
def process(wav_name): # speaker 's5', 'p280', 'p315' are excluded, speaker = wav_name[:4] wav_path = os.path.join("VCTK", speaker, wav_name) if os.path.exists(wav_path) and ('.wav' in wav_path or '_mic2.flac' in wav_path): os.makedirs(os.path.join("./dataset/vctk-16k", speaker), exist_ok=True) os.makedirs(os.path.join("./dataset/vctk-22k", speaker), exist_ok=True) wav, sr = librosa.load(wav_path) wav, _ = librosa.effects.trim(wav, top_db=20) peak = np.abs(wav).max() if peak > 1.0: wav = 0.98 * wav / peak wav1 = librosa.resample(wav, orig_sr=sr, target_sr=16000) wav2 = librosa.resample(wav, orig_sr=sr, target_sr=22050) save_name = wav_name.replace("_mic2.flac", ".wav") save_path1 = os.path.join("./dataset/vctk-16k", speaker, save_name) save_path2 = os.path.join("./dataset/vctk-22k", speaker, save_name) wavfile.write( save_path1, 16000, (wav1 * np.iinfo(np.int16).max).astype(np.int16) ) wavfile.write( save_path2, 22050, (wav2 * np.iinfo(np.int16).max).astype(np.int16) )
Huh, that is quite odd. I don't see anything off about this. Maybe in the second script just change it from 22 to 24 since 22 isn't used anyway I think. So that it saves both in one script.
nvm preprocess_sr uses 22 khz
@OlaWod
Alright I have figured out the problem......
The original preprocess is incorrect, for both the 24 and 16khz.
so the problem is this line here wav, sr = librosa.load(wav_path)
Librosa.load load the file with a default sampling rate of 22khz.
From the librosa documentation
librosa.load
Load an audio file as a floating point time series.
Audio will be automatically resampled to the given rate (default sr=22050).
To preserve the native sampling rate of the file, use sr=None.
so what happens is the audio is loaded with only 22khz, in both the 16 and 24khz preprocess. This is theoretically fine when doing the 16khz preprocess, but obviously this is a problem when trying to save at 24khz. I noticed this when I first ran the preprocess already and had changed the loading of the wav file, as i noticed that the output of the downsampled files was blank above 10khz. So the 24khz files where sampled at 24khz, but only had the audio information of a 22khz file. See below image of 2 24khz files. Top file was loaded with wav, sr = librosa.load(wav_path)
Bottom file was loaded with wav, sr = librosa.load(wav_path, sr=None)
Can see the upper freq range is blank on the file that used the default load. So this is a problem for trying to train a 24khz model as you are actually training on a 22khz file. The other problem is the way the trim feature works, as it changes based on the sample rate the file is loaded at. So I said it was theoretically fine for the 16khz files, but it isn't because of the trim feature. So if you look at the 2 files, you can see that the file loaded with the correct SR is slightly shorter then the one loaded with 22khz. Well the same thing happens with the 16khz files, they are longer then they should be.
This image shows a 16khz files from the original preprocess, along with the 2 versions of the 24khz files.
So here you can see the the files loaded at the 22khz are longer then the one loaded at 48khz. So this length variance is what causes the tensor size errors, but also look closely at the circled section, the trimming on the 16khz is stretching and mashing words together. You no longer have clean breaks on formants between sounds, everything gets blurred together, this hurts the intelligibility of the model.
We see this is even more apparent when we compare to the original file below.
So the phrase is "please call stella", what we see here is that the trim has removed the beginning of the word please, it has removed the P from please. This explains why the model has a hard time with S and P pronounciations, as the trim on preprocess removes all of these sounds. So if you listen to the 2 files below, you can clearly hear how the trimmed files has a very weak P sound.
https://drive.google.com/file/d/1r-pra0feL3aWpWUQDWxGJ7QNf9f4xxj3/view?usp=share_link
https://drive.google.com/file/d/1q4m_to27nYUqpu9EOWpi7Ol5UwsqF2PX/view?usp=sharing
So the trim function needs to be made less aggressive or left out completely as its removing key sounds from words that the model needs to be able to learn properly.
and the preprocess scripts for both 16 and 24khz need to be adjusted to use wav, sr = librosa.load(wav_path, sr=None)
So that the files load with the correct sampling rate.
After having rerun the preprocess for both the 16 and 24khz with sr=None the 24khz training now works. Of course when resuming the original model the mel_loss is over 30, as the new files actually contain audio data above 10khz and the existing model was trained on files with the blank section. Olawod, you may want to train a new 24khz model.
This code here gives a true 24khz file, and the top db of 30 seems to retain the plosives in the speech. With the handful of files I tested anyway.... Might be some files where 30 is still insufficient, but at 35 its not removing anything so..... As good as its gonna get I guess
def process(wav_name):
# speaker 's5', 'p280', 'p315' are excluded,
speaker = wav_name[:4]
wav_path = os.path.join("VCTK", speaker, wav_name)
if os.path.exists(wav_path) and ('.wav' in wav_path or '_mic2.flac' in wav_path):
os.makedirs(os.path.join("./dataset/vctk-24k", speaker), exist_ok=True)
wav, sr = librosa.load(wav_path,sr=None)
wav, index = librosa.effects.trim(wav, top_db=30)
peak = np.abs(wav).max()
if peak > 1.0:
wav = 0.98 * wav / peak
wav1 = librosa.resample(wav, orig_sr=sr, target_sr=24000)
save_name = wav_name.replace("_mic2.flac", ".wav")
save_path1 = os.path.join("./dataset/vctk-24k", speaker, save_name)
wavfile.write(
save_path1,
24000,
(wav1 * np.iinfo(np.int16).max).astype(np.int16)
)
thank you for your effort and experiments!
for the downsampling issue i think i've fixed it: link
a lazy way to get 24k wav with original 22k trimed index โ
and i agree that top_db=20 is too aggressive, which removes some sounds and so makes some pronunciations bad. thank you for pointing out this!
The fix you suggested there does not work. It results in overflow issues
RuntimeWarning: overflow encountered in long_scalars
wav1 = wav1[int(index[0]*24000/22050): int(index[1]*24000/22050)]
All of the outputs are only 1-2 seconds long and much shorter then the 16k files.
Best solution is to downsample all files at the same time using the code I posted above but with all sample rates. This results in all necessary files being created, and all of the file lengths matching.
the problem is this here in dautautils
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
It is inferring the audio length from the filesize. If the header length of the wav changes then this will break, Workaround is to make sure all wavs are written by the same process that ensures a consistent header length.
so running this here is the best solution at the moment
def process(wav_name):
# speaker 's5', 'p280', 'p315' are excluded,
speaker = wav_name[:4]
wav_path = os.path.join("VCTK", speaker, wav_name)
if os.path.exists(wav_path) and ('.wav' in wav_path or '_mic2.flac' in wav_path):
os.makedirs(os.path.join("./dataset/vctk-16k", speaker), exist_ok=True)
os.makedirs(os.path.join("./dataset/vctk-22k", speaker), exist_ok=True)
os.makedirs(os.path.join("./dataset/vctk-48k", speaker), exist_ok=True)
wav, sr = librosa.load(wav_path,sr=None)
wav, _ = librosa.effects.trim(wav, top_db=30)
peak = np.abs(wav).max()
if peak > 1.0:
wav = 0.98 * wav / peak
wav1 = librosa.resample(wav, orig_sr=sr, target_sr=16000)
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=22050)
wav3 = librosa.resample(wav, orig_sr=sr, target_sr=24000)
save_name = wav_name.replace("_mic2.flac", ".wav")
save_path1 = os.path.join("./dataset/vctk-16k", speaker, save_name)
save_path2 = os.path.join("./dataset/vctk-22k", speaker, save_name)
save_path3 = os.path.join("./dataset/vctk-24k", speaker, save_name)
wavfile.write(
save_path1,
16000,
(wav1 * np.iinfo(np.int16).max).astype(np.int16)
)
wavfile.write(
save_path2,
22050,
(wav2 * np.iinfo(np.int16).max).astype(np.int16)
)
wavfile.write(
save_path3,
24000,
(wav3 * np.iinfo(np.int16).max).astype(np.int16)
)
yes your def process(wav_name): code is the best solution if process from scratch. i'm just lazy to reprocess all those files, i want to use my old processed files, so i get the trimed index from 22khz wav (which is what my old process did), and apply the index to 24khz wav. and this works fine for me, i did not encounter any errors, and training with these 24khz wavs is also going fine.