Size compatability issue when voice cloning

Question

Size compatability issue when voice cloning

Closed this issue 5 months ago · 1 comments

Hey, I'm getting the following error when trying to run clone_utterance: RuntimeError: The size of tensor a (35) must match the size of tensor b (36) at non-singleton dimension 1

I try this with different input files and all input files have an associated transcribed sentence when they were originally recorded. I have tried resampling my audio but haven't had any luck there. Any ideas?

`
RuntimeError Traceback (most recent call last)
Cell In[84], line 19
9 sentence = find_sentence(input_file)
11 uc.clone_utterance(
12 input_file,
13 speaker['path'],
(...)
16 filename_of_result=output_file
17 )
---> 19 apply_func_to_all_wavs(input_path, output_path, create_realistic_voice_clones)

File ~/i3_speech_emotion_recognition/utils/utils.py:20, in apply_func_to_all_wavs(input_path, output_path, func)
18 if not os.path.exists(output_path):
19 os.makedirs(output_path)
---> 20 func(input_file, output_file)

Cell In[84], line 11, in create_realistic_voice_clones(input_file, output_file)
8 resample(input_file, input_file)
9 sentence = find_sentence(input_file)
---> 11 uc.clone_utterance(
12 input_file,
13 speaker['path'],
14 sentence,
15 lang='it',
16 filename_of_result=output_file
17 )

File ~/i3_speech_emotion_recognition/voice_cloning/IMS-Toucan/InferenceInterfaces/UtteranceCloner.py:163, in UtteranceCloner.clone_utterance(self, path_to_reference_audio_for_intonation, path_to_reference_audio_for_voice, transcription_of_intonation_reference, filename_of_result, lang)
161 start_sil = torch.zeros([silence_frames_start * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
162 end_sil = torch.zeros([silence_frames_end * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
--> 163 cloned_speech = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch, energy=energy)
164 cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
165 if filename_of_result is not None:

File /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/i3_speech_emotion_recognition/voice_cloning/IMS-Toucan/InferenceInterfaces/ToucanTTSInterface.py:157, in ToucanTTSInterface.forward(self, text, view, duration_scaling_factor, pitch_variance_scale, energy_variance_scale, pause_duration_scaling_factor, durations, pitch, energy, input_is_phones, return_plot_as_filepath)
155 with torch.inference_mode():
156 phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
--> 157 mel, durations, pitch, energy = self.phone2mel(phones,
158 return_duration_pitch_energy=True,
159 utterance_embedding=self.default_utterance_embedding,
160 durations=durations,
161 pitch=pitch,
162 energy=energy,
163 lang_id=self.lang_id,
164 duration_scaling_factor=duration_scaling_factor,
165 pitch_variance_scale=pitch_variance_scale,
166 energy_variance_scale=energy_variance_scale,
167 pause_duration_scaling_factor=pause_duration_scaling_factor)
168 mel = mel.transpose(0, 1)
169 wave = self.mel2wav(mel)

File /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File ~/i3_speech_emotion_recognition/voice_cloning/IMS-Toucan/InferenceInterfaces/InferenceArchitectures/InferenceToucanTTS.py:307, in ToucanTTS.forward(self, text, durations, pitch, energy, utterance_embedding, return_duration_pitch_energy, lang_id, duration_scaling_factor, pitch_variance_scale, energy_variance_scale, pause_duration_scaling_factor)
300 if lang_id is not None:
301 lang_id = lang_id.unsqueeze(0).to(text.device)
303 before_outs,
304 after_outs,
305 predicted_durations,
306 pitch_predictions,
--> 307 energy_predictions = self._forward(text.unsqueeze(0),
308 text_length,
309 gold_durations=durations,
310 gold_pitch=pitch,
311 gold_energy=energy,
312 utterance_embedding=utterance_embedding.unsqueeze(0) if utterance_embedding is not None else None, lang_ids=lang_id,
313 duration_scaling_factor=duration_scaling_factor,
314 pitch_variance_scale=pitch_variance_scale,
315 energy_variance_scale=energy_variance_scale,
316 pause_duration_scaling_factor=pause_duration_scaling_factor)
317 if return_duration_pitch_energy:
318 return after_outs, predicted_durations, pitch_predictions, energy_predictions

File ~/i3_speech_emotion_recognition/voice_cloning/IMS-Toucan/InferenceInterfaces/InferenceArchitectures/InferenceToucanTTS.py:232, in ToucanTTS._forward(self, text_tensors, text_lengths, gold_durations, gold_pitch, gold_energy, duration_scaling_factor, utterance_embedding, lang_ids, pitch_variance_scale, energy_variance_scale, pause_duration_scaling_factor)
230 embedded_pitch_curve = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2)
231 embedded_energy_curve = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2)
--> 232 enriched_encoded_texts = encoded_texts + embedded_pitch_curve + embedded_energy_curve
234 # predicting durations for text and upsampling accordingly
235 upsampled_enriched_encoded_texts = self.length_regulator(enriched_encoded_texts, predicted_durations)

RuntimeError: The size of tensor a (35) must match the size of tensor b (36) at non-singleton dimension 1

`

Answer 1 · 2023-10-04T13:31:27.000Z

There seems to be one more phoneme than there are durations. So somehow the amout of phoneme-inputs to the aligner and the amount of phoneme-inputs to the TTS must be different. I don't know how that could happen though, for me it works and I did not find a way to break this.