How can i add sample voice and seed like in the webUI to this script?
Closed this issue · 6 comments
I prefer to use a script and CLI to generate audio with ChatTTS rather than opening the webUI and want these features in my script:
The ability to add a sample audio and input and view a specific seed used for the audio to generate.
This is the script i use (got it mostly from the chatTTS repository documentation):
import ChatTTS
import torch
import torchaudio
chat = ChatTTS.Chat()
chat.load(compile=False) # Set to True for better performance
inputs_en = """
This is chat T T S voice, this is an example of a laugh [laugh]
now an example of a pause [lbreak] and now an example of ending a sentence.[lbreak]
""".replace('\n', '') # English is still experimental.
params_refine_text = ChatTTS.Chat.RefineTextParams(
prompt='[oral_1][laugh_2][break_6]',
)
audio_array_en = chat.infer(inputs_en, params_refine_text=params_refine_text)
torchaudio.save("self_introduction_output.wav", torch.from_numpy(audio_array_en), 24000)
How can i edit this so i can add a sample audio location and a specific seed?
I believe seeds are what define if you get a male or female audio, so they are important.
Also, does anyone know what is "oral_1" there? Not sure where to add it and what it does.
See notebooks in example
folder for more command-line details. Seed is passed through manaul_seed
parameter in ChatTTS.Chat.InferCodeParams
. Just refer to the codes of webui. [oral_1]
gives the hint of oral strength level 1 to the model.
See notebooks in
example
folder for more command-line details. Seed is passed throughmanaul_seed
parameter inChatTTS.Chat.InferCodeParams
. Just refer to the codes of webui.[oral_1]
gives the hint of oral strength level 1 to the model.
Thank you for answer.
Yes, i checked the example files for webui and CMD.
But i still cannot properly use a sample .wav file for the inference.
I tried to come up with a script, but it does NOT take into account my .wav audio file when doing inference.
This is the script in question:
#importing
import ChatTTS
import torch
import torchaudio
import numpy as np
from typing import Optional
from tools.audio import float_to_int16, load_audio
chat = ChatTTS.Chat()
chat.load(compile=False) # Set to True for better performance
###using an 7 second audio file as a sample (DOES NOT WORK)
def on_upload_sample_(sample_audio_input: Optional[str]) -> np.ndarray:
if sample_audio_input is None:
return np.array([])
sample_audio = load_audio(sample_audio_input, 24000)
spk_smp = chat.sample_audio_speaker(sample_audio)
del sample_audio
return spk_smp
voice = on_upload_sample_
result = voice(r"C:\Users\Desktop\ChatTTS\MY_7_SECOND_AUDIO_FILE_FOR_INFERENCE.wav")
###
#Inference text
inputs_en = """
This is chat T T S voice, this is an example of a laugh [laugh]
now an example of a pause [lbreak] and now an example of ending a sentence.[lbreak]
""".replace('\n', '') # English is still experimental.
params_refine_text = ChatTTS.Chat.RefineTextParams(
prompt='[laugh_2][break_6]',
)
###
#Here i got multiple numpy errors when getting an output,
#went through a convoluted solution that probably breaks the whole process.
###
# Retrieve audio arrays and ensure they have 2D shapes ([channels, samples])
audio_arrays = [np.array(arr) for arr in chat.infer(inputs_en, on_upload_sample_, params_refine_text=params_refine_text)]
# Determine the maximum sample length (time dimension only)
max_length = max(arr.shape[-1] for arr in audio_arrays)
# Pad all arrays along the time dimension to match the max length
padded_audio_arrays = [
np.pad(
arr,
[(0, 0)] * (arr.ndim - 1) + [(0, max_length - arr.shape[-1])],
mode='constant'
)
for arr in audio_arrays
]
# Concatenate along the sample axis (time dimension) if they are all [channels, samples]
audio_array_en = np.concatenate(padded_audio_arrays, axis=-1)
# Save as 2D Tensor by ensuring it is shaped [channels, samples]
torchaudio.save("output.wav", torch.from_numpy(audio_array_en), 24000)
I realize it's a big mess after the attempt at use an audio .wav file.
Any suggestion so i can load a sample .wav audio for inference?
You need to pass params_infer_code
param to infer.
You need to pass
params_infer_code
param to infer.
Thanks for the help, sadly i still cannot get it to work properly.
I tried to simplify my code and pass params_infer_code but to no avail.
import ChatTTS
import torch
import torchaudio
from typing import Optional
from tools.audio import float_to_int16, load_audio
chat = ChatTTS.Chat()
chat.load(compile=False) # Set to True for better performance
# Define the function
def on_upload_sample(sample_audio_input: Optional[str]) -> str:
if sample_audio_input is None:
return ""
sample_audio = load_audio(sample_audio_input, 24000)
spk_smp = chat.sample_audio_speaker(sample_audio)
del sample_audio
return spk_smp
sample_spk = on_upload_sample(r"C:\Audio_example.wav")
params_infer_code = ChatTTS.Chat.InferCodeParams(
spk_emb = sample_spk, # add sampled speaker
temperature = .3, # using custom temperature
top_P = 0.7, # top P decode
top_K = 20, # top K decode
)
params_refine_text = ChatTTS.Chat.RefineTextParams(
prompt='[oral_1][laugh_2][break_6]',
)
text = 'What is [uv_break]your favorite english food?[laugh][lbreak][uv_break]'
wavs = chat.infer(
text,
params_refine_text=params_refine_text,
params_infer_code=params_infer_code,
)
wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text, params_infer_code=params_infer_code)
torchaudio.save("word_level_output.wav", torch.from_numpy(wavs[0]).unsqueeze(0), 24000)
Everything looks fine to me, but it's obviously not.
When i run the script i get a python library error with the tokenizer and other things.
Traceback (most recent call last):
File "C:\ChatTTS\TTS_try6.py", line 42, in <module>
wavs = chat.infer(
File "C:\ChatTTS\core.py", line 230, in infer
return next(res_gen)
File "C:\ChatTTS\ChatTTS\core.py", line 388, in _infer
for result in self._infer_code(
File "C:\ChatTTS\venv\lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "C:\ChatTTS\ChatTTS\core.py", line 560, in _infer_code
self.tokenizer.apply_spk_emb(
File "C:\ChatTTS\venv\lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "C:\ChatTTS\ChatTTS\model\tokenizer.py", line 153, in apply_spk_emb
self._decode_spk_emb(spk_emb),
File "C:\ChatTTS\ChatTTS\model\tokenizer.py", line 134, in _decode_spk_emb
lzma.decompress(
File "C:\Python310\lib\lzma.py", line 343, in decompress
res = decomp.decompress(data)
_lzma.LZMAError: Corrupt input data
I wonder what am i doing wrong.
Thanks for the help nonetheless!
You should use spk_smp
and txt_smp
. One for sampled audio code, the other is the exact transcript of your audio.
This issue was closed because it has been inactive for 15 days since being marked as stale.