Will there be plans to enable training in other languages in the future, or to extend support for additional languages?
Opened this issue · 3 comments
Will there be plans to enable training in other languages in the future, or to extend support for additional languages?
I see that chatterbox multilingual is updated today, do we need to update the weights too?
Thank you for your open-source work and effort! I tested the TTS speed in Chinese, and it currently takes around 2–5 seconds to generate. It seems difficult to achieve the 200ms performance.
Thank you for your open-source work and effort! I tested the TTS speed in Chinese, and it currently takes around 2–5 seconds to generate. It seems difficult to achieve the 200ms performance.
you can implement streaming and you can train with your own dataset for other languages.
streaming code change
-> chatterbox/src/chatterbox/vc.py
def generate(
self,
audio,
target_voice_path=None,
):
......................................
# add this two methods
def _to_numpy_1xn(self, audio):
"""
Girdi: list[float] | np.ndarray | torch.Tensor
Çıkış: np.ndarray shape (1, N), dtype=float32
"""
if isinstance(audio, torch.Tensor):
x = audio.detach().cpu().numpy()
else:
x = np.asarray(audio, dtype=np.float32)
if x.ndim == 1:
x = x[None, :] # (1, N)
elif x.ndim == 2 and x.shape[0] != 1 and x.shape[1] == 1:
x = x.T # (N,1) -> (1,N)
elif x.ndim != 2 or x.shape[0] != 1:
raise ValueError(f"Unsupported audio shape {x.shape}, expected [N] or [1,N].")
return x.astype(np.float32, copy=False)
def generateArray(
self,
audio,
target_voice_path=None,
):
if target_voice_path:
self.set_target_voice(target_voice_path)
else:
assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`"
with torch.inference_mode():
audio_16=self._to_numpy_1xn(audio)#, _ = librosa.load(audio, sr=S3_SR)
audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ]
s3_tokens, _ = self.s3gen.tokenizer(audio_16)
wav, _ = self.s3gen.inference(
speech_tokens=s3_tokens,
ref_dict=self.ref_dict,
)
wav = wav.squeeze(0).detach().cpu().numpy()
#watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
#return torch.from_numpy(watermarked_wav).unsqueeze(0)
return torch.from_numpy(wav).unsqueeze(0)
# instead of model.generate call model.generateArray
@app.route('/generatearr', methods=['POST'])
def generatearr():
"""
Beklenen JSON:
{
"audio": [0.0, 0.01, ...], // float[]
"target_voice": "speakerA.wav" // optional; prepare_conditionals yapılmadıysa gerekli
}
"""
data = request.get_json(force=True, silent=False)
if data is None or "audio" not in data:
return jsonify({"error": "JSON body must include 'audio' float array"}), 400
audio_np = np.asarray(data["audio"], dtype=np.float32)
target_voice = data.get("target_voice", None)
# NumPy -> (gerekirse) torch -> model.generate
# generate() artık float[] kabul ediyor, direkt geçebiliriz.
try:
wav_tensor = model.generateArray(
audio=audio_np, # float[]
target_voice_path=(TARGET_VOICE_PATH + target_voice) if target_voice else None
) # torch [1, N]
except Exception as e:
return jsonify({"error": f"generation failed: {str(e)}"}), 500
# Çıktıyı float[] olarak döndür (JSON)
wav_np = wav_tensor.squeeze(0).cpu().numpy().astype(np.float32) # [N]
return jsonify(wav_np.tolist())