Will there be plans to enable training in other languages in the future, or to extend support for additional languages?

Question

Will there be plans to enable training in other languages in the future, or to extend support for additional languages?

Opened this issue 2 months ago · 3 comments

Answer 1 · 2025-09-04T15:47:02.000Z

I see that chatterbox multilingual is updated today, do we need to update the weights too?

Answer 2 · 2025-09-04T16:58:32.000Z

Thank you for your open-source work and effort! I tested the TTS speed in Chinese, and it currently takes around 2–5 seconds to generate. It seems difficult to achieve the 200ms performance.

Answer 3 · 2025-09-05T13:27:36.000Z

Thank you for your open-source work and effort! I tested the TTS speed in Chinese, and it currently takes around 2–5 seconds to generate. It seems difficult to achieve the 200ms performance.

@shao-ssq @Idiotabtcodes

you can implement streaming and you can train with your own dataset for other languages.

streaming code change

-> chatterbox/src/chatterbox/vc.py

    def generate(
        self,
        audio,
        target_voice_path=None,
    ):
  ......................................

# add this two methods

    def _to_numpy_1xn(self, audio):
        """
        Girdi: list[float] | np.ndarray | torch.Tensor
        Çıkış: np.ndarray shape (1, N), dtype=float32
        """
        if isinstance(audio, torch.Tensor):
            x = audio.detach().cpu().numpy()
        else:
            x = np.asarray(audio, dtype=np.float32)

        if x.ndim == 1:
            x = x[None, :]           # (1, N)
        elif x.ndim == 2 and x.shape[0] != 1 and x.shape[1] == 1:
            x = x.T                  # (N,1) -> (1,N)
        elif x.ndim != 2 or x.shape[0] != 1:
            raise ValueError(f"Unsupported audio shape {x.shape}, expected [N] or [1,N].")

        return x.astype(np.float32, copy=False)


    def generateArray(
        self,
        audio,
        target_voice_path=None,
    ):
        if target_voice_path:
            self.set_target_voice(target_voice_path)
        else:
            assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`"

        with torch.inference_mode():
            audio_16=self._to_numpy_1xn(audio)#, _ = librosa.load(audio, sr=S3_SR)
            audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ]

            s3_tokens, _ = self.s3gen.tokenizer(audio_16)
            wav, _ = self.s3gen.inference(
                speech_tokens=s3_tokens,
                ref_dict=self.ref_dict,
            )
            wav = wav.squeeze(0).detach().cpu().numpy()
            #watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
        #return torch.from_numpy(watermarked_wav).unsqueeze(0)
        return torch.from_numpy(wav).unsqueeze(0)

# instead of model.generate  call model.generateArray


@app.route('/generatearr', methods=['POST'])
def generatearr():
    """
    Beklenen JSON:
    {
      "audio": [0.0, 0.01, ...],  // float[]
      "target_voice": "speakerA.wav"  // optional; prepare_conditionals yapılmadıysa gerekli
    }
    """
    data = request.get_json(force=True, silent=False)

    if data is None or "audio" not in data:
        return jsonify({"error": "JSON body must include 'audio' float array"}), 400

    audio_np = np.asarray(data["audio"], dtype=np.float32)
    target_voice = data.get("target_voice", None)

     

    # NumPy -> (gerekirse) torch -> model.generate
    # generate() artık float[] kabul ediyor, direkt geçebiliriz.
    try:
        wav_tensor = model.generateArray(
            audio=audio_np,  # float[]
            target_voice_path=(TARGET_VOICE_PATH + target_voice) if target_voice else None
        )  # torch [1, N]
    except Exception as e:
        return jsonify({"error": f"generation failed: {str(e)}"}), 500

    # Çıktıyı float[] olarak döndür (JSON)
    wav_np = wav_tensor.squeeze(0).cpu().numpy().astype(np.float32)  # [N]
    return jsonify(wav_np.tolist())