Realtime streaming Voice clone sample
Opened this issue · 0 comments
karayakar commented
I am dropping changes for streaming float[] audio realtime VC.
-> chatterbox/src/chatterbox/vc.py
def generate(
self,
audio,
target_voice_path=None,
):
......................................
# add this two methods
def _to_numpy_1xn(self, audio):
"""
Girdi: list[float] | np.ndarray | torch.Tensor
Çıkış: np.ndarray shape (1, N), dtype=float32
"""
if isinstance(audio, torch.Tensor):
x = audio.detach().cpu().numpy()
else:
x = np.asarray(audio, dtype=np.float32)
if x.ndim == 1:
x = x[None, :] # (1, N)
elif x.ndim == 2 and x.shape[0] != 1 and x.shape[1] == 1:
x = x.T # (N,1) -> (1,N)
elif x.ndim != 2 or x.shape[0] != 1:
raise ValueError(f"Unsupported audio shape {x.shape}, expected [N] or [1,N].")
return x.astype(np.float32, copy=False)
def generateArray(
self,
audio,
target_voice_path=None,
):
if target_voice_path:
self.set_target_voice(target_voice_path)
else:
assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`"
with torch.inference_mode():
audio_16=self._to_numpy_1xn(audio)#, _ = librosa.load(audio, sr=S3_SR)
audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ]
s3_tokens, _ = self.s3gen.tokenizer(audio_16)
wav, _ = self.s3gen.inference(
speech_tokens=s3_tokens,
ref_dict=self.ref_dict,
)
wav = wav.squeeze(0).detach().cpu().numpy()
#watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
#return torch.from_numpy(watermarked_wav).unsqueeze(0)
return torch.from_numpy(wav).unsqueeze(0)
example_vc.py | implemented flask
# instead of model.generate call model.generateArray
@app.route('/generatearr', methods=['POST'])
def generatearr():
"""
Beklenen JSON:
{
"audio": [0.0, 0.01, ...], // float[]
"target_voice": "speakerA.wav" // optional; prepare_conditionals yapılmadıysa gerekli
}
"""
data = request.get_json(force=True, silent=False)
if data is None or "audio" not in data:
return jsonify({"error": "JSON body must include 'audio' float array"}), 400
audio_np = np.asarray(data["audio"], dtype=np.float32)
target_voice = data.get("target_voice", None)
# NumPy -> (gerekirse) torch -> model.generate
# generate() artık float[] kabul ediyor, direkt geçebiliriz.
try:
wav_tensor = model.generateArray(
audio=audio_np, # float[]
target_voice_path=(TARGET_VOICE_PATH + target_voice) if target_voice else None
) # torch [1, N]
except Exception as e:
return jsonify({"error": f"generation failed: {str(e)}"}), 500
# Çıktıyı float[] olarak döndür (JSON)
wav_np = wav_tensor.squeeze(0).cpu().numpy().astype(np.float32) # [N]
return jsonify(wav_np.tolist())