

MisakaMikoto-o opened this issue · 5 comments





Thank you for providing the information. Could you please assist in enhancing the related functions and contribute to EmotiVoice?

Thank you for providing the information. Could you please assist in enhancing the related functions and contribute to EmotiVoice?

sure, it's my pleasure, I'll provide my code here, you can refer to it.

# 读取本地文件变速实现
import pyrubberband as pyrb
import soundfile as sf
import simpleaudio as sa
import numpy as np

def play_audio_at_speed(file_path, speed_factor=1.0):
    # 读取音频
    data, samplerate =

    # 使用 pyrubberband 改变速度而不改变音调
    new_data = pyrb.time_stretch(data, samplerate, speed_factor)

    # 将音频数据转换为 simpleaudio 可接受的格式
    # 例如,转换为 16 位整数 (int16)
    if new_data.dtype != np.int16:
        new_data = (new_data * 32768).astype(np.int16)

    # 播放处理后的音频
    play_obj = sa.play_buffer(new_data, 1, 2, samplerate)

# 调用函数以 1.5 倍速播放音频,不改变音调
play_audio_at_speed('/mnt/hgfs/reflection/EmotiVoice/outputs/prompt_tts_open_source_joint/test_audio/audio/g_00140000/16000.wav', speed_factor=1.5)

Actual use can be written like this

from demo_page import get_models, tts
from frontend import g2p_cn_en, ROOT_DIR, read_lexicon, G2p
from config.joint.config import Config
import pyrubberband as pyrb
import simpleaudio as sa
import numpy as np

config = Config()
speakers = config.speakers
models = get_models()
lexicon = read_lexicon(f"{ROOT_DIR}/lexicon/librispeech-lexicon.txt")
g2p = G2p()

content = "hello"
text =  g2p_cn_en(content, g2p, lexicon)
path = tts(text, "开心", content, "8051", models)
# 转为0.75倍速
new_data = pyrb.time_stretch(path, 16_000, 0.75)
if new_data.dtype != np.int16:
    new_data = (new_data * 32768).astype(np.int16)
play_obj = sa.play_buffer(new_data, 1, 2, 16_000)