tazz4843/whisper-rs

Different results on different platforms

IvanProg00 opened this issue · 1 comments

I've used base model https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin.
I've tested on macOS m2 PRO Sonoma and Debian 12 Intel x86-64.

And I received different text and plog using the same code.

My code:

use std::env;

use hound::WavReader;
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};

fn main() {
    let args: Vec<String> = env::args().collect();


    // load a context and model
    let ctx = WhisperContext::new_with_params("ggml-base.bin", WhisperContextParameters::default())
        .expect("failed to load model");


    let mut state = ctx.create_state().unwrap();
    // let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
    let mut params = FullParams::new(SamplingStrategy::BeamSearch { beam_size: 5, patience:1.0 });

    params.set_translate(false);
    params.set_detect_language(false);
    params.set_language(Some("es"));
    params.set_suppress_non_speech_tokens(true);
    params.set_suppress_blank(true);
    params.set_single_segment(true);

    params.set_print_special(false);
    params.set_print_progress(false);
    params.set_print_realtime(false);
    params.set_print_timestamps(false);

    let prompt = args.get(2).expect("args [file] [prompt] are required");
    let prompt = format!("[system] {} [user] ", prompt);
    let vector = ctx.tokenize(&prompt, prompt.len()).unwrap();
    let tokens = vector.as_slice();
    params.set_tokens(tokens);

    let path = args.get(1).unwrap();
    let mut reader = WavReader::open(path).unwrap();
    let audio = whisper_rs::convert_integer_to_float_audio(
        &reader
            .samples::<i16>()
            .collect::<Result<Vec<_>, hound::Error>>()
            .unwrap(),
    );

    state.full(params, &audio[..]).unwrap();

    println!("===== output =====");

    let phrase = state.full_get_segment_text(0).unwrap();
    println!("phrase: {}", phrase);

    let eot = ctx.token_eot();

    for i in 0..state.full_n_tokens(0).unwrap() {
        let id = state.full_get_token_id(0, i).unwrap();
        let data = state.full_get_token_data(0, i).unwrap();
        let token_text = state.full_get_token_text(0, i).unwrap();
        let is_special = id >= eot;

        if !is_special {
            println!("token: {}", token_text);
            println!("plog: {}", data.plog);
        }
    }
}

I think whisper.cpp has some decent level of hardware nondeterminism, as well as possibly different APIs being used which also influence the result. Gonna call this not a bug and sorta unfixable unfortunately.