Anush008/fastembed-rs

Falling back to CPU.

vi1world opened this issue · 8 comments

Does anyone also have this issue? How can we use GPU?

[2023-12-01T14:17:40Z WARN ort::execution_providers] No execution providers registered successfully. Falling back to CPU.

This is my setup code:

`use anyhow::{Context, Result};
use fastembed::{EmbeddingBase, EmbeddingModel, ExecutionProvider, FlagEmbedding, InitOptions};
use log::{error, info, warn};
use ort::execution_providers::{
ArenaExtendStrategy, CUDAExecutionProviderCuDNNConvAlgoSearch, CUDAExecutionProviderOptions,
};
use std::path::PathBuf;

pub struct EmbeddingGenerator {
model: FlagEmbedding,
}

impl EmbeddingGenerator {
/// Creates a new instance of the EmbeddingGenerator with the specified model.
pub fn new() -> Result {
info!("Initializing the EmbeddingGenerator with CUDA support");

    // Initialize CUDA Execution Provider Options
    let cuda_options = CUDAExecutionProviderOptions {
        device_id: 0,              // GPU device ID, typically 0 for a single-GPU system
        gpu_mem_limit: usize::MAX, // Maximum GPU memory limit
        arena_extend_strategy: ArenaExtendStrategy::NextPowerOfTwo, // Strategy for extending memory arena
        cudnn_conv_algo_search: CUDAExecutionProviderCuDNNConvAlgoSearch::Exhaustive, // Search strategy for cuDNN convolution algorithms
        do_copy_in_default_stream: true, // Whether to do copies in the default stream
        cudnn_conv_use_max_workspace: true, // Whether to use maximum workspace for cuDNN operations
        cudnn_conv1d_pad_to_nc1d: false,    // Padding strategy for 1D convolutions in cuDNN
        enable_cuda_graph: false,           // Whether to enable CUDA Graphs feature
        enable_skip_layer_norm_strict_mode: false, // Whether to use strict mode in SkipLayerNormalization
    };

    let init_options = InitOptions {
        model_name: EmbeddingModel::BGEBaseENV15, // Using the v1.5 release of the Base English model
        execution_providers: vec![ExecutionProvider::CUDA(cuda_options)], // Add CUDA as the execution provider
        max_length: 2048, // Maximum length of tokenized sequences
        cache_dir: PathBuf::from("./local_cache"), // Cache directory for the model
        show_download_message: true,
    };

    match FlagEmbedding::try_new(init_options) {
        Ok(model) => {
            info!("Successfully initialized the EmbeddingGenerator with CUDA");
            Ok(Self { model })
        }
        Err(e) => {
            error!(
                "Failed to initialize the EmbeddingGenerator with CUDA: {}",
                e
            );
            Err(e.into())
        }
    }
}

/// Generates embeddings for a given list of documents.
pub fn generate_embeddings(&self, documents: Vec<&str>) -> Result<Vec<Vec<f32>>> {
    self.model
        .embed(documents, None)
        .context("Failed to generate embeddings")
}

}`

Hey. Since I don't have access to CUDA, I can't reproduce this.
The snippet you provided seems fine.
ort falls back to CPU if it couldn't resolve the execution provider.
Can you try with the default CUDA provider options?

Hi,
Thank you for your response. Still the same issue if I use the below code:

`use anyhow::{Context, Result};
use fastembed::{EmbeddingBase, EmbeddingModel, FlagEmbedding, InitOptions};
use log::info;

pub struct EmbeddingGenerator {
model: FlagEmbedding,
}

impl EmbeddingGenerator {
/// Creates a new instance of the EmbeddingGenerator with the specified model.
pub fn new() -> Result {
info!("Initializing the embedding model...");
let model = FlagEmbedding::try_new(InitOptions {
model_name: EmbeddingModel::BGEBaseENV15, // Using the v1.5 release of the Base English model
show_download_message: true,
..Default::default()
})
.context("Failed to initialize the embedding model")?;

    Ok(Self { model })
}

/// Generates passage embeddings for a given list of documents.
pub fn generate_passage_embedding(&self, documents: Vec<&str>) -> Result<Vec<Vec<f32>>> {
    self.model
        .passage_embed(documents, None)
        .context("Failed to generate passage embeddings")
}

pub fn generate_query_embedding(&self, query: &str) -> Result<Vec<f32>> {
    self.model
        .query_embed(query)
        .context("Failed to generate query embedding")
}

}`

I meant using ExecutionProvider::CUDA with its default options.

Still Falling back to CPU.

`let cuda_options = CUDAExecutionProviderOptions::default();

    let init_options = InitOptions {
        model_name: EmbeddingModel::BGEBaseENV15, // Using the v1.5 release of the Base English model
        execution_providers: vec![ExecutionProvider::CUDA(cuda_options)], // Add CUDA as the execution provider
        max_length: 2048, // Maximum length of tokenized sequences
        cache_dir: PathBuf::from("./local_cache"), // Cache directory for the model
        show_download_message: true,
    };

    match FlagEmbedding::try_new(init_options) {
        Ok(model) => {
            info!("Successfully initialized the EmbeddingGenerator with CUDA");
            Ok(Self { model })
        }
        Err(e) => {
            error!(
                "Failed to initialize the EmbeddingGenerator with CUDA: {}",
                e
            );
            Err(e.into())
        }
    }`

Can you try the following snippet with CUDA?
https://github.com/Anush008/fastembed-rs/blob/main/src/lib.rs#L214-L222

With ort V1.

Doesn't work. Maybe I'm doing something wrong?

`use anyhow::Error;
use fastembed::{Embedding, EmbeddingBase, EmbeddingModel, FlagEmbedding, InitOptions};
use log::{error, info};
use ort::execution_providers::CUDAExecutionProviderOptions;
use ort::{Environment, ExecutionProvider, GraphOptimizationLevel, Session, SessionBuilder};
use std::path::PathBuf;
use std::sync::Arc;

pub fn generate_embeddings(
model: &FlagEmbedding,
texts: Vec,
embedding_type: &str,
) -> Result<Vec<Vec>, Error> {
match embedding_type {
"passage" => {
// Directly return the embeddings from passage_embed
model.passage_embed(texts, Some(1))
}
"query" => {
// Directly return the embeddings from query_embed
// Assuming 'query_embed' returns Vec<Vec> and takes a single String
let query_embedding = model.query_embed(texts[0].clone())?;
Ok(vec![query_embedding]) // Wrap in a Vec since we're returning a Vec<Vec>
}
_ => Err(Error::msg("Invalid embedding type")),
}
}
pub fn initialize_model() -> Result<FlagEmbedding, Box> {
info!("Starting model initialization");

let model_path = PathBuf::from("./local_cache/fast-bge-base-en-v1.5");
info!("Model path set to: {:?}", model_path);

let threads = 4;
info!("Number of threads: {}", threads);

let cuda_provider = ExecutionProvider::CUDA(CUDAExecutionProviderOptions::default());
info!("CUDA execution provider initialized");

let execution_providers = vec![cuda_provider];
info!("Execution providers set");

// Using the suggested snippet for environment and session setup
let environment = Environment::builder()
    .with_name("Fastembed")
    .with_execution_providers(execution_providers)
    .build()?;

let session = SessionBuilder::new(&environment.into())?
    .with_optimization_level(GraphOptimizationLevel::Level3)?
    .with_intra_threads(threads)?
    .with_model_from_file(model_path.join("model_optimized.onnx"))?;

info!("Session created successfully");

let model = FlagEmbedding::try_new(InitOptions {
    model_name: EmbeddingModel::BGEBaseENV15,
    show_download_message: true,
    ..Default::default()
})?;

info!("FlagEmbedding model initialized successfully");

Ok(model)

}
`

The ort session was not initiated with CUDA? Since I am unable to reproduce, I recommend you raise an issue at ort with your machine specs. Maybe they can help better.

This should be repeated now that ort has been updated to 2.0 to see if it is still an issue