zama-ai/tfhe-rs

BorrowMutError: 'already borrowed' while executing CompactFheBool operations within a rayon parallel iterator

0xalexbel opened this issue · 6 comments

Unable to execute CompactFheBool operations within a rayon iterator
A BorrowMutError is randomly raised when executing a CompactFheBool operation inside a rayon parallel iterator. The bug arises when trying to access the local thread RefCell that encapsulates the ShortEngine. The bug arises in Release AND Debug.
The problem does not occur with FheBool or CompressedFheBool.

To Reproduce

  1. Copy/paste the code below
  2. Adjust the number of iterations in the main loop (100 is the default, may not be enough)
  3. Use any of the following Cargo.toml files below

The main.rs file

use tfhe::set_server_key;
use tfhe::{
    prelude::FheEncrypt, CompactFheBool, CompactPublicKey,
};

use rayon::iter::IntoParallelRefIterator;
use rayon::iter::ParallelIterator;

fn main() {
    // Panic at tfhe-0.5.3/src/shortint/engine/mod.rs:216:63
    // already borrowed: BorrowMutError
    let config = tfhe::ConfigBuilder::default().build();
    let (ck, sk) = tfhe::generate_keys(config);

    let pool_sk = sk.clone();
    rayon::broadcast(move |_| {
        let thread_local_sk = pool_sk.clone();
        set_server_key(thread_local_sk);
    });
    set_server_key(sk);

    let compact_public_key = CompactPublicKey::try_new(&ck).unwrap();

    // You may have to increase the number of iterations to reach to the problem.
    // 100 is enough on my machine (which is not very powerfull)
    let v = vec![true; 100];
    v.par_iter().for_each(|x| {
        let _a_compact_fhe_bool = CompactFheBool::encrypt(*x, &compact_public_key);
    });
}

The Cargo.toml debug file

[package]
name = "tfhe_bug"
version = "0.1.0"
edition = "2021"

[dependencies]
tfhe = { version = "0.5.3", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
rayon = { version = "1.8.1" }

The Cargo.toml release file

[package]
name = "tfhe_bug"
version = "0.1.0"
edition = "2021"

[dependencies]
tfhe = { version = "0.5.3", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
rayon = { version = "1.8.1" }

[profile.dev.package."*"]
opt-level = 3
debug = false
split-debuginfo = '...'  # Platform-specific.
strip = "none"
debug-assertions = false
overflow-checks = false
incremental = false
codegen-units = 16

Logs

stack backtrace:
   0: rust_begin_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:645:5
   1: core::panicking::panic_fmt
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/panicking.rs:72:14
   2: core::cell::panic_already_borrowed
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/cell.rs:761:5
   3: core::cell::RefCell<T>::borrow_mut
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/cell.rs:1051:25
   4: tfhe::shortint::engine::ShortintEngine::with_thread_local_mut::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/engine/mod.rs:216:51
   5: std::thread::local::LocalKey<T>::try_with
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/thread/local.rs:270:16
   6: std::thread::local::LocalKey<T>::with
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/thread/local.rs:246:9
   7: tfhe::shortint::engine::ShortintEngine::with_thread_local_mut
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/engine/mod.rs:216:9
thread '<unnamed>' panicked at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/engine/mod.rs:216:63:
already borrowed: BorrowMutError
   8: tfhe::shortint::public_key::compact::CompactPublicKey::encrypt_iter
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/shortint/public_key/compact.rs:240:13
   9: tfhe::integer::public_key::compact::CompactPublicKey::encrypt_radix_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/integer/public_key/compact.rs:74:23
  10: tfhe::integer::public_key::compact::CompactPublicKey::encrypt_iter_radix_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/integer/public_key/compact.rs:100:24
  11: tfhe::integer::public_key::compact::CompactPublicKey::encrypt_slice_radix_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/integer/public_key/compact.rs:86:9
  12: tfhe::high_level_api::keys::inner::IntegerCompactPublicKey::try_encrypt_compact
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/high_level_api/keys/inner.rs:228:9
  13: <tfhe::high_level_api::booleans::compact::CompactFheBool as tfhe::high_level_api::traits::FheTryEncrypt<bool,tfhe::high_level_api::keys::public::CompactPublicKey>>::try_encrypt
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/high_level_api/booleans/compact.rs:59:26
  14: <T as tfhe::high_level_api::traits::FheEncrypt<Clear,Key>>::encrypt
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/tfhe-0.5.3/src/high_level_api/traits.rs:21:9
  15: tfhe_bug::main::{{closure}}
             at ./src/main.rs:26:35
  16: core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ops/function.rs:272:13
  17: <core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::for_each
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/slice/iter/macros.rs:254:21
  18: <rayon::iter::for_each::ForEachConsumer<F> as rayon::iter::plumbing::Folder<T>>::consume_iter
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/for_each.rs:55:9
  19: rayon::iter::plumbing::Producer::fold_with
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/plumbing/mod.rs:110:9
  20: rayon::iter::plumbing::bridge_producer_consumer::helper
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/plumbing/mod.rs:438:13
  21: rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-1.9.0/src/iter/plumbing/mod.rs:427:21
  22: rayon_core::join::join_context::call_b::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/join/mod.rs:129:25
  23: rayon_core::job::JobResult<T>::call::{{closure}}
             at /Users/alex/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:218:41
  24: <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/panic/unwind_safe.rs:272:9
  25: std::panicking::try::do_call
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
  26: ___rust_try

Configuration:

  • OS: MacOS Big Sur 11.7.10
  • rustc 1.76.0 (07dca489a 2024-02-04)
  • tfhe 0.5.3
  • VSCode Version: 1.85.2

Hello, thanks for the detailled report, we are going to investigate and see what we can do about it

Sadly for now I don't have any workaround other than not using par_iter when encrypting Compact ciphertexts

there is a way to fix this potentially with a Mutex instead of a ref cell, not sure it's gonna be great and not the source of deadlocks, so will have to test and make sure we understand what rayon does with tasks

but a Mutex essentially defeats the thread local storage so, not great

and it deadlocks of course, it's the well known rayon bug from here rayon-rs/rayon#592

using this issue as a bit of a notepad on that issue

the problem arises when there are nested rayon calls IIRC, as the recent examples/addition proposal in the rayon-rs/rayon#592 issue (e.g. rayon-rs/rayon#592 (comment)) for fully blocking thread pool seems to indicate

in our case some threads are stealing some tasks from other threads where the engine has already been borrowed, I'm still unclear on the exact succession of events

could be

  • thread 0 starts task 0, borrows engine from its thread local storage
  • thread 0 stop/yields from task 0 while still borrowing the engine
  • thread 0 starts task 1, borrows engine from its thread local storage -> borrow mut error

could be

  • thread 0 starts task 0, 1, keeps executing task 1 and borrows engine from task 1 local storage which happens to be the one from thread 0 (? not sure that makes any sense)
  • thread 1 sees task 0 is not being run, steals it, borrows engine from the local storage which still is the one from thread 0 given it started the task, borrows the engine -> crash

example log

looks to be the first case 🤔

Thread #ThreadId(1), borrow cell: 0x7f40d3d6dc40
Thread #ThreadId(1), stops borrow cell
Thread #ThreadId(1), borrow cell: 0x7f40d3d6dc40
Thread #ThreadId(1), stops borrow cell
Thread #ThreadId(1), borrow cell: 0x7f40d3d6dc40
Thread #ThreadId(1), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(10), borrow cell: 0x7f40bbdfe1c0
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(6), borrow cell: 0x7f40d07041c0
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(10), stops borrow cell
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(5), borrow cell: 0x7f40d09081c0
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(2), borrow cell: 0x7f40d0f0b1c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(13), borrow cell: 0x7f40bb7fb1c0
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(9), borrow cell: 0x7f40bbfff1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(4), borrow cell: 0x7f40d0b091c0
Thread #ThreadId(3), stops borrow cell
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(12), borrow cell: 0x7f40bb9fc1c0
Thread #ThreadId(3), borrow cell: 0x7f40d0d0a1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(8), borrow cell: 0x7f40d02fc1c0
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(7), borrow cell: 0x7f40d05001c0
Thread #ThreadId(11), borrow cell: 0x7f40bbbfd1c0
thread '<unnamed>' panicked at tfhe/src/shortint/engine/mod.rs:219:45:
already borrowed: BorrowMutError
Thread #ThreadId(13), stops borrow cell
Thread #ThreadId(6), stops borrow cell
Thread #ThreadId(4), stops borrow cell
Thread #ThreadId(5), stops borrow cell
Thread #ThreadId(2), stops borrow cell
Thread #ThreadId(11), stops borrow cell
Thread #ThreadId(8), stops borrow cell
Thread #ThreadId(9), stops borrow cell
Thread #ThreadId(7), stops borrow cell
Thread #ThreadId(12), stops borrow cell
Thread #ThreadId(3), stops borrow cell