Race leads to panic in `oneshot::Sender::send()`
acfoltzer opened this issue · 4 comments
Version
Reproduced with tokio 1.12.0 and 1.13.0
Platform
Linux hostname 5.11.0-38-generic #42~20.04.1-Ubuntu SMP Tue Sep 28 20:41:07 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
Description
There is a race between send()
, try_recv()
, and oneshot::Receiver::close()
. The following program yields a panic roughly every 3 seconds on my 18c/36t workstation, compiled with Rust 1.56.1 in release mode:
#[tokio::main]
async fn main() {
loop {
let (tx, mut rx) = tokio::sync::oneshot::channel();
tokio::spawn(async move {
let _ = tx.send(());
});
tokio::spawn(async move {
rx.close();
let _ = rx.try_recv();
});
}
}
All of the panics occur when send()
attempts inner.consume_value().unwrap()
. For example:
thread 'tokio-runtime-worker' panicked at 'called `Option::unwrap()` on a `None` value', /home/acfoltzer/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.13.0/src/sync/oneshot.rs:498:50
I suspect this is a race where the rx.close(); rx.try_recv()
happens between the if !inner.complete()
check and the inner.consume_value().unwrap()
.
i threw together a quick Loom test based on your repro that appears to fail with a causality violation: https://github.com/tokio-rs/tokio/compare/eliza/oneshot-race?expand=1
Test output
:; LOOM_LOG=trace LOOM_MAX_PREEMPTIONS=3 RUSTFLAGS="--cfg loom" cargo test -p tokio --lib --release --features full -- recv_close
Finished release [optimized] target(s) in 0.04s
Running unittests (target/release/deps/tokio-468ba47bf2fe69d7)
running 1 test
test sync::tests::loom_oneshot::recv_close ... FAILED
failures:
---- sync::tests::loom_oneshot::recv_close stdout ----
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
thread 'sync::tests::loom_oneshot::recv_close' panicked at 'Causality violation: Concurrent write accesses to `UnsafeCell`.
', /home/eliza/.cargo/registry/src/github.com-1ecc6299db9ec823/loom-0.5.1/src/rt/location.rs:115:9
stack backtrace:
0: 0x55e1eef79d1c - std::backtrace_rs::backtrace::libunwind::trace::h3fea1eb2e0ba2ac9
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/../../backtrace/src/backtrace/libunwind.rs:90:5
1: 0x55e1eef79d1c - std::backtrace_rs::backtrace::trace_unsynchronized::h849d83492cbc0d59
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
2: 0x55e1eef79d1c - std::sys_common::backtrace::_print_fmt::he3179d37290f23d3
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:67:5
3: 0x55e1eef79d1c - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h140f6925cad14324
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:46:22
4: 0x55e1eef9d5ac - core::fmt::write::h31b9cd1bedd7ea38
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/core/src/fmt/mod.rs:1150:17
5: 0x55e1eef76765 - std::io::Write::write_fmt::h001fc83b62f3fb85
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/io/mod.rs:1667:15
6: 0x55e1eef7bc70 - std::sys_common::backtrace::_print::he7ac492cd19c3189
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:49:5
7: 0x55e1eef7bc70 - std::sys_common::backtrace::print::hba20f8920229d8e8
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:36:9
8: 0x55e1eef7bc70 - std::panicking::default_hook::{{closure}}::h714d63979ae18678
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:210:50
9: 0x55e1eef7b862 - std::panicking::default_hook::hf1afb64e69563ca8
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:224:9
10: 0x55e1eef7c324 - std::panicking::rust_panic_with_hook::h02231a501e274a13
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:624:17
11: 0x55e1eef7be00 - std::panicking::begin_panic_handler::{{closure}}::h5b5e738edf361af7
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:521:13
12: 0x55e1eef7a1e4 - std::sys_common::backtrace::__rust_end_short_backtrace::h601a115def7987b1
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:141:18
13: 0x55e1eef7bd69 - rust_begin_unwind
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:517:5
14: 0x55e1eedb0abb - std::panicking::begin_panic_fmt::hc15cd62c02c70986
at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:460:5
15: 0x55e1eef54c8f - loom::rt::location::PanicBuilder::fire::hd2aa6024f9cde508
16: 0x55e1eef49d74 - loom::rt::cell::State::track_write::hfb356d64ceb7d0d7
17: 0x55e1eeddbc28 - scoped_tls::ScopedKey<T>::with::he7c6540a05f6201e
18: 0x55e1eee9a67f - loom::cell::unsafe_cell::UnsafeCell<T>::with_mut::h2576b690fe758e36
19: 0x55e1eeef64bd - tokio::sync::oneshot::Receiver<T>::try_recv::h0d7824d96857ae71
20: 0x55e1eee1130c - core::ops::function::FnOnce::call_once{{vtable.shim}}::he6f36eb67e2ec228
21: 0x55e1eef5561a - loom::rt::scheduler::spawn_threads::{{closure}}::{{closure}}::h4e7415b3d604446b
22: 0x55e1eef54656 - generator::stack::StackBox<F>::call_once::h0c92f84febab0523
23: 0x55e1eef5ca66 - generator::gen_impl::gen_init::hd40ac4efc0687d23
24: 0x0 - <unknown>
failures:
sync::tests::loom_oneshot::recv_close
test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 77 filtered out; finished in 0.00s
I am fairly sure this is the same race condition; loom
is detecting the race in the call to consume_value()
but prior to the unwrap()
that fails outside of loom
.
I think this may also deserve the I-Unsound
issue tag --- in @acfoltzer's repro case, the failure mode here is fairly "clean"; the Sender
sees the None
value, unwraps it, and panics, which is not too bad. But, in the repro, the values in the channel are ()
, which is zero-sized, so the UnsafeCell
only contains the Option
discriminant, and take
ing it is just zeroing a single word...so, the other side will always see a None
and fail cleanly with a panic. But, in the case where a large struct or something is inside the UnsafeCell
, I'm pretty sure it's possible for one side or the other to see a partially-deinitialized value, which seems bad. And, if a large struct or something contains an owning pointer, it might also be possible for both sides to end up with ownership over that pointer? So, the data race could potentially result in arbitrarily bad things happening...
I mean, yes, it is a race. Races are unsound.
Yup.
Modifying the repro to send a string through the channel and actually mutate/dereference it results in a more or less instantaneous segfault:
#[tokio::main]
async fn main() {
loop {
let (tx, mut rx) = tokio::sync::oneshot::channel();
tokio::spawn(async move {
if let Err(mut s) = tx.send(String::from("hello")) {
s.push_str(" world");
assert_eq!(&s[..], "hello world");
}
});
tokio::spawn(async move {
rx.close();
if let Ok(mut s) = rx.try_recv() {
s.push_str(" san francisco");
assert_eq!(&s[..], "hello san francisco");
}
});
}
}
A few runs with various amusingly mangled strings I got it to produce:
:# eliza at noctis in adam-repro on main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
Compiling adam-repro v0.1.0 (/home/eliza/Code/adam-repro)
Finished release [optimized] target(s) in 0.91s
Running `target/release/adam-repro`
thread 'tokio-runtime-worker' panicked at 'byte index 1 is not a char boundary; it is inside '\u{10}' (bytes 0..1) of `&�sa��sco`', library/core/src/fmt/mod.rs:2133:30
stack backtrace:
zsh: segmentation fault (core dumped) cargo run --release
cargo run --release 2.30s user 0.24s system 219% cpu 1.157 total
:# eliza at noctis in adam-repro on main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
Finished release [optimized] target(s) in 0.01s
Running `target/release/adam-repro`
thread 'tokio-runtime-worker' panicked at 'assertion failed: `(left == right)`
left: `"�C~\u{0} francisco"`,
right: `"hello san francisco"`', src/main.rs:15:17
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
zsh: segmentation fault (core dumped) cargo run --release
cargo run --release 0.19s user 0.08s system 108% cpu 0.245 total
:# eliza at noctis in adam-repro on main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
Finished release [optimized] target(s) in 0.01s
Running `target/release/adam-repro`
thread 'tokio-runtime-worker' panicked at 'assertion failed: `(left == right)`
left: `"hello sa\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}sco"`,
right: `"hello san francisco"`', src/main.rs:15:17
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
zsh: segmentation fault (core dumped) cargo run --release
cargo run --release 0.23s user 0.09s system 125% cpu 0.249 total
:# eliza at noctis in adam-repro on main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
Finished release [optimized] target(s) in 0.01s
Running `target/release/adam-repro`
zsh: segmentation fault (core dumped) cargo run --release
cargo run --release 0.07s user 0.05s system 61% cpu 0.192 total
:# eliza at noctis in adam-repro on main [?] is 📦 v0.1.0 via ⚙️ v1.56.1