tokio-rs/tokio

Race leads to panic in `oneshot::Sender::send()`

acfoltzer opened this issue · 4 comments

Version

Reproduced with tokio 1.12.0 and 1.13.0

Platform

Linux hostname 5.11.0-38-generic #42~20.04.1-Ubuntu SMP Tue Sep 28 20:41:07 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux

Description

There is a race between send(), try_recv(), and oneshot::Receiver::close(). The following program yields a panic roughly every 3 seconds on my 18c/36t workstation, compiled with Rust 1.56.1 in release mode:

#[tokio::main]
async fn main() {
    loop {
        let (tx, mut rx) = tokio::sync::oneshot::channel();
        tokio::spawn(async move {
            let _ = tx.send(());
        });
        tokio::spawn(async move {
            rx.close();
            let _ = rx.try_recv();
        });
    }
}

All of the panics occur when send() attempts inner.consume_value().unwrap(). For example:

thread 'tokio-runtime-worker' panicked at 'called `Option::unwrap()` on a `None` value', /home/acfoltzer/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.13.0/src/sync/oneshot.rs:498:50

I suspect this is a race where the rx.close(); rx.try_recv() happens between the if !inner.complete() check and the inner.consume_value().unwrap().

hawkw commented

i threw together a quick Loom test based on your repro that appears to fail with a causality violation: https://github.com/tokio-rs/tokio/compare/eliza/oneshot-race?expand=1

Test output
:; LOOM_LOG=trace LOOM_MAX_PREEMPTIONS=3 RUSTFLAGS="--cfg loom" cargo test -p tokio --lib --release --features full -- recv_close
    Finished release [optimized] target(s) in 0.04s
     Running unittests (target/release/deps/tokio-468ba47bf2fe69d7)

running 1 test
test sync::tests::loom_oneshot::recv_close ... FAILED

failures:

---- sync::tests::loom_oneshot::recv_close stdout ----
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
~~~~~~~~ THREAD 1 ~~~~~~~~
~~~~~~~~ THREAD 2 ~~~~~~~~
thread 'sync::tests::loom_oneshot::recv_close' panicked at 'Causality violation: Concurrent write accesses to `UnsafeCell`.
', /home/eliza/.cargo/registry/src/github.com-1ecc6299db9ec823/loom-0.5.1/src/rt/location.rs:115:9
stack backtrace:
   0:     0x55e1eef79d1c - std::backtrace_rs::backtrace::libunwind::trace::h3fea1eb2e0ba2ac9
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/../../backtrace/src/backtrace/libunwind.rs:90:5
   1:     0x55e1eef79d1c - std::backtrace_rs::backtrace::trace_unsynchronized::h849d83492cbc0d59
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
   2:     0x55e1eef79d1c - std::sys_common::backtrace::_print_fmt::he3179d37290f23d3
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:67:5
   3:     0x55e1eef79d1c - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h140f6925cad14324
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:46:22
   4:     0x55e1eef9d5ac - core::fmt::write::h31b9cd1bedd7ea38
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/core/src/fmt/mod.rs:1150:17
   5:     0x55e1eef76765 - std::io::Write::write_fmt::h001fc83b62f3fb85
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/io/mod.rs:1667:15
   6:     0x55e1eef7bc70 - std::sys_common::backtrace::_print::he7ac492cd19c3189
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:49:5
   7:     0x55e1eef7bc70 - std::sys_common::backtrace::print::hba20f8920229d8e8
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:36:9
   8:     0x55e1eef7bc70 - std::panicking::default_hook::{{closure}}::h714d63979ae18678
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:210:50
   9:     0x55e1eef7b862 - std::panicking::default_hook::hf1afb64e69563ca8
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:224:9
  10:     0x55e1eef7c324 - std::panicking::rust_panic_with_hook::h02231a501e274a13
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:624:17
  11:     0x55e1eef7be00 - std::panicking::begin_panic_handler::{{closure}}::h5b5e738edf361af7
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:521:13
  12:     0x55e1eef7a1e4 - std::sys_common::backtrace::__rust_end_short_backtrace::h601a115def7987b1
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/sys_common/backtrace.rs:141:18
  13:     0x55e1eef7bd69 - rust_begin_unwind
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:517:5
  14:     0x55e1eedb0abb - std::panicking::begin_panic_fmt::hc15cd62c02c70986
                               at /rustc/59eed8a2aac0230a8b53e89d4e99d55912ba6b35/library/std/src/panicking.rs:460:5
  15:     0x55e1eef54c8f - loom::rt::location::PanicBuilder::fire::hd2aa6024f9cde508
  16:     0x55e1eef49d74 - loom::rt::cell::State::track_write::hfb356d64ceb7d0d7
  17:     0x55e1eeddbc28 - scoped_tls::ScopedKey<T>::with::he7c6540a05f6201e
  18:     0x55e1eee9a67f - loom::cell::unsafe_cell::UnsafeCell<T>::with_mut::h2576b690fe758e36
  19:     0x55e1eeef64bd - tokio::sync::oneshot::Receiver<T>::try_recv::h0d7824d96857ae71
  20:     0x55e1eee1130c - core::ops::function::FnOnce::call_once{{vtable.shim}}::he6f36eb67e2ec228
  21:     0x55e1eef5561a - loom::rt::scheduler::spawn_threads::{{closure}}::{{closure}}::h4e7415b3d604446b
  22:     0x55e1eef54656 - generator::stack::StackBox<F>::call_once::h0c92f84febab0523
  23:     0x55e1eef5ca66 - generator::gen_impl::gen_init::hd40ac4efc0687d23
  24:                0x0 - <unknown>


failures:
    sync::tests::loom_oneshot::recv_close

test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 77 filtered out; finished in 0.00s

I am fairly sure this is the same race condition; loom is detecting the race in the call to consume_value() but prior to the unwrap() that fails outside of loom.

hawkw commented

I think this may also deserve the I-Unsound issue tag --- in @acfoltzer's repro case, the failure mode here is fairly "clean"; the Sender sees the None value, unwraps it, and panics, which is not too bad. But, in the repro, the values in the channel are (), which is zero-sized, so the UnsafeCell only contains the Option discriminant, and takeing it is just zeroing a single word...so, the other side will always see a None and fail cleanly with a panic. But, in the case where a large struct or something is inside the UnsafeCell, I'm pretty sure it's possible for one side or the other to see a partially-deinitialized value, which seems bad. And, if a large struct or something contains an owning pointer, it might also be possible for both sides to end up with ownership over that pointer? So, the data race could potentially result in arbitrarily bad things happening...

I mean, yes, it is a race. Races are unsound.

hawkw commented

Yup.

Modifying the repro to send a string through the channel and actually mutate/dereference it results in a more or less instantaneous segfault:

#[tokio::main]
async fn main() {
    loop {
        let (tx, mut rx) = tokio::sync::oneshot::channel();
        tokio::spawn(async move {
            if let Err(mut s) = tx.send(String::from("hello")) {
                s.push_str(" world");
                assert_eq!(&s[..], "hello world");
            }
        });
        tokio::spawn(async move {
            rx.close();
            if let Ok(mut s) = rx.try_recv() {
                s.push_str(" san francisco");
                assert_eq!(&s[..], "hello san francisco");
            }
        });
    }
}

A few runs with various amusingly mangled strings I got it to produce:

:# eliza at noctis in adam-repro on  main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
   Compiling adam-repro v0.1.0 (/home/eliza/Code/adam-repro)
    Finished release [optimized] target(s) in 0.91s
     Running `target/release/adam-repro`
thread 'tokio-runtime-worker' panicked at 'byte index 1 is not a char boundary; it is inside '\u{10}' (bytes 0..1) of `&�sa��sco`', library/core/src/fmt/mod.rs:2133:30
stack backtrace:
zsh: segmentation fault (core dumped)  cargo run --release
cargo run --release  2.30s user 0.24s system 219% cpu 1.157 total

:# eliza at noctis in adam-repro on  main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
    Finished release [optimized] target(s) in 0.01s
     Running `target/release/adam-repro`
thread 'tokio-runtime-worker' panicked at 'assertion failed: `(left == right)`
  left: `"�C~\u{0} francisco"`,
 right: `"hello san francisco"`', src/main.rs:15:17
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
zsh: segmentation fault (core dumped)  cargo run --release
cargo run --release  0.19s user 0.08s system 108% cpu 0.245 total

:# eliza at noctis in adam-repro on  main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
    Finished release [optimized] target(s) in 0.01s
     Running `target/release/adam-repro`
thread 'tokio-runtime-worker' panicked at 'assertion failed: `(left == right)`
  left: `"hello sa\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}sco"`,
 right: `"hello san francisco"`', src/main.rs:15:17
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
zsh: segmentation fault (core dumped)  cargo run --release
cargo run --release  0.23s user 0.09s system 125% cpu 0.249 total

:# eliza at noctis in adam-repro on  main [?] is 📦 v0.1.0 via ⚙️ v1.56.1
:; time cargo run --release
    Finished release [optimized] target(s) in 0.01s
     Running `target/release/adam-repro`
zsh: segmentation fault (core dumped)  cargo run --release
cargo run --release  0.07s user 0.05s system 61% cpu 0.192 total

:# eliza at noctis in adam-repro on  main [?] is 📦 v0.1.0 via ⚙️ v1.56.1