shmem backend thread panic when running examples
wjhorne opened this issue · 3 comments
Hi,
I wasn't sure if this project was to a point where issues would be appropriate, but figured it couldn't hurt. When I run many of the examples using the shmem backend without rofi on 2 or more PEs on a single node I currently get the following error. I am able to run some of the array examples without issue and generally speaking 1 PE works.
RUST_BACKTRACE=full ./lamellar_run.sh -N=2 -T=10 target/debug/examples/am_return_am
thread '' panicked at 'attempt to add with overflow', /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/traits/accum.rs:141:1
stack backtrace:
0: 0x55f174ef41dc - std::backtrace_rs::backtrace::libunwind::trace::h91c465e73bf6c785
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/../../backtrace/src/backtrace/libunwind.rs:93:5
1: 0x55f174ef41dc - std::backtrace_rs::backtrace::trace_unsynchronized::hae9da36f5d58b5f3
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
2: 0x55f174ef41dc - std::sys_common::backtrace::_print_fmt::h7f499fa126a7effb
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys_common/backtrace.rs:67:5
3: 0x55f174ef41dc - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h3e2b509ce2ce6007
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys_common/backtrace.rs:46:22
4: 0x55f174f1631c - core::fmt::write::h753c7571fa063ecb
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/fmt/mod.rs:1168:17
5: 0x55f174ef0cd3 - std::io::Write::write_fmt::h2815c0519c99ba09
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/io/mod.rs:1660:15
6: 0x55f174ef6612 - std::sys_common::backtrace::_print::h64941a6fc8b0ed9b
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys_common/backtrace.rs:49:5
7: 0x55f174ef6612 - std::sys_common::backtrace::print::hcf25e43e1a9b0766
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys_common/backtrace.rs:36:9
8: 0x55f174ef6612 - std::panicking::default_hook::{{closure}}::h78d3e6cf97fc623d
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:211:50
9: 0x55f174ef61f5 - std::panicking::default_hook::hda898f8d3ad1a5ae
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:228:9
10: 0x55f17471ded3 - <alloc::boxed::Box<F,A> as core::ops::function::Fn>::call::h432af52895dbab9c
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/alloc/src/boxed.rs:1868:9
11: 0x55f1746ec55c - lamellar::scheduler::work_stealing::WorkStealingInner::init::{{closure}}::h200072a14ace5090
at /home/nixes/dev_work/lamellar-runtime/src/scheduler/work_stealing.rs:362:13
12: 0x55f174ef6c85 - std::panicking::rust_panic_with_hook::h1a5ea2d6c23051aa
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:610:17
13: 0x55f174ef6952 - std::panicking::begin_panic_handler::{{closure}}::h07f549390938b73f
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:500:13
14: 0x55f174ef4684 - std::sys_common::backtrace::__rust_end_short_backtrace::h5ec3758a92cfb00d
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys_common/backtrace.rs:139:18
15: 0x55f174ef66b9 - rust_begin_unwind
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:498:5
16: 0x55f1742d98e1 - core::panicking::panic_fmt::h3a79a6a99affe1d5
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/panicking.rs:116:14
17: 0x55f1742d982d - core::panicking::panic::h97167cd315d19cd4
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/panicking.rs:48:5
18: 0x55f17477f7c8 - ::sum::{{closure}}::h6f36b1ba0736544d
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/traits/accum.rs:45:28
19: 0x55f1744507d1 - core::iter::adapters::map::map_fold::{{closure}}::h5ebf1fb0cac52e67
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/adapters/map.rs:84:21
20: 0x55f1744ee609 - core::iter::traits::iterator::Iterator::fold::h2015bcf3735fbf56
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/traits/iterator.rs:2171:21
21: 0x55f174457b63 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold::h138ac22ff629d74b
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/adapters/map.rs:124:9
22: 0x55f17477f718 - ::sum::hea1767c71216a7d1
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/traits/accum.rs:42:17
23: 0x55f174465ca4 - core::iter::traits::iterator::Iterator::sum::hf0e6c90242aea12b
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/iter/traits/iterator.rs:3088:9
24: 0x55f17476db58 - lamellar::lamellae::command_queues::calc_hash::h4e6ac3bbf895c966
at /home/nixes/dev_work/lamellar-runtime/src/lamellae/command_queues.rs:60:5
25: 0x55f17476ef87 - lamellar::lamellae::command_queues::CmdMsgBuffer::flush_buffer::hd391a19071b8fce6
at /home/nixes/dev_work/lamellar-runtime/src/lamellae/command_queues.rs:251:28
26: 0x55f174772026 - lamellar::lamellae::command_queues::InnerCQ::try_sending_buffer::hde60e6bdb568879e
at /home/nixes/dev_work/lamellar-runtime/src/lamellae/command_queues.rs:561:17
27: 0x55f174772e13 - lamellar::lamellae::command_queues::InnerCQ::send::{{closure}}::ha9c4ef14e1c03bd7
at /home/nixes/dev_work/lamellar-runtime/src/lamellae/command_queues.rs:640:24
28: 0x55f17493df7d - <core::future::from_generator::GenFuture as core::future::future::Future>::poll::hb9698704e3ece047
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/future/mod.rs:84:19
29: 0x55f1747795ba - lamellar::lamellae::command_queues::CommandQueue::send_data::{{closure}}::h0431491961bae8f1
at /home/nixes/dev_work/lamellar-runtime/src/lamellae/command_queues.rs:1074:70
30: 0x55f174943b5d - <core::future::from_generator::GenFuture as core::future::future::Future>::poll::hdf6642be1199e606
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/future/mod.rs:84:19
31: 0x55f1745a5eef - <lamellar::lamellae::shmem_lamellae::Shmem as lamellar::lamellae::LamellaeAM>::send_to_pes_async::{{closure}}::h7450ba257cc90dfc
at /home/nixes/dev_work/lamellar-runtime/src/lamellae/shmem_lamellae.rs:149:40
32: 0x55f17493613c - <core::future::from_generator::GenFuture as core::future::future::Future>::poll::h7cf96167d09b9d9e
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/future/mod.rs:84:19
33: 0x55f174592875 - <core::pin::Pin
as core::future::future::Future>::poll::h6bd2cc5f3b69113e
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/future/future.rs:123:9
34: 0x55f17452baad - lamellar::active_messaging::registered_active_message::RegisteredActiveMessages::add_req_to_batch::{{closure}}::h4771c74641be2b30
at /home/nixes/dev_work/lamellar-runtime/src/active_messaging/registered_active_message.rs:413:91
35: 0x55f17493e5ac - <core::future::from_generator::GenFuture as core::future::future::Future>::poll::hbb4472c25f073069
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/future/mod.rs:84:19
36: 0x55f1746e9192 - <lamellar::scheduler::work_stealing::WorkStealingInner as lamellar::scheduler::AmeSchedulerQueue>::submit_task::{{closure}}::h245eed26e28f7c85
at /home/nixes/dev_work/lamellar-runtime/src/scheduler/work_stealing.rs:222:19
37: 0x55f17493688c - <core::future::from_generator::GenFuture as core::future::future::Future>::poll::h80ea87dbd92d3eaa
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/future/mod.rs:84:19
38: 0x55f174633eab - async_task::raw::RawTask<F,T,S>::run::h835262f5039256d8
at /home/nixes/.cargo/registry/src/github.com-1ecc6299db9ec823/async-task-4.2.0/src/raw.rs:489:20
39: 0x55f174e08061 - async_task::runnable::Runnable::run::hadecd10fa8c50bbf
at /home/nixes/.cargo/registry/src/github.com-1ecc6299db9ec823/async-task-4.2.0/src/runnable.rs:309:18
40: 0x55f1746e51a2 - lamellar::scheduler::work_stealing::WorkStealingThread::run::{{closure}}::hd510133d8eb64179
at /home/nixes/dev_work/lamellar-runtime/src/scheduler/work_stealing.rs:89:21
41: 0x55f1744169e3 - std::sys_common::backtrace::__rust_begin_short_backtrace::h0da881cbb3dcf4a8
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys_common/backtrace.rs:123:18
42: 0x55f1743cc28d - std::thread::Builder::spawn_unchecked::{{closure}}::{{closure}}::h9b75cd87d68c1c6a
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/thread/mod.rs:477:17
43: 0x55f174415f31 - <core::panic::unwind_safe::AssertUnwindSafe as core::ops::function::FnOnce<()>>::call_once::hbebabbb798283b2c
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/panic/unwind_safe.rs:271:9
44: 0x55f17491d94c - std::panicking::try::do_call::h27158e6086ce710b
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:406:40
45: 0x55f17495c13b - __rust_try
46: 0x55f17491d887 - std::panicking::try::h629ae4a22cb5191d
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panicking.rs:370:19
47: 0x55f1745774a1 - std::panic::catch_unwind::h0e1d859b2965093d
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/panic.rs:133:14
48: 0x55f1743cc0ad - std::thread::Builder::spawn_unchecked::{{closure}}::hbfd6028abfbdc549
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/thread/mod.rs:476:30
49: 0x55f17495c1bf - core::ops::function::FnOnce::call_once{{vtable.shim}}::hb59033300323e990
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/core/src/ops/function.rs:227:5
50: 0x55f174efa973 - <alloc::boxed::Box<F,A> as core::ops::function::FnOnce>::call_once::h49b6c7c5155a2296
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/alloc/src/boxed.rs:1854:9
51: 0x55f174efa973 - <alloc::boxed::Box<F,A> as core::ops::function::FnOnce>::call_once::ha8b5234bfeb15105
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/alloc/src/boxed.rs:1854:9
52: 0x55f174efa973 - std::sys::unix::thread::Thread::new::thread_start::h6f207dd842d64859
at /rustc/9d1b2106e23b1abd32fce1f17267604a5102f57a/library/std/src/sys/unix/thread.rs:108:17
53: 0x7f26d8e175c2 - start_thread
54: 0x7f26d8e9c584 - __clone
55: 0x0 -
Thanks for posting an issue, we are getting to the point where we would like other folks to start testing/using!
Could you provide some info on your system? Like OS, amount of memory, number of cores?
I'm assuming this was on the master branch?
I was able to reproduce (I forget that release mode does not panic on overflows), please checkout the branch associated with the issue and give it a try!
I can confirm that the new branch fixes the issue on my end. Thanks for putting in the quick effort.