Segfault with two task pools
jmid opened this issue · 7 comments
The following program triggers a segfault on my machine.
dune:
(executable
(name task_issue_crash)
(modules task_issue_crash)
(libraries domainslib))
task_issue_crash.ml:
open Domainslib
(* a simple work item, from ocaml/testsuite/tests/misc/takc.ml *)
let rec tak x y z =
if x > y then tak (tak (x-1) y z) (tak (y-1) z x) (tak (z-1) x y)
else z
let work () =
for _ = 1 to 200 do
assert (7 = tak 18 12 6);
done
;;
begin
let pool1 = Task.setup_pool ~num_additional_domains:2 () in
let pool2 = Task.setup_pool ~num_additional_domains:1 () in
let pool1_prom0 = Task.async pool1 work in
let pool2_prom0 = Task.async pool2 work in
let pool2_prom1 = Task.async pool2 work in
Task.run pool1 (fun () -> List.iter (fun p -> Task.await pool1 p) [pool1_prom0]);
Task.run pool2 (fun () -> List.iter (fun p -> Task.await pool2 p) [pool2_prom0; pool2_prom1]);
Task.teardown_pool pool1;
Task.teardown_pool pool2;
end
$ dune exec ./task_issue_crash.exe
Segmentation fault (core dumped)
I think that the Task.run
usage abides by the specification:
This function should be used at the top level to enclose the calls to other
functions that may await on promises. This includes {!await},
{!parallel_for} and its variants. Otherwise, those functions will raise
[Unhandled] exception. *)
I get a segfault running it on both
- 4.12+domains (installed through opam IIRC) and
- 4.14.0+domains+dev0 git hash '318b23950' installed manually from the latest git version yesterday (Thu, Dec.9)
and with the latest domainslib installed through opam with opam source domainslib --dev --pin
.
This is on an old Intel dual-core x86_64 Thinkpad (w/4 CPU threads) running Linux 5.4.0-91-generic.
I've dug a bit to investigate this issue. Running through gdb
reveals:
(gdb) run
Starting program: [...] /domainslib-issue-58/_build/default/task_issue_crash.exe
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7a35700 (LWP 369580)]
[New Thread 0x7ffff6a33700 (LWP 369582)]
[New Thread 0x7ffff7234700 (LWP 369581)]
[New Thread 0x7ffff6232700 (LWP 369583)]
[New Thread 0x7ffff516e700 (LWP 369585)]
[New Thread 0x7ffff596f700 (LWP 369584)]
[New Thread 0x7ffff48ab700 (LWP 369586)]
Thread 1 "Domain0" received signal SIGSEGV, Segmentation fault.
camlDomainslib__Ws_deque__push_315 () at lib/ws_deque.ml:128
128 let b = Atomic.get q.bottom in
With the following stack trace:
(gdb) bt
#0 camlDomainslib__Ws_deque__push_315 () at lib/ws_deque.ml:128
#1 0x00005555555a5de4 in camlDomainslib__Multi_channel__send_341 () at lib/multi_channel.ml:113
#2 0x00005555555a6adf in camlDomainslib__Task__async_331 () at lib/task.ml:54
#3 0x00005555555a3664 in camlDune__exe__Task_issue_crash__entry () at task_issue_crash.ml:19
#4 0x00005555555a0b7b in caml_program ()
#5 <signal handler called>
#6 0x0000555555600267 in caml_startup_common (argv=0x7fffffffdc58, pooling=<optimized out>, pooling@entry=0) at startup_nat.c:137
#7 0x000055555560029f in caml_startup_exn (argv=<optimized out>) at startup_nat.c:147
#8 caml_startup (argv=<optimized out>) at startup_nat.c:147
#9 0x00005555555a0852 in main (argc=<optimized out>, argv=<optimized out>) at main.c:37
Arising from these lines:
Lines 126 to 128 in df30722
domainslib/lib/multi_channel.ml
Lines 111 to 113 in df30722
Lines 51 to 54 in df30722
let pool2_prom0 = Task.async pool2 work in
Concretely, Array.unsafe_get
in Multi_channel.send
looks up index 2 of an array of size 2 (array index out of bounds).
This can be confirmed by using Array.get
instead.
This seems to arise from the two Multi_channel
s of different sizes:
Multi_channel.make (2+1)
underlyingpool1
allocates achannel
array of size 4 whereasMulti_channel.make (1+1)
underlyingpool2
allocates achannel
array of size 2.
I've also tried changing the example to be closer to the 2 work-pool example in test/test_task.ml.
The result also segfaults on my machine:
[...]
begin
let pool1 = Task.setup_pool ~name:"pool1" ~num_additional_domains:2 () in
let pool2 = Task.setup_pool ~name:"pool2" ~num_additional_domains:1 () in
Task.run pool1 (fun () ->
let p1 = Option.get @@ Task.lookup_pool "pool1" in
let pool1_prom0 = Task.async p1 work in
List.iter (fun p -> Task.await p1 p) [pool1_prom0]);
Task.run pool2 (fun () ->
let p2 = Option.get @@ Task.lookup_pool "pool2" in
let pool2_prom0 = Task.async p2 work in
let pool2_prom1 = Task.async p2 work in
List.iter (fun p -> Task.await p2 p) [pool2_prom0; pool2_prom1]);
Task.teardown_pool pool1;
Task.teardown_pool pool2;
end
Out of curiosity, to try the setup with different domain sizes in each pool I just tried changing test/test_task.ml:
$ diff -u test_task.ml{~,}
--- test_task.ml~ 2021-12-20 15:29:11.049259293 +0100
+++ test_task.ml 2021-12-20 15:30:31.916631932 +0100
@@ -38,7 +38,7 @@
let () =
let pool1 = Task.setup_pool ~num_additional_domains:2 ~name:"pool1" () in
- let pool2 = Task.setup_pool ~num_additional_domains:2 ~name:"pool2" () in
+ let pool2 = Task.setup_pool ~num_additional_domains:1 ~name:"pool2" () in
Task.run pool1 (fun _ ->
let p1 = Option.get @@ Task.lookup_pool "pool1" in
modify_arr pool1 0 ();
The resulting program also segfaults on my machine.
Ah yes, thanks! With this PR they all run smoothly. I found a minor remaining issue - but I'll report that separately.
I've resolved the conflicts from merging @edwintorok's PR against the latest master and added the crash test on top.
The result is available here: https://github.com/jmid/domainslib/tree/unique-key-updated
I can create a PR from that if you prefer - but I don't want to step over @edwintorok's toes... 🙂
@jmid don't worry, updating my PR is much appreciated, I've incorporated your branch into my PR now.