ocaml-multicore/domainslib

Segfault with two task pools

jmid opened this issue · 7 comments

jmid commented

The following program triggers a segfault on my machine.

dune:

(executable
 (name task_issue_crash)
 (modules task_issue_crash)
 (libraries domainslib))

task_issue_crash.ml:

open Domainslib

(* a simple work item, from ocaml/testsuite/tests/misc/takc.ml *)
let rec tak x y z =
  if x > y then tak (tak (x-1) y z) (tak (y-1) z x) (tak (z-1) x y)
           else z

let work () =
  for _ = 1 to 200 do
    assert (7 = tak 18 12 6);
  done
;;
begin
  let pool1 = Task.setup_pool ~num_additional_domains:2 () in
  let pool2 = Task.setup_pool ~num_additional_domains:1 () in

  let pool1_prom0 = Task.async pool1 work in

  let pool2_prom0 = Task.async pool2 work in
  let pool2_prom1 = Task.async pool2 work in

  Task.run pool1 (fun () -> List.iter (fun p -> Task.await pool1 p) [pool1_prom0]);
  Task.run pool2 (fun () -> List.iter (fun p -> Task.await pool2 p) [pool2_prom0; pool2_prom1]);

  Task.teardown_pool pool1;
  Task.teardown_pool pool2;
end
$ dune exec ./task_issue_crash.exe
Segmentation fault (core dumped)

I think that the Task.run usage abides by the specification:

    This function should be used at the top level to enclose the calls to other
    functions that may await on promises. This includes {!await},
    {!parallel_for} and its variants. Otherwise, those functions will raise
    [Unhandled] exception. *)

I get a segfault running it on both

  • 4.12+domains (installed through opam IIRC) and
  • 4.14.0+domains+dev0 git hash '318b23950' installed manually from the latest git version yesterday (Thu, Dec.9)

and with the latest domainslib installed through opam with opam source domainslib --dev --pin.

This is on an old Intel dual-core x86_64 Thinkpad (w/4 CPU threads) running Linux 5.4.0-91-generic.

jmid commented

I've dug a bit to investigate this issue. Running through gdb reveals:

(gdb) run
Starting program: [...] /domainslib-issue-58/_build/default/task_issue_crash.exe 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7a35700 (LWP 369580)]
[New Thread 0x7ffff6a33700 (LWP 369582)]
[New Thread 0x7ffff7234700 (LWP 369581)]
[New Thread 0x7ffff6232700 (LWP 369583)]
[New Thread 0x7ffff516e700 (LWP 369585)]
[New Thread 0x7ffff596f700 (LWP 369584)]
[New Thread 0x7ffff48ab700 (LWP 369586)]

Thread 1 "Domain0" received signal SIGSEGV, Segmentation fault.
camlDomainslib__Ws_deque__push_315 () at lib/ws_deque.ml:128
128	    let b = Atomic.get q.bottom in

With the following stack trace:

(gdb) bt
#0  camlDomainslib__Ws_deque__push_315 () at lib/ws_deque.ml:128
#1  0x00005555555a5de4 in camlDomainslib__Multi_channel__send_341 () at lib/multi_channel.ml:113
#2  0x00005555555a6adf in camlDomainslib__Task__async_331 () at lib/task.ml:54
#3  0x00005555555a3664 in camlDune__exe__Task_issue_crash__entry () at task_issue_crash.ml:19
#4  0x00005555555a0b7b in caml_program ()
#5  <signal handler called>
#6  0x0000555555600267 in caml_startup_common (argv=0x7fffffffdc58, pooling=<optimized out>, pooling@entry=0) at startup_nat.c:137
#7  0x000055555560029f in caml_startup_exn (argv=<optimized out>) at startup_nat.c:147
#8  caml_startup (argv=<optimized out>) at startup_nat.c:147
#9  0x00005555555a0852 in main (argc=<optimized out>, argv=<optimized out>) at main.c:37

Arising from these lines:

domainslib/lib/ws_deque.ml

Lines 126 to 128 in df30722

let push q v =
let v' = ref v in
let b = Atomic.get q.bottom in

let send mchan v =
let id = (get_local_state mchan).id in
Ws_deque.push (Array.unsafe_get mchan.channels id) v;

domainslib/lib/task.ml

Lines 51 to 54 in df30722

let async pool f =
let pd = get_pool_data pool in
let p = Atomic.make (Pending []) in
Multi_channel.send pd.task_chan (Work (fun _ -> do_task f p));

  let pool2_prom0 = Task.async pool2 work in

Concretely, Array.unsafe_get in Multi_channel.send looks up index 2 of an array of size 2 (array index out of bounds).
This can be confirmed by using Array.get instead.

This seems to arise from the two Multi_channels of different sizes:

  • Multi_channel.make (2+1) underlying pool1 allocates a channel array of size 4 whereas
  • Multi_channel.make (1+1) underlying pool2 allocates a channel array of size 2.
jmid commented

I've also tried changing the example to be closer to the 2 work-pool example in test/test_task.ml.
The result also segfaults on my machine:

[...]

begin
  let pool1 = Task.setup_pool ~name:"pool1" ~num_additional_domains:2 () in
  let pool2 = Task.setup_pool ~name:"pool2" ~num_additional_domains:1 () in

  Task.run pool1 (fun () ->
      let p1 = Option.get @@ Task.lookup_pool "pool1" in
      let pool1_prom0 = Task.async p1 work in
      List.iter (fun p -> Task.await p1 p) [pool1_prom0]);

  Task.run pool2 (fun () ->
      let p2 = Option.get @@ Task.lookup_pool "pool2" in
      let pool2_prom0 = Task.async p2 work in
      let pool2_prom1 = Task.async p2 work in
      List.iter (fun p -> Task.await p2 p) [pool2_prom0; pool2_prom1]);

  Task.teardown_pool pool1;
  Task.teardown_pool pool2;
end
jmid commented

Out of curiosity, to try the setup with different domain sizes in each pool I just tried changing test/test_task.ml:

$ diff -u test_task.ml{~,}
--- test_task.ml~	2021-12-20 15:29:11.049259293 +0100
+++ test_task.ml	2021-12-20 15:30:31.916631932 +0100
@@ -38,7 +38,7 @@
 
 let () =
   let pool1 = Task.setup_pool ~num_additional_domains:2 ~name:"pool1" () in
-  let pool2 = Task.setup_pool ~num_additional_domains:2 ~name:"pool2" () in
+  let pool2 = Task.setup_pool ~num_additional_domains:1 ~name:"pool2" () in
   Task.run pool1 (fun _ ->
     let p1 = Option.get @@ Task.lookup_pool "pool1" in
     modify_arr pool1 0 ();

The resulting program also segfaults on my machine.

This PR is still not merged: #50. Perhaps explains the issue?

jmid commented

Ah yes, thanks! With this PR they all run smoothly. I found a minor remaining issue - but I'll report that separately.

I've resolved the conflicts from merging @edwintorok's PR against the latest master and added the crash test on top.
The result is available here: https://github.com/jmid/domainslib/tree/unique-key-updated
I can create a PR from that if you prefer - but I don't want to step over @edwintorok's toes... 🙂

@jmid don't worry, updating my PR is much appreciated, I've incorporated your branch into my PR now.

jmid commented

Solved with the merge of #50