aoh/radamsa

Radamsa hang on linux and macOS

yrp604 opened this issue · 4 comments

I get a hang sometimes when running radamsa as a child process on macOS and linux. I don't think I've done anything wrong here, but if so my apologies.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>

#include <sys/types.h>
#include <sys/wait.h>

void radamsa(uint8_t *buf, const size_t buf_sz) {
    int child_in[2]  = { -1, -1 };
    int child_out[2] = { -1, -1 };

    pipe (child_in);
    pipe (child_out);

    pid_t child = fork ();

    if (child == 0) {
        dup2 (child_in[0], STDIN_FILENO);
        dup2 (child_out[1], STDOUT_FILENO);

        close (child_in[0]);
        close (child_in[1]);

        close (child_out[0]);
        close (child_out[1]);

        execlp ("radamsa", "radamsa", NULL);

        abort ();
    }

    close (child_in[0]);
    close (child_out[1]);

    FILE *in  = fdopen (child_in[1], "w");
    FILE *out = fdopen (child_out[0], "r");

    fwrite (buf, 1, buf_sz, in);
    fclose (in);

    int status;
    waitpid (child, &status, 0);

    fread (buf, 1, buf_sz, out);

    fclose (out);
}

int main() {
    for (;;) {
        uint8_t buf[20] = { 0 };
        memset (buf, 0x41, sizeof(buf) - 1);

        radamsa (buf, sizeof(buf) - 1);

        printf ("%s\n", buf);
    }
}

Linux backtrace:

(gdb) bt
#0  0x00007f96014f93a0 in __nanosleep_nocancel () at ../sysdeps/unix/syscall-template.S:81
#1  0x00007f9601522fd4 in usleep (useconds=<optimized out>) at ../sysdeps/unix/sysv/linux/usleep.c:32
#2  0x0000000000404361 in vm ()
#3  0x000000000049f580 in boot ()
#4  0x0000000000401009 in main ()

macOS:

(lldb) bt
* thread #1, queue = 'com.apple.main-thread', stop reason = signal SIGSTOP
  * frame #0: 0x00007fff9f8eaf46 libsystem_kernel.dylib`__semwait_signal + 10
    frame #1: 0x00007fff9f871b72 libsystem_c.dylib`nanosleep + 199
    frame #2: 0x00007fff9f871a66 libsystem_c.dylib`usleep + 54
    frame #3: 0x0000000101bcbaf3 radamsa`vm + 2941
    frame #4: 0x0000000101bcab48 radamsa`boot + 784
    frame #5: 0x0000000101bf28f9 radamsa`main + 9
    frame #6: 0x00007fff9f7bc235 libdyld.dylib`start + 1

My reproducer pretty reliably hangs in a few seconds. Tested on 10.12.4 and linux 4.3. Happy to provide any more info if it would be useful.

Wups, the linux backtrace was with an older version of radamsa. The current version still hangs, but has a different backtrace:

(gdb) bt
#0  0x00007ff804bc3873 in __select_nocancel () at ../sysdeps/unix/syscall-template.S:81
#1  0x0000000000402bc1 in do_poll (a=<optimized out>, b=<optimized out>, c=<optimized out>, r1=0x6c6178 <R.5254+24>, r2=0x6c6180 <R.5254+32>) at radamsa.c:3591
#2  0x00000000004040b1 in vm (ob=0x7ff7fcae6270, ob@entry=0x7ff7fcb23088, args=args@entry=0x7ff7fcadd020) at radamsa.c:3753
#3  0x00000000004ae8d7 in boot (nargs=nargs@entry=1, argv=argv@entry=0x7ffca075c5b8) at radamsa.c:3562
#4  0x0000000000400f91 in main (nargs=1, argv=0x7ffca075c5b8) at radamsa.c:6369
aoh commented

Syscalls seem ok from radamsa side.

...
open("/dev/urandom", O_RDONLY)          = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(1, 9), ...}) = 0
fcntl(3, F_GETFD)                       = 0
fcntl(3, F_SETFL, O_RDONLY|O_NONBLOCK)  = 0
read(3, "\242\341\\\26\357\326A\206w\0", 10) = 10
close(3)                                = 0
read(0, "AAAAAAAAAAAAAAAAAAA", 797)     = 19
read(0, "", 778)                        = 0
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 1066) = 1066
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = 2048
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 2048) = -1 EAGAIN (Resource temporarily unavailable)
select(2, [], [1], [1], NULL
aoh commented

Radamsa is waiting for stdout to be ready for writing, and the C code closes the stdout only after waiting for radamsa to finish. Try moving waitpid (child, &status, 0); below fclose(out).

Ah, I see whats happening. The radamsa's stdout pipe is filling up, causing it to block on writes, meanwhile the parent is blocked on wait.

If I close the pipe before waiting, I'll be unable to read the radamsa output. Instead I need to read the data after closing radamsa's stdin, but before waiting on radamsa. Thanks for taking a look, and sorry for the false alarm.