servo/gaol

Handle file descriptor access control

Opened this issue · 1 comments

A sandbox would greatly benefit from being able to only use a set of file descriptors/handles instead of accessing an explicit path, with seccom-bpf (e.g. write(2), fstat(2)…) and maybe later with capsicum (e.g. openat(2)).

This could also allow efficient data sharing (i.e. memfd_create(2)/seal/mmap).

cc rust-lang/rust#21936
cc rust-lang/rfcs#941
cc #2

Can't this be done by chrooting or pivot_rooting into an unlinked directory that was created inside a mounted tmpfs? I'm thinking something like this.

#define _GNU_SOURCE

#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <mntent.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include <linux/sched.h>

#ifndef __NR_execveat
#define __NR_execveat 322
#endif

#define SHELL "/bin/busybox"

#define RUNTIME_NAME "sandbox"

#define HOSTNAME "sandbox"

#define ARRAY_SIZE(A) (sizeof A / sizeof A[0U])

static char * const shell_arguments[] = { (char *)SHELL, "sh", NULL };
static char * const shell_environment[] = { "container=init", NULL };

static char fstab_config[] =
    "# <file system>    <mount point>   <type>  <options>\n"
    "tmpfs  tmp tmpfs   mkdir,nodev,noexec,nosuid\n"
    "\n"
    "# Allow connecting to X11\n"
    "/tmp/.X11-unix tmp/.X11-unix   none    mkdir,ro,bind,noexec,nosuid\n"
    "\n"
    "tmpfs  dev tmpfs   mkdir,nosuid,noexec\n"
    "\n"
    "# 3D acceleration\n"
    "/dev/dri   dev/dri none    mkdir,ro,bind,noexec,nosuid\n"
    "\n"
    "/dev/null  dev/null    none    touch,bind\n"
    "/dev/full  dev/full    none    touch,bind\n"
    "/dev/zero  dev/zero    none    touch,bind\n"
    "/dev/urandom   dev/urandom none    touch,bind\n"
    "\n"
    "/dev/tty   dev/tty none    touch,bind\n"
    "\n"
    "devpts dev/pts devpts  mkdir,ptmxmode=0666,newinstance\n"
    "\n"
    "tmpfs  run tmpfs   mkdir,nosuid,noexec\n"
    "tmpfs  run/lock    tmpfs   mkdir,nosuid,nodev,noexec\n"
    "tmpfs  run/shm tmpfs   mkdir,nosuid,nodev\n"
    "tmpfs  var tmpfs   mkdir,nosuid,noexec\n"
    "\n"
    "proc   proc    proc    mkdir,ro,nodev,noexec,nosuid\n"
    "sysfs  sys sysfs   mkdir,ro,nodev,noexec,nosuid\n"
    "\n"
    "/lib   lib none    mkdir,ro,nodev,nosuid,bind\n"
    "/lib32 lib32   none    mkdir,ro,nodev,nosuid,bind\n"
    "/lib64 lib64   none    mkdir,ro,nodev,nosuid,bind\n"
    "\n"
    "/bin   bin none    mkdir,ro,nodev,nosuid,bind\n"
    "/sbin  sbin    none    mkdir,ro,nodev,nosuid,bind\n"
    "/usr   usr none    mkdir,ro,nodev,nosuid,bind\n"
    "\n"
    "/etc   etc none    mkdir,ro,nodev,nosuid,bind\n";

static int close_leaked_fds(void);

int main(void)
{
    int errnum;

    if (-1 == close_leaked_fds()) {
        perror("close_leaked_fds");
        return EXIT_FAILURE;
    }

    int sh_fd = open(SHELL, O_CLOEXEC | O_NONBLOCK | O_NOCTTY);
    if (-1 == sh_fd) {
        perror("open");
        return EXIT_FAILURE;
    }

    uid_t uid = getuid();
    gid_t gid = getgid();

    uid_t mapped_uid = uid;
    gid_t mapped_gid = gid;

    /* Needed to do the rest of the unsharing */
    if (-1 == unshare(CLONE_NEWUSER)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    /* Prevent signals, ptracing of other processes */
    if (-1 == unshare(CLONE_NEWPID)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    /* Fork to allow for multithreading and to make the shell less
     * buggy.
     */
    {
        pid_t child = fork();
        if (-1 == child) {
            perror("fork");
            return EXIT_FAILURE;
        }


        if (child != 0) {
            siginfo_t info;
            do {
                errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
            } while (EINTR == errnum);
            if (errnum != 0) {
                assert(errnum != EINVAL);
                assert(errnum != ECHILD);
                assert(false);
            }
            return info.si_status;
        }
    }

    {
    int set_groups = open("/proc/self/setgroups", O_CLOEXEC | O_WRONLY);
    if (-1 == set_groups) {
        perror("open");
        return EXIT_FAILURE;
    }

        if (-1 == dprintf(set_groups, "deny\n")) {
            perror("dprintf");
            return EXIT_FAILURE;
        }

    if (-1 == close(set_groups)) {
        perror("close");
        return EXIT_FAILURE;
    }
    }

    {
        int file = open("/proc/self/uid_map", O_CLOEXEC | O_WRONLY);
        if (-1 == file) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == dprintf(file, "%i %i 1\n", mapped_uid, uid)) {
            perror("dprintf");
            return EXIT_FAILURE;
        }

        if (-1 == close(file)) {
            perror("close");
            return EXIT_FAILURE;
        }
    }

    {
        int file = open("/proc/self/gid_map", O_CLOEXEC | O_WRONLY);
        if (-1 == file) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == dprintf(file, "%i %i 1\n", mapped_gid, gid)) {
            perror("dprintf");
            return EXIT_FAILURE;
        }

        if (-1 == close(file)) {
            perror("close");
            return EXIT_FAILURE;
        }
    }

    if (-1 == setresgid(mapped_gid, mapped_gid, mapped_gid)) {
         perror("setresgid");
         return EXIT_FAILURE;
    }

    if (-1 == setresuid(mapped_uid, mapped_uid, mapped_uid)) {
         perror("setresuid");
         return EXIT_FAILURE;
    }

    /* With chroot prevent messing with user files */
    if (-1 == unshare(CLONE_NEWNS)) {
            perror("unshare");
            return EXIT_FAILURE;
    }

    /* We have unshare the network namespace so we can mount /proc
     * because of /proc/net
     */
    if (-1 == unshare(CLONE_NEWNET)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    if (0) {
        FILE * tmp = tmpfile();
        if (NULL == tmp) {
            perror("tmpfile");
            return EXIT_FAILURE;
        }

        {
            size_t bytes_to_write = sizeof fstab_config - 1U;
            if (fwrite(fstab_config, 1U, bytes_to_write, tmp) != sizeof fstab_config - 1U) {
                perror("fwrite");
                return EXIT_FAILURE;
            }
        }

        char tmppath[] = "/proc/self/fd/XXXXXXXXXXX";
        sprintf(tmppath, "/proc/self/fd/%i", fileno(tmp));

        FILE * fstab = setmntent(tmppath, "r");
        if (NULL == fstab) {
            perror("setmtent");
            return EXIT_FAILURE;
        }

        if (EOF == fclose(tmp)) {
            perror("fclose");
            return EXIT_FAILURE;
        }

        if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
            errnum = errno;
            if (errnum != EEXIST) {
                perror("mkdir");
                return EXIT_FAILURE;
            }
        }

        if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
            perror("mount");
            return EXIT_FAILURE;
        }

        if (-1 == chdir(RUNTIME_NAME)) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        for (;;) {
            errno = 0;
            struct mntent * entry = getmntent(fstab);
            if (NULL == entry) {
                errnum = errno;
                if (errnum != 0) {
                    perror("getmntent");
                    return EXIT_FAILURE;
                }

                break;
            }

            enum {
                MKDIR,
                TOUCH,
                BIND,
                RBIND,
                REMOUNT,
                RO,
                RW,
                SUID,
                NOSUID,
                DEV,
                NODEV,
                EXEC,
                NOEXEC,
                USER,
                NOUSER,
                KERNMOUNT,
                ACTIVE
            };
            char * const token[] = {
                [MKDIR] = "mkdir",
                [TOUCH] = "touch",
                [BIND] = "bind",
                [RBIND] = "rbind",
                [REMOUNT] = "remount",
                [RO] = MNTOPT_RO,
                [RW] = MNTOPT_RW,
                [SUID] = MNTOPT_SUID,
                [NOSUID] = MNTOPT_NOSUID,
                [DEV] = "dev",
                [NODEV] = "nodev",
                [EXEC] = "exec",
                [NOEXEC] = "noexec",
                [USER] = "user",
                [NOUSER] = "nouser",
                [KERNMOUNT] = "kernmount",
                [ACTIVE] = "active",
                NULL
            };
            bool mkdir_flag = false;
            bool touch_flag = false;
            bool bind = false;
            bool rec = false;
            bool remount = false;
            bool readonly = false;
            bool readwrite = false;
            bool suid = true;
            bool dev = true;
            bool exec = true;
            bool user = true;
            bool kernmount = false;
            bool active = false;

            char *leftovers = NULL;

            {
                char *mnt_opts = entry->mnt_opts;

                if (0 == strcmp("none", mnt_opts)) {
                    goto mount;
                }

                char *subopts_str = strdup(mnt_opts);
                if (NULL == subopts_str) {
                    perror("strdup");
                    return EXIT_FAILURE;
                }

                char * subopts = subopts_str;

                char *value = NULL;
                while (*subopts != '\0') {
                    switch (getsubopt(&subopts, token, &value)) {
                    case MKDIR:
                        mkdir_flag = true;
                        break;

                    case TOUCH:
                        touch_flag = true;
                        break;

                    case BIND:
                        bind = true;
                        break;

                    case RBIND:
                        bind = true;
                        rec = true;
                        break;

                    case REMOUNT:
                        remount = true;
                        break;

                    case RO:
                        readonly = true;
                        break;

                    case RW:
                        readwrite = true;
                        break;

                    case SUID:
                        suid = true;
                        break;

                    case NOSUID:
                        suid = false;
                        break;

                    case DEV:
                        dev = true;
                        break;

                    case NODEV:
                        dev = false;
                        break;

                    case EXEC:
                        exec = true;
                        break;

                    case NOEXEC:
                        exec = false;
                        break;

                    case USER:
                        user = true;
                        break;

                    case NOUSER:
                        user = false;
                        break;

                    case KERNMOUNT:
                        kernmount = true;
                        break;

                    case ACTIVE:
                        active = true;
                        break;

                    default:;
                        leftovers = strstr(mnt_opts, value);
                        goto free_subopts_str;
                    }
                }

            free_subopts_str:
                free(subopts_str);
            }
        mount:
            if (bind && rec && readonly) {
                fprintf(stderr,
                    "It's not possible to recursively bind readonly mounts\n");
                return EXIT_FAILURE;
            }

            if (readwrite && readonly) {
                fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
                    token[RO], token[RW]);
                return EXIT_FAILURE;
            }

            if (mkdir_flag && touch_flag) {
                fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
                    token[MKDIR], token[TOUCH]);
                return EXIT_FAILURE;
            }

            unsigned long mountflags = 0;

            if (bind) {
                mountflags |= MS_BIND;
            }

            if (rec) {
                mountflags |= MS_REC;
            }

            if (remount) {
                mountflags |= MS_REMOUNT;
            }

            if (readonly) {
                mountflags |= MS_RDONLY;
            }

            if (!suid) {
                mountflags |= MS_NOSUID;
            }

            if (!dev) {
                mountflags |= MS_NODEV;
            }

            if (!exec) {
                mountflags |= MS_NOEXEC;
            }

            if (!user) {
                mountflags |= MS_NOUSER;
            }

            if (kernmount) {
                mountflags |= MS_KERNMOUNT;
            }

            if (active) {
                mountflags |= MS_ACTIVE;
            }

            if (mkdir_flag) {
                if (-1 == mkdir(entry->mnt_dir, S_IRWXU)) {
                    perror("mkdir");
                    return EXIT_FAILURE;
                }
            } else if (touch_flag) {
                int fd = open(entry->mnt_dir, O_EXCL | O_CREAT | O_CLOEXEC, S_IRWXU);
                if (-1 == fd) {
                    perror("open");
                    return EXIT_FAILURE;
                }
                close(fd);
            }

            if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
                    entry->mnt_dir,
                    entry->mnt_type, mountflags,
                    leftovers)) {
                perror("mount");
                return EXIT_FAILURE;
            }

            if (bind && readonly) {
                mountflags |= MS_REMOUNT;
                if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
                        entry->mnt_dir,
                        entry->mnt_type, mountflags,
                        leftovers)) {
                    perror("mount");
                    return EXIT_FAILURE;
                }
            }
        }

        if (endmntent(fstab) != 1) {
            perror("endmntent");
            return EXIT_FAILURE;
        }

        int old_root = open("/", O_DIRECTORY);
        if (-1 == old_root) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == syscall(__NR_pivot_root, ".", ".")) {
            perror("pivot_root");
            return EXIT_FAILURE;
        }

        if (-1 == fchdir(old_root)) {
            perror("fchdir");
            return EXIT_FAILURE;
        }

        if (-1 == umount2(".", MNT_DETACH)) {
            perror("umount");
            return EXIT_FAILURE;
        }

        if (-1 == close(old_root)) {
            perror("close");
            return EXIT_FAILURE;
        }

        if (-1 == chdir("/")) {
            perror("chdir");
            return EXIT_FAILURE;
        }
    } else {
        if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
            errnum = errno;
            if (errnum != EEXIST) {
                perror("mkdir");
                return EXIT_FAILURE;
            }
        }

        if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
            perror("mount");
            return EXIT_FAILURE;
        }

        if (-1 == chdir(RUNTIME_NAME)) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        if (-1 == mkdir("sandbox", S_IRWXU)) {
            errnum = errno;
            if (errnum != EEXIST) {
                perror("mkdir");
                return EXIT_FAILURE;
            }
        }

        int old_root = open("/", O_DIRECTORY);
        if (-1 == old_root) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == syscall(__NR_pivot_root, ".", ".")) {
            perror("pivot_root");
            return EXIT_FAILURE;
        }

        if (-1 == fchdir(old_root)) {
            perror("fchdir");
            return EXIT_FAILURE;
        }

        if (-1 == umount2(".", MNT_DETACH)) {
            perror("umount");
            return EXIT_FAILURE;
        }

        if (-1 == close(old_root)) {
            perror("close");
            return EXIT_FAILURE;
        }

        if (-1 == chdir("/")) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        int sandbox_fd = open("sandbox", O_CLOEXEC | O_DIRECTORY);
        if (-1 == sandbox_fd) {
            perror("open");
            return EXIT_FAILURE;
        }

        if (-1 == fchdir(sandbox_fd)) {
            perror("fchdir");
            return EXIT_FAILURE;
        }

        if (-1 == rmdir("../sandbox")) {
            perror("rmdir");
            return EXIT_FAILURE;
        }

        if (-1 == chroot(".")) {
            perror("chroot");
            return EXIT_FAILURE;
        }

        if (-1 == chdir("/")) {
            perror("chdir");
            return EXIT_FAILURE;
        }

        close(sandbox_fd);
    }

    /* Sandbox the rest of the namespaces */

    /* We can't unshare the IPC namespace because we need to share it
     * to use X11's shared memory extensions. Not sure how to disable
     * shared memory extensions.
     */
    if (-1 == unshare(CLONE_NEWIPC | CLONE_NEWUTS)) {
        perror("unshare");
        return EXIT_FAILURE;
    }

    /* Favor other processes over this process hierarchy. Only
     * superuser may lower priorities so this is not stoppable. This
     * also makes the process hierarchy nicer for the OOM killer.
     */
    if (-1 == setpriority(PRIO_PROCESS, 0, getpriority(PRIO_PROCESS, 0) + 1)) {
        perror("setpriority");
        return EXIT_FAILURE;
    }

    if (-1 == sethostname(HOSTNAME, sizeof HOSTNAME - 1U)) {
        perror("sethostname");
        return EXIT_FAILURE;
    }

    if (0) {
        if (-1 == symlink("/proc/self/fd", "/dev/fd")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/proc/self/fd/0", "/dev/stdin")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/proc/self/fd/1", "/dev/stdout")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/proc/self/fd/2", "/dev/stderr")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/run/shm", "/dev/shm")) {
            perror("symlink");
            return EXIT_FAILURE;
        }

        if (-1 == symlink("/dev/pts/ptmx", "/dev/ptmx")) {
            perror("symlink");
            return EXIT_FAILURE;
        }
    }

    /* Keep init super privileged */
    {
        pid_t child = fork();
        if (-1 == child) {
            perror("fork");
            return EXIT_FAILURE;
        }


        if (child != 0) {
            siginfo_t info;
            do {
                errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
            } while (EINTR == errnum);
            if (errnum != 0) {
                assert(errnum != EINVAL);
                assert(errnum != ECHILD);
                assert(false);
            }
            return info.si_status;
        }
    }

    /* In the shell drop all privileges I might possibly have. */
    cap_t caps = cap_get_proc();
    if (NULL == caps) {
        perror("cap_get_proc");
        return EXIT_FAILURE;
    }

    if (-1 == cap_clear_flag(caps, CAP_PERMITTED)) {
        perror("cap_clear_flag");
        return EXIT_FAILURE;
    }
    if (-1 == cap_clear_flag(caps, CAP_EFFECTIVE)) {
        perror("cap_clear_flag");
        return EXIT_FAILURE;
    }

    if (-1 == cap_set_proc(caps)) {
        perror("cap_set_proc");
        return EXIT_FAILURE;
    }

    if (-1 == cap_free(caps)) {
        perror("cap_free");
        return EXIT_FAILURE;
    }

    syscall(__NR_execveat, sh_fd, "",
        (char *const *)shell_arguments, shell_environment,
        AT_EMPTY_PATH);
    perror("execveat");
    return EXIT_FAILURE;
}

static int close_leaked_fds(void)
{
    int errnum = 0;
    size_t size = 0U;
    int *fds = NULL;

    DIR *const fds_dir = opendir("/proc/self/fd");
    if (NULL == fds_dir) {
        errnum = errno;
        assert(errnum != 0);
        return errnum;
    }

    for (;;) {
        errno = 0;
        struct dirent *const result = readdir(fds_dir);
        {
            errnum = errno;
            if (errnum != 0) {
                goto close_fds_dir;
            }
        }
        if (NULL == result) {
            break;
        }

        char const *const d_name = result->d_name;
        if (0 == strcmp(d_name, ".")) {
            continue;
        }

        if (0 == strcmp(d_name, "..")) {
            continue;
        }

        int const fd = atoi(d_name);
        if (fd == dirfd(fds_dir)) {
            continue;
        }

        ++size;
    }

    rewinddir(fds_dir);

    fds = calloc(size, sizeof fds[0]);
    if (size != 0U && NULL == fds) {
        errnum = errno;
        assert(errnum != 0);
        goto close_fds_dir;
    }

    for (size_t ii = 0U; ii < size;) {
        errno = 0;
        struct dirent *const result = readdir(fds_dir);
        {
            errnum = errno;
            if (errnum != 0) {
                goto close_fds_dir;
            }
        }

        char const *const d_name = result->d_name;
        if (0 == strcmp(d_name, ".")) {
            continue;
        }

        if (0 == strcmp(d_name, "..")) {
            continue;
        }

        int const fd = atoi(d_name);

        if (fd == dirfd(fds_dir)) {
            continue;
        }

        fds[ii] = fd;
        ++ii;
    }

close_fds_dir:
    if (-1 == closedir(fds_dir)) {
        int close_errnum = errno;
        assert(close_errnum != 0);
        assert(close_errnum != EBADF);

        if (0 == errnum) {
            errnum = close_errnum;
        }
    }

    if (0 == errnum) {
        for (size_t ii = 0U; ii < size; ++ii) {
            int fd = fds[ii];
            switch (fd) {
            case STDIN_FILENO:
            case STDOUT_FILENO:
            case STDERR_FILENO:
                break;

            default:
                close(fd);
                break;
            }
        }
    }

    free(fds);

    return errnum;
}