Handle file descriptor access control
Opened this issue · 1 comments
l0kod commented
A sandbox would greatly benefit from being able to only use a set of file descriptors/handles instead of accessing an explicit path, with seccom-bpf (e.g. write(2)
, fstat(2)
…) and maybe later with capsicum (e.g. openat(2)
).
This could also allow efficient data sharing (i.e. memfd_create(2)
/seal/mmap).
cc rust-lang/rust#21936
cc rust-lang/rfcs#941
cc #2
mstewartgallus commented
Can't this be done by chroot
ing or pivot_root
ing into an unlinked directory that was created inside a mounted tmpfs? I'm thinking something like this.
#define _GNU_SOURCE
#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <mntent.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <linux/sched.h>
#ifndef __NR_execveat
#define __NR_execveat 322
#endif
#define SHELL "/bin/busybox"
#define RUNTIME_NAME "sandbox"
#define HOSTNAME "sandbox"
#define ARRAY_SIZE(A) (sizeof A / sizeof A[0U])
static char * const shell_arguments[] = { (char *)SHELL, "sh", NULL };
static char * const shell_environment[] = { "container=init", NULL };
static char fstab_config[] =
"# <file system> <mount point> <type> <options>\n"
"tmpfs tmp tmpfs mkdir,nodev,noexec,nosuid\n"
"\n"
"# Allow connecting to X11\n"
"/tmp/.X11-unix tmp/.X11-unix none mkdir,ro,bind,noexec,nosuid\n"
"\n"
"tmpfs dev tmpfs mkdir,nosuid,noexec\n"
"\n"
"# 3D acceleration\n"
"/dev/dri dev/dri none mkdir,ro,bind,noexec,nosuid\n"
"\n"
"/dev/null dev/null none touch,bind\n"
"/dev/full dev/full none touch,bind\n"
"/dev/zero dev/zero none touch,bind\n"
"/dev/urandom dev/urandom none touch,bind\n"
"\n"
"/dev/tty dev/tty none touch,bind\n"
"\n"
"devpts dev/pts devpts mkdir,ptmxmode=0666,newinstance\n"
"\n"
"tmpfs run tmpfs mkdir,nosuid,noexec\n"
"tmpfs run/lock tmpfs mkdir,nosuid,nodev,noexec\n"
"tmpfs run/shm tmpfs mkdir,nosuid,nodev\n"
"tmpfs var tmpfs mkdir,nosuid,noexec\n"
"\n"
"proc proc proc mkdir,ro,nodev,noexec,nosuid\n"
"sysfs sys sysfs mkdir,ro,nodev,noexec,nosuid\n"
"\n"
"/lib lib none mkdir,ro,nodev,nosuid,bind\n"
"/lib32 lib32 none mkdir,ro,nodev,nosuid,bind\n"
"/lib64 lib64 none mkdir,ro,nodev,nosuid,bind\n"
"\n"
"/bin bin none mkdir,ro,nodev,nosuid,bind\n"
"/sbin sbin none mkdir,ro,nodev,nosuid,bind\n"
"/usr usr none mkdir,ro,nodev,nosuid,bind\n"
"\n"
"/etc etc none mkdir,ro,nodev,nosuid,bind\n";
static int close_leaked_fds(void);
int main(void)
{
int errnum;
if (-1 == close_leaked_fds()) {
perror("close_leaked_fds");
return EXIT_FAILURE;
}
int sh_fd = open(SHELL, O_CLOEXEC | O_NONBLOCK | O_NOCTTY);
if (-1 == sh_fd) {
perror("open");
return EXIT_FAILURE;
}
uid_t uid = getuid();
gid_t gid = getgid();
uid_t mapped_uid = uid;
gid_t mapped_gid = gid;
/* Needed to do the rest of the unsharing */
if (-1 == unshare(CLONE_NEWUSER)) {
perror("unshare");
return EXIT_FAILURE;
}
/* Prevent signals, ptracing of other processes */
if (-1 == unshare(CLONE_NEWPID)) {
perror("unshare");
return EXIT_FAILURE;
}
/* Fork to allow for multithreading and to make the shell less
* buggy.
*/
{
pid_t child = fork();
if (-1 == child) {
perror("fork");
return EXIT_FAILURE;
}
if (child != 0) {
siginfo_t info;
do {
errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
} while (EINTR == errnum);
if (errnum != 0) {
assert(errnum != EINVAL);
assert(errnum != ECHILD);
assert(false);
}
return info.si_status;
}
}
{
int set_groups = open("/proc/self/setgroups", O_CLOEXEC | O_WRONLY);
if (-1 == set_groups) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == dprintf(set_groups, "deny\n")) {
perror("dprintf");
return EXIT_FAILURE;
}
if (-1 == close(set_groups)) {
perror("close");
return EXIT_FAILURE;
}
}
{
int file = open("/proc/self/uid_map", O_CLOEXEC | O_WRONLY);
if (-1 == file) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == dprintf(file, "%i %i 1\n", mapped_uid, uid)) {
perror("dprintf");
return EXIT_FAILURE;
}
if (-1 == close(file)) {
perror("close");
return EXIT_FAILURE;
}
}
{
int file = open("/proc/self/gid_map", O_CLOEXEC | O_WRONLY);
if (-1 == file) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == dprintf(file, "%i %i 1\n", mapped_gid, gid)) {
perror("dprintf");
return EXIT_FAILURE;
}
if (-1 == close(file)) {
perror("close");
return EXIT_FAILURE;
}
}
if (-1 == setresgid(mapped_gid, mapped_gid, mapped_gid)) {
perror("setresgid");
return EXIT_FAILURE;
}
if (-1 == setresuid(mapped_uid, mapped_uid, mapped_uid)) {
perror("setresuid");
return EXIT_FAILURE;
}
/* With chroot prevent messing with user files */
if (-1 == unshare(CLONE_NEWNS)) {
perror("unshare");
return EXIT_FAILURE;
}
/* We have unshare the network namespace so we can mount /proc
* because of /proc/net
*/
if (-1 == unshare(CLONE_NEWNET)) {
perror("unshare");
return EXIT_FAILURE;
}
if (0) {
FILE * tmp = tmpfile();
if (NULL == tmp) {
perror("tmpfile");
return EXIT_FAILURE;
}
{
size_t bytes_to_write = sizeof fstab_config - 1U;
if (fwrite(fstab_config, 1U, bytes_to_write, tmp) != sizeof fstab_config - 1U) {
perror("fwrite");
return EXIT_FAILURE;
}
}
char tmppath[] = "/proc/self/fd/XXXXXXXXXXX";
sprintf(tmppath, "/proc/self/fd/%i", fileno(tmp));
FILE * fstab = setmntent(tmppath, "r");
if (NULL == fstab) {
perror("setmtent");
return EXIT_FAILURE;
}
if (EOF == fclose(tmp)) {
perror("fclose");
return EXIT_FAILURE;
}
if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
errnum = errno;
if (errnum != EEXIST) {
perror("mkdir");
return EXIT_FAILURE;
}
}
if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
perror("mount");
return EXIT_FAILURE;
}
if (-1 == chdir(RUNTIME_NAME)) {
perror("chdir");
return EXIT_FAILURE;
}
for (;;) {
errno = 0;
struct mntent * entry = getmntent(fstab);
if (NULL == entry) {
errnum = errno;
if (errnum != 0) {
perror("getmntent");
return EXIT_FAILURE;
}
break;
}
enum {
MKDIR,
TOUCH,
BIND,
RBIND,
REMOUNT,
RO,
RW,
SUID,
NOSUID,
DEV,
NODEV,
EXEC,
NOEXEC,
USER,
NOUSER,
KERNMOUNT,
ACTIVE
};
char * const token[] = {
[MKDIR] = "mkdir",
[TOUCH] = "touch",
[BIND] = "bind",
[RBIND] = "rbind",
[REMOUNT] = "remount",
[RO] = MNTOPT_RO,
[RW] = MNTOPT_RW,
[SUID] = MNTOPT_SUID,
[NOSUID] = MNTOPT_NOSUID,
[DEV] = "dev",
[NODEV] = "nodev",
[EXEC] = "exec",
[NOEXEC] = "noexec",
[USER] = "user",
[NOUSER] = "nouser",
[KERNMOUNT] = "kernmount",
[ACTIVE] = "active",
NULL
};
bool mkdir_flag = false;
bool touch_flag = false;
bool bind = false;
bool rec = false;
bool remount = false;
bool readonly = false;
bool readwrite = false;
bool suid = true;
bool dev = true;
bool exec = true;
bool user = true;
bool kernmount = false;
bool active = false;
char *leftovers = NULL;
{
char *mnt_opts = entry->mnt_opts;
if (0 == strcmp("none", mnt_opts)) {
goto mount;
}
char *subopts_str = strdup(mnt_opts);
if (NULL == subopts_str) {
perror("strdup");
return EXIT_FAILURE;
}
char * subopts = subopts_str;
char *value = NULL;
while (*subopts != '\0') {
switch (getsubopt(&subopts, token, &value)) {
case MKDIR:
mkdir_flag = true;
break;
case TOUCH:
touch_flag = true;
break;
case BIND:
bind = true;
break;
case RBIND:
bind = true;
rec = true;
break;
case REMOUNT:
remount = true;
break;
case RO:
readonly = true;
break;
case RW:
readwrite = true;
break;
case SUID:
suid = true;
break;
case NOSUID:
suid = false;
break;
case DEV:
dev = true;
break;
case NODEV:
dev = false;
break;
case EXEC:
exec = true;
break;
case NOEXEC:
exec = false;
break;
case USER:
user = true;
break;
case NOUSER:
user = false;
break;
case KERNMOUNT:
kernmount = true;
break;
case ACTIVE:
active = true;
break;
default:;
leftovers = strstr(mnt_opts, value);
goto free_subopts_str;
}
}
free_subopts_str:
free(subopts_str);
}
mount:
if (bind && rec && readonly) {
fprintf(stderr,
"It's not possible to recursively bind readonly mounts\n");
return EXIT_FAILURE;
}
if (readwrite && readonly) {
fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
token[RO], token[RW]);
return EXIT_FAILURE;
}
if (mkdir_flag && touch_flag) {
fprintf(stderr, "Only one of '%s' and '%s' can be specified\n",
token[MKDIR], token[TOUCH]);
return EXIT_FAILURE;
}
unsigned long mountflags = 0;
if (bind) {
mountflags |= MS_BIND;
}
if (rec) {
mountflags |= MS_REC;
}
if (remount) {
mountflags |= MS_REMOUNT;
}
if (readonly) {
mountflags |= MS_RDONLY;
}
if (!suid) {
mountflags |= MS_NOSUID;
}
if (!dev) {
mountflags |= MS_NODEV;
}
if (!exec) {
mountflags |= MS_NOEXEC;
}
if (!user) {
mountflags |= MS_NOUSER;
}
if (kernmount) {
mountflags |= MS_KERNMOUNT;
}
if (active) {
mountflags |= MS_ACTIVE;
}
if (mkdir_flag) {
if (-1 == mkdir(entry->mnt_dir, S_IRWXU)) {
perror("mkdir");
return EXIT_FAILURE;
}
} else if (touch_flag) {
int fd = open(entry->mnt_dir, O_EXCL | O_CREAT | O_CLOEXEC, S_IRWXU);
if (-1 == fd) {
perror("open");
return EXIT_FAILURE;
}
close(fd);
}
if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
entry->mnt_dir,
entry->mnt_type, mountflags,
leftovers)) {
perror("mount");
return EXIT_FAILURE;
}
if (bind && readonly) {
mountflags |= MS_REMOUNT;
if (-1 == mount(0 == strcmp("none", entry->mnt_fsname) ? NULL : entry->mnt_fsname,
entry->mnt_dir,
entry->mnt_type, mountflags,
leftovers)) {
perror("mount");
return EXIT_FAILURE;
}
}
}
if (endmntent(fstab) != 1) {
perror("endmntent");
return EXIT_FAILURE;
}
int old_root = open("/", O_DIRECTORY);
if (-1 == old_root) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == syscall(__NR_pivot_root, ".", ".")) {
perror("pivot_root");
return EXIT_FAILURE;
}
if (-1 == fchdir(old_root)) {
perror("fchdir");
return EXIT_FAILURE;
}
if (-1 == umount2(".", MNT_DETACH)) {
perror("umount");
return EXIT_FAILURE;
}
if (-1 == close(old_root)) {
perror("close");
return EXIT_FAILURE;
}
if (-1 == chdir("/")) {
perror("chdir");
return EXIT_FAILURE;
}
} else {
if (-1 == mkdir(RUNTIME_NAME, S_IRWXU)) {
errnum = errno;
if (errnum != EEXIST) {
perror("mkdir");
return EXIT_FAILURE;
}
}
if (-1 == mount("tmpfs", RUNTIME_NAME, "tmpfs", 0, NULL)) {
perror("mount");
return EXIT_FAILURE;
}
if (-1 == chdir(RUNTIME_NAME)) {
perror("chdir");
return EXIT_FAILURE;
}
if (-1 == mkdir("sandbox", S_IRWXU)) {
errnum = errno;
if (errnum != EEXIST) {
perror("mkdir");
return EXIT_FAILURE;
}
}
int old_root = open("/", O_DIRECTORY);
if (-1 == old_root) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == syscall(__NR_pivot_root, ".", ".")) {
perror("pivot_root");
return EXIT_FAILURE;
}
if (-1 == fchdir(old_root)) {
perror("fchdir");
return EXIT_FAILURE;
}
if (-1 == umount2(".", MNT_DETACH)) {
perror("umount");
return EXIT_FAILURE;
}
if (-1 == close(old_root)) {
perror("close");
return EXIT_FAILURE;
}
if (-1 == chdir("/")) {
perror("chdir");
return EXIT_FAILURE;
}
int sandbox_fd = open("sandbox", O_CLOEXEC | O_DIRECTORY);
if (-1 == sandbox_fd) {
perror("open");
return EXIT_FAILURE;
}
if (-1 == fchdir(sandbox_fd)) {
perror("fchdir");
return EXIT_FAILURE;
}
if (-1 == rmdir("../sandbox")) {
perror("rmdir");
return EXIT_FAILURE;
}
if (-1 == chroot(".")) {
perror("chroot");
return EXIT_FAILURE;
}
if (-1 == chdir("/")) {
perror("chdir");
return EXIT_FAILURE;
}
close(sandbox_fd);
}
/* Sandbox the rest of the namespaces */
/* We can't unshare the IPC namespace because we need to share it
* to use X11's shared memory extensions. Not sure how to disable
* shared memory extensions.
*/
if (-1 == unshare(CLONE_NEWIPC | CLONE_NEWUTS)) {
perror("unshare");
return EXIT_FAILURE;
}
/* Favor other processes over this process hierarchy. Only
* superuser may lower priorities so this is not stoppable. This
* also makes the process hierarchy nicer for the OOM killer.
*/
if (-1 == setpriority(PRIO_PROCESS, 0, getpriority(PRIO_PROCESS, 0) + 1)) {
perror("setpriority");
return EXIT_FAILURE;
}
if (-1 == sethostname(HOSTNAME, sizeof HOSTNAME - 1U)) {
perror("sethostname");
return EXIT_FAILURE;
}
if (0) {
if (-1 == symlink("/proc/self/fd", "/dev/fd")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/proc/self/fd/0", "/dev/stdin")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/proc/self/fd/1", "/dev/stdout")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/proc/self/fd/2", "/dev/stderr")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/run/shm", "/dev/shm")) {
perror("symlink");
return EXIT_FAILURE;
}
if (-1 == symlink("/dev/pts/ptmx", "/dev/ptmx")) {
perror("symlink");
return EXIT_FAILURE;
}
}
/* Keep init super privileged */
{
pid_t child = fork();
if (-1 == child) {
perror("fork");
return EXIT_FAILURE;
}
if (child != 0) {
siginfo_t info;
do {
errnum = -1 == waitid(P_PID, child, &info, WEXITED) ? errno : 0;
} while (EINTR == errnum);
if (errnum != 0) {
assert(errnum != EINVAL);
assert(errnum != ECHILD);
assert(false);
}
return info.si_status;
}
}
/* In the shell drop all privileges I might possibly have. */
cap_t caps = cap_get_proc();
if (NULL == caps) {
perror("cap_get_proc");
return EXIT_FAILURE;
}
if (-1 == cap_clear_flag(caps, CAP_PERMITTED)) {
perror("cap_clear_flag");
return EXIT_FAILURE;
}
if (-1 == cap_clear_flag(caps, CAP_EFFECTIVE)) {
perror("cap_clear_flag");
return EXIT_FAILURE;
}
if (-1 == cap_set_proc(caps)) {
perror("cap_set_proc");
return EXIT_FAILURE;
}
if (-1 == cap_free(caps)) {
perror("cap_free");
return EXIT_FAILURE;
}
syscall(__NR_execveat, sh_fd, "",
(char *const *)shell_arguments, shell_environment,
AT_EMPTY_PATH);
perror("execveat");
return EXIT_FAILURE;
}
static int close_leaked_fds(void)
{
int errnum = 0;
size_t size = 0U;
int *fds = NULL;
DIR *const fds_dir = opendir("/proc/self/fd");
if (NULL == fds_dir) {
errnum = errno;
assert(errnum != 0);
return errnum;
}
for (;;) {
errno = 0;
struct dirent *const result = readdir(fds_dir);
{
errnum = errno;
if (errnum != 0) {
goto close_fds_dir;
}
}
if (NULL == result) {
break;
}
char const *const d_name = result->d_name;
if (0 == strcmp(d_name, ".")) {
continue;
}
if (0 == strcmp(d_name, "..")) {
continue;
}
int const fd = atoi(d_name);
if (fd == dirfd(fds_dir)) {
continue;
}
++size;
}
rewinddir(fds_dir);
fds = calloc(size, sizeof fds[0]);
if (size != 0U && NULL == fds) {
errnum = errno;
assert(errnum != 0);
goto close_fds_dir;
}
for (size_t ii = 0U; ii < size;) {
errno = 0;
struct dirent *const result = readdir(fds_dir);
{
errnum = errno;
if (errnum != 0) {
goto close_fds_dir;
}
}
char const *const d_name = result->d_name;
if (0 == strcmp(d_name, ".")) {
continue;
}
if (0 == strcmp(d_name, "..")) {
continue;
}
int const fd = atoi(d_name);
if (fd == dirfd(fds_dir)) {
continue;
}
fds[ii] = fd;
++ii;
}
close_fds_dir:
if (-1 == closedir(fds_dir)) {
int close_errnum = errno;
assert(close_errnum != 0);
assert(close_errnum != EBADF);
if (0 == errnum) {
errnum = close_errnum;
}
}
if (0 == errnum) {
for (size_t ii = 0U; ii < size; ++ii) {
int fd = fds[ii];
switch (fd) {
case STDIN_FILENO:
case STDOUT_FILENO:
case STDERR_FILENO:
break;
default:
close(fd);
break;
}
}
}
free(fds);
return errnum;
}