From 9bf225d193fa007cb66becf4a1324f9932aaba20 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Mon, 28 Jan 2019 11:26:57 +0900 Subject: [PATCH] mckernel overlay: replace mcoverlayfs with a soft userspace overlay mcoverlayfs has a high maintenance burden and does not work on rhel8's 4.18 kernel (while it works on vanilla 4.18...); instead of debugging this further time is better spent making it independent from overlayfs. Change-Id: I7454ae95b0fbb3373c256aa2fd83cdfec466c009 --- configure | 2 +- configure.ac | 2 +- executer/user/arch/arm64/Makefile.in | 7 +- executer/user/arch/arm64/arch_syscall.c | 7 - executer/user/arch/x86_64/Makefile.in | 7 +- executer/user/arch/x86_64/arch_syscall.c | 63 -- executer/user/mcexec.c | 1008 ++++++++++++++-------- lib/include/list.h | 2 + 8 files changed, 673 insertions(+), 425 deletions(-) delete mode 100644 executer/user/arch/arm64/arch_syscall.c delete mode 100644 executer/user/arch/x86_64/arch_syscall.c diff --git a/configure b/configure index a82b8b27..428a891d 100755 --- a/configure +++ b/configure @@ -3766,7 +3766,7 @@ fi if test "${enable_mcoverlayfs+set}" = set; then : enableval=$enable_mcoverlayfs; ENABLE_MCOVERLAYFS=$enableval else - ENABLE_MCOVERLAYFS=yes + ENABLE_MCOVERLAYFS=no fi diff --git a/configure.ac b/configure.ac index 36fb8065..d5cdb069 100644 --- a/configure.ac +++ b/configure.ac @@ -193,7 +193,7 @@ AC_ARG_ENABLE([mcoverlayfs], AC_HELP_STRING([--enable-mcoverlayfs], [enable mcoverlayfs implementation]), [ENABLE_MCOVERLAYFS=$enableval], - [ENABLE_MCOVERLAYFS=yes]) + [ENABLE_MCOVERLAYFS=no]) AC_ARG_ENABLE([rusage], AC_HELP_STRING([--enable-rusage], diff --git a/executer/user/arch/arm64/Makefile.in b/executer/user/arch/arm64/Makefile.in index 61143789..8e151194 100644 --- a/executer/user/arch/arm64/Makefile.in +++ b/executer/user/arch/arm64/Makefile.in @@ -10,15 +10,12 @@ LIBS=@LIBS@ all: $(TARGET) -../../libmcexec.a: archdep.o arch_syscall.o - $(AR) cr ../../libmcexec.a archdep.o arch_syscall.o +../../libmcexec.a: archdep.o + $(AR) cr ../../libmcexec.a archdep.o archdep.o: archdep.c archdep.S $(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $^ -arch_syscall.o: arch_syscall.c - $(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $< - clean: $(RM) $(TARGET) *.o diff --git a/executer/user/arch/arm64/arch_syscall.c b/executer/user/arch/arm64/arch_syscall.c deleted file mode 100644 index 8aefe8a5..00000000 --- a/executer/user/arch/arm64/arch_syscall.c +++ /dev/null @@ -1,7 +0,0 @@ -struct syscall_wait_desc; - -int -archdep_syscall(struct syscall_wait_desc *w, long *ret) -{ - return -1; -} diff --git a/executer/user/arch/x86_64/Makefile.in b/executer/user/arch/x86_64/Makefile.in index b08dff18..453bd5de 100644 --- a/executer/user/arch/x86_64/Makefile.in +++ b/executer/user/arch/x86_64/Makefile.in @@ -10,15 +10,12 @@ LIBS=@LIBS@ all: $(TARGET) -../../libmcexec.a: archdep.o arch_syscall.o - $(AR) cr ../../libmcexec.a archdep.o arch_syscall.o +../../libmcexec.a: archdep.o + $(AR) cr ../../libmcexec.a archdep.o archdep.o: archdep.S $(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $< -arch_syscall.o: arch_syscall.c - $(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $< - ../../libsyscall_intercept_arch.a: archdep_c.o $(AR) cr ../../libsyscall_intercept_arch.a archdep_c.o diff --git a/executer/user/arch/x86_64/arch_syscall.c b/executer/user/arch/x86_64/arch_syscall.c deleted file mode 100644 index bae50d17..00000000 --- a/executer/user/arch/x86_64/arch_syscall.c +++ /dev/null @@ -1,63 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../../../include/uprotocol.h" -#include "../../archdep.h" - -//#define DEBUG -#ifndef DEBUG -#define __dprint(msg, ...) -#define __dprintf(arg, ...) -#define __eprint(msg, ...) -#define __eprintf(format, ...) -#else -#define __dprint(msg, ...) {printf("%s: " msg, __FUNCTION__);fflush(stdout);} -#define __dprintf(format, ...) {printf("%s: " format, __FUNCTION__, \ - __VA_ARGS__);fflush(stdout);} -#define __eprint(msg, ...) {fprintf(stderr, "%s: " msg, __FUNCTION__);\ - fflush(stderr);} -#define __eprintf(format, ...) {fprintf(stderr, "%s: " format, __FUNCTION__, \ - __VA_ARGS__);fflush(stderr);} -#endif - -extern char *chgpath(char *, char *); -extern long do_strncpy_from_user(int, void *, void *, unsigned long); -extern int fd; - -#define SET_ERR(ret) if (ret == -1) ret = -errno - -int -archdep_syscall(struct syscall_wait_desc *w, long *ret) -{ - char *fn; - char pathbuf[PATH_MAX]; - char tmpbuf[PATH_MAX]; - - switch (w->sr.number) { - case __NR_open: - *ret = do_strncpy_from_user(fd, pathbuf, - (void *)w->sr.args[0], PATH_MAX); - if (*ret >= PATH_MAX) { - *ret = -ENAMETOOLONG; - } - if (*ret < 0) { - return 0; - } - __dprintf("open: %s\n", pathbuf); - - fn = chgpath(pathbuf, tmpbuf); - - *ret = open(fn, w->sr.args[1], w->sr.args[2]); - SET_ERR(*ret); - return 0; - } - return -1; -} diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index ae622350..9aa252a2 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -88,6 +88,8 @@ #include "../include/qlmpi.h" #include #include +#include +#include "../../lib/include/list.h" //#define DEBUG #define ADD_ENVS_OPTION @@ -1046,8 +1048,16 @@ struct thread_data_s { } *thread_data; int ncpu; +int nnodes; +void *numa_nodes; +size_t cpu_set_size; int n_threads; +static inline cpu_set_t *numa_node_set(int n) +{ + return (cpu_set_t *)(numa_nodes + n * cpu_set_size); +} + pid_t master_tid; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; @@ -2040,6 +2050,8 @@ static int get_thp_disable(void) return ret; } +pthread_spinlock_t overlay_fd_lock; + int main(int argc, char **argv) { int ret = 0; @@ -2244,6 +2256,8 @@ int main(int argc, char **argv) } #endif + pthread_spin_init(&overlay_fd_lock, 0); + ld_preload_init(); #ifdef ADD_ENVS_OPTION @@ -2467,10 +2481,36 @@ int main(int argc, char **argv) __dprintf("desc->rlimit[MCK_RLIMIT_STACK]=%ld,%ld\n", desc->rlimit[MCK_RLIMIT_STACK].rlim_cur, desc->rlimit[MCK_RLIMIT_STACK].rlim_max); ncpu = ioctl(fd, MCEXEC_UP_GET_CPU, 0); - if(ncpu == -1){ + if (ncpu <= 0) { fprintf(stderr, "No CPU found.\n"); return 1; } + nnodes = ioctl(fd, MCEXEC_UP_GET_NODES, 0); + if (nnodes <= 0) { + fprintf(stderr, "No numa node found.\n"); + return 1; + } + cpu_set_size = CPU_ALLOC_SIZE(ncpu); + numa_nodes = malloc(cpu_set_size * nnodes); + if (!numa_nodes) { + fprintf(stderr, "Error allocating nodes cpu sets\n"); + return 1; + } + for (i = 0; i < nnodes; i++) { + cpu_set_t *node = numa_node_set(i); + int j; + struct stat sb; + char buf[PATH_MAX]; + + CPU_ZERO_S(cpu_set_size, node); + for (j = 0; j < ncpu; j++) { + snprintf(buf, PATH_MAX, + "/sys/class/mcos/mcos0/sys/devices/system/node/node%d/cpu%d", + i, j); + if (stat(buf, &sb) == 0) + CPU_SET_S(j, cpu_set_size, node); + } + } if (nr_processes > ncpu) { fprintf(stderr, "error: nr_processes can't exceed nr. of CPUs\n"); @@ -2810,111 +2850,6 @@ do_generic_syscall( ret = -errno; } - /* Overlayfs /sys/X directory lseek() problem work around */ - if (w->sr.number == __NR_lseek && ret == -EINVAL) { - char proc_path[PATH_MAX]; - char path[PATH_MAX]; - struct stat sb; - int len; - - sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]); - - /* Get filename */ - if ((len = readlink(proc_path, path, sizeof(path))) < 0) { - fprintf(stderr, "%s: error: readlink() failed for %s\n", - __FUNCTION__, proc_path); - perror(": "); - goto out; - } - - path[len] = 0; - - /* Not in /sys? */ - if (strncmp(path, "/sys/", 5)) - goto out; - - /* Stat */ - if (stat(path, &sb) < 0) { - fprintf(stderr, "%s: error stat() failed for %s\n", - __FUNCTION__, path); - goto out; - } - - /* Not dir? */ - if ((sb.st_mode & S_IFMT) != S_IFDIR) - goto out; - - ret = 0; - } - /* Fake that nodeX in /sys/devices/system/node do not exist, - * where X >= number of LWK NUMA nodes */ -#ifdef POSTK_DEBUG_ARCH_DEP_55 -# ifdef __aarch64__ -# define __nr_getdents __NR_getdents64 -# else -# define __nr_getdents __NR_getdents -# endif - else if (w->sr.number == __nr_getdents && ret > 0) { -#else /*POSTK_DEBUG_ARCH_DEP_55*/ - else if (w->sr.number == __NR_getdents && ret > 0) { -#endif /*POSTK_DEBUG_ARCH_DEP_55*/ - struct linux_dirent { - long d_ino; - off_t d_off; - unsigned short d_reclen; - char d_name[]; - }; - struct linux_dirent *d; - char *buf = (char *)w->sr.args[1]; - int bpos = 0; - int nodes,len; - char proc_path[PATH_MAX]; - char path[PATH_MAX]; - - sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]); - - /* Get filename */ - len = readlink(proc_path, path, sizeof(path)); - if (len < 0 || len >= sizeof(path)) { - fprintf(stderr, "%s: error: readlink() failed for %s\n", - __FUNCTION__, proc_path); - goto out; - } - path[len] = 0; - - /* Not /sys/devices/system/node ? */ - if (strcmp(path, "/sys/devices/system/node")) - goto out; - - nodes = ioctl(fd, MCEXEC_UP_GET_NODES, 0); - if (nodes == -1) { - goto out; - } - - d = (struct linux_dirent *) (buf + bpos); - for (bpos = 0; bpos < ret; ) { - int nodeid, tmp_reclen; - d = (struct linux_dirent *) (buf + bpos); - - if (sscanf(d->d_name, "node%d", &nodeid) != 1) { - bpos += d->d_reclen; - continue; - } - - if (nodeid >= nodes) { - tmp_reclen = d->d_reclen; - memmove(buf + bpos, - buf + bpos + tmp_reclen, - ret - bpos - tmp_reclen); - ret -= tmp_reclen; - continue; - } - - bpos += d->d_reclen; - } - } - -out: __dprintf("do_generic_syscall(%ld):%ld (%#lx)\n", w->sr.number, ret, ret); return ret; } @@ -3132,96 +3067,501 @@ int close_cloexec_fds(int mcos_fd) return 0; } -void chgdevpath(char *in, char *buf) +struct overlay_fd { + int fd; /* associated fd, points to mckernel side */ + int linux_fd; /* linux fd, -1 if not opened */ + struct list_head link; + char path[PATH_MAX]; /* linux path */ + size_t pathlen; + void *dirents; /* copy of mckernel dirents to filter duplicates */ + size_t dirents_size; +}; +LIST_HEAD(overlay_fd_list); + +void overlay_addfd(int fd, const char *path) { - if(!strcmp(in, "/dev/xpmem")){ - sprintf(in, "/dev/null"); + struct overlay_fd *ofd; + int n; + char mcos[32], *real_path; + const char *prefix = ""; + + if (strncmp(path, "/proc/", 6) == 0) + prefix = "/proc"; + else if (strncmp(path, "/sys/", 5) != 0) + return; + + n = snprintf(mcos, 32, "mcos%d", mcosid); + real_path = strstr(path, mcos); + if (!real_path) + return; + + /* point to first character after mcos string */ + real_path += n; + + ofd = malloc(sizeof(*ofd)); + if (!ofd) { + fprintf(stderr, "%s: out of memory\n", __func__); + return; } + + ofd->fd = fd; + ofd->linux_fd = -1; + ofd->dirents = NULL; + ofd->dirents_size = 0; + ofd->pathlen = snprintf(ofd->path, PATH_MAX, "%s%s", prefix, real_path); + + pthread_spin_lock(&overlay_fd_lock); + list_add(&ofd->link, &overlay_fd_list); + pthread_spin_unlock(&overlay_fd_lock); } -char * -chgpath(char *in, char *buf) +void overlay_delfd(int fd) { - chgdevpath(in, buf); + struct overlay_fd *ofd; + + pthread_spin_lock(&overlay_fd_lock); + list_for_each_entry(ofd, &overlay_fd_list, link) { + if (ofd->fd == fd) { + list_del(&ofd->link); + if (ofd->linux_fd != -1) + close(ofd->linux_fd); + free(ofd->dirents); + free(ofd); + break; + } + } + pthread_spin_unlock(&overlay_fd_lock); +} + +/* List of blacklisted paths + * + * Since we abuse sscanf, there are a few constraints: + * - scanf cannot be used to differenciate strings with no pattern, + * so the last character has to be a pattern. If it is not a number, + * it is compared by hand. + * - always make previous patterns ignore patterns (%*..) + * - symlinks can be assumed to be resolved previously + */ + +struct overlay_blacklist_entry { + char *pattern; + int cpuid; + int nodeid; + char lastchar; +} overlay_blacklists[] = { + { "/sys/devices/system/cpu/cpu%d", 0, -1, -1 }, + { "/sys/devices/system/cpu/cpu%d/node%d", 0, 1, -1 }, + { "/sys/bus/cpu/devices/cpu%d", 0, -1, -1 }, + { "/sys/bus/cpu/drivers/processor/cpu%d", 0, -1, -1 }, + { "/sys/devices/system/node/node%d", -1, 0, -1 }, + { "/sys/devices/system/node/node%d/cpu%d", 1, 0, -1 }, + { "/sys/devices/system/node/node%d/memor%c", -1, -1, 'y' }, + { "/sys/bus/node/devices/node%d", -1, 0, -1 }, + { "/sys/devices/system/node/has%c", -1, -1, '_' }, + { "/sys/fs/cgrou%c", -1, -1, 'p' }, + { "/sys/devices/pci%*[^/]/%*[^/]/local_cpu%c", -1, -1, 's' }, + { NULL, 0, 0 }, +}; + +int overlay_blacklist(const char *path) +{ + int ids[3]; + struct overlay_blacklist_entry *entry; + int rc; + + if (strncmp(path, "/sys/", 5)) + return 0; + + for (entry = overlay_blacklists; entry->pattern; entry++) { + memset(ids, 0, sizeof(ids)); + rc = sscanf(path, entry->pattern, ids, ids + 1, ids + 2); + if (rc < (entry->cpuid != -1 ? 1 : 0) + + (entry->nodeid != -1 ? 1 : 0) + + (entry->lastchar != (char)-1 ? 1 : 0)) + continue; + if (entry->lastchar != (char)-1 && ids[rc - 1] != entry->lastchar) + continue; + if (entry->cpuid == -1 && entry->nodeid == -1) + return -ENOENT; + if (entry->cpuid != -1 && ids[entry->cpuid] >= ncpu) + return -ENOENT; + if (entry->nodeid != -1 && ids[entry->nodeid] >= nnodes) + return -ENOENT; + if (entry->cpuid != -1 && entry->nodeid != -1 && + !CPU_ISSET_S(ids[entry->cpuid], cpu_set_size, + numa_node_set(ids[entry->nodeid]))) + return -ENOENT; + } + + return 0; +} + +/* Fixup paths that need to point to mckernel files + * dirfd/in are openat/fstatat/faccessat arguments, + * buf is a buffer we can dirty assumed to be PATH_MAX long + * returns path to use *with dirfd* if it was provided. + */ +const char * +overlay_path(int dirfd, const char *in, char *buf) +{ + const char *path = in; + char *linkpath, *tmppath; + char tmpbuf[PATH_MAX], tmpbuf2[PATH_MAX]; + + struct stat sb; + ssize_t n; + int rc; + + __dprintf("considering fd %d path %s\n", dirfd, in); + + if (dirfd != AT_FDCWD && in[0] != '/') { + snprintf(buf, PATH_MAX, "/proc/self/fd/%d", dirfd); + + n = readlink(buf, tmpbuf, PATH_MAX); + if (n == PATH_MAX || n < 0) { + if (n == PATH_MAX) + errno = ENAMETOOLONG; + fprintf(stderr, + "%s: readlink /proc/self/fd/%d failed: %d\n", + __func__, dirfd, errno); + return in; + } + tmpbuf[n] = 0; + + if (n > 0 && tmpbuf[n-1] == '/') + n--; + + n += snprintf(tmpbuf + n, PATH_MAX - n, "/%s", in); + if (n >= PATH_MAX) { + fprintf(stderr, "%s: %s truncated\n", + __func__, tmpbuf); + return in; + } + + path = tmpbuf; + } else if (in[0] != '/') { + path = getcwd(tmpbuf, PATH_MAX); + if (path == NULL) { + fprintf(stderr, "%s: could not getcwd(): %d\n", + __func__, errno); + return in; + } + + n = strlen(tmpbuf); + if (n > 0 && tmpbuf[n-1] == '/') + n--; + + n += snprintf(tmpbuf + n, PATH_MAX - n, "/%s", in); + if (n >= PATH_MAX) { + fprintf(stderr, "%s: %s truncated\n", + __func__, tmpbuf); + return in; + } + + path = tmpbuf; + } + + __dprintf("glued to %s\n", path); + + if (!strcmp(path, "/dev/xpmem")) + return "/dev/null"; + + if (!strncmp(path, "/proc/self", 10) && + (path[10] == '/' || path[10] == '\0')) { + n = snprintf(buf, PATH_MAX, "/proc/mcos%d/%d%s", + mcosid, getpid(), path + 10); + goto checkexist; + } + + if (!strncmp(path, "/proc", 5) && + (path[5] == '/' || path[5] == '\0')) { + n = snprintf(buf, PATH_MAX, "/proc/mcos%d%s", + mcosid, path + 5); + goto checkexist; + } + + if (!strncmp(path, "/sys", 4) && + (path[4] == '/' || path[4] == '\0')) { + goto checkexist_resolvelinks; + } -#ifdef ENABLE_MCOVERLAYFS return in; -#endif // ENABLE_MCOVERLAYFS - char *fn = in; - struct stat sb; - if (!strncmp(fn, "/proc/self/", 11)){ - sprintf(buf, "/proc/mcos%d/%d/%s", mcosid, getpid(), fn + 11); - fn = buf; +checkexist_resolvelinks: + /* now, for the fun part: since /sys is full of symlinks, we need + * to check every single component of that path for links + * (in the real path!) and consider the final destination + */ + if (path != tmpbuf) { + strcpy(tmpbuf, path); + path = tmpbuf; } - else if(!strncmp(fn, "/proc/", 6)){ - sprintf(buf, "/proc/mcos%d/%s", mcosid, fn + 6); - fn = buf; - } - else if(!strcmp(fn, "/sys/devices/system/cpu/online")){ - fn = "/admin/fs/attached/files/sys/devices/system/cpu/online"; - } - else - return in; + linkpath = tmpbuf; + while ((linkpath = strchr(linkpath + 1, '/'))) { + linkpath[0] = 0; + rc = lstat(tmpbuf, &sb); - if(stat(fn, &sb) == -1) + /* Could not exist on linux - no more links */ + if (rc == -1) { + linkpath[0] = '/'; + break; + } + + if (S_ISLNK(sb.st_mode)) { + n = readlink(tmpbuf, buf, PATH_MAX); + if (n >= PATH_MAX || n < 0) + return in; + buf[n] = 0; + + if (buf[0] == '/') { + /* cannot snprintf from same source and dest */ + n = snprintf(tmpbuf2, PATH_MAX, "%s/%s", buf, + linkpath); + if (n >= PATH_MAX) + return in; + strcpy(tmpbuf, tmpbuf2); + linkpath = tmpbuf; + } else { + strcpy(tmpbuf2, linkpath + 1); + + /* remove link component from path */ + linkpath = strrchr(tmpbuf, '/'); + if (linkpath != tmpbuf) + linkpath[0] = 0; + else + linkpath[1] = 0; + + /* go back as many / as there are .. + * otherwise kernel would need intermediate + * directories to exist on mckernel side */ + tmppath = buf; + while (!strncmp(tmppath, "../", 3)) { + linkpath = strrchr(tmpbuf, '/'); + if (!linkpath) // should never happen + return in; + if (linkpath != tmpbuf) + linkpath[0] = 0; + tmppath += 3; + } + n = linkpath - tmpbuf; + n += snprintf(linkpath, PATH_MAX - n, + "/%s/%s", tmppath, tmpbuf2); + if (n >= PATH_MAX) + return in; + } + } + linkpath[0] = '/'; + linkpath++; + } + + n = snprintf(buf, PATH_MAX, "/sys/devices/virtual/mcos/mcos%d", + mcosid); + tmppath = buf + n; + n += snprintf(buf + n, PATH_MAX - n, "/sys/%s", path + 5); + path = tmppath; + +checkexist: + if (n >= PATH_MAX) { + fprintf(stderr, "%s: %s truncated\n", __func__, buf); return in; - return fn; + } + + while ((tmppath = strstr(buf, "//"))) { + memmove(tmppath, tmppath + 1, PATH_MAX - (tmppath + 1 - buf)); + n--; + } + while (n > 0 && buf[n-1] == '/') { + buf[n-1] = 0; + n--; + } + + rc = stat(buf, &sb); + __dprintf("trying %s: %d\n", buf, rc == -1 ? errno : 0); + if (rc == -1 && errno == ENOENT) { + if (overlay_blacklist(path)) { + __dprintf("blacklisted %s\n", path); + return "/nonexisting"; + } + return in; + } + + return buf; } -#ifdef POSTK_DEBUG_ARCH_DEP_72 /* add __NR_newfstat */ -static int -syscall_pathname(int dirfd, char *pathname, size_t size) +struct linux_dirent { + unsigned long d_ino; /* Inode number */ + unsigned long d_off; /* Offset to next linux_dirent */ + unsigned short d_reclen; /* Length of this linux_dirent */ + char d_name[]; /* Filename (null-terminated) */ + /* length is actually (d_reclen - 2 - + * offsetof(struct linux_dirent, d_name)) */ +/* char pad; // Zero padding byte + * char d_type; // File type (since linux 2.6.4) at reclen-1 + */ +}; +struct linux_dirent64 { + ino64_t d_ino; /* 64-bit inode number */ + off64_t d_off; /* 64-bit offset to next structure */ + unsigned short d_reclen; /* Size of this dirent */ + unsigned char d_type; /* File type */ + char d_name[]; /* Filename (null-terminated) */ +}; + +static inline unsigned short dirent_reclen(int sysnum, void *_dirp) { - int ret = 0; - char *tempbuf = NULL; - size_t tempbuf_size; - if (pathname[0] == '/') { - goto out; - } +#ifdef __NR_getdents + if (sysnum == __NR_getdents) { + struct linux_dirent *dirp = _dirp; - if (dirfd != AT_FDCWD) { - int len; - char dfdpath[64]; - snprintf(dfdpath, sizeof(dfdpath), "/proc/self/fd/%d", dirfd); - - tempbuf_size = size; - tempbuf = malloc(tempbuf_size); - if (tempbuf == NULL) { - ret = -ENOMEM; - goto out; + return dirp->d_reclen; } +#endif + if (sysnum == __NR_getdents64) { + struct linux_dirent64 *dirp = _dirp; - ret = readlink(dfdpath, tempbuf, tempbuf_size); - if (ret == -1) { + return dirp->d_reclen; + } + fprintf(stderr, "%s: unexpected syscall number %d\n", + __func__, sysnum); + exit(-1); +} + +static inline char *dirent_name(int sysnum, void *_dirp) +{ + +#ifdef __NR_getdents + if (sysnum == __NR_getdents) { + struct linux_dirent *dirp = _dirp; + + return dirp->d_name; + } +#endif + if (sysnum == __NR_getdents64) { + struct linux_dirent64 *dirp = _dirp; + + return dirp->d_name; + } + fprintf(stderr, "%s: unexpected syscall number %d\n", + __func__, sysnum); + exit(-1); +} + +int overlay_getdents(int sysnum, int fd, void *_dirp, unsigned int count) +{ + void *dirp, *mcdirp; + int ret = 0, pos, linux_ret, mcpos; + unsigned short reclen; + struct overlay_fd *ofd = NULL, *ofd_iter; + + pthread_spin_lock(&overlay_fd_lock); + list_for_each_entry(ofd_iter, &overlay_fd_list, link) { + if (ofd_iter->fd == fd) { + ofd = ofd_iter; + break; + } + } + pthread_spin_unlock(&overlay_fd_lock); + + /* not a directory we overlay, or not there yet */ + if (ofd == NULL || ofd->linux_fd == -1) { + ret = syscall(sysnum, fd, _dirp, count); + if (ret == -1) ret = -errno; - goto out; - } - - len = strlen(pathname); - if (tempbuf_size <= ret + 1 + len + 1) { - ret = -ENAMETOOLONG; - goto out; - } - tempbuf[ret] = '/'; - strncpy(&tempbuf[ret+1], pathname, len+1); - - strcpy(pathname, tempbuf); } -out: - if (tempbuf) { - free(tempbuf); + if (ofd == NULL || ret < 0) + return ret; + + /* copy mckernel dirents to our buffer, in case of split getdents */ + if (ret > 0) { + void *newbuf = realloc(ofd->dirents, ofd->dirents_size + ret); + + if (!newbuf) { + fprintf(stderr, "%s: not enough memory (%zd)", + __func__, ofd->dirents_size + ret); + return ret; + } + ofd->dirents = newbuf; + memcpy(ofd->dirents + ofd->dirents_size, _dirp, ret); + ofd->dirents_size += ret; } + + /* return first directory result unless it is empty or there + * is obvious room for more elements. + * The second check could have false positives depending on + * the fs, but should not be for filesystems we overlay + */ + if (ret > 0 && count - ret < 500) + return ret; + + if (ofd->linux_fd == -1) { + ofd->linux_fd = open(ofd->path, O_RDONLY|O_DIRECTORY); + if (ofd->linux_fd < 0) { + if (errno != ENOENT) { + fprintf(stderr, "%s: could not open %s: %d\n", + __func__, ofd->path, errno); + } + return ret; + } + } + +again: + linux_ret = syscall(sysnum, ofd->linux_fd, _dirp + ret, count - ret); + if (linux_ret < 0) { + fprintf(stderr, "%s: linux getdents failed: %d\n", + __func__, errno); + return ret; + } + if (linux_ret == 0) + return ret; + + for (pos = ret; pos < ret + linux_ret;) { + dirp = _dirp + pos; + reclen = dirent_reclen(sysnum, dirp); + snprintf(ofd->path + ofd->pathlen, PATH_MAX - ofd->pathlen, + "/%s", dirent_name(sysnum, dirp)); + /* remove blacklist */ + if (overlay_blacklist(ofd->path)) { + __dprintf("blacklisted %s\n", ofd->path); + memmove(_dirp + pos, + _dirp + pos + reclen, + ret + linux_ret - pos - reclen); + linux_ret -= reclen; + continue; + } + /* remove duplicates */ + for (mcpos = 0; mcpos < ofd->dirents_size;) { + mcdirp = ofd->dirents + mcpos; + if (!strcmp(dirent_name(sysnum, mcdirp), + dirent_name(sysnum, dirp))) { + memmove(_dirp + pos, + _dirp + pos + reclen, + ret + linux_ret - pos - reclen); + linux_ret -= reclen; + break; + } + mcpos += dirent_reclen(sysnum, mcdirp); + } + if (mcpos >= ofd->dirents_size) + pos += reclen; + } + + ret += linux_ret; + + /* It's possible we filtered everything out, but there is more + * available. Keep trying! + */ + if (linux_ret == 0 || count - ret > 500) + goto again; + return ret; } -#endif /*POSTK_DEBUG_ARCH_DEP_72*/ int main_loop(struct thread_data_s *my_thread) { struct syscall_wait_desc w; long ret; - char *fn; + const char *fn; int sig; int term; struct timespec tv; @@ -3250,10 +3590,6 @@ int main_loop(struct thread_data_s *my_thread) switch (w.sr.number) { case __NR_openat: - /* initialize buffer */ - memset(tmpbuf, '\0', sizeof(tmpbuf)); - memset(pathbuf, '\0', sizeof(pathbuf)); - /* check argument 1 dirfd */ ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[1], @@ -3266,52 +3602,18 @@ int main_loop(struct thread_data_s *my_thread) do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; } + pathbuf[ret] = 0; + __dprintf("openat: %d, %s,tid=%d\n", (int)w.sr.args[0], + pathbuf, my_thread->remote_tid); - if ((int)w.sr.args[0] != AT_FDCWD && - pathbuf[0] != '/') { - /* dirfd != AT_FDCWD */ - __dprintf("openat(dirfd != AT_FDCWD)\n"); - snprintf(tmpbuf, sizeof(tmpbuf), - "/proc/self/fd/%d", (int)w.sr.args[0]); - ret = readlink(tmpbuf, pathbuf, - sizeof(pathbuf) - 1); - if (ret == -1 && - (errno == ENOENT || - errno == EINVAL)) { - do_syscall_return(fd, cpu, -EBADF, 0, 0, - 0, 0); - break; - } - if (ret < 0) { - do_syscall_return(fd, cpu, -errno, 0, 0, - 0, 0); - break; - } - __dprintf(" %s -> %s\n", tmpbuf, pathbuf); - ret = do_strncpy_from_user(fd, tmpbuf, - (void *)w.sr.args[1], - PATH_MAX); - if (ret >= PATH_MAX) { - ret = -ENAMETOOLONG; - } - if (ret < 0) { - do_syscall_return(fd, cpu, ret, 0, 0, 0, - 0); - break; - } - strncat(pathbuf, "/", - sizeof(pathbuf) - strlen(pathbuf) - 1); - strncat(pathbuf, tmpbuf, - sizeof(pathbuf) - strlen(pathbuf) - 1); - } - else { - } - __dprintf("openat: %s,tid=%d\n", pathbuf, my_thread->remote_tid); + fn = overlay_path((int)w.sr.args[0], pathbuf, tmpbuf); - fn = chgpath(pathbuf, tmpbuf); - - ret = open(fn, w.sr.args[2], w.sr.args[3]); + ret = openat(w.sr.args[0], fn, w.sr.args[2], + w.sr.args[3]); SET_ERR(ret); + if (ret >= 0 && fn == tmpbuf) + overlay_addfd(ret, fn); + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; @@ -3904,63 +4206,38 @@ return_execve2: break; case __NR_close: - if(w.sr.args[0] == fd) + if (w.sr.args[0] == fd) ret = -EBADF; else ret = do_generic_syscall(&w); + overlay_delfd(w.sr.args[0]); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; -#ifdef POSTK_DEBUG_ARCH_DEP_36 -#ifdef __aarch64__ + case __NR_readlinkat: - /* initialize buffer */ - memset(tmpbuf, '\0', sizeof(tmpbuf)); - memset(pathbuf, '\0', sizeof(pathbuf)); - /* check argument 1 dirfd */ - if ((int)w.sr.args[0] != AT_FDCWD) { - /* dirfd != AT_FDCWD */ - __dprintf("readlinkat(dirfd != AT_FDCWD)\n"); - snprintf(tmpbuf, sizeof(tmpbuf), "/proc/self/fd/%d", (int)w.sr.args[0]); - ret = readlink(tmpbuf, pathbuf, sizeof(pathbuf) - 1); - if (ret < 0) { - do_syscall_return(fd, cpu, -errno, 0, 0, 0, 0); - break; - } - __dprintf(" %s -> %s\n", tmpbuf, pathbuf); - ret = do_strncpy_from_user(fd, tmpbuf, (void *)w.sr.args[1], PATH_MAX); - if (ret >= PATH_MAX) { - ret = -ENAMETOOLONG; - } - if (ret < 0) { - do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); - break; - } - strncat(pathbuf, "/", 1); - strncat(pathbuf, tmpbuf, strlen(tmpbuf) + 1); - } else { - /* dirfd == AT_FDCWD */ - __dprintf("readlinkat(dirfd == AT_FDCWD)\n"); - ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[1], PATH_MAX); - if (ret >= PATH_MAX) { - ret = -ENAMETOOLONG; - } - if (ret < 0) { - do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); - break; - } + ret = do_strncpy_from_user(fd, pathbuf, + (void *)w.sr.args[1], PATH_MAX); + if (ret >= PATH_MAX) { + ret = -ENAMETOOLONG; } - __dprintf("readlinkat: %s\n", pathbuf); + if (ret < 0) { + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + } + pathbuf[ret] = 0; + __dprintf("readlinkat: %d, %s\n", (int)w.sr.args[0], pathbuf); - fn = chgpath(pathbuf, tmpbuf); + fn = overlay_path((int)w.sr.args[0], pathbuf, tmpbuf); - ret = readlink(fn, (char *)w.sr.args[2], w.sr.args[3]); - __dprintf("readlinkat: dirfd=%d, path=%s, buf=%s, ret=%ld\n", + ret = readlinkat(w.sr.args[0], fn, (char *)w.sr.args[2], + w.sr.args[3]); + SET_ERR(ret); + __dprintf("readlinkat: dirfd=%d, path=%s, buf=%s, ret=%ld\n", (int)w.sr.args[0], fn, (char *)w.sr.args[2], ret); - SET_ERR(ret); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; -#else /* __aarch64__ */ +#ifdef __NR_readlink case __NR_readlink: ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX); if (ret >= PATH_MAX) { @@ -3971,42 +4248,17 @@ return_execve2: break; } - fn = chgpath(pathbuf, tmpbuf); + fn = overlay_path(AT_FDCWD, pathbuf, tmpbuf); ret = readlink(fn, (char *)w.sr.args[1], w.sr.args[2]); + SET_ERR(ret); __dprintf("readlink: path=%s, buf=%s, ret=%ld\n", fn, (char *)w.sr.args[1], ret); - SET_ERR(ret); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; -#endif /* __aarch64__ */ -#else /* POSTK_DEBUG_ARCH_DEP_36 */ - case __NR_readlink: - ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX); - if (ret >= PATH_MAX) { - ret = -ENAMETOOLONG; - } - if (ret < 0) { - do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); - break; - } +#endif /* __NR_readlink */ - fn = chgpath(pathbuf, tmpbuf); - - ret = readlink(fn, (char *)w.sr.args[1], w.sr.args[2]); - __dprintf("readlink: path=%s, buf=%s, ret=%ld\n", - fn, (char *)w.sr.args[1], ret); - SET_ERR(ret); - do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); - break; -#endif /* POSTK_DEBUG_ARCH_DEP_36 */ - -#ifdef POSTK_DEBUG_ARCH_DEP_72 /* add __NR_newfstat */ case __NR_newfstatat: - /* initialize buffer */ - memset(tmpbuf, '\0', sizeof(tmpbuf)); - memset(pathbuf, '\0', sizeof(pathbuf)); - ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[1], PATH_MAX); if (ret >= PATH_MAX) { ret = -ENAMETOOLONG; @@ -4015,53 +4267,19 @@ return_execve2: do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; } + pathbuf[ret] = 0; - if (pathbuf[0] == '\0') { - // empty string - if ((int)w.sr.args[3] & AT_EMPTY_PATH) { - if ((int)w.sr.args[0] == AT_FDCWD) { - if (NULL == getcwd(pathbuf, PATH_MAX)) { - do_syscall_return(fd, cpu, -errno, 0, 0, 0, 0); - break; - } - } else { - char dfdpath[64]; - snprintf(dfdpath, sizeof(dfdpath), "/proc/self/fd/%d", (int)w.sr.args[0]); - ret = readlink(dfdpath, pathbuf, PATH_MAX); - if (ret == -1) { - do_syscall_return(fd, cpu, -errno, 0, 0, 0, 0); - break; - } - pathbuf[ret] = '\0'; - } - } - } else if (pathbuf[0] != '/') { - // relative path - ret = syscall_pathname((int)w.sr.args[0], pathbuf, PATH_MAX); - if (ret < 0) { - do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); - break; - } - } - - fn = chgpath(pathbuf, tmpbuf); - if (fn[0] == '/') { - ret = fstatat((int)w.sr.args[0], - fn, - (struct stat*)w.sr.args[2], - (int)w.sr.args[3]); - __dprintf("fstatat: dirfd=%d, pathname=%s, buf=%p, flags=%x, ret=%ld\n", - (int)w.sr.args[0], fn, (void*)w.sr.args[2], (int)w.sr.args[3], ret); - } else { - ret = fstatat((int)w.sr.args[0], - (const char*)w.sr.args[1], - (struct stat*)w.sr.args[2], - (int)w.sr.args[3]); - __dprintf("fstatat: dirfd=%d, pathname=%s, buf=%p, flags=%x, ret=%ld\n", - (int)w.sr.args[0], (char*)w.sr.args[1], (void*)w.sr.args[2], (int)w.sr.args[3], ret); - } + fn = overlay_path((int)w.sr.args[0], pathbuf, tmpbuf); + ret = fstatat((int)w.sr.args[0], + fn, + (struct stat *)w.sr.args[2], + (int)w.sr.args[3]); SET_ERR(ret); + __dprintf("fstatat: dirfd=%d, pathname=%s, buf=%p, flags=%x, ret=%ld\n", + (int)w.sr.args[0], fn, (void *)w.sr.args[2], + (int)w.sr.args[3], ret); + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; #ifdef __NR_stat @@ -4075,17 +4293,47 @@ return_execve2: break; } - fn = chgpath(pathbuf, tmpbuf); + fn = overlay_path(AT_FDCWD, pathbuf, tmpbuf); ret = stat(fn, (struct stat *)w.sr.args[1]); - __dprintf("stat: path=%s, ret=%ld\n", fn, ret); SET_ERR(ret); + __dprintf("stat: path=%s, ret=%ld\n", fn, ret); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; #endif /* __NR_stat */ -#else /* POSTK_DEBUG_ARCH_DEP_72 */ - case __NR_stat: - ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX); + + case __NR_faccessat: + ret = do_strncpy_from_user(fd, pathbuf, + (void *)w.sr.args[1], PATH_MAX); + if (ret >= PATH_MAX) { + ret = -ENAMETOOLONG; + } + if (ret < 0) { + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + } + pathbuf[ret] = 0; + + fn = overlay_path((int)w.sr.args[0], pathbuf, tmpbuf); + + /* the syscall doesn't take flags argument, link + * resolution happened first so don't do it again + */ + ret = faccessat((int)w.sr.args[0], fn, + (int)w.sr.args[2], + AT_SYMLINK_NOFOLLOW); + SET_ERR(ret); + __dprintf("faccessat: dirfd=%d, pathname=%s, mode=%d, ret=%ld\n", + (int)w.sr.args[0], fn, (int)w.sr.args[2], + ret); + + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + +#ifdef __NR_access + case __NR_access: + ret = do_strncpy_from_user(fd, pathbuf, + (void *)w.sr.args[0], PATH_MAX); if (ret >= PATH_MAX) { ret = -ENAMETOOLONG; } @@ -4094,14 +4342,66 @@ return_execve2: break; } - fn = chgpath(pathbuf, tmpbuf); + fn = overlay_path(AT_FDCWD, pathbuf, tmpbuf); - ret = stat(fn, (struct stat *)w.sr.args[1]); - __dprintf("stat: path=%s, ret=%ld\n", fn, ret); + ret = access(fn, (int)w.sr.args[1]); SET_ERR(ret); + __dprintf("access: path=%s, ret=%ld\n", fn, ret); + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; +#endif /* __NR_access */ + case __NR_getxattr: + ret = do_strncpy_from_user(fd, pathbuf, + (void *)w.sr.args[0], PATH_MAX); + if (ret >= PATH_MAX) { + ret = -ENAMETOOLONG; + } + if (ret < 0) { + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + } + + fn = overlay_path(AT_FDCWD, pathbuf, tmpbuf); + + ret = getxattr(fn, (char *)w.sr.args[1], + (void *)w.sr.args[2], + (size_t)w.sr.args[3]); + SET_ERR(ret); + __dprintf("getxattr: path=%s, name=%s, ret=%ld\n", fn, + (char *)w.sr.args[1], ret); + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + case __NR_lgetxattr: + ret = do_strncpy_from_user(fd, pathbuf, + (void *)w.sr.args[0], PATH_MAX); + if (ret >= PATH_MAX) { + ret = -ENAMETOOLONG; + } + if (ret < 0) { + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + } + + fn = overlay_path(AT_FDCWD, pathbuf, tmpbuf); + + ret = lgetxattr(fn, (char *)w.sr.args[1], + (void *)w.sr.args[2], + (size_t)w.sr.args[3]); + SET_ERR(ret); + __dprintf("lgetxattr: path=%s, name=%s, ret=%ld\n", fn, + (char *)w.sr.args[1], ret); + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; +#ifdef __NR_getdents + case __NR_getdents: +#endif + case __NR_getdents64: + ret = overlay_getdents(w.sr.number, + (int)w.sr.args[0], + (struct linux_dirent *)w.sr.args[1], + (unsigned int)w.sr.args[2]); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; -#endif /* POSTK_DEBUG_ARCH_DEP_72 */ case __NR_sched_setaffinity: if (w.sr.args[0] == 0) { @@ -4303,10 +4603,32 @@ return_linux_spawn: break; } - default: - if (archdep_syscall(&w, &ret)) { - ret = do_generic_syscall(&w); +#ifdef __NR_open + case __NR_open: + ret = do_strncpy_from_user(fd, pathbuf, + (void *)w.sr.args[0], PATH_MAX); + if (ret >= PATH_MAX) { + ret = -ENAMETOOLONG; } + if (ret < 0) { + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + } + __dprintf("open: %s\n", pathbuf); + + fn = overlay_path(AT_FDCWD, pathbuf, tmpbuf); + + ret = open(fn, w.sr.args[1], w.sr.args[2]); + SET_ERR(ret); + if (ret >= 0 && fn == tmpbuf) + overlay_addfd(ret, fn); + + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; +#endif + + default: + ret = do_generic_syscall(&w); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; diff --git a/lib/include/list.h b/lib/include/list.h index 39b832d0..152b5744 100644 --- a/lib/include/list.h +++ b/lib/include/list.h @@ -1,7 +1,9 @@ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H +#ifndef offsetof #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member.