mcctrl: remove in-kernel calls to syscalls

Since 4.17.0, kernel cannot call syscalls directly because the calling
convention can be different on x86_64, as explained in this email:
https://lore.kernel.org/lkml/20180325162527.GA17492@light.dominikbrodowski.net

Use the ksys_* alternatives instead when possible, or for readlink use
do_readlinkat (and use readlinkat all the time to simplify ifdefs)

It might be possible to change some of these without ifdefs, but for
example ksys_unshare only got introduced in 4.17 so we need to keep some
syscall calling...

Change-Id: Ic47e184b29ef8b21731b2eae6193b0af2548b872
This commit is contained in:
Dominique Martinet
2018-11-19 17:27:26 +09:00
parent db4d19e419
commit 583cb94667
3 changed files with 25 additions and 9 deletions

View File

@ -29,6 +29,7 @@
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/kallsyms.h>
#include <linux/version.h>
#include "mcctrl.h"
#include <ihk/ihk_host_user.h>
@ -222,7 +223,7 @@ long (*mcctrl_sched_setaffinity)(pid_t pid, const struct cpumask *in_mask);
int (*mcctrl_sched_setscheduler_nocheck)(struct task_struct *p, int policy,
const struct sched_param *param);
ssize_t (*mcctrl_sys_readlink)(const char *path, char *buf,
ssize_t (*mcctrl_sys_readlinkat)(int dfd, const char *path, char *buf,
size_t bufsiz);
void (*mcctrl_zap_page_range)(struct vm_area_struct *vma,
unsigned long start,
@ -234,29 +235,41 @@ struct inode_operations *mcctrl_hugetlbfs_inode_operations;
static int symbols_init(void)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0)
mcctrl_sys_mount = (void *) kallsyms_lookup_name("ksys_mount");
#else
mcctrl_sys_mount = (void *) kallsyms_lookup_name("sys_mount");
#if defined(CONFIG_X86_64_SMP)
if (!mcctrl_sys_mount)
mcctrl_sys_mount =
(void *) kallsyms_lookup_name("__x64_sys_mount");
#endif
#endif
if (WARN_ON(!mcctrl_sys_mount))
return -EFAULT;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0)
mcctrl_sys_umount = (void *) kallsyms_lookup_name("ksys_umount");
#else
mcctrl_sys_umount = (void *) kallsyms_lookup_name("sys_umount");
#if defined(CONFIG_X86_64_SMP)
if (!mcctrl_sys_umount)
mcctrl_sys_umount =
(void *) kallsyms_lookup_name("__x64_sys_umount");
#endif
#endif
if (WARN_ON(!mcctrl_sys_umount))
return -EFAULT;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0)
mcctrl_sys_unshare = (void *) kallsyms_lookup_name("ksys_unshare");
#else
mcctrl_sys_unshare = (void *) kallsyms_lookup_name("sys_unshare");
#if defined(CONFIG_X86_64_SMP)
if (!mcctrl_sys_unshare)
mcctrl_sys_unshare =
(void *) kallsyms_lookup_name("__x64_sys_unshare");
#endif
#endif
if (WARN_ON(!mcctrl_sys_unshare))
return -EFAULT;
@ -271,14 +284,17 @@ static int symbols_init(void)
if (WARN_ON(!mcctrl_sched_setscheduler_nocheck))
return -EFAULT;
mcctrl_sys_readlink =
(void *) kallsyms_lookup_name("sys_readlink");
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0)
mcctrl_sys_readlinkat = (void *)kallsyms_lookup_name("do_readlinkat");
#else
mcctrl_sys_readlinkat = (void *)kallsyms_lookup_name("sys_readlinkat");
#if defined(CONFIG_X86_64_SMP)
if (!mcctrl_sys_readlink)
mcctrl_sys_readlink =
(void *) kallsyms_lookup_name("__x64_sys_readlink");
if (!mcctrl_sys_readlinkat)
mcctrl_sys_readlinkat =
(void *) kallsyms_lookup_name("__x64_sys_readlinkat");
#endif
if (WARN_ON(!mcctrl_sys_readlink))
#endif
if (WARN_ON(!mcctrl_sys_readlinkat))
return -EFAULT;
mcctrl_zap_page_range =

View File

@ -422,7 +422,7 @@ extern long (*mcctrl_sched_setaffinity)(pid_t pid,
extern int (*mcctrl_sched_setscheduler_nocheck)(struct task_struct *p,
int policy,
const struct sched_param *param);
extern ssize_t (*mcctrl_sys_readlink)(const char *path, char *buf,
extern ssize_t (*mcctrl_sys_readlinkat)(int dfd, const char *path, char *buf,
size_t bufsiz);
extern void (*mcctrl_zap_page_range)(struct vm_area_struct *vma,
unsigned long start,

View File

@ -921,7 +921,7 @@ static int read_link(char *buf, size_t bufsize, char *fmt, ...)
old_fs = get_fs();
set_fs(KERNEL_DS);
ss = mcctrl_sys_readlink(filename, buf, bufsize);
ss = mcctrl_sys_readlinkat(AT_FDCWD, filename, buf, bufsize);
set_fs(old_fs);
if (ss < 0) {
error = ss;