From 9ae5bcf46e20d1b3b210781220a9689623bf0ff1 Mon Sep 17 00:00:00 2001 From: Balazs Gerofi Date: Mon, 24 Aug 2015 23:41:31 +0200 Subject: [PATCH] gettimeofday(): an implementation based on CPU invariant TSC support --- arch/x86/kernel/cpu.c | 32 ++++++ arch/x86/kernel/include/syscall_list.h | 4 +- kernel/ap.c | 9 +- kernel/include/cls.h | 5 + kernel/include/time.h | 2 + kernel/init.c | 26 +++++ kernel/syscall.c | 146 ++++++++++++++++++++----- lib/include/ihk/cpu.h | 2 + 8 files changed, 193 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index 3890217d..7aab0b1b 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -67,6 +67,7 @@ void assign_processor_id(void); void arch_delay(int); void x86_set_warm_reset(unsigned long ip, char *first_page_va); void x86_init_perfctr(void); +int gettime_local_support = 0; extern int kprintf(const char *format, ...); @@ -569,6 +570,29 @@ static void check_no_execute(void) return; } +void init_gettime_support(void) +{ + uint64_t op; + uint64_t eax; + uint64_t ebx; + uint64_t ecx; + uint64_t edx; + + /* Check if Invariant TSC supported. + * Processor’s support for invariant TSC is indicated by + * CPUID.80000007H:EDX[8]. + * See page 2498 of the Intel64 and IA-32 Architectures Software + * Developer’s Manual - combined */ + + op = 0x80000007; + asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (op)); + + if (edx & (1 << 8)) { + gettime_local_support = 1; + kprintf("Invariant TSC supported.\n"); + } +} + void init_cpu(void) { enable_page_protection_fault(); @@ -595,6 +619,8 @@ void setup_x86(void) init_cpu(); + init_gettime_support(); + kprintf("setup_x86 done.\n"); } @@ -1316,3 +1342,9 @@ ihk_mc_user_context_t *lookup_user_context(struct process *proc) return uctx; } /* lookup_user_context() */ + + +void zero_tsc(void) +{ + wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0); +} diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index 5c356af1..943bac82 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -50,7 +50,7 @@ SYSCALL_HANDLED(29, shmget) SYSCALL_HANDLED(30, shmat) SYSCALL_HANDLED(31, shmctl) SYSCALL_HANDLED(34, pause) -SYSCALL_DELEGATED(35, nanosleep) +SYSCALL_HANDLED(35, nanosleep) SYSCALL_HANDLED(39, getpid) SYSCALL_HANDLED(56, clone) SYSCALL_DELEGATED(57, fork) @@ -67,7 +67,7 @@ SYSCALL_DELEGATED(70, msgrcv) SYSCALL_DELEGATED(72, fcntl) SYSCALL_DELEGATED(79, getcwd) SYSCALL_DELEGATED(89, readlink) -SYSCALL_DELEGATED(96, gettimeofday) +SYSCALL_HANDLED(96, gettimeofday) SYSCALL_HANDLED(97, getrlimit) SYSCALL_HANDLED(101, ptrace) SYSCALL_HANDLED(102, getuid) diff --git a/kernel/ap.c b/kernel/ap.c index a5bb0d9f..4ec8d78e 100644 --- a/kernel/ap.c +++ b/kernel/ap.c @@ -24,18 +24,21 @@ #include #include #include +#include int num_processors = 1; static volatile int ap_stop = 1; +extern void zero_tsc(void); static void ap_wait(void) { - wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0); - while (ap_stop) { barrier(); cpu_pause(); } + + zero_tsc(); + kmalloc_init(); sched_init(); @@ -64,8 +67,6 @@ void ap_init(void) ihk_mc_init_ap(); - wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0); - cpu_info = ihk_mc_get_cpu_info(); bsp_hw_id = ihk_mc_get_hardware_processor_id(); diff --git a/kernel/include/cls.h b/kernel/include/cls.h index 9a735d32..2ccaf187 100644 --- a/kernel/include/cls.h +++ b/kernel/include/cls.h @@ -71,6 +71,11 @@ struct cpu_local_var { int in_interrupt; int no_preempt; int timer_enabled; + + unsigned long tv_sec; + unsigned long tv_nsec; + unsigned long last_tsc; + } __attribute__((aligned(64))); diff --git a/kernel/include/time.h b/kernel/include/time.h index b09e3f67..ce4b9a53 100644 --- a/kernel/include/time.h +++ b/kernel/include/time.h @@ -19,6 +19,8 @@ #ifndef __TIME_H #define __TIME_H +#define NS_PER_SEC 1000000000UL + typedef long int __time_t; /* POSIX.1b structure for a time value. This is like a `struct timeval' but diff --git a/kernel/init.c b/kernel/init.c index a8d7fbf8..8a520447 100644 --- a/kernel/init.c +++ b/kernel/init.c @@ -200,6 +200,14 @@ static void pc_test(void) ed[1] - st[1], ed[2] - st[2], ed[3] - st[3]); } +extern void ihk_mc_get_boot_time(unsigned long *tv_sec, unsigned long *tv_nsec); +static void time_init(void) +{ + ihk_mc_get_boot_time(&cpu_local_var(tv_sec), + &cpu_local_var(tv_nsec)); + cpu_local_var(last_tsc) = 0; +} + static void rest_init(void) { handler_init(); @@ -212,6 +220,7 @@ static void rest_init(void) ap_init(); cpu_local_var_init(); + time_init(); kmalloc_init(); ikc_master_init(); @@ -220,9 +229,13 @@ static void rest_init(void) } int host_ikc_inited = 0; +extern int num_processors; +extern void zero_tsc(void); +extern void update_cpu_local_time(void); static void post_init(void) { + int i; cpu_enable_interrupt(); while (!host_ikc_inited) { @@ -237,7 +250,20 @@ static void post_init(void) init_host_syscall_channel2(); ihk_mc_spinlock_init(&syscall_lock); } + + /* Update time elapsed so far during boot, distribute the current + * date to all cores and zero TSC. + * All AP cores are wait spinning for ap_start() and they will zero + * their TSC immediatly. */ + update_cpu_local_time(); + cpu_local_var(last_tsc) = 0; + for (i = 0; i < num_processors; ++i) { + get_cpu_local_var(i)->tv_sec = cpu_local_var(tv_sec); + get_cpu_local_var(i)->tv_nsec = cpu_local_var(tv_nsec); + } + zero_tsc(); ap_start(); + create_os_procfs_files(); } #ifdef DCFA_RUN diff --git a/kernel/syscall.c b/kernel/syscall.c index 2a434797..7bd1674b 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -105,6 +105,7 @@ int patch_process_vm(struct process_vm *, void *, const void *, size_t); void do_setpgid(int, int); extern long alloc_debugreg(struct process *proc); extern int num_processors; +extern unsigned long ihk_mc_get_ns_per_tsc(void); static int ptrace_detach(int pid, int data); int prepare_process_ranges_args_envs(struct process *proc, @@ -3505,38 +3506,47 @@ SYSCALL_DECLARE(futex) (unsigned long)uaddr, op, val, utime, uaddr2, val3, *uaddr); if (utime && (op == FUTEX_WAIT_BITSET || op == FUTEX_WAIT)) { - struct syscall_request request IHK_DMA_ALIGN; - struct timeval tv_now; - request.number = n; - unsigned long __phys; + if (!gettime_local_support) { + struct syscall_request request IHK_DMA_ALIGN; + struct timeval tv_now; + request.number = n; + unsigned long __phys; - dkprintf("futex,utime and FUTEX_WAIT_*, uaddr=%lx, []=%x\n", (unsigned long)uaddr, *uaddr); + dkprintf("futex,utime and FUTEX_WAIT_*, uaddr=%lx, []=%x\n", (unsigned long)uaddr, *uaddr); - if (ihk_mc_pt_virt_to_phys(cpu_local_var(current)->vm->page_table, - (void *)&tv_now, &__phys)) { - return -EFAULT; + if (ihk_mc_pt_virt_to_phys(cpu_local_var(current)->vm->page_table, + (void *)&tv_now, &__phys)) { + return -EFAULT; + } + + request.args[0] = __phys; + + int r = do_syscall(&request, ihk_mc_get_processor_id(), 0); + + if (r < 0) { + return -EFAULT; + } + + dkprintf("futex, FUTEX_WAIT_*, arg3 != NULL, pc=%lx\n", (unsigned long)ihk_mc_syscall_pc(ctx)); + dkprintf("now->tv_sec=%016ld,tv_nsec=%016ld\n", tv_now.tv_sec, tv_now.tv_usec * 1000); + dkprintf("utime->tv_sec=%016ld,tv_nsec=%016ld\n", utime->tv_sec, utime->tv_nsec); + unsigned long nsec_timeout = ((long)utime->tv_sec * 1000000000ULL) + + utime->tv_nsec; + + long nsec_now = ((long)tv_now.tv_sec * 1000000000ULL) + + tv_now.tv_usec * 1000; + long diff_nsec = nsec_timeout - nsec_now; + + timeout = (diff_nsec / 1000) * 1100; // (usec * 1.1GHz) } + /* Compute timeout based on TSC/nanosec ratio */ + else { + unsigned long nsec_timeout = ((long)utime->tv_sec * 1000000000ULL) + + utime->tv_nsec; - request.args[0] = __phys; - - int r = do_syscall(&request, ihk_mc_get_processor_id(), 0); - - if (r < 0) { - return -EFAULT; + timeout = nsec_timeout * 1000 / ihk_mc_get_ns_per_tsc(); + dkprintf("futex timeout: %lu\n", timeout); } - - dkprintf("futex, FUTEX_WAIT_*, arg3 != NULL, pc=%lx\n", (unsigned long)ihk_mc_syscall_pc(ctx)); - dkprintf("now->tv_sec=%016ld,tv_nsec=%016ld\n", tv_now.tv_sec, tv_now.tv_usec * 1000); - dkprintf("utime->tv_sec=%016ld,tv_nsec=%016ld\n", utime->tv_sec, utime->tv_nsec); - - long nsec_now = ((long)tv_now.tv_sec * 1000000000ULL) + - tv_now.tv_usec * 1000; - long nsec_timeout = ((long)utime->tv_sec * 1000000000ULL) + - utime->tv_nsec * 1; - long diff_nsec = nsec_timeout - nsec_now; - - timeout = (diff_nsec / 1000) * 1100; // (usec * 1.1GHz) - dkprintf("futex timeout: %lu\n", timeout); } /* Requeue parameter in 'utime' if op == FUTEX_CMP_REQUEUE. @@ -4913,6 +4923,88 @@ SYSCALL_DECLARE(get_cpu_id) return ihk_mc_get_processor_id(); } +void __update_time_from_tsc_delta(unsigned long *tv_sec, + unsigned long *tv_nsec, + unsigned long tsc_delta) +{ + unsigned long ns_delta = tsc_delta * ihk_mc_get_ns_per_tsc() / 1000; + + *tv_sec += (ns_delta / NS_PER_SEC); + *tv_nsec += (ns_delta % NS_PER_SEC); + if (*tv_nsec > NS_PER_SEC) { + *tv_nsec -= NS_PER_SEC; + ++*tv_sec; + } +} + +void update_cpu_local_time(void) +{ + unsigned long tsc = rdtsc(); + + __update_time_from_tsc_delta( + &cpu_local_var(tv_sec), + &cpu_local_var(tv_nsec), + tsc - cpu_local_var(last_tsc)); + + cpu_local_var(last_tsc) = tsc; +} + + +SYSCALL_DECLARE(gettimeofday) +{ + struct timeval *tv = (struct timeval *)ihk_mc_syscall_arg0(ctx); + struct syscall_request request IHK_DMA_ALIGN; + + /* Do it locally if supported */ + if (gettime_local_support) { + update_cpu_local_time(); + + tv->tv_sec = cpu_local_var(tv_sec); + tv->tv_usec = cpu_local_var(tv_nsec) / 1000; + + dkprintf("gettimeofday(): \n"); + return 0; + } + + /* Otherwise offload */ + request.number = __NR_gettimeofday; + request.args[0] = (unsigned long)tv; + + return do_syscall(&request, ihk_mc_get_processor_id(), 0); +} + + +SYSCALL_DECLARE(nanosleep) +{ + struct timespec *tv = (struct timespec *)ihk_mc_syscall_arg0(ctx); + struct timespec *rem = (struct timespec *)ihk_mc_syscall_arg0(ctx); + struct syscall_request request IHK_DMA_ALIGN; + + /* Do it locally if supported */ + if (gettime_local_support) { + unsigned long nanosecs = tv->tv_sec * NS_PER_SEC + tv->tv_nsec; + unsigned long tscs = nanosecs * 1000 / ihk_mc_get_ns_per_tsc(); + + unsigned long ts = rdtsc(); + + /* Spin wait */ + while (rdtsc() - ts < tscs) + cpu_pause(); + + rem->tv_sec = 0; + rem->tv_nsec = 0; + + return 0; + } + + /* Otherwise offload */ + request.number = __NR_nanosleep; + request.args[0] = (unsigned long)tv; + request.args[0] = (unsigned long)rem; + + return do_syscall(&request, ihk_mc_get_processor_id(), 0); +} + SYSCALL_DECLARE(sched_yield) { schedule(); diff --git a/lib/include/ihk/cpu.h b/lib/include/ihk/cpu.h index 8567e520..ae167683 100644 --- a/lib/include/ihk/cpu.h +++ b/lib/include/ihk/cpu.h @@ -102,4 +102,6 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type, unsigned long *value); extern unsigned int ihk_ikc_irq; extern unsigned int ihk_ikc_irq_apicid; +extern int gettime_local_support; + #endif