Compare commits
413 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bf5ac7afc8 | |||
| bc423255d9 | |||
| 6714161c25 | |||
| 992a292c08 | |||
| 64c2e437c6 | |||
| dd9675d65e | |||
| 51ed8dce06 | |||
| 01f5e46865 | |||
| 38961fca78 | |||
| 2d7890731e | |||
| 7d181fccd9 | |||
| bd75e80df2 | |||
| 035e7913d8 | |||
| 7d38c7c147 | |||
| a801bcc591 | |||
| d7b8e7f4f4 | |||
| 6afea4af48 | |||
| 6415dcfdcc | |||
| 0f58e9e77d | |||
| 72e3f5ee50 | |||
| 8d57ad9bc4 | |||
| 35b36c2d33 | |||
| 632611d78c | |||
| d48d44d365 | |||
| 4c0f401424 | |||
| 06f824c829 | |||
| 7a606baad4 | |||
| 4c6c66555e | |||
| 8426cf589a | |||
| da7421e8ee | |||
| 209748d913 | |||
| f81722c63b | |||
| 2189c55d99 | |||
| 201a7e2595 | |||
| 5cdd194856 | |||
| 0061adadfb | |||
| 67843151d3 | |||
| 083cf3fcc9 | |||
| 4236323661 | |||
| 5a9bee55c9 | |||
| 6e23b07b20 | |||
| e64bd49d9e | |||
| 72b8f99d3b | |||
| 090937a5a3 | |||
| 2082acdf0d | |||
| a8f11634e6 | |||
| 4f9865cc8f | |||
| 07efb3ab9a | |||
| 2afc9d37d1 | |||
| fa6f20a3c4 | |||
| 52bc052e1a | |||
| f84415c310 | |||
| 1a853e07d7 | |||
| 07b0954610 | |||
| 1f006b2381 | |||
| 4dfd806aa7 | |||
| c6e3185246 | |||
| d9e6ff235d | |||
| b03f69783a | |||
| ab915f3331 | |||
| 7773c4aef6 | |||
| 58e531eb58 | |||
| 9beef7d901 | |||
| 0733592eb5 | |||
| 4d0e0728f4 | |||
| 66fad4c7a4 | |||
| 5758dba7cf | |||
| 1ca16b9693 | |||
| d29922c820 | |||
| 46b48ac59b | |||
| 446ef0465b | |||
| 200fe9aec4 | |||
| fedba28a93 | |||
| b527503937 | |||
| 6bdafbd33b | |||
| 12e7ed644f | |||
| edf059888d | |||
| a66fb96cd9 | |||
| dd2ef89997 | |||
| ba7edf1981 | |||
| a669fc5125 | |||
| c0cabc2d83 | |||
| e306b1e838 | |||
| 0c3b705f98 | |||
| 9f55263528 | |||
| 74c5f61fd5 | |||
| cadb66e5c1 | |||
| 9b5ccb5a33 | |||
| c5079898c2 | |||
| 746b459e7f | |||
| 4c42086154 | |||
| 56ee0787c9 | |||
| e901d42fb6 | |||
| 29ab087fa2 | |||
| 105d373765 | |||
| 0dd2fad33b | |||
| e554f4e2f9 | |||
| a256280118 | |||
| d75be7228b | |||
| 923dc4aa11 | |||
| e3e0f6a174 | |||
| dd6f721e03 | |||
| 9c25d47d9b | |||
| 5a4148aaaf | |||
| 32c8f6192d | |||
| e2f424846c | |||
| 989af7e045 | |||
| 721cee05a2 | |||
| 86aa76e088 | |||
| ab113658f1 | |||
| 2d72042021 | |||
| 610463ff39 | |||
| dfb0a37305 | |||
| 26b9484bae | |||
| b4aecfd43c | |||
| bf036f19f7 | |||
| 182202523e | |||
| afb7cb3a1e | |||
| fdbdcbd0ee | |||
| a18fd1f45c | |||
| d8170e292c | |||
| fee5234c54 | |||
| 6309095fd2 | |||
| b005adc103 | |||
| 21373338cc | |||
| 39352cd364 | |||
| 84025cc9cb | |||
| 04cbfbb025 | |||
| ba58054c9d | |||
| 7fd55dc83f | |||
| d66af42f7b | |||
| 4b964b8e0d | |||
| 65dc3440cb | |||
| fbd9086ce5 | |||
| c2b1d8e3ef | |||
| e2d59e2cb9 | |||
| 3de0f5ea19 | |||
| 373e9ea63c | |||
| 8daffa939e | |||
| eaa4d35fab | |||
| a968c935b5 | |||
| e01f6dd6ea | |||
| a07d802cbe | |||
| 1e442cce10 | |||
| 3f870b69a6 | |||
| 0fef80cb19 | |||
| 9992fe0d72 | |||
| 2d19ed9391 | |||
| 2f2f04d5a1 | |||
| 1541b26086 | |||
| e6c4d7731d | |||
| 94b527e027 | |||
| 8c9b207557 | |||
| dacb05844b | |||
| c3ec5d20ca | |||
| 92a40f92dd | |||
| 45bddf3caa | |||
| b7671fedd3 | |||
| c38d536aaa | |||
| 4ee0c05e08 | |||
| f2ab0193e5 | |||
| ef910fdf0e | |||
| b97a8c5138 | |||
| 034d10b185 | |||
| 3fe2257929 | |||
| eca4018ecb | |||
| e936b2ebe1 | |||
| d8112f92f8 | |||
| 1076010de4 | |||
| da4a5ec44b | |||
| d35aa9b100 | |||
| ba8dbf1b19 | |||
| 6213f0e488 | |||
| 4ef82c2683 | |||
| e066a8798c | |||
| b702c9691e | |||
| addbe91e59 | |||
| b812848a0e | |||
| ad214c8206 | |||
| 1bc3218fc1 | |||
| 5cc420a6c3 | |||
| c7686fdf4e | |||
| c1dae4d8b0 | |||
| 2473025201 | |||
| fa5c1b23ca | |||
| f2f499aace | |||
| bd47b909bf | |||
| d646c2a4b9 | |||
| 865ada46bf | |||
| cdffc5e853 | |||
| 0e67e9266b | |||
| 1ff0afe6fb | |||
| d34884f9a4 | |||
| 7a0c204dc1 | |||
| 25f67c9ef8 | |||
| a776464a7e | |||
| c40e7105e6 | |||
| 5bac38ce8b | |||
| e3f0662130 | |||
| 21df56b233 | |||
| 393cec513c | |||
| 4437ecc69a | |||
| 40d75baca2 | |||
| 00f3fe0840 | |||
| 47a8b5bda5 | |||
| ec75095073 | |||
| 1794232989 | |||
| 40978d162e | |||
| 536ce9f927 | |||
| 4e5ec74ffe | |||
| a6d8125fd7 | |||
| 15d3a0361e | |||
| 6ad84a96a3 | |||
| 16e846e9b6 | |||
| 5bc7185f07 | |||
| 32462dfb2d | |||
| e3ef88c0cf | |||
| 829aae7b8d | |||
| b836b84825 | |||
| 3e1f154412 | |||
| e7af537452 | |||
| 3565959af7 | |||
| 4667136a4c | |||
| 972d14611a | |||
| e90eef8910 | |||
| f81927b85b | |||
| 701cdcdab1 | |||
| 9635a628a9 | |||
| 3e1b16f3fc | |||
| ff37ff9ccf | |||
| 5b7bcb7170 | |||
| 6a5fe90f98 | |||
| 91373337ba | |||
| 56ed726a88 | |||
| bce10e11e4 | |||
| 91cdb16158 | |||
| c58ab0f648 | |||
| f410af1cfc | |||
| aa15e5eea8 | |||
| df9f1f8f78 | |||
| 7ace35d737 | |||
| 551999ff6b | |||
| 052b3f44ca | |||
| fdcf766337 | |||
| 7d13bfb14e | |||
| 202bfd9955 | |||
| c99e36235b | |||
| 3cecafac59 | |||
| 61fc4c5e55 | |||
| fad73cacc1 | |||
| 8fced29978 | |||
| b0f4ae4890 | |||
| 7070094a31 | |||
| 011185e3f7 | |||
| 461881e46a | |||
| ddc33821cf | |||
| 0ab7d02994 | |||
| a8c4ab221b | |||
| 87d36a7752 | |||
| 998ded414c | |||
| f78d031e64 | |||
| 4ab37dd34a | |||
| 8129dec2f7 | |||
| a1035a1878 | |||
| db169c5f90 | |||
| bbb55ef261 | |||
| 1130cafe41 | |||
| a1cf27e232 | |||
| 5a1ce99d87 | |||
| c7db296e1b | |||
| f634a750c5 | |||
| d07a196c8e | |||
| 8c56c75d2c | |||
| e54895efde | |||
| 2f8cca2d6d | |||
| 64607152ee | |||
| 20383ad3d0 | |||
| 787d34f650 | |||
| ae618a0c68 | |||
| f480376153 | |||
| e4b3a88fc6 | |||
| 69a5c53074 | |||
| 259583e936 | |||
| 0f826290d0 | |||
| e46f027894 | |||
| 3e093f6a40 | |||
| 00996b551f | |||
| 24d8697cef | |||
| be4f6741f9 | |||
| 7a2f67f5f0 | |||
| bba0425267 | |||
| beaf96b375 | |||
| f1af1ffb8f | |||
| 059fab2cc0 | |||
| f284a80656 | |||
| 5f973ab51e | |||
| 60b6713957 | |||
| ebcf9a0d6d | |||
| 942b7f8b78 | |||
| 0b0aa6c0e0 | |||
| 9705a80c82 | |||
| 99a02e2941 | |||
| b88d75720f | |||
| d2b677b6da | |||
| 083645f203 | |||
| 994b9a19ac | |||
| faa929e717 | |||
| 3ee3a9df6d | |||
| 73e1a4f1f9 | |||
| b068fde9cd | |||
| 167ea67dee | |||
| f33d85a27a | |||
| 1e8239d72a | |||
| a51a0a6f13 | |||
| cc3f6e1a4f | |||
| 5db6c311f4 | |||
| f4df713846 | |||
| 7176bb2a47 | |||
| a6bd98cc02 | |||
| 0f7462ae1c | |||
| 0d8d915d82 | |||
| 8f4f68b877 | |||
| 8c0a5a5e61 | |||
| ffd3f53785 | |||
| f39fa54c39 | |||
| 11125b0d68 | |||
| 3ae69d1290 | |||
| 2929fbb803 | |||
| f4db8b96de | |||
| 8eb3bf3559 | |||
| 326a4fcee4 | |||
| 9b82f1a52c | |||
| f3da381752 | |||
| 8aa589a40c | |||
| e03f377326 | |||
| 8d21846562 | |||
| 3e1367caa1 | |||
| 02536b7724 | |||
| e28725884f | |||
| c2b3fb7236 | |||
| 2f95f7cda8 | |||
| e551aa17ed | |||
| e6d4c160cd | |||
| 9390fe5d2c | |||
| 419f5e495b | |||
| 673deadf37 | |||
| 20ea65b38c | |||
| 84665ff699 | |||
| bfbc94dfb0 | |||
| f74dcfc2a1 | |||
| 7c562d0539 | |||
| b5e4459a34 | |||
| 782122b681 | |||
| d550bced78 | |||
| a7ee3f531b | |||
| b9439947a7 | |||
| 3b60a95f13 | |||
| 82ae6d7458 | |||
| 7ebc34ddcc | |||
| bd6a2c2311 | |||
| 5fd68eae54 | |||
| f5857cfc9e | |||
| 1ce1b17a85 | |||
| a2456c3ed2 | |||
| 01d2ea1605 | |||
| 15783f09a0 | |||
| 9efd568e07 | |||
| 1a207e19c2 | |||
| 73cf93727b | |||
| 4410e702d9 | |||
| f584e2ec25 | |||
| 3aa06444f4 | |||
| c897a56c34 | |||
| 5e9957da0f | |||
| 6ff2d4abe7 | |||
| e4239f1885 | |||
| fbbaaf5b54 | |||
| 3fa3920bb3 | |||
| 45e51fcc07 | |||
| 0884e3d543 | |||
| e3c7c9b890 | |||
| f4155cc9e8 | |||
| a01ae91051 | |||
| daca522d25 | |||
| ec521feb15 | |||
| d7bc947a02 | |||
| fb84d4ef11 | |||
| 5fbeee953a | |||
| 4cefb4333f | |||
| 689da07ac6 | |||
| 76981bcc18 | |||
| 6aae35cb3d | |||
| dac6f2883e | |||
| c484f766fa | |||
| 57690479bd | |||
| d0539a9cac | |||
| 4c8f583c0c | |||
| 6118faffa9 | |||
| dad6470c60 | |||
| 46c37fc8f3 | |||
| f6908f21a8 | |||
| 01d9d9a5ba | |||
| c43d993a4d | |||
| 7d9bbecd7a | |||
| d135731398 | |||
| 5c190beb04 | |||
| fc66556f9f | |||
| 648bacc90f | |||
| dd37443fc7 | |||
| e34322702a | |||
| e12997e6a9 | |||
| fabaa806d3 | |||
| a83ad620c8 |
@ -1,5 +1,6 @@
|
||||
TARGET = @TARGET@
|
||||
SBINDIR = @SBINDIR@
|
||||
ETCDIR = @ETCDIR@
|
||||
MANDIR = @MANDIR@
|
||||
|
||||
all::
|
||||
@ -48,6 +49,10 @@ install::
|
||||
mkdir -p -m 755 $(SBINDIR); \
|
||||
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
|
||||
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
|
||||
install -m 755 arch/x86/tools/eclair-dump-backtrace.exp $(SBINDIR)/eclair-dump-backtrace.exp;\
|
||||
mkdir -p -m 755 $(ETCDIR); \
|
||||
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
|
||||
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
|
||||
mkdir -p -m 755 $(MANDIR)/man1; \
|
||||
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
|
||||
;; \
|
||||
|
||||
@ -30,6 +30,7 @@
|
||||
#include <cls.h>
|
||||
#include <prctl.h>
|
||||
#include <page.h>
|
||||
#include <kmalloc.h>
|
||||
|
||||
#define LAPIC_ID 0x020
|
||||
#define LAPIC_TIMER 0x320
|
||||
@ -42,8 +43,6 @@
|
||||
#define LAPIC_ICR0 0x300
|
||||
#define LAPIC_ICR2 0x310
|
||||
#define LAPIC_ESR 0x280
|
||||
#define LOCAL_TIMER_VECTOR 0xef
|
||||
#define LOCAL_PERF_VECTOR 0xf0
|
||||
|
||||
#define APIC_INT_LEVELTRIG 0x08000
|
||||
#define APIC_INT_ASSERT 0x04000
|
||||
@ -80,6 +79,7 @@ static void (*lapic_icr_write)(unsigned int h, unsigned int l);
|
||||
static void (*lapic_wait_icr_idle)(void);
|
||||
void (*x86_issue_ipi)(unsigned int apicid, unsigned int low);
|
||||
int running_on_kvm(void);
|
||||
static void smp_func_call_handler(void);
|
||||
|
||||
void init_processors_local(int max_id);
|
||||
void assign_processor_id(void);
|
||||
@ -148,7 +148,7 @@ extern char page_fault[], general_protection_exception[];
|
||||
extern char debug_exception[], int3_exception[];
|
||||
|
||||
uint64_t boot_pat_state = 0;
|
||||
int no_turbo = 0; /* May be updated by early parsing of kargs */
|
||||
int no_turbo = 1; /* May be updated by early parsing of kargs */
|
||||
|
||||
extern int num_processors; /* kernel/ap.c */
|
||||
struct pvclock_vsyscall_time_info *pvti = NULL;
|
||||
@ -181,6 +181,8 @@ static void init_idt(void)
|
||||
}
|
||||
|
||||
static int xsave_available = 0;
|
||||
static int xsave_size = 0;
|
||||
static uint64_t xsave_mask = 0x0;
|
||||
|
||||
void init_fpu(void)
|
||||
{
|
||||
@ -224,6 +226,26 @@ void init_fpu(void)
|
||||
xsetbv(0, reg);
|
||||
dkprintf("init_fpu(): AVX init: XCR0 = 0x%016lX\n", reg);
|
||||
}
|
||||
if(xsave_available){
|
||||
unsigned long eax;
|
||||
unsigned long ebx;
|
||||
unsigned long ecx;
|
||||
unsigned long edx;
|
||||
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (0x0d), "c" (0x00));
|
||||
xsave_size = ecx;
|
||||
dkprintf("init_fpu(): xsave_size = %d\n", xsave_size);
|
||||
|
||||
if ((eax & (1 << 5)) && (eax & (1 << 6)) && (eax & (1 << 7))) {
|
||||
/* Set xcr0[7:5] to enable avx-512 ops */
|
||||
reg = xgetbv(0);
|
||||
reg |= 0xe6;
|
||||
xsetbv(0, reg);
|
||||
dkprintf("init_fpu(): AVX-512 init: XCR0 = 0x%016lX\n", reg);
|
||||
}
|
||||
}
|
||||
|
||||
xsave_mask = xgetbv(0);
|
||||
dkprintf("init_fpu(): xsave_mask = 0x%016lX\n", xsave_mask);
|
||||
|
||||
/* TODO: set MSR_IA32_XSS to enable xsaves/xrstors */
|
||||
|
||||
@ -234,6 +256,17 @@ void init_fpu(void)
|
||||
asm volatile("finit");
|
||||
}
|
||||
|
||||
int
|
||||
get_xsave_size()
|
||||
{
|
||||
return xsave_size;
|
||||
}
|
||||
|
||||
uint64_t get_xsave_mask()
|
||||
{
|
||||
return xsave_mask;
|
||||
}
|
||||
|
||||
void reload_gdt(struct x86_desc_ptr *gdt_ptr)
|
||||
{
|
||||
asm volatile("pushq %1\n"
|
||||
@ -811,6 +844,25 @@ void set_signal(int sig, void *regs, struct siginfo *info);
|
||||
void check_signal(unsigned long, void *, int);
|
||||
extern void tlb_flush_handler(int vector);
|
||||
|
||||
void __show_stack(uintptr_t *sp) {
|
||||
while (((uintptr_t)sp >= 0xffff800000000000)
|
||||
&& ((uintptr_t)sp < 0xffffffff80000000)) {
|
||||
uintptr_t fp;
|
||||
uintptr_t ip;
|
||||
|
||||
fp = sp[0];
|
||||
ip = sp[1];
|
||||
kprintf("IP: %016lx, SP: %016lx, FP: %016lx\n", ip, (uintptr_t)sp, fp);
|
||||
sp = (void *)fp;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void show_context_stack(uintptr_t *rbp) {
|
||||
__show_stack(rbp);
|
||||
return;
|
||||
}
|
||||
|
||||
void handle_interrupt(int vector, struct x86_user_context *regs)
|
||||
{
|
||||
struct ihk_mc_interrupt_handler *h;
|
||||
@ -883,19 +935,48 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
|
||||
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
|
||||
}
|
||||
else if (vector == LOCAL_PERF_VECTOR) {
|
||||
struct siginfo info;
|
||||
unsigned long value;
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
struct process *proc = thread->proc;
|
||||
long irqstate;
|
||||
struct mckfd *fdp;
|
||||
|
||||
lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR);
|
||||
|
||||
value = rdmsr(MSR_PERF_GLOBAL_STATUS);
|
||||
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, value);
|
||||
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, 0);
|
||||
//TODO: counter overflow signal
|
||||
//set_signal(0x1d, regs, NULL); // SIGIO
|
||||
|
||||
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
||||
for(fdp = proc->mckfd; fdp; fdp = fdp->next) {
|
||||
if(fdp->sig_no > 0)
|
||||
break;
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
||||
|
||||
if(fdp) {
|
||||
memset(&info, '\0', sizeof info);
|
||||
info.si_signo = fdp->sig_no;
|
||||
info._sifields._sigfault.si_addr = (void *)regs->gpr.rip;
|
||||
info._sifields._sigpoll.si_fd = fdp->fd;
|
||||
set_signal(fdp->sig_no, regs, &info);
|
||||
}
|
||||
else {
|
||||
set_signal(SIGIO, regs, NULL);
|
||||
}
|
||||
}
|
||||
else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START &&
|
||||
vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) {
|
||||
|
||||
tlb_flush_handler(vector);
|
||||
}
|
||||
else if (vector == LOCAL_SMP_FUNC_CALL_VECTOR) {
|
||||
smp_func_call_handler();
|
||||
}
|
||||
else if (vector == 133) {
|
||||
show_context_stack((uintptr_t *)regs->gpr.rbp);
|
||||
}
|
||||
else {
|
||||
list_for_each_entry(h, &handlers[vector - 32], list) {
|
||||
if (h->func) {
|
||||
@ -998,9 +1079,8 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
|
||||
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
|
||||
|
||||
irqflags = kprintf_lock();
|
||||
dkprintf("[%d] Page fault for 0x%lX\n",
|
||||
ihk_mc_get_processor_id(), address);
|
||||
dkprintf("%s for %s access in %s mode (reserved bit %s set), "
|
||||
__kprintf("Page fault for 0x%lx\n", address);
|
||||
__kprintf("%s for %s access in %s mode (reserved bit %s set), "
|
||||
"it %s an instruction fetch\n",
|
||||
(error & PF_PROT ? "protection fault" : "no page found"),
|
||||
(error & PF_WRITE ? "write" : "read"),
|
||||
@ -1012,14 +1092,14 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
|
||||
list_for_each_entry(range, &vm->vm_range_list, list) {
|
||||
if (range->start <= address && range->end > address) {
|
||||
found = 1;
|
||||
dkprintf("address is in range, flag: 0x%X! \n",
|
||||
__kprintf("address is in range, flag: 0x%lx\n",
|
||||
range->flag);
|
||||
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
dkprintf("address is out of range! \n");
|
||||
__kprintf("address is out of range! \n");
|
||||
}
|
||||
|
||||
kprintf_unlock(irqflags);
|
||||
@ -1027,6 +1107,9 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
|
||||
/* TODO */
|
||||
ihk_mc_debug_show_interrupt_context(regs);
|
||||
|
||||
if (!(error & PF_USER)) {
|
||||
panic("panic: kernel mode PF");
|
||||
}
|
||||
|
||||
//dkprintf("now dump a core file\n");
|
||||
//coredump(proc, regs);
|
||||
@ -1494,7 +1577,8 @@ release_fp_regs(struct thread *thread)
|
||||
if (thread && !thread->fp_regs)
|
||||
return;
|
||||
|
||||
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
|
||||
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
|
||||
dkprintf("release_fp_regs: pages=%d\n", pages);
|
||||
ihk_mc_free_pages(thread->fp_regs, pages);
|
||||
thread->fp_regs = NULL;
|
||||
}
|
||||
@ -1508,7 +1592,8 @@ save_fp_regs(struct thread *thread)
|
||||
int pages;
|
||||
|
||||
if (!thread->fp_regs) {
|
||||
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
|
||||
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
|
||||
dkprintf("save_fp_regs: pages=%d\n", pages);
|
||||
thread->fp_regs = ihk_mc_alloc_pages(pages, IHK_MC_AP_NOWAIT);
|
||||
|
||||
if (!thread->fp_regs) {
|
||||
@ -1517,14 +1602,15 @@ save_fp_regs(struct thread *thread)
|
||||
}
|
||||
|
||||
memset(thread->fp_regs, 0, sizeof(fp_regs_struct));
|
||||
memset(thread->fp_regs, 0, pages * PAGE_SIZE);
|
||||
}
|
||||
|
||||
if (xsave_available) {
|
||||
unsigned int low, high;
|
||||
|
||||
/* Request full save of x87, SSE and AVX states */
|
||||
low = 0x7;
|
||||
high = 0;
|
||||
/* Request full save of x87, SSE, AVX and AVX-512 states */
|
||||
low = (unsigned int)xsave_mask;
|
||||
high = (unsigned int)(xsave_mask >> 32);
|
||||
|
||||
asm volatile("xsave %0" : : "m" (*thread->fp_regs), "a" (low), "d" (high)
|
||||
: "memory");
|
||||
@ -1546,9 +1632,9 @@ restore_fp_regs(struct thread *thread)
|
||||
if (xsave_available) {
|
||||
unsigned int low, high;
|
||||
|
||||
/* Request full restore of x87, SSE and AVX states */
|
||||
low = 0x7;
|
||||
high = 0;
|
||||
/* Request full restore of x87, SSE, AVX and AVX-512 states */
|
||||
low = (unsigned int)xsave_mask;
|
||||
high = (unsigned int)(xsave_mask >> 32);
|
||||
|
||||
asm volatile("xrstor %0" : : "m" (*thread->fp_regs),
|
||||
"a" (low), "d" (high));
|
||||
@ -1678,7 +1764,7 @@ int arch_setup_pvclock(void)
|
||||
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
|
||||
pvti_npages = npages;
|
||||
|
||||
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
|
||||
pvti = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
|
||||
if (!pvti) {
|
||||
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
|
||||
return -ENOMEM;
|
||||
@ -1708,44 +1794,6 @@ void arch_start_pvclock(void)
|
||||
return;
|
||||
} /* arch_start_pvclock() */
|
||||
|
||||
static struct cpu_mapping *cpu_mapping = NULL;
|
||||
|
||||
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
|
||||
{
|
||||
int error;
|
||||
size_t size;
|
||||
int npages;
|
||||
struct cpu_mapping *mapping;
|
||||
int cpu;
|
||||
struct x86_cpu_local_variables *v;
|
||||
|
||||
if (!cpu_mapping) {
|
||||
size = sizeof(*mapping) * num_processors;
|
||||
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
|
||||
if (!mapping) {
|
||||
error = -ENOMEM;
|
||||
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (cpu = 0; cpu < num_processors; ++cpu) {
|
||||
v = get_x86_cpu_local_variable(cpu);
|
||||
mapping[cpu].cpu_number = cpu;
|
||||
mapping[cpu].hw_id = v->apic_id;
|
||||
}
|
||||
|
||||
cpu_mapping = mapping;
|
||||
}
|
||||
|
||||
error = 0;
|
||||
*buf = cpu_mapping;
|
||||
*nelemsp = num_processors;
|
||||
|
||||
out:
|
||||
return error;
|
||||
} /* arch_get_cpu_mapping() */
|
||||
|
||||
#define KVM_CPUID_SIGNATURE 0x40000000
|
||||
|
||||
int running_on_kvm(void) {
|
||||
@ -1767,4 +1815,178 @@ int running_on_kvm(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
mod_nmi_ctx(void *nmi_ctx, void (*func)())
|
||||
{
|
||||
unsigned long *l = nmi_ctx;
|
||||
int i;
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile("pushf; pop %0" : "=r"(flags) : : "memory", "cc");
|
||||
for (i = 0; i < 22; i++)
|
||||
l[i] = l[i + 5];
|
||||
l[i++] = (unsigned long)func; // return address
|
||||
l[i++] = 0x20; // KERNEL CS
|
||||
l[i++] = flags & ~RFLAGS_IF; // rflags (disable interrupt)
|
||||
l[i++] = (unsigned long)(l + 27); // ols rsp
|
||||
l[i++] = 0x28; // KERNEL DS
|
||||
}
|
||||
|
||||
int arch_cpu_read_write_register(
|
||||
struct ihk_os_cpu_register *desc,
|
||||
enum mcctrl_os_cpu_operation op)
|
||||
{
|
||||
if (op == MCCTRL_OS_CPU_READ_REGISTER) {
|
||||
desc->val = rdmsr(desc->addr);
|
||||
}
|
||||
else if (op == MCCTRL_OS_CPU_WRITE_REGISTER) {
|
||||
wrmsr(desc->addr, desc->val);
|
||||
}
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generic remote CPU function invocation facility.
|
||||
*/
|
||||
static void smp_func_call_handler(void)
|
||||
{
|
||||
int irq_flags;
|
||||
struct smp_func_call_request *req;
|
||||
int reqs_left;
|
||||
|
||||
reiterate:
|
||||
req = NULL;
|
||||
reqs_left = 0;
|
||||
|
||||
irq_flags = ihk_mc_spinlock_lock(
|
||||
&cpu_local_var(smp_func_req_lock));
|
||||
|
||||
/* Take requests one-by-one */
|
||||
if (!list_empty(&cpu_local_var(smp_func_req_list))) {
|
||||
req = list_first_entry(&cpu_local_var(smp_func_req_list),
|
||||
struct smp_func_call_request, list);
|
||||
list_del(&req->list);
|
||||
|
||||
reqs_left = !list_empty(&cpu_local_var(smp_func_req_list));
|
||||
}
|
||||
|
||||
ihk_mc_spinlock_unlock(&cpu_local_var(smp_func_req_lock),
|
||||
irq_flags);
|
||||
|
||||
if (req) {
|
||||
req->ret = req->sfcd->func(req->cpu_index,
|
||||
req->sfcd->nr_cpus, req->sfcd->arg);
|
||||
ihk_atomic_dec(&req->sfcd->cpus_left);
|
||||
}
|
||||
|
||||
if (reqs_left)
|
||||
goto reiterate;
|
||||
}
|
||||
|
||||
int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
||||
{
|
||||
int cpu, nr_cpus = 0;
|
||||
int cpu_index = 0;
|
||||
int this_cpu_index = 0;
|
||||
struct smp_func_call_data sfcd;
|
||||
struct smp_func_call_request *reqs;
|
||||
int ret = 0;
|
||||
int call_on_this_cpu = 0;
|
||||
cpu_set_t cpu_set;
|
||||
|
||||
/* Sanity checks */
|
||||
if (!__cpu_set || !__func) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Make sure it won't change in between */
|
||||
cpu_set = *__cpu_set;
|
||||
|
||||
for_each_set_bit(cpu, (unsigned long *)&cpu_set,
|
||||
sizeof(cpu_set) * BITS_PER_BYTE) {
|
||||
|
||||
if (cpu == ihk_mc_get_processor_id()) {
|
||||
call_on_this_cpu = 1;
|
||||
}
|
||||
++nr_cpus;
|
||||
}
|
||||
|
||||
if (!nr_cpus) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
reqs = kmalloc(sizeof(*reqs) * nr_cpus, IHK_MC_AP_NOWAIT);
|
||||
if (!reqs) {
|
||||
ret = -ENOMEM;
|
||||
goto free_out;
|
||||
}
|
||||
|
||||
sfcd.nr_cpus = nr_cpus;
|
||||
sfcd.func = __func;
|
||||
sfcd.arg = __arg;
|
||||
ihk_atomic_set(&sfcd.cpus_left,
|
||||
call_on_this_cpu ? nr_cpus - 1 : nr_cpus);
|
||||
|
||||
/* Add requests and send IPIs */
|
||||
cpu_index = 0;
|
||||
for_each_set_bit(cpu, (unsigned long *)&cpu_set,
|
||||
sizeof(cpu_set) * BITS_PER_BYTE) {
|
||||
unsigned long irq_flags;
|
||||
|
||||
reqs[cpu_index].cpu_index = cpu_index;
|
||||
reqs[cpu_index].ret = 0;
|
||||
|
||||
if (cpu == ihk_mc_get_processor_id()) {
|
||||
this_cpu_index = cpu_index;
|
||||
++cpu_index;
|
||||
continue;
|
||||
}
|
||||
|
||||
reqs[cpu_index].sfcd = &sfcd;
|
||||
|
||||
irq_flags =
|
||||
ihk_mc_spinlock_lock(&get_cpu_local_var(cpu)->smp_func_req_lock);
|
||||
list_add_tail(&reqs[cpu_index].list,
|
||||
&get_cpu_local_var(cpu)->smp_func_req_list);
|
||||
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
|
||||
irq_flags);
|
||||
|
||||
ihk_mc_interrupt_cpu(
|
||||
get_x86_cpu_local_variable(cpu)->apic_id,
|
||||
LOCAL_SMP_FUNC_CALL_VECTOR);
|
||||
|
||||
++cpu_index;
|
||||
}
|
||||
|
||||
/* Is this CPU involved? */
|
||||
if (call_on_this_cpu) {
|
||||
reqs[this_cpu_index].ret =
|
||||
__func(this_cpu_index, nr_cpus, __arg);
|
||||
}
|
||||
|
||||
/* Wait for the rest of the CPUs */
|
||||
while (ihk_atomic_read(&sfcd.cpus_left) > 0) {
|
||||
cpu_pause();
|
||||
}
|
||||
|
||||
/* Check return values, if error, report the first non-zero */
|
||||
for (cpu_index = 0; cpu_index < nr_cpus; ++cpu_index) {
|
||||
if (reqs[cpu_index].ret != 0) {
|
||||
ret = reqs[cpu_index].ret;
|
||||
goto free_out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
free_out:
|
||||
kfree(reqs);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*** end of file ***/
|
||||
|
||||
@ -182,7 +182,6 @@ void fill_prpsinfo(struct note *head, struct thread *thread, void *regs)
|
||||
/*
|
||||
We leave most of the fields unfilled.
|
||||
|
||||
char pr_state;
|
||||
char pr_sname;
|
||||
char pr_zomb;
|
||||
char pr_nice;
|
||||
|
||||
@ -13,6 +13,8 @@
|
||||
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
|
||||
#define HEADER_X86_COMMON_ARCH_BITOPS_H
|
||||
|
||||
#define ARCH_HAS_FAST_MULTIPLIER 1
|
||||
|
||||
static inline int fls(int x)
|
||||
{
|
||||
int r;
|
||||
|
||||
@ -131,6 +131,7 @@ static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
||||
typedef struct mcs_lock_node {
|
||||
unsigned long locked;
|
||||
struct mcs_lock_node *next;
|
||||
unsigned long irqsave;
|
||||
} __attribute__((aligned(64))) mcs_lock_node_t;
|
||||
|
||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
||||
@ -139,7 +140,7 @@ static void mcs_lock_init(struct mcs_lock_node *node)
|
||||
node->next = NULL;
|
||||
}
|
||||
|
||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
struct mcs_lock_node *pred;
|
||||
@ -158,7 +159,7 @@ static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
}
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
if (node->next == NULL) {
|
||||
@ -178,6 +179,37 @@ static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
node->next->locked = 0;
|
||||
}
|
||||
|
||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
preempt_disable();
|
||||
__mcs_lock_lock(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
__mcs_lock_unlock(lock, node);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
mcs_lock_lock_noirq(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
mcs_lock_unlock_noirq(lock, node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
}
|
||||
|
||||
|
||||
#define SPINLOCK_IN_MCS_RWLOCK
|
||||
|
||||
// reader/writer lock
|
||||
typedef struct mcs_rwlock_node {
|
||||
ihk_atomic_t count; // num of readers (use only common reader)
|
||||
@ -194,21 +226,31 @@ typedef struct mcs_rwlock_node {
|
||||
} __attribute__((aligned(64))) mcs_rwlock_node_t;
|
||||
|
||||
typedef struct mcs_rwlock_node_irqsave {
|
||||
#ifndef SPINLOCK_IN_MCS_RWLOCK
|
||||
struct mcs_rwlock_node node;
|
||||
#endif
|
||||
unsigned long irqsave;
|
||||
} __attribute__((aligned(64))) mcs_rwlock_node_irqsave_t;
|
||||
|
||||
typedef struct mcs_rwlock_lock {
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_spinlock_t slock;
|
||||
#else
|
||||
struct mcs_rwlock_node reader; /* common reader lock */
|
||||
struct mcs_rwlock_node *node; /* base */
|
||||
#endif
|
||||
} __attribute__((aligned(64))) mcs_rwlock_lock_t;
|
||||
|
||||
static void
|
||||
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_init(&lock->slock);
|
||||
#else
|
||||
ihk_atomic_set(&lock->reader.count, 0);
|
||||
lock->reader.type = MCS_RWLOCK_TYPE_COMMON_READER;
|
||||
lock->node = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -223,6 +265,9 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id());
|
||||
static void
|
||||
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_lock_noirq(&lock->slock);
|
||||
#else
|
||||
struct mcs_rwlock_node *pred;
|
||||
|
||||
preempt_disable();
|
||||
@ -240,8 +285,10 @@ __mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef SPINLOCK_IN_MCS_RWLOCK
|
||||
static void
|
||||
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
|
||||
{
|
||||
@ -298,6 +345,7 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
|
||||
|
||||
f->locked = MCS_RWLOCK_UNLOCKED;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
#define mcs_rwlock_writer_unlock_noirq(l, n) { \
|
||||
@ -311,6 +359,9 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()
|
||||
static void
|
||||
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_unlock_noirq(&lock->slock);
|
||||
#else
|
||||
if (node->next == NULL) {
|
||||
struct mcs_rwlock_node *old = (struct mcs_rwlock_node *)
|
||||
atomic_cmpxchg8((unsigned long *)&lock->node,
|
||||
@ -335,6 +386,7 @@ __mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock
|
||||
|
||||
out:
|
||||
preempt_enable();
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -367,6 +419,9 @@ atomic_inc_ifnot0(ihk_atomic_t *v)
|
||||
static void
|
||||
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_lock_noirq(&lock->slock);
|
||||
#else
|
||||
struct mcs_rwlock_node *pred;
|
||||
|
||||
preempt_disable();
|
||||
@ -415,6 +470,7 @@ __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
|
||||
}
|
||||
out:
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -429,6 +485,9 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()
|
||||
static void
|
||||
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_unlock_noirq(&lock->slock);
|
||||
#else
|
||||
if(ihk_atomic_dec_return(&lock->reader.count))
|
||||
goto out;
|
||||
|
||||
@ -458,6 +517,7 @@ __mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock
|
||||
|
||||
out:
|
||||
preempt_enable();
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -472,8 +532,12 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
|
||||
static void
|
||||
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
node->irqsave = ihk_mc_spinlock_lock(&lock->slock);
|
||||
#else
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
__mcs_rwlock_writer_lock_noirq(lock, &node->node);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -488,8 +552,12 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
|
||||
static void
|
||||
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_unlock(&lock->slock, node->irqsave);
|
||||
#else
|
||||
__mcs_rwlock_writer_unlock_noirq(lock, &node->node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -504,8 +572,12 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
|
||||
static void
|
||||
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
node->irqsave = ihk_mc_spinlock_lock(&lock->slock);
|
||||
#else
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
__mcs_rwlock_reader_lock_noirq(lock, &node->node);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DEBUG_MCS_RWLOCK
|
||||
@ -520,8 +592,12 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
|
||||
static void
|
||||
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
ihk_mc_spinlock_unlock(&lock->slock, node->irqsave);
|
||||
#else
|
||||
__mcs_rwlock_reader_unlock_noirq(lock, &node->node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -204,6 +204,11 @@ static inline int pte_is_fileoff(pte_t *ptep, size_t pgsize)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void pte_update_phys(pte_t *ptep, unsigned long phys)
|
||||
{
|
||||
*ptep = (*ptep & ~PT_PHYSMASK) | (phys & PT_PHYSMASK);
|
||||
}
|
||||
|
||||
static inline uintptr_t pte_get_phys(pte_t *ptep)
|
||||
{
|
||||
return (*ptep & PT_PHYSMASK);
|
||||
@ -306,7 +311,7 @@ struct page_table;
|
||||
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
|
||||
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
|
||||
|
||||
void *early_alloc_page(void);
|
||||
void *early_alloc_pages(int nr_pages);
|
||||
void *get_last_early_heap(void);
|
||||
void flush_tlb(void);
|
||||
void flush_tlb_single(unsigned long addr);
|
||||
@ -318,5 +323,5 @@ extern unsigned long ap_trampoline;
|
||||
#define AP_TRAMPOLINE_SIZE 0x2000
|
||||
|
||||
/* Local is cachable */
|
||||
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)
|
||||
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
|
||||
#endif
|
||||
|
||||
42
arch/x86/kernel/include/arch-string.h
Normal file
42
arch/x86/kernel/include/arch-string.h
Normal file
@ -0,0 +1,42 @@
|
||||
#ifndef _ASM_X86_STRING_H
|
||||
#define _ASM_X86_STRING_H
|
||||
|
||||
#define ARCH_FAST_MEMCPY
|
||||
|
||||
static inline void *__inline_memcpy(void *to, const void *from, size_t n)
|
||||
{
|
||||
unsigned long d0, d1, d2;
|
||||
asm volatile("rep ; movsl\n\t"
|
||||
"testb $2,%b4\n\t"
|
||||
"je 1f\n\t"
|
||||
"movsw\n"
|
||||
"1:\ttestb $1,%b4\n\t"
|
||||
"je 2f\n\t"
|
||||
"movsb\n"
|
||||
"2:"
|
||||
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
|
||||
: "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
|
||||
: "memory");
|
||||
return to;
|
||||
}
|
||||
|
||||
#define ARCH_FAST_MEMSET
|
||||
|
||||
static inline void *__inline_memset(void *s, unsigned long c, size_t count)
|
||||
{
|
||||
int d0, d1;
|
||||
asm volatile("rep ; stosl\n\t"
|
||||
"testb $2,%b3\n\t"
|
||||
"je 1f\n\t"
|
||||
"stosw\n"
|
||||
"1:\ttestb $1,%b3\n\t"
|
||||
"je 2f\n\t"
|
||||
"stosb\n"
|
||||
"2:"
|
||||
: "=&c" (d0), "=&D" (d1)
|
||||
: "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
|
||||
: "memory");
|
||||
return s;
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -215,4 +215,25 @@ static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
|
||||
return oldval;
|
||||
}
|
||||
|
||||
static inline void ihk_atomic_add_long(long i, long *v) {
|
||||
asm volatile("lock addq %1,%0"
|
||||
: "+m" (*v)
|
||||
: "ir" (i));
|
||||
}
|
||||
static inline void ihk_atomic_add_ulong(long i, unsigned long *v) {
|
||||
asm volatile("lock addq %1,%0"
|
||||
: "+m" (*v)
|
||||
: "ir" (i));
|
||||
}
|
||||
|
||||
static inline unsigned long ihk_atomic_add_long_return(long i, long *v) {
|
||||
long __i;
|
||||
|
||||
__i = i;
|
||||
asm volatile("lock xaddq %0, %1"
|
||||
: "+r" (i), "+m" (*v)
|
||||
: : "memory");
|
||||
return i + __i;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -15,6 +15,9 @@
|
||||
|
||||
#include <ikc/ihk.h>
|
||||
|
||||
#define IKC_PORT_IKC2MCKERNEL 501
|
||||
#define IKC_PORT_IKC2LINUX 503
|
||||
|
||||
/* manycore side */
|
||||
int ihk_mc_ikc_init_first(struct ihk_ikc_channel_desc *,
|
||||
ihk_ikc_ph_t handler);
|
||||
|
||||
@ -215,6 +215,7 @@ struct x86_sregs {
|
||||
* bit 4 == 1: fault was an instruction fetch
|
||||
*
|
||||
* internal use:
|
||||
* bit 29 == 1: Make PF map text modified by ptrace_poketext()
|
||||
* bit 30 == 1: don't use COW page to resolve page fault.
|
||||
*/
|
||||
enum x86_pf_error_code {
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
|
||||
SYSCALL_HANDLED(0, read)
|
||||
SYSCALL_DELEGATED(1, write)
|
||||
SYSCALL_DELEGATED(2, open)
|
||||
SYSCALL_HANDLED(2, open)
|
||||
SYSCALL_HANDLED(3, close)
|
||||
SYSCALL_DELEGATED(4, stat)
|
||||
SYSCALL_DELEGATED(5, fstat)
|
||||
@ -66,7 +66,7 @@ SYSCALL_DELEGATED(65, semop)
|
||||
SYSCALL_HANDLED(67, shmdt)
|
||||
SYSCALL_DELEGATED(69, msgsnd)
|
||||
SYSCALL_DELEGATED(70, msgrcv)
|
||||
SYSCALL_DELEGATED(72, fcntl)
|
||||
SYSCALL_HANDLED(72, fcntl)
|
||||
SYSCALL_DELEGATED(79, getcwd)
|
||||
SYSCALL_DELEGATED(89, readlink)
|
||||
SYSCALL_HANDLED(96, gettimeofday)
|
||||
@ -150,5 +150,11 @@ SYSCALL_HANDLED(602, pmc_start)
|
||||
SYSCALL_HANDLED(603, pmc_stop)
|
||||
SYSCALL_HANDLED(604, pmc_reset)
|
||||
SYSCALL_HANDLED(700, get_cpu_id)
|
||||
#ifdef PROFILE_ENABLE
|
||||
SYSCALL_HANDLED(__NR_profile, profile)
|
||||
#endif // PROFILE_ENABLE
|
||||
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
|
||||
SYSCALL_HANDLED(731, util_indicate_clone)
|
||||
SYSCALL_HANDLED(732, get_system)
|
||||
|
||||
/**** End of File ****/
|
||||
|
||||
@ -130,11 +130,40 @@ general_protection_exception:
|
||||
addq $8, %rsp
|
||||
iretq
|
||||
|
||||
.global __freeze
|
||||
__freeze:
|
||||
PUSH_ALL_REGS
|
||||
callq freeze
|
||||
POP_ALL_REGS
|
||||
iretq
|
||||
|
||||
.globl nmi
|
||||
nmi:
|
||||
#define PANICED 232
|
||||
#define PANIC_REGS 240
|
||||
movq %rax,%gs:PANIC_REGS+0x00
|
||||
movq %rsp,%gs:PANIC_REGS+0x08
|
||||
|
||||
movl nmi_mode(%rip),%eax
|
||||
cmp $1,%rax
|
||||
je 1f
|
||||
cmp $2,%rax
|
||||
jne 3f
|
||||
1:
|
||||
cld
|
||||
movq %gs:PANIC_REGS+0x00,%rax
|
||||
PUSH_ALL_REGS
|
||||
subq $40, %rsp
|
||||
movq %rsp,%gs:PANIC_REGS+0x10
|
||||
movq %rsp, %rdi
|
||||
call freeze_thaw
|
||||
cmpq $0, %rax
|
||||
jnz 2f
|
||||
addq $40, %rsp
|
||||
2:
|
||||
POP_ALL_REGS
|
||||
iretq
|
||||
3:
|
||||
movq %rbx,%gs:PANIC_REGS+0x08
|
||||
movq %rcx,%gs:PANIC_REGS+0x10
|
||||
movq %rdx,%gs:PANIC_REGS+0x18
|
||||
@ -210,6 +239,7 @@ enter_user_mode:
|
||||
movq $0, %rdi
|
||||
movq %rsp, %rsi
|
||||
call check_signal
|
||||
call utilthr_migrate
|
||||
movq $0, %rdi
|
||||
call set_cputime
|
||||
POP_ALL_REGS
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
#include <process.h>
|
||||
#include <page.h>
|
||||
#include <cls.h>
|
||||
#include <kmalloc.h>
|
||||
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
@ -30,11 +31,10 @@
|
||||
static char *last_page;
|
||||
extern char _head[], _end[];
|
||||
|
||||
static struct ihk_mc_pa_ops *pa_ops;
|
||||
|
||||
extern unsigned long x86_kernel_phys_base;
|
||||
|
||||
void *early_alloc_page(void)
|
||||
/* Arch specific early allocation routine */
|
||||
void *early_alloc_pages(int nr_pages)
|
||||
{
|
||||
void *p;
|
||||
|
||||
@ -45,59 +45,38 @@ void *early_alloc_page(void)
|
||||
last_page = phys_to_virt(virt_to_phys(last_page));
|
||||
} else if (last_page == (void *)-1) {
|
||||
panic("Early allocator is already finalized. Do not use it.\n");
|
||||
}
|
||||
} else {
|
||||
if(virt_to_phys(last_page) >= bootstrap_mem_end) {
|
||||
panic("Early allocator: Out of memory\n");
|
||||
}
|
||||
}
|
||||
p = last_page;
|
||||
last_page += PAGE_SIZE;
|
||||
last_page += (nr_pages * PAGE_SIZE);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void *arch_alloc_page(enum ihk_mc_ap_flag flag)
|
||||
void early_alloc_invalidate(void)
|
||||
{
|
||||
if (pa_ops)
|
||||
return pa_ops->alloc_page(1, PAGE_P2ALIGN, flag);
|
||||
else
|
||||
return early_alloc_page();
|
||||
}
|
||||
void arch_free_page(void *ptr)
|
||||
{
|
||||
if (pa_ops)
|
||||
pa_ops->free_page(ptr, 1);
|
||||
last_page = (void *)-1;
|
||||
}
|
||||
|
||||
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag)
|
||||
void *ihk_mc_allocate(int size, int flag)
|
||||
{
|
||||
if (pa_ops)
|
||||
return pa_ops->alloc_page(npages, p2align, flag);
|
||||
else
|
||||
if (!cpu_local_var(kmalloc_initialized)) {
|
||||
kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag)
|
||||
{
|
||||
return ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag);
|
||||
}
|
||||
|
||||
void ihk_mc_free_pages(void *p, int npages)
|
||||
{
|
||||
if (pa_ops)
|
||||
pa_ops->free_page(p, npages);
|
||||
}
|
||||
|
||||
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag)
|
||||
{
|
||||
if (pa_ops && pa_ops->alloc)
|
||||
return pa_ops->alloc(size, flag);
|
||||
else
|
||||
return ihk_mc_alloc_pages(1, flag);
|
||||
}
|
||||
return kmalloc(size, IHK_MC_AP_NOWAIT);
|
||||
}
|
||||
|
||||
void ihk_mc_free(void *p)
|
||||
{
|
||||
if (pa_ops && pa_ops->free)
|
||||
return pa_ops->free(p);
|
||||
else
|
||||
return ihk_mc_free_pages(p, 1);
|
||||
if (!cpu_local_var(kmalloc_initialized)) {
|
||||
kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
|
||||
return;
|
||||
}
|
||||
kfree(p);
|
||||
}
|
||||
|
||||
void *get_last_early_heap(void)
|
||||
@ -172,7 +151,7 @@ static unsigned long setup_l3(struct page_table *pt,
|
||||
pt->entry[i] = 0;
|
||||
continue;
|
||||
}
|
||||
pt_phys = setup_l2(arch_alloc_page(IHK_MC_AP_CRITICAL), phys, start, end);
|
||||
pt_phys = setup_l2(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, start, end);
|
||||
|
||||
pt->entry[i] = pt_phys | PFL3_PDIR_ATTR;
|
||||
}
|
||||
@ -196,7 +175,7 @@ static void init_normal_area(struct page_table *pt)
|
||||
|
||||
for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
|
||||
phys += PTL4_SIZE) {
|
||||
pt_phys = setup_l3(arch_alloc_page(IHK_MC_AP_CRITICAL), phys,
|
||||
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
|
||||
map_start, map_end);
|
||||
|
||||
pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
|
||||
@ -204,9 +183,9 @@ static void init_normal_area(struct page_table *pt)
|
||||
}
|
||||
}
|
||||
|
||||
static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag)
|
||||
static struct page_table *__alloc_new_pt(ihk_mc_ap_flag ap_flag)
|
||||
{
|
||||
struct page_table *newpt = arch_alloc_page(ap_flag);
|
||||
struct page_table *newpt = ihk_mc_alloc_pages(1, ap_flag);
|
||||
|
||||
if(newpt)
|
||||
memset(newpt, 0, sizeof(struct page_table));
|
||||
@ -303,7 +282,7 @@ void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr)
|
||||
* and returns a pointer to the PTE corresponding to the
|
||||
* virtual address.
|
||||
*/
|
||||
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr, enum ihk_mc_ap_flag ap_flag)
|
||||
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr, ihk_mc_ap_flag ap_flag)
|
||||
{
|
||||
int l4idx, l3idx, l2idx, l1idx;
|
||||
unsigned long v = (unsigned long)virt;
|
||||
@ -364,7 +343,7 @@ static int __set_pt_page(struct page_table *pt, void *virt, unsigned long phys,
|
||||
int l4idx, l3idx, l2idx, l1idx;
|
||||
unsigned long v = (unsigned long)virt;
|
||||
struct page_table *newpt;
|
||||
enum ihk_mc_ap_flag ap_flag;
|
||||
ihk_mc_ap_flag ap_flag;
|
||||
int in_kernel =
|
||||
(((unsigned long long)virt) >= 0xffff000000000000ULL);
|
||||
unsigned long init_pt_lock_flags;
|
||||
@ -515,8 +494,10 @@ uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt)
|
||||
return pagemap;
|
||||
}
|
||||
|
||||
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
|
||||
const void *virt, unsigned long *phys)
|
||||
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
|
||||
const void *virt,
|
||||
unsigned long *phys,
|
||||
unsigned long *size)
|
||||
{
|
||||
int l4idx, l3idx, l2idx, l1idx;
|
||||
unsigned long v = (unsigned long)virt;
|
||||
@ -538,6 +519,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
|
||||
if ((pt->entry[l3idx] & PFL3_SIZE)) {
|
||||
*phys = pte_get_phys(&pt->entry[l3idx])
|
||||
| (v & (PTL3_SIZE - 1));
|
||||
if (size) *size = PTL3_SIZE;
|
||||
return 0;
|
||||
}
|
||||
pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx]));
|
||||
@ -548,6 +530,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
|
||||
if ((pt->entry[l2idx] & PFL2_SIZE)) {
|
||||
*phys = pte_get_phys(&pt->entry[l2idx])
|
||||
| (v & (PTL2_SIZE - 1));
|
||||
if (size) *size = PTL2_SIZE;
|
||||
return 0;
|
||||
}
|
||||
pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx]));
|
||||
@ -557,9 +540,17 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
|
||||
}
|
||||
|
||||
*phys = pte_get_phys(&pt->entry[l1idx]) | (v & (PTL1_SIZE - 1));
|
||||
if (size) *size = PTL1_SIZE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
|
||||
const void *virt, unsigned long *phys)
|
||||
{
|
||||
return ihk_mc_pt_virt_to_phys_size(pt, virt, phys, NULL);
|
||||
}
|
||||
|
||||
|
||||
int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
|
||||
{
|
||||
int l4idx, l3idx, l2idx, l1idx;
|
||||
@ -571,28 +562,34 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
|
||||
|
||||
GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx);
|
||||
|
||||
__kprintf("l4 table: 0x%lX l4idx: %d \n", virt_to_phys(pt), l4idx);
|
||||
if (!(pt->entry[l4idx] & PFL4_PRESENT)) {
|
||||
__kprintf("0x%lX l4idx not present! \n", (unsigned long)virt);
|
||||
__kprintf("l4 entry: 0x%lX\n", pt->entry[l4idx]);
|
||||
return -EFAULT;
|
||||
}
|
||||
__kprintf("l4 entry: 0x%lX\n", pt->entry[l4idx]);
|
||||
pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK);
|
||||
|
||||
__kprintf("l3 table: 0x%lX l3idx: %d \n", virt_to_phys(pt), l3idx);
|
||||
if (!(pt->entry[l3idx] & PFL3_PRESENT)) {
|
||||
__kprintf("0x%lX l3idx not present! \n", (unsigned long)virt);
|
||||
__kprintf("l3 entry: 0x%lX\n", pt->entry[l3idx]);
|
||||
return -EFAULT;
|
||||
}
|
||||
__kprintf("l3 entry: 0x%lX\n", pt->entry[l3idx]);
|
||||
if ((pt->entry[l3idx] & PFL3_SIZE)) {
|
||||
__kprintf("l3 entry is 1G page\n");
|
||||
return 0;
|
||||
}
|
||||
pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK);
|
||||
|
||||
__kprintf("l2 table: 0x%lX l2idx: %d \n", virt_to_phys(pt), l2idx);
|
||||
if (!(pt->entry[l2idx] & PFL2_PRESENT)) {
|
||||
__kprintf("0x%lX l2idx not present! \n", (unsigned long)virt);
|
||||
__kprintf("l2 entry: 0x%lX\n", pt->entry[l2idx]);
|
||||
return -EFAULT;
|
||||
}
|
||||
__kprintf("l2 entry: 0x%lX\n", pt->entry[l2idx]);
|
||||
if ((pt->entry[l2idx] & PFL2_SIZE)) {
|
||||
__kprintf("l2 entry is 2M page\n");
|
||||
return 0;
|
||||
}
|
||||
pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK);
|
||||
@ -671,7 +668,7 @@ int ihk_mc_pt_prepare_map(page_table_t p, void *virt, unsigned long size,
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag)
|
||||
struct page_table *ihk_mc_pt_create(ihk_mc_ap_flag ap_flag)
|
||||
{
|
||||
struct page_table *pt = ihk_mc_alloc_pages(1, ap_flag);
|
||||
|
||||
@ -715,7 +712,7 @@ static void destroy_page_table(int level, struct page_table *pt)
|
||||
}
|
||||
}
|
||||
|
||||
arch_free_page(pt);
|
||||
ihk_mc_free_pages(pt, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -910,11 +907,17 @@ static int split_large_page(pte_t *ptep, size_t pgsize)
|
||||
|
||||
*ptep = (virt_to_phys(pt) & PT_PHYSMASK) | PFL2_PDIR_ATTR;
|
||||
|
||||
if (phys_base != NOPHYS) {
|
||||
page = phys_to_page(phys_base);
|
||||
if (page && page_unmap(page)) {
|
||||
kprintf("split_large_page:page_unmap:%p\n", page);
|
||||
panic("split_large_page:page_unmap\n");
|
||||
/* Do not do this check for large pages as they don't come from the zeroobj
|
||||
* and are not actually mapped.
|
||||
* TODO: clean up zeroobj as we don't really need it, anonymous mappings
|
||||
* should be allocated for real */
|
||||
if (pgsize != PTL2_SIZE) {
|
||||
if (phys_base != NOPHYS) {
|
||||
page = phys_to_page(phys_base);
|
||||
if (pgsize != PTL2_SIZE && page && page_unmap(page)) {
|
||||
kprintf("split_large_page:page_unmap:%p\n", page);
|
||||
panic("split_large_page:page_unmap\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
@ -1072,11 +1075,29 @@ int visit_pte_range(page_table_t pt, void *start0, void *end0, int pgshift,
|
||||
|
||||
struct clear_range_args {
|
||||
int free_physical;
|
||||
uint8_t padding[4];
|
||||
struct memobj *memobj;
|
||||
struct process_vm *vm;
|
||||
unsigned long *addr;
|
||||
int nr_addr;
|
||||
int max_nr_addr;
|
||||
};
|
||||
|
||||
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
|
||||
unsigned long addr)
|
||||
{
|
||||
if (args->nr_addr < args->max_nr_addr) {
|
||||
args->addr[args->nr_addr] = addr;
|
||||
++args->nr_addr;
|
||||
return;
|
||||
}
|
||||
|
||||
remote_flush_tlb_array_cpumask(args->vm, args->addr, args->nr_addr,
|
||||
ihk_mc_get_processor_id());
|
||||
|
||||
args->addr[0] = addr;
|
||||
args->nr_addr = 1;
|
||||
}
|
||||
|
||||
static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
|
||||
uint64_t start, uint64_t end)
|
||||
{
|
||||
@ -1090,7 +1111,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
|
||||
}
|
||||
|
||||
old = xchg(ptep, PTE_NULL);
|
||||
remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id());
|
||||
remote_flush_tlb_add_addr(args, base);
|
||||
|
||||
page = NULL;
|
||||
if (!pte_is_fileoff(&old, PTL1_SIZE)) {
|
||||
@ -1098,13 +1119,15 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
|
||||
page = phys_to_page(phys);
|
||||
}
|
||||
|
||||
if (page && page_is_in_memobj(page) && (old & PFL1_DIRTY)) {
|
||||
if (page && page_is_in_memobj(page) && (old & PFL1_DIRTY) && (args->memobj) &&
|
||||
!(args->memobj->flags & MF_ZEROFILL)) {
|
||||
memobj_flush_page(args->memobj, phys, PTL1_SIZE);
|
||||
}
|
||||
|
||||
if (!(old & PFL1_FILEOFF) && args->free_physical) {
|
||||
if (page && page_unmap(page)) {
|
||||
ihk_mc_free_pages(phys_to_virt(phys), 1);
|
||||
if (!page || (page && page_unmap(page))) {
|
||||
ihk_mc_free_pages_user(phys_to_virt(phys), 1);
|
||||
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
|
||||
}
|
||||
args->vm->currss -= PTL1_SIZE;
|
||||
}
|
||||
@ -1137,8 +1160,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
|
||||
|
||||
if (*ptep & PFL2_SIZE) {
|
||||
old = xchg(ptep, PTE_NULL);
|
||||
remote_flush_tlb_cpumask(args->vm, base,
|
||||
ihk_mc_get_processor_id());
|
||||
remote_flush_tlb_add_addr(args, base);
|
||||
|
||||
page = NULL;
|
||||
if (!pte_is_fileoff(&old, PTL2_SIZE)) {
|
||||
@ -1151,8 +1173,10 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
|
||||
}
|
||||
|
||||
if (!(old & PFL2_FILEOFF) && args->free_physical) {
|
||||
if (page && page_unmap(page)) {
|
||||
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
|
||||
if (!page || (page && page_unmap(page))) {
|
||||
ihk_mc_free_pages_user(phys_to_virt(phys),
|
||||
PTL2_SIZE/PTL1_SIZE);
|
||||
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
|
||||
}
|
||||
args->vm->currss -= PTL2_SIZE;
|
||||
}
|
||||
@ -1168,9 +1192,8 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
|
||||
|
||||
if ((start <= base) && ((base + PTL2_SIZE) <= end)) {
|
||||
*ptep = PTE_NULL;
|
||||
remote_flush_tlb_cpumask(args->vm, base,
|
||||
ihk_mc_get_processor_id());
|
||||
arch_free_page(pt);
|
||||
remote_flush_tlb_add_addr(args, base);
|
||||
ihk_mc_free_pages(pt, 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1201,8 +1224,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
|
||||
|
||||
if (*ptep & PFL3_SIZE) {
|
||||
old = xchg(ptep, PTE_NULL);
|
||||
remote_flush_tlb_cpumask(args->vm, base,
|
||||
ihk_mc_get_processor_id());
|
||||
remote_flush_tlb_add_addr(args, base);
|
||||
|
||||
page = NULL;
|
||||
if (!pte_is_fileoff(&old, PTL3_SIZE)) {
|
||||
@ -1215,8 +1237,9 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
|
||||
}
|
||||
|
||||
if (!(old & PFL3_FILEOFF) && args->free_physical) {
|
||||
if (page && page_unmap(page)) {
|
||||
ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE);
|
||||
if (!page || (page && page_unmap(page))) {
|
||||
ihk_mc_free_pages_user(phys_to_virt(phys),
|
||||
PTL3_SIZE/PTL1_SIZE);
|
||||
}
|
||||
args->vm->currss -= PTL3_SIZE;
|
||||
}
|
||||
@ -1232,9 +1255,8 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
|
||||
|
||||
if (use_1gb_page && (start <= base) && ((base + PTL3_SIZE) <= end)) {
|
||||
*ptep = PTE_NULL;
|
||||
remote_flush_tlb_cpumask(args->vm, base,
|
||||
ihk_mc_get_processor_id());
|
||||
arch_free_page(pt);
|
||||
remote_flush_tlb_add_addr(args, base);
|
||||
ihk_mc_free_pages(pt, 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1253,8 +1275,10 @@ static int clear_range_l4(void *args0, pte_t *ptep, uint64_t base,
|
||||
return walk_pte_l3(pt, base, start, end, &clear_range_l3, args0);
|
||||
}
|
||||
|
||||
static int clear_range(struct page_table *pt, struct process_vm *vm,
|
||||
uintptr_t start, uintptr_t end, int free_physical,
|
||||
#define TLB_INVALID_ARRAY_PAGES (4)
|
||||
|
||||
static int clear_range(struct page_table *pt, struct process_vm *vm,
|
||||
uintptr_t start, uintptr_t end, int free_physical,
|
||||
struct memobj *memobj)
|
||||
{
|
||||
int error;
|
||||
@ -1269,11 +1293,35 @@ static int clear_range(struct page_table *pt, struct process_vm *vm,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* TODO: embedd this in tlb_flush_entry? */
|
||||
args.addr = (unsigned long *)ihk_mc_alloc_pages(
|
||||
TLB_INVALID_ARRAY_PAGES, IHK_MC_AP_CRITICAL);
|
||||
if (!args.addr) {
|
||||
ekprintf("%s: error: allocating address array\n", __FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
args.nr_addr = 0;
|
||||
args.max_nr_addr = (TLB_INVALID_ARRAY_PAGES * PAGE_SIZE /
|
||||
sizeof(uint64_t));
|
||||
|
||||
args.free_physical = free_physical;
|
||||
if (memobj && (memobj->flags & MF_DEV_FILE)) {
|
||||
args.free_physical = 0;
|
||||
}
|
||||
if (memobj && ((memobj->flags & MF_PREMAP))) {
|
||||
args.free_physical = 0;
|
||||
}
|
||||
args.memobj = memobj;
|
||||
args.vm = vm;
|
||||
|
||||
error = walk_pte_l4(pt, 0, start, end, &clear_range_l4, &args);
|
||||
if (args.nr_addr) {
|
||||
remote_flush_tlb_array_cpumask(vm, args.addr, args.nr_addr,
|
||||
ihk_mc_get_processor_id());
|
||||
}
|
||||
|
||||
ihk_mc_free_pages(args.addr, TLB_INVALID_ARRAY_PAGES);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -1585,7 +1633,7 @@ retry:
|
||||
error = 0;
|
||||
out:
|
||||
if (newpt) {
|
||||
arch_free_page(newpt);
|
||||
ihk_mc_free_pages(newpt, 1);
|
||||
}
|
||||
dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n",
|
||||
base, start, end, error, *ptep);
|
||||
@ -1668,7 +1716,7 @@ retry:
|
||||
error = 0;
|
||||
out:
|
||||
if (newpt) {
|
||||
arch_free_page(newpt);
|
||||
ihk_mc_free_pages(newpt, 1);
|
||||
}
|
||||
dkprintf("set_range_l3(%lx,%lx,%lx): %d\n",
|
||||
base, start, end, error, *ptep);
|
||||
@ -1726,7 +1774,7 @@ retry:
|
||||
error = 0;
|
||||
out:
|
||||
if (newpt) {
|
||||
arch_free_page(newpt);
|
||||
ihk_mc_free_pages(newpt, 1);
|
||||
}
|
||||
dkprintf("set_range_l4(%lx,%lx,%lx): %d %lx\n",
|
||||
base, start, end, error, *ptep);
|
||||
@ -1778,9 +1826,19 @@ int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize,
|
||||
*ptep = phys | attr_to_l1attr(attr);
|
||||
}
|
||||
else if (pgsize == PTL2_SIZE) {
|
||||
if (phys & (PTL2_SIZE - 1)) {
|
||||
kprintf("%s: error: phys needs to be PTL2_SIZE aligned\n", __FUNCTION__);
|
||||
error = -1;
|
||||
goto out;
|
||||
}
|
||||
*ptep = phys | attr_to_l2attr(attr | PTATTR_LARGEPAGE);
|
||||
}
|
||||
else if ((pgsize == PTL3_SIZE) && (use_1gb_page)) {
|
||||
if (phys & (PTL3_SIZE - 1)) {
|
||||
kprintf("%s: error: phys needs to be PTL3_SIZE aligned\n", __FUNCTION__);
|
||||
error = -1;
|
||||
goto out;
|
||||
}
|
||||
*ptep = phys | attr_to_l3attr(attr | PTATTR_LARGEPAGE);
|
||||
}
|
||||
else {
|
||||
@ -2044,7 +2102,8 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
||||
attr |= PTATTR_UNCACHABLE;
|
||||
}
|
||||
|
||||
kprintf("map_fixed: %lx => %p (%d pages)\n", paligned, v, npages);
|
||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
paligned, v, npages);
|
||||
|
||||
for (i = 0; i < npages; i++) {
|
||||
if(__set_pt_page(init_pt, (void *)fixed_virt, paligned, attr)){
|
||||
@ -2083,7 +2142,7 @@ static void init_vsyscall_area(struct page_table *pt)
|
||||
void init_page_table(void)
|
||||
{
|
||||
check_available_page_size();
|
||||
init_pt = arch_alloc_page(IHK_MC_AP_CRITICAL);
|
||||
init_pt = ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL);
|
||||
ihk_mc_spinlock_init(&init_pt_lock);
|
||||
|
||||
memset(init_pt, 0, sizeof(PAGE_SIZE));
|
||||
@ -2100,27 +2159,27 @@ void init_page_table(void)
|
||||
}
|
||||
|
||||
extern void __reserve_arch_pages(unsigned long, unsigned long,
|
||||
void (*)(unsigned long, unsigned long, int));
|
||||
void (*)(struct ihk_page_allocator_desc *,
|
||||
unsigned long, unsigned long, int));
|
||||
|
||||
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
|
||||
void (*cb)(unsigned long, unsigned long, int))
|
||||
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
|
||||
unsigned long start, unsigned long end,
|
||||
void (*cb)(struct ihk_page_allocator_desc *,
|
||||
unsigned long, unsigned long, int))
|
||||
{
|
||||
/* Reserve Text + temporal heap */
|
||||
cb(virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
|
||||
cb(pa_allocator, virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
|
||||
/* Reserve trampoline area to boot the second ap */
|
||||
cb(ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
|
||||
cb(pa_allocator, ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
|
||||
/* Reserve the null page */
|
||||
cb(0, PAGE_SIZE, 0);
|
||||
/* Micro-arch specific */
|
||||
cb(pa_allocator, 0, PAGE_SIZE, 0);
|
||||
/*
|
||||
* Micro-arch specific
|
||||
* TODO: this does nothing in SMP mode, update it for KNC if necessary
|
||||
*/
|
||||
__reserve_arch_pages(start, end, cb);
|
||||
}
|
||||
|
||||
void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *ops)
|
||||
{
|
||||
last_page = (void *)-1;
|
||||
pa_ops = ops;
|
||||
}
|
||||
|
||||
unsigned long virt_to_phys(void *v)
|
||||
{
|
||||
unsigned long va = (unsigned long)v;
|
||||
@ -2147,26 +2206,18 @@ int copy_from_user(void *dst, const void *src, size_t siz)
|
||||
int strlen_user(const char *s)
|
||||
{
|
||||
struct process_vm *vm = cpu_local_var(current)->vm;
|
||||
struct vm_range *range;
|
||||
unsigned long pgstart;
|
||||
int maxlen;
|
||||
const char *head = s;
|
||||
int err;
|
||||
|
||||
maxlen = 4096 - (((unsigned long)s) & 0x0000000000000fffUL);
|
||||
pgstart = ((unsigned long)s) & 0xfffffffffffff000UL;
|
||||
if(!pgstart || pgstart >= MAP_KERNEL_START)
|
||||
return -EFAULT;
|
||||
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
|
||||
for(;;){
|
||||
range = lookup_process_memory_range(vm, pgstart, pgstart+1);
|
||||
if(range == NULL){
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
|
||||
return -EFAULT;
|
||||
}
|
||||
if((range->flag & VR_PROT_MASK) == VR_PROT_NONE){
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
|
||||
return -EFAULT;
|
||||
}
|
||||
if ((err = verify_process_vm(vm, s, 1)))
|
||||
return err;
|
||||
while(*s && maxlen > 0){
|
||||
s++;
|
||||
maxlen--;
|
||||
@ -2176,14 +2227,12 @@ int strlen_user(const char *s)
|
||||
maxlen = 4096;
|
||||
pgstart += 4096;
|
||||
}
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
|
||||
return s - head;
|
||||
}
|
||||
|
||||
int strcpy_from_user(char *dst, const char *src)
|
||||
{
|
||||
struct process_vm *vm = cpu_local_var(current)->vm;
|
||||
struct vm_range *range;
|
||||
unsigned long pgstart;
|
||||
int maxlen;
|
||||
int err = 0;
|
||||
@ -2192,17 +2241,9 @@ int strcpy_from_user(char *dst, const char *src)
|
||||
pgstart = ((unsigned long)src) & 0xfffffffffffff000UL;
|
||||
if(!pgstart || pgstart >= MAP_KERNEL_START)
|
||||
return -EFAULT;
|
||||
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
|
||||
for(;;){
|
||||
range = lookup_process_memory_range(vm, pgstart, pgstart + 1);
|
||||
if(range == NULL){
|
||||
err = -EFAULT;
|
||||
break;
|
||||
}
|
||||
if((range->flag & VR_PROT_MASK) == VR_PROT_NONE){
|
||||
err = -EFAULT;
|
||||
break;
|
||||
}
|
||||
if ((err = verify_process_vm(vm, src, 1)))
|
||||
return err;
|
||||
while(*src && maxlen > 0){
|
||||
*(dst++) = *(src++);
|
||||
maxlen--;
|
||||
@ -2214,34 +2255,62 @@ int strcpy_from_user(char *dst, const char *src)
|
||||
maxlen = 4096;
|
||||
pgstart += 4096;
|
||||
}
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
long getlong_user(const long *p)
|
||||
long getlong_user(long *dest, const long *p)
|
||||
{
|
||||
int error;
|
||||
long l;
|
||||
|
||||
error = copy_from_user(&l, p, sizeof(l));
|
||||
error = copy_from_user(dest, p, sizeof(long));
|
||||
if (error) {
|
||||
return error;
|
||||
}
|
||||
|
||||
return l;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int getint_user(const int *p)
|
||||
int getint_user(int *dest, const int *p)
|
||||
{
|
||||
int error;
|
||||
int i;
|
||||
|
||||
error = copy_from_user(&i, p, sizeof(i));
|
||||
error = copy_from_user(dest, p, sizeof(int));
|
||||
if (error) {
|
||||
return error;
|
||||
}
|
||||
|
||||
return i;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int verify_process_vm(struct process_vm *vm,
|
||||
const void *usrc, size_t size)
|
||||
{
|
||||
const uintptr_t ustart = (uintptr_t)usrc;
|
||||
const uintptr_t uend = ustart + size;
|
||||
uint64_t reason;
|
||||
uintptr_t addr;
|
||||
int error = 0;
|
||||
|
||||
if ((ustart < vm->region.user_start)
|
||||
|| (vm->region.user_end <= ustart)
|
||||
|| ((vm->region.user_end - ustart) < size)) {
|
||||
kprintf("%s: error: out of user range\n", __FUNCTION__);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
reason = PF_USER; /* page not present */
|
||||
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
|
||||
if (!addr)
|
||||
return -EINVAL;
|
||||
|
||||
error = page_fault_process_vm(vm, (void *)addr, reason);
|
||||
if (error) {
|
||||
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz)
|
||||
@ -2261,13 +2330,18 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
|
||||
if ((ustart < vm->region.user_start)
|
||||
|| (vm->region.user_end <= ustart)
|
||||
|| ((vm->region.user_end - ustart) < siz)) {
|
||||
kprintf("%s: error: out of user range\n", __FUNCTION__);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
reason = PF_USER; /* page not present */
|
||||
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
|
||||
if (!addr)
|
||||
return -EINVAL;
|
||||
|
||||
error = page_fault_process_vm(vm, (void *)addr, reason);
|
||||
if (error) {
|
||||
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
@ -2283,11 +2357,22 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
|
||||
|
||||
error = ihk_mc_pt_virt_to_phys(vm->address_space->page_table, from, &pa);
|
||||
if (error) {
|
||||
kprintf("%s: error: resolving physical address or %p\n", __FUNCTION__, from);
|
||||
return error;
|
||||
}
|
||||
|
||||
va = phys_to_virt(pa);
|
||||
memcpy(to, va, cpsize);
|
||||
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
|
||||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
|
||||
dkprintf("%s: pa is outside of LWK memory, to: %p, pa: %p,"
|
||||
"cpsize: %d\n", __FUNCTION__, to, pa, cpsize);
|
||||
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
|
||||
memcpy(to, va, cpsize);
|
||||
ihk_mc_unmap_virtual(va, 1, 1);
|
||||
}
|
||||
else {
|
||||
va = phys_to_virt(pa);
|
||||
memcpy(to, va, cpsize);
|
||||
}
|
||||
|
||||
from += cpsize;
|
||||
to += cpsize;
|
||||
@ -2356,8 +2441,18 @@ int write_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
|
||||
return error;
|
||||
}
|
||||
|
||||
va = phys_to_virt(pa);
|
||||
memcpy(va, from, cpsize);
|
||||
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
|
||||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
|
||||
dkprintf("%s: pa is outside of LWK memory, from: %p,"
|
||||
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
|
||||
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
|
||||
memcpy(va, from, cpsize);
|
||||
ihk_mc_unmap_virtual(va, 1, 1);
|
||||
}
|
||||
else {
|
||||
va = phys_to_virt(pa);
|
||||
memcpy(va, from, cpsize);
|
||||
}
|
||||
|
||||
from += cpsize;
|
||||
to += cpsize;
|
||||
@ -2381,7 +2476,7 @@ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
|
||||
unsigned long pa;
|
||||
void *va;
|
||||
|
||||
kprintf("patch_process_vm(%p,%p,%p,%lx)\n", vm, udst, ksrc, siz);
|
||||
dkprintf("patch_process_vm(%p,%p,%p,%lx)\n", vm, udst, ksrc, siz);
|
||||
if ((ustart < vm->region.user_start)
|
||||
|| (vm->region.user_end <= ustart)
|
||||
|| ((vm->region.user_end - ustart) < siz)) {
|
||||
@ -2413,14 +2508,24 @@ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
|
||||
return error;
|
||||
}
|
||||
|
||||
va = phys_to_virt(pa);
|
||||
memcpy(va, from, cpsize);
|
||||
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
|
||||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
|
||||
dkprintf("%s: pa is outside of LWK memory, from: %p,"
|
||||
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
|
||||
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
|
||||
memcpy(va, from, cpsize);
|
||||
ihk_mc_unmap_virtual(va, 1, 1);
|
||||
}
|
||||
else {
|
||||
va = phys_to_virt(pa);
|
||||
memcpy(va, from, cpsize);
|
||||
}
|
||||
|
||||
from += cpsize;
|
||||
to += cpsize;
|
||||
remain -= cpsize;
|
||||
}
|
||||
|
||||
kprintf("patch_process_vm(%p,%p,%p,%lx):%d\n", vm, udst, ksrc, siz, 0);
|
||||
dkprintf("patch_process_vm(%p,%p,%p,%lx):%d\n", vm, udst, ksrc, siz, 0);
|
||||
return 0;
|
||||
} /* patch_process_vm() */
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
|
||||
extern int num_processors;
|
||||
extern void arch_set_mikc_queue(void *r, void *w);
|
||||
ihk_ikc_ph_t arch_master_channel_packet_handler;
|
||||
|
||||
@ -23,22 +24,28 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
|
||||
ihk_ikc_ph_t packet_handler)
|
||||
{
|
||||
struct ihk_ikc_queue_head *rq, *wq;
|
||||
size_t mikc_queue_pages;
|
||||
|
||||
ihk_ikc_system_init(NULL);
|
||||
|
||||
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
|
||||
|
||||
/* Place both sides in this side */
|
||||
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
|
||||
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
|
||||
mikc_queue_pages = ((2 * num_processors * MASTER_IKCQ_PKTSIZE)
|
||||
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
|
||||
|
||||
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
|
||||
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
|
||||
/* Place both sides in this side */
|
||||
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
|
||||
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
|
||||
|
||||
ihk_ikc_init_queue(rq, 0, 0,
|
||||
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
|
||||
ihk_ikc_init_queue(wq, 0, 0,
|
||||
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
|
||||
|
||||
arch_master_channel_packet_handler = packet_handler;
|
||||
|
||||
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
|
||||
ihk_ikc_master_channel_packet_handler);
|
||||
ihk_ikc_master_channel_packet_handler, channel);
|
||||
ihk_ikc_enable_channel(channel);
|
||||
|
||||
/* Set boot parameter */
|
||||
|
||||
@ -12,16 +12,47 @@
|
||||
#include <errno.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <registers.h>
|
||||
#include <mc_perf_event.h>
|
||||
|
||||
extern unsigned int *x86_march_perfmap;
|
||||
extern int running_on_kvm(void);
|
||||
|
||||
//#define PERFCTR_DEBUG
|
||||
#ifdef PERFCTR_DEBUG
|
||||
#define dkprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define dkprintf(...) do { } while (0)
|
||||
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#endif
|
||||
|
||||
#define X86_CR4_PCE 0x00000100
|
||||
|
||||
#define PERFCTR_CHKANDJUMP(cond, msg, err) \
|
||||
do { \
|
||||
if(cond) { \
|
||||
ekprintf("%s,"msg"\n", __FUNCTION__); \
|
||||
ret = err; \
|
||||
goto fn_fail; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
int perf_counters_discovered = 0;
|
||||
int X86_IA32_NUM_PERF_COUNTERS = 0;
|
||||
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
|
||||
int X86_IA32_NUM_FIXED_PERF_COUNTERS = 0;
|
||||
unsigned long X86_IA32_FIXED_PERF_COUNTERS_MASK = 0;
|
||||
|
||||
void x86_init_perfctr(void)
|
||||
{
|
||||
int i = 0;
|
||||
unsigned long reg;
|
||||
unsigned long value = 0;
|
||||
uint64_t op;
|
||||
uint64_t eax;
|
||||
uint64_t ebx;
|
||||
uint64_t ecx;
|
||||
uint64_t edx;
|
||||
|
||||
/* Do not do it on KVM */
|
||||
if (running_on_kvm()) return;
|
||||
@ -30,12 +61,41 @@ void x86_init_perfctr(void)
|
||||
asm volatile("movq %%cr4, %0" : "=r"(reg));
|
||||
reg |= X86_CR4_PCE;
|
||||
asm volatile("movq %0, %%cr4" : : "r"(reg));
|
||||
|
||||
/* Detect number of supported performance counters */
|
||||
if (!perf_counters_discovered) {
|
||||
/* See Table 35.2 - Architectural MSRs in Vol 3C */
|
||||
op = 0x0a;
|
||||
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx):"a"(op));
|
||||
|
||||
X86_IA32_NUM_PERF_COUNTERS = ((eax & 0xFF00) >> 8);
|
||||
X86_IA32_PERF_COUNTERS_MASK = (1 << X86_IA32_NUM_PERF_COUNTERS) - 1;
|
||||
|
||||
X86_IA32_NUM_FIXED_PERF_COUNTERS = (edx & 0x0F);
|
||||
X86_IA32_FIXED_PERF_COUNTERS_MASK =
|
||||
((1UL << X86_IA32_NUM_FIXED_PERF_COUNTERS) - 1) <<
|
||||
X86_IA32_BASE_FIXED_PERF_COUNTERS;
|
||||
|
||||
perf_counters_discovered = 1;
|
||||
kprintf("X86_IA32_NUM_PERF_COUNTERS: %d, X86_IA32_NUM_FIXED_PERF_COUNTERS: %d\n",
|
||||
X86_IA32_NUM_PERF_COUNTERS, X86_IA32_NUM_FIXED_PERF_COUNTERS);
|
||||
}
|
||||
|
||||
/* Clear Fixed Counter Control */
|
||||
value = rdmsr(MSR_PERF_FIXED_CTRL);
|
||||
value &= 0xfffffffffffff000L;
|
||||
wrmsr(MSR_PERF_FIXED_CTRL, value);
|
||||
|
||||
/* Clear Generic Counter Control */
|
||||
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
|
||||
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
|
||||
}
|
||||
|
||||
/* Enable PMC Control */
|
||||
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
|
||||
value |= X86_IA32_PERF_COUNTERS_MASK;
|
||||
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
|
||||
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
|
||||
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
|
||||
value |= X86_IA32_PERF_COUNTERS_MASK;
|
||||
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
|
||||
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
|
||||
}
|
||||
|
||||
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
|
||||
@ -63,12 +123,12 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
|
||||
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
|
||||
|
||||
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
|
||||
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
|
||||
//kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int set_pmc_x86_direct(int counter, unsigned long val)
|
||||
static int set_pmc_x86_direct(int counter, long val)
|
||||
{
|
||||
unsigned long cnt_bit = 0;
|
||||
|
||||
@ -76,6 +136,8 @@ static int set_pmc_x86_direct(int counter, unsigned long val)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
val &= 0x000000ffffffffff; // 40bit Mask
|
||||
|
||||
cnt_bit = 1UL << counter;
|
||||
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
|
||||
// set generic pmc
|
||||
@ -102,7 +164,7 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
|
||||
static int set_fixed_counter(int counter, int mode)
|
||||
{
|
||||
unsigned long value = 0;
|
||||
unsigned int ctr_mask = 0x7;
|
||||
unsigned int ctr_mask = 0xf;
|
||||
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
|
||||
unsigned int set_val = 0;
|
||||
|
||||
@ -159,9 +221,12 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
|
||||
|
||||
int ihk_mc_perfctr_start(unsigned long counter_mask)
|
||||
{
|
||||
int ret = 0;
|
||||
unsigned long value = 0;
|
||||
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
|
||||
|
||||
PERFCTR_CHKANDJUMP(counter_mask & ~mask, "counter_mask out of range", -EINVAL);
|
||||
|
||||
#ifdef HAVE_MARCH_PERFCTR_START
|
||||
x86_march_perfctr_start(counter_mask);
|
||||
#endif
|
||||
@ -169,28 +234,53 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
|
||||
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
|
||||
value |= counter_mask;
|
||||
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
|
||||
|
||||
return 0;
|
||||
fn_exit:
|
||||
return ret;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_stop(unsigned long counter_mask)
|
||||
{
|
||||
int ret = 0;
|
||||
unsigned long value;
|
||||
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
|
||||
|
||||
PERFCTR_CHKANDJUMP(counter_mask & ~mask, "counter_mask out of range", -EINVAL);
|
||||
|
||||
counter_mask &= mask;
|
||||
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
|
||||
value &= ~counter_mask;
|
||||
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
|
||||
|
||||
return 0;
|
||||
if(counter_mask >> 32 & 0x1) {
|
||||
value = rdmsr(MSR_PERF_FIXED_CTRL);
|
||||
value &= ~(0xf);
|
||||
wrmsr(MSR_PERF_FIXED_CTRL, value);
|
||||
}
|
||||
|
||||
if(counter_mask >> 32 & 0x2) {
|
||||
value = rdmsr(MSR_PERF_FIXED_CTRL);
|
||||
value &= ~(0xf << 4);
|
||||
wrmsr(MSR_PERF_FIXED_CTRL, value);
|
||||
}
|
||||
|
||||
if(counter_mask >> 32 & 0x4) {
|
||||
value = rdmsr(MSR_PERF_FIXED_CTRL);
|
||||
value &= ~(0xf << 8);
|
||||
wrmsr(MSR_PERF_FIXED_CTRL, value);
|
||||
}
|
||||
fn_exit:
|
||||
return ret;
|
||||
fn_fail:
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
// init for fixed counter
|
||||
int ihk_mc_perfctr_fixed_init(int counter, int mode)
|
||||
{
|
||||
unsigned long value = 0;
|
||||
unsigned int ctr_mask = 0x7;
|
||||
unsigned int ctr_mask = 0xf;
|
||||
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
|
||||
unsigned int set_val = 0;
|
||||
|
||||
@ -210,6 +300,9 @@ int ihk_mc_perfctr_fixed_init(int counter, int mode)
|
||||
set_val |= 1;
|
||||
}
|
||||
|
||||
// enable PMI on overflow
|
||||
set_val |= 1 << 3;
|
||||
|
||||
set_val <<= counter_idx * 4;
|
||||
value |= set_val;
|
||||
|
||||
@ -223,7 +316,7 @@ int ihk_mc_perfctr_reset(int counter)
|
||||
return set_pmc_x86_direct(counter, 0);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set(int counter, unsigned long val)
|
||||
int ihk_mc_perfctr_set(int counter, long val)
|
||||
{
|
||||
return set_pmc_x86_direct(counter, val);
|
||||
}
|
||||
@ -297,23 +390,33 @@ unsigned long ihk_mc_perfctr_read_msr(int counter)
|
||||
return retval;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status)
|
||||
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status)
|
||||
{
|
||||
int ret = -1;
|
||||
int i = 0;
|
||||
int ret = -1;
|
||||
|
||||
// find avail generic counter
|
||||
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
|
||||
if(*type == PERF_TYPE_HARDWARE) {
|
||||
switch(*config){
|
||||
case PERF_COUNT_HW_INSTRUCTIONS :
|
||||
*type = PERF_TYPE_RAW;
|
||||
*config = 0x5300c0;
|
||||
break;
|
||||
default :
|
||||
// Unexpected config
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else if(*type != PERF_TYPE_RAW) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// find avail generic counter
|
||||
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
|
||||
if(!(pmc_status & (1 << i))) {
|
||||
ret = i;
|
||||
pmc_status |= (1 << i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(ret < 0){
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -38,6 +38,8 @@ void set_signal(int sig, void *regs0, siginfo_t *info);
|
||||
void check_signal(unsigned long rc, void *regs0, int num);
|
||||
extern unsigned long do_fork(int, unsigned long, unsigned long, unsigned long,
|
||||
unsigned long, unsigned long, unsigned long);
|
||||
extern int get_xsave_size();
|
||||
extern uint64_t get_xsave_mask();
|
||||
|
||||
//#define DEBUG_PRINT_SC
|
||||
|
||||
@ -54,6 +56,7 @@ uintptr_t debug_constants[] = {
|
||||
offsetof(struct cpu_local_var, current),
|
||||
offsetof(struct cpu_local_var, runq),
|
||||
offsetof(struct cpu_local_var, status),
|
||||
offsetof(struct cpu_local_var, idle),
|
||||
offsetof(struct thread, ctx),
|
||||
offsetof(struct thread, sched_list),
|
||||
offsetof(struct thread, proc),
|
||||
@ -67,71 +70,37 @@ static struct vdso vdso;
|
||||
static size_t container_size = 0;
|
||||
static ptrdiff_t vdso_offset;
|
||||
|
||||
/*
|
||||
See dkprintf("BSP HW ID = %d, ", bsp_hw_id); (in ./mcos/kernel/ap.c)
|
||||
extern int num_processors;
|
||||
|
||||
Core with BSP HW ID 224 is 1st logical core of last physical core.
|
||||
It boots first and is given SW-ID of 0
|
||||
int obtain_clone_cpuid(cpu_set_t *cpu_set) {
|
||||
int min_queue_len = -1;
|
||||
int cpu, min_cpu = -1;
|
||||
|
||||
Core with BSP HW ID 0 is 1st logical core of 1st physical core.
|
||||
It boots next and is given SW-ID of 1.
|
||||
Core with BSP HW ID 1 boots next and is given SW-ID of 2.
|
||||
Core with BSP HW ID 2 boots next and is given SW-ID of 3.
|
||||
Core with BSP HW ID 3 boots next and is given SW-ID of 4.
|
||||
...
|
||||
Core with BSP HW ID 220 is 1st logical core of 56-th physical core.
|
||||
It boots next and is given SW-ID of 221.
|
||||
Core with BSP HW ID 221 boots next and is given SW-ID of 222.
|
||||
Core with BSP HW ID 222 boots next and is given SW-ID of 223.
|
||||
Core with BSP HW ID 223 boots next and is given SW-ID of 224.
|
||||
/* Find the first allowed core with the shortest run queue */
|
||||
for (cpu = 0; cpu < num_processors; ++cpu) {
|
||||
struct cpu_local_var *v;
|
||||
unsigned long irqstate;
|
||||
|
||||
Core with BSP HW ID 225 is 2nd logical core of last physical core.
|
||||
It boots next and is given SW-ID of 225.
|
||||
Core with BSP HW ID 226 boots next and is given SW-ID of 226.
|
||||
Core with BSP HW ID 227 boots next and is given SW-ID of 227.
|
||||
*/
|
||||
ihk_spinlock_t cpuid_head_lock = 0;
|
||||
static int cpuid_head = 0;
|
||||
if (!CPU_ISSET(cpu, cpu_set)) continue;
|
||||
|
||||
/* archtecture-depended syscall handlers */
|
||||
int obtain_clone_cpuid() {
|
||||
/* see above on BSP HW ID */
|
||||
struct ihk_mc_cpu_info *cpu_info = ihk_mc_get_cpu_info();
|
||||
int cpuid, nretry = 0;
|
||||
ihk_mc_spinlock_lock_noirq(&cpuid_head_lock);
|
||||
|
||||
/* Always start from 0 to fill in LWK cores linearily */
|
||||
cpuid_head = 0;
|
||||
retry:
|
||||
/* Try to obtain next physical core */
|
||||
cpuid = cpuid_head;
|
||||
v = get_cpu_local_var(cpu);
|
||||
irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
|
||||
if (min_queue_len == -1 || v->runq_len < min_queue_len) {
|
||||
min_queue_len = v->runq_len;
|
||||
min_cpu = cpu;
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
|
||||
|
||||
/* A hyper-threading core on the same physical core as
|
||||
the parent process might be chosen. Use sched_setaffinity
|
||||
if you want to skip that kind of busy physical core for
|
||||
performance reason. */
|
||||
cpuid_head += 1;
|
||||
if(cpuid_head >= cpu_info->ncpus) {
|
||||
cpuid_head = 0;
|
||||
}
|
||||
if (min_queue_len == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
/* A hyper-threading core whose parent physical core has a
|
||||
process on one of its hyper-threading core might
|
||||
be chosen. Use sched_setaffinity if you want to skip that
|
||||
kind of busy physical core for performance reason. */
|
||||
if(get_cpu_local_var(cpuid)->status != CPU_STATUS_IDLE) {
|
||||
nretry++;
|
||||
if(nretry >= cpu_info->ncpus) {
|
||||
cpuid = -1;
|
||||
ihk_mc_spinlock_unlock_noirq(&cpuid_head_lock);
|
||||
goto out;
|
||||
}
|
||||
goto retry;
|
||||
}
|
||||
get_cpu_local_var(cpuid)->status = CPU_STATUS_RESERVED;
|
||||
ihk_mc_spinlock_unlock_noirq(&cpuid_head_lock);
|
||||
out:
|
||||
return cpuid;
|
||||
if (min_cpu != -1) {
|
||||
if (get_cpu_local_var(min_cpu)->status != CPU_STATUS_RESERVED)
|
||||
get_cpu_local_var(min_cpu)->status = CPU_STATUS_RESERVED;
|
||||
}
|
||||
|
||||
return min_cpu;
|
||||
}
|
||||
|
||||
int
|
||||
@ -219,6 +188,7 @@ SYSCALL_DECLARE(rt_sigreturn)
|
||||
struct x86_user_context *regs;
|
||||
struct sigsp ksigsp;
|
||||
struct sigsp *sigsp;
|
||||
int xsavesize = get_xsave_size();
|
||||
|
||||
asm ("movq %%gs:(%1),%0"
|
||||
: "=r"(regs)
|
||||
@ -265,12 +235,31 @@ SYSCALL_DECLARE(rt_sigreturn)
|
||||
check_signal(0, regs, 0);
|
||||
check_need_resched();
|
||||
}
|
||||
|
||||
if(ksigsp.fpregs && xsavesize){
|
||||
void *fpregs = kmalloc(xsavesize + 64, IHK_MC_AP_NOWAIT);
|
||||
|
||||
if(fpregs){
|
||||
uint64_t xsave_mask = get_xsave_mask();
|
||||
unsigned int low = (unsigned int)xsave_mask;
|
||||
unsigned int high = (unsigned int)(xsave_mask >> 32);
|
||||
struct xsave_struct *kfpregs;
|
||||
|
||||
kfpregs = (void *)((((unsigned long)fpregs) + 63) & ~63);
|
||||
|
||||
if(copy_from_user(kfpregs, ksigsp.fpregs, xsavesize))
|
||||
return -EFAULT;
|
||||
asm volatile("xrstor %0" : : "m"(*kfpregs), "a"(low), "d"(high) : "memory");
|
||||
kfree(fpregs);
|
||||
}
|
||||
}
|
||||
|
||||
return sigsp->sigrc;
|
||||
}
|
||||
|
||||
extern struct cpu_local_var *clv;
|
||||
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
|
||||
extern void interrupt_syscall(int all, int pid);
|
||||
extern void interrupt_syscall(struct thread *, int sig);
|
||||
extern int num_processors;
|
||||
|
||||
#define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \
|
||||
@ -521,14 +510,14 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
int parent_pid;
|
||||
struct siginfo info;
|
||||
|
||||
dkprintf("ptrace_report_signal,pid=%d\n", thread->proc->pid);
|
||||
dkprintf("ptrace_report_signal, tid=%d, pid=%d\n", thread->tid, thread->proc->pid);
|
||||
|
||||
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
|
||||
if(!(proc->ptrace & PT_TRACED)){
|
||||
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
|
||||
return;
|
||||
}
|
||||
proc->exit_status = sig;
|
||||
thread->exit_status = sig;
|
||||
/* Transition thread state */
|
||||
proc->status = PS_TRACED;
|
||||
thread->status = PS_TRACED;
|
||||
@ -546,8 +535,8 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
memset(&info, '\0', sizeof info);
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_TRAPPED;
|
||||
info._sifields._sigchld.si_pid = thread->proc->pid;
|
||||
info._sifields._sigchld.si_status = thread->proc->exit_status;
|
||||
info._sifields._sigchld.si_pid = thread->tid;
|
||||
info._sifields._sigchld.si_status = thread->exit_status;
|
||||
do_kill(cpu_local_var(current), parent_pid, -1, SIGCHLD, &info, 0);
|
||||
/* Wake parent (if sleeping in wait4()) */
|
||||
waitq_wakeup(&proc->parent->waitpid_q);
|
||||
@ -672,10 +661,10 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
int orgsig;
|
||||
int ptraceflag = 0;
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
unsigned long irqstate;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
|
||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||
dkprintf("do_signal,pid=%d,sig=%d\n", proc->pid, sig);
|
||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||
orgsig = sig;
|
||||
|
||||
if((proc->ptrace & PT_TRACED) &&
|
||||
@ -695,18 +684,20 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
rc = regs->gpr.rax;
|
||||
}
|
||||
|
||||
irqstate = ihk_mc_spinlock_lock(&thread->sigcommon->lock);
|
||||
mcs_rwlock_writer_lock(&thread->sigcommon->lock, &mcs_rw_node);
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
|
||||
if(k->sa.sa_handler == SIG_IGN){
|
||||
kfree(pending);
|
||||
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
|
||||
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
|
||||
return;
|
||||
}
|
||||
else if(k->sa.sa_handler){
|
||||
unsigned long *usp; /* user stack */
|
||||
struct sigsp ksigsp;
|
||||
struct sigsp *sigsp;
|
||||
int xsavesize = get_xsave_size();
|
||||
unsigned long fpregs;
|
||||
|
||||
if((k->sa.sa_flags & SA_ONSTACK) &&
|
||||
!(thread->sigstack.ss_flags & SS_DISABLE) &&
|
||||
@ -719,7 +710,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
else{
|
||||
usp = (unsigned long *)regs->gpr.rsp;
|
||||
}
|
||||
sigsp = ((struct sigsp *)usp) - 1;
|
||||
fpregs = (unsigned long)usp - xsavesize;
|
||||
sigsp = ((struct sigsp *)fpregs) - 1;
|
||||
sigsp = (struct sigsp *)((unsigned long)sigsp & 0xfffffffffffffff0UL);
|
||||
memset(&ksigsp, '\0', sizeof ksigsp);
|
||||
|
||||
@ -751,19 +743,43 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
ksigsp.restart = isrestart(num, rc, sig, k->sa.sa_flags & SA_RESTART);
|
||||
if(num != 0 && rc == -EINTR && sig == SIGCHLD)
|
||||
ksigsp.restart = 1;
|
||||
if(xsavesize){
|
||||
uint64_t xsave_mask = get_xsave_mask();
|
||||
unsigned int low = (unsigned int)xsave_mask;
|
||||
unsigned int high = (unsigned int)(xsave_mask >> 32);
|
||||
void *_kfpregs = kmalloc(xsavesize + 64, IHK_MC_AP_NOWAIT);
|
||||
struct xsave_struct *kfpregs;
|
||||
|
||||
if(!_kfpregs){
|
||||
kfree(pending);
|
||||
kfree(_kfpregs);
|
||||
kprintf("do_signal,no space available\n");
|
||||
terminate(0, sig);
|
||||
return;
|
||||
}
|
||||
kfpregs = (void *)((((unsigned long)_kfpregs) + 63) & ~63);
|
||||
memset(kfpregs, '\0', xsavesize);
|
||||
asm volatile("xsave %0" : : "m"(*kfpregs), "a"(low), "d"(high) : "memory");
|
||||
if(copy_to_user((void *)fpregs, kfpregs, xsavesize)){
|
||||
kfree(pending);
|
||||
kfree(_kfpregs);
|
||||
kprintf("do_signal,write_process_vm failed\n");
|
||||
terminate(0, sig);
|
||||
return;
|
||||
}
|
||||
ksigsp.fpregs = (void *)fpregs;
|
||||
kfree(_kfpregs);
|
||||
}
|
||||
memcpy(&ksigsp.info, &pending->info, sizeof(siginfo_t));
|
||||
|
||||
if(copy_to_user(sigsp, &ksigsp, sizeof ksigsp)){
|
||||
kfree(pending);
|
||||
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
|
||||
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
|
||||
kprintf("do_signal,write_process_vm failed\n");
|
||||
terminate(0, sig);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
usp = (unsigned long *)sigsp;
|
||||
usp--;
|
||||
*usp = (unsigned long)k->sa.sa_restorer;
|
||||
@ -777,7 +793,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
if(!(k->sa.sa_flags & SA_NODEFER))
|
||||
thread->sigmask.__val[0] |= pending->sigmask.__val[0];
|
||||
kfree(pending);
|
||||
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
|
||||
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
|
||||
if(regs->gpr.rflags & RFLAGS_TF){
|
||||
struct siginfo info;
|
||||
|
||||
@ -803,7 +819,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
}
|
||||
else
|
||||
kfree(pending);
|
||||
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
|
||||
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
|
||||
switch (sig) {
|
||||
case SIGSTOP:
|
||||
case SIGTSTP:
|
||||
@ -835,7 +851,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
/* Wake up the parent who tried wait4 and sleeping */
|
||||
waitq_wakeup(&proc->parent->waitpid_q);
|
||||
|
||||
dkprintf("do_signal,SIGSTOP,sleeping\n");
|
||||
dkprintf("do_signal(): pid: %d, tid: %d SIGSTOP, sleeping\n",
|
||||
proc->pid, thread->tid);
|
||||
/* Sleep */
|
||||
schedule();
|
||||
dkprintf("SIGSTOP(): woken up\n");
|
||||
@ -849,7 +866,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
|
||||
/* Update thread state in fork tree */
|
||||
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
|
||||
proc->exit_status = SIGTRAP;
|
||||
thread->exit_status = SIGTRAP;
|
||||
proc->status = PS_TRACED;
|
||||
thread->status = PS_TRACED;
|
||||
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
|
||||
@ -903,11 +920,11 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
static struct sig_pending *
|
||||
getsigpending(struct thread *thread, int delflag){
|
||||
struct list_head *head;
|
||||
ihk_spinlock_t *lock;
|
||||
mcs_rwlock_lock_t *lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
struct sig_pending *next;
|
||||
struct sig_pending *pending;
|
||||
__sigset_t w;
|
||||
int irqstate;
|
||||
__sigset_t x;
|
||||
int sig;
|
||||
struct k_sigaction *k;
|
||||
@ -916,8 +933,12 @@ getsigpending(struct thread *thread, int delflag){
|
||||
|
||||
lock = &thread->sigcommon->lock;
|
||||
head = &thread->sigcommon->sigpending;
|
||||
for(;;){
|
||||
irqstate = ihk_mc_spinlock_lock(lock);
|
||||
for(;;) {
|
||||
if (delflag)
|
||||
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
||||
else
|
||||
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
||||
|
||||
list_for_each_entry_safe(pending, next, head, list){
|
||||
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
@ -926,17 +947,26 @@ getsigpending(struct thread *thread, int delflag){
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
k->sa.sa_handler != NULL)){
|
||||
if(!(pending->sigmask.__val[0] & w)){
|
||||
if(delflag)
|
||||
if(delflag)
|
||||
list_del(&pending->list);
|
||||
ihk_mc_spinlock_unlock(lock, irqstate);
|
||||
|
||||
if (delflag)
|
||||
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
||||
else
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
return pending;
|
||||
}
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock(lock, irqstate);
|
||||
|
||||
if (delflag)
|
||||
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
||||
else
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
|
||||
if(lock == &thread->sigpendinglock)
|
||||
return NULL;
|
||||
|
||||
lock = &thread->sigpendinglock;
|
||||
head = &thread->sigpending;
|
||||
}
|
||||
@ -984,22 +1014,25 @@ check_signal(unsigned long rc, void *regs0, int num)
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if(regs != NULL && !interrupt_from_user(regs)) {
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for(;;){
|
||||
pending = getsigpending(thread, 1);
|
||||
if(!pending) {
|
||||
dkprintf("check_signal,queue is empty\n");
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
do_signal(rc, regs, thread, pending, num);
|
||||
}
|
||||
|
||||
out:
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned long
|
||||
@ -1013,7 +1046,8 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
||||
struct thread *tthread = NULL;
|
||||
int i;
|
||||
__sigset_t mask;
|
||||
ihk_spinlock_t *savelock = NULL;
|
||||
mcs_rwlock_lock_t *savelock = NULL;
|
||||
struct mcs_rwlock_node mcs_rw_node;
|
||||
struct list_head *head = NULL;
|
||||
int rc;
|
||||
unsigned long irqstate = 0;
|
||||
@ -1145,7 +1179,8 @@ done:
|
||||
if(pid != -1 && tthread->proc->pid != pid){
|
||||
continue;
|
||||
}
|
||||
if(tthread->tid == tid){
|
||||
if (tthread->tid == tid &&
|
||||
tthread->status != PS_EXITED) {
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
@ -1195,9 +1230,15 @@ done:
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (tthread->thread_offloaded) {
|
||||
interrupt_syscall(tthread, sig);
|
||||
release_thread(tthread);
|
||||
return 0;
|
||||
}
|
||||
|
||||
doint = 0;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(savelock);
|
||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||
|
||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
||||
because target ptraced thread must call ptrace_report_signal
|
||||
@ -1236,12 +1277,10 @@ done:
|
||||
}
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock_noirq(savelock);
|
||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||
cpu_restore_interrupt(irqstate);
|
||||
|
||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||
int cpuid = tthread->cpu_id;
|
||||
int pid = tproc->pid;
|
||||
int status = tthread->status;
|
||||
|
||||
if (thread != tthread) {
|
||||
@ -1251,18 +1290,21 @@ done:
|
||||
}
|
||||
|
||||
if(!tthread->proc->nohost)
|
||||
interrupt_syscall(pid, cpuid);
|
||||
interrupt_syscall(tthread, 0);
|
||||
|
||||
if (status != PS_RUNNING) {
|
||||
if(sig == SIGKILL){
|
||||
/* Wake up the target only when stopped by ptrace-reporting */
|
||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED);
|
||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||
}
|
||||
else if(sig == SIGCONT || ptracecont == 1){
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
}
|
||||
else {
|
||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
release_thread(tthread);
|
||||
@ -1387,9 +1429,8 @@ SYSCALL_DECLARE(mmap)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((addr < region->user_start)
|
||||
|| (region->user_end <= addr)
|
||||
|| ((region->user_end - addr) < len)) {
|
||||
if ((flags & MAP_FIXED) && ((addr < region->user_start)
|
||||
|| (region->user_end <= addr))) {
|
||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
|
||||
addr0, len0, prot, flags0, fd, off0);
|
||||
error = -ENOMEM;
|
||||
@ -1510,9 +1551,10 @@ static int vdso_get_vdso_info(void)
|
||||
{
|
||||
int error;
|
||||
struct ikc_scd_packet packet;
|
||||
struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel);
|
||||
struct ihk_ikc_channel_desc *ch = cpu_local_var(ikc2linux);
|
||||
|
||||
dkprintf("vdso_get_vdso_info()\n");
|
||||
memset(&vdso, '\0', sizeof vdso);
|
||||
vdso.busy = 1;
|
||||
vdso.vdso_npages = 0;
|
||||
|
||||
@ -1707,7 +1749,8 @@ int arch_map_vdso(struct process_vm *vm)
|
||||
vrflags = VR_REMOTE;
|
||||
vrflags |= VR_PROT_READ | VR_PROT_EXEC;
|
||||
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
||||
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e, NOPHYS, vrflags, NULL, 0, PAGE_SHIFT);
|
||||
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
|
||||
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, NULL);
|
||||
if (error) {
|
||||
ekprintf("ERROR: adding memory range for vdso. %d\n", error);
|
||||
goto out;
|
||||
@ -1738,7 +1781,8 @@ int arch_map_vdso(struct process_vm *vm)
|
||||
vrflags = VR_REMOTE;
|
||||
vrflags |= VR_PROT_READ;
|
||||
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
||||
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e, NOPHYS, vrflags, NULL, 0, PAGE_SHIFT);
|
||||
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e,
|
||||
NOPHYS, vrflags, NULL, 0, PAGE_SHIFT, NULL);
|
||||
if (error) {
|
||||
ekprintf("ERROR: adding memory range for vvar. %d\n", error);
|
||||
goto out;
|
||||
@ -1786,4 +1830,61 @@ out:
|
||||
return error;
|
||||
} /* arch_map_vdso() */
|
||||
|
||||
void
|
||||
save_uctx(void *uctx, struct x86_user_context *regs)
|
||||
{
|
||||
struct trans_uctx {
|
||||
volatile int cond;
|
||||
int fregsize;
|
||||
|
||||
unsigned long rax;
|
||||
unsigned long rbx;
|
||||
unsigned long rcx;
|
||||
unsigned long rdx;
|
||||
unsigned long rsi;
|
||||
unsigned long rdi;
|
||||
unsigned long rbp;
|
||||
unsigned long r8;
|
||||
unsigned long r9;
|
||||
unsigned long r10;
|
||||
unsigned long r11;
|
||||
unsigned long r12;
|
||||
unsigned long r13;
|
||||
unsigned long r14;
|
||||
unsigned long r15;
|
||||
unsigned long rflags;
|
||||
unsigned long rip;
|
||||
unsigned long rsp;
|
||||
unsigned long fs;
|
||||
} *ctx = uctx;
|
||||
|
||||
if (!regs) {
|
||||
asm ("movq %%gs:(%1),%0" : "=r"(regs) :
|
||||
"r"(offsetof(struct x86_cpu_local_variables, tss.rsp0)));
|
||||
regs--;
|
||||
}
|
||||
|
||||
ctx->cond = 0;
|
||||
ctx->rax = regs->gpr.rax;
|
||||
ctx->rbx = regs->gpr.rbx;
|
||||
ctx->rcx = regs->gpr.rcx;
|
||||
ctx->rdx = regs->gpr.rdx;
|
||||
ctx->rsi = regs->gpr.rsi;
|
||||
ctx->rdi = regs->gpr.rdi;
|
||||
ctx->rbp = regs->gpr.rbp;
|
||||
ctx->r8 = regs->gpr.r8;
|
||||
ctx->r9 = regs->gpr.r9;
|
||||
ctx->r10 = regs->gpr.r10;
|
||||
ctx->r11 = regs->gpr.r11;
|
||||
ctx->r12 = regs->gpr.r12;
|
||||
ctx->r13 = regs->gpr.r13;
|
||||
ctx->r14 = regs->gpr.r14;
|
||||
ctx->r15 = regs->gpr.r15;
|
||||
ctx->rflags = regs->gpr.rflags;
|
||||
ctx->rsp = regs->gpr.rsp;
|
||||
ctx->rip = regs->gpr.rip;
|
||||
ihk_mc_arch_get_special_register(IHK_ASR_X86_FS, &ctx->fs);
|
||||
ctx->fregsize = 0;
|
||||
}
|
||||
|
||||
/*** End of File ***/
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
* make sure that these are position-independent codes.
|
||||
*/
|
||||
|
||||
#include <cls.h>
|
||||
#include <syscall.h>
|
||||
#include <ihk/atomic.h>
|
||||
#include <arch/cpu.h>
|
||||
|
||||
67
arch/x86/tools/eclair-dump-backtrace.exp.in
Executable file
67
arch/x86/tools/eclair-dump-backtrace.exp.in
Executable file
@ -0,0 +1,67 @@
|
||||
#!/usr/bin/expect
|
||||
|
||||
set INST_DIR "@prefix@"
|
||||
|
||||
spawn $INST_DIR/bin/eclair -d /tmp/mckernel.dump -k $INST_DIR/smp-x86/kernel/mckernel.img -i
|
||||
|
||||
set state "init"
|
||||
set thread_id 0
|
||||
|
||||
expect {
|
||||
"in ?? ()" {
|
||||
switch -- $state {
|
||||
"thread_chosen" {
|
||||
set state "thread_skip"
|
||||
}
|
||||
"thread_bt" {
|
||||
set state "thread_skip"
|
||||
}
|
||||
}
|
||||
|
||||
exp_continue
|
||||
}
|
||||
"(eclair) " {
|
||||
switch -- $state {
|
||||
"init" {
|
||||
set state "threads_list"
|
||||
send "info threads\r"
|
||||
}
|
||||
"threads_list" {
|
||||
incr thread_id
|
||||
set state "thread_chosen"
|
||||
send "thread $thread_id\r"
|
||||
}
|
||||
"thread_skip" {
|
||||
incr thread_id
|
||||
set state "thread_chosen"
|
||||
send "thread $thread_id\r"
|
||||
}
|
||||
"thread_chosen" {
|
||||
set state "thread_bt"
|
||||
send "bt\r"
|
||||
}
|
||||
}
|
||||
|
||||
exp_continue
|
||||
}
|
||||
"Type <return> to continue, or q <return> to quit" {
|
||||
switch -- $state {
|
||||
"threads_list" {
|
||||
send "\r"
|
||||
}
|
||||
"thread_bt" {
|
||||
send "\r"
|
||||
}
|
||||
"thread_skip" {
|
||||
send "q\r"
|
||||
}
|
||||
}
|
||||
exp_continue
|
||||
}
|
||||
" not known." {
|
||||
expect "(eclair) " { send "quit\r" }
|
||||
expect "Quit anyway? (y or n) " { send "y\r" }
|
||||
exit 0
|
||||
}
|
||||
}
|
||||
|
||||
28
arch/x86/tools/irqbalance_mck.in.in
Normal file
28
arch/x86/tools/irqbalance_mck.in.in
Normal file
@ -0,0 +1,28 @@
|
||||
# irqbalance is a daemon process that distributes interrupts across
|
||||
# CPUS on SMP systems. The default is to rebalance once every 10
|
||||
# seconds. This is the environment file that is specified to systemd via the
|
||||
# EnvironmentFile key in the service unit file (or via whatever method the init
|
||||
# system you're using has.
|
||||
#
|
||||
# ONESHOT=yes
|
||||
# after starting, wait for a minute, then look at the interrupt
|
||||
# load and balance it once; after balancing exit and do not change
|
||||
# it again.
|
||||
#IRQBALANCE_ONESHOT=
|
||||
|
||||
#
|
||||
# IRQBALANCE_BANNED_CPUS
|
||||
# 64 bit bitmask which allows you to indicate which cpu's should
|
||||
# be skipped when reblancing irqs. Cpu numbers which have their
|
||||
# corresponding bits set to one in this mask will not have any
|
||||
# irq's assigned to them on rebalance
|
||||
#
|
||||
IRQBALANCE_BANNED_CPUS=%mask%
|
||||
|
||||
#
|
||||
# IRQBALANCE_ARGS
|
||||
# append any args here to the irqbalance daemon as documented in the man page
|
||||
#
|
||||
IRQBALANCE_ARGS=--banirq=%banirq%
|
||||
|
||||
|
||||
10
arch/x86/tools/irqbalance_mck.service.in
Normal file
10
arch/x86/tools/irqbalance_mck.service.in
Normal file
@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=irqbalance daemon
|
||||
After=syslog.target
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/tmp/irqbalance_mck
|
||||
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -3,37 +3,63 @@
|
||||
# IHK SMP-x86 example boot script.
|
||||
# author: Balazs Gerofi <bgerofi@riken.jp>
|
||||
# Copyright (C) 2014 RIKEN AICS
|
||||
#
|
||||
# This is an example script for loading IHK, configuring a partition and
|
||||
# booting McKernel on it.
|
||||
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
|
||||
# when IHK is loaded for the first time, otherwise it destroys the current
|
||||
# McKernel instance and reboots it using the same set of resources as it used
|
||||
# previously.
|
||||
#
|
||||
# This is an example script for loading IHK, configuring a partition and
|
||||
# booting McKernel on it. Unless specific CPUs and memory are requested,
|
||||
# the script reserves half of the CPU cores and 512MB of RAM from
|
||||
# NUMA node 0 when IHK is loaded for the first time.
|
||||
# Otherwise, it destroys the current McKernel instance and reboots it using
|
||||
# the same set of resources as it used previously.
|
||||
# Note that the script does not output anything unless an error occurs.
|
||||
|
||||
prefix="@prefix@"
|
||||
BINDIR="@BINDIR@"
|
||||
SBINDIR="@SBINDIR@"
|
||||
KMODDIR="@KMODDIR@"
|
||||
KERNDIR="@KERNDIR@"
|
||||
BINDIR="${prefix}/bin"
|
||||
SBINDIR="${prefix}/sbin"
|
||||
ETCDIR=@ETCDIR@
|
||||
KMODDIR="${prefix}/kmod"
|
||||
KERNDIR="${prefix}/@TARGET@/kernel"
|
||||
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
|
||||
|
||||
mem="512M@0"
|
||||
cpus=""
|
||||
ikc_map=""
|
||||
|
||||
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
|
||||
echo "You need at least bash-4.0 to run this script." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INTERVAL=1
|
||||
LOGMODE=0
|
||||
while getopts :i:k: OPT
|
||||
facility="LOG_LOCAL6"
|
||||
chown_option=`logname 2> /dev/null`
|
||||
|
||||
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
|
||||
irqbalance_used="yes"
|
||||
else
|
||||
irqbalance_used="no"
|
||||
fi
|
||||
|
||||
turbo=""
|
||||
ihk_irq=""
|
||||
|
||||
while getopts :ti:k:c:m:o:f:r:q: OPT
|
||||
do
|
||||
case ${OPT} in
|
||||
f) facility=${OPTARG}
|
||||
;;
|
||||
o) chown_option=${OPTARG}
|
||||
;;
|
||||
i) INTERVAL=${OPTARG}
|
||||
expr "${INTERVAL}" + 1 > /dev/null 2>&1
|
||||
if [ $? -ge 2 ]
|
||||
then
|
||||
echo "invalid -i value"
|
||||
echo "invalid -i value" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ ${INTERVAL} -le 0 ]
|
||||
then
|
||||
echo "invalid -i value"
|
||||
echo "invalid -i value" >&2
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
@ -41,22 +67,127 @@ do
|
||||
expr "${LOGMODE}" + 1 > /dev/null 2>&1
|
||||
if [ $? -ge 2 ]
|
||||
then
|
||||
echo "invalid -k value"
|
||||
echo "invalid -k value" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
|
||||
then
|
||||
echo "invalid -k value"
|
||||
echo "invalid -k value" >&2
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
*) echo "invalid option -${OPT}"
|
||||
c) cpus=${OPTARG}
|
||||
;;
|
||||
m) mem=${OPTARG}
|
||||
;;
|
||||
r) ikc_map=${OPTARG}
|
||||
;;
|
||||
q) ihk_irq=${OPTARG}
|
||||
;;
|
||||
t) turbo="turbo"
|
||||
;;
|
||||
*) echo "invalid option -${OPT}" >&2
|
||||
exit 1
|
||||
esac
|
||||
done
|
||||
|
||||
mem="512M@0"
|
||||
cpus=""
|
||||
#
|
||||
# Revert any state that has been initialized before the error occured.
|
||||
#
|
||||
error_exit() {
|
||||
local status=$1
|
||||
|
||||
case $status in
|
||||
mcos_sys_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/mcos0_sys
|
||||
fi
|
||||
;&
|
||||
mcos_proc_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/mcos0_proc
|
||||
fi
|
||||
;&
|
||||
mcoverlayfs_loaded)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
rmmod mcoverlay 2>/dev/null
|
||||
fi
|
||||
;&
|
||||
linux_proc_bind_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/linux_proc
|
||||
fi
|
||||
;&
|
||||
tmp_mcos_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos
|
||||
fi
|
||||
;&
|
||||
tmp_mcos_created)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
rm -rf /tmp/mcos
|
||||
fi
|
||||
;&
|
||||
os_created)
|
||||
# Destroy all LWK instances
|
||||
if ls /dev/mcos* 1>/dev/null 2>&1; then
|
||||
for i in /dev/mcos*; do
|
||||
ind=`echo $i|cut -c10-`;
|
||||
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
|
||||
echo "warning: failed to destroy LWK instance $ind" >&2
|
||||
fi
|
||||
done
|
||||
fi
|
||||
;&
|
||||
mcctrl_loaded)
|
||||
rmmod mcctrl 2>/dev/null || echo "warning: failed to remove mcctrl" >&2
|
||||
;&
|
||||
cpus_reserved)
|
||||
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
|
||||
if [ "${cpus}" != "" ]; then
|
||||
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
|
||||
echo "warning: failed to release CPUs" >&2
|
||||
fi
|
||||
fi
|
||||
;&
|
||||
mem_reserved)
|
||||
mem=`${SBINDIR}/ihkconfig 0 query mem`
|
||||
if [ "${mem}" != "" ]; then
|
||||
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
|
||||
echo "warning: failed to release memory" >&2
|
||||
fi
|
||||
fi
|
||||
;&
|
||||
ihk_smp_loaded)
|
||||
rmmod ihk_smp_x86 2>/dev/null || echo "warning: failed to remove ihk_smp_x86" >&2
|
||||
;&
|
||||
ihk_loaded)
|
||||
rmmod ihk 2>/dev/null || echo "warning: failed to remove ihk" >&2
|
||||
;&
|
||||
irqbalance_stopped)
|
||||
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
|
||||
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
|
||||
echo "warning: failed to stop irqbalance_mck" >&2
|
||||
fi
|
||||
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
|
||||
echo "warning: failed to disable irqbalance_mck" >&2
|
||||
fi
|
||||
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
|
||||
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
|
||||
fi
|
||||
if ! systemctl start irqbalance.service; then
|
||||
echo "warning: failed to start irqbalance" >&2;
|
||||
fi
|
||||
fi
|
||||
;&
|
||||
initial)
|
||||
# Nothing more to revert
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 1
|
||||
}
|
||||
|
||||
ihk_ikc_irq_core=0
|
||||
|
||||
release=`uname -r`
|
||||
@ -65,13 +196,28 @@ minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
|
||||
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
|
||||
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
|
||||
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
|
||||
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
|
||||
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
|
||||
enable_mcoverlay=`if ( [ ${linux_version_code} -ge 262144 ] && [ ${linux_version_code} -lt 262400 ] ); then echo "yes"; else echo "no"; fi`
|
||||
else
|
||||
enable_mcoverlay=no
|
||||
if [ "${release}" == "${rhel_release}" ]; then
|
||||
rhel_release="";
|
||||
fi
|
||||
|
||||
enable_mcoverlay="no"
|
||||
|
||||
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
|
||||
if [ "${rhel_release}" == "" ]; then
|
||||
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
|
||||
enable_mcoverlay="yes"
|
||||
fi
|
||||
if [ ${linux_version_code} -ge 263680 -a ${linux_version_code} -lt 263936 ]; then
|
||||
enable_mcoverlay="yes"
|
||||
fi
|
||||
else
|
||||
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
|
||||
enable_mcoverlay="yes"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Figure out CPUs if not requested by user
|
||||
if [ "$cpus" == "" ]; then
|
||||
# Get the number of CPUs on NUMA node 0
|
||||
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
|
||||
@ -79,121 +225,318 @@ if [ "$cpus" == "" ]; then
|
||||
# Use the second half of the cores
|
||||
let nr_cpus="$nr_cpus / 2"
|
||||
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
|
||||
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?"; exit; fi
|
||||
fi
|
||||
|
||||
# Remove delegator if loaded
|
||||
if [ "`lsmod | grep mcctrl`" != "" ]; then
|
||||
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
|
||||
if [ "$cpus" == "" ]; then
|
||||
echo "error: no available CPUs on NUMA node 0?" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Remove mcoverlay if loaded
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
if [ "`lsmod | grep mcoverlay`" != "" ]; then
|
||||
if grep mcoverlay /proc/modules &>/dev/null; then
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
|
||||
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
|
||||
if ! rmmod mcoverlay; then echo "error: removing mcoverlay"; exit; fi
|
||||
if ! rmmod mcoverlay 2>/dev/null; then
|
||||
echo "error: removing mcoverlay" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Load IHK if not loaded
|
||||
if [ "`lsmod | grep ihk`" == "" ]; then
|
||||
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
|
||||
# Stop irqbalance
|
||||
if [ "${irqbalance_used}" == "yes" ]; then
|
||||
systemctl stop irqbalance_mck.service 2>/dev/null
|
||||
if ! systemctl stop irqbalance.service 2>/dev/null ; then
|
||||
echo "error: stopping irqbalance" >&2
|
||||
exit 1
|
||||
fi;
|
||||
|
||||
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
|
||||
echo "error: saving /proc/irq/*/smp_affinity" >&2
|
||||
error_exit "mcos_sys_mounted"
|
||||
fi;
|
||||
|
||||
# Prevent /proc/irq/*/smp_affinity from getting zero after offlining
|
||||
# McKernel CPUs by using the following algorithm.
|
||||
# if (smp_affinity & mck_cores) {
|
||||
# smp_affinity = (mck_cores ^ -1);
|
||||
# }
|
||||
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
|
||||
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = ($j != $nint32s - 1) ? 8 : ($ENV{'ncpus'} % 32 != 0) ? int((($ENV{'ncpus'} + 3) % 32) / 4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
|
||||
# echo cpus=$cpus ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask
|
||||
|
||||
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); if($len != 0) { $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
|
||||
echo "error: modifying /proc/irq/*/smp_affinity" >&2
|
||||
error_exit "mcos_sys_mounted"
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# Load IHK if not loaded
|
||||
if ! grep -E 'ihk\s' /proc/modules &>/dev/null; then
|
||||
if ! taskset -c 0 insmod ${KMODDIR}/ihk.ko 2>/dev/null; then
|
||||
echo "error: loading ihk" >&2
|
||||
error_exit "irqbalance_stopped"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Increase swappiness so that we have better chance to allocate memory for IHK
|
||||
echo 100 > /proc/sys/vm/swappiness
|
||||
|
||||
# Drop Linux caches to free memory
|
||||
sync && echo 3 > /proc/sys/vm/drop_caches
|
||||
|
||||
# Merge free memory areas into large, physically contigous ones
|
||||
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
|
||||
|
||||
sync
|
||||
|
||||
# Load IHK-SMP if not loaded and reserve CPUs and memory
|
||||
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
|
||||
ihk_irq=""
|
||||
for i in `seq 64 255`; do
|
||||
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
|
||||
ihk_irq=$i
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available"; exit; fi
|
||||
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86"; exit; fi;
|
||||
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
|
||||
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
|
||||
# If loaded, but no resources allocated, get CPUs and memory
|
||||
else
|
||||
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
|
||||
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
|
||||
if [ "$cpus_allocated" == "" ]; then
|
||||
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
|
||||
if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then
|
||||
if [ "$ihk_irq" == "" ]; then
|
||||
for i in `seq 64 255`; do
|
||||
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
|
||||
ihk_irq=$i
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$ihk_irq" == "" ]; then
|
||||
echo "error: no IRQ available" >&2
|
||||
error_exit "ihk_loaded"
|
||||
fi
|
||||
fi
|
||||
if ! taskset -c 0 insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core 2>/dev/null; then
|
||||
echo "error: loading ihk-smp-x86" >&2
|
||||
error_exit "ihk_loaded"
|
||||
fi
|
||||
|
||||
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
|
||||
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
|
||||
if [ "$mem_allocated" == "" ]; then
|
||||
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
|
||||
fi
|
||||
# Offline-reonline RAM (special case for OFP SNC-4 mode)
|
||||
if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-7" ]; then
|
||||
for i in 0 1 2 3; do
|
||||
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
|
||||
echo 0 > $f 2>&1 > /dev/null;
|
||||
done
|
||||
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
|
||||
echo 1 > $f 2>&1 > /dev/null;
|
||||
done
|
||||
done
|
||||
for i in 4 5 6 7; do
|
||||
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
|
||||
echo 0 > $f 2>&1 > /dev/null;
|
||||
done
|
||||
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
|
||||
echo 1 > $f 2>&1 > /dev/null;
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
|
||||
echo "error: reserving memory" >&2
|
||||
error_exit "ihk_smp_loaded"
|
||||
fi
|
||||
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
|
||||
echo "error: reserving CPUs" >&2;
|
||||
error_exit "mem_reserved"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check for existing OS instance and destroy
|
||||
if [ -c /dev/mcos0 ]; then
|
||||
# Query CPU cores and memory of OS instance so that the same values are used as previously
|
||||
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
|
||||
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
|
||||
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
|
||||
mem=`${SBINDIR}/ihkosctl 0 query mem`
|
||||
|
||||
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed"; fi
|
||||
else
|
||||
# Otherwise query IHK-SMP for resources
|
||||
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
|
||||
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
|
||||
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
|
||||
mem=`${SBINDIR}/ihkconfig 0 query mem`
|
||||
# Load mcctrl if not loaded
|
||||
if ! grep mcctrl /proc/modules &>/dev/null; then
|
||||
if ! taskset -c 0 insmod ${KMODDIR}/mcctrl.ko 2>/dev/null; then
|
||||
echo "error: inserting mcctrl.ko" >&2
|
||||
error_exit "cpus_reserved"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
|
||||
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
|
||||
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
|
||||
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
|
||||
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments"; exit; fi
|
||||
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
|
||||
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi
|
||||
if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi
|
||||
# Destroy all LWK instances
|
||||
if ls /dev/mcos* 1>/dev/null 2>&1; then
|
||||
for i in /dev/mcos*; do
|
||||
ind=`echo $i|cut -c10-`;
|
||||
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
|
||||
echo "error: destroying LWK instance $ind failed" >&2
|
||||
error_exit "mcctrl_loaded"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Create OS instance
|
||||
if ! ${SBINDIR}/ihkconfig 0 create; then
|
||||
echo "error: creating OS instance" >&2
|
||||
error_exit "mcctrl_loaded"
|
||||
fi
|
||||
|
||||
# Assign CPUs
|
||||
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
|
||||
echo "error: assign CPUs" >&2
|
||||
error_exit "os_created"
|
||||
fi
|
||||
|
||||
if [ "$ikc_map" != "" ]; then
|
||||
# Specify IKC map
|
||||
if ! ${SBINDIR}/ihkosctl 0 ikc_map ${ikc_map}; then
|
||||
echo "error: assign CPUs" >&2
|
||||
error_exit "os_created"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Assign memory
|
||||
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
|
||||
echo "error: assign memory" >&2
|
||||
error_exit "os_created"
|
||||
fi
|
||||
|
||||
# Load kernel image
|
||||
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
|
||||
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
|
||||
error_exit "os_created"
|
||||
fi
|
||||
|
||||
# Set kernel arguments
|
||||
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE} $turbo"; then
|
||||
echo "error: setting kernel arguments" >&2
|
||||
error_exit "os_created"
|
||||
fi
|
||||
|
||||
# Boot OS instance
|
||||
if ! ${SBINDIR}/ihkosctl 0 boot; then
|
||||
echo "error: booting" >&2
|
||||
error_exit "os_created"
|
||||
fi
|
||||
|
||||
# Set device file ownership
|
||||
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
|
||||
echo "warning: failed to chown device files" >&2
|
||||
fi
|
||||
|
||||
# Overlay /proc, /sys with McKernel specific contents
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
|
||||
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos"; exit; fi
|
||||
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
|
||||
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc"; exit; fi
|
||||
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko"; exit; fi
|
||||
if [ ! -e /tmp/mcos ]; then
|
||||
mkdir -p /tmp/mcos;
|
||||
fi
|
||||
if ! mount -t tmpfs tmpfs /tmp/mcos; then
|
||||
echo "error: mount /tmp/mcos" >&2
|
||||
error_exit "tmp_mcos_created"
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/linux_proc ]; then
|
||||
mkdir -p /tmp/mcos/linux_proc;
|
||||
fi
|
||||
if ! mount --bind /proc /tmp/mcos/linux_proc; then
|
||||
echo "error: mount /tmp/mcos/linux_proc" >&2
|
||||
error_exit "tmp_mcos_mounted"
|
||||
fi
|
||||
if ! taskset -c 0 insmod ${KMODDIR}/mcoverlay.ko 2>/dev/null; then
|
||||
echo "error: inserting mcoverlay.ko" >&2
|
||||
error_exit "linux_proc_bind_mounted"
|
||||
fi
|
||||
while [ ! -e /proc/mcos0 ]
|
||||
do
|
||||
sleep 1
|
||||
sleep 0.1
|
||||
done
|
||||
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc"; exit; fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc_upper;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc_work;
|
||||
fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
|
||||
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
|
||||
error_exit "mcoverlayfs_loaded"
|
||||
fi
|
||||
# TODO: How de we revert this in case of failure??
|
||||
mount --make-rprivate /proc
|
||||
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
|
||||
|
||||
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
|
||||
do
|
||||
sleep 1
|
||||
sleep 0.1
|
||||
done
|
||||
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys"; exit; fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys_upper;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys_work;
|
||||
fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
|
||||
echo "error: mount /tmp/mcos/mcos0_sys" >&2
|
||||
error_exit "mcos_proc_mounted"
|
||||
fi
|
||||
# TODO: How de we revert this in case of failure??
|
||||
mount --make-rprivate /sys
|
||||
|
||||
touch /tmp/mcos/mcos0_proc/mckernel
|
||||
|
||||
rm -rf /tmp/mcos/mcos0_sys/setup_complete
|
||||
|
||||
# Hide NUMA related files which are outside the LWK partition
|
||||
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
|
||||
else
|
||||
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
|
||||
else
|
||||
# Delete non-existent symlinks
|
||||
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
|
||||
fi
|
||||
done
|
||||
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
|
||||
fi
|
||||
done
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
|
||||
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
||||
fi
|
||||
done
|
||||
fi
|
||||
if [ ${LOGMODE} -ne 0 ]
|
||||
then
|
||||
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL}
|
||||
|
||||
# Start irqbalance with CPUs and IRQ for McKernel banned
|
||||
if [ "${irqbalance_used}" == "yes" ]; then
|
||||
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
|
||||
|
||||
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > /tmp/irqbalance_mck
|
||||
systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null
|
||||
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
|
||||
echo "error: linking irqbalance_mck" >&2
|
||||
error_exit "mcos_sys_mounted"
|
||||
fi
|
||||
|
||||
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
|
||||
echo "error: starting irqbalance_mck" >&2
|
||||
error_exit "mcos_sys_mounted"
|
||||
fi
|
||||
# echo cpus=$cpus ncpus=$ncpus banirq=$banirq
|
||||
fi
|
||||
|
||||
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
|
||||
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
|
||||
# booting McKernel
|
||||
if [ ${LOGMODE} -ne 0 ]; then
|
||||
# Stop mcklogd which has survived McKernel shutdown because
|
||||
# mcstop+release.sh is not used
|
||||
pkill mcklogd
|
||||
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
|
||||
fi
|
||||
|
||||
|
||||
@ -10,38 +10,116 @@
|
||||
prefix="@prefix@"
|
||||
BINDIR="@BINDIR@"
|
||||
SBINDIR="@SBINDIR@"
|
||||
ETCDIR=@ETCDIR@
|
||||
KMODDIR="@KMODDIR@"
|
||||
KERNDIR="@KERNDIR@"
|
||||
|
||||
mem=""
|
||||
cpus=""
|
||||
irqbalance_used=""
|
||||
|
||||
# No SMP module? Exit.
|
||||
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit; fi
|
||||
if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then exit 0; fi
|
||||
|
||||
# Remove delegator if loaded
|
||||
if [ "`lsmod | grep mcctrl`" != "" ]; then
|
||||
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
|
||||
# Stop mcklogd
|
||||
while pgrep "mcklogd" > /dev/null 2>&1;
|
||||
do
|
||||
pkill -9 mcklogd
|
||||
done
|
||||
|
||||
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
|
||||
irqbalance_used="yes"
|
||||
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
|
||||
echo "warning: failed to stop irqbalance_mck" >&2
|
||||
fi
|
||||
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
|
||||
echo "warning: failed to disable irqbalance_mck" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Destroy all LWK instances
|
||||
for i in /dev/mcos*; do
|
||||
ind=`echo $i|cut -c10-`;
|
||||
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed"; exit; fi
|
||||
done
|
||||
|
||||
# Query IHK-SMP resources and release them
|
||||
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
|
||||
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
|
||||
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs"; exit; fi
|
||||
|
||||
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
|
||||
mem=`${SBINDIR}/ihkconfig 0 query mem`
|
||||
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory"; exit; fi
|
||||
|
||||
# Remove SMP module
|
||||
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
|
||||
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86"; exit; fi
|
||||
if ls /dev/mcos* 1>/dev/null 2>&1; then
|
||||
for i in /dev/mcos*; do
|
||||
ind=`echo $i|cut -c10-`;
|
||||
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
|
||||
echo "error: destroying LWK instance $ind failed" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Query IHK-SMP resources and release them
|
||||
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
|
||||
echo "error: querying cpus" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
|
||||
if [ "${cpus}" != "" ]; then
|
||||
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
|
||||
echo "error: releasing CPUs" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
|
||||
echo "error: querying memory" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mem=`${SBINDIR}/ihkconfig 0 query mem`
|
||||
if [ "${mem}" != "" ]; then
|
||||
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
|
||||
echo "error: releasing memory" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Remove delegator if loaded
|
||||
if grep mcctrl /proc/modules &>/dev/null; then
|
||||
if ! rmmod mcctrl 2>/dev/null; then
|
||||
echo "error: removing mcctrl" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Remove mcoverlay if loaded
|
||||
if grep mcoverlay /proc/modules &>/dev/null; then
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
|
||||
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
|
||||
if ! rmmod mcoverlay 2>/dev/null; then
|
||||
echo "warning: failed to remove mcoverlay" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Remove SMP module
|
||||
if grep ihk_smp_x86 /proc/modules &>/dev/null; then
|
||||
if ! rmmod ihk_smp_x86 2>/dev/null; then
|
||||
echo "error: removing ihk_smp_x86" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Remove core module
|
||||
if grep -E 'ihk\s' /proc/modules &>/dev/null; then
|
||||
if ! rmmod ihk 2>/dev/null; then
|
||||
echo "error: removing ihk" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Start irqbalance with the original settings
|
||||
if [ "${irqbalance_used}" != "" ]; then
|
||||
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
|
||||
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
|
||||
fi
|
||||
if ! systemctl start irqbalance.service; then
|
||||
echo "warning: failed to start irqbalance" >&2;
|
||||
fi
|
||||
fi
|
||||
|
||||
# Set back default swappiness
|
||||
echo 60 > /proc/sys/vm/swappiness
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* executer/config.h.in. Generated from configure.ac by autoheader. */
|
||||
/* config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* whether mcoverlayfs is enabled */
|
||||
#undef ENABLE_MCOVERLAYFS
|
||||
@ -6,6 +6,9 @@
|
||||
/* whether memdump feature is enabled */
|
||||
#undef ENABLE_MEMDUMP
|
||||
|
||||
/* whether rusage is enabled */
|
||||
#undef ENABLE_RUSAGE
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#undef HAVE_INTTYPES_H
|
||||
|
||||
@ -51,6 +54,9 @@
|
||||
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
|
||||
#undef MCCTRL_KSYM_sys_readlink
|
||||
|
||||
/* Define to address of kernel symbol sys_umount, or 0 if exported */
|
||||
#undef MCCTRL_KSYM_sys_umount
|
||||
|
||||
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
|
||||
#undef MCCTRL_KSYM_sys_unshare
|
||||
|
||||
@ -69,6 +75,9 @@
|
||||
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
|
||||
#undef MCCTRL_KSYM_zap_page_range
|
||||
|
||||
/* McKernel specific libraries */
|
||||
#undef MCKERNEL_LIBDIR
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#undef PACKAGE_BUGREPORT
|
||||
|
||||
@ -87,5 +96,8 @@
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
/* Path of bind-mount source directory */
|
||||
#undef ROOTFSDIR
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#undef STDC_HEADERS
|
||||
70
configure.ac
70
configure.ac
@ -17,6 +17,13 @@ DCFA_RELEASE_DATE=DCFA_RELEASE_DATE_m4
|
||||
|
||||
AC_PREFIX_DEFAULT([/opt/ppos])
|
||||
|
||||
AC_CHECK_HEADER([numa.h],[numa_header_found=yes])
|
||||
AS_IF([test "x$numa_header_found" != "xyes"],
|
||||
[AC_MSG_ERROR([Unable to find numa.h header file, missing numactl-devel?])])
|
||||
AC_CHECK_LIB([numa],[numa_run_on_node],[numa_lib_found=yes])
|
||||
AS_IF([test "x$numa_lib_found" != "xyes"],
|
||||
[AC_MSG_ERROR([Unable to find NUMA library, missing numactl-devel?])])
|
||||
|
||||
AC_ARG_WITH([kernelsrc],
|
||||
AC_HELP_STRING(
|
||||
[--with-kernelsrc=path],[Path to 'kernel src', default is /lib/modules/uname_r/build]),
|
||||
@ -48,6 +55,23 @@ AC_ARG_ENABLE([mcoverlayfs],
|
||||
[ENABLE_MCOVERLAYFS=$enableval],
|
||||
[ENABLE_MCOVERLAYFS=yes])
|
||||
|
||||
AC_ARG_ENABLE([rusage],
|
||||
AC_HELP_STRING([--enable-rusage],
|
||||
[enable rusage implementation]),
|
||||
[ENABLE_RUSAGE=$enableval],
|
||||
[ENABLE_RUSAGE=yes])
|
||||
|
||||
AC_ARG_WITH([uname_r],
|
||||
AC_HELP_STRING(
|
||||
[--with-uname_r=uname_r],[Value of '`uname -r`' on the target platform, default is local value]),
|
||||
[WITH_UNAME_R=$withval],[WITH_UNAME_R=yes])
|
||||
|
||||
case "X$WITH_UNAME_R" in
|
||||
Xyes | Xno | X)
|
||||
WITH_UNAME_R='`uname -r`'
|
||||
;;
|
||||
esac
|
||||
|
||||
case "X$WITH_KERNELSRC" in
|
||||
Xyes | Xno | X)
|
||||
WITH_KERNELSRC='/lib/modules/`uname -r`/build'
|
||||
@ -64,12 +88,14 @@ if test "X$WITH_TARGET" = Xyes -o "X$WITH_TARGET" = Xno; then
|
||||
fi
|
||||
|
||||
test "x$prefix" = xNONE && prefix="$ac_default_prefix"
|
||||
AC_DEFINE_UNQUOTED(ROOTFSDIR,"$prefix/rootfs",[Path of bind-mount source directory])
|
||||
|
||||
case $WITH_TARGET in
|
||||
attached-mic|builtin-x86|smp-x86)
|
||||
ARCH=`uname -m`
|
||||
AC_PROG_CC
|
||||
XCC=$CC
|
||||
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
|
||||
;;
|
||||
builtin-mic)
|
||||
ARCH=k1om
|
||||
@ -146,6 +172,12 @@ case $WITH_TARGET in
|
||||
if test "X$SBINDIR" = X; then
|
||||
SBINDIR="$prefix/sbin"
|
||||
fi
|
||||
if test "X$MCKERNEL_LIBDIR" = X; then
|
||||
MCKERNEL_LIBDIR="$prefix/lib"
|
||||
fi
|
||||
if test "X$ETCDIR" = X; then
|
||||
ETCDIR="$prefix/etc"
|
||||
fi
|
||||
if test "X$KMODDIR" = X; then
|
||||
KMODDIR="$prefix/kmod"
|
||||
fi
|
||||
@ -159,6 +191,7 @@ case $WITH_TARGET in
|
||||
esac
|
||||
|
||||
KDIR="$WITH_KERNELSRC"
|
||||
UNAME_R="$WITH_UNAME_R"
|
||||
TARGET="$WITH_TARGET"
|
||||
|
||||
MCCTRL_LINUX_SYMTAB=""
|
||||
@ -218,6 +251,7 @@ AC_DEFUN([MCCTRL_FIND_KSYM],[
|
||||
])
|
||||
|
||||
MCCTRL_FIND_KSYM([sys_mount])
|
||||
MCCTRL_FIND_KSYM([sys_umount])
|
||||
MCCTRL_FIND_KSYM([sys_unshare])
|
||||
MCCTRL_FIND_KSYM([zap_page_range])
|
||||
MCCTRL_FIND_KSYM([vdso_image_64])
|
||||
@ -271,17 +305,44 @@ else
|
||||
AC_MSG_NOTICE([mcoverlayfs is disabled])
|
||||
fi
|
||||
|
||||
case $ENABLE_RUSAGE in
|
||||
yes|no)
|
||||
;;
|
||||
default)
|
||||
ENABLE_RUSAGE=yes
|
||||
;;
|
||||
*)
|
||||
AC_MSG_ERROR([unknown rusage argument: $ENABLE_RUSAGE])
|
||||
;;
|
||||
esac
|
||||
|
||||
if test "x$ENABLE_RUSAGE" = "xyes" ; then
|
||||
AC_MSG_NOTICE([rusage is enabled])
|
||||
AC_DEFINE([ENABLE_RUSAGE],[1],[whether rusage is enabled])
|
||||
else
|
||||
AC_MSG_NOTICE([rusage is disabled])
|
||||
fi
|
||||
|
||||
if test "x$MCKERNEL_LIBDIR" != "x" ; then
|
||||
AC_DEFINE_UNQUOTED(MCKERNEL_LIBDIR,"$MCKERNEL_LIBDIR",[McKernel specific libraries])
|
||||
fi
|
||||
|
||||
AC_SUBST(CC)
|
||||
AC_SUBST(XCC)
|
||||
AC_SUBST(ARCH)
|
||||
AC_SUBST(KDIR)
|
||||
AC_SUBST(UNAME_R)
|
||||
AC_SUBST(TARGET)
|
||||
AC_SUBST(BINDIR)
|
||||
AC_SUBST(SBINDIR)
|
||||
AC_SUBST(MCKERNEL_LIBDIR)
|
||||
AC_SUBST(ETCDIR)
|
||||
AC_SUBST(KMODDIR)
|
||||
AC_SUBST(KERNDIR)
|
||||
AC_SUBST(MANDIR)
|
||||
AC_SUBST(CFLAGS)
|
||||
AC_SUBST(ENABLE_MCOVERLAYFS)
|
||||
AC_SUBST(ENABLE_RUSAGE)
|
||||
|
||||
AC_SUBST(IHK_VERSION)
|
||||
AC_SUBST(MCKERNEL_VERSION)
|
||||
@ -291,13 +352,17 @@ AC_SUBST(MCKERNEL_RELEASE_DATE)
|
||||
AC_SUBST(DCFA_RESEASE_DATE)
|
||||
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
|
||||
|
||||
AC_CONFIG_HEADERS([executer/config.h])
|
||||
AC_CONFIG_HEADERS([config.h])
|
||||
AC_CONFIG_FILES([
|
||||
Makefile
|
||||
executer/user/Makefile
|
||||
executer/user/arch/x86_64/Makefile
|
||||
executer/kernel/mcctrl/Makefile
|
||||
executer/kernel/mcctrl/arch/x86_64/Makefile
|
||||
executer/kernel/mcoverlayfs/Makefile
|
||||
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
|
||||
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
|
||||
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile
|
||||
kernel/Makefile
|
||||
kernel/Makefile.build
|
||||
arch/x86/tools/mcreboot-attached-mic.sh
|
||||
@ -305,8 +370,11 @@ AC_CONFIG_FILES([
|
||||
arch/x86/tools/mcreboot-builtin-x86.sh
|
||||
arch/x86/tools/mcreboot-smp-x86.sh
|
||||
arch/x86/tools/mcstop+release-smp-x86.sh
|
||||
arch/x86/tools/eclair-dump-backtrace.exp
|
||||
arch/x86/tools/mcshutdown-builtin-x86.sh
|
||||
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
|
||||
arch/x86/tools/irqbalance_mck.service
|
||||
arch/x86/tools/irqbalance_mck.in
|
||||
])
|
||||
|
||||
AS_IF([test "x$enable_dcfa" = xyes], [
|
||||
|
||||
@ -41,6 +41,9 @@
|
||||
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
|
||||
#define MCEXEC_UP_GET_CRED 0x30a0290a
|
||||
#define MCEXEC_UP_GET_CREDV 0x30a0290b
|
||||
#define MCEXEC_UP_GET_NODES 0x30a0290c
|
||||
#define MCEXEC_UP_GET_CPUSET 0x30a0290d
|
||||
#define MCEXEC_UP_CREATE_PPD 0x30a0290e
|
||||
|
||||
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
|
||||
#define MCEXEC_UP_FREE_DMA 0x30a02911
|
||||
@ -49,7 +52,18 @@
|
||||
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
|
||||
|
||||
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
|
||||
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
|
||||
#define MCEXEC_UP_SYS_UMOUNT 0x30a02915
|
||||
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
|
||||
|
||||
#define MCEXEC_UP_UTIL_THREAD1 0x30a02920
|
||||
#define MCEXEC_UP_UTIL_THREAD2 0x30a02921
|
||||
#define MCEXEC_UP_SIG_THREAD 0x30a02922
|
||||
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
|
||||
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
|
||||
#define MCEXEC_UP_GET_NUM_POOL_THREADS 0x30a02926
|
||||
|
||||
#define MCEXEC_UP_COPY_FROM_MCK 0x30a03000
|
||||
#define MCEXEC_UP_COPY_TO_MCK 0x30a03001
|
||||
|
||||
#define MCEXEC_UP_DEBUG_LOG 0x40000000
|
||||
|
||||
@ -77,6 +91,26 @@ struct program_image_section {
|
||||
#define SHELL_PATH_MAX_LEN 1024
|
||||
#define MCK_RLIM_MAX 20
|
||||
|
||||
struct get_cpu_set_arg {
|
||||
int nr_processes;
|
||||
void *cpu_set;
|
||||
size_t cpu_set_size; // Size in bytes
|
||||
int *target_core;
|
||||
int *mcexec_linux_numa; // NUMA domain to bind mcexec to
|
||||
void *mcexec_cpu_set;
|
||||
size_t mcexec_cpu_set_size; // Size in bytes
|
||||
int *ikc_mapped;
|
||||
};
|
||||
|
||||
#define PLD_CPU_SET_MAX_CPUS 1024
|
||||
typedef unsigned long __cpu_set_unit;
|
||||
#define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit)))
|
||||
|
||||
#define MPOL_NO_HEAP 0x01
|
||||
#define MPOL_NO_STACK 0x02
|
||||
#define MPOL_NO_BSS 0x04
|
||||
#define MPOL_SHM_PREMAP 0x08
|
||||
|
||||
struct program_load_desc {
|
||||
int num_sections;
|
||||
int status;
|
||||
@ -105,11 +139,24 @@ struct program_load_desc {
|
||||
unsigned long envs_len;
|
||||
struct rlimit rlimit[MCK_RLIM_MAX];
|
||||
unsigned long interp_align;
|
||||
unsigned long mpol_flags;
|
||||
unsigned long mpol_threshold;
|
||||
unsigned long heap_extension;
|
||||
int nr_processes;
|
||||
char shell_path[SHELL_PATH_MAX_LEN];
|
||||
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
|
||||
int profile;
|
||||
struct program_image_section sections[0];
|
||||
};
|
||||
|
||||
struct syscall_request {
|
||||
/* TID of requesting thread */
|
||||
int rtid;
|
||||
/*
|
||||
* TID of target thread. Remote page fault response needs to designate the
|
||||
* thread that must serve the request, 0 indicates any thread from the pool
|
||||
*/
|
||||
int ttid;
|
||||
unsigned long valid;
|
||||
unsigned long number;
|
||||
unsigned long args[6];
|
||||
@ -128,8 +175,17 @@ struct syscall_load_desc {
|
||||
unsigned long size;
|
||||
};
|
||||
|
||||
#define IHK_SCD_REQ_THREAD_SPINNING 0
|
||||
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
|
||||
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
|
||||
|
||||
struct syscall_response {
|
||||
/* TID of the thread that requested the service */
|
||||
int ttid;
|
||||
/* TID of the mcexec thread that is serving or has served the request */
|
||||
int stid;
|
||||
unsigned long status;
|
||||
unsigned long req_thread_status;
|
||||
long ret;
|
||||
unsigned long fault_address;
|
||||
unsigned long fault_reason;
|
||||
@ -180,8 +236,42 @@ struct sys_mount_desc {
|
||||
void *data;
|
||||
};
|
||||
|
||||
struct sys_umount_desc {
|
||||
char *dir_name;
|
||||
};
|
||||
|
||||
struct sys_unshare_desc {
|
||||
unsigned long unshare_flags;
|
||||
};
|
||||
|
||||
enum perf_ctrl_type {
|
||||
PERF_CTRL_SET,
|
||||
PERF_CTRL_GET,
|
||||
PERF_CTRL_ENABLE,
|
||||
PERF_CTRL_DISABLE,
|
||||
};
|
||||
|
||||
struct perf_ctrl_desc {
|
||||
enum perf_ctrl_type ctrl_type;
|
||||
int status;
|
||||
union {
|
||||
/* for SET, GET */
|
||||
struct {
|
||||
unsigned int target_cntr;
|
||||
unsigned long config;
|
||||
unsigned long read_value;
|
||||
unsigned disabled :1,
|
||||
pinned :1,
|
||||
exclude_user :1,
|
||||
exclude_kernel :1,
|
||||
exclude_hv :1,
|
||||
exclude_idle :1;
|
||||
};
|
||||
|
||||
/* for START, STOP*/
|
||||
struct {
|
||||
unsigned long target_cntr_mask;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#include <linux/version.h>
|
||||
#include "../../../../config.h"
|
||||
#include "../../../config.h"
|
||||
#include "../../mcctrl.h"
|
||||
|
||||
#ifdef MCCTRL_KSYM_vdso_image_64
|
||||
@ -64,6 +64,10 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
|
||||
unsigned long start = 0L;
|
||||
unsigned long end;
|
||||
|
||||
if (mutex_lock_killable(&usrdata->reserve_lock) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
#define DESIRED_USER_END 0x800000000000
|
||||
#define GAP_FOR_MCEXEC 0x008000000000UL
|
||||
end = DESIRED_USER_END;
|
||||
@ -81,6 +85,8 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
#endif
|
||||
|
||||
mutex_unlock(&usrdata->reserve_lock);
|
||||
|
||||
if (IS_ERR_VALUE(start)) {
|
||||
return start;
|
||||
}
|
||||
@ -100,8 +106,6 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
|
||||
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
|
||||
|
||||
memset(vdso, 0, sizeof(*vdso));
|
||||
|
||||
/* VDSO pages */
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
|
||||
size = vdso_image->size;
|
||||
@ -192,3 +196,65 @@ out:
|
||||
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
|
||||
return;
|
||||
} /* get_vdso_info() */
|
||||
|
||||
void *
|
||||
get_user_sp(void)
|
||||
{
|
||||
unsigned long usp;
|
||||
|
||||
asm volatile("movq %%gs:0xaf80, %0" : "=r" (usp));
|
||||
return (void *)usp;
|
||||
}
|
||||
|
||||
void
|
||||
set_user_sp(void *usp)
|
||||
{
|
||||
asm volatile("movq %0, %%gs:0xaf80" :: "r" (usp));
|
||||
}
|
||||
|
||||
struct trans_uctx {
|
||||
volatile int cond;
|
||||
int fregsize;
|
||||
|
||||
unsigned long rax;
|
||||
unsigned long rbx;
|
||||
unsigned long rcx;
|
||||
unsigned long rdx;
|
||||
unsigned long rsi;
|
||||
unsigned long rdi;
|
||||
unsigned long rbp;
|
||||
unsigned long r8;
|
||||
unsigned long r9;
|
||||
unsigned long r10;
|
||||
unsigned long r11;
|
||||
unsigned long r12;
|
||||
unsigned long r13;
|
||||
unsigned long r14;
|
||||
unsigned long r15;
|
||||
unsigned long rflags;
|
||||
unsigned long rip;
|
||||
unsigned long rsp;
|
||||
unsigned long fs;
|
||||
};
|
||||
|
||||
void
|
||||
restore_fs(unsigned long fs)
|
||||
{
|
||||
wrmsrl(MSR_FS_BASE, fs);
|
||||
}
|
||||
|
||||
void
|
||||
save_fs_ctx(void *ctx)
|
||||
{
|
||||
struct trans_uctx *tctx = ctx;
|
||||
|
||||
rdmsrl(MSR_FS_BASE, tctx->fs);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
get_fs_ctx(void *ctx)
|
||||
{
|
||||
struct trans_uctx *tctx = ctx;
|
||||
|
||||
return tctx->fs;
|
||||
}
|
||||
|
||||
@ -75,7 +75,7 @@ static int load_elf(struct linux_binprm *bprm
|
||||
char buf[32];
|
||||
int l;
|
||||
int pass;
|
||||
char pbuf[1024];
|
||||
char *pbuf;
|
||||
const char *path;
|
||||
|
||||
if(bprm->envc == 0)
|
||||
@ -88,6 +88,11 @@ static int load_elf(struct linux_binprm *bprm
|
||||
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
|
||||
return -ENOEXEC;
|
||||
|
||||
pbuf = kmalloc(1024, GFP_ATOMIC);
|
||||
if (!pbuf) {
|
||||
printk("%s: error: allocating pbuf\n", __FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
path = d_path(&bprm->file->f_path, pbuf, 1024);
|
||||
if(!path || IS_ERR(path))
|
||||
path = bprm->interp;
|
||||
@ -96,8 +101,10 @@ static int load_elf(struct linux_binprm *bprm
|
||||
if(!cp ||
|
||||
!strcmp(cp, "/mcexec") ||
|
||||
!strcmp(cp, "/ihkosctl") ||
|
||||
!strcmp(cp, "/ihkconfig"))
|
||||
!strcmp(cp, "/ihkconfig")) {
|
||||
kfree(pbuf);
|
||||
return -ENOEXEC;
|
||||
}
|
||||
|
||||
cnt[0] = bprm->argc;
|
||||
cnt[1] = bprm->envc;
|
||||
@ -124,8 +131,10 @@ static int load_elf(struct linux_binprm *bprm
|
||||
bprm->p, 1, 0, 1,
|
||||
&page, NULL);
|
||||
#endif
|
||||
if(rc <= 0)
|
||||
if(rc <= 0) {
|
||||
kfree(pbuf);
|
||||
return -EFAULT;
|
||||
}
|
||||
addr = kmap_atomic(page
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
|
||||
, KM_USER0
|
||||
@ -199,21 +208,27 @@ static int load_elf(struct linux_binprm *bprm
|
||||
for(ep = env; ep->name; ep++)
|
||||
if(ep->val)
|
||||
kfree(ep->val);
|
||||
if(rc)
|
||||
if(rc) {
|
||||
kfree(pbuf);
|
||||
return -ENOEXEC;
|
||||
}
|
||||
|
||||
file = open_exec(MCEXEC_PATH);
|
||||
if (IS_ERR(file))
|
||||
if (IS_ERR(file)) {
|
||||
kfree(pbuf);
|
||||
return -ENOEXEC;
|
||||
}
|
||||
|
||||
rc = remove_arg_zero(bprm);
|
||||
if (rc){
|
||||
fput(file);
|
||||
kfree(pbuf);
|
||||
return rc;
|
||||
}
|
||||
rc = copy_strings_kernel(1, &bprm->interp, bprm);
|
||||
if (rc < 0){
|
||||
fput(file);
|
||||
kfree(pbuf);
|
||||
return rc;
|
||||
}
|
||||
bprm->argc++;
|
||||
@ -221,12 +236,14 @@ static int load_elf(struct linux_binprm *bprm
|
||||
rc = copy_strings_kernel(1, &wp, bprm);
|
||||
if (rc){
|
||||
fput(file);
|
||||
kfree(pbuf);
|
||||
return rc;
|
||||
}
|
||||
bprm->argc++;
|
||||
rc = bprm_change_interp(MCEXEC_PATH, bprm);
|
||||
if (rc < 0){
|
||||
fput(file);
|
||||
kfree(pbuf);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -236,8 +253,12 @@ static int load_elf(struct linux_binprm *bprm
|
||||
|
||||
rc = prepare_binprm(bprm);
|
||||
if (rc < 0){
|
||||
kfree(pbuf);
|
||||
return rc;
|
||||
}
|
||||
|
||||
kfree(pbuf);
|
||||
|
||||
return search_binary_handler(bprm
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
|
||||
, regs
|
||||
@ -255,7 +276,7 @@ void __init binfmt_mcexec_init(void)
|
||||
insert_binfmt(&mcexec_format);
|
||||
}
|
||||
|
||||
void __exit binfmt_mcexec_exit(void)
|
||||
void binfmt_mcexec_exit(void)
|
||||
{
|
||||
unregister_binfmt(&mcexec_format);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -27,6 +27,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/device.h>
|
||||
#include "mcctrl.h"
|
||||
#include <ihk/ihk_host_user.h>
|
||||
|
||||
#define OS_MAX_MINOR 64
|
||||
|
||||
@ -45,6 +46,12 @@ extern void rus_page_hash_put_pages(void);
|
||||
extern void binfmt_mcexec_init(void);
|
||||
extern void binfmt_mcexec_exit(void);
|
||||
|
||||
extern int mcctrl_os_read_cpu_register(ihk_os_t os, int cpu,
|
||||
struct ihk_os_cpu_register *desc);
|
||||
extern int mcctrl_os_write_cpu_register(ihk_os_t os, int cpu,
|
||||
struct ihk_os_cpu_register *desc);
|
||||
extern int mcctrl_get_request_os_cpu(ihk_os_t os, int *cpu);
|
||||
|
||||
static long mcctrl_ioctl(ihk_os_t os, unsigned int request, void *priv,
|
||||
unsigned long arg, struct file *file)
|
||||
{
|
||||
@ -60,6 +67,9 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
|
||||
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CPUSET, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_CREATE_PPD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
|
||||
@ -69,8 +79,29 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
|
||||
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_UTIL_THREAD1, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_UTIL_THREAD2, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_NUM_POOL_THREADS, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_COPY_FROM_MCK, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_COPY_TO_MCK, .func = mcctrl_ioctl },
|
||||
{ .request = IHK_OS_AUX_PERF_NUM, .func = mcctrl_ioctl },
|
||||
{ .request = IHK_OS_AUX_PERF_SET, .func = mcctrl_ioctl },
|
||||
{ .request = IHK_OS_AUX_PERF_GET, .func = mcctrl_ioctl },
|
||||
{ .request = IHK_OS_AUX_PERF_ENABLE, .func = mcctrl_ioctl },
|
||||
{ .request = IHK_OS_AUX_PERF_DISABLE, .func = mcctrl_ioctl },
|
||||
{ .request = IHK_OS_AUX_PERF_DESTROY, .func = mcctrl_ioctl },
|
||||
};
|
||||
|
||||
static struct ihk_os_kernel_call_handler mcctrl_kernel_handlers = {
|
||||
.get_request_cpu = mcctrl_get_request_os_cpu,
|
||||
.read_cpu_register = mcctrl_os_read_cpu_register,
|
||||
.write_cpu_register = mcctrl_os_write_cpu_register,
|
||||
};
|
||||
|
||||
static struct ihk_os_user_call mcctrl_uc_proto = {
|
||||
@ -82,79 +113,125 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
|
||||
|
||||
static ihk_os_t os[OS_MAX_MINOR];
|
||||
|
||||
ihk_os_t
|
||||
osnum_to_os(int n)
|
||||
ihk_os_t osnum_to_os(int n)
|
||||
{
|
||||
return os[n];
|
||||
}
|
||||
|
||||
static int __init mcctrl_init(void)
|
||||
/* OS event notifier implementation */
|
||||
int mcctrl_os_boot_notifier(int os_index)
|
||||
{
|
||||
int i;
|
||||
int rc;
|
||||
|
||||
rc = -ENOENT;
|
||||
for(i = 0; i < OS_MAX_MINOR; i++){
|
||||
os[i] = ihk_host_find_os(i, NULL);
|
||||
if (os[i]) {
|
||||
printk("OS #%d found.\n", i);
|
||||
rc = 0;
|
||||
}
|
||||
}
|
||||
if(rc){
|
||||
printk("OS not found.\n");
|
||||
return rc;
|
||||
os[os_index] = ihk_host_find_os(os_index, NULL);
|
||||
if (!os[os_index]) {
|
||||
printk("mcctrl: error: OS ID %d couldn't be found\n", os_index);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
for(i = 0; i < OS_MAX_MINOR; i++){
|
||||
if (os[i]) {
|
||||
if (prepare_ikc_channels(os[i]) != 0) {
|
||||
printk("Preparing syscall channels failed.\n");
|
||||
os[i] = NULL;
|
||||
}
|
||||
}
|
||||
if (prepare_ikc_channels(os[os_index]) != 0) {
|
||||
printk("mcctrl: error: preparing IKC channels for OS %d\n", os_index);
|
||||
|
||||
os[os_index] = NULL;
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
|
||||
|
||||
rc = ihk_os_set_kernel_call_handlers(os[os_index], &mcctrl_kernel_handlers);
|
||||
if (rc < 0) {
|
||||
printk("mcctrl: error: setting kernel callbacks for OS %d\n", os_index);
|
||||
goto error_cleanup_channels;
|
||||
}
|
||||
|
||||
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
|
||||
if (rc < 0) {
|
||||
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
|
||||
goto error_clear_kernel_handlers;
|
||||
}
|
||||
|
||||
procfs_init(os_index);
|
||||
printk("mcctrl: OS ID %d boot event handled\n", os_index);
|
||||
|
||||
return 0;
|
||||
|
||||
error_clear_kernel_handlers:
|
||||
ihk_os_clear_kernel_call_handlers(os[os_index]);
|
||||
error_cleanup_channels:
|
||||
destroy_ikc_channels(os[os_index]);
|
||||
|
||||
os[os_index] = NULL;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int mcctrl_os_shutdown_notifier(int os_index)
|
||||
{
|
||||
if (os[os_index]) {
|
||||
sysfsm_cleanup(os[os_index]);
|
||||
free_topology_info(os[os_index]);
|
||||
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
|
||||
ihk_os_clear_kernel_call_handlers(os[os_index]);
|
||||
destroy_ikc_channels(os[os_index]);
|
||||
procfs_exit(os_index);
|
||||
}
|
||||
|
||||
os[os_index] = NULL;
|
||||
|
||||
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ihk_os_notifier_ops mcctrl_os_notifier_ops = {
|
||||
.boot = mcctrl_os_boot_notifier,
|
||||
.shutdown = mcctrl_os_shutdown_notifier,
|
||||
};
|
||||
|
||||
static struct ihk_os_notifier mcctrl_os_notifier = {
|
||||
.ops = &mcctrl_os_notifier_ops,
|
||||
};
|
||||
|
||||
static int __init mcctrl_init(void)
|
||||
{
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
#ifndef DO_USER_MODE
|
||||
mcctrl_syscall_init();
|
||||
#endif
|
||||
|
||||
rus_page_hash_init();
|
||||
|
||||
for(i = 0; i < OS_MAX_MINOR; i++){
|
||||
if (os[i]) {
|
||||
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
|
||||
rc = ihk_os_register_user_call_handlers(os[i], mcctrl_uc + i);
|
||||
if(rc < 0){
|
||||
destroy_ikc_channels(os[i]);
|
||||
os[i] = NULL;
|
||||
}
|
||||
procfs_init(i);
|
||||
}
|
||||
for (i = 0; i < OS_MAX_MINOR; ++i) {
|
||||
os[i] = NULL;
|
||||
}
|
||||
|
||||
rus_page_hash_init();
|
||||
|
||||
binfmt_mcexec_init();
|
||||
|
||||
return 0;
|
||||
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
|
||||
printk("mcctrl: error: registering OS notifier\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
printk("mcctrl: initialized successfully.\n");
|
||||
return ret;
|
||||
|
||||
error:
|
||||
binfmt_mcexec_exit();
|
||||
rus_page_hash_put_pages();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit mcctrl_exit(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
binfmt_mcexec_exit();
|
||||
printk("mcctrl: unregistered.\n");
|
||||
for(i = 0; i < OS_MAX_MINOR; i++){
|
||||
if(os[i]){
|
||||
sysfsm_cleanup(os[i]);
|
||||
free_topology_info(os[i]);
|
||||
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
|
||||
destroy_ikc_channels(os[i]);
|
||||
procfs_exit(i);
|
||||
}
|
||||
if (ihk_host_deregister_os_notifier(&mcctrl_os_notifier) != 0) {
|
||||
printk("mcctrl: warning: failed to deregister OS notifier??\n");
|
||||
}
|
||||
|
||||
binfmt_mcexec_exit();
|
||||
rus_page_hash_put_pages();
|
||||
|
||||
printk("mcctrl: unregistered.\n");
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
||||
@ -27,6 +27,7 @@
|
||||
#include <linux/miscdevice.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include "mcctrl.h"
|
||||
#ifdef ATTACHED_MIC
|
||||
#include <sysdeps/mic/mic/micconst.h>
|
||||
@ -34,22 +35,38 @@
|
||||
|
||||
#define REQUEST_SHIFT 16
|
||||
|
||||
//#define DEBUG_IKC
|
||||
|
||||
#ifdef DEBUG_IKC
|
||||
#define dkprintf(...) kprintf(__VA_ARGS__)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
#else
|
||||
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) printk(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
//int num_channels;
|
||||
|
||||
//struct mcctrl_channel *channels;
|
||||
|
||||
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
|
||||
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
|
||||
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
|
||||
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
|
||||
void sig_done(unsigned long arg, int err);
|
||||
void mcctrl_perf_ack(ihk_os_t os, struct ikc_scd_packet *packet);
|
||||
void mcctrl_os_read_write_cpu_response(ihk_os_t os,
|
||||
struct ikc_scd_packet *pisp);
|
||||
void mcctrl_eventfd(ihk_os_t os, struct ikc_scd_packet *pisp);
|
||||
|
||||
/* XXX: this runs in atomic context! */
|
||||
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
void *__packet, void *__os)
|
||||
{
|
||||
struct ikc_scd_packet *pisp = __packet;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
|
||||
int msg = pisp->msg;
|
||||
|
||||
switch (pisp->msg) {
|
||||
switch (msg) {
|
||||
case SCD_MSG_INIT_CHANNEL:
|
||||
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
|
||||
break;
|
||||
@ -63,11 +80,11 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
break;
|
||||
|
||||
case SCD_MSG_SYSCALL_ONESIDE:
|
||||
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
|
||||
mcexec_syscall(usrdata, pisp);
|
||||
break;
|
||||
|
||||
case SCD_MSG_PROCFS_ANSWER:
|
||||
procfs_answer(pisp->arg, pisp->err);
|
||||
procfs_answer(usrdata, pisp->pid);
|
||||
break;
|
||||
|
||||
case SCD_MSG_SEND_SIGNAL:
|
||||
@ -88,19 +105,24 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
break;
|
||||
|
||||
case SCD_MSG_PROCFS_TID_CREATE:
|
||||
add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
|
||||
break;
|
||||
|
||||
case SCD_MSG_PROCFS_TID_DELETE:
|
||||
delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
|
||||
procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
|
||||
break;
|
||||
|
||||
case SCD_MSG_GET_VDSO_INFO:
|
||||
get_vdso_info(__os, pisp->arg);
|
||||
break;
|
||||
|
||||
case SCD_MSG_REPLY_GET_CPU_MAPPING:
|
||||
reply_get_cpu_mapping(pisp->arg);
|
||||
case SCD_MSG_PERF_ACK:
|
||||
mcctrl_perf_ack(__os, pisp);
|
||||
break;
|
||||
|
||||
case SCD_MSG_CPU_RW_REG_RESP:
|
||||
mcctrl_os_read_write_cpu_response(__os, pisp);
|
||||
break;
|
||||
|
||||
case SCD_MSG_EVENTFD:
|
||||
mcctrl_eventfd(__os, pisp);
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -110,6 +132,25 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
pisp->err, pisp->arg);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
|
||||
* mcexec_ret_syscall(), for the rest, free it here.
|
||||
*/
|
||||
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
|
||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet,
|
||||
(usrdata->ikc2linux[smp_processor_id()] ?
|
||||
usrdata->ikc2linux[smp_processor_id()] :
|
||||
usrdata->ikc2linux[0]));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dummy_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
void *__packet, void *__os)
|
||||
{
|
||||
kprintf("%s: WARNING: packet received\n", __FUNCTION__);
|
||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -146,8 +187,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
|
||||
|
||||
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
|
||||
ihk_ikc_get_processor_id());
|
||||
kprintf("Setting the target to %d\n",
|
||||
ihk_ikc_get_processor_id());
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -162,221 +201,174 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
|
||||
}
|
||||
}
|
||||
|
||||
//unsigned long *mcctrl_doorbell_va;
|
||||
//unsigned long mcctrl_doorbell_pa;
|
||||
|
||||
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
|
||||
{
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
struct ikc_scd_packet packet;
|
||||
struct mcctrl_channel *pmc = usrdata->channels + cpu;
|
||||
unsigned long phys;
|
||||
struct ikc_scd_init_param *rpm;
|
||||
|
||||
if(c->port == 502)
|
||||
if (c->port == 502) {
|
||||
pmc = usrdata->channels + usrdata->num_channels - 1;
|
||||
|
||||
if (!pmc) {
|
||||
return;
|
||||
}
|
||||
|
||||
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
|
||||
|
||||
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
|
||||
sizeof(struct ikc_scd_init_param));
|
||||
#ifdef CONFIG_MIC
|
||||
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
|
||||
#else
|
||||
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
|
||||
sizeof(struct ikc_scd_init_param),
|
||||
NULL, 0);
|
||||
#endif
|
||||
|
||||
pmc->param.request_va =
|
||||
(void *)__get_free_pages(GFP_KERNEL,
|
||||
REQUEST_SHIFT - PAGE_SHIFT);
|
||||
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
|
||||
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
|
||||
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
|
||||
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL);
|
||||
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
|
||||
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
|
||||
memset(pmc->param.request_va, 0, PAGE_SIZE);
|
||||
memset(pmc->param.post_va, 0, PAGE_SIZE);
|
||||
|
||||
pmc->param.response_rpa = rpm->response_page;
|
||||
pmc->param.response_pa
|
||||
= ihk_device_map_memory(ihk_os_to_dev(os),
|
||||
pmc->param.response_rpa,
|
||||
PAGE_SIZE);
|
||||
#ifdef CONFIG_MIC
|
||||
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
|
||||
PAGE_SIZE);
|
||||
#else
|
||||
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
|
||||
pmc->param.response_pa,
|
||||
PAGE_SIZE, NULL, 0);
|
||||
#endif
|
||||
|
||||
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL,
|
||||
DMA_PIN_SHIFT - PAGE_SHIFT);
|
||||
|
||||
rpm->request_page = pmc->param.request_pa;
|
||||
rpm->doorbell_page = pmc->param.doorbell_pa;
|
||||
rpm->post_page = pmc->param.post_pa;
|
||||
if (!pmc) {
|
||||
kprintf("%s: error: no channel found?\n", __FUNCTION__);
|
||||
return;
|
||||
}
|
||||
|
||||
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
|
||||
packet.ref = cpu;
|
||||
packet.arg = rphys;
|
||||
|
||||
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
|
||||
pmc->param.request_pa, pmc->param.response_rpa,
|
||||
pmc->param.doorbell_pa);
|
||||
printk("Request: %p, Response: %p, Doorbell: %p\n",
|
||||
pmc->param.request_va, pmc->param.response_va,
|
||||
pmc->param.doorbell_va);
|
||||
|
||||
ihk_ikc_send(pmc->c, &packet, 0);
|
||||
|
||||
#ifdef CONFIG_MIC
|
||||
iounmap(rpm);
|
||||
#else
|
||||
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
|
||||
sizeof(struct ikc_scd_init_param));
|
||||
#endif
|
||||
|
||||
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
|
||||
sizeof(struct ikc_scd_init_param));
|
||||
}
|
||||
|
||||
static int connect_handler(struct ihk_ikc_channel_info *param)
|
||||
static int connect_handler_ikc2linux(struct ihk_ikc_channel_info *param)
|
||||
{
|
||||
struct ihk_ikc_channel_desc *c;
|
||||
int cpu;
|
||||
ihk_os_t os = param->channel->remote_os;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
int linux_cpu;
|
||||
|
||||
c = param->channel;
|
||||
cpu = c->send.queue->read_cpu;
|
||||
linux_cpu = c->send.queue->write_cpu;
|
||||
if (linux_cpu > nr_cpu_ids) {
|
||||
kprintf("%s: invalid Linux CPU id %d\n",
|
||||
__FUNCTION__, linux_cpu);
|
||||
return -1;
|
||||
}
|
||||
dkprintf("%s: Linux CPU: %d\n", __FUNCTION__, linux_cpu);
|
||||
|
||||
if (cpu < 0 || cpu >= usrdata->num_channels) {
|
||||
kprintf("Invalid connect source processor: %d\n", cpu);
|
||||
param->packet_handler = syscall_packet_handler;
|
||||
usrdata->ikc2linux[linux_cpu] = c;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int connect_handler_ikc2mckernel(struct ihk_ikc_channel_info *param)
|
||||
{
|
||||
struct ihk_ikc_channel_desc *c;
|
||||
int mck_cpu;
|
||||
ihk_os_t os = param->channel->remote_os;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
|
||||
c = param->channel;
|
||||
mck_cpu = c->send.queue->read_cpu;
|
||||
|
||||
if (mck_cpu < 0 || mck_cpu >= usrdata->num_channels) {
|
||||
kprintf("Invalid connect source processor: %d\n", mck_cpu);
|
||||
return 1;
|
||||
}
|
||||
param->packet_handler = syscall_packet_handler;
|
||||
|
||||
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
|
||||
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
|
||||
param->packet_handler = dummy_packet_handler;
|
||||
|
||||
usrdata->channels[cpu].c = c;
|
||||
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
|
||||
usrdata->channels[mck_cpu].c = c;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int connect_handler2(struct ihk_ikc_channel_info *param)
|
||||
{
|
||||
struct ihk_ikc_channel_desc *c;
|
||||
int cpu;
|
||||
ihk_os_t os = param->channel->remote_os;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
|
||||
c = param->channel;
|
||||
cpu = usrdata->num_channels - 1;
|
||||
|
||||
param->packet_handler = syscall_packet_handler;
|
||||
|
||||
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
|
||||
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
|
||||
|
||||
usrdata->channels[cpu].c = c;
|
||||
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ihk_ikc_listen_param listen_param = {
|
||||
.port = 501,
|
||||
.handler = connect_handler,
|
||||
static struct ihk_ikc_listen_param lp_ikc2linux = {
|
||||
.port = 503,
|
||||
.ikc_direction = IHK_IKC_DIRECTION_RECV,
|
||||
.handler = connect_handler_ikc2linux,
|
||||
.pkt_size = sizeof(struct ikc_scd_packet),
|
||||
.queue_size = PAGE_SIZE,
|
||||
.queue_size = PAGE_SIZE * 4,
|
||||
.magic = 0x1129,
|
||||
};
|
||||
|
||||
static struct ihk_ikc_listen_param listen_param2 = {
|
||||
.port = 502,
|
||||
.handler = connect_handler2,
|
||||
static struct ihk_ikc_listen_param lp_ikc2mckernel = {
|
||||
.port = 501,
|
||||
.ikc_direction = IHK_IKC_DIRECTION_SEND,
|
||||
.handler = connect_handler_ikc2mckernel,
|
||||
.pkt_size = sizeof(struct ikc_scd_packet),
|
||||
.queue_size = PAGE_SIZE,
|
||||
.queue_size = PAGE_SIZE * 4,
|
||||
.magic = 0x1329,
|
||||
};
|
||||
|
||||
int prepare_ikc_channels(ihk_os_t os)
|
||||
{
|
||||
struct ihk_cpu_info *info;
|
||||
struct mcctrl_usrdata *usrdata;
|
||||
int error;
|
||||
struct mcctrl_usrdata *usrdata;
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
|
||||
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
|
||||
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
|
||||
|
||||
info = ihk_os_get_cpu_info(os);
|
||||
if (!info) {
|
||||
printk("Error: cannot retrieve CPU info.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (info->n_cpus < 1) {
|
||||
printk("Error: # of cpu is invalid.\n");
|
||||
return -EINVAL;
|
||||
if (!usrdata) {
|
||||
printk("%s: error: allocating mcctrl_usrdata\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
usrdata->num_channels = info->n_cpus + 1;
|
||||
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
|
||||
GFP_KERNEL);
|
||||
usrdata->cpu_info = ihk_os_get_cpu_info(os);
|
||||
usrdata->mem_info = ihk_os_get_memory_info(os);
|
||||
|
||||
if (!usrdata->cpu_info || !usrdata->mem_info) {
|
||||
printk("%s: cannot obtain OS CPU and memory information.\n",
|
||||
__FUNCTION__);
|
||||
ret = -EINVAL;
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (usrdata->cpu_info->n_cpus < 1) {
|
||||
printk("%s: Error: # of cpu is invalid.\n", __FUNCTION__);
|
||||
ret = -EINVAL;
|
||||
goto error;
|
||||
}
|
||||
|
||||
usrdata->num_channels = usrdata->cpu_info->n_cpus;
|
||||
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
|
||||
usrdata->num_channels,
|
||||
GFP_KERNEL);
|
||||
|
||||
if (!usrdata->channels) {
|
||||
printk("Error: cannot allocate channels.\n");
|
||||
return -ENOMEM;
|
||||
ret = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
usrdata->ikc2linux = kzalloc(sizeof(struct ihk_ikc_channel_desc *) *
|
||||
nr_cpu_ids, GFP_KERNEL);
|
||||
|
||||
if (!usrdata->ikc2linux) {
|
||||
printk("Error: cannot allocate ikc2linux channels.\n");
|
||||
ret = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
usrdata->os = os;
|
||||
init_waitqueue_head(&usrdata->wq_prepare);
|
||||
ihk_host_os_set_usrdata(os, usrdata);
|
||||
memcpy(&usrdata->listen_param, &listen_param, sizeof listen_param);
|
||||
ihk_ikc_listen_port(os, &usrdata->listen_param);
|
||||
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
|
||||
ihk_ikc_listen_port(os, &usrdata->listen_param2);
|
||||
|
||||
INIT_LIST_HEAD(&usrdata->per_proc_list);
|
||||
spin_lock_init(&usrdata->per_proc_list_lock);
|
||||
ihk_ikc_listen_port(os, &lp_ikc2linux);
|
||||
ihk_ikc_listen_port(os, &lp_ikc2mckernel);
|
||||
|
||||
init_waitqueue_head(&usrdata->wq_procfs);
|
||||
mutex_init(&usrdata->reserve_lock);
|
||||
|
||||
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
|
||||
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
|
||||
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
|
||||
INIT_LIST_HEAD(&usrdata->node_topology_list);
|
||||
|
||||
error = init_peer_channel_registry(usrdata);
|
||||
if (error) {
|
||||
return error;
|
||||
}
|
||||
mutex_init(&usrdata->part_exec.lock);
|
||||
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
|
||||
usrdata->part_exec.nr_processes = -1;
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
if (usrdata) {
|
||||
if (usrdata->channels) kfree(usrdata->channels);
|
||||
if (usrdata->ikc2linux) kfree(usrdata->ikc2linux);
|
||||
kfree(usrdata);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
|
||||
{
|
||||
free_pages((unsigned long)pmc->param.request_va,
|
||||
REQUEST_SHIFT - PAGE_SHIFT);
|
||||
free_page((unsigned long)pmc->param.post_va);
|
||||
|
||||
#ifdef CONFIG_MIC
|
||||
iounmap(pmc->param.response_va);
|
||||
#else
|
||||
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
|
||||
PAGE_SIZE);
|
||||
#endif
|
||||
ihk_device_unmap_memory(ihk_os_to_dev(os),
|
||||
pmc->param.response_pa, PAGE_SIZE);
|
||||
free_pages((unsigned long)pmc->dma_buf,
|
||||
DMA_PIN_SHIFT - PAGE_SHIFT);
|
||||
return;
|
||||
}
|
||||
|
||||
void destroy_ikc_channels(ihk_os_t os)
|
||||
@ -384,19 +376,32 @@ void destroy_ikc_channels(ihk_os_t os)
|
||||
int i;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
|
||||
if (!usrdata) {
|
||||
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
|
||||
return;
|
||||
}
|
||||
|
||||
ihk_host_os_set_usrdata(os, NULL);
|
||||
|
||||
for (i = 0; i < usrdata->num_channels; i++) {
|
||||
if (usrdata->channels[i].c) {
|
||||
// ihk_ikc_disconnect(usrdata->channels[i].c);
|
||||
ihk_ikc_free_channel(usrdata->channels[i].c);
|
||||
__destroy_ikc_channel(os, usrdata->channels + i);
|
||||
printk("Channel #%d freed.\n", i);
|
||||
ihk_ikc_destroy_channel(usrdata->channels[i].c);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nr_cpu_ids; i++) {
|
||||
if (usrdata->ikc2linux[i]) {
|
||||
ihk_ikc_destroy_channel(usrdata->ikc2linux[i]);
|
||||
}
|
||||
}
|
||||
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
|
||||
|
||||
destroy_peer_channel_registry(usrdata);
|
||||
kfree(usrdata->channels);
|
||||
kfree(usrdata->ikc2linux);
|
||||
kfree(usrdata);
|
||||
}
|
||||
|
||||
void
|
||||
mcctrl_eventfd(ihk_os_t os, struct ikc_scd_packet *pisp)
|
||||
{
|
||||
ihk_os_eventfd(os, 0);
|
||||
}
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
#include <ikc/master.h>
|
||||
#include <ihk/msr.h>
|
||||
#include <linux/semaphore.h>
|
||||
#include <linux/rwlock.h>
|
||||
#include <linux/threads.h>
|
||||
#include "sysfs.h"
|
||||
|
||||
@ -48,6 +49,7 @@
|
||||
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
|
||||
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
|
||||
#define SCD_MSG_SCHEDULE_PROCESS 0x3
|
||||
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
|
||||
|
||||
#define SCD_MSG_INIT_CHANNEL 0x5
|
||||
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
|
||||
@ -57,8 +59,8 @@
|
||||
#define SCD_MSG_CLEANUP_PROCESS 0x9
|
||||
#define SCD_MSG_GET_VDSO_INFO 0xa
|
||||
|
||||
#define SCD_MSG_GET_CPU_MAPPING 0xc
|
||||
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
|
||||
//#define SCD_MSG_GET_CPU_MAPPING 0xc
|
||||
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
|
||||
|
||||
#define SCD_MSG_PROCFS_CREATE 0x10
|
||||
#define SCD_MSG_PROCFS_DELETE 0x11
|
||||
@ -90,6 +92,14 @@
|
||||
#define SCD_MSG_PROCFS_TID_CREATE 0x44
|
||||
#define SCD_MSG_PROCFS_TID_DELETE 0x45
|
||||
|
||||
#define SCD_MSG_EVENTFD 0x46
|
||||
|
||||
#define SCD_MSG_PERF_CTRL 0x50
|
||||
#define SCD_MSG_PERF_ACK 0x51
|
||||
|
||||
#define SCD_MSG_CPU_RW_REG 0x52
|
||||
#define SCD_MSG_CPU_RW_REG_RESP 0x53
|
||||
|
||||
#define DMA_PIN_SHIFT 21
|
||||
|
||||
#define DO_USER_MODE
|
||||
@ -101,6 +111,12 @@ struct coretable {
|
||||
unsigned long addr;
|
||||
};
|
||||
|
||||
enum mcctrl_os_cpu_operation {
|
||||
MCCTRL_OS_CPU_READ_REGISTER,
|
||||
MCCTRL_OS_CPU_WRITE_REGISTER,
|
||||
MCCTRL_OS_CPU_MAX_OP
|
||||
};
|
||||
|
||||
struct ikc_scd_packet {
|
||||
int msg;
|
||||
int err;
|
||||
@ -110,8 +126,9 @@ struct ikc_scd_packet {
|
||||
int ref;
|
||||
int osnum;
|
||||
int pid;
|
||||
int padding;
|
||||
unsigned long arg;
|
||||
struct syscall_request req;
|
||||
unsigned long resp_pa;
|
||||
};
|
||||
|
||||
/* for SCD_MSG_SYSFS_* */
|
||||
@ -120,7 +137,20 @@ struct ikc_scd_packet {
|
||||
long sysfs_arg2;
|
||||
long sysfs_arg3;
|
||||
};
|
||||
|
||||
/* SCD_MSG_SCHEDULE_THREAD */
|
||||
struct {
|
||||
int ttid;
|
||||
};
|
||||
|
||||
/* SCD_MSG_CPU_RW_REG */
|
||||
struct {
|
||||
struct ihk_os_cpu_register desc;
|
||||
enum mcctrl_os_cpu_operation op;
|
||||
void *resp;
|
||||
};
|
||||
};
|
||||
char padding[12];
|
||||
};
|
||||
|
||||
struct mcctrl_priv {
|
||||
@ -154,24 +184,51 @@ struct syscall_params {
|
||||
struct wait_queue_head_list_node {
|
||||
struct list_head list;
|
||||
wait_queue_head_t wq_syscall;
|
||||
int pid;
|
||||
struct task_struct *task;
|
||||
/* Denotes an exclusive wait for requester TID rtid */
|
||||
int rtid;
|
||||
int req;
|
||||
struct ikc_scd_packet *packet;
|
||||
};
|
||||
|
||||
struct mcctrl_channel {
|
||||
struct ihk_ikc_channel_desc *c;
|
||||
struct syscall_params param;
|
||||
struct ikc_scd_init_param init;
|
||||
void *dma_buf;
|
||||
|
||||
struct list_head wq_list;
|
||||
ihk_spinlock_t wq_list_lock;
|
||||
};
|
||||
|
||||
struct mcctrl_per_thread_data {
|
||||
struct list_head hash;
|
||||
struct task_struct *task;
|
||||
void *data;
|
||||
};
|
||||
|
||||
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
|
||||
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
|
||||
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
|
||||
|
||||
struct mcctrl_per_proc_data {
|
||||
struct list_head list;
|
||||
struct mcctrl_usrdata *ud;
|
||||
struct list_head hash;
|
||||
int pid;
|
||||
unsigned long rpgtable; /* per process, not per OS */
|
||||
|
||||
struct list_head wq_list; /* All these requests come from mcexec */
|
||||
struct list_head wq_req_list; /* These requests come from IKC IRQ handler (can be processed by any threads) */
|
||||
struct list_head wq_list_exact; /* These requests come from IKC IRQ handler targeting a particular thread */
|
||||
|
||||
ihk_spinlock_t wq_list_lock;
|
||||
wait_queue_head_t wq_prepare;
|
||||
wait_queue_head_t wq_procfs;
|
||||
|
||||
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
|
||||
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
|
||||
cpumask_t cpu_set;
|
||||
int ikc_target_cpu;
|
||||
atomic_t refcount;
|
||||
|
||||
struct list_head devobj_pager_list;
|
||||
struct semaphore devobj_pager_lock;
|
||||
};
|
||||
|
||||
struct sysfsm_req {
|
||||
@ -199,11 +256,6 @@ static inline int sysfs_inited(struct sysfsm_data *sdp)
|
||||
return !!(sdp->sysfs_buf);
|
||||
} /* sysfs_inited() */
|
||||
|
||||
struct cpu_mapping {
|
||||
int cpu_number;
|
||||
int hw_id;
|
||||
};
|
||||
|
||||
struct cache_topology {
|
||||
struct ihk_cache_topology *saved;
|
||||
cpumask_t shared_cpu_map;
|
||||
@ -212,8 +264,9 @@ struct cache_topology {
|
||||
};
|
||||
|
||||
struct cpu_topology {
|
||||
struct cpu_mapping *cpu_mapping;
|
||||
//struct mcctrl_usrdata *udp;
|
||||
struct ihk_cpu_topology *saved;
|
||||
int mckernel_cpu_id;
|
||||
cpumask_t core_siblings;
|
||||
cpumask_t thread_siblings;
|
||||
|
||||
@ -221,41 +274,67 @@ struct cpu_topology {
|
||||
struct list_head cache_list;
|
||||
};
|
||||
|
||||
#define NODE_DISTANCE_S_SIZE 1024
|
||||
|
||||
struct node_topology {
|
||||
struct ihk_node_topology *saved;
|
||||
int mckernel_numa_id;
|
||||
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
|
||||
cpumask_t cpumap;
|
||||
|
||||
struct list_head chain;
|
||||
};
|
||||
|
||||
struct process_list_item {
|
||||
int ready;
|
||||
struct task_struct *task;
|
||||
struct list_head list;
|
||||
wait_queue_head_t pli_wq;
|
||||
};
|
||||
|
||||
struct mcctrl_part_exec {
|
||||
struct mutex lock;
|
||||
int nr_processes;
|
||||
int nr_processes_left;
|
||||
cpumask_t cpus_used;
|
||||
struct list_head pli_list;
|
||||
};
|
||||
|
||||
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
|
||||
|
||||
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
|
||||
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
|
||||
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
|
||||
|
||||
struct mcctrl_usrdata {
|
||||
struct ihk_ikc_listen_param listen_param;
|
||||
struct ihk_ikc_listen_param listen_param2;
|
||||
ihk_os_t os;
|
||||
int num_channels;
|
||||
struct mcctrl_channel *channels;
|
||||
unsigned long *mcctrl_doorbell_va;
|
||||
unsigned long mcctrl_doorbell_pa;
|
||||
/* Channels used for sending messages to LWK */
|
||||
struct mcctrl_channel *channels;
|
||||
/* Channels used for receiving messages from LWK */
|
||||
struct ihk_ikc_channel_desc **ikc2linux;
|
||||
int remaining_job;
|
||||
int base_cpu;
|
||||
int job_pos;
|
||||
int mcctrl_dma_abort;
|
||||
struct mutex reserve_lock;
|
||||
unsigned long last_thread_exec;
|
||||
wait_queue_head_t wq_prepare;
|
||||
|
||||
struct list_head per_proc_list;
|
||||
ihk_spinlock_t per_proc_list_lock;
|
||||
wait_queue_head_t wq_procfs;
|
||||
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
|
||||
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
|
||||
|
||||
void **keys;
|
||||
struct sysfsm_data sysfsm_data;
|
||||
unsigned long cpu_online[CPU_LONGS];
|
||||
int cpu_mapping_elems;
|
||||
int padding;
|
||||
struct cpu_mapping *cpu_mapping;
|
||||
long cpu_mapping_pa;
|
||||
struct ihk_cpu_info *cpu_info;
|
||||
struct ihk_mem_info *mem_info;
|
||||
nodemask_t numa_online;
|
||||
struct list_head cpu_topology_list;
|
||||
struct list_head node_topology_list;
|
||||
struct mcctrl_part_exec part_exec;
|
||||
int perf_event_num;
|
||||
};
|
||||
|
||||
struct mcctrl_signal {
|
||||
@ -273,14 +352,28 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
|
||||
ihk_os_t osnum_to_os(int n);
|
||||
|
||||
/* syscall.c */
|
||||
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
|
||||
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud);
|
||||
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
|
||||
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
|
||||
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
|
||||
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc);
|
||||
void pager_add_process(void);
|
||||
void pager_remove_process(struct mcctrl_per_proc_data *ppd);
|
||||
|
||||
#define PROCFS_NAME_MAX 1000
|
||||
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
|
||||
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
|
||||
struct mcctrl_per_proc_data *ppd);
|
||||
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
|
||||
struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
|
||||
struct mcctrl_usrdata *ud, int pid);
|
||||
void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd);
|
||||
|
||||
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
|
||||
struct task_struct *task, void *data);
|
||||
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
|
||||
struct task_struct *task);
|
||||
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
|
||||
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
|
||||
|
||||
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||
long ret, int stid);
|
||||
|
||||
#define PROCFS_NAME_MAX 768
|
||||
|
||||
struct procfs_read {
|
||||
unsigned long pbuf; /* physical address of the host buffer (request) */
|
||||
@ -300,7 +393,8 @@ struct procfs_file {
|
||||
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
|
||||
};
|
||||
|
||||
void procfs_answer(unsigned int arg, int err);
|
||||
void procfs_answer(struct mcctrl_usrdata *ud, int pid);
|
||||
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
|
||||
void add_tid_entry(int osnum, int pid, int tid);
|
||||
void add_pid_entry(int osnum, int pid);
|
||||
void delete_tid_entry(int osnum, int pid, int tid);
|
||||
@ -347,4 +441,14 @@ struct get_cpu_mapping_req {
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
|
||||
struct ihk_perf_event_attr{
|
||||
unsigned long config;
|
||||
unsigned disabled:1;
|
||||
unsigned pinned:1;
|
||||
unsigned exclude_user:1;
|
||||
unsigned exclude_kernel:1;
|
||||
unsigned exclude_hv:1;
|
||||
unsigned exclude_idle:1;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@ -17,8 +17,10 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/resource.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include "mcctrl.h"
|
||||
#include <linux/version.h>
|
||||
#include <linux/semaphore.h>
|
||||
|
||||
//#define PROCFS_DEBUG
|
||||
|
||||
@ -57,7 +59,6 @@ static const struct procfs_entry base_entry_stuff[];
|
||||
static const struct file_operations mckernel_forward_ro;
|
||||
static const struct file_operations mckernel_forward;
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
|
||||
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
|
||||
size_t nbytes, loff_t *ppos);
|
||||
|
||||
@ -81,7 +82,7 @@ struct procfs_list_entry {
|
||||
* file.
|
||||
*/
|
||||
LIST_HEAD(procfs_file_list);
|
||||
static ihk_spinlock_t procfs_file_list_lock;
|
||||
DEFINE_SEMAPHORE(procfs_file_list_lock);
|
||||
|
||||
static char *
|
||||
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
|
||||
@ -104,14 +105,28 @@ getpath(struct procfs_list_entry *e, char *buf, int bufsize)
|
||||
/**
|
||||
* \brief Process SCD_MSG_PROCFS_ANSWER message.
|
||||
*
|
||||
* \param arg sent argument
|
||||
* \param err error info (redundant)
|
||||
* \param ud mcctrl_usrdata pointer
|
||||
* \param pid PID of the requesting process
|
||||
*/
|
||||
void
|
||||
procfs_answer(unsigned int arg, int err)
|
||||
void procfs_answer(struct mcctrl_usrdata *ud, int pid)
|
||||
{
|
||||
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
|
||||
wake_up_interruptible(&procfsq);
|
||||
struct mcctrl_per_proc_data *ppd = NULL;
|
||||
|
||||
if (pid > 0) {
|
||||
ppd = mcctrl_get_per_proc_data(ud, pid);
|
||||
|
||||
if (unlikely(!ppd)) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d\n",
|
||||
__FUNCTION__, pid);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
wake_up_all(pid > 0 ? &ppd->wq_procfs : &ud->wq_procfs);
|
||||
|
||||
if (pid > 0) {
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
}
|
||||
}
|
||||
|
||||
static struct procfs_list_entry *
|
||||
@ -246,9 +261,11 @@ get_pid_cred(int pid)
|
||||
{
|
||||
struct task_struct *task = NULL;
|
||||
|
||||
if(pid > 0){
|
||||
if (pid > 0) {
|
||||
rcu_read_lock();
|
||||
task = pid_task(find_vpid(pid), PIDTYPE_PID);
|
||||
if(task){
|
||||
rcu_read_unlock();
|
||||
if (task) {
|
||||
return __task_cred(task);
|
||||
}
|
||||
}
|
||||
@ -375,67 +392,62 @@ _add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
|
||||
void
|
||||
add_tid_entry(int osnum, int pid, int tid)
|
||||
{
|
||||
unsigned long irqflag;
|
||||
const struct cred *cred = get_pid_cred(pid);
|
||||
|
||||
if(!cred)
|
||||
return;
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
_add_tid_entry(osnum, pid, tid, cred);
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
void
|
||||
add_pid_entry(int osnum, int pid)
|
||||
{
|
||||
struct procfs_list_entry *parent;
|
||||
unsigned long irqflag;
|
||||
const struct cred *cred = get_pid_cred(pid);
|
||||
|
||||
if(!cred)
|
||||
return;
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
parent = get_pid_entry(osnum, pid);
|
||||
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
|
||||
_add_tid_entry(osnum, pid, pid, cred);
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
void
|
||||
delete_tid_entry(int osnum, int pid, int tid)
|
||||
{
|
||||
unsigned long irqflag;
|
||||
struct procfs_list_entry *e;
|
||||
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
e = find_tid_entry(osnum, pid, tid);
|
||||
if(e)
|
||||
delete_procfs_entries(e);
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
void
|
||||
delete_pid_entry(int osnum, int pid)
|
||||
{
|
||||
unsigned long irqflag;
|
||||
struct procfs_list_entry *e;
|
||||
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
e = find_pid_entry(osnum, pid);
|
||||
if(e)
|
||||
delete_procfs_entries(e);
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
void
|
||||
proc_exe_link(int osnum, int pid, const char *path)
|
||||
{
|
||||
struct procfs_list_entry *parent;
|
||||
unsigned long irqflag;
|
||||
kuid_t uid = KUIDT_INIT(0);
|
||||
kgid_t gid = KGIDT_INIT(0);
|
||||
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
parent = find_pid_entry(osnum, pid);
|
||||
if(parent){
|
||||
struct procfs_list_entry *task;
|
||||
@ -451,7 +463,7 @@ proc_exe_link(int osnum, int pid, const char *path)
|
||||
uid, gid, path);
|
||||
}
|
||||
}
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -463,14 +475,13 @@ void
|
||||
procfs_init(int osnum)
|
||||
{
|
||||
struct procfs_list_entry *parent;
|
||||
unsigned long irqflag;
|
||||
kuid_t uid = KUIDT_INIT(0);
|
||||
kgid_t gid = KGIDT_INIT(0);
|
||||
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
parent = get_base_entry(osnum);
|
||||
add_procfs_entries(parent, base_entry_stuff, uid, gid);
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -481,14 +492,14 @@ procfs_init(int osnum)
|
||||
void
|
||||
procfs_exit(int osnum)
|
||||
{
|
||||
unsigned long irqflag;
|
||||
struct procfs_list_entry *e;
|
||||
|
||||
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
|
||||
down(&procfs_file_list_lock);
|
||||
e = find_base_entry(osnum);
|
||||
if(e)
|
||||
if (e) {
|
||||
delete_procfs_entries(e);
|
||||
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
|
||||
}
|
||||
up(&procfs_file_list_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -497,36 +508,84 @@ procfs_exit(int osnum)
|
||||
* This function conforms to the 2) way of fs/proc/generic.c
|
||||
* from linux-2.6.39.4.
|
||||
*/
|
||||
static ssize_t
|
||||
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
|
||||
loff_t *ppos)
|
||||
static ssize_t __mckernel_procfs_read_write(
|
||||
struct file *file,
|
||||
char __user *buf, size_t nbytes,
|
||||
loff_t *ppos, int read_write)
|
||||
{
|
||||
struct inode * inode = file->f_path.dentry->d_inode;
|
||||
struct inode * inode = file->f_inode;
|
||||
char *kern_buffer = NULL;
|
||||
int order = 0;
|
||||
volatile struct procfs_read *r = NULL;
|
||||
struct ikc_scd_packet isp;
|
||||
int ret;
|
||||
int ret, osnum, pid, retw;
|
||||
unsigned long pbuf;
|
||||
unsigned long count = nbytes;
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
|
||||
struct proc_dir_entry *dp = PDE(inode);
|
||||
struct procfs_list_entry *e = dp->data;
|
||||
#else
|
||||
#else
|
||||
struct procfs_list_entry *e = PDE_DATA(inode);
|
||||
#endif
|
||||
#endif
|
||||
loff_t offset = *ppos;
|
||||
char pathbuf[PROCFS_NAME_MAX];
|
||||
char *path;
|
||||
char *path, *p;
|
||||
ihk_os_t os = NULL;
|
||||
struct mcctrl_usrdata *udp = NULL;
|
||||
struct mcctrl_per_proc_data *ppd = NULL;
|
||||
|
||||
path = getpath(e, pathbuf, 256);
|
||||
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
|
||||
path, offset, count);
|
||||
|
||||
if (count <= 0 || offset < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
path = getpath(e, pathbuf, PROCFS_NAME_MAX);
|
||||
dprintk("%s: invoked for %s, offset: %lu, count: %lu\n",
|
||||
__FUNCTION__, path,
|
||||
(unsigned long)offset, count);
|
||||
|
||||
/* Verify OS number */
|
||||
ret = sscanf(path, "mcos%d/", &osnum);
|
||||
if (ret != 1) {
|
||||
printk("%s: error: couldn't determine OS number\n", __FUNCTION__);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (osnum != e->osnum) {
|
||||
printk("%s: error: OS numbers don't match\n", __FUNCTION__);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Is this request for a specific process? */
|
||||
p = strchr(path, '/') + 1;
|
||||
ret = sscanf(p, "%d/", &pid);
|
||||
if (ret != 1) {
|
||||
pid = -1;
|
||||
}
|
||||
|
||||
os = osnum_to_os(osnum);
|
||||
if (!os) {
|
||||
printk("%s: error: no IHK OS data found for OS %d\n",
|
||||
__FUNCTION__, osnum);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
udp = ihk_host_os_get_usrdata(os);
|
||||
if (!udp) {
|
||||
printk("%s: error: no MCCTRL data found for OS %d\n",
|
||||
__FUNCTION__, osnum);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (pid > 0) {
|
||||
ppd = mcctrl_get_per_proc_data(udp, pid);
|
||||
|
||||
if (unlikely(!ppd)) {
|
||||
printk("%s: error: no per-process structure for PID %d",
|
||||
__FUNCTION__, pid);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
while ((1 << order) < count) ++order;
|
||||
if (order > 12) {
|
||||
order -= 12;
|
||||
@ -538,10 +597,11 @@ mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
|
||||
/* NOTE: we need physically contigous memory to pass through IKC */
|
||||
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
|
||||
if (!kern_buffer) {
|
||||
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
|
||||
return -ENOMEM;
|
||||
printk("%s: ERROR: allocating kernel buffer\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
pbuf = virt_to_phys(kern_buffer);
|
||||
|
||||
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
|
||||
@ -555,152 +615,96 @@ mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
|
||||
r->status = 0;
|
||||
r->offset = offset;
|
||||
r->count = count;
|
||||
r->readwrite = 0;
|
||||
r->readwrite = read_write;
|
||||
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
|
||||
isp.msg = SCD_MSG_PROCFS_REQUEST;
|
||||
isp.ref = 0;
|
||||
isp.arg = virt_to_phys(r);
|
||||
|
||||
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
|
||||
|
||||
isp.pid = pid;
|
||||
|
||||
ret = mcctrl_ikc_send(osnum_to_os(e->osnum),
|
||||
(pid > 0) ? ppd->ikc_target_cpu : 0, &isp);
|
||||
|
||||
if (ret < 0) {
|
||||
goto out; /* error */
|
||||
}
|
||||
|
||||
|
||||
/* Wait for a reply. */
|
||||
ret = -EIO; /* default exit code */
|
||||
dprintk("now wait for a relpy\n");
|
||||
|
||||
/* Wait for the status field of the procfs_read structure set ready. */
|
||||
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
|
||||
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
|
||||
dprintk("%s: waiting for reply\n", __FUNCTION__);
|
||||
|
||||
retry_wait:
|
||||
/* Wait for the status field of the procfs_read structure,
|
||||
* wait on per-process or OS specific data depending on
|
||||
* who the request is for.
|
||||
*/
|
||||
if (pid > 0) {
|
||||
retw = wait_event_interruptible_timeout(ppd->wq_procfs,
|
||||
r->status != 0, 5 * HZ);
|
||||
}
|
||||
else {
|
||||
retw = wait_event_interruptible_timeout(udp->wq_procfs,
|
||||
r->status != 0, 5 * HZ);
|
||||
}
|
||||
|
||||
/* Timeout? */
|
||||
if (retw == 0 && r->status == 0) {
|
||||
printk("%s: error: timeout (1 sec)\n", __FUNCTION__);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Wake up and check the result. */
|
||||
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
|
||||
|
||||
if (r->ret > 0) {
|
||||
if (copy_to_user(buf, kern_buffer, r->ret)) {
|
||||
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
/* Interrupted? */
|
||||
else if (retw == -ERESTARTSYS) {
|
||||
ret = -ERESTART;
|
||||
goto out;
|
||||
}
|
||||
/* Were we woken up by a reply to another procfs request? */
|
||||
else if (r->status == 0) {
|
||||
/* TODO: r->status is not set atomically, we could be woken
|
||||
* up with status == 0 and it could change to 1 while in this
|
||||
* code, we could potentially miss the wake_up()...
|
||||
*/
|
||||
printk("%s: stale wake-up, retrying\n", __FUNCTION__);
|
||||
goto retry_wait;
|
||||
}
|
||||
|
||||
/* Wake up and check the result. */
|
||||
dprintk("%s: woke up. ret: %d, eof: %d\n",
|
||||
__FUNCTION__, r->ret, r->eof);
|
||||
|
||||
if (r->ret > 0) {
|
||||
if (read_write == 0) {
|
||||
if (copy_to_user(buf, kern_buffer, r->ret)) {
|
||||
printk("%s: ERROR: copy_to_user failed.\n", __FUNCTION__);
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
*ppos += r->ret;
|
||||
}
|
||||
ret = r->ret;
|
||||
|
||||
out:
|
||||
if(kern_buffer)
|
||||
if (ppd)
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
if (kern_buffer)
|
||||
free_pages((uintptr_t)kern_buffer, order);
|
||||
if(r)
|
||||
if (r)
|
||||
kfree((void *)r);
|
||||
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
|
||||
loff_t *ppos)
|
||||
static ssize_t mckernel_procfs_read(struct file *file,
|
||||
char __user *buf, size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
struct inode * inode = file->f_path.dentry->d_inode;
|
||||
char *kern_buffer = NULL;
|
||||
int order = 0;
|
||||
volatile struct procfs_read *r = NULL;
|
||||
struct ikc_scd_packet isp;
|
||||
int ret;
|
||||
unsigned long pbuf;
|
||||
unsigned long count = nbytes;
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
|
||||
struct proc_dir_entry *dp = PDE(inode);
|
||||
struct procfs_list_entry *e = dp->data;
|
||||
#else
|
||||
struct procfs_list_entry *e = PDE_DATA(inode);
|
||||
#endif
|
||||
loff_t offset = *ppos;
|
||||
char pathbuf[PROCFS_NAME_MAX];
|
||||
char *path;
|
||||
return __mckernel_procfs_read_write(file, buf, nbytes, ppos, 0);
|
||||
}
|
||||
|
||||
path = getpath(e, pathbuf, 256);
|
||||
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
|
||||
path, offset, count);
|
||||
|
||||
if (count <= 0 || offset < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
while ((1 << order) < count) ++order;
|
||||
if (order > 12) {
|
||||
order -= 12;
|
||||
}
|
||||
else {
|
||||
order = 1;
|
||||
}
|
||||
|
||||
/* NOTE: we need physically contigous memory to pass through IKC */
|
||||
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
|
||||
if (!kern_buffer) {
|
||||
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
if (copy_from_user(kern_buffer, buf, nbytes)) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pbuf = virt_to_phys(kern_buffer);
|
||||
|
||||
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
|
||||
if (r == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
|
||||
|
||||
r->pbuf = pbuf;
|
||||
r->eof = 0;
|
||||
r->ret = -EIO; /* default */
|
||||
r->status = 0;
|
||||
r->offset = offset;
|
||||
r->count = count;
|
||||
r->readwrite = 1;
|
||||
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
|
||||
isp.msg = SCD_MSG_PROCFS_REQUEST;
|
||||
isp.ref = 0;
|
||||
isp.arg = virt_to_phys(r);
|
||||
|
||||
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
|
||||
|
||||
if (ret < 0) {
|
||||
goto out; /* error */
|
||||
}
|
||||
|
||||
/* Wait for a reply. */
|
||||
ret = -EIO; /* default exit code */
|
||||
dprintk("now wait for a relpy\n");
|
||||
|
||||
/* Wait for the status field of the procfs_read structure set ready. */
|
||||
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
|
||||
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Wake up and check the result. */
|
||||
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
|
||||
|
||||
if (r->ret > 0) {
|
||||
*ppos += r->ret;
|
||||
}
|
||||
ret = r->ret;
|
||||
|
||||
out:
|
||||
if(kern_buffer)
|
||||
free_pages((uintptr_t)kern_buffer, order);
|
||||
if(r)
|
||||
kfree((void *)r);
|
||||
|
||||
return ret;
|
||||
static ssize_t mckernel_procfs_write(struct file *file,
|
||||
const char __user *buf, size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
return __mckernel_procfs_read_write(file,
|
||||
(char __user *)buf, nbytes, ppos, 1);
|
||||
}
|
||||
|
||||
static loff_t
|
||||
@ -719,6 +723,57 @@ mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
|
||||
return file->f_pos;
|
||||
}
|
||||
|
||||
struct procfs_work {
|
||||
void *os;
|
||||
int msg;
|
||||
int pid;
|
||||
unsigned long arg;
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
static void procfsm_work_main(struct work_struct *work0)
|
||||
{
|
||||
struct procfs_work *work = container_of(work0, struct procfs_work, work);
|
||||
|
||||
switch (work->msg) {
|
||||
case SCD_MSG_PROCFS_TID_CREATE:
|
||||
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
|
||||
break;
|
||||
|
||||
case SCD_MSG_PROCFS_TID_DELETE:
|
||||
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
|
||||
break;
|
||||
|
||||
default:
|
||||
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
|
||||
__FUNCTION__, work->msg, work->pid, work->arg);
|
||||
break;
|
||||
}
|
||||
|
||||
kfree(work);
|
||||
return;
|
||||
}
|
||||
|
||||
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
|
||||
{
|
||||
struct procfs_work *work = NULL;
|
||||
|
||||
work = kzalloc(sizeof(*work), GFP_ATOMIC);
|
||||
if (!work) {
|
||||
printk("%s: kzalloc failed\n", __FUNCTION__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
work->os = os;
|
||||
work->msg = msg;
|
||||
work->pid = pid;
|
||||
work->arg = arg;
|
||||
INIT_WORK(&work->work, &procfsm_work_main);
|
||||
|
||||
schedule_work(&work->work);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct file_operations mckernel_forward_ro = {
|
||||
.llseek = mckernel_procfs_lseek,
|
||||
.read = mckernel_procfs_read,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -14,10 +14,11 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/version.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include "mcctrl.h"
|
||||
#include "sysfs_msg.h"
|
||||
|
||||
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
|
||||
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
|
||||
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
|
||||
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
|
||||
|
||||
@ -277,8 +278,10 @@ release_i(struct sysfsm_node *np)
|
||||
|
||||
sdp = np->sdp;
|
||||
|
||||
if (np->server_ops && np->server_ops->release) {
|
||||
(*np->server_ops->release)(np->server_ops, np);
|
||||
if (np->type != SNT_DIR) {
|
||||
if (np->server_ops && np->server_ops->release) {
|
||||
(*np->server_ops->release)(np->server_ops, np);
|
||||
}
|
||||
}
|
||||
kfree(np->name);
|
||||
kfree(np);
|
||||
@ -718,8 +721,6 @@ unlink_i(struct sysfsm_node *np)
|
||||
else if (np->type == SNT_DIR) {
|
||||
if (np->parent != np) {
|
||||
kobject_del(&np->kobj);
|
||||
error = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
else if (np->type == SNT_LINK) {
|
||||
@ -1231,9 +1232,16 @@ sysfsm_cleanup(ihk_os_t os)
|
||||
int error;
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct sysfsm_data *sdp = &udp->sysfsm_data;
|
||||
struct sysfsm_data *sdp;
|
||||
struct sysfsm_node *np;
|
||||
|
||||
if (!udp) {
|
||||
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
|
||||
return;
|
||||
}
|
||||
|
||||
sdp = &udp->sysfsm_data;
|
||||
|
||||
dprintk("mcctrl:sysfsm_cleanup(%p)\n", os);
|
||||
|
||||
if (sdp->sysfs_buf) {
|
||||
@ -2094,9 +2102,16 @@ struct sysfsm_ops snooping_local_ops_s = {
|
||||
/**** local list ****/
|
||||
static ssize_t snooping_local_show_pbl(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
|
||||
{
|
||||
size_t ret;
|
||||
const struct sysfsm_bitmap_param *p = instance;
|
||||
|
||||
return bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
|
||||
ret = bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
|
||||
if (ret < bufsize - 1) {
|
||||
sprintf(buf + ret, "\n");
|
||||
return ret + 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* snooping_local_show_pbl() */
|
||||
|
||||
struct sysfsm_ops snooping_local_ops_pbl = {
|
||||
@ -2107,9 +2122,16 @@ struct sysfsm_ops snooping_local_ops_pbl = {
|
||||
/**** local map ****/
|
||||
static ssize_t snooping_local_show_pb(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
|
||||
{
|
||||
size_t ret;
|
||||
const struct sysfsm_bitmap_param *p = instance;
|
||||
|
||||
return bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
|
||||
ret = bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
|
||||
if (ret < bufsize - 1) {
|
||||
sprintf(buf + ret, "\n");
|
||||
return ret + 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* snooping_local_show_pb() */
|
||||
|
||||
struct sysfsm_ops snooping_local_ops_pb = {
|
||||
|
||||
@ -14,11 +14,11 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/version.h>
|
||||
#include "../../config.h"
|
||||
#include "../../../config.h"
|
||||
#include "mcctrl.h"
|
||||
#include "sysfs_msg.h"
|
||||
|
||||
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
|
||||
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
|
||||
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
|
||||
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
|
||||
|
||||
@ -92,27 +92,19 @@ void setup_local_snooping_samples(ihk_os_t os)
|
||||
|
||||
void setup_local_snooping_files(ihk_os_t os)
|
||||
{
|
||||
struct ihk_cpu_info *info;
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct sysfsm_bitmap_param param;
|
||||
static unsigned long cpu_offline = 0x0;
|
||||
int i;
|
||||
int error;
|
||||
|
||||
info = ihk_os_get_cpu_info(os);
|
||||
if (!info) {
|
||||
eprintk("mcctrl:ihk_os_get_cpu_info failed.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
memset(udp->cpu_online, 0, sizeof(udp->cpu_online));
|
||||
for (i = 0; i < info->n_cpus; i++) {
|
||||
udp->cpu_online[i / BITS_PER_LONG] =
|
||||
udp->cpu_online[i / BITS_PER_LONG] | (1 << (i % BITS_PER_LONG));
|
||||
for (i = 0; i < udp->cpu_info->n_cpus; i++) {
|
||||
set_bit(i, udp->cpu_online);
|
||||
}
|
||||
|
||||
param.nbits = CPU_LONGS * BITS_PER_LONG;
|
||||
param.ptr = udp->cpu_online;
|
||||
param.ptr = &udp->cpu_online;
|
||||
dprintk("mcctrl:setup_local_snooping_files: CPU_LONGS=%d, BITS_PER_LONG=%d\n",
|
||||
CPU_LONGS, BITS_PER_LONG);
|
||||
|
||||
@ -187,141 +179,122 @@ static void free_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
return;
|
||||
} /* free_cpu_topology() */
|
||||
|
||||
static void free_cpu_mapping(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
||||
size_t size;
|
||||
|
||||
size = udp->cpu_mapping_elems * sizeof(struct cpu_mapping);
|
||||
ihk_device_unmap_virtual(dev, udp->cpu_mapping, size);
|
||||
ihk_device_unmap_memory(dev, udp->cpu_mapping_pa, size);
|
||||
|
||||
return;
|
||||
} /* free_cpu_mapping() */
|
||||
|
||||
void free_topology_info(ihk_os_t os)
|
||||
{
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
|
||||
if (!udp) {
|
||||
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
|
||||
return;
|
||||
}
|
||||
|
||||
free_node_topology(udp);
|
||||
free_cpu_topology(udp);
|
||||
free_cpu_mapping(udp);
|
||||
|
||||
return;
|
||||
} /* free_topology_info() */
|
||||
|
||||
void reply_get_cpu_mapping(long req_pa)
|
||||
/*
|
||||
* CPU and NUMA node mapping conversion functions.
|
||||
*/
|
||||
int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
|
||||
{
|
||||
struct get_cpu_mapping_req *req = phys_to_virt(req_pa);
|
||||
return (cpu_id < udp->cpu_info->n_cpus) ?
|
||||
udp->cpu_info->mapping[cpu_id] : -1;
|
||||
}
|
||||
|
||||
req->busy = 0;
|
||||
wake_up(&req->wq);
|
||||
|
||||
return;
|
||||
} /* reply_get_cpu_mapping() */
|
||||
|
||||
static int get_cpu_mapping(struct mcctrl_usrdata *udp)
|
||||
int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
|
||||
{
|
||||
int error;
|
||||
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
||||
struct get_cpu_mapping_req *req = NULL;
|
||||
struct ikc_scd_packet packet;
|
||||
size_t size;
|
||||
return (cpu_id < udp->cpu_info->n_cpus) ?
|
||||
udp->cpu_info->hw_ids[cpu_id] : -1;
|
||||
}
|
||||
|
||||
dprintk("get_cpu_mapping(%p)\n", udp);
|
||||
|
||||
req = kmalloc(sizeof(*req), GFP_KERNEL);
|
||||
if (!req) {
|
||||
error = -ENOMEM;
|
||||
eprintk("mcctrl:get_cpu_mapping:kmalloc failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
req->busy = 1;
|
||||
req->error = -1;
|
||||
init_waitqueue_head(&req->wq);
|
||||
|
||||
packet.msg = SCD_MSG_GET_CPU_MAPPING;
|
||||
packet.arg = virt_to_phys(req);
|
||||
|
||||
#define GET_CPU_MAPPING_CPU 0
|
||||
error = mcctrl_ikc_send(udp->os, GET_CPU_MAPPING_CPU, &packet);
|
||||
if (error) {
|
||||
eprintk("mcctrl:get_cpu_mapping:"
|
||||
"mcctrl_ikc_send failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = wait_event_interruptible(req->wq, !req->busy);
|
||||
if (error) {
|
||||
eprintk("mcctrl:get_cpu_mapping:"
|
||||
"wait_event_interruptible failed. %d\n", error);
|
||||
req = NULL; /* XXX */
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (req->error) {
|
||||
error = req->error;
|
||||
eprintk("mcctrl:get_cpu_mapping:"
|
||||
"SCD_MSG_GET_CPU_MAPPING failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
size = req->buf_elems * sizeof(struct cpu_mapping);
|
||||
udp->cpu_mapping_elems = req->buf_elems;
|
||||
udp->cpu_mapping_pa = ihk_device_map_memory(dev, req->buf_rpa, size);
|
||||
udp->cpu_mapping = ihk_device_map_virtual(
|
||||
dev, udp->cpu_mapping_pa, size, NULL, 0);
|
||||
|
||||
error = 0;
|
||||
out:
|
||||
dprintk("get_cpu_mapping(%p): %d\n", udp, error);
|
||||
kfree(req);
|
||||
return error;
|
||||
} /* get_cpu_mapping() */
|
||||
|
||||
static int hwid_to_cpu(struct mcctrl_usrdata *udp, int hw_id)
|
||||
int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < udp->cpu_mapping_elems; ++i) {
|
||||
if (udp->cpu_mapping[i].hw_id == hw_id) {
|
||||
return udp->cpu_mapping[i].cpu_number;
|
||||
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
|
||||
if (udp->cpu_info->mapping[i] == cpu_id)
|
||||
return i;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
|
||||
if (udp->cpu_info->hw_ids[i] == hw_id) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
|
||||
if (udp->cpu_info->hw_ids[i] == hw_id) {
|
||||
return mckernel_cpu_2_linux_cpu(udp, i);
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
|
||||
{
|
||||
int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
|
||||
|
||||
return (mckernel_cpu >= 0 && mckernel_cpu < udp->cpu_info->n_cpus) ?
|
||||
udp->cpu_info->hw_ids[mckernel_cpu] : -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
|
||||
{
|
||||
return (numa_id < udp->mem_info->n_numa_nodes) ?
|
||||
udp->mem_info->numa_mapping[numa_id] : -1;
|
||||
}
|
||||
|
||||
int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < udp->mem_info->n_numa_nodes; ++i) {
|
||||
if (udp->mem_info->numa_mapping[i] == numa_id)
|
||||
return i;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int translate_cpumap(struct mcctrl_usrdata *udp,
|
||||
cpumask_t *linmap, cpumask_t *mckmap)
|
||||
{
|
||||
int error;
|
||||
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
||||
int lincpu;
|
||||
int hw_id;
|
||||
int mckcpu;
|
||||
|
||||
dprintk("translate_cpumap(%p,%p,%p)\n", udp, linmap, mckmap);
|
||||
cpumask_clear(mckmap);
|
||||
for_each_cpu(lincpu, linmap) {
|
||||
hw_id = ihk_device_linux_cpu_to_hw_id(dev, lincpu);
|
||||
if (hw_id < 0) {
|
||||
error = hw_id;
|
||||
eprintk("mcctrl:translate_cpumap:"
|
||||
"ihk_device_linux_cpu_to_hw_id failed."
|
||||
" %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
mckcpu = linux_cpu_2_mckernel_cpu(udp, lincpu);
|
||||
|
||||
mckcpu = hwid_to_cpu(udp, hw_id);
|
||||
if (mckcpu >= 0) {
|
||||
cpumask_set_cpu(mckcpu, mckmap);
|
||||
}
|
||||
}
|
||||
|
||||
error = 0;
|
||||
out:
|
||||
dprintk("translate_cpumap(%p,%p,%p): %d\n", udp, linmap, mckmap, error);
|
||||
return error;
|
||||
} /* translate_cpumap() */
|
||||
@ -361,7 +334,7 @@ out:
|
||||
return (error)? ERR_PTR(error): topo;
|
||||
} /* get_cache_topology() */
|
||||
|
||||
static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
int index)
|
||||
{
|
||||
int error;
|
||||
@ -370,41 +343,43 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
struct cache_topology *cache;
|
||||
struct ihk_cache_topology *saved_cache;
|
||||
|
||||
dprintk("get_cpu_topology_one(%p,%d)\n", udp, index);
|
||||
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
|
||||
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
|
||||
if (!topology) {
|
||||
error = -ENOMEM;
|
||||
eprintk("mcctrl:get_cpu_topology_one:"
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"kmalloc failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&topology->cache_list);
|
||||
topology->cpu_mapping = &udp->cpu_mapping[index];
|
||||
topology->mckernel_cpu_id = index;
|
||||
topology->saved = ihk_device_get_cpu_topology(dev,
|
||||
mckernel_cpu_2_hw_id(udp, index));
|
||||
|
||||
topology->saved = ihk_device_get_cpu_topology(
|
||||
dev, topology->cpu_mapping->hw_id);
|
||||
if (IS_ERR(topology->saved)) {
|
||||
error = PTR_ERR(topology->saved);
|
||||
eprintk("mcctrl:get_cpu_topology_one:"
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"ihk_device_get_cpu_topology failed. %d\n",
|
||||
error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = translate_cpumap(udp, &topology->saved->core_siblings,
|
||||
error = translate_cpumap(udp,
|
||||
&topology->saved->core_siblings,
|
||||
&topology->core_siblings);
|
||||
if (error) {
|
||||
eprintk("mcctrl:get_cpu_topology_one:"
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"translate_cpumap(core_siblings) failed."
|
||||
" %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = translate_cpumap(udp, &topology->saved->thread_siblings,
|
||||
error = translate_cpumap(udp,
|
||||
&topology->saved->thread_siblings,
|
||||
&topology->thread_siblings);
|
||||
if (error) {
|
||||
eprintk("mcctrl:get_cpu_topology_one:"
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"translate_cpumap(thread_siblings) failed."
|
||||
" %d\n", error);
|
||||
goto out;
|
||||
@ -415,7 +390,7 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
cache = get_cache_topology(udp, topology, saved_cache);
|
||||
if (IS_ERR(cache)) {
|
||||
error = PTR_ERR(cache);
|
||||
eprintk("mcctrl:get_cpu_topology_one:"
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"get_cache_topology failed. %d\n",
|
||||
error);
|
||||
goto out;
|
||||
@ -429,10 +404,10 @@ out:
|
||||
if (error && !IS_ERR_OR_NULL(topology)) {
|
||||
free_cpu_topology_one(udp, topology);
|
||||
}
|
||||
dprintk("get_cpu_topology_one(%p,%d): %d %p\n",
|
||||
dprintk("get_one_cpu_topology(%p,%d): %d %p\n",
|
||||
udp, index, error, topology);
|
||||
return (error)? ERR_PTR(error): topology;
|
||||
} /* get_cpu_topology_one() */
|
||||
} /* get_one_cpu_topology() */
|
||||
|
||||
static int get_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
@ -441,12 +416,12 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
struct cpu_topology *topology;
|
||||
|
||||
dprintk("get_cpu_topology(%p)\n", udp);
|
||||
for (index = 0; index < udp->cpu_mapping_elems; ++index) {
|
||||
topology = get_cpu_topology_one(udp, index);
|
||||
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
|
||||
topology = get_one_cpu_topology(udp, index);
|
||||
if (IS_ERR(topology)) {
|
||||
error = PTR_ERR(topology);
|
||||
eprintk("mcctrl:get_cpu_topology:"
|
||||
"get_cpu_topology_one failed. %d\n",
|
||||
eprintk("mcctrl:get_cpu_topology: "
|
||||
"get_one_cpu_topology failed. %d\n",
|
||||
error);
|
||||
goto out;
|
||||
}
|
||||
@ -460,15 +435,15 @@ out:
|
||||
return error;
|
||||
} /* get_cpu_topology() */
|
||||
|
||||
static void setup_one_cache_files(struct mcctrl_usrdata *udp,
|
||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu, struct cache_topology *cache)
|
||||
{
|
||||
char *prefix = "/sys/devices/system/cpu";
|
||||
int cpu_number = cpu->cpu_mapping->cpu_number;
|
||||
int cpu_number = cpu->mckernel_cpu_id;
|
||||
int index = cache->saved->index;
|
||||
struct sysfsm_bitmap_param param;
|
||||
|
||||
dprintk("setup_one_cache_files(%p,%p,%p)\n", udp, cpu, cache);
|
||||
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p)\n", udp, cpu, cache);
|
||||
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d64,
|
||||
&cache->saved->level, 0444,
|
||||
@ -509,19 +484,19 @@ static void setup_one_cache_files(struct mcctrl_usrdata *udp,
|
||||
"%s/cpu%d/cache/index%d/shared_cpu_list",
|
||||
prefix, cpu_number, index);
|
||||
|
||||
dprintk("setup_one_cache_files(%p,%p,%p):\n", udp, cpu, cache);
|
||||
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p):\n", udp, cpu, cache);
|
||||
return;
|
||||
} /* setup_one_cache_files() */
|
||||
} /* setup_cpu_sysfs_cache_files() */
|
||||
|
||||
static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
|
||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu)
|
||||
{
|
||||
char *prefix = "/sys/devices/system/cpu";
|
||||
int cpu_number = cpu->cpu_mapping->cpu_number;
|
||||
int cpu_number = cpu->mckernel_cpu_id;
|
||||
struct sysfsm_bitmap_param param;
|
||||
struct cache_topology *cache;
|
||||
|
||||
dprintk("setup_one_cpu_files(%p,%p)\n", udp, cpu);
|
||||
dprintk("setup_cpu_sysfs_files(%p,%p)\n", udp, cpu);
|
||||
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
|
||||
&cpu->saved->physical_package_id, 0444,
|
||||
@ -553,41 +528,61 @@ static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
|
||||
prefix, cpu_number);
|
||||
|
||||
list_for_each_entry(cache, &cpu->cache_list, chain) {
|
||||
setup_one_cache_files(udp, cpu, cache);
|
||||
setup_cpu_sysfs_cache_files(udp, cpu, cache);
|
||||
}
|
||||
|
||||
dprintk("setup_one_cpu_files(%p,%p):\n", udp, cpu);
|
||||
dprintk("setup_cpu_sysfs_files(%p,%p):\n", udp, cpu);
|
||||
return;
|
||||
} /* setup_one_cpu_files() */
|
||||
} /* setup_cpu_sysfs_files() */
|
||||
|
||||
static void setup_cpu_files(struct mcctrl_usrdata *udp)
|
||||
static void setup_cpus_sysfs_files_node_link(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
int cpu;
|
||||
struct sysfs_handle handle;
|
||||
|
||||
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
|
||||
int node = linux_numa_2_mckernel_numa(udp,
|
||||
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)));
|
||||
|
||||
error = sysfsm_lookupf(udp->os, &handle,
|
||||
"/sys/devices/system/node/node%d", node);
|
||||
if (error) {
|
||||
panic("sysfsm_lookupf: node for CPU");
|
||||
}
|
||||
|
||||
error = sysfsm_symlinkf(udp->os, handle,
|
||||
"/sys/devices/system/cpu/cpu%d/node%d",
|
||||
cpu, node);
|
||||
if (error) {
|
||||
panic("sysfsm_symlinkf(CPU in node)");
|
||||
}
|
||||
}
|
||||
|
||||
error = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
struct cpu_topology *cpu;
|
||||
|
||||
dprintk("setup_cpu_file(%p)\n", udp);
|
||||
error = get_cpu_mapping(udp);
|
||||
if (error) {
|
||||
eprintk("mcctrl:setup_cpu_files:"
|
||||
"get_cpu_mapping failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = get_cpu_topology(udp);
|
||||
if (error) {
|
||||
eprintk("mcctrl:setup_cpu_files:"
|
||||
eprintk("mcctrl:setup_cpus_sysfs_files:"
|
||||
"get_cpu_topology failed. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_for_each_entry(cpu, &udp->cpu_topology_list, chain) {
|
||||
setup_one_cpu_files(udp, cpu);
|
||||
setup_cpu_sysfs_files(udp, cpu);
|
||||
}
|
||||
error = 0;
|
||||
out:
|
||||
dprintk("setup_cpu_file(%p):\n", udp);
|
||||
return;
|
||||
} /* setup_cpu_files() */
|
||||
} /* setup_cpus_sysfs_files() */
|
||||
|
||||
static struct node_topology *get_one_node_topology(struct mcctrl_usrdata *udp,
|
||||
struct ihk_node_topology *saved)
|
||||
@ -629,8 +624,10 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
|
||||
struct node_topology *topology;
|
||||
|
||||
dprintk("get_node_topology(%p)\n", udp);
|
||||
for (node = 0; ; ++node) {
|
||||
saved = ihk_device_get_node_topology(dev, node);
|
||||
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
|
||||
saved = ihk_device_get_node_topology(dev,
|
||||
mckernel_numa_2_linux_numa(udp, node));
|
||||
|
||||
if (IS_ERR(saved)) {
|
||||
break;
|
||||
}
|
||||
@ -647,6 +644,8 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
|
||||
goto out;
|
||||
}
|
||||
|
||||
topology->mckernel_numa_id = node;
|
||||
|
||||
list_add(&topology->chain, &udp->node_topology_list);
|
||||
}
|
||||
|
||||
@ -659,6 +658,7 @@ out:
|
||||
static int setup_node_files(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
int node;
|
||||
struct node_topology *p;
|
||||
struct sysfsm_bitmap_param param;
|
||||
|
||||
@ -670,16 +670,71 @@ static int setup_node_files(struct mcctrl_usrdata *udp)
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(&udp->numa_online, 0, sizeof(udp->numa_online));
|
||||
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
|
||||
node_set(node, udp->numa_online);
|
||||
}
|
||||
|
||||
param.nbits = MAX_NUMNODES;
|
||||
param.ptr = &udp->numa_online;
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, ¶m, 0444,
|
||||
"/sys/devices/system/node/online");
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, ¶m, 0444,
|
||||
"/sys/devices/system/node/possible");
|
||||
|
||||
list_for_each_entry(p, &udp->node_topology_list, chain) {
|
||||
struct sysfs_handle handle;
|
||||
int cpu;
|
||||
size_t offset = 0;
|
||||
param.nbits = nr_cpumask_bits;
|
||||
param.ptr = &p->cpumap;
|
||||
|
||||
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
|
||||
if (node > 0) {
|
||||
offset += snprintf(&p->mckernel_numa_distance_s[offset],
|
||||
NODE_DISTANCE_S_SIZE - offset, "%s", " ");
|
||||
}
|
||||
offset += snprintf(&p->mckernel_numa_distance_s[offset],
|
||||
NODE_DISTANCE_S_SIZE - offset, "%d",
|
||||
node_distance(
|
||||
mckernel_numa_2_linux_numa(udp, p->mckernel_numa_id),
|
||||
mckernel_numa_2_linux_numa(udp, node)
|
||||
));
|
||||
}
|
||||
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_s,
|
||||
p->mckernel_numa_distance_s, 0444,
|
||||
"/sys/devices/system/node/node%d/distance",
|
||||
p->mckernel_numa_id);
|
||||
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pb, ¶m, 0444,
|
||||
"/sys/devices/system/node/node%d/cpumap",
|
||||
p->saved->node_number);
|
||||
p->mckernel_numa_id);
|
||||
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, ¶m, 0444,
|
||||
"/sys/devices/system/node/node%d/cpulist",
|
||||
p->saved->node_number);
|
||||
p->mckernel_numa_id);
|
||||
|
||||
/* Add CPU symlinks for this node */
|
||||
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
|
||||
if (linux_numa_2_mckernel_numa(udp,
|
||||
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))
|
||||
!= p->mckernel_numa_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
error = sysfsm_lookupf(udp->os, &handle,
|
||||
"/sys/devices/system/cpu/cpu%d", cpu);
|
||||
if (error) {
|
||||
panic("sysfsm_lookupf(CPU in node)");
|
||||
}
|
||||
|
||||
error = sysfsm_symlinkf(udp->os, handle,
|
||||
"/sys/devices/system/node/node%d/cpu%d",
|
||||
p->mckernel_numa_id, cpu);
|
||||
if (error) {
|
||||
panic("sysfsm_symlinkf(CPU in node)");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
error = 0;
|
||||
@ -1026,11 +1081,18 @@ void setup_sysfs_files(ihk_os_t os)
|
||||
panic("sysfsm_unlinkf");
|
||||
}
|
||||
|
||||
setup_local_snooping_samples(os);
|
||||
//setup_local_snooping_samples(os);
|
||||
setup_local_snooping_files(os);
|
||||
setup_cpu_files(udp);
|
||||
setup_cpus_sysfs_files(udp);
|
||||
setup_node_files(udp);
|
||||
setup_pci_files(udp);
|
||||
setup_cpus_sysfs_files_node_link(udp);
|
||||
//setup_pci_files(udp);
|
||||
|
||||
/* Indicate sysfs files setup completion for boot script */
|
||||
error = sysfsm_mkdirf(os, NULL, "/sys/setup_complete");
|
||||
if (error) {
|
||||
panic("sysfsm_mkdir(complete)");
|
||||
}
|
||||
|
||||
return;
|
||||
} /* setup_files() */
|
||||
|
||||
@ -1,39 +1,45 @@
|
||||
KDIR ?= @KDIR@
|
||||
ARCH ?= @ARCH@
|
||||
KMODDIR=@KMODDIR@
|
||||
src = @abs_srcdir@
|
||||
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
|
||||
RELEASE=@UNAME_R@
|
||||
|
||||
RELEASE=$(shell uname -r)
|
||||
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
|
||||
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
|
||||
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
|
||||
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
|
||||
RHEL_RELEASE=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
|
||||
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE}" ]; then echo ""; else echo ${RHEL_RELEASE}; fi)
|
||||
RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
|
||||
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
|
||||
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
|
||||
BUILD_MODULE=none
|
||||
#$(info "LINUX_VERSION_CODE: ${LINUX_VERSION_CODE}, RHEL_RELEASE: ${RHEL_RELEASE}")
|
||||
ifeq ($(ENABLE_MCOVERLAYFS),yes)
|
||||
ENABLE_BUILD=$(shell if ( [ ${LINUX_VERSION_CODE} -ge 262144 ] && [ ${LINUX_VERSION_CODE} -lt 262400 ] ); then echo "yes"; else echo "no"; fi)
|
||||
else
|
||||
ENABLE_BUILD=no
|
||||
ifeq ($(BUILD_MODULE_TMP),org)
|
||||
ifeq ($(BUILD_MODULE),none)
|
||||
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
|
||||
endif
|
||||
ifeq ($(BUILD_MODULE),none)
|
||||
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 243680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
|
||||
endif
|
||||
endif
|
||||
ifeq ($(BUILD_MODULE_TMP),rhel)
|
||||
ifeq ($(BUILD_MODULE),none)
|
||||
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -ge 327 -a ${RHEL_RELEASE} -le 514 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
obj-m += mcoverlay.o
|
||||
|
||||
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
|
||||
|
||||
.PHONY: clean install modules
|
||||
|
||||
modules:
|
||||
ifeq ($(ENABLE_BUILD),yes)
|
||||
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
|
||||
ifneq ($(BUILD_MODULE),none)
|
||||
@(cd $(BUILD_MODULE); make modules)
|
||||
endif
|
||||
|
||||
clean:
|
||||
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
|
||||
@(cd linux-3.10.0-327.36.1.el7; make clean)
|
||||
@(cd linux-4.0.9; make clean)
|
||||
@(cd linux-4.6.7; make clean)
|
||||
|
||||
install:
|
||||
ifeq ($(ENABLE_BUILD),yes)
|
||||
mkdir -p -m 755 $(KMODDIR)
|
||||
install -m 644 mcoverlay.ko $(KMODDIR)
|
||||
ifneq ($(BUILD_MODULE),none)
|
||||
@(cd $(BUILD_MODULE); make install)
|
||||
endif
|
||||
|
||||
|
||||
@ -0,0 +1,21 @@
|
||||
KDIR ?= @KDIR@
|
||||
ARCH ?= @ARCH@
|
||||
KMODDIR = @KMODDIR@
|
||||
src = @abs_srcdir@
|
||||
|
||||
obj-m += mcoverlay.o
|
||||
|
||||
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
|
||||
|
||||
.PHONY: clean install modules
|
||||
|
||||
modules:
|
||||
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
|
||||
|
||||
clean:
|
||||
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
|
||||
|
||||
install:
|
||||
mkdir -p -m 755 $(KMODDIR)
|
||||
install -m 644 mcoverlay.ko $(KMODDIR)
|
||||
|
||||
461
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/copy_up.c
Normal file
461
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/copy_up.c
Normal file
@ -0,0 +1,461 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
static unsigned ovl_check_copy_up = 1;
|
||||
module_param_named(check_copy_up, ovl_check_copy_up, uint,
|
||||
S_IWUSR | S_IRUGO);
|
||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
||||
"Warn on copy-up when causing process also has a R/O fd open");
|
||||
|
||||
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
|
||||
{
|
||||
const struct dentry *dentry = data;
|
||||
|
||||
if (f->f_path.dentry == dentry)
|
||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
||||
f, fd, current->pid, current->comm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the fds open by this process and warn if something like the following
|
||||
* scenario is about to occur:
|
||||
*
|
||||
* fd1 = open("foo", O_RDONLY);
|
||||
* fd2 = open("foo", O_RDWR);
|
||||
*/
|
||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
||||
{
|
||||
if (ovl_check_copy_up)
|
||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
||||
}
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size, value_size = 0;
|
||||
char *buf, *name, *value = NULL;
|
||||
int uninitialized_var(error);
|
||||
|
||||
if (!old->d_inode->i_op->getxattr ||
|
||||
!new->d_inode->i_op->getxattr)
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
||||
retry:
|
||||
size = vfs_getxattr(old, name, value, value_size);
|
||||
if (size == -ERANGE)
|
||||
size = vfs_getxattr(old, name, NULL, 0);
|
||||
|
||||
if (size < 0) {
|
||||
error = size;
|
||||
break;
|
||||
}
|
||||
|
||||
if (size > value_size) {
|
||||
void *new;
|
||||
|
||||
new = krealloc(value, size, GFP_KERNEL);
|
||||
if (!new) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
value = new;
|
||||
value_size = size;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
||||
{
|
||||
int res;
|
||||
char *buf;
|
||||
struct inode *inode = realdentry->d_inode;
|
||||
mm_segment_t old_fs;
|
||||
|
||||
res = -EINVAL;
|
||||
if (!inode->i_op->readlink)
|
||||
goto err;
|
||||
|
||||
res = -ENOMEM;
|
||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
old_fs = get_fs();
|
||||
set_fs(get_ds());
|
||||
/* The cast to a user pointer is valid due to the set_fs() */
|
||||
res = inode->i_op->readlink(realdentry,
|
||||
(char __user *)buf, PAGE_SIZE - 1);
|
||||
set_fs(old_fs);
|
||||
if (res < 0) {
|
||||
free_page((unsigned long) buf);
|
||||
goto err;
|
||||
}
|
||||
buf[res] = '\0';
|
||||
|
||||
return buf;
|
||||
|
||||
err:
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
||||
struct dentry *dentry, struct path *lowerpath,
|
||||
struct kstat *stat, struct iattr *attr,
|
||||
const char *link)
|
||||
{
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *upper = NULL;
|
||||
umode_t mode = stat->mode;
|
||||
int err;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out1;
|
||||
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
stat->mode &= S_IFMT;
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
||||
stat->mode = mode;
|
||||
if (err)
|
||||
goto out2;
|
||||
|
||||
if (S_ISREG(stat->mode)) {
|
||||
struct path upperpath;
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = newdentry;
|
||||
|
||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
||||
err = ovl_set_attr(newdentry, stat);
|
||||
if (!err && attr)
|
||||
err = notify_change(newdentry, attr, NULL);
|
||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
newdentry = NULL;
|
||||
|
||||
/*
|
||||
* Non-directores become opaque when copied up.
|
||||
*/
|
||||
if (!S_ISDIR(stat->mode))
|
||||
ovl_dentry_set_opaque(dentry, true);
|
||||
out2:
|
||||
dput(upper);
|
||||
out1:
|
||||
dput(newdentry);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* Directory renames only allowed on "pure upper" (already created on
|
||||
* upper filesystem, never copied up). Directories which are on lower or
|
||||
* are merged may not be renamed. For these -EXDEV is returned and
|
||||
* userspace has to deal with it. This means, when copying up a
|
||||
* directory we can rely on it and ancestors being stable.
|
||||
*
|
||||
* Non-directory renames start with copy up of source if necessary. The
|
||||
* actual rename will only proceed once the copy up was successful. Copy
|
||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
||||
* d_parent it is possible that the copy up will lock the old parent. At
|
||||
* that point the file will have already been copied up anyway.
|
||||
*/
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
int err;
|
||||
struct kstat pstat;
|
||||
struct path parentpath;
|
||||
struct dentry *upperdir;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
char *link = NULL;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_do_check_copy_up(lowerpath->dentry);
|
||||
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
upperdir = parentpath.dentry;
|
||||
|
||||
err = vfs_getattr(&parentpath, &pstat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (S_ISLNK(stat->mode)) {
|
||||
link = ovl_read_symlink(lowerpath->dentry);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_free_link;
|
||||
|
||||
override_cred->fsuid = stat->uid;
|
||||
override_cred->fsgid = stat->gid;
|
||||
/*
|
||||
* CAP_SYS_ADMIN for copying up extended attributes
|
||||
* CAP_DAC_OVERRIDE for create
|
||||
* CAP_FOWNER for chmod, timestamp update
|
||||
* CAP_FSETID for chmod
|
||||
* CAP_CHOWN for chown
|
||||
* CAP_MKNOD for mknod
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = -EIO;
|
||||
if (lock_rename(workdir, upperdir) != NULL) {
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
goto out_unlock;
|
||||
}
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
unlock_rename(workdir, upperdir);
|
||||
err = 0;
|
||||
/* Raced with another copy-up? Do the setattr here */
|
||||
if (attr) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
goto out_put_cred;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
||||
stat, attr, link);
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &pstat);
|
||||
}
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_put_cred:
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
out_free_link:
|
||||
if (link)
|
||||
free_page((unsigned long) link);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = 0;
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
type = ovl_path_type(parent);
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
ovl_path_lower(next, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (!err)
|
||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
972
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/dir.c
Normal file
972
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/dir.c
Normal file
@ -0,0 +1,972 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
dget(wdentry);
|
||||
if (S_ISDIR(wdentry->d_inode->i_mode))
|
||||
err = ovl_do_rmdir(wdir, wdentry);
|
||||
else
|
||||
err = ovl_do_unlink(wdir, wdentry);
|
||||
dput(wdentry);
|
||||
|
||||
if (err) {
|
||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
||||
wdentry, err);
|
||||
}
|
||||
}
|
||||
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
||||
{
|
||||
struct dentry *temp;
|
||||
char name[20];
|
||||
|
||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
||||
|
||||
temp = lookup_one_len(name, workdir, strlen(name));
|
||||
if (!IS_ERR(temp) && temp->d_inode) {
|
||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
||||
dput(temp);
|
||||
temp = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* caller holds i_mutex on workdir */
|
||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *whiteout;
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
|
||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
||||
if (IS_ERR(whiteout))
|
||||
return whiteout;
|
||||
|
||||
err = ovl_do_whiteout(wdir, whiteout);
|
||||
if (err) {
|
||||
dput(whiteout);
|
||||
whiteout = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return whiteout;
|
||||
}
|
||||
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (newdentry->d_inode)
|
||||
return -ESTALE;
|
||||
|
||||
if (hardlink) {
|
||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
||||
} else {
|
||||
switch (stat->mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFCHR:
|
||||
case S_IFBLK:
|
||||
case S_IFIFO:
|
||||
case S_IFSOCK:
|
||||
err = ovl_do_mknod(dir, newdentry,
|
||||
stat->mode, stat->rdev, debug);
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = -EPERM;
|
||||
}
|
||||
}
|
||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
||||
/*
|
||||
* Not quite sure if non-instantiated dentry is legal or not.
|
||||
* VFS doesn't seem to care so check and warn here.
|
||||
*/
|
||||
err = -ENOENT;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
||||
}
|
||||
|
||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
||||
if (err) {
|
||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
||||
upperdentry->d_name.name, err);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, stat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&udir->i_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
||||
struct dentry *upperdir)
|
||||
{
|
||||
/* Workdir should not be the same as upperdir */
|
||||
if (workdir == upperdir)
|
||||
goto err;
|
||||
|
||||
/* Workdir should not be subdir of upperdir and vice versa */
|
||||
if (lock_rename(workdir, upperdir) != NULL)
|
||||
goto err_unlock;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
err:
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct path upperpath;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir;
|
||||
struct kstat stat;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return ERR_PTR(-EROFS);
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
err = vfs_getattr(&upperpath, &stat);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (!S_ISDIR(stat.mode))
|
||||
goto out_unlock;
|
||||
upper = upperpath.dentry;
|
||||
if (upper->d_parent->d_inode != udir)
|
||||
goto out_unlock;
|
||||
|
||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out_unlock;
|
||||
|
||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_copy_xattr(upper, opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_set_opaque(opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
||||
err = ovl_set_attr(opaquedir, &stat);
|
||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup_whiteouts(upper, list);
|
||||
ovl_cleanup(wdir, upper);
|
||||
unlock_rename(workdir, upperdir);
|
||||
|
||||
/* dentry's upper doesn't match now, get rid of it */
|
||||
d_drop(dentry);
|
||||
|
||||
return opaquedir;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, opaquedir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *ret = NULL;
|
||||
LIST_HEAD(list);
|
||||
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
if (err)
|
||||
ret = ERR_PTR(err);
|
||||
else {
|
||||
/*
|
||||
* If no upperdentry then skip clearing whiteouts.
|
||||
*
|
||||
* Can race with copy-up, since we don't hold the upperdir
|
||||
* mutex. Doesn't matter, since copy-up can't create a
|
||||
* non-empty directory from an empty one.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry))
|
||||
ret = ovl_clear_empty(dentry, &list);
|
||||
}
|
||||
|
||||
ovl_cache_free(&list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
||||
if (err)
|
||||
goto out_dput2;
|
||||
|
||||
if (S_ISDIR(stat->mode)) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
||||
RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup(wdir, upper);
|
||||
} else {
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput2:
|
||||
dput(upper);
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out_dput2;
|
||||
}
|
||||
|
||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link, struct dentry *hardlink)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
struct kstat stat = {
|
||||
.mode = mode,
|
||||
.rdev = rdev,
|
||||
};
|
||||
|
||||
err = -ENOMEM;
|
||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
||||
if (!inode)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_iput;
|
||||
|
||||
if (!ovl_dentry_is_opaque(dentry)) {
|
||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_iput;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting opaque xattr
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
||||
hardlink);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
|
||||
if (!err)
|
||||
inode = NULL;
|
||||
out_iput:
|
||||
iput(inode);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
bool excl)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
/* Don't allow creation of "whiteout" on overlay */
|
||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
||||
return -EPERM;
|
||||
|
||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
||||
}
|
||||
|
||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *link)
|
||||
{
|
||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
||||
}
|
||||
|
||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
||||
struct dentry *new)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upper = ovl_dentry_upper(old);
|
||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *whiteout;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir = NULL;
|
||||
int err;
|
||||
int flags = 0;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
if (is_dir) {
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out;
|
||||
} else {
|
||||
LIST_HEAD(list);
|
||||
|
||||
/*
|
||||
* When removing an empty opaque directory, then it
|
||||
* makes no sense to replace it with an exact replica of
|
||||
* itself. But emptiness still needs to be checked.
|
||||
*/
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
ovl_cache_free(&list);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if ((opaquedir && upper != opaquedir) ||
|
||||
(!opaquedir && ovl_dentry_upper(dentry) &&
|
||||
upper != ovl_dentry_upper(dentry))) {
|
||||
goto out_dput_upper;
|
||||
}
|
||||
|
||||
whiteout = ovl_whiteout(workdir, dentry);
|
||||
err = PTR_ERR(whiteout);
|
||||
if (IS_ERR(whiteout))
|
||||
goto out_dput_upper;
|
||||
|
||||
if (d_is_dir(upper))
|
||||
flags = RENAME_EXCHANGE;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
if (flags)
|
||||
ovl_cleanup(wdir, upper);
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
out_d_drop:
|
||||
d_drop(dentry);
|
||||
dput(whiteout);
|
||||
out_dput_upper:
|
||||
dput(upper);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
kill_whiteout:
|
||||
ovl_cleanup(wdir, whiteout);
|
||||
goto out_d_drop;
|
||||
}
|
||||
|
||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *dir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (upper == ovl_dentry_upper(dentry)) {
|
||||
if (is_dir)
|
||||
err = vfs_rmdir(dir, upper);
|
||||
else
|
||||
err = vfs_unlink(dir, upper, NULL);
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
}
|
||||
dput(upper);
|
||||
|
||||
/*
|
||||
* Keeping this dentry hashed would mean having to release
|
||||
* upperpath/lowerpath, which could only be done if we are the
|
||||
* sole user of this dentry. Too tricky... Just unhash for
|
||||
* now.
|
||||
*/
|
||||
if (!err)
|
||||
d_drop(dentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&dir->i_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
||||
{
|
||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
||||
|
||||
if (check_sticky(dir, inode))
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
enum ovl_path_type type;
|
||||
int err;
|
||||
|
||||
err = ovl_check_sticky(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
type = ovl_path_type(dentry);
|
||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
||||
err = ovl_remove_upper(dentry, is_dir);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, false);
|
||||
}
|
||||
|
||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, true);
|
||||
}
|
||||
|
||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type old_type;
|
||||
enum ovl_path_type new_type;
|
||||
struct dentry *old_upperdir;
|
||||
struct dentry *new_upperdir;
|
||||
struct dentry *olddentry;
|
||||
struct dentry *newdentry;
|
||||
struct dentry *trap;
|
||||
bool old_opaque;
|
||||
bool new_opaque;
|
||||
bool new_create = false;
|
||||
bool cleanup_whiteout = false;
|
||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
||||
bool is_dir = S_ISDIR(old->d_inode->i_mode);
|
||||
bool new_is_dir = false;
|
||||
struct dentry *opaquedir = NULL;
|
||||
const struct cred *old_cred = NULL;
|
||||
struct cred *override_cred = NULL;
|
||||
|
||||
err = -EINVAL;
|
||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
||||
goto out;
|
||||
|
||||
flags &= ~RENAME_NOREPLACE;
|
||||
|
||||
err = ovl_check_sticky(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* Don't copy up directory trees */
|
||||
old_type = ovl_path_type(old);
|
||||
err = -EXDEV;
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
||||
goto out;
|
||||
|
||||
if (new->d_inode) {
|
||||
err = ovl_check_sticky(new);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (S_ISDIR(new->d_inode->i_mode))
|
||||
new_is_dir = true;
|
||||
|
||||
new_type = ovl_path_type(new);
|
||||
err = -EXDEV;
|
||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_lower(old)->d_inode ==
|
||||
ovl_dentry_lower(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_upper(old)->d_inode ==
|
||||
ovl_dentry_upper(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (ovl_dentry_is_opaque(new))
|
||||
new_type = __OVL_PATH_UPPER;
|
||||
else
|
||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
||||
}
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(new->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
if (!overwrite) {
|
||||
err = ovl_copy_up(new);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
}
|
||||
|
||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
||||
opaquedir = ovl_check_empty_and_clear(new);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir)) {
|
||||
opaquedir = NULL;
|
||||
goto out_revert_creds;
|
||||
}
|
||||
}
|
||||
|
||||
if (overwrite) {
|
||||
if (old_opaque) {
|
||||
if (new->d_inode || !new_opaque) {
|
||||
/* Whiteout source */
|
||||
flags |= RENAME_WHITEOUT;
|
||||
} else {
|
||||
/* Switch whiteouts */
|
||||
flags |= RENAME_EXCHANGE;
|
||||
}
|
||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
||||
flags |= RENAME_EXCHANGE;
|
||||
cleanup_whiteout = true;
|
||||
}
|
||||
}
|
||||
|
||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
||||
|
||||
trap = lock_rename(new_upperdir, old_upperdir);
|
||||
|
||||
|
||||
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
|
||||
old->d_name.len);
|
||||
err = PTR_ERR(olddentry);
|
||||
if (IS_ERR(olddentry))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (olddentry != ovl_dentry_upper(old))
|
||||
goto out_dput_old;
|
||||
|
||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
||||
new->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_dput_old;
|
||||
|
||||
err = -ESTALE;
|
||||
if (ovl_dentry_upper(new)) {
|
||||
if (opaquedir) {
|
||||
if (newdentry != opaquedir)
|
||||
goto out_dput;
|
||||
} else {
|
||||
if (newdentry != ovl_dentry_upper(new))
|
||||
goto out_dput;
|
||||
}
|
||||
} else {
|
||||
new_create = true;
|
||||
if (!d_is_negative(newdentry) &&
|
||||
(!new_opaque || !ovl_is_whiteout(newdentry)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (olddentry == trap)
|
||||
goto out_dput;
|
||||
if (newdentry == trap)
|
||||
goto out_dput;
|
||||
|
||||
if (is_dir && !old_opaque && new_opaque) {
|
||||
err = ovl_set_opaque(olddentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
flags);
|
||||
} else {
|
||||
/* No debug for the plain case */
|
||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
NULL, flags);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
|
||||
if (old_opaque != new_opaque) {
|
||||
ovl_dentry_set_opaque(old, new_opaque);
|
||||
if (!overwrite)
|
||||
ovl_dentry_set_opaque(new, old_opaque);
|
||||
}
|
||||
|
||||
if (cleanup_whiteout)
|
||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
||||
|
||||
ovl_dentry_version_inc(old->d_parent);
|
||||
ovl_dentry_version_inc(new->d_parent);
|
||||
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_dput_old:
|
||||
dput(olddentry);
|
||||
out_unlock:
|
||||
unlock_rename(new_upperdir, old_upperdir);
|
||||
out_revert_creds:
|
||||
if (old_opaque || new_opaque) {
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
dput(opaquedir);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_rename(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new)
|
||||
{
|
||||
return ovl_rename2(olddir, old, newdir, new, 0);
|
||||
}
|
||||
|
||||
const struct inode_operations_wrapper ovl_dir_inode_operations = {
|
||||
.ops = {
|
||||
.lookup = ovl_lookup,
|
||||
.mkdir = ovl_mkdir,
|
||||
.symlink = ovl_symlink,
|
||||
.unlink = ovl_unlink,
|
||||
.rmdir = ovl_rmdir,
|
||||
.rename = ovl_rename,
|
||||
.link = ovl_link,
|
||||
.setattr = ovl_setattr,
|
||||
.create = ovl_create,
|
||||
.mknod = ovl_mknod,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_dir_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
},
|
||||
.rename2 = ovl_rename2,
|
||||
};
|
||||
442
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/inode.c
Normal file
442
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/inode.c
Normal file
@ -0,0 +1,442 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
||||
bool no_data)
|
||||
{
|
||||
int err;
|
||||
struct dentry *parent;
|
||||
struct kstat stat;
|
||||
struct path lowerpath;
|
||||
|
||||
parent = dget_parent(dentry);
|
||||
err = ovl_copy_up(parent);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
ovl_path_lower(dentry, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
if (no_data)
|
||||
stat.size = 0;
|
||||
|
||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
||||
|
||||
out_dput_parent:
|
||||
dput(parent);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (!err) {
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct path realpath;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
return vfs_getattr(&realpath, stat);
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct ovl_entry *oe;
|
||||
struct dentry *alias = NULL;
|
||||
struct inode *realinode;
|
||||
struct dentry *realdentry;
|
||||
bool is_upper;
|
||||
int err;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
oe = inode->i_private;
|
||||
} else if (mask & MAY_NOT_BLOCK) {
|
||||
return -ECHILD;
|
||||
} else {
|
||||
/*
|
||||
* For non-directories find an alias and get the info
|
||||
* from there.
|
||||
*/
|
||||
alias = d_find_any_alias(inode);
|
||||
if (WARN_ON(!alias))
|
||||
return -ENOENT;
|
||||
|
||||
oe = alias->d_fsdata;
|
||||
}
|
||||
|
||||
realdentry = ovl_entry_real(oe, &is_upper);
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
err = -ENOENT;
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (mask & MAY_WRITE) {
|
||||
umode_t mode = realinode->i_mode;
|
||||
|
||||
/*
|
||||
* Writes will always be redirected to upper layer, so
|
||||
* ignore lower layer being read-only.
|
||||
*
|
||||
* If the overlay itself is read-only then proceed
|
||||
* with the permission check, don't return EROFS.
|
||||
* This will only happen if this is the lower layer of
|
||||
* another overlayfs.
|
||||
*
|
||||
* If upper fs becomes read-only after the overlay was
|
||||
* constructed return EROFS to prevent modification of
|
||||
* upper layer.
|
||||
*/
|
||||
err = -EROFS;
|
||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
err = __inode_permission(realinode, mask);
|
||||
out_dput:
|
||||
dput(alias);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
struct ovl_link_data {
|
||||
struct dentry *realdentry;
|
||||
void *cookie;
|
||||
};
|
||||
|
||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
void *ret;
|
||||
struct dentry *realdentry;
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = NULL;
|
||||
|
||||
realdentry = ovl_dentry_real(dentry);
|
||||
realinode = realdentry->d_inode;
|
||||
|
||||
if (WARN_ON(!realinode->i_op->follow_link))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
if (realinode->i_op->put_link) {
|
||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
||||
if (!data)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
data->realdentry = realdentry;
|
||||
}
|
||||
|
||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
||||
if (IS_ERR(ret)) {
|
||||
kfree(data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (data)
|
||||
data->cookie = ret;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
||||
{
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = c;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
realinode = data->realdentry->d_inode;
|
||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
|
||||
static bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
||||
}
|
||||
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -EPERM;
|
||||
if (ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
||||
enum ovl_path_type type)
|
||||
{
|
||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
||||
return S_ISDIR(dentry->d_inode->i_mode);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
return -ENODATA;
|
||||
|
||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
ssize_t res;
|
||||
int off;
|
||||
|
||||
res = vfs_listxattr(realpath.dentry, list, size);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
if (!ovl_need_xattr_filter(dentry, type))
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (off = 0; off < res;) {
|
||||
char *s = list + off;
|
||||
size_t slen = strlen(s) + 1;
|
||||
|
||||
BUG_ON(off + slen > res);
|
||||
|
||||
if (ovl_is_private_xattr(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, res - off);
|
||||
} else {
|
||||
off += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -ENODATA;
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
err = vfs_removexattr(realpath.dentry, name);
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
||||
struct dentry *realdentry)
|
||||
{
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
return false;
|
||||
|
||||
if (special_file(realdentry->d_inode->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
||||
const struct cred *cred)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type;
|
||||
bool want_write = false;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
if (!ovl_is_nocopyupw(dentry)) {
|
||||
if (ovl_open_need_copy_up(file->f_flags, type,
|
||||
realpath.dentry)) {
|
||||
want_write = true;
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (file->f_flags & O_TRUNC)
|
||||
err = ovl_copy_up_last(dentry, NULL, true);
|
||||
else
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
}
|
||||
|
||||
err = vfs_open(&realpath, file, cred);
|
||||
out_drop_write:
|
||||
if (want_write)
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct inode_operations_wrapper ovl_file_inode_operations = {
|
||||
.ops = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
},
|
||||
.dentry_open = ovl_dentry_open,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.follow_link = ovl_follow_link,
|
||||
.put_link = ovl_put_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
mode &= S_IFMT;
|
||||
|
||||
inode->i_ino = get_next_ino();
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
||||
|
||||
switch (mode) {
|
||||
case S_IFDIR:
|
||||
inode->i_private = oe;
|
||||
inode->i_op = &ovl_dir_inode_operations.ops;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFREG:
|
||||
case S_IFSOCK:
|
||||
case S_IFBLK:
|
||||
case S_IFCHR:
|
||||
case S_IFIFO:
|
||||
inode->i_op = &ovl_file_inode_operations.ops;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
break;
|
||||
|
||||
default:
|
||||
WARN(1, "illegal file type: %i\n", mode);
|
||||
iput(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
@ -0,0 +1,200 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
struct ovl_entry;
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_PURE = (1 << 0),
|
||||
__OVL_PATH_UPPER = (1 << 1),
|
||||
__OVL_PATH_MERGE = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
||||
|
||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
||||
#define OVL_XATTR_PRE_LEN 16
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry, bool debug)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
if (debug) {
|
||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
||||
old_dentry, new_dentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
if (debug)
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
if (debug)
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev, bool debug)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
if (debug) {
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
||||
dentry, mode, dev, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname, bool debug)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
if (debug)
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
||||
dentry, name, (int) size, (char *) value, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
||||
olddentry, newdentry, flags);
|
||||
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
|
||||
if (err) {
|
||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
|
||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
||||
struct kstat *stat, const char *link);
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug);
|
||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
588
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/readdir.c
Normal file
588
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/readdir.c
Normal file
@ -0,0 +1,588 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
struct ovl_cache_entry {
|
||||
unsigned int len;
|
||||
unsigned int type;
|
||||
u64 ino;
|
||||
struct list_head l_node;
|
||||
struct rb_node node;
|
||||
struct ovl_cache_entry *next_maybe_whiteout;
|
||||
bool is_whiteout;
|
||||
char name[];
|
||||
};
|
||||
|
||||
struct ovl_dir_cache {
|
||||
long refcount;
|
||||
u64 version;
|
||||
struct list_head entries;
|
||||
};
|
||||
|
||||
struct dir_context {
|
||||
const filldir_t actor;
|
||||
//loff_t pos;
|
||||
};
|
||||
|
||||
struct ovl_readdir_data {
|
||||
struct dir_context ctx;
|
||||
bool is_merge;
|
||||
struct rb_root root;
|
||||
struct list_head *list;
|
||||
struct list_head middle;
|
||||
struct ovl_cache_entry *first_maybe_whiteout;
|
||||
int count;
|
||||
int err;
|
||||
};
|
||||
|
||||
struct ovl_dir_file {
|
||||
bool is_real;
|
||||
bool is_upper;
|
||||
struct ovl_dir_cache *cache;
|
||||
struct list_head *cursor;
|
||||
struct file *realfile;
|
||||
struct file *upperfile;
|
||||
};
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
||||
{
|
||||
return container_of(n, struct ovl_cache_entry, node);
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
||||
const char *name, int len)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
||||
|
||||
cmp = strncmp(name, p->name, len);
|
||||
if (cmp > 0)
|
||||
node = p->node.rb_right;
|
||||
else if (cmp < 0 || len < p->len)
|
||||
node = p->node.rb_left;
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len,
|
||||
u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
memcpy(p->name, name, len);
|
||||
p->name[len] = '\0';
|
||||
p->len = len;
|
||||
p->type = d_type;
|
||||
p->ino = ino;
|
||||
p->is_whiteout = false;
|
||||
|
||||
if (d_type == DT_CHR) {
|
||||
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct rb_node **newp = &rdd->root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
while (*newp) {
|
||||
int cmp;
|
||||
struct ovl_cache_entry *tmp;
|
||||
|
||||
parent = *newp;
|
||||
tmp = ovl_cache_entry_from_node(*newp);
|
||||
cmp = strncmp(name, tmp->name, len);
|
||||
if (cmp > 0)
|
||||
newp = &tmp->node.rb_right;
|
||||
else if (cmp < 0 || len < tmp->len)
|
||||
newp = &tmp->node.rb_left;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
|
||||
if (p == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add_tail(&p->l_node, rdd->list);
|
||||
rb_link_node(&p->node, parent, newp);
|
||||
rb_insert_color(&p->node, &rdd->root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
||||
const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
||||
if (p) {
|
||||
list_move_tail(&p->l_node, &rdd->middle);
|
||||
} else {
|
||||
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
|
||||
if (p == NULL)
|
||||
rdd->err = -ENOMEM;
|
||||
else
|
||||
list_add_tail(&p->l_node, &rdd->middle);
|
||||
}
|
||||
|
||||
return rdd->err;
|
||||
}
|
||||
|
||||
void ovl_cache_free(struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
struct ovl_cache_entry *n;
|
||||
|
||||
list_for_each_entry_safe(p, n, list, l_node)
|
||||
kfree(p);
|
||||
|
||||
INIT_LIST_HEAD(list);
|
||||
}
|
||||
|
||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
||||
{
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
|
||||
WARN_ON(cache->refcount <= 0);
|
||||
cache->refcount--;
|
||||
if (!cache->refcount) {
|
||||
if (ovl_dir_cache(dentry) == cache)
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_fill_merge(void *buf, const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct dir_context *ctx = buf;
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
rdd->count++;
|
||||
if (!rdd->is_merge)
|
||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
||||
else
|
||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
|
||||
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
struct dentry *dentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* CAP_DAC_OVERRIDE for lookup
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = mutex_lock_killable(&dir->d_inode->i_mutex);
|
||||
if (!err) {
|
||||
while (rdd->first_maybe_whiteout) {
|
||||
p = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
|
||||
dentry = lookup_one_len(p->name, dir, p->len);
|
||||
if (!IS_ERR(dentry)) {
|
||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&dir->d_inode->i_mutex);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_dir_read(struct path *realpath,
|
||||
struct ovl_readdir_data *rdd)
|
||||
{
|
||||
struct file *realfile;
|
||||
int err;
|
||||
|
||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
||||
if (IS_ERR(realfile))
|
||||
return PTR_ERR(realfile);
|
||||
|
||||
rdd->first_maybe_whiteout = NULL;
|
||||
//rdd->ctx.pos = 0;
|
||||
do {
|
||||
rdd->count = 0;
|
||||
rdd->err = 0;
|
||||
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
|
||||
if (err >= 0)
|
||||
err = rdd->err;
|
||||
} while (!err && rdd->count);
|
||||
|
||||
if (!err && rdd->first_maybe_whiteout)
|
||||
err = ovl_check_whiteouts(realpath->dentry, rdd);
|
||||
|
||||
fput(realfile);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_dir_reset(struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
||||
ovl_cache_put(od, dentry);
|
||||
od->cache = NULL;
|
||||
od->cursor = NULL;
|
||||
}
|
||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
||||
od->is_real = false;
|
||||
}
|
||||
|
||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_fill_merge,
|
||||
.list = list,
|
||||
.root = RB_ROOT,
|
||||
.is_merge = false,
|
||||
};
|
||||
int idx, next;
|
||||
|
||||
for (idx = 0; idx != -1; idx = next) {
|
||||
next = ovl_path_next(idx, dentry, &realpath);
|
||||
|
||||
if (next != -1) {
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
if (err)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Insert lowest layer entries before upper ones, this
|
||||
* allows offsets to be reasonably constant
|
||||
*/
|
||||
list_add(&rdd.middle, rdd.list);
|
||||
rdd.is_merge = true;
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
list_del(&rdd.middle);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t off = 0;
|
||||
|
||||
list_for_each(p, &od->cache->entries) {
|
||||
if (off >= pos)
|
||||
break;
|
||||
off++;
|
||||
}
|
||||
/* Cursor is safe since the cache is stable */
|
||||
od->cursor = p;
|
||||
}
|
||||
|
||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
||||
{
|
||||
int res;
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_dir_cache(dentry);
|
||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
||||
cache->refcount++;
|
||||
return cache;
|
||||
}
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
||||
if (!cache)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
cache->refcount = 1;
|
||||
INIT_LIST_HEAD(&cache->entries);
|
||||
|
||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
||||
if (res) {
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
cache->version = ovl_dentry_version_get(dentry);
|
||||
ovl_set_dir_cache(dentry, cache);
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct ovl_cache_entry *p;
|
||||
int res;
|
||||
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_readdir(od->realfile, filler, buf);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
if (!od->cache) {
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_cache_get(dentry);
|
||||
if (IS_ERR(cache))
|
||||
return PTR_ERR(cache);
|
||||
|
||||
od->cache = cache;
|
||||
ovl_seek_cursor(od, file->f_pos);
|
||||
}
|
||||
|
||||
while (od->cursor != &od->cache->entries) {
|
||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
||||
if (!p->is_whiteout)
|
||||
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
|
||||
break;
|
||||
od->cursor = p->l_node.next;
|
||||
file->f_pos++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
loff_t res;
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
mutex_lock(&file_inode(file)->i_mutex);
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_llseek(od->realfile, offset, origin);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
} else {
|
||||
res = -EINVAL;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
default:
|
||||
goto out_unlock;
|
||||
}
|
||||
if (offset < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
if (od->cache)
|
||||
ovl_seek_cursor(od, offset);
|
||||
}
|
||||
res = offset;
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&file_inode(file)->i_mutex);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct file *realfile = od->realfile;
|
||||
|
||||
/*
|
||||
* Need to check if we started out being a lower dir, but got copied up
|
||||
*/
|
||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
realfile = lockless_dereference(od->upperfile);
|
||||
if (!realfile) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
||||
smp_mb__before_spinlock();
|
||||
mutex_lock(&inode->i_mutex);
|
||||
if (!od->upperfile) {
|
||||
if (IS_ERR(realfile)) {
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->upperfile = realfile;
|
||||
} else {
|
||||
/* somebody has beaten us to it */
|
||||
if (!IS_ERR(realfile))
|
||||
fput(realfile);
|
||||
realfile = od->upperfile;
|
||||
}
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return vfs_fsync_range(realfile, start, end, datasync);
|
||||
}
|
||||
|
||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
if (od->cache) {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ovl_cache_put(od, file->f_path.dentry);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
fput(od->realfile);
|
||||
if (od->upperfile)
|
||||
fput(od->upperfile);
|
||||
kfree(od);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct path realpath;
|
||||
struct file *realfile;
|
||||
struct ovl_dir_file *od;
|
||||
enum ovl_path_type type;
|
||||
|
||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
||||
if (!od)
|
||||
return -ENOMEM;
|
||||
|
||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
||||
if (IS_ERR(realfile)) {
|
||||
kfree(od);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->realfile = realfile;
|
||||
od->is_real = !OVL_TYPE_MERGE(type);
|
||||
od->is_upper = OVL_TYPE_UPPER(type);
|
||||
file->private_data = od;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ovl_dir_operations = {
|
||||
.read = generic_read_dir,
|
||||
.open = ovl_dir_open,
|
||||
.readdir = ovl_readdir,
|
||||
.llseek = ovl_dir_llseek,
|
||||
.fsync = ovl_dir_fsync,
|
||||
.release = ovl_dir_release,
|
||||
};
|
||||
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
err = ovl_dir_read_merged(dentry, list);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = 0;
|
||||
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
if (p->is_whiteout)
|
||||
continue;
|
||||
|
||||
if (p->name[0] == '.') {
|
||||
if (p->len == 1)
|
||||
continue;
|
||||
if (p->len == 2 && p->name[1] == '.')
|
||||
continue;
|
||||
}
|
||||
err = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
struct dentry *dentry;
|
||||
|
||||
if (!p->is_whiteout)
|
||||
continue;
|
||||
|
||||
dentry = lookup_one_len(p->name, upper, p->len);
|
||||
if (IS_ERR(dentry)) {
|
||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
||||
upper->d_name.name, p->len, p->name,
|
||||
(int) PTR_ERR(dentry));
|
||||
continue;
|
||||
}
|
||||
ovl_cleanup(upper->d_inode, dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
mutex_unlock(&upper->d_inode->i_mutex);
|
||||
}
|
||||
1203
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/super.c
Normal file
1203
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/super.c
Normal file
File diff suppressed because it is too large
Load Diff
21
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile.in
Normal file
21
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile.in
Normal file
@ -0,0 +1,21 @@
|
||||
KDIR ?= @KDIR@
|
||||
ARCH ?= @ARCH@
|
||||
KMODDIR = @KMODDIR@
|
||||
src = @abs_srcdir@
|
||||
|
||||
obj-m += mcoverlay.o
|
||||
|
||||
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
|
||||
|
||||
.PHONY: clean install modules
|
||||
|
||||
modules:
|
||||
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
|
||||
|
||||
clean:
|
||||
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
|
||||
|
||||
install:
|
||||
mkdir -p -m 755 $(KMODDIR)
|
||||
install -m 644 mcoverlay.ko $(KMODDIR)
|
||||
|
||||
21
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile.in
Normal file
21
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile.in
Normal file
@ -0,0 +1,21 @@
|
||||
KDIR ?= @KDIR@
|
||||
ARCH ?= @ARCH@
|
||||
KMODDIR = @KMODDIR@
|
||||
src = @abs_srcdir@
|
||||
|
||||
obj-m += mcoverlay.o
|
||||
|
||||
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
|
||||
|
||||
.PHONY: clean install modules
|
||||
|
||||
modules:
|
||||
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
|
||||
|
||||
clean:
|
||||
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
|
||||
|
||||
install:
|
||||
mkdir -p -m 755 $(KMODDIR)
|
||||
install -m 644 mcoverlay.ko $(KMODDIR)
|
||||
|
||||
460
executer/kernel/mcoverlayfs/linux-4.6.7/copy_up.c
Normal file
460
executer/kernel/mcoverlayfs/linux-4.6.7/copy_up.c
Normal file
@ -0,0 +1,460 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
static bool __read_mostly ovl_check_copy_up;
|
||||
module_param_named(check_copy_up, ovl_check_copy_up, bool,
|
||||
S_IWUSR | S_IRUGO);
|
||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
||||
"Warn on copy-up when causing process also has a R/O fd open");
|
||||
|
||||
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
|
||||
{
|
||||
const struct dentry *dentry = data;
|
||||
|
||||
if (f->f_inode == d_inode(dentry))
|
||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
||||
f, fd, current->pid, current->comm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the fds open by this process and warn if something like the following
|
||||
* scenario is about to occur:
|
||||
*
|
||||
* fd1 = open("foo", O_RDONLY);
|
||||
* fd2 = open("foo", O_RDWR);
|
||||
*/
|
||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
||||
{
|
||||
if (ovl_check_copy_up)
|
||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
||||
}
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt)
|
||||
{
|
||||
ssize_t list_size, size, value_size = 0;
|
||||
char *buf, *name, *value = NULL;
|
||||
int uninitialized_var(error);
|
||||
|
||||
if (!old->d_inode->i_op->getxattr ||
|
||||
!new->d_inode->i_op->getxattr)
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
||||
retry:
|
||||
size = vfs_getxattr(old, name, value, value_size);
|
||||
if (size == -ERANGE)
|
||||
size = vfs_getxattr(old, name, NULL, 0);
|
||||
|
||||
if (size < 0) {
|
||||
if (OVL_OPT_NOFSCHECK(opt)) {
|
||||
OVL_DEBUG("fail: old=%pd4, i_ino=%lu, name=%s\n",
|
||||
old, old->d_inode->i_ino, name);
|
||||
continue;
|
||||
} else {
|
||||
error = size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
OVL_DEBUG("success: old=%pd4, i_ino=%lu, name=%s\n",
|
||||
old, old->d_inode->i_ino, name);
|
||||
|
||||
if (size > value_size) {
|
||||
void *new;
|
||||
|
||||
new = krealloc(value, size, GFP_KERNEL);
|
||||
if (!new) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
value = new;
|
||||
value_size = size;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
||||
{
|
||||
int res;
|
||||
char *buf;
|
||||
struct inode *inode = realdentry->d_inode;
|
||||
mm_segment_t old_fs;
|
||||
|
||||
res = -EINVAL;
|
||||
if (!inode->i_op->readlink)
|
||||
goto err;
|
||||
|
||||
res = -ENOMEM;
|
||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
old_fs = get_fs();
|
||||
set_fs(get_ds());
|
||||
/* The cast to a user pointer is valid due to the set_fs() */
|
||||
res = inode->i_op->readlink(realdentry,
|
||||
(char __user *)buf, PAGE_SIZE - 1);
|
||||
set_fs(old_fs);
|
||||
if (res < 0) {
|
||||
free_page((unsigned long) buf);
|
||||
goto err;
|
||||
}
|
||||
buf[res] = '\0';
|
||||
|
||||
return buf;
|
||||
|
||||
err:
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
||||
struct dentry *dentry, struct path *lowerpath,
|
||||
struct kstat *stat, const char *link)
|
||||
{
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *upper = NULL;
|
||||
umode_t mode = stat->mode;
|
||||
unsigned opt = ovl_get_config_opt(dentry);
|
||||
int err;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out1;
|
||||
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
stat->mode &= S_IFMT;
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
||||
stat->mode = mode;
|
||||
if (err)
|
||||
goto out2;
|
||||
|
||||
if (S_ISREG(stat->mode)) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = newdentry;
|
||||
|
||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry, opt);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
inode_lock(newdentry->d_inode);
|
||||
err = ovl_set_attr(newdentry, stat);
|
||||
inode_unlock(newdentry->d_inode);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
newdentry = NULL;
|
||||
|
||||
/*
|
||||
* Non-directores become opaque when copied up.
|
||||
*/
|
||||
if (!S_ISDIR(stat->mode))
|
||||
ovl_dentry_set_opaque(dentry, true);
|
||||
out2:
|
||||
dput(upper);
|
||||
out1:
|
||||
dput(newdentry);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* Directory renames only allowed on "pure upper" (already created on
|
||||
* upper filesystem, never copied up). Directories which are on lower or
|
||||
* are merged may not be renamed. For these -EXDEV is returned and
|
||||
* userspace has to deal with it. This means, when copying up a
|
||||
* directory we can rely on it and ancestors being stable.
|
||||
*
|
||||
* Non-directory renames start with copy up of source if necessary. The
|
||||
* actual rename will only proceed once the copy up was successful. Copy
|
||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
||||
* d_parent it is possible that the copy up will lock the old parent. At
|
||||
* that point the file will have already been copied up anyway.
|
||||
*/
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
int err;
|
||||
struct kstat pstat;
|
||||
struct path parentpath;
|
||||
struct dentry *upperdir;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
char *link = NULL;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_do_check_copy_up(lowerpath->dentry);
|
||||
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
upperdir = parentpath.dentry;
|
||||
|
||||
err = vfs_getattr(&parentpath, &pstat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (S_ISLNK(stat->mode)) {
|
||||
link = ovl_read_symlink(lowerpath->dentry);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_free_link;
|
||||
|
||||
override_cred->fsuid = stat->uid;
|
||||
override_cred->fsgid = stat->gid;
|
||||
/*
|
||||
* CAP_SYS_ADMIN for copying up extended attributes
|
||||
* CAP_DAC_OVERRIDE for create
|
||||
* CAP_FOWNER for chmod, timestamp update
|
||||
* CAP_FSETID for chmod
|
||||
* CAP_CHOWN for chown
|
||||
* CAP_MKNOD for mknod
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = -EIO;
|
||||
if (lock_rename(workdir, upperdir) != NULL) {
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
goto out_unlock;
|
||||
}
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
/* Raced with another copy-up? Nothing to do, then... */
|
||||
err = 0;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
||||
stat, link);
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &pstat);
|
||||
}
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
out_free_link:
|
||||
if (link)
|
||||
free_page((unsigned long) link);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = 0;
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
type = ovl_path_type(parent);
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
ovl_path_lower(next, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (!err)
|
||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
969
executer/kernel/mcoverlayfs/linux-4.6.7/dir.c
Normal file
969
executer/kernel/mcoverlayfs/linux-4.6.7/dir.c
Normal file
@ -0,0 +1,969 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
dget(wdentry);
|
||||
if (d_is_dir(wdentry))
|
||||
err = ovl_do_rmdir(wdir, wdentry);
|
||||
else
|
||||
err = ovl_do_unlink(wdir, wdentry);
|
||||
dput(wdentry);
|
||||
|
||||
if (err) {
|
||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
||||
wdentry, err);
|
||||
}
|
||||
}
|
||||
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
||||
{
|
||||
struct dentry *temp;
|
||||
char name[20];
|
||||
|
||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
||||
|
||||
temp = lookup_one_len(name, workdir, strlen(name));
|
||||
if (!IS_ERR(temp) && temp->d_inode) {
|
||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
||||
dput(temp);
|
||||
temp = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* caller holds i_mutex on workdir */
|
||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *whiteout;
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
|
||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
||||
if (IS_ERR(whiteout))
|
||||
return whiteout;
|
||||
|
||||
err = ovl_do_whiteout(wdir, whiteout);
|
||||
if (err) {
|
||||
dput(whiteout);
|
||||
whiteout = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return whiteout;
|
||||
}
|
||||
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (newdentry->d_inode)
|
||||
return -ESTALE;
|
||||
|
||||
if (hardlink) {
|
||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
||||
} else {
|
||||
switch (stat->mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFCHR:
|
||||
case S_IFBLK:
|
||||
case S_IFIFO:
|
||||
case S_IFSOCK:
|
||||
err = ovl_do_mknod(dir, newdentry,
|
||||
stat->mode, stat->rdev, debug);
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = -EPERM;
|
||||
}
|
||||
}
|
||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
||||
/*
|
||||
* Not quite sure if non-instantiated dentry is legal or not.
|
||||
* VFS doesn't seem to care so check and warn here.
|
||||
*/
|
||||
err = -ENOENT;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
||||
}
|
||||
|
||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
||||
if (err) {
|
||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
||||
upperdentry->d_name.name, err);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, stat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
inode_unlock(udir);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
||||
struct dentry *upperdir)
|
||||
{
|
||||
/* Workdir should not be the same as upperdir */
|
||||
if (workdir == upperdir)
|
||||
goto err;
|
||||
|
||||
/* Workdir should not be subdir of upperdir and vice versa */
|
||||
if (lock_rename(workdir, upperdir) != NULL)
|
||||
goto err_unlock;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
err:
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct path upperpath;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir;
|
||||
struct kstat stat;
|
||||
unsigned opt = ovl_get_config_opt(dentry);
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return ERR_PTR(-EROFS);
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
err = vfs_getattr(&upperpath, &stat);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (!S_ISDIR(stat.mode))
|
||||
goto out_unlock;
|
||||
upper = upperpath.dentry;
|
||||
if (upper->d_parent->d_inode != udir)
|
||||
goto out_unlock;
|
||||
|
||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out_unlock;
|
||||
|
||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_copy_xattr(upper, opaquedir, opt);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_set_opaque(opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
inode_lock(opaquedir->d_inode);
|
||||
err = ovl_set_attr(opaquedir, &stat);
|
||||
inode_unlock(opaquedir->d_inode);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup_whiteouts(upper, list);
|
||||
ovl_cleanup(wdir, upper);
|
||||
unlock_rename(workdir, upperdir);
|
||||
|
||||
/* dentry's upper doesn't match now, get rid of it */
|
||||
d_drop(dentry);
|
||||
|
||||
return opaquedir;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, opaquedir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *ret = NULL;
|
||||
LIST_HEAD(list);
|
||||
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
if (err)
|
||||
ret = ERR_PTR(err);
|
||||
else {
|
||||
/*
|
||||
* If no upperdentry then skip clearing whiteouts.
|
||||
*
|
||||
* Can race with copy-up, since we don't hold the upperdir
|
||||
* mutex. Doesn't matter, since copy-up can't create a
|
||||
* non-empty directory from an empty one.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry))
|
||||
ret = ovl_clear_empty(dentry, &list);
|
||||
}
|
||||
|
||||
ovl_cache_free(&list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
||||
if (err)
|
||||
goto out_dput2;
|
||||
|
||||
if (S_ISDIR(stat->mode)) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
||||
RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup(wdir, upper);
|
||||
} else {
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput2:
|
||||
dput(upper);
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out_dput2;
|
||||
}
|
||||
|
||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link, struct dentry *hardlink)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
struct kstat stat = {
|
||||
.mode = mode,
|
||||
.rdev = rdev,
|
||||
};
|
||||
|
||||
err = -ENOMEM;
|
||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
||||
if (!inode)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_iput;
|
||||
|
||||
if (!ovl_dentry_is_opaque(dentry)) {
|
||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_iput;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting opaque xattr
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
||||
hardlink);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
|
||||
if (!err)
|
||||
inode = NULL;
|
||||
out_iput:
|
||||
iput(inode);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
bool excl)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
/* Don't allow creation of "whiteout" on overlay */
|
||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
||||
return -EPERM;
|
||||
|
||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
||||
}
|
||||
|
||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *link)
|
||||
{
|
||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
||||
}
|
||||
|
||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
||||
struct dentry *new)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upper = ovl_dentry_upper(old);
|
||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *whiteout;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir = NULL;
|
||||
int err;
|
||||
int flags = 0;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
if (is_dir) {
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out;
|
||||
} else {
|
||||
LIST_HEAD(list);
|
||||
|
||||
/*
|
||||
* When removing an empty opaque directory, then it
|
||||
* makes no sense to replace it with an exact replica of
|
||||
* itself. But emptiness still needs to be checked.
|
||||
*/
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
ovl_cache_free(&list);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if ((opaquedir && upper != opaquedir) ||
|
||||
(!opaquedir && ovl_dentry_upper(dentry) &&
|
||||
upper != ovl_dentry_upper(dentry))) {
|
||||
goto out_dput_upper;
|
||||
}
|
||||
|
||||
whiteout = ovl_whiteout(workdir, dentry);
|
||||
err = PTR_ERR(whiteout);
|
||||
if (IS_ERR(whiteout))
|
||||
goto out_dput_upper;
|
||||
|
||||
if (d_is_dir(upper))
|
||||
flags = RENAME_EXCHANGE;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
if (flags)
|
||||
ovl_cleanup(wdir, upper);
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
out_d_drop:
|
||||
d_drop(dentry);
|
||||
dput(whiteout);
|
||||
out_dput_upper:
|
||||
dput(upper);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
kill_whiteout:
|
||||
ovl_cleanup(wdir, whiteout);
|
||||
goto out_d_drop;
|
||||
}
|
||||
|
||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *dir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
int err;
|
||||
|
||||
inode_lock_nested(dir, I_MUTEX_PARENT);
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (upper == ovl_dentry_upper(dentry)) {
|
||||
if (is_dir)
|
||||
err = vfs_rmdir(dir, upper);
|
||||
else
|
||||
err = vfs_unlink(dir, upper, NULL);
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
}
|
||||
dput(upper);
|
||||
|
||||
/*
|
||||
* Keeping this dentry hashed would mean having to release
|
||||
* upperpath/lowerpath, which could only be done if we are the
|
||||
* sole user of this dentry. Too tricky... Just unhash for
|
||||
* now.
|
||||
*/
|
||||
if (!err)
|
||||
d_drop(dentry);
|
||||
out_unlock:
|
||||
inode_unlock(dir);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
||||
{
|
||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
||||
|
||||
if (check_sticky(dir, inode))
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
enum ovl_path_type type;
|
||||
int err;
|
||||
|
||||
err = ovl_check_sticky(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
type = ovl_path_type(dentry);
|
||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
||||
err = ovl_remove_upper(dentry, is_dir);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, false);
|
||||
}
|
||||
|
||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, true);
|
||||
}
|
||||
|
||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type old_type;
|
||||
enum ovl_path_type new_type;
|
||||
struct dentry *old_upperdir;
|
||||
struct dentry *new_upperdir;
|
||||
struct dentry *olddentry;
|
||||
struct dentry *newdentry;
|
||||
struct dentry *trap;
|
||||
bool old_opaque;
|
||||
bool new_opaque;
|
||||
bool cleanup_whiteout = false;
|
||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
||||
bool is_dir = d_is_dir(old);
|
||||
bool new_is_dir = false;
|
||||
struct dentry *opaquedir = NULL;
|
||||
const struct cred *old_cred = NULL;
|
||||
struct cred *override_cred = NULL;
|
||||
|
||||
err = -EINVAL;
|
||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
||||
goto out;
|
||||
|
||||
flags &= ~RENAME_NOREPLACE;
|
||||
|
||||
err = ovl_check_sticky(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* Don't copy up directory trees */
|
||||
old_type = ovl_path_type(old);
|
||||
err = -EXDEV;
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
||||
goto out;
|
||||
|
||||
if (new->d_inode) {
|
||||
err = ovl_check_sticky(new);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (d_is_dir(new))
|
||||
new_is_dir = true;
|
||||
|
||||
new_type = ovl_path_type(new);
|
||||
err = -EXDEV;
|
||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_lower(old)->d_inode ==
|
||||
ovl_dentry_lower(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_upper(old)->d_inode ==
|
||||
ovl_dentry_upper(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (ovl_dentry_is_opaque(new))
|
||||
new_type = __OVL_PATH_UPPER;
|
||||
else
|
||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
||||
}
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(new->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
if (!overwrite) {
|
||||
err = ovl_copy_up(new);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
}
|
||||
|
||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
||||
opaquedir = ovl_check_empty_and_clear(new);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir)) {
|
||||
opaquedir = NULL;
|
||||
goto out_revert_creds;
|
||||
}
|
||||
}
|
||||
|
||||
if (overwrite) {
|
||||
if (old_opaque) {
|
||||
if (new->d_inode || !new_opaque) {
|
||||
/* Whiteout source */
|
||||
flags |= RENAME_WHITEOUT;
|
||||
} else {
|
||||
/* Switch whiteouts */
|
||||
flags |= RENAME_EXCHANGE;
|
||||
}
|
||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
||||
flags |= RENAME_EXCHANGE;
|
||||
cleanup_whiteout = true;
|
||||
}
|
||||
}
|
||||
|
||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
||||
|
||||
trap = lock_rename(new_upperdir, old_upperdir);
|
||||
|
||||
|
||||
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
|
||||
old->d_name.len);
|
||||
err = PTR_ERR(olddentry);
|
||||
if (IS_ERR(olddentry))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (olddentry != ovl_dentry_upper(old))
|
||||
goto out_dput_old;
|
||||
|
||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
||||
new->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_dput_old;
|
||||
|
||||
err = -ESTALE;
|
||||
if (ovl_dentry_upper(new)) {
|
||||
if (opaquedir) {
|
||||
if (newdentry != opaquedir)
|
||||
goto out_dput;
|
||||
} else {
|
||||
if (newdentry != ovl_dentry_upper(new))
|
||||
goto out_dput;
|
||||
}
|
||||
} else {
|
||||
if (!d_is_negative(newdentry) &&
|
||||
(!new_opaque || !ovl_is_whiteout(newdentry)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (olddentry == trap)
|
||||
goto out_dput;
|
||||
if (newdentry == trap)
|
||||
goto out_dput;
|
||||
|
||||
if (is_dir && !old_opaque && new_opaque) {
|
||||
err = ovl_set_opaque(olddentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
flags);
|
||||
} else {
|
||||
/* No debug for the plain case */
|
||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
NULL, flags);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
|
||||
/*
|
||||
* Old dentry now lives in different location. Dentries in
|
||||
* lowerstack are stale. We cannot drop them here because
|
||||
* access to them is lockless. This could be only pure upper
|
||||
* or opaque directory - numlower is zero. Or upper non-dir
|
||||
* entry - its pureness is tracked by flag opaque.
|
||||
*/
|
||||
if (old_opaque != new_opaque) {
|
||||
ovl_dentry_set_opaque(old, new_opaque);
|
||||
if (!overwrite)
|
||||
ovl_dentry_set_opaque(new, old_opaque);
|
||||
}
|
||||
|
||||
if (cleanup_whiteout)
|
||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
||||
|
||||
ovl_dentry_version_inc(old->d_parent);
|
||||
ovl_dentry_version_inc(new->d_parent);
|
||||
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_dput_old:
|
||||
dput(olddentry);
|
||||
out_unlock:
|
||||
unlock_rename(new_upperdir, old_upperdir);
|
||||
out_revert_creds:
|
||||
if (old_opaque || new_opaque) {
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
dput(opaquedir);
|
||||
return err;
|
||||
}
|
||||
|
||||
const struct inode_operations ovl_dir_inode_operations = {
|
||||
.lookup = ovl_lookup,
|
||||
.mkdir = ovl_mkdir,
|
||||
.symlink = ovl_symlink,
|
||||
.unlink = ovl_unlink,
|
||||
.rmdir = ovl_rmdir,
|
||||
.rename2 = ovl_rename2,
|
||||
.link = ovl_link,
|
||||
.setattr = ovl_setattr,
|
||||
.create = ovl_create,
|
||||
.mknod = ovl_mknod,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_dir_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
494
executer/kernel/mcoverlayfs/linux-4.6.7/inode.c
Normal file
494
executer/kernel/mcoverlayfs/linux-4.6.7/inode.c
Normal file
@ -0,0 +1,494 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_copy_up_truncate(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *parent;
|
||||
struct kstat stat;
|
||||
struct path lowerpath;
|
||||
|
||||
parent = dget_parent(dentry);
|
||||
err = ovl_copy_up(parent);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
ovl_path_lower(dentry, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
stat.size = 0;
|
||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
|
||||
|
||||
out_dput_parent:
|
||||
dput(parent);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
unsigned opt = ovl_get_config_opt(dentry);
|
||||
|
||||
if (OVL_OPT_NOCOPYUPW(opt)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for permissions before trying to copy-up. This is redundant
|
||||
* since it will be rechecked later by ->setattr() on upper dentry. But
|
||||
* without this, copy-up can be triggered by just about anybody.
|
||||
*
|
||||
* We don't initialize inode->size, which just means that
|
||||
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
|
||||
* check for a swapfile (which this won't be anyway).
|
||||
*/
|
||||
err = inode_change_ok(dentry->d_inode, attr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (attr->ia_valid & ATTR_SIZE) {
|
||||
struct inode *realinode = d_inode(ovl_dentry_real(dentry));
|
||||
|
||||
err = -ETXTBSY;
|
||||
if (atomic_read(&realinode->i_writecount) < 0)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (!err) {
|
||||
struct inode *winode = NULL;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
|
||||
if (attr->ia_valid & ATTR_SIZE) {
|
||||
winode = d_inode(upperdentry);
|
||||
err = get_write_access(winode);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
|
||||
attr->ia_valid &= ~ATTR_MODE;
|
||||
|
||||
inode_lock(upperdentry->d_inode);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
if (!err)
|
||||
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
|
||||
inode_unlock(upperdentry->d_inode);
|
||||
|
||||
if (winode)
|
||||
put_write_access(winode);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct path realpath;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
return vfs_getattr(&realpath, stat);
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct ovl_entry *oe;
|
||||
struct dentry *alias = NULL;
|
||||
struct inode *realinode;
|
||||
struct dentry *realdentry;
|
||||
bool is_upper;
|
||||
int err;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
oe = inode->i_private;
|
||||
} else if (mask & MAY_NOT_BLOCK) {
|
||||
return -ECHILD;
|
||||
} else {
|
||||
/*
|
||||
* For non-directories find an alias and get the info
|
||||
* from there.
|
||||
*/
|
||||
alias = d_find_any_alias(inode);
|
||||
if (WARN_ON(!alias))
|
||||
return -ENOENT;
|
||||
|
||||
oe = alias->d_fsdata;
|
||||
|
||||
ovl_reset_ovl_entry(&oe, alias);
|
||||
}
|
||||
|
||||
realdentry = ovl_entry_real(oe, &is_upper);
|
||||
|
||||
if (ovl_is_default_permissions(inode)) {
|
||||
struct kstat stat;
|
||||
struct path realpath = { .dentry = realdentry };
|
||||
|
||||
if (mask & MAY_NOT_BLOCK)
|
||||
return -ECHILD;
|
||||
|
||||
realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
|
||||
|
||||
err = vfs_getattr(&realpath, &stat);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = -ESTALE;
|
||||
if ((stat.mode ^ inode->i_mode) & S_IFMT)
|
||||
goto out_dput;
|
||||
|
||||
inode->i_mode = stat.mode;
|
||||
inode->i_uid = stat.uid;
|
||||
inode->i_gid = stat.gid;
|
||||
|
||||
err = generic_permission(inode, mask);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
err = -ENOENT;
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (mask & MAY_WRITE) {
|
||||
umode_t mode = realinode->i_mode;
|
||||
|
||||
/*
|
||||
* Writes will always be redirected to upper layer, so
|
||||
* ignore lower layer being read-only.
|
||||
*
|
||||
* If the overlay itself is read-only then proceed
|
||||
* with the permission check, don't return EROFS.
|
||||
* This will only happen if this is the lower layer of
|
||||
* another overlayfs.
|
||||
*
|
||||
* If upper fs becomes read-only after the overlay was
|
||||
* constructed return EROFS to prevent modification of
|
||||
* upper layer.
|
||||
*/
|
||||
err = -EROFS;
|
||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
err = __inode_permission(realinode, mask);
|
||||
out_dput:
|
||||
dput(alias);
|
||||
return err;
|
||||
}
|
||||
|
||||
static const char *ovl_get_link(struct dentry *dentry,
|
||||
struct inode *inode,
|
||||
struct delayed_call *done)
|
||||
{
|
||||
struct dentry *realdentry;
|
||||
struct inode *realinode;
|
||||
|
||||
if (!dentry)
|
||||
return ERR_PTR(-ECHILD);
|
||||
|
||||
realdentry = ovl_dentry_real(dentry);
|
||||
realinode = realdentry->d_inode;
|
||||
|
||||
if (WARN_ON(!realinode->i_op->get_link))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
return realinode->i_op->get_link(realdentry, realinode, done);
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
|
||||
static bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
||||
}
|
||||
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
unsigned opt = ovl_get_config_opt(dentry);
|
||||
|
||||
if (OVL_OPT_NOCOPYUPW(opt)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -EPERM;
|
||||
if (ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
||||
enum ovl_path_type type)
|
||||
{
|
||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
||||
return S_ISDIR(dentry->d_inode->i_mode);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
return -ENODATA;
|
||||
|
||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
ssize_t res;
|
||||
int off;
|
||||
|
||||
res = vfs_listxattr(realpath.dentry, list, size);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
if (!ovl_need_xattr_filter(dentry, type))
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (off = 0; off < res;) {
|
||||
char *s = list + off;
|
||||
size_t slen = strlen(s) + 1;
|
||||
|
||||
BUG_ON(off + slen > res);
|
||||
|
||||
if (ovl_is_private_xattr(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, res - off);
|
||||
} else {
|
||||
off += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
unsigned opt = ovl_get_config_opt(dentry);
|
||||
|
||||
if (OVL_OPT_NOCOPYUPW(opt)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -ENODATA;
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
err = vfs_removexattr(realpath.dentry, name);
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
||||
struct dentry *realdentry)
|
||||
{
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
return false;
|
||||
|
||||
if (special_file(realdentry->d_inode->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type;
|
||||
unsigned opt = ovl_get_config_opt(dentry);
|
||||
|
||||
if (d_is_dir(dentry))
|
||||
return d_backing_inode(dentry);
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
if (!OVL_OPT_NOCOPYUPW(opt) &&
|
||||
ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
|
||||
OVL_DEBUG("copyup: realpath.dentry=%pd4, i_ino=%lu\n",
|
||||
realpath.dentry, realpath.dentry->d_inode->i_ino);
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
if (file_flags & O_TRUNC)
|
||||
err = ovl_copy_up_truncate(dentry);
|
||||
else
|
||||
err = ovl_copy_up(dentry);
|
||||
ovl_drop_write(dentry);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
|
||||
return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
|
||||
|
||||
if (OVL_OPT_NOFSCHECK(opt)) {
|
||||
if (realpath.dentry->d_inode->i_sb->s_magic == SYSFS_MAGIC) {
|
||||
OVL_DEBUG("sysfs: dentry=%pd4, i_ino=%lu\n",
|
||||
dentry, dentry->d_inode->i_ino);
|
||||
OVL_DEBUG("sysfs: realpath.dentry=%pd4, i_ino=%lu\n",
|
||||
realpath.dentry, realpath.dentry->d_inode->i_ino);
|
||||
if (!ovl_find_d_fsdata(dentry)) {
|
||||
ovl_add_d_fsdata(dentry);
|
||||
dentry->d_fsdata = realpath.dentry->d_fsdata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return d_backing_inode(realpath.dentry);
|
||||
}
|
||||
|
||||
static const struct inode_operations ovl_file_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.get_link = ovl_get_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
inode->i_ino = get_next_ino();
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
||||
|
||||
mode &= S_IFMT;
|
||||
switch (mode) {
|
||||
case S_IFDIR:
|
||||
inode->i_private = oe;
|
||||
inode->i_op = &ovl_dir_inode_operations;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFREG:
|
||||
case S_IFSOCK:
|
||||
case S_IFBLK:
|
||||
case S_IFCHR:
|
||||
case S_IFIFO:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
break;
|
||||
|
||||
default:
|
||||
WARN(1, "illegal file type: %i\n", mode);
|
||||
iput(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
230
executer/kernel/mcoverlayfs/linux-4.6.7/overlayfs.h
Normal file
230
executer/kernel/mcoverlayfs/linux-4.6.7/overlayfs.h
Normal file
@ -0,0 +1,230 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
//#define DEBUG
|
||||
#ifdef DEBUG
|
||||
#define OVL_DEBUG(format, ...) pr_err("[DEBUG] %s(): " format, __FUNCTION__, ##__VA_ARGS__)
|
||||
#else
|
||||
#define OVL_DEBUG(format, ...) {}
|
||||
#endif
|
||||
|
||||
struct ovl_entry;
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_PURE = (1 << 0),
|
||||
__OVL_PATH_UPPER = (1 << 1),
|
||||
__OVL_PATH_MERGE = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
||||
|
||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
||||
#define OVL_XATTR_PRE_LEN 16
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
||||
|
||||
enum ovl_opt_bit {
|
||||
__OVL_OPT_DEFAULT = 0,
|
||||
__OVL_OPT_NOCOPYUPW = (1 << 0),
|
||||
__OVL_OPT_NOFSCHECK = (1 << 1),
|
||||
};
|
||||
|
||||
#define OVL_OPT_NOCOPYUPW(opt) ((opt) & __OVL_OPT_NOCOPYUPW)
|
||||
#define OVL_OPT_NOFSCHECK(opt) ((opt) & __OVL_OPT_NOFSCHECK)
|
||||
|
||||
struct ovl_d_fsdata {
|
||||
struct list_head list;
|
||||
struct dentry *d;
|
||||
struct ovl_entry *oe;
|
||||
};
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry, bool debug)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
if (debug) {
|
||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
||||
old_dentry, new_dentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
if (debug)
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
if (debug)
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev, bool debug)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
if (debug) {
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
||||
dentry, mode, dev, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname, bool debug)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
if (debug)
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
||||
dentry, name, (int) size, (char *) value, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
||||
olddentry, newdentry, flags);
|
||||
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
|
||||
if (err) {
|
||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
unsigned ovl_get_config_opt(struct dentry *dentry);
|
||||
void ovl_reset_ovl_entry(struct ovl_entry **oe, struct dentry *dentry);
|
||||
struct ovl_entry *ovl_find_d_fsdata(struct dentry *dentry);
|
||||
int ovl_add_d_fsdata(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
||||
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
|
||||
bool is_upper);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
||||
bool ovl_is_default_permissions(struct inode *inode);
|
||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
|
||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
||||
struct kstat *stat, const char *link);
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
int ovl_check_d_type_supported(struct path *realpath);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
||||
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
to->i_mode = from->i_mode;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations ovl_dir_inode_operations;
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug);
|
||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
616
executer/kernel/mcoverlayfs/linux-4.6.7/readdir.c
Normal file
616
executer/kernel/mcoverlayfs/linux-4.6.7/readdir.c
Normal file
@ -0,0 +1,616 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
struct ovl_cache_entry {
|
||||
unsigned int len;
|
||||
unsigned int type;
|
||||
u64 ino;
|
||||
struct list_head l_node;
|
||||
struct rb_node node;
|
||||
struct ovl_cache_entry *next_maybe_whiteout;
|
||||
bool is_whiteout;
|
||||
char name[];
|
||||
};
|
||||
|
||||
struct ovl_dir_cache {
|
||||
long refcount;
|
||||
u64 version;
|
||||
struct list_head entries;
|
||||
};
|
||||
|
||||
struct ovl_readdir_data {
|
||||
struct dir_context ctx;
|
||||
bool is_lowest;
|
||||
struct rb_root root;
|
||||
struct list_head *list;
|
||||
struct list_head middle;
|
||||
struct ovl_cache_entry *first_maybe_whiteout;
|
||||
int count;
|
||||
int err;
|
||||
bool d_type_supported;
|
||||
};
|
||||
|
||||
struct ovl_dir_file {
|
||||
bool is_real;
|
||||
bool is_upper;
|
||||
struct ovl_dir_cache *cache;
|
||||
struct list_head *cursor;
|
||||
struct file *realfile;
|
||||
struct file *upperfile;
|
||||
};
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
||||
{
|
||||
return container_of(n, struct ovl_cache_entry, node);
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
||||
const char *name, int len)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
||||
|
||||
cmp = strncmp(name, p->name, len);
|
||||
if (cmp > 0)
|
||||
node = p->node.rb_right;
|
||||
else if (cmp < 0 || len < p->len)
|
||||
node = p->node.rb_left;
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len,
|
||||
u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
memcpy(p->name, name, len);
|
||||
p->name[len] = '\0';
|
||||
p->len = len;
|
||||
p->type = d_type;
|
||||
p->ino = ino;
|
||||
p->is_whiteout = false;
|
||||
|
||||
if (d_type == DT_CHR) {
|
||||
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct rb_node **newp = &rdd->root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
while (*newp) {
|
||||
int cmp;
|
||||
struct ovl_cache_entry *tmp;
|
||||
|
||||
parent = *newp;
|
||||
tmp = ovl_cache_entry_from_node(*newp);
|
||||
cmp = strncmp(name, tmp->name, len);
|
||||
if (cmp > 0)
|
||||
newp = &tmp->node.rb_right;
|
||||
else if (cmp < 0 || len < tmp->len)
|
||||
newp = &tmp->node.rb_left;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
|
||||
if (p == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add_tail(&p->l_node, rdd->list);
|
||||
rb_link_node(&p->node, parent, newp);
|
||||
rb_insert_color(&p->node, &rdd->root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
|
||||
const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
||||
if (p) {
|
||||
list_move_tail(&p->l_node, &rdd->middle);
|
||||
} else {
|
||||
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
|
||||
if (p == NULL)
|
||||
rdd->err = -ENOMEM;
|
||||
else
|
||||
list_add_tail(&p->l_node, &rdd->middle);
|
||||
}
|
||||
|
||||
return rdd->err;
|
||||
}
|
||||
|
||||
void ovl_cache_free(struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
struct ovl_cache_entry *n;
|
||||
|
||||
list_for_each_entry_safe(p, n, list, l_node)
|
||||
kfree(p);
|
||||
|
||||
INIT_LIST_HEAD(list);
|
||||
}
|
||||
|
||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
||||
{
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
|
||||
WARN_ON(cache->refcount <= 0);
|
||||
cache->refcount--;
|
||||
if (!cache->refcount) {
|
||||
if (ovl_dir_cache(dentry) == cache)
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
|
||||
int namelen, loff_t offset, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
rdd->count++;
|
||||
if (!rdd->is_lowest)
|
||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
||||
else
|
||||
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
|
||||
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
struct dentry *dentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* CAP_DAC_OVERRIDE for lookup
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = mutex_lock_killable(&dir->d_inode->i_mutex);
|
||||
if (!err) {
|
||||
while (rdd->first_maybe_whiteout) {
|
||||
p = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
|
||||
dentry = lookup_one_len(p->name, dir, p->len);
|
||||
if (!IS_ERR(dentry)) {
|
||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
}
|
||||
inode_unlock(dir->d_inode);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_dir_read(struct path *realpath,
|
||||
struct ovl_readdir_data *rdd)
|
||||
{
|
||||
struct file *realfile;
|
||||
int err;
|
||||
|
||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
||||
if (IS_ERR(realfile))
|
||||
return PTR_ERR(realfile);
|
||||
|
||||
rdd->first_maybe_whiteout = NULL;
|
||||
rdd->ctx.pos = 0;
|
||||
do {
|
||||
rdd->count = 0;
|
||||
rdd->err = 0;
|
||||
err = iterate_dir(realfile, &rdd->ctx);
|
||||
if (err >= 0)
|
||||
err = rdd->err;
|
||||
} while (!err && rdd->count);
|
||||
|
||||
if (!err && rdd->first_maybe_whiteout)
|
||||
err = ovl_check_whiteouts(realpath->dentry, rdd);
|
||||
|
||||
fput(realfile);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_dir_reset(struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
||||
ovl_cache_put(od, dentry);
|
||||
od->cache = NULL;
|
||||
od->cursor = NULL;
|
||||
}
|
||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
||||
od->is_real = false;
|
||||
}
|
||||
|
||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_fill_merge,
|
||||
.list = list,
|
||||
.root = RB_ROOT,
|
||||
.is_lowest = false,
|
||||
};
|
||||
int idx, next;
|
||||
|
||||
for (idx = 0; idx != -1; idx = next) {
|
||||
next = ovl_path_next(idx, dentry, &realpath);
|
||||
|
||||
if (next != -1) {
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
if (err)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Insert lowest layer entries before upper ones, this
|
||||
* allows offsets to be reasonably constant
|
||||
*/
|
||||
list_add(&rdd.middle, rdd.list);
|
||||
rdd.is_lowest = true;
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
list_del(&rdd.middle);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t off = 0;
|
||||
|
||||
list_for_each(p, &od->cache->entries) {
|
||||
if (off >= pos)
|
||||
break;
|
||||
off++;
|
||||
}
|
||||
/* Cursor is safe since the cache is stable */
|
||||
od->cursor = p;
|
||||
}
|
||||
|
||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
||||
{
|
||||
int res;
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_dir_cache(dentry);
|
||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
||||
cache->refcount++;
|
||||
return cache;
|
||||
}
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
||||
if (!cache)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
cache->refcount = 1;
|
||||
INIT_LIST_HEAD(&cache->entries);
|
||||
|
||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
||||
if (res) {
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
cache->version = ovl_dentry_version_get(dentry);
|
||||
ovl_set_dir_cache(dentry, cache);
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
static int ovl_iterate(struct file *file, struct dir_context *ctx)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
if (!ctx->pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real)
|
||||
return iterate_dir(od->realfile, ctx);
|
||||
|
||||
if (!od->cache) {
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_cache_get(dentry);
|
||||
if (IS_ERR(cache))
|
||||
return PTR_ERR(cache);
|
||||
|
||||
od->cache = cache;
|
||||
ovl_seek_cursor(od, ctx->pos);
|
||||
}
|
||||
|
||||
while (od->cursor != &od->cache->entries) {
|
||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
||||
if (!p->is_whiteout)
|
||||
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
|
||||
break;
|
||||
od->cursor = p->l_node.next;
|
||||
ctx->pos++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
loff_t res;
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
inode_lock(file_inode(file));
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_llseek(od->realfile, offset, origin);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
} else {
|
||||
res = -EINVAL;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
default:
|
||||
goto out_unlock;
|
||||
}
|
||||
if (offset < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
if (od->cache)
|
||||
ovl_seek_cursor(od, offset);
|
||||
}
|
||||
res = offset;
|
||||
}
|
||||
out_unlock:
|
||||
inode_unlock(file_inode(file));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct file *realfile = od->realfile;
|
||||
|
||||
/*
|
||||
* Need to check if we started out being a lower dir, but got copied up
|
||||
*/
|
||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
realfile = lockless_dereference(od->upperfile);
|
||||
if (!realfile) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
||||
smp_mb__before_spinlock();
|
||||
inode_lock(inode);
|
||||
if (!od->upperfile) {
|
||||
if (IS_ERR(realfile)) {
|
||||
inode_unlock(inode);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->upperfile = realfile;
|
||||
} else {
|
||||
/* somebody has beaten us to it */
|
||||
if (!IS_ERR(realfile))
|
||||
fput(realfile);
|
||||
realfile = od->upperfile;
|
||||
}
|
||||
inode_unlock(inode);
|
||||
}
|
||||
}
|
||||
|
||||
return vfs_fsync_range(realfile, start, end, datasync);
|
||||
}
|
||||
|
||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
if (od->cache) {
|
||||
inode_lock(inode);
|
||||
ovl_cache_put(od, file->f_path.dentry);
|
||||
inode_unlock(inode);
|
||||
}
|
||||
fput(od->realfile);
|
||||
if (od->upperfile)
|
||||
fput(od->upperfile);
|
||||
kfree(od);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct path realpath;
|
||||
struct file *realfile;
|
||||
struct ovl_dir_file *od;
|
||||
enum ovl_path_type type;
|
||||
|
||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
||||
if (!od)
|
||||
return -ENOMEM;
|
||||
|
||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
||||
if (IS_ERR(realfile)) {
|
||||
kfree(od);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->realfile = realfile;
|
||||
od->is_real = !OVL_TYPE_MERGE(type);
|
||||
od->is_upper = OVL_TYPE_UPPER(type);
|
||||
file->private_data = od;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ovl_dir_operations = {
|
||||
.read = generic_read_dir,
|
||||
.open = ovl_dir_open,
|
||||
.iterate = ovl_iterate,
|
||||
.llseek = ovl_dir_llseek,
|
||||
.fsync = ovl_dir_fsync,
|
||||
.release = ovl_dir_release,
|
||||
};
|
||||
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
err = ovl_dir_read_merged(dentry, list);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = 0;
|
||||
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
if (p->is_whiteout)
|
||||
continue;
|
||||
|
||||
if (p->name[0] == '.') {
|
||||
if (p->len == 1)
|
||||
continue;
|
||||
if (p->len == 2 && p->name[1] == '.')
|
||||
continue;
|
||||
}
|
||||
err = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
struct dentry *dentry;
|
||||
|
||||
if (!p->is_whiteout)
|
||||
continue;
|
||||
|
||||
dentry = lookup_one_len(p->name, upper, p->len);
|
||||
if (IS_ERR(dentry)) {
|
||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
||||
upper->d_name.name, p->len, p->name,
|
||||
(int) PTR_ERR(dentry));
|
||||
continue;
|
||||
}
|
||||
if (dentry->d_inode)
|
||||
ovl_cleanup(upper->d_inode, dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
inode_unlock(upper->d_inode);
|
||||
}
|
||||
|
||||
static int ovl_check_d_type(struct dir_context *ctx, const char *name,
|
||||
int namelen, loff_t offset, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
|
||||
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
|
||||
return 0;
|
||||
|
||||
if (d_type != DT_UNKNOWN)
|
||||
rdd->d_type_supported = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
|
||||
* if error is encountered.
|
||||
*/
|
||||
int ovl_check_d_type_supported(struct path *realpath)
|
||||
{
|
||||
int err;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_check_d_type,
|
||||
.d_type_supported = false,
|
||||
};
|
||||
|
||||
err = ovl_dir_read(realpath, &rdd);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return rdd.d_type_supported;
|
||||
}
|
||||
1363
executer/kernel/mcoverlayfs/linux-4.6.7/super.c
Normal file
1363
executer/kernel/mcoverlayfs/linux-4.6.7/super.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,27 +1,43 @@
|
||||
CC=@CC@
|
||||
BINDIR=@BINDIR@
|
||||
prefix=@prefix@
|
||||
exec_prefix=@exec_prefix@
|
||||
LIBDIR=@libdir@
|
||||
MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@
|
||||
KDIR ?= @KDIR@
|
||||
CFLAGS=-Wall -O -I.
|
||||
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH}
|
||||
VPATH=@abs_srcdir@
|
||||
TARGET=mcexec
|
||||
TARGET=mcexec libsched_yield
|
||||
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
|
||||
LIBS=@LIBS@
|
||||
ARCH=@ARCH@
|
||||
IHKDIR ?= $(VPATH)/../../../ihk/linux/include/
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
mcexec: mcexec.c
|
||||
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
|
||||
mcexec: mcexec.c libmcexec.a
|
||||
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -DLIBDIR=\"$(LIBDIR)\" -fPIE -pie -L. -lmcexec -lrt -lnuma -pthread -o $@ $^ $(EXTRA_OBJS)
|
||||
|
||||
eclair: eclair.c
|
||||
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
|
||||
$(CC) $(CFLAGS) -I${IHKDIR} -o $@ $^ $(LIBS)
|
||||
|
||||
clean:
|
||||
libsched_yield: libsched_yield.c
|
||||
$(CC) -shared -fPIC -Wl,-soname,sched_yield.so.1 -o libsched_yield.so.1.0.0 $^ -lc -ldl
|
||||
|
||||
libmcexec.a::
|
||||
(cd arch/${ARCH}; make)
|
||||
|
||||
clean::
|
||||
(cd arch/${ARCH}; make clean)
|
||||
$(RM) $(TARGET) *.o
|
||||
|
||||
.PHONY: all clean install
|
||||
|
||||
install:
|
||||
install::
|
||||
(cd arch/${ARCH}; make install)
|
||||
mkdir -p -m 755 $(BINDIR)
|
||||
install -m 755 mcexec $(BINDIR)
|
||||
mkdir -p -m 755 $(MCKERNEL_LIBDIR)
|
||||
install -m 755 libsched_yield.so.1.0.0 $(MCKERNEL_LIBDIR)
|
||||
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)
|
||||
|
||||
|
||||
23
executer/user/arch/x86_64/Makefile.in
Normal file
23
executer/user/arch/x86_64/Makefile.in
Normal file
@ -0,0 +1,23 @@
|
||||
CC=@CC@
|
||||
AR=ar
|
||||
BINDIR=@BINDIR@
|
||||
KDIR ?= @KDIR@
|
||||
CFLAGS=-Wall -O -I.
|
||||
VPATH=@abs_srcdir@
|
||||
TARGET=../../libmcexec.a
|
||||
LIBS=@LIBS@
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
../../libmcexec.a: archdep.o
|
||||
$(AR) cr ../../libmcexec.a archdep.o
|
||||
|
||||
archdep.o: archdep.S
|
||||
$(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $<
|
||||
|
||||
clean:
|
||||
$(RM) $(TARGET) *.o
|
||||
|
||||
.PHONY: all clean install
|
||||
|
||||
install:
|
||||
113
executer/user/arch/x86_64/arch_args.h
Normal file
113
executer/user/arch/x86_64/arch_args.h
Normal file
@ -0,0 +1,113 @@
|
||||
#ifndef ARCH_ARGS_H
|
||||
#define ARCH_ARGS_H
|
||||
|
||||
typedef struct user_regs_struct syscall_args;
|
||||
|
||||
static inline int
|
||||
get_syscall_args(int pid, syscall_args *args)
|
||||
{
|
||||
return ptrace(PTRACE_GETREGS, pid, NULL, args);
|
||||
}
|
||||
|
||||
static inline int
|
||||
set_syscall_args(int pid, syscall_args *args)
|
||||
{
|
||||
return ptrace(PTRACE_SETREGS, pid, NULL, args);
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_number(syscall_args *args)
|
||||
{
|
||||
return args->orig_rax;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_return(syscall_args *args)
|
||||
{
|
||||
return args->rax;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_arg1(syscall_args *args)
|
||||
{
|
||||
return args->rdi;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_arg2(syscall_args *args)
|
||||
{
|
||||
return args->rsi;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_arg3(syscall_args *args)
|
||||
{
|
||||
return args->rdx;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_arg4(syscall_args *args)
|
||||
{
|
||||
return args->r10;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_arg5(syscall_args *args)
|
||||
{
|
||||
return args->r8;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
get_syscall_arg6(syscall_args *args)
|
||||
{
|
||||
return args->r9;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_number(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->orig_rax = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_return(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->rax = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_arg1(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->rdi = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_arg2(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->rsi = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_arg3(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->rdx = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_arg4(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->r10 = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_arg5(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->r8 = value;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_syscall_arg6(syscall_args *args, unsigned long value)
|
||||
{
|
||||
args->r9 = value;
|
||||
}
|
||||
#endif
|
||||
149
executer/user/arch/x86_64/archdep.S
Normal file
149
executer/user/arch/x86_64/archdep.S
Normal file
@ -0,0 +1,149 @@
|
||||
/*
|
||||
arg: rdi, rsi, rdx, rcx, r8, r9
|
||||
ret: rax
|
||||
|
||||
rax syscall number
|
||||
syscall: (rax:num) rdi rsi rdx r10 r8 r9 (rcx:ret addr)
|
||||
fd, cmd, param
|
||||
rdi: fd
|
||||
rsi: cmd
|
||||
rdx: param
|
||||
rcx: save area
|
||||
r8: new thread context
|
||||
*/
|
||||
|
||||
.global switch_ctx
|
||||
switch_ctx:
|
||||
movq $0,0x00(%rcx)
|
||||
movq %rax,0x8(%rcx)
|
||||
movq %rbx,0x10(%rcx)
|
||||
movq %rcx,0x18(%rcx)
|
||||
movq %rdx,0x20(%rcx)
|
||||
movq %rsi,0x28(%rcx)
|
||||
movq %rdi,0x30(%rcx)
|
||||
movq %rbp,0x38(%rcx)
|
||||
movq %r8,0x40(%rcx)
|
||||
movq %r9,0x48(%rcx)
|
||||
movq %r10,0x50(%rcx)
|
||||
movq %r11,0x58(%rcx)
|
||||
movq %r12,0x60(%rcx)
|
||||
movq %r13,0x68(%rcx)
|
||||
movq %r14,0x70(%rcx)
|
||||
movq %r15,0x78(%rcx)
|
||||
pushfq
|
||||
popq %rax
|
||||
movq %rax,0x80(%rcx)
|
||||
movq 0x00(%rsp),%rax
|
||||
movq %rax,0x88(%rcx)
|
||||
movq %rsp,0x90(%rcx)
|
||||
movq %rcx,%r10
|
||||
|
||||
pushq %rcx
|
||||
pushq %r8
|
||||
pushq %rax
|
||||
|
||||
mov $0x10,%eax /* ioctl */
|
||||
syscall
|
||||
3:
|
||||
|
||||
popq %r8
|
||||
popq %r8
|
||||
popq %rcx
|
||||
|
||||
movq %r10,%rcx
|
||||
cmp $0xfffffffffffff001,%eax
|
||||
jae 1f
|
||||
|
||||
test %eax,%eax
|
||||
jnz 2f
|
||||
|
||||
pushq %rax
|
||||
movq $158,%rax /* arch_prctl */
|
||||
movq $0x1002,%rdi /* ARCH_SET_FS */
|
||||
movq 0x98(%r8),%rsi
|
||||
syscall
|
||||
popq %rax
|
||||
|
||||
movq 0x10(%r8),%rbx
|
||||
movq 0x18(%r8),%rcx
|
||||
movq 0x20(%r8),%rdx
|
||||
movq 0x28(%r8),%rsi
|
||||
movq 0x30(%r8),%rdi
|
||||
movq 0x38(%r8),%rbp
|
||||
movq 0x48(%r8),%r9
|
||||
movq 0x50(%r8),%r10
|
||||
movq 0x58(%r8),%r11
|
||||
movq 0x60(%r8),%r12
|
||||
movq 0x68(%r8),%r13
|
||||
movq 0x70(%r8),%r14
|
||||
movq 0x78(%r8),%r15
|
||||
movq 0x80(%r8),%rax
|
||||
pushq %rax
|
||||
popfq
|
||||
movq 0x90(%r8),%rsp
|
||||
// movq 0x8(%r8),%rax /* for interrupts */
|
||||
movq 0x40(%r8),%r8
|
||||
|
||||
movq $0,%rax /* ioctl return */
|
||||
|
||||
pushq %rcx
|
||||
retq
|
||||
|
||||
1:
|
||||
mov $0xffffffffffffffff,%eax
|
||||
2:
|
||||
pushq %rax
|
||||
movq $158,%rax /* arch_prctl */
|
||||
movq $0x1002,%rdi /* ARCH_SET_FS */
|
||||
movq 0x98(%rcx),%rsi
|
||||
syscall
|
||||
popq %rax
|
||||
|
||||
movq 0x10(%rcx),%rbx
|
||||
movq 0x28(%rcx),%rsi
|
||||
movq 0x30(%rcx),%rdi
|
||||
movq 0x38(%rcx),%rbp
|
||||
movq 0x40(%rcx),%r8
|
||||
movq 0x48(%rcx),%r9
|
||||
movq 0x50(%rcx),%r10
|
||||
movq 0x58(%rcx),%r11
|
||||
movq 0x60(%rcx),%r12
|
||||
movq 0x68(%rcx),%r13
|
||||
movq 0x70(%rcx),%r14
|
||||
movq 0x78(%rcx),%r15
|
||||
movq 0x80(%rcx),%rdx
|
||||
pushq %rdx
|
||||
popfq
|
||||
movq 0x20(%rcx),%rdx
|
||||
movq 0x18(%rcx),%rcx
|
||||
retq
|
||||
|
||||
/*
|
||||
arg: rdi, rsi, rdx, rcx, r8, r9
|
||||
ret: rax
|
||||
unsigned long
|
||||
compare_and_swap(unsigned long *addr, unsigned long old, unsigned long new);
|
||||
rdi: addr
|
||||
rsi: old
|
||||
rdx: new
|
||||
RET: old value
|
||||
*/
|
||||
.global compare_and_swap
|
||||
compare_and_swap:
|
||||
movq %rsi,%rax
|
||||
lock
|
||||
cmpxchgq %rdx,0(%rdi)
|
||||
retq
|
||||
|
||||
/*
|
||||
unsigned int
|
||||
compare_and_swap_int(unsigned int *addr, unsigned int old, unsigned int new);
|
||||
ret: old value
|
||||
*/
|
||||
.global compare_and_swap_int
|
||||
compare_and_swap_int:
|
||||
movl %esi,%eax
|
||||
lock
|
||||
cmpxchgl %edx,0(%rdi)
|
||||
retq
|
||||
|
||||
3
executer/user/archdep.h
Normal file
3
executer/user/archdep.h
Normal file
@ -0,0 +1,3 @@
|
||||
extern int switch_ctx(int fd, unsigned long cmd, void **param, void *lctx, void *rctx);
|
||||
extern unsigned long compare_and_swap(unsigned long *addr, unsigned long old, unsigned long new);
|
||||
extern unsigned int compare_and_swap_int(unsigned int *addr, unsigned int old, unsigned int new);
|
||||
@ -16,6 +16,8 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/socket.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <ihk/ihk_host_user.h>
|
||||
|
||||
#define CPU_TID_BASE 1000000
|
||||
|
||||
@ -25,6 +27,10 @@ struct options {
|
||||
char *kernel_path;
|
||||
char *dump_path;
|
||||
char *log_path;
|
||||
int interactive;
|
||||
int os_id;
|
||||
int mcos_fd;
|
||||
int print_idle;
|
||||
}; /* struct options */
|
||||
|
||||
struct thread_info {
|
||||
@ -42,7 +48,7 @@ struct thread_info {
|
||||
int tid;
|
||||
int cpu;
|
||||
int lcpu;
|
||||
int padding;
|
||||
int idle;
|
||||
uintptr_t process;
|
||||
uintptr_t clv;
|
||||
uintptr_t x86_clv;
|
||||
@ -53,6 +59,7 @@ static volatile int f_done = 0;
|
||||
static bfd *symbfd = NULL;
|
||||
static bfd *dumpbfd = NULL;
|
||||
static asection *dumpscn = NULL;
|
||||
static dump_mem_chunks_t *mem_chunks;
|
||||
static int num_processors = -1;
|
||||
static asymbol **symtab = NULL;
|
||||
static ssize_t nsyms;
|
||||
@ -91,25 +98,35 @@ static uintptr_t virt_to_phys(uintptr_t va) {
|
||||
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
|
||||
off_t off;
|
||||
bfd_boolean ok;
|
||||
int i;
|
||||
|
||||
if (pa < dumpscn->vma) {
|
||||
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
|
||||
return 1;
|
||||
}
|
||||
off = pa - dumpscn->vma;
|
||||
if (off >= dumpscn->size) {
|
||||
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
|
||||
return 1;
|
||||
}
|
||||
if ((dumpscn->size - off) < size) {
|
||||
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
|
||||
off = 0;
|
||||
/* Check if pa is valid in any chunks and figure
|
||||
* out the global offset in dump section */
|
||||
for (i = 0; i < mem_chunks->nr_chunks; ++i) {
|
||||
|
||||
if (mem_chunks->chunks[i].addr <= pa &&
|
||||
((pa + size) <= (mem_chunks->chunks[i].addr +
|
||||
mem_chunks->chunks[i].size))) {
|
||||
|
||||
off += (pa - mem_chunks->chunks[i].addr);
|
||||
break;
|
||||
}
|
||||
|
||||
off += mem_chunks->chunks[i].size;
|
||||
}
|
||||
|
||||
if (i == mem_chunks->nr_chunks) {
|
||||
printf("read_physmem: invalid addr 0x%lx\n", pa);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
|
||||
if (!ok) {
|
||||
bfd_perror("read_physmem:bfd_get_section_contents");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* read_physmem() */
|
||||
|
||||
@ -125,7 +142,21 @@ static int read_mem(uintptr_t va, void *buf, size_t size) {
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
error = read_physmem(pa, buf, size);
|
||||
|
||||
if (opt.interactive) {
|
||||
dumpargs_t args;
|
||||
|
||||
args.cmd = DUMP_READ;
|
||||
args.start = pa;
|
||||
args.size = size;
|
||||
args.buf = buf;
|
||||
|
||||
error = ioctl(opt.mcos_fd, IHK_OS_DUMP, &args);
|
||||
}
|
||||
else {
|
||||
error = read_physmem(pa, buf, size);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
perror("read_mem:read_physmem");
|
||||
return 1;
|
||||
@ -167,6 +198,7 @@ enum {
|
||||
CURRENT_OFFSET,
|
||||
RUNQ_OFFSET,
|
||||
CPU_STATUS_OFFSET,
|
||||
IDLE_THREAD_OFFSET,
|
||||
|
||||
/* process */
|
||||
CTX_OFFSET,
|
||||
@ -204,6 +236,7 @@ static int setup_constants(void) {
|
||||
printf("CURRENT_OFFSET: %ld\n", K(CURRENT_OFFSET));
|
||||
printf("RUNQ_OFFSET: %ld\n", K(RUNQ_OFFSET));
|
||||
printf("CPU_STATUS_OFFSET: %ld\n", K(CPU_STATUS_OFFSET));
|
||||
printf("IDLE_THREAD_OFFSET: %ld\n", K(IDLE_THREAD_OFFSET));
|
||||
printf("CTX_OFFSET: %ld\n", K(CTX_OFFSET));
|
||||
printf("SCHED_LIST_OFFSET: %ld\n", K(SCHED_LIST_OFFSET));
|
||||
printf("PROC_OFFSET: %ld\n", K(PROC_OFFSET));
|
||||
@ -229,6 +262,7 @@ static int setup_threads(void) {
|
||||
perror("num_processors");
|
||||
return 1;
|
||||
}
|
||||
printf("%s: num_processors: %d\n", __FUNCTION__, num_processors);
|
||||
|
||||
error = read_symbol_64("locals", &locals);
|
||||
if (error) {
|
||||
@ -315,15 +349,19 @@ static int setup_threads(void) {
|
||||
ti->status = status;
|
||||
ti->pid = pid;
|
||||
ti->tid = tid;
|
||||
ti->cpu = (thread == current)? cpu: -1;
|
||||
ti->cpu = (thread == current) ? cpu : -1;
|
||||
ti->lcpu = cpu;
|
||||
ti->process = thread;
|
||||
ti->idle = 0;
|
||||
ti->clv = v;
|
||||
ti->x86_clv = locals + locals_span*cpu;
|
||||
|
||||
*titailp = ti;
|
||||
titailp = &ti->next;
|
||||
|
||||
if (!curr_thread)
|
||||
curr_thread = ti;
|
||||
|
||||
error = read_64(entry, &entry);
|
||||
if (error) {
|
||||
perror("process2");
|
||||
@ -332,8 +370,78 @@ static int setup_threads(void) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Set up idle threads */
|
||||
if (opt.print_idle) {
|
||||
for (cpu = 0; cpu < num_processors; ++cpu) {
|
||||
uintptr_t v;
|
||||
uintptr_t thread;
|
||||
uintptr_t proc;
|
||||
int pid;
|
||||
int tid;
|
||||
struct thread_info *ti;
|
||||
int status;
|
||||
|
||||
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
|
||||
|
||||
error = read_64(v+K(CURRENT_OFFSET), ¤t);
|
||||
if (error) {
|
||||
perror("current");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ti = malloc(sizeof(*ti));
|
||||
if (!ti) {
|
||||
perror("malloc");
|
||||
return 1;
|
||||
}
|
||||
|
||||
thread = v+K(IDLE_THREAD_OFFSET);
|
||||
|
||||
error = read_64(thread+K(PROC_OFFSET), &proc);
|
||||
if (error) {
|
||||
perror("proc");
|
||||
return 1;
|
||||
}
|
||||
|
||||
error = read_32(thread+K(STATUS_OFFSET), &status);
|
||||
if (error) {
|
||||
perror("status");
|
||||
return 1;
|
||||
}
|
||||
|
||||
error = read_32(proc+K(PID_OFFSET), &pid);
|
||||
if (error) {
|
||||
perror("pid");
|
||||
return 1;
|
||||
}
|
||||
|
||||
error = read_32(thread+K(TID_OFFSET), &tid);
|
||||
if (error) {
|
||||
perror("tid");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ti->next = NULL;
|
||||
ti->status = status;
|
||||
ti->pid = 1;
|
||||
ti->tid = 2000000000 + tid;
|
||||
ti->cpu = (thread == current) ? cpu : -1;
|
||||
ti->lcpu = cpu;
|
||||
ti->process = thread;
|
||||
ti->idle = 1;
|
||||
ti->clv = v;
|
||||
ti->x86_clv = locals + locals_span*cpu;
|
||||
|
||||
*titailp = ti;
|
||||
titailp = &ti->next;
|
||||
|
||||
if (!curr_thread)
|
||||
curr_thread = ti;
|
||||
}
|
||||
}
|
||||
|
||||
if (!tihead) {
|
||||
printf("thread not found. cpu mode forcibly\n");
|
||||
printf("No threads found, forcing CPU mode.\n");
|
||||
opt.cpu = 1;
|
||||
}
|
||||
|
||||
@ -374,6 +482,7 @@ static int setup_threads(void) {
|
||||
ti->tid = CPU_TID_BASE + cpu;
|
||||
ti->cpu = cpu;
|
||||
ti->process = current;
|
||||
ti->idle = 1;
|
||||
ti->clv = v;
|
||||
ti->x86_clv = locals + locals_span*cpu;
|
||||
|
||||
@ -386,7 +495,9 @@ static int setup_threads(void) {
|
||||
printf("thread not found\n");
|
||||
return 1;
|
||||
}
|
||||
curr_thread = tihead;
|
||||
|
||||
if (!curr_thread)
|
||||
curr_thread = tihead;
|
||||
|
||||
return 0;
|
||||
} /* setup_threads() */
|
||||
@ -448,13 +559,32 @@ static int setup_dump(char *fname) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
|
||||
mem_chunks = malloc(PHYS_CHUNKS_DESC_SIZE);
|
||||
if (!mem_chunks) {
|
||||
perror("allocating mem chunks descriptor: ");
|
||||
return 1;
|
||||
}
|
||||
|
||||
dumpscn = bfd_get_section_by_name(dumpbfd, "physchunks");
|
||||
if (!dumpscn) {
|
||||
bfd_perror("bfd_get_section_by_name");
|
||||
return 1;
|
||||
}
|
||||
|
||||
kernel_base = dumpscn->vma + 0x200000;
|
||||
ok = bfd_get_section_contents(dumpbfd, dumpscn, mem_chunks,
|
||||
0, PHYS_CHUNKS_DESC_SIZE);
|
||||
if (!ok) {
|
||||
bfd_perror("read_physmem:bfd_get_section_contents");
|
||||
return 1;
|
||||
}
|
||||
|
||||
kernel_base = mem_chunks->kernel_base;
|
||||
|
||||
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
|
||||
if (!dumpscn) {
|
||||
bfd_perror("bfd_get_section_by_name");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* setup_dump() */
|
||||
@ -609,18 +739,21 @@ static void command(char *cmd, char *res) {
|
||||
break;
|
||||
}
|
||||
|
||||
//if (regs[17] > MAP_KERNEL) {}
|
||||
pu8 = (void *)®s;
|
||||
for (i = 0; i < sizeof(regs)-4; ++i) {
|
||||
rbp += sprintf(rbp, "%02x", pu8[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
else if (!strcmp(p, "mffffffff80018a82,1")) {
|
||||
rbp += sprintf(rbp, "b8");
|
||||
}
|
||||
else if (!strcmp(p, "mffffffff80018a82,9")) {
|
||||
rbp += sprintf(rbp, "b8f2ffffff41564155");
|
||||
}
|
||||
*/
|
||||
else if (!strncmp(p, "m", 1)) {
|
||||
int n;
|
||||
uintptr_t start;
|
||||
@ -716,33 +849,35 @@ static void command(char *cmd, char *res) {
|
||||
break;
|
||||
}
|
||||
q = buf;
|
||||
q += sprintf(q, "PID %d, ", ti->pid);
|
||||
if (ti->status & PS_RUNNING) {
|
||||
q += sprintf(q, "running on cpu%d", ti->cpu);
|
||||
q += sprintf(q, "%srunning on cpu %d",
|
||||
ti->idle ? "idle " : "", ti->lcpu);
|
||||
}
|
||||
else if (ti->status & (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE)) {
|
||||
q += sprintf(q, "waiting on cpu%d", ti->lcpu);
|
||||
q += sprintf(q, "%swaiting on cpu %d",
|
||||
ti->idle ? "idle " : "", ti->lcpu);
|
||||
}
|
||||
else if (ti->status & PS_STOPPED) {
|
||||
q += sprintf(q, "stopped on cpu%d", ti->lcpu);
|
||||
q += sprintf(q, "%sstopped on cpu %d",
|
||||
ti->idle ? "idle " : "", ti->lcpu);
|
||||
}
|
||||
else if (ti->status & PS_TRACED) {
|
||||
q += sprintf(q, "traced on cpu%d", ti->lcpu);
|
||||
q += sprintf(q, "%straced on cpu %d",
|
||||
ti->idle ? "idle " : "", ti->lcpu);
|
||||
}
|
||||
else if (ti->status == CS_IDLE) {
|
||||
q += sprintf(q, "cpu%d idle", ti->cpu);
|
||||
q += sprintf(q, "cpu %d idle", ti->cpu);
|
||||
}
|
||||
else if (ti->status == CS_RUNNING) {
|
||||
q += sprintf(q, "cpu%d running", ti->cpu);
|
||||
q += sprintf(q, "cpu %d running", ti->cpu);
|
||||
}
|
||||
else if (ti->status == CS_RESERVED) {
|
||||
q += sprintf(q, "cpu%d reserved", ti->cpu);
|
||||
q += sprintf(q, "cpu %d reserved", ti->cpu);
|
||||
}
|
||||
else {
|
||||
q += sprintf(q, "status=%#x", ti->status);
|
||||
}
|
||||
if (ti->tid != ti->pid) {
|
||||
q += sprintf(q, ",pid=%d", ti->pid);
|
||||
}
|
||||
rbp += print_hex(rbp, buf);
|
||||
}
|
||||
} while (0);
|
||||
@ -755,11 +890,12 @@ static void options(int argc, char *argv[]) {
|
||||
memset(&opt, 0, sizeof(opt));
|
||||
opt.kernel_path = "./mckernel.img";
|
||||
opt.dump_path = "./mcdump";
|
||||
opt.mcos_fd = -1;
|
||||
|
||||
for (;;) {
|
||||
int c;
|
||||
|
||||
c = getopt(argc, argv, "cd:hk:");
|
||||
c = getopt(argc, argv, "ilcd:hk:o:");
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
@ -777,12 +913,32 @@ static void options(int argc, char *argv[]) {
|
||||
case 'd':
|
||||
opt.dump_path = optarg;
|
||||
break;
|
||||
case 'i':
|
||||
opt.interactive = 1;
|
||||
break;
|
||||
case 'o':
|
||||
opt.os_id = atoi(optarg);
|
||||
break;
|
||||
case 'l':
|
||||
opt.print_idle = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (optind < argc) {
|
||||
opt.help = 1;
|
||||
}
|
||||
|
||||
if (opt.interactive) {
|
||||
char fn[128];
|
||||
sprintf(fn, "/dev/mcos%d", opt.os_id);
|
||||
|
||||
opt.mcos_fd = open(fn, O_RDONLY);
|
||||
if (opt.mcos_fd < 0) {
|
||||
perror("open");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
} /* options() */
|
||||
|
||||
@ -865,7 +1021,7 @@ int main(int argc, char *argv[]) {
|
||||
uint8_t sum;
|
||||
uint8_t check;
|
||||
static char lbuf[1024];
|
||||
static char rbuf[1024];
|
||||
static char rbuf[8192];
|
||||
static char cbuf[3];
|
||||
char *lbp;
|
||||
char *p;
|
||||
|
||||
27
executer/user/libsched_yield.c
Normal file
27
executer/user/libsched_yield.c
Normal file
@ -0,0 +1,27 @@
|
||||
#define _GNU_SOURCE
|
||||
#include <dlfcn.h>
|
||||
#include <sys/time.h>
|
||||
#include <sched.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef sched_yield
|
||||
|
||||
typedef int (*int_void_fn)(void);
|
||||
|
||||
static int_void_fn orig_sched_yield = 0;
|
||||
|
||||
int sched_yield(void)
|
||||
{
|
||||
#if 0
|
||||
if (!orig_sched_yield) {
|
||||
orig_sched_yield = (int_void_fn)dlsym(RTLD_NEXT, "sched_yield");
|
||||
}
|
||||
|
||||
printf("sched_yield() called\n");
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -3,15 +3,16 @@ SRC=$(VPATH)
|
||||
IHKDIR=$(IHKBASE)/$(TARGETDIR)
|
||||
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
|
||||
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
|
||||
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
|
||||
OBJS += zeroobj.o procfs.o devobj.o sysfs.o xpmem.o profile.o freeze.o
|
||||
OBJS += rbtree.o
|
||||
DEPSRCS=$(wildcard $(SRC)/*.c)
|
||||
|
||||
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
|
||||
CFLAGS += -I$(SRC)/include -I@abs_builddir@/../ -I@abs_builddir@/include -D__KERNEL__ -g -fno-omit-frame-pointer -fno-inline -fno-inline-small-functions
|
||||
LDFLAGS += -e arch_start
|
||||
IHKOBJ = ihk/ihk.o
|
||||
|
||||
include $(SRC)/config/config.$(TARGET)
|
||||
include $(IHKBASE)/Makefile.common
|
||||
include @abs_builddir@/../../ihk/cokernel/Makefile.common
|
||||
|
||||
# CFLAGS += -I$(SRC)/../arch/$(IHKARCH)/kernel/include -I$(SRC)/../lib/include
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ V ?= $(VERBOSE)
|
||||
KERNEL = kernel.img
|
||||
KERNELS = $(addsuffix /$(KERNEL),$(addprefix $(O)/,$(BUILD_TARGET)))
|
||||
|
||||
SUBCMD_OPTS = V='$(V)'
|
||||
SUBCMD_OPTS = V='$(V)' BUILD_IHK_COKERNEL=@abs_builddir@/../../ihk/cokernel
|
||||
|
||||
$(if $(O),,$(error Specify the compilation target directory))
|
||||
#$(if $(shell ls $(IHKBASE)/Makefile),,\
|
||||
|
||||
46
kernel/ap.c
46
kernel/ap.c
@ -25,10 +25,25 @@
|
||||
#include <init.h>
|
||||
#include <march.h>
|
||||
#include <cls.h>
|
||||
#include <time.h>
|
||||
#include <syscall.h>
|
||||
#include <rusage.h>
|
||||
|
||||
//#define DEBUG_PRINT_AP
|
||||
|
||||
#ifdef DEBUG_PRINT_AP
|
||||
#define dkprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define dkprintf(...) do { } while (0)
|
||||
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#endif
|
||||
|
||||
int num_processors = 1;
|
||||
static volatile int ap_stop = 1;
|
||||
|
||||
mcs_lock_node_t ap_syscall_semaphore;
|
||||
|
||||
static void ap_wait(void)
|
||||
{
|
||||
init_tick();
|
||||
@ -43,7 +58,15 @@ static void ap_wait(void)
|
||||
arch_start_pvclock();
|
||||
|
||||
if (find_command_line("hidos")) {
|
||||
init_host_syscall_channel();
|
||||
mcs_lock_node_t mcs_node;
|
||||
int ikc_cpu = ihk_mc_get_ikc_cpu(ihk_mc_get_processor_id());
|
||||
if(ikc_cpu < 0) {
|
||||
ekprintf("%s,ihk_mc_get_ikc_cpu failed\n", __FUNCTION__);
|
||||
}
|
||||
mcs_lock_lock_noirq(&ap_syscall_semaphore, &mcs_node);
|
||||
init_host_ikc2mckernel();
|
||||
init_host_ikc2linux(ikc_cpu);
|
||||
mcs_lock_unlock_noirq(&ap_syscall_semaphore, &mcs_node);
|
||||
}
|
||||
|
||||
pc_ap_init();
|
||||
@ -57,6 +80,7 @@ static void ap_wait(void)
|
||||
void ap_start(void)
|
||||
{
|
||||
init_tick();
|
||||
mcs_lock_init(&ap_syscall_semaphore);
|
||||
ap_stop = 0;
|
||||
sync_tick();
|
||||
}
|
||||
@ -65,7 +89,7 @@ void ap_init(void)
|
||||
{
|
||||
struct ihk_mc_cpu_info *cpu_info;
|
||||
int i;
|
||||
int bsp_hw_id;
|
||||
int bsp_hw_id, bsp_cpu_id;
|
||||
|
||||
ihk_mc_init_ap();
|
||||
init_delay();
|
||||
@ -78,18 +102,28 @@ void ap_init(void)
|
||||
return;
|
||||
}
|
||||
|
||||
kprintf("BSP HW ID = %d\n", bsp_hw_id);
|
||||
bsp_cpu_id = 0;
|
||||
for (i = 0; i < cpu_info->ncpus; ++i) {
|
||||
if (cpu_info->hw_ids[i] == bsp_hw_id) {
|
||||
bsp_cpu_id = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
|
||||
bsp_hw_id, cpu_info->nodes[0]);
|
||||
|
||||
for (i = 0; i < cpu_info->ncpus; i++) {
|
||||
if (cpu_info->hw_ids[i] == bsp_hw_id) {
|
||||
continue;
|
||||
}
|
||||
kprintf("AP Booting: %d (HW ID: %d)\n", i, cpu_info->hw_ids[i]);
|
||||
dkprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
|
||||
cpu_info->hw_ids[i], cpu_info->nodes[i]);
|
||||
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
|
||||
|
||||
num_processors++;
|
||||
}
|
||||
kprintf("AP Booting: Done\n");
|
||||
kprintf("BSP: booted %d AP CPUs\n", cpu_info->ncpus - 1);
|
||||
}
|
||||
|
||||
#include <sysfs.h>
|
||||
@ -199,7 +233,7 @@ cpu_sysfs_setup(void)
|
||||
/* setup table */
|
||||
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
|
||||
for (cpu = 0; cpu < num_processors; ++cpu) {
|
||||
info[cpu].online = 10+cpu;
|
||||
info[cpu].online = 1;
|
||||
}
|
||||
fake_cpu_infos = info;
|
||||
|
||||
|
||||
12
kernel/cls.c
12
kernel/cls.c
@ -19,21 +19,29 @@
|
||||
#include <ihk/page_alloc.h>
|
||||
#include <cls.h>
|
||||
#include <page.h>
|
||||
#include <rusage.h>
|
||||
|
||||
extern int num_processors;
|
||||
|
||||
struct cpu_local_var *clv;
|
||||
static int cpu_local_var_initialized = 0;
|
||||
int cpu_local_var_initialized = 0;
|
||||
|
||||
void cpu_local_var_init(void)
|
||||
{
|
||||
int z;
|
||||
int i;
|
||||
|
||||
z = sizeof(struct cpu_local_var) * num_processors;
|
||||
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
|
||||
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
|
||||
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
|
||||
memset(clv, 0, z * PAGE_SIZE);
|
||||
|
||||
for (i = 0; i < num_processors; i++) {
|
||||
clv[i].monitor = monitor->cpu + i;
|
||||
INIT_LIST_HEAD(&clv[i].smp_func_req_list);
|
||||
}
|
||||
|
||||
cpu_local_var_initialized = 1;
|
||||
}
|
||||
|
||||
|
||||
@ -37,6 +37,8 @@ static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
|
||||
if (head < tail) head += buf_len;
|
||||
if (tail + len > buf_len) adj = buf_len - tail;
|
||||
if (head > tail && head <= tail + len + adj) {
|
||||
/* When proceeding tail (producer pointer) by len would
|
||||
cross head (consumer pointer) in ring-buffer */
|
||||
if (mode != 1) {
|
||||
*slide = 1;
|
||||
break;
|
||||
@ -70,6 +72,9 @@ void kputs(char *buf)
|
||||
|
||||
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
|
||||
kmsg_buf.tail += len;
|
||||
/* When proceeding tail (producer pointer) by len would
|
||||
cross head (consumer pointer) in ring-buffer, give up
|
||||
[head, tail] because the range is overwritten */
|
||||
if (slide == 1) {
|
||||
kmsg_buf.head = kmsg_buf.tail + 1;
|
||||
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
|
||||
@ -110,6 +115,7 @@ int __kprintf(const char *format, ...)
|
||||
char buf[KPRINTF_LOCAL_BUF_LEN];
|
||||
|
||||
/* Copy into the local buf */
|
||||
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
|
||||
va_start(va, format);
|
||||
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
|
||||
va_end(va);
|
||||
@ -169,6 +175,17 @@ int kprintf(const char *format, ...)
|
||||
return len;
|
||||
}
|
||||
|
||||
/* mode:
|
||||
0: mcklogd is not running.
|
||||
When kmsg buffer is full, writer doesn't block
|
||||
and overwrites the buffer.
|
||||
1: mcklogd periodically retrieves kmsg.
|
||||
When kmsg buffer is full, writer blocks until
|
||||
someone retrieves kmsg.
|
||||
2: mcklogd periodically retrieves kmsg.
|
||||
When kmsg buffer is full, writer doesn't block
|
||||
and overwrites the buffer.
|
||||
*/
|
||||
void kmsg_init(int mode)
|
||||
{
|
||||
ihk_mc_spinlock_init(&kmsg_lock);
|
||||
|
||||
@ -78,54 +78,56 @@ static struct memobj *to_memobj(struct devobj *devobj)
|
||||
/***********************************************************************
|
||||
* devobj
|
||||
*/
|
||||
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp)
|
||||
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
|
||||
int prot, int populate_flags)
|
||||
{
|
||||
ihk_mc_user_context_t ctx;
|
||||
struct pager_map_result result; // XXX: assumes contiguous physical
|
||||
int error;
|
||||
struct devobj *obj = NULL;
|
||||
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
|
||||
const size_t pfn_npages = (npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
|
||||
|
||||
dkprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
|
||||
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
|
||||
if (npages > MAX_PAGES_IN_DEVOBJ) {
|
||||
error = -EFBIG;
|
||||
kprintf("devobj_create(%d,%lx,%lx):too large len. %d\n", fd, len, off, error);
|
||||
goto out;
|
||||
}
|
||||
dkprintf("%s: fd: %d, len: %lu, off: %lu \n", __FUNCTION__, fd, len, off);
|
||||
|
||||
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
|
||||
if (!obj) {
|
||||
error = -ENOMEM;
|
||||
kprintf("devobj_create(%d,%lx,%lx):kmalloc failed. %d\n", fd, len, off, error);
|
||||
kprintf("%s: error: fd: %d, len: %lu, off: %lu kmalloc failed.\n",
|
||||
__FUNCTION__, fd, len, off);
|
||||
goto out;
|
||||
}
|
||||
memset(obj, 0, sizeof(*obj));
|
||||
|
||||
obj->pfn_table = allocate_pages(1, IHK_MC_AP_NOWAIT);
|
||||
obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
|
||||
if (!obj->pfn_table) {
|
||||
error = -ENOMEM;
|
||||
kprintf("devobj_create(%d,%lx,%lx):allocate_pages failed. %d\n", fd, len, off, error);
|
||||
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
|
||||
__FUNCTION__, fd, len, off);
|
||||
goto out;
|
||||
}
|
||||
memset(obj->pfn_table, 0, 1*PAGE_SIZE);
|
||||
memset(obj->pfn_table, 0, pfn_npages * PAGE_SIZE);
|
||||
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_MAP;
|
||||
ihk_mc_syscall_arg1(&ctx) = fd;
|
||||
ihk_mc_syscall_arg2(&ctx) = len;
|
||||
ihk_mc_syscall_arg3(&ctx) = off;
|
||||
ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result);
|
||||
ihk_mc_syscall_arg5(&ctx) = prot | populate_flags;
|
||||
|
||||
error = syscall_generic_forwarding(__NR_mmap, &ctx);
|
||||
if (error) {
|
||||
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
|
||||
kprintf("%s: error: fd: %d, len: %lu, off: %lu map failed.\n",
|
||||
__FUNCTION__, fd, len, off);
|
||||
goto out;
|
||||
}
|
||||
dkprintf("devobj_create:handle: %lx\n", result.handle);
|
||||
dkprintf("devobj_create:maxprot: %x\n", result.maxprot);
|
||||
|
||||
dkprintf("%s: fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x\n",
|
||||
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
|
||||
|
||||
obj->memobj.ops = &devobj_ops;
|
||||
obj->memobj.flags = MF_HAS_PAGER;
|
||||
obj->memobj.flags = MF_HAS_PAGER | MF_DEV_FILE;
|
||||
obj->memobj.size = len;
|
||||
obj->handle = result.handle;
|
||||
obj->ref = 1;
|
||||
obj->pfn_pgoff = off / PAGE_SIZE;
|
||||
@ -140,11 +142,12 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
|
||||
out:
|
||||
if (obj) {
|
||||
if (obj->pfn_table) {
|
||||
free_pages(obj->pfn_table, 1);
|
||||
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
|
||||
}
|
||||
kfree(obj);
|
||||
}
|
||||
dkprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
|
||||
dkprintf("%s: ret: %d, fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x \n",
|
||||
__FUNCTION__, error, fd, len, off, result.handle, result.maxprot);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -164,6 +167,8 @@ static void devobj_release(struct memobj *memobj)
|
||||
struct devobj *obj = to_devobj(memobj);
|
||||
struct devobj *free_obj = NULL;
|
||||
uintptr_t handle;
|
||||
const size_t pfn_npages =
|
||||
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
|
||||
|
||||
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
|
||||
|
||||
@ -176,23 +181,25 @@ static void devobj_release(struct memobj *memobj)
|
||||
memobj_unlock(&obj->memobj);
|
||||
|
||||
if (free_obj) {
|
||||
int error;
|
||||
ihk_mc_user_context_t ctx;
|
||||
if (!(free_obj->memobj.flags & MF_HOST_RELEASED)) {
|
||||
int error;
|
||||
ihk_mc_user_context_t ctx;
|
||||
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_UNMAP;
|
||||
ihk_mc_syscall_arg1(&ctx) = handle;
|
||||
ihk_mc_syscall_arg2(&ctx) = 1;
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_UNMAP;
|
||||
ihk_mc_syscall_arg1(&ctx) = handle;
|
||||
ihk_mc_syscall_arg2(&ctx) = 1;
|
||||
|
||||
error = syscall_generic_forwarding(__NR_mmap, &ctx);
|
||||
if (error) {
|
||||
kprintf("devobj_release(%p %lx):"
|
||||
"release failed. %d\n",
|
||||
free_obj, handle, error);
|
||||
/* through */
|
||||
error = syscall_generic_forwarding(__NR_mmap, &ctx);
|
||||
if (error) {
|
||||
kprintf("devobj_release(%p %lx):"
|
||||
"release failed. %d\n",
|
||||
free_obj, handle, error);
|
||||
/* through */
|
||||
}
|
||||
}
|
||||
|
||||
if (obj->pfn_table) {
|
||||
free_pages(obj->pfn_table, 1);
|
||||
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
|
||||
}
|
||||
kfree(free_obj);
|
||||
}
|
||||
@ -204,7 +211,7 @@ static void devobj_release(struct memobj *memobj)
|
||||
|
||||
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
|
||||
{
|
||||
const off_t pgoff = off >> PAGE_SHIFT;
|
||||
const off_t pgoff = off / PAGE_SIZE;
|
||||
struct devobj *obj = to_devobj(memobj);
|
||||
int error;
|
||||
uintptr_t pfn;
|
||||
@ -216,7 +223,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
|
||||
|
||||
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
|
||||
error = -EFBIG;
|
||||
kprintf("devobj_get_page(%p %lx,%lx,%d): out of range. %d\n", memobj, obj->handle, off, p2align, error);
|
||||
kprintf("%s: error: out of range: off: %lu, page off: %lu obj->npages: %d\n", __FUNCTION__, off, pgoff, obj->npages);
|
||||
goto out;
|
||||
}
|
||||
ix = pgoff - obj->pfn_pgoff;
|
||||
@ -224,6 +231,9 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
|
||||
|
||||
memobj_lock(&obj->memobj);
|
||||
pfn = obj->pfn_table[ix];
|
||||
#ifdef PROFILE_ENABLE
|
||||
profile_event_add(PROFILE_page_fault_dev_file, PAGE_SIZE);
|
||||
#endif // PROFILE_ENABLE
|
||||
if (!(pfn & PFN_VALID)) {
|
||||
memobj_unlock(&obj->memobj);
|
||||
|
||||
|
||||
496
kernel/fileobj.c
496
kernel/fileobj.c
@ -29,22 +29,26 @@
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
|
||||
static ihk_spinlock_t fileobj_list_lock = SPIN_LOCK_UNLOCKED;
|
||||
mcs_rwlock_lock_t fileobj_list_lock;
|
||||
static LIST_HEAD(fileobj_list);
|
||||
|
||||
#define FILEOBJ_PAGE_HASH_SHIFT 9
|
||||
#define FILEOBJ_PAGE_HASH_SIZE (1 << FILEOBJ_PAGE_HASH_SHIFT)
|
||||
#define FILEOBJ_PAGE_HASH_MASK (FILEOBJ_PAGE_HASH_SIZE - 1)
|
||||
|
||||
struct fileobj {
|
||||
struct memobj memobj; /* must be first */
|
||||
long sref;
|
||||
long cref;
|
||||
uintptr_t handle;
|
||||
struct list_head page_list;
|
||||
struct list_head list;
|
||||
struct memobj memobj; /* must be first */
|
||||
long sref;
|
||||
long cref;
|
||||
uintptr_t handle;
|
||||
struct list_head list;
|
||||
struct list_head page_hash[FILEOBJ_PAGE_HASH_SIZE];
|
||||
mcs_rwlock_lock_t page_hash_locks[FILEOBJ_PAGE_HASH_SIZE];
|
||||
};
|
||||
|
||||
static memobj_release_func_t fileobj_release;
|
||||
static memobj_ref_func_t fileobj_ref;
|
||||
static memobj_get_page_func_t fileobj_get_page;
|
||||
static memobj_copy_page_func_t fileobj_copy_page;
|
||||
static memobj_flush_page_func_t fileobj_flush_page;
|
||||
static memobj_invalidate_page_func_t fileobj_invalidate_page;
|
||||
static memobj_lookup_page_func_t fileobj_lookup_page;
|
||||
@ -53,7 +57,7 @@ static struct memobj_ops fileobj_ops = {
|
||||
.release = &fileobj_release,
|
||||
.ref = &fileobj_ref,
|
||||
.get_page = &fileobj_get_page,
|
||||
.copy_page = &fileobj_copy_page,
|
||||
.copy_page = NULL,
|
||||
.flush_page = &fileobj_flush_page,
|
||||
.invalidate_page = &fileobj_invalidate_page,
|
||||
.lookup_page = &fileobj_lookup_page,
|
||||
@ -72,28 +76,36 @@ static struct memobj *to_memobj(struct fileobj *fileobj)
|
||||
/***********************************************************************
|
||||
* page_list
|
||||
*/
|
||||
static void page_list_init(struct fileobj *obj)
|
||||
static void fileobj_page_hash_init(struct fileobj *obj)
|
||||
{
|
||||
INIT_LIST_HEAD(&obj->page_list);
|
||||
int i;
|
||||
for (i = 0; i < FILEOBJ_PAGE_HASH_SIZE; ++i) {
|
||||
mcs_rwlock_init(&obj->page_hash_locks[i]);
|
||||
INIT_LIST_HEAD(&obj->page_hash[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void page_list_insert(struct fileobj *obj, struct page *page)
|
||||
/* NOTE: caller must hold page_hash_locks[hash] */
|
||||
static void __fileobj_page_hash_insert(struct fileobj *obj,
|
||||
struct page *page, int hash)
|
||||
{
|
||||
list_add(&page->list, &obj->page_list);
|
||||
return;
|
||||
list_add(&page->list, &obj->page_hash[hash]);
|
||||
}
|
||||
|
||||
static void page_list_remove(struct fileobj *obj, struct page *page)
|
||||
/* NOTE: caller must hold page_hash_locks[hash] */
|
||||
static void __fileobj_page_hash_remove(struct page *page)
|
||||
{
|
||||
list_del(&page->list);
|
||||
}
|
||||
|
||||
static struct page *page_list_lookup(struct fileobj *obj, off_t off)
|
||||
/* NOTE: caller must hold page_hash_locks[hash] */
|
||||
static struct page *__fileobj_page_hash_lookup(struct fileobj *obj,
|
||||
int hash, off_t off)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
list_for_each_entry(page, &obj->page_list, list) {
|
||||
list_for_each_entry(page, &obj->page_hash[hash], list) {
|
||||
if ((page->mode != PM_WILL_PAGEIO)
|
||||
&& (page->mode != PM_PAGEIO)
|
||||
&& (page->mode != PM_DONE_PAGEIO)
|
||||
@ -104,6 +116,7 @@ static struct page *page_list_lookup(struct fileobj *obj, off_t off)
|
||||
obj, off, page->mode);
|
||||
panic("page_list_lookup:invalid obj page");
|
||||
}
|
||||
|
||||
if (page->offset == off) {
|
||||
goto out;
|
||||
}
|
||||
@ -114,13 +127,22 @@ out:
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct page *page_list_first(struct fileobj *obj)
|
||||
static struct page *fileobj_page_hash_first(struct fileobj *obj)
|
||||
{
|
||||
if (list_empty(&obj->page_list)) {
|
||||
return NULL;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < FILEOBJ_PAGE_HASH_SIZE; ++i) {
|
||||
if (!list_empty(&obj->page_hash[i])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return list_first_entry(&obj->page_list, struct page, list);
|
||||
if (i != FILEOBJ_PAGE_HASH_SIZE) {
|
||||
return list_first_entry(&obj->page_hash[i], struct page, list);
|
||||
}
|
||||
else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
@ -163,10 +185,11 @@ static struct fileobj *obj_list_lookup(uintptr_t handle)
|
||||
int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
|
||||
{
|
||||
ihk_mc_user_context_t ctx;
|
||||
struct pager_create_result result; // XXX: assumes contiguous physical
|
||||
struct pager_create_result result __attribute__((aligned(64)));
|
||||
int error;
|
||||
struct fileobj *newobj = NULL;
|
||||
struct fileobj *obj;
|
||||
struct mcs_rwlock_node node;
|
||||
|
||||
dkprintf("fileobj_create(%d)\n", fd);
|
||||
newobj = kmalloc(sizeof(*newobj), IHK_MC_AP_NOWAIT);
|
||||
@ -179,36 +202,101 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_CREATE;
|
||||
ihk_mc_syscall_arg1(&ctx) = fd;
|
||||
ihk_mc_syscall_arg2(&ctx) = virt_to_phys(&result);
|
||||
memset(&result, 0, sizeof(result));
|
||||
|
||||
error = syscall_generic_forwarding(__NR_mmap, &ctx);
|
||||
if (error) {
|
||||
kprintf("fileobj_create(%d):create failed. %d\n", fd, error);
|
||||
dkprintf("fileobj_create(%d):create failed. %d\n", fd, error);
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(newobj, 0, sizeof(*newobj));
|
||||
newobj->memobj.ops = &fileobj_ops;
|
||||
newobj->memobj.flags = MF_HAS_PAGER;
|
||||
newobj->memobj.flags = MF_HAS_PAGER | MF_REG_FILE;
|
||||
newobj->handle = result.handle;
|
||||
newobj->sref = 1;
|
||||
newobj->cref = 1;
|
||||
page_list_init(newobj);
|
||||
fileobj_page_hash_init(newobj);
|
||||
ihk_mc_spinlock_init(&newobj->memobj.lock);
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&fileobj_list_lock);
|
||||
mcs_rwlock_writer_lock_noirq(&fileobj_list_lock, &node);
|
||||
obj = obj_list_lookup(result.handle);
|
||||
if (!obj) {
|
||||
obj_list_insert(newobj);
|
||||
obj = newobj;
|
||||
to_memobj(obj)->size = result.size;
|
||||
to_memobj(obj)->flags |= result.flags;
|
||||
to_memobj(obj)->status = MEMOBJ_READY;
|
||||
if (to_memobj(obj)->flags & MF_PREFETCH) {
|
||||
to_memobj(obj)->status = MEMOBJ_TO_BE_PREFETCHED;
|
||||
}
|
||||
|
||||
/* XXX: KNL specific optimization for OFP runs */
|
||||
if ((to_memobj(obj)->flags & MF_PREMAP) &&
|
||||
(to_memobj(obj)->flags & MF_ZEROFILL)) {
|
||||
struct memobj *mo = to_memobj(obj);
|
||||
int nr_pages = (result.size + (PAGE_SIZE - 1))
|
||||
>> PAGE_SHIFT;
|
||||
int j = 0;
|
||||
int node = ihk_mc_get_nr_numa_nodes() / 2;
|
||||
dkprintf("%s: MF_PREMAP, start node: %d\n",
|
||||
__FUNCTION__, node);
|
||||
|
||||
mo->pages = kmalloc(nr_pages * sizeof(void *), IHK_MC_AP_NOWAIT);
|
||||
if (!mo->pages) {
|
||||
kprintf("%s: WARNING: failed to allocate pages\n",
|
||||
__FUNCTION__);
|
||||
goto error_cleanup;
|
||||
}
|
||||
|
||||
mo->nr_pages = nr_pages;
|
||||
memset(mo->pages, 0, nr_pages * sizeof(*mo->pages));
|
||||
|
||||
if (cpu_local_var(current)->proc->mpol_flags & MPOL_SHM_PREMAP) {
|
||||
/* Get the actual pages NUMA interleaved */
|
||||
for (j = 0; j < nr_pages; ++j) {
|
||||
mo->pages[j] = ihk_mc_alloc_aligned_pages_node_user(1,
|
||||
PAGE_P2ALIGN, IHK_MC_AP_NOWAIT, node);
|
||||
if (!mo->pages[j]) {
|
||||
kprintf("%s: ERROR: allocating pages[%d]\n",
|
||||
__FUNCTION__, j);
|
||||
goto error_cleanup;
|
||||
}
|
||||
|
||||
memset(mo->pages[j], 0, PAGE_SIZE);
|
||||
|
||||
++node;
|
||||
if (node == ihk_mc_get_nr_numa_nodes()) {
|
||||
node = ihk_mc_get_nr_numa_nodes() / 2;
|
||||
}
|
||||
}
|
||||
dkprintf("%s: allocated %d pages interleaved\n",
|
||||
__FUNCTION__, nr_pages);
|
||||
}
|
||||
error_cleanup:
|
||||
/* TODO: cleanup allocated portion */
|
||||
;
|
||||
}
|
||||
|
||||
newobj = NULL;
|
||||
dkprintf("%s: new obj 0x%lx cref: %d, %s\n",
|
||||
__FUNCTION__,
|
||||
obj,
|
||||
obj->cref,
|
||||
to_memobj(obj)->flags & MF_ZEROFILL ? "zerofill" : "");
|
||||
}
|
||||
else {
|
||||
++obj->sref;
|
||||
++obj->cref;
|
||||
memobj_unlock(&obj->memobj); /* locked by obj_list_lookup() */
|
||||
dkprintf("%s: existing obj 0x%lx cref: %d, %s\n",
|
||||
__FUNCTION__,
|
||||
obj,
|
||||
obj->cref,
|
||||
to_memobj(obj)->flags & MF_ZEROFILL ? "zerofill" : "");
|
||||
}
|
||||
|
||||
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
|
||||
mcs_rwlock_writer_unlock_noirq(&fileobj_list_lock, &node);
|
||||
|
||||
error = 0;
|
||||
*objp = to_memobj(obj);
|
||||
@ -239,6 +327,7 @@ static void fileobj_release(struct memobj *memobj)
|
||||
long free_sref = 0;
|
||||
uintptr_t free_handle;
|
||||
struct fileobj *free_obj = NULL;
|
||||
struct mcs_rwlock_node node;
|
||||
|
||||
dkprintf("fileobj_release(%p %lx)\n", obj, obj->handle);
|
||||
|
||||
@ -252,19 +341,41 @@ static void fileobj_release(struct memobj *memobj)
|
||||
obj->sref -= free_sref;
|
||||
free_handle = obj->handle;
|
||||
memobj_unlock(&obj->memobj);
|
||||
if (obj->memobj.flags & MF_HOST_RELEASED) {
|
||||
free_sref = 0; // don't call syscall_generic_forwarding
|
||||
}
|
||||
|
||||
if (free_obj) {
|
||||
ihk_mc_spinlock_lock_noirq(&fileobj_list_lock);
|
||||
dkprintf("%s: release obj 0x%lx cref: %d, free_obj: 0x%lx, %s\n",
|
||||
__FUNCTION__,
|
||||
obj,
|
||||
obj->cref,
|
||||
free_obj,
|
||||
to_memobj(obj)->flags & MF_ZEROFILL ? "zerofill" : "");
|
||||
mcs_rwlock_writer_lock_noirq(&fileobj_list_lock, &node);
|
||||
/* zap page_list */
|
||||
for (;;) {
|
||||
struct page *page;
|
||||
int count;
|
||||
void *page_va;
|
||||
|
||||
page = page_list_first(obj);
|
||||
page = fileobj_page_hash_first(obj);
|
||||
if (!page) {
|
||||
break;
|
||||
}
|
||||
page_list_remove(obj, page);
|
||||
__fileobj_page_hash_remove(page);
|
||||
page_va = phys_to_virt(page_to_phys(page));
|
||||
|
||||
if (ihk_atomic_read(&page->count) != 1) {
|
||||
kprintf("%s: WARNING: page count %d for phys 0x%lx is invalid, flags: 0x%lx\n",
|
||||
__FUNCTION__,
|
||||
ihk_atomic_read(&page->count),
|
||||
page->phys,
|
||||
to_memobj(free_obj)->flags);
|
||||
}
|
||||
else if (page_unmap(page)) {
|
||||
ihk_mc_free_pages_user(page_va, 1);
|
||||
}
|
||||
#if 0
|
||||
count = ihk_atomic_sub_return(1, &page->count);
|
||||
|
||||
if (!((page->mode == PM_WILL_PAGEIO)
|
||||
@ -281,10 +392,23 @@ static void fileobj_release(struct memobj *memobj)
|
||||
}
|
||||
|
||||
page->mode = PM_NONE;
|
||||
free_pages(phys_to_virt(page_to_phys(page)), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Pre-mapped? */
|
||||
if (to_memobj(free_obj)->flags & MF_PREMAP) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < to_memobj(free_obj)->nr_pages; ++i) {
|
||||
if (to_memobj(free_obj)->pages[i])
|
||||
ihk_mc_free_pages_user(to_memobj(free_obj)->pages[i], 1);
|
||||
}
|
||||
|
||||
kfree(to_memobj(free_obj)->pages);
|
||||
}
|
||||
|
||||
obj_list_remove(free_obj);
|
||||
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
|
||||
mcs_rwlock_writer_unlock_noirq(&fileobj_list_lock, &node);
|
||||
kfree(free_obj);
|
||||
}
|
||||
|
||||
@ -330,83 +454,144 @@ static void fileobj_do_pageio(void *args0)
|
||||
struct page *page;
|
||||
ihk_mc_user_context_t ctx;
|
||||
ssize_t ss;
|
||||
struct mcs_rwlock_node mcs_node;
|
||||
int hash = (off >> PAGE_SHIFT) & FILEOBJ_PAGE_HASH_MASK;
|
||||
|
||||
memobj_lock(&obj->memobj);
|
||||
page = page_list_lookup(obj, off);
|
||||
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
page = __fileobj_page_hash_lookup(obj, hash, off);
|
||||
if (!page) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
while (page->mode == PM_PAGEIO) {
|
||||
memobj_unlock(&obj->memobj);
|
||||
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
cpu_pause();
|
||||
memobj_lock(&obj->memobj);
|
||||
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
}
|
||||
|
||||
if (page->mode == PM_WILL_PAGEIO) {
|
||||
page->mode = PM_PAGEIO;
|
||||
memobj_unlock(&obj->memobj);
|
||||
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_READ;
|
||||
ihk_mc_syscall_arg1(&ctx) = obj->handle;
|
||||
ihk_mc_syscall_arg2(&ctx) = off;
|
||||
ihk_mc_syscall_arg3(&ctx) = pgsize;
|
||||
ihk_mc_syscall_arg4(&ctx) = page_to_phys(page);
|
||||
|
||||
ss = syscall_generic_forwarding(__NR_mmap, &ctx);
|
||||
|
||||
memobj_lock(&obj->memobj);
|
||||
if (page->mode != PM_PAGEIO) {
|
||||
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
|
||||
"invalid mode %x\n",
|
||||
obj, off, pgsize, page->mode);
|
||||
panic("fileobj_do_pageio:invalid page mode");
|
||||
if (to_memobj(obj)->flags & MF_ZEROFILL) {
|
||||
void *virt = phys_to_virt(page_to_phys(page));
|
||||
memset(virt, 0, PAGE_SIZE);
|
||||
#ifdef PROFILE_ENABLE
|
||||
profile_event_add(PROFILE_page_fault_file_clr, PAGE_SIZE);
|
||||
#endif // PROFILE_ENABLE
|
||||
}
|
||||
else {
|
||||
page->mode = PM_PAGEIO;
|
||||
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
|
||||
if (ss == 0) {
|
||||
dkprintf("fileobj_do_pageio(%p,%lx,%lx):EOF? %ld\n",
|
||||
obj, off, pgsize, ss);
|
||||
page->mode = PM_PAGEIO_EOF;
|
||||
goto out;
|
||||
}
|
||||
else if (ss != pgsize) {
|
||||
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
|
||||
"read failed. %ld\n",
|
||||
obj, off, pgsize, ss);
|
||||
page->mode = PM_PAGEIO_ERROR;
|
||||
goto out;
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_READ;
|
||||
ihk_mc_syscall_arg1(&ctx) = obj->handle;
|
||||
ihk_mc_syscall_arg2(&ctx) = off;
|
||||
ihk_mc_syscall_arg3(&ctx) = pgsize;
|
||||
ihk_mc_syscall_arg4(&ctx) = page_to_phys(page);
|
||||
|
||||
dkprintf("%s: __NR_mmap for handle 0x%lx\n",
|
||||
__FUNCTION__, obj->handle);
|
||||
ss = syscall_generic_forwarding(__NR_mmap, &ctx);
|
||||
|
||||
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
if (page->mode != PM_PAGEIO) {
|
||||
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
|
||||
"invalid mode %x\n",
|
||||
obj, off, pgsize, page->mode);
|
||||
panic("fileobj_do_pageio:invalid page mode");
|
||||
}
|
||||
|
||||
if (ss == 0) {
|
||||
dkprintf("fileobj_do_pageio(%p,%lx,%lx):EOF? %ld\n",
|
||||
obj, off, pgsize, ss);
|
||||
page->mode = PM_PAGEIO_EOF;
|
||||
goto out;
|
||||
}
|
||||
else if (ss != pgsize) {
|
||||
kprintf("fileobj_do_pageio(%p,%lx,%lx):"
|
||||
"read failed. %ld\n",
|
||||
obj, off, pgsize, ss);
|
||||
page->mode = PM_PAGEIO_ERROR;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
page->mode = PM_DONE_PAGEIO;
|
||||
}
|
||||
out:
|
||||
memobj_unlock(&obj->memobj);
|
||||
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
fileobj_release(&obj->memobj); /* got fileobj_get_page() */
|
||||
kfree(args0);
|
||||
dkprintf("fileobj_do_pageio(%p,%lx,%lx):\n", obj, off, pgsize);
|
||||
return;
|
||||
}
|
||||
|
||||
static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
|
||||
static int fileobj_get_page(struct memobj *memobj, off_t off,
|
||||
int p2align, uintptr_t *physp, unsigned long *pflag)
|
||||
{
|
||||
struct thread *proc = cpu_local_var(current);
|
||||
struct fileobj *obj = to_fileobj(memobj);
|
||||
int error;
|
||||
int error = -1;
|
||||
void *virt = NULL;
|
||||
int npages;
|
||||
uintptr_t phys = -1;
|
||||
struct page *page;
|
||||
struct pageio_args *args = NULL;
|
||||
struct mcs_rwlock_node mcs_node;
|
||||
int hash = (off >> PAGE_SHIFT) & FILEOBJ_PAGE_HASH_MASK;
|
||||
|
||||
dkprintf("fileobj_get_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
|
||||
|
||||
memobj_lock(&obj->memobj);
|
||||
if (p2align != PAGE_P2ALIGN) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
page = page_list_lookup(obj, off);
|
||||
#ifdef PROFILE_ENABLE
|
||||
profile_event_add(PROFILE_page_fault_file, PAGE_SIZE);
|
||||
#endif // PROFILE_ENABLE
|
||||
|
||||
if (memobj->flags & MF_PREMAP) {
|
||||
int page_ind = off >> PAGE_SHIFT;
|
||||
|
||||
if (!memobj->pages[page_ind]) {
|
||||
virt = ihk_mc_alloc_pages_user(1, IHK_MC_AP_NOWAIT | IHK_MC_AP_USER);
|
||||
|
||||
if (!virt) {
|
||||
error = -ENOMEM;
|
||||
kprintf("fileobj_get_page(%p,%lx,%x,%p):"
|
||||
"alloc failed. %d\n",
|
||||
obj, off, p2align, physp,
|
||||
error);
|
||||
goto out_nolock;
|
||||
}
|
||||
|
||||
/* Update the array but see if someone did it already and use
|
||||
* that if so */
|
||||
if (!__sync_bool_compare_and_swap(&memobj->pages[page_ind],
|
||||
NULL, virt)) {
|
||||
ihk_mc_free_pages_user(virt, 1);
|
||||
}
|
||||
else {
|
||||
dkprintf("%s: MF_ZEROFILL: off: %lu -> 0x%lx allocated\n",
|
||||
__FUNCTION__, off, virt_to_phys(virt));
|
||||
}
|
||||
}
|
||||
|
||||
virt = memobj->pages[page_ind];
|
||||
error = 0;
|
||||
*physp = virt_to_phys(virt);
|
||||
dkprintf("%s: MF_ZEROFILL: off: %lu -> 0x%lx resolved\n",
|
||||
__FUNCTION__, off, virt_to_phys(virt));
|
||||
virt = NULL;
|
||||
goto out_nolock;
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
page = __fileobj_page_hash_lookup(obj, hash, off);
|
||||
if (!page || (page->mode == PM_WILL_PAGEIO)
|
||||
|| (page->mode == PM_PAGEIO)) {
|
||||
args = kmalloc(sizeof(*args), IHK_MC_AP_NOWAIT);
|
||||
@ -420,7 +605,10 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
|
||||
|
||||
if (!page) {
|
||||
npages = 1 << p2align;
|
||||
virt = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
|
||||
|
||||
virt = ihk_mc_alloc_pages_user(npages, IHK_MC_AP_NOWAIT |
|
||||
(to_memobj(obj)->flags & MF_ZEROFILL) ? IHK_MC_AP_USER : 0);
|
||||
|
||||
if (!virt) {
|
||||
error = -ENOMEM;
|
||||
kprintf("fileobj_get_page(%p,%lx,%x,%p):"
|
||||
@ -430,17 +618,19 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
|
||||
goto out;
|
||||
}
|
||||
phys = virt_to_phys(virt);
|
||||
page = phys_to_page(phys);
|
||||
page = phys_to_page_insert_hash(phys);
|
||||
if (page->mode != PM_NONE) {
|
||||
panic("fileobj_get_page:invalid new page");
|
||||
}
|
||||
page->mode = PM_WILL_PAGEIO;
|
||||
page->offset = off;
|
||||
ihk_atomic_set(&page->count, 1);
|
||||
page_list_insert(obj, page);
|
||||
__fileobj_page_hash_insert(obj, page, hash);
|
||||
page->mode = PM_WILL_PAGEIO;
|
||||
}
|
||||
|
||||
memobj_lock(&obj->memobj);
|
||||
++obj->cref; /* for fileobj_do_pageio() */
|
||||
memobj_unlock(&obj->memobj);
|
||||
|
||||
args->fileobj = obj;
|
||||
args->objoff = off;
|
||||
@ -472,9 +662,11 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
|
||||
*physp = page_to_phys(page);
|
||||
virt = NULL;
|
||||
out:
|
||||
memobj_unlock(&obj->memobj);
|
||||
mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
out_nolock:
|
||||
if (virt) {
|
||||
ihk_mc_free_pages(virt, npages);
|
||||
ihk_mc_free_pages_user(virt, npages);
|
||||
}
|
||||
if (args) {
|
||||
kfree(args);
|
||||
@ -484,76 +676,6 @@ out:
|
||||
return error;
|
||||
}
|
||||
|
||||
static uintptr_t fileobj_copy_page(
|
||||
struct memobj *memobj, uintptr_t orgpa, int p2align)
|
||||
{
|
||||
struct page *orgpage = phys_to_page(orgpa);
|
||||
size_t pgsize = PAGE_SIZE << p2align;
|
||||
int npages = 1 << p2align;
|
||||
void *newkva = NULL;
|
||||
uintptr_t newpa = -1;
|
||||
void *orgkva;
|
||||
int count;
|
||||
|
||||
dkprintf("fileobj_copy_page(%p,%lx,%d)\n", memobj, orgpa, p2align);
|
||||
if (p2align != PAGE_P2ALIGN) {
|
||||
panic("p2align");
|
||||
}
|
||||
|
||||
memobj_lock(memobj);
|
||||
for (;;) {
|
||||
if (orgpage->mode != PM_MAPPED) {
|
||||
kprintf("fileobj_copy_page(%p,%lx,%d):"
|
||||
"invalid cow page. %x\n",
|
||||
memobj, orgpa, p2align, orgpage->mode);
|
||||
panic("fileobj_copy_page:invalid cow page");
|
||||
}
|
||||
count = ihk_atomic_read(&orgpage->count);
|
||||
if (count == 2) { // XXX: private only
|
||||
list_del(&orgpage->list);
|
||||
ihk_atomic_dec(&orgpage->count);
|
||||
orgpage->mode = PM_NONE;
|
||||
newpa = orgpa;
|
||||
break;
|
||||
}
|
||||
if (count <= 0) {
|
||||
kprintf("fileobj_copy_page(%p,%lx,%d):"
|
||||
"orgpage count corrupted. %x\n",
|
||||
memobj, orgpa, p2align, count);
|
||||
panic("fileobj_copy_page:orgpage count corrupted");
|
||||
}
|
||||
if (newkva) {
|
||||
orgkva = phys_to_virt(orgpa);
|
||||
memcpy(newkva, orgkva, pgsize);
|
||||
ihk_atomic_dec(&orgpage->count);
|
||||
newpa = virt_to_phys(newkva);
|
||||
page_map(phys_to_page(newpa));
|
||||
newkva = NULL; /* avoid ihk_mc_free_pages() */
|
||||
break;
|
||||
}
|
||||
|
||||
memobj_unlock(memobj);
|
||||
newkva = ihk_mc_alloc_aligned_pages(npages, p2align,
|
||||
IHK_MC_AP_NOWAIT);
|
||||
if (!newkva) {
|
||||
kprintf("fileobj_copy_page(%p,%lx,%d):"
|
||||
"alloc page failed\n",
|
||||
memobj, orgpa, p2align);
|
||||
goto out;
|
||||
}
|
||||
memobj_lock(memobj);
|
||||
}
|
||||
memobj_unlock(memobj);
|
||||
|
||||
out:
|
||||
if (newkva) {
|
||||
ihk_mc_free_pages(newkva, npages);
|
||||
}
|
||||
dkprintf("fileobj_copy_page(%p,%lx,%d): %lx\n",
|
||||
memobj, orgpa, p2align, newpa);
|
||||
return newpa;
|
||||
}
|
||||
|
||||
static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
|
||||
size_t pgsize)
|
||||
{
|
||||
@ -562,7 +684,20 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
|
||||
ihk_mc_user_context_t ctx;
|
||||
ssize_t ss;
|
||||
|
||||
if (to_memobj(obj)->flags & MF_ZEROFILL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (memobj->flags |= MF_HOST_RELEASED) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
page = phys_to_page(phys);
|
||||
if (!page) {
|
||||
kprintf("%s: warning: tried to flush non-existing page for phys addr: 0x%lx\n",
|
||||
__FUNCTION__, phys);
|
||||
return 0;
|
||||
}
|
||||
memobj_unlock(&obj->memobj);
|
||||
|
||||
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_WRITE;
|
||||
@ -585,63 +720,48 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
|
||||
static int fileobj_invalidate_page(struct memobj *memobj, uintptr_t phys,
|
||||
size_t pgsize)
|
||||
{
|
||||
struct fileobj *obj = to_fileobj(memobj);
|
||||
int error;
|
||||
struct page *page;
|
||||
|
||||
dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx)\n",
|
||||
memobj, phys, pgsize);
|
||||
|
||||
if (!(page = phys_to_page(phys))
|
||||
|| !(page = page_list_lookup(obj, page->offset))) {
|
||||
error = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ihk_atomic_read(&page->count) == 1) {
|
||||
if (page_unmap(page)) {
|
||||
ihk_mc_free_pages(phys_to_virt(phys),
|
||||
pgsize/PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
error = 0;
|
||||
out:
|
||||
dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx):%d\n",
|
||||
memobj, phys, pgsize, error);
|
||||
return error;
|
||||
/* TODO: keep track of reverse mappings so that invalidation
|
||||
* can be performed */
|
||||
kprintf("%s: WARNING: file mapping invalidation not supported\n",
|
||||
__FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fileobj_lookup_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
|
||||
static int fileobj_lookup_page(struct memobj *memobj, off_t off,
|
||||
int p2align, uintptr_t *physp, unsigned long *pflag)
|
||||
{
|
||||
struct fileobj *obj = to_fileobj(memobj);
|
||||
int error;
|
||||
uintptr_t phys = -1;
|
||||
int error = -1;
|
||||
struct page *page;
|
||||
struct mcs_rwlock_node mcs_node;
|
||||
int hash = (off >> PAGE_SHIFT) & FILEOBJ_PAGE_HASH_MASK;
|
||||
|
||||
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
|
||||
|
||||
memobj_lock(&obj->memobj);
|
||||
if (p2align != PAGE_P2ALIGN) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
page = page_list_lookup(obj, off);
|
||||
mcs_rwlock_reader_lock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
|
||||
page = __fileobj_page_hash_lookup(obj, hash, off);
|
||||
if (!page) {
|
||||
error = -ENOENT;
|
||||
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): page not found. %d\n", obj, off, p2align, physp, error);
|
||||
goto out;
|
||||
}
|
||||
phys = page_to_phys(page);
|
||||
|
||||
*physp = page_to_phys(page);
|
||||
error = 0;
|
||||
if (physp) {
|
||||
*physp = phys;
|
||||
}
|
||||
|
||||
out:
|
||||
memobj_unlock(&obj->memobj);
|
||||
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d %lx\n",
|
||||
obj, off, p2align, physp, error, phys);
|
||||
mcs_rwlock_reader_unlock_noirq(&obj->page_hash_locks[hash],
|
||||
&mcs_node);
|
||||
|
||||
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d \n",
|
||||
obj, off, p2align, physp, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
55
kernel/freeze.c
Normal file
55
kernel/freeze.c
Normal file
@ -0,0 +1,55 @@
|
||||
#include <kmsg.h>
|
||||
#include <string.h>
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <cls.h>
|
||||
#include <rusage.h>
|
||||
|
||||
extern int nmi_mode;
|
||||
extern void mod_nmi_ctx(void *, void(*)());
|
||||
extern void lapic_ack();
|
||||
extern void __freeze();
|
||||
|
||||
void
|
||||
freeze()
|
||||
{
|
||||
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
||||
|
||||
monitor->status_bak = monitor->status;
|
||||
monitor->status = IHK_OS_MONITOR_KERNEL_FROZEN;
|
||||
while (monitor->status == IHK_OS_MONITOR_KERNEL_FROZEN)
|
||||
cpu_halt();
|
||||
monitor->status = monitor->status_bak;
|
||||
}
|
||||
|
||||
long
|
||||
freeze_thaw(void *nmi_ctx)
|
||||
{
|
||||
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
||||
|
||||
if (nmi_mode == 1) {
|
||||
if (monitor->status != IHK_OS_MONITOR_KERNEL_FROZEN) {
|
||||
#if 1
|
||||
mod_nmi_ctx(nmi_ctx, __freeze);
|
||||
return 1;
|
||||
#else
|
||||
unsigned long flags;
|
||||
|
||||
flags = cpu_disable_interrupt_save();
|
||||
monitor->status_bak = monitor->status;
|
||||
monitor->status = IHK_OS_MONITOR_KERNEL_FROZEN;
|
||||
lapic_ack();
|
||||
while (monitor->status == IHK_OS_MONITOR_KERNEL_FROZEN)
|
||||
cpu_halt();
|
||||
monitor->status = monitor->status_bak;
|
||||
cpu_restore_interrupt(flags);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if(nmi_mode == 2) {
|
||||
if (monitor->status == IHK_OS_MONITOR_KERNEL_FROZEN) {
|
||||
monitor->status = IHK_OS_MONITOR_KERNEL_THAW;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -79,8 +79,6 @@
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#endif
|
||||
|
||||
extern struct sigpending *hassigpending(struct thread *thread);
|
||||
|
||||
int futex_cmpxchg_enabled;
|
||||
|
||||
/**
|
||||
@ -250,9 +248,13 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin
|
||||
|
||||
static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
|
||||
{
|
||||
/* RIKEN: futexes are always on not swappable pages */
|
||||
*dest = getint_user((int *)from);
|
||||
|
||||
/*
|
||||
* Officially we should call:
|
||||
* return getint_user((int *)dest, (int *)from);
|
||||
*
|
||||
* but McKernel on x86 can just access user-space.
|
||||
*/
|
||||
*dest = *(volatile uint32_t *)from;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -672,25 +674,32 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
|
||||
uint64_t timeout)
|
||||
{
|
||||
uint64_t time_remain = 0;
|
||||
unsigned long irqstate;
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
/*
|
||||
* The task state is guaranteed to be set before another task can
|
||||
* wake it. set_current_state() is implemented using set_mb() and
|
||||
* queue_me() calls spin_unlock() upon completion, both serializing
|
||||
* access to the hash list and forcing another memory barrier.
|
||||
* wake it.
|
||||
* queue_me() calls spin_unlock() upon completion, serializing
|
||||
* access to the hash list and forcing a memory barrier.
|
||||
*/
|
||||
xchg4(&(cpu_local_var(current)->status), PS_INTERRUPTIBLE);
|
||||
|
||||
/* Indicate spin sleep */
|
||||
irqstate = ihk_mc_spinlock_lock(&thread->spin_sleep_lock);
|
||||
thread->spin_sleep = 1;
|
||||
ihk_mc_spinlock_unlock(&thread->spin_sleep_lock, irqstate);
|
||||
|
||||
queue_me(q, hb);
|
||||
|
||||
if (!plist_node_empty(&q->list)) {
|
||||
|
||||
/* RIKEN: use mcos timers */
|
||||
if (timeout) {
|
||||
dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid);
|
||||
time_remain = schedule_timeout(timeout);
|
||||
}
|
||||
else {
|
||||
dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid);
|
||||
schedule();
|
||||
spin_sleep_or_schedule();
|
||||
time_remain = 0;
|
||||
}
|
||||
|
||||
@ -699,6 +708,7 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
|
||||
|
||||
/* This does not need to be serialized */
|
||||
cpu_local_var(current)->status = PS_RUNNING;
|
||||
thread->spin_sleep = 0;
|
||||
|
||||
return time_remain;
|
||||
}
|
||||
@ -745,14 +755,17 @@ static int futex_wait_setup(uint32_t __user *uaddr, uint32_t val, int fshared,
|
||||
*/
|
||||
q->key = FUTEX_KEY_INIT;
|
||||
ret = get_futex_key(uaddr, fshared, &q->key);
|
||||
if ((ret != 0))
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
*hb = queue_lock(q);
|
||||
|
||||
ret = get_futex_value_locked(&uval, uaddr);
|
||||
|
||||
/* RIKEN: get_futex_value_locked() always returns 0 on mckernel */
|
||||
if (ret) {
|
||||
queue_unlock(q, *hb);
|
||||
put_futex_key(fshared, &q->key);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (uval != val) {
|
||||
queue_unlock(q, *hb);
|
||||
@ -775,11 +788,18 @@ static int futex_wait(uint32_t __user *uaddr, int fshared,
|
||||
if (!bitset)
|
||||
return -EINVAL;
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
if (cpu_local_var(current)->profile &&
|
||||
cpu_local_var(current)->profile_start_ts) {
|
||||
cpu_local_var(current)->profile_elapsed_ts +=
|
||||
(rdtsc() - cpu_local_var(current)->profile_start_ts);
|
||||
cpu_local_var(current)->profile_start_ts = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
q.bitset = bitset;
|
||||
q.requeue_pi_key = NULL;
|
||||
|
||||
/* RIKEN: futex_wait_queue_me() calls schedule_timeout() if timer is set */
|
||||
|
||||
retry:
|
||||
/* Prepare to wait on uaddr. */
|
||||
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
|
||||
@ -811,6 +831,11 @@ retry:
|
||||
out_put_key:
|
||||
put_futex_key(fshared, &q.key);
|
||||
out:
|
||||
#ifdef PROFILE_ENABLE
|
||||
if (cpu_local_var(current)->profile) {
|
||||
cpu_local_var(current)->profile_start_ts = rdtsc();
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
487
kernel/host.c
487
kernel/host.c
@ -23,14 +23,15 @@
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/ikc.h>
|
||||
#include <ikc/master.h>
|
||||
#include <syscall.h>
|
||||
#include <cls.h>
|
||||
#include <syscall.h>
|
||||
#include <process.h>
|
||||
#include <page.h>
|
||||
#include <mman.h>
|
||||
#include <init.h>
|
||||
#include <kmalloc.h>
|
||||
#include <sysfs.h>
|
||||
#include <ihk/perfctr.h>
|
||||
|
||||
//#define DEBUG_PRINT_HOST
|
||||
|
||||
@ -40,6 +41,9 @@
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#endif
|
||||
|
||||
/* Linux channel table, indexec by Linux CPU id */
|
||||
static struct ihk_ikc_channel_desc **ikc2linuxs = NULL;
|
||||
|
||||
void check_mapping_for_proc(struct thread *thread, unsigned long addr)
|
||||
{
|
||||
unsigned long __phys;
|
||||
@ -87,11 +91,15 @@ int prepare_process_ranges_args_envs(struct thread *thread,
|
||||
struct address_space *as = vm->address_space;
|
||||
long aout_base;
|
||||
int error;
|
||||
struct vm_range *range;
|
||||
unsigned long ap_flags;
|
||||
enum ihk_mc_pt_attribute ptattr;
|
||||
|
||||
n = p->num_sections;
|
||||
|
||||
aout_base = (pn->reloc)? vm->region.map_end: 0;
|
||||
for (i = 0; i < n; i++) {
|
||||
ap_flags = 0;
|
||||
if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) {
|
||||
interp_obase = pn->sections[i].vaddr;
|
||||
interp_obase -= (interp_obase % pn->interp_align);
|
||||
@ -112,48 +120,51 @@ int prepare_process_ranges_args_envs(struct thread *thread,
|
||||
s = (pn->sections[i].vaddr) & PAGE_MASK;
|
||||
e = (pn->sections[i].vaddr + pn->sections[i].len
|
||||
+ PAGE_SIZE - 1) & PAGE_MASK;
|
||||
range_npages = (e - s) >> PAGE_SHIFT;
|
||||
range_npages = ((pn->sections[i].vaddr - s) +
|
||||
pn->sections[i].filesz + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
flags = VR_NONE;
|
||||
flags |= PROT_TO_VR_FLAG(pn->sections[i].prot);
|
||||
flags |= VRFLAG_PROT_TO_MAXPROT(flags);
|
||||
flags |= VR_DEMAND_PAGING;
|
||||
|
||||
if ((up_v = ihk_mc_alloc_pages(range_npages, IHK_MC_AP_NOWAIT))
|
||||
== NULL) {
|
||||
kprintf("ERROR: alloc pages for ELF section %i\n", i);
|
||||
goto err;
|
||||
}
|
||||
|
||||
up = virt_to_phys(up_v);
|
||||
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0,
|
||||
PAGE_SHIFT) != 0) {
|
||||
ihk_mc_free_pages(up_v, range_npages);
|
||||
/* Non-TEXT sections that are large respect user allocation policy
|
||||
* unless user explicitly requests otherwise */
|
||||
if (i >= 1 && pn->sections[i].len >= pn->mpol_threshold &&
|
||||
!(pn->mpol_flags & MPOL_NO_BSS)) {
|
||||
dkprintf("%s: section: %d size: %d pages -> IHK_MC_AP_USER\n",
|
||||
__FUNCTION__, i, range_npages);
|
||||
ap_flags = IHK_MC_AP_USER;
|
||||
flags |= VR_AP_USER;
|
||||
}
|
||||
|
||||
if (add_process_memory_range(vm, s, e, NOPHYS, flags, NULL, 0,
|
||||
pn->sections[i].len > LARGE_PAGE_SIZE ?
|
||||
LARGE_PAGE_SHIFT : PAGE_SHIFT,
|
||||
&range) != 0) {
|
||||
kprintf("ERROR: adding memory range for ELF section %i\n", i);
|
||||
goto err;
|
||||
}
|
||||
|
||||
{
|
||||
void *_virt = (void *)s;
|
||||
unsigned long _phys;
|
||||
if (ihk_mc_pt_virt_to_phys(as->page_table,
|
||||
_virt, &_phys)) {
|
||||
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
|
||||
}
|
||||
for (_virt = (void *)s + PAGE_SIZE;
|
||||
(unsigned long)_virt < e; _virt += PAGE_SIZE) {
|
||||
unsigned long __phys;
|
||||
if (ihk_mc_pt_virt_to_phys(as->page_table,
|
||||
_virt, &__phys)) {
|
||||
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
|
||||
panic("mapping");
|
||||
}
|
||||
if (__phys != _phys + PAGE_SIZE) {
|
||||
kprintf("0x%lX + PAGE_SIZE is not physically contigous, from 0x%lX to 0x%lX\n", _virt - PAGE_SIZE, _phys, __phys);
|
||||
panic("mondai");
|
||||
}
|
||||
if ((up_v = ihk_mc_alloc_pages_user(range_npages,
|
||||
IHK_MC_AP_NOWAIT | ap_flags)) == NULL) {
|
||||
kprintf("ERROR: alloc pages for ELF section %i\n", i);
|
||||
goto err;
|
||||
}
|
||||
|
||||
_phys = __phys;
|
||||
}
|
||||
dkprintf("0x%lX -> 0x%lX is physically contigous\n", s, e);
|
||||
up = virt_to_phys(up_v);
|
||||
|
||||
ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
|
||||
error = ihk_mc_pt_set_range(vm->address_space->page_table, vm,
|
||||
(void *)range->start,
|
||||
(void *)range->start + (range_npages * PAGE_SIZE),
|
||||
up, ptattr,
|
||||
range->pgshift);
|
||||
|
||||
if (error) {
|
||||
kprintf("%s: ihk_mc_pt_set_range failed. %d\n",
|
||||
__FUNCTION__, error);
|
||||
ihk_mc_free_pages_user(up_v, range_npages);
|
||||
goto err;
|
||||
}
|
||||
|
||||
p->sections[i].remote_pa = up;
|
||||
@ -198,7 +209,43 @@ int prepare_process_ranges_args_envs(struct thread *thread,
|
||||
pn->at_entry += aout_base;
|
||||
}
|
||||
|
||||
vm->region.brk_start = vm->region.brk_end = vm->region.data_end;
|
||||
vm->region.brk_start = vm->region.brk_end =
|
||||
(vm->region.data_end + LARGE_PAGE_SIZE - 1) & LARGE_PAGE_MASK;
|
||||
|
||||
#if 0
|
||||
{
|
||||
void *heap;
|
||||
|
||||
dkprintf("%s: requested heap size: %lu\n",
|
||||
__FUNCTION__, proc->heap_extension);
|
||||
heap = ihk_mc_alloc_aligned_pages(proc->heap_extension >> PAGE_SHIFT,
|
||||
LARGE_PAGE_P2ALIGN, IHK_MC_AP_NOWAIT |
|
||||
(!(proc->mpol_flags & MPOL_NO_HEAP) ? IHK_MC_AP_USER : 0));
|
||||
|
||||
if (!heap) {
|
||||
kprintf("%s: error: allocating heap\n", __FUNCTION__);
|
||||
goto err;
|
||||
}
|
||||
|
||||
flags = VR_PROT_READ | VR_PROT_WRITE;
|
||||
flags |= VRFLAG_PROT_TO_MAXPROT(flags);
|
||||
if (add_process_memory_range(vm, vm->region.brk_start,
|
||||
vm->region.brk_start + proc->heap_extension,
|
||||
virt_to_phys(heap),
|
||||
flags, NULL, 0, LARGE_PAGE_P2ALIGN, NULL) != 0) {
|
||||
ihk_mc_free_pages(heap, proc->heap_extension >> PAGE_SHIFT);
|
||||
kprintf("%s: error: adding memory range for heap\n", __FUNCTION__);
|
||||
goto err;
|
||||
}
|
||||
|
||||
vm->region.brk_end_allocated = vm->region.brk_end +
|
||||
proc->heap_extension;
|
||||
dkprintf("%s: heap @ 0x%lx:%lu\n",
|
||||
__FUNCTION__, vm->region.brk_start, proc->heap_extension);
|
||||
}
|
||||
#else
|
||||
vm->region.brk_end_allocated = vm->region.brk_end;
|
||||
#endif
|
||||
|
||||
/* Map, copy and update args and envs */
|
||||
flags = VR_PROT_READ | VR_PROT_WRITE;
|
||||
@ -206,15 +253,16 @@ int prepare_process_ranges_args_envs(struct thread *thread,
|
||||
addr = vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT;
|
||||
e = addr + PAGE_SIZE * ARGENV_PAGE_COUNT;
|
||||
|
||||
if((args_envs = ihk_mc_alloc_pages(ARGENV_PAGE_COUNT, IHK_MC_AP_NOWAIT)) == NULL){
|
||||
if((args_envs = ihk_mc_alloc_pages_user(ARGENV_PAGE_COUNT,
|
||||
IHK_MC_AP_NOWAIT)) == NULL){
|
||||
kprintf("ERROR: allocating pages for args/envs\n");
|
||||
goto err;
|
||||
}
|
||||
args_envs_p = virt_to_phys(args_envs);
|
||||
|
||||
if(add_process_memory_range(vm, addr, e, args_envs_p,
|
||||
flags, NULL, 0, PAGE_SHIFT) != 0){
|
||||
ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT);
|
||||
flags, NULL, 0, PAGE_SHIFT, NULL) != 0){
|
||||
ihk_mc_free_pages_user(args_envs, ARGENV_PAGE_COUNT);
|
||||
kprintf("ERROR: adding memory range for args/envs\n");
|
||||
goto err;
|
||||
}
|
||||
@ -332,6 +380,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
else {
|
||||
vm->vdso_addr = NULL;
|
||||
}
|
||||
|
||||
p->rprocess = (unsigned long)thread;
|
||||
p->rpgtable = virt_to_phys(as->page_table);
|
||||
@ -373,10 +424,16 @@ static int process_msg_prepare_process(unsigned long rphys)
|
||||
}
|
||||
|
||||
n = p->num_sections;
|
||||
if (n > 16) {
|
||||
kprintf("%s: ERROR: more ELF sections than 16??\n",
|
||||
__FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
dkprintf("# of sections: %d\n", n);
|
||||
|
||||
if((pn = ihk_mc_allocate(sizeof(struct program_load_desc)
|
||||
+ sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){
|
||||
if((pn = kmalloc(sizeof(struct program_load_desc)
|
||||
+ sizeof(struct program_image_section) * n,
|
||||
IHK_MC_AP_NOWAIT)) == NULL){
|
||||
ihk_mc_unmap_virtual(p, npages, 0);
|
||||
ihk_mc_unmap_memory(NULL, phys, sz);
|
||||
return -ENOMEM;
|
||||
@ -384,8 +441,10 @@ static int process_msg_prepare_process(unsigned long rphys)
|
||||
memcpy_long(pn, p, sizeof(struct program_load_desc)
|
||||
+ sizeof(struct program_image_section) * n);
|
||||
|
||||
if((thread = create_thread(p->entry)) == NULL){
|
||||
ihk_mc_free(pn);
|
||||
if ((thread = create_thread(p->entry,
|
||||
(unsigned long *)&p->cpu_set,
|
||||
sizeof(p->cpu_set))) == NULL) {
|
||||
kfree(pn);
|
||||
ihk_mc_unmap_virtual(p, npages, 1);
|
||||
ihk_mc_unmap_memory(NULL, phys, sz);
|
||||
return -ENOMEM;
|
||||
@ -405,6 +464,14 @@ static int process_msg_prepare_process(unsigned long rphys)
|
||||
proc->sgid = pn->cred[6];
|
||||
proc->fsgid = pn->cred[7];
|
||||
proc->termsig = SIGCHLD;
|
||||
proc->mpol_flags = pn->mpol_flags;
|
||||
proc->mpol_threshold = pn->mpol_threshold;
|
||||
proc->nr_processes = pn->nr_processes;
|
||||
proc->heap_extension = pn->heap_extension;
|
||||
#ifdef PROFILE_ENABLE
|
||||
proc->profile = pn->profile;
|
||||
thread->profile = pn->profile;
|
||||
#endif
|
||||
|
||||
vm->region.user_start = pn->user_start;
|
||||
vm->region.user_end = pn->user_end;
|
||||
@ -423,9 +490,6 @@ static int process_msg_prepare_process(unsigned long rphys)
|
||||
vm->region.map_end = vm->region.map_start;
|
||||
memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX);
|
||||
|
||||
/* TODO: Clear it at the proper timing */
|
||||
cpu_local_var(scp).post_idx = 0;
|
||||
|
||||
if (prepare_process_ranges_args_envs(thread, pn, p, attr,
|
||||
NULL, 0, NULL, 0) != 0) {
|
||||
kprintf("error: preparing process ranges, args, envs, stack\n");
|
||||
@ -435,7 +499,7 @@ static int process_msg_prepare_process(unsigned long rphys)
|
||||
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
|
||||
vm->address_space->page_table);
|
||||
|
||||
ihk_mc_free(pn);
|
||||
kfree(pn);
|
||||
|
||||
ihk_mc_unmap_virtual(p, npages, 1);
|
||||
ihk_mc_unmap_memory(NULL, phys, sz);
|
||||
@ -443,77 +507,13 @@ static int process_msg_prepare_process(unsigned long rphys)
|
||||
|
||||
return 0;
|
||||
err:
|
||||
ihk_mc_free(pn);
|
||||
kfree(pn);
|
||||
ihk_mc_unmap_virtual(p, npages, 1);
|
||||
ihk_mc_unmap_memory(NULL, phys, sz);
|
||||
destroy_thread(thread);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
|
||||
{
|
||||
lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0);
|
||||
lparam->response_pa = virt_to_phys(lparam->response_va);
|
||||
|
||||
pcp->request_page = 0;
|
||||
pcp->doorbell_page = 0;
|
||||
pcp->response_page = lparam->response_pa;
|
||||
}
|
||||
|
||||
static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys)
|
||||
{
|
||||
struct ikc_scd_init_param *param = phys_to_virt(pphys);
|
||||
struct syscall_params *lparam;
|
||||
enum ihk_mc_pt_attribute attr;
|
||||
|
||||
attr = PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_FOR_USER;
|
||||
|
||||
lparam = &cpu_local_var(scp);
|
||||
if(cpu_local_var(syscall_channel2) == c)
|
||||
lparam = &cpu_local_var(scp2);
|
||||
lparam->request_rpa = param->request_page;
|
||||
lparam->request_pa = ihk_mc_map_memory(NULL, param->request_page,
|
||||
REQUEST_PAGE_COUNT * PAGE_SIZE);
|
||||
if((lparam->request_va = ihk_mc_map_virtual(lparam->request_pa,
|
||||
REQUEST_PAGE_COUNT,
|
||||
attr)) == NULL){
|
||||
// TODO:
|
||||
panic("ENOMEM");
|
||||
}
|
||||
|
||||
lparam->doorbell_rpa = param->doorbell_page;
|
||||
lparam->doorbell_pa = ihk_mc_map_memory(NULL, param->doorbell_page,
|
||||
DOORBELL_PAGE_COUNT *
|
||||
PAGE_SIZE);
|
||||
if((lparam->doorbell_va = ihk_mc_map_virtual(lparam->doorbell_pa,
|
||||
DOORBELL_PAGE_COUNT,
|
||||
attr)) == NULL){
|
||||
// TODO:
|
||||
panic("ENOMEM");
|
||||
}
|
||||
|
||||
lparam->post_rpa = param->post_page;
|
||||
lparam->post_pa = ihk_mc_map_memory(NULL, param->post_page,
|
||||
PAGE_SIZE);
|
||||
if((lparam->post_va = ihk_mc_map_virtual(lparam->post_pa, 1,
|
||||
attr)) == NULL){
|
||||
// TODO:
|
||||
panic("ENOMEM");
|
||||
}
|
||||
|
||||
lparam->post_fin = 1;
|
||||
|
||||
dkprintf("Syscall parameters: (%d)\n", ihk_mc_get_processor_id());
|
||||
dkprintf(" Response: %lx, %p\n",
|
||||
lparam->response_pa, lparam->response_va);
|
||||
dkprintf(" Request : %lx, %lx, %p\n",
|
||||
lparam->request_pa, lparam->request_rpa, lparam->request_va);
|
||||
dkprintf(" Doorbell: %lx, %lx, %p\n",
|
||||
lparam->doorbell_pa, lparam->doorbell_rpa, lparam->doorbell_va);
|
||||
dkprintf(" Post: %lx, %lx, %p\n",
|
||||
lparam->post_pa, lparam->post_rpa, lparam->post_va);
|
||||
}
|
||||
|
||||
static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
|
||||
struct ikc_scd_packet *packet)
|
||||
{
|
||||
@ -521,46 +521,18 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
|
||||
}
|
||||
|
||||
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
|
||||
extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
|
||||
|
||||
extern void process_procfs_request(unsigned long rarg);
|
||||
extern int memcheckall();
|
||||
extern int freecheck(int runcount);
|
||||
extern int runcount;
|
||||
extern void process_procfs_request(struct ikc_scd_packet *rpacket);
|
||||
extern void terminate_host(int pid);
|
||||
extern void debug_log(long);
|
||||
|
||||
static void req_get_cpu_mapping(long req_rpa)
|
||||
{
|
||||
size_t mapsize;
|
||||
size_t size;
|
||||
int npages;
|
||||
long phys;
|
||||
struct get_cpu_mapping_req *req;
|
||||
struct cpu_mapping *buf;
|
||||
|
||||
size = sizeof(*req);
|
||||
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
|
||||
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
phys = ihk_mc_map_memory(NULL, req_rpa, size);
|
||||
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
|
||||
|
||||
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
|
||||
if (!req->error) {
|
||||
req->buf_rpa = virt_to_phys(buf);
|
||||
}
|
||||
|
||||
ihk_mc_unmap_virtual(req, npages, 0);
|
||||
ihk_mc_unmap_memory(NULL, phys, size);
|
||||
return;
|
||||
} /* req_get_cpu_mapping() */
|
||||
|
||||
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
void *__packet, void *ihk_os)
|
||||
{
|
||||
struct ikc_scd_packet *packet = __packet;
|
||||
struct ikc_scd_packet pckt;
|
||||
struct ihk_ikc_channel_desc *resp_channel = cpu_local_var(ikc2linux);
|
||||
int rc;
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct thread *thread;
|
||||
struct process *proc;
|
||||
struct mcctrl_signal {
|
||||
@ -572,22 +544,18 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
} *sp, info;
|
||||
unsigned long pp;
|
||||
int cpuid;
|
||||
int ret = 0;
|
||||
struct perf_ctrl_desc *pcd;
|
||||
unsigned int mode = 0;
|
||||
|
||||
switch (packet->msg) {
|
||||
case SCD_MSG_INIT_CHANNEL_ACKED:
|
||||
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
|
||||
process_msg_init_acked(c, packet->arg);
|
||||
return 0;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_PREPARE_PROCESS:
|
||||
|
||||
if (find_command_line("memdebug")) {
|
||||
memcheckall();
|
||||
if (runcount)
|
||||
freecheck(runcount);
|
||||
runcount++;
|
||||
}
|
||||
|
||||
if((rc = process_msg_prepare_process(packet->arg)) == 0){
|
||||
pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED;
|
||||
pckt.err = 0;
|
||||
@ -598,29 +566,53 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
}
|
||||
pckt.ref = packet->ref;
|
||||
pckt.arg = packet->arg;
|
||||
syscall_channel_send(c, &pckt);
|
||||
syscall_channel_send(resp_channel, &pckt);
|
||||
|
||||
return 0;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_SCHEDULE_PROCESS:
|
||||
cpuid = obtain_clone_cpuid();
|
||||
if(cpuid == -1){
|
||||
kprintf("No CPU available\n");
|
||||
return -1;
|
||||
}
|
||||
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
|
||||
thread = (struct thread *)packet->arg;
|
||||
proc = thread->proc;
|
||||
|
||||
settid(thread, 0, cpuid, -1);
|
||||
cpuid = obtain_clone_cpuid(&thread->cpu_set);
|
||||
if (cpuid == -1) {
|
||||
kprintf("No CPU available\n");
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
|
||||
proc = thread->proc;
|
||||
thread->tid = proc->pid;
|
||||
proc->status = PS_RUNNING;
|
||||
thread->status = PS_RUNNING;
|
||||
chain_thread(thread);
|
||||
chain_process(proc);
|
||||
runq_add_thread(thread, cpuid);
|
||||
|
||||
//cpu_local_var(next) = (struct thread *)packet->arg;
|
||||
return 0;
|
||||
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
/*
|
||||
* Used for syscall offload reply message to explicitly schedule in
|
||||
* the waiting thread
|
||||
*/
|
||||
case SCD_MSG_WAKE_UP_SYSCALL_THREAD:
|
||||
thread = find_thread(0, packet->ttid, &lock);
|
||||
if (!thread) {
|
||||
kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n",
|
||||
__FUNCTION__, packet->ttid);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
thread_unlock(thread, &lock);
|
||||
|
||||
dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n",
|
||||
__FUNCTION__, packet->ttid);
|
||||
waitq_wakeup(&thread->scd_wq);
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_SEND_SIGNAL:
|
||||
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
|
||||
sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
|
||||
@ -631,22 +623,29 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
pckt.err = 0;
|
||||
pckt.ref = packet->ref;
|
||||
pckt.arg = packet->arg;
|
||||
syscall_channel_send(c, &pckt);
|
||||
syscall_channel_send(resp_channel, &pckt);
|
||||
|
||||
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
|
||||
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
|
||||
return 0;
|
||||
dkprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_PROCFS_REQUEST:
|
||||
process_procfs_request(packet->arg);
|
||||
return 0;
|
||||
process_procfs_request(packet);
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_CLEANUP_PROCESS:
|
||||
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
|
||||
terminate_host(packet->pid);
|
||||
return 0;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_DEBUG_LOG:
|
||||
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
|
||||
debug_log(packet->arg);
|
||||
return 0;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_SYSFS_REQ_SHOW:
|
||||
case SCD_MSG_SYSFS_REQ_STORE:
|
||||
@ -654,77 +653,149 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
sysfss_packet_handler(c, packet->msg, packet->err,
|
||||
packet->sysfs_arg1, packet->sysfs_arg2,
|
||||
packet->sysfs_arg3);
|
||||
return 0;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_GET_CPU_MAPPING:
|
||||
req_get_cpu_mapping(packet->arg);
|
||||
case SCD_MSG_PERF_CTRL:
|
||||
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct perf_ctrl_desc));
|
||||
pcd = (struct perf_ctrl_desc *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
|
||||
|
||||
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
|
||||
switch (pcd->ctrl_type) {
|
||||
case PERF_CTRL_SET:
|
||||
if (!pcd->exclude_kernel) {
|
||||
mode |= PERFCTR_KERNEL_MODE;
|
||||
}
|
||||
if (!pcd->exclude_user) {
|
||||
mode |= PERFCTR_USER_MODE;
|
||||
}
|
||||
ihk_mc_perfctr_init_raw(pcd->target_cntr, pcd->config, mode);
|
||||
ihk_mc_perfctr_stop(1 << pcd->target_cntr);
|
||||
ihk_mc_perfctr_reset(pcd->target_cntr);
|
||||
break;
|
||||
|
||||
case PERF_CTRL_ENABLE:
|
||||
ihk_mc_perfctr_start(pcd->target_cntr_mask);
|
||||
break;
|
||||
|
||||
case PERF_CTRL_DISABLE:
|
||||
ihk_mc_perfctr_stop(pcd->target_cntr_mask);
|
||||
break;
|
||||
|
||||
case PERF_CTRL_GET:
|
||||
pcd->read_value = ihk_mc_perfctr_read(pcd->target_cntr);
|
||||
break;
|
||||
|
||||
default:
|
||||
kprintf("%s: SCD_MSG_PERF_CTRL unexpected ctrl_type\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
ihk_mc_unmap_virtual(pcd, 1, 0);
|
||||
ihk_mc_unmap_memory(NULL, pp, sizeof(struct perf_ctrl_desc));
|
||||
|
||||
pckt.msg = SCD_MSG_PERF_ACK;
|
||||
pckt.err = 0;
|
||||
pckt.arg = packet->arg;
|
||||
syscall_channel_send(c, &pckt);
|
||||
return 0;
|
||||
ihk_ikc_send(resp_channel, &pckt, 0);
|
||||
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
case SCD_MSG_CPU_RW_REG:
|
||||
|
||||
pckt.msg = SCD_MSG_CPU_RW_REG_RESP;
|
||||
memcpy(&pckt.desc, &packet->desc,
|
||||
sizeof(struct ihk_os_cpu_register));
|
||||
pckt.resp = packet->resp;
|
||||
pckt.err = arch_cpu_read_write_register(&pckt.desc, packet->op);
|
||||
|
||||
ihk_ikc_send(resp_channel, &pckt, 0);
|
||||
break;
|
||||
|
||||
default:
|
||||
kprintf("syscall_pakcet_handler:unknown message "
|
||||
"(%d.%d.%d.%d.%d.%#lx)\n",
|
||||
packet->msg, packet->ref, packet->osnum,
|
||||
packet->pid, packet->err, packet->arg);
|
||||
return 0;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int dummy_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
void *__packet, void *__os)
|
||||
{
|
||||
struct ikc_scd_packet *packet = __packet;
|
||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void init_host_syscall_channel(void)
|
||||
void init_host_ikc2linux(int linux_cpu)
|
||||
{
|
||||
struct ihk_ikc_connect_param param;
|
||||
struct ikc_scd_packet pckt;
|
||||
struct ihk_ikc_channel_desc *c;
|
||||
|
||||
param.port = 501;
|
||||
param.pkt_size = sizeof(struct ikc_scd_packet);
|
||||
param.queue_size = PAGE_SIZE;
|
||||
param.magic = 0x1129;
|
||||
param.handler = syscall_packet_handler;
|
||||
/* Main thread allocates channel pointer table */
|
||||
if (!ikc2linuxs) {
|
||||
ikc2linuxs = kmalloc(sizeof(*ikc2linuxs) *
|
||||
ihk_mc_get_nr_linux_cores(), IHK_MC_AP_NOWAIT);
|
||||
if (!ikc2linuxs) {
|
||||
kprintf("%s: error: allocating Linux channels\n", __FUNCTION__);
|
||||
panic("");
|
||||
}
|
||||
|
||||
dkprintf("(syscall) Trying to connect host ...");
|
||||
while (ihk_ikc_connect(NULL, ¶m) != 0) {
|
||||
dkprintf(".");
|
||||
ihk_mc_delay_us(1000 * 1000);
|
||||
memset(ikc2linuxs, 0, sizeof(*ikc2linuxs) *
|
||||
ihk_mc_get_nr_linux_cores());
|
||||
}
|
||||
dkprintf("connected.\n");
|
||||
|
||||
get_this_cpu_local_var()->syscall_channel = param.channel;
|
||||
c = ikc2linuxs[linux_cpu];
|
||||
|
||||
process_msg_init(&cpu_local_var(iip), &cpu_local_var(scp));
|
||||
pckt.msg = SCD_MSG_INIT_CHANNEL;
|
||||
pckt.ref = ihk_mc_get_processor_id();
|
||||
pckt.arg = virt_to_phys(&cpu_local_var(iip));
|
||||
syscall_channel_send(param.channel, &pckt);
|
||||
if (!c) {
|
||||
param.port = 503;
|
||||
param.intr_cpu = linux_cpu;
|
||||
param.pkt_size = sizeof(struct ikc_scd_packet);
|
||||
param.queue_size = 2 * num_processors * sizeof(struct ikc_scd_packet);
|
||||
if (param.queue_size < PAGE_SIZE * 4) {
|
||||
param.queue_size = PAGE_SIZE * 4;
|
||||
}
|
||||
param.magic = 0x1129;
|
||||
param.handler = dummy_packet_handler;
|
||||
|
||||
dkprintf("(ikc2linux) Trying to connect host ...");
|
||||
while (ihk_ikc_connect(NULL, ¶m) != 0) {
|
||||
dkprintf(".");
|
||||
ihk_mc_delay_us(1000 * 1000);
|
||||
}
|
||||
dkprintf("connected.\n");
|
||||
|
||||
ikc2linuxs[linux_cpu] = param.channel;
|
||||
c = param.channel;
|
||||
}
|
||||
|
||||
get_this_cpu_local_var()->ikc2linux = c;
|
||||
}
|
||||
|
||||
void init_host_syscall_channel2(void)
|
||||
void init_host_ikc2mckernel(void)
|
||||
{
|
||||
struct ihk_ikc_connect_param param;
|
||||
struct ikc_scd_packet pckt;
|
||||
|
||||
param.port = 502;
|
||||
param.port = 501;
|
||||
param.intr_cpu = -1;
|
||||
param.pkt_size = sizeof(struct ikc_scd_packet);
|
||||
param.queue_size = PAGE_SIZE;
|
||||
param.queue_size = PAGE_SIZE * 4;
|
||||
param.magic = 0x1329;
|
||||
param.handler = syscall_packet_handler;
|
||||
|
||||
dkprintf("(syscall) Trying to connect host ...");
|
||||
dkprintf("(ikc2mckernel) Trying to connect host ...");
|
||||
while (ihk_ikc_connect(NULL, ¶m) != 0) {
|
||||
dkprintf(".");
|
||||
ihk_mc_delay_us(1000 * 1000);
|
||||
}
|
||||
dkprintf("connected.\n");
|
||||
|
||||
get_this_cpu_local_var()->syscall_channel2 = param.channel;
|
||||
|
||||
process_msg_init(&cpu_local_var(iip2), &cpu_local_var(scp2));
|
||||
pckt.msg = SCD_MSG_INIT_CHANNEL;
|
||||
pckt.ref = ihk_mc_get_processor_id();
|
||||
pckt.arg = virt_to_phys(&cpu_local_var(iip2));
|
||||
syscall_channel_send(param.channel, &pckt);
|
||||
ihk_ikc_set_regular_channel(NULL, param.channel, ihk_ikc_get_processor_id());
|
||||
}
|
||||
|
||||
|
||||
@ -19,11 +19,13 @@
|
||||
* CPU Local Storage (cls)
|
||||
*/
|
||||
|
||||
struct malloc_header {
|
||||
unsigned int check;
|
||||
struct kmalloc_header {
|
||||
unsigned int front_magic;
|
||||
unsigned int cpu_id;
|
||||
struct malloc_header *next;
|
||||
unsigned long size;
|
||||
struct list_head list;
|
||||
int size; /* The size of this chunk without the header */
|
||||
unsigned int end_magic;
|
||||
/* 32 bytes */
|
||||
};
|
||||
|
||||
#include <ihk/lock.h>
|
||||
@ -36,10 +38,31 @@ extern ihk_spinlock_t cpu_status_lock;
|
||||
#define CPU_FLAG_NEED_RESCHED 0x1U
|
||||
#define CPU_FLAG_NEED_MIGRATE 0x2U
|
||||
|
||||
typedef int (*smp_func_t)(int cpu_index, int nr_cpus, void *arg);
|
||||
int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg);
|
||||
|
||||
struct smp_func_call_data {
|
||||
/* XXX: Sync MCS lock to avoid contention on counter */
|
||||
// mcs_lock_node_t lock;
|
||||
int nr_cpus;
|
||||
ihk_atomic_t cpus_left;
|
||||
|
||||
smp_func_t func;
|
||||
void *arg;
|
||||
};
|
||||
|
||||
struct smp_func_call_request {
|
||||
struct smp_func_call_data *sfcd;
|
||||
int cpu_index;
|
||||
int ret;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
struct cpu_local_var {
|
||||
/* malloc */
|
||||
struct malloc_header free_list;
|
||||
struct malloc_header *remote_free_list;
|
||||
struct list_head free_list;
|
||||
struct list_head remote_free_list;
|
||||
ihk_spinlock_t remote_free_list_lock;
|
||||
|
||||
struct thread idle;
|
||||
struct process idle_proc;
|
||||
@ -52,13 +75,8 @@ struct cpu_local_var {
|
||||
struct list_head runq;
|
||||
size_t runq_len;
|
||||
|
||||
struct ihk_ikc_channel_desc *syscall_channel;
|
||||
struct syscall_params scp;
|
||||
struct ikc_scd_init_param iip;
|
||||
struct ihk_ikc_channel_desc *ikc2linux;
|
||||
|
||||
struct ihk_ikc_channel_desc *syscall_channel2;
|
||||
struct syscall_params scp2;
|
||||
struct ikc_scd_init_param iip2;
|
||||
struct resource_set *resource_set;
|
||||
|
||||
int status;
|
||||
@ -73,6 +91,11 @@ struct cpu_local_var {
|
||||
int in_interrupt;
|
||||
int no_preempt;
|
||||
int timer_enabled;
|
||||
int kmalloc_initialized;
|
||||
struct ihk_os_cpu_monitor *monitor;
|
||||
|
||||
ihk_spinlock_t smp_func_req_lock;
|
||||
struct list_head smp_func_req_list;
|
||||
} __attribute__((aligned(64)));
|
||||
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
extern void arch_init(void);
|
||||
extern void kmsg_init(int);
|
||||
extern void mem_init(void);
|
||||
extern void ikc_master_init(void);
|
||||
extern void ihk_ikc_master_init(void);
|
||||
extern void ap_init(void);
|
||||
extern void arch_ready(void);
|
||||
extern void mc_ikc_test_init(void);
|
||||
@ -24,12 +24,18 @@ extern void cpu_local_var_init(void);
|
||||
extern void kmalloc_init(void);
|
||||
extern void ap_start(void);
|
||||
extern void ihk_mc_dma_init(void);
|
||||
extern void init_host_syscall_channel(void);
|
||||
extern void init_host_syscall_channel2(void);
|
||||
extern void init_host_ikc2linux(int linux_cpu);
|
||||
extern void init_host_ikc2mckernel(void);
|
||||
//extern void set_ikc2linux_to_local(int linux_cpu);
|
||||
extern void sched_init(void);
|
||||
extern void pc_ap_init(void);
|
||||
extern void cpu_sysfs_setup(void);
|
||||
extern void numa_sysfs_setup(void);
|
||||
extern void rusage_sysfs_setup(void);
|
||||
extern void status_sysfs_setup(void);
|
||||
|
||||
extern char *find_command_line(char *name);
|
||||
|
||||
extern int num_processors;
|
||||
|
||||
#endif
|
||||
|
||||
@ -28,15 +28,14 @@ r;\
|
||||
})
|
||||
#define kfree(ptr) _kfree(ptr, __FILE__, __LINE__)
|
||||
#define memcheck(ptr, msg) _memcheck(ptr, msg, __FILE__, __LINE__, 0)
|
||||
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
|
||||
void *_kmalloc(int size, ihk_mc_ap_flag flag, char *file, int line);
|
||||
void _kfree(void *ptr, char *file, int line);
|
||||
void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
|
||||
void *__kmalloc(int size, ihk_mc_ap_flag flag);
|
||||
void __kfree(void *ptr);
|
||||
void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
|
||||
void ___kfree(void *ptr);
|
||||
|
||||
int _memcheck(void *ptr, char *msg, char *file, int line, int free);
|
||||
int memcheckall();
|
||||
int freecheck(int runcount);
|
||||
void kmalloc_consolidate_free_list(void);
|
||||
|
||||
#endif
|
||||
|
||||
@ -32,13 +32,28 @@ enum {
|
||||
MF_HAS_PAGER = 0x0001,
|
||||
MF_SHMDT_OK = 0x0002,
|
||||
MF_IS_REMOVABLE = 0x0004,
|
||||
MF_PREFETCH = 0x0008,
|
||||
MF_ZEROFILL = 0x0010,
|
||||
MF_REG_FILE = 0x1000,
|
||||
MF_DEV_FILE = 0x2000,
|
||||
MF_PREMAP = 0x8000,
|
||||
MF_HOST_RELEASED = 0x80000000,
|
||||
MF_END
|
||||
};
|
||||
|
||||
#define MEMOBJ_READY 0
|
||||
#define MEMOBJ_TO_BE_PREFETCHED 1
|
||||
|
||||
struct memobj {
|
||||
struct memobj_ops * ops;
|
||||
uint32_t flags;
|
||||
int8_t padding[4];
|
||||
ihk_spinlock_t lock;
|
||||
struct memobj_ops *ops;
|
||||
uint32_t flags;
|
||||
uint32_t status;
|
||||
size_t size;
|
||||
ihk_spinlock_t lock;
|
||||
|
||||
/* For pre-mapped memobjects */
|
||||
void **pages;
|
||||
int nr_pages;
|
||||
};
|
||||
|
||||
typedef void memobj_release_func_t(struct memobj *obj);
|
||||
@ -141,6 +156,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp);
|
||||
struct shmid_ds;
|
||||
int shmobj_create(struct shmid_ds *ds, struct memobj **objp);
|
||||
int zeroobj_create(struct memobj **objp);
|
||||
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp);
|
||||
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
|
||||
int prot, int populate_flags);
|
||||
|
||||
#endif /* HEADER_MEMOBJ_H */
|
||||
|
||||
@ -17,8 +17,9 @@
|
||||
|
||||
struct page {
|
||||
struct list_head list;
|
||||
struct list_head hash;
|
||||
uint8_t mode;
|
||||
uint8_t padding[3];
|
||||
uint64_t phys;
|
||||
ihk_atomic_t count;
|
||||
off_t offset;
|
||||
};
|
||||
@ -38,9 +39,8 @@ enum page_mode {
|
||||
struct page *phys_to_page(uintptr_t phys);
|
||||
uintptr_t page_to_phys(struct page *page);
|
||||
int page_unmap(struct page *page);
|
||||
struct page *phys_to_page_insert_hash(uint64_t phys);
|
||||
|
||||
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag);
|
||||
void free_pages(void *va, int npages);
|
||||
void begin_free_pages_pending(void);
|
||||
void finish_free_pages_pending(void);
|
||||
|
||||
|
||||
@ -30,7 +30,8 @@ enum pager_op {
|
||||
struct pager_create_result {
|
||||
uintptr_t handle;
|
||||
int maxprot;
|
||||
int8_t padding[4];
|
||||
uint32_t flags;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
/*
|
||||
|
||||
@ -22,13 +22,17 @@
|
||||
#include <memobj.h>
|
||||
#include <affinity.h>
|
||||
#include <syscall.h>
|
||||
#include <bitops.h>
|
||||
#include <profile.h>
|
||||
|
||||
#define VR_NONE 0x0
|
||||
#define VR_STACK 0x1
|
||||
#define VR_RESERVED 0x2
|
||||
#define VR_AP_USER 0x4
|
||||
#define VR_IO_NOCACHE 0x100
|
||||
#define VR_REMOTE 0x200
|
||||
#define VR_WRITE_COMBINED 0x400
|
||||
#define VR_DONTFORK 0x800
|
||||
#define VR_DEMAND_PAGING 0x1000
|
||||
#define VR_PRIVATE 0x2000
|
||||
#define VR_LOCKED 0x4000
|
||||
@ -160,10 +164,77 @@
|
||||
#endif
|
||||
|
||||
#define USER_STACK_NR_PAGES 8192
|
||||
#define KERNEL_STACK_NR_PAGES 25
|
||||
#define KERNEL_STACK_NR_PAGES 32
|
||||
|
||||
#define NOPHYS ((uintptr_t)-1)
|
||||
|
||||
#define PROCESS_NUMA_MASK_BITS 256
|
||||
|
||||
/*
|
||||
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
|
||||
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
|
||||
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
|
||||
*/
|
||||
|
||||
/* Policies */
|
||||
enum {
|
||||
MPOL_DEFAULT,
|
||||
MPOL_PREFERRED,
|
||||
MPOL_BIND,
|
||||
MPOL_INTERLEAVE,
|
||||
MPOL_LOCAL,
|
||||
MPOL_MAX, /* always last member of enum */
|
||||
};
|
||||
|
||||
enum mpol_rebind_step {
|
||||
MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
|
||||
MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
|
||||
MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
|
||||
MPOL_REBIND_NSTEP,
|
||||
};
|
||||
|
||||
/* Flags for set_mempolicy */
|
||||
#define MPOL_F_STATIC_NODES (1 << 15)
|
||||
#define MPOL_F_RELATIVE_NODES (1 << 14)
|
||||
|
||||
/*
|
||||
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
|
||||
* either set_mempolicy() or mbind().
|
||||
*/
|
||||
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
|
||||
|
||||
/* Flags for get_mempolicy */
|
||||
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
|
||||
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
|
||||
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
|
||||
|
||||
/* Flags for mbind */
|
||||
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
|
||||
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
|
||||
to policy */
|
||||
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
|
||||
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
|
||||
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
|
||||
|
||||
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
|
||||
MPOL_MF_MOVE | \
|
||||
MPOL_MF_MOVE_ALL)
|
||||
|
||||
/*
|
||||
* Internal flags that share the struct mempolicy flags word with
|
||||
* "mode flags". These flags are allocated from bit 0 up, as they
|
||||
* are never OR'ed into the mode in mempolicy API arguments.
|
||||
*/
|
||||
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
|
||||
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
|
||||
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
|
||||
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
|
||||
#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
|
||||
|
||||
#define SPAWN_TO_LOCAL 0
|
||||
#define SPAWN_TO_REMOTE 1
|
||||
#define SPAWNING_TO_REMOTE 1001
|
||||
|
||||
#include <waitq.h>
|
||||
#include <futex.h>
|
||||
|
||||
@ -177,6 +248,7 @@ struct process_vm;
|
||||
struct vm_regions;
|
||||
struct vm_range;
|
||||
|
||||
|
||||
#define HASH_SIZE 73
|
||||
|
||||
struct resource_set {
|
||||
@ -302,13 +374,21 @@ struct vm_range {
|
||||
off_t objoff;
|
||||
int pgshift; /* page size. 0 means THP */
|
||||
int padding;
|
||||
void *private_data;
|
||||
};
|
||||
|
||||
struct vm_range_numa_policy {
|
||||
struct list_head list;
|
||||
unsigned long start, end;
|
||||
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
|
||||
int numa_mem_policy;
|
||||
};
|
||||
|
||||
struct vm_regions {
|
||||
unsigned long vm_start, vm_end;
|
||||
unsigned long text_start, text_end;
|
||||
unsigned long data_start, data_end;
|
||||
unsigned long brk_start, brk_end;
|
||||
unsigned long brk_start, brk_end, brk_end_allocated;
|
||||
unsigned long map_start, map_end;
|
||||
unsigned long stack_start, stack_end;
|
||||
unsigned long user_start, user_end;
|
||||
@ -319,19 +399,21 @@ struct process_vm;
|
||||
struct mckfd {
|
||||
struct mckfd *next;
|
||||
int fd;
|
||||
int sig_no;
|
||||
long data;
|
||||
void *opt;
|
||||
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
|
||||
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
|
||||
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
|
||||
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
|
||||
int (*fcntl_cb)(struct mckfd *, ihk_mc_user_context_t *);
|
||||
};
|
||||
|
||||
#define SFD_CLOEXEC 02000000
|
||||
#define SFD_NONBLOCK 04000
|
||||
|
||||
struct sig_common {
|
||||
ihk_spinlock_t lock;
|
||||
mcs_rwlock_lock_t lock;
|
||||
ihk_atomic_t use;
|
||||
struct k_sigaction action[_NSIG];
|
||||
struct list_head sigpending;
|
||||
@ -346,6 +428,11 @@ struct sig_pending {
|
||||
|
||||
typedef void pgio_func_t(void *arg);
|
||||
|
||||
struct mcexec_tid {
|
||||
int tid;
|
||||
struct thread *thread;
|
||||
};
|
||||
|
||||
/* Represents a node in the process fork tree, it may exist even after the
|
||||
* corresponding process exited due to references from the parent and/or
|
||||
* children and is used for implementing wait/waitpid without having a
|
||||
@ -360,6 +447,9 @@ struct process {
|
||||
// threads and children
|
||||
struct list_head threads_list;
|
||||
mcs_rwlock_lock_t threads_lock; // lock for threads_list
|
||||
/* TID set of proxy process */
|
||||
struct mcexec_tid *tids;
|
||||
int nr_tids;
|
||||
|
||||
/* The ptracing process behave as the parent of the ptraced process
|
||||
after using PTRACE_ATTACH except getppid. So we save it here. */
|
||||
@ -384,7 +474,7 @@ struct process {
|
||||
// V +---- |
|
||||
// PS_STOPPED -----+
|
||||
// (PS_TRACED)
|
||||
int exit_status;
|
||||
int exit_status; // only for zombie
|
||||
|
||||
/* Store exit_status for a group of threads when stopped by SIGSTOP.
|
||||
exit_status can't be used because values of exit_status of threads
|
||||
@ -414,6 +504,7 @@ struct process {
|
||||
unsigned long saved_auxv[AUXV_LEN];
|
||||
char *saved_cmdline;
|
||||
long saved_cmdline_len;
|
||||
cpu_set_t cpu_set;
|
||||
|
||||
/* Store ptrace flags.
|
||||
* The lower 8 bits are PTRACE_O_xxx of the PTRACE_SETOPTIONS request.
|
||||
@ -447,6 +538,10 @@ struct process {
|
||||
|
||||
long maxrss;
|
||||
long maxrss_children;
|
||||
/* Memory policy flags and memory specific options */
|
||||
unsigned long mpol_flags;
|
||||
size_t mpol_threshold;
|
||||
unsigned long heap_extension;
|
||||
|
||||
// perf_event
|
||||
int perf_status;
|
||||
@ -455,6 +550,13 @@ struct process {
|
||||
#define PP_COUNT 2
|
||||
#define PP_STOP 3
|
||||
struct mc_perf_event *monitoring_event;
|
||||
#ifdef PROFILE_ENABLE
|
||||
int profile;
|
||||
mcs_lock_node_t profile_lock;
|
||||
struct profile_event *profile_events;
|
||||
unsigned long profile_elapsed_ts;
|
||||
#endif // PROFILE_ENABLE
|
||||
int nr_processes; /* For partitioned execution */
|
||||
};
|
||||
|
||||
void hold_thread(struct thread *ftn);
|
||||
@ -496,6 +598,7 @@ struct thread {
|
||||
// PS_TRACED
|
||||
// PS_INTERRPUTIBLE
|
||||
// PS_UNINTERRUPTIBLE
|
||||
int exit_status;
|
||||
|
||||
// process vm
|
||||
struct process_vm *vm;
|
||||
@ -526,12 +629,19 @@ struct thread {
|
||||
fp_regs_struct *fp_regs;
|
||||
int in_syscall_offload;
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
int profile;
|
||||
struct profile_event *profile_events;
|
||||
unsigned long profile_start_ts;
|
||||
unsigned long profile_elapsed_ts;
|
||||
#endif // PROFILE_ENABLE
|
||||
|
||||
// signal
|
||||
struct sig_common *sigcommon;
|
||||
sigset_t sigmask;
|
||||
stack_t sigstack;
|
||||
struct list_head sigpending;
|
||||
ihk_spinlock_t sigpendinglock;
|
||||
mcs_rwlock_lock_t sigpendinglock;
|
||||
volatile int sigevent;
|
||||
|
||||
// gpio
|
||||
@ -544,9 +654,14 @@ struct thread {
|
||||
struct sig_pending *ptrace_sendsig;
|
||||
|
||||
// cpu time
|
||||
/*
|
||||
struct timespec stime;
|
||||
struct timespec utime;
|
||||
struct timespec btime;
|
||||
*/
|
||||
unsigned long system_tsc;
|
||||
unsigned long user_tsc;
|
||||
unsigned long base_tsc;
|
||||
int times_update;
|
||||
int in_kernel;
|
||||
|
||||
@ -556,8 +671,18 @@ struct thread {
|
||||
struct itimerval itimer_prof;
|
||||
struct timespec itimer_virtual_value;
|
||||
struct timespec itimer_prof_value;
|
||||
|
||||
/* Syscall offload wait queue head */
|
||||
struct waitq scd_wq;
|
||||
|
||||
int thread_offloaded;
|
||||
int mod_clone;
|
||||
struct uti_attr *mod_clone_arg;
|
||||
int parent_cpuid;
|
||||
};
|
||||
|
||||
#define VM_RANGE_CACHE_SIZE 4
|
||||
|
||||
struct process_vm {
|
||||
struct address_space *address_space;
|
||||
struct list_head vm_range_list;
|
||||
@ -580,6 +705,12 @@ struct process_vm {
|
||||
int exiting;
|
||||
|
||||
long currss;
|
||||
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
|
||||
int numa_mem_policy;
|
||||
/* Protected by memory_range_lock */
|
||||
struct list_head vm_range_numa_policy_list;
|
||||
struct vm_range *range_cache[VM_RANGE_CACHE_SIZE];
|
||||
int range_cache_ind;
|
||||
};
|
||||
|
||||
static inline int has_cap_ipc_lock(struct thread *th)
|
||||
@ -596,7 +727,8 @@ static inline int has_cap_sys_admin(struct thread *th)
|
||||
|
||||
void hold_address_space(struct address_space *);
|
||||
void release_address_space(struct address_space *);
|
||||
struct thread *create_thread(unsigned long user_pc);
|
||||
struct thread *create_thread(unsigned long user_pc,
|
||||
unsigned long *__cpu_set, size_t cpu_set_size);
|
||||
struct thread *clone_thread(struct thread *org, unsigned long pc,
|
||||
unsigned long sp, int clone_flags);
|
||||
void destroy_thread(struct thread *thread);
|
||||
@ -611,9 +743,10 @@ void free_process_memory_ranges(struct process_vm *vm);
|
||||
int populate_process_memory(struct process_vm *vm, void *start, size_t len);
|
||||
|
||||
int add_process_memory_range(struct process_vm *vm,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long phys, unsigned long flag,
|
||||
struct memobj *memobj, off_t objoff, int pgshift);
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long phys, unsigned long flag,
|
||||
struct memobj *memobj, off_t offset,
|
||||
int pgshift, struct vm_range **rp);
|
||||
int remove_process_memory_range(struct process_vm *vm, unsigned long start,
|
||||
unsigned long end, int *ro_freedp);
|
||||
int split_process_memory_range(struct process_vm *vm,
|
||||
@ -647,15 +780,17 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn,
|
||||
int argc, char **argv,
|
||||
int envc, char **env);
|
||||
unsigned long extend_process_region(struct process_vm *vm,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long address, unsigned long flag);
|
||||
unsigned long end_allocated,
|
||||
unsigned long address, unsigned long flag);
|
||||
extern enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
|
||||
enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
|
||||
|
||||
void schedule(void);
|
||||
void spin_sleep_or_schedule(void);
|
||||
void runq_add_thread(struct thread *thread, int cpu_id);
|
||||
void runq_del_thread(struct thread *thread, int cpu_id);
|
||||
int sched_wakeup_thread(struct thread *thread, int valid_states);
|
||||
int sched_wakeup_thread_locked(struct thread *thread, int valid_states);
|
||||
|
||||
void sched_request_migrate(int cpu_id, struct thread *thread);
|
||||
void check_need_resched(void);
|
||||
@ -675,5 +810,6 @@ void chain_process(struct process *);
|
||||
void chain_thread(struct thread *);
|
||||
void proc_init();
|
||||
void set_timer();
|
||||
struct sig_pending *hassigpending(struct thread *thread);
|
||||
|
||||
#endif
|
||||
|
||||
66
kernel/include/profile.h
Normal file
66
kernel/include/profile.h
Normal file
@ -0,0 +1,66 @@
|
||||
#ifndef __PROCESS_PROFILE_H_
|
||||
#define __PROCESS_PROFILE_H_
|
||||
|
||||
/* Uncomment this to enable profiling */
|
||||
#define PROFILE_ENABLE
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
#define PROFILE_SYSCALL_MAX 300
|
||||
#define PROFILE_OFFLOAD_MAX (PROFILE_SYSCALL_MAX << 1)
|
||||
#define PROFILE_EVENT_MIN PROFILE_OFFLOAD_MAX
|
||||
#define __NR_profile 701
|
||||
|
||||
#define PROF_JOB 0x40000000
|
||||
#define PROF_PROC 0x80000000
|
||||
#define PROF_CLEAR 0x01
|
||||
#define PROF_ON 0x02
|
||||
#define PROF_OFF 0x04
|
||||
#define PROF_PRINT 0x08
|
||||
|
||||
struct profile_event {
|
||||
uint32_t cnt;
|
||||
uint64_t tsc;
|
||||
};
|
||||
|
||||
/*
|
||||
* The layout of profile events is as follows:
|
||||
* [0,PROFILE_SYSCALL_MAX) - syscalls
|
||||
* [PROFILE_SYSCALL_MAX,PROFILE_OFFLOAD_MAX) - syscall offloads
|
||||
* [PROFILE_OFFLOAD_MAX,PROFILE_EVENT_MAX) - general events
|
||||
*
|
||||
* XXX: Make sure to fill in prof_event_names in profile.c
|
||||
* for each added profiled event.
|
||||
*/
|
||||
enum profile_event_type {
|
||||
PROFILE_tlb_invalidate = PROFILE_EVENT_MIN,
|
||||
PROFILE_page_fault,
|
||||
PROFILE_page_fault_anon_clr,
|
||||
PROFILE_page_fault_file,
|
||||
PROFILE_page_fault_dev_file,
|
||||
PROFILE_page_fault_file_clr,
|
||||
PROFILE_mpol_alloc_missed,
|
||||
PROFILE_mmap_anon_contig_phys,
|
||||
PROFILE_mmap_anon_no_contig_phys,
|
||||
PROFILE_mmap_regular_file,
|
||||
PROFILE_mmap_device_file,
|
||||
PROFILE_EVENT_MAX /* Should be the last event type */
|
||||
};
|
||||
|
||||
struct thread;
|
||||
struct process;
|
||||
|
||||
enum profile_event_type profile_syscall2offload(enum profile_event_type sc);
|
||||
void profile_event_add(enum profile_event_type type, uint64_t tsc);
|
||||
void profile_print_thread_stats(struct thread *thread);
|
||||
void profile_print_proc_stats(struct process *proc);
|
||||
void profile_print_job_stats(struct process *proc);
|
||||
void profile_accumulate_events(struct thread *thread, struct process *proc);
|
||||
int profile_accumulate_and_print_job_events(struct process *proc);
|
||||
int profile_alloc_events(struct thread *thread);
|
||||
void profile_dealloc_thread_events(struct thread *thread);
|
||||
void profile_dealloc_proc_events(struct process *proc);
|
||||
#endif // PROFILE_ENABLE
|
||||
|
||||
|
||||
|
||||
#endif // __PROCESS_PROFILE_H_
|
||||
109
kernel/include/rbtree.h
Normal file
109
kernel/include/rbtree.h
Normal file
@ -0,0 +1,109 @@
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
linux/include/linux/rbtree.h
|
||||
|
||||
To use rbtrees you'll have to implement your own insert and search cores.
|
||||
This will avoid us to use callbacks and to drop drammatically performances.
|
||||
I know it's not the cleaner way, but in C (not in C++) to get
|
||||
performances and genericity...
|
||||
|
||||
See Documentation/rbtree.txt for documentation and samples.
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_RBTREE_H
|
||||
#define _LINUX_RBTREE_H
|
||||
|
||||
#include <ihk/types.h>
|
||||
#include <lwk/compiler.h>
|
||||
#include <lwk/stddef.h>
|
||||
|
||||
struct rb_node {
|
||||
unsigned long __rb_parent_color;
|
||||
struct rb_node *rb_right;
|
||||
struct rb_node *rb_left;
|
||||
} __attribute__((aligned(sizeof(long))));
|
||||
/* The alignment might seem pointless, but allegedly CRIS needs it */
|
||||
|
||||
struct rb_root {
|
||||
struct rb_node *rb_node;
|
||||
};
|
||||
|
||||
|
||||
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
|
||||
|
||||
#define RB_ROOT (struct rb_root) { NULL, }
|
||||
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
|
||||
|
||||
#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
|
||||
|
||||
/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
|
||||
#define RB_EMPTY_NODE(node) \
|
||||
((node)->__rb_parent_color == (unsigned long)(node))
|
||||
#define RB_CLEAR_NODE(node) \
|
||||
((node)->__rb_parent_color = (unsigned long)(node))
|
||||
|
||||
|
||||
extern void rb_insert_color(struct rb_node *, struct rb_root *);
|
||||
extern void rb_erase(struct rb_node *, struct rb_root *);
|
||||
|
||||
|
||||
/* Find logical next and previous nodes in a tree */
|
||||
extern struct rb_node *rb_next(const struct rb_node *);
|
||||
extern struct rb_node *rb_prev(const struct rb_node *);
|
||||
extern struct rb_node *rb_first(const struct rb_root *);
|
||||
extern struct rb_node *rb_last(const struct rb_root *);
|
||||
|
||||
/* Postorder iteration - always visit the parent after its children */
|
||||
extern struct rb_node *rb_first_postorder(const struct rb_root *);
|
||||
extern struct rb_node *rb_next_postorder(const struct rb_node *);
|
||||
|
||||
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
|
||||
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root);
|
||||
|
||||
static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
|
||||
struct rb_node ** rb_link)
|
||||
{
|
||||
node->__rb_parent_color = (unsigned long)parent;
|
||||
node->rb_left = node->rb_right = NULL;
|
||||
|
||||
*rb_link = node;
|
||||
}
|
||||
|
||||
#define rb_entry_safe(ptr, type, member) \
|
||||
({ typeof(ptr) ____ptr = (ptr); \
|
||||
____ptr ? rb_entry(____ptr, type, member) : NULL; \
|
||||
})
|
||||
|
||||
/**
|
||||
* rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
|
||||
* given type safe against removal of rb_node entry
|
||||
*
|
||||
* @pos: the 'type *' to use as a loop cursor.
|
||||
* @n: another 'type *' to use as temporary storage
|
||||
* @root: 'rb_root *' of the rbtree.
|
||||
* @field: the name of the rb_node field within 'type'.
|
||||
*/
|
||||
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
|
||||
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
|
||||
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
|
||||
typeof(*pos), field); 1; }); \
|
||||
pos = n)
|
||||
|
||||
#endif /* _LINUX_RBTREE_H */
|
||||
231
kernel/include/rbtree_augmented.h
Normal file
231
kernel/include/rbtree_augmented.h
Normal file
@ -0,0 +1,231 @@
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
(C) 2002 David Woodhouse <dwmw2@infradead.org>
|
||||
(C) 2012 Michel Lespinasse <walken@google.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
linux/include/linux/rbtree_augmented.h
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_RBTREE_AUGMENTED_H
|
||||
#define _LINUX_RBTREE_AUGMENTED_H
|
||||
|
||||
#include <rbtree.h>
|
||||
|
||||
/*
|
||||
* Please note - only struct rb_augment_callbacks and the prototypes for
|
||||
* rb_insert_augmented() and rb_erase_augmented() are intended to be public.
|
||||
* The rest are implementation details you are not expected to depend on.
|
||||
*
|
||||
* See Documentation/rbtree.txt for documentation and samples.
|
||||
*/
|
||||
|
||||
struct rb_augment_callbacks {
|
||||
void (*propagate)(struct rb_node *node, struct rb_node *stop);
|
||||
void (*copy)(struct rb_node *old, struct rb_node *new);
|
||||
void (*rotate)(struct rb_node *old, struct rb_node *new);
|
||||
};
|
||||
|
||||
extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
|
||||
static inline void
|
||||
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
__rb_insert_augmented(node, root, augment->rotate);
|
||||
}
|
||||
|
||||
#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
|
||||
rbtype, rbaugmented, rbcompute) \
|
||||
static inline void \
|
||||
rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \
|
||||
{ \
|
||||
while (rb != stop) { \
|
||||
rbstruct *node = rb_entry(rb, rbstruct, rbfield); \
|
||||
rbtype augmented = rbcompute(node); \
|
||||
if (node->rbaugmented == augmented) \
|
||||
break; \
|
||||
node->rbaugmented = augmented; \
|
||||
rb = rb_parent(&node->rbfield); \
|
||||
} \
|
||||
} \
|
||||
static inline void \
|
||||
rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
|
||||
{ \
|
||||
rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
|
||||
rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
|
||||
new->rbaugmented = old->rbaugmented; \
|
||||
} \
|
||||
static void \
|
||||
rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
|
||||
{ \
|
||||
rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
|
||||
rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
|
||||
new->rbaugmented = old->rbaugmented; \
|
||||
old->rbaugmented = rbcompute(old); \
|
||||
} \
|
||||
rbstatic const struct rb_augment_callbacks rbname = { \
|
||||
rbname ## _propagate, rbname ## _copy, rbname ## _rotate \
|
||||
};
|
||||
|
||||
|
||||
#define RB_RED 0
|
||||
#define RB_BLACK 1
|
||||
|
||||
#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
|
||||
|
||||
#define __rb_color(pc) ((pc) & 1)
|
||||
#define __rb_is_black(pc) __rb_color(pc)
|
||||
#define __rb_is_red(pc) (!__rb_color(pc))
|
||||
#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
|
||||
#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
|
||||
#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
|
||||
|
||||
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
|
||||
{
|
||||
rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
|
||||
}
|
||||
|
||||
static inline void rb_set_parent_color(struct rb_node *rb,
|
||||
struct rb_node *p, int color)
|
||||
{
|
||||
rb->__rb_parent_color = (unsigned long)p | color;
|
||||
}
|
||||
|
||||
static inline void
|
||||
__rb_change_child(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_node *parent, struct rb_root *root)
|
||||
{
|
||||
if (parent) {
|
||||
if (parent->rb_left == old)
|
||||
parent->rb_left = new;
|
||||
else
|
||||
parent->rb_right = new;
|
||||
} else
|
||||
root->rb_node = new;
|
||||
}
|
||||
|
||||
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
|
||||
|
||||
static __always_inline struct rb_node *
|
||||
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
struct rb_node *child = node->rb_right, *tmp = node->rb_left;
|
||||
struct rb_node *parent, *rebalance;
|
||||
unsigned long pc;
|
||||
|
||||
if (!tmp) {
|
||||
/*
|
||||
* Case 1: node to erase has no more than 1 child (easy!)
|
||||
*
|
||||
* Note that if there is one child it must be red due to 5)
|
||||
* and node must be black due to 4). We adjust colors locally
|
||||
* so as to bypass __rb_erase_color() later on.
|
||||
*/
|
||||
pc = node->__rb_parent_color;
|
||||
parent = __rb_parent(pc);
|
||||
__rb_change_child(node, child, parent, root);
|
||||
if (child) {
|
||||
child->__rb_parent_color = pc;
|
||||
rebalance = NULL;
|
||||
} else
|
||||
rebalance = __rb_is_black(pc) ? parent : NULL;
|
||||
tmp = parent;
|
||||
} else if (!child) {
|
||||
/* Still case 1, but this time the child is node->rb_left */
|
||||
tmp->__rb_parent_color = pc = node->__rb_parent_color;
|
||||
parent = __rb_parent(pc);
|
||||
__rb_change_child(node, tmp, parent, root);
|
||||
rebalance = NULL;
|
||||
tmp = parent;
|
||||
} else {
|
||||
struct rb_node *successor = child, *child2;
|
||||
tmp = child->rb_left;
|
||||
if (!tmp) {
|
||||
/*
|
||||
* Case 2: node's successor is its right child
|
||||
*
|
||||
* (n) (s)
|
||||
* / \ / \
|
||||
* (x) (s) -> (x) (c)
|
||||
* \
|
||||
* (c)
|
||||
*/
|
||||
parent = successor;
|
||||
child2 = successor->rb_right;
|
||||
augment->copy(node, successor);
|
||||
} else {
|
||||
/*
|
||||
* Case 3: node's successor is leftmost under
|
||||
* node's right child subtree
|
||||
*
|
||||
* (n) (s)
|
||||
* / \ / \
|
||||
* (x) (y) -> (x) (y)
|
||||
* / /
|
||||
* (p) (p)
|
||||
* / /
|
||||
* (s) (c)
|
||||
* \
|
||||
* (c)
|
||||
*/
|
||||
do {
|
||||
parent = successor;
|
||||
successor = tmp;
|
||||
tmp = tmp->rb_left;
|
||||
} while (tmp);
|
||||
parent->rb_left = child2 = successor->rb_right;
|
||||
successor->rb_right = child;
|
||||
rb_set_parent(child, successor);
|
||||
augment->copy(node, successor);
|
||||
augment->propagate(parent, successor);
|
||||
}
|
||||
|
||||
successor->rb_left = tmp = node->rb_left;
|
||||
rb_set_parent(tmp, successor);
|
||||
|
||||
pc = node->__rb_parent_color;
|
||||
tmp = __rb_parent(pc);
|
||||
__rb_change_child(node, successor, tmp, root);
|
||||
if (child2) {
|
||||
successor->__rb_parent_color = pc;
|
||||
rb_set_parent_color(child2, parent, RB_BLACK);
|
||||
rebalance = NULL;
|
||||
} else {
|
||||
unsigned long pc2 = successor->__rb_parent_color;
|
||||
successor->__rb_parent_color = pc;
|
||||
rebalance = __rb_is_black(pc2) ? parent : NULL;
|
||||
}
|
||||
tmp = successor;
|
||||
}
|
||||
|
||||
augment->propagate(tmp, NULL);
|
||||
return rebalance;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
|
||||
if (rebalance)
|
||||
__rb_erase_color(rebalance, root, augment->rotate);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_RBTREE_AUGMENTED_H */
|
||||
197
kernel/include/rusage.h
Normal file
197
kernel/include/rusage.h
Normal file
@ -0,0 +1,197 @@
|
||||
#ifndef __RUSAGE_H
|
||||
#define __RUSAGE_H
|
||||
|
||||
#include <config.h>
|
||||
#include <ihk/rusage.h>
|
||||
|
||||
#ifdef ENABLE_RUSAGE
|
||||
#define RUSAGE_MEM_LIMIT (2 * 1024 * 1024) // 2MB
|
||||
|
||||
extern void eventfd();
|
||||
|
||||
static inline void
|
||||
rusage_total_memory_add(unsigned long size)
|
||||
{
|
||||
monitor->rusage_total_memory += size;
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_rss_add(unsigned long size)
|
||||
{
|
||||
unsigned long newval;
|
||||
unsigned long oldval;
|
||||
unsigned long retval;
|
||||
|
||||
newval = __sync_add_and_fetch(&monitor->rusage_rss_current, size);
|
||||
oldval = monitor->rusage_rss_max;
|
||||
while (newval > oldval) {
|
||||
retval = __sync_val_compare_and_swap(&monitor->rusage_rss_max,
|
||||
oldval, newval);
|
||||
if (retval == oldval) {
|
||||
break;
|
||||
}
|
||||
oldval = retval;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_rss_sub(unsigned long size)
|
||||
{
|
||||
__sync_sub_and_fetch(&monitor->rusage_rss_current, size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_kmem_add(unsigned long size)
|
||||
{
|
||||
unsigned long newval;
|
||||
unsigned long oldval;
|
||||
unsigned long retval;
|
||||
|
||||
newval = __sync_add_and_fetch(&monitor->rusage_kmem_usage, size);
|
||||
oldval = monitor->rusage_kmem_max_usage;
|
||||
while (newval > oldval) {
|
||||
retval = __sync_val_compare_and_swap(
|
||||
&monitor->rusage_kmem_max_usage,
|
||||
oldval, newval);
|
||||
if (retval == oldval) {
|
||||
break;
|
||||
}
|
||||
oldval = retval;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_kmem_sub(unsigned long size)
|
||||
{
|
||||
__sync_sub_and_fetch(&monitor->rusage_kmem_usage, size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_numa_add(int numa_id, unsigned long size)
|
||||
{
|
||||
__sync_add_and_fetch(monitor->rusage_numa_stat + numa_id, size);
|
||||
rusage_rss_add(size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_numa_sub(int numa_id, unsigned long size)
|
||||
{
|
||||
rusage_rss_sub(size);
|
||||
__sync_sub_and_fetch(monitor->rusage_numa_stat + numa_id, size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_page_add(int numa_id, unsigned long pages, int is_user)
|
||||
{
|
||||
unsigned long size = pages * PAGE_SIZE;
|
||||
unsigned long newval;
|
||||
unsigned long oldval;
|
||||
unsigned long retval;
|
||||
|
||||
if (is_user)
|
||||
rusage_numa_add(numa_id, size);
|
||||
else
|
||||
rusage_kmem_add(size);
|
||||
|
||||
newval = __sync_add_and_fetch(&monitor->rusage_total_memory_usage, size);
|
||||
oldval = monitor->rusage_total_memory_max_usage;
|
||||
while (newval > oldval) {
|
||||
retval = __sync_val_compare_and_swap(&monitor->rusage_total_memory_max_usage,
|
||||
oldval, newval);
|
||||
if (retval == oldval) {
|
||||
if (monitor->rusage_total_memory - newval <
|
||||
RUSAGE_MEM_LIMIT) {
|
||||
eventfd();
|
||||
}
|
||||
break;
|
||||
}
|
||||
oldval = retval;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_page_sub(int numa_id, unsigned long pages, int is_user)
|
||||
{
|
||||
unsigned long size = pages * PAGE_SIZE;
|
||||
|
||||
__sync_sub_and_fetch(&monitor->rusage_total_memory_usage, size);
|
||||
|
||||
if (is_user)
|
||||
rusage_numa_sub(numa_id, size);
|
||||
else
|
||||
rusage_kmem_sub(size);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_num_threads_inc()
|
||||
{
|
||||
unsigned long newval;
|
||||
unsigned long oldval;
|
||||
unsigned long retval;
|
||||
|
||||
newval = __sync_add_and_fetch(&monitor->rusage_num_threads, 1);
|
||||
oldval = monitor->rusage_max_num_threads;
|
||||
while (newval > oldval) {
|
||||
retval = __sync_val_compare_and_swap(&monitor->
|
||||
rusage_max_num_threads,
|
||||
oldval, newval);
|
||||
if (retval == oldval) {
|
||||
break;
|
||||
}
|
||||
oldval = retval;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_num_threads_dec()
|
||||
{
|
||||
__sync_sub_and_fetch(&monitor->rusage_num_threads, 1);
|
||||
}
|
||||
#else
|
||||
static inline void
|
||||
rusage_total_memory_add(unsigned long size)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_rss_add(unsigned long size)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_rss_sub(unsigned long size)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_numa_add(int numa_id, unsigned long size)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_numa_sub(int numa_id, unsigned long size)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_page_add(int numa_id, unsigned long size, int is_user)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_page_sub(int numa_id, unsigned long size, int is_user)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_num_threads_inc()
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
rusage_num_threads_dec()
|
||||
{
|
||||
}
|
||||
#endif // ENABLE_RUSAGE
|
||||
|
||||
#endif
|
||||
@ -31,6 +31,7 @@
|
||||
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
|
||||
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
|
||||
#define SCD_MSG_SCHEDULE_PROCESS 0x3
|
||||
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
|
||||
|
||||
#define SCD_MSG_INIT_CHANNEL 0x5
|
||||
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
|
||||
@ -72,6 +73,13 @@
|
||||
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
|
||||
#define SCD_MSG_PROCFS_TID_CREATE 0x44
|
||||
#define SCD_MSG_PROCFS_TID_DELETE 0x45
|
||||
#define SCD_MSG_EVENTFD 0x46
|
||||
|
||||
#define SCD_MSG_PERF_CTRL 0x50
|
||||
#define SCD_MSG_PERF_ACK 0x51
|
||||
|
||||
#define SCD_MSG_CPU_RW_REG 0x52
|
||||
#define SCD_MSG_CPU_RW_REG_RESP 0x53
|
||||
|
||||
/* Cloning flags. */
|
||||
# define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */
|
||||
@ -117,28 +125,6 @@ struct user_desc {
|
||||
unsigned int lm:1;
|
||||
};
|
||||
|
||||
struct ikc_scd_packet {
|
||||
int msg;
|
||||
int err;
|
||||
union {
|
||||
/* for traditional SCD_MSG_* */
|
||||
struct {
|
||||
int ref;
|
||||
int osnum;
|
||||
int pid;
|
||||
int padding;
|
||||
unsigned long arg;
|
||||
};
|
||||
|
||||
/* for SCD_MSG_SYSFS_* */
|
||||
struct {
|
||||
long sysfs_arg1;
|
||||
long sysfs_arg2;
|
||||
long sysfs_arg3;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct program_image_section {
|
||||
unsigned long vaddr;
|
||||
unsigned long len;
|
||||
@ -170,6 +156,15 @@ struct program_image_section {
|
||||
#define MCK_RLIMIT_SIGPENDING 14
|
||||
#define MCK_RLIMIT_STACK 15
|
||||
|
||||
#define PLD_CPU_SET_MAX_CPUS 1024
|
||||
typedef unsigned long __cpu_set_unit;
|
||||
#define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit)))
|
||||
|
||||
#define MPOL_NO_HEAP 0x01
|
||||
#define MPOL_NO_STACK 0x02
|
||||
#define MPOL_NO_BSS 0x04
|
||||
#define MPOL_SHM_PREMAP 0x08
|
||||
|
||||
struct program_load_desc {
|
||||
int num_sections;
|
||||
int status;
|
||||
@ -198,7 +193,13 @@ struct program_load_desc {
|
||||
unsigned long envs_len;
|
||||
struct rlimit rlimit[MCK_RLIM_MAX];
|
||||
unsigned long interp_align;
|
||||
unsigned long mpol_flags;
|
||||
unsigned long mpol_threshold;
|
||||
unsigned long heap_extension;
|
||||
int nr_processes;
|
||||
char shell_path[SHELL_PATH_MAX_LEN];
|
||||
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
|
||||
int profile;
|
||||
struct program_image_section sections[0];
|
||||
};
|
||||
|
||||
@ -210,13 +211,77 @@ struct ikc_scd_init_param {
|
||||
};
|
||||
|
||||
struct syscall_request {
|
||||
/* TID of requesting thread */
|
||||
int rtid;
|
||||
/*
|
||||
* TID of target thread. Remote page fault response needs to designate the
|
||||
* thread that must serve the request, 0 indicates any thread from the pool
|
||||
*/
|
||||
int ttid;
|
||||
unsigned long valid;
|
||||
unsigned long number;
|
||||
unsigned long args[6];
|
||||
};
|
||||
|
||||
struct ihk_os_cpu_register {
|
||||
unsigned long addr;
|
||||
unsigned long val;
|
||||
unsigned long addr_ext;
|
||||
};
|
||||
|
||||
enum mcctrl_os_cpu_operation {
|
||||
MCCTRL_OS_CPU_READ_REGISTER,
|
||||
MCCTRL_OS_CPU_WRITE_REGISTER,
|
||||
MCCTRL_OS_CPU_MAX_OP
|
||||
};
|
||||
|
||||
struct ikc_scd_packet {
|
||||
int msg;
|
||||
int err;
|
||||
union {
|
||||
/* for traditional SCD_MSG_* */
|
||||
struct {
|
||||
int ref;
|
||||
int osnum;
|
||||
int pid;
|
||||
unsigned long arg;
|
||||
struct syscall_request req;
|
||||
unsigned long resp_pa;
|
||||
};
|
||||
|
||||
/* for SCD_MSG_SYSFS_* */
|
||||
struct {
|
||||
long sysfs_arg1;
|
||||
long sysfs_arg2;
|
||||
long sysfs_arg3;
|
||||
};
|
||||
|
||||
/* SCD_MSG_SCHEDULE_THREAD */
|
||||
struct {
|
||||
int ttid;
|
||||
};
|
||||
|
||||
/* SCD_MSG_CPU_RW_REG */
|
||||
struct {
|
||||
struct ihk_os_cpu_register desc;
|
||||
enum mcctrl_os_cpu_operation op;
|
||||
void *resp;
|
||||
};
|
||||
};
|
||||
char padding[12];
|
||||
};
|
||||
|
||||
#define IHK_SCD_REQ_THREAD_SPINNING 0
|
||||
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
|
||||
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
|
||||
|
||||
struct syscall_response {
|
||||
/* TID of the thread that requested the service */
|
||||
int ttid;
|
||||
/* TID of the mcexec thread that is serving the request */
|
||||
int stid;
|
||||
unsigned long status;
|
||||
unsigned long req_thread_status;
|
||||
long ret;
|
||||
unsigned long fault_address;
|
||||
unsigned long fault_reason;
|
||||
@ -226,22 +291,6 @@ struct syscall_post {
|
||||
unsigned long v[8];
|
||||
};
|
||||
|
||||
struct syscall_params {
|
||||
unsigned long request_rpa, request_pa;
|
||||
struct syscall_request *request_va;
|
||||
unsigned long response_pa;
|
||||
struct syscall_response *response_va;
|
||||
|
||||
unsigned long doorbell_rpa, doorbell_pa;
|
||||
unsigned long *doorbell_va;
|
||||
|
||||
unsigned int post_idx;
|
||||
unsigned long post_rpa, post_pa;
|
||||
struct syscall_post *post_va;
|
||||
unsigned long post_fin;
|
||||
struct syscall_post post_buf IHK_DMA_ALIGN;
|
||||
};
|
||||
|
||||
#define SYSCALL_DECLARE(name) long sys_##name(int n, ihk_mc_user_context_t *ctx)
|
||||
#define SYSCALL_HEADER struct syscall_request request IHK_DMA_ALIGN; \
|
||||
request.number = n
|
||||
@ -307,7 +356,7 @@ void delete_proc_procfs_files(int pid);
|
||||
void create_os_procfs_files(void);
|
||||
void delete_os_procfs_files(void);
|
||||
|
||||
#define PROCFS_NAME_MAX 1000
|
||||
#define PROCFS_NAME_MAX 768
|
||||
|
||||
struct procfs_read {
|
||||
unsigned long pbuf; /* physical address of the host buffer (request) */
|
||||
@ -361,8 +410,37 @@ struct tod_data_s {
|
||||
};
|
||||
extern struct tod_data_s tod_data; /* residing in arch-dependent file */
|
||||
|
||||
static inline void tsc_to_ts(unsigned long tsc, struct timespec *ts)
|
||||
{
|
||||
time_t sec_delta;
|
||||
long ns_delta;
|
||||
|
||||
sec_delta = tsc / tod_data.clocks_per_sec;
|
||||
ns_delta = NS_PER_SEC * (tsc % tod_data.clocks_per_sec)
|
||||
/ tod_data.clocks_per_sec;
|
||||
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
|
||||
|
||||
ts->tv_sec = sec_delta;
|
||||
ts->tv_nsec = ns_delta;
|
||||
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||
ts->tv_nsec -= NS_PER_SEC;
|
||||
++ts->tv_sec;
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned long timeval_to_jiffy(const struct timeval *ats)
|
||||
{
|
||||
return ats->tv_sec * 100 + ats->tv_usec / 10000;
|
||||
}
|
||||
|
||||
static inline unsigned long timespec_to_jiffy(const struct timespec *ats)
|
||||
{
|
||||
return ats->tv_sec * 100 + ats->tv_nsec / 10000000;
|
||||
}
|
||||
|
||||
void reset_cputime();
|
||||
void set_cputime(int mode);
|
||||
int do_munmap(void *addr, size_t len);
|
||||
intptr_t do_mmap(intptr_t addr0, size_t len0, int prot, int flags, int fd,
|
||||
off_t off0);
|
||||
void clear_host_pte(uintptr_t addr, size_t len);
|
||||
@ -371,6 +449,8 @@ int do_shmget(key_t key, size_t size, int shmflg);
|
||||
struct process_vm;
|
||||
int arch_map_vdso(struct process_vm *vm); /* arch dependent */
|
||||
int arch_setup_vdso(void);
|
||||
int arch_cpu_read_write_register(struct ihk_os_cpu_register *desc,
|
||||
enum mcctrl_os_cpu_operation op);
|
||||
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
@ -407,4 +487,64 @@ struct get_cpu_mapping_req {
|
||||
#endif
|
||||
};
|
||||
|
||||
enum perf_ctrl_type {
|
||||
PERF_CTRL_SET,
|
||||
PERF_CTRL_GET,
|
||||
PERF_CTRL_ENABLE,
|
||||
PERF_CTRL_DISABLE,
|
||||
};
|
||||
|
||||
struct perf_ctrl_desc {
|
||||
enum perf_ctrl_type ctrl_type;
|
||||
int status;
|
||||
union {
|
||||
/* for SET, GET */
|
||||
struct {
|
||||
unsigned int target_cntr;
|
||||
unsigned long config;
|
||||
unsigned long read_value;
|
||||
unsigned disabled :1,
|
||||
pinned :1,
|
||||
exclude_user :1,
|
||||
exclude_kernel :1,
|
||||
exclude_hv :1,
|
||||
exclude_idle :1;
|
||||
};
|
||||
|
||||
/* for START, STOP*/
|
||||
struct {
|
||||
unsigned long target_cntr_mask;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */
|
||||
|
||||
#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2)
|
||||
#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3)
|
||||
|
||||
#define UTI_FLAG_SAME_L1 (1ULL<<4)
|
||||
#define UTI_FLAG_SAME_L2 (1ULL<<5)
|
||||
#define UTI_FLAG_SAME_L3 (1ULL<<6)
|
||||
|
||||
#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7)
|
||||
#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8)
|
||||
#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9)
|
||||
|
||||
#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10)
|
||||
#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11)
|
||||
#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12)
|
||||
#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13)
|
||||
|
||||
/* Linux default value is used */
|
||||
#define UTI_MAX_NUMA_DOMAINS (1024)
|
||||
|
||||
typedef struct uti_attr {
|
||||
/* UTI_CPU_SET environmental variable is used to denote the preferred
|
||||
location of utility thread */
|
||||
uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) /
|
||||
(sizeof(uint64_t) * 8)];
|
||||
uint64_t flags; /* Representing location and behavior hints by bitmap */
|
||||
} uti_attr_t;
|
||||
|
||||
#endif
|
||||
|
||||
@ -27,6 +27,8 @@ typedef int (*waitq_func_t)(struct waitq_entry *wait, unsigned mode,
|
||||
|
||||
int default_wake_function(struct waitq_entry *wait, unsigned mode, int flags,
|
||||
void *key);
|
||||
int locked_wake_function(struct waitq_entry *wait, unsigned mode, int flags,
|
||||
void *key);
|
||||
|
||||
typedef struct waitq {
|
||||
ihk_spinlock_t lock;
|
||||
@ -57,6 +59,13 @@ typedef struct waitq_entry {
|
||||
.link = { &(name).link, &(name).link } \
|
||||
}
|
||||
|
||||
#define DECLARE_WAITQ_ENTRY_LOCKED(name, tsk) \
|
||||
waitq_entry_t name = { \
|
||||
.private = tsk, \
|
||||
.func = locked_wake_function, \
|
||||
.link = { &(name).link, &(name).link } \
|
||||
}
|
||||
|
||||
extern void waitq_init(waitq_t *waitq);
|
||||
extern void waitq_init_entry(waitq_entry_t *entry, struct thread *proc);
|
||||
extern int waitq_active(waitq_t *waitq);
|
||||
|
||||
26
kernel/include/xpmem.h
Normal file
26
kernel/include/xpmem.h
Normal file
@ -0,0 +1,26 @@
|
||||
/**
|
||||
* \file xpmem.h
|
||||
* License details are found in the file LICENSE.
|
||||
* \brief
|
||||
* Structures and functions of xpmem
|
||||
*/
|
||||
/*
|
||||
* HISTORY
|
||||
*/
|
||||
|
||||
#ifndef _XPMEM_H
|
||||
#define _XPMEM_H
|
||||
|
||||
#include <process.h>
|
||||
#include <ihk/context.h>
|
||||
|
||||
#define XPMEM_DEV_PATH "/dev/xpmem"
|
||||
|
||||
extern int xpmem_open(ihk_mc_user_context_t *ctx);
|
||||
extern int xpmem_remove_process_memory_range(struct process_vm *vm,
|
||||
struct vm_range *vmr);
|
||||
extern int xpmem_fault_process_memory_range(struct process_vm *vm,
|
||||
struct vm_range *vmr, unsigned long vaddr, uint64_t reason);
|
||||
|
||||
#endif /* _XPMEM_H */
|
||||
|
||||
490
kernel/include/xpmem_private.h
Normal file
490
kernel/include/xpmem_private.h
Normal file
@ -0,0 +1,490 @@
|
||||
/**
|
||||
* \file xpmem_private.h
|
||||
* License details are found in the file LICENSE.
|
||||
* \brief
|
||||
* Private Cross Partition Memory (XPMEM) structures and macros.
|
||||
*/
|
||||
/*
|
||||
* This file is subject to the terms and conditions of the GNU General Public
|
||||
* License. See the file "COPYING" in the main directory of this archive
|
||||
* for more details.
|
||||
*
|
||||
* Copyright (c) 2004-2007 Silicon Graphics, Inc. All Rights Reserved.
|
||||
* Copyright 2009, 2010, 2014 Cray Inc. All Rights Reserved
|
||||
* Copyright (c) 2014-2016 Los Alamos National Security, LCC. All rights
|
||||
* reserved.
|
||||
*/
|
||||
/*
|
||||
* HISTORY
|
||||
*/
|
||||
|
||||
#ifndef _XPMEM_PRIVATE_H
|
||||
#define _XPMEM_PRIVATE_H
|
||||
|
||||
#include <mc_xpmem.h>
|
||||
#include <xpmem.h>
|
||||
|
||||
#define XPMEM_CURRENT_VERSION 0x00026003
|
||||
|
||||
//#define DEBUG_PRINT_XPMEM
|
||||
|
||||
#ifdef DEBUG_PRINT_XPMEM
|
||||
#define dkprintf(...) kprintf(__VA_ARGS__)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
#define XPMEM_DEBUG(format, a...) kprintf("[%d] %s: "format"\n", cpu_local_var(current)->proc->rgid, __func__, ##a)
|
||||
#else
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
#define XPMEM_DEBUG(format, a...) do { if (0) kprintf("\n"); } while (0)
|
||||
#endif
|
||||
|
||||
//#define USE_DBUG_ON
|
||||
|
||||
#ifdef USE_DBUG_ON
|
||||
#define DBUG_ON(condition) do { if (condition) kprintf("[%d] BUG: func=%s\n", cpu_local_var(current)->proc->rgid, __func__); } while (0)
|
||||
#else
|
||||
#define DBUG_ON(condition)
|
||||
#endif
|
||||
|
||||
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
|
||||
|
||||
#define min(x, y) ({ \
|
||||
__typeof__(x) _min1 = (x); \
|
||||
__typeof__(y) _min2 = (y); \
|
||||
(void) (&_min1 == &_min2); \
|
||||
_min1 < _min2 ? _min1 : _min2;})
|
||||
|
||||
#define max(x, y) ({ \
|
||||
__typeof__(x) _max1 = (x); \
|
||||
__typeof__(y) _max2 = (y); \
|
||||
(void) (&_max1 == &_max2); \
|
||||
_max1 > _max2 ? _max1 : _max2;})
|
||||
|
||||
#define MAX_ERRNO 4095
|
||||
|
||||
#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO)
|
||||
|
||||
static inline void * ERR_PTR(long error)
|
||||
{
|
||||
return (void *)error;
|
||||
}
|
||||
|
||||
static inline long PTR_ERR(const void *ptr)
|
||||
{
|
||||
return (long)ptr;
|
||||
}
|
||||
|
||||
static inline long IS_ERR(const void *ptr)
|
||||
{
|
||||
return IS_ERR_VALUE((unsigned long)ptr);
|
||||
}
|
||||
|
||||
static inline long IS_ERR_OR_NULL(const void *ptr)
|
||||
{
|
||||
return !ptr || IS_ERR_VALUE((unsigned long)ptr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Both the xpmem_segid_t and xpmem_apid_t are of type __s64 and designed
|
||||
* to be opaque to the user. Both consist of the same underlying fields.
|
||||
*
|
||||
* The 'uniq' field is designed to give each segid or apid a unique value.
|
||||
* Each type is only unique with respect to itself.
|
||||
*
|
||||
* An ID is never less than or equal to zero.
|
||||
*/
|
||||
struct xpmem_id {
|
||||
pid_t tgid; /* thread group that owns ID */
|
||||
unsigned int uniq; /* this value makes the ID unique */
|
||||
};
|
||||
|
||||
typedef union {
|
||||
struct xpmem_id xpmem_id;
|
||||
xpmem_segid_t segid;
|
||||
xpmem_apid_t apid;
|
||||
} xpmem_id_t;
|
||||
|
||||
/* Shift INT_MAX by one so we can tell when we overflow. */
|
||||
#define XPMEM_MAX_UNIQ_ID (INT_MAX >> 1)
|
||||
|
||||
static inline pid_t xpmem_segid_to_tgid(xpmem_segid_t segid)
|
||||
{
|
||||
DBUG_ON(segid <= 0);
|
||||
return ((xpmem_id_t *)&segid)->xpmem_id.tgid;
|
||||
}
|
||||
|
||||
static inline pid_t xpmem_apid_to_tgid(xpmem_apid_t apid)
|
||||
{
|
||||
DBUG_ON(apid <= 0);
|
||||
return ((xpmem_id_t *)&apid)->xpmem_id.tgid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hash Tables
|
||||
*
|
||||
* XPMEM utilizes hash tables to enable faster lookups of list entries.
|
||||
* These hash tables are implemented as arrays. A simple modulus of the hash
|
||||
* key yields the appropriate array index. A hash table's array element (i.e.,
|
||||
* hash table bucket) consists of a hash list and the lock that protects it.
|
||||
*
|
||||
* XPMEM has the following two hash tables:
|
||||
*
|
||||
* table bucket key
|
||||
* part->tg_hashtable list of struct xpmem_thread_group tgid
|
||||
* tg->ap_hashtable list of struct xpmem_access_permit apid.uniq
|
||||
*/
|
||||
struct xpmem_hashlist {
|
||||
mcs_rwlock_lock_t lock; /* lock for hash list */
|
||||
struct list_head list; /* hash list */
|
||||
};
|
||||
|
||||
#define XPMEM_TG_HASHTABLE_SIZE 8
|
||||
#define XPMEM_AP_HASHTABLE_SIZE 8
|
||||
|
||||
static inline int xpmem_tg_hashtable_index(pid_t tgid)
|
||||
{
|
||||
int index;
|
||||
|
||||
index = (unsigned int)tgid % XPMEM_TG_HASHTABLE_SIZE;
|
||||
|
||||
XPMEM_DEBUG("return: tgid=%lu, index=%d", tgid, index);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
static inline int xpmem_ap_hashtable_index(xpmem_apid_t apid)
|
||||
{
|
||||
int index;
|
||||
|
||||
DBUG_ON(apid <= 0);
|
||||
|
||||
index = ((xpmem_id_t *)&apid)->xpmem_id.uniq % XPMEM_AP_HASHTABLE_SIZE;
|
||||
|
||||
XPMEM_DEBUG("return: apid=0x%lx, index=%d", apid, index);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
/*
|
||||
* general internal driver structures
|
||||
*/
|
||||
struct xpmem_thread_group {
|
||||
ihk_spinlock_t lock; /* tg lock */
|
||||
pid_t tgid; /* tg's tgid */
|
||||
uid_t uid; /* tg's uid */
|
||||
gid_t gid; /* tg's gid */
|
||||
volatile int flags; /* tg attributes and state */
|
||||
ihk_atomic_t uniq_segid; /* segid uniq */
|
||||
ihk_atomic_t uniq_apid; /* apid uniq */
|
||||
mcs_rwlock_lock_t seg_list_lock; /* tg's list of segs lock */
|
||||
struct list_head seg_list; /* tg's list of segs */
|
||||
ihk_atomic_t refcnt; /* references to tg */
|
||||
ihk_atomic_t n_pinned; /* #of pages pinned by this tg */
|
||||
struct list_head tg_hashlist; /* tg hash list */
|
||||
struct thread *group_leader; /* thread group leader */
|
||||
struct process_vm *vm; /* tg's process_vm */
|
||||
struct xpmem_hashlist ap_hashtable[]; /* locks + ap hash lists */
|
||||
};
|
||||
|
||||
struct xpmem_segment {
|
||||
ihk_spinlock_t lock; /* seg lock */
|
||||
xpmem_segid_t segid; /* unique segid */
|
||||
unsigned long vaddr; /* starting address */
|
||||
size_t size; /* size of seg */
|
||||
int permit_type; /* permission scheme */
|
||||
void *permit_value; /* permission data */
|
||||
volatile int flags; /* seg attributes and state */
|
||||
ihk_atomic_t refcnt; /* references to seg */
|
||||
struct xpmem_thread_group *tg; /* creator tg */
|
||||
struct list_head ap_list; /* local access permits of seg */
|
||||
struct list_head seg_list; /* tg's list of segs */
|
||||
};
|
||||
|
||||
struct xpmem_access_permit {
|
||||
ihk_spinlock_t lock; /* access permit lock */
|
||||
xpmem_apid_t apid; /* unique apid */
|
||||
int mode; /* read/write mode */
|
||||
volatile int flags; /* access permit attributes and state */
|
||||
ihk_atomic_t refcnt; /* references to access permit */
|
||||
struct xpmem_segment *seg; /* seg permitted to be accessed */
|
||||
struct xpmem_thread_group *tg; /* access permit's tg */
|
||||
struct list_head att_list; /* atts of this access permit's seg */
|
||||
struct list_head ap_list; /* access permits linked to seg */
|
||||
struct list_head ap_hashlist; /* access permit hash list */
|
||||
};
|
||||
|
||||
struct xpmem_attachment {
|
||||
mcs_rwlock_lock_t at_lock; /* att lock */
|
||||
unsigned long vaddr; /* starting address of seg attached */
|
||||
unsigned long at_vaddr; /* address where seg is attached */
|
||||
size_t at_size; /* size of seg attachment */
|
||||
struct vm_range *at_vmr; /* vm_range where seg is attachment */
|
||||
volatile int flags; /* att attributes and state */
|
||||
ihk_atomic_t refcnt; /* references to att */
|
||||
struct xpmem_access_permit *ap; /* associated access permit */
|
||||
struct list_head att_list; /* atts linked to access permit */
|
||||
struct process_vm *vm; /* process_vm attached to */
|
||||
};
|
||||
|
||||
struct xpmem_partition {
|
||||
ihk_atomic_t n_opened; /* # of /dev/xpmem opened */
|
||||
struct xpmem_hashlist tg_hashtable[]; /* locks + tg hash lists */
|
||||
};
|
||||
|
||||
#define XPMEM_FLAG_DESTROYING 0x00040 /* being destroyed */
|
||||
#define XPMEM_FLAG_DESTROYED 0x00080 /* 'being destroyed' finished */
|
||||
|
||||
#define XPMEM_FLAG_VALIDPTEs 0x00200 /* valid PTEs exist */
|
||||
|
||||
struct xpmem_perm {
|
||||
uid_t uid;
|
||||
gid_t gid;
|
||||
unsigned long mode;
|
||||
};
|
||||
|
||||
#define XPMEM_PERM_IRUSR 00400
|
||||
#define XPMEM_PERM_IWUSR 00200
|
||||
|
||||
extern struct xpmem_partition *xpmem_my_part;
|
||||
|
||||
static int xpmem_ioctl(struct mckfd *mckfd, ihk_mc_user_context_t *ctx);
|
||||
static int xpmem_close(struct mckfd *mckfd, ihk_mc_user_context_t *ctx);
|
||||
|
||||
static int xpmem_init(void);
|
||||
static void xpmem_exit(void);
|
||||
static int __xpmem_open(void);
|
||||
static void xpmem_destroy_tg(struct xpmem_thread_group *);
|
||||
|
||||
static int xpmem_make(unsigned long, size_t, int, void *, xpmem_segid_t *);
|
||||
static xpmem_segid_t xpmem_make_segid(struct xpmem_thread_group *);
|
||||
|
||||
static int xpmem_remove(xpmem_segid_t);
|
||||
static void xpmem_remove_seg(struct xpmem_thread_group *,
|
||||
struct xpmem_segment *);
|
||||
static void xpmem_remove_segs_of_tg(struct xpmem_thread_group *seg_tg);
|
||||
|
||||
static int xpmem_get(xpmem_segid_t, int, int, void *, xpmem_apid_t *);
|
||||
static int xpmem_check_permit_mode(int, struct xpmem_segment *);
|
||||
static int xpmem_perms(struct xpmem_perm *, short);
|
||||
static xpmem_apid_t xpmem_make_apid(struct xpmem_thread_group *);
|
||||
|
||||
static int xpmem_release(xpmem_apid_t);
|
||||
static void xpmem_release_ap(struct xpmem_thread_group *,
|
||||
struct xpmem_access_permit *);
|
||||
static void xpmem_release_aps_of_tg(struct xpmem_thread_group *ap_tg);
|
||||
|
||||
static int xpmem_attach(struct mckfd *, xpmem_apid_t, off_t, size_t,
|
||||
unsigned long, int, int, unsigned long *);
|
||||
|
||||
static int xpmem_detach(unsigned long);
|
||||
static int xpmem_vm_munmap(struct process_vm *vm, void *addr, size_t len);
|
||||
static int xpmem_remove_process_range(struct process_vm *vm,
|
||||
unsigned long start, unsigned long end, int *ro_freedp);
|
||||
static int xpmem_free_process_memory_range(struct process_vm *vm,
|
||||
struct vm_range *range);
|
||||
static void xpmem_detach_att(struct xpmem_access_permit *,
|
||||
struct xpmem_attachment *);
|
||||
static void xpmem_clear_PTEs(struct xpmem_segment *);
|
||||
static void xpmem_clear_PTEs_range(struct xpmem_segment *, unsigned long,
|
||||
unsigned long);
|
||||
static void xpmem_clear_PTEs_of_ap(struct xpmem_access_permit *, unsigned long,
|
||||
unsigned long);
|
||||
static void xpmem_clear_PTEs_of_att(struct xpmem_attachment *, unsigned long,
|
||||
unsigned long);
|
||||
|
||||
static int xpmem_remap_pte(struct process_vm *, struct vm_range *,
|
||||
unsigned long, uint64_t, struct xpmem_segment *, unsigned long);
|
||||
|
||||
static int xpmem_ensure_valid_page(struct xpmem_segment *, unsigned long);
|
||||
static pte_t * xpmem_vaddr_to_pte(struct process_vm *, unsigned long,
|
||||
size_t *pgsize);
|
||||
static int xpmem_pin_page(struct xpmem_thread_group *, struct thread *,
|
||||
struct process_vm *, unsigned long);
|
||||
static void xpmem_unpin_pages(struct xpmem_segment *, struct process_vm *,
|
||||
unsigned long, size_t);
|
||||
|
||||
static struct xpmem_thread_group * __xpmem_tg_ref_by_tgid_nolock_internal(
|
||||
pid_t, int, int);
|
||||
|
||||
static inline struct xpmem_thread_group *__xpmem_tg_ref_by_tgid(
|
||||
pid_t tgid,
|
||||
int return_destroying)
|
||||
{
|
||||
struct xpmem_thread_group *tg;
|
||||
int index;
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
|
||||
XPMEM_DEBUG("call: tgid=%d, return_destroying=%d",
|
||||
tgid, return_destroying);
|
||||
|
||||
index = xpmem_tg_hashtable_index(tgid);
|
||||
mcs_rwlock_reader_lock(&xpmem_my_part->tg_hashtable[index].lock, &lock);
|
||||
tg = __xpmem_tg_ref_by_tgid_nolock_internal(tgid, index,
|
||||
return_destroying);
|
||||
mcs_rwlock_reader_unlock(&xpmem_my_part->tg_hashtable[index].lock,
|
||||
&lock);
|
||||
|
||||
XPMEM_DEBUG("return: tg=0x%p", tg);
|
||||
|
||||
return tg;
|
||||
}
|
||||
|
||||
static inline struct xpmem_thread_group *__xpmem_tg_ref_by_tgid_nolock(
|
||||
pid_t tgid,
|
||||
int return_destroying)
|
||||
{
|
||||
struct xpmem_thread_group *tg;
|
||||
|
||||
XPMEM_DEBUG("call: tgid=%d, return_destroying=%d",
|
||||
tgid, return_destroying);
|
||||
|
||||
tg = __xpmem_tg_ref_by_tgid_nolock_internal(tgid,
|
||||
xpmem_tg_hashtable_index(tgid), return_destroying);
|
||||
|
||||
XPMEM_DEBUG("return: tg=0x%p", tg);
|
||||
|
||||
return tg;
|
||||
}
|
||||
|
||||
#define xpmem_tg_ref_by_tgid(t) __xpmem_tg_ref_by_tgid(t, 0)
|
||||
#define xpmem_tg_ref_by_tgid_all(t) __xpmem_tg_ref_by_tgid(t, 1)
|
||||
#define xpmem_tg_ref_by_tgid_nolock(t) __xpmem_tg_ref_by_tgid_nolock(t, 0)
|
||||
#define xpmem_tg_ref_by_tgid_all_nolock(t) __xpmem_tg_ref_by_tgid_nolock(t, 1)
|
||||
|
||||
static struct xpmem_thread_group * xpmem_tg_ref_by_segid(xpmem_segid_t);
|
||||
static struct xpmem_thread_group * xpmem_tg_ref_by_apid(xpmem_apid_t);
|
||||
static void xpmem_tg_deref(struct xpmem_thread_group *);
|
||||
static struct xpmem_segment *xpmem_seg_ref_by_segid(struct xpmem_thread_group *,
|
||||
xpmem_segid_t);
|
||||
static void xpmem_seg_deref(struct xpmem_segment *);
|
||||
static struct xpmem_access_permit * xpmem_ap_ref_by_apid(
|
||||
struct xpmem_thread_group *, xpmem_apid_t);
|
||||
static void xpmem_ap_deref(struct xpmem_access_permit *);
|
||||
static void xpmem_att_deref(struct xpmem_attachment *);
|
||||
static int xpmem_validate_access(struct xpmem_access_permit *, off_t, size_t,
|
||||
int, unsigned long *);
|
||||
|
||||
/*
|
||||
* Inlines that mark an internal driver structure as being destroyable or not.
|
||||
* The idea is to set the refcnt to 1 at structure creation time and then
|
||||
* drop that reference at the time the structure is to be destroyed.
|
||||
*/
|
||||
static inline void xpmem_tg_not_destroyable(
|
||||
struct xpmem_thread_group *tg)
|
||||
{
|
||||
ihk_atomic_set(&tg->refcnt, 1);
|
||||
|
||||
XPMEM_DEBUG("return: tg->refcnt=%d", tg->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_tg_destroyable(
|
||||
struct xpmem_thread_group *tg)
|
||||
{
|
||||
XPMEM_DEBUG("call: ");
|
||||
|
||||
xpmem_tg_deref(tg);
|
||||
|
||||
XPMEM_DEBUG("return: ");
|
||||
}
|
||||
|
||||
static inline void xpmem_seg_not_destroyable(
|
||||
struct xpmem_segment *seg)
|
||||
{
|
||||
ihk_atomic_set(&seg->refcnt, 1);
|
||||
|
||||
XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_seg_destroyable(
|
||||
struct xpmem_segment *seg)
|
||||
{
|
||||
XPMEM_DEBUG("call: ");
|
||||
|
||||
xpmem_seg_deref(seg);
|
||||
|
||||
XPMEM_DEBUG("return: ");
|
||||
}
|
||||
|
||||
static inline void xpmem_ap_not_destroyable(
|
||||
struct xpmem_access_permit *ap)
|
||||
{
|
||||
ihk_atomic_set(&ap->refcnt, 1);
|
||||
|
||||
XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_ap_destroyable(
|
||||
struct xpmem_access_permit *ap)
|
||||
{
|
||||
XPMEM_DEBUG("call: ");
|
||||
|
||||
xpmem_ap_deref(ap);
|
||||
|
||||
XPMEM_DEBUG("return: ");
|
||||
}
|
||||
|
||||
static inline void xpmem_att_not_destroyable(
|
||||
struct xpmem_attachment *att)
|
||||
{
|
||||
ihk_atomic_set(&att->refcnt, 1);
|
||||
|
||||
XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_att_destroyable(
|
||||
struct xpmem_attachment *att)
|
||||
{
|
||||
XPMEM_DEBUG("call: ");
|
||||
|
||||
xpmem_att_deref(att);
|
||||
|
||||
XPMEM_DEBUG("return: ");
|
||||
}
|
||||
|
||||
/*
|
||||
* Inlines that increment the refcnt for the specified structure.
|
||||
*/
|
||||
static inline void xpmem_tg_ref(
|
||||
struct xpmem_thread_group *tg)
|
||||
{
|
||||
DBUG_ON(ihk_atomic_read(&tg->refcnt) <= 0);
|
||||
ihk_atomic_inc(&tg->refcnt);
|
||||
|
||||
XPMEM_DEBUG("return: tg->refcnt=%d", tg->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_seg_ref(
|
||||
struct xpmem_segment *seg)
|
||||
{
|
||||
DBUG_ON(ihk_atomic_read(&seg->refcnt) <= 0);
|
||||
ihk_atomic_inc(&seg->refcnt);
|
||||
|
||||
XPMEM_DEBUG("return: seg->refcnt=%d", seg->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_ap_ref(
|
||||
struct xpmem_access_permit *ap)
|
||||
{
|
||||
DBUG_ON(ihk_atomic_read(&ap->refcnt) <= 0);
|
||||
ihk_atomic_inc(&ap->refcnt);
|
||||
|
||||
XPMEM_DEBUG("return: ap->refcnt=%d", ap->refcnt);
|
||||
}
|
||||
|
||||
static inline void xpmem_att_ref(
|
||||
struct xpmem_attachment *att)
|
||||
{
|
||||
DBUG_ON(ihk_atomic_read(&att->refcnt) <= 0);
|
||||
ihk_atomic_inc(&att->refcnt);
|
||||
|
||||
XPMEM_DEBUG("return: att->refcnt=%d", att->refcnt);
|
||||
}
|
||||
|
||||
static inline int xpmem_is_private_data(
|
||||
struct vm_range *vmr)
|
||||
{
|
||||
return (vmr->private_data != NULL);
|
||||
}
|
||||
|
||||
#endif /* _XPMEM_PRIVATE_H */
|
||||
|
||||
@ -31,6 +31,7 @@
|
||||
#include <cls.h>
|
||||
#include <syscall.h>
|
||||
#include <sysfs.h>
|
||||
#include <rusage.h>
|
||||
|
||||
//#define IOCTL_FUNC_EXTENSION
|
||||
#ifdef IOCTL_FUNC_EXTENSION
|
||||
@ -40,17 +41,21 @@
|
||||
//#define DEBUG_PRINT_INIT
|
||||
|
||||
#ifdef DEBUG_PRINT_INIT
|
||||
#define dkprintf kprintf
|
||||
#define dkprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#define dkprintf(...) do { } while (0)
|
||||
#define ekprintf(...) do { kprintf(__VA_ARGS__); } while (0)
|
||||
#endif
|
||||
|
||||
int osnum = 0;
|
||||
|
||||
extern struct ihk_kmsg_buf kmsg_buf;
|
||||
|
||||
extern unsigned long ihk_mc_get_ns_per_tsc(void);
|
||||
extern long syscall(int, ihk_mc_user_context_t *);
|
||||
|
||||
struct ihk_os_monitor *monitor;
|
||||
|
||||
static void handler_init(void)
|
||||
{
|
||||
ihk_mc_set_syscall_handler(syscall);
|
||||
@ -108,11 +113,11 @@ static void dma_test(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
extern char *ihk_mc_get_kernel_args(void);
|
||||
extern char *ihk_get_kargs(void);
|
||||
|
||||
char *find_command_line(char *name)
|
||||
{
|
||||
char *cmdline = ihk_mc_get_kernel_args();
|
||||
char *cmdline = ihk_get_kargs();
|
||||
|
||||
if (!cmdline) {
|
||||
return NULL;
|
||||
@ -122,7 +127,7 @@ char *find_command_line(char *name)
|
||||
|
||||
static void parse_kargs(void)
|
||||
{
|
||||
kprintf("KCommand Line: %s\n", ihk_mc_get_kernel_args());
|
||||
kprintf("KCommand Line: %s\n", ihk_get_kargs());
|
||||
|
||||
if (1) {
|
||||
char *key = "osnum=";
|
||||
@ -239,6 +244,34 @@ static void time_init(void)
|
||||
return;
|
||||
}
|
||||
|
||||
static void monitor_init()
|
||||
{
|
||||
int z;
|
||||
unsigned long phys;
|
||||
|
||||
z = sizeof(struct ihk_os_monitor) +
|
||||
sizeof(struct ihk_os_cpu_monitor) * num_processors;
|
||||
z = (z + PAGE_SIZE -1) >> PAGE_SHIFT;
|
||||
monitor = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
|
||||
memset(monitor, 0, z * PAGE_SIZE);
|
||||
monitor->num_processors = num_processors;
|
||||
monitor->num_numa_nodes = ihk_mc_get_nr_numa_nodes();
|
||||
monitor->ns_per_tsc = ihk_mc_get_ns_per_tsc();
|
||||
phys = virt_to_phys(monitor);
|
||||
ihk_set_monitor(phys, sizeof(struct ihk_os_monitor) +
|
||||
sizeof(struct ihk_os_cpu_monitor) * num_processors);
|
||||
}
|
||||
|
||||
int nmi_mode;
|
||||
|
||||
static void nmi_init()
|
||||
{
|
||||
unsigned long phys;
|
||||
|
||||
phys = virt_to_phys(&nmi_mode);
|
||||
ihk_set_nmi_mode_addr(phys);
|
||||
}
|
||||
|
||||
static void rest_init(void)
|
||||
{
|
||||
handler_init();
|
||||
@ -250,11 +283,13 @@ static void rest_init(void)
|
||||
//pc_test();
|
||||
|
||||
ap_init();
|
||||
monitor_init();
|
||||
cpu_local_var_init();
|
||||
nmi_init();
|
||||
time_init();
|
||||
kmalloc_init();
|
||||
|
||||
ikc_master_init();
|
||||
ihk_ikc_master_init();
|
||||
|
||||
proc_init();
|
||||
|
||||
@ -320,7 +355,8 @@ static void setup_remote_snooping_samples(void)
|
||||
static void populate_sysfs(void)
|
||||
{
|
||||
cpu_sysfs_setup();
|
||||
setup_remote_snooping_samples();
|
||||
numa_sysfs_setup();
|
||||
//setup_remote_snooping_samples();
|
||||
} /* populate_sysfs() */
|
||||
|
||||
int host_ikc_inited = 0;
|
||||
@ -336,11 +372,12 @@ static void post_init(void)
|
||||
}
|
||||
|
||||
if (find_command_line("hidos")) {
|
||||
extern ihk_spinlock_t syscall_lock;
|
||||
|
||||
init_host_syscall_channel();
|
||||
init_host_syscall_channel2();
|
||||
ihk_mc_spinlock_init(&syscall_lock);
|
||||
int ikc_cpu = ihk_mc_get_ikc_cpu(ihk_mc_get_processor_id());
|
||||
if(ikc_cpu < 0) {
|
||||
ekprintf("%s,ihk_mc_get_ikc_cpu failed\n", __FUNCTION__);
|
||||
}
|
||||
init_host_ikc2mckernel();
|
||||
init_host_ikc2linux(ikc_cpu);
|
||||
}
|
||||
|
||||
arch_setup_vdso();
|
||||
@ -371,8 +408,8 @@ int main(void)
|
||||
}
|
||||
kmsg_init(mode);
|
||||
|
||||
kputs("MCK started.\n");
|
||||
|
||||
kputs("IHK/McKernel started.\n");
|
||||
ihk_set_kmsg(virt_to_phys(&kmsg_buf), IHK_KMSG_SIZE);
|
||||
arch_init();
|
||||
|
||||
/*
|
||||
@ -393,7 +430,7 @@ int main(void)
|
||||
|
||||
futex_init();
|
||||
|
||||
kputs("MCK/IHK booted.\n");
|
||||
kputs("IHK/McKernel booted.\n");
|
||||
|
||||
#ifdef DCFA_KMOD
|
||||
mc_cmd_client_init();
|
||||
|
||||
2021
kernel/mem.c
2021
kernel/mem.c
File diff suppressed because it is too large
Load Diff
@ -21,7 +21,7 @@ static struct ihk_ikc_channel_desc *mchannel;
|
||||
static int arch_master_channel_packet_handler(struct ihk_ikc_channel_desc *,
|
||||
void *__packet, void *arg);
|
||||
|
||||
void ikc_master_init(void)
|
||||
void ihk_ikc_master_init(void)
|
||||
{
|
||||
mchannel = kmalloc(sizeof(struct ihk_ikc_channel_desc) +
|
||||
sizeof(struct ihk_ikc_master_packet),
|
||||
|
||||
882
kernel/process.c
882
kernel/process.c
File diff suppressed because it is too large
Load Diff
@ -17,12 +17,14 @@
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/ikc.h>
|
||||
#include <ikc/master.h>
|
||||
#include <syscall.h>
|
||||
#include <cls.h>
|
||||
#include <syscall.h>
|
||||
#include <kmalloc.h>
|
||||
#include <process.h>
|
||||
#include <page.h>
|
||||
#include <mman.h>
|
||||
#include <bitmap.h>
|
||||
#include <init.h>
|
||||
|
||||
//#define DEBUG_PRINT_PROCFS
|
||||
|
||||
@ -35,6 +37,7 @@
|
||||
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
|
||||
extern int sprintf(char * buf, const char *fmt, ...);
|
||||
extern int sscanf(const char * buf, const char * fmt, ...);
|
||||
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
|
||||
|
||||
extern int osnum;
|
||||
|
||||
@ -44,7 +47,7 @@ procfs_thread_ctl(struct thread *thread, int msg)
|
||||
struct ihk_ikc_channel_desc *syscall_channel;
|
||||
struct ikc_scd_packet packet;
|
||||
|
||||
syscall_channel = cpu_local_var(syscall_channel);
|
||||
syscall_channel = cpu_local_var(ikc2linux);
|
||||
memset(&packet, '\0', sizeof packet);
|
||||
packet.arg = thread->tid;
|
||||
packet.msg = msg;
|
||||
@ -73,11 +76,11 @@ procfs_delete_thread(struct thread *thread)
|
||||
*
|
||||
* \param rarg returned argument
|
||||
*/
|
||||
void
|
||||
process_procfs_request(unsigned long rarg)
|
||||
void process_procfs_request(struct ikc_scd_packet *rpacket)
|
||||
{
|
||||
unsigned long rarg = rpacket->arg;
|
||||
unsigned long parg, pbuf;
|
||||
struct thread *thread = NULL;
|
||||
struct thread *thread = NULL;
|
||||
struct process *proc = NULL;
|
||||
struct process_vm *vm = NULL;
|
||||
struct procfs_read *r;
|
||||
@ -93,7 +96,7 @@ process_procfs_request(unsigned long rarg)
|
||||
|
||||
dprintf("process_procfs_request: invoked.\n");
|
||||
|
||||
syscall_channel = get_cpu_local_var(0)->syscall_channel;
|
||||
syscall_channel = get_cpu_local_var(0)->ikc2linux;
|
||||
|
||||
dprintf("rarg: %x\n", rarg);
|
||||
parg = ihk_mc_map_memory(NULL, rarg, sizeof(struct procfs_read));
|
||||
@ -158,7 +161,7 @@ process_procfs_request(unsigned long rarg)
|
||||
*/
|
||||
ret = sscanf(p, "%d/", &pid);
|
||||
if (ret == 1) {
|
||||
struct mcs_rwlock_node tlock;
|
||||
struct mcs_rwlock_node_irqsave tlock;
|
||||
int tids;
|
||||
struct thread *thread1 = NULL;
|
||||
|
||||
@ -175,7 +178,7 @@ process_procfs_request(unsigned long rarg)
|
||||
else
|
||||
tid = pid;
|
||||
|
||||
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &tlock);
|
||||
mcs_rwlock_reader_lock(&proc->threads_lock, &tlock);
|
||||
list_for_each_entry(thread, &proc->threads_list, siblings_list){
|
||||
if(thread->tid == tid)
|
||||
break;
|
||||
@ -185,15 +188,15 @@ process_procfs_request(unsigned long rarg)
|
||||
if(thread == NULL){
|
||||
kprintf("process_procfs_request: no such tid %d-%d\n", pid, tid);
|
||||
if(tids){
|
||||
mcs_rwlock_reader_unlock(&proc->threads_lock, &tlock);
|
||||
process_unlock(proc, &lock);
|
||||
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &tlock);
|
||||
goto end;
|
||||
}
|
||||
thread = thread1;
|
||||
}
|
||||
if(thread)
|
||||
hold_thread(thread);
|
||||
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &tlock);
|
||||
mcs_rwlock_reader_unlock(&proc->threads_lock, &tlock);
|
||||
hold_process(proc);
|
||||
vm = proc->vm;
|
||||
if(vm)
|
||||
@ -281,6 +284,13 @@ process_procfs_request(unsigned long rarg)
|
||||
ans = -EIO;
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
|
||||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
|
||||
ans = -EIO;
|
||||
goto end;
|
||||
}
|
||||
|
||||
va = phys_to_virt(pa);
|
||||
if(readwrite)
|
||||
memcpy(va, buf + ans, size);
|
||||
@ -397,12 +407,34 @@ process_procfs_request(unsigned long rarg)
|
||||
/*
|
||||
* mcos%d/PID/status
|
||||
*/
|
||||
#define BITMASKS_BUF_SIZE 2048
|
||||
if (strcmp(p, "status") == 0) {
|
||||
extern int num_processors; /* kernel/ap.c */
|
||||
struct vm_range *range;
|
||||
unsigned long lockedsize = 0;
|
||||
char tmp[1024];
|
||||
char *tmp;
|
||||
char *bitmasks;
|
||||
int bitmasks_offset = 0;
|
||||
char *cpu_bitmask, *cpu_list, *numa_bitmask, *numa_list;
|
||||
int len;
|
||||
|
||||
tmp = kmalloc(8192, IHK_MC_AP_CRITICAL);
|
||||
if (!tmp) {
|
||||
kprintf("%s: error allocating /proc/self/status buffer\n",
|
||||
__FUNCTION__);
|
||||
ans = 0;
|
||||
goto end;
|
||||
}
|
||||
|
||||
bitmasks = kmalloc(BITMASKS_BUF_SIZE, IHK_MC_AP_CRITICAL);
|
||||
if (!tmp) {
|
||||
kprintf("%s: error allocating /proc/self/status bitmaks buffer\n",
|
||||
__FUNCTION__);
|
||||
kfree(tmp);
|
||||
ans = 0;
|
||||
goto end;
|
||||
}
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock);
|
||||
list_for_each_entry(range, &proc->vm->vm_range_list, list) {
|
||||
if(range->flag & VR_LOCKED)
|
||||
@ -410,13 +442,42 @@ process_procfs_request(unsigned long rarg)
|
||||
}
|
||||
ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock);
|
||||
|
||||
cpu_bitmask = &bitmasks[bitmasks_offset];
|
||||
bitmasks_offset += bitmap_scnprintf(cpu_bitmask,
|
||||
BITMASKS_BUF_SIZE - bitmasks_offset,
|
||||
thread->cpu_set.__bits, num_processors);
|
||||
bitmasks_offset++;
|
||||
|
||||
cpu_list = &bitmasks[bitmasks_offset];
|
||||
bitmasks_offset += bitmap_scnlistprintf(cpu_list,
|
||||
BITMASKS_BUF_SIZE - bitmasks_offset,
|
||||
thread->cpu_set.__bits, __CPU_SETSIZE);
|
||||
bitmasks_offset++;
|
||||
|
||||
numa_bitmask = &bitmasks[bitmasks_offset];
|
||||
bitmasks_offset += bitmap_scnprintf(numa_bitmask,
|
||||
BITMASKS_BUF_SIZE - bitmasks_offset,
|
||||
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
|
||||
bitmasks_offset++;
|
||||
|
||||
numa_list = &bitmasks[bitmasks_offset];
|
||||
bitmasks_offset += bitmap_scnlistprintf(numa_list,
|
||||
BITMASKS_BUF_SIZE - bitmasks_offset,
|
||||
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
|
||||
bitmasks_offset++;
|
||||
|
||||
sprintf(tmp,
|
||||
"Uid:\t%d\t%d\t%d\t%d\n"
|
||||
"Gid:\t%d\t%d\t%d\t%d\n"
|
||||
"VmLck:\t%9lu kB\n",
|
||||
"VmLck:\t%9lu kB\n"
|
||||
"Cpus_allowed:\t%s\n"
|
||||
"Cpus_allowed_list:\t%s\n"
|
||||
"Mems_allowed:\t%s\n"
|
||||
"Mems_allowed_list:\t%s\n",
|
||||
proc->ruid, proc->euid, proc->suid, proc->fsuid,
|
||||
proc->rgid, proc->egid, proc->sgid, proc->fsgid,
|
||||
(lockedsize + 1023) >> 10);
|
||||
(lockedsize + 1023) >> 10,
|
||||
cpu_bitmask, cpu_list, numa_bitmask, numa_list);
|
||||
len = strlen(tmp);
|
||||
if (r->offset < len) {
|
||||
if (r->offset + r->count < len) {
|
||||
@ -430,6 +491,8 @@ process_procfs_request(unsigned long rarg)
|
||||
ans = 0;
|
||||
eof = 1;
|
||||
}
|
||||
kfree(tmp);
|
||||
kfree(bitmasks);
|
||||
goto end;
|
||||
}
|
||||
|
||||
@ -570,6 +633,7 @@ dataunavail:
|
||||
|
||||
packet.msg = SCD_MSG_PROCFS_ANSWER;
|
||||
packet.arg = rarg;
|
||||
packet.pid = rpacket->pid;
|
||||
|
||||
ret = ihk_ikc_send(syscall_channel, &packet, 0);
|
||||
if (ret < 0) {
|
||||
|
||||
589
kernel/profile.c
Normal file
589
kernel/profile.c
Normal file
@ -0,0 +1,589 @@
|
||||
/**
|
||||
* \file profile.c
|
||||
* License details are found in the file LICENSE.
|
||||
*
|
||||
* \brief
|
||||
* Profiler code for various process statistics
|
||||
* \author Balazs Gerofi <bgerofi@riken.jp>
|
||||
* Copyright (C) 2017 RIKEN AICS
|
||||
*/
|
||||
|
||||
/*
|
||||
* HISTORY:
|
||||
*/
|
||||
|
||||
#include <types.h>
|
||||
#include <kmsg.h>
|
||||
#include <ihk/cpu.h>
|
||||
#include <cpulocal.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/ikc.h>
|
||||
#include <errno.h>
|
||||
#include <cls.h>
|
||||
#include <syscall.h>
|
||||
#include <page.h>
|
||||
#include <ihk/lock.h>
|
||||
#include <ctype.h>
|
||||
#include <waitq.h>
|
||||
#include <rlimit.h>
|
||||
#include <affinity.h>
|
||||
#include <time.h>
|
||||
#include <ihk/perfctr.h>
|
||||
#include <mman.h>
|
||||
#include <kmalloc.h>
|
||||
#include <memobj.h>
|
||||
#include <shm.h>
|
||||
#include <prio.h>
|
||||
#include <arch/cpu.h>
|
||||
#include <limits.h>
|
||||
#include <march.h>
|
||||
#include <process.h>
|
||||
|
||||
extern char *syscall_name[];
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
|
||||
//#define DEBUG_PRINT_PROFILE
|
||||
|
||||
#ifdef DEBUG_PRINT_PROFILE
|
||||
#define dkprintf(...) kprintf(__VA_ARGS__)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
#else
|
||||
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
|
||||
#define ekprintf(...) kprintf(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
|
||||
char *profile_event_names[] =
|
||||
{
|
||||
"remote_tlb_invalidate",
|
||||
"page_fault",
|
||||
"page_fault_anon_clr_mem",
|
||||
"page_fault_file",
|
||||
"page_fault_dev_file",
|
||||
"page_fault_file_clr_mem",
|
||||
"mpol_alloc_missed",
|
||||
"mmap_anon_contig_phys",
|
||||
"mmap_anon_no_contig_phys",
|
||||
"mmap_regular_file",
|
||||
"mmap_device_file",
|
||||
""
|
||||
};
|
||||
|
||||
mcs_lock_node_t job_profile_lock = {0, NULL};
|
||||
struct profile_event *job_profile_events = NULL;
|
||||
int job_nr_processes = -1;
|
||||
int job_nr_processes_left = -1;
|
||||
unsigned long job_elapsed_ts;
|
||||
|
||||
|
||||
|
||||
enum profile_event_type profile_syscall2offload(enum profile_event_type sc)
|
||||
{
|
||||
return (PROFILE_SYSCALL_MAX + sc);
|
||||
}
|
||||
|
||||
void profile_event_add(enum profile_event_type type, uint64_t tsc)
|
||||
{
|
||||
struct profile_event *event = NULL;
|
||||
if (!cpu_local_var(current)->profile)
|
||||
return;
|
||||
|
||||
if (!cpu_local_var(current)->profile_events) {
|
||||
if (profile_alloc_events(cpu_local_var(current)) < 0)
|
||||
return;
|
||||
}
|
||||
|
||||
if (type < PROFILE_EVENT_MAX) {
|
||||
event = &cpu_local_var(current)->profile_events[type];
|
||||
}
|
||||
else {
|
||||
kprintf("%s: WARNING: unknown event type %d\n",
|
||||
__FUNCTION__, type);
|
||||
return;
|
||||
}
|
||||
|
||||
++event->cnt;
|
||||
event->tsc += tsc;
|
||||
}
|
||||
|
||||
void profile_print_thread_stats(struct thread *thread)
|
||||
{
|
||||
int i;
|
||||
unsigned long flags;
|
||||
|
||||
if (!thread->profile_events)
|
||||
return;
|
||||
|
||||
/* Not yet accumulated period? */
|
||||
if (thread->profile_start_ts) {
|
||||
thread->profile_elapsed_ts += (rdtsc() - thread->profile_start_ts);
|
||||
}
|
||||
|
||||
flags = kprintf_lock();
|
||||
|
||||
__kprintf("TID: %4d elapsed cycles (excluding idle): %luk\n",
|
||||
thread->tid,
|
||||
thread->profile_elapsed_ts / 1000);
|
||||
|
||||
for (i = 0; i < PROFILE_SYSCALL_MAX; ++i) {
|
||||
if (!thread->profile_events[i].cnt &&
|
||||
!thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt)
|
||||
continue;
|
||||
|
||||
__kprintf("TID: %4d (%3d,%20s): %6u %6luk offl: %6u %6luk (%2d.%2d%%)\n",
|
||||
thread->tid,
|
||||
i,
|
||||
syscall_name[i],
|
||||
thread->profile_events[i].cnt,
|
||||
(thread->profile_events[i].tsc /
|
||||
(thread->profile_events[i].cnt ?
|
||||
thread->profile_events[i].cnt : 1))
|
||||
/ 1000,
|
||||
thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt,
|
||||
(thread->profile_events[i + PROFILE_SYSCALL_MAX].tsc /
|
||||
(thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt ?
|
||||
thread->profile_events[i + PROFILE_SYSCALL_MAX].cnt : 1))
|
||||
/ 1000,
|
||||
(thread->profile_events[i].tsc ?
|
||||
thread->profile_events[i].tsc * 100
|
||||
/ thread->profile_elapsed_ts : 0),
|
||||
(thread->profile_events[i].tsc ?
|
||||
(thread->profile_events[i].tsc * 10000
|
||||
/ thread->profile_elapsed_ts) % 100 : 0)
|
||||
);
|
||||
}
|
||||
|
||||
for (i = PROFILE_EVENT_MIN; i < PROFILE_EVENT_MAX; ++i) {
|
||||
|
||||
if (!thread->profile_events[i].cnt)
|
||||
continue;
|
||||
|
||||
__kprintf("TID: %4d (%24s): %6u %6luk \n",
|
||||
thread->tid,
|
||||
profile_event_names[i - PROFILE_EVENT_MIN],
|
||||
thread->profile_events[i].cnt,
|
||||
(thread->profile_events[i].tsc /
|
||||
(thread->profile_events[i].cnt ?
|
||||
thread->profile_events[i].cnt : 1))
|
||||
/ 1000,
|
||||
(thread->profile_events[i].tsc ?
|
||||
thread->profile_events[i].tsc * 100
|
||||
/ thread->profile_elapsed_ts : 0),
|
||||
(thread->profile_events[i].tsc ?
|
||||
(thread->profile_events[i].tsc * 10000
|
||||
/ thread->profile_elapsed_ts) % 100 : 0)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
kprintf_unlock(flags);
|
||||
}
|
||||
|
||||
void profile_print_proc_stats(struct process *proc)
|
||||
{
|
||||
int i;
|
||||
unsigned long flags;
|
||||
|
||||
if (!proc->profile_events || !proc->profile_elapsed_ts)
|
||||
return;
|
||||
|
||||
flags = kprintf_lock();
|
||||
__kprintf("PID: %4d elapsed cycles for all threads (excluding idle): %luk\n",
|
||||
proc->pid,
|
||||
proc->profile_elapsed_ts / 1000);
|
||||
|
||||
for (i = 0; i < PROFILE_SYSCALL_MAX; ++i) {
|
||||
if (!proc->profile_events[i].cnt &&
|
||||
!proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt)
|
||||
continue;
|
||||
|
||||
__kprintf("PID: %4d (%3d,%20s): %6u %6luk offl: %6u %6luk (%2d.%2d%%)\n",
|
||||
proc->pid,
|
||||
i,
|
||||
syscall_name[i],
|
||||
proc->profile_events[i].cnt,
|
||||
(proc->profile_events[i].tsc /
|
||||
(proc->profile_events[i].cnt ?
|
||||
proc->profile_events[i].cnt : 1))
|
||||
/ 1000,
|
||||
proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt,
|
||||
(proc->profile_events[i + PROFILE_SYSCALL_MAX].tsc /
|
||||
(proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt ?
|
||||
proc->profile_events[i + PROFILE_SYSCALL_MAX].cnt : 1))
|
||||
/ 1000,
|
||||
(proc->profile_events[i].tsc ?
|
||||
proc->profile_events[i].tsc * 100
|
||||
/ proc->profile_elapsed_ts : 0),
|
||||
(proc->profile_events[i].tsc ?
|
||||
(proc->profile_events[i].tsc * 10000
|
||||
/ proc->profile_elapsed_ts) % 100 : 0)
|
||||
);
|
||||
}
|
||||
|
||||
for (i = PROFILE_EVENT_MIN; i < PROFILE_EVENT_MAX; ++i) {
|
||||
|
||||
if (!proc->profile_events[i].cnt)
|
||||
continue;
|
||||
|
||||
__kprintf("PID: %4d (%24s): %6u %6luk \n",
|
||||
proc->pid,
|
||||
profile_event_names[i - PROFILE_EVENT_MIN],
|
||||
proc->profile_events[i].cnt,
|
||||
(proc->profile_events[i].tsc /
|
||||
(proc->profile_events[i].cnt ?
|
||||
proc->profile_events[i].cnt : 1))
|
||||
/ 1000,
|
||||
(proc->profile_events[i].tsc &&
|
||||
proc->profile_elapsed_ts ?
|
||||
proc->profile_events[i].tsc * 100
|
||||
/ proc->profile_elapsed_ts : 0),
|
||||
(proc->profile_events[i].tsc &&
|
||||
proc->profile_elapsed_ts ?
|
||||
(proc->profile_events[i].tsc * 10000
|
||||
/ proc->profile_elapsed_ts) % 100 : 0)
|
||||
);
|
||||
}
|
||||
|
||||
kprintf_unlock(flags);
|
||||
}
|
||||
|
||||
int profile_accumulate_and_print_job_events(struct process *proc)
|
||||
{
|
||||
int i;
|
||||
unsigned long flags;
|
||||
struct mcs_lock_node mcs_node;
|
||||
|
||||
mcs_lock_lock(&job_profile_lock, &mcs_node);
|
||||
|
||||
/* First process? */
|
||||
if (job_nr_processes == -1) {
|
||||
job_nr_processes = proc->nr_processes;
|
||||
job_nr_processes_left = proc->nr_processes;
|
||||
job_elapsed_ts = 0;
|
||||
}
|
||||
|
||||
--job_nr_processes_left;
|
||||
|
||||
/* Allocate event counters */
|
||||
if (!job_profile_events) {
|
||||
|
||||
job_profile_events = kmalloc(sizeof(*job_profile_events) *
|
||||
PROFILE_EVENT_MAX, IHK_MC_AP_NOWAIT);
|
||||
|
||||
if (!job_profile_events) {
|
||||
kprintf("%s: ERROR: allocating job profile counters\n",
|
||||
__FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memset(job_profile_events, 0,
|
||||
sizeof(*job_profile_events) * PROFILE_EVENT_MAX);
|
||||
}
|
||||
|
||||
/* Accumulate process */
|
||||
for (i = 0; i < PROFILE_EVENT_MAX; ++i) {
|
||||
if (!proc->profile_events[i].tsc)
|
||||
continue;
|
||||
|
||||
job_profile_events[i].tsc += proc->profile_events[i].tsc;
|
||||
job_profile_events[i].cnt += proc->profile_events[i].cnt;
|
||||
proc->profile_events[i].tsc = 0;
|
||||
proc->profile_events[i].cnt = 0;
|
||||
}
|
||||
|
||||
job_elapsed_ts += proc->profile_elapsed_ts;
|
||||
|
||||
/* Last process? */
|
||||
if (job_nr_processes_left == 0) {
|
||||
flags = kprintf_lock();
|
||||
__kprintf("JOB: (%2d) elapsed cycles for all threads (excluding idle): %luk\n",
|
||||
job_nr_processes,
|
||||
job_elapsed_ts / 1000);
|
||||
|
||||
for (i = 0; i < PROFILE_SYSCALL_MAX; ++i) {
|
||||
if (!job_profile_events[i].cnt &&
|
||||
!job_profile_events[i + PROFILE_SYSCALL_MAX].cnt)
|
||||
continue;
|
||||
|
||||
__kprintf("JOB: (%2d) (%3d,%20s): %6u %6luk offl: %6u %6luk (%2d.%2d%%)\n",
|
||||
job_nr_processes,
|
||||
i,
|
||||
syscall_name[i],
|
||||
job_profile_events[i].cnt,
|
||||
(job_profile_events[i].tsc /
|
||||
(job_profile_events[i].cnt ?
|
||||
job_profile_events[i].cnt : 1))
|
||||
/ 1000,
|
||||
job_profile_events[i + PROFILE_SYSCALL_MAX].cnt,
|
||||
(job_profile_events[i + PROFILE_SYSCALL_MAX].tsc /
|
||||
(job_profile_events[i + PROFILE_SYSCALL_MAX].cnt ?
|
||||
job_profile_events[i + PROFILE_SYSCALL_MAX].cnt : 1))
|
||||
/ 1000,
|
||||
(job_profile_events[i].tsc ?
|
||||
job_profile_events[i].tsc * 100
|
||||
/ job_elapsed_ts : 0),
|
||||
(job_profile_events[i].tsc ?
|
||||
(job_profile_events[i].tsc * 10000
|
||||
/ job_elapsed_ts) % 100 : 0)
|
||||
);
|
||||
|
||||
job_profile_events[i].tsc = 0;
|
||||
job_profile_events[i].cnt = 0;
|
||||
job_profile_events[i + PROFILE_SYSCALL_MAX].tsc = 0;
|
||||
job_profile_events[i + PROFILE_SYSCALL_MAX].cnt = 0;
|
||||
}
|
||||
|
||||
for (i = PROFILE_EVENT_MIN; i < PROFILE_EVENT_MAX; ++i) {
|
||||
|
||||
if (!job_profile_events[i].cnt)
|
||||
continue;
|
||||
|
||||
__kprintf("JOB: (%2d) (%24s): %6u %6luk \n",
|
||||
job_nr_processes,
|
||||
profile_event_names[i - PROFILE_EVENT_MIN],
|
||||
job_profile_events[i].cnt,
|
||||
(job_profile_events[i].tsc /
|
||||
(job_profile_events[i].cnt ?
|
||||
job_profile_events[i].cnt : 1))
|
||||
/ 1000);
|
||||
|
||||
job_profile_events[i].tsc = 0;
|
||||
job_profile_events[i].cnt = 0;
|
||||
}
|
||||
|
||||
kprintf_unlock(flags);
|
||||
|
||||
/* Reset job process indicators */
|
||||
job_nr_processes = -1;
|
||||
job_nr_processes_left = -1;
|
||||
job_elapsed_ts = 0;
|
||||
}
|
||||
|
||||
mcs_lock_unlock(&job_profile_lock, &mcs_node);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void profile_accumulate_events(struct thread *thread,
|
||||
struct process *proc)
|
||||
{
|
||||
int i;
|
||||
struct mcs_lock_node mcs_node;
|
||||
|
||||
if (!thread->profile_events || !proc->profile_events) return;
|
||||
|
||||
mcs_lock_lock(&proc->profile_lock, &mcs_node);
|
||||
|
||||
for (i = 0; i < PROFILE_EVENT_MAX; ++i) {
|
||||
proc->profile_events[i].tsc += thread->profile_events[i].tsc;
|
||||
proc->profile_events[i].cnt += thread->profile_events[i].cnt;
|
||||
thread->profile_events[i].tsc = 0;
|
||||
thread->profile_events[i].cnt = 0;
|
||||
}
|
||||
|
||||
proc->profile_elapsed_ts += thread->profile_elapsed_ts;
|
||||
if (thread->profile_start_ts) {
|
||||
proc->profile_elapsed_ts +=
|
||||
(rdtsc() - thread->profile_start_ts);
|
||||
}
|
||||
|
||||
mcs_lock_unlock(&proc->profile_lock, &mcs_node);
|
||||
}
|
||||
|
||||
int profile_alloc_events(struct thread *thread)
|
||||
{
|
||||
struct process *proc = thread->proc;
|
||||
struct mcs_lock_node mcs_node;
|
||||
|
||||
if (!thread->profile_events) {
|
||||
thread->profile_events = kmalloc(sizeof(*thread->profile_events) *
|
||||
PROFILE_EVENT_MAX, IHK_MC_AP_NOWAIT);
|
||||
|
||||
if (!thread->profile_events) {
|
||||
kprintf("%s: ERROR: allocating thread private profile counters\n",
|
||||
__FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memset(thread->profile_events, 0,
|
||||
sizeof(*thread->profile_events) * PROFILE_EVENT_MAX);
|
||||
}
|
||||
|
||||
mcs_lock_lock(&proc->profile_lock, &mcs_node);
|
||||
if (!proc->profile_events) {
|
||||
proc->profile_events = kmalloc(sizeof(*proc->profile_events) *
|
||||
PROFILE_EVENT_MAX, IHK_MC_AP_NOWAIT);
|
||||
|
||||
if (!proc->profile_events) {
|
||||
kprintf("%s: ERROR: allocating proc private profile counters\n",
|
||||
__FUNCTION__);
|
||||
mcs_lock_unlock(&proc->profile_lock, &mcs_node);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memset(proc->profile_events, 0,
|
||||
sizeof(*thread->profile_events) * PROFILE_EVENT_MAX);
|
||||
|
||||
}
|
||||
mcs_lock_unlock(&proc->profile_lock, &mcs_node);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void profile_dealloc_thread_events(struct thread *thread)
|
||||
{
|
||||
kfree(thread->profile_events);
|
||||
}
|
||||
|
||||
void profile_dealloc_proc_events(struct process *proc)
|
||||
{
|
||||
kfree(proc->profile_events);
|
||||
}
|
||||
|
||||
void static profile_clear_process(struct process *proc)
|
||||
{
|
||||
proc->profile_elapsed_ts = 0;
|
||||
if (!proc->profile_events) return;
|
||||
|
||||
memset(proc->profile_events, 0,
|
||||
sizeof(*proc->profile_events) * PROFILE_EVENT_MAX);
|
||||
}
|
||||
|
||||
void static profile_clear_thread(struct thread *thread)
|
||||
{
|
||||
thread->profile_start_ts = 0;
|
||||
thread->profile_elapsed_ts = 0;
|
||||
if (!thread->profile_events) return;
|
||||
|
||||
memset(thread->profile_events, 0,
|
||||
sizeof(*thread->profile_events) * PROFILE_EVENT_MAX);
|
||||
}
|
||||
|
||||
int do_profile(int flag)
|
||||
{
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
struct process *proc = thread->proc;
|
||||
unsigned long now_ts = rdtsc();
|
||||
|
||||
/* Job level? */
|
||||
if (flag & PROF_JOB) {
|
||||
dkprintf("%s: JOB %d, flag: 0x%lx\n",
|
||||
__FUNCTION__, proc->nr_processes, flag);
|
||||
if (flag & PROF_PRINT) {
|
||||
struct mcs_rwlock_node lock;
|
||||
struct thread *_thread;
|
||||
|
||||
/* Accumulate events from all threads to process level */
|
||||
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &lock);
|
||||
list_for_each_entry(_thread, &proc->threads_list,
|
||||
siblings_list) {
|
||||
profile_accumulate_events(_thread, proc);
|
||||
}
|
||||
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &lock);
|
||||
|
||||
/* Accumulate events to job level */
|
||||
return profile_accumulate_and_print_job_events(proc);
|
||||
}
|
||||
}
|
||||
/* Process level? */
|
||||
else if (flag & PROF_PROC) {
|
||||
struct mcs_rwlock_node lock;
|
||||
struct thread *_thread;
|
||||
|
||||
dkprintf("%s: PID %d, flag: 0x%lx\n",
|
||||
__FUNCTION__, proc->pid, flag);
|
||||
/* Accumulate events from all threads */
|
||||
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &lock);
|
||||
|
||||
list_for_each_entry(_thread, &proc->threads_list,
|
||||
siblings_list) {
|
||||
if (flag & PROF_PRINT) {
|
||||
profile_accumulate_events(_thread, proc);
|
||||
}
|
||||
|
||||
if (flag & PROF_CLEAR) {
|
||||
profile_clear_thread(_thread);
|
||||
}
|
||||
|
||||
if (flag & PROF_ON) {
|
||||
_thread->profile = 1;
|
||||
}
|
||||
else if (flag & PROF_OFF) {
|
||||
if (_thread->profile) {
|
||||
_thread->profile = 0;
|
||||
if (_thread->profile_start_ts) {
|
||||
_thread->profile_elapsed_ts +=
|
||||
(now_ts - _thread->profile_start_ts);
|
||||
}
|
||||
_thread->profile_start_ts = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &lock);
|
||||
|
||||
if (flag & PROF_PRINT) {
|
||||
profile_print_proc_stats(proc);
|
||||
}
|
||||
|
||||
if (flag & PROF_CLEAR) {
|
||||
profile_clear_process(proc);
|
||||
}
|
||||
|
||||
/* Make sure future threads profile as well */
|
||||
if (flag & PROF_ON) {
|
||||
if (!proc->profile) {
|
||||
proc->profile = 1;
|
||||
}
|
||||
}
|
||||
else if (flag & PROF_OFF) {
|
||||
proc->profile = 0;
|
||||
}
|
||||
}
|
||||
/* Thread level */
|
||||
else {
|
||||
dkprintf("%s: TID %d, flag: 0x%lx\n",
|
||||
__FUNCTION__, thread->tid, flag);
|
||||
if (flag & PROF_PRINT) {
|
||||
profile_print_thread_stats(thread);
|
||||
}
|
||||
|
||||
if (flag & PROF_CLEAR) {
|
||||
profile_clear_thread(thread);
|
||||
/* If profiling, reset start and elapsed */
|
||||
if (thread->profile) {
|
||||
thread->profile_start_ts = 0;
|
||||
thread->profile_elapsed_ts = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (flag & PROF_ON) {
|
||||
if (!thread->profile) {
|
||||
thread->profile = 1;
|
||||
thread->profile_start_ts = 0;
|
||||
}
|
||||
}
|
||||
else if (flag & PROF_OFF) {
|
||||
if (thread->profile) {
|
||||
thread->profile = 0;
|
||||
if (thread->profile_start_ts) {
|
||||
thread->profile_elapsed_ts +=
|
||||
(now_ts - thread->profile_start_ts);
|
||||
}
|
||||
thread->profile_start_ts = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(profile)
|
||||
{
|
||||
int flag = (int)ihk_mc_syscall_arg0(ctx);
|
||||
return do_profile(flag);
|
||||
}
|
||||
|
||||
#endif // PROFILE_ENABLE
|
||||
561
kernel/rbtree.c
Normal file
561
kernel/rbtree.c
Normal file
@ -0,0 +1,561 @@
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
(C) 2002 David Woodhouse <dwmw2@infradead.org>
|
||||
(C) 2012 Michel Lespinasse <walken@google.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
linux/lib/rbtree.c
|
||||
*/
|
||||
|
||||
#include <rbtree_augmented.h>
|
||||
|
||||
#define EXPORT_SYMBOL(x)
|
||||
|
||||
/*
|
||||
* red-black trees properties: http://en.wikipedia.org/wiki/Rbtree
|
||||
*
|
||||
* 1) A node is either red or black
|
||||
* 2) The root is black
|
||||
* 3) All leaves (NULL) are black
|
||||
* 4) Both children of every red node are black
|
||||
* 5) Every simple path from root to leaves contains the same number
|
||||
* of black nodes.
|
||||
*
|
||||
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
|
||||
* consecutive red nodes in a path and every red node is therefore followed by
|
||||
* a black. So if B is the number of black nodes on every simple path (as per
|
||||
* 5), then the longest possible path due to 4 is 2B.
|
||||
*
|
||||
* We shall indicate color with case, where black nodes are uppercase and red
|
||||
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
|
||||
* parentheses and have some accompanying text comment.
|
||||
*/
|
||||
|
||||
static inline void rb_set_black(struct rb_node *rb)
|
||||
{
|
||||
rb->__rb_parent_color |= RB_BLACK;
|
||||
}
|
||||
|
||||
static inline struct rb_node *rb_red_parent(struct rb_node *red)
|
||||
{
|
||||
return (struct rb_node *)red->__rb_parent_color;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper function for rotations:
|
||||
* - old's parent and color get assigned to new
|
||||
* - old gets assigned new as a parent and 'color' as a color.
|
||||
*/
|
||||
static inline void
|
||||
__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_root *root, int color)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(old);
|
||||
new->__rb_parent_color = old->__rb_parent_color;
|
||||
rb_set_parent_color(old, new, color);
|
||||
__rb_change_child(old, new, parent, root);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__rb_insert(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
|
||||
|
||||
while (true) {
|
||||
/*
|
||||
* Loop invariant: node is red
|
||||
*
|
||||
* If there is a black parent, we are done.
|
||||
* Otherwise, take some corrective action as we don't
|
||||
* want a red root or two consecutive red nodes.
|
||||
*/
|
||||
if (!parent) {
|
||||
rb_set_parent_color(node, NULL, RB_BLACK);
|
||||
break;
|
||||
} else if (rb_is_black(parent))
|
||||
break;
|
||||
|
||||
gparent = rb_red_parent(parent);
|
||||
|
||||
tmp = gparent->rb_right;
|
||||
if (parent != tmp) { /* parent == gparent->rb_left */
|
||||
if (tmp && rb_is_red(tmp)) {
|
||||
/*
|
||||
* Case 1 - color flips
|
||||
*
|
||||
* G g
|
||||
* / \ / \
|
||||
* p u --> P U
|
||||
* / /
|
||||
* n N
|
||||
*
|
||||
* However, since g's parent might be red, and
|
||||
* 4) does not allow this, we need to recurse
|
||||
* at g.
|
||||
*/
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
rb_set_parent_color(parent, gparent, RB_BLACK);
|
||||
node = gparent;
|
||||
parent = rb_parent(node);
|
||||
rb_set_parent_color(node, parent, RB_RED);
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp = parent->rb_right;
|
||||
if (node == tmp) {
|
||||
/*
|
||||
* Case 2 - left rotate at parent
|
||||
*
|
||||
* G G
|
||||
* / \ / \
|
||||
* p U --> n U
|
||||
* \ /
|
||||
* n p
|
||||
*
|
||||
* This still leaves us in violation of 4), the
|
||||
* continuation into Case 3 will fix that.
|
||||
*/
|
||||
parent->rb_right = tmp = node->rb_left;
|
||||
node->rb_left = parent;
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, parent,
|
||||
RB_BLACK);
|
||||
rb_set_parent_color(parent, node, RB_RED);
|
||||
augment_rotate(parent, node);
|
||||
parent = node;
|
||||
tmp = node->rb_right;
|
||||
}
|
||||
|
||||
/*
|
||||
* Case 3 - right rotate at gparent
|
||||
*
|
||||
* G P
|
||||
* / \ / \
|
||||
* p U --> n g
|
||||
* / \
|
||||
* n U
|
||||
*/
|
||||
gparent->rb_left = tmp; /* == parent->rb_right */
|
||||
parent->rb_right = gparent;
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
|
||||
augment_rotate(gparent, parent);
|
||||
break;
|
||||
} else {
|
||||
tmp = gparent->rb_left;
|
||||
if (tmp && rb_is_red(tmp)) {
|
||||
/* Case 1 - color flips */
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
rb_set_parent_color(parent, gparent, RB_BLACK);
|
||||
node = gparent;
|
||||
parent = rb_parent(node);
|
||||
rb_set_parent_color(node, parent, RB_RED);
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp = parent->rb_left;
|
||||
if (node == tmp) {
|
||||
/* Case 2 - right rotate at parent */
|
||||
parent->rb_left = tmp = node->rb_right;
|
||||
node->rb_right = parent;
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, parent,
|
||||
RB_BLACK);
|
||||
rb_set_parent_color(parent, node, RB_RED);
|
||||
augment_rotate(parent, node);
|
||||
parent = node;
|
||||
tmp = node->rb_left;
|
||||
}
|
||||
|
||||
/* Case 3 - left rotate at gparent */
|
||||
gparent->rb_right = tmp; /* == parent->rb_left */
|
||||
parent->rb_left = gparent;
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
|
||||
augment_rotate(gparent, parent);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Inline version for rb_erase() use - we want to be able to inline
|
||||
* and eliminate the dummy_rotate callback there
|
||||
*/
|
||||
static __always_inline void
|
||||
____rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
|
||||
|
||||
while (true) {
|
||||
/*
|
||||
* Loop invariants:
|
||||
* - node is black (or NULL on first iteration)
|
||||
* - node is not the root (parent is not NULL)
|
||||
* - All leaf paths going through parent and node have a
|
||||
* black node count that is 1 lower than other leaf paths.
|
||||
*/
|
||||
sibling = parent->rb_right;
|
||||
if (node != sibling) { /* node == parent->rb_left */
|
||||
if (rb_is_red(sibling)) {
|
||||
/*
|
||||
* Case 1 - left rotate at parent
|
||||
*
|
||||
* P S
|
||||
* / \ / \
|
||||
* N s --> p Sr
|
||||
* / \ / \
|
||||
* Sl Sr N Sl
|
||||
*/
|
||||
parent->rb_right = tmp1 = sibling->rb_left;
|
||||
sibling->rb_left = parent;
|
||||
rb_set_parent_color(tmp1, parent, RB_BLACK);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_RED);
|
||||
augment_rotate(parent, sibling);
|
||||
sibling = tmp1;
|
||||
}
|
||||
tmp1 = sibling->rb_right;
|
||||
if (!tmp1 || rb_is_black(tmp1)) {
|
||||
tmp2 = sibling->rb_left;
|
||||
if (!tmp2 || rb_is_black(tmp2)) {
|
||||
/*
|
||||
* Case 2 - sibling color flip
|
||||
* (p could be either color here)
|
||||
*
|
||||
* (p) (p)
|
||||
* / \ / \
|
||||
* N S --> N s
|
||||
* / \ / \
|
||||
* Sl Sr Sl Sr
|
||||
*
|
||||
* This leaves us violating 5) which
|
||||
* can be fixed by flipping p to black
|
||||
* if it was red, or by recursing at p.
|
||||
* p is red when coming from Case 1.
|
||||
*/
|
||||
rb_set_parent_color(sibling, parent,
|
||||
RB_RED);
|
||||
if (rb_is_red(parent))
|
||||
rb_set_black(parent);
|
||||
else {
|
||||
node = parent;
|
||||
parent = rb_parent(node);
|
||||
if (parent)
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Case 3 - right rotate at sibling
|
||||
* (p could be either color here)
|
||||
*
|
||||
* (p) (p)
|
||||
* / \ / \
|
||||
* N S --> N Sl
|
||||
* / \ \
|
||||
* sl Sr s
|
||||
* \
|
||||
* Sr
|
||||
*/
|
||||
sibling->rb_left = tmp1 = tmp2->rb_right;
|
||||
tmp2->rb_right = sibling;
|
||||
parent->rb_right = tmp2;
|
||||
if (tmp1)
|
||||
rb_set_parent_color(tmp1, sibling,
|
||||
RB_BLACK);
|
||||
augment_rotate(sibling, tmp2);
|
||||
tmp1 = sibling;
|
||||
sibling = tmp2;
|
||||
}
|
||||
/*
|
||||
* Case 4 - left rotate at parent + color flips
|
||||
* (p and sl could be either color here.
|
||||
* After rotation, p becomes black, s acquires
|
||||
* p's color, and sl keeps its color)
|
||||
*
|
||||
* (p) (s)
|
||||
* / \ / \
|
||||
* N S --> P Sr
|
||||
* / \ / \
|
||||
* (sl) sr N (sl)
|
||||
*/
|
||||
parent->rb_right = tmp2 = sibling->rb_left;
|
||||
sibling->rb_left = parent;
|
||||
rb_set_parent_color(tmp1, sibling, RB_BLACK);
|
||||
if (tmp2)
|
||||
rb_set_parent(tmp2, parent);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_BLACK);
|
||||
augment_rotate(parent, sibling);
|
||||
break;
|
||||
} else {
|
||||
sibling = parent->rb_left;
|
||||
if (rb_is_red(sibling)) {
|
||||
/* Case 1 - right rotate at parent */
|
||||
parent->rb_left = tmp1 = sibling->rb_right;
|
||||
sibling->rb_right = parent;
|
||||
rb_set_parent_color(tmp1, parent, RB_BLACK);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_RED);
|
||||
augment_rotate(parent, sibling);
|
||||
sibling = tmp1;
|
||||
}
|
||||
tmp1 = sibling->rb_left;
|
||||
if (!tmp1 || rb_is_black(tmp1)) {
|
||||
tmp2 = sibling->rb_right;
|
||||
if (!tmp2 || rb_is_black(tmp2)) {
|
||||
/* Case 2 - sibling color flip */
|
||||
rb_set_parent_color(sibling, parent,
|
||||
RB_RED);
|
||||
if (rb_is_red(parent))
|
||||
rb_set_black(parent);
|
||||
else {
|
||||
node = parent;
|
||||
parent = rb_parent(node);
|
||||
if (parent)
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* Case 3 - right rotate at sibling */
|
||||
sibling->rb_right = tmp1 = tmp2->rb_left;
|
||||
tmp2->rb_left = sibling;
|
||||
parent->rb_left = tmp2;
|
||||
if (tmp1)
|
||||
rb_set_parent_color(tmp1, sibling,
|
||||
RB_BLACK);
|
||||
augment_rotate(sibling, tmp2);
|
||||
tmp1 = sibling;
|
||||
sibling = tmp2;
|
||||
}
|
||||
/* Case 4 - left rotate at parent + color flips */
|
||||
parent->rb_left = tmp2 = sibling->rb_right;
|
||||
sibling->rb_right = parent;
|
||||
rb_set_parent_color(tmp1, sibling, RB_BLACK);
|
||||
if (tmp2)
|
||||
rb_set_parent(tmp2, parent);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_BLACK);
|
||||
augment_rotate(parent, sibling);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Non-inline version for rb_erase_augmented() use */
|
||||
void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
____rb_erase_color(parent, root, augment_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(__rb_erase_color);
|
||||
|
||||
/*
|
||||
* Non-augmented rbtree manipulation functions.
|
||||
*
|
||||
* We use dummy augmented callbacks here, and have the compiler optimize them
|
||||
* out of the rb_insert_color() and rb_erase() function definitions.
|
||||
*/
|
||||
|
||||
static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
|
||||
static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
|
||||
static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
|
||||
|
||||
static const struct rb_augment_callbacks dummy_callbacks = {
|
||||
dummy_propagate, dummy_copy, dummy_rotate
|
||||
};
|
||||
|
||||
void rb_insert_color(struct rb_node *node, struct rb_root *root)
|
||||
{
|
||||
__rb_insert(node, root, dummy_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_insert_color);
|
||||
|
||||
void rb_erase(struct rb_node *node, struct rb_root *root)
|
||||
{
|
||||
struct rb_node *rebalance;
|
||||
rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
|
||||
if (rebalance)
|
||||
____rb_erase_color(rebalance, root, dummy_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_erase);
|
||||
|
||||
/*
|
||||
* Augmented rbtree manipulation functions.
|
||||
*
|
||||
* This instantiates the same __always_inline functions as in the non-augmented
|
||||
* case, but this time with user-defined callbacks.
|
||||
*/
|
||||
|
||||
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
__rb_insert(node, root, augment_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(__rb_insert_augmented);
|
||||
|
||||
/*
|
||||
* This function returns the first node (in sort order) of the tree.
|
||||
*/
|
||||
struct rb_node *rb_first(const struct rb_root *root)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
n = root->rb_node;
|
||||
if (!n)
|
||||
return NULL;
|
||||
while (n->rb_left)
|
||||
n = n->rb_left;
|
||||
return n;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_first);
|
||||
|
||||
struct rb_node *rb_last(const struct rb_root *root)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
n = root->rb_node;
|
||||
if (!n)
|
||||
return NULL;
|
||||
while (n->rb_right)
|
||||
n = n->rb_right;
|
||||
return n;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_last);
|
||||
|
||||
struct rb_node *rb_next(const struct rb_node *node)
|
||||
{
|
||||
struct rb_node *parent;
|
||||
|
||||
if (RB_EMPTY_NODE(node))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* If we have a right-hand child, go down and then left as far
|
||||
* as we can.
|
||||
*/
|
||||
if (node->rb_right) {
|
||||
node = node->rb_right;
|
||||
while (node->rb_left)
|
||||
node=node->rb_left;
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
|
||||
/*
|
||||
* No right-hand children. Everything down and left is smaller than us,
|
||||
* so any 'next' node must be in the general direction of our parent.
|
||||
* Go up the tree; any time the ancestor is a right-hand child of its
|
||||
* parent, keep going up. First time it's a left-hand child of its
|
||||
* parent, said parent is our 'next' node.
|
||||
*/
|
||||
while ((parent = rb_parent(node)) && node == parent->rb_right)
|
||||
node = parent;
|
||||
|
||||
return parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_next);
|
||||
|
||||
struct rb_node *rb_prev(const struct rb_node *node)
|
||||
{
|
||||
struct rb_node *parent;
|
||||
|
||||
if (RB_EMPTY_NODE(node))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* If we have a left-hand child, go down and then right as far
|
||||
* as we can.
|
||||
*/
|
||||
if (node->rb_left) {
|
||||
node = node->rb_left;
|
||||
while (node->rb_right)
|
||||
node=node->rb_right;
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
|
||||
/*
|
||||
* No left-hand children. Go up till we find an ancestor which
|
||||
* is a right-hand child of its parent.
|
||||
*/
|
||||
while ((parent = rb_parent(node)) && node == parent->rb_left)
|
||||
node = parent;
|
||||
|
||||
return parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_prev);
|
||||
|
||||
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(victim);
|
||||
|
||||
/* Set the surrounding nodes to point to the replacement */
|
||||
__rb_change_child(victim, new, parent, root);
|
||||
if (victim->rb_left)
|
||||
rb_set_parent(victim->rb_left, new);
|
||||
if (victim->rb_right)
|
||||
rb_set_parent(victim->rb_right, new);
|
||||
|
||||
/* Copy the pointers/colour from the victim to the replacement */
|
||||
*new = *victim;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_replace_node);
|
||||
|
||||
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
|
||||
{
|
||||
for (;;) {
|
||||
if (node->rb_left)
|
||||
node = node->rb_left;
|
||||
else if (node->rb_right)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
}
|
||||
|
||||
struct rb_node *rb_next_postorder(const struct rb_node *node)
|
||||
{
|
||||
const struct rb_node *parent;
|
||||
if (!node)
|
||||
return NULL;
|
||||
parent = rb_parent(node);
|
||||
|
||||
/* If we're sitting on node, we've already seen our children */
|
||||
if (parent && node == parent->rb_left && parent->rb_right) {
|
||||
/* If we are the parent's left node, go to the parent's right
|
||||
* node then all the way down to the left */
|
||||
return rb_left_deepest_node(parent->rb_right);
|
||||
} else
|
||||
/* Otherwise we are the parent's right node, and the parent
|
||||
* should be next */
|
||||
return (struct rb_node *)parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_next_postorder);
|
||||
|
||||
struct rb_node *rb_first_postorder(const struct rb_root *root)
|
||||
{
|
||||
if (!root->rb_node)
|
||||
return NULL;
|
||||
|
||||
return rb_left_deepest_node(root->rb_node);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_first_postorder);
|
||||
@ -179,6 +179,7 @@ int shmobj_create(struct shmid_ds *ds, struct memobj **objp)
|
||||
|
||||
memset(obj, 0, sizeof(*obj));
|
||||
obj->memobj.ops = &shmobj_ops;
|
||||
obj->memobj.size = ds->shm_segsz;
|
||||
obj->ds = *ds;
|
||||
obj->ds.shm_perm.seq = the_seq++;
|
||||
obj->ds.shm_nattch = 1;
|
||||
@ -240,14 +241,24 @@ void shmobj_destroy(struct shmobj *obj)
|
||||
npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT);
|
||||
for (;;) {
|
||||
struct page *page;
|
||||
int count;
|
||||
void *page_va;
|
||||
|
||||
page = page_list_first(obj);
|
||||
if (!page) {
|
||||
break;
|
||||
}
|
||||
page_list_remove(obj, page);
|
||||
page_va = phys_to_virt(page_to_phys(page));
|
||||
|
||||
if (ihk_atomic_read(&page->count) != 1) {
|
||||
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
|
||||
__FUNCTION__, page->phys);
|
||||
}
|
||||
|
||||
if (page_unmap(page)) {
|
||||
ihk_mc_free_pages_user(page_va, npages);
|
||||
}
|
||||
#if 0
|
||||
dkprintf("shmobj_destroy(%p):"
|
||||
"release page. %p %#lx %d %d",
|
||||
obj, page, page_to_phys(page),
|
||||
@ -265,7 +276,8 @@ void shmobj_destroy(struct shmobj *obj)
|
||||
}
|
||||
|
||||
page->mode = PM_NONE;
|
||||
free_pages(phys_to_virt(page_to_phys(page)), npages);
|
||||
ihk_mc_free_pages(phys_to_virt(page_to_phys(page)), npages);
|
||||
#endif
|
||||
}
|
||||
if (obj->index < 0) {
|
||||
kfree(obj);
|
||||
@ -394,7 +406,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
|
||||
page = page_list_lookup(obj, off);
|
||||
if (!page) {
|
||||
npages = 1 << p2align;
|
||||
virt = ihk_mc_alloc_aligned_pages(npages, p2align,
|
||||
virt = ihk_mc_alloc_aligned_pages_user(npages, p2align,
|
||||
IHK_MC_AP_NOWAIT);
|
||||
if (!virt) {
|
||||
error = -ENOMEM;
|
||||
@ -404,7 +416,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
|
||||
goto out;
|
||||
}
|
||||
phys = virt_to_phys(virt);
|
||||
page = phys_to_page(phys);
|
||||
page = phys_to_page_insert_hash(phys);
|
||||
if (page->mode != PM_NONE) {
|
||||
fkprintf("shmobj_get_page(%p,%#lx,%d,%p):"
|
||||
"page %p %#lx %d %d %#lx\n",
|
||||
@ -431,7 +443,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
|
||||
out:
|
||||
memobj_unlock(&obj->memobj);
|
||||
if (virt) {
|
||||
ihk_mc_free_pages(virt, npages);
|
||||
ihk_mc_free_pages_user(virt, npages);
|
||||
}
|
||||
dkprintf("shmobj_get_page(%p,%#lx,%d,%p):%d\n",
|
||||
memobj, off, p2align, physp, error);
|
||||
@ -455,7 +467,8 @@ static int shmobj_invalidate_page(struct memobj *memobj, uintptr_t phys,
|
||||
|
||||
if (ihk_atomic_read(&page->count) == 1) {
|
||||
if (page_unmap(page)) {
|
||||
ihk_mc_free_pages(phys_to_virt(phys), pgsize/PAGE_SIZE);
|
||||
ihk_mc_free_pages_user(phys_to_virt(phys),
|
||||
pgsize/PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
2879
kernel/syscall.c
2879
kernel/syscall.c
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user