diff --git a/kernel/mem.c b/kernel/mem.c index e8facbf4..c16c7066 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -792,6 +792,27 @@ order_based: return NULL; } +/* + * Get NUMA node structure offsetted by index in the order of distance + */ +struct ihk_mc_numa_node *ihk_mc_get_numa_node_by_distance(int i) +{ + int numa_id; + + if (!cpu_local_var_initialized) + return NULL; + + if (i < 0 || i > ihk_mc_get_nr_numa_nodes()) { + return NULL; + } + + numa_id = ihk_mc_get_numa_id(); + if (!memory_nodes[numa_id].nodes_by_distance) + return NULL; + + return &memory_nodes[memory_nodes[numa_id].nodes_by_distance[i].id]; +} + static void __mckernel_free_pages_in_allocator(void *va, int npages, int is_user) { @@ -1465,11 +1486,13 @@ static void numa_init(void) INIT_LIST_HEAD(&memory_nodes[i].allocators); memory_nodes[i].nodes_by_distance = 0; #ifdef IHK_RBTREE_ALLOCATOR + memory_nodes[i].zeroed_chunks.rb_node = 0; memory_nodes[i].free_chunks.rb_node = 0; mcs_lock_init(&memory_nodes[i].lock); memory_nodes[i].min_addr = 0xFFFFFFFFFFFFFFFF; memory_nodes[i].max_addr = 0; memory_nodes[i].nr_pages = 0; + memory_nodes[i].nr_zeroed_pages = 0; memory_nodes[i].nr_free_pages = 0; #endif } diff --git a/kernel/process.c b/kernel/process.c index 36435ab9..9f8f5947 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -3122,6 +3122,7 @@ static void idle(void) v->status == CPU_STATUS_RESERVED) { /* No work to do? Consolidate the kmalloc free list */ kmalloc_consolidate_free_list(); + ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0)); monitor->status = IHK_OS_MONITOR_IDLE; cpu_local_var(current)->status = PS_INTERRUPTIBLE; cpu_safe_halt(); @@ -3477,6 +3478,7 @@ void spin_sleep_or_schedule(void) break; } + ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0)); cpu_pause(); } diff --git a/kernel/syscall.c b/kernel/syscall.c index 236786e4..fdc6b185 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -266,6 +266,7 @@ long do_syscall(struct syscall_request *req, int cpu) cpu_restore_interrupt(runq_irqstate); if (!do_schedule) { + ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0)); continue; } diff --git a/kernel/timer.c b/kernel/timer.c index 6dd727c2..49fd305c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -102,6 +102,7 @@ uint64_t schedule_timeout(uint64_t timeout) /* Spin wait */ while ((rdtsc() - t_s) < LOOP_TIMEOUT) { + ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0)); cpu_pause(); } diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index e5b94c2f..93c89d3a 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -208,6 +208,10 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt, uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt); int ihk_mc_get_nr_numa_nodes(void); +struct ihk_mc_numa_node *ihk_mc_get_numa_node_by_distance(int i); +void ihk_numa_zero_free_pages(struct ihk_mc_numa_node *__node); +extern int zero_at_free; + struct smp_coreset; int ihk_mc_get_numa_node(int id, int *linux_numa_id, int *type); int ihk_mc_get_numa_distance(int i, int j); diff --git a/lib/include/ihk/page_alloc.h b/lib/include/ihk/page_alloc.h index d11ebd9c..71a1f185 100644 --- a/lib/include/ihk/page_alloc.h +++ b/lib/include/ihk/page_alloc.h @@ -41,10 +41,17 @@ struct ihk_mc_numa_node { struct list_head allocators; struct node_distance *nodes_by_distance; #ifdef IHK_RBTREE_ALLOCATOR + struct rb_root zeroed_chunks; struct rb_root free_chunks; mcs_lock_node_t lock; unsigned long nr_pages; + /* + * nr_free_pages: all freed pages + * nr_zeroed_pages: zeroed free pages + * Invariant: nr_zeroed_pages <= nr_free_pages + */ + unsigned long nr_zeroed_pages; unsigned long nr_free_pages; unsigned long min_addr; unsigned long max_addr; diff --git a/lib/page_alloc.c b/lib/page_alloc.c index 79f2360f..bff2bf3f 100644 --- a/lib/page_alloc.c +++ b/lib/page_alloc.c @@ -319,6 +319,9 @@ kprintf("\nzeroing done\n"); #ifdef IHK_RBTREE_ALLOCATOR +int zero_at_free = 1; +int deferred_zero_at_free = 1; + /* * Simple red-black tree based physical memory management routines. * @@ -356,6 +359,7 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root, /* Is ichunk contigous from the left? */ if (ichunk->addr + ichunk->size == addr) { struct rb_node *right; + /* Extend it to the right */ ichunk->size += size; dkprintf("%s: chunk extended to right: 0x%lx:%lu\n", @@ -370,6 +374,10 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root, if (ichunk->addr + ichunk->size == right_chunk->addr) { ichunk->size += right_chunk->size; rb_erase(right, root); + + /* Clear old structure */ + memset(right_chunk, 0, sizeof(*right_chunk)); + dkprintf("%s: chunk merged to right: 0x%lx:%lu\n", __FUNCTION__, ichunk->addr, ichunk->size); } @@ -381,6 +389,7 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root, /* Is ichunk contigous from the right? */ if (addr + size == ichunk->addr) { struct rb_node *left; + /* Extend it to the left */ ichunk->addr -= size; ichunk->size += size; @@ -397,6 +406,10 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root, ichunk->addr -= left_chunk->size; ichunk->size += left_chunk->size; rb_erase(left, root); + + /* Clear old structure */ + memset(left_chunk, 0, sizeof(*left_chunk)); + dkprintf("%s: chunk merged to left: 0x%lx:%lu\n", __FUNCTION__, ichunk->addr, ichunk->size); } @@ -406,6 +419,10 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root, new_chunk = (struct free_chunk *)phys_to_virt(ichunk->addr); *new_chunk = *ichunk; rb_replace_node(&ichunk->node, &new_chunk->node, root); + + /* Clear old structure */ + memset(ichunk, 0, sizeof(*ichunk)); + dkprintf("%s: chunk moved to front: 0x%lx:%lu\n", __FUNCTION__, new_chunk->addr, new_chunk->size); @@ -530,6 +547,11 @@ static unsigned long __page_alloc_rbtree_alloc_pages(struct rb_root *root, return 0; } + if (zero_at_free) { + memset(phys_to_virt(aligned_addr), + 0, sizeof(struct free_chunk)); + } + return aligned_addr; } @@ -576,6 +598,17 @@ static unsigned long __page_alloc_rbtree_reserve_pages(struct rb_root *root, return aligned_addr; } +static struct free_chunk *__page_alloc_rbtree_get_root_chunk( + struct rb_root *root) +{ + struct rb_node *node = root->rb_node; + if (!node) { + return NULL; + } + + rb_erase(node, root); + return container_of(node, struct free_chunk, node); +} /* * External routines. @@ -583,10 +616,23 @@ static unsigned long __page_alloc_rbtree_reserve_pages(struct rb_root *root, int ihk_numa_add_free_pages(struct ihk_mc_numa_node *node, unsigned long addr, unsigned long size) { - if (__page_alloc_rbtree_free_range(&node->free_chunks, addr, size)) { - kprintf("%s: ERROR: adding 0x%lx:%lu\n", - __FUNCTION__, addr, size); - return EINVAL; + if (zero_at_free) { + /* Zero chunk */ + memset(phys_to_virt(addr), 0, size); + + if (__page_alloc_rbtree_free_range(&node->zeroed_chunks, addr, size)) { + kprintf("%s: ERROR: adding 0x%lx:%lu\n", + __FUNCTION__, addr, size); + return EINVAL; + } + } + /* Default behavior */ + else { + if (__page_alloc_rbtree_free_range(&node->free_chunks, addr, size)) { + kprintf("%s: ERROR: adding 0x%lx:%lu\n", + __FUNCTION__, addr, size); + return EINVAL; + } } if (addr < node->min_addr) @@ -596,12 +642,81 @@ int ihk_numa_add_free_pages(struct ihk_mc_numa_node *node, node->max_addr = addr + size; node->nr_pages += (size >> PAGE_SHIFT); + if (zero_at_free) { + node->nr_zeroed_pages += (size >> PAGE_SHIFT); + } node->nr_free_pages += (size >> PAGE_SHIFT); dkprintf("%s: added free pages 0x%lx:%lu\n", __FUNCTION__, addr, size); return 0; } +void ihk_numa_zero_free_pages(struct ihk_mc_numa_node *__node) +{ + mcs_lock_node_t mcs_node; + unsigned long irqflags; + int i, max_i; + + if (!zero_at_free) + return; + + /* If explicitly specified, zero only in __node */ + max_i = __node ? 1 : ihk_mc_get_nr_numa_nodes(); + + irqflags = cpu_disable_interrupt_save(); + + /* Look at NUMA nodes in the order of distance */ + for (i = 0; i < max_i; ++i) { + struct ihk_mc_numa_node *node; + + node = __node ? __node : ihk_mc_get_numa_node_by_distance(i); + if (!node) { + break; + } + + /* Iterate free chunks */ + for (;;) { + struct free_chunk *chunk; + unsigned long addr, size; + + mcs_lock_lock_noirq(&node->lock, &mcs_node); + chunk = __page_alloc_rbtree_get_root_chunk(&node->free_chunks); + /* + * Release the lock to let other CPUs potentially proceed + * in parallel with other chunks + */ + mcs_lock_unlock_noirq(&node->lock, &mcs_node); + + if (!chunk) { + break; + } + + /* + * Zero chunk + * NOTE: we cannot refer to chunk structure any more after zeroing + */ + addr = chunk->addr; + size = chunk->size; + memset(phys_to_virt(addr), 0, chunk->size); + + mcs_lock_lock_noirq(&node->lock, &mcs_node); + if (__page_alloc_rbtree_free_range(&node->zeroed_chunks, addr, size)) { + kprintf("%s: ERROR: freeing 0x%lx:%lu\n", + __FUNCTION__, addr, size); + goto unlock; + } + + node->nr_zeroed_pages += (size >> PAGE_SHIFT); +if (cpu_local_var(current)->profile) + kprintf("%s: zeroed %lu pages @ NUMA %d\n", + __func__, size >> PAGE_SHIFT, node->id); +unlock: + mcs_lock_unlock_noirq(&node->lock, &mcs_node); + } + } + + cpu_restore_interrupt(irqflags); +} unsigned long ihk_numa_alloc_pages(struct ihk_mc_numa_node *node, int npages, int p2align) @@ -633,14 +748,61 @@ unsigned long ihk_numa_alloc_pages(struct ihk_mc_numa_node *node, goto unlock_out; } - addr = __page_alloc_rbtree_alloc_pages(&node->free_chunks, - npages, p2align); + if (zero_at_free) { + /* Do we need to zero pages? */ + if (node->nr_zeroed_pages < npages) { + mcs_lock_unlock(&node->lock, &mcs_node); + ihk_numa_zero_free_pages(node); + mcs_lock_lock(&node->lock, &mcs_node); + } - /* Does not necessarily succeed due to alignment */ - if (addr) { - node->nr_free_pages -= npages; - dkprintf("%s: allocated pages 0x%lx:%lu\n", - __FUNCTION__, addr, npages << PAGE_SHIFT); + /* Still not enough? Give up.. */ + if (node->nr_zeroed_pages < npages) { + goto unlock_out; + } + + addr = __page_alloc_rbtree_alloc_pages(&node->zeroed_chunks, + npages, p2align); + + /* Does not necessarily succeed due to alignment */ + if (addr) { + node->nr_free_pages -= npages; + node->nr_zeroed_pages -= npages; +#if 0 + { + size_t free_bytes = __count_free_bytes(&node->free_chunks); + if (free_bytes != node->nr_free_pages * PAGE_SIZE) { + kprintf("%s: inconsistent free count? node: %lu vs. cnt: %lu\n", + __func__, node->nr_free_pages * PAGE_SIZE, free_bytes); + panic(""); + } + } +#endif + dkprintf("%s: allocated pages 0x%lx:%lu\n", + __FUNCTION__, addr, npages << PAGE_SHIFT); + } + } + /* Default behavior */ + else { + addr = __page_alloc_rbtree_alloc_pages(&node->free_chunks, + npages, p2align); + + /* Does not necessarily succeed due to alignment */ + if (addr) { + node->nr_free_pages -= npages; +#if 0 + { + size_t free_bytes = __count_free_bytes(&node->free_chunks); + if (free_bytes != node->nr_free_pages * PAGE_SIZE) { + kprintf("%s: inconsistent free count? node: %lu vs. cnt: %lu\n", + __func__, node->nr_free_pages * PAGE_SIZE, free_bytes); + panic(""); + } + } +#endif + dkprintf("%s: allocated pages 0x%lx:%lu\n", + __FUNCTION__, addr, npages << PAGE_SHIFT); + } } unlock_out: @@ -685,15 +847,60 @@ void ihk_numa_free_pages(struct ihk_mc_numa_node *node, } mcs_lock_lock(&node->lock, &mcs_node); - if (__page_alloc_rbtree_free_range(&node->free_chunks, addr, - npages << PAGE_SHIFT)) { - kprintf("%s: ERROR: freeing 0x%lx:%lu\n", - __FUNCTION__, addr, npages << PAGE_SHIFT); + if (!zero_at_free || + (zero_at_free && deferred_zero_at_free)) { + /* + * Free to free_chunks first, will be moved to zeroed_chunks later + * if zero at free or asynchronously + */ + if (__page_alloc_rbtree_free_range(&node->free_chunks, addr, + npages << PAGE_SHIFT)) { + kprintf("%s: ERROR: freeing 0x%lx:%lu\n", + __FUNCTION__, addr, npages << PAGE_SHIFT); + } + else { + node->nr_free_pages += npages; +#if 0 + { + size_t free_bytes = __count_free_bytes(&node->free_chunks); + if (free_bytes != node->nr_free_pages * PAGE_SIZE) { + kprintf("%s: inconsistent free count? node: %lu vs. cnt: %lu\n", + __func__, node->nr_free_pages * PAGE_SIZE, free_bytes); + panic(""); + } + } +#endif + dkprintf("%s: freed pages 0x%lx:%lu\n", + __FUNCTION__, addr, npages << PAGE_SHIFT); + } } else { - node->nr_free_pages += npages; - dkprintf("%s: freed pages 0x%lx:%lu\n", - __FUNCTION__, addr, npages << PAGE_SHIFT); + /* + * Free and zero chunk right here + */ + memset(phys_to_virt(addr), 0, npages << PAGE_SHIFT); + + if (__page_alloc_rbtree_free_range(&node->zeroed_chunks, addr, + npages << PAGE_SHIFT)) { + kprintf("%s: ERROR: freeing 0x%lx:%lu\n", + __FUNCTION__, addr, npages << PAGE_SHIFT); + } + else { + node->nr_free_pages += npages; + node->nr_zeroed_pages += npages; +#if 0 + { + size_t free_bytes = __count_free_bytes(&node->free_chunks); + if (free_bytes != node->nr_free_pages * PAGE_SIZE) { + kprintf("%s: inconsistent free count? node: %lu vs. cnt: %lu\n", + __func__, node->nr_free_pages * PAGE_SIZE, free_bytes); + panic(""); + } + } +#endif + dkprintf("%s: freed+zeroed pages 0x%lx:%lu\n", + __FUNCTION__, addr, npages << PAGE_SHIFT); + } } mcs_lock_unlock(&node->lock, &mcs_node); }