Compare commits

...

55 Commits
1.1.2 ... 1.2.0

Author SHA1 Message Date
e4b3a88fc6 mcexec_sys_umount(): remove debug print 2016-11-10 15:05:45 +09:00
69a5c53074 NUMA: hide non-existing nodes from /sys/devices/system/node listing 2016-11-05 16:12:08 +09:00
259583e936 mcreboot-smp-x86.sh: more white out of invalid NUMA info 2016-11-05 13:35:53 +09:00
0f826290d0 NUMA: get_mempolicy(), set_mempolicy() and mbind() implementation 2016-11-05 13:32:02 +09:00
e46f027894 mcexec/mcctrl: unmount cgroups (privately) which expose invalid NUMA info 2016-11-04 17:02:48 +09:00
3e093f6a40 sysfs: fix /sys/devices/system/node/online value 2016-11-03 16:10:29 +09:00
00996b551f mcreboot: white out non-existing NUMA information 2016-11-03 16:09:27 +09:00
24d8697cef mcexec: workaround for overlayed /sys FS directory lseek() bug
lseek() on directories under /sys filesystem that are part of an
overlayed filesystem behave differently than in the original /sys.
This causes segfault in libnuma when discovering topology
information. The patch fakes return value as it is supposed to be,
which also fixes the Intel MPI 2017 MPI_Init() crash.
2016-11-03 13:41:25 +09:00
be4f6741f9 sysfs: fix /sys/devices/system/cpu/cpuXX/online value 2016-11-03 13:39:21 +09:00
7a2f67f5f0 sysfs: eliminate unnecessary new line from /sys/devices/system/node/nodeX/distance 2016-11-03 13:37:53 +09:00
bba0425267 sysfs: fix /sys/devices/system/cpu/online value 2016-11-03 13:36:29 +09:00
beaf96b375 mcreboot/mcstop: proper error handling (revert previous state) 2016-10-28 14:29:10 +09:00
f1af1ffb8f NUMA: expose correct NUMA distances in sysfs 2016-10-27 14:29:15 +09:00
059fab2cc0 mcctrl: fix NULL pointer dereference for unbooted OS instance shutdown 2016-10-26 14:50:07 +09:00
f284a80656 Defrag memory in mcreboot.sh
Merge free physical pages to create large, physically contiguous
blocks with the following command.

    echo 1 > /proc/sys/vm/compact_memory
2016-10-25 16:35:43 +09:00
5f973ab51e IKC2: adjust master channel message queue size dynamically
Determine master channel's message queue size based on the number of
LWK CPUs so that all cores can communicate simultaneously during
syscall channel initialization.
2016-10-24 20:49:00 +09:00
60b6713957 IKC2: eliminate unused structures/fields of old IKC code 2016-10-24 15:41:27 +09:00
ebcf9a0d6d mcctrl: fix a bunch of -Wframe-larger-than warnings 2016-10-21 04:54:38 -04:00
942b7f8b78 mcreboot-smp-x86: eliminate unnecessary resource queries 2016-10-21 03:38:21 -04:00
0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00
9705a80c82 get/set_mempolicy(): support for query/set process level policy 2016-10-16 14:01:14 +09:00
99a02e2941 get_mempolicy(): store policy in per-process VM structure 2016-10-16 09:10:36 +09:00
b88d75720f __NR_gettid: use regular offloading channel (fixes unknown PID bug) 2016-10-15 11:46:01 +09:00
d2b677b6da get_mempolicy(): initial implementation 2016-10-14 21:34:32 +09:00
083645f203 mcreboot: purge Linux caches before reserving IHK resources 2016-10-14 21:34:32 +09:00
994b9a19ac NUMA: expose CPU and memory info in /proc/self/status 2016-10-14 21:34:32 +09:00
faa929e717 NUMA: add NUMA mask to process VM structure 2016-10-14 21:34:31 +09:00
3ee3a9df6d sysfs: fix bitmask and bitmask list-view display bug 2016-10-14 21:34:31 +09:00
73e1a4f1f9 NUMA: fill in /sys/devices/system/cpu/nodeX properly and sync with boot script 2016-10-14 21:34:31 +09:00
b068fde9cd NUMA: use IHK CPU and NUMA mappings for sysfs entries 2016-10-14 21:34:31 +09:00
167ea67dee NUMA: receive CPU info in array format 2016-10-14 21:34:31 +09:00
f33d85a27a eclair: support for multiple physical memory chunks 2016-10-14 21:34:31 +09:00
1e8239d72a kmalloc/pagealloc tracker: fix race condition bug 2016-10-14 21:34:31 +09:00
a51a0a6f13 page allocation tracker: support tracking partial deallocations 2016-10-14 21:34:31 +09:00
cc3f6e1a4f page_fault_process_memory_range(): fix double allocation leak 2016-10-14 21:34:31 +09:00
5db6c311f4 page alloc tracker: count freed pages in addr tracker objects 2016-10-14 21:34:31 +09:00
f4df713846 munmap(): fix memory leak in non page backed mappings 2016-10-14 21:34:31 +09:00
7176bb2a47 allow partial deallocation in page level allocation tracker 2016-10-14 21:34:30 +09:00
a6bd98cc02 MM: memory leak tracker for page level allocator 2016-10-14 21:34:30 +09:00
0f7462ae1c mm.h: eliminate global pa_allocator 2016-10-14 21:34:30 +09:00
0d8d915d82 fix KMALLOC_MIN_SIZE macro 2016-10-14 21:34:30 +09:00
8f4f68b877 eliminate arch_alloc_page() and move ihk_mc_alloc_pages() to arch independent code 2016-10-14 21:34:30 +09:00
8c0a5a5e61 page_hash_count_pages(): report page hash size in memory stat 2016-10-14 21:34:30 +09:00
ffd3f53785 page_unmap(): proper locking of hash table 2016-10-14 21:34:30 +09:00
f39fa54c39 NUMA: default policy: allocate from CPU's NUMA node 2016-10-14 21:34:30 +09:00
11125b0d68 fileobj and shmemobj: delete unused variables 2016-10-14 21:34:30 +09:00
3ae69d1290 NUMA: process CPU NUMA information 2016-10-14 21:34:30 +09:00
2929fbb803 NUMA: support multiple physical allocators 2016-10-14 21:34:30 +09:00
f4db8b96de fileobj/shmobj: release pages correctly according to dynamic page frame management 2016-10-14 21:34:30 +09:00
8eb3bf3559 physical page management: eliminate static page frame array and
maintain page structures dynamically covering only file mappings.
use hash table for address <-> page structure conversion.
2016-10-14 21:34:29 +09:00
326a4fcee4 mem_init(): parse NUMA information 2016-10-14 21:34:29 +09:00
9b82f1a52c use ihk_mc_alloc/free_pages() and eliminate direct calls to low level routines 2016-10-14 21:34:29 +09:00
f3da381752 ihk_mc_unmap_virtual: add flush_tlb_single
refs #778
2016-10-11 14:44:23 +09:00
8aa589a40c A signal may not sometimes arrive to a thread. 2016-10-04 14:35:25 +09:00
e03f377326 interrupt_syscall: interrupt valid thread 2016-10-03 00:49:56 +09:00
45 changed files with 4247 additions and 806 deletions

View File

@ -1736,7 +1736,7 @@ int arch_setup_pvclock(void)
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
pvti_npages = npages;
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
pvti = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
if (!pvti) {
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
return -ENOMEM;
@ -1766,44 +1766,6 @@ void arch_start_pvclock(void)
return;
} /* arch_start_pvclock() */
static struct cpu_mapping *cpu_mapping = NULL;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
{
int error;
size_t size;
int npages;
struct cpu_mapping *mapping;
int cpu;
struct x86_cpu_local_variables *v;
if (!cpu_mapping) {
size = sizeof(*mapping) * num_processors;
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!mapping) {
error = -ENOMEM;
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
goto out;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
v = get_x86_cpu_local_variable(cpu);
mapping[cpu].cpu_number = cpu;
mapping[cpu].hw_id = v->apic_id;
}
cpu_mapping = mapping;
}
error = 0;
*buf = cpu_mapping;
*nelemsp = num_processors;
out:
return error;
} /* arch_get_cpu_mapping() */
#define KVM_CPUID_SIGNATURE 0x40000000
int running_on_kvm(void) {

View File

@ -13,6 +13,8 @@
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
#define ARCH_HAS_FAST_MULTIPLIER 1
static inline int fls(int x)
{
int r;

View File

@ -306,7 +306,7 @@ struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
void *early_alloc_page(void);
void *early_alloc_pages(int nr_pages);
void *get_last_early_heap(void);
void flush_tlb(void);
void flush_tlb_single(unsigned long addr);

View File

@ -31,11 +31,10 @@
static char *last_page;
extern char _head[], _end[];
static struct ihk_mc_pa_ops *pa_ops;
extern unsigned long x86_kernel_phys_base;
void *early_alloc_page(void)
/* Arch specific early allocation routine */
void *early_alloc_pages(int nr_pages)
{
void *p;
@ -48,41 +47,14 @@ void *early_alloc_page(void)
panic("Early allocator is already finalized. Do not use it.\n");
}
p = last_page;
last_page += PAGE_SIZE;
last_page += (nr_pages * PAGE_SIZE);
return p;
}
void *arch_alloc_page(enum ihk_mc_ap_flag flag)
void early_alloc_invalidate(void)
{
if (pa_ops)
return pa_ops->alloc_page(1, PAGE_P2ALIGN, flag);
else
return early_alloc_page();
}
void arch_free_page(void *ptr)
{
if (pa_ops)
pa_ops->free_page(ptr, 1);
}
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag)
{
if (pa_ops)
return pa_ops->alloc_page(npages, p2align, flag);
else
return NULL;
}
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag)
{
return ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag);
}
void ihk_mc_free_pages(void *p, int npages)
{
if (pa_ops)
pa_ops->free_page(p, npages);
last_page = (void *)-1;
}
void *ihk_mc_allocate(int size, int flag)
@ -175,7 +147,7 @@ static unsigned long setup_l3(struct page_table *pt,
pt->entry[i] = 0;
continue;
}
pt_phys = setup_l2(arch_alloc_page(IHK_MC_AP_CRITICAL), phys, start, end);
pt_phys = setup_l2(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, start, end);
pt->entry[i] = pt_phys | PFL3_PDIR_ATTR;
}
@ -199,7 +171,7 @@ static void init_normal_area(struct page_table *pt)
for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
phys += PTL4_SIZE) {
pt_phys = setup_l3(arch_alloc_page(IHK_MC_AP_CRITICAL), phys,
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
map_start, map_end);
pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
@ -209,7 +181,7 @@ static void init_normal_area(struct page_table *pt)
static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag)
{
struct page_table *newpt = arch_alloc_page(ap_flag);
struct page_table *newpt = ihk_mc_alloc_pages(1, ap_flag);
if(newpt)
memset(newpt, 0, sizeof(struct page_table));
@ -718,7 +690,7 @@ static void destroy_page_table(int level, struct page_table *pt)
}
}
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
return;
}
@ -1112,7 +1084,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL1_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), 1);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
}
@ -1161,7 +1133,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL2_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
}
@ -1181,7 +1153,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
}
return 0;
@ -1226,7 +1198,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL3_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE);
}
args->vm->currss -= PTL3_SIZE;
@ -1245,7 +1217,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
}
return 0;
@ -1596,7 +1568,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@ -1679,7 +1651,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l3(%lx,%lx,%lx): %d\n",
base, start, end, error, *ptep);
@ -1737,7 +1709,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l4(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@ -2094,7 +2066,7 @@ static void init_vsyscall_area(struct page_table *pt)
void init_page_table(void)
{
check_available_page_size();
init_pt = arch_alloc_page(IHK_MC_AP_CRITICAL);
init_pt = ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL);
ihk_mc_spinlock_init(&init_pt_lock);
memset(init_pt, 0, sizeof(PAGE_SIZE));
@ -2111,27 +2083,27 @@ void init_page_table(void)
}
extern void __reserve_arch_pages(unsigned long, unsigned long,
void (*)(unsigned long, unsigned long, int));
void (*)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int))
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int))
{
/* Reserve Text + temporal heap */
cb(virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
cb(pa_allocator, virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
/* Reserve trampoline area to boot the second ap */
cb(ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
cb(pa_allocator, ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
/* Reserve the null page */
cb(0, PAGE_SIZE, 0);
/* Micro-arch specific */
cb(pa_allocator, 0, PAGE_SIZE, 0);
/*
* Micro-arch specific
* TODO: this does nothing in SMP mode, update it for KNC if necessary
*/
__reserve_arch_pages(start, end, cb);
}
void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *ops)
{
last_page = (void *)-1;
pa_ops = ops;
}
unsigned long virt_to_phys(void *v)
{
unsigned long va = (unsigned long)v;

View File

@ -16,6 +16,7 @@
#include <memory.h>
#include <string.h>
extern int num_processors;
extern void arch_set_mikc_queue(void *r, void *w);
ihk_ikc_ph_t arch_master_channel_packet_handler;
@ -23,17 +24,23 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
ihk_ikc_ph_t packet_handler)
{
struct ihk_ikc_queue_head *rq, *wq;
size_t mikc_queue_pages;
ihk_ikc_system_init(NULL);
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
/* Place both sides in this side */
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
mikc_queue_pages = ((num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
/* Place both sides in this side */
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
ihk_ikc_init_queue(rq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
arch_master_channel_packet_handler = packet_handler;

View File

@ -3,13 +3,13 @@
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it. Unless specific CPUs and memory are requested,
# the script reserves half of the CPU cores and 512MB of RAM from
# NUMA node 0 when IHK is loaded for the first time.
# Otherwise, it destroys the current McKernel instance and reboots it using
# the same set of resources as it used previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
@ -23,15 +23,20 @@ ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
INTERVAL=1
LOGMODE=0
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
irqbalance_used="yes"
else
irqbalance_used="no"
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
@ -76,6 +81,103 @@ do
esac
done
#
# Revert any state that has been initialized before the error occured.
#
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
os_created)
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "warning: failed to destroy LWK instance $ind" >&2
fi
done
fi
;&
mcctrl_loaded)
rmmod mcctrl || echo "warning: failed to remove mcctrl" >&2
;&
mem_reserved)
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "warning: failed to release memory" >&2
fi
fi
;&
cpus_reserved)
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "warning: failed to release CPUs" >&2
fi
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 || echo "warning: failed to remove ihk_smp_x86" >&2
;&
ihk_loaded)
rmmod ihk || echo "warning: failed to remove ihk" >&2
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
# Nothing more to revert
;;
esac
exit 1
}
ihk_ikc_irq_core=0
release=`uname -r`
@ -84,8 +186,12 @@ minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
if [ "${release}" == "${rhel_release}" ]; then
rhel_release="";
fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
@ -98,6 +204,7 @@ if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
fi
fi
# Figure out CPUs if not requested by user
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
@ -105,7 +212,10 @@ if [ "$cpus" == "" ]; then
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
if [ "$cpus" == "" ]; then
echo "error: no available CPUs on NUMA node 0?" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
@ -116,85 +226,149 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
if ! rmmod mcoverlay; then
echo "error: removing mcoverlay" >&2
error_exit "initial"
fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
if ! systemctl stop irqbalance.service 2>/dev/null ; then
echo "error: stopping irqbalance" >&2
error_exit "initial"
fi;
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
# booting McKernel
if [ ${LOGMODE} -ne 0 ]; then
# Stop mcklogd which has survived McKernel shutdown because
# mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
if ! insmod ${KMODDIR}/ihk.ko; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
fi
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then
echo "error: no IRQ available" >&2
error_exit "ihk_loaded"
fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then
echo "error: loading ihk-smp-x86" >&2
error_exit "ihk_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
echo "error: reserving CPUs" >&2;
error_exit "ihk_smp_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
error_exit "cpus_reserved"
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then
echo "error: inserting mcctrl.ko" >&2
error_exit "mem_reserved"
fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
error_exit "mcctrl_loaded"
fi
done
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
# Create OS instance
if ! ${SBINDIR}/ihkconfig 0 create; then
echo "error: creating OS instance" >&2
error_exit "mcctrl_loaded"
fi
# Assign CPUs
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
# Assign memory
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
echo "error: assign memory" >&2
error_exit "os_created"
fi
# Load kernel image
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
error_exit "os_created"
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
# Boot OS instance
if ! ${SBINDIR}/ihkosctl 0 boot; then
echo "error: booting" >&2
error_exit "os_created"
fi
# Set device file ownership
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
echo "warning: failed to chown device files" >&2
fi
# Overlay /proc, /sys with McKernel specific contents
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
@ -202,49 +376,93 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
# mcklogd survives when McKernel isn't shut down by mcstop+release.sh
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
echo "error: starting irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@ -22,34 +22,76 @@ if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi
done
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
exit 1
fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
exit 1
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "error: releasing CPUs" >&2
exit 1
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl" >&2; exit 1; fi
if ! rmmod mcctrl; then
echo "error: removing mcctrl" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "warning: failed to remove mcoverlay" >&2
fi
fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86" >&2; exit 1; fi
if ! rmmod ihk_smp_x86; then
echo "error: removing ihk_smp_x86" >&2
exit 1
fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then echo "error: removing ihk" >&2; exit 1; fi
if ! rmmod ihk; then
echo "error: removing ihk" >&2
exit 1
fi
fi
# Stop mcklogd
@ -57,8 +99,17 @@ pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null ; then echo "error: stopping irqbalance_mck" >&2; exit 1; fi;
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: disabling irqbalance_mck" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }' ; then echo "error: restoring /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! systemctl start irqbalance.service; then echo "error: starting irqbalance" >&2; exit 1; fi;
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi

25
configure vendored
View File

@ -3117,6 +3117,31 @@ _ACEOF
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking System.map for symbol sys_umount" >&5
$as_echo_n "checking System.map for symbol sys_umount... " >&6; }
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " sys_umount\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
$as_echo "not found" >&6; }
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_sys_umount\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $mcctrl_result" >&5
$as_echo "$mcctrl_result" >&6; }
cat >>confdefs.h <<_ACEOF
#define MCCTRL_KSYM_sys_umount $mcctrl_addr
_ACEOF
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking System.map for symbol sys_unshare" >&5
$as_echo_n "checking System.map for symbol sys_unshare... " >&6; }
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " sys_unshare\$" | cut -d\ -f1`

View File

@ -221,6 +221,7 @@ AC_DEFUN([MCCTRL_FIND_KSYM],[
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_umount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])

View File

@ -51,6 +51,9 @@
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_umount, or 0 if exported */
#undef MCCTRL_KSYM_sys_umount
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare

View File

@ -41,6 +41,7 @@
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
#define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_GET_NODES 0x30a0290c
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911
@ -49,7 +50,8 @@
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_SYS_UMOUNT 0x30a02915
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
#define MCEXEC_UP_DEBUG_LOG 0x40000000
@ -196,6 +198,10 @@ struct sys_mount_desc {
void *data;
};
struct sys_umount_desc {
char *dir_name;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};

View File

@ -75,7 +75,7 @@ static int load_elf(struct linux_binprm *bprm
char buf[32];
int l;
int pass;
char pbuf[1024];
char *pbuf;
const char *path;
if(bprm->envc == 0)
@ -88,6 +88,11 @@ static int load_elf(struct linux_binprm *bprm
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
pbuf = kmalloc(1024, GFP_ATOMIC);
if (!pbuf) {
printk("%s: error: allocating pbuf\n", __FUNCTION__);
return -ENOMEM;
}
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
@ -96,8 +101,10 @@ static int load_elf(struct linux_binprm *bprm
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig"))
!strcmp(cp, "/ihkconfig")) {
kfree(pbuf);
return -ENOEXEC;
}
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
@ -124,8 +131,10 @@ static int load_elf(struct linux_binprm *bprm
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0)
if(rc <= 0) {
kfree(pbuf);
return -EFAULT;
}
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
@ -199,21 +208,27 @@ static int load_elf(struct linux_binprm *bprm
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc)
if(rc) {
kfree(pbuf);
return -ENOEXEC;
}
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file))
if (IS_ERR(file)) {
kfree(pbuf);
return -ENOEXEC;
}
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
@ -221,12 +236,14 @@ static int load_elf(struct linux_binprm *bprm
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
@ -236,8 +253,12 @@ static int load_elf(struct linux_binprm *bprm
rc = prepare_binprm(bprm);
if (rc < 0){
kfree(pbuf);
return rc;
}
kfree(pbuf);
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs

View File

@ -66,7 +66,18 @@ int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long
(int_star_fn_char_char_char_ulong_void_t)
MCCTRL_KSYM_sys_mount;
#else // exported
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = NULL;
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = sys_mount;
#endif
#endif
#ifdef MCCTRL_KSYM_sys_umount
#if MCCTRL_KSYM_sys_umount
typedef int (*int_fn_char_star_int_t)(char *, int);
int (*mcctrl_sys_umount)(char *dir_name, int flags) =
(int_fn_char_star_int_t)
MCCTRL_KSYM_sys_umount;
#else // exported
int (*mcctrl_sys_umount)(char *dir_name, int flags) = sys_umount;
#endif
#endif
@ -77,32 +88,49 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
static long mcexec_prepare_image(ihk_os_t os,
struct program_load_desc * __user udesc)
{
struct program_load_desc desc, *pdesc;
struct program_load_desc *desc, *pdesc;
struct ikc_scd_packet isp;
void *args, *envs;
long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
int num_sections;
if (copy_from_user(&desc, udesc,
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
printk("%s: error: allocating program_load_desc\n",
__FUNCTION__);
return -ENOMEM;
}
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
printk("%s: error: copying program_load_desc\n",
__FUNCTION__);
kfree(desc);
return -EFAULT;
}
if (desc.num_sections <= 0 || desc.num_sections > 16) {
printk("# of sections: %d\n", desc.num_sections);
num_sections = desc->num_sections;
if (num_sections <= 0 || num_sections > 16) {
printk("# of sections: %d\n", num_sections);
return -EINVAL;
}
pdesc = kmalloc(sizeof(struct program_load_desc) +
sizeof(struct program_image_section)
* desc.num_sections, GFP_KERNEL);
memcpy(pdesc, &desc, sizeof(struct program_load_desc));
* num_sections, GFP_KERNEL);
memcpy(pdesc, desc, sizeof(struct program_load_desc));
if (copy_from_user(pdesc->sections, udesc->sections,
sizeof(struct program_image_section)
* desc.num_sections)) {
* num_sections)) {
kfree(desc);
kfree(pdesc);
return -EFAULT;
}
kfree(desc);
pdesc->pid = task_tgid_vnr(current);
if (reserve_user_space(usrdata, &pdesc->user_start, &pdesc->user_end)) {
@ -158,7 +186,7 @@ static long mcexec_prepare_image(ihk_os_t os,
ppd->rpgtable = pdesc->rpgtable;
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) {
sizeof(struct program_image_section) * num_sections)) {
ret = -EFAULT;
goto free_out;
}
@ -315,33 +343,42 @@ static long mcexec_start_image(ihk_os_t os,
struct program_load_desc * __user udesc,
struct file *file)
{
struct program_load_desc desc;
struct program_load_desc *desc;
struct ikc_scd_packet isp;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct handlerinfo *info;
if (copy_from_user(&desc, udesc,
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
printk("%s: error: allocating program_load_desc\n",
__FUNCTION__);
return -ENOMEM;
}
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
kfree(desc);
return -EFAULT;
}
info = kmalloc(sizeof(struct handlerinfo), GFP_KERNEL);
info->pid = desc.pid;
info->pid = desc->pid;
ihk_os_register_release_handler(file, release_handler, info);
c = usrdata->channels + desc.cpu;
c = usrdata->channels + desc->cpu;
mcctrl_ikc_set_recv_cpu(os, desc.cpu);
mcctrl_ikc_set_recv_cpu(os, desc->cpu);
usrdata->last_thread_exec = desc.cpu;
usrdata->last_thread_exec = desc->cpu;
isp.msg = SCD_MSG_SCHEDULE_PROCESS;
isp.ref = desc.cpu;
isp.arg = desc.rprocess;
isp.ref = desc->cpu;
isp.arg = desc->rprocess;
mcctrl_ikc_send(os, desc.cpu, &isp);
mcctrl_ikc_send(os, desc->cpu, &isp);
kfree(desc);
return 0;
}
@ -413,6 +450,16 @@ static long mcexec_get_cpu(ihk_os_t os)
return info->n_cpus;
}
static long mcexec_get_nodes(ihk_os_t os)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata || !usrdata->mem_info)
return -EINVAL;
return usrdata->mem_info->n_numa_nodes;
}
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd)
{
@ -501,9 +548,10 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return 0;
kprintf("%s: ERROR: no per-process structure for PID %d, "
"syscall nr: %lu\n",
__FUNCTION__, pid, packet->req.number);
return -1;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
@ -1127,7 +1175,7 @@ long mcexec_sys_mount(struct sys_mount_desc *__user arg)
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_mount
#ifdef MCCTRL_KSYM_sys_mount
ret = mcctrl_sys_mount(desc.dev_name, desc.dir_name, desc.type,
desc.flags, desc.data);
#else
@ -1140,6 +1188,36 @@ long mcexec_sys_mount(struct sys_mount_desc *__user arg)
return ret;
}
long mcexec_sys_umount(struct sys_mount_desc *__user arg)
{
struct sys_umount_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#ifdef MCCTRL_KSYM_sys_umount
ret = mcctrl_sys_umount(desc.dir_name, MNT_FORCE);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long mcexec_sys_unshare(struct sys_unshare_desc *__user arg)
{
struct sys_unshare_desc desc;
@ -1198,6 +1276,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_GET_CPU:
return mcexec_get_cpu(os);
case MCEXEC_UP_GET_NODES:
return mcexec_get_nodes(os);
case MCEXEC_UP_STRNCPY_FROM_USER:
return mcexec_strncpy_from_user(os,
(struct strncpy_from_user_desc *)arg);
@ -1227,6 +1308,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_SYS_MOUNT:
return mcexec_sys_mount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UMOUNT:
return mcexec_sys_umount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UNSHARE:
return mcexec_sys_unshare((struct sys_unshare_desc *)arg);

View File

@ -60,6 +60,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
@ -69,6 +70,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
@ -129,11 +131,15 @@ error_cleanup_channels:
int mcctrl_os_shutdown_notifier(int os_index)
{
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
if (os[os_index]) {
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
}
os[os_index] = NULL;
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
@ -151,11 +157,16 @@ static struct ihk_os_notifier mcctrl_os_notifier = {
static int __init mcctrl_init(void)
{
int ret = 0;
int i;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
for (i = 0; i < OS_MAX_MINOR; ++i) {
os[i] = NULL;
}
rus_page_hash_init();
binfmt_mcexec_init();

View File

@ -35,6 +35,16 @@
#define REQUEST_SHIFT 16
//#define DEBUG_IKC
#ifdef DEBUG_IKC
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define ekprintf(...) printk(__VA_ARGS__)
#endif
//int num_channels;
//struct mcctrl_channel *channels;
@ -99,10 +109,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
@ -168,93 +174,26 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
}
}
//unsigned long *mcctrl_doorbell_va;
//unsigned long mcctrl_doorbell_pa;
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet packet;
struct mcctrl_channel *pmc = usrdata->channels + cpu;
unsigned long phys;
struct ikc_scd_init_param *rpm;
if(c->port == 502)
if (c->port == 502) {
pmc = usrdata->channels + usrdata->num_channels - 1;
if (!pmc) {
return;
}
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
sizeof(struct ikc_scd_init_param));
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param),
NULL, 0);
#endif
pmc->param.request_va =
(void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
memset(pmc->param.post_va, 0, PAGE_SIZE);
pmc->param.response_rpa = rpm->response_page;
pmc->param.response_pa
= ihk_device_map_memory(ihk_os_to_dev(os),
pmc->param.response_rpa,
PAGE_SIZE);
#ifdef CONFIG_MIC
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
PAGE_SIZE);
#else
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
pmc->param.response_pa,
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
rpm->post_page = pmc->param.post_pa;
if (!pmc) {
kprintf("%s: error: no channel found?\n", __FUNCTION__);
return;
}
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
packet.ref = cpu;
packet.arg = rphys;
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
pmc->param.request_pa, pmc->param.response_rpa,
pmc->param.doorbell_pa);
printk("Request: %p, Response: %p, Doorbell: %p\n",
pmc->param.request_va, pmc->param.response_va,
pmc->param.doorbell_va);
ihk_ikc_send(pmc->c, &packet, 0);
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
sizeof(struct ikc_scd_init_param));
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param));
}
static int connect_handler(struct ihk_ikc_channel_info *param)
@ -274,7 +213,7 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -292,7 +231,7 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -315,27 +254,29 @@ static struct ihk_ikc_listen_param listen_param2 = {
int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
struct mcctrl_usrdata *usrdata;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
usrdata->cpu_info = ihk_os_get_cpu_info(os);
usrdata->mem_info = ihk_os_get_memory_info(os);
if (!usrdata->cpu_info || !usrdata->mem_info) {
printk("Error: cannot obtain OS CPU and memory information.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
if (usrdata->cpu_info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
}
usrdata->num_channels = info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
GFP_KERNEL);
usrdata->num_channels = usrdata->cpu_info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
return -ENOMEM;
@ -362,20 +303,7 @@ int prepare_ikc_channels(ihk_os_t os)
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
{
free_pages((unsigned long)pmc->param.request_va,
REQUEST_SHIFT - PAGE_SHIFT);
free_page((unsigned long)pmc->param.post_va);
#ifdef CONFIG_MIC
iounmap(pmc->param.response_va);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os),
pmc->param.response_pa, PAGE_SIZE);
free_pages((unsigned long)pmc->dma_buf,
DMA_PIN_SHIFT - PAGE_SHIFT);
return;
}
void destroy_ikc_channels(ihk_os_t os)
@ -383,6 +311,11 @@ void destroy_ikc_channels(ihk_os_t os)
int i;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
ihk_host_os_set_usrdata(os, NULL);
for (i = 0; i < usrdata->num_channels; i++) {
@ -393,7 +326,6 @@ void destroy_ikc_channels(ihk_os_t os)
printk("Channel #%d freed.\n", i);
}
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
kfree(usrdata->channels);
kfree(usrdata);

View File

@ -59,8 +59,8 @@
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
@ -172,7 +172,6 @@ struct wait_queue_head_list_node {
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
};
@ -226,11 +225,6 @@ static inline int sysfs_inited(struct sysfsm_data *sdp)
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
@ -239,8 +233,9 @@ struct cache_topology {
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
cpumask_t core_siblings;
cpumask_t thread_siblings;
@ -248,8 +243,12 @@ struct cpu_topology {
struct list_head cache_list;
};
#define NODE_DISTANCE_S_SIZE 1024
struct node_topology {
struct ihk_node_topology *saved;
int mckernel_numa_id;
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
cpumask_t cpumap;
struct list_head chain;
@ -266,9 +265,7 @@ struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
struct mcctrl_channel *channels;
int remaining_job;
int base_cpu;
int job_pos;
@ -282,10 +279,9 @@ struct mcctrl_usrdata {
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct ihk_cpu_info *cpu_info;
struct ihk_mem_info *mem_info;
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
@ -322,7 +318,7 @@ inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */

View File

@ -481,8 +481,9 @@ procfs_exit(int osnum)
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
if (e) {
delete_procfs_entries(e);
}
up(&procfs_file_list_lock);
}

View File

@ -18,7 +18,7 @@
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@ -1232,9 +1232,16 @@ sysfsm_cleanup(ihk_os_t os)
int error;
ihk_device_t dev = ihk_os_to_dev(os);
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_data *sdp = &udp->sysfsm_data;
struct sysfsm_data *sdp;
struct sysfsm_node *np;
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
sdp = &udp->sysfsm_data;
dprintk("mcctrl:sysfsm_cleanup(%p)\n", os);
if (sdp->sysfs_buf) {
@ -2095,9 +2102,16 @@ struct sysfsm_ops snooping_local_ops_s = {
/**** local list ****/
static ssize_t snooping_local_show_pbl(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pbl() */
struct sysfsm_ops snooping_local_ops_pbl = {
@ -2108,9 +2122,16 @@ struct sysfsm_ops snooping_local_ops_pbl = {
/**** local map ****/
static ssize_t snooping_local_show_pb(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pb() */
struct sysfsm_ops snooping_local_ops_pb = {

View File

@ -18,7 +18,7 @@
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@ -92,27 +92,19 @@ void setup_local_snooping_samples(ihk_os_t os)
void setup_local_snooping_files(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_bitmap_param param;
static unsigned long cpu_offline = 0x0;
int i;
int error;
info = ihk_os_get_cpu_info(os);
if (!info) {
eprintk("mcctrl:ihk_os_get_cpu_info failed.\n");
return;
}
memset(udp->cpu_online, 0, sizeof(udp->cpu_online));
for (i = 0; i < info->n_cpus; i++) {
udp->cpu_online[i / BITS_PER_LONG] =
udp->cpu_online[i / BITS_PER_LONG] | (1 << (i % BITS_PER_LONG));
for (i = 0; i < udp->cpu_info->n_cpus; i++) {
set_bit(i, udp->cpu_online);
}
param.nbits = CPU_LONGS * BITS_PER_LONG;
param.ptr = udp->cpu_online;
param.ptr = &udp->cpu_online;
dprintk("mcctrl:setup_local_snooping_files: CPU_LONGS=%d, BITS_PER_LONG=%d\n",
CPU_LONGS, BITS_PER_LONG);
@ -187,141 +179,122 @@ static void free_cpu_topology(struct mcctrl_usrdata *udp)
return;
} /* free_cpu_topology() */
static void free_cpu_mapping(struct mcctrl_usrdata *udp)
{
ihk_device_t dev = ihk_os_to_dev(udp->os);
size_t size;
size = udp->cpu_mapping_elems * sizeof(struct cpu_mapping);
ihk_device_unmap_virtual(dev, udp->cpu_mapping, size);
ihk_device_unmap_memory(dev, udp->cpu_mapping_pa, size);
return;
} /* free_cpu_mapping() */
void free_topology_info(ihk_os_t os)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
free_node_topology(udp);
free_cpu_topology(udp);
free_cpu_mapping(udp);
return;
} /* free_topology_info() */
void reply_get_cpu_mapping(long req_pa)
/*
* CPU and NUMA node mapping conversion functions.
*/
static int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
struct get_cpu_mapping_req *req = phys_to_virt(req_pa);
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->mapping[cpu_id] : -1;
}
req->busy = 0;
wake_up(&req->wq);
return;
} /* reply_get_cpu_mapping() */
static int get_cpu_mapping(struct mcctrl_usrdata *udp)
static int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
struct get_cpu_mapping_req *req = NULL;
struct ikc_scd_packet packet;
size_t size;
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[cpu_id] : -1;
}
dprintk("get_cpu_mapping(%p)\n", udp);
req = kmalloc(sizeof(*req), GFP_KERNEL);
if (!req) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_mapping:kmalloc failed. %d\n", error);
goto out;
}
req->busy = 1;
req->error = -1;
init_waitqueue_head(&req->wq);
packet.msg = SCD_MSG_GET_CPU_MAPPING;
packet.arg = virt_to_phys(req);
#define GET_CPU_MAPPING_CPU 0
error = mcctrl_ikc_send(udp->os, GET_CPU_MAPPING_CPU, &packet);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"mcctrl_ikc_send failed. %d\n", error);
goto out;
}
error = wait_event_interruptible(req->wq, !req->busy);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"wait_event_interruptible failed. %d\n", error);
req = NULL; /* XXX */
goto out;
}
if (req->error) {
error = req->error;
eprintk("mcctrl:get_cpu_mapping:"
"SCD_MSG_GET_CPU_MAPPING failed. %d\n", error);
goto out;
}
size = req->buf_elems * sizeof(struct cpu_mapping);
udp->cpu_mapping_elems = req->buf_elems;
udp->cpu_mapping_pa = ihk_device_map_memory(dev, req->buf_rpa, size);
udp->cpu_mapping = ihk_device_map_virtual(
dev, udp->cpu_mapping_pa, size, NULL, 0);
error = 0;
out:
dprintk("get_cpu_mapping(%p): %d\n", udp, error);
kfree(req);
return error;
} /* get_cpu_mapping() */
static int hwid_to_cpu(struct mcctrl_usrdata *udp, int hw_id)
static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
int i;
for (i = 0; i < udp->cpu_mapping_elems; ++i) {
if (udp->cpu_mapping[i].hw_id == hw_id) {
return udp->cpu_mapping[i].cpu_number;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->mapping[i] == cpu_id)
return i;
}
return -1;
}
#if 0
static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return i;
}
}
return -1;
}
static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return mckernel_cpu_2_linux_cpu(udp, i);
}
}
return -1;
}
static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
{
int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
return (mckernel_cpu >= 0 && mckernel_cpu < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[mckernel_cpu] : -1;
}
#endif
static int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
{
return (numa_id < udp->mem_info->n_numa_nodes) ?
udp->mem_info->numa_mapping[numa_id] : -1;
}
static int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
{
int i;
for (i = 0; i < udp->mem_info->n_numa_nodes; ++i) {
if (udp->mem_info->numa_mapping[i] == numa_id)
return i;
}
return -1;
}
static int translate_cpumap(struct mcctrl_usrdata *udp,
cpumask_t *linmap, cpumask_t *mckmap)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
int lincpu;
int hw_id;
int mckcpu;
dprintk("translate_cpumap(%p,%p,%p)\n", udp, linmap, mckmap);
cpumask_clear(mckmap);
for_each_cpu(lincpu, linmap) {
hw_id = ihk_device_linux_cpu_to_hw_id(dev, lincpu);
if (hw_id < 0) {
error = hw_id;
eprintk("mcctrl:translate_cpumap:"
"ihk_device_linux_cpu_to_hw_id failed."
" %d\n", error);
goto out;
}
mckcpu = linux_cpu_2_mckernel_cpu(udp, lincpu);
mckcpu = hwid_to_cpu(udp, hw_id);
if (mckcpu >= 0) {
cpumask_set_cpu(mckcpu, mckmap);
}
}
error = 0;
out:
dprintk("translate_cpumap(%p,%p,%p): %d\n", udp, linmap, mckmap, error);
return error;
} /* translate_cpumap() */
@ -361,7 +334,7 @@ out:
return (error)? ERR_PTR(error): topo;
} /* get_cache_topology() */
static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
int index)
{
int error;
@ -370,41 +343,43 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
struct cache_topology *cache;
struct ihk_cache_topology *saved_cache;
dprintk("get_cpu_topology_one(%p,%d)\n", udp, index);
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
if (!topology) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"kmalloc failed. %d\n", error);
goto out;
}
INIT_LIST_HEAD(&topology->cache_list);
topology->cpu_mapping = &udp->cpu_mapping[index];
topology->mckernel_cpu_id = index;
topology->saved = ihk_device_get_cpu_topology(dev,
mckernel_cpu_2_hw_id(udp, index));
topology->saved = ihk_device_get_cpu_topology(
dev, topology->cpu_mapping->hw_id);
if (IS_ERR(topology->saved)) {
error = PTR_ERR(topology->saved);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"ihk_device_get_cpu_topology failed. %d\n",
error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->core_siblings,
error = translate_cpumap(udp,
&topology->saved->core_siblings,
&topology->core_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(core_siblings) failed."
" %d\n", error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->thread_siblings,
error = translate_cpumap(udp,
&topology->saved->thread_siblings,
&topology->thread_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(thread_siblings) failed."
" %d\n", error);
goto out;
@ -415,7 +390,7 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
cache = get_cache_topology(udp, topology, saved_cache);
if (IS_ERR(cache)) {
error = PTR_ERR(cache);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"get_cache_topology failed. %d\n",
error);
goto out;
@ -429,10 +404,10 @@ out:
if (error && !IS_ERR_OR_NULL(topology)) {
free_cpu_topology_one(udp, topology);
}
dprintk("get_cpu_topology_one(%p,%d): %d %p\n",
dprintk("get_one_cpu_topology(%p,%d): %d %p\n",
udp, index, error, topology);
return (error)? ERR_PTR(error): topology;
} /* get_cpu_topology_one() */
} /* get_one_cpu_topology() */
static int get_cpu_topology(struct mcctrl_usrdata *udp)
{
@ -441,12 +416,12 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
struct cpu_topology *topology;
dprintk("get_cpu_topology(%p)\n", udp);
for (index = 0; index < udp->cpu_mapping_elems; ++index) {
topology = get_cpu_topology_one(udp, index);
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
topology = get_one_cpu_topology(udp, index);
if (IS_ERR(topology)) {
error = PTR_ERR(topology);
eprintk("mcctrl:get_cpu_topology:"
"get_cpu_topology_one failed. %d\n",
eprintk("mcctrl:get_cpu_topology: "
"get_one_cpu_topology failed. %d\n",
error);
goto out;
}
@ -460,15 +435,15 @@ out:
return error;
} /* get_cpu_topology() */
static void setup_one_cache_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu, struct cache_topology *cache)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
int index = cache->saved->index;
struct sysfsm_bitmap_param param;
dprintk("setup_one_cache_files(%p,%p,%p)\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p)\n", udp, cpu, cache);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d64,
&cache->saved->level, 0444,
@ -509,19 +484,19 @@ static void setup_one_cache_files(struct mcctrl_usrdata *udp,
"%s/cpu%d/cache/index%d/shared_cpu_list",
prefix, cpu_number, index);
dprintk("setup_one_cache_files(%p,%p,%p):\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p):\n", udp, cpu, cache);
return;
} /* setup_one_cache_files() */
} /* setup_cpu_sysfs_cache_files() */
static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
struct sysfsm_bitmap_param param;
struct cache_topology *cache;
dprintk("setup_one_cpu_files(%p,%p)\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p)\n", udp, cpu);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
&cpu->saved->physical_package_id, 0444,
@ -553,41 +528,61 @@ static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
prefix, cpu_number);
list_for_each_entry(cache, &cpu->cache_list, chain) {
setup_one_cache_files(udp, cpu, cache);
setup_cpu_sysfs_cache_files(udp, cpu, cache);
}
dprintk("setup_one_cpu_files(%p,%p):\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p):\n", udp, cpu);
return;
} /* setup_one_cpu_files() */
} /* setup_cpu_sysfs_files() */
static void setup_cpu_files(struct mcctrl_usrdata *udp)
static void setup_cpus_sysfs_files_node_link(struct mcctrl_usrdata *udp)
{
int error;
int cpu;
struct sysfs_handle handle;
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
int node = linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)));
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/node/node%d", node);
if (error) {
panic("sysfsm_lookupf: node for CPU");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/cpu/cpu%d/node%d",
cpu, node);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
error = 0;
return;
}
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
{
int error;
struct cpu_topology *cpu;
dprintk("setup_cpu_file(%p)\n", udp);
error = get_cpu_mapping(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
"get_cpu_mapping failed. %d\n", error);
goto out;
}
error = get_cpu_topology(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
eprintk("mcctrl:setup_cpus_sysfs_files:"
"get_cpu_topology failed. %d\n", error);
goto out;
}
list_for_each_entry(cpu, &udp->cpu_topology_list, chain) {
setup_one_cpu_files(udp, cpu);
setup_cpu_sysfs_files(udp, cpu);
}
error = 0;
out:
dprintk("setup_cpu_file(%p):\n", udp);
return;
} /* setup_cpu_files() */
} /* setup_cpus_sysfs_files() */
static struct node_topology *get_one_node_topology(struct mcctrl_usrdata *udp,
struct ihk_node_topology *saved)
@ -629,8 +624,10 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
struct node_topology *topology;
dprintk("get_node_topology(%p)\n", udp);
for (node = 0; ; ++node) {
saved = ihk_device_get_node_topology(dev, node);
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
saved = ihk_device_get_node_topology(dev,
mckernel_numa_2_linux_numa(udp, node));
if (IS_ERR(saved)) {
break;
}
@ -647,6 +644,8 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
goto out;
}
topology->mckernel_numa_id = node;
list_add(&topology->chain, &udp->node_topology_list);
}
@ -659,6 +658,7 @@ out:
static int setup_node_files(struct mcctrl_usrdata *udp)
{
int error;
int node;
struct node_topology *p;
struct sysfsm_bitmap_param param;
@ -670,16 +670,71 @@ static int setup_node_files(struct mcctrl_usrdata *udp)
goto out;
}
memset(&udp->numa_online, 0, sizeof(udp->numa_online));
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
node_set(node, udp->numa_online);
}
param.nbits = MAX_NUMNODES;
param.ptr = &udp->numa_online;
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/online");
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/possible");
list_for_each_entry(p, &udp->node_topology_list, chain) {
struct sysfs_handle handle;
int cpu;
size_t offset = 0;
param.nbits = nr_cpumask_bits;
param.ptr = &p->cpumap;
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
if (node > 0) {
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%s", " ");
}
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%d",
node_distance(
mckernel_numa_2_linux_numa(udp, p->mckernel_numa_id),
mckernel_numa_2_linux_numa(udp, node)
));
}
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_s,
p->mckernel_numa_distance_s, 0444,
"/sys/devices/system/node/node%d/distance",
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pb, &param, 0444,
"/sys/devices/system/node/node%d/cpumap",
p->saved->node_number);
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/node%d/cpulist",
p->saved->node_number);
p->mckernel_numa_id);
/* Add CPU symlinks for this node */
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
if (linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))
!= p->mckernel_numa_id) {
continue;
}
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("sysfsm_lookupf(CPU in node)");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/node/node%d/cpu%d",
p->mckernel_numa_id, cpu);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
}
error = 0;
@ -1026,11 +1081,18 @@ void setup_sysfs_files(ihk_os_t os)
panic("sysfsm_unlinkf");
}
setup_local_snooping_samples(os);
//setup_local_snooping_samples(os);
setup_local_snooping_files(os);
setup_cpu_files(udp);
setup_cpus_sysfs_files(udp);
setup_node_files(udp);
setup_pci_files(udp);
setup_cpus_sysfs_files_node_link(udp);
//setup_pci_files(udp);
/* Indicate sysfs files setup completion for boot script */
error = sysfsm_mkdirf(os, NULL, "/sys/setup_complete");
if (error) {
panic("sysfsm_mkdir(complete)");
}
return;
} /* setup_files() */

View File

@ -17,6 +17,20 @@
#include <sys/socket.h>
#include <arpa/inet.h>
/* From ihk/linux/include/ihk/ihk_host_user.h */
#define PHYS_CHUNKS_DESC_SIZE 8192
struct dump_mem_chunk {
unsigned long addr;
unsigned long size;
};
typedef struct dump_mem_chunks_s {
int nr_chunks;
struct dump_mem_chunk chunks[];
} dump_mem_chunks_t;
/* ---------- */
#define CPU_TID_BASE 1000000
struct options {
@ -53,6 +67,7 @@ static volatile int f_done = 0;
static bfd *symbfd = NULL;
static bfd *dumpbfd = NULL;
static asection *dumpscn = NULL;
static dump_mem_chunks_t *mem_chunks;
static int num_processors = -1;
static asymbol **symtab = NULL;
static ssize_t nsyms;
@ -91,25 +106,35 @@ static uintptr_t virt_to_phys(uintptr_t va) {
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
off_t off;
bfd_boolean ok;
int i;
if (pa < dumpscn->vma) {
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
return 1;
}
off = pa - dumpscn->vma;
if (off >= dumpscn->size) {
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
if ((dumpscn->size - off) < size) {
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
off = 0;
/* Check if pa is valid in any chunks and figure
* out the global offset in dump section */
for (i = 0; i < mem_chunks->nr_chunks; ++i) {
if (mem_chunks->chunks[i].addr <= pa &&
((pa + size) <= (mem_chunks->chunks[i].addr +
mem_chunks->chunks[i].size))) {
off += (pa - mem_chunks->chunks[i].addr);
break;
}
off += mem_chunks->chunks[i].size;
}
if (i == mem_chunks->nr_chunks) {
printf("read_physmem: invalid addr 0x%lx\n", pa);
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
return 0;
} /* read_physmem() */
@ -508,6 +533,25 @@ static int setup_dump(char *fname) {
return 1;
}
mem_chunks = malloc(PHYS_CHUNKS_DESC_SIZE);
if (!mem_chunks) {
perror("allocating mem chunks descriptor: ");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physchunks");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, mem_chunks,
0, PHYS_CHUNKS_DESC_SIZE);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");

View File

@ -41,6 +41,7 @@
#include <sys/mman.h>
#include <asm/unistd.h>
#include <sched.h>
#include <dirent.h>
#include <termios.h>
#include <sys/ioctl.h>
@ -1148,75 +1149,41 @@ void init_worker_threads(int fd)
#ifdef ENABLE_MCOVERLAYFS
#define READ_BUFSIZE 1024
static int isunshare(void)
static int find_mount_prefix(char *prefix)
{
int err = 0;
int ret;
int fd;
FILE *fp;
char *line = NULL;
size_t len = 0;
ssize_t read;
char proc_path[PATH_MAX];
ssize_t len_read;
char buf_read[READ_BUFSIZE + 1];
char *buf_read_off;
char *buf_find;
char buf_cmp[READ_BUFSIZE + 1];
char *buf_cmp_off;
ssize_t len_copy;
int ret = 0;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/mounts", getpid());
fd = open(proc_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error: Failed to open %s.\n", proc_path);
fp = fopen(proc_path, "r");
if (fp == NULL) {
return -1;
}
buf_cmp_off = buf_cmp;
while (1) {
len_read = read(fd, buf_read, READ_BUFSIZE);
if (len_read == -1) {
fprintf(stderr, "Error: Failed to read.\n");
err = -1;
break;
}
while ((read = getline(&line, &len, fp)) != -1) {
if (strlen(line) < strlen(prefix))
continue;
buf_read_off = buf_read;
while (1) {
if ((len_read - (buf_read_off - buf_read)) <= 0) {
break;
}
buf_find = memchr(buf_read_off, '\n',
len_read - (buf_read_off - buf_read));
if (buf_find) {
len_copy = buf_find - buf_read_off;
} else {
len_copy = len_read - (buf_read_off - buf_read);
}
memcpy(buf_cmp_off, buf_read_off, len_copy);
*(buf_cmp_off + len_copy) = '\0';
if (buf_find) {
buf_read_off = buf_read_off + len_copy + 1;
buf_cmp_off = buf_cmp;
ret = strncmp(buf_cmp, "mcoverlay /proc ", 16);
if (!ret) {
err = 1;
break;
}
} else {
buf_read_off = buf_read_off + len_copy;
buf_cmp_off = buf_cmp_off + len_copy;
break;
}
}
if (err == 1 || len_read == 0) {
if (!strncmp(line, prefix, strlen(prefix))) {
ret = 1;
break;
}
}
close(fd);
if (line)
free(line);
__dprintf("err=%d\n", err);
return err;
return ret;
}
static int isunshare(void)
{
return find_mount_prefix("mcoverlay /proc ");
}
#endif // ENABLE_MCOVERLAYFS
@ -1415,6 +1382,7 @@ int main(int argc, char **argv)
if (error == 0) {
struct sys_unshare_desc unshare_desc;
struct sys_mount_desc mount_desc;
struct sys_umount_desc umount_desc;
memset(&unshare_desc, '\0', sizeof unshare_desc);
memset(&mount_desc, '\0', sizeof mount_desc);
@ -1426,6 +1394,53 @@ int main(int argc, char **argv)
return 1;
}
/*
* Umount cgroup filesystems that may expose invalid NUMA
* information
*/
if (find_mount_prefix("cgroup /sys/fs/cgroup/cpu,cpuacct")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpu,cpuacct";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpu,cpuacct. (%s)\n",
strerror(errno));
}
}
else if (find_mount_prefix("cgroup /sys/fs/cgroup/cpu")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpu";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpu. (%s)\n",
strerror(errno));
}
}
if (find_mount_prefix("cgroup /sys/fs/cgroup/cpuset")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpuset";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpuset. (%s)\n",
strerror(errno));
}
}
if (find_mount_prefix("cgroup /sys/fs/cgroup/memory")) {
umount_desc.dir_name = "/sys/fs/cgroup/memory/";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/memory. (%s)\n",
strerror(errno));
}
}
sprintf(mcos_procdir, "/tmp/mcos/mcos%d_proc", mcosid);
mount_desc.dev_name = mcos_procdir;
mount_desc.dir_name = "/proc";
@ -1686,6 +1701,97 @@ do_generic_syscall(
ret = -errno;
}
/* Overlayfs /sys/X directory lseek() problem work around */
if (w->sr.number == __NR_lseek && ret == -EINVAL) {
char proc_path[512];
char path[512];
struct stat sb;
sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]);
/* Get filename */
if (readlink(proc_path, path, sizeof(path)) < 0) {
fprintf(stderr, "%s: error: readlink() failed for %s\n",
__FUNCTION__, proc_path);
goto out;
}
/* Not in /sys? */
if (strncmp(path, "/sys/", 5))
goto out;
/* Stat */
if (stat(path, &sb) < 0) {
fprintf(stderr, "%s: error stat() failed for %s\n",
__FUNCTION__, path);
goto out;
}
/* Not dir? */
if ((sb.st_mode & S_IFMT) != S_IFDIR)
goto out;
ret = 0;
}
/* Fake that nodeX in /sys/devices/system/node do not exist,
* where X >= number of LWK NUMA nodes */
else if (w->sr.number == __NR_getdents && ret > 0) {
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
struct linux_dirent *d;
char *buf = (char *)w->sr.args[1];
int bpos = 0;
int nodes,len;
char proc_path[PATH_MAX];
char path[PATH_MAX];
sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]);
/* Get filename */
if ((len = readlink(proc_path, path, sizeof(path))) < 0) {
fprintf(stderr, "%s: error: readlink() failed for %s\n",
__FUNCTION__, proc_path);
goto out;
}
path[len] = 0;
/* Not /sys/devices/system/node ? */
if (strcmp(path, "/sys/devices/system/node"))
goto out;
nodes = ioctl(fd, MCEXEC_UP_GET_NODES, 0);
if (nodes == -1) {
goto out;
}
d = (struct linux_dirent *) (buf + bpos);
for (bpos = 0; bpos < ret; ) {
int nodeid, tmp_reclen;
d = (struct linux_dirent *) (buf + bpos);
if (sscanf(d->d_name, "node%d", &nodeid) != 1) {
bpos += d->d_reclen;
continue;
}
if (nodeid >= nodes) {
tmp_reclen = d->d_reclen;
memmove(buf + bpos,
buf + bpos + tmp_reclen,
ret - bpos - tmp_reclen);
ret -= tmp_reclen;
continue;
}
bpos += d->d_reclen;
}
}
out:
__dprintf("do_generic_syscall(%ld):%ld (%#lx)\n", w->sr.number, ret, ret);
return ret;
}
@ -1695,7 +1801,7 @@ kill_thread(unsigned long tid)
{
int i;
for (i = 0; i < n_threads; ++i) {
for (i = 0; i <= n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
@ -1954,21 +2060,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
#endif
case __NR_gettid:{
int mode = w.sr.args[0];
int remote_pid = w.sr.args[1];
int newcpuid = w.sr.args[2];
int oldcpuid = w.sr.args[3];
int wtid = thread_data[newcpuid].remote_tid;
if(mode == 0){
thread_data[ncpu].remote_tid = wtid;
thread_data[newcpuid].remote_tid = remote_pid;
}
else if(mode == 2){
thread_data[newcpuid].remote_tid = thread_data[oldcpuid].remote_tid;
thread_data[oldcpuid].remote_tid = wtid;
}
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
@ -2002,7 +2093,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
break;
}

View File

@ -65,7 +65,7 @@ void ap_init(void)
{
struct ihk_mc_cpu_info *cpu_info;
int i;
int bsp_hw_id;
int bsp_hw_id, bsp_cpu_id;
ihk_mc_init_ap();
init_delay();
@ -78,13 +78,23 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d\n", bsp_hw_id);
bsp_cpu_id = 0;
for (i = 0; i < cpu_info->ncpus; ++i) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
bsp_cpu_id = i;
break;
}
}
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
bsp_hw_id, cpu_info->nodes[0]);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d)\n", i, cpu_info->hw_ids[i]);
kprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
cpu_info->hw_ids[i], cpu_info->nodes[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
num_processors++;
@ -199,7 +209,7 @@ cpu_sysfs_setup(void)
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
info[cpu].online = 1;
}
fake_cpu_infos = info;

View File

@ -32,7 +32,7 @@ void cpu_local_var_init(void)
z = sizeof(struct cpu_local_var) * num_processors;
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
cpu_local_var_initialized = 1;
}

View File

@ -37,6 +37,8 @@ static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer */
if (mode != 1) {
*slide = 1;
break;
@ -70,6 +72,9 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer, give up
[head, tail] because the range is overwritten */
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
@ -170,6 +175,17 @@ int kprintf(const char *format, ...)
return len;
}
/* mode:
0: mcklogd is not running.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
1: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer blocks until
someone retrieves kmsg.
2: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
*/
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);

View File

@ -258,13 +258,24 @@ static void fileobj_release(struct memobj *memobj)
/* zap page_list */
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, 1);
}
#if 0
count = ihk_atomic_sub_return(1, &page->count);
if (!((page->mode == PM_WILL_PAGEIO)
@ -281,7 +292,7 @@ static void fileobj_release(struct memobj *memobj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), 1);
#endif
}
obj_list_remove(free_obj);
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
@ -430,7 +441,7 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
panic("fileobj_get_page:invalid new page");
}
@ -502,10 +513,10 @@ static uintptr_t fileobj_copy_page(
memobj_lock(memobj);
for (;;) {
if (orgpage->mode != PM_MAPPED) {
if (!orgpage || orgpage->mode != PM_MAPPED) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"invalid cow page. %x\n",
memobj, orgpa, p2align, orgpage->mode);
memobj, orgpa, p2align, orgpage ? orgpage->mode : 0);
panic("fileobj_copy_page:invalid cow page");
}
count = ihk_atomic_read(&orgpage->count);
@ -527,7 +538,9 @@ static uintptr_t fileobj_copy_page(
memcpy(newkva, orgkva, pgsize);
ihk_atomic_dec(&orgpage->count);
newpa = virt_to_phys(newkva);
page_map(phys_to_page(newpa));
if (phys_to_page(newpa)) {
page_map(phys_to_page(newpa));
}
newkva = NULL; /* avoid ihk_mc_free_pages() */
break;
}
@ -563,6 +576,11 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
ssize_t ss;
page = phys_to_page(phys);
if (!page) {
kprintf("%s: warning: tried to flush non-existing page for phys addr: 0x%lx\n",
__FUNCTION__, phys);
return 0;
}
memobj_unlock(&obj->memobj);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_WRITE;

View File

@ -534,31 +534,6 @@ extern void process_procfs_request(unsigned long rarg);
extern void terminate_host(int pid);
extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os)
{
@ -613,8 +588,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(thread, 0, cpuid, -1, 0, NULL);
thread->tid = proc->pid;
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
@ -688,15 +662,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
ret = 0;
break;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
ret = 0;
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",

View File

@ -17,8 +17,9 @@
struct page {
struct list_head list;
struct list_head hash;
uint8_t mode;
uint8_t padding[3];
uint64_t phys;
ihk_atomic_t count;
off_t offset;
};
@ -38,9 +39,8 @@ enum page_mode {
struct page *phys_to_page(uintptr_t phys);
uintptr_t page_to_phys(struct page *page);
int page_unmap(struct page *page);
struct page *phys_to_page_insert_hash(uint64_t phys);
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag);
void free_pages(void *va, int npages);
void begin_free_pages_pending(void);
void finish_free_pages_pending(void);

View File

@ -22,6 +22,7 @@
#include <memobj.h>
#include <affinity.h>
#include <syscall.h>
#include <bitops.h>
#define VR_NONE 0x0
#define VR_STACK 0x1
@ -165,6 +166,69 @@
#define NOPHYS ((uintptr_t)-1)
#define PROCESS_NUMA_MASK_BITS 64
/*
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
*/
/* Policies */
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
enum mpol_rebind_step {
MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
MPOL_REBIND_NSTEP,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
to policy */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
#include <waitq.h>
#include <futex.h>
@ -305,6 +369,13 @@ struct vm_range {
int padding;
};
struct vm_range_numa_policy {
struct list_head list;
unsigned long start, end;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
@ -594,6 +665,10 @@ struct process_vm {
int exiting;
long currss;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
/* Protected by memory_range_lock */
struct list_head vm_range_numa_policy_list;
};
static inline int has_cap_ipc_lock(struct thread *th)
@ -690,7 +765,5 @@ void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids);
#endif

View File

@ -331,7 +331,7 @@ void delete_proc_procfs_files(int pid);
void create_os_procfs_files(void);
void delete_os_procfs_files(void);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */

View File

@ -320,7 +320,7 @@ static void setup_remote_snooping_samples(void)
static void populate_sysfs(void)
{
cpu_sysfs_setup();
setup_remote_snooping_samples();
//setup_remote_snooping_samples();
} /* populate_sysfs() */
int host_ikc_inited = 0;

View File

@ -48,9 +48,8 @@
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
static struct ihk_page_allocator_desc *pa_allocator;
static unsigned long pa_start, pa_end;
static struct page *pa_pages;
static struct ihk_mc_numa_node *memory_nodes = NULL;
extern void unhandled_page_fault(struct thread *, void *, void *);
extern int interrupt_from_user(void *);
@ -59,13 +58,433 @@ struct tlb_flush_entry tlb_flush_vector[IHK_TLB_FLUSH_IRQ_VECTOR_SIZE];
int anon_on_demand = 0;
static void reserve_pages(unsigned long start, unsigned long end, int type)
static struct ihk_mc_pa_ops *pa_ops;
extern void *early_alloc_pages(int nr_pages);
extern void early_alloc_invalidate(void);
static char *memdebug = NULL;
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
static void ___kfree(void *ptr);
static void *___ihk_mc_alloc_aligned_pages(int npages,
int p2align, enum ihk_mc_ap_flag flag);
static void *___ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
static void ___ihk_mc_free_pages(void *p, int npages);
/*
* Page allocator tracking routines
*/
#define PAGEALLOC_TRACK_HASH_SHIFT (8)
#define PAGEALLOC_TRACK_HASH_SIZE (1 << PAGEALLOC_TRACK_HASH_SHIFT)
#define PAGEALLOC_TRACK_HASH_MASK (PAGEALLOC_TRACK_HASH_SIZE - 1)
struct list_head pagealloc_track_hash[PAGEALLOC_TRACK_HASH_SIZE];
ihk_spinlock_t pagealloc_track_hash_locks[PAGEALLOC_TRACK_HASH_SIZE];
struct list_head pagealloc_addr_hash[PAGEALLOC_TRACK_HASH_SIZE];
ihk_spinlock_t pagealloc_addr_hash_locks[PAGEALLOC_TRACK_HASH_SIZE];
int pagealloc_track_initialized = 0;
int pagealloc_runcount = 0;
struct pagealloc_track_addr_entry {
void *addr;
int runcount;
struct list_head list; /* track_entry's list */
struct pagealloc_track_entry *entry;
struct list_head hash; /* address hash */
int npages;
};
struct pagealloc_track_entry {
char *file;
int line;
ihk_atomic_t alloc_count;
struct list_head hash;
struct list_head addr_list;
ihk_spinlock_t addr_list_lock;
};
void pagealloc_track_init(void)
{
if (start < pa_start) {
if (!pagealloc_track_initialized) {
int i;
pagealloc_track_initialized = 1;
for (i = 0; i < PAGEALLOC_TRACK_HASH_SIZE; ++i) {
ihk_mc_spinlock_init(&pagealloc_track_hash_locks[i]);
INIT_LIST_HEAD(&pagealloc_track_hash[i]);
ihk_mc_spinlock_init(&pagealloc_addr_hash_locks[i]);
INIT_LIST_HEAD(&pagealloc_addr_hash[i]);
}
}
}
/* NOTE: Hash lock must be held */
struct pagealloc_track_entry *__pagealloc_track_find_entry(
char *file, int line)
{
struct pagealloc_track_entry *entry_iter, *entry = NULL;
int hash = (strlen(file) + line) & PAGEALLOC_TRACK_HASH_MASK;
list_for_each_entry(entry_iter, &pagealloc_track_hash[hash], hash) {
if (!strcmp(entry_iter->file, file) &&
entry_iter->line == line) {
entry = entry_iter;
break;
}
}
if (entry) {
dkprintf("%s found entry %s:%d\n", __FUNCTION__,
file, line);
}
else {
dkprintf("%s couldn't find entry %s:%d\n", __FUNCTION__,
file, line);
}
return entry;
}
/* Top level routines called from macros */
void *_ihk_mc_alloc_aligned_pages(int npages, int p2align,
enum ihk_mc_ap_flag flag, char *file, int line)
{
unsigned long irqflags;
struct pagealloc_track_entry *entry;
struct pagealloc_track_addr_entry *addr_entry;
int hash, addr_hash;
void *r = ___ihk_mc_alloc_aligned_pages(npages, p2align, flag);
if (!memdebug || !pagealloc_track_initialized)
return r;
if (!r)
return r;
hash = (strlen(file) + line) & PAGEALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&pagealloc_track_hash_locks[hash]);
entry = __pagealloc_track_find_entry(file, line);
if (!entry) {
entry = ___kmalloc(sizeof(*entry), IHK_MC_AP_NOWAIT);
if (!entry) {
kprintf("%s: ERROR: allocating tracking entry\n");
goto out;
}
entry->line = line;
ihk_atomic_set(&entry->alloc_count, 1);
ihk_mc_spinlock_init(&entry->addr_list_lock);
INIT_LIST_HEAD(&entry->addr_list);
entry->file = ___kmalloc(strlen(file) + 1, IHK_MC_AP_NOWAIT);
if (!entry->file) {
kprintf("%s: ERROR: allocating file string\n");
___kfree(entry);
ihk_mc_spinlock_unlock(&pagealloc_track_hash_locks[hash], irqflags);
goto out;
}
strcpy(entry->file, file);
entry->file[strlen(file)] = 0;
list_add(&entry->hash, &pagealloc_track_hash[hash]);
dkprintf("%s entry %s:%d npages: %d added\n", __FUNCTION__,
file, line, npages);
}
else {
ihk_atomic_inc(&entry->alloc_count);
}
ihk_mc_spinlock_unlock(&pagealloc_track_hash_locks[hash], irqflags);
/* Add new addr entry for this allocation entry */
addr_entry = ___kmalloc(sizeof(*addr_entry), IHK_MC_AP_NOWAIT);
if (!addr_entry) {
kprintf("%s: ERROR: allocating addr entry\n");
goto out;
}
addr_entry->addr = r;
addr_entry->runcount = pagealloc_runcount;
addr_entry->entry = entry;
addr_entry->npages = npages;
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_add(&addr_entry->list, &entry->addr_list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
/* Add addr entry to address hash */
addr_hash = ((unsigned long)r >> 5) & PAGEALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&pagealloc_addr_hash_locks[addr_hash]);
list_add(&addr_entry->hash, &pagealloc_addr_hash[addr_hash]);
ihk_mc_spinlock_unlock(&pagealloc_addr_hash_locks[addr_hash], irqflags);
dkprintf("%s addr_entry %p added\n", __FUNCTION__, r);
out:
return r;
}
void *_ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag,
char *file, int line)
{
return _ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag, file, line);
}
void _ihk_mc_free_pages(void *ptr, int npages, char *file, int line)
{
unsigned long irqflags;
struct pagealloc_track_entry *entry;
struct pagealloc_track_addr_entry *addr_entry_iter, *addr_entry = NULL;
struct pagealloc_track_addr_entry *addr_entry_next = NULL;
int hash;
int rehash_addr_entry = 0;
if (!memdebug || !pagealloc_track_initialized) {
goto out;
}
hash = ((unsigned long)ptr >> 5) & PAGEALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&pagealloc_addr_hash_locks[hash]);
list_for_each_entry(addr_entry_iter,
&pagealloc_addr_hash[hash], hash) {
if (addr_entry_iter->addr == ptr) {
addr_entry = addr_entry_iter;
break;
}
}
if (addr_entry) {
if (addr_entry->npages > npages) {
addr_entry->addr += (npages * PAGE_SIZE);
addr_entry->npages -= npages;
/* Only rehash if haven't freed all pages yet */
if (addr_entry->npages) {
rehash_addr_entry = 1;
}
}
list_del(&addr_entry->hash);
}
ihk_mc_spinlock_unlock(&pagealloc_addr_hash_locks[hash], irqflags);
if (!addr_entry) {
/*
* Deallocations that don't start at the allocated address are
* valid but can't be found in addr hash, scan the entire table
* and split the matching entry
*/
for (hash = 0; hash < PAGEALLOC_TRACK_HASH_SIZE; ++hash) {
irqflags = ihk_mc_spinlock_lock(&pagealloc_addr_hash_locks[hash]);
list_for_each_entry(addr_entry_iter,
&pagealloc_addr_hash[hash], hash) {
if (addr_entry_iter->addr < ptr &&
(addr_entry_iter->addr + addr_entry_iter->npages * PAGE_SIZE)
>= ptr + (npages * PAGE_SIZE)) {
addr_entry = addr_entry_iter;
break;
}
}
if (addr_entry) {
list_del(&addr_entry->hash);
}
ihk_mc_spinlock_unlock(&pagealloc_addr_hash_locks[hash], irqflags);
if (addr_entry) break;
}
/* Still not? Invalid deallocation */
if (!addr_entry) {
kprintf("%s: ERROR: invalid deallocation @ %s:%d\n",
__FUNCTION__, file, line);
panic("invalid deallocation");
}
dkprintf("%s: found covering addr_entry: 0x%lx:%d\n", __FUNCTION__,
addr_entry->addr, addr_entry->npages);
entry = addr_entry->entry;
/*
* Now split, allocate new entry and rehash.
* Is there a remaining piece after the deallocation?
*/
if ((ptr + (npages * PAGE_SIZE)) <
(addr_entry->addr + (addr_entry->npages * PAGE_SIZE))) {
int addr_hash;
addr_entry_next =
___kmalloc(sizeof(*addr_entry_next), IHK_MC_AP_NOWAIT);
if (!addr_entry_next) {
kprintf("%s: ERROR: allocating addr entry prev\n", __FUNCTION__);
goto out;
}
addr_entry_next->addr = ptr + (npages * PAGE_SIZE);
addr_entry_next->npages = ((addr_entry->addr +
(addr_entry->npages * PAGE_SIZE)) -
(ptr + npages * PAGE_SIZE)) / PAGE_SIZE;
addr_entry_next->runcount = addr_entry->runcount;
addr_hash = ((unsigned long)addr_entry_next->addr >> 5) &
PAGEALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&pagealloc_addr_hash_locks[addr_hash]);
list_add(&addr_entry_next->hash, &pagealloc_addr_hash[addr_hash]);
ihk_mc_spinlock_unlock(&pagealloc_addr_hash_locks[addr_hash], irqflags);
/* Add to allocation entry */
addr_entry_next->entry = entry;
ihk_atomic_inc(&entry->alloc_count);
ihk_mc_spinlock_lock_noirq(&entry->addr_list_lock);
list_add(&addr_entry_next->list, &entry->addr_list);
ihk_mc_spinlock_unlock_noirq(&entry->addr_list_lock);
dkprintf("%s: addr_entry_next: 0x%lx:%d\n", __FUNCTION__,
addr_entry_next->addr, addr_entry_next->npages);
}
/*
* We know that addr_entry->addr != ptr, addr_entry will cover
* the region before the deallocation.
*/
addr_entry->npages = (ptr - addr_entry->addr) / PAGE_SIZE;
rehash_addr_entry = 1;
dkprintf("%s: modified addr_entry: 0x%lx:%d\n", __FUNCTION__,
addr_entry->addr, addr_entry->npages);
}
entry = addr_entry->entry;
if (rehash_addr_entry) {
int addr_hash = ((unsigned long)addr_entry->addr >> 5) &
PAGEALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&pagealloc_addr_hash_locks[addr_hash]);
list_add(&addr_entry->hash, &pagealloc_addr_hash[addr_hash]);
ihk_mc_spinlock_unlock(&pagealloc_addr_hash_locks[addr_hash], irqflags);
goto out;
}
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_del(&addr_entry->list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
dkprintf("%s addr_entry %p removed\n", __FUNCTION__, addr_entry->addr);
___kfree(addr_entry);
/* Do we need to remove tracking entry as well? */
hash = (strlen(entry->file) + entry->line) &
PAGEALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&pagealloc_track_hash_locks[hash]);
if (!ihk_atomic_dec_and_test(&entry->alloc_count)) {
ihk_mc_spinlock_unlock(&pagealloc_track_hash_locks[hash], irqflags);
goto out;
}
list_del(&entry->hash);
ihk_mc_spinlock_unlock(&pagealloc_track_hash_locks[hash], irqflags);
dkprintf("%s entry %s:%d removed\n", __FUNCTION__,
entry->file, entry->line);
___kfree(entry->file);
___kfree(entry);
out:
___ihk_mc_free_pages(ptr, npages);
}
void pagealloc_memcheck(void)
{
int i;
unsigned long irqflags;
struct pagealloc_track_entry *entry = NULL;
for (i = 0; i < PAGEALLOC_TRACK_HASH_SIZE; ++i) {
irqflags = ihk_mc_spinlock_lock(&pagealloc_track_hash_locks[i]);
list_for_each_entry(entry, &pagealloc_track_hash[i], hash) {
struct pagealloc_track_addr_entry *addr_entry = NULL;
int cnt = 0;
ihk_mc_spinlock_lock_noirq(&entry->addr_list_lock);
list_for_each_entry(addr_entry, &entry->addr_list, list) {
dkprintf("%s memory leak: %p @ %s:%d runcount: %d\n",
__FUNCTION__,
addr_entry->addr,
entry->file,
entry->line,
addr_entry->runcount);
if (pagealloc_runcount != addr_entry->runcount)
continue;
cnt++;
}
ihk_mc_spinlock_unlock_noirq(&entry->addr_list_lock);
if (!cnt)
continue;
kprintf("%s memory leak: %s:%d cnt: %d, runcount: %d\n",
__FUNCTION__,
entry->file,
entry->line,
cnt,
pagealloc_runcount);
}
ihk_mc_spinlock_unlock(&pagealloc_track_hash_locks[i], irqflags);
}
++pagealloc_runcount;
}
/* Actual allocation routines */
static void *___ihk_mc_alloc_aligned_pages(int npages, int p2align,
enum ihk_mc_ap_flag flag)
{
if (pa_ops)
return pa_ops->alloc_page(npages, p2align, flag);
else
return early_alloc_pages(npages);
}
static void *___ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag)
{
return ___ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag);
}
static void ___ihk_mc_free_pages(void *p, int npages)
{
if (pa_ops)
pa_ops->free_page(p, npages);
}
void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *ops)
{
pagealloc_track_init();
early_alloc_invalidate();
pa_ops = ops;
}
/* Internal allocation routines */
static void reserve_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end, int type)
{
if (start < pa_allocator->start) {
start = pa_allocator->start;
}
if (end > pa_end) {
end = pa_allocator->last;
if (end > pa_allocator->end) {
end = pa_allocator->end;
}
if (start >= end) {
return;
@ -75,43 +494,83 @@ static void reserve_pages(unsigned long start, unsigned long end, int type)
ihk_pagealloc_reserve(pa_allocator, start, end);
}
void *allocate_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag)
static void *allocate_aligned_pages(int npages, int p2align,
enum ihk_mc_ap_flag flag)
{
unsigned long pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
/* all_pagealloc_alloc returns zero when error occured,
and callee (in mcos/kernel/process.c) so propagate it */
if(pa)
unsigned long pa;
int i;
/* TODO: match NUMA id and distance matrix with allocating core */
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
struct ihk_page_allocator_desc *pa_allocator;
list_for_each_entry(pa_allocator,
&memory_nodes[(ihk_mc_get_numa_id() + i) %
ihk_mc_get_nr_numa_nodes()].allocators, list) {
pa = ihk_pagealloc_alloc(pa_allocator, npages, p2align);
if (pa) break;
}
if (pa) break;
}
if (pa)
return phys_to_virt(pa);
/*
if(flag != IHK_MC_AP_NOWAIT)
panic("Not enough space\n");
*/
return NULL;
}
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag)
static void *allocate_pages(int npages, enum ihk_mc_ap_flag flag)
{
return allocate_aligned_pages(npages, PAGE_P2ALIGN, flag);
}
void free_pages(void *va, int npages)
static void __free_pages_in_allocator(void *va, int npages)
{
int i;
unsigned long pa_start = virt_to_phys(va);
unsigned long pa_end = pa_start + (npages * PAGE_SIZE);
/* Find corresponding memory allocator */
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
struct ihk_page_allocator_desc *pa_allocator;
list_for_each_entry(pa_allocator,
&memory_nodes[i].allocators, list) {
if (pa_start >= pa_allocator->start &&
pa_end <= pa_allocator->end) {
ihk_pagealloc_free(pa_allocator, pa_start, npages);
return;
}
}
}
}
static void free_pages(void *va, int npages)
{
struct list_head *pendings = &cpu_local_var(pending_free_pages);
struct page *page;
page = phys_to_page(virt_to_phys(va));
if (!page) {
panic("free_pages:struct page not found");
}
if (page->mode != PM_NONE) {
panic("free_pages:not PM_NONE");
}
if (pendings->next != NULL) {
page->mode = PM_PENDING_FREE;
page->offset = npages;
list_add_tail(&page->list, pendings);
return;
if (page) {
if (page->mode != PM_NONE) {
panic("free_pages:not PM_NONE");
}
if (pendings->next != NULL) {
page->mode = PM_PENDING_FREE;
page->offset = npages;
list_add_tail(&page->list, pendings);
return;
}
}
ihk_pagealloc_free(pa_allocator, virt_to_phys(va), npages);
__free_pages_in_allocator(va, npages);
}
void begin_free_pages_pending(void) {
@ -140,7 +599,8 @@ void finish_free_pages_pending(void)
}
page->mode = PM_NONE;
list_del(&page->list);
ihk_pagealloc_free(pa_allocator, page_to_phys(page), page->offset);
__free_pages_in_allocator(phys_to_virt(page_to_phys(page)),
page->offset);
}
pendings->next = pendings->prev = NULL;
@ -154,18 +614,35 @@ static struct ihk_mc_pa_ops allocator = {
void sbox_write(int offset, unsigned int value);
static int page_hash_count_pages(void);
static void query_free_mem_interrupt_handler(void *priv)
{
int pages = ihk_pagealloc_query_free(pa_allocator);
kprintf("McKernel free pages: %d\n", pages);
int i, pages = 0;
/* Iterate memory allocators */
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
struct ihk_page_allocator_desc *pa_allocator;
list_for_each_entry(pa_allocator,
&memory_nodes[i].allocators, list) {
int __pages = ihk_pagealloc_query_free(pa_allocator);
kprintf("McKernel free pages in (0x%lx - 0x%lx): %d\n",
pa_allocator->start, pa_allocator->end, __pages);
pages += __pages;
}
}
kprintf("McKernel free pages in total: %d\n", pages);
if (find_command_line("memdebug")) {
extern void kmalloc_memcheck(void);
kmalloc_memcheck();
pagealloc_memcheck();
}
kprintf("Page hash: %d pages active\n", page_hash_count_pages());
#ifdef ATTACHED_MIC
sbox_write(SBOX_SCRATCH0, pages);
sbox_write(SBOX_SCRATCH1, 1);
@ -391,136 +868,230 @@ out:
return;
}
static void page_allocator_init(void)
static struct ihk_page_allocator_desc *page_allocator_init(uint64_t start,
uint64_t end, int initial)
{
struct ihk_page_allocator_desc *pa_allocator;
unsigned long page_map_pa, pages;
void *page_map;
unsigned int i;
uint64_t start;
uint64_t end;
start = ihk_mc_get_memory_address(IHK_MC_GMA_AVAIL_START, 0);
end = ihk_mc_get_memory_address(IHK_MC_GMA_AVAIL_END, 0);
start &= PAGE_MASK;
pa_start = start & LARGE_PAGE_MASK;
pa_end = (end + PAGE_SIZE - 1) & PAGE_MASK;
pa_start = (start + PAGE_SIZE - 1) & PAGE_MASK;
pa_end = end & PAGE_MASK;
#ifndef ATTACHED_MIC
page_map_pa = ihk_mc_get_memory_address(IHK_MC_GMA_HEAP_START, 0);
#else
#ifdef ATTACHED_MIC
/*
* Can't allocate in reserved area
* TODO: figure this out automatically!
*/
page_map_pa = 0x100000;
#else
page_map_pa = initial ? virt_to_phys(get_last_early_heap()) : pa_start;
#endif
page_map = phys_to_virt(page_map_pa);
pa_allocator = __ihk_pagealloc_init(pa_start, pa_end - pa_start,
PAGE_SIZE, page_map, &pages);
reserve_pages(page_map_pa, page_map_pa + pages * PAGE_SIZE, 0);
reserve_pages(pa_allocator, page_map_pa,
page_map_pa + pages * PAGE_SIZE, 0);
if (pa_start < start) {
reserve_pages(pa_start, start, 0);
reserve_pages(pa_allocator, pa_start, start, 0);
}
/* BIOS reserved ranges */
for (i = 1; i <= ihk_mc_get_memory_address(IHK_MC_NR_RESERVED_AREAS, 0);
++i) {
reserve_pages(ihk_mc_get_memory_address(IHK_MC_RESERVED_AREA_START, i),
ihk_mc_get_memory_address(IHK_MC_RESERVED_AREA_END, i), 0);
reserve_pages(pa_allocator,
ihk_mc_get_memory_address(IHK_MC_RESERVED_AREA_START, i),
ihk_mc_get_memory_address(IHK_MC_RESERVED_AREA_END, i), 0);
}
ihk_mc_reserve_arch_pages(pa_start, pa_end, reserve_pages);
ihk_mc_reserve_arch_pages(pa_allocator, pa_start, pa_end, reserve_pages);
kprintf("Available memory: %ld bytes in %ld pages\n",
(ihk_pagealloc_count(pa_allocator) * PAGE_SIZE),
ihk_pagealloc_count(pa_allocator));
return pa_allocator;
}
/* Notify the ihk to use my page allocator */
ihk_mc_set_page_allocator(&allocator);
static void numa_init(void)
{
int i, j;
memory_nodes = early_alloc_pages((sizeof(*memory_nodes) *
ihk_mc_get_nr_numa_nodes() + PAGE_SIZE - 1)
>> PAGE_SHIFT);
/* And prepare some exception handlers */
ihk_mc_set_page_fault_handler(page_fault_handler);
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
int linux_numa_id, type;
/* Register query free mem handler */
ihk_mc_register_interrupt_handler(ihk_mc_get_vector(IHK_GV_QUERY_FREE_MEM),
&query_free_mem_handler);
ihk_mc_get_numa_node(i, &linux_numa_id, &type);
memory_nodes[i].id = i;
memory_nodes[i].linux_numa_id = linux_numa_id;
memory_nodes[i].type = type;
INIT_LIST_HEAD(&memory_nodes[i].allocators);
kprintf("NUMA: %d, Linux NUMA: %d, type: %d\n",
i, linux_numa_id, type);
}
for (j = 0; j < ihk_mc_get_nr_memory_chunks(); ++j) {
unsigned long start, end;
int numa_id;
struct ihk_page_allocator_desc *allocator;
ihk_mc_get_memory_chunk(j, &start, &end, &numa_id);
allocator = page_allocator_init(start, end, (j == 0));
list_add_tail(&allocator->list, &memory_nodes[numa_id].allocators);
kprintf("Physical memory: 0x%lx - 0x%lx, %lu bytes, %d pages available @ NUMA: %d\n",
start, end,
ihk_pagealloc_count(allocator) * PAGE_SIZE,
ihk_pagealloc_count(allocator),
numa_id);
}
}
#define PHYS_PAGE_HASH_SHIFT (10)
#define PHYS_PAGE_HASH_SIZE (1 << PHYS_PAGE_HASH_SHIFT)
#define PHYS_PAGE_HASH_MASK (PHYS_PAGE_HASH_SIZE - 1)
/*
* Page hash only tracks pages that are mapped in non-anymous mappings
* and thus it is initially empty.
*/
struct list_head page_hash[PHYS_PAGE_HASH_SIZE];
ihk_spinlock_t page_hash_locks[PHYS_PAGE_HASH_SIZE];
static void page_init(void)
{
int i;
for (i = 0; i < PHYS_PAGE_HASH_SIZE; ++i) {
ihk_mc_spinlock_init(&page_hash_locks[i]);
INIT_LIST_HEAD(&page_hash[i]);
}
return;
}
static int page_hash_count_pages(void)
{
int i;
int cnt = 0;
for (i = 0; i < PHYS_PAGE_HASH_SIZE; ++i) {
unsigned long irqflags;
struct page *page_iter;
irqflags = ihk_mc_spinlock_lock(&page_hash_locks[i]);
list_for_each_entry(page_iter, &page_hash[i], hash) {
++cnt;
}
ihk_mc_spinlock_unlock(&page_hash_locks[i], irqflags);
}
return cnt;
}
/* XXX: page_hash_lock must be held */
static struct page *__phys_to_page(uintptr_t phys)
{
int hash = (phys >> PAGE_SHIFT) & PHYS_PAGE_HASH_MASK;
struct page *page_iter, *page = NULL;
list_for_each_entry(page_iter, &page_hash[hash], hash) {
if (page_iter->phys == phys) {
page = page_iter;
break;
}
}
return page;
}
struct page *phys_to_page(uintptr_t phys)
{
int64_t ix;
int hash = (phys >> PAGE_SHIFT) & PHYS_PAGE_HASH_MASK;
struct page *page = NULL;
unsigned long irqflags;
if ((phys < pa_start) || (pa_end <= phys)) {
return NULL;
}
irqflags = ihk_mc_spinlock_lock(&page_hash_locks[hash]);
page = __phys_to_page(phys);
ihk_mc_spinlock_unlock(&page_hash_locks[hash], irqflags);
ix = (phys - pa_start) >> PAGE_SHIFT;
return &pa_pages[ix];
return page;
}
uintptr_t page_to_phys(struct page *page)
{
int64_t ix;
uintptr_t phys;
return page ? page->phys : 0;
}
ix = page - pa_pages;
phys = pa_start + (ix << PAGE_SHIFT);
if ((phys < pa_start) || (pa_end <= phys)) {
ekprintf("page_to_phys(%p):not a pa_pages[]:%p %lx-%lx\n",
page, pa_pages, pa_start, pa_end);
panic("page_to_phys");
/*
* Allocate page and add to hash if it doesn't exist yet.
* NOTE: page->count is zero for new pages and the caller
* is responsible to increase it.
*/
struct page *phys_to_page_insert_hash(uint64_t phys)
{
int hash = (phys >> PAGE_SHIFT) & PHYS_PAGE_HASH_MASK;
struct page *page = NULL;
unsigned long irqflags;
irqflags = ihk_mc_spinlock_lock(&page_hash_locks[hash]);
page = __phys_to_page(phys);
if (!page) {
int hash = (phys >> PAGE_SHIFT) & PHYS_PAGE_HASH_MASK;
page = kmalloc(sizeof(*page), IHK_MC_AP_CRITICAL);
if (!page) {
kprintf("%s: error allocating page\n", __FUNCTION__);
goto out;
}
list_add(&page->hash, &page_hash[hash]);
page->phys = phys;
page->mode = PM_NONE;
INIT_LIST_HEAD(&page->list);
ihk_atomic_set(&page->count, 0);
}
return phys;
out:
ihk_mc_spinlock_unlock(&page_hash_locks[hash], irqflags);
return page;
}
int page_unmap(struct page *page)
{
int hash = (page->phys >> PAGE_SHIFT) & PHYS_PAGE_HASH_MASK;
unsigned long irqflags;
irqflags = ihk_mc_spinlock_lock(&page_hash_locks[hash]);
dkprintf("page_unmap(%p %x %d)\n", page, page->mode, page->count);
if (ihk_atomic_sub_return(1, &page->count) > 0) {
/* other mapping exist */
dkprintf("page_unmap(%p %x %d): 0\n",
page, page->mode, page->count);
ihk_mc_spinlock_unlock(&page_hash_locks[hash], irqflags);
return 0;
}
/* no mapping exist */
/* no mapping exist TODO: why is this check??
if (page->mode != PM_MAPPED) {
return 1;
}
*/
list_del(&page->list);
page->mode = PM_NONE;
dkprintf("page_unmap(%p %x %d): 1\n", page, page->mode, page->count);
list_del(&page->hash);
kfree(page);
ihk_mc_spinlock_unlock(&page_hash_locks[hash], irqflags);
return 1;
}
static void page_init(void)
{
size_t npages;
size_t allocsize;
size_t allocpages;
if (sizeof(ihk_atomic_t) != sizeof(uint32_t)) {
panic("sizeof(ihk_atomic_t) is not 32 bit");
}
npages = (pa_end - pa_start) >> PAGE_SHIFT;
allocsize = sizeof(struct page) * npages;
allocpages = (allocsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
pa_pages = allocate_pages(allocpages, IHK_MC_AP_CRITICAL);
memset(pa_pages, 0, allocsize);
return;
}
static char *memdebug = NULL;
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
static void ___kfree(void *ptr);
void register_kmalloc(void)
{
if(memdebug){
@ -580,8 +1151,10 @@ void ihk_mc_unmap_virtual(void *va, int npages, int free_physical)
ihk_mc_pt_clear_page(NULL, (char *)va + (i << PAGE_SHIFT));
}
if (free_physical)
if (free_physical) {
ihk_pagealloc_free(vmap_allocator, (unsigned long)va, npages);
flush_tlb_single((unsigned long)va);
}
}
#ifdef ATTACHED_MIC
@ -638,7 +1211,20 @@ void ihk_mc_clean_micpa(void){
void mem_init(void)
{
page_allocator_init();
/* Initialize NUMA information and memory allocator bitmaps */
numa_init();
/* Notify the ihk to use my page allocator */
ihk_mc_set_page_allocator(&allocator);
/* And prepare some exception handlers */
ihk_mc_set_page_fault_handler(page_fault_handler);
/* Register query free mem handler */
ihk_mc_register_interrupt_handler(ihk_mc_get_vector(IHK_GV_QUERY_FREE_MEM),
&query_free_mem_handler);
/* Init page frame hash */
page_init();
/* Prepare the kernel virtual map space */
@ -759,13 +1345,14 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
if (!entry) {
entry = ___kmalloc(sizeof(*entry), IHK_MC_AP_NOWAIT);
if (!entry) {
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
kprintf("%s: ERROR: allocating tracking entry\n");
goto out;
}
entry->line = line;
entry->size = size;
ihk_atomic_set(&entry->alloc_count, 0);
ihk_atomic_set(&entry->alloc_count, 1);
ihk_mc_spinlock_init(&entry->addr_list_lock);
INIT_LIST_HEAD(&entry->addr_list);
@ -779,14 +1366,16 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
strcpy(entry->file, file);
entry->file[strlen(file)] = 0;
INIT_LIST_HEAD(&entry->hash);
list_add(&entry->hash, &kmalloc_track_hash[hash]);
dkprintf("%s entry %s:%d size: %d added\n", __FUNCTION__,
file, line, size);
}
else {
ihk_atomic_inc(&entry->alloc_count);
}
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
ihk_atomic_inc(&entry->alloc_count);
/* Add new addr entry for this allocation entry */
addr_entry = ___kmalloc(sizeof(*addr_entry), IHK_MC_AP_NOWAIT);
if (!addr_entry) {
@ -855,13 +1444,15 @@ void _kfree(void *ptr, char *file, int line)
___kfree(addr_entry);
/* Do we need to remove tracking entry as well? */
if (!ihk_atomic_dec_and_test(&entry->alloc_count)) {
goto out;
}
hash = (strlen(entry->file) + entry->line + entry->size) &
KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
if (!ihk_atomic_dec_and_test(&entry->alloc_count)) {
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
goto out;
}
list_del(&entry->hash);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
@ -1016,7 +1607,7 @@ void kmalloc_consolidate_free_list(void)
}
#define KMALLOC_MIN_SHIFT (5)
#define KMALLOC_MIN_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
#define KMALLOC_MIN_SIZE (1 << KMALLOC_MIN_SHIFT)
#define KMALLOC_MIN_MASK (KMALLOC_MIN_SIZE - 1)
/* Actual low-level allocation routines */
@ -1066,7 +1657,8 @@ split_and_return:
/* Allocate new memory and add it to free list */
npages = (size + sizeof(struct kmalloc_header) + (PAGE_SIZE - 1))
>> PAGE_SHIFT;
chunk = ihk_mc_alloc_pages(npages, flag);
/* Use low-level page allocator to avoid tracking */
chunk = ___ihk_mc_alloc_pages(npages, flag);
if (!chunk) {
cpu_restore_interrupt(kmalloc_irq_flags);

View File

@ -204,15 +204,28 @@ detach_address_space(struct address_space *asp, int pid)
static int
init_process_vm(struct process *owner, struct address_space *asp, struct process_vm *vm)
{
int i;
ihk_mc_spinlock_init(&vm->memory_range_lock);
ihk_mc_spinlock_init(&vm->page_table_lock);
ihk_atomic_set(&vm->refcount, 1);
INIT_LIST_HEAD(&vm->vm_range_list);
INIT_LIST_HEAD(&vm->vm_range_numa_policy_list);
vm->address_space = asp;
vm->proc = owner;
vm->exiting = 0;
memset(&vm->numa_mask, 0, sizeof(vm->numa_mask));
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
if (i >= PROCESS_NUMA_MASK_BITS) {
kprintf("%s: error: NUMA id is larger than mask size!\n",
__FUNCTION__);
break;
}
set_bit(i, &vm->numa_mask[0]);
}
vm->numa_mem_policy = MPOL_DEFAULT;
return 0;
}
@ -371,6 +384,11 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
kfree(proc);
goto err_free_proc;
}
memcpy(&proc->vm->numa_mask, &org->vm->numa_mask,
sizeof(proc->vm->numa_mask));
proc->vm->numa_mem_policy =
org->vm->numa_mem_policy;
thread->proc = proc;
thread->vm = proc->vm;
@ -1517,7 +1535,7 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang
}
}
if (phys == NOPHYS) {
void *virt;
void *virt = NULL;
size_t npages;
retry:
@ -1542,7 +1560,9 @@ retry:
__FUNCTION__, pgaddr, pgsize);
memset(virt, 0, pgsize);
phys = virt_to_phys(virt);
page_map(phys_to_page(phys));
if (phys_to_page(phys)) {
page_map(phys_to_page(phys));
}
}
}
else {
@ -1554,10 +1574,12 @@ retry:
attr = arch_vrflag_to_ptattr(range->flag | memobj_flag, reason, ptep);
/*****/
if (((range->flag & VR_PRIVATE)
|| ((reason & PF_PATCH)
&& !(range->flag & VR_PROT_WRITE)))
&& (!page || page_is_in_memobj(page) || page_is_multi_mapped(page))) {
if (((range->flag & VR_PRIVATE) ||
((reason & PF_PATCH) && !(range->flag & VR_PROT_WRITE)))
&& ((!page && phys == NOPHYS) || (page &&
(page_is_in_memobj(page) ||
page_is_multi_mapped(page))))) {
if (!(attr & PTATTR_DIRTY)) {
attr &= ~PTATTR_WRITABLE;
}
@ -2462,6 +2484,7 @@ void sched_init(void)
ihk_mc_init_context(&idle_thread->ctx, NULL, idle);
ihk_mc_spinlock_init(&idle_thread->vm->memory_range_lock);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_list);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_numa_policy_list);
idle_thread->proc->pid = 0;
idle_thread->tid = ihk_mc_get_processor_id();
@ -2551,7 +2574,6 @@ static void do_migrate(void)
v->flags |= CPU_FLAG_NEED_RESCHED;
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1);
double_rq_unlock(cur_v, v, irqstate);
//settid(req->thread, 2, cpu_id, old_cpu_id, 0, NULL);
ack:
waitq_wakeup(&req->wq);

View File

@ -23,6 +23,7 @@
#include <process.h>
#include <page.h>
#include <mman.h>
#include <bitmap.h>
//#define DEBUG_PRINT_PROCFS
@ -35,6 +36,7 @@
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
extern int osnum;
@ -404,12 +406,33 @@ process_procfs_request(unsigned long rarg)
/*
* mcos%d/PID/status
*/
#define BITMASKS_BUF_SIZE 2048
if (strcmp(p, "status") == 0) {
struct vm_range *range;
unsigned long lockedsize = 0;
char tmp[1024];
char *tmp;
char *bitmasks;
int bitmasks_offset = 0;
char *cpu_bitmask, *cpu_list, *numa_bitmask, *numa_list;
int len;
tmp = kmalloc(8192, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status buffer\n",
__FUNCTION__);
ans = 0;
goto end;
}
bitmasks = kmalloc(BITMASKS_BUF_SIZE, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status bitmaks buffer\n",
__FUNCTION__);
kfree(tmp);
ans = 0;
goto end;
}
ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock);
list_for_each_entry(range, &proc->vm->vm_range_list, list) {
if(range->flag & VR_LOCKED)
@ -417,13 +440,42 @@ process_procfs_request(unsigned long rarg)
}
ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock);
cpu_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(cpu_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, __CPU_SETSIZE);
bitmasks_offset++;
cpu_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(cpu_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, __CPU_SETSIZE);
bitmasks_offset++;
numa_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(numa_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
numa_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(numa_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
sprintf(tmp,
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n"
"VmLck:\t%9lu kB\n",
"VmLck:\t%9lu kB\n"
"Cpus_allowed:\t%s\n"
"Cpus_allowed_list:\t%s\n"
"Mems_allowed:\t%s\n"
"Mems_allowed_list:\t%s\n",
proc->ruid, proc->euid, proc->suid, proc->fsuid,
proc->rgid, proc->egid, proc->sgid, proc->fsgid,
(lockedsize + 1023) >> 10);
(lockedsize + 1023) >> 10,
cpu_bitmask, cpu_list, numa_bitmask, numa_list);
len = strlen(tmp);
if (r->offset < len) {
if (r->offset + r->count < len) {
@ -437,6 +489,8 @@ process_procfs_request(unsigned long rarg)
ans = 0;
eof = 1;
}
kfree(tmp);
kfree(bitmasks);
goto end;
}

View File

@ -240,14 +240,24 @@ void shmobj_destroy(struct shmobj *obj)
npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT);
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, npages);
}
#if 0
dkprintf("shmobj_destroy(%p):"
"release page. %p %#lx %d %d",
obj, page, page_to_phys(page),
@ -265,7 +275,8 @@ void shmobj_destroy(struct shmobj *obj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), npages);
ihk_mc_free_pages(phys_to_virt(page_to_phys(page)), npages);
#endif
}
if (obj->index < 0) {
kfree(obj);
@ -404,7 +415,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("shmobj_get_page(%p,%#lx,%d,%p):"
"page %p %#lx %d %d %#lx\n",

View File

@ -52,6 +52,8 @@
#include <mc_perf_event.h>
#include <march.h>
#include <process.h>
#include <bitops.h>
#include <bitmap.h>
/* Headers taken from kitten LWK */
#include <lwk/stddef.h>
@ -134,7 +136,6 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid, struct s
int ret;
if(req->number == __NR_exit_group ||
req->number == __NR_gettid ||
req->number == __NR_kill){ // interrupt syscall
extern int num_processors;
@ -143,9 +144,11 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid, struct s
/* XXX: is this really going to work if multiple processes
* exit/receive signals at the same time?? */
cpu = num_processors;
if(req->number == __NR_kill)
if (req->number == __NR_kill) {
req->rtid = -1;
pid = req->args[0];
if(req->number == __NR_gettid)
}
if (req->number == __NR_gettid)
pid = req->args[1];
}
else{
@ -203,7 +206,6 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
}
if(req->number == __NR_exit_group ||
req->number == __NR_gettid ||
req->number == __NR_kill){ // interrupt syscall
islock = 1;
irqstate = ihk_mc_spinlock_lock(&syscall_lock);
@ -1487,26 +1489,22 @@ SYSCALL_DECLARE(getppid)
return thread->proc->ppid_parent->pid;
}
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids)
static void settid(struct thread *thread, int nr_tids, int *tids)
{
int ret;
struct syscall_request request IHK_DMA_ALIGN;
unsigned long rc;
request.number = __NR_gettid;
request.args[0] = mode;
request.args[1] = thread->proc->pid;
request.args[2] = newcpuid;
request.args[3] = oldcpuid;
/*
* If nr_tids is non-zero, tids should point to an array of ints
* where the thread ids of the mcexec process are expected.
*/
request.args[4] = nr_tids;
request.args[5] = virt_to_phys(tids);
rc = do_syscall(&request, ihk_mc_get_processor_id(), thread->proc->pid);
if (mode != 2) {
thread->tid = rc;
if ((ret = do_syscall(&request, ihk_mc_get_processor_id(),
thread->proc->pid)) < 0) {
kprintf("%s: WARNING: do_syscall returns %d\n",
__FUNCTION__, ret);
}
}
@ -1930,7 +1928,7 @@ unsigned long do_fork(int clone_flags, unsigned long newsp,
return -ENOMEM;
}
settid(new, 1, cpuid, -1, num_processors, tids);
settid(new, num_processors, tids);
for (i = 0; (i < num_processors) && tids[i]; ++i) {
dkprintf("%s: tid[%d]: %d\n", __FUNCTION__, i, tids[i]);
@ -1943,6 +1941,7 @@ unsigned long do_fork(int clone_flags, unsigned long newsp,
}
/* Find an unused TID */
new->tid = 0;
retry_tid:
for (i = 0; i < newproc->nr_tids; ++i) {
if (!newproc->tids[i].thread) {
@ -7082,20 +7081,581 @@ out:
SYSCALL_DECLARE(mbind)
{
dkprintf("sys_mbind\n");
return -ENOSYS;
unsigned long addr = ihk_mc_syscall_arg0(ctx);
unsigned long len = ihk_mc_syscall_arg1(ctx);
int mode = ihk_mc_syscall_arg2(ctx);
unsigned long *nodemask =
(unsigned long *)ihk_mc_syscall_arg3(ctx);
unsigned long maxnode = ihk_mc_syscall_arg4(ctx);
unsigned flags = ihk_mc_syscall_arg5(ctx);
struct process_vm *vm = cpu_local_var(current)->vm;
unsigned long nodemask_bits = 0;
int mode_flags = 0;
int error = 0;
int bit;
struct vm_range *range;
struct vm_range_numa_policy *range_policy, *range_policy_iter;
struct vm_range_numa_policy *range_policy_next = NULL;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
/* Validate arguments */
if (addr & ~PAGE_MASK) {
return -EINVAL;
}
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
if (addr + len < addr || addr == (addr + len)) {
return -EINVAL;
}
memset(numa_mask, 0, sizeof(numa_mask));
if (maxnode) {
nodemask_bits = ALIGN(maxnode, 8);
if (maxnode > (PAGE_SIZE << 3)) {
dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
__FUNCTION__);
nodemask_bits = PROCESS_NUMA_MASK_BITS;
}
}
if ((mode & MPOL_F_STATIC_NODES) && (mode & MPOL_F_RELATIVE_NODES)) {
dkprintf("%s: error: MPOL_F_STATIC_NODES & MPOL_F_RELATIVE_NODES\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
if ((flags & MPOL_MF_STRICT) && (flags & MPOL_MF_MOVE)) {
dkprintf("%s: error: MPOL_MF_STRICT & MPOL_MF_MOVE\n",
__FUNCTION__);
/*
* XXX: man page claims the correct error code is EIO,
* but LTP tests for EINVAL.
*/
error = -EINVAL;
goto out;
}
mode_flags = (mode & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES));
mode &= ~(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
if (mode_flags & MPOL_F_RELATIVE_NODES) {
/* Not supported.. */
dkprintf("%s: error: MPOL_F_RELATIVE_NODES not supported\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
switch (mode) {
case MPOL_DEFAULT:
if (nodemask && nodemask_bits) {
error = copy_from_user(numa_mask, nodemask,
(nodemask_bits >> 3));
if (error) {
dkprintf("%s: error: copy_from_user numa_mask\n",
__FUNCTION__);
error = -EFAULT;
goto out;
}
if (!bitmap_empty(numa_mask, nodemask_bits)) {
dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
}
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
/* Special case for MPOL_PREFERRED with empty nodemask */
if (mode == MPOL_PREFERRED && !nodemask) {
error = 0;
break;
}
if (flags & MPOL_MF_STRICT) {
error = -EIO;
goto out;
}
error = copy_from_user(numa_mask, nodemask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
goto out;
}
if (!nodemask || bitmap_empty(numa_mask, nodemask_bits)) {
dkprintf("%s: ERROR: nodemask not specified\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
/* Verify NUMA mask */
for_each_set_bit(bit, numa_mask, maxnode) {
if (bit >= ihk_mc_get_nr_numa_nodes()) {
dkprintf("%s: %d is bigger than # of NUMA nodes\n",
__FUNCTION__, bit);
error = -EINVAL;
goto out;
}
}
break;
default:
error = -EINVAL;
goto out;
}
/* Validate address range */
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
range = lookup_process_memory_range(vm, addr, addr + len);
if (!range) {
dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__);
error = -EFAULT;
goto unlock_out;
}
/* Do the actual policy setting */
switch (mode) {
/*
* Man page claims MPOL_DEFAULT should remove any range specific
* policies so that process wise policy will be used. LTP on the
* other hand seems to test if MPOL_DEFAULT is set as a range policy.
* MPOL_DEFAULT thus behaves the same as the rest of the policies
* for now.
*/
#if 0
case MPOL_DEFAULT:
/* Delete or adjust any overlapping range settings */
list_for_each_entry_safe(range_policy_iter, range_policy_next,
&vm->vm_range_numa_policy_list, list) {
int keep = 0;
unsigned long orig_end = range_policy_iter->end;
if (range_policy_iter->end < addr ||
range_policy_iter->start > addr + len) {
continue;
}
/* Do we need to keep the front? */
if (range_policy_iter->start < addr) {
range_policy_iter->end = addr;
keep = 1;
}
/* Do we need to keep the end? */
if (orig_end > addr + len) {
/* Are we keeping front already? */
if (keep) {
/* Add a new entry after */
range_policy = kmalloc(sizeof(*range_policy),
IHK_MC_AP_NOWAIT);
if (!range_policy) {
kprintf("%s: error allocating range_policy\n",
__FUNCTION__);
error = -ENOMEM;
goto unlock_out;
}
memcpy(range_policy, range_policy_iter,
sizeof(*range_policy));
range_policy->start = addr + len;
range_policy->end = orig_end;
list_add(&range_policy->list,
&range_policy_iter->list);
}
else {
range_policy_iter->start = addr + len;
keep = 1;
}
}
if (!keep) {
list_del(&range_policy_iter->list);
kfree(range_policy_iter);
}
}
break;
#endif
case MPOL_DEFAULT:
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
/* Adjust any overlapping range settings and add new one */
range_policy_next = NULL;
list_for_each_entry(range_policy_iter,
&vm->vm_range_numa_policy_list, list) {
int adjusted = 0;
unsigned long orig_end = range_policy_iter->end;
if (range_policy_iter->end < addr)
continue;
/* Special case of entirely overlapping */
if (range_policy_iter->start == addr &&
range_policy_iter->end == addr + len) {
range_policy = range_policy_iter;
goto mbind_update_only;
}
/* Overlapping partially? */
if (range_policy_iter->start < addr) {
orig_end = range_policy_iter->end;
range_policy_iter->end = addr;
adjusted = 1;
}
/* Do we need to keep the end? */
if (orig_end > addr + len) {
if (adjusted) {
/* Add a new entry after */
range_policy = kmalloc(sizeof(*range_policy),
IHK_MC_AP_NOWAIT);
if (!range_policy) {
dkprintf("%s: error allocating range_policy\n",
__FUNCTION__);
error = -ENOMEM;
goto unlock_out;
}
memcpy(range_policy, range_policy_iter,
sizeof(*range_policy));
range_policy->start = addr + len;
range_policy->end = orig_end;
list_add(&range_policy->list,
&range_policy_iter->list);
range_policy_next = range_policy;
break;
}
else {
range_policy_iter->start = addr + len;
range_policy_next = range_policy_iter;
break;
}
}
/* Next one in ascending address order? */
if (range_policy_iter->start >= addr + len) {
range_policy_next = range_policy_iter;
break;
}
}
/* Add a new entry */
range_policy = kmalloc(sizeof(*range_policy),
IHK_MC_AP_NOWAIT);
if (!range_policy) {
dkprintf("%s: error allocating range_policy\n",
__FUNCTION__);
error = -ENOMEM;
goto unlock_out;
}
memset(range_policy, 0, sizeof(*range_policy));
range_policy->start = addr;
range_policy->end = addr + len;
if (range_policy_next) {
list_add_tail(&range_policy->list,
&range_policy_next->list);
}
else {
list_add_tail(&range_policy->list,
&vm->vm_range_numa_policy_list);
}
mbind_update_only:
if (mode == MPOL_DEFAULT) {
memset(range_policy->numa_mask, 0, sizeof(numa_mask));
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
set_bit(bit, range_policy->numa_mask);
}
}
else {
memcpy(range_policy->numa_mask, &numa_mask,
sizeof(numa_mask));
}
range_policy->numa_mem_policy = mode;
break;
default:
error = -EINVAL;
goto out;
}
error = 0;
unlock_out:
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
out:
return error;
} /* sys_mbind() */
SYSCALL_DECLARE(set_mempolicy)
{
dkprintf("sys_set_mempolicy\n");
return -ENOSYS;
int mode = ihk_mc_syscall_arg0(ctx);
unsigned long *nodemask =
(unsigned long *)ihk_mc_syscall_arg1(ctx);
unsigned long maxnode = ihk_mc_syscall_arg2(ctx);
unsigned long nodemask_bits = 0;
struct process_vm *vm = cpu_local_var(current)->vm;
int error = 0;
int bit, valid_mask;
struct vm_range_numa_policy *range_policy_iter;
struct vm_range_numa_policy *range_policy_next = NULL;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
memset(numa_mask, 0, sizeof(numa_mask));
if (maxnode) {
nodemask_bits = ALIGN(maxnode, 8);
if (maxnode > (PAGE_SIZE << 3)) {
dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
__FUNCTION__);
nodemask_bits = PROCESS_NUMA_MASK_BITS;
}
}
switch (mode) {
case MPOL_DEFAULT:
if (nodemask && nodemask_bits) {
error = copy_from_user(numa_mask, nodemask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
goto out;
}
if (!bitmap_empty(numa_mask, nodemask_bits)) {
dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
}
memset(vm->numa_mask, 0, sizeof(numa_mask));
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
set_bit(bit, vm->numa_mask);
}
/* Delete all range settings */
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
list_for_each_entry_safe(range_policy_iter, range_policy_next,
&vm->vm_range_numa_policy_list, list) {
list_del(&range_policy_iter->list);
kfree(range_policy_iter);
}
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
vm->numa_mem_policy = mode;
error = 0;
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
/* Special case for MPOL_PREFERRED with empty nodemask */
if (mode == MPOL_PREFERRED && !nodemask) {
memset(vm->numa_mask, 0, sizeof(numa_mask));
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
set_bit(bit, vm->numa_mask);
}
vm->numa_mem_policy = mode;
error = 0;
break;
}
if (!nodemask) {
dkprintf("%s: ERROR: nodemask not specified\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
error = copy_from_user(numa_mask, nodemask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
goto out;
}
/* Verify NUMA mask */
valid_mask = 0;
for_each_set_bit(bit, numa_mask, maxnode) {
if (bit >= ihk_mc_get_nr_numa_nodes()) {
dkprintf("%s: %d is bigger than # of NUMA nodes\n",
__FUNCTION__, bit);
error = -EINVAL;
goto out;
}
/* Is there at least one node which is allowed
* in current mask? */
if (test_bit(bit, vm->numa_mask)) {
valid_mask = 1;
}
}
if (!valid_mask) {
dkprintf("%s: ERROR: invalid nodemask\n", __FUNCTION__);
error = -EINVAL;
goto out;
}
/* Update current mask by clearing non-requested nodes */
for_each_set_bit(bit, vm->numa_mask, maxnode) {
if (!test_bit(bit, numa_mask)) {
clear_bit(bit, vm->numa_mask);
}
}
vm->numa_mem_policy = mode;
error = 0;
break;
default:
error = -EINVAL;
}
out:
return error;
} /* sys_set_mempolicy() */
SYSCALL_DECLARE(get_mempolicy)
{
dkprintf("sys_get_mempolicy\n");
return -ENOSYS;
int *mode = (int *)ihk_mc_syscall_arg0(ctx);
unsigned long *nodemask =
(unsigned long *)ihk_mc_syscall_arg1(ctx);
unsigned long nodemask_bits = 0;
unsigned long maxnode = ihk_mc_syscall_arg2(ctx);
unsigned long addr = ihk_mc_syscall_arg3(ctx);
unsigned long flags = ihk_mc_syscall_arg4(ctx);
struct process_vm *vm = cpu_local_var(current)->vm;
struct vm_range_numa_policy *range_policy = NULL;
int error = 0;
int policy;
if ((!(flags & MPOL_F_ADDR) && addr) ||
(flags & ~(MPOL_F_ADDR | MPOL_F_NODE | MPOL_F_MEMS_ALLOWED)) ||
((flags & MPOL_F_NODE) && !(flags & MPOL_F_ADDR) &&
vm->numa_mem_policy == MPOL_INTERLEAVE)) {
return -EINVAL;
}
/*
* XXX: man page claims the correct error code is EINVAL,
* but LTP tests for EFAULT.
*/
if ((flags & MPOL_F_ADDR) && !addr) {
return -EFAULT;
}
if (maxnode) {
if (maxnode < ihk_mc_get_nr_numa_nodes()) {
return -EINVAL;
}
nodemask_bits = ALIGN(maxnode, 8);
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
__FUNCTION__);
nodemask_bits = PROCESS_NUMA_MASK_BITS;
}
}
/* Special case of MPOL_F_MEMS_ALLOWED */
if (flags == MPOL_F_MEMS_ALLOWED) {
if (nodemask) {
error = copy_to_user(nodemask,
cpu_local_var(current)->vm->numa_mask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
}
}
goto out;
}
/* Address range specific? */
if (flags & MPOL_F_ADDR) {
struct vm_range_numa_policy *range_policy_iter;
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
range = lookup_process_memory_range(vm, addr, addr + 1);
if (!range) {
dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__);
error = -EFAULT;
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
goto out;
}
list_for_each_entry(range_policy_iter,
&vm->vm_range_numa_policy_list, list) {
if (range_policy_iter->start > addr ||
range_policy_iter->end <= addr) {
continue;
}
range_policy = range_policy_iter;
break;
}
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
}
/* Return policy */
policy = range_policy ? range_policy->numa_mem_policy :
vm->numa_mem_policy;
if (mode) {
error = copy_to_user(mode, &policy, sizeof(int));
if (error) {
error = -EFAULT;
goto out;
}
}
if (nodemask && (policy != MPOL_DEFAULT)) {
error = copy_to_user(nodemask,
range_policy ? range_policy->numa_mask :
cpu_local_var(current)->vm->numa_mask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
goto out;
}
}
out:
return error;
} /* sys_get_mempolicy() */
SYSCALL_DECLARE(migrate_pages)

View File

@ -112,7 +112,7 @@ static int alloc_zeroobj(void)
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("alloc_zeroobj():"

1179
lib/bitmap.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -140,3 +140,58 @@ found:
return result + ffz(tmp);
}
/**
* hweightN - returns the hamming weight of a N-bit word
* @x: the word to weigh
*
* The Hamming Weight of a number is the total number of bits set in it.
*/
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x55555555;
w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
w = (w + (w >> 4)) & 0x0f0f0f0f;
return (w * 0x01010101) >> 24;
#else
unsigned int res = w - ((w >> 1) & 0x55555555);
res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
res = (res + (res >> 4)) & 0x0F0F0F0F;
res = res + (res >> 8);
return (res + (res >> 16)) & 0x000000FF;
#endif
}
unsigned int __sw_hweight16(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x5555);
res = (res & 0x3333) + ((res >> 2) & 0x3333);
res = (res + (res >> 4)) & 0x0F0F;
return (res + (res >> 8)) & 0x00FF;
}
unsigned int __sw_hweight8(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x55);
res = (res & 0x33) + ((res >> 2) & 0x33);
return (res + (res >> 4)) & 0x0F;
}
unsigned long __sw_hweight64(uint64_t w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x5555555555555555ul;
w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
return (w * 0x0101010101010101ul) >> 56;
#else
uint64_t res = w - ((w >> 1) & 0x5555555555555555ul);
res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
res = res + (res >> 8);
res = res + (res >> 16);
return (res + (res >> 32)) & 0x00000000000000FFul;
#endif
}

307
lib/include/bitmap.h Normal file
View File

@ -0,0 +1,307 @@
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H
#include <types.h>
#include <bitops.h>
#include <string.h>
/*
* bitmaps provide bit arrays that consume one or more unsigned
* longs. The bitmap interface and available operations are listed
* here, in bitmap.h
*
* Function implementations generic to all architectures are in
* lib/bitmap.c. Functions implementations that are architecture
* specific are in various include/asm-<arch>/bitops.h headers
* and other arch/<arch> specific files.
*
* See lib/bitmap.c for more details.
*/
/*
* The available bitmap operations and their rough meaning in the
* case that the bitmap is a single unsigned long are thus:
*
* Note that nbits should be always a compile time evaluable constant.
* Otherwise many inlines will generate horrible code.
*
* bitmap_zero(dst, nbits) *dst = 0UL
* bitmap_fill(dst, nbits) *dst = ~0UL
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
* bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal?
* bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap?
* bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2?
* bitmap_empty(src, nbits) Are all bits zero in *src?
* bitmap_full(src, nbits) Are all bits set in *src?
* bitmap_weight(src, nbits) Hamming Weight: number set bits
* bitmap_set(dst, pos, nbits) Set specified bit area
* bitmap_clear(dst, pos, nbits) Clear specified bit area
* bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
* bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n
* bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
* bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
* bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
* bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap
* bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz
* bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf
* bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
* bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf
* bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf
* bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region
* bitmap_release_region(bitmap, pos, order) Free specified bit region
* bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region
*/
/*
* Also the following operations in asm/bitops.h apply to bitmaps.
*
* set_bit(bit, addr) *addr |= bit
* clear_bit(bit, addr) *addr &= ~bit
* change_bit(bit, addr) *addr ^= bit
* test_bit(bit, addr) Is bit set in *addr?
* test_and_set_bit(bit, addr) Set bit and return old value
* test_and_clear_bit(bit, addr) Clear bit and return old value
* test_and_change_bit(bit, addr) Change bit and return old value
* find_first_zero_bit(addr, nbits) Position first zero bit in *addr
* find_first_bit(addr, nbits) Position first set bit in *addr
* find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit
* find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit
*/
/*
* The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
* to declare an array named 'name' of just enough unsigned longs to
* contain all bit positions from 0 to 'bits' - 1.
*/
/*
* lib/bitmap.c provides these functions:
*/
#define __user
#define __force
#define u32 uint32_t
extern int __bitmap_empty(const unsigned long *bitmap, int bits);
extern int __bitmap_full(const unsigned long *bitmap, int bits);
extern int __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
int bits);
extern void __bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern void __bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
extern void bitmap_set(unsigned long *map, int i, int len);
extern void bitmap_clear(unsigned long *map, int start, int nr);
extern unsigned long bitmap_find_next_zero_area(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask);
extern int bitmap_scnprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
unsigned long *dst, int nbits);
extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern int bitmap_scnlistprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int bitmap_parselist(const char *buf, unsigned long *maskp,
int nmaskbits);
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new, int bits);
extern int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
int sz, int bits);
extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
#define BITMAP_LAST_WORD_MASK(nbits) \
( \
((nbits) % BITS_PER_LONG) ? \
(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \
)
#define small_const_nbits(nbits) \
(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
static inline void bitmap_zero(unsigned long *dst, int nbits)
{
if (small_const_nbits(nbits))
*dst = 0UL;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0, len);
}
}
static inline void bitmap_fill(unsigned long *dst, int nbits)
{
size_t nlongs = BITS_TO_LONGS(nbits);
if (!small_const_nbits(nbits)) {
int len = (nlongs - 1) * sizeof(unsigned long);
memset(dst, 0xff, len);
}
dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = *src;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memcpy(dst, src, len);
}
}
static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & *src2) != 0;
return __bitmap_and(dst, src1, src2, nbits);
}
static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 | *src2;
else
__bitmap_or(dst, src1, src2, nbits);
}
static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 ^ *src2;
else
__bitmap_xor(dst, src1, src2, nbits);
}
static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & ~(*src2)) != 0;
return __bitmap_andnot(dst, src1, src2, nbits);
}
static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_complement(dst, src, nbits);
}
static inline int bitmap_equal(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_equal(src1, src2, nbits);
}
static inline int bitmap_intersects(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
else
return __bitmap_intersects(src1, src2, nbits);
}
static inline int bitmap_subset(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_subset(src1, src2, nbits);
}
static inline int bitmap_empty(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_empty(src, nbits);
}
static inline int bitmap_full(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_full(src, nbits);
}
static inline int bitmap_weight(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
return __bitmap_weight(src, nbits);
}
static inline void bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src >> n;
else
__bitmap_shift_right(dst, src, n, nbits);
}
static inline void bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_shift_left(dst, src, n, nbits);
}
static inline int bitmap_parse(const char *buf, unsigned int buflen,
unsigned long *maskp, int nmaskbits)
{
return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
}
#endif /* __LINUX_BITMAP_H */

View File

@ -27,6 +27,31 @@ unsigned long find_first_bit(const unsigned long *addr,
unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size);
static inline int test_bit(int nr, const void *addr)
{
const uint32_t *p = (const uint32_t *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned long __sw_hweight64(uint64_t w);
static inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? __sw_hweight32(w) : __sw_hweight64(w);
}
#define BIT(nr) (1UL << (nr))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITS_PER_BYTE 8
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
#endif /*__ASSEMBLY__*/
#include <arch-bitops.h>

View File

@ -55,6 +55,7 @@ struct ihk_mc_cpu_info *ihk_mc_get_cpu_info(void);
void ihk_mc_boot_cpu(int cpuid, unsigned long pc);
int ihk_mc_get_processor_id(void);
int ihk_mc_get_hardware_processor_id(void);
int ihk_mc_get_numa_id(void);
void ihk_mc_delay_us(int us);
void ihk_mc_set_syscall_handler(long (*handler)(int, ihk_mc_user_context_t *));

View File

@ -72,8 +72,11 @@ struct ihk_mc_memory_node {
unsigned long ihk_mc_get_memory_address(enum ihk_mc_gma_type, int);
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int));
struct ihk_page_allocator_desc;
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
struct ihk_mc_pa_ops {
void *(*alloc_page)(int, int, enum ihk_mc_ap_flag);
@ -100,14 +103,28 @@ void ihk_mc_map_micpa(unsigned long host_pa, unsigned long* mic_pa);
int ihk_mc_free_micpa(unsigned long mic_pa);
void ihk_mc_clean_micpa(void);
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag);
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
void ihk_mc_free_pages(void *p, int npages);
void *_ihk_mc_alloc_aligned_pages(int npages, int p2align,
enum ihk_mc_ap_flag flag, char *file, int line);
#define ihk_mc_alloc_aligned_pages(npages, p2align, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages(npages, p2align, flag, __FILE__, __LINE__);\
r;\
})
void *_ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag,
char *file, int line);
#define ihk_mc_alloc_pages(npages, flag) ({\
void *r = _ihk_mc_alloc_pages(npages, flag, __FILE__, __LINE__);\
r;\
})
void _ihk_mc_free_pages(void *ptr, int npages, char *file, int line);
#define ihk_mc_free_pages(p, npages) ({\
_ihk_mc_free_pages(p, npages, __FILE__, __LINE__);\
})
void *ihk_mc_allocate(int size, int flag);
void ihk_mc_free(void *p);
void *arch_alloc_page(enum ihk_mc_ap_flag flag);
void arch_free_page(void *ptr);
int arch_get_smaller_page_size(void *args, size_t origsize, size_t *sizep, int *p2alignp);
typedef void *page_table_t;
@ -151,6 +168,16 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys);
uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt);
int ihk_mc_get_nr_numa_nodes(void);
struct smp_coreset;
int ihk_mc_get_numa_node(int id, int *linux_numa_id, int *type);
int ihk_mc_get_numa_distance(int i, int j);
int ihk_mc_get_nr_memory_chunks(void);
int ihk_mc_get_memory_chunk(int id,
unsigned long *start,
unsigned long *end,
int *numa_id);
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id);

View File

@ -14,14 +14,24 @@
#ifndef __HEADER_GENERIC_IHK_PAGE_ALLOC
#define __HEADER_GENERIC_IHK_PAGE_ALLOC
#include <list.h>
/* XXX: Physical memory management shouldn't be part of IHK */
struct ihk_mc_numa_node {
int id;
int linux_numa_id;
int type;
struct list_head allocators;
};
struct ihk_page_allocator_desc {
unsigned long start;
unsigned long start, end;
unsigned int last;
unsigned int count;
unsigned int flag;
unsigned int shift;
ihk_spinlock_t lock;
unsigned int pad;
struct list_head list;
unsigned long map[0];
};

View File

@ -29,6 +29,11 @@ void *memcpy_long(void *dest, const void *src, size_t n);
int memcmp(const void *s1, const void *s2, size_t n);
void *memset(void *s, int n, size_t l);
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
unsigned long strtol(const char *cp, char **endp, unsigned int base);
int flatten_strings(int nr_strings, char *first, char **strings, char **flat);
int flatten_strings_from_user(int nr_strings, char *first, char **strings, char **flat);

View File

@ -52,7 +52,7 @@ void *__ihk_pagealloc_init(unsigned long start, unsigned long size,
desc = initial;
*pdescsize = descsize;
} else {
desc = (void *)allocate_pages(descsize, IHK_MC_AP_CRITICAL);
desc = (void *)ihk_mc_alloc_pages(descsize, IHK_MC_AP_CRITICAL);
}
if (!desc) {
kprintf("IHK: failed to allocate page-allocator-desc "\
@ -64,13 +64,14 @@ void *__ihk_pagealloc_init(unsigned long start, unsigned long size,
memset(desc, 0, descsize * PAGE_SIZE);
desc->start = start;
desc->end = start + size;
desc->last = 0;
desc->count = mapaligned >> 3;
desc->shift = page_shift;
desc->flag = flag;
kprintf("Page allocator: %lx - %lx (%d)\n", start, start + size,
page_shift);
//kprintf("page allocator @ %lx - %lx (%d)\n", start, start + size,
// page_shift);
ihk_mc_spinlock_init(&desc->lock);
@ -92,7 +93,7 @@ void ihk_pagealloc_destroy(void *__desc)
{
struct ihk_page_allocator_desc *desc = __desc;
free_pages(desc, desc->flag);
ihk_mc_free_pages(desc, desc->flag);
}
static unsigned long __ihk_pagealloc_large(struct ihk_page_allocator_desc *desc,