Compare commits

..

35 Commits

Author SHA1 Message Date
2fa1c053d7 spec: prerelease 0.3 for testing ihk_reserve_mem and memory policy
Change-Id: I4fbcfa1f93522fd01af42d1ef13d0be075086773
2020-12-24 15:11:01 +09:00
530110e3a9 Tofu: fix ENABLE_TOFU switching
Change-Id: Ib33323d4b59ea8fb4f5f40dff7ea25a36773d5e2
2020-12-24 15:00:14 +09:00
f6ed44aeec spec: prerelease 0.2 for testing ihk_reserve_mem and memory policy
Change-Id: I9ff171c5d65b5f465ce7a2767be1a710de0a0400
2020-12-24 11:23:17 +09:00
33dd2e60b1 mcexec: memory policy control by environmental variable
Refs: #1470
Change-Id: I3d556cae90d31d81572b1c4e5c680e826577d428
2020-12-24 11:18:01 +09:00
ed670c03af spec: prerelease 0.1 for testing ihk_create_os_str
Change-Id: I3c9bbc6f3c9e8951c0ad700b9c02fcdec65018ff
2020-12-23 11:33:31 +09:00
e5f4a4e87d Tofu: proper cleanup of device files when mcexec gets killed
Change-Id: I6cb0290f72d96682700f945b29585e132e525ac1
2020-12-09 13:05:54 +09:00
1918df7765 Tofu: support for barrier gate, kmalloc cache
Change-Id: I6f4cfec2ec404efd03b332fc3f449a775816230e
2020-12-09 13:05:54 +09:00
5d784f3ea4 kernel: increase stack size
Change-Id: I27698149e9206138402dcc65db0078d5dbf548cb
2020-12-09 13:05:53 +09:00
10c09aa10e MM: generic lockless kmalloc and page cache
Change-Id: I71ad498fdd10136d9c72ffe2b16b9122d1bc9673
2020-12-09 13:05:53 +09:00
41f5c0bdde MM: deferred zero cleaning on Linux CPUs
Change-Id: Icdb8ac807688533be7a95b7101edfd904250cd02
2020-12-09 13:05:53 +09:00
e7b8aeb4f7 Tofu: per-fd path memory leak fix
Change-Id: I451472365806333adfac6dae32746195e3c30694
2020-12-09 13:05:53 +09:00
1b3dd45dbc MM: straight mapping memory leak fix
Change-Id: I7d841fbedb1db498b5994eb69b0350df7a5cefb0
2020-12-09 13:05:53 +09:00
623d6f8bc3 arm64: record register state at kernel mode page fault (for eclair)
Change-Id: I066bceecc0377110faaca0b21d45a476d000e684
2020-12-09 13:05:53 +09:00
92902d36fc Tofu: initial version
Change-Id: I9c464d5af883c18715a97ca9e9981cf73b260f90
2020-12-09 13:03:01 +09:00
fe83deb3db profile: make header user-space includable
Change-Id: I4a88d9be7c169f29ef6f6328e8576a3fe3b6e34f
2020-12-08 12:32:10 +09:00
e056cb799f memclear: non-temporal memory clean (arm64)
Change-Id: I8f80ff20e98bc01088450282e1790c27c67c16eb
2020-12-08 12:32:10 +09:00
201f5ce500 MM: straight mapping
Change-Id: I70871f8c382fb00aa719ed501cc5de436d916d7f
2020-12-08 12:32:10 +09:00
100bbe6231 MM: zero memory at free and deferred zero
Change-Id: Ib0055d6f2bdd10d05d749dcd1f3d5c3d318f22f3
2020-12-08 12:32:10 +09:00
fbd121d28c mmap: return -EINVAL for non-anonymous, MAP_HUGETLB map
Change-Id: I2bcbbf0ee9c0f47160eabac4a8d09991c71fe852
2020-12-07 15:23:38 +09:00
d1d93d90cc mcexec: detect mismatch of mcexec -n and mpirun -ppn
Change-Id: I0ce1b2d48cda10713920cb88692e107b8c4d3bab
Refs: #929
2020-12-07 15:23:34 +09:00
45bc6a617a __return_syscall: check input & fix unmap memory in error cases
Change-Id: I5de3ab3acd46770518b79bdc6f1c2e00c1cd5096
2020-11-25 01:58:47 +00:00
924ba7fd65 mcctrl_ikc_send_wait: free desc only if we allocated it internally
Change-Id: I4710ea6bb31f098451347c53ac0ff0be422aec06
2020-11-25 01:58:47 +00:00
2814f7cac4 mcctrl_get_request_os_cpu: check os instance & ret_cpu
Change-Id: I4d3f6fd93eaa183d560c874ba33add83c4308c5a
2020-11-25 01:58:47 +00:00
b510de7bd5 mcctrl_perf_get: check os instance & cpu info
Change-Id: Ic4f9d818b7d58f8ae651e43175fb1c478baec9c1
2020-11-25 01:58:47 +00:00
3e927f61dc mcctrl_perf_disable: check os instance & cpu info
Change-Id: I7195272a65b31db72158f5e5bbfc490bac547b91
2020-11-25 01:58:47 +00:00
64579830dd mcctrl_perf_enable: check os instance & cpu info
Change-Id: I31ab829d63833f924af17445fd9b8488d6eb454f
2020-11-25 01:58:47 +00:00
3cc98883f5 delete_procfs_entries: fix possible crash if top entry has no children
Change-Id: I209842699615f9bb58c12ccd262ae4b17f8f558c
2020-11-25 01:58:47 +00:00
442045a320 mcctrl_ikc_send: validate os and check input packet
Change-Id: I1f8c2228043841685617b665eeeaf2ce15a08703
2020-11-25 01:58:47 +00:00
fe5d8fc71f mcctrl_getrusage: validate os input
Change-Id: I97908069f8bc4703b99f9ffca94f3dd33eb64cc4
2020-11-25 01:58:47 +00:00
550c6cc5fb mcctrl_perf_set : validate os input & check cpu info
Change-Id: If308013746ff6dce03fa8e0eb1ebaca1cb2a4a64
2020-11-25 01:58:47 +00:00
8c0b2ab6ce mcctrl_perf_num: check "os" argument
Change-Id: I13c8b0c337cac9bbb240667808e871defce34aab
2020-11-25 01:58:47 +00:00
239b1b265f release 1.7.0
Change-Id: I8413aa2d051c6164235816bae2823187870efe49
2020-11-25 10:51:40 +09:00
f646fd141b prerelase 0.96: ihk_reserve_mem: balanced, capped best effort
Change-Id: Ia98c87e651d8dd34dfd36bc0c45f1d23e245330d
2020-11-24 03:40:01 +00:00
734d1cc056 ihk submodule update: ihklib: ihk_create_os_str: add ihk_reserve_mem_conf equivalent
Change-Id: Iede1a043b0316d6541656e86091f2288fd299383
2020-11-24 03:40:01 +00:00
040a9c0c7f cmake: set QEMU_LD_PREFIX when cross-compiling
Change-Id: Ie7b86ddba344e02d6f739225e44f3ad4927f5a2f
2020-11-20 07:59:55 +00:00
90 changed files with 8067 additions and 121 deletions

View File

@ -7,10 +7,10 @@ endif (NOT CMAKE_BUILD_TYPE)
enable_language(C ASM)
project(mckernel C ASM)
set(MCKERNEL_VERSION "1.7.0")
set(MCKERNEL_VERSION "1.7.1")
# See "Fedora Packaging Guidlines -- Versioning"
set(MCKERNEL_RELEASE "0.95")
# See "Fedora Packaging Guidelines -- Versioning"
set(MCKERNEL_RELEASE "0.3")
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
# for rpmbuild
@ -50,6 +50,20 @@ if (ENABLE_WERROR)
add_compile_options("-Werror")
endif(ENABLE_WERROR)
execute_process(COMMAND bash -c "ls -ld /proc/tofu/ 2>/dev/null | wc -l"
OUTPUT_VARIABLE PROC_TOFU OUTPUT_STRIP_TRAILING_WHITESPACE)
if(PROC_TOFU STREQUAL "1")
option(ENABLE_TOFU "Built-in tofu driver support" ON)
else()
option(ENABLE_TOFU "Built-in tofu driver support" OFF)
endif()
if(ENABLE_TOFU)
add_definitions(-DENABLE_TOFU)
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DENABLE_TOFU")
endif()
option(ENABLE_LINUX_WORK_IRQ_FOR_IKC "Use Linux work IRQ for IKC IPI" ON)
if (ENABLE_LINUX_WORK_IRQ_FOR_IKC)
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DIHK_IKC_USE_LINUX_WORK_IRQ")
@ -252,6 +266,7 @@ message("KBUILD_C_FLAGS: ${KBUILD_C_FLAGS}")
message("MAP_KERNEL_START: ${MAP_KERNEL_START}")
message("ENABLE_MEMDUMP: ${ENABLE_MEMDUMP}")
message("ENABLE_PERF: ${ENABLE_PERF}")
message("ENABLE_TOFU: ${ENABLE_TOFU}")
message("ENABLE_RUSAGE: ${ENABLE_RUSAGE}")
message("ENABLE_QLMPI: ${ENABLE_QLMPI}")
message("ENABLE_UTI: ${ENABLE_UTI}")

View File

@ -143,6 +143,11 @@ void arch_save_panic_regs(void *irq_regs)
clv = get_arm64_this_cpu_local();
/* If kernel mode PF occurred, unroll the causing call stack */
if (cpu_local_var(kernel_mode_pf_regs)) {
regs = cpu_local_var(kernel_mode_pf_regs);
}
/* For user-space, use saved kernel context */
if (regs->pc < USER_END) {
memset(clv->arm64_cpu_local_thread.panic_regs,

View File

@ -223,7 +223,12 @@ static int do_translation_fault(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
{
#ifdef ENABLE_TOFU
// XXX: Handle kernel space page faults for Tofu driver
//if (addr < USER_END)
#else
if (addr < USER_END)
#endif
return do_page_fault(addr, esr, regs);
do_bad_area(addr, esr, regs);

View File

@ -94,7 +94,11 @@ extern char _end[];
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
# define USER_END UL(0x0000400000000000)
#ifdef ENABLE_TOFU
# define MAP_VMAP_START UL(0xffff7bdfffff0000)
#else
# define MAP_VMAP_START UL(0xffff780000000000)
#endif
# define MAP_VMAP_SIZE UL(0x0000000100000000)
# define MAP_FIXED_START UL(0xffff7ffffbdd0000)
# define MAP_ST_START UL(0xffff800000000000)
@ -142,6 +146,7 @@ extern char _end[];
# define __PTL1_SHIFT 16
# define PTL4_INDEX_MASK 0
# define PTL3_INDEX_MASK ((UL(1) << 6) - 1)
# define PTL3_INDEX_MASK_LINUX ((UL(1) << 10) - 1)
# define PTL2_INDEX_MASK ((UL(1) << 13) - 1)
# define PTL1_INDEX_MASK PTL2_INDEX_MASK
# define __PTL4_CONT_SHIFT (__PTL4_SHIFT + 0)
@ -829,7 +834,13 @@ static inline int pte_is_head(pte_t *ptep, pte_t *old, size_t cont_size)
return page_is_contiguous_head(ptep, cont_size);
}
struct page_table;
typedef pte_t translation_table_t;
struct page_table {
translation_table_t* tt;
translation_table_t* tt_pa;
int asid;
};
void arch_adjust_allocate_page_size(struct page_table *pt,
uintptr_t fault_addr,
pte_t *ptep,
@ -849,7 +860,6 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable);
void set_address_space_id(struct page_table *pt, int asid);
int get_address_space_id(const struct page_table *pt);
typedef pte_t translation_table_t;
void set_translation_table(struct page_table *pt, translation_table_t* tt);
translation_table_t* get_translation_table(const struct page_table *pt);
translation_table_t* get_translation_table_as_paddr(const struct page_table *pt);

View File

@ -10,4 +10,13 @@ extern void *__inline_memcpy(void *to, const void *from, size_t t);
extern void *__inline_memset(void *s, unsigned long c, size_t count);
#define ARCH_MEMCLEAR
extern void __memclear(void *addr, unsigned long len, void *tmp);
inline static void memclear(void *addr, unsigned long len)
{
uint64_t q0q1[4];
__memclear(addr, len, (void *)&q0q1);
}
#endif /* __HEADER_ARM64_COMMON_ARCH_TIMER_H */

View File

@ -80,6 +80,10 @@ static inline uint64_t __raw_readq(const volatile void *addr)
return val;
}
/* IO barriers */
#define __iormb() rmb()
#define __iowmb() wmb()
/*
* Relaxed I/O memory access primitives. These follow the Device memory
* ordering rules but do not guarantee any ordering relative to Normal memory
@ -95,5 +99,20 @@ static inline uint64_t __raw_readq(const volatile void *addr)
#define writel_relaxed(v,c) ((void)__raw_writel((uint32_t)(v),(c)))
#define writeq_relaxed(v,c) ((void)__raw_writeq((uint64_t)(v),(c)))
/*
* I/O memory access primitives. Reads are ordered relative to any
* following Normal memory access. Writes are ordered relative to any prior
* Normal memory access.
*/
#define readb(c) ({ uint8_t __v = readb_relaxed(c); __iormb(); __v; })
#define readw(c) ({ uint16_t __v = readw_relaxed(c); __iormb(); __v; })
#define readl(c) ({ uint32_t __v = readl_relaxed(c); __iormb(); __v; })
#define readq(c) ({ uint64_t __v = readq_relaxed(c); __iormb(); __v; })
#define writeb(v,c) ({ __iowmb(); writeb_relaxed((v),(c)); })
#define writew(v,c) ({ __iowmb(); writew_relaxed((v),(c)); })
#define writel(v,c) ({ __iowmb(); writel_relaxed((v),(c)); })
#define writeq(v,c) ({ __iowmb(); writeq_relaxed((v),(c)); })
#endif /* __KERNEL__ */
#endif /* __ASM_IO_H */

View File

@ -2,7 +2,7 @@
#ifndef __HEADER_ARM64_COMMON_THREAD_INFO_H
#define __HEADER_ARM64_COMMON_THREAD_INFO_H
#define MIN_KERNEL_STACK_SHIFT 15
#define MIN_KERNEL_STACK_SHIFT 18
#include <arch-memory.h>

View File

@ -150,12 +150,6 @@ void flush_tlb_single(unsigned long addr)
arch_flush_tlb_single(asid, addr);
}
struct page_table {
translation_table_t* tt;
translation_table_t* tt_pa;
int asid;
};
extern struct page_table swapper_page_table;
static struct page_table *init_pt = &swapper_page_table;
static ihk_spinlock_t init_pt_lock;
@ -223,6 +217,13 @@ static inline int ptl4_index(unsigned long addr)
int idx = (addr >> PTL4_SHIFT) & PTL4_INDEX_MASK;
return idx;
}
#ifdef ENABLE_TOFU
static inline int ptl3_index_linux(unsigned long addr)
{
int idx = (addr >> PTL3_SHIFT) & PTL3_INDEX_MASK_LINUX;
return idx;
}
#endif
static inline int ptl3_index(unsigned long addr)
{
int idx = (addr >> PTL3_SHIFT) & PTL3_INDEX_MASK;
@ -281,6 +282,40 @@ static inline pte_t* ptl4_offset(const translation_table_t* ptl4, unsigned long
}
return ptep;
}
#ifdef ENABLE_TOFU
static inline pte_t* ptl3_offset_linux(const pte_t* l4p, unsigned long addr)
{
pte_t* ptep = NULL;
pte_t pte = 0;
unsigned long phys = 0;
translation_table_t* ptl3 = NULL;
int idx = 0;
switch (CONFIG_ARM64_PGTABLE_LEVELS)
{
case 4:
pte = ptl4_val(l4p);
phys = pte & PT_PHYSMASK;
ptl3 = phys_to_virt(phys);
idx = ptl3_index_linux(addr);
ptep = (pte_t*)ptl3 + idx;
break;
case 3:
ptl3 = (translation_table_t*)l4p;
idx = ptl3_index_linux(addr);
ptep = (pte_t*)ptl3 + idx;
break;
case 2:
case 1:
/* PTL3が無いときにはエントリではなくページテーブルのアドレスを引渡していく。*/
ptep = (pte_t*)l4p;
break;
}
return ptep;
}
#endif
static inline pte_t* ptl3_offset(const pte_t* l4p, unsigned long addr)
{
pte_t* ptep = NULL;
@ -959,7 +994,14 @@ static void init_normal_area(struct page_table *pt)
int i;
tt = get_translation_table(pt);
#ifdef ENABLE_TOFU
setup(tt,
arm64_st_phys_base,
arm64_st_phys_base + (1UL << 40));
return;
#endif
for (i = 0; i < ihk_mc_get_nr_memory_chunks(); i++) {
unsigned long map_start, map_end;
int numa_id;
@ -1287,6 +1329,58 @@ out:
return ret;
}
#ifdef ENABLE_TOFU
int ihk_mc_linux_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size)
{
unsigned long v = (unsigned long)virt;
pte_t* ptep;
translation_table_t* tt;
unsigned long paddr;
unsigned long lsize;
tt = get_translation_table(pt);
ptep = ptl4_offset(tt, v);
if (!ptl4_present(ptep)) {
return -EFAULT;
}
ptep = ptl3_offset_linux(ptep, v);
if (!ptl3_present(ptep)) {
return -EFAULT;
}
if (ptl3_type_block(ptep)) {
paddr = ptl3_phys(ptep);
lsize = PTL3_SIZE;
goto out;
}
ptep = ptl2_offset(ptep, v);
if (!ptl2_present(ptep)) {
return -EFAULT;
}
if (ptl2_type_block(ptep)) {
paddr = ptl2_phys(ptep);
lsize = PTL2_SIZE;
goto out;
}
ptep = ptl1_offset(ptep, v);
if (!ptl1_present(ptep)) {
return -EFAULT;
}
paddr = ptl1_phys(ptep);
lsize = PTL1_SIZE;
out:
*phys = paddr | (v & (lsize - 1));
if(size) *size = lsize;
return 0;
}
#endif
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
@ -1348,7 +1442,6 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
return ihk_mc_pt_virt_to_phys_size(pt, virt, phys, NULL);
}
int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
{
const unsigned long v = (unsigned long)virt;
@ -1360,6 +1453,15 @@ int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
}
tt = get_translation_table(pt);
__kprintf("%s: 0x%lx, CONFIG_ARM64_PGTABLE_LEVELS: %d, ptl4_index: %ld, ptl3_index: %ld, ptl2_index: %ld, ptl1_index: %ld\n",
__func__,
v,
CONFIG_ARM64_PGTABLE_LEVELS,
ptl4_index(v),
ptl3_index(v),
ptl2_index(v),
ptl1_index(v));
ptep = ptl4_offset(tt, v);
__kprintf("l4 table: 0x%lX l4idx: %d\n", virt_to_phys(tt), ptl4_index(v));
if (!(ptl4_present(ptep))) {
@ -2147,6 +2249,198 @@ static void unmap_free_stat(struct page *page, unsigned long phys,
}
}
/*
* Kernel space page table clearing functions.
*/
struct clear_kernel_range_args {
int free_physical;
};
static int clear_kernel_range_middle(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end, int level);
static int clear_kernel_range_l1(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end)
{
const struct table {
unsigned long pgsize;
unsigned long cont_pgsize;
} tbl = {
.pgsize = PTL1_SIZE,
.cont_pgsize = PTL1_CONT_SIZE
};
struct clear_kernel_range_args *args = args0;
uint64_t phys = 0;
pte_t old;
size_t clear_size;
if (ptl1_null(ptep)) {
return -ENOENT;
}
old = xchg(ptep, PTE_NULL);
if (!pte_is_present(&old))
return 0;
arch_flush_tlb_single(0, base);
clear_size = pte_is_contiguous(&old) ?
tbl.cont_pgsize : tbl.pgsize;
dkprintf("%s: 0x%lx:%lu unmapped\n",
__func__, base, clear_size);
if (args->free_physical) {
phys = ptl1_phys(&old);
ihk_mc_free_pages(phys_to_virt(phys), clear_size >> PAGE_SHIFT);
}
return 0;
}
static int clear_kernel_range_l2(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end)
{
return clear_kernel_range_middle(args0, ptep, base, start, end, 2);
}
static int clear_kernel_range_l3(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end)
{
return clear_kernel_range_middle(args0, ptep, base, start, end, 3);
}
static int clear_kernel_range_l4(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end)
{
return clear_kernel_range_middle(args0, ptep, base, start, end, 4);
}
static int clear_kernel_range_middle(void *args0, pte_t *ptep, uint64_t base,
uint64_t start, uint64_t end, int level)
{
const struct table {
walk_pte_t* walk;
walk_pte_fn_t* callback;
unsigned long pgsize;
unsigned long cont_pgsize;
} table[] = {
{walk_pte_l1, clear_kernel_range_l1, PTL2_SIZE, PTL2_CONT_SIZE}, /*PTL2*/
{walk_pte_l2, clear_kernel_range_l2, PTL3_SIZE, PTL3_CONT_SIZE}, /*PTL3*/
{walk_pte_l3, clear_kernel_range_l3, PTL4_SIZE, PTL4_CONT_SIZE}, /*PTL4*/
};
const struct table tbl = table[level-2];
struct clear_kernel_range_args *args = args0;
uint64_t phys = 0;
translation_table_t *tt;
int error;
pte_t old;
size_t clear_size;
if (ptl_null(ptep, level)) {
return -ENOENT;
}
dkprintf("%s(level: %d): 0x%lx in 0x%lx-0x%lx\n",
__func__, level, base, start, end);
if (ptl_type_page(ptep, level)
&& ((base < start) || (end < (base + tbl.pgsize)))) {
error = -EINVAL;
ekprintf("clear_range_middle(%p,%p,%lx,%lx,%lx,%d):"
"split page. %d\n",
args0, ptep, base, start, end, level, error);
return error;
}
if (ptl_type_page(ptep, level)) {
old = xchg(ptep, PTE_NULL);
if (!ptl_present(&old, level)) {
return 0;
}
arch_flush_tlb_single(0, base);
clear_size = pte_is_contiguous(&old) ?
tbl.cont_pgsize : tbl.pgsize;
dkprintf("%s(level: %d): 0x%lx:%lu unmapped\n",
__func__, level, base, clear_size);
if (args->free_physical) {
phys = ptl_phys(&old, level);
ihk_mc_free_pages(phys_to_virt(phys), clear_size >> PAGE_SHIFT);
}
return 0;
}
tt = (translation_table_t*)phys_to_virt(ptl_phys(ptep, level));
error = tbl.walk(tt, base, start, end, tbl.callback, args0);
if (error && (error != -ENOENT)) {
return error;
}
if (args->free_physical) {
if ((start <= base) && ((base + tbl.pgsize) <= end)) {
ptl_clear(ptep, level);
arch_flush_tlb_single(0, base);
ihk_mc_free_pages(tt, 1);
}
}
return 0;
}
static int clear_kernel_range(uintptr_t start, uintptr_t end, int free_physical)
{
const struct table {
walk_pte_t* walk;
walk_pte_fn_t* callback;
} tables[] = {
{walk_pte_l2, clear_kernel_range_l2}, /*second*/
{walk_pte_l3, clear_kernel_range_l3}, /*first*/
{walk_pte_l4, clear_kernel_range_l4}, /*zero*/
};
const struct table initial_lookup = tables[CONFIG_ARM64_PGTABLE_LEVELS - 2];
int error;
struct clear_kernel_range_args args;
translation_table_t* tt;
unsigned long irqflags;
dkprintf("%s: start: 0x%lx, end: 0x%lx, free phys: %d\n",
__func__, start, end, free_physical);
if (start <= USER_END)
return -EINVAL;
args.free_physical = free_physical;
irqflags = ihk_mc_spinlock_lock(&init_pt_lock);
tt = get_translation_table(get_init_page_table());
error = initial_lookup.walk(tt, 0,
(start & ~(0xffff000000000000)),
(end & ~(0xffff000000000000)),
initial_lookup.callback, &args);
dkprintf("%s: start: 0x%lx, end: 0x%lx, free phys: %d, ret: %d\n",
__func__, start, end, free_physical, error);
ihk_mc_spinlock_unlock(&init_pt_lock, irqflags);
return error;
}
int ihk_mc_clear_kernel_range(void *start, void *end)
{
#define KEEP_PHYSICAL 0
return clear_kernel_range((uintptr_t)start, (uintptr_t)end, KEEP_PHYSICAL);
}
/*
* User space page table clearing functions.
*/
struct clear_range_args {
int free_physical;
struct memobj *memobj;
@ -2344,6 +2638,14 @@ static int clear_range(struct page_table *pt, struct process_vm *vm,
if (memobj && ((memobj->flags & MF_PREMAP))) {
args.free_physical = 0;
}
if (vm->proc->straight_va &&
(void *)start == vm->proc->straight_va &&
(void *)end == (vm->proc->straight_va +
vm->proc->straight_len)) {
args.free_physical = 0;
}
args.memobj = memobj;
args.vm = vm;

View File

@ -218,3 +218,41 @@ ENTRY(__inline_memset)
ret
ENDPIPROC(__inline_memset)
ENDPROC(____inline_memset)
/*
* Non-temporal vector memory clear
*
* Parameters:
* x0 - buf (assumed to be aligned to page size)
* x1 - n (assumed to be at least page size)
*/
ENTRY(__memclear)
stp q0, q1, [x2] /* Preserve two 128 bit vector regs */
eor v0.16B, v0.16B, v0.16B
eor v1.16B, v1.16B, v1.16B
1:
stnp q0, q1, [x0, #32 * 0]
stnp q0, q1, [x0, #32 * 1]
stnp q0, q1, [x0, #32 * 2]
stnp q0, q1, [x0, #32 * 3]
stnp q0, q1, [x0, #32 * 4]
stnp q0, q1, [x0, #32 * 5]
stnp q0, q1, [x0, #32 * 6]
stnp q0, q1, [x0, #32 * 7]
stnp q0, q1, [x0, #32 * 8]
stnp q0, q1, [x0, #32 * 9]
stnp q0, q1, [x0, #32 * 10]
stnp q0, q1, [x0, #32 * 11]
stnp q0, q1, [x0, #32 * 12]
stnp q0, q1, [x0, #32 * 13]
stnp q0, q1, [x0, #32 * 14]
stnp q0, q1, [x0, #32 * 15]
add x0, x0, #512
subs x1, x1, #512
cmp x1, #0
b.ne 1b
ldp q0, q1, [x2] /* Restore vector regs */
ret
ENDPROC(__memclear)

View File

@ -1726,6 +1726,14 @@ SYSCALL_DECLARE(mmap)
if (flags & MAP_HUGETLB) {
int hugeshift = flags & (0x3F << MAP_HUGE_SHIFT);
/* OpenMPI expects -EINVAL when trying to map
* /dev/shm/ file with MAP_SHARED | MAP_HUGETLB
*/
if (!(flags & MAP_ANONYMOUS)) {
error = -EINVAL;
goto out;
}
if (hugeshift == 0) {
/* default hugepage size */
flags |= ihk_mc_get_linux_default_huge_page_shift() <<

View File

@ -174,9 +174,14 @@ void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
arch_show_interrupt_context(regs);
#ifdef ENABLE_TOFU
info.si_signo = SIGSTOP;
info.si_errno = 0;
#else
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_ILLOPC;
#endif
info._sifields._sigfault.si_addr = (void*)regs->pc;
arm64_notify_die("Oops - bad mode", regs, &info, 0);

View File

@ -1651,6 +1651,14 @@ static int clear_range(struct page_table *pt, struct process_vm *vm,
if (memobj && ((memobj->flags & MF_PREMAP))) {
args.free_physical = 0;
}
if (vm->proc->straight_va &&
(void *)start == vm->proc->straight_va &&
(void *)end == (vm->proc->straight_va +
vm->proc->straight_len)) {
args.free_physical = 0;
}
args.memobj = memobj;
args.vm = vm;

View File

@ -1430,6 +1430,14 @@ SYSCALL_DECLARE(mmap)
/* check arguments */
pgsize = PAGE_SIZE;
if (flags & MAP_HUGETLB) {
/* OpenMPI expects -EINVAL when trying to map
* /dev/shm/ file with MAP_SHARED | MAP_HUGETLB
*/
if (!(flags & MAP_ANONYMOUS)) {
error = -EINVAL;
goto out;
}
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
case 0:
/* default hugepage size */

View File

@ -30,6 +30,9 @@ endif ()
if (NOT "${LINUX_ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
if (CMAKE_CROSSCOMPILING)
list(APPEND KBUILD_MAKE_FLAGS "QEMU_LD_PREFIX=${CMAKE_FIND_ROOT_PATH}")
endif()
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH}")
list(APPEND KBUILD_MAKE_FLAGS "CROSS_COMPILE=${CROSS_COMPILE}")
endif()

View File

@ -1,11 +1,12 @@
=============================================
Version 1.7.0-0.93 (Aug 1, 2020)
Version 1.7.0 (Nov 25, 2020)
=============================================
----------------------
IHK major updates
----------------------
#. ihklib: add ihk_create_os_str
#. ihklib: ihk_reserve_mem: add capped best effort to balanced
------------------------
IHK major bug fixes

View File

@ -4671,7 +4671,7 @@ void cmd_ipcs(void); /* ipcs.c */
/*
* main.c
*/
void main_loop(void);
//void main_loop(void);
void exec_command(void);
struct command_table_entry *get_command_table_entry(char *);
void program_usage(int);

View File

@ -94,6 +94,7 @@ struct get_cpu_set_arg {
char *req_cpu_list; // Requested by user-space
int req_cpu_list_len; // Lenght of request string
int *process_rank;
pid_t ppid;
void *cpu_set;
size_t cpu_set_size; // Size in bytes
int *target_core;
@ -112,6 +113,18 @@ typedef unsigned long __cpu_set_unit;
#define MPOL_NO_BSS 0x04
#define MPOL_SHM_PREMAP 0x08
/* should be the same as process.h */
#define PLD_PROCESS_NUMA_MASK_BITS 256
enum {
PLD_MPOL_DEFAULT,
PLD_MPOL_PREFERRED,
PLD_MPOL_BIND,
PLD_MPOL_INTERLEAVE,
PLD_MPOL_LOCAL,
PLD_MPOL_MAX, /* always last member of enum */
};
#define PLD_MAGIC 0xcafecafe44332211UL
struct program_load_desc {
@ -146,9 +159,18 @@ struct program_load_desc {
unsigned long heap_extension;
long stack_premap;
unsigned long mpol_bind_mask;
int mpol_mode;
unsigned long mpol_nodemask[PLD_PROCESS_NUMA_MASK_BITS /
(sizeof(unsigned long) * 8)];
int thp_disable;
int uti_thread_rank; /* N-th clone() spawns a thread on Linux CPU */
int uti_use_last_cpu; /* Work-around not to share CPU with OpenMP thread */
int straight_map;
size_t straight_map_threshold;
#ifdef ENABLE_TOFU
int enable_tofu;
#endif
int nr_processes;
int process_rank;
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
@ -195,6 +217,9 @@ struct syscall_response {
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
#ifdef ENABLE_TOFU
void *pde_data;
#endif
};
struct syscall_ret_desc {

View File

@ -2,6 +2,7 @@
#include <linux/version.h>
#include <linux/mm_types.h>
#include <linux/kallsyms.h>
#include <linux/delay.h>
#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE
#include <linux/sched/task_stack.h>
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) */
@ -27,6 +28,16 @@ void *vdso_end;
static struct vm_special_mapping (*vdso_spec)[2];
#endif
#ifdef ENABLE_TOFU
/* Tofu CQ and barrier gate release functions */
struct file_operations *mcctrl_tof_utofu_procfs_ops_cq;
int (*mcctrl_tof_utofu_release_cq)(struct inode *inode,
struct file *filp);
struct file_operations *mcctrl_tof_utofu_procfs_ops_bch;
int (*mcctrl_tof_utofu_release_bch)(struct inode *inode,
struct file *filp);
#endif
int arch_symbols_init(void)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
@ -43,6 +54,28 @@ int arch_symbols_init(void)
return -EFAULT;
#endif
#ifdef ENABLE_TOFU
mcctrl_tof_utofu_procfs_ops_cq =
(void *)kallsyms_lookup_name("tof_utofu_procfs_ops_cq");
if (WARN_ON(!mcctrl_tof_utofu_procfs_ops_cq))
return -EFAULT;
mcctrl_tof_utofu_procfs_ops_bch =
(void *)kallsyms_lookup_name("tof_utofu_procfs_ops_bch");
if (WARN_ON(!mcctrl_tof_utofu_procfs_ops_bch))
return -EFAULT;
mcctrl_tof_utofu_release_cq =
(void *)kallsyms_lookup_name("tof_utofu_release_cq");
if (WARN_ON(!mcctrl_tof_utofu_release_cq))
return -EFAULT;
mcctrl_tof_utofu_release_bch =
(void *)kallsyms_lookup_name("tof_utofu_release_bch");
if (WARN_ON(!mcctrl_tof_utofu_release_bch))
return -EFAULT;
#endif
return 0;
}
@ -417,3 +450,108 @@ long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
out:
return rc;
}
#ifdef ENABLE_TOFU
/*
* Tofu CQ and BCH release handlers
*/
int __mcctrl_tof_utofu_release_cq(struct inode *inode, struct file *filp);
int __mcctrl_tof_utofu_release_bch(struct inode *inode, struct file *filp);
void mcctrl_tofu_hijack_release_handlers(void)
{
mcctrl_tof_utofu_procfs_ops_cq->release =
__mcctrl_tof_utofu_release_cq;
mcctrl_tof_utofu_procfs_ops_bch->release =
__mcctrl_tof_utofu_release_bch;
}
void mcctrl_tofu_restore_release_handlers(void)
{
mcctrl_tof_utofu_procfs_ops_cq->release =
mcctrl_tof_utofu_release_cq;
mcctrl_tof_utofu_procfs_ops_bch->release =
mcctrl_tof_utofu_release_bch;
}
int __mcctrl_tof_utofu_release_handler(struct inode *inode, struct file *filp,
int (*__release_func)(struct inode *inode, struct file *filp))
{
struct mcctrl_usrdata *usrdata;
struct mcctrl_file_to_pidfd *f2pfd;
struct mcctrl_per_proc_data *ppd;
struct ikc_scd_packet isp;
int ret;
dprintk("%s: current PID: %d, comm: %s \n",
__func__, task_tgid_vnr(current), current->comm);
f2pfd = mcctrl_file_to_pidfd_hash_lookup(filp, current->group_leader);
if (!f2pfd) {
goto out;
}
dprintk("%s: current PID: %d, PID: %d, fd: %d ...\n",
__func__, task_tgid_vnr(current), f2pfd->pid, f2pfd->fd);
usrdata = ihk_host_os_get_usrdata(f2pfd->os);
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, f2pfd->pid);
if (!ppd) {
pr_err("%s: PID: %d, fd: %d no PPD\n",
__func__, f2pfd->pid, f2pfd->fd);
goto out;
}
dprintk("%s: PID: %d, fd: %d PPD OK\n",
__func__, f2pfd->pid, f2pfd->fd);
/*
* We are in release() due to the process being killed,
* or because the application didn't close the file properly.
* Ask McKernel to clean up this fd.
*/
isp.msg = SCD_MSG_CLEANUP_FD;
isp.pid = f2pfd->pid;
isp.arg = f2pfd->fd;
ret = mcctrl_ikc_send_wait(f2pfd->os, ppd->ikc_target_cpu,
&isp, -20, NULL, NULL, 0);
if (ret != 0) {
dprintk("%s: WARNING: failed to send IKC msg: %d\n",
__func__, ret);
}
mcctrl_file_to_pidfd_hash_remove(filp, f2pfd->os,
current->group_leader, f2pfd->fd);
mcctrl_put_per_proc_data(ppd);
/* Do not call into Linux driver if timed out in SIGKILL.. */
if (ret == -ETIME && __fatal_signal_pending(current)) {
pr_err("%s: WARNING: failed to send IKC msg in SIGKILL: %d\n",
__func__, ret);
goto out_no_release;
}
out:
dprintk("%s: current PID: %d, comm: %s -> calling release\n",
__func__, task_tgid_vnr(current), current->comm);
return __release_func(inode, filp);
out_no_release:
return ret;
}
int __mcctrl_tof_utofu_release_cq(struct inode *inode, struct file *filp)
{
return __mcctrl_tof_utofu_release_handler(inode, filp,
mcctrl_tof_utofu_release_cq);
}
int __mcctrl_tof_utofu_release_bch(struct inode *inode, struct file *filp)
{
return __mcctrl_tof_utofu_release_handler(inode, filp,
mcctrl_tof_utofu_release_bch);
}
#endif

View File

@ -36,6 +36,7 @@
#include <linux/semaphore.h>
#include <linux/interrupt.h>
#include <linux/cpumask.h>
#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
@ -378,6 +379,7 @@ static void release_handler(ihk_os_t os, void *param)
int os_ind = ihk_host_os_get_index(os);
unsigned long flags;
struct host_thread *thread;
int ret;
/* Finalize FS switch for uti threads */
write_lock_irqsave(&host_thread_lock, flags);
@ -399,7 +401,13 @@ static void release_handler(ihk_os_t os, void *param)
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
__FUNCTION__, info, info->cpu);
mcctrl_ikc_send(os, info->cpu, &isp);
ret = mcctrl_ikc_send_wait(os, info->cpu,
&isp, -20, NULL, NULL, 0);
if (ret != 0) {
printk("%s: WARNING: failed to send IKC msg: %d\n",
__func__, ret);
}
if (os_ind >= 0) {
delete_pid_entry(os_ind, info->pid);
}
@ -587,13 +595,14 @@ extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct mcctrl_part_exec *pe;
struct mcctrl_part_exec *pe = NULL, *pe_itr;
struct get_cpu_set_arg req;
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
struct cache_topology *cache_top;
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
int ret = 0;
int mcexec_linux_numa;
int pe_list_len = 0;
cpumask_t *mcexec_cpu_set = NULL;
cpumask_t *cpus_used = NULL;
cpumask_t *cpus_to_use = NULL;
@ -614,7 +623,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
}
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
printk("%s: error copying user request\n", __FUNCTION__);
pr_err("%s: error copying user request\n", __func__);
ret = -EINVAL;
goto put_out;
}
@ -691,18 +700,48 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
goto put_out;
}
pe = &udp->part_exec;
mutex_lock(&udp->part_exec_lock);
/* Find part_exec having same node_proxy */
list_for_each_entry_reverse(pe_itr, &udp->part_exec_list, chain) {
pe_list_len++;
if (pe_itr->node_proxy_pid == req.ppid) {
pe = pe_itr;
break;
}
}
mutex_lock(&pe->lock);
if (!pe) {
/* First process to enter CPU partitioning */
pr_debug("%s: pe_list_len:%d\n", __func__, pe_list_len);
if (pe_list_len >= PE_LIST_MAXLEN) {
/* delete head entry of pe_list */
pe_itr = list_first_entry(&udp->part_exec_list,
struct mcctrl_part_exec, chain);
list_del(&pe_itr->chain);
kfree(pe_itr);
}
/* First process to enter CPU partitioning */
if (pe->nr_processes == -1) {
pe = kzalloc(sizeof(struct mcctrl_part_exec), GFP_KERNEL);
if (!pe) {
mutex_unlock(&udp->part_exec_lock);
ret = -ENOMEM;
goto put_out;
}
/* Init part_exec */
mutex_init(&pe->lock);
INIT_LIST_HEAD(&pe->pli_list);
pe->nr_processes = req.nr_processes;
pe->nr_processes_left = req.nr_processes;
pe->nr_processes_joined = 0;
pe->node_proxy_pid = req.ppid;
list_add_tail(&pe->chain, &udp->part_exec_list);
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
__FUNCTION__,
pe->nr_processes);
__func__, pe->nr_processes);
}
mutex_unlock(&udp->part_exec_lock);
mutex_lock(&pe->lock);
if (pe->nr_processes != req.nr_processes) {
printk("%s: error: requested number of processes"
@ -712,7 +751,15 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
goto put_and_unlock_out;
}
if (pe->nr_processes_joined >= pe->nr_processes) {
printk("%s: too many processes have joined to the group of %d\n",
__func__, req.ppid);
ret = -EINVAL;
goto put_and_unlock_out;
}
--pe->nr_processes_left;
++pe->nr_processes_joined;
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
__FUNCTION__,
pe->nr_processes,
@ -798,8 +845,6 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
wake_up_interruptible(&pli_next->pli_wq);
}
/* Reset process counter to start state */
pe->nr_processes = -1;
ret = -ETIMEDOUT;
goto put_and_unlock_out;
}
@ -1047,16 +1092,8 @@ next_cpu:
/* Commit used cores to OS structure */
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
/* Reset if last process */
if (pe->nr_processes_left == 0) {
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
__FUNCTION__,
pe->nr_processes);
pe->nr_processes = -1;
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
}
/* Otherwise wake up next process in list */
else {
/* If not last process, wake up next process in list */
if (pe->nr_processes_left != 0) {
++pe->process_rank;
pli_next = list_first_entry(&pe->pli_list,
struct process_list_item, list);
@ -2172,7 +2209,13 @@ static DECLARE_WAIT_QUEUE_HEAD(perfctrlq);
long mcctrl_perf_num(ihk_os_t os, unsigned long arg)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_usrdata *usrdata;
if (!os || ihk_host_validate_os(os)) {
return -EINVAL;
}
usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
@ -2197,22 +2240,34 @@ struct mcctrl_perf_ctrl_desc {
*/
long mcctrl_perf_set(ihk_os_t os, struct ihk_perf_event_attr *__user arg)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_usrdata *usrdata = NULL;
struct ikc_scd_packet isp;
struct perf_ctrl_desc *perf_desc;
struct ihk_perf_event_attr attr;
struct ihk_cpu_info *info = ihk_os_get_cpu_info(os);
struct ihk_cpu_info *info = NULL;
int ret = 0;
int i = 0, j = 0;
int need_free;
int num_registered = 0;
int err = 0;
if (!os || ihk_host_validate_os(os)) {
return -EINVAL;
}
usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
return -EINVAL;
}
info = ihk_os_get_cpu_info(os);
if (!info) {
pr_err("%s: error: cannot get cpu info\n", __func__);
return -EINVAL;
}
for (i = 0; i < usrdata->perf_event_num; i++) {
ret = copy_from_user(&attr, &arg[i],
sizeof(struct ihk_perf_event_attr));
@ -2272,20 +2327,30 @@ long mcctrl_perf_set(ihk_os_t os, struct ihk_perf_event_attr *__user arg)
long mcctrl_perf_get(ihk_os_t os, unsigned long *__user arg)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_usrdata *usrdata = NULL;
struct ikc_scd_packet isp;
struct perf_ctrl_desc *perf_desc;
struct ihk_cpu_info *info = ihk_os_get_cpu_info(os);
struct ihk_cpu_info *info = NULL;
unsigned long value_sum = 0;
int ret = 0;
int i = 0, j = 0;
int need_free;
if (!os || ihk_host_validate_os(os)) {
return -EINVAL;
}
usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
return -EINVAL;
}
info = ihk_os_get_cpu_info(os);
if (!info || info->n_cpus < 1) {
return -EINVAL;
}
for (i = 0; i < usrdata->perf_event_num; i++) {
perf_desc = kmalloc(sizeof(struct mcctrl_perf_ctrl_desc),
GFP_KERNEL);
@ -2333,15 +2398,20 @@ long mcctrl_perf_get(ihk_os_t os, unsigned long *__user arg)
long mcctrl_perf_enable(ihk_os_t os)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_usrdata *usrdata = NULL;
struct ikc_scd_packet isp;
struct perf_ctrl_desc *perf_desc;
struct ihk_cpu_info *info = ihk_os_get_cpu_info(os);
struct ihk_cpu_info *info = NULL;
unsigned long cntr_mask = 0;
int ret = 0;
int i = 0, j = 0;
int need_free;
if (!os || ihk_host_validate_os(os)) {
return -EINVAL;
}
usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
return -EINVAL;
@ -2364,6 +2434,11 @@ long mcctrl_perf_enable(ihk_os_t os)
isp.msg = SCD_MSG_PERF_CTRL;
isp.arg = virt_to_phys(perf_desc);
info = ihk_os_get_cpu_info(os);
if (!info || info->n_cpus < 1) {
kfree(perf_desc);
return -EINVAL;
}
for (j = 0; j < info->n_cpus; j++) {
ret = mcctrl_ikc_send_wait(os, j, &isp, 0,
wakeup_desc_of_perf_desc(perf_desc),
@ -2391,15 +2466,20 @@ long mcctrl_perf_enable(ihk_os_t os)
long mcctrl_perf_disable(ihk_os_t os)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_usrdata *usrdata = NULL;
struct ikc_scd_packet isp;
struct perf_ctrl_desc *perf_desc;
struct ihk_cpu_info *info = ihk_os_get_cpu_info(os);
struct ihk_cpu_info *info = NULL;
unsigned long cntr_mask = 0;
int ret = 0;
int i = 0, j = 0;
int need_free;
if (!os || ihk_host_validate_os(os)) {
return -EINVAL;
}
usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
return -EINVAL;
@ -2422,6 +2502,11 @@ long mcctrl_perf_disable(ihk_os_t os)
isp.msg = SCD_MSG_PERF_CTRL;
isp.arg = virt_to_phys(perf_desc);
info = ihk_os_get_cpu_info(os);
if (!info || info->n_cpus < 1) {
kfree(perf_desc);
return -EINVAL;
}
for (j = 0; j < info->n_cpus; j++) {
ret = mcctrl_ikc_send_wait(os, j, &isp, 0,
wakeup_desc_of_perf_desc(perf_desc),
@ -2463,6 +2548,10 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
unsigned long ut;
unsigned long st;
if (!ihk_os || ihk_host_validate_os(ihk_os)) {
return -EINVAL;
}
ret = copy_from_user(&desc, _desc, sizeof(struct mcctrl_ioctl_getrusage_desc));
if (ret != 0) {
printk("%s: copy_from_user failed\n", __FUNCTION__);
@ -3451,7 +3540,7 @@ int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
struct ihk_ikc_channel_desc *ch;
int ret = 0;
if (!os) {
if (!os || ihk_host_validate_os(os) || !ret_cpu) {
return -EINVAL;
}

View File

@ -50,6 +50,9 @@ extern void procfs_exit(int);
extern void uti_attr_finalize(void);
extern void binfmt_mcexec_init(void);
extern void binfmt_mcexec_exit(void);
#ifdef ENABLE_TOFU
extern void mcctrl_file_to_pidfd_hash_init(void);
#endif
extern int mcctrl_os_read_cpu_register(ihk_os_t os, int cpu,
struct ihk_os_cpu_register *desc);
@ -57,6 +60,11 @@ extern int mcctrl_os_write_cpu_register(ihk_os_t os, int cpu,
struct ihk_os_cpu_register *desc);
extern int mcctrl_get_request_os_cpu(ihk_os_t os, int *cpu);
#ifdef ENABLE_TOFU
extern void mcctrl_tofu_hijack_release_handlers(void);
extern void mcctrl_tofu_restore_release_handlers(void);
#endif
static long mcctrl_ioctl(ihk_os_t os, unsigned int request, void *priv,
unsigned long arg, struct file *file)
{
@ -319,10 +327,17 @@ static int __init mcctrl_init(void)
}
binfmt_mcexec_init();
#ifdef ENABLE_TOFU
mcctrl_file_to_pidfd_hash_init();
#endif
if ((ret = symbols_init()))
goto error;
#ifdef ENABLE_TOFU
mcctrl_tofu_hijack_release_handlers();
#endif
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
printk("mcctrl: error: registering OS notifier\n");
goto error;
@ -345,6 +360,9 @@ static void __exit mcctrl_exit(void)
binfmt_mcexec_exit();
uti_attr_finalize();
#ifdef ENABLE_TOFU
mcctrl_tofu_restore_release_handlers();
#endif
printk("mcctrl: unregistered.\n");
}

View File

@ -142,13 +142,35 @@ int mcctrl_ikc_send_wait(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp,
ret = mcctrl_ikc_send(os, cpu, pisp);
if (ret < 0) {
pr_warn("%s: mcctrl_ikc_send failed: %d\n", __func__, ret);
kfree(desc);
if (alloc_desc)
kfree(desc);
return ret;
}
if (timeout) {
ret = wait_event_interruptible_timeout(desc->wq,
desc->status, timeout);
/*
* Negative timeout indicates busy waiting, which can be used
* in situations where wait_event_interruptible_XXX() would
* fail, e.g., in a signal handler, at the time the process
* is being killed, etc.
*/
if (timeout < 0) {
unsigned long timeout_jiffies =
jiffies + msecs_to_jiffies(timeout * -1);
ret = -ETIME;
while (time_before(jiffies, timeout_jiffies)) {
schedule();
if (READ_ONCE(desc->status)) {
ret = 0;
break;
}
}
}
else {
ret = wait_event_interruptible_timeout(desc->wq,
desc->status, msecs_to_jiffies(timeout));
}
} else {
ret = wait_event_interruptible(desc->wq, desc->status);
}
@ -210,6 +232,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_PROCFS_ANSWER:
case SCD_MSG_REMOTE_PAGE_FAULT_ANSWER:
case SCD_MSG_CPU_RW_REG_RESP:
case SCD_MSG_CLEANUP_PROCESS_RESP:
case SCD_MSG_CLEANUP_FD_RESP:
mcctrl_wakeup_cb(__os, pisp);
break;
@ -280,7 +304,11 @@ int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp)
{
struct mcctrl_usrdata *usrdata;
if (!os || cpu < 0) {
if (!os || ihk_host_validate_os(os) || !pisp) {
return -EINVAL;
}
if (cpu < 0) {
return -EINVAL;
}
@ -513,6 +541,7 @@ int prepare_ikc_channels(ihk_os_t os)
init_waitqueue_head(&usrdata->wq_procfs);
mutex_init(&usrdata->reserve_lock);
mutex_init(&usrdata->part_exec_lock);
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
@ -521,10 +550,8 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
INIT_LIST_HEAD(&usrdata->part_exec_list);
mutex_init(&usrdata->part_exec.lock);
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
usrdata->part_exec.nr_processes = -1;
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
spin_lock_init(&usrdata->wakeup_descs_lock);
@ -580,6 +607,18 @@ void destroy_ikc_channels(ihk_os_t os)
kfree(usrdata->channels);
kfree(usrdata->ikc2linux);
mutex_lock(&usrdata->part_exec_lock);
while (!list_empty(&usrdata->part_exec_list)) {
struct mcctrl_part_exec *pe;
pe = list_first_entry(&usrdata->part_exec_list,
struct mcctrl_part_exec, chain);
list_del(&pe->chain);
kfree(pe);
}
mutex_unlock(&usrdata->part_exec_lock);
kfree(usrdata);
}

View File

@ -58,7 +58,8 @@
#define SCD_MSG_SEND_SIGNAL 0x7
#define SCD_MSG_SEND_SIGNAL_ACK 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_CLEANUP_PROCESS_RESP 0xa
#define SCD_MSG_GET_VDSO_INFO 0xb
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
@ -104,6 +105,8 @@
#define SCD_MSG_CPU_RW_REG 0x52
#define SCD_MSG_CPU_RW_REG_RESP 0x53
#define SCD_MSG_CLEANUP_FD 0x54
#define SCD_MSG_CLEANUP_FD_RESP 0x55
#define SCD_MSG_FUTEX_WAKE 0x60
@ -324,13 +327,20 @@ struct process_list_item {
wait_queue_head_t pli_wq;
};
#define PE_LIST_MAXLEN 5
struct mcctrl_part_exec {
struct mutex lock;
int nr_processes;
/* number of processes to let in / out the synchronization point */
int nr_processes_left;
/* number of processes which have joined the partition */
int nr_processes_joined;
int process_rank;
pid_t node_proxy_pid;
cpumask_t cpus_used;
struct list_head pli_list;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
@ -353,6 +363,7 @@ struct mcctrl_usrdata {
int job_pos;
int mcctrl_dma_abort;
struct mutex reserve_lock;
struct mutex part_exec_lock;
unsigned long last_thread_exec;
wait_queue_head_t wq_procfs;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
@ -368,7 +379,7 @@ struct mcctrl_usrdata {
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
struct mcctrl_part_exec part_exec;
struct list_head part_exec_list;
int perf_event_num;
};
@ -548,4 +559,31 @@ struct uti_futex_resp {
int done;
wait_queue_head_t wq;
};
#ifdef ENABLE_TOFU
/*
* Hash table to keep track of files and related processes
* and file descriptors.
* NOTE: Used for Tofu driver release handlers.
*/
#define MCCTRL_FILE_2_PIDFD_HASH_SHIFT 4
#define MCCTRL_FILE_2_PIDFD_HASH_SIZE (1 << MCCTRL_FILE_2_PIDFD_HASH_SHIFT)
#define MCCTRL_FILE_2_PIDFD_HASH_MASK (MCCTRL_FILE_2_PIDFD_HASH_SIZE - 1)
struct mcctrl_file_to_pidfd {
struct file *filp;
ihk_os_t os;
struct task_struct *group_leader;
int pid;
int fd;
struct list_head hash;
};
int mcctrl_file_to_pidfd_hash_insert(struct file *filp,
ihk_os_t os, int pid, struct task_struct *group_leader, int fd);
struct mcctrl_file_to_pidfd *mcctrl_file_to_pidfd_hash_lookup(
struct file *filp, struct task_struct *group_leader);
int mcctrl_file_to_pidfd_hash_remove(struct file *filp,
ihk_os_t os, struct task_struct *group_leader, int fd);
#endif
#endif

View File

@ -126,7 +126,7 @@ find_procfs_entry(struct procfs_list_entry *parent, const char *name)
static void
delete_procfs_entries(struct procfs_list_entry *top)
{
struct procfs_list_entry *e;
struct procfs_list_entry *e = NULL;
struct procfs_list_entry *n;
list_del(&top->list);
@ -136,8 +136,10 @@ delete_procfs_entries(struct procfs_list_entry *top)
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
if (e) {
e->entry->read_proc = NULL;
e->entry->data = NULL;
}
#endif
remove_proc_entry(top->name, top->parent? top->parent->entry: NULL);
if(top->data)

View File

@ -45,6 +45,9 @@
#include <linux/mount.h>
#include <linux/kdev_t.h>
#include <linux/hugetlb.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/llist.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
@ -52,6 +55,7 @@
#include "mcctrl.h"
#include <linux/version.h>
#include <archdeps.h>
#include <asm/pgtable.h>
#define ALIGN_WAIT_BUF(z) (((z + 63) >> 6) << 6)
@ -655,6 +659,9 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
goto put_and_out;
}
// Force regular page size
pgsize = PAGE_SIZE;
rva = (unsigned long)addr & ~(pgsize - 1);
rpa = rpa & ~(pgsize - 1);
@ -666,7 +673,8 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* LWK may hold large page based mappings that align rva outside
* Linux' VMA, make sure we don't try to map to those pages */
if (rva + (pix * PAGE_SIZE) < vma->vm_start) {
if (rva + (pix * PAGE_SIZE) < vma->vm_start ||
rva + (pix * PAGE_SIZE) > vma->vm_end) {
continue;
}
@ -677,11 +685,11 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (error) {
pr_err("%s: error inserting mapping for 0x%#lx "
"(req: TID: %d, syscall: %lu) error: %d,"
" vm_start: 0x%lx, vm_end: 0x%lx\n",
" vm_start: 0x%lx, vm_end: 0x%lx, pgsize: %lu, ind: %lu\n",
__func__,
(unsigned long)addr, packet.fault_tid,
rsysnum, error,
vma->vm_start, vma->vm_end);
vma->vm_start, vma->vm_end, pgsize, pix);
}
}
else
@ -1835,20 +1843,160 @@ static long pager_call(ihk_os_t os, struct syscall_request *req)
return ret;
}
#ifdef ENABLE_TOFU
struct list_head mcctrl_file_to_pidfd_hash[MCCTRL_FILE_2_PIDFD_HASH_SIZE];
spinlock_t mcctrl_file_to_pidfd_hash_lock;
void mcctrl_file_to_pidfd_hash_init(void)
{
int hash;
spin_lock_init(&mcctrl_file_to_pidfd_hash_lock);
for (hash = 0; hash < MCCTRL_FILE_2_PIDFD_HASH_SIZE; ++hash) {
INIT_LIST_HEAD(&mcctrl_file_to_pidfd_hash[hash]);
}
}
int mcctrl_file_to_pidfd_hash_insert(struct file *filp,
ihk_os_t os, int pid, struct task_struct *group_leader, int fd)
{
unsigned long irqflags;
struct mcctrl_file_to_pidfd *file2pidfd_iter;
struct mcctrl_file_to_pidfd *file2pidfd;
int hash = (int)((unsigned long)filp &
(unsigned long)MCCTRL_FILE_2_PIDFD_HASH_MASK);
int ret = 0;
file2pidfd = kmalloc(sizeof(*file2pidfd), GFP_ATOMIC);
if (!file2pidfd)
return -ENOMEM;
file2pidfd->filp = filp;
file2pidfd->os = os;
file2pidfd->pid = pid;
file2pidfd->group_leader = group_leader;
file2pidfd->fd = fd;
spin_lock_irqsave(&mcctrl_file_to_pidfd_hash_lock, irqflags);
list_for_each_entry(file2pidfd_iter,
&mcctrl_file_to_pidfd_hash[hash], hash) {
if (file2pidfd_iter->filp == filp) {
printk("%s: WARNING: filp: %p, pid: %d, fd: %d exists\n",
__func__, filp, pid, fd);
ret = -EBUSY;
goto free_out;
}
}
list_add_tail(&file2pidfd->hash,
&mcctrl_file_to_pidfd_hash[hash]);
dprintk("%s: filp: %p, pid: %d, fd: %d added\n",
__func__, filp, pid, fd);
spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags);
return ret;
free_out:
kfree(file2pidfd);
spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags);
return ret;
}
/*
* XXX: lookup relies on group_leader to identify the process
* because PIDs might be different across name spaces (e.g.,
* when using Docker)
*/
struct mcctrl_file_to_pidfd *mcctrl_file_to_pidfd_hash_lookup(
struct file *filp, struct task_struct *group_leader)
{
unsigned long irqflags;
struct mcctrl_file_to_pidfd *file2pidfd_iter;
struct mcctrl_file_to_pidfd *file2pidfd = NULL;
int hash = (int)((unsigned long)filp &
(unsigned long)MCCTRL_FILE_2_PIDFD_HASH_MASK);
spin_lock_irqsave(&mcctrl_file_to_pidfd_hash_lock, irqflags);
list_for_each_entry(file2pidfd_iter,
&mcctrl_file_to_pidfd_hash[hash], hash) {
if (file2pidfd_iter->filp == filp &&
file2pidfd_iter->group_leader == group_leader) {
file2pidfd = file2pidfd_iter;
dprintk("%s: filp: %p, pid: %d, fd: %d found\n",
__func__, filp, file2pidfd->pid, file2pidfd->fd);
break;
}
}
spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags);
return file2pidfd;
}
int mcctrl_file_to_pidfd_hash_remove(struct file *filp,
ihk_os_t os, struct task_struct *group_leader, int fd)
{
unsigned long irqflags;
struct mcctrl_file_to_pidfd *file2pidfd_iter;
int hash = (int)((unsigned long)filp &
(unsigned long)MCCTRL_FILE_2_PIDFD_HASH_MASK);
int ret = 0;
spin_lock_irqsave(&mcctrl_file_to_pidfd_hash_lock, irqflags);
list_for_each_entry(file2pidfd_iter,
&mcctrl_file_to_pidfd_hash[hash], hash) {
if (file2pidfd_iter->filp != filp)
continue;
if (file2pidfd_iter->os != os)
continue;
if (file2pidfd_iter->group_leader != group_leader)
continue;
if (file2pidfd_iter->fd != fd)
continue;
list_del(&file2pidfd_iter->hash);
dprintk("%s: filp: %p, pid: %d, fd: %d removed\n",
__func__, filp, file2pidfd_iter->pid, fd);
kfree(file2pidfd_iter);
goto unlock_out;
}
dprintk("%s: filp: %p, pid: %d, fd: %d couldn't be found\n",
__func__, filp, pid, fd);
ret = -ENOENT;
unlock_out:
spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags);
return ret;
}
#endif
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid)
{
unsigned long phys;
struct syscall_response *res;
if (!os || ihk_host_validate_os(os) || !packet) {
return;
}
phys = ihk_device_map_memory(ihk_os_to_dev(os),
packet->resp_pa, sizeof(*res));
if (!phys) {
return;
}
res = ihk_device_map_virtual(ihk_os_to_dev(os),
phys, sizeof(*res), NULL, 0);
if (!res) {
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, sizeof(*res));
return;
}
@ -1856,6 +2004,59 @@ void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
res->ret = ret;
res->stid = stid;
#ifdef ENABLE_TOFU
/* Record PDE_DATA after open() calls for Tofu driver */
if (packet->req.number == __NR_openat && ret > 1) {
char *pathbuf, *fullpath;
struct fd f;
int fd;
fd = ret;
f = fdget(fd);
if (!f.file) {
goto out_notify;
}
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
if (!pathbuf) {
goto out_fdput;
}
fullpath = d_path(&f.file->f_path, pathbuf, PATH_MAX);
if (IS_ERR(fullpath)) {
goto out_free;
}
if (!strncmp("/proc/tofu/dev/", fullpath, 15)) {
res->pde_data = PDE_DATA(file_inode(f.file));
dprintk("%s: fd: %d, path: %s, PDE_DATA: 0x%lx\n",
__func__,
fd,
fullpath,
(unsigned long)res->pde_data);
dprintk("%s: pgd_index: %ld, pmd_index: %ld, pte_index: %ld\n",
__func__,
pgd_index((unsigned long)res->pde_data),
pmd_index((unsigned long)res->pde_data),
pte_index((unsigned long)res->pde_data));
#ifdef CONFIG_ARM64
dprintk("CONFIG_ARM64_VA_BITS: %d, PGDIR_SHIFT: %d\n",
CONFIG_ARM64_VA_BITS, PGDIR_SHIFT);
#endif
mcctrl_file_to_pidfd_hash_insert(f.file, os,
task_tgid_vnr(current),
current->group_leader, fd);
}
out_free:
kfree(pathbuf);
out_fdput:
fdput(f);
}
out_notify:
#endif
if (__notify_syscall_requester(os, packet, res) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
@ -2163,6 +2364,93 @@ int __do_in_kernel_irq_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
return 0;
}
/*
* Memory clearing helpers.
*/
struct node_distance;
#define IHK_RBTREE_ALLOCATOR
#ifdef IHK_RBTREE_ALLOCATOR
struct free_chunk {
unsigned long addr, size;
struct rb_node node;
struct llist_node list;
};
#endif
typedef struct mcs_lock_node {
#ifndef SPIN_LOCK_IN_MCS
unsigned long locked;
struct mcs_lock_node *next;
#endif
unsigned long irqsave;
#ifdef SPIN_LOCK_IN_MCS
ihk_spinlock_t spinlock;
#endif
#ifndef ENABLE_UBSAN
} __aligned(64) mcs_lock_node_t;
#else
} mcs_lock_node_t;
#endif
struct ihk_mc_numa_node {
int id;
int linux_numa_id;
int type;
struct list_head allocators;
struct node_distance *nodes_by_distance;
#ifdef IHK_RBTREE_ALLOCATOR
atomic_t zeroing_workers;
atomic_t nr_to_zero_pages;
struct llist_head zeroed_list;
struct llist_head to_zero_list;
struct rb_root free_chunks;
mcs_lock_node_t lock;
unsigned long nr_pages;
/*
* nr_free_pages: all freed pages, zeroed if zero_at_free
*/
unsigned long nr_free_pages;
unsigned long min_addr;
unsigned long max_addr;
#endif
};
void mcctrl_zero_mckernel_pages(unsigned long arg)
{
struct llist_node *llnode;
struct ihk_mc_numa_node *node =
(struct ihk_mc_numa_node *)arg;
/* Iterate free chunks */
while ((llnode = llist_del_first(&node->to_zero_list))) {
unsigned long addr;
unsigned long size;
struct free_chunk *chunk =
container_of(llnode, struct free_chunk, list);
addr = chunk->addr;
size = chunk->size;
memset(phys_to_virt(addr) + sizeof(*chunk), 0,
chunk->size - sizeof(*chunk));
llist_add(&chunk->list, &node->zeroed_list);
dprintk("%s: zeroed %lu pages @ McKernel NUMA %d (chunk: 0x%lx:%lu)\n",
__func__,
size >> PAGE_SHIFT,
node->id,
addr, size);
barrier();
atomic_sub((int)(size >> PAGE_SHIFT), &node->nr_to_zero_pages);
}
atomic_dec(&node->zeroing_workers);
}
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
{
struct syscall_request *sc = &packet->req;
@ -2171,6 +2459,28 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
dprintk("%s: system call: %lx\n", __FUNCTION__, sc->args[0]);
switch (sc->number) {
#ifdef ENABLE_TOFU
case __NR_close: {
struct fd f;
int fd;
fd = (int)sc->args[0];
if (fd > 2) {
f = fdget(fd);
if (f.file) {
mcctrl_file_to_pidfd_hash_remove(f.file, os,
current->group_leader, fd);
fdput(f);
}
}
error = -ENOSYS;
goto out;
break;
}
#endif
case __NR_mmap:
ret = pager_call(os, sc);
break;
@ -2183,6 +2493,14 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
ret = remap_user_space(sc->args[0], sc->args[1], sc->args[2]);
break;
case __NR_move_pages:
/*
* move pages is used for zeroing McKernel side memory,
* this call is NOT offloaded by applications.
*/
mcctrl_zero_mckernel_pages(sc->args[0]);
goto out_no_syscall_return;
case __NR_exit_group: {
/* Make sure the user space handler will be called as well */
@ -2267,6 +2585,8 @@ sched_setparam_out:
}
__return_syscall(os, packet, ret, 0);
out_no_syscall_return:
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet);
error = 0;

View File

@ -68,13 +68,13 @@
#include <sys/user.h>
#endif /* !__aarch64__ */
#include <sys/prctl.h>
#include "../../config.h"
#include "../include/uprotocol.h"
#include <ihk/ihk_host_user.h>
#include "../include/uti.h"
#include <getopt.h>
#include "archdep.h"
#include "arch_args.h"
#include "../../config.h"
#include <numa.h>
#include <numaif.h>
#include <spawn.h>
@ -84,7 +84,11 @@
#include "../include/pmi.h"
#include "../include/qlmpi.h"
#include <sys/xattr.h>
#include "../include/defs.h"
#include "../../lib/include/list.h"
#include "../../lib/include/bitops-set_bit.h"
#include "../../lib/include/bitops-clear_bit.h"
#include "../../lib/include/bitops-test_bit.h"
//#define DEBUG
#define ADD_ENVS_OPTION
@ -187,6 +191,8 @@ static int mpol_no_stack = 0;
static int mpol_no_bss = 0;
static int mpol_shm_premap = 0;
static int no_bind_ikc_map = 0;
static int straight_map = 0;
static unsigned long straight_map_threshold = (1024*1024);
static unsigned long mpol_threshold = 0;
static unsigned long heap_extension = -1;
static int profile = 0;
@ -198,6 +204,9 @@ static char *mpol_bind_nodes = NULL;
static int uti_thread_rank = 0;
static int uti_use_last_cpu = 0;
static int enable_uti = 0;
#ifdef ENABLE_TOFU
static int enable_tofu = 0;
#endif
/* Partitioned execution (e.g., for MPI) */
static int nr_processes = 0;
@ -1053,6 +1062,64 @@ static inline cpu_set_t *numa_node_set(int n)
return (cpu_set_t *)(numa_nodes + n * cpu_set_size);
}
static inline void _numa_local(__cpu_set_unit *localset,
unsigned long *nodemask, int nonlocal)
{
int i;
memset(nodemask, 0, PLD_PROCESS_NUMA_MASK_BITS / 8);
for (i = 0; i < nnodes; i++) {
cpu_set_t *nodeset = numa_node_set(i);
int j;
if (nonlocal) {
set_bit(i, nodemask);
}
for (j = 0; j < ncpu; j++) {
if (test_bit(j, localset)) {
__dprintf("%d belongs to local set\n", j);
}
if (CPU_ISSET_S(j, cpu_set_size, nodeset)) {
__dprintf("%d belongs to node %d\n", j, i);
}
if (test_bit(j, localset) &&
CPU_ISSET_S(j, cpu_set_size, nodeset)) {
if (nonlocal) {
clear_bit(i, nodemask);
} else {
set_bit(i, nodemask);
}
}
}
}
}
static inline void numa_local(__cpu_set_unit *localset, unsigned long *nodemask)
{
_numa_local(localset, nodemask, 0);
}
static inline void numa_nonlocal(__cpu_set_unit *localset,
unsigned long *nodemask)
{
_numa_local(localset, nodemask, 1);
}
static inline void numa_all(unsigned long *nodemask)
{
int i;
memset(nodemask, 0, PLD_PROCESS_NUMA_MASK_BITS / 8);
for (i = 0; i < nnodes; i++) {
set_bit(i, nodemask);
}
}
pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@ -1674,6 +1741,18 @@ static struct option mcexec_options[] = {
.flag = NULL,
.val = 'M',
},
{
.name = "enable-straight-map",
.has_arg = no_argument,
.flag = &straight_map,
.val = 1,
},
{
.name = "straight-map-threshold",
.has_arg = required_argument,
.flag = NULL,
.val = 'S',
},
{
.name = "disable-sched-yield",
.has_arg = no_argument,
@ -1710,6 +1789,14 @@ static struct option mcexec_options[] = {
.flag = &enable_uti,
.val = 1,
},
#ifdef ENABLE_TOFU
{
.name = "enable-tofu",
.has_arg = no_argument,
.flag = &enable_tofu,
.val = 1,
},
#endif
{
.name = "debug-mcexec",
.has_arg = no_argument,
@ -2095,10 +2182,10 @@ int main(int argc, char **argv)
/* Parse options ("+" denotes stop at the first non-option) */
#ifdef ADD_ENVS_OPTION
while ((opt = getopt_long(argc, argv, "+c:n:t:M:h:e:s:m:u:",
while ((opt = getopt_long(argc, argv, "+c:n:t:M:h:e:s:m:u:S:",
mcexec_options, NULL)) != -1) {
#else /* ADD_ENVS_OPTION */
while ((opt = getopt_long(argc, argv, "+c:n:t:M:h:s:m:u:",
while ((opt = getopt_long(argc, argv, "+c:n:t:M:h:s:m:u:S:",
mcexec_options, NULL)) != -1) {
#endif /* ADD_ENVS_OPTION */
switch (opt) {
@ -2140,6 +2227,10 @@ int main(int argc, char **argv)
heap_extension = atobytes(optarg);
break;
case 'S':
straight_map_threshold = atobytes(optarg);
break;
#ifdef ADD_ENVS_OPTION
case 'e':
add_env_list(&extra_env, optarg);
@ -2554,6 +2645,7 @@ int main(int argc, char **argv)
cpu_set_arg.cpu_set = (void *)&desc->cpu_set;
cpu_set_arg.cpu_set_size = sizeof(desc->cpu_set);
cpu_set_arg.nr_processes = nr_processes;
cpu_set_arg.ppid = getppid();
cpu_set_arg.target_core = &target_core;
cpu_set_arg.process_rank = &process_rank;
cpu_set_arg.mcexec_linux_numa = &mcexec_linux_numa;
@ -2659,6 +2751,7 @@ int main(int argc, char **argv)
desc->heap_extension = heap_extension;
desc->mpol_bind_mask = 0;
desc->mpol_mode = PLD_MPOL_MAX; /* not specified */
if (mpol_bind_nodes) {
struct bitmask *bind_mask;
bind_mask = numa_parse_nodestring_all(mpol_bind_nodes);
@ -2672,11 +2765,65 @@ int main(int argc, char **argv)
}
}
}
/* Fujitsu TCS specific: mempolicy */
else if (getenv("OMPI_MCA_plm_ple_memory_allocation_policy")) {
char *mpol =
getenv("OMPI_MCA_plm_ple_memory_allocation_policy");
__dprintf("OMPI_MCA_plm_ple_memory_allocation_policy: %s\n",
mpol);
if (!strncmp(mpol, "localalloc", 10)) {
/* MPOL_DEFAULT has the same effect as MPOL_LOCAL */
desc->mpol_mode = MPOL_DEFAULT;
}
else if (!strncmp(mpol, "interleave_local", 16)) {
desc->mpol_mode = MPOL_INTERLEAVE;
numa_local(desc->cpu_set, desc->mpol_nodemask);
}
else if (!strncmp(mpol, "interleave_nonlocal", 19)) {
desc->mpol_mode = MPOL_INTERLEAVE;
numa_nonlocal(desc->cpu_set, desc->mpol_nodemask);
}
else if (!strncmp(mpol, "interleave_all", 14)) {
desc->mpol_mode = MPOL_INTERLEAVE;
numa_all(desc->mpol_nodemask);
}
else if (!strncmp(mpol, "bind_local", 10)) {
desc->mpol_mode = MPOL_BIND;
numa_local(desc->cpu_set, desc->mpol_nodemask);
}
else if (!strncmp(mpol, "bind_nonlocal", 13)) {
desc->mpol_mode = MPOL_BIND;
numa_nonlocal(desc->cpu_set, desc->mpol_nodemask);
}
else if (!strncmp(mpol, "bind_all", 8)) {
desc->mpol_mode = MPOL_BIND;
numa_all(desc->mpol_nodemask);
}
else if (!strncmp(mpol, "prefer_local", 12)) {
desc->mpol_mode = MPOL_PREFERRED;
numa_local(desc->cpu_set, desc->mpol_nodemask);
}
else if (!strncmp(mpol, "prefer_nonlocal", 15)) {
desc->mpol_mode = MPOL_PREFERRED;
numa_nonlocal(desc->cpu_set, desc->mpol_nodemask);
}
__dprintf("mpol_mode: %d, mpol_nodemask: %ld\n",
desc->mpol_mode, desc->mpol_nodemask[0]);
}
desc->uti_thread_rank = uti_thread_rank;
desc->uti_use_last_cpu = uti_use_last_cpu;
desc->thp_disable = get_thp_disable();
desc->straight_map = straight_map;
desc->straight_map_threshold = straight_map_threshold;
#ifdef ENABLE_TOFU
desc->enable_tofu = enable_tofu;
#endif
/* user_start and user_end are set by this call */
if (ioctl(fd, MCEXEC_UP_PREPARE_IMAGE, (unsigned long)desc) != 0) {
perror("prepare");

2
ihk

Submodule ihk updated: 97a2723e48...30e8b79b7c

View File

@ -51,6 +51,12 @@ set(MCKERNEL_SRCS
${IHK_FULL_SOURCE_DIR}/cokernel/smp/${ARCH}/setup.c
)
if (ENABLE_TOFU)
list(APPEND MCKERNEL_SRCS
tofu/tof_utofu_main.c
)
endif()
if (ENABLE_UBSAN)
add_compile_options(-fsanitize=undefined)
list(APPEND MCKERNEL_SRCS ubsan.c)

View File

@ -542,10 +542,42 @@ static int process_msg_prepare_process(unsigned long rphys)
}
vm->numa_mem_policy = MPOL_BIND;
}
else if (pn->mpol_mode != MPOL_MAX) {
int bit;
vm->numa_mem_policy = pn->mpol_mode;
memset(&vm->numa_mask, 0, sizeof(vm->numa_mask));
for_each_set_bit(bit, pn->mpol_nodemask,
PLD_PROCESS_NUMA_MASK_BITS) {
if (bit >= ihk_mc_get_nr_numa_nodes()) {
kprintf("%s: error: NUMA id %d is larger than mask size!\n",
__func__, bit);
return -EINVAL;
}
set_bit(bit, &vm->numa_mask[0]);
}
dkprintf("%s: numa_mem_policy: %d, numa_mask: %ld\n",
__func__, vm->numa_mem_policy, vm->numa_mask[0]);
}
proc->uti_thread_rank = pn->uti_thread_rank;
proc->uti_use_last_cpu = pn->uti_use_last_cpu;
proc->straight_map = pn->straight_map;
proc->straight_map_threshold = pn->straight_map_threshold;
#ifdef ENABLE_TOFU
proc->enable_tofu = pn->enable_tofu;
if (proc->enable_tofu) {
extern void tof_utofu_finalize(void);
tof_utofu_finalize();
}
#endif
#ifdef PROFILE_ENABLE
proc->profile = pn->profile;
thread->profile = pn->profile;
@ -766,12 +798,36 @@ out_remote_pf:
ret = 0;
break;
case SCD_MSG_CLEANUP_PROCESS:
case SCD_MSG_CLEANUP_PROCESS: {
extern int process_cleanup_before_terminate(int pid);
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d, thread=0x%llx\n",
packet->pid, packet->arg);
pckt.msg = SCD_MSG_CLEANUP_PROCESS_RESP;
pckt.err = process_cleanup_before_terminate(packet->pid);
pckt.ref = packet->ref;
pckt.arg = packet->arg;
pckt.reply = packet->reply;
syscall_channel_send(resp_channel, &pckt);
terminate_host(packet->pid, (struct thread *)packet->arg);
ret = 0;
break;
}
case SCD_MSG_CLEANUP_FD: {
extern int process_cleanup_fd(int pid, int fd);
pckt.msg = SCD_MSG_CLEANUP_FD_RESP;
pckt.err = process_cleanup_fd(packet->pid, packet->arg);
dkprintf("SCD_MSG_CLEANUP_FD pid=%d, fd=%d -> err: %d\n",
packet->pid, packet->arg, pckt.err);
pckt.ref = packet->ref;
pckt.arg = packet->arg;
pckt.reply = packet->reply;
syscall_channel_send(resp_channel, &pckt);
ret = 0;
break;
}
case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);

View File

@ -20,10 +20,17 @@
* CPU Local Storage (cls)
*/
struct kmalloc_cache_header {
struct kmalloc_cache_header *next;
};
struct kmalloc_header {
unsigned int front_magic;
int cpu_id;
struct list_head list;
union {
struct list_head list;
struct kmalloc_cache_header *cache;
};
int size; /* The size of this chunk without the header */
unsigned int end_magic;
/* 32 bytes */
@ -79,6 +86,7 @@ struct cpu_local_var {
ihk_spinlock_t runq_lock;
unsigned long runq_irqstate;
struct thread *current;
void *kernel_mode_pf_regs;
int prevpid;
struct list_head runq;
size_t runq_len;

View File

@ -36,4 +36,98 @@ int memcheckall(void);
int freecheck(int runcount);
void kmalloc_consolidate_free_list(void);
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
/*
* Generic lockless kmalloc cache.
*/
static inline void kmalloc_cache_free(void *elem)
{
struct kmalloc_cache_header *current = NULL;
struct kmalloc_cache_header *new =
(struct kmalloc_cache_header *)elem;
struct kmalloc_header *header;
register struct kmalloc_cache_header *cache;
if (unlikely(!elem))
return;
/* Get cache pointer from kmalloc header */
header = (struct kmalloc_header *)((void *)elem -
sizeof(struct kmalloc_header));
if (unlikely(!header->cache)) {
kprintf("%s: WARNING: no cache for 0x%lx\n",
__func__, elem);
return;
}
cache = header->cache;
retry:
current = cache->next;
new->next = current;
if (!__sync_bool_compare_and_swap(&cache->next, current, new)) {
goto retry;
}
}
static inline void kmalloc_cache_prealloc(struct kmalloc_cache_header *cache,
size_t size, int nr_elem)
{
struct kmalloc_cache_header *elem;
int i;
if (unlikely(cache->next))
return;
for (i = 0; i < nr_elem; ++i) {
struct kmalloc_header *header;
elem = (struct kmalloc_cache_header *)
kmalloc(size, IHK_MC_AP_NOWAIT);
if (!elem) {
kprintf("%s: ERROR: allocating cache element\n", __func__);
continue;
}
/* Store cache pointer in kmalloc_header */
header = (struct kmalloc_header *)((void *)elem -
sizeof(struct kmalloc_header));
header->cache = cache;
kmalloc_cache_free(elem);
}
}
static inline void *kmalloc_cache_alloc(struct kmalloc_cache_header *cache,
size_t size)
{
register struct kmalloc_cache_header *first, *next;
retry:
next = NULL;
first = cache->next;
if (first) {
next = first->next;
if (!__sync_bool_compare_and_swap(&cache->next,
first, next)) {
goto retry;
}
}
else {
kprintf("%s: calling pre-alloc for 0x%lx...\n", __func__, cache);
kmalloc_cache_prealloc(cache, size, 384);
goto retry;
}
return (void *)first;
}
#endif

84
kernel/include/kref.h Normal file
View File

@ -0,0 +1,84 @@
/*
* kref.h - library routines for handling generic reference counted objects
* (based on Linux implementation)
*
* This file is released under the GPLv2.
*
*/
#ifndef _KREF_H_
#define _KREF_H_
#include <ihk/atomic.h>
#include <ihk/lock.h>
/*
* Bit 30 marks a kref as McKernel internal.
* This can be used to distinguish krefs from Linux and
* it also ensures that a non deallocated kref will not
* crash the Linux allocator.
*/
#define MCKERNEL_KREF_MARK (1U << 30)
struct kref {
ihk_atomic_t refcount;
};
#define KREF_INIT(n) { .refcount = IHK_ATOMIC_INIT(MCKERNEL_KREF_MARK + n), }
/**
* kref_init - initialize object.
* @kref: object in question.
*/
static inline void kref_init(struct kref *kref)
{
ihk_atomic_set(&kref->refcount, MCKERNEL_KREF_MARK + 1);
}
static inline unsigned int kref_read(const struct kref *kref)
{
return (ihk_atomic_read(&kref->refcount) & ~(MCKERNEL_KREF_MARK));
}
static inline unsigned int kref_is_mckernel(const struct kref *kref)
{
return (ihk_atomic_read(&kref->refcount) & (MCKERNEL_KREF_MARK));
}
/**
* kref_get - increment refcount for object.
* @kref: object.
*/
static inline void kref_get(struct kref *kref)
{
ihk_atomic_inc(&kref->refcount);
}
/**
* kref_put - decrement refcount for object.
* @kref: object.
* @release: pointer to the function that will clean up the object when the
* last reference to the object is released.
* This pointer is required, and it is not acceptable to pass kfree
* in as this function. If the caller does pass kfree to this
* function, you will be publicly mocked mercilessly by the kref
* maintainer, and anyone else who happens to notice it. You have
* been warned.
*
* Decrement the refcount, and if 0, call release().
* Return 1 if the object was removed, otherwise return 0. Beware, if this
* function returns 0, you still can not count on the kref from remaining in
* memory. Only use the return value if you want to see if the kref is now
* gone, not present.
*/
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
//if (ihk_atomic_dec_and_test(&kref->refcount)) {
if (ihk_atomic_sub_return(1, &kref->refcount) == MCKERNEL_KREF_MARK) {
release(kref);
return 1;
}
return 0;
}
#endif /* _KREF_H_ */

View File

@ -79,4 +79,14 @@
extern int sysctl_overcommit_memory;
/*
* This looks more complex than it should be. But we need to
* get the type for the ~ right in round_down (it needs to be
* as wide as the result!), and we want to evaluate the macro
* arguments just once each.
*/
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))
#endif /* HEADER_MMAN_H */

View File

@ -390,6 +390,7 @@ struct vm_range {
struct rb_node vm_rb_node;
unsigned long start, end;
unsigned long flag;
unsigned long straight_start;
struct memobj *memobj;
off_t objoff;
int pgshift; /* page size. 0 means THP */
@ -558,11 +559,19 @@ struct process {
size_t mpol_threshold;
unsigned long heap_extension;
unsigned long mpol_bind_mask;
int mpol_mode;
int uti_thread_rank; /* Spawn on Linux CPU when clone_count reaches this */
int uti_use_last_cpu; /* Work-around not to share CPU with OpenMP thread */
int clone_count;
int thp_disable;
int straight_map;
#ifdef ENABLE_TOFU
int enable_tofu;
#endif
size_t straight_map_threshold;
// perf_event
int perf_status;
#define PP_NONE 0
@ -578,8 +587,18 @@ struct process {
#endif // PROFILE_ENABLE
int nr_processes; /* For partitioned execution */
int process_rank; /* Rank in partition */
void *straight_va;
size_t straight_len;
unsigned long straight_pa;
int coredump_barrier_count, coredump_barrier_count2;
mcs_rwlock_lock_t coredump_lock; // lock for coredump
#ifdef ENABLE_TOFU
#define MAX_FD_PDE 1024
void *fd_pde_data[MAX_FD_PDE];
char *fd_path[MAX_FD_PDE];
#endif
};
/*
@ -737,6 +756,11 @@ struct thread {
void *coredump_regs;
struct waitq coredump_wq;
int coredump_status;
#ifdef ENABLE_TOFU
/* Path of file being opened */
char *fd_path_in_open;
#endif
};
#define VM_RANGE_CACHE_SIZE 4

View File

@ -40,14 +40,27 @@ enum profile_event_type {
PROFILE_remote_page_fault,
PROFILE_mpol_alloc_missed,
PROFILE_mmap_anon_contig_phys,
PROFILE_mmap_anon_straight,
PROFILE_mmap_anon_not_straight,
PROFILE_mmap_anon_no_contig_phys,
PROFILE_mmap_regular_file,
PROFILE_mmap_device_file,
PROFILE_tofu_stag_alloc,
PROFILE_tofu_stag_alloc_new_steering,
PROFILE_tofu_stag_alloc_new_steering_alloc_mbpt,
PROFILE_tofu_stag_alloc_new_steering_update_mbpt,
PROFILE_tofu_stag_free_stags,
PROFILE_tofu_stag_free_stag,
PROFILE_tofu_stag_free_stag_pre,
PROFILE_tofu_stag_free_stag_cqflush,
PROFILE_tofu_stag_free_stag_dealloc,
PROFILE_tofu_stag_free_stag_dealloc_free_pages,
PROFILE_EVENT_MAX /* Should be the last event type */
};
#define __NR_profile PROFILE_EVENT_MAX
#ifdef __KERNEL__
struct thread;
struct process;
@ -61,6 +74,75 @@ int profile_accumulate_and_print_job_events(struct process *proc);
int profile_alloc_events(struct thread *thread);
void profile_dealloc_thread_events(struct thread *thread);
void profile_dealloc_proc_events(struct process *proc);
#else // User space interface
#include <unistd.h>
#include <sys/syscall.h>
/* Per-thread */
static inline void mckernel_profile_thread_on(void)
{
syscall(__NR_profile, PROF_ON);
}
static inline void mckernel_profile_thread_off(void)
{
syscall(__NR_profile, PROF_OFF);
}
static inline void mckernel_profile_thread_print(void)
{
syscall(__NR_profile, PROF_PRINT);
}
static inline void mckernel_profile_thread_print_off(void)
{
syscall(__NR_profile, PROF_OFF | PROF_PRINT);
}
/* Per-process */
static inline void mckernel_profile_process_on(void)
{
syscall(__NR_profile, PROF_PROC | PROF_ON);
}
static inline void mckernel_profile_process_off(void)
{
syscall(__NR_profile, PROF_PROC | PROF_OFF);
}
static inline void mckernel_profile_process_print(void)
{
syscall(__NR_profile, PROF_PROC | PROF_PRINT);
}
static inline void mckernel_profile_process_print_off(void)
{
syscall(__NR_profile, PROF_PROC | PROF_OFF | PROF_PRINT);
}
/* Per-job */
static inline void mckernel_profile_job_on(void)
{
syscall(__NR_profile, PROF_JOB | PROF_ON);
}
static inline void mckernel_profile_job_off(void)
{
syscall(__NR_profile, PROF_JOB | PROF_OFF);
}
static inline void mckernel_profile_job_print(void)
{
syscall(__NR_profile, PROF_JOB | PROF_PRINT);
}
static inline void mckernel_profile_job_print_off(void)
{
syscall(__NR_profile, PROF_JOB | PROF_OFF | PROF_PRINT);
}
#endif // __KERNEL__
#endif // PROFILE_ENABLE

View File

@ -108,4 +108,6 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
typeof(*pos), field); 1; }); \
pos = n)
struct rb_node *rb_preorder_dfs_search(const struct rb_root *root,
bool (*__cond)(struct rb_node *, void *arg), void *__cond_arg);
#endif /* _LINUX_RBTREE_H */

View File

@ -39,7 +39,8 @@
#define SCD_MSG_SEND_SIGNAL 0x7
#define SCD_MSG_SEND_SIGNAL_ACK 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_CLEANUP_PROCESS_RESP 0xa
#define SCD_MSG_GET_VDSO_INFO 0xb
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
@ -84,6 +85,8 @@
#define SCD_MSG_CPU_RW_REG 0x52
#define SCD_MSG_CPU_RW_REG_RESP 0x53
#define SCD_MSG_CLEANUP_FD 0x54
#define SCD_MSG_CLEANUP_FD_RESP 0x55
#define SCD_MSG_FUTEX_WAKE 0x60
@ -180,6 +183,18 @@ typedef unsigned long __cpu_set_unit;
#define MPOL_NO_BSS 0x04
#define MPOL_SHM_PREMAP 0x08
/* should be the same as process.h */
#define PLD_PROCESS_NUMA_MASK_BITS 256
enum {
PLD_MPOL_DEFAULT,
PLD_MPOL_PREFERRED,
PLD_MPOL_BIND,
PLD_MPOL_INTERLEAVE,
PLD_MPOL_LOCAL,
PLD_MPOL_MAX, /* always last member of enum */
};
#define PLD_MAGIC 0xcafecafe44332211UL
struct program_load_desc {
@ -214,9 +229,18 @@ struct program_load_desc {
unsigned long heap_extension;
long stack_premap;
unsigned long mpol_bind_mask;
int mpol_mode;
unsigned long mpol_nodemask[PLD_PROCESS_NUMA_MASK_BITS /
(sizeof(unsigned long) * 8)];
int thp_disable;
int uti_thread_rank; /* N-th clone() spawns a thread on Linux CPU */
int uti_use_last_cpu; /* Work-around not to share CPU with OpenMP thread */
int straight_map;
size_t straight_map_threshold;
#ifdef ENABLE_TOFU
int enable_tofu;
#endif
int nr_processes;
int process_rank;
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
@ -327,6 +351,7 @@ struct syscall_response {
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
void *pde_data;
};
struct syscall_post {

View File

@ -0,0 +1,36 @@
#!/bin/bash
SCRIPT="`readlink -f ${BASH_SOURCE[0]:-}`"
SCRIPT_DIR=$(dirname ${SCRIPT})
CURRENT_DIR=`pwd`
cd ${SCRIPT_DIR}
DWARF_TOOL=~/src/mckernel-apollo+a64fx/mckernel/tools/dwarf-extract-struct/dwarf-extract-struct
KMODULE=tof_utofu.ko
if ! tar zxvf /lib/modules/`uname -r`+debug/extra/tof_module.tar.gz ${KMODULE} 2>&1 > /dev/null; then
echo "error: uncompressing kernel module with debug symbols"
cd -
exit 1
fi
${DWARF_TOOL} ${KMODULE} tof_utofu_device enabled subnet gpid > tofu_generated-tof_utofu_device.h
${DWARF_TOOL} ${KMODULE} tof_utofu_cq common tni cqid trans steering mb num_stag | sed "s/struct FILL_IN_MANUALLY trans;/#include \"tof_utofu_cq_trans.h\"/g" > tofu_generated-tof_utofu_cq.h
${DWARF_TOOL} ${KMODULE} tof_utofu_mbpt ucq iova sg nsgents mbptstart pgsz kref > tofu_generated-tof_utofu_mbpt.h
${DWARF_TOOL} ${KMODULE} tof_utofu_bg common tni bgid bch | sed "s/struct FILL_IN_MANUALLY bch;/#include \"tof_utofu_bg_bch.h\"/g" > tofu_generated-tof_utofu_bg.h
rm ${KMODULE}
KMODULE=tof_core.ko
if ! tar zxvf /lib/modules/`uname -r`+debug/extra/tof_module.tar.gz ${KMODULE} 2>&1 > /dev/null; then
echo "error: uncompressing kernel module with debug symbols"
cd -
exit 1
fi
${DWARF_TOOL} ${KMODULE} tof_core_cq reg | sed "s/struct FILL_IN_MANUALLY reg;/#include \"tof_core_cq_reg.h\"/g" > tofu_generated-tof_core_cq.h
${DWARF_TOOL} ${KMODULE} tof_core_bg lock reg irq subnet gpid sighandler | sed "s/struct FILL_IN_MANUALLY reg;/#include \"tof_core_bg_reg.h\"/g" > tofu_generated-tof_core_bg.h
rm ${KMODULE}
#cat tofu_generated*.h
cd -

View File

@ -0,0 +1,4 @@
struct {
void *bgs;
void *bch;
} reg;

View File

@ -0,0 +1,4 @@
struct {
void *cq;
void *cqs;
} reg;

View File

@ -0,0 +1,836 @@
#ifndef _TOF_ICC_H_
#define _TOF_ICC_H_
#include <types.h>
#include <bitops.h>
typedef uint64_t phys_addr_t;
/* @ref.impl include/linux/bitops.h */
/*
* Create a contiguous bitmask starting at bit position @l and ending at
* position @h. For example
* GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
*/
#define GENMASK(h, l) \
(((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
/* constants related to the Tofu Interconnect D */
#define TOF_ICC_NTNIS 6
#define TOF_ICC_NCQS 12
#define TOF_ICC_NBGS 48
#define TOF_ICC_NBCHS 16
#define TOF_ICC_NPORTS 10
#define TOF_ICC_NVMSIDS 16
#define TOF_ICC_RH_LEN 8
#define TOF_ICC_ECRC_LEN 4
#define TOF_ICC_FRAME_ALIGN 32
#define TOF_ICC_TLP_LEN(len) (((len) + 1) * TOF_ICC_FRAME_ALIGN)
#define TOF_ICC_TLP_PAYLOAD_MAX (TOF_ICC_TLP_LEN(61) - TOF_ICC_ECRC_LEN)
#define TOF_ICC_FRAME_LEN(len) (TOF_ICC_RH_LEN + TOF_ICC_TLP_LEN(len))
#define TOF_ICC_FRAME_LEN_MIN TOF_ICC_FRAME_LEN(2)
#define TOF_ICC_FRAME_LEN_MAX TOF_ICC_FRAME_LEN(61)
#define TOF_ICC_FRAME_BUF_SIZE_BITS 11
#define TOF_ICC_FRAME_BUF_SIZE (1 << TOF_ICC_FRAME_BUF_SIZE_BITS)
#define TOF_ICC_FRAME_BUF_ALIGN_BITS 8
#define TOF_ICC_FRAME_BUF_ALIGN (1 << TOF_ICC_FRAME_BUF_ALIGN_BITS)
#define TOF_ICC_PB_SIZE_BITS 11
#define TOF_ICC_PB_SIZE (1 << TOF_ICC_PB_SIZE_BITS)
#define TOF_ICC_PB_ALIGN_BITS 11
#define TOF_ICC_PB_ALIGN (1 << TOF_ICC_PB_ALIGN_BITS)
#define TOF_ICC_ST_ALIGN_BITS 8
#define TOF_ICC_ST_ALIGN (1 << TOF_ICC_ST_ALIGN_BITS)
#define TOF_ICC_MBT_ALIGN_BITS 8
#define TOF_ICC_MBT_ALIGN (1 << TOF_ICC_MBT_ALIGN_BITS)
#define TOF_ICC_MBPT_ALIGN_BITS 8
#define TOF_ICC_MBPT_ALIGN (1 << TOF_ICC_MBPT_ALIGN_BITS)
#define TOF_ICC_BG_BSEQ_SIZE_BITS 24
#define TOF_ICC_BG_BSEQ_SIZE (1 << TOF_ICC_BG_BSEQ_SIZE_BITS)
#define TOF_ICC_BCH_DMA_ALIGN_BITS 8
#define TOF_ICC_BCH_DMA_ALIGN (1 << TOF_ICC_BCH_DMA_ALIGN_BITS)
/* this is a CPU-specific constant, but referred in the ICC spec. */
#define TOF_ICC_CACHE_LINE_SIZE_BITS 8
#define TOF_ICC_CACHE_LINE_SIZE (1 << TOF_ICC_CACHE_LINE_SIZE_BITS)
#define TOF_ICC_TOQ_DESC_SIZE_BITS 5
#define TOF_ICC_TOQ_DESC_SIZE (1 << TOF_ICC_TOQ_DESC_SIZE_BITS)
#define TOF_ICC_TCQ_DESC_SIZE_BITS 3
#define TOF_ICC_TCQ_DESC_SIZE (1 << TOF_ICC_TCQ_DESC_SIZE_BITS)
#define TOF_ICC_TCQ_NLINE_BITS (TOF_ICC_CACHE_LINE_SIZE_BITS - TOF_ICC_TCQ_DESC_SIZE_BITS)
#define TOF_ICC_MRQ_DESC_SIZE_BITS 5
#define TOF_ICC_MRQ_DESC_SIZE (1 << TOF_ICC_MRQ_DESC_SIZE_BITS)
#define TOF_ICC_PBQ_DESC_SIZE_BITS 3
#define TOF_ICC_PBQ_DESC_SIZE (1 << TOF_ICC_PBQ_DESC_SIZE_BITS)
#define TOF_ICC_PRQ_DESC_SIZE_BITS 3
#define TOF_ICC_PRQ_DESC_SIZE (1 << TOF_ICC_PRQ_DESC_SIZE_BITS)
#define TOF_ICC_PRQ_NLINE_BITS (TOF_ICC_CACHE_LINE_SIZE_BITS - TOF_ICC_PBQ_DESC_SIZE_BITS)
#define TOF_ICC_TOQ_SIZE_NTYPES 6
#define TOF_ICC_TOQ_SIZE_BITS(size) ((size) * 2 + 11)
#define TOF_ICC_TOQ_SIZE(size) (1 << TOF_ICC_TOQ_SIZE_BITS(size))
#define TOF_ICC_TOQ_LEN(size) (TOF_ICC_TOQ_SIZE(size) * TOF_ICC_TOQ_DESC_SIZE)
#define TOF_ICC_TCQ_LEN(size) (TOF_ICC_TOQ_SIZE(size) * TOF_ICC_TCQ_DESC_SIZE)
#define TOF_ICC_MRQ_SIZE_NTYPES 6
#define TOF_ICC_MRQ_SIZE_BITS(size) ((size) * 2 + 11)
#define TOF_ICC_MRQ_SIZE(size) (1 << TOF_ICC_MRQ_SIZE_BITS(size))
#define TOF_ICC_MRQ_LEN(size) (TOF_ICC_MRQ_SIZE(size) * TOF_ICC_MRQ_DESC_SIZE)
#define TOF_ICC_PBQ_SIZE_NTYPES 6
#define TOF_ICC_PBQ_SIZE_BITS(size) ((size) * 2 + 11)
#define TOF_ICC_PBQ_SIZE(size) (1 << TOF_ICC_PBQ_SIZE_BITS(size))
#define TOF_ICC_PBQ_LEN(size) (TOF_ICC_PBQ_SIZE(size) * TOF_ICC_PBQ_DESC_SIZE)
#define TOF_ICC_PRQ_SIZE_NTYPES 6
#define TOF_ICC_PRQ_SIZE_BITS(size) ((size) * 2 + 11)
#define TOF_ICC_PRQ_SIZE(size) (1 << TOF_ICC_PRQ_SIZE_BITS(size))
#define TOF_ICC_PRQ_LEN(size) (TOF_ICC_PRQ_SIZE(size) * TOF_ICC_PRQ_DESC_SIZE)
#define TOF_ICC_STEERING_TABLE_ALIGN_BITS 8
#define TOF_ICC_STEERING_TABLE_ALIGN (1 << TOF_ICC_STEERING_TABLE_ALIGN_BITS)
#define TOF_ICC_STEERING_SIZE_BITS 4
#define TOF_ICC_STEERING_SIZE (1 << TOF_ICC_STEERING_SIZE_BITS)
#define TOF_ICC_MB_TABLE_ALIGN_BITS 8
#define TOF_ICC_MB_TABLE_ALIGN (1 << TOF_ICC_MB_TABLE_ALIGN_BITS)
#define TOF_ICC_MB_SIZE_BITS 4
#define TOF_ICC_MB_SIZE (1 << TOF_ICC_MB_SIZE_BITS)
#define TOF_ICC_MB_PS_ENCODE(bits) ((bits) % 9 == 3 ? (bits) / 9 - 1 : (bits) / 13 + 3)
#define TOF_ICC_MBPT_ALIGN_BITS 8
#define TOF_ICC_MBPT_ALIGN (1 << TOF_ICC_MBPT_ALIGN_BITS)
#define TOF_ICC_MBPT_SIZE_BITS 3
#define TOF_ICC_MBPT_SIZE (1 << TOF_ICC_MBPT_SIZE_BITS)
#define TOF_ICC_X_BITS 5
#define TOF_ICC_Y_BITS 5
#define TOF_ICC_Z_BITS 5
#define TOF_ICC_A_BITS 1
#define TOF_ICC_B_BITS 2
#define TOF_ICC_C_BITS 1
#define TOF_ICC_MAX_X_SIZE (1 << TOF_ICC_X_BITS)
#define TOF_ICC_MAX_Y_SIZE (1 << TOF_ICC_Y_BITS)
#define TOF_ICC_MAX_Z_SIZE (1 << TOF_ICC_Z_BITS)
#define TOF_ICC_A_SIZE 2
#define TOF_ICC_B_SIZE 3
#define TOF_ICC_C_SIZE 2
#define TOF_ICC_X_MASK ((1 << TOF_ICC_X_BITS) - 1)
#define TOF_ICC_Y_MASK ((1 << TOF_ICC_Y_BITS) - 1)
#define TOF_ICC_Z_MASK ((1 << TOF_ICC_Z_BITS) - 1)
#define TOF_ICC_A_MASK ((1 << TOF_ICC_A_BITS) - 1)
#define TOF_ICC_B_MASK ((1 << TOF_ICC_B_BITS) - 1)
#define TOF_ICC_C_MASK ((1 << TOF_ICC_C_BITS) - 1)
#define TOF_ICC_ABC_SIZE (TOF_ICC_A_SIZE * TOF_ICC_B_SIZE * TOF_ICC_C_SIZE)
static inline int tof_icc_get_framelen(int len){
len = TOF_ICC_RH_LEN + round_up(len + TOF_ICC_ECRC_LEN, TOF_ICC_FRAME_ALIGN);
if(len < TOF_ICC_FRAME_LEN_MIN){
len = TOF_ICC_FRAME_LEN_MIN;
}
return len;
}
/** Descriptors **/
/** commands and rcodes **/
enum {
TOF_ICC_TOQ_NOP,
TOF_ICC_TOQ_PUT,
TOF_ICC_TOQ_WRITE_PIGGYBACK_BUFFER,
TOF_ICC_TOQ_PUT_PIGGYBACK,
TOF_ICC_TOQ_GET,
TOF_ICC_TOQ_GETL,
TOF_ICC_TOQ_ATOMIC_READ_MODIFY_WRITE = 0xe,
TOF_ICC_TOQ_TRANSMIT_RAW_PACKET1 = 0x10,
TOF_ICC_TOQ_TRANSMIT_RAW_PACKET2,
TOF_ICC_TOQ_TRANSMIT_SYSTEM_PACKET1,
TOF_ICC_TOQ_TRANSMIT_SYSTEM_PACKET2,
TOF_ICC_TOQ_NCOMMANDS,
};
enum {
TOF_ICC_MRQ_ATOMIC_READ_MODIFY_WRITE_HALFWAY_NOTICE = 0x1,
TOF_ICC_MRQ_ATOMIC_READ_MODIFY_WRITE_NOTICE,
TOF_ICC_MRQ_ATOMIC_READ_MODIFY_WRITE_REMOTE_ERROR,
TOF_ICC_MRQ_PUT_HALFWAY_NOTICE,
TOF_ICC_MRQ_PUT_LAST_HALFWAY_NOTICE,
TOF_ICC_MRQ_GET_HALFWAY_NOTICE,
TOF_ICC_MRQ_GET_LAST_HALFWAY_NOTICE,
TOF_ICC_MRQ_PUT_NOTICE,
TOF_ICC_MRQ_PUT_LAST_NOTICE,
TOF_ICC_MRQ_GET_NOTICE,
TOF_ICC_MRQ_GET_LAST_NOTICE,
TOF_ICC_MRQ_PUT_REMOTE_ERROR,
TOF_ICC_MRQ_PUT_LAST_REMOTE_ERROR,
TOF_ICC_MRQ_GET_REMOTE_ERROR,
TOF_ICC_MRQ_GET_LAST_REMOTE_ERROR,
TOF_ICC_MRQ_NCOMMANDS,
};
enum {
TOF_ICC_PRQ_UNKNOWN_TLP,
TOF_ICC_PRQ_SYSTEM_TLP,
TOF_ICC_PRQ_ADDRESS_RANGE_EXCEPTION = 0x6,
TOF_ICC_PRQ_CQ_EXCEPTION = 0x8,
TOF_ICC_PRQ_ILLEGAL_TLP_FLAGS,
TOF_ICC_PRQ_ILLEGAL_TLP_LENGTH,
TOF_ICC_PRQ_CQ_ERROR = 0xc,
};
/** structures **/
struct tof_icc_steering_entry {
uint64_t res1:6;
uint64_t readonly:1;
uint64_t enable:1;
uint64_t mbva:32;
uint64_t res2:8;
uint64_t mbid:16;
uint64_t length; /* for optimization */
};
struct tof_icc_mb_entry {
uint64_t ps:3;
uint64_t res1:4;
uint64_t enable:1;
uint64_t ipa:32;
uint64_t res2:24;
uint64_t npage; /* for optimization */
};
struct tof_icc_mbpt_entry {
uint64_t res1:7;
uint64_t enable:1;
uint64_t res2:4;
uint64_t ipa:28;
uint64_t res3:24;
};
struct tof_icc_cq_stag_offset {
uint64_t offset:40;
uint64_t stag:18;
uint64_t cqid:6;
};
struct tof_icc_toq_common_header1 {
uint8_t interrupt:1;
uint8_t res1:4;
uint8_t source_type:2;
uint8_t flip:1;
uint8_t command;
union {
uint8_t mtu;
struct {
uint8_t res:4;
uint8_t op:4;
} armw;
} mtuop;
uint8_t sps:4;
uint8_t pa:1;
uint8_t pb:2;
uint8_t pc:1;
uint8_t rx;
uint8_t ry;
uint8_t rz;
uint8_t ra:1;
uint8_t rb:2;
uint8_t rc:1;
uint8_t res3:1;
uint8_t ri:3;
};
struct tof_icc_toq_common_header2 {
uint8_t gap;
uint8_t s:1;
uint8_t r:1;
uint8_t q:1;
uint8_t p:1;
uint8_t res1:1;
uint8_t j:1;
uint8_t res2:2;
uint16_t edata;
union{
struct {
uint32_t length:24;
uint32_t res:8;
} normal;
struct {
uint32_t length:6;
uint32_t res:26;
} piggyback;
} len;
};
struct tof_icc_toq_descriptor {
struct tof_icc_toq_common_header1 head1;
uint64_t res[3];
};
struct tof_icc_toq_nop {
struct tof_icc_toq_common_header1 head1;
uint64_t res[3];
};
struct tof_icc_toq_put {
struct tof_icc_toq_common_header1 head1;
struct tof_icc_toq_common_header2 head2;
struct tof_icc_cq_stag_offset remote;
struct tof_icc_cq_stag_offset local;
};
struct tof_icc_toq_write_piggyback_buffer {
struct tof_icc_toq_common_header1 head1;
uint64_t data[3];
};
struct tof_icc_toq_put_piggyback {
struct tof_icc_toq_common_header1 head1;
struct tof_icc_toq_common_header2 head2;
struct tof_icc_cq_stag_offset remote;
uint64_t data;
};
struct tof_icc_toq_get {
struct tof_icc_toq_common_header1 head1;
struct tof_icc_toq_common_header2 head2;
struct tof_icc_cq_stag_offset remote;
struct tof_icc_cq_stag_offset local;
};
struct tof_icc_toq_atomic_read_modify_write {
struct tof_icc_toq_common_header1 head1;
struct tof_icc_toq_common_header2 head2;
struct tof_icc_cq_stag_offset remote;
uint64_t data;
};
struct tof_icc_toq_transmit_raw_packet1 {
struct tof_icc_toq_common_header1 head1;
uint8_t gap;
uint8_t res4[3];
uint32_t length:12;
uint32_t res5:20;
uint64_t res6;
uint64_t pa:48; /* for optimization */
uint64_t res7:16;
};
struct tof_icc_toq_transmit_raw_packet2 {
uint8_t interrupt:1;
uint8_t res1:4;
uint8_t source_type:2;
uint8_t flip:1;
uint8_t command;
uint8_t res2:7;
uint8_t e:1;
uint8_t res3[4];
uint8_t port:5;
uint8_t res4:1;
uint8_t vc:2;
uint8_t gap;
uint8_t res5[3];
uint32_t length:12;
uint32_t res6:20;
uint64_t res7;
uint64_t pa:48; /* for optimization */
uint64_t res8:16;
};
struct tof_icc_toq_transmit_system_packet {
struct tof_icc_toq_common_header1 head1; /* rx, ry, rz should be rdx, rdy, rdz */
uint8_t gap;
uint8_t res4[3];
uint32_t length:12;
uint32_t res5:20;
uint64_t res6;
uint64_t pa:48; /* for optimization */
uint64_t res7:16;
};
struct tof_icc_tcq_descriptor {
uint8_t res1:5;
uint8_t counter_unmatch:1;
uint8_t res2:1;
uint8_t flip:1;
uint8_t rcode;
uint8_t res3[2];
union{
struct {
uint32_t length:24;
uint32_t res:8;
} normal;
struct {
uint32_t length:6;
uint32_t res:26;
} piggyback;
} len;
};
struct tof_icc_mrq_common_header1 {
uint8_t res1:7;
uint8_t flip:1;
uint8_t id;
uint8_t rcode;
uint8_t res2:4;
uint8_t pa:1;
uint8_t pb:2;
uint8_t pc:1;
uint8_t x;
uint8_t y;
uint8_t z;
uint8_t a:1;
uint8_t b:2;
uint8_t c:1;
uint8_t res3:1;
uint8_t i:3;
};
struct tof_icc_mrq_common_header2 {
uint8_t res1;
uint8_t res2:4;
uint8_t initial:1;
uint8_t res3:3;
uint16_t edata;
union {
struct {
uint32_t length:11;
uint32_t res:21;
} normal;
struct {
uint32_t op:4;
uint32_t res:28;
} armw;
} lenop;
};
struct tof_icc_mrq_atomic_read_modify_write_halfway_notice {
struct tof_icc_mrq_common_header1 head1;
struct tof_icc_mrq_common_header2 head2;
struct tof_icc_cq_stag_offset local;
struct tof_icc_cq_stag_offset remote;
};
struct tof_icc_mrq_descriptor {
struct tof_icc_mrq_common_header1 head1;
struct tof_icc_mrq_common_header2 head2;
struct tof_icc_cq_stag_offset cso1;
struct tof_icc_cq_stag_offset cso2;
};
struct tof_icc_pbq_descriptor {
uint64_t res1:7;
uint64_t f:1;
uint64_t res2:3;
uint64_t pa:29;
uint64_t res3:24;
};
struct tof_icc_prq_descriptor {
uint64_t rcode:7;
uint64_t f:1;
uint64_t res1:3;
uint64_t pa:29;
uint64_t res2:8;
uint64_t w:1;
uint64_t res3:5;
uint64_t l:1;
uint64_t e:1;
uint64_t res4:8;
};
/** Registers **/
/* useful packed structures */
struct tof_icc_reg_subnet {
uint64_t lz:6;
uint64_t sz:6;
uint64_t nz:6;
uint64_t ly:6;
uint64_t sy:6;
uint64_t ny:6;
uint64_t lx:6;
uint64_t sx:6;
uint64_t nx:6;
uint64_t res:10;
};
struct tof_icc_reg_bg_address {
uint32_t bgid:6;
uint32_t tni:3;
uint32_t c:1;
uint32_t b:2;
uint32_t a:1;
uint32_t z:5;
uint32_t y:5;
uint32_t x:5;
uint32_t pc:1;
uint32_t pb:2;
uint32_t pa:1;
};
/* relative offset of interrupt controller registers */
#define TOF_ICC_IRQREG_IRR 0x0
#define TOF_ICC_IRQREG_IMR 0x8
#define TOF_ICC_IRQREG_IRC 0x10
#define TOF_ICC_IRQREG_IMC 0x18
#define TOF_ICC_IRQREG_ICL 0x20
/* TOFU REGISTERS */
#define tof_icc_reg_pa 0x40000000
/* CQ */
#define TOF_ICC_REG_CQ_PA(tni, cqid) (tof_icc_reg_pa + 0 + (tni) * 0x1000000 + (cqid) * 0x10000)
#define TOF_ICC_REG_CQ_TOQ_DIRECT_DESCRIPTOR 0x0
#define TOF_ICC_REG_CQ_TOQ_FETCH_START 0x40
#define TOF_ICC_REG_CQ_MRQ_FULL_POINTER 0x48
#define TOF_ICC_REG_CQ_TOQ_PIGGYBACK_BUFFER0 0x50
#define TOF_ICC_REG_CQ_TOQ_PIGGYBACK_BUFFER1 0x58
#define TOF_ICC_REG_CQ_TOQ_PIGGYBACK_BUFFER2 0x60
#define TOF_ICC_REG_CQ_TCQ_NUM_NOTICE 0x68
#define TOF_ICC_REG_CQ_MRQ_NUM_NOTICE 0x70
#define TOF_ICC_REG_CQ_TX_PAYLOAD_BYTE 0x78
#define TOF_ICC_REG_CQ_RX_PAYLOAD_BYTE 0x80
#define TOF_ICC_REG_CQ_DUMP_START 0x0
#define TOF_ICC_REG_CQ_DUMP_END 0x88
/* BCH */
#define TOF_ICC_REG_BCH_PA(tni, bgid) (tof_icc_reg_pa + 0x0000e00000 + (tni) * 0x1000000 + (bgid) * 0x10000)
#define TOF_ICC_REG_BCH_IDATA 0x800
#define TOF_ICC_REG_BCH_READY 0x840
#define TOF_ICC_REG_BCH_READY_STATE BIT(63)
#define TOF_ICC_REG_BCH_IGNORED_SIGNAL_COUNT 0x848
#define TOF_ICC_REG_BCH_DUMP_START 0x800
#define TOF_ICC_REG_BCH_DUMP_END 0x850
/* CQS */
#define TOF_ICC_REG_CQS_PA(tni, cqid) (tof_icc_reg_pa + 0x0000400000 + (tni) * 0x1000000 + (cqid) * 0x10000)
#define TOF_ICC_REG_CQS_STATUS 0x0
#define TOF_ICC_REG_CQS_STATUS_DESCRIPTOR_PROCESS_STOP BIT(63)
#define TOF_ICC_REG_CQS_STATUS_DESCRIPTOR_FETCH_STOP BIT(62)
#define TOF_ICC_REG_CQS_STATUS_BLANK_ENTRY_FLIP_BIT BIT(61)
#define TOF_ICC_REG_CQS_STATUS_CACHE_FLUSH_BUSY BIT(60)
#define TOF_ICC_REG_CQS_STATUS_CQ_ENABLE BIT(59)
#define TOF_ICC_REG_CQS_STATUS_SESSION_DEAD BIT(58)
#define TOF_ICC_REG_CQS_STATUS_SESSION_OFFSET_OVERFLOW BIT(57)
#define TOF_ICC_REG_CQS_STATUS_SESSION_OFFSET GENMASK(56, 32)
#define TOF_ICC_REG_CQS_STATUS_NEXT_DESCRIPTOR_OFFSET GENMASK(29, 5)
#define TOF_ICC_REG_CQS_ENABLE 0x8
#define TOF_ICC_REG_CQS_CACHE_FLUSH 0x10
#define TOF_ICC_REG_CQS_FETCH_STOP 0x18
#define TOF_ICC_REG_CQS_MODE 0x20
#define TOF_ICC_REG_CQS_MODE_SYSTEM BIT(63)
#define TOF_ICC_REG_CQS_MODE_TRP2_ENABLE BIT(62)
#define TOF_ICC_REG_CQS_MODE_TRP1_ENABLE BIT(61)
#define TOF_ICC_REG_CQS_MODE_SESSION BIT(60)
#define TOF_ICC_REG_CQS_MODE_SUBNET_NX GENMASK(53, 48)
#define TOF_ICC_REG_CQS_MODE_SUBNET_SX GENMASK(47, 42)
#define TOF_ICC_REG_CQS_MODE_SUBNET_LX GENMASK(41, 36)
#define TOF_ICC_REG_CQS_MODE_SUBNET_NY GENMASK(35, 30)
#define TOF_ICC_REG_CQS_MODE_SUBNET_SY GENMASK(29, 24)
#define TOF_ICC_REG_CQS_MODE_SUBNET_LY GENMASK(23, 18)
#define TOF_ICC_REG_CQS_MODE_SUBNET_NZ GENMASK(17, 12)
#define TOF_ICC_REG_CQS_MODE_SUBNET_SZ GENMASK(11, 6)
#define TOF_ICC_REG_CQS_MODE_SUBNET_LZ GENMASK(5, 0)
#define TOF_ICC_REG_CQS_GPID 0x28
#define TOF_ICC_REG_CQS_TOQ_IPA 0x30
#define TOF_ICC_REG_CQS_TOQ_SIZE 0x38
#define TOF_ICC_REG_CQS_TCQ_IPA 0x40
#define TOF_ICC_REG_CQS_TCQ_IPA_CACHE_INJECTION BIT(63)
#define TOF_ICC_REG_CQS_MRQ_IPA 0x48
#define TOF_ICC_REG_CQS_MRQ_IPA_CACHE_INJECTION BIT(63)
#define TOF_ICC_REG_CQS_MRQ_SIZE 0x50
#define TOF_ICC_REG_CQS_MRQ_MASK 0x58
#define TOF_ICC_REG_CQS_TCQ_DESCRIPTOR_COALESCING_TIMER 0x60
#define TOF_ICC_REG_CQS_MRQ_DESCRIPTOR_COALESCING_TIMER 0x68
#define TOF_ICC_REG_CQS_MRQ_INTERRUPT_COALESCING_TIMER 0x70
#define TOF_ICC_REG_CQS_MRQ_INTERRUPT_COALESCING_COUNT 0x78
#define TOF_ICC_REG_CQS_TOQ_DIRECT_SOURCE_COUNT 0x80
#define TOF_ICC_REG_CQS_TOQ_DIRECT_DESCRIPTOR_COUNT 0x88
#define TOF_ICC_REG_CQS_MEMORY_BLOCK_TABLE_ENABLE 0x90
#define TOF_ICC_REG_CQS_MEMORY_BLOCK_TABLE_IPA 0x98
#define TOF_ICC_REG_CQS_MEMORY_BLOCK_TABLE_SIZE 0xa0
#define TOF_ICC_REG_CQS_STEERING_TABLE_ENABLE 0xa8
#define TOF_ICC_REG_CQS_STEERING_TABLE_IPA 0xb0
#define TOF_ICC_REG_CQS_STEERING_TABLE_SIZE 0xb8
#define TOF_ICC_REG_CQS_MRQ_INTERRUPT_MASK 0xc0
#define TOF_ICC_REG_CQS_IRR 0xc8
#define TOF_ICC_REG_CQS_IMR 0xd0
#define TOF_ICC_REG_CQS_IRC 0xd8
#define TOF_ICC_REG_CQS_IMC 0xe0
#define TOF_ICC_REG_CQS_ICL 0xe8
#define TOF_ICC_REG_CQS_DUMP_START 0x0
#define TOF_ICC_REG_CQS_DUMP_END 0xf0
/* BGS */
#define TOF_ICC_REG_BGS_PA(tni, bgid) (tof_icc_reg_pa + 0x0000800000 + (tni) * 0x1000000 + (bgid) * 0x10000)
#define TOF_ICC_REG_BGS_ENABLE 0x0
#define TOF_ICC_REG_BGS_IRR 0x8
#define TOF_ICC_REG_BGS_IMR 0x10
#define TOF_ICC_REG_BGS_IRC 0x18
#define TOF_ICC_REG_BGS_IMC 0x20
#define TOF_ICC_REG_BGS_ICL 0x28
#define TOF_ICC_REG_BGS_STATE 0x30
#define TOF_ICC_REG_BGS_STATE_ENABLE BIT(0)
#define TOF_ICC_REG_BGS_EXCEPTION_INFO_GPID_UNMATCH 0x38
#define TOF_ICC_REG_BGS_EXCEPTION_INFO_GPID_UNMATCH_BG_ADDRESS GENMASK(27, 0)
#define TOF_ICC_REG_BGS_EXCEPTION_INFO_ADDRESS_UNMATCH 0x40
#define TOF_ICC_REG_BGS_EXCEPTION_INFO_ADDRESS_UNMATCH_BG_ADDRESS GENMASK(27, 0)
#define TOF_ICC_REG_BGS_SIGNAL_A 0x48
#define TOF_ICC_REG_BGS_SIGNAL_A_SIG_RECV BIT(63)
#define TOF_ICC_REG_BGS_SIGNAL_A_TLP_RECV BIT(62)
#define TOF_ICC_REG_BGS_SIGNAL_A_SIG_SEND BIT(61)
#define TOF_ICC_REG_BGS_SIGNAL_A_OP_TYPE GENMASK(3, 0)
#define TOF_ICC_REG_BGS_SIGNAL_B 0x50
#define TOF_ICC_REG_BGS_SIGNAL_B_SIG_RECV BIT(63)
#define TOF_ICC_REG_BGS_SIGNAL_B_TLP_RECV BIT(62)
#define TOF_ICC_REG_BGS_SIGNAL_B_SIG_SEND BIT(61)
#define TOF_ICC_REG_BGS_SIGNAL_B_OP_TYPE GENMASK(3, 0)
#define TOF_ICC_REG_BGS_SIGNAL_MASK 0x58
#define TOF_ICC_REG_BGS_SIGNAL_MASK_SIG_RECV BIT(63)
#define TOF_ICC_REG_BGS_SIGNAL_MASK_TLP_RECV BIT(62)
#define TOF_ICC_REG_BGS_SIGNAL_MASK_SIG_SEND BIT(61)
#define TOF_ICC_REG_BGS_SIGNAL_MASK_TLP_SEND BIT(60)
#define TOF_ICC_REG_BGS_LOCAL_LINK 0x60
#define TOF_ICC_REG_BGS_LOCAL_LINK_BGID_RECV GENMASK(37, 32)
#define TOF_ICC_REG_BGS_LOCAL_LINK_BGID_SEND GENMASK(5, 0)
#define TOF_ICC_REG_BGS_REMOTE_LINK 0x68
#define TOF_ICC_REG_BGS_REMOTE_LINK_BG_ADDRESS_RECV GENMASK(59, 32)
#define TOF_ICC_REG_BGS_REMOTE_LINK_BG_ADDRESS_SEND GENMASK(31, 0)
#define TOF_ICC_REG_BGS_SUBNET_SIZE 0x70
#define TOF_ICC_REG_BGS_GPID_BSEQ 0x78
#define TOF_ICC_REG_BGS_DATA_A0 0x108
#define TOF_ICC_REG_BGS_DATA_AE 0x178
#define TOF_ICC_REG_BGS_DATA_B0 0x188
#define TOF_ICC_REG_BGS_DATA_BE 0x1f8
#define TOF_ICC_REG_BGS_BCH_MASK 0x800
#define TOF_ICC_REG_BGS_BCH_MASK_MASK BIT(63)
#define TOF_ICC_REG_BGS_BCH_MASK_STATUS 0x808
#define TOF_ICC_REG_BGS_BCH_MASK_STATUS_RUN BIT(63)
#define TOF_ICC_REG_BGS_BCH_NOTICE_IPA 0x810
#define TOF_ICC_REG_BGS_DUMP_START 0x0
#define TOF_ICC_REG_BGS_DUMP_END 0x818
/* TNI */
#define TOF_ICC_REG_TNI_PA(tni) (tof_icc_reg_pa + 0x0000c00000 + (tni) * 0x1000000)
#define TOF_ICC_REG_TNI_IRR 0x8
#define TOF_ICC_REG_TNI_IMR 0x10
#define TOF_ICC_REG_TNI_IRC 0x18
#define TOF_ICC_REG_TNI_IMC 0x20
#define TOF_ICC_REG_TNI_ICL 0x28
#define TOF_ICC_REG_TNI_STATE 0x30
#define TOF_ICC_REG_TNI_STATE_MASK GENMASK(1, 0)
#define TOF_ICC_REG_TNI_STATE_DISABLE 0
#define TOF_ICC_REG_TNI_STATE_NORMAL 2
#define TOF_ICC_REG_TNI_STATE_ERROR 3
#define TOF_ICC_REG_TNI_ENABLE 0x38
#define TOF_ICC_REG_TNI_CQ_PRESENT 0x40
#define TOF_ICC_REG_TNI_EXCEPTION_INFO_INACTIVE_BG 0x48
#define TOF_ICC_REG_TNI_EXCEPTION_INFO_INACTIVE_BG_DEST_BG GENMASK(37, 32)
#define TOF_ICC_REG_TNI_EXCEPTION_INFO_INACTIVE_BG_SOURCE_BG_ADDRESS GENMASK(27, 0)
#define TOF_ICC_REG_TNI_PRQ_FULL_POINTER 0x100
#define TOF_ICC_REG_TNI_PBQ_PA 0x108
#define TOF_ICC_REG_TNI_PBQ_SIZE 0x110
#define TOF_ICC_REG_TNI_PRQ_PA 0x118
#define TOF_ICC_REG_TNI_PRQ_PA_CACHE_INJECTION BIT(63)
#define TOF_ICC_REG_TNI_PRQ_SIZE 0x120
#define TOF_ICC_REG_TNI_PRQ_MASK 0x128
#define TOF_ICC_REG_TNI_PRQ_ENTRY_COALESCING_TIMER 0x130
#define TOF_ICC_REG_TNI_PRQ_INTERRUPT_COALESCING_TIMER 0x138
#define TOF_ICC_REG_TNI_PRQ_INTERRUPT_COALESCING_COUNT 0x140
#define TOF_ICC_REG_TNI_SEND_COUNT 0x148
#define TOF_ICC_REG_TNI_NO_SEND_COUNT 0x150
#define TOF_ICC_REG_TNI_BLOCK_SEND_COUNT 0x158
#define TOF_ICC_REG_TNI_RECEIVE_COUNT 0x160
#define TOF_ICC_REG_TNI_NO_RECEIVE_COUNT 0x168
#define TOF_ICC_REG_TNI_NUM_SEND_TLP 0x170
#define TOF_ICC_REG_TNI_BYTE_SEND_TLP 0x178
#define TOF_ICC_REG_TNI_NUM_SEND_SYSTEM_TLP 0x180
#define TOF_ICC_REG_TNI_NUM_RECEIVE_TLP 0x188
#define TOF_ICC_REG_TNI_BYTE_RECEIVE_TLP 0x190
#define TOF_ICC_REG_TNI_NUM_RECEIVE_NULLIFIED_TLP 0x198
#define TOF_ICC_REG_TNI_RX_NUM_UNKNOWN_TLP 0x1a0
#define TOF_ICC_REG_TNI_RX_NUM_SYSTEM_TLP 0x1a8
#define TOF_ICC_REG_TNI_RX_NUM_EXCEPTION_TLP 0x1b0
#define TOF_ICC_REG_TNI_RX_NUM_DISCARD_UNKNOWN_TLP 0x1b8
#define TOF_ICC_REG_TNI_RX_NUM_DISCARD_SYSTEM_TLP 0x1c0
#define TOF_ICC_REG_TNI_RX_NUM_DISCARD_EXCEPTION_TLP 0x1c8
#define TOF_ICC_REG_TNI_DUMP_START 0x8
#define TOF_ICC_REG_TNI_DUMP_END 0x1d0
/* Port */
#define TOF_ICC_REG_PORT_PA(port) (tof_icc_reg_pa + 0x0006000000 + (port) * 0x1000)
#define TOF_ICC_REG_PORT_TX_VC0_ZERO_CREDIT_COUNT 0x0
#define TOF_ICC_REG_PORT_TX_VC1_ZERO_CREDIT_COUNT 0x8
#define TOF_ICC_REG_PORT_TX_VC2_ZERO_CREDIT_COUNT 0x10
#define TOF_ICC_REG_PORT_TX_VC3_ZERO_CREDIT_COUNT 0x18
#define TOF_ICC_REG_PORT_FREE_RUN_COUNT 0x80
#define TOF_ICC_REG_PORT_NUM_SEND_DLLP 0xc0
#define TOF_ICC_REG_PORT_NUM_SEND_TLP 0xc8
#define TOF_ICC_REG_PORT_BYTE_SEND_TLP 0xd0
#define TOF_ICC_REG_PORT_NUM_SEND_SYSTEM_TLP 0xd8
#define TOF_ICC_REG_PORT_NUM_SEND_NULLIFIED_TLP 0xe0
#define TOF_ICC_REG_PORT_NUM_TX_DISCARD_SYSTEM_TLP 0xe8
#define TOF_ICC_REG_PORT_NUM_TX_DISCARD_NORMAL_TLP 0xf0
#define TOF_ICC_REG_PORT_NUM_TX_FILTERED_NORMAL_TLP 0xf8
#define TOF_ICC_REG_PORT_NUM_VIRTUAL_CUT_THROUGH_TLP 0x100
#define TOF_ICC_REG_PORT_NUM_GENERATE_NULLIFIED_TLP 0x108
#define TOF_ICC_REG_PORT_NUM_RECEIVE_DLLP 0x110
#define TOF_ICC_REG_PORT_NUM_RECEIVE_TLP 0x118
#define TOF_ICC_REG_PORT_BYTE_RECEIVE_TLP 0x120
#define TOF_ICC_REG_PORT_NUM_RECEIVE_SYSTEM_TLP 0x128
#define TOF_ICC_REG_PORT_NUM_RECEIVE_NULLIFIED_TLP 0x130
#define TOF_ICC_REG_PORT_NUM_RX_DISCARD_SYSTEM_TLP 0x138
#define TOF_ICC_REG_PORT_NUM_RX_DISCARD_NORMAL_TLP 0x140
#define TOF_ICC_REG_PORT_NUM_RX_FILTERED_NORMAL_TLP 0x158
#define TOF_ICC_REG_PORT_NUM_RX_DISCARD_NULLIFIED_TLP 0x160
#define TOF_ICC_REG_PORT_FRAME_LCRC_ERROR_COUNT 0x170
#define TOF_ICC_REG_PORT_TX_RETRY_BUFFER_CE_COUNT 0x180
#define TOF_ICC_REG_PORT_RX_VC_BUFFER_CE_COUNT 0x188
#define TOF_ICC_REG_PORT_XB_CE_COUNT 0x190
#define TOF_ICC_REG_PORT_ACK_NACK_TIME_OUT_COUNT 0x198
#define TOF_ICC_REG_PORT_SLICE0_FCS_ERROR_COUNT 0x1a0
#define TOF_ICC_REG_PORT_SLICE1_FCS_ERROR_COUNT 0x1a8
#define TOF_ICC_REG_PORT_DUMP_START 0x0
#define TOF_ICC_REG_PORT_DUMP_END 0x1b0
/* XB */
#define TOF_ICC_REG_XB_PA (tof_icc_reg_pa + 0x000600f000)
#define TOF_ICC_REG_XB_STQ_ENABLE 0x0
#define TOF_ICC_REG_XB_STQ_UPDATE_INTERVAL 0x8
#define TOF_ICC_REG_XB_STQ_PA 0x10
#define TOF_ICC_REG_XB_STQ_SIZE 0x18
#define TOF_ICC_REG_XB_STQ_NEXT_OFFSET 0x20
#define TOF_ICC_REG_XB_DUMP_START 0x0
#define TOF_ICC_REG_XB_DUMP_END 0x28
#define TOF_ICC_XB_TC_DATA_CYCLE_COUNT(tni) ((tni) * 0x10 + 0x0)
#define TOF_ICC_XB_TC_WAIT_CYCLE_COUNT(tni) ((tni) * 0x10 + 0x8)
#define TOF_ICC_XB_TD_DATA_CYCLE_COUNT(tnr) ((tnr) * 0x10 + 0x60)
#define TOF_ICC_XB_TD_WAIT_CYCLE_COUNT(tnr) ((tnr) * 0x10 + 0x68)
/* Tofu */
#define TOF_ICC_REG_TOFU_PA (tof_icc_reg_pa + 0x0007000000)
#define TOF_ICC_REG_TOFU_NODE_ADDRESS 0x0
#define TOF_ICC_REG_TOFU_NODE_ADDRESS_X GENMASK(22, 18)
#define TOF_ICC_REG_TOFU_NODE_ADDRESS_Y GENMASK(17, 13)
#define TOF_ICC_REG_TOFU_NODE_ADDRESS_Z GENMASK(12, 8)
#define TOF_ICC_REG_TOFU_NODE_ADDRESS_A BIT(7)
#define TOF_ICC_REG_TOFU_NODE_ADDRESS_B GENMASK(6, 5)
#define TOF_ICC_REG_TOFU_NODE_ADDRESS_C BIT(4)
#define TOF_ICC_REG_TOFU_PORT_SETTING 0x8
#define TOF_ICC_REG_TOFU_TD_TLP_FILTER(tnr) ((tnr) * 0x10 + 0x10)
#define TOF_ICC_REG_TOFU_TD_SETTINGS(tnr) ((tnr) * 0x10 + 0x18)
#define TOF_ICC_REG_TOFU_TNR_MSI_BASE 0xc0
#define TOF_ICC_REG_TOFU_TNR_IRR 0xc8
#define TOF_ICC_REG_TOFU_TNR_IMR 0xd0
#define TOF_ICC_REG_TOFU_TNR_IRC 0xd8
#define TOF_ICC_REG_TOFU_TNR_IMC 0xe0
#define TOF_ICC_REG_TOFU_TNR_ICL 0xe8
#define TOF_ICC_REG_TOFU_TNI_VMS(tni, vmsid) ((tni) * 0x100 + (vmsid) * 0x8 + 0x100)
#define TOF_ICC_REG_TOFU_TNI_VMS_CQ00(tni) ((tni) * 0x100 + 0x180)
#define TOF_ICC_REG_TOFU_TNI_VMS_BG00(tni) ((tni) * 0x100 + 0x1a0)
#define TOF_ICC_REG_TOFU_TNI_VMS_BG16(tni) ((tni) * 0x100 + 0x1a8)
#define TOF_ICC_REG_TOFU_TNI_VMS_BG32(tni) ((tni) * 0x100 + 0x1b0)
#define TOF_ICC_REG_TOFU_TNI_MSI_BASE(tni) ((tni) * 0x100 + 0x1c0)
#define TOF_ICC_REG_TOFU_DUMP_START 0x0
#define TOF_ICC_REG_TOFU_DUMP_END 0x6c8
/** Interrupts **/
#define TOF_ICC_IRQ_CQS_TOQ_READ_EXCEPTION BIT(0)
#define TOF_ICC_IRQ_CQS_TOQ_DIRECT_DESCRIPTOR_EXCEPTION BIT(1)
#define TOF_ICC_IRQ_CQS_TOQ_MARKED_UE BIT(2)
#define TOF_ICC_IRQ_CQS_TCQ_WRITE_EXCEPTION BIT(3)
#define TOF_ICC_IRQ_CQS_TOQ_SOURCE_TYPE_EXCEPTION BIT(4)
#define TOF_ICC_IRQ_CQS_TCQ_WRITE_ACKNOWLEDGE BIT(5)
#define TOF_ICC_IRQ_CQS_MRQ_WRITE_ACKNOWLEDGE BIT(7)
#define TOF_ICC_IRQ_CQS_MRQ_WRITE_EXCEPTION BIT(8)
#define TOF_ICC_IRQ_CQS_MRQ_OVERFLOW BIT(9)
#define TOF_ICC_IRQ_CQS_STEERING_READ_EXCEPTION BIT(36)
#define TOF_ICC_IRQ_CQS_MB_READ_EXCEPTION BIT(38)
#define TOF_ICC_IRQ_CQS_PAYLOAD_READ_EXCEPTION BIT(39)
#define TOF_ICC_IRQ_CQS_PAYLOAD_WRITE_EXCEPTION BIT(40)
/* Just for convinience of irr value, no exists CQS CACHEFLUSH_TIMEOUT interrupt */
#define TOF_ICC_DUMMY_IRQ_CQS_CACHEFLUSH_TIMEOUT BIT(63)
#define TOF_ICC_IRQ_BGS_NODE_ADDRESS_UNMATCH BIT(0)
#define TOF_ICC_IRQ_BGS_BG_RECV_ADDRESS_EXCEPTION BIT(1)
#define TOF_ICC_IRQ_BGS_BG_SEND_ADDRESS_EXCEPTION BIT(2)
#define TOF_ICC_IRQ_BGS_GPID_UNMATCH BIT(3)
#define TOF_ICC_IRQ_BGS_BSEQ_UNMATCH BIT(4)
#define TOF_ICC_IRQ_BGS_SIGNAL_STATE_ERROR BIT(5)
#define TOF_ICC_IRQ_BGS_SYNCHRONIZATION_ACKNOWLEDGE BIT(24)
#define TOF_ICC_IRQ_BGS_ERROR_SYNCHRONIZATION_ACKNOWLEDGE BIT(25)
#define TOF_ICC_IRQ_BGS_DMA_COMPLETION_EXCEPTION BIT(26)
#define TOF_ICC_IRQ_TNI_PBQ_READ_EXCEPTION BIT(0)
#define TOF_ICC_IRQ_TNI_PBQ_MARKED_UE BIT(1)
#define TOF_ICC_IRQ_TNI_PBQ_UNDERFLOW BIT(2)
#define TOF_ICC_IRQ_TNI_PRQ_PACKET_DISCARD BIT(3)
#define TOF_ICC_IRQ_TNI_PRQ_WRITE_ACKNOWLEDGE BIT(4)
#define TOF_ICC_IRQ_TNI_PRQ_WRITE_EXCEPTION BIT(5)
#define TOF_ICC_IRQ_TNI_PRQ_OVERFLOW BIT(6)
#define TOF_ICC_IRQ_TNI_INACTIVE_BG BIT(16)
#define TOF_ICC_IRQ_TNI_STAGE2_TRANSLATION_FAULT BIT(32)
#define TOF_ICC_IRQ_TNR_TNR0_RX_FILTER_OUT BIT(0)
#define TOF_ICC_IRQ_TNR_TNR0_TX_FILTER_OUT BIT(1)
#define TOF_ICC_IRQ_TNR_TNR0_PORT_ERROR BIT(2)
#define TOF_ICC_IRQ_TNR_TNR0_DATELINE_ERROR BIT(3)
#define TOF_ICC_IRQ_TNR_TNR0_ROUTING_ERROR BIT(4)
#define TOF_ICC_IRQ_TNR_TNR1_RX_FILTER_OUT BIT(6)
#define TOF_ICC_IRQ_TNR_TNR1_TX_FILTER_OUT BIT(7)
#define TOF_ICC_IRQ_TNR_TNR1_PORT_ERROR BIT(8)
#define TOF_ICC_IRQ_TNR_TNR1_DATELINE_ERROR BIT(9)
#define TOF_ICC_IRQ_TNR_TNR1_ROUTING_ERROR BIT(10)
#define TOF_ICC_IRQ_TNR_TNR2_RX_FILTER_OUT BIT(12)
#define TOF_ICC_IRQ_TNR_TNR2_TX_FILTER_OUT BIT(13)
#define TOF_ICC_IRQ_TNR_TNR2_PORT_ERROR BIT(14)
#define TOF_ICC_IRQ_TNR_TNR2_DATELINE_ERROR BIT(15)
#define TOF_ICC_IRQ_TNR_TNR2_ROUTING_ERROR BIT(16)
#define TOF_ICC_IRQ_TNR_TNR3_RX_FILTER_OUT BIT(18)
#define TOF_ICC_IRQ_TNR_TNR3_TX_FILTER_OUT BIT(19)
#define TOF_ICC_IRQ_TNR_TNR3_PORT_ERROR BIT(20)
#define TOF_ICC_IRQ_TNR_TNR3_DATELINE_ERROR BIT(21)
#define TOF_ICC_IRQ_TNR_TNR3_ROUTING_ERROR BIT(22)
#define TOF_ICC_IRQ_TNR_TNR4_RX_FILTER_OUT BIT(24)
#define TOF_ICC_IRQ_TNR_TNR4_TX_FILTER_OUT BIT(25)
#define TOF_ICC_IRQ_TNR_TNR4_PORT_ERROR BIT(26)
#define TOF_ICC_IRQ_TNR_TNR4_DATELINE_ERROR BIT(27)
#define TOF_ICC_IRQ_TNR_TNR4_ROUTING_ERROR BIT(28)
#define TOF_ICC_IRQ_TNR_TNR5_RX_FILTER_OUT BIT(30)
#define TOF_ICC_IRQ_TNR_TNR5_TX_FILTER_OUT BIT(31)
#define TOF_ICC_IRQ_TNR_TNR5_PORT_ERROR BIT(32)
#define TOF_ICC_IRQ_TNR_TNR5_DATELINE_ERROR BIT(33)
#define TOF_ICC_IRQ_TNR_TNR5_ROUTING_ERROR BIT(34)
#define TOF_ICC_IRQ_TNR_TNR6_RX_FILTER_OUT BIT(36)
#define TOF_ICC_IRQ_TNR_TNR6_TX_FILTER_OUT BIT(37)
#define TOF_ICC_IRQ_TNR_TNR6_PORT_ERROR BIT(38)
#define TOF_ICC_IRQ_TNR_TNR6_DATELINE_ERROR BIT(39)
#define TOF_ICC_IRQ_TNR_TNR6_ROUTING_ERROR BIT(40)
#define TOF_ICC_IRQ_TNR_TNR7_RX_FILTER_OUT BIT(42)
#define TOF_ICC_IRQ_TNR_TNR7_TX_FILTER_OUT BIT(43)
#define TOF_ICC_IRQ_TNR_TNR7_PORT_ERROR BIT(44)
#define TOF_ICC_IRQ_TNR_TNR7_DATELINE_ERROR BIT(45)
#define TOF_ICC_IRQ_TNR_TNR7_ROUTING_ERROR BIT(46)
#define TOF_ICC_IRQ_TNR_TNR8_RX_FILTER_OUT BIT(48)
#define TOF_ICC_IRQ_TNR_TNR8_TX_FILTER_OUT BIT(49)
#define TOF_ICC_IRQ_TNR_TNR8_PORT_ERROR BIT(50)
#define TOF_ICC_IRQ_TNR_TNR8_DATELINE_ERROR BIT(51)
#define TOF_ICC_IRQ_TNR_TNR8_ROUTING_ERROR BIT(52)
#define TOF_ICC_IRQ_TNR_TNR9_RX_FILTER_OUT BIT(54)
#define TOF_ICC_IRQ_TNR_TNR9_TX_FILTER_OUT BIT(55)
#define TOF_ICC_IRQ_TNR_TNR9_PORT_ERROR BIT(56)
#define TOF_ICC_IRQ_TNR_TNR9_DATELINE_ERROR BIT(57)
#define TOF_ICC_IRQ_TNR_TNR9_ROUTING_ERROR BIT(58)
#endif
/* vim: set noet ts=8 sw=8 sts=0 tw=0 : */

View File

@ -0,0 +1,319 @@
#ifndef _TOF_UAPI_H_
#define _TOF_UAPI_H_
#include <ihk/types.h>
#include <arch-memory.h>
enum tof_sig_errno_cq {
TOF_TOQ_DIRECT_DESCRIPTOR_EXCEPTION,
TOF_TOQ_SOURCE_TYPE_EXCEPTION,
TOF_MRQ_OVERFLOW,
TOF_CQS_CACHEFLUSH_TIMEOUT,
};
enum tof_sig_errno_bg {
TOF_NODE_ADDRESS_UNMATCH,
TOF_BSEQ_UNMATCH,
TOF_SIGNAL_STATE_ERROR,
TOF_ERROR_SYNCHRONIZATION_ACKNOWLEDGE,
};
#define TOF_UAPI_VERSION 0x2a00
struct tof_init_cq {
uint16_t version;
uint8_t session_mode;
uint8_t toq_size;
uint8_t mrq_size;
uint8_t num_stag;
uint8_t tcq_cinj;
uint8_t mrq_cinj;
void *toq_mem;
void *tcq_mem;
void *mrq_mem;
};
struct tof_alloc_stag {
uint32_t flags;
int stag;
uint64_t offset;
void *va;
uint64_t len;
};
struct tof_free_stags {
uint16_t num;
int *stags;
};
struct tof_addr {
uint8_t pa;
uint8_t pb;
uint8_t pc;
uint8_t x;
uint8_t y;
uint8_t z;
uint8_t a;
uint8_t b;
uint8_t c;
};
struct tof_set_bg {
int tni;
int gate;
int source_lgate;
struct tof_addr source_raddr;
int source_rtni;
int source_rgate;
int dest_lgate;
struct tof_addr dest_raddr;
int dest_rtni;
int dest_rgate;
};
struct tof_enable_bch {
void *addr;
int bseq;
int num;
struct tof_set_bg *bgs;
};
struct tof_set_subnet {
int res0;
int res1;
uint8_t nx;
uint8_t sx;
uint8_t lx;
uint8_t ny;
uint8_t sy;
uint8_t ly;
uint8_t nz;
uint8_t sz;
uint8_t lz;
};
struct tof_reg_user {
uid_t uid;
uint32_t gpid;
struct tof_set_subnet subnet;
uint64_t *cqmask;
uint64_t *bgmask;
};
struct tof_notify_linkdown {
int num;
struct {
uint8_t x;
uint8_t y;
uint8_t z;
uint8_t a;
uint8_t b;
uint8_t c;
uint16_t ports;
} *items;
};
struct tof_get_port_stat {
int port_no;
uint64_t mask;
uint64_t pa[31];
};
struct tof_get_cq_stat {
int tni;
int cqid;
uint64_t txbyte;
uint64_t rxbyte;
};
struct tof_load_register {
uint64_t pa;
uint64_t len;
void *buf;
};
struct tof_load_resource {
uint64_t rsc_id;
uint64_t offset;
uint64_t len;
void *buf;
};
union tof_trans_table_bitfield {
struct {
uint64_t start:36;
uint64_t len:27;
uint64_t ps_code:1;
} bits;
uint64_t atomic;
};
struct tof_trans_table {
union tof_trans_table_bitfield steering;
union tof_trans_table_bitfield mbpt;
};
void tof_utofu_set_linkdown_callback(void (*callback)(int, const void *));
void tof_utofu_unset_linkdown_callback(void);
#define TOF_MMAP_CQ_REGISTER 0
#define TOF_MMAP_CQ_TRANSTABLE (PAGE_SIZE)
#define TOF_MMAP_BCH_REGISTER 0
#define TOF_MMAP_XB_STQ 0
#define TOF_ST_RDWR 0x0
#define TOF_ST_RDONLY 0x1
#define TOF_ST_LPG 0x2
#define TOF_STAG_TRANS_PS_CODE_64KB 0
#define TOF_STAG_TRANS_PS_CODE_2MB 1
#define TOF_IOC_MAGIC 'd'
#define TOF_IOCTL_INIT_CQ _IOWR(TOF_IOC_MAGIC, 0, long)
#define TOF_IOCTL_ALLOC_STAG _IOWR(TOF_IOC_MAGIC, 1, long)
#define TOF_IOCTL_FREE_STAGS _IOWR(TOF_IOC_MAGIC, 2, long)
#define TOF_IOCTL_ENABLE_BCH _IOWR(TOF_IOC_MAGIC, 3, long)
#define TOF_IOCTL_DISABLE_BCH _IOWR(TOF_IOC_MAGIC, 4, long)
#define TOF_IOCTL_SET_RT_SIGNAL _IOWR(TOF_IOC_MAGIC, 5, long)
#define TOF_IOCTL_SET_SUBNET _IOWR(TOF_IOC_MAGIC, 6, long)
#define TOF_IOCTL_REG_USER _IOWR(TOF_IOC_MAGIC, 7, long)
#define TOF_IOCTL_NOTIFY_LINKDOWN _IOWR(TOF_IOC_MAGIC, 8, long)
#define TOF_IOCTL_GET_PORT_STAT _IOWR(TOF_IOC_MAGIC, 9, long)
#define TOF_IOCTL_GET_CQ_STAT _IOWR(TOF_IOC_MAGIC, 10, long)
#define TOF_IOCTL_LOAD_REGISTER _IOWR(TOF_IOC_MAGIC, 11, long)
#define TOF_IOCTL_LOAD_RESOURCE _IOWR(TOF_IOC_MAGIC, 12, long)
enum {
/* TOQ (0 - 71) */
TOF_RSC_TNI0_TOQ0 = 0, TOF_RSC_TNI0_TOQ1, TOF_RSC_TNI0_TOQ2, TOF_RSC_TNI0_TOQ3,
TOF_RSC_TNI0_TOQ4, TOF_RSC_TNI0_TOQ5, TOF_RSC_TNI0_TOQ6, TOF_RSC_TNI0_TOQ7,
TOF_RSC_TNI0_TOQ8, TOF_RSC_TNI0_TOQ9, TOF_RSC_TNI0_TOQ10, TOF_RSC_TNI0_TOQ11,
TOF_RSC_TNI1_TOQ0, TOF_RSC_TNI1_TOQ1, TOF_RSC_TNI1_TOQ2, TOF_RSC_TNI1_TOQ3,
TOF_RSC_TNI1_TOQ4, TOF_RSC_TNI1_TOQ5, TOF_RSC_TNI1_TOQ6, TOF_RSC_TNI1_TOQ7,
TOF_RSC_TNI1_TOQ8, TOF_RSC_TNI1_TOQ9, TOF_RSC_TNI1_TOQ10, TOF_RSC_TNI1_TOQ11,
TOF_RSC_TNI2_TOQ0, TOF_RSC_TNI2_TOQ1, TOF_RSC_TNI2_TOQ2, TOF_RSC_TNI2_TOQ3,
TOF_RSC_TNI2_TOQ4, TOF_RSC_TNI2_TOQ5, TOF_RSC_TNI2_TOQ6, TOF_RSC_TNI2_TOQ7,
TOF_RSC_TNI2_TOQ8, TOF_RSC_TNI2_TOQ9, TOF_RSC_TNI2_TOQ10, TOF_RSC_TNI2_TOQ11,
TOF_RSC_TNI3_TOQ0, TOF_RSC_TNI3_TOQ1, TOF_RSC_TNI3_TOQ2, TOF_RSC_TNI3_TOQ3,
TOF_RSC_TNI3_TOQ4, TOF_RSC_TNI3_TOQ5, TOF_RSC_TNI3_TOQ6, TOF_RSC_TNI3_TOQ7,
TOF_RSC_TNI3_TOQ8, TOF_RSC_TNI3_TOQ9, TOF_RSC_TNI3_TOQ10, TOF_RSC_TNI3_TOQ11,
TOF_RSC_TNI4_TOQ0, TOF_RSC_TNI4_TOQ1, TOF_RSC_TNI4_TOQ2, TOF_RSC_TNI4_TOQ3,
TOF_RSC_TNI4_TOQ4, TOF_RSC_TNI4_TOQ5, TOF_RSC_TNI4_TOQ6, TOF_RSC_TNI4_TOQ7,
TOF_RSC_TNI4_TOQ8, TOF_RSC_TNI4_TOQ9, TOF_RSC_TNI4_TOQ10, TOF_RSC_TNI4_TOQ11,
TOF_RSC_TNI5_TOQ0, TOF_RSC_TNI5_TOQ1, TOF_RSC_TNI5_TOQ2, TOF_RSC_TNI5_TOQ3,
TOF_RSC_TNI5_TOQ4, TOF_RSC_TNI5_TOQ5, TOF_RSC_TNI5_TOQ6, TOF_RSC_TNI5_TOQ7,
TOF_RSC_TNI5_TOQ8, TOF_RSC_TNI5_TOQ9, TOF_RSC_TNI5_TOQ10, TOF_RSC_TNI5_TOQ11,
/* TOQ (72 - 143) */
TOF_RSC_TNI0_TCQ0, TOF_RSC_TNI0_TCQ1, TOF_RSC_TNI0_TCQ2, TOF_RSC_TNI0_TCQ3,
TOF_RSC_TNI0_TCQ4, TOF_RSC_TNI0_TCQ5, TOF_RSC_TNI0_TCQ6, TOF_RSC_TNI0_TCQ7,
TOF_RSC_TNI0_TCQ8, TOF_RSC_TNI0_TCQ9, TOF_RSC_TNI0_TCQ10, TOF_RSC_TNI0_TCQ11,
TOF_RSC_TNI1_TCQ0, TOF_RSC_TNI1_TCQ1, TOF_RSC_TNI1_TCQ2, TOF_RSC_TNI1_TCQ3,
TOF_RSC_TNI1_TCQ4, TOF_RSC_TNI1_TCQ5, TOF_RSC_TNI1_TCQ6, TOF_RSC_TNI1_TCQ7,
TOF_RSC_TNI1_TCQ8, TOF_RSC_TNI1_TCQ9, TOF_RSC_TNI1_TCQ10, TOF_RSC_TNI1_TCQ11,
TOF_RSC_TNI2_TCQ0, TOF_RSC_TNI2_TCQ1, TOF_RSC_TNI2_TCQ2, TOF_RSC_TNI2_TCQ3,
TOF_RSC_TNI2_TCQ4, TOF_RSC_TNI2_TCQ5, TOF_RSC_TNI2_TCQ6, TOF_RSC_TNI2_TCQ7,
TOF_RSC_TNI2_TCQ8, TOF_RSC_TNI2_TCQ9, TOF_RSC_TNI2_TCQ10, TOF_RSC_TNI2_TCQ11,
TOF_RSC_TNI3_TCQ0, TOF_RSC_TNI3_TCQ1, TOF_RSC_TNI3_TCQ2, TOF_RSC_TNI3_TCQ3,
TOF_RSC_TNI3_TCQ4, TOF_RSC_TNI3_TCQ5, TOF_RSC_TNI3_TCQ6, TOF_RSC_TNI3_TCQ7,
TOF_RSC_TNI3_TCQ8, TOF_RSC_TNI3_TCQ9, TOF_RSC_TNI3_TCQ10, TOF_RSC_TNI3_TCQ11,
TOF_RSC_TNI4_TCQ0, TOF_RSC_TNI4_TCQ1, TOF_RSC_TNI4_TCQ2, TOF_RSC_TNI4_TCQ3,
TOF_RSC_TNI4_TCQ4, TOF_RSC_TNI4_TCQ5, TOF_RSC_TNI4_TCQ6, TOF_RSC_TNI4_TCQ7,
TOF_RSC_TNI4_TCQ8, TOF_RSC_TNI4_TCQ9, TOF_RSC_TNI4_TCQ10, TOF_RSC_TNI4_TCQ11,
TOF_RSC_TNI5_TCQ0, TOF_RSC_TNI5_TCQ1, TOF_RSC_TNI5_TCQ2, TOF_RSC_TNI5_TCQ3,
TOF_RSC_TNI5_TCQ4, TOF_RSC_TNI5_TCQ5, TOF_RSC_TNI5_TCQ6, TOF_RSC_TNI5_TCQ7,
TOF_RSC_TNI5_TCQ8, TOF_RSC_TNI5_TCQ9, TOF_RSC_TNI5_TCQ10, TOF_RSC_TNI5_TCQ11,
/* MRQ (144 - 215) */
TOF_RSC_TNI0_MRQ0, TOF_RSC_TNI0_MRQ1, TOF_RSC_TNI0_MRQ2, TOF_RSC_TNI0_MRQ3,
TOF_RSC_TNI0_MRQ4, TOF_RSC_TNI0_MRQ5, TOF_RSC_TNI0_MRQ6, TOF_RSC_TNI0_MRQ7,
TOF_RSC_TNI0_MRQ8, TOF_RSC_TNI0_MRQ9, TOF_RSC_TNI0_MRQ10, TOF_RSC_TNI0_MRQ11,
TOF_RSC_TNI1_MRQ0, TOF_RSC_TNI1_MRQ1, TOF_RSC_TNI1_MRQ2, TOF_RSC_TNI1_MRQ3,
TOF_RSC_TNI1_MRQ4, TOF_RSC_TNI1_MRQ5, TOF_RSC_TNI1_MRQ6, TOF_RSC_TNI1_MRQ7,
TOF_RSC_TNI1_MRQ8, TOF_RSC_TNI1_MRQ9, TOF_RSC_TNI1_MRQ10, TOF_RSC_TNI1_MRQ11,
TOF_RSC_TNI2_MRQ0, TOF_RSC_TNI2_MRQ1, TOF_RSC_TNI2_MRQ2, TOF_RSC_TNI2_MRQ3,
TOF_RSC_TNI2_MRQ4, TOF_RSC_TNI2_MRQ5, TOF_RSC_TNI2_MRQ6, TOF_RSC_TNI2_MRQ7,
TOF_RSC_TNI2_MRQ8, TOF_RSC_TNI2_MRQ9, TOF_RSC_TNI2_MRQ10, TOF_RSC_TNI2_MRQ11,
TOF_RSC_TNI3_MRQ0, TOF_RSC_TNI3_MRQ1, TOF_RSC_TNI3_MRQ2, TOF_RSC_TNI3_MRQ3,
TOF_RSC_TNI3_MRQ4, TOF_RSC_TNI3_MRQ5, TOF_RSC_TNI3_MRQ6, TOF_RSC_TNI3_MRQ7,
TOF_RSC_TNI3_MRQ8, TOF_RSC_TNI3_MRQ9, TOF_RSC_TNI3_MRQ10, TOF_RSC_TNI3_MRQ11,
TOF_RSC_TNI4_MRQ0, TOF_RSC_TNI4_MRQ1, TOF_RSC_TNI4_MRQ2, TOF_RSC_TNI4_MRQ3,
TOF_RSC_TNI4_MRQ4, TOF_RSC_TNI4_MRQ5, TOF_RSC_TNI4_MRQ6, TOF_RSC_TNI4_MRQ7,
TOF_RSC_TNI4_MRQ8, TOF_RSC_TNI4_MRQ9, TOF_RSC_TNI4_MRQ10, TOF_RSC_TNI4_MRQ11,
TOF_RSC_TNI5_MRQ0, TOF_RSC_TNI5_MRQ1, TOF_RSC_TNI5_MRQ2, TOF_RSC_TNI5_MRQ3,
TOF_RSC_TNI5_MRQ4, TOF_RSC_TNI5_MRQ5, TOF_RSC_TNI5_MRQ6, TOF_RSC_TNI5_MRQ7,
TOF_RSC_TNI5_MRQ8, TOF_RSC_TNI5_MRQ9, TOF_RSC_TNI5_MRQ10, TOF_RSC_TNI5_MRQ11,
/* PBQ (216 - 221) */
TOF_RSC_TNI0_PBQ, TOF_RSC_TNI1_PBQ, TOF_RSC_TNI2_PBQ, TOF_RSC_TNI3_PBQ,
TOF_RSC_TNI4_PBQ, TOF_RSC_TNI5_PBQ,
/* PRQ (222 - 227) */
TOF_RSC_TNI0_PRQ, TOF_RSC_TNI1_PRQ, TOF_RSC_TNI2_PRQ, TOF_RSC_TNI3_PRQ,
TOF_RSC_TNI4_PRQ, TOF_RSC_TNI5_PRQ,
/* STEERINGTABLE (228 - 299) */
TOF_RSC_TNI0_STEERINGTABLE0, TOF_RSC_TNI0_STEERINGTABLE1, TOF_RSC_TNI0_STEERINGTABLE2,
TOF_RSC_TNI0_STEERINGTABLE3, TOF_RSC_TNI0_STEERINGTABLE4, TOF_RSC_TNI0_STEERINGTABLE5,
TOF_RSC_TNI0_STEERINGTABLE6, TOF_RSC_TNI0_STEERINGTABLE7, TOF_RSC_TNI0_STEERINGTABLE8,
TOF_RSC_TNI0_STEERINGTABLE9, TOF_RSC_TNI0_STEERINGTABLE10, TOF_RSC_TNI0_STEERINGTABLE11,
TOF_RSC_TNI1_STEERINGTABLE0, TOF_RSC_TNI1_STEERINGTABLE1, TOF_RSC_TNI1_STEERINGTABLE2,
TOF_RSC_TNI1_STEERINGTABLE3, TOF_RSC_TNI1_STEERINGTABLE4, TOF_RSC_TNI1_STEERINGTABLE5,
TOF_RSC_TNI1_STEERINGTABLE6, TOF_RSC_TNI1_STEERINGTABLE7, TOF_RSC_TNI1_STEERINGTABLE8,
TOF_RSC_TNI1_STEERINGTABLE9, TOF_RSC_TNI1_STEERINGTABLE10, TOF_RSC_TNI1_STEERINGTABLE11,
TOF_RSC_TNI2_STEERINGTABLE0, TOF_RSC_TNI2_STEERINGTABLE1, TOF_RSC_TNI2_STEERINGTABLE2,
TOF_RSC_TNI2_STEERINGTABLE3, TOF_RSC_TNI2_STEERINGTABLE4, TOF_RSC_TNI2_STEERINGTABLE5,
TOF_RSC_TNI2_STEERINGTABLE6, TOF_RSC_TNI2_STEERINGTABLE7, TOF_RSC_TNI2_STEERINGTABLE8,
TOF_RSC_TNI2_STEERINGTABLE9, TOF_RSC_TNI2_STEERINGTABLE10, TOF_RSC_TNI2_STEERINGTABLE11,
TOF_RSC_TNI3_STEERINGTABLE0, TOF_RSC_TNI3_STEERINGTABLE1, TOF_RSC_TNI3_STEERINGTABLE2,
TOF_RSC_TNI3_STEERINGTABLE3, TOF_RSC_TNI3_STEERINGTABLE4, TOF_RSC_TNI3_STEERINGTABLE5,
TOF_RSC_TNI3_STEERINGTABLE6, TOF_RSC_TNI3_STEERINGTABLE7, TOF_RSC_TNI3_STEERINGTABLE8,
TOF_RSC_TNI3_STEERINGTABLE9, TOF_RSC_TNI3_STEERINGTABLE10, TOF_RSC_TNI3_STEERINGTABLE11,
TOF_RSC_TNI4_STEERINGTABLE0, TOF_RSC_TNI4_STEERINGTABLE1, TOF_RSC_TNI4_STEERINGTABLE2,
TOF_RSC_TNI4_STEERINGTABLE3, TOF_RSC_TNI4_STEERINGTABLE4, TOF_RSC_TNI4_STEERINGTABLE5,
TOF_RSC_TNI4_STEERINGTABLE6, TOF_RSC_TNI4_STEERINGTABLE7, TOF_RSC_TNI4_STEERINGTABLE8,
TOF_RSC_TNI4_STEERINGTABLE9, TOF_RSC_TNI4_STEERINGTABLE10, TOF_RSC_TNI4_STEERINGTABLE11,
TOF_RSC_TNI5_STEERINGTABLE3, TOF_RSC_TNI5_STEERINGTABLE4, TOF_RSC_TNI5_STEERINGTABLE5,
TOF_RSC_TNI5_STEERINGTABLE6, TOF_RSC_TNI5_STEERINGTABLE7, TOF_RSC_TNI5_STEERINGTABLE8,
TOF_RSC_TNI5_STEERINGTABLE9, TOF_RSC_TNI5_STEERINGTABLE10, TOF_RSC_TNI5_STEERINGTABLE11,
/* MBTABLE (300 - 371) */
TOF_RSC_TNI0_MBTABLE0, TOF_RSC_TNI0_MBTABLE1, TOF_RSC_TNI0_MBTABLE2,
TOF_RSC_TNI0_MBTABLE3, TOF_RSC_TNI0_MBTABLE4, TOF_RSC_TNI0_MBTABLE5,
TOF_RSC_TNI0_MBTABLE6, TOF_RSC_TNI0_MBTABLE7, TOF_RSC_TNI0_MBTABLE8,
TOF_RSC_TNI0_MBTABLE9, TOF_RSC_TNI0_MBTABLE10, TOF_RSC_TNI0_MBTABLE11,
TOF_RSC_TNI1_MBTABLE0, TOF_RSC_TNI1_MBTABLE1, TOF_RSC_TNI1_MBTABLE2,
TOF_RSC_TNI1_MBTABLE3, TOF_RSC_TNI1_MBTABLE4, TOF_RSC_TNI1_MBTABLE5,
TOF_RSC_TNI1_MBTABLE6, TOF_RSC_TNI1_MBTABLE7, TOF_RSC_TNI1_MBTABLE8,
TOF_RSC_TNI1_MBTABLE9, TOF_RSC_TNI1_MBTABLE10, TOF_RSC_TNI1_MBTABLE11,
TOF_RSC_TNI2_MBTABLE0, TOF_RSC_TNI2_MBTABLE1, TOF_RSC_TNI2_MBTABLE2,
TOF_RSC_TNI2_MBTABLE3, TOF_RSC_TNI2_MBTABLE4, TOF_RSC_TNI2_MBTABLE5,
TOF_RSC_TNI2_MBTABLE6, TOF_RSC_TNI2_MBTABLE7, TOF_RSC_TNI2_MBTABLE8,
TOF_RSC_TNI2_MBTABLE9, TOF_RSC_TNI2_MBTABLE10, TOF_RSC_TNI2_MBTABLE11,
TOF_RSC_TNI3_MBTABLE0, TOF_RSC_TNI3_MBTABLE1, TOF_RSC_TNI3_MBTABLE2,
TOF_RSC_TNI3_MBTABLE3, TOF_RSC_TNI3_MBTABLE4, TOF_RSC_TNI3_MBTABLE5,
TOF_RSC_TNI3_MBTABLE6, TOF_RSC_TNI3_MBTABLE7, TOF_RSC_TNI3_MBTABLE8,
TOF_RSC_TNI3_MBTABLE9, TOF_RSC_TNI3_MBTABLE10, TOF_RSC_TNI3_MBTABLE11,
TOF_RSC_TNI4_MBTABLE0, TOF_RSC_TNI4_MBTABLE1, TOF_RSC_TNI4_MBTABLE2,
TOF_RSC_TNI4_MBTABLE3, TOF_RSC_TNI4_MBTABLE4, TOF_RSC_TNI4_MBTABLE5,
TOF_RSC_TNI4_MBTABLE6, TOF_RSC_TNI4_MBTABLE7, TOF_RSC_TNI4_MBTABLE8,
TOF_RSC_TNI4_MBTABLE9, TOF_RSC_TNI4_MBTABLE10, TOF_RSC_TNI4_MBTABLE11,
TOF_RSC_TNI5_MBTABLE0, TOF_RSC_TNI5_MBTABLE1, TOF_RSC_TNI5_MBTABLE2,
TOF_RSC_TNI5_MBTABLE3, TOF_RSC_TNI5_MBTABLE4, TOF_RSC_TNI5_MBTABLE5,
TOF_RSC_TNI5_MBTABLE6, TOF_RSC_TNI5_MBTABLE7, TOF_RSC_TNI5_MBTABLE8,
TOF_RSC_TNI5_MBTABLE9, TOF_RSC_TNI5_MBTABLE10, TOF_RSC_TNI5_MBTABLE11,
TOF_RSC_NUM /* 372 */
};
#define TOF_RSC_TOQ(TNI, CQID) (TOF_RSC_TNI0_TOQ0 + (TNI * 12) + CQID)
#define TOF_RSC_TCQ(TNI, CQID) (TOF_RSC_TNI0_TCQ0 + (TNI * 12) + CQID)
#define TOF_RSC_MRQ(TNI, CQID) (TOF_RSC_TNI0_MRQ0 + (TNI * 12) + CQID)
#define TOF_RSC_PBQ(TNI) (TOF_RSC_TNI0_PBQ + TNI)
#define TOF_RSC_PRQ(TNI) (TOF_RSC_TNI0_PRQ + TNI)
#define TOF_RSC_STT(TNI, CQID) (TOF_RSC_TNI0_STEERINGTABLE0 + (TNI * 12) + CQID)
#define TOF_RSC_MBT(TNI, CQID) (TOF_RSC_TNI0_MBTABLE0 + (TNI * 12) + CQID)
#endif
/* vim: set noet ts=8 sw=8 sts=0 tw=0 : */

View File

@ -0,0 +1,6 @@
struct {
bool enabled;
uint64_t bgmask[TOF_ICC_NTNIS];
uintptr_t iova;
void *kaddr;
} bch;

View File

@ -0,0 +1,6 @@
struct {
struct tof_utofu_trans_list *mru;
struct tof_trans_table *table;
int mruhead;
ihk_spinlock_t mru_lock;
} trans;

View File

@ -0,0 +1,29 @@
struct tof_core_bg {
union {
char whole_struct[120];
struct {
char padding0[0];
spinlock_t lock;
};
struct {
char padding1[8];
#include "tof_core_bg_reg.h"
};
struct {
char padding2[24];
struct tof_core_irq irq;
};
struct {
char padding3[88];
tof_core_signal_handler sighandler;
};
struct {
char padding4[104];
uint64_t subnet;
};
struct {
char padding5[112];
uint32_t gpid;
};
};
};

View File

@ -0,0 +1,9 @@
struct tof_core_cq {
union {
char whole_struct[264];
struct {
char padding0[56];
#include "tof_core_cq_reg.h"
};
};
};

View File

@ -0,0 +1,21 @@
struct tof_utofu_bg {
union {
char whole_struct[160];
struct {
char padding0[0];
struct tof_utofu_device common;
};
struct {
char padding1[80];
uint8_t tni;
};
struct {
char padding2[81];
uint8_t bgid;
};
struct {
char padding3[88];
#include "tof_utofu_bg_bch.h"
};
};
};

View File

@ -0,0 +1,33 @@
struct tof_utofu_cq {
union {
char whole_struct[384];
struct {
char padding0[0];
struct tof_utofu_device common;
};
struct {
char padding1[80];
uint8_t tni;
};
struct {
char padding2[81];
uint8_t cqid;
};
struct {
char padding3[104];
#include "tof_utofu_cq_trans.h"
};
struct {
char padding4[128];
struct tof_icc_steering_entry *steering;
};
struct {
char padding5[136];
struct tof_icc_mb_entry *mb;
};
struct {
char padding6[186];
uint8_t num_stag;
};
};
};

View File

@ -0,0 +1,17 @@
struct tof_utofu_device {
union {
char whole_struct[80];
struct {
char padding0[0];
bool enabled;
};
struct {
char padding1[12];
uint32_t gpid;
};
struct {
char padding2[24];
uint64_t subnet;
};
};
};

View File

@ -0,0 +1,33 @@
struct tof_utofu_mbpt {
union {
char whole_struct[56];
struct {
char padding0[0];
struct kref kref;
};
struct {
char padding1[8];
struct tof_utofu_cq *ucq;
};
struct {
char padding2[16];
uintptr_t iova;
};
struct {
char padding3[24];
struct scatterlist *sg;
};
struct {
char padding4[32];
size_t nsgents;
};
struct {
char padding5[40];
uintptr_t mbptstart;
};
struct {
char padding6[48];
size_t pgsz;
};
};
};

View File

@ -0,0 +1,15 @@
#ifndef __TOFU_STAG_RANGE_HEADER__
#define __TOFU_STAG_RANGE_HEADER__
struct tof_utofu_cq;
struct tofu_stag_range {
uintptr_t start, end;
int stag;
struct tof_utofu_cq *ucq;
struct list_head list; // per-vm_range list
struct list_head hash; // per-process stag hash
};
#endif // __TOFU_STAG_RANGE_HEADER__

View File

@ -345,6 +345,9 @@ static void populate_sysfs(void)
int host_ikc_inited = 0;
extern int num_processors;
#ifdef ENABLE_TOFU
extern void tof_utofu_init_globals(void);
#endif
static void post_init(void)
{
@ -370,6 +373,9 @@ static void post_init(void)
sysfs_init();
populate_sysfs();
#ifdef ENABLE_TOFU
tof_utofu_init_globals();
#endif
}
#ifdef DCFA_RUN
extern void user_main();

View File

@ -44,6 +44,7 @@
#include <limits.h>
#include <sysfs.h>
#include <ihk/debug.h>
#include <llist.h>
#include <bootparam.h>
//#define DEBUG_PRINT_MEM
@ -741,6 +742,15 @@ distance_based:
memory_nodes[node].nodes_by_distance[i].id);
break;
}
else {
if (i == 0)
kprintf("%s: distance: CPU @ node %d failed to allocate "
"%d pages from node %d\n",
__FUNCTION__,
ihk_mc_get_numa_id(),
npages,
memory_nodes[node].nodes_by_distance[i].id);
}
}
if (pa) break;
@ -792,6 +802,27 @@ order_based:
return NULL;
}
/*
* Get NUMA node structure offsetted by index in the order of distance
*/
struct ihk_mc_numa_node *ihk_mc_get_numa_node_by_distance(int i)
{
int numa_id;
if (!cpu_local_var_initialized)
return NULL;
if (i < 0 || i > ihk_mc_get_nr_numa_nodes()) {
return NULL;
}
numa_id = ihk_mc_get_numa_id();
if (!memory_nodes[numa_id].nodes_by_distance)
return NULL;
return &memory_nodes[memory_nodes[numa_id].nodes_by_distance[i].id];
}
static void __mckernel_free_pages_in_allocator(void *va, int npages,
int is_user)
{
@ -1292,6 +1323,7 @@ static void unhandled_page_fault(struct thread *thread, void *fault_addr,
ihk_mc_debug_show_interrupt_context(regs);
if (!(reason & PF_USER)) {
cpu_local_var(kernel_mode_pf_regs) = regs;
panic("panic: kernel mode PF");
}
@ -1312,6 +1344,9 @@ static void unhandled_page_fault(struct thread *thread, void *fault_addr,
static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
{
struct thread *thread = cpu_local_var(current);
#ifdef ENABLE_TOFU
unsigned long addr = (unsigned long)fault_addr;
#endif
int error;
#ifdef PROFILE_ENABLE
uint64_t t_s = 0;
@ -1328,6 +1363,49 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
cpu_enable_interrupt();
#ifdef ENABLE_TOFU
if (!(reason & PF_USER) &&
(addr > 0xffff000000000000 &&
addr < 0xffff800000000000)) {
int error;
int ihk_mc_linux_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size);
unsigned long phys, size;
enum ihk_mc_pt_attribute attr = PTATTR_WRITABLE | PTATTR_ACTIVE;
if (ihk_mc_linux_pt_virt_to_phys_size(ihk_mc_get_linux_kernel_pgt(),
fault_addr, &phys, &size) < 0) {
kprintf("%s: failed to resolve 0x%lx from Linux PT..\n",
__func__, addr);
goto out_linux;
}
retry_linux:
if ((error = ihk_mc_pt_set_page(NULL, fault_addr, phys, attr)) < 0) {
if (error == -EBUSY) {
kprintf("%s: WARNING: updating 0x%lx -> 0x%lx"
" to reflect Linux kernel mapping..\n",
__func__, addr, phys);
ihk_mc_clear_kernel_range(fault_addr, fault_addr + PAGE_SIZE);
goto retry_linux;
}
else {
kprintf("%s: failed to set up 0x%lx -> 0x%lx Linux kernel mapping..\n",
__func__, addr, phys);
goto out_linux;
}
}
dkprintf("%s: Linux kernel mapping 0x%lx -> 0x%lx set\n",
__func__, addr, phys);
goto out_ok;
}
out_linux:
#endif
if ((uintptr_t)fault_addr < PAGE_SIZE || !thread) {
error = -EINVAL;
} else {
@ -1373,6 +1451,9 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
goto out;
}
#ifdef ENABLE_TOFU
out_ok:
#endif
error = 0;
preempt_enable();
out:
@ -1465,7 +1546,11 @@ static void numa_init(void)
INIT_LIST_HEAD(&memory_nodes[i].allocators);
memory_nodes[i].nodes_by_distance = 0;
#ifdef IHK_RBTREE_ALLOCATOR
ihk_atomic_set(&memory_nodes[i].zeroing_workers, 0);
ihk_atomic_set(&memory_nodes[i].nr_to_zero_pages, 0);
memory_nodes[i].free_chunks.rb_node = 0;
init_llist_head(&memory_nodes[i].zeroed_list);
init_llist_head(&memory_nodes[i].to_zero_list);
mcs_lock_init(&memory_nodes[i].lock);
memory_nodes[i].min_addr = 0xFFFFFFFFFFFFFFFF;
memory_nodes[i].max_addr = 0;
@ -2712,3 +2797,25 @@ int ihk_mc_get_mem_user_page(void *arg0, page_table_t pt, pte_t *ptep, void *pga
return 0;
}
pte_t *ihk_mc_pt_lookup_fault_pte(struct process_vm *vm, void *virt,
int pgshift, void **basep, size_t *sizep, int *p2alignp)
{
int faulted = 0;
pte_t *ptep;
retry:
ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
virt, pgshift, basep, sizep, p2alignp);
if (!faulted && (!ptep || !pte_is_present(ptep))) {
page_fault_process_vm(vm, virt, PF_POPULATE | PF_USER);
faulted = 1;
goto retry;
}
if (faulted && ptep && pte_is_present(ptep)) {
kprintf("%s: successfully faulted 0x%lx\n", __FUNCTION__, virt);
}
return ptep;
}

View File

@ -945,6 +945,11 @@ int split_process_memory_range(struct process_vm *vm, struct vm_range *range,
}
newrange->start = addr;
newrange->straight_start = 0;
if (range->straight_start) {
newrange->straight_start =
range->straight_start + (addr - range->start);
}
newrange->end = range->end;
newrange->flag = range->flag;
newrange->pgshift = range->pgshift;
@ -1045,6 +1050,11 @@ static int free_process_memory_range(struct process_vm *vm,
start = range->start;
end = range->end;
/* No regular page table manipulation for straight mappings */
if (range->straight_start || ((void *)start == vm->proc->straight_va))
goto straight_out;
if (!(range->flag & (VR_REMOTE | VR_IO_NOCACHE | VR_RESERVED))) {
neighbor = previous_process_memory_range(vm, range);
pgsize = -1;
@ -1126,11 +1136,39 @@ static int free_process_memory_range(struct process_vm *vm,
memobj_unref(range->memobj);
}
straight_out:
rb_erase(&range->vm_rb_node, &vm->vm_range_tree);
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
if (vm->range_cache[i] == range)
vm->range_cache[i] = NULL;
}
/* For straight ranges just free physical memory */
if (range->straight_start) {
ihk_mc_free_pages(phys_to_virt(vm->proc->straight_pa +
(range->straight_start - (unsigned long)vm->proc->straight_va)),
(range->end - range->start) >> PAGE_SHIFT);
dkprintf("%s: straight range 0x%lx @ straight 0x%lx (phys: 0x%lx)"
" physical memory freed\n",
__FUNCTION__, range->start, range->straight_start,
vm->proc->straight_pa +
(range->straight_start - (unsigned long)vm->proc->straight_va));
}
/* For the main straight mapping, free page tables */
else if (range->start == (unsigned long)vm->proc->straight_va &&
range->end == ((unsigned long)vm->proc->straight_va +
vm->proc->straight_len)) {
ihk_mc_spinlock_lock_noirq(&vm->page_table_lock);
error = ihk_mc_pt_clear_range(vm->address_space->page_table, vm,
(void *)start, (void *)end);
ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock);
dkprintf("%s: main straight mapping 0x%lx unmapped\n",
__FUNCTION__, vm->proc->straight_va);
vm->proc->straight_len = 0;
}
kfree(range);
dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n",
@ -1148,6 +1186,50 @@ int remove_process_memory_range(struct process_vm *vm,
dkprintf("remove_process_memory_range(%p,%lx,%lx)\n",
vm, start, end);
/*
* Convert to real virtual address for straight ranges,
* but not for the main straight mapping
*/
if (vm->proc->straight_va &&
start >= (unsigned long)vm->proc->straight_va &&
end <= ((unsigned long)vm->proc->straight_va +
vm->proc->straight_len) &&
!(start == (unsigned long)vm->proc->straight_va &&
end == ((unsigned long)vm->proc->straight_va +
vm->proc->straight_len))) {
struct vm_range *range_iter;
struct vm_range *range = NULL;
unsigned long len = end - start;
range_iter = lookup_process_memory_range(vm, 0, -1);
while (range_iter) {
if (range_iter->straight_start &&
start >= range_iter->straight_start &&
start < (range_iter->straight_start +
(range_iter->end - range_iter->start))) {
range = range_iter;
break;
}
range_iter = next_process_memory_range(vm, range_iter);
}
if (!range) {
kprintf("%s: WARNING: no straight mapping range found for 0x%lx\n",
__FUNCTION__, start);
return 0;
}
dkprintf("%s: straight range converted from 0x%lx:%lu -> 0x%lx:%lu\n",
__FUNCTION__,
start, len,
range->start + (start - range->straight_start), len);
start = range->start + (start - range->straight_start);
end = start + len;
}
next = lookup_process_memory_range(vm, start, end);
while ((range = next) && range->start < end) {
next = next_process_memory_range(vm, range);
@ -1345,6 +1427,7 @@ int add_process_memory_range(struct process_vm *vm,
range->objoff = offset;
range->pgshift = pgshift;
range->private_data = NULL;
range->straight_start = 0;
rc = 0;
if (phys == NOPHYS) {
@ -1385,17 +1468,14 @@ int add_process_memory_range(struct process_vm *vm,
/* Clear content! */
if (phys != NOPHYS && !(flag & (VR_REMOTE | VR_DEMAND_PAGING))
&& ((flag & VR_PROT_MASK) != VR_PROT_NONE)) {
#if 1
memset((void *)phys_to_virt(phys), 0, end - start);
if (!zero_at_free) {
#ifdef ARCH_MEMCLEAR
memclear((void *)phys_to_virt(phys), end - start);
#else
if (end - start < (1024*1024)) {
memset((void*)phys_to_virt(phys), 0, end - start);
}
else {
memset_smp(&cpu_local_var(current)->cpu_set,
(void *)phys_to_virt(phys), 0, end - start);
}
memset((void *)phys_to_virt(phys), 0, end - start);
#endif
}
}
/* Return range object if requested */
@ -1935,6 +2015,14 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang
int private_range, patching_to_rdonly;
int devfile_or_hugetlbfs_or_premap, regfile_or_shm;
if (cpu_local_var(current)->profile) {
dkprintf("%s: 0x%lx @ %s\n",
__func__, fault_addr,
range->memobj && range->memobj->path ?
range->memobj->path :
range->private_data ? "XPMEM" : "<unknown>");
}
dkprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx)\n", vm, range->start, range->end, range->flag, fault_addr, reason);
ihk_mc_spinlock_lock_noirq(&vm->page_table_lock);
/*****/
@ -2774,6 +2862,11 @@ release_process(struct process *proc)
/* no process left */
mcs_rwlock_reader_lock(&rset->pid1->children_lock, &lock);
if (list_empty(&rset->pid1->children_list)) {
#ifdef ENABLE_TOFU
extern void tof_utofu_finalize(void);
tof_utofu_finalize();
#endif
hugefileobj_cleanup();
}
mcs_rwlock_reader_unlock(&rset->pid1->children_lock, &lock);
@ -3122,6 +3215,7 @@ static void idle(void)
v->status == CPU_STATUS_RESERVED) {
/* No work to do? Consolidate the kmalloc free list */
kmalloc_consolidate_free_list();
ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0));
monitor->status = IHK_OS_MONITOR_IDLE;
cpu_local_var(current)->status = PS_INTERRUPTIBLE;
cpu_safe_halt();
@ -3477,6 +3571,7 @@ void spin_sleep_or_schedule(void)
break;
}
ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0));
cpu_pause();
}

View File

@ -63,10 +63,21 @@ char *profile_event_names[] =
"remote_page_fault",
"mpol_alloc_missed",
"mmap_anon_contig_phys",
"|-------mmap_straight",
"|---mmap_not_straight",
"mmap_anon_no_contig_phys",
"mmap_regular_file",
"mmap_device_file",
""
"tofu_stag_alloc ",
"|--new_steering ",
" |-alloc_mbpt ",
" |-update_mbpt",
"tofu_stag_free_stags",
"tofu_stag_free_stag",
" |--------pre",
" |----cqflush",
" |----dealloc",
" |---free_pages",
};
mcs_lock_node_t job_profile_lock = { 0 };
@ -471,7 +482,7 @@ int do_profile(int flag)
if (flag & PROF_ON) {
if (!thread->profile) {
thread->profile = 1;
thread->profile_start_ts = 0;
thread->profile_start_ts = now_ts;
}
}
else if (flag & PROF_OFF) {

View File

@ -432,6 +432,43 @@ struct rb_node *rb_first(const struct rb_root *root)
}
EXPORT_SYMBOL(rb_first);
/*
* Pre-order depth first search.
* Return a node where __cond is true.
*/
static struct rb_node *__rb_preorder_dfs(struct rb_node *n,
bool (*__cond)(struct rb_node *, void *arg), void *__cond_arg)
{
struct rb_node *left_res = NULL;
if (__cond(n, __cond_arg))
return n;
if (n->rb_left) {
left_res = __rb_preorder_dfs(n->rb_left, __cond, __cond_arg);
if (left_res) {
return left_res;
}
}
if (n->rb_right)
return __rb_preorder_dfs(n->rb_right, __cond, __cond_arg);
return NULL;
}
struct rb_node *rb_preorder_dfs_search(const struct rb_root *root,
bool (*__cond)(struct rb_node *, void *arg), void *__cond_arg)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
return __rb_preorder_dfs(n, __cond, __cond_arg);
}
struct rb_node *rb_first_safe(const struct rb_root *root)
{
struct rb_node *n;

View File

@ -220,6 +220,9 @@ long do_syscall(struct syscall_request *req, int cpu)
req->ttid = 0;
}
res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
#ifdef ENABLE_TOFU
res.pde_data = NULL;
#endif
send_syscall(req, cpu, &res);
if (req->rtid == -1) {
@ -266,6 +269,7 @@ long do_syscall(struct syscall_request *req, int cpu)
cpu_restore_interrupt(runq_irqstate);
if (!do_schedule) {
ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0));
continue;
}
@ -379,6 +383,35 @@ long do_syscall(struct syscall_request *req, int cpu)
rc = res.ret;
#ifdef ENABLE_TOFU
if ((req->number == __NR_ioctl && rc == 0) ||
(req->number == __NR_openat && rc > 0)) {
int fd = req->number == __NR_ioctl ? req->args[0] : rc;
char *path = req->number == __NR_ioctl ?
thread->proc->fd_path[fd] : thread->fd_path_in_open;
if (cpu_local_var(current)->proc->enable_tofu &&
res.pde_data &&
fd < MAX_FD_PDE &&
!thread->proc->fd_pde_data[fd] &&
!strncmp(path, "/proc/tofu/dev/", 15)) {
unsigned long irqstate;
irqstate = ihk_mc_spinlock_lock(&thread->proc->mckfd_lock);
thread->proc->fd_pde_data[fd] = res.pde_data;
ihk_mc_spinlock_unlock(&thread->proc->mckfd_lock, irqstate);
dkprintf("%s: PID: %d, ioctl fd: %d, filename: "
"%s, pde_data: 0x%lx\n",
__FUNCTION__,
thread->proc->pid,
fd,
path,
res.pde_data);
}
}
#endif
if(req->number != __NR_exit_group){
--thread->in_syscall_offload;
}
@ -1264,6 +1297,30 @@ void terminate(int rc, int sig)
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
#ifdef ENABLE_TOFU
/* Tofu: cleanup, must be done before mcexec is gone */
if (proc->enable_tofu) {
int fd;
for (fd = 0; fd < MAX_FD_PDE; ++fd) {
/* Tofu? */
if (proc->enable_tofu && proc->fd_pde_data[fd]) {
extern void tof_utofu_release_fd(struct process *proc, int fd);
dkprintf("%s: -> tof_utofu_release_fd() @ fd: %d (%s)\n",
__func__, fd, proc->fd_path[fd]);
tof_utofu_release_fd(proc, fd);
proc->fd_pde_data[fd] = NULL;
}
if (proc->fd_path[fd]) {
kfree(proc->fd_path[fd]);
proc->fd_path[fd] = NULL;
}
}
}
#endif
terminate_mcexec(rc, sig);
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
@ -1418,7 +1475,6 @@ void terminate(int rc, int sig)
#endif
// clean up memory
finalize_process(proc);
preempt_disable();
@ -1433,6 +1489,71 @@ void terminate(int rc, int sig)
panic("panic");
}
int __process_cleanup_fd(struct process *proc, int fd)
{
#ifdef ENABLE_TOFU
/* Tofu? */
if (proc->enable_tofu) {
extern void tof_utofu_release_fd(struct process *proc, int fd);
dkprintf("%s: -> tof_utofu_release_fd() @ fd: %d (%s)\n",
__func__, fd, proc->fd_path[fd]);
tof_utofu_release_fd(proc, fd);
proc->fd_pde_data[fd] = NULL;
if (proc->fd_path[fd]) {
kfree(proc->fd_path[fd]);
proc->fd_path[fd] = NULL;
}
}
#endif
return 0;
}
int process_cleanup_fd(int pid, int fd)
{
struct process *proc;
struct mcs_rwlock_node_irqsave lock;
proc = find_process(pid, &lock);
if (!proc) {
/* This is normal behavior */
dkprintf("%s: PID %d couldn't be found\n", __func__, pid);
return 0;
}
__process_cleanup_fd(proc, fd);
process_unlock(proc, &lock);
return 0;
}
int process_cleanup_before_terminate(int pid)
{
struct process *proc;
struct mcs_rwlock_node_irqsave lock;
#ifdef ENABLE_TOFU
int fd;
#endif
proc = find_process(pid, &lock);
if (!proc) {
/* This is normal behavior */
return 0;
}
#ifdef ENABLE_TOFU
/* Clean up PDE file descriptors */
for (fd = 2; fd < MAX_FD_PDE; ++fd) {
__process_cleanup_fd(proc, fd);
}
#endif
process_unlock(proc, &lock);
return 0;
}
void
terminate_host(int pid, struct thread *thread)
{
@ -1581,10 +1702,20 @@ int do_munmap(void *addr, size_t len, int holding_memory_range_lock)
{
int error;
int ro_freed;
struct thread *thread = cpu_local_var(current);
begin_free_pages_pending();
error = remove_process_memory_range(cpu_local_var(current)->vm,
(intptr_t)addr, (intptr_t)addr+len, &ro_freed);
/* No host involvement for straight mapping ranges */
if (thread->proc->straight_va &&
addr >= thread->proc->straight_va &&
(addr + len) <=
(thread->proc->straight_va + thread->proc->straight_len)) {
goto out;
}
if (error || !ro_freed) {
clear_host_pte((uintptr_t)addr, len, holding_memory_range_lock);
}
@ -1595,6 +1726,8 @@ int do_munmap(void *addr, size_t len, int holding_memory_range_lock)
/* through */
}
}
out:
finish_free_pages_pending();
dkprintf("%s: 0x%lx:%lu, error: %ld\n",
@ -1666,6 +1799,7 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
void *p = NULL;
int vrflags;
uintptr_t phys;
intptr_t straight_phys;
struct memobj *memobj = NULL;
int maxprot;
int denied;
@ -1707,6 +1841,124 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
flush_nfo_tlb();
/* Initialize straight large memory mapping */
if (proc->straight_map && !proc->straight_va) {
unsigned long straight_pa_start = 0xFFFFFFFFFFFFFFFF;
unsigned long straight_pa_end = 0;
int i;
int p2align = PAGE_P2ALIGN;
size_t psize = PAGE_SIZE;
unsigned long vrflags;
enum ihk_mc_pt_attribute ptattr;
struct vm_range *range;
vrflags = PROT_TO_VR_FLAG(PROT_READ | PROT_WRITE);
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
vrflags |= VR_DEMAND_PAGING;
for (i = 0; i < ihk_mc_get_nr_memory_chunks(); ++i) {
unsigned long start, end;
ihk_mc_get_memory_chunk(i, &start, &end, NULL);
if (straight_pa_start > start) {
straight_pa_start = start;
}
if (straight_pa_end < end) {
straight_pa_end = end;
}
}
kprintf("%s: straight_pa_start: 0x%lx, straight_pa_end: 0x%lx\n",
__FUNCTION__, straight_pa_start, straight_pa_end);
error = arch_get_smaller_page_size(NULL,
straight_pa_end - straight_pa_start,
&psize, &p2align);
if (error) {
kprintf("%s: arch_get_smaller_page_size failed: %d\n",
__FUNCTION__, error);
goto straight_out;
}
//psize = PTL2_SIZE;
//p2align = PTL2_SHIFT - PTL1_SHIFT;
// Force 512G page
//psize = (1UL << 39);
//p2align = 39 - PAGE_SHIFT;
// Force 512MB page
psize = (1UL << 29);
p2align = 29 - PAGE_SHIFT;
kprintf("%s: using page shift: %d, psize: %lu\n",
__FUNCTION__, p2align + PAGE_SHIFT, psize);
straight_pa_start &= ~(psize - 1);
straight_pa_end = (straight_pa_end + psize - 1) & ~(psize - 1);
kprintf("%s: aligned straight_pa_start: 0x%lx, straight_pa_end: 0x%lx\n",
__FUNCTION__, straight_pa_start, straight_pa_end);
proc->straight_len = straight_pa_end - straight_pa_start;
error = search_free_space(proc->straight_len,
PAGE_SHIFT + p2align, (uintptr_t *)&proc->straight_va);
if (error) {
kprintf("%s: search_free_space() failed: %d\n",
__FUNCTION__, error);
proc->straight_va = 0;
goto straight_out;
}
dkprintf("%s: straight_va: 0x%lx to be used\n",
__FUNCTION__, proc->straight_va);
if (add_process_memory_range(proc->vm, (unsigned long)proc->straight_va,
(unsigned long)proc->straight_va + proc->straight_len,
NOPHYS, vrflags, NULL, 0,
PAGE_SHIFT + p2align, &range) != 0) {
kprintf("%s: error: adding straight memory range \n",
__FUNCTION__);
proc->straight_va = 0;
goto straight_out;
}
kprintf("%s: straight_va: 0x%lx, range->pgshift: %d, range OK\n",
__FUNCTION__, proc->straight_va, range->pgshift);
ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
error = ihk_mc_pt_set_range(proc->vm->address_space->page_table,
proc->vm,
(void *)range->start,
(void *)range->end,
straight_pa_start, ptattr,
range->pgshift,
range, 0);
if (error) {
kprintf("%s: ihk_mc_pt_set_range() failed: %d\n",
__FUNCTION__, error);
proc->straight_va = 0;
goto straight_out;
}
//ihk_mc_pt_print_pte(proc->vm->address_space->page_table, range->start);
region->map_end = (unsigned long)proc->straight_va + proc->straight_len;
proc->straight_pa = straight_pa_start;
kprintf("%s: straight mapping: 0x%lx:%lu @ 0x%lx, "
"psize: %lu, straight_map_threshold: %lu\n",
__FUNCTION__,
proc->straight_va,
proc->straight_len,
proc->straight_pa,
psize,
proc->straight_map_threshold);
}
straight_out:
if (flags & MAP_HUGETLB) {
pgshift = (flags >> MAP_HUGE_SHIFT) & 0x3F;
if (!pgshift) {
@ -1734,6 +1986,15 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
if ((flags & MAP_FIXED) && proc->straight_va &&
((void *)addr >= proc->straight_va) &&
((void *)addr + len) <= (proc->straight_va + proc->straight_len)) {
kprintf("%s: can't map MAP_FIXED into straight mapping\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
if (flags & MAP_FIXED) {
/* clear specified address range */
error = do_munmap((void *)addr, len, 1/* holding memory_range_lock */);
@ -1766,6 +2027,10 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
}
if (flags & (MAP_POPULATE | MAP_LOCKED)) {
dkprintf("%s: 0x%lx:%lu %s%s|\n",
__func__, addr, len,
flags & MAP_POPULATE ? "|MAP_POPULATE" : "",
flags & MAP_LOCKED ? "|MAP_LOCKED" : "");
populated_mapping = 1;
}
@ -1790,6 +2055,7 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
}
phys = 0;
straight_phys = 0;
off = 0;
maxprot = PROT_READ | PROT_WRITE | PROT_EXEC;
if (!(flags & MAP_ANONYMOUS)) {
@ -1977,6 +2243,31 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
}
vrflags |= VRFLAG_PROT_TO_MAXPROT(PROT_TO_VR_FLAG(maxprot));
/*
* Large anonymous non-fix allocations are in straight mapping,
* pretend demand paging to avoid filling in PTEs
*/
if ((flags & MAP_ANONYMOUS) && proc->straight_map &&
!(flags & MAP_FIXED) && phys) {
if (len >= proc->straight_map_threshold) {
dkprintf("%s: range 0x%lx:%lu will be straight, addding VR_DEMAND\n",
__FUNCTION__, addr, len);
vrflags |= VR_DEMAND_PAGING;
straight_phys = phys;
phys = 0;
#ifdef PROFILE_ENABLE
profile_event_add(PROFILE_mmap_anon_straight, len);
#endif // PROFILE_ENABLE
}
else {
#ifdef PROFILE_ENABLE
if (cpu_local_var(current)->profile)
kprintf("%s: contiguous but not straight? len: %lu\n", __func__, len);
profile_event_add(PROFILE_mmap_anon_not_straight, len);
#endif // PROFILE_ENABLE
}
}
error = add_process_memory_range(thread->vm, addr, addr+len, phys,
vrflags, memobj, off, pgshift, &range);
if (error) {
@ -1987,6 +2278,19 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
goto out;
}
/* Update straight mapping start address */
if (straight_phys) {
extern int zero_at_free;
range->straight_start =
(unsigned long)proc->straight_va +
(straight_phys - proc->straight_pa);
dkprintf("%s: range 0x%lx:%lu is straight starting at 0x%lx\n",
__FUNCTION__, addr, len, range->straight_start);
if (!zero_at_free) {
memset((void *)phys_to_virt(straight_phys), 0, len);
}
}
/* Determine pre-populated size */
populate_len = memobj ? min(len, memobj->size) : len;
@ -2041,12 +2345,13 @@ do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
ro_vma_mapped = 0;
out:
if (ro_vma_mapped) {
if (ro_vma_mapped && !range->straight_start) {
(void)set_host_vma(addr, len, PROT_READ | PROT_WRITE | PROT_EXEC, 1/* holding memory_range_lock */);
}
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
if (!error && populated_mapping && !((vrflags & VR_PROT_MASK) == VR_PROT_NONE)) {
if (!error && populated_mapping &&
!((vrflags & VR_PROT_MASK) == VR_PROT_NONE) && !range->straight_start) {
error = populate_process_memory(thread->vm,
(void *)addr, populate_len);
@ -2086,7 +2391,9 @@ out:
addr, len, addr0, len0, prot, flags,
fd, off0, error, addr);
return (!error)? addr: error;
return !error ?
(range->straight_start ? range->straight_start : addr) :
error;
}
SYSCALL_DECLARE(munmap)
@ -2166,6 +2473,16 @@ SYSCALL_DECLARE(mprotect)
return 0;
}
if (thread->proc->straight_va &&
((void *)start >= thread->proc->straight_va) &&
(void *)end <= (thread->proc->straight_va +
thread->proc->straight_len)) {
kprintf("%s: ignored for straight mapping 0x%lx\n",
__FUNCTION__, start);
error = 0;
goto out_straight;
}
flush_nfo_tlb();
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
@ -2259,6 +2576,8 @@ out:
}
}
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
out_straight:
dkprintf("[%d]sys_mprotect(%lx,%lx,%x): %d\n",
ihk_mc_get_processor_id(), start, len0, prot, error);
return error;
@ -3615,6 +3934,23 @@ SYSCALL_DECLARE(ioctl)
break;
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
#ifdef ENABLE_TOFU
/* Tofu? */
if (proc->enable_tofu &&
fd < MAX_FD_PDE && thread->proc->fd_pde_data[fd]) {
extern long tof_utofu_unlocked_ioctl(int fd,
unsigned int cmd, unsigned long arg);
rc = tof_utofu_unlocked_ioctl(fd,
ihk_mc_syscall_arg1(ctx),
ihk_mc_syscall_arg2(ctx));
/* Do we need to offload? */
if (rc != -ENOTSUPP)
return rc;
}
#endif
if(fdp && fdp->ioctl_cb){
//kprintf("ioctl: found system fd %d\n", fd);
rc = fdp->ioctl_cb(fdp, ctx);
@ -3622,6 +3958,7 @@ SYSCALL_DECLARE(ioctl)
else{
rc = syscall_generic_forwarding(__NR_ioctl, ctx);
}
return rc;
}
@ -3649,6 +3986,10 @@ SYSCALL_DECLARE(open)
goto out;
}
#ifdef ENABLE_TOFU
cpu_local_var(current)->fd_path_in_open = pathname;
#endif
dkprintf("open(): pathname=%s\n", pathname);
if (!strncmp(pathname, XPMEM_DEV_PATH, len)) {
rc = xpmem_open(pathname, flags, ctx);
@ -3656,8 +3997,21 @@ SYSCALL_DECLARE(open)
rc = syscall_generic_forwarding(__NR_open, ctx);
}
#ifdef ENABLE_TOFU
cpu_local_var(current)->fd_path_in_open = NULL;
#endif
out:
#ifdef ENABLE_TOFU
if (rc > 0 && rc < MAX_FD_PDE) {
cpu_local_var(current)->proc->fd_path[rc] = pathname;
}
else {
kfree(pathname);
}
#else
kfree(pathname);
#endif
return rc;
}
@ -3685,6 +4039,10 @@ SYSCALL_DECLARE(openat)
goto out;
}
#ifdef ENABLE_TOFU
cpu_local_var(current)->fd_path_in_open = pathname;
#endif
dkprintf("openat(): pathname=%s\n", pathname);
if (!strncmp(pathname, XPMEM_DEV_PATH, len)) {
rc = xpmem_openat(pathname, flags, ctx);
@ -3692,8 +4050,21 @@ SYSCALL_DECLARE(openat)
rc = syscall_generic_forwarding(__NR_openat, ctx);
}
#ifdef ENABLE_TOFU
cpu_local_var(current)->fd_path_in_open = NULL;
#endif
out:
#ifdef ENABLE_TOFU
if (rc > 0 && rc < MAX_FD_PDE) {
cpu_local_var(current)->proc->fd_path[rc] = pathname;
}
else {
kfree(pathname);
}
#else
kfree(pathname);
#endif
return rc;
}
@ -3741,6 +4112,29 @@ SYSCALL_DECLARE(close)
long irqstate;
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
#ifdef ENABLE_TOFU
/* Clear path and PDE data */
if (thread->proc->enable_tofu &&
fd >= 0 && fd < MAX_FD_PDE) {
/* Tofu? */
if (thread->proc->fd_pde_data[fd]) {
extern void tof_utofu_release_fd(struct process *proc, int fd);
dkprintf("%s: -> tof_utofu_release_fd() @ fd: %d (%s)\n",
__func__, fd, thread->proc->fd_path[fd]);
tof_utofu_release_fd(thread->proc, fd);
thread->proc->fd_pde_data[fd] = NULL;
}
if (thread->proc->fd_path[fd]) {
dkprintf("%s: %d -> %s\n", __func__, fd, thread->proc->fd_path[fd]);
kfree(thread->proc->fd_path[fd]);
thread->proc->fd_path[fd] = NULL;
}
}
#endif
for(fdp = proc->mckfd, fdq = NULL; fdp; fdq = fdp, fdp = fdp->next)
if(fdp->fd == fd)
break;
@ -8729,6 +9123,15 @@ SYSCALL_DECLARE(mremap)
dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx)\n",
oldaddr, oldsize0, newsize0, flags, newaddr);
if (vm->proc->straight_va &&
(void *)oldaddr >= vm->proc->straight_va &&
(void *)oldaddr < vm->proc->straight_va + vm->proc->straight_len) {
kprintf("%s: reject for straight range 0x%lx\n",
__FUNCTION__, oldaddr);
return -EINVAL;
}
ihk_rwspinlock_write_lock_noirq(&vm->memory_range_lock);
/* check arguments */
@ -9132,6 +9535,11 @@ SYSCALL_DECLARE(mbind)
__FUNCTION__,
addr, len, mode, nodemask, flags);
/* No bind support for straight mapped processes */
if (cpu_local_var(current)->proc->straight_va) {
return 0;
}
/* Validate arguments */
if (addr & ~PAGE_MASK) {
return -EINVAL;
@ -10473,6 +10881,7 @@ long syscall(int num, ihk_mc_user_context_t *ctx)
*/
if (num < PROFILE_SYSCALL_MAX) {
profile_event_add(num, (ts - thread->profile_start_ts));
thread->profile_start_ts = rdtsc();
}
else {
if (num != __NR_profile) {

View File

@ -102,6 +102,7 @@ uint64_t schedule_timeout(uint64_t timeout)
/* Spin wait */
while ((rdtsc() - t_s) < LOOP_TIMEOUT) {
ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0));
cpu_pause();
}

2126
kernel/tofu/tof_utofu_main.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1919,21 +1919,35 @@ static int xpmem_remap_pte(
goto out;
}
seg_pte = ihk_mc_pt_lookup_pte(seg_tg->vm->address_space->page_table,
(void *)seg_vaddr, seg_vmr->pgshift, &seg_pgaddr, &seg_pgsize,
&seg_p2align);
if (!seg_pte) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_lookup_pte() failed\n",
__FUNCTION__);
goto out;
if (seg_tg->vm->proc->straight_va &&
seg_vaddr >= (unsigned long)seg_tg->vm->proc->straight_va &&
seg_vaddr < ((unsigned long)seg_tg->vm->proc->straight_va +
seg_tg->vm->proc->straight_len)) {
seg_phys = (((unsigned long)seg_vaddr & PAGE_MASK) -
(unsigned long)seg_tg->vm->proc->straight_va) +
seg_tg->vm->proc->straight_pa;
dkprintf("%s: 0x%lx in PID %d is straight -> phys: 0x%lx\n",
__func__, (unsigned long)seg_vaddr & PAGE_MASK,
seg_tg->tgid, seg_phys);
}
XPMEM_DEBUG("seg_pte=0x%016lx, seg_pgaddr=0x%p, seg_pgsize=%lu, "
"seg_p2align=%d",
*seg_pte, seg_pgaddr, seg_pgsize, seg_p2align);
else {
seg_phys = pte_get_phys(seg_pte);
XPMEM_DEBUG("seg_phys=0x%lx", seg_phys);
seg_pte = ihk_mc_pt_lookup_pte(seg_tg->vm->address_space->page_table,
(void *)seg_vaddr, seg_vmr->pgshift, &seg_pgaddr, &seg_pgsize,
&seg_p2align);
if (!seg_pte) {
ret = -EFAULT;
ekprintf("%s: ERROR: ihk_mc_pt_lookup_pte() failed\n",
__FUNCTION__);
goto out;
}
XPMEM_DEBUG("seg_pte=0x%016lx, seg_pgaddr=0x%p, seg_pgsize=%lu, "
"seg_p2align=%d",
*seg_pte, seg_pgaddr, seg_pgsize, seg_p2align);
seg_phys = pte_get_phys(seg_pte);
XPMEM_DEBUG("seg_phys=0x%lx", seg_phys);
}
att_pte = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)vaddr, vmr->pgshift, &att_pgaddr, &att_pgsize,

View File

@ -0,0 +1,11 @@
#ifndef INCLUDE_BITOPS_TEST_BIT_H
#define INCLUDE_BITOPS_TEST_BIT_H
static inline int test_bit(int nr, const void *addr)
{
const uint32_t *p = (const uint32_t *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
#endif

View File

@ -27,11 +27,7 @@ unsigned long find_first_bit(const unsigned long *addr,
unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size);
static inline int test_bit(int nr, const void *addr)
{
const uint32_t *p = (const uint32_t *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
#include <bitops-test_bit.h>
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);

View File

@ -292,5 +292,38 @@ void ihk_mc_spinlock_lock(ihk_spinlock_t *, unsigned long *);
void ihk_mc_spinlock_unlock(ihk_spinlock_t *, unsigned long *);
#endif
/*
* Linux queued_spin_lock compatible spin_lock, without the queue.
*/
#define _Q_LOCKED_OFFSET 0
#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
#define linux_spin_lock(lock) \
do { \
while (!__sync_bool_compare_and_swap( \
(unsigned int *)lock, 0, \
_Q_LOCKED_VAL)) { \
cpu_pause(); \
} \
} while (0)
#define linux_spin_unlock(lock) \
do { \
smp_store_release(lock, 0); \
} while (0)
#define linux_spin_lock_irqsave(lock, flags) \
do { \
flags = cpu_disable_interrupt_save(); \
linux_spin_lock(lock); \
} while (0)
#define linux_spin_unlock_irqrestore(lock, flags) \
do { \
linux_spin_unlock(lock); \
cpu_restore_interrupt(flags); \
} while (0)
#endif

View File

@ -20,6 +20,7 @@
#include <ihk/lock.h>
#include <ihk/atomic.h>
#include <arch/mm.h>
#include <ihk/debug.h>
struct memobj;
struct process_vm;
@ -55,6 +56,8 @@ typedef unsigned long ihk_mc_ap_flag;
#define IHK_MC_AP_BANDWIDTH 0x010000
#define IHK_MC_AP_LATENCY 0x020000
/* Only allocate from the closest NUMA node */
#define IHK_MC_AP_NUMA_STRICT 0x040000
#define IHK_MC_PG_KERNEL 0
#define IHK_MC_PG_USER 1
@ -170,6 +173,7 @@ int ihk_mc_pt_change_page(page_table_t pt, void *virt,
enum ihk_mc_pt_attribute);
int ihk_mc_pt_clear_page(page_table_t pt, void *virt);
int ihk_mc_pt_clear_large_page(page_table_t pt, void *virt);
int ihk_mc_clear_kernel_range(void *start, void *end);
int ihk_mc_pt_clear_range(page_table_t pt, struct process_vm *vm,
void *start, void *end);
int ihk_mc_pt_free_range(page_table_t pt, struct process_vm *vm,
@ -178,6 +182,8 @@ int ihk_mc_pt_change_attr_range(page_table_t pt, void *start, void *end,
enum ihk_mc_pt_attribute clrattr,
enum ihk_mc_pt_attribute setattr);
pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift, void **pgbasep, size_t *pgsizep, int *p2alignp);
pte_t *ihk_mc_pt_lookup_fault_pte(struct process_vm *vm, void *virt,
int pgshift, void **basep, size_t *sizep, int *p2alignp);
int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start,
void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr,
int pgshift, struct vm_range *range, int overwrite);
@ -208,6 +214,10 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt);
int ihk_mc_get_nr_numa_nodes(void);
struct ihk_mc_numa_node *ihk_mc_get_numa_node_by_distance(int i);
void ihk_numa_zero_free_pages(struct ihk_mc_numa_node *__node);
extern int zero_at_free;
struct smp_coreset;
int ihk_mc_get_numa_node(int id, int *linux_numa_id, int *type);
int ihk_mc_get_numa_distance(int i, int j);
@ -217,6 +227,11 @@ int ihk_mc_get_memory_chunk(int id,
unsigned long *start,
unsigned long *end,
int *numa_id);
#ifdef ENABLE_TOFU
int ihk_mc_get_memory_chunk_dma_addr(int id,
int tni, int cqid,
uintptr_t *dma_addr);
#endif
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id);
@ -257,4 +272,92 @@ void ihk_mc_query_mem_free_page(void *dump_page_set);
int ihk_mc_chk_page_address(pte_t mem_addr);
int ihk_mc_get_mem_user_page(void *arg0, page_table_t pt, pte_t *ptep, void *pgaddr, int pgshift);
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
extern int zero_at_free;
/*
* Generic lockless page cache.
* TODO: Store nr of pages in header and double-check at alloc time..
*/
struct ihk_mc_page_cache_header;
struct ihk_mc_page_cache_header {
struct ihk_mc_page_cache_header *next;
};
static inline void ihk_mc_page_cache_free(
struct ihk_mc_page_cache_header *cache, void *page)
{
struct ihk_mc_page_cache_header *current = NULL;
struct ihk_mc_page_cache_header *new =
(struct ihk_mc_page_cache_header *)page;
if (unlikely(!page))
return;
retry:
current = cache->next;
new->next = current;
if (!__sync_bool_compare_and_swap(&cache->next, current, new)) {
goto retry;
}
}
static inline void ihk_mc_page_cache_prealloc(
struct ihk_mc_page_cache_header *cache,
int nr_pages,
int nr_elem)
{
int i;
if (unlikely(cache->next))
return;
for (i = 0; i < nr_elem; ++i) {
void *pages;
pages = ihk_mc_alloc_pages(nr_pages, IHK_MC_AP_NOWAIT);
if (!pages) {
kprintf("%s: ERROR: allocating pages..\n", __func__);
continue;
}
ihk_mc_page_cache_free(cache, pages);
}
}
static inline void *ihk_mc_page_cache_alloc(
struct ihk_mc_page_cache_header *cache,
int nr_pages)
{
register struct ihk_mc_page_cache_header *first, *next;
retry:
next = NULL;
first = cache->next;
if (first) {
next = first->next;
if (!__sync_bool_compare_and_swap(&cache->next,
first, next)) {
goto retry;
}
}
else {
kprintf("%s: calling pre-alloc for 0x%lx...\n", __func__, cache);
ihk_mc_page_cache_prealloc(cache, nr_pages, 256);
goto retry;
}
return (void *)first;
}
#endif

View File

@ -17,6 +17,7 @@
#define __HEADER_GENERIC_IHK_PAGE_ALLOC
#include <list.h>
#include <llist.h>
#include <rbtree.h>
/* XXX: Physical memory management shouldn't be part of IHK */
@ -31,9 +32,11 @@ struct node_distance {
struct free_chunk {
unsigned long addr, size;
struct rb_node node;
struct llist_node list;
};
#endif
struct ihk_mc_numa_node {
int id;
int linux_numa_id;
@ -41,10 +44,17 @@ struct ihk_mc_numa_node {
struct list_head allocators;
struct node_distance *nodes_by_distance;
#ifdef IHK_RBTREE_ALLOCATOR
ihk_atomic_t zeroing_workers;
ihk_atomic_t nr_to_zero_pages;
struct llist_head zeroed_list;
struct llist_head to_zero_list;
struct rb_root free_chunks;
mcs_lock_node_t lock;
unsigned long nr_pages;
/*
* nr_free_pages: all freed pages, zeroed if zero_at_free
*/
unsigned long nr_free_pages;
unsigned long min_addr;
unsigned long max_addr;

View File

@ -13,12 +13,35 @@
#ifndef __HEADER_LIMITS
#define __HEADER_LIMITS
#define INT_MAX 0x7fffffff
#define INT_MIN -0x80000000
#define UINT_MAX 0xffffffff
#define LONG_MAX 0x7fffffffffffffffL
#define LONG_MIN -0x8000000000000000L
#define ULONG_MAX 0xffffffffffffffffL
#define USHRT_MAX ((uint16_t)(~0U))
#define SHRT_MAX ((int16_t)(USHRT_MAX>>1))
#define SHRT_MIN ((int16_t)(-SHRT_MAX - 1))
#define INT_MAX ((int)(~0U>>1))
#define INT_MIN (-INT_MAX - 1)
#define UINT_MAX (~0U)
#define LONG_MAX ((long)(~0UL>>1))
#define LONG_MIN (-LONG_MAX - 1)
#define ULONG_MAX (~0UL)
#define LLONG_MAX ((long long)(~0ULL>>1))
#define LLONG_MIN (-LLONG_MAX - 1)
#define ULLONG_MAX (~0ULL)
#define SIZE_MAX (~(size_t)0)
typedef uint64_t phys_addr_t;
#define PHYS_ADDR_MAX (~(phys_addr_t)0)
#define U8_MAX ((uint8_t)~0U)
#define S8_MAX ((int8_t)(U8_MAX>>1))
#define S8_MIN ((int8_t)(-S8_MAX - 1))
#define U16_MAX ((uint16_t)~0U)
#define S16_MAX ((int16_t)(U16_MAX>>1))
#define S16_MIN ((int16_t)(-S16_MAX - 1))
#define U32_MAX ((uint32_t)~0U)
#define S32_MAX ((int32_t)(U32_MAX>>1))
#define S32_MIN ((int32_t)(-S32_MAX - 1))
#define U64_MAX ((uint64_t)~0ULL)
#define S64_MAX ((int64_t)(U64_MAX>>1))
#define S64_MIN ((int64_t)(-S64_MAX - 1))
#define IOV_MAX 1024
#ifndef PATH_MAX

View File

@ -45,7 +45,7 @@ struct perf_event_attr;
((nr) << _IOC_NRSHIFT) | \
((size) << _IOC_SIZESHIFT))
#ifndef __KERNEL__
#ifndef _IOC_TYPECHECK
#define _IOC_TYPECHECK(t) (sizeof(t))
#endif

View File

@ -319,6 +319,9 @@ kprintf("\nzeroing done\n");
#ifdef IHK_RBTREE_ALLOCATOR
int zero_at_free = 1;
int deferred_zero_at_free = 1;
/*
* Simple red-black tree based physical memory management routines.
*
@ -356,6 +359,7 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root,
/* Is ichunk contigous from the left? */
if (ichunk->addr + ichunk->size == addr) {
struct rb_node *right;
/* Extend it to the right */
ichunk->size += size;
dkprintf("%s: chunk extended to right: 0x%lx:%lu\n",
@ -370,6 +374,10 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root,
if (ichunk->addr + ichunk->size == right_chunk->addr) {
ichunk->size += right_chunk->size;
rb_erase(right, root);
/* Clear old structure */
memset(right_chunk, 0, sizeof(*right_chunk));
dkprintf("%s: chunk merged to right: 0x%lx:%lu\n",
__FUNCTION__, ichunk->addr, ichunk->size);
}
@ -381,6 +389,7 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root,
/* Is ichunk contigous from the right? */
if (addr + size == ichunk->addr) {
struct rb_node *left;
/* Extend it to the left */
ichunk->addr -= size;
ichunk->size += size;
@ -397,6 +406,10 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root,
ichunk->addr -= left_chunk->size;
ichunk->size += left_chunk->size;
rb_erase(left, root);
/* Clear old structure */
memset(left_chunk, 0, sizeof(*left_chunk));
dkprintf("%s: chunk merged to left: 0x%lx:%lu\n",
__FUNCTION__, ichunk->addr, ichunk->size);
}
@ -406,6 +419,10 @@ static int __page_alloc_rbtree_free_range(struct rb_root *root,
new_chunk = (struct free_chunk *)phys_to_virt(ichunk->addr);
*new_chunk = *ichunk;
rb_replace_node(&ichunk->node, &new_chunk->node, root);
/* Clear old structure */
memset(ichunk, 0, sizeof(*ichunk));
dkprintf("%s: chunk moved to front: 0x%lx:%lu\n",
__FUNCTION__, new_chunk->addr, new_chunk->size);
@ -496,6 +513,30 @@ static int __page_alloc_rbtree_mark_range_allocated(struct rb_root *root,
*
* NOTE: locking must be managed by the caller.
*/
struct chunk_fits_arg {
unsigned long size;
unsigned long align_size;
unsigned long align_mask;
};
bool chunk_fits(struct rb_node *node, void *arg)
{
struct free_chunk *chunk;
unsigned long aligned_addr = 0;
struct chunk_fits_arg *cfa = (struct chunk_fits_arg *)arg;
chunk = container_of(node, struct free_chunk, node);
aligned_addr = (chunk->addr + (cfa->align_size - 1)) & cfa->align_mask;
/* Is this a suitable chunk? */
if ((aligned_addr + cfa->size) <= (chunk->addr + chunk->size)) {
return true;
}
return false;
}
static unsigned long __page_alloc_rbtree_alloc_pages(struct rb_root *root,
int npages, int p2align)
{
@ -506,6 +547,19 @@ static unsigned long __page_alloc_rbtree_alloc_pages(struct rb_root *root,
unsigned long align_mask = ~(align_size - 1);
unsigned long aligned_addr = 0;
#if 0
struct chunk_fits_arg cfa = {
.size = size,
.align_size = align_size,
.align_mask = align_mask
};
/* Find first maching chunk */
node = rb_preorder_dfs_search(root, chunk_fits, &cfa);
chunk = container_of(node, struct free_chunk, node);
aligned_addr = (chunk->addr + (align_size - 1)) & align_mask;
#else
for (node = rb_first(root); node; node = rb_next(node)) {
chunk = container_of(node, struct free_chunk, node);
aligned_addr = (chunk->addr + (align_size - 1)) & align_mask;
@ -520,6 +574,7 @@ static unsigned long __page_alloc_rbtree_alloc_pages(struct rb_root *root,
if (!node) {
return 0;
}
#endif
dkprintf("%s: allocating: 0x%lx:%lu\n",
__FUNCTION__, aligned_addr, size);
@ -530,6 +585,11 @@ static unsigned long __page_alloc_rbtree_alloc_pages(struct rb_root *root,
return 0;
}
if (zero_at_free) {
memset(phys_to_virt(aligned_addr),
0, sizeof(struct free_chunk));
}
return aligned_addr;
}
@ -576,6 +636,17 @@ static unsigned long __page_alloc_rbtree_reserve_pages(struct rb_root *root,
return aligned_addr;
}
static struct free_chunk *__page_alloc_rbtree_get_root_chunk(
struct rb_root *root)
{
struct rb_node *node = root->rb_node;
if (!node) {
return NULL;
}
rb_erase(node, root);
return container_of(node, struct free_chunk, node);
}
/*
* External routines.
@ -583,9 +654,14 @@ static unsigned long __page_alloc_rbtree_reserve_pages(struct rb_root *root,
int ihk_numa_add_free_pages(struct ihk_mc_numa_node *node,
unsigned long addr, unsigned long size)
{
if (zero_at_free) {
/* Zero chunk */
memset(phys_to_virt(addr), 0, size);
}
if (__page_alloc_rbtree_free_range(&node->free_chunks, addr, size)) {
kprintf("%s: ERROR: adding 0x%lx:%lu\n",
__FUNCTION__, addr, size);
__FUNCTION__, addr, size);
return EINVAL;
}
@ -602,6 +678,99 @@ int ihk_numa_add_free_pages(struct ihk_mc_numa_node *node,
return 0;
}
#define IHK_NUMA_ALL_PAGES (0)
int __ihk_numa_zero_free_pages(struct ihk_mc_numa_node *__node, int nr_pages)
{
int i, max_i;
int nr_zeroed_pages = 0;
if (!zero_at_free)
return 0;
/* If explicitly specified, zero only in __node */
max_i = __node ? 1 : ihk_mc_get_nr_numa_nodes();
/* Look at NUMA nodes in the order of distance */
for (i = 0; i < max_i; ++i) {
struct ihk_mc_numa_node *node;
struct llist_node *llnode;
/* Unless explicitly specified.. */
node = __node ? __node : ihk_mc_get_numa_node_by_distance(i);
if (!node) {
break;
}
/*
* If number of pages specified, look for a big enough chunk
*/
if (nr_pages) {
struct llist_head tmp;
init_llist_head(&tmp);
/* Look for a suitable chunk */
while ((llnode = llist_del_first(&node->to_zero_list))) {
unsigned long addr;
unsigned long size;
struct free_chunk *chunk =
container_of(llnode, struct free_chunk, list);
addr = chunk->addr;
size = chunk->size;
if (size < (nr_pages << PAGE_SHIFT)) {
llist_add(llnode, &tmp);
continue;
}
memset(phys_to_virt(addr) + sizeof(*chunk), 0,
size - sizeof(*chunk));
llist_add(&chunk->list, &node->zeroed_list);
barrier();
ihk_atomic_sub((int)(size >> PAGE_SHIFT),
&node->nr_to_zero_pages);
nr_zeroed_pages += (chunk->size >> PAGE_SHIFT);
kprintf("%s: zeroed chunk 0x%lx:%lu in allocate path\n",
__func__, addr, size);
break;
}
/* Add back the ones that didn't match */
while ((llnode = llist_del_first(&tmp))) {
llist_add(llnode, &node->to_zero_list);
}
}
/* Otherwise iterate all to_zero chunks */
else {
while ((llnode = llist_del_first(&node->to_zero_list))) {
unsigned long addr;
unsigned long size;
struct free_chunk *chunk =
container_of(llnode, struct free_chunk, list);
addr = chunk->addr;
size = chunk->size;
memset(phys_to_virt(addr) + sizeof(*chunk), 0,
size - sizeof(*chunk));
llist_add(&chunk->list, &node->zeroed_list);
barrier();
ihk_atomic_sub((int)(size >> PAGE_SHIFT),
&node->nr_to_zero_pages);
nr_zeroed_pages += (chunk->size >> PAGE_SHIFT);
}
}
}
return nr_zeroed_pages;
}
void ihk_numa_zero_free_pages(struct ihk_mc_numa_node *__node)
{
__ihk_numa_zero_free_pages(__node, IHK_NUMA_ALL_PAGES);
}
unsigned long ihk_numa_alloc_pages(struct ihk_mc_numa_node *node,
int npages, int p2align)
@ -628,7 +797,44 @@ unsigned long ihk_numa_alloc_pages(struct ihk_mc_numa_node *node,
#endif
mcs_lock_lock(&node->lock, &mcs_node);
retry:
if (zero_at_free) {
struct llist_node *llnode;
/*
* Process zeroed chunks that are not
* on the free tree yet.
*/
while ((llnode = llist_del_first(&node->zeroed_list))) {
unsigned long addr;
unsigned long size;
struct free_chunk *chunk =
container_of(llnode, struct free_chunk, list);
addr = chunk->addr;
size = chunk->size;
if (__page_alloc_rbtree_free_range(&node->free_chunks,
addr, size)) {
kprintf("%s: ERROR: freeing zeroed chunk 0x%lx:%lu\n",
__FUNCTION__, addr, npages << PAGE_SHIFT);
}
else {
node->nr_free_pages += (size >> PAGE_SHIFT);
dkprintf("%s: freed zeroed chunk 0x%lx:%lu\n",
__FUNCTION__, addr, size);
}
}
/* Not enough? Check if we can zero pages now */
if (node->nr_free_pages < npages) {
if (__ihk_numa_zero_free_pages(node, npages) >= npages) {
goto retry;
}
}
}
/* Not enough pages? Give up.. */
if (node->nr_free_pages < npages) {
goto unlock_out;
}
@ -639,6 +845,16 @@ unsigned long ihk_numa_alloc_pages(struct ihk_mc_numa_node *node,
/* Does not necessarily succeed due to alignment */
if (addr) {
node->nr_free_pages -= npages;
#if 0
{
size_t free_bytes = __count_free_bytes(&node->free_chunks);
if (free_bytes != node->nr_free_pages * PAGE_SIZE) {
kprintf("%s: inconsistent free count? node: %lu vs. cnt: %lu\n",
__func__, node->nr_free_pages * PAGE_SIZE, free_bytes);
panic("");
}
}
#endif
dkprintf("%s: allocated pages 0x%lx:%lu\n",
__FUNCTION__, addr, npages << PAGE_SHIFT);
}
@ -653,6 +869,7 @@ void ihk_numa_free_pages(struct ihk_mc_numa_node *node,
unsigned long addr, int npages)
{
mcs_lock_node_t mcs_node;
int defer_zero_at_free = deferred_zero_at_free;
#ifdef ENABLE_PER_CPU_ALLOC_CACHE
/* CPU local cache */
@ -684,18 +901,105 @@ void ihk_numa_free_pages(struct ihk_mc_numa_node *node,
return;
}
mcs_lock_lock(&node->lock, &mcs_node);
if (__page_alloc_rbtree_free_range(&node->free_chunks, addr,
npages << PAGE_SHIFT)) {
kprintf("%s: ERROR: freeing 0x%lx:%lu\n",
__FUNCTION__, addr, npages << PAGE_SHIFT);
#if 0
/* Do not defer zeroing when the number of free pages is low */
if (zero_at_free && defer_zero_at_free) {
mcs_lock_lock(&node->lock, &mcs_node);
if (node->nr_free_pages < (node->nr_pages * 3 / 100))
defer_zero_at_free = 0;
mcs_lock_unlock(&node->lock, &mcs_node);
}
#endif
/* Zero chunk right here if needed */
if (zero_at_free && !defer_zero_at_free) {
memset(phys_to_virt(addr), 0, npages << PAGE_SHIFT);
}
/*
* If we don't zero at free() or we zeroed the chunk
* already, simply add it to the free tree.
*/
if (!zero_at_free ||
(zero_at_free && !defer_zero_at_free)) {
mcs_lock_lock(&node->lock, &mcs_node);
if (__page_alloc_rbtree_free_range(&node->free_chunks, addr,
npages << PAGE_SHIFT)) {
kprintf("%s: ERROR: freeing 0x%lx:%lu\n",
__FUNCTION__, addr, npages << PAGE_SHIFT);
}
else {
node->nr_free_pages += npages;
#if 0
{
size_t free_bytes = __count_free_bytes(&node->free_chunks);
if (free_bytes != node->nr_free_pages * PAGE_SIZE) {
kprintf("%s: inconsistent free count? node: %lu vs. cnt: %lu\n",
__func__, node->nr_free_pages * PAGE_SIZE, free_bytes);
panic("");
}
}
#endif
dkprintf("%s: freed%s chunk 0x%lx:%lu\n",
__FUNCTION__,
zero_at_free ? " and zeroed" : "",
addr, npages << PAGE_SHIFT);
}
mcs_lock_unlock(&node->lock, &mcs_node);
}
/*
* Deferred zeroing.
* Put the chunk to the to_zero list.
*/
else {
node->nr_free_pages += npages;
dkprintf("%s: freed pages 0x%lx:%lu\n",
__FUNCTION__, addr, npages << PAGE_SHIFT);
struct free_chunk *chunk =
(struct free_chunk *)phys_to_virt(addr);
chunk->addr = addr;
chunk->size = npages << PAGE_SHIFT;
ihk_atomic_add(npages, &node->nr_to_zero_pages);
barrier();
llist_add(&chunk->list, &node->to_zero_list);
/* Ask Linux to clear memory */
if (cpu_local_var_initialized &&
cpu_local_var(current) &&
cpu_local_var(current) != &cpu_local_var(idle) &&
!cpu_local_var(current)->proc->nohost) {
struct ihk_ikc_channel_desc *syscall_channel =
cpu_local_var(ikc2linux);
struct ikc_scd_packet packet IHK_DMA_ALIGN;
if (ihk_atomic_read(&node->zeroing_workers) > 0) {
dkprintf("%s: skipping Linux zero request..\n", __func__);
return;
}
ihk_atomic_inc(&node->zeroing_workers);
memset(&packet, 0, sizeof(packet));
packet.req.number = __NR_move_pages;
packet.req.args[0] = (unsigned long)node;
barrier();
smp_store_release(&packet.req.valid, 1);
packet.msg = SCD_MSG_SYSCALL_ONESIDE;
packet.ref = ihk_mc_get_processor_id();
packet.pid = cpu_local_var(current)->proc->pid;
packet.resp_pa = 0;
if (ihk_ikc_send(syscall_channel, &packet, 0) < 0) {
kprintf("%s: WARNING: failed to send memory clear"
" send IKC req..\n", __func__);
}
else {
dkprintf("%s: clear mem req for NUMA %d sent in req"
" for addr: 0x%lx\n",
__func__, node->id, addr);
}
}
}
mcs_lock_unlock(&node->lock, &mcs_node);
}
#endif // IHK_RBTREE_ALLOCATOR

View File

@ -0,0 +1,123 @@
execute_process(COMMAND bash -c "gawk '/CPU implementer/ { print \$4; exit; }' /proc/cpuinfo"
OUTPUT_VARIABLE CPU_IMPLEMENTER OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND bash -c "gawk '/CPU architecture/ { print \$3; exit; }' /proc/cpuinfo"
OUTPUT_VARIABLE CPU_ARCH OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND bash -c "gawk '/CPU variant/ { print \$4; exit; }' /proc/cpuinfo"
OUTPUT_VARIABLE CPU_VARIANT OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND bash -c "gawk '/CPU part/ { print \$4; exit; }' /proc/cpuinfo"
OUTPUT_VARIABLE CPU_PART OUTPUT_STRIP_TRAILING_WHITESPACE)
if(CPU_IMPLEMENTER STREQUAL "0x46" AND CPU_ARCH STREQUAL "8" AND
CPU_VARIANT STREQUAL "0x1" AND CPU_PART STREQUAL "0x001")
message("A64FX detected")
set(CPU_MODEL "a64fx")
add_definitions(-D__a64fx__)
endif()
# find first NUMA available to user (0 or 4 now)
execute_process(COMMAND bash -c "awk -v keyword=nr_free_pages -f ${CMAKE_CURRENT_SOURCE_DIR}/src/zoneinfo.awk /proc/zoneinfo | awk -v page_size=$(getconf PAGE_SIZE) -f ${CMAKE_CURRENT_SOURCE_DIR}/src/zoneinfo_filter.awk | head -n1" OUTPUT_VARIABLE FIRST_USER_NUMA OUTPUT_STRIP_TRAILING_WHITESPACE)
message("FIRST_USER_NUMA: ${FIRST_USER_NUMA}")
add_definitions(-DFIRST_USER_NUMA=${FIRST_USER_NUMA})
if (FIRST_USER_NUMA STREQUAL "4")
execute_process(COMMAND sudo bash -c "echo 0-7 > /sys/fs/cgroup/cpuset/system.slice/cpuset.mems")
endif()
cmake_policy(SET CMP0005 NEW)
# Options: -DWITH_MCK=<McKernel install directory>
add_definitions(-DWITH_MCK=${WITH_MCK})
# Options: -DWITH_MCK_SRC=<McKernel source directory>
add_definitions(-DWITH_MCK_SRC=${WITH_MCK_SRC})
# for autotest
if(NOT DEFINED CMAKE_INSTALL_PREFIX_SCRIPTS)
set(CMAKE_INSTALL_PREFIX_SCRIPTS ${CMAKE_INSTALL_PREFIX}/scripts)
endif()
cmake_minimum_required(VERSION 3.0)
project(issue1470 C)
# CPPFLAGS
set(UNAME_R ${CMAKE_SYSTEM_VERSION} CACHE STRING "Kernel version to build against")
set(KERNEL_DIR "/lib/modules/${UNAME_R}/build" CACHE STRING "kernel build directory")
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_64K_PAGES\" { print $2; exit; }" "${KERNEL_DIR}/.config"
OUTPUT_VARIABLE CONFIG_ARM64_64K_PAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
if(CONFIG_ARM64_64K_PAGES STREQUAL "y")
set(PAGE_SIZE "65536")
else()
set(PAGE_SIZE "4096")
endif()
else()
set(PAGE_SIZE "4096")
endif()
message("PAGE_SIZE: ${PAGE_SIZE}")
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
add_definitions(-DBUILD_TARGET=smp-x86)
add_definitions(-DKMOD_POSTFIX=smp_x86)
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
add_definitions(-DBUILD_TARGET=smp-arm64)
add_definitions(-DKMOD_POSTFIX=smp_arm64)
endif()
add_definitions(-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX})
# CFLAGS
set(CFLAGS_WARNING "-Wall" "-Wextra" "-Wno-unused-parameter" "-Wno-sign-compare" "-Wno-unused-function" ${EXTRA_WARNINGS} CACHE STRING "Warning flags")
add_compile_options(
-O2
-g
${CFLAGS_WARNING}
)
# -L, this must be done before adding dependants
link_directories("${WITH_MCK}/lib64")
# -Wl,--rpath=, this must be done before adding dependants
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
set(CMAKE_INSTALL_RPATH "${WITH_MCK}/lib64")
# test driver scripts
foreach(target IN ITEMS
OMPI_MCA_plm_ple_memory_allocation_policy01
OMPI_MCA_plm_ple_memory_allocation_policy02
OMPI_MCA_plm_ple_memory_allocation_policy03
OMPI_MCA_plm_ple_memory_allocation_policy04
OMPI_MCA_plm_ple_memory_allocation_policy05
OMPI_MCA_plm_ple_memory_allocation_policy06
OMPI_MCA_plm_ple_memory_allocation_policy07
)
# String replacement
configure_file(src/${target}.sh.in ihklib-${target} @ONLY)
# Install scripts
install(PROGRAMS ${CMAKE_BINARY_DIR}/ihklib-${target} DESTINATION ${CMAKE_INSTALL_PREFIX_SCRIPTS})
endforeach()
# programs running on McKernel
foreach(target IN ITEMS
check_mempolicy
)
# Add C target
add_executable(${target} src/${target}.c)
# -I
target_include_directories(${target}
PRIVATE "${PROJECT_SOURCE_DIR}/include"
)
# -l
target_link_libraries(${target} PRIVATE numa)
# Install
install(TARGETS ${target} DESTINATION bin)
endforeach()

32
test/issues/1470/README Normal file
View File

@ -0,0 +1,32 @@
============
What to test
============
Check if the mode and nodemask obtained by get_mempolicy() are set to the expected values, with different reserved cpus, cpumask and policy requests.
The following settings are used.
Memory reserved: "1G@4,1G@5,1G@6,1G@7"
CPUs reserved: "12-59", "24-59"
FLIB_AFFINITY_ON_PROCESS:
"12-23", "24-35", "36-47", "48-59",
"12-35", "24-47", "36-59",
"12-47", "24-59",
"12-59"
OMPI_MCA_plm_ple_memory_allocation_policy:
{interleave,bind,prefer}_{local,nonlocal},
{interleave,bind}_all,
localalloc
============
How to build
============
cd <mckernel>/test/issues
mkdir build
cd build
cmake ../1470/ -DCMAKE_INSTALL_PREFIX=<mckernel>/test/issues/install -DWITH_MCK=<mckernel-install> -DWITH_MCK_SRC=<mckernel>
===========
How to test
===========
for i in {1..7}; do <mckernel>/test/issues/install/scripts/ihklib-OMPI_MCA_plm_ple_memory_allocation_policy0$i; done
check if no "[ NG ]" is shown.

View File

@ -0,0 +1,33 @@
#ifndef __OKNG_H_INCLUDED__
#define __OKNG_H_INCLUDED__
#include <stdio.h>
#define _OKNG(verb, jump, cond, fmt, args...) do { \
if (cond) { \
if (verb) \
printf("[ OK ] " fmt, ##args); \
} else { \
printf("[ NG ] " fmt, ##args); \
if (jump) { \
ret = 1; \
goto out; \
} \
} \
} while (0)
#define OKNG(args...) _OKNG(1, 1, ##args)
#define INFO(fmt, args...) printf("[ INFO ] " fmt, ##args)
#define START(fmt, args...) printf("[ START] " fmt, ##args)
#define INTERR(cond, fmt, args...) do { \
if (cond) { \
char msg[4096]; \
sprintf(msg, fmt, ##args); \
printf("[INTERR] %s:%d %s", __FILE__, __LINE__, msg); \
ret = 1; \
goto out; \
} \
} while (0)
#define ARRAY_SIZE_CHECK(array, size) INTERR(sizeof(array)/sizeof(array[0]) != size, "size of array \"%s\" isn't %d\n", #array, size)
#endif

View File

@ -0,0 +1,53 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
declare -A mode
mode[interleave_local]="3"
mode[bind_local]="2"
mode[prefer_local]="1"
declare -A nodemask
nodemask[$(seq -s, 12 23)]="1"
nodemask[$(seq -s, 24 35)]="2"
nodemask[$(seq -s, 36 47)]="4"
nodemask[$(seq -s, 48 59)]="8"
nodemask[$(seq -s, 12 35)]="3"
nodemask[$(seq -s, 24 47)]="6"
nodemask[$(seq -s, 36 59)]="12"
nodemask[$(seq -s, 12 47)]="7"
nodemask[$(seq -s, 24 59)]="14"
nodemask[$(seq -s, 12 59)]="15"
@WITH_MCK@/sbin/mcstop+release.sh
@WITH_MCK@/sbin/mcreboot.sh -c 12-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in interleave_local bind_local prefer_local; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) $(seq -s, 48 59) \
$(seq -s, 12 35) $(seq -s, 24 47) $(seq -s, 36 59) \
$(seq -s, 12 47) $(seq -s, 24 59) \
$(seq -s, 12 59); do
# check if policy is not set when not specified
if (( i++ == 0 )); then
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m 0 || exit $?
fi
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m ${mode[$policy]} -n ${nodemask[$cpuset]} || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,48 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
declare -A mode
mode[interleave_nonlocal]="3"
mode[bind_nonlocal]="2"
mode[prefer_nonlocal]="1"
declare -A nodemask
nodemask[$(seq -s, 12 23)]="14"
nodemask[$(seq -s, 24 35)]="13"
nodemask[$(seq -s, 36 47)]="11"
nodemask[$(seq -s, 48 59)]="7"
nodemask[$(seq -s, 12 35)]="12"
nodemask[$(seq -s, 24 47)]="9"
nodemask[$(seq -s, 36 59)]="3"
nodemask[$(seq -s, 12 47)]="8"
nodemask[$(seq -s, 24 59)]="1"
nodemask[$(seq -s, 12 59)]="0"
@WITH_MCK@/sbin/mcstop+release.sh
@WITH_MCK@/sbin/mcreboot.sh -c 12-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in interleave_nonlocal bind_nonlocal prefer_nonlocal; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) $(seq -s, 48 59) \
$(seq -s, 12 35) $(seq -s, 24 47) $(seq -s, 36 59) \
$(seq -s, 12 47) $(seq -s, 24 59) \
$(seq -s, 12 59); do
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m ${mode[$policy]} -n ${nodemask[$cpuset]} || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,34 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
declare -A mode
mode[interleave_all]="3"
mode[bind_all]="2"
@WITH_MCK@/sbin/mcstop+release.sh
@WITH_MCK@/sbin/mcreboot.sh -c 12-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in interleave_all bind_all; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) $(seq -s, 48 59) \
$(seq -s, 12 35) $(seq -s, 24 47) $(seq -s, 36 59) \
$(seq -s, 12 47) $(seq -s, 24 59) \
$(seq -s, 12 59); do
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m ${mode[$policy]} -n 15 || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,30 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
@WITH_MCK@/sbin/mcstop+release.sh
@WITH_MCK@/sbin/mcreboot.sh -c 12-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in localalloc; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) $(seq -s, 48 59) \
$(seq -s, 12 35) $(seq -s, 24 47) $(seq -s, 36 59) \
$(seq -s, 12 47) $(seq -s, 24 59) \
$(seq -s, 12 59); do
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m 0 || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,44 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
declare -A mode
mode[interleave_local]="3"
mode[bind_local]="2"
mode[prefer_local]="1"
declare -A nodemask
nodemask[$(seq -s, 12 23)]="2"
nodemask[$(seq -s, 24 35)]="4"
nodemask[$(seq -s, 36 47)]="8"
nodemask[$(seq -s, 12 35)]="6"
nodemask[$(seq -s, 24 47)]="12"
nodemask[$(seq -s, 12 47)]="14"
# reserve the last 36 cpus
@WITH_MCK@/sbin/mcreboot.sh -c 24-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in interleave_local bind_local prefer_local; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) \
$(seq -s, 12 35) $(seq -s, 24 47) \
$(seq -s, 12 47); do
# check nodemask when last 36 cpus are reserved
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m ${mode[$policy]} -n ${nodemask[$cpuset]} || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,44 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
declare -A mode
mode[interleave_nonlocal]="3"
mode[bind_nonlocal]="2"
mode[prefer_nonlocal]="1"
declare -A nodemask
nodemask[$(seq -s, 12 23)]="13"
nodemask[$(seq -s, 24 35)]="11"
nodemask[$(seq -s, 36 47)]="7"
nodemask[$(seq -s, 12 35)]="9"
nodemask[$(seq -s, 24 47)]="3"
nodemask[$(seq -s, 12 47)]="1"
# reserve the last 36 cpus
@WITH_MCK@/sbin/mcreboot.sh -c 24-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in interleave_nonlocal bind_nonlocal prefer_nonlocal; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) \
$(seq -s, 12 35) $(seq -s, 24 47) \
$(seq -s, 12 47); do
# check nodemask when last 36 cpus are reserved
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m ${mode[$policy]} -n ${nodemask[$cpuset]} || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,34 @@
#!/usr/bin/bash
# define WORKDIR
SCRIPT_PATH=$(readlink -m "${BASH_SOURCE[0]}")
SCRIPT_NAME="${SCRIPT_PATH##*/}"
TEST_NAME="${SCRIPT_NAME%.sh}"
AUTOTEST_HOME="${SCRIPT_PATH%/*/*/*}"
if [ -f ${AUTOTEST_HOME}/bin/config.sh ]; then
. ${AUTOTEST_HOME}/bin/config.sh
else
WORKDIR=$(pwd)
fi
declare -A mode
mode[interleave_all]="3"
mode[bind_all]="2"
# reserve the last 36 cpus
@WITH_MCK@/sbin/mcreboot.sh -c 24-59 -m 1G@4,1G@5,1G@6,1G@7
for policy in interleave_all bind_all; do
for cpuset in \
$(seq -s, 12 23) $(seq -s, 24 35) $(seq -s, 36 47) \
$(seq -s, 12 35) $(seq -s, 24 47) \
$(seq -s, 12 47); do
# check nodemask when last 36 cpus are reserved
FLIB_NUM_PROCESS_ON_NODE=1 FLIB_AFFINITY_ON_PROCESS=$cpuset OMPI_MCA_plm_ple_memory_allocation_policy=$policy @WITH_MCK@/bin/mcexec @CMAKE_INSTALL_PREFIX@/bin/check_mempolicy -m ${mode[$policy]} -n 15 || exit $?
done
done
@WITH_MCK@/sbin/mcstop+release.sh
exit 0

View File

@ -0,0 +1,54 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <numaif.h>
#include <okng.h>
#define PLD_PROCESS_NUMA_MASK_BITS 256
int main(int argc, char **argv)
{
long ret;
int mode;
unsigned long nodemask[PLD_PROCESS_NUMA_MASK_BITS /
(sizeof(unsigned long) * 8)] = { 0 };
int mode_expected = -1;
unsigned long nodemask_expected[PLD_PROCESS_NUMA_MASK_BITS /
(sizeof(unsigned long) * 8)] = { 0 };
int opt;
while ((opt = getopt(argc, argv, "m:n:")) != -1) {
switch (opt) {
case 'm':
mode_expected = atol(optarg);
break;
case 'n':
nodemask_expected[0] = atoi(optarg);
break;
default: /* '?' */
INTERR(1, "unknown option %c\n", optopt);
}
}
INTERR(mode_expected == -1, "specify -m <mode>\n");
ret = get_mempolicy(&mode, nodemask, PLD_PROCESS_NUMA_MASK_BITS,
NULL, 0);
INTERR(ret, "get_mempolicy failed with %ld\n", ret);
OKNG(mode == mode_expected, "mode: actual (%d), expected (%d)\n",
mode, mode_expected);
/* nodemask is "don't care" when mode is MPOL_DEFAULT */
if (mode_expected != 0) {
OKNG(nodemask[0] == nodemask_expected[0],
"nodemask: actual (%ld), expected (%ld)\n",
nodemask[0],
nodemask_expected[0]);
}
ret = 0;
out:
return ret;
}

View File

@ -0,0 +1,9 @@
BEGIN { id = -1; }
/Node .*, zone\s*(Normal|DMA32)/ { id = substr($2, 1, length($2) - 1); }
{
if ($0 ~ keyword && id != -1) {
printf("id: %d, nr_free_pages: %ld\n", id, $2);
id = -1;
}
}

View File

@ -0,0 +1,13 @@
{
id = substr($2, 1, length($2) - 1);
size = $4;
sizes[id] += size;
}
END {
for (i = 0; i <= id; i++) {
if (sizes[i] * page_size > 2 * 1024 * 1024 * 1024) {
print i;
}
}
}

145
test/issues/929/C929.sh Executable file
View File

@ -0,0 +1,145 @@
#/bin/sh
USELTP=0
USEOSTEST=0
. ../../common.sh
issue="929"
tid=01
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 5 ${MCEXEC} -n 5 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -eq 0 -a ${started_num} -eq 5 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 5 ${MCEXEC} -n 3 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 3 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 3 ${MCEXEC} -n 5 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 6 ${MCEXEC} -n 3 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 3 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 250 ${MCEXEC} -n 250 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
head -n 10 ./${tname}.txt
echo "..."
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
ng=0
TEST_CMD="mpirun -f ./hostfile -ppn 5 ${MCEXEC} -n 5 ./test_prog.sh"
echo "** reboot mcrernel for check pe_list_len"
mcreboot
echo "** enable debug message in mcexec_get_cpuset"
sudo sh -c "echo -n 'func mcexec_get_cpuset +p' > /sys/kernel/debug/dynamic_debug/control"
echo ${TEST_CMD}
for i in `seq 1 20`
do
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -eq 0 -a ${started_num} -eq 5 ]; then
echo "[OK] exec: $i"
else
echo "[NG] exec: $i"
let ng++
fi
done
echo "** check pe_list_len"
dmesg --notime | grep "mcexec_get_cpuset: pe_list" | tail -n 20 | cut -f 2-3 -d ':' > ./pe_list_len.txt
cat ./pe_list_len.txt | while read line
do
len=`echo ${line} | cut -f 2 -d ':'`
if [ ${len} -ge 0 -a ${len} -le 5 ]; then
echo "[OK] ${line}"
else
echo "[NG] ${line}"
let ng++
fi
done
echo "** disable debug message in mcexec_get_cpuset"
sudo sh -c "echo -n 'func mcexec_get_cpuset -p' > /sys/kernel/debug/dynamic_debug/control"
if [ ${ng} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""

11
test/issues/929/Makefile Normal file
View File

@ -0,0 +1,11 @@
CFLAGS=-g
LDFLAGS=
TARGET=
all: $(TARGET)
test: all
./C929.sh
clean:
rm -f $(TARGET) *.o *.txt

36
test/issues/929/README Normal file
View File

@ -0,0 +1,36 @@
【Issue#929 動作確認】
□ テスト内容
1. mpirunで指定する-ppnと、mcexecで指定する-n の指定状況ごとに
想定どおりの動作となることを確認
C929T01:
-ppn == -n の場合に、プログラムが実行され、mpirunが成功する
C929T02:
-ppn > -n の場合に、プログラムの一部が実行され、mpirunが失敗する
C929T03:
-ppn < -n の場合に、プログラムが実行されず、mpirunが失敗する
C929T04:
-ppn が -n の整数倍である場合に、プログラムの一部が実行され、mpirunが失敗する
C929T05:
-ppn と -n がMcKernelに割り当てたCPU数よりも大きい場合に、
プログラムが実行されず、mpirunが失敗する
C929T06:
-ppn == -n での正常実行を20回連続で行った場合に、
プログラムが実行され、mpirunが成功する
また、mcctrlで管理しているpart_exec_list の要素数が5を超えない
□ 実行手順
$ make test
McKernelのインストール先や、OSTEST, LTPの配置場所は、
$HOME/.mck_test_config を参照している
.mck_test_config は、McKernelをビルドした際に生成されるmck_test_config.sample ファイルを
$HOMEにコピーし、適宜編集する
□ 実行結果
x86_64_result.log aarch64_result.log 参照。
すべての項目をPASSしていることを確認。

View File

@ -0,0 +1,99 @@
*** C929T01 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T01 PASSED ******************************
*** C929T02 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T02 PASSED ******************************
*** C929T03 start *******************************
mpirun -f ./hostfile -ppn 3 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
*** C929T03 PASSED ******************************
*** C929T04 start *******************************
mpirun -f ./hostfile -ppn 6 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T04 PASSED ******************************
*** C929T05 start *******************************
mpirun -f ./hostfile -ppn 250 /home/satoken/ihk+mckernel/bin/mcexec -n 250 ./test_prog.sh
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
...
*** C929T05 PASSED ******************************
*** C929T06 start *******************************
** reboot mcrernel for check pe_list_len
mcreboot.sh -c 1-6,29-34 -m 50G@0,50G@1 -r 1-6:0+29-34:28 -O ... done
** enable debug message in mcexec_get_cpuset
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
[OK] exec: 1
[OK] exec: 2
[OK] exec: 3
[OK] exec: 4
[OK] exec: 5
[OK] exec: 6
[OK] exec: 7
[OK] exec: 8
[OK] exec: 9
[OK] exec: 10
[OK] exec: 11
[OK] exec: 12
[OK] exec: 13
[OK] exec: 14
[OK] exec: 15
[OK] exec: 16
[OK] exec: 17
[OK] exec: 18
[OK] exec: 19
[OK] exec: 20
** check pe_list_len
[OK] pe_list_len:0
[OK] pe_list_len:1
[OK] pe_list_len:2
[OK] pe_list_len:3
[OK] pe_list_len:4
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
** disable debug message in mcexec_get_cpuset
*** C929T06 PASSED ******************************

1
test/issues/929/hostfile Normal file
View File

@ -0,0 +1 @@
localhost

3
test/issues/929/test_prog.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/sh
echo "test_prog is started."

View File

@ -0,0 +1,74 @@
*** C929T01 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T01 PASSED ******************************
*** C929T02 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T02 PASSED ******************************
*** C929T03 start *******************************
mpirun -f ./hostfile -ppn 3 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
*** C929T03 PASSED ******************************
*** C929T04 start *******************************
mpirun -f ./hostfile -ppn 6 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T04 PASSED ******************************
*** C929T05 start *******************************
mpirun -f ./hostfile -ppn 250 /home/satoken/ihk+mckernel/bin/mcexec -n 250 ./test_prog.sh
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
...
*** C929T05 PASSED ******************************
*** C929T06 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
[OK] exec: 1
[OK] exec: 2
[OK] exec: 3
[OK] exec: 4
[OK] exec: 5
[OK] exec: 6
[OK] exec: 7
[OK] exec: 8
[OK] exec: 9
[OK] exec: 10
[OK] exec: 11
[OK] exec: 12
[OK] exec: 13
[OK] exec: 14
[OK] exec: 15
[OK] exec: 16
[OK] exec: 17
[OK] exec: 18
[OK] exec: 19
[OK] exec: 20
*** C929T06 PASSED ******************************

View File

@ -1179,6 +1179,7 @@ pgshift_to_string(int pgshift)
case 21: return "2M";
case 29: return "512M";
case 30: return "1G";
case 34: return "16G";
case 39: return "512G";
case 42: return "4T";
case 55: return "32P";
@ -1645,9 +1646,8 @@ static void
cmd_mcinfo(void)
{
#ifdef x86
#endif
fprintf(fp, "LINUX_PAGE_OFFSET: 0x%lx\n", LINUX_PAGE_OFFSET);
#endif
#ifdef ARM64
fprintf(fp, "V2PHYS_OFFSET: 0x%lx\n", V2PHYS_OFFSET);

Binary file not shown.

View File

@ -0,0 +1,729 @@
/*
* Trivial dwarf parser to extract part of a struct from debug infos
*
* Author: Dominique Martinet <dominique.martinet@cea.fr>
* License: WTFPLv2
*
* Canonical source: http://cgit.notk.org/asmadeus/dwarf-extract-struct.git
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <strings.h>
#include <errno.h>
#include "libdwarf/dwarf.h"
#include "libdwarf/libdwarf.h"
static void parse_dwarf(Dwarf_Debug dbg, const char *struct_name,
const char *field_names[], int field_count);
static void find_struct(Dwarf_Debug dbg, Dwarf_Die die, const char *struct_name,
const char *field_names[], int field_count, int level);
static void find_fields(Dwarf_Debug dbg, Dwarf_Die struct_die, Dwarf_Die die,
const char *struct_name, const char *field_names[],
int field_count, int level);
static void print_field(Dwarf_Debug dbg, Dwarf_Die die, const char *field_name,
int pad_num);
int debug = 0;
void usage(const char *argv[]) {
fprintf(stderr, "%s debug_file struct_name [field [field...]]\n",
argv[0]);
}
int main(int argc, const char *argv[]) {
Dwarf_Debug dbg = 0;
int fd = -1;
const char *filepath;
const char *struct_name;
int res = DW_DLV_ERROR;
Dwarf_Error error;
Dwarf_Handler errhand = 0;
Dwarf_Ptr errarg = 0;
if(argc < 3) {
usage(argv);
exit(1);
}
filepath = argv[1];
struct_name = argv[2];
fd = open(filepath,O_RDONLY);
if(fd < 0) {
fprintf(stderr, "Failure attempting to open %s\n",filepath);
}
res = dwarf_init(fd, DW_DLC_READ, errhand, errarg, &dbg, &error);
if(res != DW_DLV_OK) {
fprintf(stderr, "Giving up, cannot do DWARF processing\n");
exit(1);
}
parse_dwarf(dbg, struct_name, argv + 3, argc - 3);
res = dwarf_finish(dbg,&error);
if(res != DW_DLV_OK) {
fprintf(stderr, "dwarf_finish failed!\n");
}
close(fd);
return 0;
}
static void parse_dwarf(Dwarf_Debug dbg, const char *struct_name,
const char *field_names[], int field_count) {
Dwarf_Bool is_info = 1;
Dwarf_Unsigned cu_length;
Dwarf_Half cu_version;
Dwarf_Off cu_abbrev_offset;
Dwarf_Half cu_pointer_size;
Dwarf_Half cu_offset_size;
Dwarf_Half cu_extension_size;
Dwarf_Sig8 type_signature;
Dwarf_Unsigned type_offset;
Dwarf_Unsigned cu_next_offset;
Dwarf_Error err;
int rc;
/* Iterate compile and type units */
for (is_info = 0; is_info < 2; ++is_info) {
rc = dwarf_next_cu_header_c(dbg, is_info, &cu_length,
&cu_version, &cu_abbrev_offset, &cu_pointer_size,
&cu_offset_size, &cu_extension_size, &type_signature,
&type_offset, &cu_next_offset, &err);
while (rc != DW_DLV_NO_ENTRY) {
Dwarf_Die die;
if (rc != DW_DLV_OK) {
fprintf(stderr, "error dwarf_next_cu_header_c: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
rc = dwarf_siblingof(dbg, NULL, &die, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr, "first dwarf_siblingof failed: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
find_struct(dbg, die, struct_name, field_names, field_count, 0);
rc = dwarf_next_cu_header_c(dbg, is_info, &cu_length,
&cu_version, &cu_abbrev_offset, &cu_pointer_size,
&cu_offset_size, &cu_extension_size, &type_signature,
&type_offset, &cu_next_offset, &err);
}
}
fprintf(stderr, "struct %s not found\n", struct_name);
exit(2);
}
static void find_struct(Dwarf_Debug dbg, Dwarf_Die die, const char *struct_name,
const char *field_names[], int field_count, int level) {
Dwarf_Die next;
Dwarf_Error err;
int rc;
if (level > 1)
return;
do {
char *name;
const char *tag_name;
Dwarf_Half tag;
rc = dwarf_diename(die, &name, &err);
if (rc == DW_DLV_NO_ENTRY) {
name = NULL;
} else if (rc != DW_DLV_OK) {
fprintf(stderr, "dwarf_diename error: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
if (debug) {
printf("diename: %s\n", name);
}
rc = dwarf_tag(die, &tag, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr, "dwarf_tag error: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
if (debug) {
rc = dwarf_get_TAG_name(tag, &tag_name);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"dwarf_get_TAG_name error: %d\n", rc);
exit(1);
}
printf("<%d> %p <%d> %s: %s\n", level, die, tag,
tag_name, name ? name : "<no name>");
}
rc = dwarf_child(die, &next, &err);
if (rc == DW_DLV_ERROR) {
fprintf(stderr, "dwarf_child error: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
if (rc == DW_DLV_OK) {
if (tag == DW_TAG_structure_type
&& name && strcasecmp(name, struct_name) == 0) {
find_fields(dbg, die, next, struct_name,
field_names, field_count,
level + 1);
fprintf(stderr,
"Found struct %s but it did not have all members given!\nMissing:\n",
struct_name);
for (rc = 0; rc < field_count; rc++) {
if (field_names[rc])
fprintf(stderr, "%s\n",
field_names[rc]);
}
exit(3);
}
find_struct(dbg, next, struct_name, field_names,
field_count, level + 1);
dwarf_dealloc(dbg, next, DW_DLA_DIE);
}
rc = dwarf_siblingof(dbg, die, &next, &err);
dwarf_dealloc(dbg, die, DW_DLA_DIE);
if (name)
dwarf_dealloc(dbg, name, DW_DLA_STRING);
if (rc != DW_DLV_OK)
break;
die = next;
} while (die);
}
static int dwarf_get_offset(Dwarf_Debug dbg, Dwarf_Die die,
int *poffset, Dwarf_Error *perr) {
Dwarf_Attribute attr;
Dwarf_Unsigned offset;
int rc;
rc = dwarf_attr(die, DW_AT_data_member_location, &attr, perr);
if (rc != DW_DLV_OK) {
return rc;
}
Dwarf_Half form;
rc = dwarf_whatform(attr, &form, perr);
if (rc != DW_DLV_OK) {
fprintf(stderr, "Error getting whatform: %s\n",
dwarf_errmsg(*perr));
exit(5);
}
if (form == DW_FORM_data1 || form == DW_FORM_data2
|| form == DW_FORM_data2 || form == DW_FORM_data4
|| form == DW_FORM_data8 || form == DW_FORM_udata) {
dwarf_formudata(attr, &offset, 0);
} else if (form == DW_FORM_sdata) {
Dwarf_Signed soffset;
dwarf_formsdata(attr, &soffset, 0);
if (soffset < 0) {
fprintf(stderr,
"unsupported negative offset\n");
exit(5);
}
offset = (Dwarf_Unsigned) soffset;
} else {
Dwarf_Locdesc **locdescs;
Dwarf_Signed len;
if (dwarf_loclist_n(attr, &locdescs, &len, perr)
== DW_DLV_ERROR) {
fprintf(stderr, "unsupported member offset\n");
exit(5);
}
if (len != 1
|| locdescs[0]->ld_cents != 1
|| (locdescs[0]->ld_s[0]).lr_atom
!= DW_OP_plus_uconst) {
fprintf(stderr,
"unsupported location expression\n");
exit(5);
}
offset = (locdescs[0]->ld_s[0]).lr_number;
}
dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
*poffset = (int) offset;
return DW_DLV_OK;
}
static int dwarf_get_size(Dwarf_Debug dbg, Dwarf_Die die,
int *psize, Dwarf_Error *perr) {
Dwarf_Attribute attr;
Dwarf_Unsigned size;
int rc;
rc = dwarf_attr(die, DW_AT_byte_size, &attr, perr);
if (rc != DW_DLV_OK) {
return rc;
}
Dwarf_Half form;
rc = dwarf_whatform(attr, &form, perr);
if (rc != DW_DLV_OK) {
fprintf(stderr, "Error getting whatform: %s\n",
dwarf_errmsg(*perr));
exit(5);
}
if (form == DW_FORM_data1 || form == DW_FORM_data2
|| form == DW_FORM_data2 || form == DW_FORM_data4
|| form == DW_FORM_data8 || form == DW_FORM_udata) {
dwarf_formudata(attr, &size, 0);
} else if (form == DW_FORM_sdata) {
Dwarf_Signed ssize;
dwarf_formsdata(attr, &ssize, 0);
if (ssize < 0) {
fprintf(stderr,
"unsupported negative size\n");
exit(5);
}
size = (Dwarf_Unsigned) ssize;
} else {
Dwarf_Locdesc **locdescs;
Dwarf_Signed len;
if (dwarf_loclist_n(attr, &locdescs, &len, perr)
== DW_DLV_ERROR) {
fprintf(stderr, "unsupported member size\n");
exit(5);
}
if (len != 1
|| locdescs[0]->ld_cents != 1
|| (locdescs[0]->ld_s[0]).lr_atom
!= DW_OP_plus_uconst) {
fprintf(stderr,
"unsupported location expression\n");
exit(5);
}
size = (locdescs[0]->ld_s[0]).lr_number;
}
dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
*psize = (int) size;
return DW_DLV_OK;
}
static int dwarf_get_arraysize(Dwarf_Debug dbg, Dwarf_Die die,
int *psize, Dwarf_Error *perr) {
Dwarf_Attribute attr;
Dwarf_Unsigned lower_bound, upper_bound;
int rc;
Dwarf_Die child;
Dwarf_Half form;
rc = dwarf_child(die, &child, perr);
if (rc == DW_DLV_NO_ENTRY) {
fprintf(stderr,
"Could not deref child of array: no entry\n");
return rc;
}
if (rc != DW_DLV_OK) {
fprintf(stderr,
"Could not get child entry of array: %s\n",
dwarf_errmsg(*perr));
return rc;
}
rc = dwarf_attr(child, DW_AT_lower_bound, &attr, perr);
/* Not present? Assume zero */
if (rc != DW_DLV_OK) {
lower_bound = 0;
goto upper;
}
rc = dwarf_whatform(attr, &form, perr);
if (rc != DW_DLV_OK) {
fprintf(stderr, "Error getting whatform: %s\n",
dwarf_errmsg(*perr));
exit(5);
}
if (form == DW_FORM_data1 || form == DW_FORM_data2
|| form == DW_FORM_data2 || form == DW_FORM_data4
|| form == DW_FORM_data8 || form == DW_FORM_udata) {
dwarf_formudata(attr, &lower_bound, 0);
} else if (form == DW_FORM_sdata) {
Dwarf_Signed ssize;
dwarf_formsdata(attr, &ssize, 0);
if (ssize < 0) {
fprintf(stderr,
"unsupported negative size\n");
exit(5);
}
lower_bound = (Dwarf_Unsigned) ssize;
} else {
Dwarf_Locdesc **locdescs;
Dwarf_Signed len;
if (dwarf_loclist_n(attr, &locdescs, &len, perr)
== DW_DLV_ERROR) {
fprintf(stderr, "unsupported member size\n");
exit(5);
}
if (len != 1
|| locdescs[0]->ld_cents != 1
|| (locdescs[0]->ld_s[0]).lr_atom
!= DW_OP_plus_uconst) {
fprintf(stderr,
"unsupported location expression\n");
exit(5);
}
lower_bound = (locdescs[0]->ld_s[0]).lr_number;
}
dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
upper:
rc = dwarf_attr(child, DW_AT_upper_bound, &attr, perr);
if (rc != DW_DLV_OK) {
return rc;
}
rc = dwarf_whatform(attr, &form, perr);
if (rc != DW_DLV_OK) {
fprintf(stderr, "Error getting whatform: %s\n",
dwarf_errmsg(*perr));
exit(5);
}
if (form == DW_FORM_data1 || form == DW_FORM_data2
|| form == DW_FORM_data2 || form == DW_FORM_data4
|| form == DW_FORM_data8 || form == DW_FORM_udata) {
dwarf_formudata(attr, &upper_bound, 0);
} else if (form == DW_FORM_sdata) {
Dwarf_Signed ssize;
dwarf_formsdata(attr, &ssize, 0);
if (ssize < 0) {
fprintf(stderr,
"unsupported negative size\n");
exit(5);
}
upper_bound = (Dwarf_Unsigned) ssize;
} else {
Dwarf_Locdesc **locdescs;
Dwarf_Signed len;
if (dwarf_loclist_n(attr, &locdescs, &len, perr)
== DW_DLV_ERROR) {
fprintf(stderr, "unsupported member size\n");
exit(5);
}
if (len != 1
|| locdescs[0]->ld_cents != 1
|| (locdescs[0]->ld_s[0]).lr_atom
!= DW_OP_plus_uconst) {
fprintf(stderr,
"unsupported location expression\n");
exit(5);
}
upper_bound = (locdescs[0]->ld_s[0]).lr_number;
}
dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
*psize = ((int)upper_bound - (int)lower_bound + 1);
return DW_DLV_OK;
}
static int deref_type(Dwarf_Debug dbg, Dwarf_Die type_die,
Dwarf_Die *new_type_die, Dwarf_Half *ptype_tag,
Dwarf_Error *perr) {
Dwarf_Attribute pointer_attr;
Dwarf_Off pointer_off;
int rc;
rc = dwarf_attr(type_die, DW_AT_type, &pointer_attr,
perr);
if (rc != DW_DLV_OK)
return rc;
rc = dwarf_global_formref(pointer_attr, &pointer_off,
perr);
if (rc != DW_DLV_OK)
return rc;
rc = dwarf_offdie_b(dbg, pointer_off, 1, new_type_die,
perr);
if (rc != DW_DLV_OK)
return rc;
dwarf_dealloc(dbg, pointer_attr, DW_DLA_ATTR);
if (ptype_tag)
rc = dwarf_tag(*new_type_die, ptype_tag, perr);
return rc;
}
static void find_fields(Dwarf_Debug dbg, Dwarf_Die struct_die, Dwarf_Die die,
const char *struct_name, const char *field_names[],
int field_count, int level) {
Dwarf_Die next;
Dwarf_Error err;
int rc, i, printed_count = 0;
int size;
printf("struct %s {\n\tunion {\n",
struct_name);
rc = dwarf_get_size(dbg, struct_die, &size, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr, "could not get size for struct %s: %s\n",
struct_name, dwarf_errmsg(err));
exit(1);
}
printf("\t\tchar whole_struct[%d];\n", size);
do {
char *name;
const char *tag_name;
Dwarf_Half tag;
rc = dwarf_diename(die, &name, &err);
if (rc == DW_DLV_NO_ENTRY) {
name = NULL;
} else if (rc != DW_DLV_OK) {
fprintf(stderr, "dwarf_diename error: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
rc = dwarf_tag(die, &tag, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr, "dwarf_tag error: %d %s\n",
rc, dwarf_errmsg(err));
exit(1);
}
if (debug) {
rc = dwarf_get_TAG_name(tag, &tag_name);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"dwarf_get_TAG_name error: %d\n", rc);
exit(1);
}
printf("<%d> %p <%d> %s: %s\n", level, die, tag,
tag_name, name ? name : "<no name>");
}
if (tag == DW_TAG_member && name) {
for (i = 0; i < field_count; i++) {
if (!field_names[i])
continue;
if (strcasecmp(name, field_names[i]) == 0) {
print_field(dbg, die, field_names[i],
printed_count);
field_names[i] = NULL;
printed_count++;
break;
}
}
if (printed_count == field_count) {
printf("\t};\n};\n");
exit(0);
}
}
rc = dwarf_siblingof(dbg, die, &next, &err);
dwarf_dealloc(dbg, die, DW_DLA_DIE);
if (name)
dwarf_dealloc(dbg, name, DW_DLA_STRING);
if (rc != DW_DLV_OK)
break;
die = next;
} while (die);
}
static void print_field(Dwarf_Debug dbg, Dwarf_Die die, const char *field_name,
int padnum) {
Dwarf_Attribute attr;
Dwarf_Error err;
int offset = 0;
char type_buf[1024];
char array_buf[128] = "";
char pointer_buf[128] = "";
int rc;
rc = dwarf_get_offset(dbg, die, &offset, &err);
if (rc == DW_DLV_NO_ENTRY) {
fprintf(stderr, "Found %s but no offset, assuming 0\n",
field_name);
} else if (rc != DW_DLV_OK) {
fprintf(stderr, "Error getting dwarf attr offset: %s\n",
dwarf_errmsg(err));
exit(4);
}
rc = dwarf_attr(die, DW_AT_type, &attr, &err);
if (rc == DW_DLV_NO_ENTRY) {
fprintf(stderr,
"Found %s but no type, can't assume that one out..\n",
field_name);
exit(6);
} else if (rc != DW_DLV_OK) {
fprintf(stderr, "Error getting dwarf attrlist: %s\n",
dwarf_errmsg(err));
exit(6);
} else {
Dwarf_Die type_die, next;
Dwarf_Off type_off;
Dwarf_Half type_tag;
char *type_name;
int pointer = 0;
int embeded_struct = 0;
rc = dwarf_global_formref(attr, &type_off, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"Error getting ref offset for type: %s\n",
dwarf_errmsg(err));
exit(7);
}
rc = dwarf_offdie_b(dbg, type_off, 1, &type_die, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"Error getting die from offset for type: %s\n",
dwarf_errmsg(err));
exit(7);
}
rc = dwarf_tag(type_die, &type_tag, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr, "dwarf_tag error: %d %s\n",
rc, dwarf_errmsg(err));
exit(7);
}
while (type_tag == DW_TAG_pointer_type) {
pointer_buf[pointer++] = '*';
rc = deref_type(dbg, type_die, &next,
&type_tag, &err);
/* No entry here means void* */
if (rc == DW_DLV_NO_ENTRY)
break;
if (rc != DW_DLV_OK) {
fprintf(stderr,
"Could not deref type for %s: %s\n",
field_name, dwarf_errmsg(err));
exit(7);
}
dwarf_dealloc(dbg, type_die, DW_DLA_DIE);
type_die = next;
}
if (type_tag == DW_TAG_array_type) {
int next_offset, size;
rc = deref_type(dbg, type_die, &next,
&type_tag, &err);
if (rc == DW_DLV_NO_ENTRY) {
fprintf(stderr,
"Could not deref array type for %s: no entry\n",
field_name);
exit(7);
}
if (rc != DW_DLV_OK) {
fprintf(stderr,
"Could not deref type for %s: %s\n",
field_name, dwarf_errmsg(err));
exit(7);
}
rc = dwarf_get_arraysize(dbg, type_die, &size, &err);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"Could not get array size for %s: %s\n",
field_name, dwarf_errmsg(err));
exit(7);
}
type_die = next;
snprintf(array_buf, 128, "[%d]", size);
}
/* If it's still pointer at this point, it's void * */
if (type_tag != DW_TAG_pointer_type) {
rc = dwarf_diename(type_die, &type_name, &err);
if (rc != DW_DLV_OK) {
#if 0
fprintf(stderr, "dwarf_diename error: %s\n",
rc == DW_DLV_NO_ENTRY ?
"no name" : dwarf_errmsg(err));
const char *tag_name;
rc = dwarf_get_TAG_name(type_tag, &tag_name);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"dwarf_get_TAG_name error: %d\n",
rc);
}
fprintf(stderr, "Bad tag %s (%d)?\n",
tag_name, type_tag);
exit(7);
#endif
if (rc == DW_DLV_NO_ENTRY) {
embeded_struct = 1;
}
}
}
if (type_tag == DW_TAG_structure_type) {
snprintf(type_buf, 1024, "struct %s %s",
embeded_struct ? "FILL_IN_MANUALLY" : type_name, pointer_buf);
} else if (type_tag == DW_TAG_enumeration_type) {
snprintf(type_buf, 1024, "enum %s %s",
type_name, pointer_buf);
} else if (type_tag == DW_TAG_base_type
|| type_tag == DW_TAG_typedef) {
snprintf(type_buf, 1024, "%s %s", type_name,
pointer_buf);
} else if (type_tag == DW_TAG_pointer_type) {
snprintf(type_buf, 1024, "void %s", pointer_buf);
} else {
const char *tag_name;
rc = dwarf_get_TAG_name(type_tag, &tag_name);
if (rc != DW_DLV_OK) {
fprintf(stderr,
"dwarf_get_TAG_name error: %d\n", rc);
}
fprintf(stderr,
"Type tag %s (%d) is not implemented, please add it\n",
tag_name, type_tag);
exit(7);
}
if (type_tag != DW_TAG_pointer_type)
dwarf_dealloc(dbg, type_name, DW_DLA_STRING);
dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
dwarf_dealloc(dbg, type_die, DW_DLA_DIE);
}
printf("\t\tstruct {\n\t\t\tchar padding%i[%u];\n\t\t\t%s%s%s;\n\t\t};\n",
padnum, (unsigned int) offset,
type_buf, field_name, array_buf);
}