configure.ac: Update version number to 1.5.1-knl+hfi

Change-Id: Icbd08c9c5f65b22d007ec479a34acd20062e0e90
HFI: support IFS 10.8-0
2019-05-14 17:22:33 +09:00 · 2019-04-15 11:26:39 +09:00 · 2019-04-15 11:26:39 +09:00 · 2018-07-26 05:06:16 +00:00 · 2018-06-21 02:39:43 +00:00 · 2018-06-20 20:53:00 +09:00
92 changed files with 10832 additions and 279 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,3 +14,4 @@ elfboot/elfboot_test
 linux/executer/mcexec
 linux/mod_test*
 linux/target
+kernel/script/dwarf-extract-struct
--- a/Makefile.in
+++ b/Makefile.in
@ -1,5 +1,6 @@
 TARGET = @TARGET@
 SBINDIR = @SBINDIR@
+BINDIR = @BINDIR@
 INCDIR =  @INCDIR@
 ETCDIR = @ETCDIR@
 MANDIR = @MANDIR@
@ -47,6 +48,7 @@ install:
 		mkdir -p -m 755 $(SBINDIR); \
 		install -m 755 arch/x86_64/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
 		install -m 755 arch/x86_64/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
+		install -m 755 arch/x86_64/tools/mpimcexec $(BINDIR)/mpimcexec; \
 		install -m 755 arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh $(SBINDIR)/mcoverlay-destroy.sh; \
 		install -m 755 arch/x86_64/tools/mcoverlay-create-smp-x86.sh $(SBINDIR)/mcoverlay-create.sh; \
 		install -m 755 arch/x86_64/tools/eclair-dump-backtrace.exp $(SBINDIR)/eclair-dump-backtrace.exp;\
@ -57,6 +59,7 @@ install:
 		install -m 644 kernel/include/swapfmt.h $(INCDIR); \
 		mkdir -p -m 755 $(MANDIR)/man1; \
 		install -m 644 arch/x86_64/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
+		install -m 644 arch/x86_64/tools/mpimcexec.1 $(MANDIR)/man1/mpimcexec.1; \
 		;; \
 	    *) \
 		echo "unknown target $(TARGET)" >&2 \
--- a/arch/x86_64/kernel/cpu.c
+++ b/arch/x86_64/kernel/cpu.c
@ -1225,6 +1225,13 @@ void cpu_pause(void)
 	asm volatile("pause" ::: "memory");
 }

+/* From: kernel-xppsl_1.5.2/arch/x86/include/asm/processor.h */
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+void cpu_relax(void)
+{
+	asm volatile("rep; nop" ::: "memory");
+}
+
 /*@
  @ assigns \nothing;
  @ ensures \interrupt_disabled > 0;
--- a/arch/x86_64/kernel/include/arch-lock.h
+++ b/arch/x86_64/kernel/include/arch-lock.h
@ -14,7 +14,17 @@
 int __kprintf(const char *format, ...);
 #endif

-typedef int ihk_spinlock_t;
+typedef unsigned short __ticket_t;
+typedef unsigned int __ticketpair_t;
+
+typedef struct ihk_spinlock {
+	union {
+		__ticketpair_t head_tail;
+		struct __raw_tickets {
+			__ticket_t head, tail;
+		} tickets;
+	};
+} ihk_spinlock_t;

 extern void preempt_enable(void);
 extern void preempt_disable(void);
@ -23,9 +33,9 @@ extern void preempt_disable(void);

 static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
 {
-	*lock = 0;
+	lock->head_tail = 0;
 }
-#define SPIN_LOCK_UNLOCKED 0
+#define SPIN_LOCK_UNLOCKED { .head_tail = 0 }

 #ifdef DEBUG_SPINLOCK
 #define ihk_mc_spinlock_lock_noirq(l) { \
@ -39,40 +49,24 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \

 static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
 {
-	int inc = 0x00010000;
-	int tmp;
-
-#if 0
-	asm volatile("lock ; xaddl %0, %1\n"
-	             "movzwl %w0, %2\n\t"
-	             "shrl $16, %0\n\t"
-	             "1:\t"
-	             "cmpl %0, %2\n\t"
-	             "je 2f\n\t"
-	             "rep ; nop\n\t"
-	             "movzwl %1, %2\n\t"
-	             "jmp 1b\n"
-	             "2:"
-	             : "+Q" (inc), "+m" (*lock), "=r" (tmp) : : "memory", "cc");
-#endif
+	register struct __raw_tickets inc = { .tail = 0x0002 };

 	preempt_disable();

-	asm volatile("lock; xaddl %0, %1\n"
-			"movzwl %w0, %2\n\t"
-			"shrl $16, %0\n\t"
-			"1:\t"
-			"cmpl %0, %2\n\t"
-			"je 2f\n\t"
-			"rep ; nop\n\t"
-			"movzwl %1, %2\n\t"
-			/* don't need lfence here, because loads are in-order */
-			"jmp 1b\n"
-			"2:"
-			: "+r" (inc), "+m" (*lock), "=&r" (tmp)
-			:
-			: "memory", "cc");
+	asm volatile ("lock xaddl %0, %1\n"
+			: "+r" (inc), "+m" (*(lock)) : : "memory", "cc");

+	if (inc.head == inc.tail)
+		goto out;
+
+	for (;;) {
+		if (*((volatile __ticket_t *)&lock->tickets.head) == inc.tail)
+			goto out;
+		cpu_pause();
+	}
+
+out:
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }

 #ifdef DEBUG_SPINLOCK
@ -106,8 +100,11 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id());
 #endif
 static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
 {
-	asm volatile ("lock incw %0" : "+m"(*lock) : : "memory", "cc");
-	
+	__ticket_t inc = 0x0002;
+
+	asm volatile ("lock addw %1, %0\n"
+			: "+m" (lock->tickets.head) : "ri" (inc) : "memory", "cc");
+
 	preempt_enable();
 }

--- a/arch/x86_64/kernel/include/arch-memory.h
+++ b/arch/x86_64/kernel/include/arch-memory.h
@ -40,18 +40,42 @@
 #define LARGE_PAGE_MASK    (~((unsigned long)LARGE_PAGE_SIZE - 1))
 #define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)

+#define GB_PAGE_SHIFT   30
+#define GB_PAGE_SIZE    (1UL << GB_PAGE_SHIFT)
+#define GB_PAGE_MASK    (~((unsigned long)GB_PAGE_SIZE - 1))
+#define GB_PAGE_P2ALIGN (GB_PAGE_SHIFT - PAGE_SHIFT)
+
+
 #define USER_END           0x0000800000000000UL
 #define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
+
+/*
+ * Canonical negative addresses (i.e., the smallest kernel virtual address)
+ * on x86 64 bit mode (in its most restricted 48 bit format) starts from
+ * 0xffff800000000000, but Linux starts mapping physical memory at 0xffff880000000000.
+ * The 0x80000000000 long gap (8TBs, i.e., 16 PGD level entries in the page tables)
+ * is used for Xen hyervisor (see arch/x86/include/asm/page.h) and that is
+ * what we utilize for McKernel.
+ * This gives us the benefit of being able to use Linux kernel virtual
+ * addresses identically as in Linux.
+ *
+ * NOTE: update these also in eclair.c when modified!
+ */
 #define MAP_ST_START       0xffff800000000000UL
-#define MAP_VMAP_START     0xfffff00000000000UL
-#define MAP_FIXED_START    0xffffffff70000000UL
-#define MAP_KERNEL_START   0xffffffff80000000UL
+#define MAP_VMAP_START     0xffff850000000000UL
+#define MAP_FIXED_START    0xffff860000000000UL
+#define LINUX_PAGE_OFFSET  0xffff880000000000UL
+/*
+ * MAP_KERNEL_START is 8MB below MODULES_END in Linux.
+ * Placing the LWK image in the virtual address space at the end of
+ * the Linux modules section enables us to map the LWK TEXT in Linux
+ * as well, so that Linux can also call into LWK text.
+ */
+#define MAP_KERNEL_START   0xFFFFFFFFFE800000UL
 #define STACK_TOP(region)  ((region)->user_end)

 #define MAP_VMAP_SIZE      0x0000000100000000UL

-#define KERNEL_PHYS_OFFSET MAP_ST_START
-
 #define PTL4_SHIFT         39
 #define PTL4_SIZE          (1UL << PTL4_SHIFT)
 #define PTL3_SHIFT         30
--- a/arch/x86_64/kernel/include/ihk/atomic.h
+++ b/arch/x86_64/kernel/include/ihk/atomic.h
@ -133,7 +133,7 @@ static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
 * Note 2: xchg has side effect, so that attribute volatile is necessary,
 *	  but generally the primitive is invalid, *ptr is output argument. --ANK
 */
-#define __xg(x) ((volatile long *)(x))
+#define __xg(x) ((volatile typeof(x))(x))

 #define xchg4(ptr, x)						\
 ({									\
--- a/arch/x86_64/kernel/include/syscall_list.h
+++ b/arch/x86_64/kernel/include/syscall_list.h
@ -39,7 +39,7 @@ SYSCALL_HANDLED(15, rt_sigreturn)
 SYSCALL_HANDLED(16, ioctl)
 SYSCALL_DELEGATED(17, pread64)
 SYSCALL_DELEGATED(18, pwrite64)
-SYSCALL_DELEGATED(20, writev)
+SYSCALL_HANDLED(20, writev)
 SYSCALL_DELEGATED(21, access)
 SYSCALL_DELEGATED(23, select)
 SYSCALL_HANDLED(24, sched_yield)
--- a/arch/x86_64/kernel/local.c
+++ b/arch/x86_64/kernel/local.c
@ -107,9 +107,17 @@ void init_boot_processor_local(void)
  @ ensures \result == %gs;
  @ assigns \nothing;
  */
+extern int num_processors;
 int ihk_mc_get_processor_id(void)
 {
 	int id;
+	void *gs;
+
+	gs = (void *)rdmsr(MSR_GS_BASE);
+	if (gs < (void *)locals ||
+			gs > ((void *)locals + LOCALS_SPAN * num_processors)) {
+		return -1;
+	}

 	asm volatile("movl %%gs:0, %0" : "=r"(id));

--- a/arch/x86_64/kernel/memory.c
+++ b/arch/x86_64/kernel/memory.c
@ -41,6 +41,8 @@ extern char _head[], _end[];

 extern unsigned long x86_kernel_phys_base;

+int safe_kernel_map = 0;
+
 /* Arch specific early allocation routine */
 void *early_alloc_pages(int nr_pages)
 {
@ -109,6 +111,7 @@ struct page_table {
 };

 static struct page_table *init_pt;
+static int init_pt_loaded = 0;
 static ihk_spinlock_t init_pt_lock;

 static int use_1gb_page = 0;
@ -172,19 +175,23 @@ static void init_normal_area(struct page_table *pt)
 	unsigned long map_start, map_end, phys, pt_phys;
 	int ident_index, virt_index;

-	map_start = ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0);
+	/*
+	 * This has to start from 0x00, see load_file() in IHK-SMP.
+	 * For security reasons, we could skip holes in the LWK
+	 * assigned physical memory, but Linux mappings already map
+	 * those anyway.
+	 */
+	map_start = 0;
 	map_end = ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0);

-	kprintf("map_start = %lx, map_end = %lx\n", map_start, map_end);
 	ident_index = map_start >> PTL4_SHIFT;
 	virt_index = (MAP_ST_START >> PTL4_SHIFT) & (PT_ENTRIES - 1);

 	memset(pt, 0, sizeof(struct page_table));

-	for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
-	     phys += PTL4_SIZE) {
-		pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
-		                   map_start, map_end);
+	for (phys = map_start; phys < map_end; phys += PTL4_SIZE) {
+		pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL),
+				phys, map_start, map_end);

 		pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
 		pt->entry[virt_index++] = pt_phys | PFL4_PDIR_ATTR;
@ -724,6 +731,26 @@ static void destroy_page_table(int level, struct page_table *pt)
 	return;
 }

+void ihk_mc_pt_destroy_pgd_subtree(struct page_table *pt, void *virt)
+{
+	int l4idx, l3idx, l2idx, l1idx;
+	unsigned long v = (unsigned long)virt;
+	struct page_table *lower;
+
+	GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx);
+
+	if (!(pt->entry[l4idx] & PF_PRESENT))
+		return;
+
+	lower = (struct page_table *)
+		phys_to_virt(pt->entry[l4idx] & PT_PHYSMASK);
+	destroy_page_table(3, lower);
+
+	pt->entry[l4idx] = 0;
+	dkprintf("%s: virt: 0x%lx, l4idx: %d subtree destroyed\n",
+		__FUNCTION__, virt, l4idx);
+}
+
 void ihk_mc_pt_destroy(struct page_table *pt)
 {
 	const int level = 4;	/* PML4 */
@ -1960,6 +1987,28 @@ out:
 	return ptep;
 }

+pte_t *ihk_mc_pt_lookup_fault_pte(struct process_vm *vm, void *virt,
+		int pgshift, void **basep, size_t *sizep, int *p2alignp)
+{
+	int faulted = 0;
+	pte_t *ptep;
+
+retry:
+	ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
+			virt, pgshift, basep, sizep, p2alignp);
+	if (!faulted && (!ptep || !pte_is_present(ptep))) {
+		page_fault_process_vm(vm, virt, PF_POPULATE | PF_USER);
+		faulted = 1;
+		goto retry;
+	}
+
+	if (faulted && ptep && pte_is_present(ptep)) {
+		kprintf("%s: successfully faulted 0x%lx\n", __FUNCTION__, virt);
+	}
+
+	return ptep;
+}
+
 pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift,
 		void **basep, size_t *sizep, int *p2alignp)
 {
@ -2259,7 +2308,7 @@ out:

 int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, 
 		void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr,
-						int pgshift, struct vm_range *range)
+		int pgshift, struct vm_range *range)
 {
 	int error;
 	struct set_range_args args;
@ -2603,6 +2652,61 @@ void init_low_area(struct page_table *pt)
 	set_pt_large_page(pt, 0, 0, PTATTR_NO_EXECUTE|PTATTR_WRITABLE);
 }

+static void init_linux_kernel_mapping(struct page_table *pt)
+{
+	unsigned long map_start, map_end, phys;
+	void *virt;
+	int nr_memory_chunks, chunk_id, numa_id;
+	
+	/* In case of safe_kernel_map option (safe_kernel_map == 1), 
+	   processing to prevent destruction of the memory area on Linux side 
+	   is executed */
+	if (safe_kernel_map == 0) {
+		kprintf("Straight-map entire physical memory\n");
+
+		/* Map 2 TB for now */
+		map_start = 0;
+		map_end = 0x20000000000;
+
+		virt = (void *)LINUX_PAGE_OFFSET;
+
+		kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
+			LINUX_PAGE_OFFSET, LINUX_PAGE_OFFSET + map_end, 0, map_end);
+
+		for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
+			if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
+				kprintf("%s: error setting mapping for 0x%lx\n", __FUNCTION__, virt);
+			}
+			virt += LARGE_PAGE_SIZE;
+		}
+	} else {
+		kprintf("Straight-map physical memory areas allocated to McKernel\n");
+
+		nr_memory_chunks = ihk_mc_get_nr_memory_chunks();
+		if (nr_memory_chunks == 0) {
+			kprintf("%s: ERROR: No memory chunk available.\n", __FUNCTION__);
+			return;
+		}
+
+		for (chunk_id = 0; chunk_id < nr_memory_chunks; chunk_id++) {
+			if (ihk_mc_get_memory_chunk(chunk_id, &map_start, &map_end, &numa_id)) {
+				kprintf("%s: ERROR: Memory chunk id (%d) out of range.\n", __FUNCTION__, chunk_id);
+				continue;
+			}
+			
+			dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
+					 LINUX_PAGE_OFFSET + map_start, LINUX_PAGE_OFFSET + map_end, map_start, map_end);
+
+			virt = (void *)(LINUX_PAGE_OFFSET + map_start);
+			for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
+				if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
+					kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
+				}
+			}
+		}
+	}
+}
+
 static void init_vsyscall_area(struct page_table *pt)
 {
 	extern char vsyscall_page[];
@ -2628,13 +2732,15 @@ void init_page_table(void)

 	/* Normal memory area */
 	init_normal_area(init_pt);
+	init_linux_kernel_mapping(init_pt);
 	init_fixed_area(init_pt);
 	init_low_area(init_pt);
 	init_text_area(init_pt);
 	init_vsyscall_area(init_pt);

 	load_page_table(init_pt);
-	kprintf("Page table is now at %p\n", init_pt);
+	init_pt_loaded = 1;
+	kprintf("Page table is now at 0x%lx\n", init_pt);
 }

 extern void __reserve_arch_pages(unsigned long, unsigned long,
@ -2662,17 +2768,33 @@ void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
 unsigned long virt_to_phys(void *v)
 {
 	unsigned long va = (unsigned long)v;
-	
+
 	if (va >= MAP_KERNEL_START) {
+		dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= LINUX_PAGE_OFFSET\n",
+				__FUNCTION__, va);
 		return va - MAP_KERNEL_START + x86_kernel_phys_base;
-	} else {
+	}
+	else if (va >= LINUX_PAGE_OFFSET) {
+		return va - LINUX_PAGE_OFFSET;
+	}
+	else if (va >= MAP_FIXED_START) {
+		return va - MAP_FIXED_START;
+	}
+	else {
+		dkprintf("%s: MAP_ST_START <= 0x%lx <= MAP_FIXED_START\n",
+				__FUNCTION__, va);
 		return va - MAP_ST_START;
 	}
 }

 void *phys_to_virt(unsigned long p)
 {
-	return (void *)(p + MAP_ST_START);
+	/* Before loading our own PT use straight mapping */
+	if (!init_pt_loaded) {
+		return (void *)(p + MAP_ST_START);
+	}
+
+	return (void *)(p + LINUX_PAGE_OFFSET);
 }

 int copy_from_user(void *dst, const void *src, size_t siz)
--- a/arch/x86_64/tools/mcreboot-smp-x86.sh.in
+++ b/arch/x86_64/tools/mcreboot-smp-x86.sh.in
@ -44,11 +44,12 @@ fi

 turbo=""
 ihk_irq=""
+safe_kernel_map=""
 umask_old=`umask`
 idle_halt=""
 allow_oversubscribe=""

-while getopts :tk:c:m:o:f:r:q:i:d:e:hO OPT
+while getopts :stk:c:m:o:f:r:q:i:d:e:hO OPT
 do
 	case ${OPT} in
 	f)	facility=${OPTARG}
@ -61,6 +62,8 @@ do
 		;;
 	m) mem=${OPTARG}
 		;;
+	s) safe_kernel_map="safe_kernel_map"
+		;;
 	r) ikc_map=${OPTARG}
 		;;
 	q) ihk_irq=${OPTARG}
@ -82,6 +85,9 @@ do
 	esac
 done

+redirect_kmsg=0
+turbo="turbo"
+
 # Start ihkmond
 pid=`pidof ihkmond`
 if [ "${pid}" != "" ]; then
@ -299,16 +305,25 @@ if ! grep -E 'ihk\s' /proc/modules &>/dev/null; then
 	fi
 fi

-# Increase swappiness so that we have better chance to allocate memory for IHK
-echo 100 > /proc/sys/vm/swappiness
+# Copy modules under /tmp to avoid loading from shared FS
+if mkdir -p /tmp/mcos-kmod; then
+	cp ${KMODDIR}/* /tmp/mcos-kmod/
+	KMODDIR="/tmp/mcos-kmod/"
+fi

-# Drop Linux caches to free memory
-sync && echo 3 > /proc/sys/vm/drop_caches
+# Fujitsu drops caches for us in between jobs so don't do it on OFP
+if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" == "" ]; then
+	# Increase swappiness so that we have better chance to allocate memory for IHK
+	echo 100 > /proc/sys/vm/swappiness

-# Merge free memory areas into large, physically contigous ones
-echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
+	# Drop Linux caches to free memory
+	sync && echo 3 > /proc/sys/vm/drop_caches

-sync
+	# Merge free memory areas into large, physically contigous ones
+	echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
+
+	sync
+fi

 # Load IHK-SMP if not loaded and reserve CPUs and memory
 if ! grep ihk_smp_@ARCH@ /proc/modules &>/dev/null; then
@ -329,41 +344,41 @@ if ! grep ihk_smp_@ARCH@ /proc/modules &>/dev/null; then
 		error_exit "ihk_loaded"
 	fi

-	# Offline-reonline RAM (special case for OFP SNC-4 flat mode)
-	if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-7" ]; then
-		for i in  0 1 2 3; do
-			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
-				echo 0 > $f 2>&1 > /dev/null;
-			done
-			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
-				echo 1 > $f 2>&1 > /dev/null;
-			done
-		done
-		for i in 4 5 6 7; do
-			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
-				echo 0 > $f 2>&1 > /dev/null;
-			done
-		done
-		for i in 4 5 6 7; do
-			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
-				echo 1 > $f 2>&1 > /dev/null;
-			done
-		done
-	fi
-
-	# Offline-reonline RAM (special case for OFP Quadrant flat mode)
-	if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-1" ]; then
-		for i in 1; do
-			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
-				echo 0 > $f 2>&1 > /dev/null;
-			done
-		done
-		for i in 1; do
-			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
-				echo 1 > $f 2>&1 > /dev/null;
-			done
-		done
-	fi
+# 	# Offline-reonline RAM (special case for OFP SNC-4 flat mode)
+# 	if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-7" ]; then
+# 		for i in  0 1 2 3; do
+# 			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
+# 				echo 0 | tee $f 2>/dev/null 1>/dev/null
+# 			done
+# 			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
+# 				echo 1 | tee $f 2>/dev/null 1>/dev/null
+# 			done
+# 		done
+# 		for i in 4 5 6 7; do
+# 			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
+# 				echo 0 | tee $f 2>/dev/null 1>/dev/null
+# 			done
+# 		done
+# 		for i in 4 5 6 7; do
+# 			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
+# 				echo 1 | tee $f 2>/dev/null 1>/dev/null
+# 			done
+# 		done
+# 	fi
+#
+# 	# Offline-reonline RAM (special case for OFP Quadrant flat mode)
+# 	if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-1" ]; then
+# 		for i in 1; do
+# 			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
+# 				echo 0 | tee $f 2>/dev/null 1>/dev/null
+# 			done
+# 		done
+# 		for i in 1; do
+# 			find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
+# 				echo 1 | tee $f 2>/dev/null 1>/dev/null
+# 			done
+# 		done
+# 	fi

 	if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
 		echo "error: reserving memory" >&2
@ -440,7 +455,7 @@ if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
 fi

 # Set kernel arguments
-if ! ${SBINDIR}/ihkosctl 0 kargs "hidos $turbo $idle_halt dump_level=${DUMP_LEVEL} $extra_kopts $allow_oversubscribe"; then
+if ! ${SBINDIR}/ihkosctl 0 kargs "hidos $turbo $safe_kernel_map $idle_halt dump_level=${DUMP_LEVEL} $extra_kopts $allow_oversubscribe"; then
 	echo "error: setting kernel arguments" >&2
 	error_exit "os_created"
 fi
--- a/arch/x86_64/tools/mpimcexec.1in
+++ b/arch/x86_64/tools/mpimcexec.1in
@ -0,0 +1,60 @@
+.\" Man page for mpimcexec
+.\"
+
+.TH MPIMCEXEC 1 "@MCKERNEL_RELEASE_DATE@" "Version @MCKERNEL_VERSION@" MCKERNEL @MCKERNEL_VERSION@"
+.SH NAME
+mpimcexec \- run an MPI application on McKernel
+.\"
+
+.\" ----------------------------  SYNOPSIS ----------------------------
+.SH SYNOPSIS
+.B mpimcexec \fR [\fIoptions\fR] \fI<command>\fR
+
+.\" ----------------------------  DESCRIPTION ----------------------------
+.SH DESCRIPTION
+mpimcexec is a wrapper script for running MPI applications on McKernel.
+It internally calls mpiexec to spawn mcexec on compute nodes, which in
+turn runs \fI<command>\fR on McKernel. mpimcexec specifies a number of
+mcexec arguments that enable high performance execution.
+
+.\" ----------------------------  OPTIONS ----------------------------
+.SH OPTIONS
+
+.TP
+.B	-ppn N, --ppn N, --ranks-per-node N
+Specify the number of MPI ranks per node. This argument is required.
+.TP
+.B	-n N, --n N, --ranks N
+Specify the number of total MPI ranks.
+e.g.,
+   $ mpimcexec -n 32 -ppn 4 ./a.out
+.br
+In the above example, 32 MPI processes are invoked
+on eight compute nodes each of which has four processes.
+.TP
+.B	--nodes N
+Specify the number of compute nodes.
+By default, all nodes, specified by "PJM --mpi proc" option, are used.
+.TP
+.B	--env, -env 
+Pass an additional environment variable
+.TP
+.B	-m N, --numa N
+Specify preferred NUMA node.
+.TP
+.B	-h <file name>, ---hostfile <file name>
+Specify a host file for MPI.
+.TP
+.B	--help
+Show help message.
+
+.PP
+.\" ----------------------------  SEE ALSO ----------------------------
+.SH SEE ALSO
+\fBmcexec\fR (1), \fBmpiexec\fR (1)
+
+.\" ----------------------------  AUTHORS ----------------------------
+.SH AUTHORS
+Copyright (C) 2018 McKernel Development Team, RIKEN, Japan
+
+
--- a/arch/x86_64/tools/mpimcexec.in
+++ b/arch/x86_64/tools/mpimcexec.in
@ -0,0 +1,147 @@
+#!/bin/bash
+#
+# OFP McKernel MPI wrapper script
+# author: Balazs Gerofi <bgerofi@riken.jp>
+#      Copyright (C) 2018  RIKEN R-CCS
+#
+
+prefix="@prefix@"
+BINDIR="${prefix}/bin"
+
+if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
+	echo "You need at least bash-4.0 to run this script." >&2
+	exit 1
+fi
+
+RANKS=""
+NODES=""
+PPN=""
+MPI_ENV=""
+COMMAND=""
+NUMA=""
+HOSTFILE=""
+
+if [ ! -z "${PJM_PROC_BY_NODE}" ]; then
+	PPN=${PJM_PROC_BY_NODE}
+elif [ ! -z "${MPI_LOCALNRANKS}" ]; then
+	PPN=${MPI_LOCALNRANKS}
+fi
+
+help_exit() {
+	echo ""
+	echo "Spawn an McKernel MPI job on Oakforest-PACS."
+	echo "usage: `basename $0` -ppn ranks_per_node [--nodes nodes] [-n ranks] [--env additional_environment]... command"
+	echo ""
+	echo "   -ppn | --ppn | --ranks-per-node         Number of MPI ranks per node (required)"
+	echo "   -n | --n | --ranks                      Total number of MPI ranks in the job"
+	echo "   --nodes                                 Number of nodes to be used"
+	echo "   --env | -env                            Pass an additional environment variable"
+	echo "   -m | --numa                             Preferred NUMA node(s)"
+	echo "   -h | --hostfile                         Host file for MPI"
+	echo "   --help                                  Show help message"
+	exit 1
+}
+
+# Parse options
+while true; do
+	case $1 in
+		-ppn | --ppn | --ranks-per-node )
+			if [ $# -lt 2 ]; then
+			    echo "error: needs an interger value for -ppn, --ppn, or --ranks-per-node option"
+			    help_exit
+			fi
+			PPN=$2
+			shift 2
+			;;
+		-n | --n | --ranks )
+			if [ $# -lt 2 ]; then
+			    echo "error: needs an interger value for -n, --n, or --ranks option"
+			    help_exit
+			fi
+			RANKS=$2
+			shift 2
+			;;
+		-m | --numa )
+			if [ $# -lt 2 ]; then
+			    echo "error: needs an interger value for -m or --numa option"
+			    help_exit
+			fi
+			NUMA="-m $2"
+			shift 2
+			;;
+		--nodes )
+			if [ $# -lt 2 ]; then
+			    echo "error: needs an interger value for --nodes option"
+			    help_exit
+			fi
+			NODES=$2
+			shift 2
+			;;
+		--env | -env )
+			if [ $# -lt 2 ]; then
+			    echo "error: needs an environment variable name for -env or --env option"
+			    help_exit
+			fi
+			if [ -z "`echo $2 | grep I_MPI_PIN`" ]; then
+				MPI_ENV=`echo "${MPI_ENV} -env $2" | xargs`
+			fi
+			shift 2
+			;;
+		-h | --hostfile )
+			if [ $# -lt 2 ]; then
+			    echo "error: needs a file name for -h or --hostfile option"
+			    help_exit
+			fi
+			HOSTFILE="-hostfile $2"
+			shift 2
+			;;
+		--help )
+			help_exit
+			;;
+		* )
+			COMMAND=$@
+			break
+			;;
+	esac
+done
+
+if [ -z ${PPN} ]; then
+	echo "error: please specify the number of ranks per node"
+	help_exit
+fi
+
+# Unless explicitly specified, use Fujitsu inherited value
+if [ -z ${NODES} ]; then
+	NODES=${PJM_VNODES}
+fi
+
+if [ -z ${RANKS} ] && [ -z ${NODES} ]; then
+	echo "error: please specify the total number of ranks or the number of nodes"
+	help_exit
+fi
+
+if [ "x${COMMAND}" = "x" ]; then
+	echo "error: please specify command"
+	help_exit
+fi
+
+# Calculate total job size if not specified
+if [ -z ${RANKS} ]; then
+	let RANKS=(${PPN}*${NODES})
+fi
+
+# Support direct SSH when not executed from Fujitsu job system
+if [ -z ${PJM_VNODES} ]; then
+	HOSTFILE="-launcher-exec ssh ${HOSTFILE}"
+fi
+
+export I_MPI_PIN=off
+export PSM2_RCVTHREAD=0
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+mpirun ${HOSTFILE} -n ${RANKS} -ppn ${PPN} ${MPI_ENV} ${BINDIR}/mcexec -n ${PPN} ${NUMA} --enable-hfi1 --mpol-threshold=1M --stack-premap=4M,4G --extend-heap-by=8M --disable-sched-yield --mpol-shm-premap ${COMMAND}
+
--- a/30
+++ b/30
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for mckernel 1.5.0.
+# Generated by GNU Autoconf 2.69 for mckernel 1.5.1-knl+hfi.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='mckernel'
 PACKAGE_TARNAME='mckernel'
-PACKAGE_VERSION='1.5.0'
-PACKAGE_STRING='mckernel 1.5.0'
+PACKAGE_VERSION='1.5.1-knl+hfi'
+PACKAGE_STRING='mckernel 1.5.1-knl+hfi'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@ -1262,7 +1262,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures mckernel 1.5.0 to adapt to many kinds of systems.
+\`configure' configures mckernel 1.5.1-knl+hfi to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@ -1323,7 +1323,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of mckernel 1.5.0:";;
+     short | recursive ) echo "Configuration of mckernel 1.5.1-knl+hfi:";;
   esac
  cat <<\_ACEOF

@ -1431,7 +1431,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-mckernel configure 1.5.0
+mckernel configure 1.5.1-knl+hfi
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@ -1729,7 +1729,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by mckernel $as_me 1.5.0, which was
+It was created by mckernel $as_me 1.5.1-knl+hfi, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@ -2082,11 +2082,11 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu



-IHK_VERSION=1.5.0
-MCKERNEL_VERSION=1.5.0
+IHK_VERSION=1.5.1-knl+hfi
+MCKERNEL_VERSION=1.5.1-knl+hfi
 DCFA_VERSION=DCFA_VERSION_m4
-IHK_RELEASE_DATE=2018-04-05
-MCKERNEL_RELEASE_DATE=2018-04-05
+IHK_RELEASE_DATE=2019-05-14
+MCKERNEL_RELEASE_DATE=2019-05-14
 DCFA_RELEASE_DATE=DCFA_RELEASE_DATE_m4


@ -5060,7 +5060,7 @@ ac_config_headers="$ac_config_headers config.h"

 # POSTK_DEBUG_ARCH_DEP_37
 # AC_CONFIG_FILES arch dependfiles separate
-ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/user/mcexec.1:executer/user/mcexec.1in executer/user/vmcore2mckdump executer/user/arch/$ARCH/Makefile executer/user/arch/x86_64/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/$ARCH/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile executer/include/qlmpilib.h kernel/Makefile kernel/Makefile.build kernel/include/swapfmt.h arch/x86_64/tools/mcreboot-attached-mic.sh arch/x86_64/tools/mcshutdown-attached-mic.sh arch/x86_64/tools/mcreboot-builtin-x86.sh arch/x86_64/tools/mcreboot-smp-x86.sh arch/x86_64/tools/mcstop+release-smp-x86.sh arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh arch/x86_64/tools/mcoverlay-create-smp-x86.sh arch/x86_64/tools/eclair-dump-backtrace.exp arch/x86_64/tools/mcshutdown-builtin-x86.sh arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in arch/x86_64/tools/irqbalance_mck.service arch/x86_64/tools/irqbalance_mck.in tools/mcstat/Makefile"
+ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/user/mcexec.1:executer/user/mcexec.1in executer/user/vmcore2mckdump executer/user/arch/$ARCH/Makefile executer/user/arch/x86_64/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/$ARCH/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile executer/include/qlmpilib.h kernel/Makefile kernel/Makefile.build kernel/include/swapfmt.h arch/x86_64/tools/mcreboot-attached-mic.sh arch/x86_64/tools/mcshutdown-attached-mic.sh arch/x86_64/tools/mcreboot-builtin-x86.sh arch/x86_64/tools/mcreboot-smp-x86.sh arch/x86_64/tools/mcstop+release-smp-x86.sh arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh arch/x86_64/tools/mcoverlay-create-smp-x86.sh arch/x86_64/tools/eclair-dump-backtrace.exp arch/x86_64/tools/mcshutdown-builtin-x86.sh arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in arch/x86_64/tools/mpimcexec arch/x86_64/tools/mpimcexec.1:arch/x86_64/tools/mpimcexec.1in arch/x86_64/tools/irqbalance_mck.service arch/x86_64/tools/irqbalance_mck.in tools/mcstat/Makefile"


 if test "$TARGET" = "smp-x86"; then
@ -5585,7 +5585,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by mckernel $as_me 1.5.0, which was
+This file was extended by mckernel $as_me 1.5.1-knl+hfi, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@ -5647,7 +5647,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-mckernel config.status 1.5.0
+mckernel config.status 1.5.1-knl+hfi
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

@ -5795,6 +5795,8 @@ do
    "arch/x86_64/tools/eclair-dump-backtrace.exp") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/eclair-dump-backtrace.exp" ;;
    "arch/x86_64/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcshutdown-builtin-x86.sh" ;;
    "arch/x86_64/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in" ;;
+    "arch/x86_64/tools/mpimcexec") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mpimcexec" ;;
+    "arch/x86_64/tools/mpimcexec.1") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mpimcexec.1:arch/x86_64/tools/mpimcexec.1in" ;;
    "arch/x86_64/tools/irqbalance_mck.service") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/irqbalance_mck.service" ;;
    "arch/x86_64/tools/irqbalance_mck.in") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/irqbalance_mck.in" ;;
    "tools/mcstat/Makefile") CONFIG_FILES="$CONFIG_FILES tools/mcstat/Makefile" ;;
--- a/configure.ac
+++ b/configure.ac
@ -1,9 +1,9 @@
 # configure.ac COPYRIGHT FUJITSU LIMITED 2015-2016
 AC_PREREQ(2.63)
-m4_define([IHK_VERSION_m4],[1.5.0])dnl
-m4_define([MCKERNEL_VERSION_m4],[1.5.0])dnl
-m4_define([IHK_RELEASE_DATE_m4],[2018-04-05])dnl
-m4_define([MCKERNEL_RELEASE_DATE_m4],[2018-04-05])dnl
+m4_define([IHK_VERSION_m4],[1.5.1-knl+hfi])dnl
+m4_define([MCKERNEL_VERSION_m4],[1.5.1-knl+hfi])dnl
+m4_define([IHK_RELEASE_DATE_m4],[2019-05-14])dnl
+m4_define([MCKERNEL_RELEASE_DATE_m4],[2019-05-14])dnl

 AC_INIT([mckernel], MCKERNEL_VERSION_m4)

@ -568,6 +568,8 @@ AC_CONFIG_FILES([
 	arch/x86_64/tools/eclair-dump-backtrace.exp
 	arch/x86_64/tools/mcshutdown-builtin-x86.sh
 	arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in
+	arch/x86_64/tools/mpimcexec
+	arch/x86_64/tools/mpimcexec.1:arch/x86_64/tools/mpimcexec.1in
 	arch/x86_64/tools/irqbalance_mck.service
 	arch/x86_64/tools/irqbalance_mck.in
 	tools/mcstat/Makefile
--- a/executer/include/uprotocol.h
+++ b/executer/include/uprotocol.h
@ -91,6 +91,7 @@ struct program_image_section {

 struct get_cpu_set_arg {
 	int nr_processes;
+	int *process_rank;
 	void *cpu_set;
 	size_t cpu_set_size;	// Size in bytes
 	int *target_core;
@ -109,6 +110,8 @@ typedef unsigned long __cpu_set_unit;
 #define MPOL_NO_BSS               0x04
 #define MPOL_SHM_PREMAP           0x08

+#define MCEXEC_HFI1               0x01
+
 struct program_load_desc {
 	int num_sections;
 	int status;
@ -137,12 +140,14 @@ struct program_load_desc {
 	unsigned long envs_len;
 	struct rlimit rlimit[MCK_RLIM_MAX];
 	unsigned long interp_align;
+	unsigned long mcexec_flags;
 	unsigned long mpol_flags;
 	unsigned long mpol_threshold;
 	unsigned long heap_extension;
 	long stack_premap;
 	unsigned long mpol_bind_mask;
 	int nr_processes;
+	int process_rank;
 	char shell_path[SHELL_PATH_MAX_LEN];
 	__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
 	int profile;
@ -189,6 +194,7 @@ struct syscall_response {
 	long ret;
 	unsigned long fault_address;
 	unsigned long fault_reason;
+	void *private_data;
 };

 struct syscall_ret_desc {
--- a/executer/kernel/mcctrl/control.c
+++ b/executer/kernel/mcctrl/control.c
@ -692,6 +692,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
 		wake_up_interruptible(&pli_next->pli_wq);
 		/* Reset process counter */
 		pe->nr_processes_left = pe->nr_processes;
+		pe->process_rank = 0;
 	}

 	/* Wait for the rest if not the last or if the last but
@ -923,6 +924,15 @@ next_cpu:
 		goto put_and_unlock_out;
 	}

+	/* Copy rank */
+	if (copy_to_user(req.process_rank, &pe->process_rank,
+				sizeof(int))) {
+		printk("%s: error copying process rank to user\n",
+				__FUNCTION__);
+		ret = -EINVAL;
+		goto put_and_unlock_out;
+	}
+
 	/* mcexec NUMA to bind to */
 	mcexec_linux_numa = cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu));
 	if (copy_to_user(req.mcexec_linux_numa, &mcexec_linux_numa,
@ -970,6 +980,7 @@ next_cpu:
 	}
 	/* Otherwise wake up next process in list */
 	else {
+		++pe->process_rank;
 		pli_next = list_first_entry(&pe->pli_list,
 			struct process_list_item, list);
 		list_del(&pli_next->list);
@ -1062,7 +1073,6 @@ out:
 	return ret;
 }

-
 /* NOTE: per-process data is refcounted.
 * For every get call the user should call put. */
 struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
@ -1192,7 +1202,7 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
 		return -1;
 	}

-	dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
+	dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %lu\n",
 			__FUNCTION__,
 			packet->req.rtid,
 			packet->req.ttid,
@ -1362,7 +1372,7 @@ retry_alloc:
 	}

 	packet->req.valid = 0; /* ack */
-	dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
+	dprintk("%s: system call: %lu, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
 			"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
 			__FUNCTION__,
 			packet->req.number,
@ -1487,7 +1497,7 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
 	rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys, desc.size, NULL, 0);
 #endif

-	dprintk("mcexec_load_syscall: %s (desc.size: %d)\n", rpm, desc.size);
+	dprintk("mcexec_load_syscall: %p (desc.size: %lu)\n", rpm, desc.size);

 	if (copy_to_user((void *__user)desc.dest, rpm, desc.size)) {
 		return -EFAULT;
--- a/executer/kernel/mcctrl/mcctrl.h
+++ b/executer/kernel/mcctrl/mcctrl.h
@ -314,6 +314,7 @@ struct mcctrl_part_exec {
 	struct mutex lock;	
 	int nr_processes;
 	int nr_processes_left;
+	int process_rank;
 	cpumask_t cpus_used;
 	struct list_head pli_list;
 };
--- a/executer/kernel/mcctrl/syscall.c
+++ b/executer/kernel/mcctrl/syscall.c
@ -2065,6 +2065,17 @@ void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
 	/* Map response structure and notify offloading thread */
 	res->ret = ret;
 	res->stid = stid;
+	res->private_data = 0;
+
+	/* Special case for open() to return private_data */
+	if (packet->req.number == __NR_open && ret > 0) {
+		struct fd f;
+		f = fdget(ret);
+		if (f.file) {
+			res->private_data = f.file->private_data;
+			fdput(f);
+		}
+	}

 	if (__notify_syscall_requester(os, packet, res) < 0) {
 		printk("%s: WARNING: failed to notify PID %d\n",
--- a/executer/user/Makefile.in
+++ b/executer/user/Makefile.in
@ -42,8 +42,8 @@ ifeq ($(ARCH), arm64)
 eclair: eclair.c arch/$(ARCH)/arch-eclair.c
 	$(CC) -I.. -I. -I./arch/$(ARCH)/include -I$(VPATH)/.. -I$(VPATH) -I$(VPATH)/arch/$(ARCH)/include $(CFLAGS) -o $@ $^ $(LIBS)
 else
-eclair: eclair.c
-	$(CC) $(CFLAGS) -I${IHKDIR} -o $@ $^ $(LIBS)
+eclair: eclair.c arch/$(ARCH)/arch-eclair.c
+	$(CC) -I.. -I$(VPATH) -I$(VPATH)/arch/$(ARCH)/include $(CFLAGS) -o $@ $^ $(LIBS)
 endif

 ldump2mcdump.so: ldump2mcdump.c
--- a/executer/user/arch/x86_64/include/arch-eclair.h
+++ b/executer/user/arch/x86_64/include/arch-eclair.h
@ -2,8 +2,18 @@
 #ifndef HEADER_USER_X86_ECLAIR_H
 #define HEADER_USER_X86_ECLAIR_H

-#define MAP_KERNEL	0xFFFFFFFF80000000
-#define MAP_ST		0xFFFF800000000000
+#ifndef POSTK_DEBUG_ARCH_DEP_34
+#define MAP_ST_START       0xffff800000000000UL
+#define MAP_VMAP_START     0xffff850000000000UL
+#define MAP_FIXED_START    0xffff860000000000UL
+#define LINUX_PAGE_OFFSET  0xffff880000000000UL
+#define MAP_KERNEL_START   0xFFFFFFFFFE800000UL
+#endif	/* POSTK_DEBUG_ARCH_DEP_34 */
+
+/* TODO: these should be updated when McKernel changes */
+#define MCKERNEL_ELF_START "0xFFFFFFFFFE801000"
+#define MCKERNEL_ELF_LEN   "0x0000000000100000"
+

 #define ARCH_CLV_SPAN	"x86_cpu_local_variables_span"

--- a/executer/user/eclair.c
+++ b/executer/user/eclair.c
@ -8,9 +8,7 @@
 * 	Copyright (C) 2015  RIKEN AICS
 */

-#ifdef POSTK_DEBUG_ARCH_DEP_33
 #include "../config.h"
-#endif	/* POSTK_DEBUG_ARCH_DEP_33 */
 #include <bfd.h>
 #include <fcntl.h>
 #include <inttypes.h>
@ -22,10 +20,8 @@
 #include <arpa/inet.h>
 #include <sys/ioctl.h>
 #include <ihk/ihk_host_user.h>
-#ifdef POSTK_DEBUG_ARCH_DEP_34
 #include <eclair.h>
 #include <arch-eclair.h>
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */

 #define CPU_TID_BASE 1000000

@ -85,11 +81,7 @@ static struct thread_info *curr_thread = NULL;
 static uintptr_t ihk_mc_switch_context = -1;
 #endif	/* POSTK_DEBUG_ARCH_DEP_34 */

-#ifdef POSTK_DEBUG_ARCH_DEP_34
 uintptr_t lookup_symbol(char *name) {
-#else	/* POSTK_DEBUG_ARCH_DEP_34 */
-static uintptr_t lookup_symbol(char *name) {
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */
 	int i;

 	for (i = 0; i < nsyms; ++i) {
@ -101,22 +93,22 @@ static uintptr_t lookup_symbol(char *name) {
 	return NOSYMBOL;
 } /* lookup_symbol() */

+#define NOPHYS ((uintptr_t)-1)

 static uintptr_t virt_to_phys(uintptr_t va) {
-#ifndef POSTK_DEBUG_ARCH_DEP_34
-#define MAP_KERNEL 0xFFFFFFFF80000000
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */
-	if (va >= MAP_KERNEL) {
-		return (va - MAP_KERNEL + kernel_base);
+	if (va >= MAP_KERNEL_START) {
+		return va - MAP_KERNEL_START + kernel_base;
 	}
-#ifndef POSTK_DEBUG_ARCH_DEP_34
-#define MAP_ST 0xFFFF800000000000
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */
-	if (va >= MAP_ST) {
-		return (va - MAP_ST);
+	else if (va >= LINUX_PAGE_OFFSET) {
+		return va - LINUX_PAGE_OFFSET;
 	}
-	if (0) printf("virt_to_phys(%lx): -1\n", va);
-#define NOPHYS ((uintptr_t)-1)
+	else if (va >= MAP_FIXED_START) {
+		return va - MAP_FIXED_START;
+	}
+	else if (va >= MAP_ST_START) {
+		return va - MAP_ST_START;
+	}
+
 	return NOPHYS;
 } /* virt_to_phys() */

@ -673,11 +665,7 @@ static int setup_dump(char *fname) {
 	return 0;
 } /* setup_dump() */

-#ifdef POSTK_DEBUG_ARCH_DEP_38
 static ssize_t print_hex(char *buf, size_t buf_size, char *str) {
-#else	/* POSTK_DEBUG_ARCH_DEP_38 */
-static ssize_t print_hex(char *buf, char *str) {
-#endif	/* POSTK_DEBUG_ARCH_DEP_38 */

 	char *p;
 	char *q;
@ -702,11 +690,7 @@ static ssize_t print_hex(char *buf, char *str) {
 	return (q - buf);
 } /* print_hex() */

-#if defined(POSTK_DEBUG_ARCH_DEP_34) && defined(POSTK_DEBUG_ARCH_DEP_38)
 ssize_t print_bin(char *buf, size_t buf_size, void *data, size_t size) {
-#else	/* POSTK_DEBUG_ARCH_DEP_34 && POSTK_DEBUG_ARCH_DEP_38*/
-static ssize_t print_bin(char *buf, void *data, size_t size) {
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 && POSTK_DEBUG_ARCH_DEP_38*/
 	uint8_t *p;
 	char *q;
 	int i;
@ -733,13 +717,8 @@ static ssize_t print_bin(char *buf, void *data, size_t size) {
 	return (q - buf);
 } /* print_bin() */

-#ifdef POSTK_DEBUG_ARCH_DEP_38
 static void command(const char *cmd, char *res, size_t res_size) {
 	const char *p;
-#else	/* POSTK_DEBUG_ARCH_DEP_38 */
-static void command(char *cmd, char *res) {
-	char *p;
-#endif	/* POSTK_DEBUG_ARCH_DEP_38 */
 	char *rbp;

 	p = cmd;
@ -801,11 +780,7 @@ static void command(char *cmd, char *res) {
 #endif	/* POSTK_DEBUG_ARCH_DEP_34 */
 			rbp += sprintf(rbp, "l");
 			if (0)
-#ifdef POSTK_DEBUG_ARCH_DEP_38
 			rbp += print_hex(rbp, res_size, str);
-#else	/* POSTK_DEBUG_ARCH_DEP_38 */
-			rbp += print_hex(rbp, str);
-#endif	/* POSTK_DEBUG_ARCH_DEP_38 */
 			rbp += sprintf(rbp, "%s", str);
 		}
 		else if (!strcmp(p, "D")) {
@ -814,20 +789,9 @@ static void command(char *cmd, char *res) {
 		}
 		else if (!strcmp(p, "g")) {
 			if (curr_thread->cpu < 0) {
-#ifndef POSTK_DEBUG_ARCH_DEP_34
-				struct x86_kregs {
-					uintptr_t rsp, rbp, rbx, rsi;
-					uintptr_t rdi, r12, r13, r14;
-					uintptr_t r15, rflags, rsp0;
-				};
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */

 				int error;
-#ifdef POSTK_DEBUG_ARCH_DEP_34
 				struct arch_kregs kregs;
-#else	/* POSTK_DEBUG_ARCH_DEP_34 */
-				struct x86_kregs kregs;
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */

 				error = read_mem(curr_thread->process+K(CTX_OFFSET),
 						&kregs, sizeof(kregs));
@ -836,36 +800,7 @@ static void command(char *cmd, char *res) {
 					break;
 				}

-#ifdef POSTK_DEBUG_ARCH_DEP_34
 				print_kregs(rbp, res_size, &kregs);
-#else	/* POSTK_DEBUG_ARCH_DEP_34 */
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* rax */
-				rbp += print_bin(rbp, &kregs.rbx, sizeof(uint64_t));
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* rcx */
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* rdx */
-				rbp += print_bin(rbp, &kregs.rsi, sizeof(uint64_t));
-				rbp += print_bin(rbp, &kregs.rdi, sizeof(uint64_t));
-				rbp += print_bin(rbp, &kregs.rbp, sizeof(uint64_t));
-				rbp += print_bin(rbp, &kregs.rsp, sizeof(uint64_t));
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* r8 */
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* r9 */
-
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* r10 */
-				rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx");	/* r11 */
-				rbp += print_bin(rbp, &kregs.r12, sizeof(uint64_t));
-				rbp += print_bin(rbp, &kregs.r13, sizeof(uint64_t));
-				rbp += print_bin(rbp, &kregs.r14, sizeof(uint64_t));
-				rbp += print_bin(rbp, &kregs.r15, sizeof(uint64_t));
-				rbp += print_bin(rbp, &ihk_mc_switch_context,
-						sizeof(uint64_t));		/* rip */
-				rbp += print_bin(rbp, &kregs.rflags, sizeof(uint32_t));
-				rbp += sprintf(rbp, "xxxxxxxx");		/* cs */
-				rbp += sprintf(rbp, "xxxxxxxx");		/* ss */
-				rbp += sprintf(rbp, "xxxxxxxx");		/* ds */
-				rbp += sprintf(rbp, "xxxxxxxx");		/* es */
-				rbp += sprintf(rbp, "xxxxxxxx");		/* fs */
-				rbp += sprintf(rbp, "xxxxxxxx");		/* gs */
-#endif	/* POSTK_DEBUG_ARCH_DEP_34 */
 			}
 			else {
 				int error;
@ -943,11 +878,7 @@ static void command(char *cmd, char *res) {
 #endif	/* POSTK_DEBUG_ARCH_DEP_34 */
 			rbp += sprintf(rbp, "l");
 			if (0)
-#ifdef POSTK_DEBUG_ARCH_DEP_38
 			rbp += print_hex(rbp, res_size, str);
-#else	/* POSTK_DEBUG_ARCH_DEP_38 */
-			rbp += print_hex(rbp, str);
-#endif	/* POSTK_DEBUG_ARCH_DEP_38 */
 			rbp += sprintf(rbp, "%s", str);
 		}
 		else if (!strncmp(p, "T", 1)) {
@ -1039,11 +970,7 @@ static void command(char *cmd, char *res) {
 			else {
 				q += sprintf(q, "status=%#x", ti->status);
 			}
-#ifdef POSTK_DEBUG_ARCH_DEP_38
 			rbp += print_hex(rbp, res_size, buf);
-#else	/* POSTK_DEBUG_ARCH_DEP_38 */
-			rbp += print_hex(rbp, buf);
-#endif	/* POSTK_DEBUG_ARCH_DEP_38 */
 		}
 	} while (0);

@ -1272,11 +1199,7 @@ int main(int argc, char *argv[]) {
 			}
 			mode = 0;
 			fputc('+', ofp);
-#ifdef POSTK_DEBUG_ARCH_DEP_38
 			command(lbuf, rbuf, sizeof(rbuf));
-#else	/* POSTK_DEBUG_ARCH_DEP_38 */
-			command(lbuf, rbuf);
-#endif	/* POSTK_DEBUG_ARCH_DEP_38 */
 			sum = 0;
 			for (p = rbuf; *p != '\0'; ++p) {
 				sum += *p;
--- a/executer/user/eclair.h
+++ b/executer/user/eclair.h
@ -3,11 +3,7 @@
 #ifndef HEADER_USER_COMMON_ECLAIR_H
 #define HEADER_USER_COMMON_ECLAIR_H

-#ifdef POSTK_DEBUG_ARCH_DEP_76 /* header path fix */
 #include "../config.h"
-#else /* POSTK_DEBUG_ARCH_DEP_76 */
-#include <config.h>
-#endif /* POSTK_DEBUG_ARCH_DEP_76 */
 #include <stdio.h>
 #include <inttypes.h>
 #include <arch-eclair.h>
--- a/executer/user/mcexec.1in
+++ b/executer/user/mcexec.1in
@ -73,6 +73,13 @@ e.g.: 10k means 10Kibyte, 100M 100Mibyte, 1G 1Gibyte
 Enable system call profiling. After the execution, profiling
 information may be obtained by the ihkosctl tool.

+.TP
+.B	-m N
+Specify the NUMA memory policy. In the case of Quadrant&Flat mode, NUMA node
+0 is CPU cores and NUMA node 1 is MCDRAM. Thus, option "-m 1"
+means that user's memory areas are assigned in MCDRAM.
+
+
 .TP
 .B	--mpol-no-heap, --mpol-no-stack, --mpol-no-bss, 
 Disregard NUMA memory policy in the heap/stack/BSS areas.
@ -93,7 +100,7 @@ This option eliminates potential kernel resource contention by
 avoiding page faults in the shared memory region.

 .TP
-.B	-m N, --mpol-threshold=N
+.B	-M N, --mpol-threshold=N
 Specify the threshold of memory size for respecting the memory
 allocation policy in NUMA machines. If the size of memory allocation
 is smaller than the one specified in this option, the memory area is
--- a/executer/user/mcexec.c
+++ b/executer/user/mcexec.c
@ -221,6 +221,7 @@ static int mpol_no_stack = 0;
 static int mpol_no_bss = 0;
 static int mpol_shm_premap = 0;
 static int no_bind_ikc_map = 0;
+static int hfi1_enabled = 0;
 static unsigned long mpol_threshold = 0;
 static unsigned long heap_extension = (4*1024);
 static int profile = 0;
@ -1653,6 +1654,8 @@ static void destroy_local_environ(char **local_env)
 unsigned long atobytes(char *string)
 {
 	unsigned long mult = 1;
+	unsigned long ret;
+	char orig_postfix = 0;
 	char *postfix;
 	errno = ERANGE;

@ -1664,19 +1667,26 @@ unsigned long atobytes(char *string)

 	if (*postfix == 'k' || *postfix == 'K') {
 		mult = 1024;
+		orig_postfix = *postfix;
 		*postfix = 0;
 	}
 	else if (*postfix == 'm' || *postfix == 'M') {
 		mult = 1024 * 1024;
+		orig_postfix = *postfix;
 		*postfix = 0;
 	}
 	else if (*postfix == 'g' || *postfix == 'G') {
 		mult = 1024 * 1024 * 1024;
+		orig_postfix = *postfix;
 		*postfix = 0;
 	}

+	ret = atol(string) * mult;
+	if (orig_postfix)
+		*postfix = orig_postfix;
+
 	errno = 0;
-	return atol(string) * mult;
+	return ret;
 }

 static struct option mcexec_options[] = {
@ -1744,6 +1754,12 @@ static struct option mcexec_options[] = {
 		.flag =		&disable_sched_yield,
 		.val =		1,
 	},
+	{
+		.name =		"enable-hfi1",
+		.has_arg =	no_argument,
+		.flag =		&hfi1_enabled,
+		.val =		1,
+	},
 	{
 		.name =		"extend-heap-by",
 		.has_arg =	required_argument,
@ -2416,6 +2432,7 @@ int main(int argc, char **argv)
 		struct get_cpu_set_arg cpu_set_arg;
 		int mcexec_linux_numa = 0;
 		int ikc_mapped = 0;
+		int process_rank = -1;
 		cpu_set_t mcexec_cpu_set;

 		CPU_ZERO(&mcexec_cpu_set);
@ -2424,6 +2441,7 @@ int main(int argc, char **argv)
 		cpu_set_arg.cpu_set_size = sizeof(desc->cpu_set);
 		cpu_set_arg.nr_processes = nr_processes;
 		cpu_set_arg.target_core = &target_core;
+		cpu_set_arg.process_rank = &process_rank;
 		cpu_set_arg.mcexec_linux_numa = &mcexec_linux_numa;
 		cpu_set_arg.mcexec_cpu_set = &mcexec_cpu_set;
 		cpu_set_arg.mcexec_cpu_set_size = sizeof(mcexec_cpu_set);
@ -2436,6 +2454,7 @@ int main(int argc, char **argv)
 		}

 		desc->cpu = target_core;
+		desc->process_rank = process_rank;

 		/* Bind to CPU cores where the LWK process' IKC target maps to */
 		if (ikc_mapped && !no_bind_ikc_map) {
@ -2523,6 +2542,11 @@ int main(int argc, char **argv)
 		}
 	}

+	desc->mcexec_flags = 0;
+	if (hfi1_enabled) {
+		desc->mcexec_flags |= MCEXEC_HFI1;
+	}
+
 	if (ioctl(fd, MCEXEC_UP_PREPARE_IMAGE, (unsigned long)desc) != 0) {
 		perror("prepare");
 		close(fd);
@ -3308,7 +3332,6 @@ int main_loop(struct thread_data_s *my_thread)
 	memset(&w, '\0', sizeof w);
 	w.cpu = cpu;
 	w.pid = getpid();
-
 	while (((ret = ioctl(fd, MCEXEC_UP_WAIT_SYSCALL, (unsigned long)&w)) == 0) || (ret == -1 && errno == EINTR)) {

 		if (ret) {
@ -3499,6 +3522,7 @@ int main_loop(struct thread_data_s *my_thread)

 				if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) {
 					fprintf(stderr, "__NR_gettid(): error transfering TIDs\n");
+					exit(1);
 				}

 				free(tids);
@ -4189,6 +4213,7 @@ return_execve2:
 			}
 			do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
 			break;
+
 		case 801: {// swapout
 #ifdef ENABLE_QLMPI
 			int rc;
@ -4378,6 +4403,11 @@ return_linux_spawn:
 			break;
 		}

+		case __NR_writev:
+			ret = do_generic_syscall(&w);
+			do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
+			break;
+
 		default:
 			if (archdep_syscall(&w, &ret)) {
 				ret = do_generic_syscall(&w);
--- a/kernel/Makefile.build.in
+++ b/kernel/Makefile.build.in
@ -8,6 +8,7 @@ OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
 OBJS += zeroobj.o procfs.o devobj.o sysfs.o xpmem.o profile.o freeze.o
 OBJS += rbtree.o
 OBJS += pager.o
+OBJS += file_ops.o user_sdma.o sdma.o user_exp_rcv.o chip.o
 # POSTK_DEBUG_ARCH_DEP_18 coredump arch separation.
 DEPSRCS=$(wildcard $(SRC)/*.c)

--- a/kernel/ap.c
+++ b/kernel/ap.c
@ -68,6 +68,11 @@ static void ap_wait(void)
 		init_host_ikc2mckernel();
 		init_host_ikc2linux(ikc_cpu);
 		mcs_lock_unlock_noirq(&ap_syscall_semaphore, &mcs_node);
+
+		{
+			extern void hfi1_kmalloc_cache_prealloc(void);
+			hfi1_kmalloc_cache_prealloc();
+		}
 	}
 	
 	/* one of them listens */
--- a/kernel/chip.c
+++ b/kernel/chip.c
@ -0,0 +1,126 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip,
+ * or what we use of them.
+ */
+
+#include <hfi1/hfi.h>
+#include <hfi1/chip_registers.h>
+#include <hfi1/chip.h>
+
+//#define DEBUG_PRINT_CHIP
+
+#ifdef DEBUG_PRINT_CHIP
+#define dkprintf(...) kprintf(__VA_ARGS__)
+#else
+#define dkprintf(...) do { if(0) kprintf(__VA_ARGS__); } while (0)
+#endif
+
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+		  u32 type, unsigned long pa, u16 order)
+{
+	u64 reg;
+	void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+			      (dd->kregbase1 + RCV_ARRAY));
+
+	if (!(dd->flags & HFI1_PRESENT))
+		goto done;
+
+	if (type == PT_INVALID) {
+		pa = 0;
+	} else if (type > PT_INVALID) {
+		kprintf("unexpected receive array type %u for index %u, not handled\n",
+			type, index);
+		goto done;
+	}
+
+#ifdef TIDRDMA_DEBUG
+	hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+		  pt_name(type), index, pa, (unsigned long)order);
+#endif
+
+#define RT_ADDR_SHIFT 12	/* 4KB kernel address boundary */
+	reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+		| (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+		| ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+					<< RCV_ARRAY_RT_ADDR_SHIFT;
+	dkprintf("type %d, index 0x%x, pa 0x%lx, bsize 0x%lx, reg 0x%llx\n",
+		type, index, pa, (unsigned long)order, reg);
+	writeq(reg, base + (index * 8));
+
+	if (type == PT_EAGER)
+		/*
+		 * Eager entries are written one-by-one so we have to push them
+		 * after we write the entry.
+		 */
+		flush_wc();
+done:
+	return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 i;
+
+#if 0
+	/* this could be optimized */
+	for (i = rcd->eager_base; i < rcd->eager_base +
+		     rcd->egrbufs.alloced; i++)
+		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+#endif
+	for (i = rcd->expected_base;
+			i < rcd->expected_base + rcd->expected_count; i++)
+		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
--- a/kernel/config/smp-x86.lds
+++ b/kernel/config/smp-x86.lds
@ -5,7 +5,7 @@ PHDRS
 }
 SECTIONS
 {
-	. = 0xffffffff80001000;
+	. = 0xFFFFFFFFFE801000;
    _head = .;

    .text : {
--- a/kernel/file_ops.c
+++ b/kernel/file_ops.c
@ -0,0 +1,291 @@
+#include <hfi1/file_ops.h>
+#include <hfi1/hfi.h>
+#include <hfi1/user_sdma.h>
+#include <hfi1/sdma.h>
+#include <hfi1/ihk_hfi1_common.h>
+#include <hfi1/user_exp_rcv.h>
+#include <errno.h>
+
+//#define DEBUG_PRINT_FOPS
+
+#ifdef DEBUG_PRINT_FOPS
+#define	dkprintf(...) kprintf(__VA_ARGS__)
+#define	ekprintf(...) kprintf(__VA_ARGS__)
+#else
+#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
+#define	ekprintf(...) kprintf(__VA_ARGS__)
+#endif
+
+long hfi1_file_ioctl(void *private_data, unsigned int cmd,
+			    unsigned long arg, unsigned long t_s)
+{
+	struct hfi1_filedata *fd = private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_tid_info tinfo;
+	unsigned long addr;
+	int ret = -ENOTSUPP;
+
+	hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
+	if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
+	    cmd != HFI1_IOCTL_GET_VERS &&
+	    !uctxt)
+		return -EINVAL;
+
+	switch (cmd) {
+	case HFI1_IOCTL_ASSIGN_CTXT:
+#if 0	
+		if (uctxt)
+			return -EINVAL;
+
+		if (copy_from_user(&uinfo,
+				   (struct hfi1_user_info __user *)arg,
+				   sizeof(uinfo)))
+			return -EFAULT;
+
+		ret = assign_ctxt(fp, &uinfo);
+		if (ret < 0)
+			return ret;
+		ret = setup_ctxt(fp);
+		if (ret)
+			return ret;
+		ret = user_init(fp);
+#endif
+		dkprintf("%s: HFI1_IOCTL_ASSIGN_CTXT \n", __FUNCTION__);
+		break;
+	case HFI1_IOCTL_CTXT_INFO:
+#if 0
+		ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg,
+				    sizeof(struct hfi1_ctxt_info));
+#endif
+		dkprintf("%s: HFI1_IOCTL_CTXT_INFO \n", __FUNCTION__);
+		break;
+	case HFI1_IOCTL_USER_INFO:
+#if 0
+		ret = get_base_info(fp, (void __user *)(unsigned long)arg,
+				    sizeof(struct hfi1_base_info));
+#endif
+		dkprintf("%s: HFI1_IOCTL_USER_INFO \n", __FUNCTION__);
+		break;
+	case HFI1_IOCTL_CREDIT_UPD:
+#if 0	
+		if (uctxt)
+			sc_return_credits(uctxt->sc);
+#endif
+		dkprintf("%s: HFI1_IOCTL_CREDIT_UPD \n", __FUNCTION__);
+		break;
+
+	case HFI1_IOCTL_TID_UPDATE:
+		dkprintf("%s: HFI1_IOCTL_TID_UPDATE \n", __FUNCTION__);
+		if (copy_from_user(&tinfo,
+				   (struct hfi11_tid_info __user *)arg,
+				   sizeof(tinfo)))
+			return -EFAULT;
+
+		ret = hfi1_user_exp_rcv_setup(fd, &tinfo);
+		if (!ret) {
+			/*
+			 * Copy the number of tidlist entries we used
+			 * and the length of the buffer we registered.
+			 * These fields are adjacent in the structure so
+			 * we can copy them at the same time.
+			 */
+			addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+			if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+					 sizeof(tinfo.tidcnt) +
+					 sizeof(tinfo.length)))
+				ret = -EFAULT;
+		}
+		break;
+
+	case HFI1_IOCTL_TID_FREE:
+		dkprintf("%s: HFI1_IOCTL_TID_FREE \n", __FUNCTION__);
+		if (copy_from_user(&tinfo,
+				   (struct hfi11_tid_info __user *)arg,
+				   sizeof(tinfo)))
+			return -EFAULT;
+
+		ret = hfi1_user_exp_rcv_clear(fd, &tinfo);
+		if (ret)
+			break;
+		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			ret = -EFAULT;
+		break;
+
+	case HFI1_IOCTL_TID_INVAL_READ:
+		dkprintf("%s: HFI1_IOCTL_TID_INVAL_READ \n", __FUNCTION__);
+		if (copy_from_user(&tinfo,
+				   (struct hfi11_tid_info __user *)arg,
+				   sizeof(tinfo)))
+			return -EFAULT;
+
+		ret = hfi1_user_exp_rcv_invalid(fd, &tinfo);
+		if (ret)
+			break;
+		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			ret = -EFAULT;
+		break;
+
+	case HFI1_IOCTL_RECV_CTRL:
+#if 0
+		ret = get_user(uval, (int __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		ret = manage_rcvq(uctxt, fd->subctxt, uval);
+#endif
+		dkprintf("%s: HFI1_IOCTL_RECV_CTRL \n", __FUNCTION__);
+		break;
+
+	case HFI1_IOCTL_POLL_TYPE:
+#if 0
+		ret = get_user(uval, (int __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		uctxt->poll_type = (typeof(uctxt->poll_type))uval;
+#endif
+		dkprintf("%s: HFI1_IOCTL_POLL_TYPE \n", __FUNCTION__);
+		break;
+
+	case HFI1_IOCTL_ACK_EVENT:
+#if 0
+		ret = get_user(ul_uval, (unsigned long __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
+#endif
+		dkprintf("%s: HFI1_IOCTL_ACK_EVENT \n", __FUNCTION__);
+		break;
+
+	case HFI1_IOCTL_SET_PKEY:
+#if 0
+		ret = get_user(uval16, (u16 __user *)arg);
+		if (ret != 0)
+			return -EFAULT;
+		if (HFI1_CAP_IS_USET(PKEY_CHECK))
+			ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
+		else
+			return -EPERM;
+#endif
+		ret = -ENODEV;
+		dkprintf("%s: HFI1_IOCTL_SET_PKEY \n", __FUNCTION__);
+		break;
+
+	case HFI1_IOCTL_CTXT_RESET: {
+#if 0
+		struct send_context *sc;
+		struct hfi1_devdata *dd;
+
+		if (!uctxt || !uctxt->dd || !uctxt->sc)
+			return -EINVAL;
+
+		/*
+		 * There is no protection here. User level has to
+		 * guarantee that no one will be writing to the send
+		 * context while it is being re-initialized.
+		 * If user level breaks that guarantee, it will break
+		 * it's own context and no one else's.
+		 */
+		dd = uctxt->dd;
+		sc = uctxt->sc;
+		/*
+		 * Wait until the interrupt handler has marked the
+		 * context as halted or frozen. Report error if we time
+		 * out.
+		 */
+		wait_event_interruptible_timeout(
+			sc->halt_wait, (sc->flags & SCF_HALTED),
+			msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+		if (!(sc->flags & SCF_HALTED))
+			return -ENOLCK;
+
+		/*
+		 * If the send context was halted due to a Freeze,
+		 * wait until the device has been "unfrozen" before
+		 * resetting the context.
+		 */
+		if (sc->flags & SCF_FROZEN) {
+			wait_event_interruptible_timeout(
+				dd->event_queue,
+				!(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+				msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+			if (dd->flags & HFI1_FROZEN)
+				return -ENOLCK;
+
+			if (dd->flags & HFI1_FORCED_FREEZE)
+				/*
+				 * Don't allow context reset if we are into
+				 * forced freeze
+				 */
+				return -ENODEV;
+
+			sc_disable(sc);
+			ret = sc_enable(sc);
+			hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+				     uctxt->ctxt);
+		} else {
+			ret = sc_restart(sc);
+		}
+		if (!ret)
+			sc_return_credits(sc);
+		break;
+#endif
+		dkprintf("%s: HFI1_IOCTL_CTXT_RESET \n", __FUNCTION__);
+		break;
+	}
+
+	case HFI1_IOCTL_GET_VERS:
+#if 0
+		uval = HFI1_USER_SWVERSION;
+		if (put_user(uval, (int __user *)arg))
+			return -EFAULT;
+#endif
+		dkprintf("%s: HFI1_IOCTL_GET_VERS \n", __FUNCTION__);
+		break;
+
+	default:
+		return -ENOTSUPP;
+	}
+	return ret;
+}
+
+ssize_t hfi1_aio_write(void *private_data, const struct iovec *iovec, unsigned long dim)
+{
+	struct hfi1_filedata *fd = private_data;
+	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_comp_q *cq = fd->cq;
+	int done = 0, reqs = 0;
+
+	if (!cq || !pq)
+		return -EIO;
+
+	if (!dim)
+		return -EINVAL;
+
+	hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+		fd->uctxt->ctxt, fd->subctxt, dim);
+
+	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
+		return -ENOSPC;
+
+	while (dim) {
+		int ret;
+		unsigned long count = 0;
+
+		ret = hfi1_user_sdma_process_request(
+			private_data, (struct iovec *)(iovec + done),
+			dim, &count);
+		if (ret) {
+			reqs = ret;
+			break;
+		}
+		dim -= count;
+		done += count;
+		reqs++;
+	}
+
+	return reqs;
+}
+
--- a/kernel/host.c
+++ b/kernel/host.c
@ -479,9 +479,11 @@ static int process_msg_prepare_process(unsigned long rphys)
 	proc->sgid = pn->cred[6];
 	proc->fsgid = pn->cred[7];
 	proc->termsig = SIGCHLD;
+	proc->mcexec_flags = pn->mcexec_flags;
 	proc->mpol_flags = pn->mpol_flags;
 	proc->mpol_threshold = pn->mpol_threshold;
 	proc->nr_processes = pn->nr_processes;
+	proc->process_rank = pn->process_rank;
 	proc->heap_extension = pn->heap_extension;

 	/* Update NUMA binding policy if requested */
--- a/kernel/include/cls.h
+++ b/kernel/include/cls.h
@ -19,10 +19,17 @@
 * CPU Local Storage (cls)
 */

+struct kmalloc_cache_header {
+	struct kmalloc_cache_header *next;
+};
+
 struct kmalloc_header {
 	unsigned int front_magic;
-	unsigned int cpu_id;
-	struct list_head list;
+	int cpu_id;
+	union {
+		struct list_head list;
+		struct kmalloc_cache_header *cache;
+	};
 	int size; /* The size of this chunk without the header */
 	unsigned int end_magic;
 	/* 32 bytes */
@ -99,6 +106,12 @@ struct cpu_local_var {
 	struct list_head smp_func_req_list;

 	struct process_vm *on_fork_vm;
+
+	/* HFI1 related per-core kmalloc caches */
+	struct kmalloc_cache_header txreq_cache;
+	struct kmalloc_cache_header tids_cache;
+	struct kmalloc_cache_header tidlist_cache;
+	struct kmalloc_cache_header tid_node_cache;
 } __attribute__((aligned(64)));


--- a/kernel/include/hfi1/chip.h
+++ b/kernel/include/hfi1/chip.h
@ -0,0 +1,60 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+#define MAX_EXPECTED_BUFFER    (2048 * 1024)
+
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                  u32 type, unsigned long pa, u16 order);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+
+#endif /* _CHIP_H */
--- a/kernel/include/hfi1/chip_registers.h
+++ b/kernel/include/hfi1/chip_registers.h
@ -0,0 +1,64 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE            0x000000000000
+
+
+#define RXE                     (CORE + 0x000001000000)
+
+
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+
+#endif          /* DEF_CHIP_REG */
--- a/kernel/include/hfi1/common.h
+++ b/kernel/include/hfi1/common.h
@ -0,0 +1,411 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#ifdef __HFI1_ORIG__
+#include "update/hfi1_user.h"
+#else
+#include <hfi1/hfi1_user.h>
+#endif /* __HFI1_ORIG__ */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT      24
+#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT    63
+#define HFI1_CAP_LOCKED_MASK     0x1ULL
+#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+					   HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~HFI1_CAP_##cap;			\
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_USET(cap)						\
+	({								\
+		hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+		hfi1_cap_mask;						\
+		})
+#define HFI1_CAP_UCLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_SET(cap)						\
+	({								\
+		hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<	\
+						  HFI1_CAP_USER_SHIFT)); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_CLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~(HFI1_CAP_##cap |			\
+				  (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_LOCK()							\
+	({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |			\
+				  HFI1_CAP_HDRSUPP |			\
+				  HFI1_CAP_MULTI_PKT_EGR |		\
+				  HFI1_CAP_NODROP_RHQ_FULL |		\
+				  HFI1_CAP_NODROP_EGR_FULL |		\
+				  HFI1_CAP_ALLOW_PERM_JKEY |		\
+				  HFI1_CAP_STATIC_RATE_CTRL |		\
+				  HFI1_CAP_PRINT_UNIMPL |		\
+				  HFI1_CAP_TID_UNMAP |			\
+				  HFI1_CAP_OPFN |			\
+				  HFI1_CAP_TID_RDMA)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |			\
+				   HFI1_CAP_USE_SDMA_HEAD |		\
+				   HFI1_CAP_EXTENDED_PSN |		\
+				   HFI1_CAP_PRINT_UNIMPL |		\
+				   HFI1_CAP_NO_INTEGRITY |		\
+				   HFI1_CAP_PKEY_CHECK |		\
+				   HFI1_CAP_TID_RDMA |			\
+				   HFI1_CAP_OPFN) <<			\
+				  HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |			\
+				 HFI1_CAP_NODROP_RHQ_FULL |		\
+				 HFI1_CAP_NODROP_EGR_FULL |		\
+				 HFI1_CAP_SDMA |			\
+				 HFI1_CAP_PRINT_UNIMPL |		\
+				 HFI1_CAP_STATIC_RATE_CTRL |		\
+				 HFI1_CAP_PKEY_CHECK |			\
+				 HFI1_CAP_MULTI_PKT_EGR |		\
+				 HFI1_CAP_EXTENDED_PSN |		\
+				 ((HFI1_CAP_HDRSUPP |			\
+				   HFI1_CAP_MULTI_PKT_EGR |		\
+				   HFI1_CAP_STATIC_RATE_CTRL |		\
+				   HFI1_CAP_PKEY_CHECK |		\
+				   HFI1_CAP_EARLY_CREDIT_RETURN) <<	\
+				  HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA |			\
+		     HFI1_CAP_EXTENDED_PSN |		\
+		     HFI1_CAP_PKEY_CHECK |		\
+		     HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \
+			     HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-294"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+	__u16 version;		/* structure version */
+	__u16 unit;		/* which device */
+	__u16 sw_index;		/* send sw index to use */
+	__u16 len;		/* data length, in bytes */
+	__u16 port;		/* port number */
+	__u16 unused;
+	__u32 flags;		/* call flags */
+	__u64 data;		/* user data pointer */
+	__u64 pbc;		/* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1	/* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT	0
+#define RHF_PKT_LEN_MASK	0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT	12
+#define RHF_RCV_TYPE_MASK	0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT	15
+#define RHF_USE_EGR_BFR_MASK	0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT	16
+#define RHF_EGR_INDEX_MASK	0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT	27
+#define RHF_DC_INFO_MASK	0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT	28
+#define RHF_RCV_SEQ_MASK	0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT	32
+#define RHF_EGR_OFFSET_MASK	0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT	44
+#define RHF_HDRQ_OFFSET_MASK	0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR	(0x1ull << 53)
+#define RHF_DC_UNC_ERR		(0x1ull << 54)
+#define RHF_DC_ERR		(0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT	56
+#define RHF_RCV_TYPE_ERR_MASK	0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR		(0x1ull << 59)
+#define RHF_LEN_ERR		(0x1ull << 60)
+#define RHF_ECC_ERR		(0x1ull << 61)
+#define RHF_VCRC_ERR		(0x1ull << 62)
+#define RHF_ICRC_ERR		(0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull		/* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER    1
+#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR    3
+#define RHF_RCV_TYPE_BYPASS   4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR	0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR	0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR		0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR		0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR		0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR	0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR	0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR	0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR	0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR	0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR	0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR		0x0
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY       0x7FFF
+#define FULL_MGMT_P_KEY      0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
+
+#define HFI1_PSM_IOC_BASE_SEQ 0x0
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+	return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+	return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+	return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+	return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+	return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+	return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+	return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+	return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+	return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+	return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+	return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
--- a/kernel/include/hfi1/file_ops.h
+++ b/kernel/include/hfi1/file_ops.h
@ -0,0 +1,9 @@
+#ifndef _HFI1_FILE_OPS_H_
+#define _HFI1_FILE_OPS_H_
+
+#include <ihk/types.h>
+#include <uio.h>
+
+ssize_t hfi1_aio_write(void *private_data, const struct iovec *iovec, unsigned long dim);
+
+#endif
--- a/kernel/include/hfi1/hfi.h
+++ b/kernel/include/hfi1/hfi.h
--- a/kernel/include/hfi1/hfi1_generated_ctxtdata.h
+++ b/kernel/include/hfi1/hfi1_generated_ctxtdata.h
@ -0,0 +1,41 @@
+struct hfi1_ctxtdata {
+	union {
+		char whole_struct[1160];
+		struct {
+			char padding0[144];
+			u16 ctxt;
+		};
+		struct {
+			char padding1[168];
+			u32 rcv_array_groups;
+		};
+		struct {
+			char padding2[172];
+			u32 eager_base;
+		};
+		struct {
+			char padding3[176];
+			u32 expected_count;
+		};
+		struct {
+			char padding4[180];
+			u32 expected_base;
+		};
+		struct {
+			char padding5[184];
+			struct exp_tid_set tid_group_list;
+		};
+		struct {
+			char padding6[208];
+			struct exp_tid_set tid_used_list;
+		};
+		struct {
+			char padding7[232];
+			struct exp_tid_set tid_full_list;
+		};
+		struct {
+			char padding8[392];
+			struct hfi1_devdata *dd;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_devdata.h
+++ b/kernel/include/hfi1/hfi1_generated_devdata.h
@ -0,0 +1,65 @@
+struct hfi1_devdata {
+	union {
+		char whole_struct[7808];
+		struct {
+			char padding0[3368];
+			u8 *kregbase1;
+		};
+		struct {
+			char padding1[3376];
+			resource_size_t physaddr;
+		};
+		struct {
+			char padding2[3704];
+			u64 default_desc1;
+		};
+		struct {
+			char padding3[3736];
+			dma_addr_t sdma_pad_phys;
+		};
+		struct {
+			char padding4[3760];
+			struct sdma_engine *per_sdma;
+		};
+		struct {
+			char padding5[3768];
+			struct sdma_vl_map *sdma_map;
+		};
+		struct {
+			char padding6[3816];
+			void *piobase;
+		};
+		struct {
+			char padding7[3824];
+			void *rcvarray_wc;
+		};
+		struct {
+			char padding8[4040];
+			long unsigned int *events;
+		};
+		struct {
+			char padding9[4076];
+			u32 chip_rcv_contexts;
+		};
+		struct {
+			char padding10[4080];
+			u32 chip_rcv_array_count;
+		};
+		struct {
+			char padding11[7264];
+			struct hfi1_pportdata *pport;
+		};
+		struct {
+			char padding12[7296];
+			u16 flags;
+		};
+		struct {
+			char padding13[7299];
+			u8 first_dyn_alloc_ctxt;
+		};
+		struct {
+			char padding14[7368];
+			u64 sc2vl[4];
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_filedata.h
+++ b/kernel/include/hfi1/hfi1_generated_filedata.h
@ -0,0 +1,49 @@
+struct hfi1_filedata {
+	union {
+		char whole_struct[104];
+		struct {
+			char padding0[0];
+			struct hfi1_devdata *dd;
+		};
+		struct {
+			char padding1[8];
+			struct hfi1_ctxtdata *uctxt;
+		};
+		struct {
+			char padding2[16];
+			struct hfi1_user_sdma_comp_q *cq;
+		};
+		struct {
+			char padding3[24];
+			struct hfi1_user_sdma_pkt_q *pq;
+		};
+		struct {
+			char padding4[32];
+			u16 subctxt;
+		};
+		struct {
+			char padding5[56];
+			struct tid_rb_node **entry_to_rb;
+		};
+		struct {
+			char padding6[64];
+			spinlock_t tid_lock;
+		};
+		struct {
+			char padding7[72];
+			u32 tid_used;
+		};
+		struct {
+			char padding8[80];
+			u32 *invalid_tids;
+		};
+		struct {
+			char padding9[88];
+			u32 invalid_tid_idx;
+		};
+		struct {
+			char padding10[92];
+			spinlock_t invalid_lock;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_hfi1_user_sdma_pkt_q.h
+++ b/kernel/include/hfi1/hfi1_generated_hfi1_user_sdma_pkt_q.h
@ -0,0 +1,29 @@
+struct hfi1_user_sdma_pkt_q {
+	union {
+		char whole_struct[352];
+		struct {
+			char padding0[4];
+			u16 n_max_reqs;
+		};
+		struct {
+			char padding1[8];
+			atomic_t n_reqs;
+		};
+		struct {
+			char padding2[16];
+			struct hfi1_devdata *dd;
+		};
+		struct {
+			char padding3[32];
+			struct user_sdma_request *reqs;
+		};
+		struct {
+			char padding4[40];
+			long unsigned int *req_in_use;
+		};
+		struct {
+			char padding5[288];
+			enum pkt_q_sdma_state state;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_pportdata.h
+++ b/kernel/include/hfi1/hfi1_generated_pportdata.h
@ -0,0 +1,9 @@
+struct hfi1_pportdata {
+	union {
+		char whole_struct[12992];
+		struct {
+			char padding0[2113];
+			u8 vls_operational;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_sdma_engine.h
+++ b/kernel/include/hfi1/hfi1_generated_sdma_engine.h
@ -0,0 +1,81 @@
+struct sdma_engine {
+	union {
+		char whole_struct[1472];
+		struct {
+			char padding0[0];
+			struct hfi1_devdata *dd;
+		};
+		struct {
+			char padding1[16];
+			void *tail_csr;
+		};
+		struct {
+			char padding2[72];
+			struct hw_sdma_desc *descq;
+		};
+		struct {
+			char padding3[80];
+			unsigned int descq_full_count;
+		};
+		struct {
+			char padding4[88];
+			struct sdma_txreq **tx_ring;
+		};
+		struct {
+			char padding5[104];
+			u32 sdma_mask;
+		};
+		struct {
+			char padding6[112];
+			struct sdma_state state;
+		};
+		struct {
+			char padding7[180];
+			u8 sdma_shift;
+		};
+		struct {
+			char padding8[181];
+			u8 this_idx;
+		};
+		struct {
+			char padding9[256];
+			spinlock_t tail_lock;
+		};
+		struct {
+			char padding10[260];
+			u32 descq_tail;
+		};
+		struct {
+			char padding11[264];
+			long unsigned int ahg_bits;
+		};
+		struct {
+			char padding12[272];
+			u16 desc_avail;
+		};
+		struct {
+			char padding13[274];
+			u16 tx_tail;
+		};
+		struct {
+			char padding14[276];
+			u16 descq_cnt;
+		};
+		struct {
+			char padding15[320];
+			seqlock_t head_lock;
+		};
+		struct {
+			char padding16[328];
+			u32 descq_head;
+		};
+		struct {
+			char padding17[704];
+			spinlock_t flushlist_lock;
+		};
+		struct {
+			char padding18[712];
+			struct list_head flushlist;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_sdma_state.h
+++ b/kernel/include/hfi1/hfi1_generated_sdma_state.h
@ -0,0 +1,17 @@
+struct sdma_state {
+	union {
+		char whole_struct[64];
+		struct {
+			char padding0[40];
+			enum sdma_states current_state;
+		};
+		struct {
+			char padding1[48];
+			unsigned int go_s99_running;
+		};
+		struct {
+			char padding2[52];
+			enum sdma_states previous_state;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_user_sdma_request.h
+++ b/kernel/include/hfi1/hfi1_generated_user_sdma_request.h
@ -0,0 +1,89 @@
+struct user_sdma_request {
+	union {
+		char whole_struct[768];
+		struct {
+			char padding0[0];
+			struct hfi1_pkt_header hdr;
+		};
+		struct {
+			char padding1[64];
+			struct hfi1_user_sdma_pkt_q *pq;
+		};
+		struct {
+			char padding2[72];
+			struct hfi1_user_sdma_comp_q *cq;
+		};
+		struct {
+			char padding3[80];
+			struct sdma_engine *sde;
+		};
+		struct {
+			char padding4[88];
+			struct sdma_req_info info;
+		};
+		struct {
+			char padding5[96];
+			u32 *tids;
+		};
+		struct {
+			char padding6[104];
+			u32 data_len;
+		};
+		struct {
+			char padding7[108];
+			u16 n_tids;
+		};
+		struct {
+			char padding8[110];
+			u8 data_iovs;
+		};
+		struct {
+			char padding9[111];
+			s8 ahg_idx;
+		};
+		struct {
+			char padding10[128];
+			u64 seqcomp;
+		};
+		struct {
+			char padding11[136];
+			u64 seqsubmitted;
+		};
+		struct {
+			char padding12[192];
+			struct list_head txps;
+		};
+		struct {
+			char padding13[208];
+			u64 seqnum;
+		};
+		struct {
+			char padding14[216];
+			u32 tidoffset;
+		};
+		struct {
+			char padding15[220];
+			u32 koffset;
+		};
+		struct {
+			char padding16[224];
+			u32 sent;
+		};
+		struct {
+			char padding17[228];
+			u16 tididx;
+		};
+		struct {
+			char padding18[230];
+			u8 iov_idx;
+		};
+		struct {
+			char padding19[231];
+			u8 has_error;
+		};
+		struct {
+			char padding20[232];
+			struct user_sdma_iovec iovs[8];
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_generated_user_sdma_txreq.h
+++ b/kernel/include/hfi1/hfi1_generated_user_sdma_txreq.h
@ -0,0 +1,33 @@
+struct user_sdma_txreq {
+	union {
+		char whole_struct[264];
+		struct {
+			char padding0[0];
+			struct hfi1_pkt_header hdr;
+		};
+		struct {
+			char padding1[64];
+			struct sdma_txreq txreq;
+		};
+		struct {
+			char padding2[224];
+			struct list_head list;
+		};
+		struct {
+			char padding3[240];
+			struct user_sdma_request *req;
+		};
+		struct {
+			char padding4[248];
+			u16 flags;
+		};
+		struct {
+			char padding5[252];
+			unsigned int busycount;
+		};
+		struct {
+			char padding6[256];
+			u64 seqnum;
+		};
+	};
+};
--- a/kernel/include/hfi1/hfi1_user.h
+++ b/kernel/include/hfi1/hfi1_user.h
@ -0,0 +1,444 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+#ifndef _LINUX__HFI1_USER_H
+#define _LINUX__HFI1_USER_H
+
+#ifdef __HFI1_ORIG__
+#include <linux/types.h>
+#else
+#include <mc_perf_event.h>
+#endif /* __HFI1_ORIG__ */
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of hfi1_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures change in an incompatible
+ * way. The driver must be the same for initialization to succeed.
+ */
+#define HFI1_USER_SWMAJOR 6
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define HFI1_USER_SWMINOR 3
+
+/*
+ * We will encode the major/minor inside a single 32bit version number.
+ */
+#define HFI1_SWMAJOR_SHIFT 16
+
+/*
+ * Set of HW and driver capability/feature bits.
+ * These bit values are used to configure enabled/disabled HW and
+ * driver features. The same set of bits are communicated to user
+ * space.
+ */
+#define HFI1_CAP_DMA_RTAIL        (1UL <<  0) /* Use DMA'ed RTail value */
+#define HFI1_CAP_SDMA             (1UL <<  1) /* Enable SDMA support */
+#define HFI1_CAP_SDMA_AHG         (1UL <<  2) /* Enable SDMA AHG support */
+#define HFI1_CAP_EXTENDED_PSN     (1UL <<  3) /* Enable Extended PSN support */
+#define HFI1_CAP_HDRSUPP          (1UL <<  4) /* Enable Header Suppression */
+#define HFI1_CAP_TID_RDMA         (1UL <<  5) /* Enable TID RDMA operations */
+#define HFI1_CAP_USE_SDMA_HEAD    (1UL <<  6) /* DMA Hdr Q tail vs. use CSR */
+#define HFI1_CAP_MULTI_PKT_EGR    (1UL <<  7) /* Enable multi-packet Egr buffs*/
+#define HFI1_CAP_NODROP_RHQ_FULL  (1UL <<  8) /* Don't drop on Hdr Q full */
+#define HFI1_CAP_NODROP_EGR_FULL  (1UL <<  9) /* Don't drop on EGR buffs full */
+#define HFI1_CAP_TID_UNMAP        (1UL << 10) /* Disable Expected TID caching */
+#define HFI1_CAP_PRINT_UNIMPL     (1UL << 11) /* Show for unimplemented feats */
+#define HFI1_CAP_ALLOW_PERM_JKEY  (1UL << 12) /* Allow use of permissive JKEY */
+#define HFI1_CAP_NO_INTEGRITY     (1UL << 13) /* Enable ctxt integrity checks */
+#define HFI1_CAP_PKEY_CHECK       (1UL << 14) /* Enable ctxt PKey checking */
+#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */
+#define HFI1_CAP_OPFN             (1UL << 16) /* Enable the OPFN protocol */
+#define HFI1_CAP_SDMA_HEAD_CHECK  (1UL << 17) /* SDMA head checking */
+#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */
+
+#define HFI1_RCVHDR_ENTSIZE_2    (1UL << 0)
+#define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
+#define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
+
+/* User commands. */
+#define HFI1_CMD_ASSIGN_CTXT     1	/* allocate HFI and context */
+#define HFI1_CMD_CTXT_INFO       2	/* find out what resources we got */
+#define HFI1_CMD_USER_INFO       3	/* set up userspace */
+#define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
+#define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
+#define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
+
+#define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
+#define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
+#define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
+#define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
+#define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
+#define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
+#define HFI1_CMD_GET_VERS	 14	/* get the version of the user cdev */
+
+/*
+ * User IOCTLs can not go above 128 if they do then see common.h and change the
+ * base for the snoop ioctl
+ */
+#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */
+
+/*
+ * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
+ */
+#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
+
+struct hfi1_cmd;
+#define HFI1_IOCTL_ASSIGN_CTXT \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
+#define HFI1_IOCTL_CTXT_INFO \
+	_IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
+#define HFI1_IOCTL_USER_INFO \
+	_IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
+#define HFI1_IOCTL_TID_UPDATE \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
+#define HFI1_IOCTL_TID_FREE \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
+#define HFI1_IOCTL_CREDIT_UPD \
+	_IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
+#define HFI1_IOCTL_RECV_CTRL \
+	_IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
+#define HFI1_IOCTL_POLL_TYPE \
+	_IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
+#define HFI1_IOCTL_ACK_EVENT \
+	_IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
+#define HFI1_IOCTL_SET_PKEY \
+	_IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
+#define HFI1_IOCTL_CTXT_RESET \
+	_IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
+#define HFI1_IOCTL_TID_INVAL_READ \
+	_IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
+#define HFI1_IOCTL_GET_VERS \
+	_IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
+
+#define _HFI1_EVENT_FROZEN_BIT         0
+#define _HFI1_EVENT_LINKDOWN_BIT       1
+#define _HFI1_EVENT_LID_CHANGE_BIT     2
+#define _HFI1_EVENT_LMC_CHANGE_BIT     3
+#define _HFI1_EVENT_SL2VL_CHANGE_BIT   4
+#define _HFI1_EVENT_TID_MMU_NOTIFY_BIT 5
+#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_TID_MMU_NOTIFY_BIT
+
+#define HFI1_EVENT_FROZEN            (1UL << _HFI1_EVENT_FROZEN_BIT)
+#define HFI1_EVENT_LINKDOWN          (1UL << _HFI1_EVENT_LINKDOWN_BIT)
+#define HFI1_EVENT_LID_CHANGE        (1UL << _HFI1_EVENT_LID_CHANGE_BIT)
+#define HFI1_EVENT_LMC_CHANGE        (1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
+#define HFI1_EVENT_SL2VL_CHANGE      (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
+#define HFI1_EVENT_TID_MMU_NOTIFY    (1UL << _HFI1_EVENT_TID_MMU_NOTIFY_BIT)
+
+/*
+ * These are the status bits readable (in ASCII form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define HFI1_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initialized */
+#define HFI1_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define HFI1_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define HFI1_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define HFI1_STATUS_HWERROR     0x200
+
+/*
+ * Number of supported shared contexts.
+ * This is the maximum number of software contexts that can share
+ * a hardware send/receive context.
+ */
+#define HFI1_MAX_SHARED_CTXTS 8
+
+/*
+ * Poll types
+ */
+#define HFI1_POLL_TYPE_ANYRCV     0x0
+#define HFI1_POLL_TYPE_URGENT     0x1
+
+/*
+ * This structure is passed to the driver to tell it where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct hfi1_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to HFI1_USER_SWVERSION.
+	 */
+	__u32 userversion;
+	__u32 pad;
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the subcontext_cnt and subcontext_id to the same
+	 * values.  The only restriction on the subcontext_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 subctxt_cnt;
+	__u16 subctxt_id;
+	/* 128bit UUID passed in by PSM. */
+	__u8 uuid[16];
+};
+
+struct hfi1_ctxt_info {
+	__u64 runtime_flags;    /* chip/drv runtime flags (HFI1_CAP_*) */
+	__u32 rcvegr_size;      /* size of each eager buffer */
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 rcvtids;          /* number of Rcv TIDs for this context */
+	__u16 credits;          /* number of PIO credits for this context */
+	__u16 numa_node;        /* NUMA node of the assigned device */
+	__u16 rec_cpu;          /* cpu # for affinity (0xffff if none) */
+	__u16 send_ctxt;        /* send context in use by this user context */
+	__u16 egrtids;          /* number of RcvArray entries for Eager Rcvs */
+	__u16 rcvhdrq_cnt;      /* number of RcvHdrQ entries */
+	__u16 rcvhdrq_entsize;  /* size (in bytes) for each RcvHdrQ entry */
+	__u16 sdma_ring_size;   /* number of entries in SDMA request ring */
+};
+
+struct hfi1_tid_info {
+	/* virtual address of first page in transfer */
+	__u64 vaddr;
+	/* pointer to tid array. this array is big enough */
+	__u64 tidlist;
+	/* number of tids programmed by this request */
+	__u32 tidcnt;
+	/* length of transfer buffer programmed by this request */
+	__u32 length;
+};
+
+enum hfi1_sdma_comp_state {
+	FREE = 0,
+	QUEUED,
+	COMPLETE,
+	ERROR
+};
+
+/*
+ * SDMA completion ring entry
+ */
+struct hfi1_sdma_comp_entry {
+	__u32 status;
+	__u32 errcode;
+};
+
+/*
+ * Device status and notifications from driver to user-space.
+ */
+struct hfi1_status {
+	__u64 dev;      /* device/hw status bits */
+	__u64 port;     /* port state and status bits */
+	char freezemsg[0];
+};
+
+/*
+ * This structure is returned by the driver immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explicit pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct hfi1_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 hw_version;
+	/* version of software, for feature checking. */
+	__u32 sw_version;
+	/* Job key */
+	__u16 jkey;
+	__u16 padding1;
+	/*
+	 * The special QP (queue pair) value that identifies PSM
+	 * protocol packet from standard IB packets.
+	 */
+	__u32 bthqp;
+	/* PIO credit return address, */
+	__u64 sc_credits_addr;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__u64 pio_bufbase_sop;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__u64 pio_bufbase;
+	/* address where receive buffer queue is mapped into */
+	__u64 rcvhdr_bufbase;
+	/* base address of Eager receive buffers. */
+	__u64 rcvegr_bufbase;
+	/* base address of SDMA completion ring */
+	__u64 sdma_comp_bufbase;
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always maps real chip register space.
+	 * the register addresses are:
+	 * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
+	 * ur_rcvtidflow
+	 */
+	__u64 user_regbase;
+	/* notification events */
+	__u64 events_bufbase;
+	/* status page */
+	__u64 status_bufbase;
+	/* rcvhdrtail update */
+	__u64 rcvhdrtail_base;
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 subctxt_uregbase;
+	__u64 subctxt_rcvegrbuf;
+	__u64 subctxt_rcvhdrbuf;
+};
+
+enum sdma_req_opcode {
+	EXPECTED = 0,
+	EAGER
+};
+
+#define HFI1_SDMA_REQ_VERSION_MASK 0xF
+#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0
+#define HFI1_SDMA_REQ_OPCODE_MASK 0xF
+#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4
+#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF
+#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8
+
+struct sdma_req_info {
+	/*
+	 * bits 0-3 - version (currently unused)
+	 * bits 4-7 - opcode (enum sdma_req_opcode)
+	 * bits 8-15 - io vector count
+	 */
+	__u16 ctrl;
+	/*
+	 * Number of fragments contained in this request.
+	 * User-space has already computed how many
+	 * fragment-sized packet the user buffer will be
+	 * split into.
+	 */
+	__u16 npkts;
+	/*
+	 * Size of each fragment the user buffer will be
+	 * split into.
+	 */
+	__u16 fragsize;
+	/*
+	 * Index of the slot in the SDMA completion ring
+	 * this request should be using. User-space is
+	 * in charge of managing its own ring.
+	 */
+	__u16 comp_idx;
+} __attribute__((packed));
+
+/*
+ * SW KDETH header.
+ * swdata is SW defined portion.
+ */
+struct hfi1_kdeth_header {
+	__le32 ver_tid_offset;
+	__le16 jkey;
+	__le16 hcrc;
+	__le32 swdata[7];
+} __attribute__((packed));
+
+/*
+ * Structure describing the headers that User space uses. The
+ * structure above is a subset of this one.
+ */
+struct hfi1_pkt_header {
+	__le16 pbc[4];
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct hfi1_kdeth_header kdeth;
+} __attribute__((packed));
+
+#ifdef __HFI1_ORIG__ 
+
+/*
+ * The list of usermode accessible registers.
+ */
+enum hfi1_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* (RO)  Receive Eager Offset Tail */
+	ur_rcvegroffsettail = 4,
+	/* For internal use only; max register number. */
+	ur_maxreg,
+	/* (RW)  Receive TID flow table */
+	ur_rcvtidflowtable = 256
+};
+#endif /* __HFI1_ORIG__ */
+#endif /* _LINIUX__HFI1_USER_H */
--- a/kernel/include/hfi1/ihk_hfi1_common.h
+++ b/kernel/include/hfi1/ihk_hfi1_common.h
@ -0,0 +1,310 @@
+#ifndef _IHK_HFI1_COMMON_H_
+#define _IHK_HFI1_COMMON_H_
+
+#include <ihk/atomic.h>
+#include <ihk/types.h>
+#include <ihk/cpu.h>
+#include <kmalloc.h>
+#include <lwk/compiler.h>
+#include <arch-lock.h>
+#include <page.h>
+#include <string.h>
+#include <lwk/stddef.h>
+
+//#define VERBOSE_DEBUG
+
+#define IF_VA_ARGS(...) , ##__VA_ARGS__
+//#define TP(msg, ...) kprintf("%s(%d):" msg "\n", __FUNCTION__, __LINE__ IF_VA_ARGS(__VA_ARGS__))
+#define TP(msg, ...) do {} while(0)
+
+#ifdef VERBOSE_DEBUG
+#define SDMA_DBG(req, fmt, ...)	kprintf("%s(%d): DBG:" fmt "\n", __FUNCTION__, __LINE__ IF_VA_ARGS(__VA_ARGS__));
+#define SDMA_Q_DBG(req, fmt, ...)	kprintf("%s(%d): Q_DBG:" fmt "\n", __FUNCTION__, __LINE__ IF_VA_ARGS(__VA_ARGS__));
+#define hfi1_cdbg(...) kprintf("%s(%d): hfi1_cdbg: %s \n", __FUNCTION__, __LINE__, #__VA_ARGS__);
+#else
+#define SDMA_DBG(req, fmt, ...) do {} while(0)
+#define SDMA_Q_DBG(req, fmt, ...) do {} while(0)
+#define hfi1_cdbg(...) do {} while(0)
+#endif
+
+/* From: kernel-xppsl_1.5.2/include/linux/compiler.h */
+#define WARN_ON(condition) ({						\
+	int __ret_warn_on = !!(condition);				\
+	if (unlikely(__ret_warn_on))					\
+		kprintf("%s(%d): WARN: %s\n", __FUNCTION__, __LINE__, #condition);  \
+	unlikely(__ret_warn_on);					\
+})
+
+#define WARN_ON_ONCE WARN_ON // use the local definition
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+/* From: mckernel/kernel/include/xpmem_private.h */
+#define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
+#define min(x, y)	({                                              \
+	__typeof__(x) _min1 = (x);                                      \
+	__typeof__(y) _min2 = (y);                                      \
+	(void) (&_min1 == &_min2);                                      \
+	_min1 < _min2 ? _min1 : _min2;})
+
+
+#define BIT_ULL(nr) (1ULL << (nr))
+
+/* Disable debug macros */
+#define trace_hfi1_ahg_allocate(...) do {} while(0)
+#define trace_hfi1_ahg_deallocate(...) do {} while(0)
+
+/* Byte swapping */
+#define be32_to_cpu(x) __builtin_bswap32(x)
+#define be16_to_cpu(x) __builtin_bswap16(x)
+#define le32_to_cpu(x) x
+#define le16_to_cpu(x) x
+#define cpu_to_le16(x) x
+#define cpu_to_le32(x) x
+#define cpu_to_le64(x) x
+#define __cpu_to_le64(x) x
+#define __le64_to_cpu(x) x
+#define __le32_to_cpu(x) x
+#define __le16_to_cpu(x) x
+#define cpu_to_be16(x) __builtin_bswap16(x)
+#define cpu_to_be32(x) __builtin_bswap32(x)
+
+/* Compiler */
+#ifndef likely
+# define likely(x) __builtin_expect(!!(x), 1)
+#endif
+
+#ifndef unlikely
+# define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+/* Atomic ops */
+#define atomic_inc ihk_atomic_inc
+#define atomic_dec ihk_atomic_dec
+#define atomic_read ihk_atomic_read
+#define atomic_add ihk_atomic_add
+#define atomic_t ihk_atomic_t
+typedef ihk_spinlock_t spinlock_t;
+
+
+/*
+ * Linux queued_spin_lock compatible spin_lock, without the queue.
+ */
+#define _Q_LOCKED_OFFSET	0
+#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
+
+#define linux_spin_lock(lock)                    \
+	do {                                         \
+		while (!__sync_bool_compare_and_swap(    \
+					(unsigned int *)lock, 0,     \
+					_Q_LOCKED_VAL)) {            \
+			cpu_pause();                         \
+		}                                        \
+	} while (0)
+
+#define linux_spin_unlock(lock)                               \
+	do {                                                      \
+		ihk_atomic_sub(_Q_LOCKED_VAL, (ihk_atomic_t *)lock);  \
+	} while (0)
+
+#define linux_spin_lock_irqsave(lock, flags)     \
+	do {                                         \
+		flags = cpu_disable_interrupt_save();    \
+		linux_spin_lock(lock);                   \
+	} while (0)
+
+#define linux_spin_unlock_irqrestore(lock, flags) \
+	do {                                          \
+		linux_spin_unlock(lock);                  \
+		cpu_restore_interrupt(flags);             \
+	} while (0)
+
+
+/*****************************************************/
+
+#define ____cacheline_aligned_in_smp __attribute__((aligned(64)))
+#define smp_wmb() barrier()
+#define smp_rmb() barrier()
+#define __iomem
+#define __rcu
+#define __percpu
+#define send_routine void *
+
+#define GFP_KERNEL 0
+// TODO: double check GFP_ATOMIC
+#define GFP_ATOMIC 0
+
+/* hfi1 pio.h */
+#define SC_MAX    4     /* count of send context types */
+
+/* kernel-xppsl_1.5.2/include/linux/seqlock.h */
+/***********************************************/
+typedef struct seqcount {
+	unsigned sequence;
+} seqcount_t;
+
+typedef struct {
+	struct seqcount seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+static inline unsigned raw_seqcount_begin(const seqcount_t *s)
+{
+	unsigned ret = ACCESS_ONCE(s->sequence);
+	smp_rmb();
+	return ret & ~1;
+}
+/***********************************************/
+
+/* kernel-xppsl_1.5.2/include/linux/kref.h */
+struct kref {
+	atomic_t refcount;
+};
+
+struct wait_queue_head_t {
+	spinlock_t lock;
+	struct list_head task_list;
+};
+typedef struct wait_queue_head_t wait_queue_head_t;
+
+struct completion {
+	unsigned int done;
+	wait_queue_head_t wait;
+};
+
+/* kernel-xppsl_1.5.2/include/linux/interrupt.h */
+struct tasklet_struct
+{
+	struct tasklet_struct *next;
+	unsigned long state;
+	atomic_t count;
+	void (*func)(unsigned long);
+	unsigned long data;
+};
+
+/* Misc */
+/* From: kernel-xppsl_1.5.2/include/linux/kernel.h */
+#define min_t(type, x, y) ({			\
+	type __min1 = (x);			\
+	type __min2 = (y);			\
+	__min1 < __min2 ? __min1: __min2; })
+
+#define SIZE_MAX	(~(size_t)0)
+#define MAX_TID_PAIR_ENTRIES 1024	/* max receive expected pairs */
+#define PIO_BLOCK_SIZE 64			/* bytes */
+/* From: chip.c/h */
+#define TXE_NUM_SDMA_ENGINES 16
+#define CCE_NUM_INT_CSRS 12
+//num_vls = HFI1_MAX_VLS_SUPPORTED;
+//num_vls = dd->chip_sdma_engines;
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+
+/* integer typedefs */
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+
+typedef __signed__ int __s32;
+typedef unsigned int __u32;
+
+typedef __signed__ long long __s64;
+typedef unsigned long long __u64;
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+
+typedef unsigned int uint;
+
+/* TODO: There should be a header file that I can include */
+typedef _Bool bool;
+
+/* TODO: double check this typedef */
+typedef u64 dma_addr_t;
+
+/* From: kernel-xppsl_1.5.2/include/linux/types.h */
+typedef unsigned gfp_t;
+#define CONFIG_PHYS_ADDR_T_64BIT
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+typedef u64 phys_addr_t;
+#else
+typedef u32 phys_addr_t;
+#endif
+typedef phys_addr_t resource_size_t;
+
+/* kernel-xppsl_1.5.2/include/asm-generic/io.h */
+#ifndef __raw_writeq
+static inline void __raw_writeq(u64 b, volatile void __iomem *addr)
+{
+	*(volatile u64 __force *) addr = b;
+}
+#endif
+#define writeq(b, addr) __raw_writeq(__cpu_to_le64(b), addr)
+
+
+/* TODO: I'm not sure if this definition is correct */ 
+#define LOCK_PREFIX "lock; "
+
+/* From: kernel-xppsl_1.5.2/arch/x86/include/asm/bitops.h */
+#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#define LINUX_ADDR BITOP_ADDR(addr)
+
+/* From: kernel-xppsl_1.5.2/arch/x86/include/asm/bitops.h */
+static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	int oldbit;
+
+	asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
+		     "sbb %0,%0" : "=r" (oldbit), LINUX_ADDR : "Ir" (nr) : "memory");
+
+	return oldbit;
+}
+
+/* From: kernel-xppsl_1.5.2/arch/x86/include/asm/atomic.h */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+	unsigned char c;
+
+	asm volatile(LOCK_PREFIX "decl %0; sete %1"
+		     : "+m" (v->counter), "=qm" (c)
+		     : : "memory");
+	return c != 0;
+}
+
+/* From: kernel-xppsl_1.5.2/include/linux/slab.h */
+static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+	return kmalloc(n * size, flags);
+}
+
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	void *mem = kmalloc(n * size, flags);
+	if (mem)
+		memset(mem, 0, n * size);
+	return mem;
+}
+
+#endif
--- a/kernel/include/hfi1/iowait.h
+++ b/kernel/include/hfi1/iowait.h
@ -0,0 +1,446 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef __HFI1_ORIG__
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+#include "sdma_txreq.h"
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+#endif /* __HFI1_ORIG__ */
+
+#define IOWAIT_PENDING_IB  0x0
+#define IOWAIT_PENDING_TID 0x1
+
+/*
+ * A QP can have multiple Send Engines (SEs).
+ *
+ * The current use case is for supporting a TID RDMA
+ * packet build/xmit mechanism independent from verbs.
+ */
+#define IOWAIT_SES 2
+#define IOWAIT_IB_SE 0
+#define IOWAIT_TID_SE 1
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * @iowork: the work struct
+ * @tx_head: list of prebuilt packets
+ * @iow: the parent iowait structure
+ *
+ * This structure is the work item (process) specific
+ * details associated with the each of the two SEs of the
+ * QP.
+ *
+ * The workstruct and the queued TXs are unique to each
+ * SE.
+ */
+struct iowait;
+struct iowait_work {
+	char iowork[32]; // struct work_struct iowork;
+	struct list_head tx_head;
+	struct iowait *iow;
+};
+
+/**
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback wakeup
+ * @sdma_drained: sdma count drained
+ * @lock: lock protected head of wait queue
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @wait_pio: wait for pio_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ * @flags: wait flags (one per QP)
+ * @wait: SE array
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed.   They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ *
+ * The lock field is used by waiters to record
+ * the seqlock_t that guards the list head.
+ * Waiters explicity know that, but the destroy
+ * code that unwaits QPs does not.
+ */
+/* The original size on Linux is 240 B */ 
+struct iowait {
+	struct list_head list;
+	int (*sleep)(
+		struct sdma_engine *sde,
+		struct iowait_work *wait,
+		struct sdma_txreq *tx,
+		uint seq,
+		bool pkts_sent
+		);
+	void (*wakeup)(struct iowait *wait, int reason);
+	void (*sdma_drained)(struct iowait *wait);
+	seqlock_t *lock;
+	wait_queue_head_t wait_dma;
+	wait_queue_head_t wait_pio;
+	atomic_t sdma_busy;
+	atomic_t pio_busy;
+	u32 count;
+	u32 tx_limit;
+	u32 tx_count;
+	unsigned long flags;
+	struct iowait_work wait[IOWAIT_SES];
+	u8 starved_cnt;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+#ifdef __HFI1_ORIG__
+
+void iowait_set_flag(struct iowait *wait, u32 flag);
+bool iowait_flag_set(struct iowait *wait, u32 flag);
+void iowait_clear_flag(struct iowait *wait, u32 flag);
+
+void iowait_init(
+	struct iowait *wait,
+	u32 tx_limit,
+	void (*func)(struct work_struct *work),
+	void (*tidfunc)(struct work_struct *work),
+	int (*sleep)(
+		struct sdma_engine *sde,
+		struct iowait_work *wait,
+		struct sdma_txreq *tx,
+		uint seq,
+		bool pkts_sent),
+	void (*wakeup)(struct iowait *wait, int reason),
+	void (*sdma_drained)(struct iowait *wait));
+
+/**
+ * iowait_schedule() - schedule the default send engine work
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ * @cpu: cpu
+ */
+static inline bool iowait_schedule(
+	struct iowait *wait,
+	struct workqueue_struct *wq,
+	int cpu)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork);
+}
+
+/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(
+	struct iowait *wait,
+	struct workqueue_struct *wq,
+	int cpu)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_sdma_pending() - return sdma pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_sdma_pending(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return atomic_read(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_inc - note sdma io pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_inc(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	atomic_inc(&wait->sdma_busy);
+}
+
+#endif
+/**
+ * iowait_sdma_add - add count to pending
+ * @wait: iowait_work structure
+ */
+static inline void iowait_sdma_add(struct iowait *wait, int count)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	atomic_add(count, &wait->sdma_busy);
+}
+#ifdef __HFI1_ORIG__
+
+/**
+ * iowait_pio_drain() - wait for pios to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait pios have
+ * completed.
+ */
+static inline void iowait_pio_drain(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	wait_event_timeout(wait->wait_pio,
+			   !atomic_read(&wait->pio_busy),
+			   HZ);
+}
+
+/**
+ * iowait_pio_pending() - return pio pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_pio_pending(struct iowait *w)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return atomic_read(&w->pio_busy);
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *w)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	wake_up(&w->wait_dma);
+	wake_up(&w->wait_pio);
+	if (w->sdma_drained)
+		w->sdma_drained(w);
+}
+
+/**
+ * iowait_pio_inc - note pio pending
+ * @wait: iowait structure
+ */
+static inline void iowait_pio_inc(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	atomic_inc(&wait->pio_busy);
+}
+
+/**
+ * iowait_pio_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_pio_dec(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	if (!wait)
+		return 0;
+	return atomic_dec_and_test(&wait->pio_busy);
+}
+
+/**
+ * iowait_sdma_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_sdma_dec(struct iowait *wait)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	if (!wait)
+		return 0;
+	return atomic_dec_and_test(&wait->sdma_busy);
+}
+
+/**
+ * iowait_get_txhead() - get packet off of iowait list
+ * @wait wait struture
+ */
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait)
+{
+	struct sdma_txreq *tx = NULL;
+
+	hfi1_cdbg(AIOWRITE, ".");
+	if (!list_empty(&wait->tx_head)) {
+		tx = list_first_entry(
+			&wait->tx_head,
+			struct sdma_txreq,
+			list);
+		list_del_init(&tx->list);
+	}
+	return tx;
+}
+
+static inline u16 iowait_get_desc(struct iowait_work *w)
+{
+	u16 num_desc = 0;
+	struct sdma_txreq *tx = NULL;
+	hfi1_cdbg(AIOWRITE, ".");
+
+	if (!list_empty(&w->tx_head)) {
+		tx = list_first_entry(
+			&w->tx_head,
+			struct sdma_txreq,
+			list);
+		num_desc = tx->num_desc;
+	}
+	return num_desc;
+}
+
+static inline u32 iowait_get_all_desc(struct iowait *w)
+{
+	u32 num_desc = 0;
+
+	hfi1_cdbg(AIOWRITE, ".");
+	num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]);
+	num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]);
+	return num_desc;
+}
+
+/**
+ * iowait_packet_queued() - determine if a packet it queued
+ * @wait: the wait structure
+ */
+static inline bool iowait_packet_queued(struct iowait_work *w)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return !list_empty(&w->tx_head);
+}
+
+#endif /* __HFI1_ORIG__ */
+/**
+ * inc_wait_count - increment wait counts
+ * @w: the log work struct
+ * @n: the count
+ */
+static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	if (!w)
+		return;
+	w->iow->tx_count++;
+	w->iow->count += n;
+}
+#ifdef __HFI1_ORIG__
+
+/**
+ * iowait_get_tid_work - return iowait_work for tid SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_tid_work(struct iowait *w)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return &w->wait[IOWAIT_TID_SE];
+}
+
+#endif /* __HFI1_ORIG__ */
+/**
+ * iowait_get_ib_work - return iowait_work for ib SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_ib_work(struct iowait *w)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	return &w->wait[IOWAIT_IB_SE];
+}
+
+/**
+ * iowait_ioww_to_iow - return iowait given iowait_work
+ * @w: the iowait_work struct
+ */
+static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w)
+{
+	hfi1_cdbg(AIOWRITE, ".");
+	if (likely(w))
+		return w->iow;
+	return NULL;
+}
+#ifdef __HFI1_ORIG__
+
+void iowait_cancel_work(struct iowait *w);
+int iowait_set_work_flag(struct iowait_work *w);
+#endif /* __HFI1_ORIG__ */
+
+#endif
--- a/kernel/include/hfi1/sdma.h
+++ b/kernel/include/hfi1/sdma.h
@ -0,0 +1,983 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <hfi1/hfi.h>
+#include <hfi1/ihk_hfi1_common.h>
+#include <hfi1/sdma_txreq.h>
+
+#ifdef __HFI1_ORIG__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+#include "sdma_txreq.h"
+
+#define hfi1_cdbg(which, fmt, ...) \
+	__hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+extern void __hfi1_trace_AIOWRITE(const char *func, char *fmt, ...);
+#endif /* __HFI1_ORIG__ */
+
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+#define SDMA_MAP_NONE          0
+#define SDMA_MAP_SINGLE        1
+#define SDMA_MAP_PAGE          2
+
+#define SDMA_AHG_VALUE_MASK          0xffff
+#define SDMA_AHG_VALUE_SHIFT         0
+#define SDMA_AHG_INDEX_MASK          0xf
+#define SDMA_AHG_INDEX_SHIFT         16
+#define SDMA_AHG_FIELD_LEN_MASK      0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT     20
+#define SDMA_AHG_FIELD_START_MASK    0x1f
+#define SDMA_AHG_FIELD_START_SHIFT   24
+#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG              0
+#define SDMA_AHG_COPY                1
+#define SDMA_AHG_APPLY_UPDATE1       2
+#define SDMA_AHG_APPLY_UPDATE2       3
+#define SDMA_AHG_APPLY_UPDATE3       4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG      BIT_ULL(63)
+#define SDMA_DESC0_LAST_DESC_FLAG       BIT_ULL(62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+	((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+	(SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT       0
+#define SDMA_DESC0_PHY_ADDR_WIDTH       48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+	((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+	(SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+	((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+	(SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT    13
+#define SDMA_DESC1_HEADER_MODE_WIDTH    3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+	((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+	(SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+	((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+	(SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT     4
+#define SDMA_DESC1_HEADER_DWS_WIDTH     4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+	((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+	(SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT     2
+#define SDMA_DESC1_GENERATION_WIDTH     2
+#define SDMA_DESC1_GENERATION_MASK \
+	((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
+#define SDMA_DESC1_GENERATION_SMASK \
+	(SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG         BIT_ULL(1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG    BIT_ULL(0)
+
+enum sdma_states {
+	sdma_state_s00_hw_down,
+	sdma_state_s10_hw_start_up_halt_wait,
+	sdma_state_s15_hw_start_up_clean_wait,
+	sdma_state_s20_idle,
+	sdma_state_s30_sw_clean_up_wait,
+	sdma_state_s40_hw_clean_up_wait,
+	sdma_state_s50_hw_halt_wait,
+	sdma_state_s60_idle_halt_wait,
+	sdma_state_s80_hw_freeze,
+	sdma_state_s82_freeze_sw_clean,
+	sdma_state_s99_running,
+};
+
+enum sdma_events {
+	sdma_event_e00_go_hw_down,
+	sdma_event_e10_go_hw_start,
+	sdma_event_e15_hw_halt_done,
+	sdma_event_e25_hw_clean_up_done,
+	sdma_event_e30_go_running,
+	sdma_event_e40_sw_cleaned,
+	sdma_event_e50_hw_cleaned,
+	sdma_event_e60_hw_halted,
+	sdma_event_e70_go_idle,
+	sdma_event_e80_hw_freeze,
+	sdma_event_e81_hw_frozen,
+	sdma_event_e82_hw_unfreeze,
+	sdma_event_e85_link_down,
+	sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+	unsigned op_enable:1;
+	unsigned op_intenable:1;
+	unsigned op_halt:1;
+	unsigned op_cleanup:1;
+	unsigned go_s99_running_tofalse:1;
+	unsigned go_s99_running_totrue:1;
+};
+
+#include <hfi1/hfi1_generated_sdma_state.h>
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ *   to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ *   and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ *   and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ.  The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used.  Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized,  is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location.  It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls..  The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx.   An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent().  For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added.  An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init().   Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+	/* private:  don't use directly */
+	__le64 qw[2];
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+ /* The original size on Linux is 1472 B */
+
+#include <hfi1/hfi1_generated_sdma_engine.h>
+
+#ifdef __HFI1_ORIG__
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+	return sde->descq_tail == sde->descq_head;
+}
+
+#endif /* __HFI1_ORIG__ */
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+	return sde->descq_cnt -
+		(sde->descq_tail -
+		 ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+	return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+	return engine->state.current_state == sdma_state_s99_running;
+}
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+	unsigned long flags;
+	int ret;
+
+	linux_spin_lock_irqsave(&engine->tail_lock, flags);
+	ret = __sdma_running(engine);
+	linux_spin_unlock_irqrestore(&engine->tail_lock, flags);
+	return ret;
+}
+
+void _sdma_txreq_ahgadd(
+	struct sdma_txreq *tx,
+	u8 num_ahg,
+	u8 ahg_entry,
+	u32 *ahg,
+	u8 ahg_hlen);
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use  (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise.   The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header.   This is for cases where the stored header is
+ * larger than the header to be used in a packet.  This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+	struct sdma_txreq *tx,
+	u16 flags,
+	u16 tlen,
+	u8 ahg_entry,
+	u8 num_ahg,
+	u32 *ahg,
+	u8 ahg_hlen,
+	void (*cb)(struct sdma_txreq *, int))
+{
+	if (tlen == 0)
+		return -ENODATA;
+	if (tlen > MAX_SDMA_PKT_SIZE)
+		return -EMSGSIZE;
+	tx->desc_limit = ARRAY_SIZE(tx->descs);
+	tx->descp = &tx->descs[0];
+	INIT_LIST_HEAD(&tx->list);
+	tx->num_desc = 0;
+	tx->flags = flags;
+	tx->complete = cb;
+	tx->coalesce_buf = NULL;
+	tx->wait = NULL;
+	tx->packet_len = tlen;
+	tx->tlen = tx->packet_len;
+	tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+	tx->descs[0].qw[1] = 0;
+	if (flags & SDMA_TXREQ_F_AHG_COPY)
+		tx->descs[0].qw[1] |=
+			(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+				<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
+			(((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+				<< SDMA_DESC1_HEADER_MODE_SHIFT);
+	else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+		_sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+	return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL,  will be provided this tx and a status.  The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+	struct sdma_txreq *tx,
+	u16 flags,
+	u16 tlen,
+	void (*cb)(struct sdma_txreq *, int))
+{
+	return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+#ifdef __HFI1_ORIG__
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+	return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+		>> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+	return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+		>> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+	return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+		>> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+#endif /* __HFI1_ORIG__ */
+static inline void make_tx_sdma_desc(
+	struct sdma_txreq *tx,
+	int type,
+	dma_addr_t addr,
+	size_t len)
+{
+	struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+	if (!tx->num_desc) {
+		/* qw[0] zero; qw[1] first, ahg mode already in from init */
+		desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+				<< SDMA_DESC1_GENERATION_SHIFT;
+	} else {
+		desc->qw[0] = 0;
+		desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+				<< SDMA_DESC1_GENERATION_SHIFT;
+	}
+	desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+				<< SDMA_DESC0_PHY_ADDR_SHIFT) |
+			(((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+				<< SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+			   int type, void *kvaddr, struct page *page,
+			   unsigned long offset, u16 len);
+void __sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	if (tx->num_desc)
+		__sdma_txclean(dd, tx);
+}
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+				  struct sdma_txreq *tx)
+{
+	tx->descp[tx->num_desc].qw[0] |=
+		SDMA_DESC0_LAST_DESC_FLAG;
+	tx->descp[tx->num_desc].qw[1] |=
+		dd->default_desc1;
+	if (tx->flags & SDMA_TXREQ_F_URGENT)
+		tx->descp[tx->num_desc].qw[1] |=
+			(SDMA_DESC1_HEAD_TO_HOST_FLAG |
+			 SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+	struct hfi1_devdata *dd,
+	int type,
+	struct sdma_txreq *tx,
+	dma_addr_t addr,
+	u16 len)
+{
+	int rval = 0;
+
+	make_tx_sdma_desc(
+		tx,
+		type,
+		addr, len);
+	WARN_ON(len > tx->tlen);
+	tx->tlen -= len;
+	/* special cases for last */
+	if (!tx->tlen) {
+		if (tx->packet_len & (sizeof(u32) - 1)) {
+			rval = _pad_sdma_tx_descs(dd, tx);
+			if (rval)
+				return rval;
+		} else {
+			_sdma_close_tx(dd, tx);
+		}
+	}
+	tx->num_desc++;
+	return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend/coalesce descriptor array
+ */
+static inline int sdma_txadd_page(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	dma_addr_t paddr,
+	u16 len)
+{
+	return _sdma_txadd_daddr(
+			dd, SDMA_MAP_PAGE, tx, paddr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	dma_addr_t addr,
+	u16 len)
+{
+	int rval;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
+					      NULL, NULL, 0, 0);
+		if (rval <= 0)
+			return rval;
+	}
+
+	return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	void *kvaddr,
+	u16 len)
+{
+	dma_addr_t addr;
+	int rval;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
+					      kvaddr, NULL, 0, len);
+		if (rval <= 0)
+			return rval;
+	}
+
+	addr = virt_to_phys(kvaddr);
+
+	return _sdma_txadd_daddr(
+			dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait_wait;
+int sdma_send_txreq(struct sdma_engine *sde,
+		    struct iowait_work *wait,
+		    struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+		     struct iowait_work *wait,
+		     struct list_head *tx_list,
+		     u32 *count);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+	u16 data,
+	u8 dwindex,
+	u8 startbit,
+	u8 bits)
+{
+	return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+		((startbit & SDMA_AHG_FIELD_START_MASK) <<
+		SDMA_AHG_FIELD_START_SHIFT) |
+		((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+		SDMA_AHG_FIELD_LEN_SHIFT) |
+		((dwindex & SDMA_AHG_INDEX_MASK) <<
+		SDMA_AHG_INDEX_SHIFT) |
+		((data & SDMA_AHG_VALUE_MASK) <<
+		SDMA_AHG_VALUE_SHIFT));
+}
+#ifdef __HFI1_ORIG__
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress.  This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+				     struct sdma_txreq *tx)
+{
+	if (read_seqretry(&sde->head_lock, seq)) {
+		sde->desc_avail = sdma_descq_freecnt(sde);
+		if (tx->num_desc > sde->desc_avail)
+			return 0;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+	struct sdma_engine *sde,
+	struct iowait *wait)
+{
+	struct hfi1_pportdata *ppd = sde->dd->pport;
+
+	iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ *               dd->sdma_map
+ *                    |                                   sdma_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               sdma_vl_map                           |--------------------|
+ *      +--------------------------+                   | sde[0] -> eng 1    |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| sde[1] -> eng 2    |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | sde[n] -> eng n    |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | sde[0] -> eng 1+n  |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| sde[1] -> eng 2+n  |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | sde[m] -> eng m+n  |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | sde[0] -> eng 1+m+n|
+ *                                                  \- |--------------------|
+ *                                                    >| sde[1] -> eng 2+m+n|
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | sde[o] -> eng o+m+n|
+ *                                                     +--------------------+
+ *
+ */
+
+#endif /* __HFI1_ORIG__ */
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+	u32 mask;
+	struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @engine_to_vl - map of an engine to a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure.  The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+	s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
+	char list[16]; // struct rcu_head list;
+	u32 mask;
+	u8 actual_vls;
+	u8 vls;
+	struct sdma_map_elem *map[0];
+};
+#ifdef __HFI1_ORIG__
+
+int sdma_map_init(
+	struct hfi1_devdata *dd,
+	u8 port,
+	u8 num_vls,
+	u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+	struct sdma_engine *sde)
+{
+	if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+		return;
+	_sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 sc5);
+
+#endif /* __HFI1_ORIG__ */
+struct sdma_engine *sdma_select_engine_vl(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 vl);
+
+struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
+					    u32 selector, u8 vl);
+#ifdef __HFI1_ORIG__
+						
+ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf);
+ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
+				size_t count);
+int sdma_engine_get_vl(struct sdma_engine *sde);
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+void sdma_seqfile_dump_cpu_list(struct seq_file *s, struct hfi1_devdata *dd,
+				unsigned long cpuid);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+	char *r = s;
+
+	while (*s)
+		if (*s++ == '/')
+			r = s;
+	return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif /* __HFI1_ORIG__ */
+#endif
--- a/kernel/include/hfi1/sdma_txreq.h
+++ b/kernel/include/hfi1/sdma_txreq.h
@ -0,0 +1,137 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_SDMA_TXREQ_H
+#define HFI1_SDMA_TXREQ_H
+
+#include <hfi1/iowait.h>
+
+/* increased for AHG */
+#define NUM_DESC 6
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+	/* private:  don't use directly */
+	u64 qw[2];
+};
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments.   Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int);
+
+struct iowait_wait;
+struct sdma_txreq {
+	struct list_head list;
+	/* private: */
+	struct sdma_desc *descp;
+	/* private: */
+	void *coalesce_buf;
+	/* private: */
+	struct iowait *wait;
+	/* private: */
+	callback_t complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	u64 sn;
+#endif
+	/* private: - used in coalesce/pad processing */
+	u16                         packet_len;
+	/* private: - down-counted to trigger last */
+	u16                         tlen;
+	/* private: */
+	u16                         num_desc;
+	/* private: */
+	u16                         desc_limit;
+	/* private: */
+	u16                         next_descq_idx;
+	/* private: */
+	u16 coalesce_idx;
+	/* private: flags */
+	u16                         flags;
+	/* private: */
+	struct sdma_desc descs[NUM_DESC];
+};
+
+static inline int sdma_txreq_built(struct sdma_txreq *tx)
+{
+	return tx->num_desc;
+}
+
+#endif                          /* HFI1_SDMA_TXREQ_H */
--- a/kernel/include/hfi1/user_exp_rcv.h
+++ b/kernel/include/hfi1/user_exp_rcv.h
@ -0,0 +1,175 @@
+#ifndef _HFI1_USER_EXP_RCV_H
+#define _HFI1_USER_EXP_RCV_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include "hfi1/hfi.h"
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)	\
+	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value)			\
+	(((value) & EXP_TID_TID##field##_MASK) <<	\
+	 EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({					\
+		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
+			   EXP_TID_TID##field##_SHIFT);			\
+		})
+#define EXP_TID_RESET(tid, field, value) do {				\
+		EXP_TID_CLEAR(tid, field);				\
+		(tid) |= EXP_TID_SET(field, (value));			\
+	} while (0)
+
+struct tid_group {
+	struct list_head list;
+	unsigned base;
+	u8 size;
+	u8 used;
+	u8 map;
+};
+
+struct tid_rb_node {
+	uintptr_t phys;
+	u32 len;
+	u32 rcventry;
+	struct tid_group *grp;
+	bool freed;
+
+	struct rb_root *rb_root;
+	struct hfi1_filedata *fd;
+	unsigned long start;
+	unsigned long end;
+	struct rb_node rb_node;
+	struct deferred_unmap_range *range;
+};
+
+struct tid_pageset {
+	u16 idx;
+	u16 count;
+};
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+	/*
+	 * Doing the WC fill writes only makes sense if the device is
+	 * present and the RcvArray has been mapped as WC memory.
+	 */
+	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
+		writeq(0, dd->rcvarray_wc + (index * 8));
+}
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+	u32 pair = rcventry & ~0x1;
+
+	return EXP_TID_SET(IDX, pair >> 1) |
+		EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+static inline void exp_tid_group_init(struct exp_tid_set *set)
+{
+	INIT_LIST_HEAD(&set->list);
+	set->count = 0;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+				    struct exp_tid_set *set)
+{
+	list_del_init(&grp->list);
+	set->count--;
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+				      struct exp_tid_set *set)
+{
+	list_add_tail(&grp->list, &set->list);
+	set->count++;
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+	struct tid_group *grp =
+		list_first_entry(&set->list, struct tid_group, list);
+	list_del_init(&grp->list);
+	set->count--;
+	return grp;
+}
+
+static inline void tid_group_move(struct tid_group *group,
+				  struct exp_tid_set *s1,
+				  struct exp_tid_set *s2)
+{
+	tid_group_remove(group, s1);
+	tid_group_add_tail(group, s2);
+}
+
+#ifdef __HFI1_ORIG__
+u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
+int alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd);
+void free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd);
+int hfi1_user_exp_rcv_init(struct file *);
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
+
+#endif /* __HFI1_ORIG__ */
+
+int hfi1_user_exp_rcv_setup(struct hfi1_filedata *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_clear(struct hfi1_filedata *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *, struct hfi1_tid_info *);
+
+#endif /* _HFI1_USER_EXP_RCV_H */
--- a/kernel/include/hfi1/user_sdma.h
+++ b/kernel/include/hfi1/user_sdma.h
@ -0,0 +1,139 @@
+
+#ifndef _HFI1_USER_SDMA_H
+#define _HFI1_USER_SDMA_H
+
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <hfi1/ihk_hfi1_common.h>
+#include <hfi1/iowait.h>
+#include <string.h>
+#include <hfi1/hfi1_user.h>
+#include <uio.h>
+
+#ifdef __HFI1_ORIG__
+
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+#include "user_exp_rcv.h"
+
+extern uint extended_psn;
+
+#endif /* __HFI1_ORIG__ */
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_KVER_SHIFT          30
+#define KDETH_KVER_MASK           0x3
+#define KDETH_JKEY_SHIFT          0x0
+#define KDETH_JKEY_MASK           0xff
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define AHG_KDETH_INTR_SHIFT 12
+#define AHG_KDETH_SH_SHIFT   13
+#define AHG_KDETH_ARRAY_SIZE  9
+
+#define KDETH_GET(val, field)						\
+	(((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {					\
+		u32 dwval = le32_to_cpu(dw);				\
+		dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+		dwval |= (((val) & KDETH_##field##_MASK) << \
+			  KDETH_##field##_SHIFT);			\
+		dw = cpu_to_le32(dwval);				\
+	} while (0)
+#define KDETH_RESET(dw, field, val) ({ dw = 0; KDETH_SET(dw, field, val); })
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_SMALL_SHIFT     2
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_LARGE_SHIFT     6
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+enum pkt_q_sdma_state {
+	SDMA_PKT_Q_ACTIVE,
+	SDMA_PKT_Q_DEFERRED,
+};
+
+#include <hfi1/hfi1_generated_hfi1_user_sdma_pkt_q.h>
+
+struct hfi1_user_sdma_comp_q {
+	u16 nentries;
+	struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec,
+				   unsigned long dim, unsigned long *count);
+#ifdef __HFI1_ORIG__
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+				   unsigned long *);
+
+#endif /* __HFI1_ORIG__ */
+#endif /* _HFI1_SDMA_H */
--- a/kernel/include/kmalloc.h
+++ b/kernel/include/kmalloc.h
@ -38,4 +38,104 @@ int memcheckall();
 int freecheck(int runcount);
 void kmalloc_consolidate_free_list(void);

+#ifndef unlikely
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
+
+/*
+ * Generic lockless kmalloc cache.
+ */
+static inline void kmalloc_cache_free(void *elem)
+{
+	struct kmalloc_cache_header *current = NULL;
+	struct kmalloc_cache_header *new =
+		(struct kmalloc_cache_header *)elem;
+	struct kmalloc_header *header;
+	register struct kmalloc_cache_header *cache;
+
+	if (unlikely(!elem))
+		return;
+
+	/* Get cache pointer from kmalloc header */
+	header = (struct kmalloc_header *)((void *)elem -
+				sizeof(struct kmalloc_header));
+	if (unlikely(!header->cache)) {
+		kprintf("%s: WARNING: no cache for 0x%lx\n",
+			__FUNCTION__, elem);
+		return;
+	}
+
+	cache = header->cache;
+
+retry:
+	current = cache->next;
+	new->next = current;
+
+	if (!__sync_bool_compare_and_swap(&cache->next, current, new)) {
+		goto retry;
+	}
+}
+
+static inline void kmalloc_cache_prealloc(struct kmalloc_cache_header *cache,
+		size_t size, int nr_elem)
+{
+	struct kmalloc_cache_header *elem;
+	int i;
+
+	if (unlikely(cache->next))
+		return;
+
+	for (i = 0; i < nr_elem; ++i) {
+		struct kmalloc_header *header;
+
+		elem = (struct kmalloc_cache_header *)
+			kmalloc(size, IHK_MC_AP_NOWAIT);
+
+		if (!elem) {
+			kprintf("%s: ERROR: allocating cache element\n", __FUNCTION__);
+			continue;
+		}
+
+		/* Store cache pointer in kmalloc_header */
+		header = (struct kmalloc_header *)((void *)elem -
+				sizeof(struct kmalloc_header));
+		header->cache = cache;
+
+		kmalloc_cache_free(elem);
+	}
+}
+
+static inline void *kmalloc_cache_alloc(struct kmalloc_cache_header *cache,
+		size_t size)
+{
+	register struct kmalloc_cache_header *first, *next;
+
+retry:
+	next = NULL;
+	first = cache->next;
+
+	if (first) {
+		next = first->next;
+
+		if (!__sync_bool_compare_and_swap(&cache->next,
+					first, next)) {
+			goto retry;
+		}
+	}
+	else {
+		//kprintf("%s: calling pre-alloc for 0x%lx...\n",
+		//		__FUNCTION__, cache);
+		kprintf("%s: calling pre-alloc for 0x%lx (offs: %lu)...\n",
+				__FUNCTION__, cache,
+				((unsigned long)cache -
+				 (unsigned long)&cpu_local_var(txreq_cache)) /
+				sizeof(struct kmalloc_cache_header));
+
+		kmalloc_cache_prealloc(cache, size, 512);
+		goto retry;
+	}
+
+	return (void *)first;
+}
+
 #endif
--- a/kernel/include/lwk/compiler.h
+++ b/kernel/include/lwk/compiler.h
@ -1,6 +1,8 @@
 #ifndef __LWK_COMPILER_H
 #define __LWK_COMPILER_H

+#include <ihk/cpu.h>
+
 #ifndef __ASSEMBLY__

 #ifdef __CHECKER__
@ -175,11 +177,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 # define unlikely(x)	__builtin_expect(!!(x), 0)
 #endif

-/* Optimization barrier */
-#ifndef barrier
-# define barrier() __memory_barrier()
-#endif
-
 #ifndef barrier_data
 # define barrier_data(ptr) barrier()
 #endif
@ -490,4 +487,66 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	(_________p1); \
 })

+extern void *memcpy(void *dest, const void *src, size_t n);
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(unsigned char *)res = *(volatile unsigned char *)p; break;
+	case 2: *(unsigned short *)res = *(volatile unsigned short *)p; break;
+	case 4: *(unsigned int *)res = *(volatile unsigned int *)p; break;
+	case 8: *(unsigned long long *)res = *(volatile unsigned long long *)p; break;
+	default:
+		barrier();
+		memcpy((void *)res, (const void *)p, size);
+		barrier();
+	}
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile unsigned char *)p = *(unsigned char *)res; break;
+	case 2: *(volatile unsigned short *)p = *(unsigned short *)res; break;
+	case 4: *(volatile unsigned int *)p = *(unsigned int *)res; break;
+	case 8: *(volatile unsigned long long *)p = *(unsigned long long *)res; break;
+	default:
+		barrier();
+		memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+/*
+ * Prevent the compiler from merging or refetching reads or writes. The
+ * compiler is also forbidden from reordering successive instances of
+ * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the
+ * compiler is aware of some particular ordering.  One way to make the
+ * compiler aware of ordering is to put the two invocations of READ_ONCE,
+ * WRITE_ONCE or ACCESS_ONCE() in different C statements.
+ *
+ * In contrast to ACCESS_ONCE these two macros will also work on aggregate
+ * data types like structs or unions. If the size of the accessed data
+ * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
+ * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
+ * compile-time warning.
+ *
+ * Their two major use cases are: (1) Mediating communication between
+ * process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ */
+
+#define READ_ONCE(x) \
+	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+
+#define WRITE_ONCE(x, val) \
+	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
+
+
+
+
+
 #endif /* __LWK_COMPILER_H */
--- a/kernel/include/process.h
+++ b/kernel/include/process.h
@ -32,6 +32,7 @@
 #define VR_STACK           0x1
 #define VR_RESERVED        0x2
 #define VR_AP_USER         0x4
+#define VR_PREALLOC        0x8
 #define VR_IO_NOCACHE      0x100
 #define VR_REMOTE          0x200
 #define VR_WRITE_COMBINED  0x400
@ -387,6 +388,8 @@ struct vm_range {
 	int pgshift;	/* page size. 0 means THP */
 	int padding;
 	void *private_data;
+	unsigned long lowest_accesed;
+	unsigned long faulted_size;
 };

 struct vm_range_numa_policy {
@ -552,6 +555,7 @@ struct process {

 	long maxrss;
 	long maxrss_children;
+	unsigned long mcexec_flags;
 	/* Memory policy flags and memory specific options */
 	unsigned long mpol_flags;
 	size_t mpol_threshold;
@ -572,6 +576,21 @@ struct process {
 	unsigned long profile_elapsed_ts;
 #endif // PROFILE_ENABLE
 	int nr_processes; /* For partitioned execution */
+	int process_rank; /* Rank in partition */
+
+#define MAX_FD_PRIV 256
+	void *fd_priv_table[MAX_FD_PRIV];
+	/* HFI1 specific */
+	void *hfi1_kregbase;
+	void *hfi1_piobase;
+	void *hfi1_rcvarray_wc;
+	size_t hfi1_rcvarray_wc_len;
+	void *hfi1_cq_comps;
+	void *hfi1_events;
+	size_t hfi1_cq_comps_len;
+	ihk_spinlock_t hfi1_lock;
+	struct rb_root hfi1_reg_tree;
+	struct rb_root hfi1_inv_tree;
 };

 /*
@ -699,9 +718,35 @@ struct thread {

 #define VM_RANGE_CACHE_SIZE	4

+struct deferred_unmap_range {
+	struct process_vm *vm;
+	void *addr;
+	size_t len;
+	struct list_head list;
+
+	/*
+	 * List operations as well as the refcnt are protected
+	 * by vm->vm_deferred_unmap_lock.
+	 */
+	int refcnt;
+};
+
+static void init_deferred_unmap_range(
+	struct deferred_unmap_range *range,
+	struct process_vm *vm,
+	void *addr, size_t len)
+{
+	range->vm = vm;
+	range->addr = addr;
+	range->len = len;
+	INIT_LIST_HEAD(&range->list);
+	range->refcnt = 0;
+}
+
 struct process_vm {
 	struct address_space *address_space;
 	struct rb_root vm_range_tree;
+	struct list_head vm_deferred_unmap_range_list;
 	struct vm_regions region;
 	struct process *proc;		/* process that reside on the same page */
 	void *opt;
@ -711,6 +756,7 @@ struct process_vm {
 	
 	ihk_spinlock_t page_table_lock;
 	ihk_spinlock_t memory_range_lock;
+	ihk_spinlock_t vm_deferred_unmap_lock;
    // to protect the followings:
    // 1. addition of process "memory range" (extend_process_region, add_process_memory_range)
    // 2. addition of process page table (allocate_pages, update_process_page_table)
@ -829,4 +875,8 @@ void proc_init();
 void set_timer();
 struct sig_pending *hassigpending(struct thread *thread);

+#define VERIFY_READ 0
+#define VERIFY_WRITE 1
+int access_ok(struct process_vm *vm, int type, uintptr_t addr, size_t len);
+
 #endif
--- a/kernel/include/profile.h
+++ b/kernel/include/profile.h
@ -28,7 +28,7 @@ struct profile_event {
 * [PROFILE_SYSCALL_MAX,PROFILE_OFFLOAD_MAX) - syscall offloads
 * [PROFILE_OFFLOAD_MAX,PROFILE_EVENT_MAX) - general events
 *
- * XXX: Make sure to fill in prof_event_names in profile.c
+ * XXX: Make sure to fill in profile_event_names in profile.c
 * for each added profiled event.
 */
 enum profile_event_type {
@ -44,6 +44,12 @@ enum profile_event_type {
 	PROFILE_mmap_anon_no_contig_phys,
 	PROFILE_mmap_regular_file,
 	PROFILE_mmap_device_file,
+	PROFILE_sdma_0,
+	PROFILE_sdma_1,
+	PROFILE_sdma_2,
+	PROFILE_sdma_3,
+	PROFILE_sdma_4,
+	PROFILE_sdma_5,
 	PROFILE_EVENT_MAX	/* Should be the last event type */
 };

--- a/kernel/include/syscall.h
+++ b/kernel/include/syscall.h
@ -166,6 +166,8 @@ typedef unsigned long __cpu_set_unit;
 #define MPOL_NO_BSS               0x04
 #define MPOL_SHM_PREMAP           0x08

+#define MCEXEC_HFI1               0x01
+
 struct program_load_desc {
 	int num_sections;
 	int status;
@ -194,12 +196,14 @@ struct program_load_desc {
 	unsigned long envs_len;
 	struct rlimit rlimit[MCK_RLIM_MAX];
 	unsigned long interp_align;
+	unsigned long mcexec_flags;
 	unsigned long mpol_flags;
 	unsigned long mpol_threshold;
 	unsigned long heap_extension;
 	long stack_premap;
 	unsigned long mpol_bind_mask;
 	int nr_processes;
+	int process_rank;
 	char shell_path[SHELL_PATH_MAX_LEN];
 	__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
 	int profile;
@ -293,6 +297,7 @@ struct syscall_response {
 	long ret;
 	unsigned long fault_address;
 	unsigned long fault_reason;
+	void *private_data;
 };

 struct syscall_post {
--- a/kernel/init.c
+++ b/kernel/init.c
@ -125,6 +125,8 @@ char *find_command_line(char *name)
 	return strstr(cmdline, name);
 }

+extern int safe_kernel_map;
+
 static void parse_kargs(void)
 {
 	char *ptr;
@ -145,6 +147,11 @@ static void parse_kargs(void)
 	}
 	ihk_mc_set_dump_level(dump_level);

+	ptr = find_command_line("safe_kernel_map");
+	if (ptr) {
+		safe_kernel_map = 1;
+	}
+
 	/* idle_halt option */
 	ptr = find_command_line("idle_halt");
 	if (ptr) {
@ -353,6 +360,11 @@ static void post_init(void)
 		}
 		init_host_ikc2mckernel();
 		init_host_ikc2linux(ikc_cpu);
+
+		{
+			extern void hfi1_kmalloc_cache_prealloc(void);
+			hfi1_kmalloc_cache_prealloc();
+		}
 	}

 	arch_setup_vdso();
--- a/kernel/mem.c
+++ b/kernel/mem.c
@ -698,6 +698,22 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align,

 						break;
 					}
+					else {
+						dkprintf("%s: couldn't fulfill user policy for"
+								" %d contiguous pages from node %d "
+#ifdef IHK_RBTREE_ALLOCATOR
+								"(free pages left: %d)"
+#endif
+								"\n",
+								__FUNCTION__,
+								npages,
+								numa_id
+#ifdef IHK_RBTREE_ALLOCATOR
+								, memory_nodes[numa_id].nr_free_pages
+#endif
+								);
+						//return NULL;
+					}
 				}

 				if (pa) break;
@ -719,8 +735,8 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align,
 #ifdef PROFILE_ENABLE
 		profile_event_add(PROFILE_mpol_alloc_missed, npages * 4096);
 #endif
-		dkprintf("%s: couldn't fulfill user policy for %d pages\n",
-			__FUNCTION__, npages);
+		dkprintf("%s: couldn't fulfill user policy for %d pages from node %d\n",
+			__FUNCTION__, npages, i);
 	}

 distance_based:
@ -926,6 +942,8 @@ static void query_free_mem_interrupt_handler(void *priv)
 	/* Iterate memory allocators */
 	for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
 #ifdef IHK_RBTREE_ALLOCATOR
+		kprintf("McKernel free pages in NUMA node %d: %d\n",
+			i, memory_nodes[i].nr_free_pages);
 		pages += memory_nodes[i].nr_free_pages;
 #else
 		struct ihk_page_allocator_desc *pa_allocator;
@ -981,6 +999,8 @@ void coredump(struct thread *thread, void *regs)
 	struct coretable *coretable;
 	int chunks;

+	return;
+
 #ifdef POSTK_DEBUG_ARCH_DEP_67 /* use limit corefile size. (temporarily fix.) */
 	if (thread->proc->rlimit[MCK_RLIMIT_CORE].rlim_cur == 0) {
 		return;
@ -1168,6 +1188,59 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
 	dkprintf("%s: addr: %p, reason: %lx, regs: %p\n",
 			__FUNCTION__, fault_addr, reason, regs);

+	/* Linux ioremap address? */
+	if ((unsigned long)fault_addr >= 0xFFFFC90000000000 &&
+			(unsigned long)fault_addr < 0xFFFFFFFF80000000) {
+		pte_t *lptep;
+		pte_t *ptep;
+		enum ihk_mc_pt_attribute attr =
+			PTATTR_UNCACHABLE | PTATTR_WRITABLE;
+		unsigned long phys;
+		void *virt = fault_addr;
+		struct process_vm *vm = cpu_local_var(current)->vm;
+
+		if (!vm) {
+			goto regular_handler;
+		}
+
+		/* Is this a valid address in Linux? */
+		lptep = ihk_mc_pt_lookup_pte(ihk_mc_get_linux_kernel_pgt(),
+				virt, 0, 0, 0, 0);
+		if (!lptep || !pte_is_present(lptep)) {
+			kprintf("%s: ERROR: no mapping in Linux for: 0x%lx?\n",
+					__FUNCTION__, virt);
+			terminate(0, SIGKILL);
+			goto regular_handler;
+		}
+
+		phys = pte_get_phys(lptep);
+
+		if (ihk_mc_pt_set_page(vm->address_space->page_table,
+					virt, phys, attr) < 0) {
+			/* Not necessarily an error.. */
+			kprintf("%s: WARNING: mapping: 0x%lx -> 0x%lx\n",
+					__FUNCTION__, virt, phys);
+		}
+
+		ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
+				virt, 0, 0, 0, 0);
+		if (!ptep) {
+			kprintf("%s: ERROR: no PTE in McKernel for: 0x%lx?\n",
+					__FUNCTION__, virt);
+			goto regular_handler;
+		}
+
+		*ptep = *lptep;
+		dkprintf("%s: Linux ioremap address 0x%lx -> 0x%lx "
+				"mapped on demand\n",
+				__FUNCTION__, virt, phys);
+
+		flush_tlb_single((unsigned long)virt);
+		error = 0;
+		goto out;
+	}
+
+regular_handler:
 	preempt_disable();

 	cpu_enable_interrupt();
@ -2192,6 +2265,7 @@ static void *___kmalloc(int size, ihk_mc_ap_flag flag)
 	int npages;
 	unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();

+retry_malloc:
 	/* KMALLOC_MIN_SIZE bytes aligned size. */
 	if (size & KMALLOC_MIN_MASK) {
 		size = ((size + KMALLOC_MIN_SIZE - 1) & ~(KMALLOC_MIN_MASK));
@ -2223,10 +2297,36 @@ split_and_return:
 		}

 		list_del(&chunk->list);
+		ZERO_LIST_HEAD(&chunk->list);
 		cpu_restore_interrupt(kmalloc_irq_flags);
 		return ((void *)chunk + sizeof(struct kmalloc_header));
 	}
+	/* See remote list before falling back to page_alloc */
+	else {
+		int retry = 0;
+		struct kmalloc_header *chunk, *tmp;
+		unsigned long irqflags =
+			ihk_mc_spinlock_lock(
+					&cpu_local_var(remote_free_list_lock));

+		/* Clean up remotely deallocated chunks */
+		list_for_each_entry_safe(chunk, tmp,
+				&cpu_local_var(remote_free_list), list) {
+
+			list_del(&chunk->list);
+			___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
+			if (chunk->size >= size) {
+				retry = 1;
+			}
+		}
+
+		ihk_mc_spinlock_unlock(&cpu_local_var(remote_free_list_lock),
+				irqflags);
+		/* Found anything? */
+		if (retry) {
+			goto retry_malloc;
+		}
+	}

 	/* Allocate new memory and add it to free list */
 	npages = (size + sizeof(struct kmalloc_header) + (PAGE_SIZE - 1))
@ -2585,3 +2685,4 @@ int ihk_mc_get_mem_user_page(void *arg0, page_table_t pt, pte_t *ptep, void *pga

 	return 0;
 }
+
--- a/kernel/process.c
+++ b/kernel/process.c
@ -138,12 +138,17 @@ init_process(struct process *proc, struct process *parent)
 	INIT_LIST_HEAD(&proc->ptraced_siblings_list);
 	mcs_rwlock_init(&proc->update_lock);
 #endif /* POSTK_DEBUG_ARCH_DEP_63 */
+
+	// Double check the inheritance from parent
+	memset(proc->fd_priv_table, 0, MAX_FD_PRIV * sizeof(void *));
+
 	INIT_LIST_HEAD(&proc->threads_list);
 	INIT_LIST_HEAD(&proc->children_list);
 	INIT_LIST_HEAD(&proc->ptraced_children_list);
 	mcs_rwlock_init(&proc->threads_lock);
 	mcs_rwlock_init(&proc->children_lock);
 	ihk_mc_spinlock_init(&proc->mckfd_lock);
+	ihk_mc_spinlock_init(&proc->hfi1_lock);
 	waitq_init(&proc->waitpid_q);
 	ihk_atomic_set(&proc->refcount, 2);
 	proc->monitoring_event = NULL;
@ -256,10 +261,12 @@ init_process_vm(struct process *owner, struct address_space *asp, struct process
 	int i;
 	ihk_mc_spinlock_init(&vm->memory_range_lock);
 	ihk_mc_spinlock_init(&vm->page_table_lock);
+	ihk_mc_spinlock_init(&vm->vm_deferred_unmap_lock);

 	ihk_atomic_set(&vm->refcount, 1);
 	vm->vm_range_tree = RB_ROOT;
 	vm->vm_range_numa_policy_tree = RB_ROOT;
+	INIT_LIST_HEAD(&vm->vm_deferred_unmap_range_list);
 	vm->address_space = asp;
 	vm->proc = owner;
 	vm->exiting = 0;
@ -1008,6 +1015,10 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
 		if (vm->range_cache[i] == range)
 			vm->range_cache[i] = NULL;
 	}
+
+if (range->flag & VR_STACK) {
+	kprintf("%s: VR_STACK faulted_size: %lu\n", __FUNCTION__, range->faulted_size);
+}
 	kfree(range);

 	dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n",
@ -1223,6 +1234,9 @@ int add_process_memory_range(struct process_vm *vm,
 	range->pgshift = pgshift;
 	range->private_data = NULL;

+	range->lowest_accesed = end;
+	range->faulted_size = 0;
+
 	rc = 0;
 	if (phys == NOPHYS) {
 		/* Nothing to map */
@ -1259,6 +1273,138 @@ int add_process_memory_range(struct process_vm *vm,
 		return rc;
 	}

+	/*
+	 * Allocate and map physical memory,
+	 * interpret NUMA policy.
+	 * TODO: move out to a function.. 
+	 */
+if (flag & VR_PREALLOC && phys == NOPHYS) {
+
+#if 0
+	unsigned long addr = start;
+	enum ihk_mc_pt_attribute ptattr;
+	ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
+	unsigned long irqflags;
+	unsigned long len = 0;
+	void *frame = NULL;
+	int npages;
+	int p2align;
+
+	len = end - addr;
+
+	/* Figure out size */
+	if (len >= LARGE_PAGE_SIZE) {
+		p2align = LARGE_PAGE_P2ALIGN;
+	}
+	else {
+		p2align = PAGE_P2ALIGN;
+	}
+	npages = len >> PAGE_SHIFT;
+
+	frame = ihk_mc_alloc_aligned_pages_user(npages,
+			p2align,
+			IHK_MC_AP_NOWAIT | (range->flag & VR_AP_USER ? IHK_MC_AP_USER : 0),
+			-1);
+	if (!frame) {
+		kprintf("%s: error: out of memory\n", __FUNCTION__);
+		panic("panic");
+		return -ENOMEM;
+	}
+
+	irqflags = ihk_mc_spinlock_lock(&vm->page_table_lock);
+
+	rc = ihk_mc_pt_set_range(vm->address_space->page_table,
+			vm,
+			(void *)addr,
+			(void *)addr + len,
+			virt_to_phys(frame),
+			ptattr,
+			PAGE_SHIFT + p2align,
+			range);
+
+	if (rc) {
+		kprintf("%s: ERROR: mapping\n", __FUNCTION__);
+		ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags);
+		return -ENOMEM;
+	}
+
+	ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags);
+
+	memset(frame, 0, len);
+	addr += len;
+
+
+#else
+	unsigned long addr = start;
+	enum ihk_mc_pt_attribute ptattr;
+	ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
+
+	while (addr < end) {
+		unsigned long irqflags;
+		unsigned long len = 0;
+		void *frame = NULL;
+		int npages;
+		int p2align;
+
+		len = end - addr;
+
+		/* Figure out size */
+		if (len >= LARGE_PAGE_SIZE) {
+			len = LARGE_PAGE_SIZE;
+			p2align = LARGE_PAGE_P2ALIGN;
+		}
+		else {
+			len = PAGE_SIZE;
+			p2align = PAGE_P2ALIGN;
+		}
+
+		npages = len >> PAGE_SHIFT;
+#if 0
+		frame = ihk_mc_alloc_aligned_pages_node_user(npages,
+				p2align,
+				IHK_MC_AP_NOWAIT | (range->flag & VR_AP_USER ? IHK_MC_AP_USER : 0),
+				node, -1);
+		node = 1 - node;
+#else
+		frame = ihk_mc_alloc_aligned_pages_user(npages,
+				p2align,
+				IHK_MC_AP_NOWAIT | (range->flag & VR_AP_USER ? IHK_MC_AP_USER : 0),
+				-1);
+#endif
+		if (!frame) {
+			kprintf("%s: error: out of memory\n", __FUNCTION__);
+			return -ENOMEM;
+		}
+
+		irqflags = ihk_mc_spinlock_lock(&vm->page_table_lock);
+
+		rc = ihk_mc_pt_set_range(vm->address_space->page_table,
+				vm,
+				(void *)addr,
+				(void *)addr + len,
+				virt_to_phys(frame),
+				ptattr,
+				PAGE_SHIFT + p2align,
+				range);
+
+		if (rc) {
+			kprintf("%s: ERROR: mapping\n", __FUNCTION__);
+			ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags);
+			return -ENOMEM;
+		}
+
+		ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags);
+
+		memset(frame, 0, len);
+		addr += len;
+	}
+#endif
+	dkprintf("%s: 0x%lx:%lu mapped\n",
+		__FUNCTION__,
+		start,
+		end - start);
+}
+
 	/* Clear content! */
 	if (phys != NOPHYS && !(flag & (VR_REMOTE | VR_DEMAND_PAGING))
 			&& ((flag & VR_PROT_MASK) != VR_PROT_NONE)) {
@ -1777,6 +1923,22 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang
 		}
 		pgaddr = (void *)(fault_addr & ~(pgsize - 1));
 	}
+
+	if (pgsize > LARGE_PAGE_SIZE) {
+		dkprintf("%s: 0x%lx, pgsize: %lu\n",
+				__FUNCTION__, pgaddr, pgsize);
+	}
+
+	if (range->flag & VR_STACK) {
+		range->faulted_size += pgsize;
+
+		if (range->lowest_accesed > (unsigned long)pgaddr) {
+			dkprintf("%s: VR_STACK @ 0x%lx, pgsize: %lu, distance: %lu\n",
+				__FUNCTION__, pgaddr, pgsize, range->end - (unsigned long)pgaddr);
+			range->lowest_accesed = (unsigned long)pgaddr;
+		}
+	}
+
 	/*****/
 	dkprintf("%s: ptep=%lx,pte_is_null=%d,pte_is_fileoff=%d\n", __FUNCTION__, ptep, ptep ? pte_is_null(ptep) : -1, ptep ? pte_is_fileoff(ptep, pgsize) : -1);
 	if (!ptep || pte_is_null(ptep) || pte_is_fileoff(ptep, pgsize)) {
@ -2148,6 +2310,8 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn,
 	struct vm_range *range;
 	int stack_populated_size = 0;
 	int stack_align_padding = 0;
+	int p2align = LARGE_PAGE_P2ALIGN;
+	int pgshift = LARGE_PAGE_SHIFT;

 	/* Create stack range */
 	end = STACK_TOP(&thread->vm->region) & LARGE_PAGE_MASK;
@ -2170,18 +2334,27 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn,
 	else if (size < minsz) {
 		size = minsz;
 	}
+
+#if 0
+	if (minsz >= GB_PAGE_SIZE) {
+		end = end & GB_PAGE_MASK;
+		p2align = GB_PAGE_P2ALIGN;
+		pgshift = GB_PAGE_SHIFT;
+	}
+#endif
+
 	start = (end - size) & LARGE_PAGE_MASK;

 	/* Apply user allocation policy to stacks */
 	/* TODO: make threshold kernel or mcexec argument */
 	ap_flag = (size >= proc->mpol_threshold &&
 		!(proc->mpol_flags & MPOL_NO_STACK)) ? IHK_MC_AP_USER : 0;
-	dkprintf("%s: max size: %lu, mapped size: %lu %s\n",
-			__FUNCTION__, size, minsz,
+	kprintf("%s: stack: 0x%lx-0x%lx:%lu, mapped: %lu %s\n",
+			__FUNCTION__, start, end, size, minsz,
 			ap_flag ? "(IHK_MC_AP_USER)" : "");

 	stack = ihk_mc_alloc_aligned_pages_user(minsz >> PAGE_SHIFT,
-				LARGE_PAGE_P2ALIGN, IHK_MC_AP_NOWAIT | ap_flag, start);
+				p2align, IHK_MC_AP_NOWAIT | ap_flag, start);

 	if (!stack) {
 		kprintf("%s: error: couldn't allocate initial stack\n",
@ -2208,8 +2381,7 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn,
 								thread->vm, (void *)(end - minsz),
 								(void *)end, virt_to_phys(stack),
 								arch_vrflag_to_ptattr(vrflag, PF_POPULATE, NULL),
-								LARGE_PAGE_SHIFT, range
-								);
+								pgshift, range);

 	if (error) {
 		kprintf("init_process_stack:"
@ -3677,3 +3849,47 @@ debug_log(unsigned long arg)
 		break;
 	}
 }
+
+int access_ok(struct process_vm *vm, int type, uintptr_t addr, size_t len) {
+	struct vm_range *range, *next;
+	int first = true;
+
+	range = lookup_process_memory_range(vm, addr, addr + len);
+
+	if (!range || range->start > addr) {
+		kprintf("%s: No VM range at 0x%llx, refusing access\n",
+			__FUNCTION__, addr);
+		return -EFAULT;
+	}
+	do {
+		if (first) {
+			first = false;
+		} else {
+			next = next_process_memory_range(vm, range);
+			if (!next) {
+				kprintf("%s: No VM range after 0x%llx, but checking until 0x%llx. Refusing access\n",
+					__FUNCTION__, range->end, addr + len);
+				return -EFAULT;
+			}
+			if (range->end != next->start) {
+				kprintf("%s: 0x%llx - 0x%llx and 0x%llx - 0x%llx are not adjacent (request was %0x%llx-0x%llx %zu)\n",
+					__FUNCTION__, range->start, range->end,
+					next->start, next->end,
+					addr, addr+len, len);
+				return -EFAULT;
+			}
+			range = next;
+		}
+
+		if ((type == VERIFY_WRITE && !(range->flag & VR_PROT_WRITE)) ||
+		    (type == VERIFY_READ && !(range->flag & VR_PROT_READ))) {
+			kprintf("%s: 0x%llx - 0x%llx does not have prot %s (request was %0x%llx-0x%llx %zu)\n",
+				__FUNCTION__, range->start, range->end,
+				type == VERIFY_WRITE ? "write" : "ready",
+				addr, addr+len, len);
+			return -EACCES;
+		}
+	} while (addr + len > range->end);
+
+	return 0;
+}
--- a/kernel/profile.c
+++ b/kernel/profile.c
@ -69,6 +69,12 @@ char *profile_event_names[] =
 	"mmap_anon_no_contig_phys",
 	"mmap_regular_file",
 	"mmap_device_file",
+	"sdma_0",
+	"sdma_1",
+	"sdma_2",
+	"sdma_3",
+	"sdma_4",
+	"sdma_5",
 	""
 };

--- a/kernel/script/dwarf-extract-struct.c
+++ b/kernel/script/dwarf-extract-struct.c
@ -0,0 +1,714 @@
+/*
+ * Trivial dwarf parser to extract part of a struct from debug infos
+ *
+ * Author: Dominique Martinet <dominique.martinet@cea.fr>
+ * License: WTFPLv2
+ *
+ * Canonical source: http://cgit.notk.org/asmadeus/dwarf-extract-struct.git
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <strings.h>
+#include <errno.h>
+#include "libdwarf/dwarf.h"
+#include "libdwarf/libdwarf.h"
+
+
+static void parse_dwarf(Dwarf_Debug dbg, const char *struct_name,
+		const char *field_names[], int field_count);
+static void find_struct(Dwarf_Debug dbg, Dwarf_Die die, const char *struct_name,
+		const char *field_names[], int field_count, int level);
+static void find_fields(Dwarf_Debug dbg, Dwarf_Die struct_die, Dwarf_Die die,
+		const char *struct_name, const char *field_names[],
+		int field_count, int level);
+static void print_field(Dwarf_Debug dbg, Dwarf_Die die, const char *field_name,
+		int pad_num);
+
+int debug = 0;
+
+void usage(const char *argv[]) {
+	fprintf(stderr, "%s debug_file struct_name [field [field...]]\n",
+		argv[0]);
+}
+
+int main(int argc, const char *argv[]) {
+	Dwarf_Debug dbg = 0;
+	int fd = -1;
+	const char *filepath;
+	const char *struct_name;
+	int res = DW_DLV_ERROR;
+	Dwarf_Error error;
+	Dwarf_Handler errhand = 0;
+	Dwarf_Ptr errarg = 0;
+
+	if(argc < 3) {
+		usage(argv);
+		exit(1);
+	}
+
+	filepath = argv[1];
+	struct_name = argv[2];
+
+	fd = open(filepath,O_RDONLY);
+	if(fd < 0) {
+		fprintf(stderr, "Failure attempting to open %s\n",filepath);
+	}
+	res = dwarf_init(fd, DW_DLC_READ, errhand, errarg, &dbg, &error);
+	if(res != DW_DLV_OK) {
+		fprintf(stderr, "Giving up, cannot do DWARF processing\n");
+		exit(1);
+	}
+
+	parse_dwarf(dbg, struct_name, argv + 3, argc - 3);
+
+	res = dwarf_finish(dbg,&error);
+	if(res != DW_DLV_OK) {
+		fprintf(stderr, "dwarf_finish failed!\n");
+	}
+	close(fd);
+	return 0;
+}
+
+static void parse_dwarf(Dwarf_Debug dbg, const char *struct_name,
+		const char *field_names[], int field_count) {
+	Dwarf_Bool is_info = 1;
+	Dwarf_Unsigned cu_length;
+	Dwarf_Half cu_version;
+	Dwarf_Off cu_abbrev_offset;
+	Dwarf_Half cu_pointer_size;
+	Dwarf_Half cu_offset_size;
+	Dwarf_Half cu_extension_size;
+	Dwarf_Sig8 type_signature;
+	Dwarf_Unsigned type_offset;
+	Dwarf_Unsigned cu_next_offset;
+	Dwarf_Error err;
+	int rc;
+
+
+	while (1) {
+		Dwarf_Die die;
+
+		rc = dwarf_next_cu_header_c(dbg, is_info, &cu_length,
+			&cu_version, &cu_abbrev_offset, &cu_pointer_size,
+			&cu_offset_size, &cu_extension_size, &type_signature,
+			&type_offset, &cu_next_offset, &err);
+
+		if (rc == DW_DLV_NO_ENTRY)
+			break;
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr, "error dwarf_next_cu_header_c: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+
+
+		rc = dwarf_siblingof(dbg, NULL, &die, &err);
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr, "first dwarf_siblingof failed: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+
+		find_struct(dbg, die, struct_name, field_names, field_count, 0);
+	}
+
+	fprintf(stderr, "struct %s not found\n", struct_name);
+	exit(2);
+}
+
+static void find_struct(Dwarf_Debug dbg, Dwarf_Die die, const char *struct_name,
+		const char *field_names[], int field_count, int level) {
+	Dwarf_Die next;
+	Dwarf_Error err;
+	int rc;
+
+	if (level > 1)
+		return;
+
+	do {
+		char *name;
+		const char *tag_name;
+		Dwarf_Half tag;
+
+		rc = dwarf_diename(die, &name, &err);
+		if (rc == DW_DLV_NO_ENTRY) {
+			name = NULL;
+		} else if (rc != DW_DLV_OK) {
+			fprintf(stderr, "dwarf_diename error: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+
+		rc = dwarf_tag(die, &tag, &err);
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr, "dwarf_tag error: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+
+		if (debug) {
+			rc = dwarf_get_TAG_name(tag, &tag_name);
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr,
+					"dwarf_get_TAG_name error: %d\n", rc);
+				exit(1);
+			}
+
+			printf("<%d> %p <%d> %s: %s\n", level, die, tag,
+			       tag_name, name ? name : "<no name>");
+		}
+
+		rc = dwarf_child(die, &next, &err);
+		if (rc == DW_DLV_ERROR) {
+			fprintf(stderr, "dwarf_child error: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+		if (rc == DW_DLV_OK) {
+			if (tag == DW_TAG_structure_type
+				&& name && strcasecmp(name, struct_name) == 0) {
+				find_fields(dbg, die, next, struct_name,
+					    field_names, field_count,
+					    level + 1);
+				fprintf(stderr,
+					"Found struct %s but it did not have all members given!\nMissing:\n",
+					struct_name);
+				for (rc = 0; rc < field_count; rc++) {
+					if (field_names[rc])
+						fprintf(stderr, "%s\n",
+							field_names[rc]);
+				}
+				exit(3);
+			}
+			find_struct(dbg, next, struct_name, field_names,
+				    field_count, level + 1);
+			dwarf_dealloc(dbg, next, DW_DLA_DIE);
+		}
+
+
+		rc = dwarf_siblingof(dbg, die, &next, &err);
+		dwarf_dealloc(dbg, die, DW_DLA_DIE);
+		if (name)
+			dwarf_dealloc(dbg, name, DW_DLA_STRING);
+
+		if (rc != DW_DLV_OK)
+			break;
+
+		die = next;
+	} while (die);
+}
+
+static int dwarf_get_offset(Dwarf_Debug dbg, Dwarf_Die die,
+		int *poffset, Dwarf_Error *perr) {
+	Dwarf_Attribute attr;
+	Dwarf_Unsigned offset;
+	int rc;
+
+	rc = dwarf_attr(die, DW_AT_data_member_location, &attr, perr);
+	if (rc != DW_DLV_OK) {
+		return rc;
+	}
+	Dwarf_Half form;
+	rc = dwarf_whatform(attr, &form, perr);
+	if (rc != DW_DLV_OK) {
+		fprintf(stderr, "Error getting whatform: %s\n",
+			dwarf_errmsg(*perr));
+		exit(5);
+	}
+	if (form == DW_FORM_data1 || form == DW_FORM_data2
+		|| form == DW_FORM_data2 || form == DW_FORM_data4
+		|| form == DW_FORM_data8 || form == DW_FORM_udata) {
+		dwarf_formudata(attr, &offset, 0);
+	} else if (form == DW_FORM_sdata) {
+		Dwarf_Signed soffset;
+		dwarf_formsdata(attr, &soffset, 0);
+		if (soffset < 0) {
+			fprintf(stderr,
+				"unsupported negative offset\n");
+			exit(5);
+		}
+		offset = (Dwarf_Unsigned) soffset;
+	} else {
+		Dwarf_Locdesc **locdescs;
+		Dwarf_Signed len;
+		if (dwarf_loclist_n(attr, &locdescs, &len,  perr)
+				== DW_DLV_ERROR) {
+			 fprintf(stderr, "unsupported member offset\n");
+			 exit(5);
+		}
+		if (len != 1
+		    || locdescs[0]->ld_cents != 1
+		    || (locdescs[0]->ld_s[0]).lr_atom
+				!= DW_OP_plus_uconst) {
+			 fprintf(stderr,
+				"unsupported location expression\n");
+			 exit(5);
+		}
+		offset = (locdescs[0]->ld_s[0]).lr_number;
+	}
+	dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
+
+	*poffset = (int) offset;
+	return DW_DLV_OK;
+}
+
+static int dwarf_get_size(Dwarf_Debug dbg, Dwarf_Die die,
+		int *psize, Dwarf_Error *perr) {
+	Dwarf_Attribute attr;
+	Dwarf_Unsigned size;
+	int rc;
+
+	rc = dwarf_attr(die, DW_AT_byte_size, &attr, perr);
+	if (rc != DW_DLV_OK) {
+		return rc;
+	}
+	Dwarf_Half form;
+	rc = dwarf_whatform(attr, &form, perr);
+	if (rc != DW_DLV_OK) {
+		fprintf(stderr, "Error getting whatform: %s\n",
+			dwarf_errmsg(*perr));
+		exit(5);
+	}
+	if (form == DW_FORM_data1 || form == DW_FORM_data2
+		|| form == DW_FORM_data2 || form == DW_FORM_data4
+		|| form == DW_FORM_data8 || form == DW_FORM_udata) {
+		dwarf_formudata(attr, &size, 0);
+	} else if (form == DW_FORM_sdata) {
+		Dwarf_Signed ssize;
+		dwarf_formsdata(attr, &ssize, 0);
+		if (ssize < 0) {
+			fprintf(stderr,
+				"unsupported negative size\n");
+			exit(5);
+		}
+		size = (Dwarf_Unsigned) ssize;
+	} else {
+		Dwarf_Locdesc **locdescs;
+		Dwarf_Signed len;
+		if (dwarf_loclist_n(attr, &locdescs, &len,  perr)
+				== DW_DLV_ERROR) {
+			 fprintf(stderr, "unsupported member size\n");
+			 exit(5);
+		}
+		if (len != 1
+		    || locdescs[0]->ld_cents != 1
+		    || (locdescs[0]->ld_s[0]).lr_atom
+				!= DW_OP_plus_uconst) {
+			 fprintf(stderr,
+				"unsupported location expression\n");
+			 exit(5);
+		}
+		size = (locdescs[0]->ld_s[0]).lr_number;
+	}
+	dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
+
+	*psize = (int) size;
+	return DW_DLV_OK;
+}
+
+static int dwarf_get_arraysize(Dwarf_Debug dbg, Dwarf_Die die,
+		int *psize, Dwarf_Error *perr) {
+	Dwarf_Attribute attr;
+	Dwarf_Unsigned lower_bound, upper_bound;
+	int rc;
+	Dwarf_Die child;
+	Dwarf_Half form;
+
+	rc = dwarf_child(die, &child, perr);
+	if (rc == DW_DLV_NO_ENTRY) {
+		fprintf(stderr,
+				"Could not deref child of array: no entry\n");
+		return rc;
+	}
+	if (rc != DW_DLV_OK) {
+		fprintf(stderr,
+				"Could not get child entry of array: %s\n",
+				dwarf_errmsg(*perr));
+		return rc;
+	}
+
+	rc = dwarf_attr(child, DW_AT_lower_bound, &attr, perr);
+	/* Not present? Assume zero */
+	if (rc != DW_DLV_OK) {
+		lower_bound = 0;
+		goto upper;
+	}
+
+	rc = dwarf_whatform(attr, &form, perr);
+	if (rc != DW_DLV_OK) {
+		fprintf(stderr, "Error getting whatform: %s\n",
+				dwarf_errmsg(*perr));
+		exit(5);
+	}
+
+	if (form == DW_FORM_data1 || form == DW_FORM_data2
+		|| form == DW_FORM_data2 || form == DW_FORM_data4
+		|| form == DW_FORM_data8 || form == DW_FORM_udata) {
+		dwarf_formudata(attr, &lower_bound, 0);
+	} else if (form == DW_FORM_sdata) {
+		Dwarf_Signed ssize;
+		dwarf_formsdata(attr, &ssize, 0);
+		if (ssize < 0) {
+			fprintf(stderr,
+				"unsupported negative size\n");
+			exit(5);
+		}
+		lower_bound = (Dwarf_Unsigned) ssize;
+	} else {
+		Dwarf_Locdesc **locdescs;
+		Dwarf_Signed len;
+		if (dwarf_loclist_n(attr, &locdescs, &len,  perr)
+				== DW_DLV_ERROR) {
+			 fprintf(stderr, "unsupported member size\n");
+			 exit(5);
+		}
+		if (len != 1
+		    || locdescs[0]->ld_cents != 1
+		    || (locdescs[0]->ld_s[0]).lr_atom
+				!= DW_OP_plus_uconst) {
+			 fprintf(stderr,
+				"unsupported location expression\n");
+			 exit(5);
+		}
+		lower_bound = (locdescs[0]->ld_s[0]).lr_number;
+	}
+	dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
+
+upper:
+	rc = dwarf_attr(child, DW_AT_upper_bound, &attr, perr);
+	if (rc != DW_DLV_OK) {
+		return rc;
+	}
+
+	rc = dwarf_whatform(attr, &form, perr);
+	if (rc != DW_DLV_OK) {
+		fprintf(stderr, "Error getting whatform: %s\n",
+			dwarf_errmsg(*perr));
+		exit(5);
+	}
+
+	if (form == DW_FORM_data1 || form == DW_FORM_data2
+		|| form == DW_FORM_data2 || form == DW_FORM_data4
+		|| form == DW_FORM_data8 || form == DW_FORM_udata) {
+		dwarf_formudata(attr, &upper_bound, 0);
+	} else if (form == DW_FORM_sdata) {
+		Dwarf_Signed ssize;
+		dwarf_formsdata(attr, &ssize, 0);
+		if (ssize < 0) {
+			fprintf(stderr,
+				"unsupported negative size\n");
+			exit(5);
+		}
+		upper_bound = (Dwarf_Unsigned) ssize;
+	} else {
+		Dwarf_Locdesc **locdescs;
+		Dwarf_Signed len;
+		if (dwarf_loclist_n(attr, &locdescs, &len,  perr)
+				== DW_DLV_ERROR) {
+			 fprintf(stderr, "unsupported member size\n");
+			 exit(5);
+		}
+		if (len != 1
+		    || locdescs[0]->ld_cents != 1
+		    || (locdescs[0]->ld_s[0]).lr_atom
+				!= DW_OP_plus_uconst) {
+			 fprintf(stderr,
+				"unsupported location expression\n");
+			 exit(5);
+		}
+		upper_bound = (locdescs[0]->ld_s[0]).lr_number;
+	}
+	dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
+
+	*psize = ((int)upper_bound - (int)lower_bound + 1);
+	return DW_DLV_OK;
+}
+
+
+
+static int deref_type(Dwarf_Debug dbg, Dwarf_Die type_die,
+		Dwarf_Die *new_type_die, Dwarf_Half *ptype_tag,
+		Dwarf_Error *perr) {
+	Dwarf_Attribute pointer_attr;
+	Dwarf_Off pointer_off;
+	int rc;
+
+	rc = dwarf_attr(type_die, DW_AT_type, &pointer_attr,
+			perr);
+	if (rc != DW_DLV_OK)
+		return rc;
+
+	rc = dwarf_global_formref(pointer_attr, &pointer_off,
+				  perr);
+	if (rc != DW_DLV_OK)
+		return rc;
+
+	rc = dwarf_offdie_b(dbg, pointer_off, 1, new_type_die,
+			    perr);
+	if (rc != DW_DLV_OK)
+		return rc;
+
+	dwarf_dealloc(dbg, pointer_attr, DW_DLA_ATTR);
+
+	if (ptype_tag)
+		rc = dwarf_tag(*new_type_die, ptype_tag, perr);
+
+	return rc;
+}
+
+static void find_fields(Dwarf_Debug dbg, Dwarf_Die struct_die, Dwarf_Die die,
+		const char *struct_name, const char *field_names[],
+		int field_count, int level) {
+	Dwarf_Die next;
+	Dwarf_Error err;
+	int rc, i, printed_count = 0;
+	int size;
+
+	printf("struct %s {\n\tunion {\n",
+		struct_name);
+
+	rc =  dwarf_get_size(dbg, struct_die, &size, &err);
+	if (rc != DW_DLV_OK) {
+		fprintf(stderr, "could not get size for struct %s: %s\n",
+			struct_name, dwarf_errmsg(err));
+		exit(1);
+	}
+	printf("\t\tchar whole_struct[%d];\n", size);
+
+	do {
+		char *name;
+		const char *tag_name;
+		Dwarf_Half tag;
+
+		rc = dwarf_diename(die, &name, &err);
+		if (rc == DW_DLV_NO_ENTRY) {
+			name = NULL;
+		} else if (rc != DW_DLV_OK) {
+			fprintf(stderr, "dwarf_diename error: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+
+		rc = dwarf_tag(die, &tag, &err);
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr, "dwarf_tag error: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(1);
+		}
+
+		if (debug) {
+			rc = dwarf_get_TAG_name(tag, &tag_name);
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr,
+					"dwarf_get_TAG_name error: %d\n", rc);
+				exit(1);
+			}
+
+			printf("<%d> %p <%d> %s: %s\n", level, die, tag,
+			       tag_name, name ? name : "<no name>");
+		}
+
+		if (tag == DW_TAG_member && name) {
+			for (i = 0; i < field_count; i++) {
+				if (!field_names[i])
+					continue;
+				if (strcasecmp(name, field_names[i]) == 0) {
+					print_field(dbg, die, field_names[i],
+						printed_count);
+					field_names[i] = NULL;
+					printed_count++;
+					break;
+				}
+			}
+			if (printed_count == field_count) {
+				printf("\t};\n};\n");
+				exit(0);
+			}
+		}
+
+		rc = dwarf_siblingof(dbg, die, &next, &err);
+		dwarf_dealloc(dbg, die, DW_DLA_DIE);
+		if (name)
+			dwarf_dealloc(dbg, name, DW_DLA_STRING);
+
+		if (rc != DW_DLV_OK)
+			break;
+
+		die = next;
+	} while (die);
+}
+
+static void print_field(Dwarf_Debug dbg, Dwarf_Die die, const char *field_name,
+		int padnum) {
+	Dwarf_Attribute attr;
+	Dwarf_Error err;
+	int offset = 0;
+	char type_buf[1024];
+	char array_buf[128] = "";
+	char pointer_buf[128] = "";
+	int rc;
+
+	rc = dwarf_get_offset(dbg, die, &offset, &err);
+	if (rc == DW_DLV_NO_ENTRY) {
+		fprintf(stderr, "Found %s but no offset, assuming 0\n",
+			field_name);
+	} else if (rc != DW_DLV_OK) {
+		fprintf(stderr, "Error getting dwarf attr offset: %s\n",
+			dwarf_errmsg(err));
+		exit(4);
+	}
+
+	rc = dwarf_attr(die, DW_AT_type, &attr, &err);
+	if (rc == DW_DLV_NO_ENTRY) {
+		fprintf(stderr,
+			"Found %s but no type, can't assume that one out..\n",
+			field_name);
+		exit(6);
+	} else if (rc != DW_DLV_OK) {
+		fprintf(stderr, "Error getting dwarf attrlist: %s\n",
+			dwarf_errmsg(err));
+		exit(6);
+	} else {
+		Dwarf_Die type_die, next;
+		Dwarf_Off type_off;
+		Dwarf_Half type_tag;
+		char *type_name;
+		int pointer = 0;
+
+		rc = dwarf_global_formref(attr, &type_off, &err);
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr,
+				"Error getting ref offset for type: %s\n",
+				dwarf_errmsg(err));
+			exit(7);
+		}
+
+		rc = dwarf_offdie_b(dbg, type_off, 1, &type_die, &err);
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr,
+				"Error getting die from offset for type: %s\n",
+				dwarf_errmsg(err));
+			exit(7);
+		}
+
+		rc = dwarf_tag(type_die, &type_tag, &err);
+		if (rc != DW_DLV_OK) {
+			fprintf(stderr, "dwarf_tag error: %d %s\n",
+				rc, dwarf_errmsg(err));
+			exit(7);
+		}
+
+		while (type_tag == DW_TAG_pointer_type) {
+			pointer_buf[pointer++] = '*';
+
+			rc = deref_type(dbg, type_die, &next,
+					&type_tag, &err);
+			/* No entry here means void* */
+			if (rc == DW_DLV_NO_ENTRY)
+				break;
+
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr,
+					"Could not deref type for %s: %s\n",
+					field_name, dwarf_errmsg(err));
+				exit(7);
+			}
+
+			dwarf_dealloc(dbg, type_die, DW_DLA_DIE);
+			type_die = next;
+		}
+
+		if (type_tag == DW_TAG_array_type) {
+			int next_offset, size;
+
+			rc = deref_type(dbg, type_die, &next,
+					&type_tag, &err);
+			if (rc == DW_DLV_NO_ENTRY) {
+				fprintf(stderr,
+					"Could not deref array type for %s: no entry\n",
+					field_name);
+				exit(7);
+			}
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr,
+					"Could not deref type for %s: %s\n",
+					field_name, dwarf_errmsg(err));
+				exit(7);
+			}
+
+			rc = dwarf_get_arraysize(dbg, type_die, &size, &err);
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr,
+					"Could not get array size for %s: %s\n",
+					field_name, dwarf_errmsg(err));
+				exit(7);
+			}
+			type_die = next;
+
+			snprintf(array_buf, 128, "[%d]", size);
+		}
+
+		/* If it's still pointer at this point, it's void * */
+		if (type_tag != DW_TAG_pointer_type) {
+			rc = dwarf_diename(type_die, &type_name, &err);
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr, "dwarf_diename error: %s\n",
+					rc == DW_DLV_NO_ENTRY ?
+						"no name" : dwarf_errmsg(err));
+				const char *tag_name;
+
+				rc = dwarf_get_TAG_name(type_tag, &tag_name);
+				if (rc != DW_DLV_OK) {
+					fprintf(stderr,
+						"dwarf_get_TAG_name error: %d\n",
+						rc);
+				}
+
+				fprintf(stderr, "Bad tag %s (%d)?\n",
+					tag_name, type_tag);
+				exit(7);
+			}
+		}
+
+		if (type_tag == DW_TAG_structure_type) {
+			snprintf(type_buf, 1024, "struct %s %s",
+				 type_name, pointer_buf);
+		} else if (type_tag == DW_TAG_enumeration_type) {
+			snprintf(type_buf, 1024, "enum %s %s",
+				 type_name, pointer_buf);
+		} else if (type_tag == DW_TAG_base_type
+				|| type_tag == DW_TAG_typedef) {
+			snprintf(type_buf, 1024, "%s %s", type_name,
+				pointer_buf);
+		} else if (type_tag == DW_TAG_pointer_type) {
+			snprintf(type_buf, 1024, "void %s", pointer_buf);
+		} else {
+			const char *tag_name;
+
+			rc = dwarf_get_TAG_name(type_tag, &tag_name);
+			if (rc != DW_DLV_OK) {
+				fprintf(stderr,
+					"dwarf_get_TAG_name error: %d\n", rc);
+			}
+
+			fprintf(stderr,
+				"Type tag %s (%d) is not implemented, please add it\n",
+				tag_name, type_tag);
+			exit(7);
+		}
+
+		if (type_tag != DW_TAG_pointer_type)
+			dwarf_dealloc(dbg, type_name, DW_DLA_STRING);
+		dwarf_dealloc(dbg, attr, DW_DLA_ATTR);
+		dwarf_dealloc(dbg, type_die, DW_DLA_DIE);
+	}
+
+	printf("\t\tstruct {\n\t\t\tchar padding%i[%u];\n\t\t\t%s%s%s;\n\t\t};\n",
+		padnum, (unsigned int) offset,
+		type_buf, field_name, array_buf);
+}
--- a/kernel/script/regenerate_hfi1_header.sh
+++ b/kernel/script/regenerate_hfi1_header.sh
@ -0,0 +1,71 @@
+#!/bin/bash
+
+# usage:
+# /path/to/regenerate_hfi1_header.sh [hfi1.ko]
+
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+ROOTDIR=$(readlink -m "$SCRIPT_PATH")
+ROOTDIR=$(dirname "$ROOTDIR")
+set -e -u
+
+# static configuration-ish
+declare -r DES_BIN="${ROOTDIR}/dwarf-extract-struct"
+declare -r DES_SRC="${DES_BIN}.c"
+declare -r HDR_PREFIX="${ROOTDIR}/../include/hfi1/hfi1_generated_"
+
+error() {
+	echo "$@" >&2
+	exit 1
+}
+
+HFI1_KO="${1-$(modinfo -n hfi1)}" || \
+	error "Could not find hfi1 module and no argument given. Usage: $0 [hfi1.ko]"
+
+
+[[ "$DES_BIN" -nt "$DES_SRC" ]]			|| \
+	gcc -o "$DES_BIN" -g -ldwarf "$DES_SRC"	|| \
+	error "Could not compile, install libdwarf-devel ?"
+
+"$DES_BIN" "$HFI1_KO" hfi1_pportdata 					\
+	vls_operational > "${HDR_PREFIX}pportdata.h"
+
+"$DES_BIN" "$HFI1_KO" hfi1_ctxtdata					\
+	ctxt rcv_array_groups eager_base expected_count expected_base	\
+	tid_group_list tid_used_list tid_full_list dd			\
+		> "${HDR_PREFIX}ctxtdata.h"
+
+"$DES_BIN" "$HFI1_KO" hfi1_devdata					\
+	per_sdma sdma_pad_phys sdma_map pport chip_rcv_array_count	\
+	kregbase1 piobase physaddr rcvarray_wc default_desc1 flags	\
+	sc2vl events first_dyn_alloc_ctxt chip_rcv_contexts \
+	> "${HDR_PREFIX}devdata.h"
+
+"$DES_BIN" "$HFI1_KO" hfi1_filedata					\
+	uctxt pq cq dd subctxt entry_to_rb tid_lock tid_used \
+	invalid_tids invalid_tid_idx invalid_lock \
+		> "${HDR_PREFIX}filedata.h"
+
+"$DES_BIN" "$HFI1_KO" sdma_state					\
+	current_state go_s99_running previous_state\
+		> "${HDR_PREFIX}sdma_state.h"
+
+"$DES_BIN" "$HFI1_KO" sdma_engine					\
+	dd tail_lock desc_avail tail_csr flushlist flushlist_lock \
+	descq_head descq_tail descq_cnt state sdma_shift sdma_mask\
+	descq tx_ring tx_tail head_lock descq_full_count ahg_bits\
+	this_idx \
+		> "${HDR_PREFIX}sdma_engine.h"
+
+"$DES_BIN" "$HFI1_KO" user_sdma_request	\
+	data_iovs pq cq txps info hdr tidoffset data_len \
+	iov_idx sent seqnum has_error koffset tididx \
+	tids n_tids sde ahg_idx iovs seqcomp seqsubmitted \
+		> "${HDR_PREFIX}user_sdma_request.h"
+
+"$DES_BIN" "$HFI1_KO" user_sdma_txreq	\
+	hdr txreq list req flags busycount seqnum \
+		> "${HDR_PREFIX}user_sdma_txreq.h"
+
+"$DES_BIN" "$HFI1_KO" hfi1_user_sdma_pkt_q	\
+	dd req_in_use reqs n_reqs state n_max_reqs \
+		> "${HDR_PREFIX}hfi1_user_sdma_pkt_q.h"
--- a/kernel/sdma.c
+++ b/kernel/sdma.c
@ -0,0 +1,682 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <hfi1/ihk_hfi1_common.h>
+#include <hfi1/user_sdma.h> 
+#include <hfi1/sdma.h> 
+#include <hfi1/common.h> 
+
+//#define DEBUG_PRINT_SDMA
+
+#ifdef DEBUG_PRINT_SC
+#define	dkprintf(...) kprintf(__VA_ARGS__)
+#define	ekprintf(...) kprintf(__VA_ARGS__)
+#else
+#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
+#define	ekprintf(...) kprintf(__VA_ARGS__)
+#endif
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 2048
+#define SDMA_DESC_INTR 64
+#define INVALID_TAIL 0xffff
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl.  The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 vl)
+{
+	struct sdma_vl_map *m;
+	struct sdma_map_elem *e;
+	struct sdma_engine *rval;
+
+	/* NOTE This should only happen if SC->VL changed after the initial
+	 *      checks on the QP/AH
+	 *      Default will return engine 0 below
+	 */
+	if (vl >= HFI1_MAX_VLS_SUPPORTED) {
+		rval = NULL;
+		goto done;
+	}
+
+	m = ACCESS_ONCE(dd->sdma_map);
+	if (unlikely(!m)) {
+		return &dd->per_sdma[0];
+	}
+	e = m->map[vl & m->mask];
+	rval = e->sde[selector & e->mask];
+
+done:
+	rval =  !rval ? &dd->per_sdma[0] : rval;
+	// trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+	hfi1_cdbg(AIOWRITE, "-");
+	return rval;
+}
+
+int sdma_select_user_engine_idx(void)
+{
+	int idx = 0;
+	int idx_start = 0;
+	int idx_modulo = 16;
+
+	/* Hash on rank if MPI job */
+	if (cpu_local_var(current)->proc->nr_processes > 1) {
+		idx = idx_start +
+			(cpu_local_var(current)->proc->process_rank % idx_modulo);
+	}
+	/* Otherwise, CPU id */
+	else {
+		idx = ihk_mc_get_processor_id() % idx_modulo;
+	}
+
+	return idx;
+}
+
+/*
+ * sdma_select_user_engine() - select sdma engine based on user setup
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ * This function returns an sdma engine for a user sdma request.
+ * User defined sdma engine affinity setting is honored when applicable,
+ * otherwise system default sdma engine mapping is used. To ensure correct
+ * ordering, the mapping from <selector, vl> to sde must remain unchanged.
+ */
+struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
+					    u32 selector, u8 vl)
+{
+	return &dd->per_sdma[sdma_select_user_engine_idx()];
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+	return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+		>> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * __sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void __sdma_txclean(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx)
+{
+	if (tx->num_desc) {
+		/* TODO: enable sdma_unmap_desc */
+#if 0
+		u16 i;
+		u8 skip = 0, mode = ahg_mode(tx);
+
+		/* unmap first */
+		//sdma_unmap_desc(dd, &tx->descp[0]);
+		/* determine number of AHG descriptors to skip */
+		if (mode > SDMA_AHG_APPLY_UPDATE1)
+			skip = mode >> 1;
+		// for (i = 1 + skip; i < tx->num_desc; i++)
+		// 	sdma_unmap_desc(dd, &tx->descp[i]);
+#endif
+		tx->num_desc = 0;
+	}
+	kfree(tx->coalesce_buf);
+	tx->coalesce_buf = NULL;
+	/* kmalloc'ed descp */
+	if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+		tx->desc_limit = ARRAY_SIZE(tx->descs);
+		kfree(tx->descp);
+	}
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	smp_wmb(); /* see get_txhead() */
+	writeq(tail, sde->tail_csr);
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+	u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+	qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+	qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+			<< SDMA_DESC1_GENERATION_SHIFT;
+	return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+	int i;
+	u16 tail;
+	struct sdma_desc *descp = tx->descp;
+	u8 skip = 0, mode = ahg_mode(tx);
+	tail = sde->descq_tail & sde->sdma_mask;
+	sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+	sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+	// trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+	// 			   tail, &sde->descq[tail]);
+	tail = ++sde->descq_tail & sde->sdma_mask;
+	descp++;
+	if (mode > SDMA_AHG_APPLY_UPDATE1)
+		skip = mode >> 1;
+	for (i = 1; i < tx->num_desc; i++, descp++) {
+		u64 qw1;
+
+		sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+		if (skip) {
+			/* edits don't have generation */
+			qw1 = descp->qw[1];
+			skip--;
+		} else {
+			/* replace generation with real one for non-edits */
+			qw1 = add_gen(sde, descp->qw[1]);
+		}
+		sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+		// trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+		// 			   tail, &sde->descq[tail]);
+		tail = ++sde->descq_tail & sde->sdma_mask;
+	}
+
+	tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	tx->sn = sde->tail_sn++;
+	// trace_hfi1_sdma_in_sn(sde, tx->sn);
+	WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+	sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+	sde->desc_avail -= tx->num_desc;
+	return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+	struct sdma_engine *sde,
+	struct iowait_work *wait,
+	struct sdma_txreq *tx,
+	bool pkts_sent)
+{
+	int ret;
+
+	hfi1_cdbg(AIOWRITE, "+");
+	sde->desc_avail = sdma_descq_freecnt(sde);
+	if (tx->num_desc <= sde->desc_avail)
+		return -EAGAIN;
+	/* pulse the head_lock */
+	if (wait && iowait_ioww_to_iow(wait)->sleep) {
+		unsigned seq;
+
+		seq = raw_seqcount_begin(
+			(const seqcount_t *)&sde->head_lock.seqcount);
+		ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
+		if (ret == -EAGAIN)
+			sde->desc_avail = sdma_descq_freecnt(sde);
+	} else {
+		ret = -EBUSY;
+	}
+	hfi1_cdbg(AIOWRITE, "-");
+	return ret;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: SE wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ * @count: pointer to a u32 which, after return will contain the total number of
+ *         sdma_txreqs removed from the tx_list. This will include sdma_txreqs
+ *         whose SDMA descriptors are submitted to the ring and the sdma_txreqs
+ *         which are added to SDMA engine flush list if the SDMA engine state is
+ *         not running.
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list  will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * 0 - Success,
+ * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
+		     struct list_head *tx_list, u32 *count_out)
+{
+	struct sdma_txreq *tx, *tx_next;
+	int ret = 0;
+	unsigned long flags;
+	u16 tail = INVALID_TAIL;
+	u32 submit_count = 0, flush_count = 0, total_count;
+
+retry_lock:
+	linux_spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+		tx->wait = iowait_ioww_to_iow(wait);
+		if (unlikely(!__sdma_running(sde))) {
+			kprintf("%s: !__sdma_running \n", __FUNCTION__);
+			goto unlock_noconn;
+		}
+		if (unlikely(tx->num_desc > sde->desc_avail)) {
+			goto nodesc;
+		}
+		if (unlikely(tx->tlen)) {
+			ret = -EINVAL;
+			goto update_tail;
+		}
+		list_del_init(&tx->list);
+		tail = submit_tx(sde, tx);
+		submit_count++;
+		if (tail != INVALID_TAIL &&
+		    (submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+			sdma_update_tail(sde, tail);
+			tail = INVALID_TAIL;
+		}
+	}
+
+update_tail:
+	total_count = submit_count + flush_count;
+	if (wait)
+		iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
+	if (tail != INVALID_TAIL)
+		sdma_update_tail(sde, tail);
+	linux_spin_unlock_irqrestore(&sde->tail_lock, flags);
+	*count_out = total_count;
+	return ret;
+
+unlock_noconn:
+nodesc:
+	{
+		/*
+		 * Either way, we spin.
+		 * We never sleep in McKernel so release the lock occasionally
+		 * to give a chance to Linux.
+		 */
+		unsigned long ts = rdtsc();
+
+		while ((tx->num_desc > sde->desc_avail) &&
+				(rdtsc() - ts) < 5000000) {
+			sde->desc_avail = sdma_descq_freecnt(sde);
+			cpu_pause();
+		}
+
+		if (tx->num_desc <= sde->desc_avail) {
+			ret = 0;
+			goto retry;
+		}
+
+		dkprintf("%s: releasing lock and reiterating.. \n", __FUNCTION__);
+		linux_spin_unlock_irqrestore(&sde->tail_lock, flags);
+		cpu_pause();
+		ret = 0;
+		goto retry_lock;
+	}
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors. There doesn't seem
+ * much point in an interim step. The last descriptor
+ * is reserved for coalesce buffer in order to support
+ * cases where input packet has >MAX_DESC iovecs.
+ *
+ */
+static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	int i;
+
+	/* Handle last descriptor */
+	if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
+		/* if tlen is 0, it is for padding, release last descriptor */
+		if (!tx->tlen) {
+			tx->desc_limit = MAX_DESC;
+		} else if (!tx->coalesce_buf) {
+			/* allocate coalesce buffer with space for padding */
+			tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
+						   GFP_ATOMIC);
+			if (!tx->coalesce_buf)
+				goto enomem;
+			tx->coalesce_idx = 0;
+		}
+		return 0;
+	}
+
+	if (unlikely(tx->num_desc == MAX_DESC))
+		goto enomem;
+
+	tx->descp = kmalloc_array(
+			MAX_DESC,
+			sizeof(struct sdma_desc),
+			GFP_ATOMIC);
+	if (!tx->descp)
+		goto enomem;
+
+	/* reserve last descriptor for coalescing */
+	tx->desc_limit = MAX_DESC - 1;
+	/* copy ones already built */
+	for (i = 0; i < tx->num_desc; i++)
+		tx->descp[i] = tx->descs[i];
+	return 0;
+enomem:
+	__sdma_txclean(dd, tx);
+	return -ENOMEM;
+}
+
+/*
+ * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
+ *
+ * This is called once the initial nominal allocation of descriptors
+ * in the sdma_txreq is exhausted.
+ *
+ * This function calls _extend_sdma_tx_descs to extend or allocate
+ * coalesce buffer. If there is a allocated coalesce buffer, it will
+ * copy the input packet data into the coalesce buffer. It also adds
+ * coalesce buffer descriptor once when whole packet is received.
+ *
+ * Return:
+ * <0 - error
+ * 0 - coalescing, don't populate descriptor
+ * 1 - continue with populating descriptor
+ */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+			   int type, void *kvaddr, struct page *page,
+			   unsigned long offset, u16 len)
+{
+//TODO: ext_coal_sdma_tx_descs
+#ifdef __HFI1_ORIG__
+	int pad_len, rval;
+	dma_addr_t addr;
+
+	rval = _extend_sdma_tx_descs(dd, tx);
+	if (rval) {
+		__sdma_txclean(dd, tx);
+		return rval;
+	}
+
+	/* If coalesce buffer is allocated, copy data into it */
+	if (tx->coalesce_buf) {
+		if (type == SDMA_MAP_NONE) {
+			__sdma_txclean(dd, tx);
+			return -EINVAL;
+		}
+
+		if (type == SDMA_MAP_PAGE) {
+			kvaddr = kmap(page);
+			kvaddr += offset;
+		} else if (WARN_ON(!kvaddr)) {
+			__sdma_txclean(dd, tx);
+			return -EINVAL;
+		}
+
+		memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
+		tx->coalesce_idx += len;
+		if (type == SDMA_MAP_PAGE)
+			kunmap(page);
+
+		/* If there is more data, return */
+		if (tx->tlen - tx->coalesce_idx)
+			return 0;
+
+		/* Whole packet is received; add any padding */
+		pad_len = tx->packet_len & (sizeof(u32) - 1);
+		if (pad_len) {
+			pad_len = sizeof(u32) - pad_len;
+			memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
+			/* padding is taken care of for coalescing case */
+			tx->packet_len += pad_len;
+			tx->tlen += pad_len;
+		}
+
+		/* dma map the coalesce buffer */
+		addr = dma_map_single(&dd->pcidev->dev,
+				      tx->coalesce_buf,
+				      tx->tlen,
+				      DMA_TO_DEVICE);
+
+		if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+			__sdma_txclean(dd, tx);
+			return -ENOSPC;
+		}
+
+		/* Add descriptor for coalesce buffer */
+		tx->desc_limit = MAX_DESC;
+		return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
+					 addr, tx->tlen);
+	}
+#endif /* __HFI1_ORIG__ */
+	return 1;
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	int rval = 0;
+
+	tx->num_desc++;
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = _extend_sdma_tx_descs(dd, tx);
+		if (rval) {
+			__sdma_txclean(dd, tx);
+			return rval;
+		}
+	}
+	/* finish the one just added */
+	make_tx_sdma_desc(
+		tx,
+		SDMA_MAP_NONE,
+		dd->sdma_pad_phys,
+		sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+	_sdma_close_tx(dd, tx);
+	return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+	struct sdma_txreq *tx,
+	u8 num_ahg,
+	u8 ahg_entry,
+	u32 *ahg,
+	u8 ahg_hlen)
+{
+	u32 i, shift = 0, desc = 0;
+	u8 mode;
+
+	WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+	/* compute mode */
+	if (num_ahg == 1)
+		mode = SDMA_AHG_APPLY_UPDATE1;
+	else if (num_ahg <= 5)
+		mode = SDMA_AHG_APPLY_UPDATE2;
+	else
+		mode = SDMA_AHG_APPLY_UPDATE3;
+	tx->num_desc++;
+	/* initialize to consumed descriptors to zero */
+	switch (mode) {
+	case SDMA_AHG_APPLY_UPDATE3:
+		tx->num_desc++;
+		tx->descs[2].qw[0] = 0;
+		tx->descs[2].qw[1] = 0;
+		/* FALLTHROUGH */
+	case SDMA_AHG_APPLY_UPDATE2:
+		tx->num_desc++;
+		tx->descs[1].qw[0] = 0;
+		tx->descs[1].qw[1] = 0;
+		break;
+	}
+	ahg_hlen >>= 2;
+	tx->descs[0].qw[1] |=
+		(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+			<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
+		(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+			<< SDMA_DESC1_HEADER_DWS_SHIFT) |
+		(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+			<< SDMA_DESC1_HEADER_MODE_SHIFT) |
+		(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+			<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+	for (i = 0; i < (num_ahg - 1); i++) {
+		if (!shift && !(i & 2))
+			desc++;
+		tx->descs[desc].qw[!!(i & 2)] |=
+			(((u64)ahg[i + 1])
+				<< shift);
+		shift = (shift + 32) & 63;
+	}
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+	int nr;
+	int oldbit;
+
+	if (!sde) {
+		trace_hfi1_ahg_allocate(sde, -EINVAL);
+		return -EINVAL;
+	}
+	while (1) {
+		nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+		if (nr > 31) {
+			trace_hfi1_ahg_allocate(sde, -ENOSPC);
+			return -ENOSPC;
+		}
+		oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+		if (!oldbit)
+			break;
+		cpu_relax();
+	}
+	trace_hfi1_ahg_allocate(sde, nr);
+	return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+	if (!sde)
+		return;
+	trace_hfi1_ahg_deallocate(sde, ahg_index);
+	if (ahg_index < 0 || ahg_index > 31)
+		return;
+	clear_bit(ahg_index, &sde->ahg_bits);
+}
--- a/kernel/syscall.c
+++ b/kernel/syscall.c
@ -67,6 +67,8 @@
 #include <lwk/stddef.h>
 #include <futex.h>

+#include <hfi1/file_ops.h>
+
 #define SYSCALL_BY_IKC

 //#define DEBUG_PRINT_SC
@ -268,6 +270,7 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
 	req->rtid = cpu_local_var(current)->tid;
 	req->ttid = 0;
 	res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
+	res.private_data = NULL;
 #ifdef POSTK_DEBUG_TEMP_FIX_26 /* do_syscall arg pid is not targetpid */
 	send_syscall(req, cpu, target_pid, &res);
 #else /* POSTK_DEBUG_TEMP_FIX_26 */
@ -477,6 +480,21 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
 	}
 #endif // PROFILE_ENABLE

+	if (req->number == __NR_open && rc > 0) {
+		if ((cpu_local_var(current)->proc->mcexec_flags & MCEXEC_HFI1) &&
+				res.private_data &&
+				!strncmp((const char *)req->args[0], "/dev/hfi", 8)) {
+
+			if (rc >= 0 && rc < MAX_FD_PRIV) {
+				thread->proc->fd_priv_table[rc] = res.private_data;
+			}
+			dkprintf("%s: PID: %d, open fd: %d, filename: "
+					"%s, private_data: 0x%lx\n",
+					__FUNCTION__, thread->proc->pid,
+					rc, req->args[0], res.private_data);
+		}
+	}
+
 	monitor->status = mstatus;
 	monitor->counter++;
 	return rc;
@ -1060,6 +1078,12 @@ void terminate(int rc, int sig)
 	mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);

 	vm = proc->vm;
+
+	{
+		extern int hfi1_unmap_device_addresses(struct process *proc);
+		hfi1_unmap_device_addresses(proc);
+	}
+
 	free_all_process_memory_range(vm);

 	if (proc->saved_cmdline) {
@ -1220,6 +1244,8 @@ interrupt_syscall(struct thread *thread, int sig)
 SYSCALL_DECLARE(exit_group)
 {
 	dkprintf("sys_exit_group,pid=%d\n", cpu_local_var(current)->proc->pid);
+dkprintf("%s: PID: %d, TID: %d\n", __FUNCTION__,
+	cpu_local_var(current)->proc->pid, cpu_local_var(current)->tid);
 	terminate((int)ihk_mc_syscall_arg0(ctx), 0);

 	return 0;
@ -1268,6 +1294,19 @@ int do_munmap(void *addr, size_t len)
 	int error;
 	int ro_freed;

+	/*
+	 * TODO: do call back registration for address space changes..
+	 */
+	{
+		extern int hfi1_user_exp_rcv_overlapping(
+				unsigned long start, unsigned long end);
+		unsigned long start = (unsigned long)addr;
+
+		if (hfi1_user_exp_rcv_overlapping(start, start + len)) {
+			return 0;
+		}
+	}
+
 	begin_free_pages_pending();
 	error = remove_process_memory_range(cpu_local_var(current)->vm,
 			(intptr_t)addr, (intptr_t)addr+len, &ro_freed);
@ -1282,6 +1321,7 @@ int do_munmap(void *addr, size_t len)
 		}
 	}
 	finish_free_pages_pending();
+
 	dkprintf("%s: 0x%lx:%lu, error: %ld\n",
 		__FUNCTION__, addr, len, error);
 	return error;
@ -1289,7 +1329,7 @@ int do_munmap(void *addr, size_t len)

 #ifdef POSTK_DEBUG_ARCH_DEP_27
 #else
-static int search_free_space(size_t len, intptr_t hint, int pgshift, intptr_t *addrp)
+static int search_free_space(size_t len, int pgshift, intptr_t *addrp)
 {
 	struct thread *thread = cpu_local_var(current);
 	struct vm_regions *region = &thread->vm->region;
@ -1298,17 +1338,17 @@ static int search_free_space(size_t len, intptr_t hint, int pgshift, intptr_t *a
 	struct vm_range *range;
 	size_t pgsize = (size_t)1 << pgshift;

-	dkprintf("search_free_space(%lx,%lx,%d,%p)\n", len, hint, pgshift, addrp);
+	dkprintf("%s: len: %lu, pgshift: %d\n",
+		__FUNCTION__, len, pgshift);

-	addr = hint;
+	addr = region->map_end;
 	for (;;) {
 		addr = (addr + pgsize - 1) & ~(pgsize - 1);
 		if ((region->user_end <= addr)
 				|| ((region->user_end - len) < addr)) {
-			ekprintf("search_free_space(%lx,%lx,%p):"
-					"no space. %lx %lx\n",
-					len, hint, addrp, addr,
-					region->user_end);
+			ekprintf("%s: error: addr 0x%lx is outside the user region\n",
+				__FUNCTION__, addr);
+
 			error = -ENOMEM;
 			goto out;
 		}
@ -1320,12 +1360,13 @@ static int search_free_space(size_t len, intptr_t hint, int pgshift, intptr_t *a
 		addr = range->end;
 	}

+	region->map_end = addr + len;
 	error = 0;
 	*addrp = addr;

 out:
-	dkprintf("search_free_space(%lx,%lx,%d,%p): %d %lx\n",
-			len, hint, pgshift, addrp, error, addr);
+	dkprintf("%s: len: %lu, pgshift: %d, addr: 0x%lx\n",
+		__FUNCTION__, len, pgshift, addr);
 	return error;
 }
 #endif
@ -1420,20 +1461,18 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
 		}
 	}
 	else {
-		/* choose mapping address */
+		/* Obtain mapping address */
 #ifdef POSTK_DEBUG_ARCH_DEP_27
 		error = search_free_space(cpu_local_var(current), len,
 				region->map_end, PAGE_SHIFT + p2align, &addr);
 #else
-		error = search_free_space(len, region->map_end,
-				PAGE_SHIFT + p2align, &addr);
+		error = search_free_space(len, PAGE_SHIFT + p2align, &addr);
 #endif	/* POSTK_DEBUG_ARCH_DEP_27 */
 		if (error) {
 			ekprintf("do_mmap:search_free_space(%lx,%lx,%d) failed. %d\n",
 					len, region->map_end, p2align, error);
 			goto out;
 		}
-		region->map_end = addr + len;
 	}

 	/* do the map */
@ -1537,6 +1576,24 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
 			vrflags |= VR_AP_USER;
 		}

+#if 1
+		if (len < (unsigned long)4*1024*1024*1024) {
+			phys = NOPHYS;
+			vrflags |= VR_PREALLOC;
+		}
+		else {
+		kprintf("%s: big ANON mapping!!: %lu\n", __FUNCTION__, len);
+			/* Give demand paging a chance */
+			vrflags |= VR_DEMAND_PAGING;
+			populated_mapping = 0;
+			error = zeroobj_create(&memobj);
+			if (error) {
+				ekprintf("%s: zeroobj_create failed, error: %d\n",
+						__FUNCTION__, error);
+				goto out;
+			}
+		}
+#else
 		p = ihk_mc_alloc_aligned_pages_user(npages, p2align,
 				IHK_MC_AP_NOWAIT | ap_flag, addr0);
 		if (p == NULL) {
@ -1568,6 +1625,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
 					__FUNCTION__, addr, len, npages, p2align);
 			phys = virt_to_phys(p);
 		}
+#endif
 	}
 	else if (flags & MAP_SHARED) {
 		dkprintf("%s: MAP_SHARED,flags=%x,len=%ld\n", __FUNCTION__, flags, len);
@ -1701,10 +1759,10 @@ out:
 	if (memobj) {
 		memobj_release(memobj);
 	}
-	dkprintf("%s: 0x%lx:%8lu, (req: 0x%lx:%lu), prot: %x, flags: %x, "
+	dkprintf("%s: 0x%lx:%8lu-0x%lx, (req: 0x%lx:%lu), prot: %x, flags: %x, "
 			"fd: %d, off: %lu, error: %ld, addr: 0x%lx\n",
 			__FUNCTION__,
-			addr, len, addr0, len0, prot, flags,
+			addr, len, addr+len, addr0, len0, prot, flags,
 			fd, off0, error, addr);

 	return (!error)? addr: error;
@ -3078,6 +3136,22 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	return 0;
 }

+SYSCALL_DECLARE(writev)
+{
+	struct process *proc = cpu_local_var(current)->proc;
+	int fd = ihk_mc_syscall_arg0(ctx);
+	struct iovec *iovec = (struct iovec *)ihk_mc_syscall_arg1(ctx);
+	int iovcnt = ihk_mc_syscall_arg2(ctx);
+	void *private_data = (fd < 0 || fd >= MAX_FD_PRIV) ? NULL : proc->fd_priv_table[fd];
+
+	if (private_data) {
+		return hfi1_aio_write(private_data, iovec, iovcnt);
+	}
+	else {
+		return syscall_generic_forwarding(__NR_writev, ctx);
+	}
+}
+
 SYSCALL_DECLARE(read)
 {
 	int fd = ihk_mc_syscall_arg0(ctx);
@ -3111,6 +3185,9 @@ SYSCALL_DECLARE(ioctl)
 	struct process *proc = thread->proc;
 	struct mckfd *fdp;
 	long irqstate;
+	void *private_data = (fd < 0 || fd >= MAX_FD_PRIV) ? NULL : proc->fd_priv_table[fd];
+	unsigned long t_s = rdtsc();
+	int sub_rc = 0;

 	irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
 	for(fdp = proc->mckfd; fdp; fdp = fdp->next)
@ -3118,13 +3195,44 @@ SYSCALL_DECLARE(ioctl)
 			break;
 	ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);

-	if(fdp && fdp->ioctl_cb){
-		//kprintf("ioctl: found system fd %d\n", fd);
+	if (private_data) {
+		extern long hfi1_file_ioctl(void *private_data,
+				unsigned int cmd,
+			    unsigned long arg,
+				unsigned long t_s);
+
+		rc = hfi1_file_ioctl(private_data,
+				ihk_mc_syscall_arg1(ctx),
+				ihk_mc_syscall_arg2(ctx),
+				t_s);
+
+		/* Continue forwarding iff hfi1 didn't handle it */
+		// TODO: improve heuristics?
+		if (rc != -ENOTSUPP && rc != -ENODEV)
+			return rc;
+
+		if (rc == -ENODEV) {
+			sub_rc = rc;
+		}
+	}
+
+	if (fdp && fdp->ioctl_cb) {
 		rc = fdp->ioctl_cb(fdp, ctx);
 	}
-	else{
+	else {
 		rc = syscall_generic_forwarding(__NR_ioctl, ctx);
 	}
+
+	if (private_data && sub_rc == -ENODEV) {
+		extern int hfi1_map_device_addresses(void *fd);
+
+		if (hfi1_map_device_addresses(private_data) < 0) {
+			kprintf("%s: Could not map hfi1 device addresses\n",
+					__FUNCTION__);
+			return -EINVAL;
+		}
+	}
+
 	return rc;
 }

@ -3145,6 +3253,11 @@ SYSCALL_DECLARE(open)
 		return -EFAULT;
 	}
 	dkprintf("open(): pathname=%s\n", xpmem_wk);
+
+	if (!strcmp(xpmem_wk, "/proc/sys/vm/overcommit_memory")) {
+		return -ENOENT;
+	}
+
 	rc = strcmp(xpmem_wk, XPMEM_DEV_PATH);
 #ifdef POSTK_DEBUG_ARCH_DEP_62 /* Absorb the difference between open and openat args. */
 	if (!rc) {
@ -3231,6 +3344,11 @@ SYSCALL_DECLARE(close)
 		ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
 		rc = syscall_generic_forwarding(__NR_close, ctx);
 	}
+
+	if (fd >= 0 && fd < MAX_FD_PRIV) {
+		thread->proc->fd_priv_table[fd] = NULL;
+	}
+
 	return rc;
 }

@ -4908,7 +5026,6 @@ SYSCALL_DECLARE(shmat)
 	struct process_vm *vm = thread->vm;
 	size_t len;
 	int error;
-	struct vm_regions *region = &vm->region;
 	intptr_t addr;
 	int prot;
 	int vrflags;
@ -4977,7 +5094,7 @@ SYSCALL_DECLARE(shmat)
 		error = search_free_space(cpu_local_var(current), len,
 					  region->map_end, obj->pgshift, &addr);
 #else
-		error = search_free_space(len, region->map_end, obj->pgshift, &addr);
+		error = search_free_space(len, obj->pgshift, &addr);
 #endif	/* POSTK_DEBUG_ARCH_DEP_27 */
 		if (error) {
 			ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
@ -4985,7 +5102,6 @@ SYSCALL_DECLARE(shmat)
 			dkprintf("shmat(%#x,%p,%#x):search_free_space failed. %d\n", shmid, shmaddr, shmflg, error);
 			return error;
 		}
-		region->map_end = addr + len;
 	}

 	vrflags = VR_NONE;
@ -5438,6 +5554,7 @@ do_exit(int code)
 	int sig = code & 255;

 	dkprintf("sys_exit,pid=%d\n", proc->pid);
+dkprintf("%s: PID: %d, TID: %d\n", __FUNCTION__, proc->pid, thread->tid);

 	mcs_rwlock_reader_lock(&proc->threads_lock, &lock);
 	nproc = 0;
@ -7721,6 +7838,10 @@ SYSCALL_DECLARE(mremap)
 	uintptr_t lckstart = -1;
 	uintptr_t lckend = -1;

+/* Not for lammps for now.. */
+if (!strcmp("./lammps", thread->proc->saved_cmdline))
+	return -ENOSYS;
+
 	dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx)\n",
 			oldaddr, oldsize0, newsize0, flags, newaddr);
 	ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
@ -7816,8 +7937,8 @@ SYSCALL_DECLARE(mremap)
 					  vm->region.map_end,
 					  range->pgshift, (intptr_t *)&newstart);
 #else
-		error = search_free_space(newsize, vm->region.map_end,
-				range->pgshift, (intptr_t *)&newstart);
+		error = search_free_space(newsize, range->pgshift,
+				(intptr_t *)&newstart);
 #endif	/* POSTK_DEBUG_ARCH_DEP_27 */
 		if (error) {
 			ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
@ -7848,8 +7969,10 @@ SYSCALL_DECLARE(mremap)
 		if (range->memobj) {
 			memobj_ref(range->memobj);
 		}
+
+		/* Drop VR_PREALLOC to create vm_range without physical pages */
 		error = add_process_memory_range(thread->vm, newstart, newend, -1,
-				range->flag, range->memobj,
+				range->flag & ~VR_PREALLOC, range->memobj,
 				range->objoff + (oldstart - range->start),
 				range->pgshift, NULL);
 		if (error) {
@ -9443,6 +9566,10 @@ long syscall(int num, ihk_mc_user_context_t *ctx)
 	}
 #endif // PROFILE_ENABLE

+	if (thread->proc->nohost) { // mcexec termination was detected
+		terminate(0, SIGKILL);
+	}
+
 #if defined(POSTK_DEBUG_TEMP_FIX_60) && defined(POSTK_DEBUG_TEMP_FIX_56)
 	check_need_resched();
 #elif defined(POSTK_DEBUG_TEMP_FIX_60) /* sched_yield called check_signal fix. */
@ -9470,9 +9597,6 @@ long syscall(int num, ihk_mc_user_context_t *ctx)
 #endif // DISABLE_SCHED_YIELD
 		set_cputime(0);

-	if (thread->proc->nohost) { // mcexec termination was detected
-		terminate(0, SIGKILL);
-	}
 //kprintf("syscall=%d returns %lx(%ld)\n", num, l, l);

 	return l;
--- a/kernel/user_exp_rcv.c
+++ b/kernel/user_exp_rcv.c
@ -0,0 +1,778 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <hfi1/ihk_hfi1_common.h>
+#include <hfi1/common.h>
+#include <hfi1/hfi.h>
+#include <hfi1/chip.h>
+#include <hfi1/user_exp_rcv.h>
+#include <hfi1/user_sdma.h> // for hfi1_map_device_addresses
+
+//#define DEBUG_PRINT_USER_EXP_RCV
+
+#ifdef DEBUG_PRINT_USER_EXP_RCV
+#define dkprintf(...) kprintf(__VA_ARGS__)
+#else
+#define dkprintf(...) do { if(0) kprintf(__VA_ARGS__); } while (0)
+#endif
+
+static int program_rcvarray(struct hfi1_filedata *, unsigned long, uintptr_t,
+		size_t, u32 *);
+static int set_rcvarray_entry(struct hfi1_filedata *, unsigned long, uintptr_t,
+		u32, struct tid_group *, int, u32);
+static int unprogram_rcvarray(struct hfi1_filedata *, u32, struct tid_group **);
+static void clear_tid_node(struct hfi1_filedata *, struct tid_rb_node *);
+static int tid_rb_invalidate(struct hfi1_filedata *fdata,
+		struct tid_rb_node *node);
+
+static int hfi1_rb_tree_insert(struct rb_root *root,
+		struct tid_rb_node *new_node);
+static void __hfi1_rb_tree_remove(struct tid_rb_node *tid_node);
+static struct tid_rb_node *__hfi1_search_rb_overlapping_node(
+		struct rb_root *root,
+		unsigned long start,
+		unsigned long end);
+
+/*
+ * RcvArray entry allocation for Expected Receives is done by the
+ * following algorithm:
+ */
+int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, struct hfi1_tid_info *tinfo)
+{
+	int ret = -EFAULT;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	uintptr_t vaddr, vaddr_end, base_vaddr = 0;
+	u32 *tidlist;
+	u16 tididx = 0;
+	struct process_vm *vm = cpu_local_var(current)->vm;
+	size_t base_pgsize, len = 0;
+	pte_t *ptep;
+	u64 phys;
+
+	if (!tinfo->length)
+		return -EINVAL;
+
+	if (tinfo->length / PAGE_SIZE > uctxt->expected_count) {
+		kprintf("Expected buffer too big\n");
+		return -EINVAL;
+	}
+
+	/* TODO: sizeof(*tidlist) * uctxt->expected_count); */
+	tidlist = kmalloc_cache_alloc(&cpu_local_var(tidlist_cache),
+			sizeof(*tidlist) * 2048);
+
+	if (!tidlist)
+		return -ENOMEM;
+
+#if 0
+	/* Verify that access is OK for the user buffer */
+	if (access_ok(vm, VERIFY_WRITE, tinfo->vaddr, tinfo->length)) {
+		kprintf("%s: access_ok() failed for 0x%lx:%lu\n",
+			__FUNCTION__, tinfo->vaddr, tinfo->length);
+		return -EFAULT;
+	}
+#endif
+
+	vaddr_end = tinfo->vaddr + tinfo->length;
+	dkprintf("%s: vaddr: 0x%llx, length: %zu (end: 0x%lx)\n",
+			__FUNCTION__, tinfo->vaddr, tinfo->length,
+			tinfo->vaddr + tinfo->length);
+
+	vaddr = tinfo->vaddr;
+
+	ptep = ihk_mc_pt_lookup_fault_pte(vm,
+			(void*)vaddr, 0,
+			(void**)&base_vaddr,
+			&base_pgsize, 0);
+	if (unlikely(!ptep || !pte_is_present(ptep))) {
+		kprintf("%s: ERROR: no valid  PTE for 0x%lx\n",
+				__FUNCTION__, vaddr);
+		return -EFAULT;
+	}
+
+	while (vaddr < vaddr_end) {
+		phys = pte_get_phys(ptep) + (vaddr - base_vaddr);
+		len = (base_vaddr + base_pgsize - vaddr);
+		ret = 0;
+
+		/* Are we right at a page border? */
+		if (len == 0) {
+			ptep = ihk_mc_pt_lookup_fault_pte(vm,
+					(void*)vaddr, 0,
+					(void**)&base_vaddr,
+					&base_pgsize, 0);
+			if (unlikely(!ptep || !pte_is_present(ptep))) {
+				kprintf("%s: ERROR: no valid  PTE for 0x%lx\n",
+						__FUNCTION__, vaddr);
+				return -EFAULT;
+			}
+
+			phys = pte_get_phys(ptep) + (vaddr - base_vaddr);
+			len = (base_vaddr + base_pgsize - vaddr);
+		}
+
+		/* Collect max physically contiguous chunk */
+		while (len < MAX_EXPECTED_BUFFER &&
+				vaddr + len < vaddr_end) {
+			uintptr_t __base_vaddr;
+			size_t __base_pgsize;
+			pte_t *__ptep;
+			int contiguous = 0;
+
+			/* Look up next page */
+			__ptep = ihk_mc_pt_lookup_fault_pte(vm,
+					(void*)vaddr + len, 0,
+					(void**)&__base_vaddr,
+					&__base_pgsize, 0);
+			if (unlikely(!__ptep || !pte_is_present(__ptep))) {
+				kprintf("%s: ERRROR: no valid  PTE for 0x%lx\n",
+						__FUNCTION__, vaddr + len);
+				ret = -EFAULT;
+				break;
+			}
+
+			/* Contiguous? */
+			if (pte_get_phys(__ptep) == pte_get_phys(ptep) + base_pgsize) {
+				len += __base_pgsize;
+				contiguous = 1;
+			}
+
+			base_pgsize = __base_pgsize;
+			base_vaddr = __base_vaddr;
+			ptep = __ptep;
+
+			if (!contiguous)
+				break;
+		}
+
+		if (ret == -EFAULT)
+			break;
+
+		if (len > vaddr_end - vaddr) {
+			len = vaddr_end - vaddr;
+		}
+
+		if (len > MAX_EXPECTED_BUFFER) {
+			len = MAX_EXPECTED_BUFFER;
+		}
+
+		ret = program_rcvarray(fd, vaddr, phys, len, tidlist + tididx);
+		if (ret <= 0) {
+			kprintf("%s: failed to program RcvArray entries for len: %lu"
+					", vaddr: 0x%lx, vaddr_end: 0x%lx, ret: %d\n",
+					__FUNCTION__, len, vaddr, vaddr_end, ret);
+			panic("program_rcvarray() failed");
+			ret = -EFAULT;
+		}
+
+		dkprintf("%s: vaddr: 0x%lx -> phys: 0x%llx:%lu programmed\n",
+			__FUNCTION__, vaddr, phys, len);
+
+		tididx += ret;
+		vaddr += len;
+	}
+
+	if (ret > 0) {
+		linux_spin_lock(&fd->tid_lock);
+		fd->tid_used += tididx;
+		linux_spin_unlock(&fd->tid_lock);
+		tinfo->tidcnt = tididx;
+
+		if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+				 tidlist, sizeof(*tidlist)*tididx)) {
+			/*
+			 * On failure to copy to the user level, we need to undo
+			 * everything done so far so we don't leak resources.
+			 */
+			tinfo->tidlist = (unsigned long)&tidlist;
+			hfi1_user_exp_rcv_clear(fd, tinfo);
+			tinfo->tidlist = 0;
+			ret = -EFAULT;
+		}
+
+		dkprintf("%s: range: 0x%llx:%lu -> %d TIDs programmed\n",
+			__FUNCTION__, tinfo->vaddr, tinfo->length, tinfo->tidcnt);
+	}
+
+	kmalloc_cache_free(tidlist);
+	return ret > 0 ? 0 : ret;
+}
+
+int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, struct hfi1_tid_info *tinfo)
+{
+	int ret = 0;
+	u32 *tidinfo;
+	unsigned tididx;
+
+	tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
+	if (!tidinfo)
+		return -ENOMEM;
+
+	if (copy_from_user(tidinfo, (void __user *)(unsigned long)
+			   tinfo->tidlist, sizeof(tidinfo[0]) *
+			   tinfo->tidcnt)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* Technically should never be needed (because mapped previously
+	 * on update), but this call is no-op if addresses have been set
+	 * previously
+	if (hfi1_map_device_addresses(fd) < 0) {
+		kprintf("%s: Could not map hfi1 device addresses\n",
+			__FUNCTION__);
+		return -EINVAL;
+	}
+	*/
+
+	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
+		ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
+		if (ret) {
+			kprintf("Failed to unprogram rcv array %d\n",
+				  ret);
+			break;
+		}
+	}
+
+	dkprintf("%s: 0x%llx:%lu -> %d TIDs unprogrammed\n",
+			__FUNCTION__, tinfo->vaddr, tinfo->length, tinfo->tidcnt);
+
+	linux_spin_lock(&fd->tid_lock);
+	fd->tid_used -= tididx;
+	linux_spin_unlock(&fd->tid_lock);
+
+	tinfo->tidcnt = tididx;
+done:
+	kfree(tidinfo);
+	return ret;
+}
+
+
+/**
+ * program_rcvarray() - program an RcvArray group with receive buffers
+ */
+static int program_rcvarray(struct hfi1_filedata *fd,
+				unsigned long vaddr,
+				uintptr_t phys,
+			    size_t len, u32 *ptid)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	u16 idx = 0;
+	u32 tidinfo = 0, rcventry;
+	int ret = -ENOMEM, count = 0;
+	struct tid_group *grp = NULL;
+
+	/* lock is taken at loop edges */
+	linux_spin_lock(&fd->tid_lock);
+	while (len > 0) {
+		size_t tid_len;
+		size_t tid_npages;
+
+		if (!grp) {
+			if (!uctxt->tid_used_list.count) {
+				if (!uctxt->tid_group_list.count) {
+					linux_spin_unlock(&fd->tid_lock);
+					/* return what we have so far */
+					kprintf("%s: ERROR: no grp?\n", __FUNCTION__);
+					return count ? count : -ENOMEM;
+				}
+
+				grp = tid_group_pop(&uctxt->tid_group_list);
+			} else {
+				grp = tid_group_pop(&uctxt->tid_used_list);
+			}
+		}
+
+		/* Find the first unused entry in the group */
+		for (; idx < grp->size; idx++) {
+			if (!(grp->map & (1 << idx))) {
+				break;
+			}
+		}
+		linux_spin_unlock(&fd->tid_lock);
+
+		tid_len = (len > MAX_EXPECTED_BUFFER) ? MAX_EXPECTED_BUFFER :
+			(1 << (fls(len) - 1));
+		tid_npages = (tid_len > PAGE_SIZE) ? tid_len >> PAGE_SHIFT : 1;
+
+		rcventry = grp->base + idx;
+		rcv_array_wc_fill(dd, rcventry);
+		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
+			EXP_TID_SET(LEN, tid_npages);
+		ret = set_rcvarray_entry(fd, vaddr, phys, rcventry,
+				grp, tid_npages, tidinfo);
+		if (ret) {
+			kprintf("%s: set_rcvarray_entry() failed: %d\n",
+				__FUNCTION__, ret);
+			return ret;
+		}
+
+		ptid[count++] = tidinfo;
+		len -= tid_len;
+		vaddr += tid_len;
+		phys += tid_len;
+
+		linux_spin_lock(&fd->tid_lock);
+		grp->used++;
+		grp->map |= 1 << idx++;
+
+		/* optimization: keep same group if possible. */
+		if (grp->used < grp->size && len > 0)
+			continue;
+
+		if (grp->used == grp->size)
+			tid_group_add_tail(grp, &uctxt->tid_full_list);
+		else
+			tid_group_add_tail(grp, &uctxt->tid_used_list);
+		idx = 0;
+		grp = NULL;
+	}
+	linux_spin_unlock(&fd->tid_lock);
+
+	return count;
+}
+
+static int set_rcvarray_entry(struct hfi1_filedata *fd,
+		unsigned long vaddr, uintptr_t phys,
+		u32 rcventry, struct tid_group *grp,
+		int npages, u32 tidinfo)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	struct tid_rb_node *node;
+
+	/*
+	 * Allocate the node first so we can handle a potential
+	 * failure before we've programmed anything.
+	 */
+	node = kmalloc_cache_alloc(&cpu_local_var(tid_node_cache),
+			sizeof(*node));
+	if (!node) {
+		kprintf("%s: ERROR: allocating node\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+
+	dkprintf("Registering rcventry %d, phys 0x%p, len %u\n", rcventry,
+		 phys, npages << PAGE_SHIFT);
+
+	node->phys = phys;
+	node->len = npages << PAGE_SHIFT;
+	node->rcventry = rcventry;
+	node->grp = grp;
+	node->freed = false;
+	node->fd = fd;
+	node->start = vaddr;
+	node->end = vaddr + node->len;
+	node->range = NULL;
+
+	// TODO: check node->rcventry - uctxt->expected_base is within
+	// [0; uctxt->expected_count[ ?
+	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
+	hfi1_rb_tree_insert(
+			&cpu_local_var(current)->proc->hfi1_reg_tree,
+			node);
+	dkprintf("%s: node (0x%lx:%lu) programmed, tidinfo: %d\n",
+		__FUNCTION__, vaddr, node->len, tidinfo);
+
+	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, fls(npages));
+#if 0
+	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
+			       node->mmu.addr, node->phys, phys);
+#endif
+	return 0;
+}
+
+
+int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, struct hfi1_tid_info *tinfo)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	unsigned long *ev = uctxt->dd->events +
+		(((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
+		  HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
+	int ret = 0;
+
+	if (!fd->invalid_tids)
+		return -EINVAL;
+
+	/*
+	 * copy_to_user() can sleep, which will leave the invalid_lock
+	 * locked and cause the MMU notifier to be blocked on the lock
+	 * for a long time.
+	 * Copy the data to a local buffer so we can release the lock.
+	 *
+	 * McKernel: copy to userspace directly.
+	 */
+
+	linux_spin_lock(&fd->invalid_lock);
+	if (fd->invalid_tid_idx) {
+		dkprintf("%s: fd->invalid_tid_idx: %d to be notified\n",
+				__FUNCTION__, fd->invalid_tid_idx);
+
+		if (copy_to_user((void __user *)tinfo->tidlist,
+					fd->invalid_tids,
+					sizeof(*(fd->invalid_tids)) *
+					fd->invalid_tid_idx)) {
+			ret = -EFAULT;
+		}
+		else {
+			tinfo->tidcnt = fd->invalid_tid_idx;
+			memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
+					fd->invalid_tid_idx);
+			/*
+			 * Reset the user flag while still holding the lock.
+			 * Otherwise, PSM can miss events.
+			 */
+			clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+			dkprintf("%s: fd->invalid_tid_idx: %d notified\n",
+					__FUNCTION__, fd->invalid_tid_idx);
+			fd->invalid_tid_idx = 0;
+		}
+	}
+	else {
+		tinfo->tidcnt = 0;
+	}
+	linux_spin_unlock(&fd->invalid_lock);
+
+	return ret;
+}
+
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
+			      struct tid_group **grp)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct tid_rb_node *node;
+	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
+
+	if (tididx >= uctxt->expected_count) {
+		kprintf("Invalid RcvArray entry (%u) index for ctxt %u\n",
+			   tididx, uctxt->ctxt);
+		return -EINVAL;
+	}
+
+	if (tidctrl == 0x3) {
+		kprintf("tidctrl = 3 for rcventry %d\n",
+			tididx + 2 + uctxt->expected_base);
+		return -EINVAL;
+	}
+
+	rcventry = tididx + (tidctrl - 1);
+
+	node = fd->entry_to_rb[rcventry];
+	dkprintf("%s: node (0x%lx:%lu), tidinfo: %d\n",
+			__FUNCTION__, node->start, node->end - node->start, tidinfo);
+
+	if (!node || node->rcventry != (uctxt->expected_base + rcventry)) {
+		kprintf("bad entry %d\n", rcventry);
+		return -EBADF;
+	}
+
+	if (node->range) {
+		struct process_vm *vm = cpu_local_var(current)->vm;
+		struct deferred_unmap_range *range = node->range;
+
+		//ihk_mc_spinlock_lock_noirq(&vm->vm_deferred_unmap_lock);
+
+		if (--range->refcnt == 0) {
+			list_del(&range->list);
+		}
+		else {
+			range = NULL;
+		}
+		//ihk_mc_spinlock_unlock_noirq(&vm->vm_deferred_unmap_lock);
+
+		if (range) {
+			dkprintf("%s: executing deferred unmap: 0x%lx:%lu-0x%lx\n",
+					__FUNCTION__, range->addr, range->len,
+					range->addr + range->len);
+
+			ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
+			do_munmap(range->addr, range->len);
+			ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
+
+			kfree(range);
+		}
+	}
+
+	if (grp)
+		*grp = node->grp;
+
+	dkprintf("Clearing rcventry %d, phys 0x%p, len %u\n", node->rcventry,
+		node->phys, node->len);
+
+	fd->entry_to_rb[rcventry] = NULL;
+	clear_tid_node(fd, node);
+
+	return 0;
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+
+
+	hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
+	/*
+	 * Make sure device has seen the write before we unpin the
+	 * pages.
+	 */
+	flush_wc();
+	barrier();
+
+	__hfi1_rb_tree_remove(node);
+
+	linux_spin_lock(&fd->tid_lock);
+	node->grp->used--;
+	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
+
+	if (node->grp->used == node->grp->size - 1)
+		tid_group_move(node->grp, &uctxt->tid_full_list,
+			       &uctxt->tid_used_list);
+	else if (!node->grp->used)
+		tid_group_move(node->grp, &uctxt->tid_used_list,
+			       &uctxt->tid_group_list);
+	linux_spin_unlock(&fd->tid_lock);
+	kmalloc_cache_free(node);
+}
+
+
+int hfi1_user_exp_rcv_overlapping(unsigned long start, unsigned long end)
+{
+	int ret = 0;
+	struct process_vm *vm = cpu_local_var(current)->vm;
+	struct tid_rb_node *node;
+	struct deferred_unmap_range *range;
+
+	dkprintf("%s: 0x%lx:%lu\n", __FUNCTION__, start, end - start);
+
+	//ihk_mc_spinlock_lock_noirq(&vm->vm_deferred_unmap_lock);
+
+	node = __hfi1_search_rb_overlapping_node(
+			&cpu_local_var(current)->proc->hfi1_reg_tree,
+			start, end);
+	if (!node || node->freed) {
+		return 0;
+	}
+
+	range = kmalloc(sizeof(*range), IHK_MC_AP_NOWAIT);
+	if (!range) {
+		kprintf("%s: ERROR: allocating memory\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+
+	init_deferred_unmap_range(range, vm, (void *)start, end - start);
+
+	while (node) {
+		struct hfi1_filedata *fd = node->fd;
+		struct hfi1_ctxtdata *uctxt = fd ? fd->uctxt : NULL;
+
+		/* Sanity check */
+		if (!uctxt ||
+				fd->entry_to_rb[node->rcventry - uctxt->expected_base] != node) {
+			kprintf("%s: ERROR: inconsistent TID node\n", __FUNCTION__);
+			ret = -EINVAL;
+			break;
+		}
+
+		dkprintf("%s: node (0x%lx:%lu) deferred and invalidated"
+				" in munmap for 0x%lx:%lu-0x%lx\n",
+				__FUNCTION__, node->start, node->len, start, end - start, end);
+		tid_rb_invalidate(fd, node);
+		if (node->range) {
+			kprintf("%s: WARNING: node->range is already set for 0x%lx:%lu\n",
+				__FUNCTION__, start, end);
+		}
+		else {
+			node->range = range;
+		}
+		++range->refcnt;
+
+		node = __hfi1_search_rb_overlapping_node(
+				&cpu_local_var(current)->proc->hfi1_reg_tree,
+				start, end);
+	}
+
+	if (range->refcnt == 0) {
+		kfree(range);
+	}
+	else {
+		list_add_tail(&range->list, &vm->vm_deferred_unmap_range_list);
+		ret = range->refcnt;
+	}
+
+	//ihk_mc_spinlock_unlock_noirq(&vm->vm_deferred_unmap_lock);
+
+	return ret;
+}
+
+static int hfi1_rb_tree_insert(struct rb_root *root,
+		struct tid_rb_node *new_node)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct tid_rb_node *tid_node;
+
+	while (*new) {
+		tid_node = rb_entry(*new, struct tid_rb_node, rb_node);
+		parent = *new;
+
+		if (new_node->end <= tid_node->start) {
+			new = &((*new)->rb_left);
+		}
+		else if (new_node->start >= tid_node->end) {
+			new = &((*new)->rb_right);
+		}
+		else {
+			kprintf("%s: ERROR: overlapping TID nodes, "
+					"node (0x%lx:%lu) <=> new (0x%lx:%lu)\n",
+					__FUNCTION__,
+					tid_node->start, tid_node->len,
+					new_node->start, new_node->len);
+			return -EINVAL;
+		}
+	}
+
+	rb_link_node(&new_node->rb_node, parent, new);
+	rb_insert_color(&new_node->rb_node, root);
+	new_node->rb_root = root;
+
+	return 0;
+}
+
+static void __hfi1_rb_tree_remove(struct tid_rb_node *tid_node)
+{
+	if (!tid_node->rb_root) {
+		kprintf("%s: ERROR: node without rb_root??\n",
+			__FUNCTION__);
+		return;
+	}
+	rb_erase(&tid_node->rb_node, tid_node->rb_root);
+	tid_node->rb_root = NULL;
+}
+
+static struct tid_rb_node *__hfi1_search_rb_overlapping_node(
+	struct rb_root *root,
+	unsigned long start,
+	unsigned long end)
+{
+	struct rb_node *node = root->rb_node;
+	struct tid_rb_node *tid_node = NULL;
+
+	while (node) {
+		tid_node = rb_entry(node, struct tid_rb_node, rb_node);
+
+		if (end <= tid_node->start) {
+			node = node->rb_left;
+		}
+		else if (start >= tid_node->end) {
+			node = node->rb_right;
+		}
+		else if (tid_node->freed) {
+			node = rb_next(node);
+		}
+		else {
+			break;
+		}
+	}
+
+	return node ? tid_node : NULL;
+}
+
+/*
+ * Always return 0 from this function.  A non-zero return indicates that the
+ * remove operation will be called and that memory should be unpinned.
+ * However, the driver cannot unpin out from under PSM.  Instead, retain the
+ * memory (by returning 0) and inform PSM that the memory is going away.  PSM
+ * will call back later when it has removed the memory from its list.
+ *
+ * XXX: in McKernel we attach tid nodes to memory ranges that are
+ * about to be unmapped. Once we got all of them cleared, the actual
+ * unmap is performed.
+ */
+static int tid_rb_invalidate(struct hfi1_filedata *fdata,
+		struct tid_rb_node *node)
+{
+	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+
+	if (node->freed)
+		return 0;
+
+	node->freed = true;
+	__hfi1_rb_tree_remove(node);
+	hfi1_rb_tree_insert(
+			&cpu_local_var(current)->proc->hfi1_inv_tree,
+			node);
+
+	linux_spin_lock(&fdata->invalid_lock);
+	if (fdata->invalid_tid_idx < uctxt->expected_count) {
+		fdata->invalid_tids[fdata->invalid_tid_idx] =
+			rcventry2tidinfo(node->rcventry - uctxt->expected_base);
+		fdata->invalid_tids[fdata->invalid_tid_idx] |=
+			EXP_TID_SET(LEN, node->len >> PAGE_SHIFT);
+		if (!fdata->invalid_tid_idx) {
+			unsigned long *ev;
+
+			/*
+			 * hfi1_set_uevent_bits() sets a user event flag
+			 * for all processes. Because calling into the
+			 * driver to process TID cache invalidations is
+			 * expensive and TID cache invalidations are
+			 * handled on a per-process basis, we can
+			 * optimize this to set the flag only for the
+			 * process in question.
+			 */
+			ev = uctxt->dd->events +
+				(((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
+				  HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
+			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+		}
+		fdata->invalid_tid_idx++;
+	}
+	linux_spin_unlock(&fdata->invalid_lock);
+	return 0;
+}
--- a/kernel/user_sdma.c
+++ b/kernel/user_sdma.c
--- a/lib/abort.c
+++ b/lib/abort.c
@ -19,7 +19,7 @@ void panic(const char *msg)

 	kprintf("%s\n", msg);

-	arch_print_stack();
+	//arch_print_stack();

 	while (1) {
 		cpu_halt();
--- a/lib/include/ihk/cpu.h
+++ b/lib/include/ihk/cpu.h
@ -24,6 +24,7 @@ void cpu_halt(void);
 void cpu_safe_halt(void);
 void cpu_restore_interrupt(unsigned long);
 void cpu_pause(void);
+void cpu_relax(void);

 #define barrier()   arch_barrier()

@ -76,6 +77,7 @@ void ihk_mc_init_ap(void);
 void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
                         void *stack_pointer,
                         void (*next_function)(void));
+void *ihk_mc_get_linux_kernel_pgt(void);

 int ihk_mc_get_extra_reg_id(unsigned long hw_config, unsigned long hw_config_ext);
 unsigned int ihk_mc_get_nr_extra_regs();
--- a/lib/include/ihk/mm.h
+++ b/lib/include/ihk/mm.h
@ -176,7 +176,10 @@ int ihk_mc_pt_free_range(page_table_t pt, struct process_vm *vm,
 int ihk_mc_pt_change_attr_range(page_table_t pt, void *start, void *end,
 		enum ihk_mc_pt_attribute clrattr,
 		enum ihk_mc_pt_attribute setattr);
-pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift, void **pgbasep, size_t *pgsizep, int *p2alignp);
+pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift,
+		void **pgbasep, size_t *pgsizep, int *p2alignp);
+pte_t *ihk_mc_pt_lookup_fault_pte(struct process_vm *vm, void *virt,
+		int pgshift, void **basep, size_t *sizep, int *p2alignp);
 int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, 
 		void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr,
 						int pgshift, struct vm_range *range);
--- a/lib/include/list.h
+++ b/lib/include/list.h
@ -41,6 +41,12 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
 	list->prev = list;
 }

+static inline void ZERO_LIST_HEAD(struct list_head *list)
+{
+	list->next = 0;
+	list->prev = 0;
+}
+
 /*
 * Insert a new entry between two known consecutive entries.
 *
--- a/lib/include/mc_perf_event.h
+++ b/lib/include/mc_perf_event.h
@ -46,9 +46,7 @@ struct perf_event_attr;
         ((nr)   << _IOC_NRSHIFT) | \
         ((size) << _IOC_SIZESHIFT))

-#ifndef __KERNEL__
 #define _IOC_TYPECHECK(t) (sizeof(t))
-#endif

 /* used to create numbers */
 #define _IO(type,nr)            _IOC(_IOC_NONE,(type),(nr),0)
--- a/lib/include/memory.h
+++ b/lib/include/memory.h
@ -16,19 +16,6 @@

 #include <arch-memory.h>

-#ifndef KERNEL_PHYS_OFFSET
-#define KERNEL_PHYS_OFFSET 0
-
-static unsigned long virt_to_phys(void *v)
-{
-	return (unsigned long)v - KERNEL_PHYS_OFFSET;
-}
-static void *phys_to_virt(unsigned long p)
-{
-	return (void *)(p + KERNEL_PHYS_OFFSET);
-}
-#endif
-
 struct process_vm;

 unsigned long virt_to_phys(void *v);
--- a/test/mem_dest_prev/LTP_testcase.txt
+++ b/test/mem_dest_prev/LTP_testcase.txt
@ -0,0 +1,100 @@
+brk01
+clone01
+clone03
+clone04
+clone06
+clone07
+close01
+close02
+dup01
+dup02
+dup03
+dup04
+dup05
+dup06
+dup07
+fork01
+fork02
+fork03
+fork04
+fork07
+fork08
+fork09
+fork10
+fork11
+mbind01
+mem01
+mem02
+mem03
+memcpy01
+memfd_create02
+memset01
+mkdir01
+mkdir08
+mkdirat01
+mknodat01
+mmap001
+mmap01
+mmap02
+mmap03
+mmap04
+mmap06
+mmap07
+mmap08
+mmap09
+mmap12
+mmapstress02
+mmapstress04
+mmapstress05
+mremap01
+mremap05
+open01
+open03
+open04
+open06
+open07
+open09
+open13
+poll01
+posix_fadvise01
+read01
+read02
+read03
+read04
+sbrk01
+sbrk02
+sendfile02
+sendfile03
+sendfile04
+sendfile05
+sendfile06
+sendfile07
+sendfile08
+sendfile09
+semctl01
+semctl03
+semctl05
+socket01
+socket02
+stream01
+stream02
+stream03
+stream04
+stream05
+unlink05
+unlink06
+unlink07
+unlink08
+vfork01
+vfork02
+vma01
+vmsplice01
+vmsplice02
+write01
+write03
+write04
+write05
+writetest
+writev01
+writev02
+writev07
--- a/test/mem_dest_prev/README
+++ b/test/mem_dest_prev/README
@ -0,0 +1,25 @@
+===================
+Advance preparation
+===================
+1)Implement patch of test_memtest_destroy.patch
+cd mckernel
+patch -p0 <  test_memtest_destroy.patch
+make
+make install
+
+
+2)Compile command execution processing
+cd mckernel/test/mem_dest_prev/mcexec_test_proc/
+make
+
+3)Write the LTP path to LTP_DIR in the configuration file
+vi config
+
+  ex) LTP_DIR=$HOME/test/mem_dest_prev/ltp/testcases/bin/
+
+
+==========
+How to run
+==========
+./go_test_McKernal.sh
+
--- a/test/mem_dest_prev/config
+++ b/test/mem_dest_prev/config
@ -0,0 +1,13 @@
+MCMOD_DIR=$HOME/ppos
+LTP_DIR=$HOME/test/mem_dest_prev/ltp/testcases/bin/
+LTP_TESTCASE_FILE=LTP_testcase.txt
+MCRBT_OPT_LTP="-m 3G@0,3G@1 -s"
+USR_PROC="mcexec_test_proc/memtest_destroy"
+OS_IDX=0
+
+export MCMOD_DIR
+export LTP_DIR
+export LTP_TESTCASE_FILE
+export MCRBT_OPT_LTP
+export USR_PROC
+export OS_IDX
--- a/test/mem_dest_prev/go_test_McKernal.sh
+++ b/test/mem_dest_prev/go_test_McKernal.sh
@ -0,0 +1,101 @@
+#!/bin/sh
+
+# read config
+source ./config
+
+#logfile="./result/test_result.log"
+
+# mcexec processのkill
+./utils/kill_mcexec.sh &> /dev/null
+
+for test_case in `ls -1 ./testcases/*.txt`
+do
+	# read testcase param
+	source ${test_case}
+	case_name=`basename ${test_case} .txt`
+	echo "####################"
+	echo "Test No:${case_name}"
+
+	# Out-of-range address Test(Before correspondence)
+	echo ">>> Out-of-range address Test(Before correspondence) Start"
+
+	# stop mckernel
+	sudo ${MCMOD_DIR}/sbin/mcstop+release.sh
+	sleep 1
+	# boot mckernel
+	echo "${MCMOD_DIR}/sbin/mcreboot.sh ${MCRBT_OPT_BEFORE%,}"
+	sudo ${MCMOD_DIR}/sbin/mcreboot.sh ${MCRBT_OPT_BEFORE%,}
+	sleep 1
+
+	echo "    ${MCMOD_DIR}/bin/mcexec ${USR_PROC}"
+	timeout -sKILL 5 ${MCMOD_DIR}/bin/mcexec ${USR_PROC}
+	STATUS=$?
+
+	echo "${MCMOD_DIR}/sbin/ihkosctl ${OS_IDX} kmsg"
+	sudo ${MCMOD_DIR}/sbin/ihkosctl ${OS_IDX} kmsg
+
+	if [ "$STATUS" -ne 21 ];
+	then
+		echo ">>> Out-of-range address Test End(Timeout!!!)"
+	else
+		echo ">>> Out-of-range address Test End"
+	fi
+
+	# Out-of-range address Test(After correspondence)
+	echo ">>> Out-of-range address(After correspondence) Test Start"
+
+	# stop mckernel
+	sudo ${MCMOD_DIR}/sbin/mcstop+release.sh
+	sleep 1
+	# boot mckernel
+	echo "${MCMOD_DIR}/sbin/mcreboot.sh ${MCRBT_OPT_AFTER%,}"
+	sudo ${MCMOD_DIR}/sbin/mcreboot.sh ${MCRBT_OPT_AFTER%,}
+	sleep 1
+
+	echo "    ${MCMOD_DIR}/bin/mcexec ${USR_PROC}"
+	timeout -sKILL 5 ${MCMOD_DIR}/bin/mcexec ${USR_PROC}
+	STATUS=$?
+
+	echo "${MCMOD_DIR}/sbin/ihkosctl ${OS_IDX} kmsg"
+	sudo ${MCMOD_DIR}/sbin/ihkosctl ${OS_IDX} kmsg
+
+	if [ "$STATUS" -ne 21 ];
+	then
+		echo ">>> Out-of-range address Test End(Timeout!!!)"
+	else
+		echo ">>> Out-of-range address Test End"
+	fi
+done
+
+### LTP START ##################################################
+# stop mckernel
+sudo ${MCMOD_DIR}/sbin/mcstop+release.sh
+sleep 1
+
+# boot mckernel
+echo "${MCMOD_DIR}/sbin/mcreboot.sh ${MCRBT_OPT_LTP%,}"
+sudo ${MCMOD_DIR}/sbin/mcreboot.sh ${MCRBT_OPT_LTP%,}
+sleep 1
+
+if [ ! -e "/dev/mcos0" ]; then
+	echo "Error: failed to mcreboot"
+	exit 1
+fi
+
+
+TEST_NUM=`wc -l ${LTP_TESTCASE_FILE} | awk '{print $1}'`
+echo ">>> LTP Test Start( $TEST_NUM counts )"
+
+# exec mckernel test program
+COUNT=0
+while read line
+do 
+	((COUNT++))
+	echo "$COUNT:${MCMOD_DIR}/bin/mcexec ${LTP_DIR}$line"
+#	${MCMOD_DIR}/bin/mcexec ${LTP_DIR}$line &>> ${logfile}
+	${MCMOD_DIR}/bin/mcexec ${LTP_DIR}$line
+done < ${LTP_TESTCASE_FILE}
+
+echo ">>> LTP Test End"
+### LTP END ####################################################
+
--- a/test/mem_dest_prev/mcexec_test_proc/Makefile
+++ b/test/mem_dest_prev/mcexec_test_proc/Makefile
@ -0,0 +1,7 @@
+OBJS = memtest_destroy
+
+all:$(OBJS)
+
+clean:
+	rm $(OBJS)
+
--- a/test/mem_dest_prev/mcexec_test_proc/memtest_destroy.c
+++ b/test/mem_dest_prev/mcexec_test_proc/memtest_destroy.c
@ -0,0 +1,13 @@
+#include <stdio.h>
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+
+main() {
+	int rst = 0;
+	
+	rst = syscall(950);
+	printf("mem_destroy result:%d\n",rst);
+
+	return;
+}
--- a/test/mem_dest_prev/test_memtest_destroy.patch
+++ b/test/mem_dest_prev/test_memtest_destroy.patch
@ -0,0 +1,50 @@
+diff --git arch/x86/kernel/include/syscall_list.h arch/x86/kernel/include/syscall_list.h
+index 42d1e2e..f5769b8 100644
+--- arch/x86/kernel/include/syscall_list.h
+++ arch/x86/kernel/include/syscall_list.h
+@@ -156,5 +156,7 @@ SYSCALL_HANDLED(__NR_profile, profile)
+ SYSCALL_HANDLED(730, util_migrate_inter_kernel)
+ SYSCALL_HANDLED(731, util_indicate_clone)
+ SYSCALL_HANDLED(732, get_system)
+/* McKernel Specific */
+SYSCALL_HANDLED(950, mem_destroy)
+ 
+ /**** End of File ****/
+diff --git arch/x86/kernel/syscall.c arch/x86/kernel/syscall.c
+index 2260b66..e96776a 100644
+--- arch/x86/kernel/syscall.c
+++ arch/x86/kernel/syscall.c
+@@ -1887,4 +1887,33 @@ save_uctx(void *uctx, struct x86_user_context *regs)
+ 	ctx->fregsize = 0;
+ }
+ 
+
+#define ADD_ADDR_VAL 0x400
+SYSCALL_DECLARE(mem_destroy)
+{
+	int rst = 0;
+	int mem_chunks_num, chunk_id, get_numa_id;
+	unsigned long get_start, get_end;
+	unsigned long *addr;
+
+	mem_chunks_num = ihk_mc_get_nr_memory_chunks();
+	kprintf("%s: memory chunk %d.\n", __FUNCTION__, mem_chunks_num);
+
+	for (chunk_id = 0; chunk_id < mem_chunks_num; chunk_id++) {
+		rst = ihk_mc_get_memory_chunk(chunk_id, &get_start, &get_end, &get_numa_id);
+		kprintf("%s: mem chunk[%d] numa ID(%d)\n"
+			,__FUNCTION__ ,chunk_id ,get_numa_id);
+		kprintf("    phys(0x%lx - 0x%lx) virt(0x%lx - 0x%lx)\n"
+			,get_start ,get_end ,phys_to_virt(get_start) ,phys_to_virt(get_end));
+	}
+
+	addr  = phys_to_virt(get_end + ADD_ADDR_VAL);
+#if 1 
+	*addr = 0x1;
+#endif     
+	kprintf("%s: Address out of range  0x%lx(val:%d)\n",__FUNCTION__ ,addr ,*addr);
+
+	return rst;
+}
+
+ /*** End of File ***/
--- a/test/mem_dest_prev/testcases/0001.txt
+++ b/test/mem_dest_prev/testcases/0001.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 3G 1`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 3G 1` -s"
--- a/test/mem_dest_prev/testcases/0002.txt
+++ b/test/mem_dest_prev/testcases/0002.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 2`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 2` -s"
--- a/test/mem_dest_prev/testcases/0003.txt
+++ b/test/mem_dest_prev/testcases/0003.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 4`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 4` -s"
--- a/test/mem_dest_prev/testcases/0004.txt
+++ b/test/mem_dest_prev/testcases/0004.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 8`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 8` -s"
--- a/test/mem_dest_prev/testcases/0005.txt
+++ b/test/mem_dest_prev/testcases/0005.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 16`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 16` -s"
--- a/test/mem_dest_prev/testcases/0006.txt
+++ b/test/mem_dest_prev/testcases/0006.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 32`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 32` -s"
--- a/test/mem_dest_prev/testcases/0007.txt
+++ b/test/mem_dest_prev/testcases/0007.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 48`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 48` -s"
--- a/test/mem_dest_prev/testcases/0008.txt
+++ b/test/mem_dest_prev/testcases/0008.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 64`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 64` -s"
--- a/test/mem_dest_prev/testcases/0009.txt
+++ b/test/mem_dest_prev/testcases/0009.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 96`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 96` -s"
--- a/test/mem_dest_prev/testcases/0010.txt
+++ b/test/mem_dest_prev/testcases/0010.txt
@ -0,0 +1,2 @@
+MCRBT_OPT_BEFORE="-m `./utils/gen_mem_chunks.sh "0 1" 32M 128`"
+MCRBT_OPT_AFTER="-m `./utils/gen_mem_chunks.sh "0 1" 32M 128` -s"
--- a/test/mem_dest_prev/utils/gen_mem_chunks.sh
+++ b/test/mem_dest_prev/utils/gen_mem_chunks.sh
@ -0,0 +1,16 @@
+#!/bin/sh
+
+NUMAS=$1
+MEM_SIZE=$2
+REP=$3
+CHUNKS=""
+
+for numa in ${NUMAS}
+do
+	for rep in `seq 1 ${REP}`
+	do
+		CHUNKS="${CHUNKS}${MEM_SIZE}@${numa},"
+	done
+done
+
+echo ${CHUNKS%,}
--- a/test/mem_dest_prev/utils/kill_mcexec.sh
+++ b/test/mem_dest_prev/utils/kill_mcexec.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+
+count=`pgrep -c -f 'mcexec '`
+if [ ${count} -gt 0 ]
+then
+	echo "kill process :" ${count}
+	pgrep -l -f 'mcexec '
+	pgrep -f 'mcexec ' | xargs kill -9
+fi
+