From 56aaff1f8784a4ff34dbdfd12cb921643bb7a085 Mon Sep 17 00:00:00 2001 From: cdkersey Date: Tue, 9 Sep 2014 03:08:23 -0400 Subject: [PATCH] Fully-functioning spawn and join instructions. --- src/core.cpp | 7 +++++ src/include/core.h | 26 ++++++++++++++---- src/instruction.cpp | 37 +++++++++++++++++++++----- src/test/Makefile | 53 ++++++------------------------------- src/test/diverge.s | 64 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 56 deletions(-) create mode 100644 src/test/diverge.s diff --git a/src/core.cpp b/src/core.cpp index a4fc57aa..5413d71a 100644 --- a/src/core.cpp +++ b/src/core.cpp @@ -46,6 +46,8 @@ Core::Core(const ArchDef &a, Decoder &d, MemoryUnit &mem, Word id) : for (Word i = 0; i < a.getNPRegs(); ++i) { pred[j].push_back(Reg(id, regNum++)); } + + tmask.push_back(true); } /* Set initial register contents. */ @@ -131,6 +133,11 @@ void Core::step() { D_RAW(" ("); for (unsigned i = 0; i < shadowPReg.size(); ++i) D_RAW(shadowPReg[i]); D_RAW(')' << endl); + + D(3, "Thread mask:"); + D_RAW(" "); + for (unsigned i = 0; i < tmask.size(); ++i) D_RAW(tmask[i] << ' '); + D_RAW(endl); } #endif diff --git a/src/include/core.h b/src/include/core.h index d447b7d3..adf57d46 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -27,7 +27,7 @@ namespace Harp { Reg &operator=(T r) { val = r; doWrite(); return *this; } - operator T() { doRead(); return val; } + operator T() const { doRead(); return val; } void trunc(Size s) { Word mask((~0ull >> (sizeof(Word)-s)*8)); @@ -40,16 +40,32 @@ namespace Harp { #ifdef EMU_INSTRUMENTATION /* Access size here is 8, representing the register size of 64-bit cores. */ - void doWrite() { reg_doWrite(cpuId, regNum); } - void doRead() { reg_doRead(cpuId, regNum); } + void doWrite() const { reg_doWrite(cpuId, regNum); } + void doRead() const { reg_doRead(cpuId, regNum); } #else - void doWrite() {} - void doRead() {} + void doWrite() const {} + void doRead() const {} #endif }; // Entry in the IPDOM Stack struct DomStackEntry { + DomStackEntry( + unsigned p, const std::vector > >& m, Word pc + ): pc(pc), fallThrough(false) + { + std::cout << "New DomStackEntry:"; + for (unsigned i = 0; i < m.size(); ++i) { + tmask.push_back(!bool(m[i][p])); + std::cout << ' ' << bool(m[i][p]); + } + std::cout << std::endl; + } + + DomStackEntry(const std::vector &tmask): + tmask(tmask), fallThrough(true) {} + + bool fallThrough; std::vector tmask; Word pc; }; diff --git a/src/instruction.cpp b/src/instruction.cpp index bf7f0c1a..cba87abf 100644 --- a/src/instruction.cpp +++ b/src/instruction.cpp @@ -121,12 +121,16 @@ void Instruction::executeOn(Core &c) { return; } - /* Also throw exceptions on divergent branches. */ - if (predicated && instTable[op].controlFlow) { - bool p0 = c.pred[0][pred]; - for (Size t = 1; t < c.activeThreads; t++) { - if (c.pred[t][pred] != p0) throw DivergentBranchException(); + /* Also throw exceptions on non-masked divergent branches. */ + if (instTable[op].controlFlow) { + Size t, count, active; + for (t = 0, count = 0, active = 0; t < c.activeThreads; ++t) { + if ((!predicated || c.pred[t][pred]) && c.tmask[t]) ++count; + if (c.tmask[t]) ++active; } + + if (count != 0 && count != active) + throw DivergentBranchException(); } Size nextActiveThreads = c.activeThreads; @@ -135,8 +139,12 @@ void Instruction::executeOn(Core &c) { for (Size t = 0; t < c.activeThreads; t++) { vector > ®(c.reg[t]); vector > &pReg(c.pred[t]); + stack &domStack(c.domStack); - if (predicated && !pReg[pred]) continue; + // If this thread is masked out, don't execute the instruction, unless it's + // a split or join. + if (((predicated && !pReg[pred]) || !c.tmask[t]) && + op != SPLIT && op != JOIN) continue; Word memAddr; switch (op) { @@ -282,6 +290,23 @@ void Instruction::executeOn(Core &c) { case FDIV: reg[rdest] = Float(double(Float(reg[rsrc[0]], wordSz)) / double(Float(reg[rsrc[1]], wordSz)),wordSz); break; + case SPLIT:if (t == 0) { + // TODO: if mask becomes all-zero, fall through + DomStackEntry e(pred, c.pred, c.pc); + c.domStack.push(c.tmask); + c.domStack.push(e); + for (unsigned i = 0; i < e.tmask.size(); ++i) + c.tmask[i] = !e.tmask[i]; + } + break; + case JOIN: if (t == 0) { + // TODO: if mask becomes all-zero, fall through + if (!c.domStack.top().fallThrough) + c.pc = c.domStack.top().pc; + c.tmask = c.domStack.top().tmask; + c.domStack.pop(); + } + break; default: cout << "ERROR: Unsupported instruction: " << *this << "\n"; exit(1); diff --git a/src/test/Makefile b/src/test/Makefile index 3f7db684..7f2130bf 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -5,12 +5,14 @@ HARPDIS = ../harptool -D 4BARCH = 4b16/16/2 all: simple.bin sieve.bin 2thread.bin simple.4b.bin sieve.4b.bin 2thread.4b.bin bubble.bin bubble.4b.bin dotprod.bin dotprod.4b.bin matmul.bin matmul.4b.bin \ - matmul-mt.s + matmul-mt.bin diverge.bin run: simple.out sieve.out 2thread.out simple.4b.out sieve.4b.out 2thread.4b.out bubble.out bubble.4b.out dotprod.out dotprod.4b.out matmul.out matmul.4b.out\ - matmul-mt.out + matmul-mt.out diverge.out -disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d +disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d \ + bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d \ + diverge.d diverge.4b.d %.4b.out : %.4b.bin $(HARPEM) -a $(4BARCH) -c $< > $@ @@ -18,50 +20,11 @@ disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d b %.out : %.bin $(HARPEM) -c $< > $@ -2thread.bin : boot.HOF lib.HOF 2thread.HOF - $(HARPLD) -o 2thread.bin $^ - -2thread.4b.bin : boot.4b.HOF lib.4b.HOF 2thread.4b.HOF - $(HARPLD) --arch $(4BARCH) -o 2thread.4b.bin $^ - -bubble.bin : boot.HOF lib.HOF bubble.HOF - $(HARPLD) -o bubble.bin $^ - -bubble.4b.bin : boot.4b.HOF lib.4b.HOF bubble.4b.HOF - $(HARPLD) --arch $(4BARCH) -o bubble.4b.bin $^ - -simple.bin : boot.HOF lib.HOF simple.HOF - $(HARPLD) -o $@ $^ - -sieve.bin : boot.HOF lib.HOF sieve.HOF - $(HARPLD) -o $@ $^ - -dotprod.bin : boot.HOF lib.HOF dotprod.HOF - $(HARPLD) -o $@ $^ - -matmul.bin : boot.HOF lib.HOF matmul.HOF - $(HARPLD) -o $@ $^ - -matmul-mt.bin : boot.HOF lib.HOF matmul-mt.HOF - $(HARPLD) -o $@ $^ - -simple.4b.bin : boot.4b.HOF lib.4b.HOF simple.4b.HOF +%.4b.bin : boot.4b.HOF lib.4b.HOF %.4b.HOF $(HARPLD) --arch $(4BARCH) -o $@ $^ -sieve.4b.bin : boot.4b.HOF lib.4b.HOF sieve.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $^ - -dotprod.4b.bin : boot.4b.HOF lib.4b.HOF dotprod.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $^ - -matmul.4b.bin : boot.4b.HOF lib.4b.HOF matmul.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $^ - -%.4b.bin : %.4b.HOF - $(HARPLD) --arch $(4BARCH) -o $@ $< - -%.bin : %.HOF - $(HARPLD) -o $@ $< +%.bin : boot.HOF lib.HOF %.HOF + $(HARPLD) -o $@ $^ %.4b.HOF : %.s $(HARPAS) --arch $(4BARCH) -o $@ $< diff --git a/src/test/diverge.s b/src/test/diverge.s new file mode 100644 index 00000000..197c5461 --- /dev/null +++ b/src/test/diverge.s @@ -0,0 +1,64 @@ +/******************************************************************************* + Harptools by Chad D. Kersey, Summer 2011 +******************************************************************************** + + Sample HARP assmebly program. + +*******************************************************************************/ +/* Divergent branch: test immediate postdominator branch divergence support. */ +.def THREADS 8 + +.align 4096 +.perm x +.entry +.global +entry: + ldi %r0, #1 + ldi %r1, THREADS +sloop: clone %r0 + + addi %r0, %r0, #1 + sub %r2, %r1, %r0 + rtop @p0, %r2 + @p0 ? jmpi sloop + + ldi %r0, #0 + jalis %r5, %r1, dthread; + + ldi %r0, #0 + ldi %r1, (__WORD * THREADS) + +ploop: ld %r7, %r0, array + jali %r5, printdec + + addi %r0, %r0, __WORD + sub %r7, %r1, %r0 + rtop @p0, %r7 + @p0 ? jmpi ploop + + trap; + + +dthread: ldi %r1, #10 + ldi %r2, #0 + +loop: andi %r3, %r0, #1 + rtop @p1, %r3 + @p1 ? split + @p1 ? jmpi else + add %r2, %r2, %r0 + jmpi after +else: sub %r2, %r2, %r0 +after: join + + subi %r1, %r1, #1 + rtop @p0, %r1 + @p0 ? jmpi loop + + shli %r4, %r0, (`__WORD) + st %r2, %r4, array + + jmprt %r5; + +.align 4096 +array: .space 4096