From cfbbc665fd85aaa5ad6b66f332be114b087968df Mon Sep 17 00:00:00 2001
From: chad <chad.d.kersey@gmail.com>
Date: Sun, 13 Oct 2013 21:14:03 -0400
Subject: [PATCH 1/3] Register numbers are decimal. Don't spawn more threads
 than there are lanes.

---
 src/instruction.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/instruction.cpp b/src/instruction.cpp
index a2c70892..7f35f2f7 100644
--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -85,17 +85,17 @@ Instruction::InstTableEntry Instruction::instTable[] = {
 
 ostream &Harp::operator<<(ostream& os, Instruction &inst) {
   if (inst.predicated) {
-    os << "@p" << inst.pred << " ? ";
+    os << "@p" << dec << inst.pred << " ? ";
   }
 
   os << Instruction::instTable[inst.op].opString << ' ';
-  if (inst.rdestPresent) os << "%r" << inst.rdest << ' ';
+  if (inst.rdestPresent) os << "%r" << dec << inst.rdest << ' ';
   if (inst.pdestPresent) os << "@p" << inst.pdest << ' ';
   for (int i = 0; i < inst.nRsrc; i++) {
-    os << "%r" << inst.rsrc[i] << ' ';
+    os << "%r" << dec << inst.rsrc[i] << ' ';
   }
   for (int i = 0; i < inst.nPsrc; i++) {
-    os << "@p" << inst.psrc[i] << ' ';
+    os << "@p" << dec << inst.psrc[i] << ' ';
   }
   if (inst.immsrcPresent) {
     if (inst.refLiteral) os << inst.refLiteral->name;
@@ -288,4 +288,9 @@ void Instruction::executeOn(Core &c) {
   D(3, "End instruction execute.");
 
   c.activeThreads = nextActiveThreads;
+  if (nextActiveThreads > c.reg.size()) {
+    cerr << "Error: attempt to spawn " << nextActiveThreads << " threads. "
+         << c.reg.size() << " available.\n";
+    abort();
+  }
 }

From 23e3b9a32b247332d669c4a355d4553b5c9d4aa1 Mon Sep 17 00:00:00 2001
From: chad <chad.d.kersey@gmail.com>
Date: Sun, 13 Oct 2013 21:14:35 -0400
Subject: [PATCH 2/3] Register names are decimal.

---
 src/core.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core.cpp b/src/core.cpp
index 3835ef50..a4fc57aa 100644
--- a/src/core.cpp
+++ b/src/core.cpp
@@ -117,7 +117,7 @@ void Core::step() {
   if (USE_DEBUG >= 3) {
     D(3, "Register state:");
     for (unsigned i = 0; i < reg[0].size(); ++i) {
-      D_RAW("  %r" << i << ':');
+      D_RAW("  %r" << dec << i << ':');
       for (unsigned j = 0; j < reg.size(); ++j) 
         D_RAW(' ' << hex << reg[j][i] << ' ');
       D_RAW('(' << shadowReg[i] << ')' << endl);

From c107a49ff0a6d540274a973be905ab32fa097884 Mon Sep 17 00:00:00 2001
From: chad <chad.d.kersey@gmail.com>
Date: Sun, 13 Oct 2013 21:15:39 -0400
Subject: [PATCH 3/3] A multithreaded version of the matrix multiply.

---
 src/test/matmul-mt.s | 57 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/test/matmul-mt.s b/src/test/matmul-mt.s
index 643942ce..0575b63c 100644
--- a/src/test/matmul-mt.s
+++ b/src/test/matmul-mt.s
@@ -12,7 +12,8 @@
 .perm x
 .entry
 .global
-entry: ldi %r0, matrix_a;
+entry:
+       ldi %r0, matrix_a;
        ldi %r1, #3;
        jali %r5, matgen;
 
@@ -59,34 +60,54 @@ mgloop: jali %r5, randf;
 /* Write the matrix product of square matrix at (%r0) and (%r1) to (%r2). The
    size of these matrices is 2^Nx2^N, where N = %r3 */
 
-matmul: ldi %r4, #1;
+matmul: ori %r22, %r5, #0
+        ldi %r4, #1;
         ldi %r10, (`__WORD); /* ` is the log base 2 operator */
         shl %r4, %r4, %r3;
         add %r10, %r10, %r3;
         ldi %r14, #1;
         shl %r14, %r14, %r10;
 
-        divi %r17, %r14, THREADS; /* Spawn threads */
-sloop:  
+        divi %r23, %r4, THREADS; /* Spawn threads */
+        ldi %r18, THREADS
+        ldi %r19, #0
+        mul %r20, %r14, %r23;
 
-        jmpr %r5;
+        ori %r30, %r2, #0; /* Save r0 and r2 for thread 0 */
+        ori %r31, %r0, #0;
+
+sloop:  add %r0, %r0, %r20
+        add %r2, %r2, %r20
+        addi %r19, %r19, #1;
+        subi %r18, %r18, #1;
+        iszero @p0, %r18;
+  @p0 ? jmpi sfin;
+        clone %r19;
+        jmpi sloop;
+
+sfin:   ori %r2, %r30, #0; /* restore r1 and r2 */
+        ori %r0, %r31, #0;
+        jalis %r5, %r19, matmulthd;
+
+        jmpr %r22;
 
 /* One thread of matrix multiplication. Expected register values at start:
  *   %r0 - matrix a pointer (plus offset)
  *   %r1 - matrix b pointer
  *   %r2 - destination matrix pointer (plus offset)
- *   %r17 - row count
+ *   %r23 - row count
  */
 matmulthd: ldi %r9, #0; /* result row: %r9 */
 rloop:     ldi %r6, #0; /* result col: %r6 */
+
 cloop:     shli %r16, %r6, (`__WORD);
            shl %r15, %r9, %r10;
 
            add %r11, %r15, %r0;
            add %r12, %r16, %r1;
-           ldi %r13, #0;
 
            ldi %r8, #0 /* dot prod position: %r8 */
+           ldi %r13, #0;
 iloop:     ld %r7, %r11, #0;
            ld %r17, %r12, #0;
            fmul %r7, %r7, %r17
@@ -109,7 +130,7 @@ iloop:     ld %r7, %r11, #0;
      @p0 ? jmpi cloop;
 
            addi %r9, %r9, #1;
-           sub %r7, %r9, %r17;
+           sub %r7, %r9, %r23;
            rtop @p0, %r7;
      @p0 ? jmpi rloop;
 
@@ -117,8 +138,24 @@ iloop:     ld %r7, %r11, #0;
 
 .align 4096
 .perm rw
-matrix_a: .space 64;
-matrix_b: .space 64;
 matrix_r: .space 64
 
+matrix_a: .word  1f  2f  3f  4f  5f  6f  7f  8f
+          .word  2f  3f  4f  5f  6f  7f  8f  9f
+          .word  3f  4f  5f  6f  7f  8f  9f 10f
+          .word  4f  5f  6f  7f  8f  9f 10f 11f
+          .word  5f  6f  7f  8f  9f 10f 11f 12f
+          .word  6f  7f  8f  9f 10f 11f 12f 13f
+          .word  7f  8f  9f 10f 11f 12f 13f 14f
+          .word  8f  9f 10f 11f 12f 13f 14f 15f
+
+matrix_b: .word  0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7
+          .word  1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7
+          .word  2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7
+          .word  3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7
+          .word  4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7
+          .word  5.0 5.1 5.2 5.3 5.4 5.5 5.6 5.7
+          .word  6.0 6.1 6.2 6.3 6.4 6.5 6.6 6.7
+          .word  7.0 7.1 7.2 7.3 7.4 7.5 7.6 7.7
+
 retaddr: .word 0