diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 92a6596..3d00c35 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -267,6 +267,7 @@ class TensorCoreDecoupled( val operandsValid = fullAQueue.io.deq.valid && respQueueB.valid val operandA = fullAQueue.io.deq.bits.data + val operandATag = fullAQueue.io.deq.bits.tag val operandB = respQueueB.bits.data val dpuReady = Wire(Bool()) val dpuFire = operandsValid && dpuReady @@ -314,8 +315,6 @@ class TensorCoreDecoupled( val operandADimensional = operandA.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4).toSeq - println(s"operandA: ${fullAQueue.io.deq.bits.data.widthOption.get} bits") - println(s"A: ${operandADimensional.length}, ${operandADimensional(0).length}") assert(operandADimensional.length == tilingParams.mc && operandADimensional(0).length == tilingParams.kc, "operand width doesn't agree with tiling parameter") @@ -323,7 +322,6 @@ class TensorCoreDecoupled( val operandBDimensional = operandB.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq .grouped(4).toSeq - println(s"B: ${operandBDimensional.length}, ${operandBDimensional(0).length}") val ncSubstep = tilingParams.nc / 2 assert(tilingParams.mc * ncSubstep == numLanes, "substep tile size doesn't match writeback throughput") @@ -369,18 +367,20 @@ class TensorCoreDecoupled( // These queues hold metadata needed for writeback in sync with the DPU. val queueDepth = 4 // needs to be at least the DPU latency - val rdQueue = Module(new Queue( - chiselTypeOf(io.writeback.bits.rd), queueDepth + val tagQueue = Module(new Queue( + chiselTypeOf(operandATag), queueDepth )) - rdQueue.io.enq.valid := dpuFire - rdQueue.io.enq.bits := rdGen(stepCompute, substepCompute) - rdQueue.io.deq.ready := io.writeback.fire - assert(rdQueue.io.enq.ready === true.B, - "rd queue full, throttling DPU operation") - assert(!dpuValid || rdQueue.io.deq.valid, - "rd queue and DPU went out of sync") + tagQueue.io.enq.valid := dpuFire + // A and B should have the same tags + tagQueue.io.enq.bits := operandATag + // @cleanup: awkward + tagQueue.io.enq.bits.substep := substepCompute + tagQueue.io.deq.ready := io.writeback.fire + assert(tagQueue.io.enq.ready === true.B, + "tag queue full, DPU operation might be throttled") + assert(!dpuValid || tagQueue.io.deq.valid, + "tag queue and DPU went out of sync") - // TODO: decouple wid from frontend // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1)) // note rd is independent to sets @@ -390,11 +390,14 @@ class TensorCoreDecoupled( (step << 1/*2 substeps*/) + substep } + val setWriteback = tagQueue.io.deq.bits.set + val stepWriteback = tagQueue.io.deq.bits.step + val substepWriteback = tagQueue.io.deq.bits.substep io.writeback.valid := dpuValid + // TODO: decouple wid from frontend io.writeback.bits.wid := warpReg - io.writeback.bits.rd := rdQueue.io.deq.bits - // FIXME: look at set/step of dpu output not setExecute - io.writeback.bits.last := setDone(setExecute) && stepDone(stepExecute) + io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback) + io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) // State transition // ---------------- @@ -500,6 +503,10 @@ class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL) tensor.io.writeback.ready := true.B io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last + when (io.finished) { + // might be too strong + assert(tensor.io.writeback.bits.rd === 31.U) + } } // a minimal Diplomacy graph with a tensor core and a TLRAM