Merge branch 'main' of https://github.com/ucb-bar/radiance into main
This commit is contained in:
Submodule src/main/resources/vsrc/vortex updated: cde8da1f3b...78df981366
716
src/main/scala/radiance/core/TensorCoreDecoupled.scala
Normal file
716
src/main/scala/radiance/core/TensorCoreDecoupled.scala
Normal file
@@ -0,0 +1,716 @@
|
||||
// See LICENSE.SiFive for license details.
|
||||
// See LICENSE.Berkeley for license details.
|
||||
|
||||
package radiance.core
|
||||
|
||||
import chisel3._
|
||||
import chisel3.util._
|
||||
import chisel3.experimental.requireIsChiselType
|
||||
import org.chipsalliance.cde.config.Parameters
|
||||
import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
|
||||
import freechips.rocketchip.tilelink._
|
||||
import freechips.rocketchip.diplomacy.{IdRange, AddressSet}
|
||||
import freechips.rocketchip.unittest.{UnitTest, UnitTestModule}
|
||||
import radiance.memory.SourceGenerator
|
||||
|
||||
case class TensorTilingParams(
|
||||
// Dimension of the SMEM tile
|
||||
m: Int = 16,
|
||||
n: Int = 16,
|
||||
k: Int = 16,
|
||||
// Dimension of the compute tile. This is determined by the number of MAC
|
||||
// units
|
||||
mc: Int = 4,
|
||||
nc: Int = 4,
|
||||
kc: Int = 4
|
||||
)
|
||||
|
||||
class TensorCoreDecoupled(
|
||||
val numWarps: Int,
|
||||
val numLanes: Int,
|
||||
val numSourceIds: Int,
|
||||
val tilingParams: TensorTilingParams,
|
||||
val numFPRegs: Int = 32
|
||||
) extends Module {
|
||||
val numWarpBits = log2Ceil(numWarps)
|
||||
val wordSize = 4 // TODO FP16
|
||||
val wordSizeInBits = wordSize * 8 // TODO FP16
|
||||
val sourceWidth = log2Ceil(numSourceIds)
|
||||
val dataWidth = numLanes * wordSizeInBits // TODO FP16
|
||||
val numFPRegBits = log2Ceil(numFPRegs)
|
||||
|
||||
val io = IO(new Bundle {
|
||||
val initiate = Flipped(Decoupled(new Bundle {
|
||||
val wid = UInt(numWarpBits.W)
|
||||
}))
|
||||
val writeback = Decoupled(new Bundle {
|
||||
val last = Bool()
|
||||
val wid = UInt(numWarpBits.W)
|
||||
val rd = UInt(numFPRegBits.W)
|
||||
val data = Vec(numLanes, UInt((wordSizeInBits).W))
|
||||
})
|
||||
val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
|
||||
val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
|
||||
val reqA = Decoupled(new TensorMemReq(sourceWidth))
|
||||
val reqB = Decoupled(new TensorMemReq(sourceWidth))
|
||||
})
|
||||
dontTouch(io)
|
||||
|
||||
class TensorMemReq(
|
||||
sourceWidth: Int
|
||||
) extends Bundle {
|
||||
val source = UInt(sourceWidth.W)
|
||||
val address = UInt(32.W)
|
||||
}
|
||||
class TensorMemResp(
|
||||
sourceWidth: Int,
|
||||
dataWidth: Int
|
||||
) extends Bundle {
|
||||
val source = UInt(sourceWidth.W)
|
||||
val data = UInt(dataWidth.W)
|
||||
}
|
||||
class TensorMemTag extends Bundle {
|
||||
val warp = UInt(numWarpBits.W)
|
||||
val set = UInt(setBits.W)
|
||||
val index = UInt(indexBits.W)
|
||||
}
|
||||
// mem response after translation from TL source to set/step tag
|
||||
class TensorMemRespWithTag(
|
||||
dataWidth: Int
|
||||
) extends Bundle {
|
||||
val tag = new TensorMemTag
|
||||
val data = UInt(dataWidth.W)
|
||||
}
|
||||
|
||||
// ===========================================================================
|
||||
// Access stage
|
||||
// ===========================================================================
|
||||
//
|
||||
// Frontend of the decoupled access/execute pipeline.
|
||||
|
||||
// sets: k iteration
|
||||
val numSets = (tilingParams.k / tilingParams.kc)
|
||||
val setBits = log2Ceil(numSets)
|
||||
// steps: i-j iteration
|
||||
val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
|
||||
val stepBits = log2Ceil(numSteps)
|
||||
val lastSet = ((1 << setBits) - 1)
|
||||
val lastStep = ((1 << stepBits) - 1)
|
||||
def setDone(set: UInt) = (set === lastSet.U)
|
||||
def stepDone(step: UInt) = (step === lastStep.U)
|
||||
// 'index' is the index of a memory request among the sequence of requests
|
||||
// needed to read a full M-column of A or N-row of B. Its range is [0,m/2)
|
||||
// or [0,n/2), where 2 is the stride can be read in a single request size.
|
||||
require(tilingParams.m == tilingParams.n,
|
||||
"currently only supports square SMEM tile")
|
||||
val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/
|
||||
val indexBits = log2Ceil(numIndices)
|
||||
val lastIndex = (1 << indexBits) - 1
|
||||
|
||||
object AccessorState extends ChiselEnum {
|
||||
val idle = Value(0.U)
|
||||
val access = Value(1.U)
|
||||
// All set/step sequencing is complete and the tensor core is holding the
|
||||
// result data until downstream writeback is ready.
|
||||
// FIXME: is this necessary if writeback is decoupled with queues?
|
||||
val finish = Value(2.U)
|
||||
}
|
||||
val state = RegInit(AccessorState.idle)
|
||||
val allReqsDone = WireInit(false.B)
|
||||
dontTouch(allReqsDone)
|
||||
|
||||
val warpAccess = RegInit(0.U(numWarpBits.W))
|
||||
|
||||
class BlockState extends Bundle {
|
||||
val set = UInt(setBits.W)
|
||||
val index = UInt(indexBits.W)
|
||||
}
|
||||
val stateInit = Wire(new BlockState)
|
||||
stateInit.set := 0.U
|
||||
stateInit.index := 0.U
|
||||
val stateA = RegInit(stateInit)
|
||||
val stateB = RegInit(stateInit)
|
||||
dontTouch(stateA)
|
||||
dontTouch(stateA.index)
|
||||
dontTouch(stateB)
|
||||
dontTouch(stateB.index)
|
||||
|
||||
io.initiate.ready := (state === AccessorState.idle)
|
||||
when (io.initiate.fire) {
|
||||
warpAccess := io.initiate.bits.wid
|
||||
assert(stateA.set === 0.U && stateA.index === 0.U &&
|
||||
stateB.set === 0.U && stateB.index === 0.U,
|
||||
"stateA and stateB not initialized to zero")
|
||||
}
|
||||
|
||||
switch(state) {
|
||||
is(AccessorState.idle) {
|
||||
when(io.initiate.fire) {
|
||||
state := AccessorState.access
|
||||
}
|
||||
}
|
||||
is(AccessorState.access) {
|
||||
when (allReqsDone) {
|
||||
state := AccessorState.finish
|
||||
}
|
||||
}
|
||||
is(AccessorState.finish) {
|
||||
// FIXME: is finish state needed?
|
||||
state := AccessorState.idle
|
||||
}
|
||||
}
|
||||
|
||||
when (io.reqA.fire) {
|
||||
when (stateA.index === lastIndex.U) {
|
||||
stateA.set := stateA.set + 1.U
|
||||
}
|
||||
stateA.index := stateA.index + 1.U
|
||||
}
|
||||
when (io.reqB.fire) {
|
||||
when (stateB.index === lastIndex.U) {
|
||||
stateB.set := stateB.set + 1.U
|
||||
}
|
||||
stateB.index := stateB.index + 1.U
|
||||
}
|
||||
|
||||
// Address generation
|
||||
//
|
||||
def addressGen(base: UInt, set: UInt, index: UInt): UInt = {
|
||||
// note that both A and B are K-major to facilitate bank conflict-free SMEM
|
||||
// accesses, so that below code applies to both.
|
||||
//
|
||||
// a "block" is the 4*8 byte-sized contiguous memory that can be read in
|
||||
// one SMEM request. The A and B matrix is assumed to be stored in
|
||||
// block-wise "index"-major order (M-major for A, N-major for B)
|
||||
val blockRow = set
|
||||
val blockCol = index
|
||||
val blockIndex = (blockRow << indexBits) + blockCol
|
||||
val blockSize = numLanes * wordSize
|
||||
val blockSizeBits = log2Ceil(blockSize)
|
||||
val byteOffset = blockIndex << blockSizeBits
|
||||
base + byteOffset
|
||||
|
||||
// address generation for byte-wise K-major A and B layout
|
||||
// val elemRow = blockRow << 1
|
||||
// val elemCol = blockCol << log2Ceil(tilingParams.kc)
|
||||
// val rowStride = tilingParams.k * wordSize
|
||||
// val rowStrideBits = log2Ceil(rowStride)
|
||||
// val wordStrideBits = log2Ceil(wordSize)
|
||||
// val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits)
|
||||
// base + tileOffset
|
||||
}
|
||||
|
||||
// FIXME: bogus base address
|
||||
val addressA = addressGen(0.U, stateA.set, stateA.index)
|
||||
// SMEM 256KB, 8 banks: 0x8000B(32KB) per bank
|
||||
val addressB = addressGen(0x8000.U, stateB.set, stateB.index)
|
||||
|
||||
val lastReqA = (stateA.set === lastSet.U) && (stateA.index === lastIndex.U)
|
||||
val lastReqB = (stateB.set === lastSet.U) && (stateB.index === lastIndex.U)
|
||||
val doneReqA = RegInit(false.B)
|
||||
val doneReqB = RegInit(false.B)
|
||||
when (lastReqA && io.reqA.fire) { doneReqA := true.B }
|
||||
when (lastReqB && io.reqB.fire) { doneReqB := true.B }
|
||||
val genReqA = (state === AccessorState.access) && !doneReqA
|
||||
val genReqB = (state === AccessorState.access) && !doneReqB
|
||||
when (state === AccessorState.finish) {
|
||||
doneReqA := false.B
|
||||
doneReqB := false.B
|
||||
stateA.set := 0.U
|
||||
stateA.index := 0.U
|
||||
stateB.set := 0.U
|
||||
stateB.index := 0.U
|
||||
}
|
||||
|
||||
allReqsDone := doneReqA && doneReqB
|
||||
|
||||
// Request generation
|
||||
//
|
||||
val tagA = Wire(new TensorMemTag)
|
||||
tagA.warp := warpAccess
|
||||
tagA.set := stateA.set
|
||||
tagA.index := stateA.index
|
||||
val tagB = Wire(new TensorMemTag)
|
||||
tagB.warp := warpAccess
|
||||
tagB.set := stateB.set
|
||||
tagB.index := stateB.index
|
||||
|
||||
val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
|
||||
val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
|
||||
Seq((io.reqA, (io.respA, respATagged)),
|
||||
(io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach {
|
||||
case ((req, (resp, respTagged)), i) => {
|
||||
val sourceGen = Module(new SourceGenerator(
|
||||
log2Ceil(numSourceIds),
|
||||
metadata = Some(new TensorMemTag)
|
||||
))
|
||||
|
||||
sourceGen.io.gen := req.fire
|
||||
sourceGen.io.meta := (if (i == 0) tagA else tagB)
|
||||
req.valid := (if (i == 0) genReqA else genReqB)
|
||||
req.bits.address := (if (i == 0) addressA else addressB)
|
||||
req.bits.source := sourceGen.io.id.bits
|
||||
|
||||
sourceGen.io.reclaim.valid := resp.fire
|
||||
sourceGen.io.reclaim.bits := resp.bits.source
|
||||
|
||||
// translate source
|
||||
respTagged.valid := resp.valid
|
||||
respTagged.bits.tag := sourceGen.io.peek
|
||||
respTagged.bits.data := resp.bits.data
|
||||
resp.ready := respTagged.ready
|
||||
}
|
||||
}
|
||||
|
||||
// ===========================================================================
|
||||
// Execute stage
|
||||
// ===========================================================================
|
||||
//
|
||||
// Backend of the decoupled access/execute pipeline.
|
||||
//
|
||||
val respQueueDepth = 4 // FIXME: parameterize
|
||||
require(respQueueDepth >= 4,
|
||||
"respQueueDepth must be at least 4. This is because the B operand buffer " ++
|
||||
"is shallower than A's, so the B response queue has to be deep enough to " ++
|
||||
"hold younger requests until A operand buffer becomes valid and the first DPU " ++
|
||||
"fire can happen. FIXME: make operand buffer report per-subtile valid so " ++
|
||||
"the first compute can happen earlier.")
|
||||
val respQueueA = Queue(respATagged, respQueueDepth)
|
||||
val respQueueB = Queue(respBTagged, respQueueDepth)
|
||||
|
||||
require(respQueueA.bits.data.widthOption.get ==
|
||||
io.writeback.bits.data.widthOption.get,
|
||||
"response data width does not match the writeback data width")
|
||||
|
||||
// FIXME: unnecessary
|
||||
val substepDeqA = RegInit(0.U(1.W))
|
||||
when (respQueueA.fire) {
|
||||
substepDeqA := substepDeqA + 1.U
|
||||
}
|
||||
dontTouch(substepDeqA)
|
||||
|
||||
// Stage the operands in a pipeline so that we obtain the full 4x4 tiles
|
||||
// ready for compute. Also send the set/step tag along the pipe for
|
||||
// alignment check.
|
||||
|
||||
// @cleanup: dedup A and B below
|
||||
|
||||
val fullA = Module(new FillBuffer(
|
||||
chiselTypeOf(respQueueB.bits.data), numIndices
|
||||
))
|
||||
fullA.io.enq.valid := respQueueA.valid
|
||||
fullA.io.enq.bits := respQueueA.bits.data
|
||||
respQueueA.ready := fullA.io.enq.ready
|
||||
// `pipe` combinationally couples enq-deq ready
|
||||
val fullATag = Module(new Queue(
|
||||
new TensorMemTag, entries = 1, pipe = true
|
||||
))
|
||||
fullATag.io.enq.valid := respQueueA.valid
|
||||
fullATag.io.enq.bits := respQueueA.bits.tag
|
||||
|
||||
// stage the full A tile once more so that FillBuffer can be filled up in the
|
||||
// background while the tile is being used for compute. This does come with
|
||||
// capacity overhead.
|
||||
val fullABuf = Module(new Queue(
|
||||
new Bundle {
|
||||
val data = chiselTypeOf(fullA.io.deq.bits)
|
||||
val tag = new TensorMemTag
|
||||
}, entries = 1, pipe = true
|
||||
))
|
||||
fullABuf.io.enq.valid := fullA.io.deq.valid
|
||||
fullABuf.io.enq.bits.data := fullA.io.deq.bits
|
||||
fullABuf.io.enq.bits.tag := fullATag.io.deq.bits
|
||||
fullA.io.deq.ready := fullABuf.io.enq.ready
|
||||
fullATag.io.deq.ready := fullABuf.io.enq.ready
|
||||
|
||||
// serialize every two B responses into one full 4x4 B tile
|
||||
// FIXME: do the same for A
|
||||
val fullB = Module(new FillBuffer(
|
||||
chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
|
||||
))
|
||||
fullB.io.enq.valid := respQueueB.valid
|
||||
fullB.io.enq.bits := respQueueB.bits.data
|
||||
respQueueB.ready := fullB.io.enq.ready
|
||||
val fullBTag = Module(new Queue(
|
||||
new TensorMemTag, entries = 1, pipe = true
|
||||
))
|
||||
fullBTag.io.enq.valid := respQueueB.valid
|
||||
fullBTag.io.enq.bits := respQueueB.bits.tag
|
||||
|
||||
val fullBBuf = Module(new Queue(
|
||||
new Bundle {
|
||||
val data = chiselTypeOf(fullB.io.deq.bits)
|
||||
val tag = new TensorMemTag
|
||||
}, entries = 1, pipe = true
|
||||
))
|
||||
fullBBuf.io.enq.valid := fullB.io.deq.valid
|
||||
fullBBuf.io.enq.bits.data := fullB.io.deq.bits
|
||||
fullBBuf.io.enq.bits.tag := fullBTag.io.deq.bits
|
||||
fullB.io.deq.ready := fullBBuf.io.enq.ready
|
||||
fullBTag.io.deq.ready := fullBBuf.io.enq.ready
|
||||
|
||||
val dpuReady = Wire(Bool())
|
||||
val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
|
||||
val dpuFire = operandsValid && dpuReady
|
||||
|
||||
val setCompute = RegInit(0.U(setBits.W))
|
||||
val stepCompute = RegInit(0.U(stepBits.W))
|
||||
val substepCompute = RegInit(0.U(1.W))
|
||||
val nextStepCompute = dpuFire && (substepCompute === 1.U)
|
||||
dontTouch(setCompute)
|
||||
dontTouch(stepCompute)
|
||||
dontTouch(substepCompute)
|
||||
when (dpuFire) {
|
||||
substepCompute := substepCompute + 1.U
|
||||
}
|
||||
|
||||
// Operand selection
|
||||
//
|
||||
// select the correct 4x4 tile from A operand buffer
|
||||
val numTilesM = tilingParams.m / tilingParams.mc
|
||||
val numTilesMBits = log2Ceil(numTilesM)
|
||||
def selectOperandA(buf: Vec[UInt]): UInt = {
|
||||
require(buf.length == numIndices)
|
||||
val stepM = stepCompute & ((1 << numTilesMBits) - 1).U
|
||||
Cat(buf((stepM << 1) + 1.U), buf(stepM << 1))
|
||||
}
|
||||
val operandA = selectOperandA(fullABuf.io.deq.bits.data)
|
||||
val operandATag = fullABuf.io.deq.bits.tag
|
||||
// select the correct 2x4 tile from B operand buffer
|
||||
val operandB = fullBBuf.io.deq.bits.data(substepCompute)
|
||||
val operandBTag = fullBBuf.io.deq.bits.tag
|
||||
dontTouch(operandATag)
|
||||
dontTouch(operandBTag)
|
||||
|
||||
// Operand buffer logic
|
||||
//
|
||||
// hold A data until the entire set is done
|
||||
val shouldDequeueAMask = ((1 << stepBits) - 1).U
|
||||
val shouldDequeueA =
|
||||
((stepCompute & shouldDequeueAMask) === shouldDequeueAMask) &&
|
||||
(substepCompute === 1.U)
|
||||
fullABuf.io.deq.ready := dpuFire && shouldDequeueA
|
||||
// hold B tile at respQueueB for multiple steps for reuse, only dequeue when
|
||||
// we fully iterated a column (M-dimension)
|
||||
val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
|
||||
val shouldDequeueB =
|
||||
((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
|
||||
(substepCompute === 1.U)
|
||||
fullBBuf.io.deq.ready := dpuFire && shouldDequeueB
|
||||
dontTouch(respQueueA)
|
||||
dontTouch(respQueueB)
|
||||
dontTouch(shouldDequeueA)
|
||||
dontTouch(shouldDequeueB)
|
||||
|
||||
// Assert that the DPU is computing with operands of the same set/step. Note
|
||||
// that the B resp will only have step values multiple of 4 due to reuse.
|
||||
//
|
||||
// This check assumes that memory responses come back in-order. Might be too
|
||||
// strong of an assumption depending on the backing memory.
|
||||
def assertAligned = {
|
||||
val stepMask = (1 << numTilesMBits).U
|
||||
when (dpuFire) {
|
||||
assert(operandATag.warp === operandBTag.warp &&
|
||||
operandATag.set === operandBTag.set,
|
||||
"A and B operands are pointing to different warps and sets. " ++
|
||||
"This might indicate memory response coming back out-of-order.")
|
||||
assert(operandATag.set === setCompute,
|
||||
"Operand arrived from memory is pointing at a different set than the FSM.")
|
||||
}
|
||||
}
|
||||
assertAligned
|
||||
|
||||
// Dot-product unit
|
||||
//
|
||||
// 4x2 four-element DPUs summing up to 32 MACs in total
|
||||
//
|
||||
val ncSubstep = tilingParams.nc / 2
|
||||
require(tilingParams.mc * ncSubstep == numLanes,
|
||||
"substep tile size doesn't match writeback throughput")
|
||||
val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)(
|
||||
Module(new TensorDotProductUnit(half = false))
|
||||
))
|
||||
|
||||
// reshape operands for easier routing to DPU
|
||||
def reshapeByFourWords(x: UInt): Seq[Seq[UInt]] = {
|
||||
x.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
|
||||
.grouped(4/*k-dim*/).toSeq
|
||||
}
|
||||
val operandADimensional = reshapeByFourWords(operandA)
|
||||
require(operandADimensional.length == tilingParams.mc &&
|
||||
operandADimensional(0).length == tilingParams.kc,
|
||||
"operand width doesn't agree with tiling parameter")
|
||||
val operandBDimensional = reshapeByFourWords(operandB)
|
||||
require(operandBDimensional.length == ncSubstep &&
|
||||
operandBDimensional(0).length == tilingParams.kc,
|
||||
"operand width doesn't agree with tiling parameter")
|
||||
|
||||
for (m <- 0 until tilingParams.mc) {
|
||||
for (n <- 0 until ncSubstep) {
|
||||
dpus(m)(n).io.in.valid := dpuFire
|
||||
dpus(m)(n).io.in.bits.a := operandADimensional(m)
|
||||
dpus(m)(n).io.in.bits.b := operandBDimensional(n)
|
||||
dpus(m)(n).io.in.bits.c := 0.U // FIXME: bogus accum data
|
||||
// dpu ready couples with writeback backpressure
|
||||
dpus(m)(n).io.stall := !io.writeback.ready
|
||||
}
|
||||
}
|
||||
dpuReady := !dpus(0)(0).io.stall
|
||||
dontTouch(dpuFire)
|
||||
dontTouch(dpuReady)
|
||||
|
||||
val dpuValids = dpus.flatMap(_.map(_.io.out.valid))
|
||||
val dpuValid = dpuValids.reduce(_ && _)
|
||||
def assertDPU = {
|
||||
val dpuStalls = dpus.flatMap(_.map(_.io.stall))
|
||||
assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _),
|
||||
"stall signals of DPUs went out of sync")
|
||||
assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _),
|
||||
"valid signals of DPUs went out of sync")
|
||||
}
|
||||
assertDPU
|
||||
|
||||
// flatten DPU output into 1D array in M-major order
|
||||
val flattenedDPUOut = (0 until ncSubstep).flatMap { n =>
|
||||
(0 until tilingParams.mc).map { m =>
|
||||
dpus(m)(n).io.out.bits.data
|
||||
}
|
||||
}
|
||||
io.writeback.bits.data := flattenedDPUOut
|
||||
|
||||
// Writeback logic
|
||||
//
|
||||
// These queues hold metadata needed for writeback in sync with the DPU.
|
||||
|
||||
class TensorComputeTag extends Bundle {
|
||||
val warp = UInt(numWarpBits.W)
|
||||
val set = UInt(setBits.W)
|
||||
val step = UInt(stepBits.W)
|
||||
val substep = UInt(1.W)
|
||||
}
|
||||
|
||||
val queueDepth = 5 // needs to be at least the DPU latency
|
||||
val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth))
|
||||
tagQueue.io.enq.valid := dpuFire
|
||||
tagQueue.io.enq.bits.warp := operandATag.warp
|
||||
tagQueue.io.enq.bits.set := setCompute
|
||||
tagQueue.io.enq.bits.step := stepCompute
|
||||
tagQueue.io.enq.bits.substep := substepCompute
|
||||
tagQueue.io.deq.ready := io.writeback.fire
|
||||
assert(tagQueue.io.enq.ready === true.B,
|
||||
"tag queue full, DPU operation might be throttled")
|
||||
assert(!dpuValid || tagQueue.io.deq.valid,
|
||||
"tag queue and DPU went out of sync")
|
||||
|
||||
// val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
|
||||
|
||||
// note rd is independent to sets
|
||||
def rdGen(step: UInt, substep: UInt): UInt = {
|
||||
// each step produces 4x4 output tile, written by 8 threads with 2 regs per
|
||||
// thread
|
||||
(step << 1/*2 substeps*/) + substep
|
||||
}
|
||||
|
||||
val warpWriteback = tagQueue.io.deq.bits.warp
|
||||
val setWriteback = tagQueue.io.deq.bits.set
|
||||
val stepWriteback = tagQueue.io.deq.bits.step
|
||||
val substepWriteback = tagQueue.io.deq.bits.substep
|
||||
io.writeback.valid := dpuValid
|
||||
io.writeback.bits.wid := warpWriteback
|
||||
io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
|
||||
io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) &&
|
||||
(substepWriteback === 1.U)
|
||||
|
||||
// State transition
|
||||
// ----------------
|
||||
//
|
||||
// set/step sequencing logic
|
||||
|
||||
def sequenceSetStep(set: UInt, step: UInt, nextStep: Bool) = {
|
||||
when (nextStep) {
|
||||
step := (step + 1.U) & lastStep.U
|
||||
when (stepDone(step)) {
|
||||
set := (set + 1.U) & lastSet.U
|
||||
}
|
||||
}
|
||||
}
|
||||
sequenceSetStep(setCompute, stepCompute, nextStepCompute)
|
||||
}
|
||||
|
||||
// A buffer that collects multiple entries of input data and exposes the
|
||||
// coalesced data as output. Effectively acts as a width-widening
|
||||
// chisel.util.Pipe.
|
||||
class FillBuffer[T <: Data](
|
||||
gen: T,
|
||||
entries: Int
|
||||
) extends Module {
|
||||
require(entries > 0, "FillBuffer must have a positive number of entries")
|
||||
requireIsChiselType(gen)
|
||||
|
||||
val io = IO(new Bundle {
|
||||
val enq = Flipped(Decoupled(gen))
|
||||
val deq = Decoupled(Vec(entries, gen))
|
||||
})
|
||||
|
||||
val data = Reg(Vec(entries, gen))
|
||||
val ptr = Counter(entries + 1)
|
||||
dontTouch(ptr.value)
|
||||
val full = (ptr.value === entries.U)
|
||||
io.enq.ready := !full
|
||||
when (io.enq.fire) {
|
||||
data(ptr.value) := io.enq.bits
|
||||
ptr.inc()
|
||||
}
|
||||
io.deq.valid := full
|
||||
(io.deq.bits zip data).foreach { case (io, d) => io := d }
|
||||
when (io.deq.fire) {
|
||||
assert(ptr.value === entries.U, "FillBuffer fired before buffer was full")
|
||||
ptr.reset()
|
||||
}
|
||||
}
|
||||
|
||||
// synthesizable unit tests
|
||||
|
||||
// wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy
|
||||
// graph.
|
||||
class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
|
||||
val numSourceIds = 16
|
||||
|
||||
// node with two edges; one for A and one for B matrix
|
||||
val node = TLClientNode(Seq(
|
||||
TLMasterPortParameters.v2(
|
||||
Seq(TLMasterParameters.v2(
|
||||
name = "TensorCoreDecoupledMatrixANode",
|
||||
sourceId = IdRange(0, numSourceIds)
|
||||
))
|
||||
),
|
||||
TLMasterPortParameters.v2(
|
||||
Seq(TLMasterParameters.v2(
|
||||
name = "TensorCoreDecoupledMatrixBNode",
|
||||
sourceId = IdRange(0, numSourceIds)
|
||||
))
|
||||
)
|
||||
))
|
||||
|
||||
lazy val module = new TensorCoreDecoupledTLImp(this)
|
||||
}
|
||||
|
||||
class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
|
||||
extends LazyModuleImp(outer) with UnitTestModule {
|
||||
require(outer.node.out.length == 2/*A and B*/)
|
||||
|
||||
val tensor = Module(new TensorCoreDecoupled(
|
||||
8, 8, outer.numSourceIds , TensorTilingParams()))
|
||||
val wordSize = 4 // @cleanup: hardcoded
|
||||
|
||||
val zip = Seq((outer.node.out(0), tensor.io.reqA),
|
||||
(outer.node.out(1), tensor.io.reqB))
|
||||
zip.foreach { case ((tl, edge), req) =>
|
||||
tl.a.valid := req.valid
|
||||
val (legal, bits) = edge.Get(
|
||||
fromSource = req.bits.source,
|
||||
toAddress = req.bits.address,
|
||||
lgSize = log2Ceil(wordSize).U
|
||||
)
|
||||
tl.a.bits := bits
|
||||
req.ready := tl.a.ready
|
||||
when(tl.a.fire) {
|
||||
assert(legal, "illegal TL req gen")
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: dedup A and B
|
||||
val (tlOutA, _) = outer.node.out(0)
|
||||
val (tlOutB, _) = outer.node.out(1)
|
||||
tensor.io.respA.valid := tlOutA.d.valid
|
||||
tensor.io.respA.bits.data := tlOutA.d.bits.data
|
||||
tensor.io.respA.bits.source := tlOutA.d.bits.source
|
||||
tlOutA.d.ready := tensor.io.respA.ready
|
||||
tensor.io.respB.valid := tlOutB.d.valid
|
||||
tensor.io.respB.bits.data := tlOutB.d.bits.data
|
||||
tensor.io.respB.bits.source := tlOutB.d.bits.source
|
||||
tlOutB.d.ready := tensor.io.respB.ready
|
||||
|
||||
tensor.io.initiate.valid := io.start
|
||||
tensor.io.initiate.bits.wid := 0.U // TODO
|
||||
tensor.io.writeback.ready := true.B
|
||||
|
||||
io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
|
||||
when (io.finished) {
|
||||
// might be too strong
|
||||
assert(tensor.io.writeback.bits.rd === 31.U)
|
||||
}
|
||||
}
|
||||
|
||||
// a minimal Diplomacy graph with a tensor core and a TLRAM
|
||||
class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
|
||||
val tensor = LazyModule(new TensorCoreDecoupledTL)
|
||||
val xbar = LazyModule(new TLXbar)
|
||||
val ram = LazyModule(new TLRAM(
|
||||
address = AddressSet(0x0000, 0xffffff),
|
||||
beatBytes = 32 // @cleanup: hardcoded
|
||||
))
|
||||
|
||||
ram.node :=* xbar.node :=* tensor.node
|
||||
|
||||
lazy val module = new Impl
|
||||
class Impl extends LazyModuleImp(this) with UnitTestModule {
|
||||
tensor.module.io.start := io.start
|
||||
io.finished := tensor.module.io.finished
|
||||
}
|
||||
}
|
||||
|
||||
// two separate TLRAMs for A and B for full throughput
|
||||
class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
|
||||
val tensor = LazyModule(new TensorCoreDecoupledTL)
|
||||
val xbar = LazyModule(new TLXbar)
|
||||
val ramA = LazyModule(new TLRAM(
|
||||
address = AddressSet(0x000, 0xfffbff),
|
||||
beatBytes = 32 // @cleanup: hardcoded
|
||||
))
|
||||
val ramB = LazyModule(new TLRAM(
|
||||
address = AddressSet(0x400, 0xfffbff),
|
||||
beatBytes = 32 // @cleanup: hardcoded
|
||||
))
|
||||
|
||||
val stutter = new TLIdentityNode
|
||||
xbar.node :=* tensor.node
|
||||
ramA.node := stutter := xbar.node
|
||||
ramB.node := xbar.node
|
||||
|
||||
val fuzz = false
|
||||
|
||||
lazy val module = new Impl
|
||||
class Impl extends LazyModuleImp(this) with UnitTestModule {
|
||||
tensor.module.io.start := io.start
|
||||
io.finished := tensor.module.io.finished
|
||||
|
||||
val (tlIn, _) = stutter.in(0)
|
||||
val (tlOut, _) = stutter.out(0)
|
||||
require(stutter.in.length == 1)
|
||||
require(stutter.out.length == 1)
|
||||
|
||||
// inject stalls for fuzzing
|
||||
val incr = Wire(Bool())
|
||||
val (count, _) = Counter(incr, 0x1000)
|
||||
def cond(x: UInt) = (x & ((1 << 3) - 1).U) =/= 0.U
|
||||
val stall = if (fuzz) cond(count) else false.B
|
||||
|
||||
tlOut.a <> tlIn.a
|
||||
tlIn.d <> tlOut.d
|
||||
incr := tlIn.a.fire || stall
|
||||
when (stall) {
|
||||
tlIn.a.ready := false.B
|
||||
tlOut.a.valid := false.B
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// unit test harness
|
||||
class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
|
||||
extends UnitTest(timeout) {
|
||||
// val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
|
||||
val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module)
|
||||
dut.io.start := io.start
|
||||
io.finished := dut.io.finished
|
||||
}
|
||||
@@ -27,11 +27,13 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
|
||||
val b = Vec(dotProductDim, Bits((inFLen).W))
|
||||
val c = Bits((outFLen).W) // note C has the out length for accumulation
|
||||
}))
|
||||
// 'stall' is effectively out.ready, combinationally coupled to in.ready
|
||||
val stall = Input(Bool())
|
||||
val out = Valid(new Bundle {
|
||||
val data = Bits((outFLen).W)
|
||||
})
|
||||
})
|
||||
dontTouch(io)
|
||||
|
||||
// [IEEE] -> recode() -> unbox() -> [Hardfloat] -> box() -> ieee() -> [IEEE]
|
||||
// make sure recoding/uncoding happens only at the edge, not at every
|
||||
@@ -52,7 +54,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
|
||||
io.out.bits.data := ieee(box(dpu.io.out.bits.data, S))
|
||||
}
|
||||
|
||||
// Copied from chisel3.util.Pipe.
|
||||
// An implementation of chisel3.util.Pipe that supports stalls.
|
||||
class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module {
|
||||
/** A non-ambiguous name of this `StallingPipe` for use in generated Verilog
|
||||
* names. Includes the latency cycle count in the name as well as the
|
||||
|
||||
@@ -372,7 +372,8 @@ class SourceGenerator[T <: Data](
|
||||
outstanding := outstanding + 1.U
|
||||
}
|
||||
}.elsewhen(io.reclaim.valid) {
|
||||
assert(outstanding > 0.U)
|
||||
assert(outstanding > 0.U,
|
||||
"Over-reclaim. Did some responses get dropped?")
|
||||
outstanding := outstanding - 1.U
|
||||
}
|
||||
dontTouch(outstanding)
|
||||
|
||||
@@ -379,6 +379,12 @@ class RadianceTile private (
|
||||
tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode
|
||||
}
|
||||
|
||||
// Instantiate a fake TensorCoreDecoupled module to force unique-ification of
|
||||
// module names in the Chisel-generated Verilog. This should be disabled for
|
||||
// synthesis runs
|
||||
val tensor = LazyModule(new radiance.core.TensorCoreDecoupledTL)
|
||||
tlMasterXbar.node :=* tensor.node
|
||||
|
||||
/* below are copied from rocket */
|
||||
|
||||
val tile_master_blocker =
|
||||
@@ -733,7 +739,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
|
||||
}
|
||||
}
|
||||
|
||||
def connectTc {
|
||||
def connectTensor = {
|
||||
val tcb0 = new {
|
||||
val addr = core.io.tc_a_bits_address(31, 0)
|
||||
val tag = core.io.tc_a_bits_tag(3, 0)
|
||||
@@ -752,16 +758,18 @@ class RadianceTileModuleImp(outer: RadianceTile)
|
||||
val adapter = Module(
|
||||
new VortexTLAdapter(
|
||||
outer.smemSourceWidth,
|
||||
new VortexBundleA(tagWidth = 1, dataWidth = 32 * 8),
|
||||
new VortexBundleD(tagWidth = 1, dataWidth = 32 * 8),
|
||||
new VortexBundleA(tagWidth = 4, dataWidth = 32 * 8),
|
||||
new VortexBundleD(tagWidth = 4, dataWidth = 32 * 8),
|
||||
client
|
||||
)
|
||||
)
|
||||
require(adapter.io.inReq.bits.source.widthOption.get == bundle.tag.widthOption.get)
|
||||
require(adapter.io.inReq.bits.address.widthOption.get == bundle.addr.widthOption.get)
|
||||
adapter.io.inReq.bits <> DontCare
|
||||
adapter.io.inReq.valid := bundle.aValid
|
||||
adapter.io.inReq.bits.address := bundle.addr
|
||||
adapter.io.inReq.bits.source := bundle.tag
|
||||
adapter.io.inReq.bits.size := 5.U
|
||||
adapter.io.inReq.bits.size := 5.U // 256 bits
|
||||
adapter.io.inReq.bits.opcode := TLMessages.Get
|
||||
adapter.io.inReq.bits.mask := x"ffffffff".U
|
||||
adapter.io.inResp.ready := bundle.dReady
|
||||
@@ -774,6 +782,8 @@ class RadianceTileModuleImp(outer: RadianceTile)
|
||||
core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid)
|
||||
core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data)
|
||||
core.io.tc_d_bits_tag := Cat(adapters.last.io.inResp.bits.source, adapters.head.io.inResp.bits.source)
|
||||
require(core.io.tc_d_bits_data.widthOption.get == adapters.head.io.inResp.bits.data.widthOption.get * 2)
|
||||
require(core.io.tc_d_bits_tag.widthOption.get == adapters.head.io.inResp.bits.source.widthOption.get * 2)
|
||||
}
|
||||
|
||||
def connectBarrier = {
|
||||
@@ -790,7 +800,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
|
||||
outer.barrierMasterNode.out(0)._1.resp.ready := true.B
|
||||
}
|
||||
|
||||
def connectAccelerator: Unit = {
|
||||
def connectAccelerator = {
|
||||
outer.accMasterNode.out.head._1.cmd.bits := core.io.acc_write_out
|
||||
outer.accMasterNode.out.head._1.cmd.valid := core.io.acc_write_en
|
||||
core.io.acc_read_in := outer.accMasterNode.out.head._1.status
|
||||
@@ -831,7 +841,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
|
||||
connectImem
|
||||
connectDmem
|
||||
connectSmem
|
||||
connectTc
|
||||
connectTensor
|
||||
connectBarrier
|
||||
connectAccelerator
|
||||
}
|
||||
@@ -839,6 +849,9 @@ class RadianceTileModuleImp(outer: RadianceTile)
|
||||
// TODO: generalize for useVxCache
|
||||
if (!outer.radianceParams.useVxCache) {}
|
||||
|
||||
// connect io.start and io.finish of the fake TensorCoreDecoupled module
|
||||
outer.tensor.module.io.start := false.B
|
||||
|
||||
// // RoCC
|
||||
// if (outer.roccs.size > 0) {
|
||||
// val (respArb, cmdRouter) = {
|
||||
|
||||
@@ -137,7 +137,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
|
||||
"NUM_THREADS" -> tile.numLsuLanes
|
||||
)
|
||||
)
|
||||
with HasBlackBoxResource {
|
||||
with HasBlackBoxResource with HasBlackBoxPath {
|
||||
// addResource("/vsrc/vortex/hw/unit_tests/generic_queue/testbench.v")
|
||||
// addResource("/vsrc/vortex/hw/unit_tests/VX_divide_tb.v")
|
||||
// addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0_rtl.v")
|
||||
@@ -242,8 +242,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
|
||||
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv")
|
||||
// addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv")
|
||||
|
||||
addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv")
|
||||
|
||||
addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv")
|
||||
// addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv")
|
||||
// addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv")
|
||||
@@ -407,7 +405,37 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
|
||||
|
||||
// tensor core
|
||||
addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv")
|
||||
addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
|
||||
addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv")
|
||||
// addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
|
||||
def addHopperTensorCore = {
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRecFN.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/DotProductPipe.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer_1.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/metadataTable_4x5.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/MulFullRawFN.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/occupancyTable_4x1.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon_1.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorMemTag.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue4_TensorMemRespWithTag.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue5_TensorComputeTag.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_4x261.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_5x7.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is26_oe8_os24.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is47_oe8_os24.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundRawFNToRecFN_e8_s24.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SimpleTimer.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SourceGenerator.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_1.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_2.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv")
|
||||
addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv")
|
||||
}
|
||||
// addHopperTensorCore
|
||||
addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
|
||||
addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")
|
||||
addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
// See LICENSE.SiFive for license details.
|
||||
|
||||
package radiance.memory
|
||||
package radiance.unittest
|
||||
|
||||
import chisel3._
|
||||
import org.chipsalliance.cde.config._
|
||||
@@ -8,6 +8,8 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig}
|
||||
import freechips.rocketchip.devices.tilelink._
|
||||
import freechips.rocketchip.tilelink._
|
||||
import freechips.rocketchip.util._
|
||||
import radiance.core.TensorCoreDecoupledTest
|
||||
import radiance.memory._
|
||||
import radiance.subsystem.WithSimtConfig
|
||||
import freechips.rocketchip.unittest._
|
||||
//import rocket.VortexFatBankTest
|
||||
@@ -17,6 +19,16 @@ case object TestDurationMultiplier extends Field[Int]
|
||||
class WithTestDuration(x: Int) extends Config((site, here, up) => {
|
||||
case TestDurationMultiplier => x
|
||||
})
|
||||
|
||||
class WithTensorUnitTests extends Config((site, _, _) => {
|
||||
case UnitTests => (q: Parameters) => {
|
||||
implicit val p = q
|
||||
val timeout = 50000 * site(TestDurationMultiplier)
|
||||
Seq(
|
||||
Module(new TensorCoreDecoupledTest(timeout=timeout)),
|
||||
) }
|
||||
})
|
||||
|
||||
class WithCoalescingUnitTests extends Config((site, _, _) => {
|
||||
case UnitTests => (q: Parameters) => {
|
||||
implicit val p = q
|
||||
@@ -52,12 +64,34 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _)
|
||||
) }
|
||||
})
|
||||
|
||||
class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig)
|
||||
class TensorUnitTestConfig extends Config(
|
||||
new WithTensorUnitTests ++
|
||||
new WithTestDuration(10) ++
|
||||
new BaseSubsystemConfig)
|
||||
|
||||
class CoalescingUnitTestConfig extends Config(
|
||||
new WithCoalescingUnitTests ++
|
||||
new WithTestDuration(10) ++
|
||||
new WithSimtConfig(nMemLanes=4) ++
|
||||
new BaseSubsystemConfig)
|
||||
|
||||
//class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig)
|
||||
|
||||
// Dummy configs of various sizes for synthesis
|
||||
class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane8Config extends Config(new WithCoalescingUnitSynthesisDummy(8) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane16Config extends Config(new WithCoalescingUnitSynthesisDummy(16) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane32Config extends Config(new WithCoalescingUnitSynthesisDummy(32) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane4Config extends Config(
|
||||
new WithCoalescingUnitSynthesisDummy(4) ++
|
||||
new WithTestDuration(10) ++
|
||||
new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane8Config extends Config(
|
||||
new WithCoalescingUnitSynthesisDummy(8) ++
|
||||
new WithTestDuration(10) ++
|
||||
new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane16Config extends Config(
|
||||
new WithCoalescingUnitSynthesisDummy(16) ++
|
||||
new WithTestDuration(10) ++
|
||||
new BaseSubsystemConfig)
|
||||
class CoalescingSynthesisDummyLane32Config extends Config(
|
||||
new WithCoalescingUnitSynthesisDummy(32) ++
|
||||
new WithTestDuration(10) ++
|
||||
new BaseSubsystemConfig)
|
||||
|
||||
26
src/test/scala/radiance/TensorCoreDecoupledTest.scala
Normal file
26
src/test/scala/radiance/TensorCoreDecoupledTest.scala
Normal file
@@ -0,0 +1,26 @@
|
||||
package radiance.core
|
||||
|
||||
import chisel3._
|
||||
import chisel3.util._
|
||||
import chiseltest._
|
||||
import org.scalatest.flatspec.AnyFlatSpec
|
||||
|
||||
class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester {
|
||||
behavior of "TensorCoreDecoupled"
|
||||
|
||||
it should "do the right thing" in {
|
||||
test(new TensorCoreDecoupled(8, 8, tilingParams = TensorTilingParams()))
|
||||
{ c =>
|
||||
c.io.initiate.valid.poke(true.B)
|
||||
c.io.initiate.bits.wid.poke(0.U)
|
||||
|
||||
c.io.respA.valid.poke(false.B)
|
||||
c.io.respA.bits.data.poke(0.U)
|
||||
c.io.respB.valid.poke(false.B)
|
||||
c.io.respB.bits.data.poke(0.U)
|
||||
|
||||
c.clock.step()
|
||||
c.io.writeback.valid.expect(true.B)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user