tensor: Parameterize result buffer depth
This commit is contained in:
@@ -298,7 +298,11 @@ endmodule
|
|||||||
|
|
||||||
module VX_tensor_octet #(
|
module VX_tensor_octet #(
|
||||||
parameter ISW,
|
parameter ISW,
|
||||||
parameter OCTET
|
parameter OCTET,
|
||||||
|
// RESULT_BUFFER_DEPTH = 2 gives good performance by absorbing commit
|
||||||
|
// backpressure (result_ready), although the value is arbitrary.
|
||||||
|
// RESULT_BUFFER_DEPTH = 0 eliminates result buffering.
|
||||||
|
parameter RESULT_BUFFER_DEPTH = 2
|
||||||
) (
|
) (
|
||||||
input clk,
|
input clk,
|
||||||
input reset,
|
input reset,
|
||||||
@@ -488,6 +492,7 @@ module VX_tensor_octet #(
|
|||||||
.D_wid(D_wid_dpu)
|
.D_wid(D_wid_dpu)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (RESULT_BUFFER_DEPTH > 0) begin
|
||||||
wire outbuf_empty;
|
wire outbuf_empty;
|
||||||
wire outbuf_full;
|
wire outbuf_full;
|
||||||
// backpressure from commit
|
// backpressure from commit
|
||||||
@@ -497,14 +502,14 @@ module VX_tensor_octet #(
|
|||||||
wire outbuf_enq = outbuf_ready_in && dpu_valid;
|
wire outbuf_enq = outbuf_ready_in && dpu_valid;
|
||||||
wire outbuf_deq = result_valid && result_ready;
|
wire outbuf_deq = result_valid && result_ready;
|
||||||
|
|
||||||
// result buffer to stage the D tile for 2 cycles until commit/writeback
|
// Result buffer that stages the D tile for 2 cycles until
|
||||||
// is complete. This decouples the irregular dpu output traffic from the
|
// commit/writeback is complete. This decouples the irregular dpu
|
||||||
// regular, every-2-cycle commit traffic to ensure the commit pipeline is
|
// output traffic from the regular, every-2-cycle commit traffic to
|
||||||
// used more efficiently.
|
// ensure the commit pipeline is used more efficiently.
|
||||||
// FIXME: unnecessary?
|
// FIXME: unnecessary?
|
||||||
VX_fifo_queue #(
|
VX_fifo_queue #(
|
||||||
.DATAW ($bits(D_wid) + $bits(D_out)),
|
.DATAW ($bits(D_wid) + $bits(D_out)),
|
||||||
.DEPTH (2 /* arbitrary */)
|
.DEPTH (RESULT_BUFFER_DEPTH) // 2 works good
|
||||||
) output_buffer (
|
) output_buffer (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
@@ -519,8 +524,18 @@ module VX_tensor_octet #(
|
|||||||
`UNUSED_PIN(size)
|
`UNUSED_PIN(size)
|
||||||
);
|
);
|
||||||
|
|
||||||
// FIXME: this shouldn't be necessary
|
// FIXME: overly strict; this firing doesn't mean a bug
|
||||||
`RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
|
`RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
|
||||||
|
end else begin
|
||||||
|
// XXX: this depends on the assumption that commit stage only asserts
|
||||||
|
// result_ready when result_valid is true
|
||||||
|
assign outbuf_ready_in = !result_valid || result_ready;
|
||||||
|
assign result_valid = dpu_valid;
|
||||||
|
|
||||||
|
// make direct connections
|
||||||
|
assign D_wid = D_wid_dpu;
|
||||||
|
assign D_out = D_tile;
|
||||||
|
end
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;
|
logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;
|
||||||
|
|||||||
@@ -250,7 +250,7 @@ module VX_tensor_threadgroup #(
|
|||||||
for (genvar i = 0; i < 4; ++i) begin
|
for (genvar i = 0; i < 4; ++i) begin
|
||||||
localparam int d_row = i / 2;
|
localparam int d_row = i / 2;
|
||||||
localparam int d_col = (i % 2) * 2;
|
localparam int d_col = (i % 2) * 2;
|
||||||
// four-element dot product (FEDP) unit
|
// Dot product (FEDP) unit generated from Chisel
|
||||||
TensorDotProductUnit fedp (
|
TensorDotProductUnit fedp (
|
||||||
.clock (clk),
|
.clock (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
|
|||||||
Reference in New Issue
Block a user