tensor: Use chisel-generated dpu module
This commit is contained in:
@@ -42,17 +42,120 @@ module VX_tensor_dpu #(
|
||||
// ready as soon as valid_out
|
||||
assign ready_in = ready_reg || valid_out;
|
||||
|
||||
// fixed-latency model
|
||||
// fully pipelined; always ready
|
||||
// assign ready_in = 1'b1;
|
||||
|
||||
// wire dpu_valid;
|
||||
// wire [31:0] dpu_data;
|
||||
// TensorDotProductUnit dpu_pipe (
|
||||
// .clock (clk),
|
||||
// .reset (reset),
|
||||
// .io_in_valid (valid_in && ready_in),
|
||||
// .io_in_bits_a_0 (32'h40000000),
|
||||
// .io_in_bits_a_1 (32'h40000000),
|
||||
// .io_in_bits_a_2 (32'h40000000),
|
||||
// .io_in_bits_a_3 (32'h40000000),
|
||||
// .io_in_bits_b_0 (32'h40000000),
|
||||
// .io_in_bits_b_1 (32'h40000000),
|
||||
// .io_in_bits_b_2 (32'h40000000),
|
||||
// .io_in_bits_b_3 (32'h40000000),
|
||||
// .io_in_bits_c (32'h3f800000),
|
||||
// .io_out_valid (dpu_valid),
|
||||
// .io_out_bits_data (dpu_data)
|
||||
// );
|
||||
|
||||
logic [1:0] threadgroup_valids;
|
||||
// B_tile is shared across the two threadgroups; see Figure 13
|
||||
VX_tensor_threadgroup #(
|
||||
) threadgroup_0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && ready_in),
|
||||
.stall (stall),
|
||||
.A_frag (A_tile[1:0]),
|
||||
.B_frag (B_tile),
|
||||
.C_frag (C_tile[1:0]),
|
||||
.valid_out (threadgroup_valids[0]),
|
||||
.D_frag (D_tile[1:0])
|
||||
);
|
||||
VX_tensor_threadgroup #(
|
||||
) threadgroup_1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && ready_in),
|
||||
.stall (stall),
|
||||
.A_frag (A_tile[3:2]),
|
||||
.B_frag (B_tile),
|
||||
.C_frag (C_tile[3:2]),
|
||||
.valid_out (threadgroup_valids[1]),
|
||||
.D_frag (D_tile[3:2])
|
||||
);
|
||||
|
||||
// fixed-latency queue
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + $bits(wid) + $bits(D_tile)),
|
||||
.DEPTH (`LATENCY_HMMA),
|
||||
.DATAW (1 + $bits(wid)/* + $bits(D_tile)*/),
|
||||
// .DEPTH (`LATENCY_HMMA),
|
||||
.DEPTH (2),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in && ready_in, wid, result_hmma}),
|
||||
.data_out ({valid_out, D_wid, D_tile})
|
||||
.data_in ({valid_in && ready_in, wid /*, result_hmma*/}),
|
||||
.data_out ({valid_out, D_wid/*, D_tile */})
|
||||
);
|
||||
|
||||
// FIXME: breaks when stall is on!
|
||||
`RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
|
||||
("FEDP and metadata queue went out of sync!"))
|
||||
endmodule
|
||||
|
||||
// does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
|
||||
// matches Figure 10(b) of the paper.
|
||||
module VX_tensor_threadgroup #(
|
||||
) (
|
||||
input clk,
|
||||
input reset,
|
||||
|
||||
input valid_in,
|
||||
input stall,
|
||||
input [1:0][1:0][31:0] A_frag,
|
||||
input [1:0][3:0][31:0] B_frag,
|
||||
input [1:0][3:0][31:0] C_frag,
|
||||
|
||||
output valid_out,
|
||||
output [1:0][3:0][31:0] D_frag
|
||||
|
||||
);
|
||||
// 4 FEDPs per threadgroup
|
||||
// FIXME: experimenting with 8 FEDPs first
|
||||
logic [1:0][3:0] valids;
|
||||
for (genvar D_row = 0; D_row < 2; ++D_row) begin
|
||||
for (genvar D_col = 0; D_col < 4; ++D_col) begin
|
||||
// four-element dot product (FEDP) unit
|
||||
TensorDotProductUnit fedp (
|
||||
.clock (clk),
|
||||
.reset (reset),
|
||||
.io_in_valid (valid_in),
|
||||
.io_in_bits_a_0 (A_frag[D_row][0]),
|
||||
.io_in_bits_a_1 (A_frag[D_row][1]),
|
||||
.io_in_bits_a_2 (32'h0),
|
||||
.io_in_bits_a_3 (32'h0),
|
||||
.io_in_bits_b_0 (B_frag[0][D_col]),
|
||||
.io_in_bits_b_1 (B_frag[1][D_col]),
|
||||
.io_in_bits_b_2 (32'h0),
|
||||
.io_in_bits_b_3 (32'h0),
|
||||
.io_in_bits_c (C_frag[D_row][D_col]),
|
||||
.io_stall (1'b0), // FIXME
|
||||
.io_out_valid (valids[D_row][D_col]),
|
||||
.io_out_bits_data (D_frag[D_row][D_col])
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
assign valid_out = (&(valids[0])) && (&(valids[1]));
|
||||
|
||||
`RUNTIME_ASSERT(reset || !stall, ("stall not supported yet in tensor dpu!"))
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
|
||||
Reference in New Issue
Block a user