tensor: more doc
This commit is contained in:
@@ -74,6 +74,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
||||
assign decode_if.ibuf_pop[i] = uop_sequencer_if[i].valid && uop_sequencer_if[i].ready;
|
||||
`endif
|
||||
|
||||
// tensor-core operation is controlled by a single macro-instruction at
|
||||
// the ISA; internally, the uop_sequencer blitzs micro-ops (counterpart
|
||||
// to Volta SASS set/step instructions) into the ibuffer upon encountering
|
||||
// this macro-instruction. this becomes a pass-through for non-tensorcore
|
||||
// instructions.
|
||||
VX_uop_sequencer uop_sequencer (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
||||
@@ -27,6 +27,7 @@ module VX_reduce_ext #(
|
||||
input wire [`INST_RED_BITS-1:0] op_type,
|
||||
output wire [DATAW_OUT-1:0] data_out
|
||||
);
|
||||
// recursive binary reduction
|
||||
if (N == 1) begin
|
||||
`UNUSED_VAR(op_type)
|
||||
`UNUSED_VAR(mask)
|
||||
|
||||
@@ -211,12 +211,11 @@ module VX_tensor_octet #(
|
||||
// half the inputs are buffered, half are not (instead coming straight
|
||||
// from operand bus) unlike the real tensor core.
|
||||
// the banks are only 32 bit rather than 64 bit (a pair of fp32 regs).
|
||||
// since A and B are supplied by 4 lanes each, we get 4 fp32's at a time
|
||||
// (8 for C).
|
||||
logic [3:0][31:0] A_half;
|
||||
logic [3:0][31:0] B_half;
|
||||
logic [7:0][31:0] C_half;
|
||||
always @(*) begin
|
||||
// note that not all lanes participate at every step
|
||||
case (step)
|
||||
2'b00: begin
|
||||
A_half = { A_in[5:4], A_in[1:0] };
|
||||
@@ -268,7 +267,6 @@ module VX_tensor_octet #(
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
wire stall = result_valid && ~result_ready;
|
||||
assign operands_ready = ~stall;
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
|
||||
HMMA_SET0_STEP0_0: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)};
|
||||
end
|
||||
|
||||
@@ -119,6 +119,7 @@ module VX_uop_sequencer import VX_gpu_pkg::*; (
|
||||
uop[UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS-1:0]
|
||||
};
|
||||
|
||||
// passthrough when !use_uop
|
||||
assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid;
|
||||
assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready;
|
||||
assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data;
|
||||
|
||||
Reference in New Issue
Block a user