texture unit hardware optimizations

This commit is contained in:
Blaise Tine
2021-12-02 10:22:21 -08:00
parent 4477cbeed1
commit 38f166f090
9 changed files with 208 additions and 106 deletions

View File

@@ -44,6 +44,7 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type; wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type;
// full address calculation
wire [`NUM_THREADS-1:0][31:0] full_addr; wire [`NUM_THREADS-1:0][31:0] full_addr;
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset;

View File

@@ -64,7 +64,8 @@ module VX_writeback #(
VX_stream_arbiter #( VX_stream_arbiter #(
.NUM_REQS (NUM_RSPS), .NUM_REQS (NUM_RSPS),
.DATAW (DATAW), .DATAW (DATAW),
.TYPE ("P") .BUFFERED (1),
.TYPE ("R")
) rsp_arb ( ) rsp_arb (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),

View File

@@ -17,6 +17,7 @@ module VX_tex_addr #(
input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
input wire [NUM_REQS-1:0][`TEX_LOD_BITS-1:0] mip_level,
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff,
input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims, input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims,
input wire [REQ_INFOW-1:0] req_info, input wire [REQ_INFOW-1:0] req_info,
@@ -28,6 +29,7 @@ module VX_tex_addr #(
output wire [NUM_REQS-1:0] rsp_tmask, output wire [NUM_REQS-1:0] rsp_tmask,
output wire [`TEX_FILTER_BITS-1:0] rsp_filter, output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride, output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride,
output wire [NUM_REQS-1:0][31:0] rsp_baseaddr,
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends, output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends,
output wire [REQ_INFOW-1:0] rsp_info, output wire [REQ_INFOW-1:0] rsp_info,
@@ -38,6 +40,7 @@ module VX_tex_addr #(
localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1); localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1);
localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1; localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1;
localparam SCALED_DIM = `TEX_FXD_FRAC + `TEX_DIM_BITS;
localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC; localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC;
localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX;
localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX;
@@ -69,7 +72,7 @@ module VX_tex_addr #(
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin for (genvar j = 0; j < 2; ++j) begin
wire [`TEX_FXD_FRAC-1:0] delta = (`TEX_FXD_HALF >> req_logdims[i][j]); wire [`TEX_FXD_FRAC-1:0] delta = `TEX_FXD_FRAC'((SCALED_DIM'(`TEX_FXD_HALF) << mip_level[i]) >> req_logdims[i][j]);
wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i]; wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i];
wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i]; wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i];
@@ -89,14 +92,14 @@ module VX_tex_addr #(
.coord_o (clamped_hi[i][j]) .coord_o (clamped_hi[i][j])
); );
assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - req_logdims[i][j]); assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - (req_logdims[i][j] - mip_level[i]));
end end
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0] - mip_level[i]) + PITCH_BITS'(log_stride);
assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); assign mip_addr[i] = req_baseaddr + `TEX_ADDR_BITS'(req_mipoff[i]);
end end
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + 32 + 2 * 2 * `TEX_FXD_FRAC)), .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + `TEX_ADDR_BITS + 2 * 2 * `TEX_FXD_FRAC)),
.RESETW (1) .RESETW (1)
) pipe_reg0 ( ) pipe_reg0 (
.clk (clk), .clk (clk),
@@ -114,8 +117,6 @@ module VX_tex_addr #(
wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi; wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi;
wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo; wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo;
wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi; wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi;
wire [NUM_REQS-1:0][31:0] base_addr_lo;
wire [NUM_REQS-1:0][31:0] base_addr_hi;
wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends; wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends;
wire [NUM_REQS-1:0][3:0][31:0] addr; wire [NUM_REQS-1:0][3:0][31:0] addr;
@@ -134,26 +135,23 @@ module VX_tex_addr #(
assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i];
assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i];
assign base_addr_lo[i] = mip_addr_s0[i] + 32'(offset_v_lo[i]); assign addr[i][0] = 32'(offset_v_lo[i]) + 32'(offset_u_lo[i]);
assign base_addr_hi[i] = mip_addr_s0[i] + 32'(offset_v_hi[i]); assign addr[i][1] = 32'(offset_v_lo[i]) + 32'(offset_u_hi[i]);
assign addr[i][2] = 32'(offset_v_hi[i]) + 32'(offset_u_lo[i]);
assign addr[i][0] = base_addr_lo[i] + 32'(offset_u_lo[i]); assign addr[i][3] = 32'(offset_v_hi[i]) + 32'(offset_u_hi[i]);
assign addr[i][1] = base_addr_lo[i] + 32'(offset_u_hi[i]);
assign addr[i][2] = base_addr_hi[i] + 32'(offset_u_lo[i]);
assign addr[i][3] = base_addr_hi[i] + 32'(offset_u_hi[i]);
end end
assign stall_out = rsp_valid && ~rsp_ready; assign stall_out = rsp_valid && ~rsp_ready;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW), .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 32) + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW),
.RESETW (1) .RESETW (1)
) pipe_reg1 ( ) pipe_reg1 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall_out), .enable (~stall_out),
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, mip_addr_s0, addr, blends, req_info_s0}),
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_addr, rsp_blends, rsp_info}) .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_baseaddr, rsp_addr, rsp_blends, rsp_info})
); );
assign req_ready = ~stall_out; assign req_ready = ~stall_out;
@@ -176,6 +174,8 @@ module VX_tex_addr #(
`TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS); `TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS);
dpi_trace(", clamped_hi="); dpi_trace(", clamped_hi=");
`TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS); `TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS);
dpi_trace(", mip_addr=");
`TRACE_ARRAY1D(mip_addr, NUM_REQS);
dpi_trace("\n"); dpi_trace("\n");
end end
@@ -192,10 +192,6 @@ module VX_tex_addr #(
`TRACE_ARRAY1D(offset_v_lo, NUM_REQS); `TRACE_ARRAY1D(offset_v_lo, NUM_REQS);
dpi_trace(", offset_v_hi="); dpi_trace(", offset_v_hi=");
`TRACE_ARRAY1D(offset_v_hi, NUM_REQS); `TRACE_ARRAY1D(offset_v_hi, NUM_REQS);
dpi_trace(", base_addr_lo=");
`TRACE_ARRAY1D(base_addr_lo, NUM_REQS);
dpi_trace(", base_addr_hi=");
`TRACE_ARRAY1D(base_addr_hi, NUM_REQS);
dpi_trace("\n"); dpi_trace("\n");
end end

View File

@@ -3,12 +3,11 @@
module VX_tex_lerp ( module VX_tex_lerp (
input wire [3:0][7:0] in1, input wire [3:0][7:0] in1,
input wire [3:0][7:0] in2, input wire [3:0][7:0] in2,
input wire [8:0] alpha, input wire [7:0] frac,
input wire [7:0] beta,
output wire [3:0][7:0] out output wire [3:0][7:0] out
); );
for (genvar i = 0; i < 4; ++i) begin for (genvar i = 0; i < 4; ++i) begin
wire [16:0] sum = in1[i] * alpha + in2[i] * beta; wire [16:0] sum = in1[i] * 8'(8'hff - frac) + in2[i] * frac;
`UNUSED_VAR (sum) `UNUSED_VAR (sum)
assign out[i] = sum[15:8]; assign out[i] = sum[15:8];
end end

View File

@@ -16,6 +16,7 @@ module VX_tex_mem #(
input wire [NUM_REQS-1:0] req_tmask, input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride, input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride,
input wire [NUM_REQS-1:0][31:0] req_baseaddr,
input wire [NUM_REQS-1:0][3:0][31:0] req_addr, input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
input wire [REQ_INFOW-1:0] req_info, input wire [REQ_INFOW-1:0] req_info,
output wire req_ready, output wire req_ready,
@@ -32,6 +33,14 @@ module VX_tex_mem #(
localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1); localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1);
// full address calculation
wire [NUM_REQS-1:0][3:0][31:0] full_addr;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 4; ++j) begin
assign full_addr[i][j] = req_baseaddr[i] + req_addr[i][j];
end
end
wire [3:0] dup_reqs; wire [3:0] dup_reqs;
wire [3:0][NUM_REQS-1:0][29:0] req_addr_w; wire [3:0][NUM_REQS-1:0][29:0] req_addr_w;
wire [3:0][NUM_REQS-1:0][1:0] align_offs; wire [3:0][NUM_REQS-1:0][1:0] align_offs;
@@ -40,17 +49,17 @@ module VX_tex_mem #(
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 4; ++j) begin for (genvar j = 0; j < 4; ++j) begin
assign req_addr_w[j][i] = req_addr[i][j][31:2]; assign req_addr_w[j][i] = full_addr[i][j][31:2];
assign align_offs[j][i] = req_addr[i][j][1:0]; assign align_offs[j][i] = full_addr[i][j][1:0];
end end
end end
// find duplicate addresses // detect duplicate addresses
for (genvar i = 0; i < 4; ++i) begin for (genvar i = 0; i < 4; ++i) begin
wire [NUM_REQS-1:0] addr_matches; wire [NUM_REQS-2:0] addr_matches;
for (genvar j = 0; j < NUM_REQS; j++) begin for (genvar j = 0; j < (NUM_REQS-1); ++j) begin
assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j]; assign addr_matches[j] = (req_addr_w[i][j+1] == req_addr_w[i][0]) || ~req_tmask[j+1];
end end
assign dup_reqs[i] = req_tmask[0] && (& addr_matches); assign dup_reqs[i] = req_tmask[0] && (& addr_matches);
end end
@@ -172,6 +181,8 @@ module VX_tex_mem #(
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
wire [NUM_REQS-1:0][1:0] rsp_align_offs; wire [NUM_REQS-1:0][1:0] rsp_align_offs;
wire [$clog2(NUM_REQS+1)-1:0] q_req_size;
wire [$clog2(NUM_REQS+1)-1:0] dcache_rsp_size;
wire dcache_rsp_fire; wire dcache_rsp_fire;
wire [1:0] rsp_texel_idx; wire [1:0] rsp_texel_idx;
wire rsp_texel_dup; wire rsp_texel_dup;
@@ -218,16 +229,21 @@ module VX_tex_mem #(
end end
end end
`POP_COUNT(q_req_size, q_req_tmask);
always @(*) begin always @(*) begin
rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask)); rsp_rem_ctr_init = q_dup_reqs[0] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size);
if (q_req_filter) begin if (q_req_filter) begin
for (integer i = 1; i < 4; ++i) begin for (integer i = 1; i < 4; ++i) begin
rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask)); rsp_rem_ctr_init += q_dup_reqs[i] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size);
end end
end end
end end
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask)); wire [NUM_REQS-1:0] dcache_rsp_tmask = dcache_rsp_if.tmask;
`POP_COUNT(dcache_rsp_size, dcache_rsp_tmask);
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'(dcache_rsp_size);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@@ -249,7 +265,7 @@ module VX_tex_mem #(
wire stall_out = rsp_valid && ~rsp_ready; wire stall_out = rsp_valid && ~rsp_ready;
wire is_last_rsp = (0 == rsp_rem_ctr_n); wire is_last_rsp = (rsp_rem_ctr == RSP_CTR_W'(dcache_rsp_size));
wire rsp_texels_done = dcache_rsp_fire && is_last_rsp; wire rsp_texels_done = dcache_rsp_fire && is_last_rsp;
@@ -290,8 +306,10 @@ module VX_tex_mem #(
dpi_trace("\n"); dpi_trace("\n");
end end
if (req_valid && req_ready) begin if (req_valid && req_ready) begin
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, addr=", dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, baseaddr=",
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride); $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride);
`TRACE_ARRAY1D(req_baseaddr, NUM_REQS);
dpi_trace(", addr=");
`TRACE_ARRAY2D(req_addr, 4, NUM_REQS); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS);
dpi_trace("\n"); dpi_trace("\n");
end end

View File

@@ -27,75 +27,78 @@ module VX_tex_sampler #(
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
wire valid_s0; wire valid_s0, valid_s1;
wire [NUM_REQS-1:0] tmask_s0; wire [NUM_REQS-1:0] req_tmask_s0, req_tmask_s1;
wire [REQ_INFOW-1:0] req_info_s0; wire [REQ_INFOW-1:0] req_info_s0, req_info_s1;
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; wire [NUM_REQS-1:0][31:0] texel_ul_s1, texel_uh_s1;
wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s0; wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends_s0;
wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s1;
wire [NUM_REQS-1:0][31:0] texel_v; wire [NUM_REQS-1:0][31:0] texel_v;
wire [NUM_REQS-1:0][3:0][31:0] fmt_texels, fmt_texels_s0;
wire stall_out; wire stall_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [3:0][31:0] fmt_texels;
for (genvar j = 0; j < 4; ++j) begin for (genvar j = 0; j < 4; ++j) begin
VX_tex_format #( VX_tex_format #(
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
) tex_format ( ) tex_format (
.format (req_format), .format (req_format),
.texel_in (req_data[i][j]), .texel_in (req_data[i][j]),
.texel_out (fmt_texels[j]) .texel_out (fmt_texels[i][j])
); );
end end
wire [7:0] beta = req_blends[i][0];
wire [8:0] alpha = `TEX_BLEND_ONE - beta;
VX_tex_lerp #(
) tex_lerp_ul (
.in1 (fmt_texels[0]),
.in2 (fmt_texels[1]),
.alpha (alpha),
.beta (beta),
.out (texel_ul[i])
);
VX_tex_lerp #(
) tex_lerp_uh (
.in1 (fmt_texels[2]),
.in2 (fmt_texels[3]),
.alpha (alpha),
.beta (beta),
.out (texel_uh[i])
);
assign blend_v[i] = req_blends[i][1];
end end
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)), .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 2 * `TEX_BLEND_FRAC) + (NUM_REQS * 4 * 32)),
.RESETW (1) .RESETW (1)
) pipe_reg0 ( ) pipe_reg0 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall_out), .enable (~stall_out),
.data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}), .data_in ({req_valid, req_tmask, req_info, req_blends, fmt_texels}),
.data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0}) .data_out ({valid_s0, req_tmask_s0, req_info_s0, req_blends_s0, fmt_texels_s0})
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_tex_lerp #(
) tex_lerp_ul (
.in1 (fmt_texels_s0[i][0]),
.in2 (fmt_texels_s0[i][1]),
.frac (req_blends_s0[i][0]),
.out (texel_ul[i])
);
VX_tex_lerp #(
) tex_lerp_uh (
.in1 (fmt_texels_s0[i][2]),
.in2 (fmt_texels_s0[i][3]),
.frac (req_blends_s0[i][0]),
.out (texel_uh[i])
);
assign blend_v[i] = req_blends_s0[i][1];
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({valid_s0, req_tmask_s0, req_info_s0, blend_v, texel_ul, texel_uh}),
.data_out ({valid_s1, req_tmask_s1, req_info_s1, blend_v_s1, texel_ul_s1, texel_uh_s1})
); );
for (genvar i = 0; i < NUM_REQS; i++) begin for (genvar i = 0; i < NUM_REQS; i++) begin
wire [7:0] beta = blend_v_s0[i];
wire [8:0] alpha = `TEX_BLEND_ONE - beta;
VX_tex_lerp #( VX_tex_lerp #(
) tex_lerp_v ( ) tex_lerp_v (
.in1 (texel_ul_s0[i]), .in1 (texel_ul_s1[i]),
.in2 (texel_uh_s0[i]), .in2 (texel_uh_s1[i]),
.alpha (alpha), .frac (blend_v_s1[i]),
.beta (beta),
.out (texel_v[i]) .out (texel_v[i])
); );
end end
@@ -105,12 +108,12 @@ module VX_tex_sampler #(
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)), .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)),
.RESETW (1) .RESETW (1)
) pipe_reg1 ( ) pipe_reg2 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall_out), .enable (~stall_out),
.data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}), .data_in ({valid_s1, req_tmask_s1, req_info_s1, texel_v}),
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
); );
// can accept new request? // can accept new request?

View File

@@ -23,9 +23,8 @@ module VX_tex_unit #(
VX_tex_rsp_if.master tex_rsp_if VX_tex_rsp_if.master tex_rsp_if
); );
localparam REQ_INFOW_S = 64 + `NR_BITS + 1 + `NW_BITS + 32; localparam REQ_INFO_W = 64 + `NR_BITS + 1 + `NW_BITS + 32;
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; localparam BLEND_FRAC_W = (2 * `NUM_THREADS * `TEX_BLEND_FRAC);
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A;
reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit; reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit;
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0];
@@ -95,15 +94,16 @@ module VX_tex_unit #(
// mipmap attributes // mipmap attributes
wire [`NUM_THREADS-1:0][`TEX_LOD_BITS-1:0] mip_level;
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims; wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin for (genvar i = 0; i < `NUM_THREADS; ++i) begin
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][`TEX_LOD_BITS-1:0]; assign mip_level[i] = tex_req_if.lod[i][`TEX_LOD_BITS-1:0];
assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; assign sel_mipoff[i] = tex_mipoff[unit][mip_level[i]];
assign sel_logdims[i][0] = (tex_logdims[unit][0] - mip_level); assign sel_logdims[i][0] = tex_logdims[unit][0];
assign sel_logdims[i][1] = (tex_logdims[unit][1] - mip_level); assign sel_logdims[i][1] = tex_logdims[unit][1];
end end
// address generation // address generation
@@ -114,12 +114,13 @@ module VX_tex_unit #(
wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride; wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride;
wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends; wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
wire [REQ_INFOW_A-1:0] mem_req_info; wire [`NUM_THREADS-1:0][31:0] mem_req_baseaddr;
wire [(`TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_req_info;
wire mem_req_ready; wire mem_req_ready;
VX_tex_addr #( VX_tex_addr #(
.CORE_ID (CORE_ID), .CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_A), .REQ_INFOW (`TEX_FORMAT_BITS + REQ_INFO_W),
.NUM_REQS (`NUM_THREADS) .NUM_REQS (`NUM_THREADS)
) tex_addr ( ) tex_addr (
.clk (clk), .clk (clk),
@@ -132,6 +133,7 @@ module VX_tex_unit #(
.req_filter (tex_filter[tex_req_if.unit]), .req_filter (tex_filter[tex_req_if.unit]),
.req_wraps (tex_wraps[tex_req_if.unit]), .req_wraps (tex_wraps[tex_req_if.unit]),
.req_baseaddr(tex_baddr[tex_req_if.unit]), .req_baseaddr(tex_baddr[tex_req_if.unit]),
.mip_level (mip_level),
.req_mipoff (sel_mipoff), .req_mipoff (sel_mipoff),
.req_logdims(sel_logdims), .req_logdims(sel_logdims),
.req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), .req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
@@ -141,6 +143,7 @@ module VX_tex_unit #(
.rsp_tmask (mem_req_tmask), .rsp_tmask (mem_req_tmask),
.rsp_filter (mem_req_filter), .rsp_filter (mem_req_filter),
.rsp_lgstride(mem_req_lgstride), .rsp_lgstride(mem_req_lgstride),
.rsp_baseaddr(mem_req_baseaddr),
.rsp_addr (mem_req_addr), .rsp_addr (mem_req_addr),
.rsp_blends (mem_req_blends), .rsp_blends (mem_req_blends),
.rsp_info (mem_req_info), .rsp_info (mem_req_info),
@@ -152,12 +155,12 @@ module VX_tex_unit #(
wire mem_rsp_valid; wire mem_rsp_valid;
wire [`NUM_THREADS-1:0] mem_rsp_tmask; wire [`NUM_THREADS-1:0] mem_rsp_tmask;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data; wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data;
wire [REQ_INFOW_M-1:0] mem_rsp_info; wire [(BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_rsp_info;
wire mem_rsp_ready; wire mem_rsp_ready;
VX_tex_mem #( VX_tex_mem #(
.CORE_ID (CORE_ID), .CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_M), .REQ_INFOW (BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W),
.NUM_REQS (`NUM_THREADS) .NUM_REQS (`NUM_THREADS)
) tex_mem ( ) tex_mem (
.clk (clk), .clk (clk),
@@ -172,6 +175,7 @@ module VX_tex_unit #(
.req_tmask (mem_req_tmask), .req_tmask (mem_req_tmask),
.req_filter(mem_req_filter), .req_filter(mem_req_filter),
.req_lgstride(mem_req_lgstride), .req_lgstride(mem_req_lgstride),
.req_baseaddr(mem_req_baseaddr),
.req_addr (mem_req_addr), .req_addr (mem_req_addr),
.req_info ({mem_req_blends, mem_req_info}), .req_info ({mem_req_blends, mem_req_info}),
.req_ready (mem_req_ready), .req_ready (mem_req_ready),
@@ -186,15 +190,9 @@ module VX_tex_unit #(
// apply sampler // apply sampler
wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends;
wire [`TEX_FORMAT_BITS-1:0] rsp_format;
wire [REQ_INFOW_S-1:0] rsp_info;
assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info;
VX_tex_sampler #( VX_tex_sampler #(
.CORE_ID (CORE_ID), .CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_S), .REQ_INFOW (REQ_INFO_W),
.NUM_REQS (`NUM_THREADS) .NUM_REQS (`NUM_THREADS)
) tex_sampler ( ) tex_sampler (
.clk (clk), .clk (clk),
@@ -204,9 +202,9 @@ module VX_tex_unit #(
.req_valid (mem_rsp_valid), .req_valid (mem_rsp_valid),
.req_tmask (mem_rsp_tmask), .req_tmask (mem_rsp_tmask),
.req_data (mem_rsp_data), .req_data (mem_rsp_data),
.req_format (rsp_format), .req_blends (mem_rsp_info[(REQ_INFO_W+`TEX_FORMAT_BITS) +: BLEND_FRAC_W]),
.req_blends (rsp_blends), .req_format (mem_rsp_info[REQ_INFO_W +: `TEX_FORMAT_BITS]),
.req_info (rsp_info), .req_info (mem_rsp_info[0 +: REQ_INFO_W]),
.req_ready (mem_rsp_ready), .req_ready (mem_rsp_ready),
// outputs // outputs

View File

@@ -1,6 +1,6 @@
BUILD_DIR ?= build BUILD_DIR ?= build
.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 .PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 texunit
dogfood: dogfood:
mkdir -p dogfood/$(BUILD_DIR) mkdir -p dogfood/$(BUILD_DIR)
@@ -75,4 +75,9 @@ top32:
top64: top64:
mkdir -p top64/$(BUILD_DIR) mkdir -p top64/$(BUILD_DIR)
cp top64/Makefile top64/$(BUILD_DIR) cp top64/Makefile top64/$(BUILD_DIR)
$(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 & $(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 &
texunit:
mkdir -p texunit/$(BUILD_DIR)
cp texunit/Makefile texunit/$(BUILD_DIR)
$(MAKE) -C texunit/$(BUILD_DIR) clean && $(MAKE) -C texunit/$(BUILD_DIR) > texunit/$(BUILD_DIR)/build.log 2>&1 &

View File

@@ -0,0 +1,81 @@
PROJECT = Core
TOP_LEVEL_ENTITY = VX_core
SRC_FILE = VX_core.v
RTL_DIR = ../../../../rtl
THIRD_PARTY_DIR = ../../../../../third_party
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Executable Configuration
SYN_ARGS = --parallel --read_settings_files=on
FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on
ASM_ARGS =
STA_ARGS = --parallel --do_report_timing
# Build targets
all: $(PROJECT).sta.rpt
syn: $(PROJECT).syn.rpt
fit: $(PROJECT).fit.rpt
asm: $(PROJECT).asm.rpt
sta: $(PROJECT).sta.rpt
smart: smart.log
# Target implementations
STAMP = echo done >
$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES)
quartus_syn $(PROJECT) $(SYN_ARGS)
$(STAMP) fit.chg
$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt
quartus_fit $(PROJECT) $(FIT_ARGS)
$(STAMP) asm.chg
$(STAMP) sta.chg
$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt
quartus_asm $(PROJECT) $(ASM_ARGS)
$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt
quartus_sta $(PROJECT) $(STA_ARGS)
smart.log: $(PROJECT_FILES)
quartus_sh --determine_smart_action $(PROJECT) > smart.log
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "EXT_TEX_ENABLE=1"
syn.chg:
$(STAMP) syn.chg
fit.chg:
$(STAMP) fit.chg
sta.chg:
$(STAMP) sta.chg
asm.chg:
$(STAMP) asm.chg
program: $(PROJECT).sof
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
clean:
rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox