aco/gfx11: perform FS input loads in WQM

fossil-db (gfx1100):
Totals from 48184 (35.68% of 135032) affected shaders:
MaxWaves: 1131876 -> 1131960 (+0.01%); split: +0.05%, -0.04%
Instrs: 36755466 -> 36782290 (+0.07%); split: -0.04%, +0.11%
CodeSize: 200812068 -> 200915348 (+0.05%); split: -0.04%, +0.09%
VGPRs: 2163980 -> 2163828 (-0.01%); split: -0.15%, +0.14%
Latency: 484174459 -> 484341018 (+0.03%); split: -0.06%, +0.09%
InvThroughput: 87941284 -> 87944874 (+0.00%); split: -0.04%, +0.04%
VClause: 652984 -> 653085 (+0.02%); split: -0.09%, +0.10%
SClause: 1510995 -> 1528832 (+1.18%); split: -0.40%, +1.58%
Copies: 1997689 -> 2001857 (+0.21%); split: -0.49%, +0.69%
Branches: 676629 -> 676584 (-0.01%); split: -0.02%, +0.01%
PreSGPRs: 2033070 -> 2036725 (+0.18%)
PreVGPRs: 1903922 -> 1903897 (-0.00%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Fixes: 3730be9873 ("aco: mostly implement FS input loads on GFX11")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19370>
This commit is contained in:
Rhys Perry 2022-10-26 21:13:15 +01:00 committed by Marge Bot
parent 3da4fe9c6d
commit 16d2c7ad55

View file

@ -5319,14 +5319,17 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Tem
Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
Temp res;
if (dst.regClass() == v2b) { if (dst.regClass() == v2b) {
Temp p10 = Temp p10 =
bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p); bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10); res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
} else { } else {
Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p); Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10); res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), p, coord2, p10);
} }
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
emit_wqm(bld, res, dst, true);
} }
void void
@ -5385,7 +5388,10 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsig
//TODO: this doesn't work in quad-divergent control flow and ignores vertex_id //TODO: this doesn't work in quad-divergent control flow and ignores vertex_id
Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component); Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0); uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0);
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl); Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
/* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
emit_wqm(bld, res, dst, true);
} else { } else {
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id), bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
bld.m0(prim_mask), idx, component); bld.m0(prim_mask), idx, component);