intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization

SIMD1 instructions are problematic because they are considered partial
writes. This increases the liveness of the destination register
written by those instructions. To workaround this we use UNDEF
instructions to bound the liveness of the register. But this causing
other issues like in this case :

  undef(1) vgrf2
  mov(1)   vgrf2, u4.0
  add(1)   vgrf3, vgrf2.0, 64UD

In this case the copy propagation pass in unable to see that vgrf2 in
the add() instruction can be replaced with the uniform u4.0.

To fix this problem, we switch NoMask SIMD8 instructions that cover
the entire register. We can drop the UNDEF instructions and now copy
propagation can do its job.

Good results on 2 apps :

Cyberpunk 2077 :

  Totals from 7258 (68.80% of 10549) affected shaders:
  Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03%
  Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58%
  Subgroup size: 90320 -> 90400 (+0.09%)
  Spill count: 90 -> 68 (-24.44%)
  Fill count: 82 -> 64 (-21.95%)
  Scratch Memory Size: 8192 -> 6144 (-25.00%)
  Max live registers: 385464 -> 375152 (-2.68%)
  Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82%

  Gaining 60 SIMD16/SIMD32 shaders, loosing 33

Strange Brigade :

  Totals from 2137 (53.12% of 4023) affected shaders:
  Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00%
  Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53%
  Subgroup size: 25328 -> 25344 (+0.06%)
  Max live registers: 113716 -> 111214 (-2.20%)
  Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37%

  Gaining 138 SIMD16/SIMD32 shaders, loosing 4

On app slightly negatively affected :

Dota2 :

  Totals from 232 (14.73% of 1575) affected shaders:
  Instrs: 30029 -> 28194 (-6.11%)
  Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02%
  Max live registers: 6792 -> 6780 (-0.18%)
  Max dispatch width: 2256 -> 2160 (-4.26%)

  Loosing 6 SIMD32 shaders

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
This commit is contained in:
Lionel Landwerlin 2023-08-07 14:03:57 +03:00 committed by Marge Bot
parent d28f42f85d
commit a25f96c00c

View file

@ -4003,7 +4003,7 @@ fs_reg
fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_def)
{
/* Create a build at the location of the resource_intel intrinsic */
fs_builder ubld1 = bld.exec_all().group(1, 0);
fs_builder ubld8 = bld.exec_all().group(8, 0);
struct rebuild_resource resources = {};
resources.idx = 0;
@ -4041,10 +4041,9 @@ fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_d
case nir_instr_type_load_const: {
nir_load_const_instr *load_const =
nir_instr_as_load_const(instr);
fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
ubld1.UNDEF(dst);
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] =
ubld1.group(8, 0).MOV(dst, brw_imm_ud(load_const->value[0].i32));
ubld8.MOV(dst, brw_imm_ud(load_const->value[0].i32));
break;
}
@ -4067,52 +4066,47 @@ fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_d
switch (alu->op) {
case nir_op_iadd: {
fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
ubld1.UNDEF(dst);
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] =
ubld1.ADD(dst,
ubld8.ADD(dst,
src0.file != IMM ? src0 : src1,
src0.file != IMM ? src1 : src0);
break;
}
case nir_op_iadd3: {
fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
ubld1.UNDEF(dst);
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
fs_reg src2 = nir_resource_insts[alu->src[2].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] =
ubld1.ADD3(dst,
ubld8.ADD3(dst,
src1.file == IMM ? src1 : src0,
src1.file == IMM ? src0 : src1,
src2);
break;
}
case nir_op_ushr: {
assert(ubld1.dispatch_width() == 1);
fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
ubld1.UNDEF(dst);
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] = ubld1.SHR(dst, src0, src1);
nir_resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
break;
}
case nir_op_ishl: {
fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
ubld1.UNDEF(dst);
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] = ubld1.SHL(dst, src0, src1);
nir_resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
break;
}
case nir_op_mov: {
@ -4138,11 +4132,10 @@ fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_d
unsigned base_offset = nir_intrinsic_base(intrin);
unsigned load_offset = nir_src_as_uint(intrin->src[0]);
fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
ubld1.UNDEF(dst);
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
src.offset = load_offset + base_offset % 4;
nir_resource_insts[def->index] = ubld1.MOV(dst, src);
nir_resource_insts[def->index] = ubld8.MOV(dst, src);
break;
}