brw: Don't emit redundant flags initialization for subgroup op lowering

No shader-db changes on any Intel platform.

fossil-db:

All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 233676039 -> 233675305 (-0.00%)
Cycle count: 32594097814 -> 32593658094 (-0.00%); split: -0.00%, +0.00%

Totals from 325 (0.04% of 789264) affected shaders:
Instrs: 104491 -> 103757 (-0.70%)
Cycle count: 1183870034 -> 1183430314 (-0.04%); split: -0.04%, +0.00%

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35444>
This commit is contained in:
Ian Romanick 2025-06-09 15:21:05 -07:00 committed by Marge Bot
parent 4a238f461d
commit 3018849535

View file

@ -349,28 +349,59 @@ brw_lower_scan(brw_shader &s, brw_inst *inst)
return true;
}
/**
* Initialize flags bits for non-executing lanes to known value.
*
* If flags have already be initlized in this block, a redundant
* initialization will not be emitted.
*/
static brw_reg
brw_fill_flag(const brw_builder &bld, unsigned v)
brw_fill_flag(const brw_builder &bld, unsigned v, brw_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
const brw_builder ubld1 = bld.uniform();
brw_reg flag = brw_flag_reg(0, 0);
brw_reg value = bld.shader->dispatch_width == 32 ?
brw_imm_ud(v) : brw_imm_uw(v & 0xFFFF);
if (bld.shader->dispatch_width == 32) {
/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
if (bld.shader->dispatch_width == 32)
flag = retype(flag, BRW_TYPE_UD);
ubld1.MOV(flag, brw_imm_ud(v));
} else {
ubld1.MOV(flag, brw_imm_uw(v & 0xFFFF));
foreach_inst_in_block_reverse_starting_from(brw_inst, scan_inst, inst) {
/* If an instruction is found that will initialize the flags to the expected
* values, no additional initialization is necessary.
*/
if (scan_inst->opcode == BRW_OPCODE_MOV &&
scan_inst->force_writemask_all &&
scan_inst->dst.equals(flag) &&
scan_inst->src[0].equals(value)) {
return flag;
}
/* If a flags write is encountered that might modify the bits that
* supposed to be initialized, stop the search.
*/
if ((scan_inst->force_writemask_all &&
scan_inst->flags_written(devinfo) != 0) ||
brw_reg_is_arf(scan_inst->dst, BRW_ARF_FLAG)) {
break;
}
}
ubld1.MOV(flag, value);
return flag;
}
static void
brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
brw_lower_dispatch_width_vote(const brw_builder &bld, brw_inst *inst)
{
const intel_device_info *devinfo = bld.shader->devinfo;
const unsigned dispatch_width = bld.shader->dispatch_width;
const enum opcode opcode = inst->opcode;
brw_reg dst = inst->dst;
brw_reg src = inst->src[0];
assert(opcode == SHADER_OPCODE_VOTE_ANY ||
opcode == SHADER_OPCODE_VOTE_ALL ||
@ -385,7 +416,7 @@ brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_re
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF, inst);
bld.CMP(bld.null_reg_d(), src, ref, equal ? BRW_CONDITIONAL_Z
: BRW_CONDITIONAL_NZ);
@ -419,8 +450,12 @@ brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_re
}
static void
brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
brw_lower_quad_vote_gfx9(const brw_builder &bld, brw_inst *inst)
{
const enum opcode opcode = inst->opcode;
brw_reg dst = inst->dst;
brw_reg src = inst->src[0];
assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL);
const bool any = opcode == SHADER_OPCODE_VOTE_ANY;
@ -428,7 +463,7 @@ brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF, inst);
bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
bld.exec_all().MOV(retype(dst, BRW_TYPE_UD), brw_imm_ud(0));
@ -441,15 +476,19 @@ brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst
}
static void
brw_lower_quad_vote_gfx20(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
brw_lower_quad_vote_gfx20(const brw_builder &bld, brw_inst *inst)
{
const enum opcode opcode = inst->opcode;
brw_reg dst = inst->dst;
brw_reg src = inst->src[0];
assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL);
const bool any = opcode == SHADER_OPCODE_VOTE_ANY;
/* This code is going to manipulate the results of flag mask, so clear it to
* avoid any residual value from disabled channels.
*/
brw_reg flag = brw_fill_flag(bld, 0);
brw_reg flag = brw_fill_flag(bld, 0, inst);
/* Mask of invocations where condition is true, note that mask is
* replicated to each invocation.
@ -490,9 +529,6 @@ brw_lower_vote(brw_shader &s, brw_inst *inst)
{
const brw_builder bld(inst);
brw_reg dst = inst->dst;
brw_reg src = inst->src[0];
unsigned cluster_size;
if (inst->sources > 1) {
assert(inst->src[1].file == IMM);
@ -502,13 +538,13 @@ brw_lower_vote(brw_shader &s, brw_inst *inst)
}
if (cluster_size == s.dispatch_width) {
brw_lower_dispatch_width_vote(bld, inst->opcode, dst, src);
brw_lower_dispatch_width_vote(bld, inst);
} else {
assert(cluster_size == 4);
if (s.devinfo->ver < 20)
brw_lower_quad_vote_gfx9(bld, inst->opcode, dst, src);
brw_lower_quad_vote_gfx9(bld, inst);
else
brw_lower_quad_vote_gfx20(bld, inst->opcode, dst, src);
brw_lower_quad_vote_gfx20(bld, inst);
}
inst->remove();
@ -536,7 +572,7 @@ brw_lower_ballot(brw_shader &s, brw_inst *inst)
xbld.MOV(dst, zero);
}
} else {
brw_reg flag = brw_fill_flag(bld, 0);
brw_reg flag = brw_fill_flag(bld, 0, inst);
bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
xbld.MOV(dst, flag);
}