brw: Remove brw_inst::no_dd_check/no_dd_clear

These dependency hints were primarily useful for the vec4 backend, where
it was common to write subsets of a vec4's components across multiple
instructions.  In the scalar backend, we rarely used them.  They also no
longer exist on Tigerlake and later in favor of software scoreboarding.

Dropping this allows us to clean up the IR a bit.

We still use the hardware hints in the generator in a couple places:

   - Gfx9-12.0 scratch headers
   - Quad swizzles
   - Indirect MOV lowering

In theory we might want them back if we moved that lowering to the IR.
For scratch at least, I suspect it won't have a huge impact, as we're
already incurring the cost of spills/fills.  The others are fairly rare
as well, so it may not be worth keeping.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36730>
This commit is contained in:
Kenneth Graunke 2025-08-13 14:05:43 -07:00 committed by Marge Bot
parent b26c68e40a
commit 6281a12822
5 changed files with 45 additions and 64 deletions

View file

@ -936,26 +936,24 @@ namespace {
}
/* Stall on any write dependencies. */
if (!inst->no_dd_check) {
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, inst->dst, j));
}
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, inst->dst, j));
}
if (inst->writes_accumulator_implicitly(devinfo)) {
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
j <= accum_reg_of_channel(devinfo, inst, info.tx,
inst->exec_size - 1); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
}
if (inst->writes_accumulator_implicitly(devinfo)) {
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
j <= accum_reg_of_channel(devinfo, inst, info.tx,
inst->exec_size - 1); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
}
if (const unsigned mask = inst->flags_written(devinfo)) {
for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
if (mask & (1 << i))
stall_on_dependency(st, flag_dependency_id(i));
}
if (const unsigned mask = inst->flags_written(devinfo)) {
for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
if (mask & (1 << i))
stall_on_dependency(st, flag_dependency_id(i));
}
}

View file

@ -1354,19 +1354,15 @@ brw_generator::generate_code(const brw_shader &s,
if (multiple_instructions_emitted)
continue;
if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
if (inst->conditional_mod) {
assert(p->next_insn_offset == last_insn_offset + 16 ||
!"conditional_mod, no_dd_check, or no_dd_clear set for IR "
!"conditional_mod for IR "
"emitting more than 1 instruction");
brw_eu_inst *last = &p->store[last_insn_offset / 16];
if (inst->conditional_mod)
brw_eu_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
if (devinfo->ver < 12) {
brw_eu_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
brw_eu_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
}
}
/* When enabled, insert sync NOP after every instruction and make sure

View file

@ -194,8 +194,6 @@ public:
bool predicate_inverse:1;
bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
bool force_writemask_all:1;
bool no_dd_clear:1;
bool no_dd_check:1;
bool saturate:1;
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */

View file

@ -1272,41 +1272,39 @@ namespace {
add_dependency(ids, deps[ip],
dependency(TGL_SBID_SET, ip, exec_all));
if (!inst->no_dd_check) {
if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
!inst->dst.is_accumulator()) {
for (unsigned j = 0; j < regs_written(inst); j++) {
add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
sb.get(byte_offset(inst->dst, REG_SIZE * j))));
}
if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
!inst->dst.is_accumulator()) {
for (unsigned j = 0; j < regs_written(inst); j++) {
add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
sb.get(byte_offset(inst->dst, REG_SIZE * j))));
}
}
if (inst->writes_accumulator_implicitly(devinfo) ||
inst->dst.is_accumulator()) {
/* Wa_22012725308:
*
* "When the accumulator registers are used as source and/or
* destination, hardware does not ensure prevention of write
* after read hazard across execution pipes."
*/
const dependency dep = sb.get(brw_acc_reg(8));
if (inst->writes_accumulator_implicitly(devinfo) ||
inst->dst.is_accumulator()) {
/* Wa_22012725308:
*
* "When the accumulator registers are used as source and/or
* destination, hardware does not ensure prevention of write
* after read hazard across execution pipes."
*/
const dependency dep = sb.get(brw_acc_reg(8));
if (dep.ordered && !is_single_pipe(dep.jp, p))
add_dependency(ids, deps[ip], dep);
}
/* flags_written returns a bit set per byte of the flags register
* file that is writtten.
*/
unsigned flags = inst->flags_written(devinfo);
for (unsigned i = 0; flags != 0; i++) {
if ((flags & 0x0f) != 0) {
const dependency dep = sb.get(brw_flag_reg(i, 0));
if (dep.ordered && !is_single_pipe(dep.jp, p))
add_dependency(ids, deps[ip], dep);
}
/* flags_written returns a bit set per byte of the flags register
* file that is writtten.
*/
unsigned flags = inst->flags_written(devinfo);
for (unsigned i = 0; flags != 0; i++) {
if ((flags & 0x0f) != 0) {
const dependency dep = sb.get(brw_flag_reg(i, 0));
if (dep.ordered && !is_single_pipe(dep.jp, p))
add_dependency(ids, deps[ip], dep);
}
flags >>= 4;
}
flags >>= 4;
}
update_inst_scoreboard(shader, jps, inst, ip, sb);
@ -1439,7 +1437,6 @@ namespace {
/* Update the IR. */
inst->sched = swsb;
inst->no_dd_check = inst->no_dd_clear = false;
ip++;
}

View file

@ -1275,14 +1275,6 @@ brw_reg_alloc::spill_reg(unsigned spill_reg)
*/
inst->dst.offset %= REG_SIZE * reg_unit(devinfo);
/* If we're immediately spilling the register, we should not use
* destination dependency hints. Doing so will cause the GPU do
* try to read and write the register at the same time and may
* hang the GPU.
*/
inst->no_dd_clear = false;
inst->no_dd_check = false;
/* Calculate the execution width of the scratch messages (which work
* in terms of 32 bit components so we have a fixed number of eight
* channels per spilled register). We attempt to write one