From 6281a128227fdbd6e10fe33c3d8d540bc05d7ffa Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 13 Aug 2025 14:05:43 -0700 Subject: [PATCH] brw: Remove brw_inst::no_dd_check/no_dd_clear These dependency hints were primarily useful for the vec4 backend, where it was common to write subsets of a vec4's components across multiple instructions. In the scalar backend, we rarely used them. They also no longer exist on Tigerlake and later in favor of software scoreboarding. Dropping this allows us to clean up the IR a bit. We still use the hardware hints in the generator in a couple places: - Gfx9-12.0 scratch headers - Quad swizzles - Indirect MOV lowering In theory we might want them back if we moved that lowering to the IR. For scratch at least, I suspect it won't have a huge impact, as we're already incurring the cost of spills/fills. The others are fairly rare as well, so it may not be worth keeping. Reviewed-by: Caio Oliveira Part-of: --- .../compiler/brw_analysis_performance.cpp | 34 ++++++----- src/intel/compiler/brw_generator.cpp | 8 +-- src/intel/compiler/brw_inst.h | 2 - src/intel/compiler/brw_lower_scoreboard.cpp | 57 +++++++++---------- src/intel/compiler/brw_reg_allocate.cpp | 8 --- 5 files changed, 45 insertions(+), 64 deletions(-) diff --git a/src/intel/compiler/brw_analysis_performance.cpp b/src/intel/compiler/brw_analysis_performance.cpp index 27a1e424299..ac25cbd7162 100644 --- a/src/intel/compiler/brw_analysis_performance.cpp +++ b/src/intel/compiler/brw_analysis_performance.cpp @@ -936,26 +936,24 @@ namespace { } /* Stall on any write dependencies. */ - if (!inst->no_dd_check) { - if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { - for (unsigned j = 0; j < regs_written(inst); j++) - stall_on_dependency( - st, reg_dependency_id(devinfo, inst->dst, j)); - } + if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { + for (unsigned j = 0; j < regs_written(inst); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, inst->dst, j)); + } - if (inst->writes_accumulator_implicitly(devinfo)) { - for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); - j <= accum_reg_of_channel(devinfo, inst, info.tx, - inst->exec_size - 1); j++) - stall_on_dependency( - st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); - } + if (inst->writes_accumulator_implicitly(devinfo)) { + for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); + j <= accum_reg_of_channel(devinfo, inst, info.tx, + inst->exec_size - 1); j++) + stall_on_dependency( + st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); + } - if (const unsigned mask = inst->flags_written(devinfo)) { - for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { - if (mask & (1 << i)) - stall_on_dependency(st, flag_dependency_id(i)); - } + if (const unsigned mask = inst->flags_written(devinfo)) { + for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { + if (mask & (1 << i)) + stall_on_dependency(st, flag_dependency_id(i)); } } diff --git a/src/intel/compiler/brw_generator.cpp b/src/intel/compiler/brw_generator.cpp index 85a4b460830..44124399fb4 100644 --- a/src/intel/compiler/brw_generator.cpp +++ b/src/intel/compiler/brw_generator.cpp @@ -1354,19 +1354,15 @@ brw_generator::generate_code(const brw_shader &s, if (multiple_instructions_emitted) continue; - if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { + if (inst->conditional_mod) { assert(p->next_insn_offset == last_insn_offset + 16 || - !"conditional_mod, no_dd_check, or no_dd_clear set for IR " + !"conditional_mod for IR " "emitting more than 1 instruction"); brw_eu_inst *last = &p->store[last_insn_offset / 16]; if (inst->conditional_mod) brw_eu_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); - if (devinfo->ver < 12) { - brw_eu_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); - brw_eu_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); - } } /* When enabled, insert sync NOP after every instruction and make sure diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h index 39001bbac3a..944db25ed2d 100644 --- a/src/intel/compiler/brw_inst.h +++ b/src/intel/compiler/brw_inst.h @@ -194,8 +194,6 @@ public: bool predicate_inverse:1; bool writes_accumulator:1; /**< instruction implicitly writes accumulator */ bool force_writemask_all:1; - bool no_dd_clear:1; - bool no_dd_check:1; bool saturate:1; bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */ bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */ diff --git a/src/intel/compiler/brw_lower_scoreboard.cpp b/src/intel/compiler/brw_lower_scoreboard.cpp index 042b6a4d75b..f5a933603d8 100644 --- a/src/intel/compiler/brw_lower_scoreboard.cpp +++ b/src/intel/compiler/brw_lower_scoreboard.cpp @@ -1272,41 +1272,39 @@ namespace { add_dependency(ids, deps[ip], dependency(TGL_SBID_SET, ip, exec_all)); - if (!inst->no_dd_check) { - if (inst->dst.file != BAD_FILE && !inst->dst.is_null() && - !inst->dst.is_accumulator()) { - for (unsigned j = 0; j < regs_written(inst); j++) { - add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst, - sb.get(byte_offset(inst->dst, REG_SIZE * j)))); - } + if (inst->dst.file != BAD_FILE && !inst->dst.is_null() && + !inst->dst.is_accumulator()) { + for (unsigned j = 0; j < regs_written(inst); j++) { + add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst, + sb.get(byte_offset(inst->dst, REG_SIZE * j)))); } + } - if (inst->writes_accumulator_implicitly(devinfo) || - inst->dst.is_accumulator()) { - /* Wa_22012725308: - * - * "When the accumulator registers are used as source and/or - * destination, hardware does not ensure prevention of write - * after read hazard across execution pipes." - */ - const dependency dep = sb.get(brw_acc_reg(8)); + if (inst->writes_accumulator_implicitly(devinfo) || + inst->dst.is_accumulator()) { + /* Wa_22012725308: + * + * "When the accumulator registers are used as source and/or + * destination, hardware does not ensure prevention of write + * after read hazard across execution pipes." + */ + const dependency dep = sb.get(brw_acc_reg(8)); + if (dep.ordered && !is_single_pipe(dep.jp, p)) + add_dependency(ids, deps[ip], dep); + } + + /* flags_written returns a bit set per byte of the flags register + * file that is writtten. + */ + unsigned flags = inst->flags_written(devinfo); + for (unsigned i = 0; flags != 0; i++) { + if ((flags & 0x0f) != 0) { + const dependency dep = sb.get(brw_flag_reg(i, 0)); if (dep.ordered && !is_single_pipe(dep.jp, p)) add_dependency(ids, deps[ip], dep); } - /* flags_written returns a bit set per byte of the flags register - * file that is writtten. - */ - unsigned flags = inst->flags_written(devinfo); - for (unsigned i = 0; flags != 0; i++) { - if ((flags & 0x0f) != 0) { - const dependency dep = sb.get(brw_flag_reg(i, 0)); - if (dep.ordered && !is_single_pipe(dep.jp, p)) - add_dependency(ids, deps[ip], dep); - } - - flags >>= 4; - } + flags >>= 4; } update_inst_scoreboard(shader, jps, inst, ip, sb); @@ -1439,7 +1437,6 @@ namespace { /* Update the IR. */ inst->sched = swsb; - inst->no_dd_check = inst->no_dd_clear = false; ip++; } diff --git a/src/intel/compiler/brw_reg_allocate.cpp b/src/intel/compiler/brw_reg_allocate.cpp index 9170d0f60c3..dba7cb5ef6b 100644 --- a/src/intel/compiler/brw_reg_allocate.cpp +++ b/src/intel/compiler/brw_reg_allocate.cpp @@ -1275,14 +1275,6 @@ brw_reg_alloc::spill_reg(unsigned spill_reg) */ inst->dst.offset %= REG_SIZE * reg_unit(devinfo); - /* If we're immediately spilling the register, we should not use - * destination dependency hints. Doing so will cause the GPU do - * try to read and write the register at the same time and may - * hang the GPU. - */ - inst->no_dd_clear = false; - inst->no_dd_check = false; - /* Calculate the execution width of the scratch messages (which work * in terms of 32 bit components so we have a fixed number of eight * channels per spilled register). We attempt to write one