From b9de19f917876d02a5100be74bd5cd5e72e91eaf Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Sat, 18 Jan 2025 00:48:10 -0800 Subject: [PATCH] brw: Eliminate the BTI source from MEMORY_FENCE/INTERLOCK opcodes Memory fences do not refer to an element of a binding table. Rather, the reason we had "BTI" in these opcodes was to distinguish what in modern terms are called UGM (untyped memory data cache) vs. SLM (cross-thread shared local memory) fences. Icelake and older platforms used the "data cache" SFID for both purposes, distinguishing them by having a special binding table index, 254, meaning "this is actually SLM access". This is where the notion that fences had BTIs came in. (In fact, prior to Icelake, separate SLM fences were not a thing, so BTI wasn't used there either.) To avoid confusion about BTI being involved, we choose a simpler lie: we have Icelake SLM fences target GFX12_SFID_SLM (like modern platforms would), even though it didn't really exist back then. Later lowering code sets it back to the correct Data Cache SFID with magic SLM binding table index. This eliminates BTI everywhere and an unnecessary source. Reviewed-by: Caio Oliveira Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_compile_mesh.cpp | 3 +-- src/intel/compiler/brw_eu.h | 3 +-- src/intel/compiler/brw_eu_defines.h | 4 ---- src/intel/compiler/brw_eu_emit.c | 21 ++++++++++++----- src/intel/compiler/brw_from_nir.cpp | 31 +++++++++++-------------- src/intel/compiler/brw_generator.cpp | 4 +--- src/intel/compiler/brw_validate.cpp | 2 ++ src/intel/compiler/brw_workaround.cpp | 3 +-- 8 files changed, 35 insertions(+), 36 deletions(-) diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index b2e81518a4c..5d08120f7cd 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -293,8 +293,7 @@ brw_emit_urb_fence(fs_visitor &s) brw_reg dst = bld1.vgrf(BRW_TYPE_UD); brw_inst *fence = bld1.emit(SHADER_OPCODE_MEMORY_FENCE, dst, brw_vec8_grf(0, 0), - brw_imm_ud(true), - brw_imm_ud(0)); + brw_imm_ud(true)); fence->size_written = REG_SIZE * reg_unit(s.devinfo); fence->sfid = BRW_SFID_URB; /* The logical thing here would likely be a THREADGROUP fence but that's diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 0040bb6377f..4832116df01 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1523,8 +1523,7 @@ brw_memory_fence(struct brw_codegen *p, enum opcode send_op, enum brw_message_target sfid, uint32_t desc, - bool commit_enable, - unsigned bti); + bool commit_enable); void brw_broadcast(struct brw_codegen *p, diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 5c6a8e5fa06..9177cc98a94 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -337,10 +337,6 @@ enum opcode { * Source 0: Must be register g0, used as header. * Source 1: Immediate bool to indicate whether control is returned to the * thread only after the fence has been honored. - * Source 2: Immediate byte indicating which memory to fence. Zero means - * global memory; GFX7_BTI_SLM means SLM (for Gfx11+ only). - * - * Vec4 backend only uses Source 0. */ SHADER_OPCODE_MEMORY_FENCE, diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index ed79303ea12..a27fd71d4d9 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -1692,14 +1692,25 @@ static void brw_set_memory_fence_message(struct brw_codegen *p, struct brw_eu_inst *insn, enum brw_message_target sfid, - bool commit_enable, - unsigned bti) + bool commit_enable) { const struct intel_device_info *devinfo = p->devinfo; brw_set_desc(p, insn, brw_message_desc( devinfo, 1, (commit_enable ? 1 : 0), true), false); + unsigned bti = 0; + if (sfid == GFX12_SFID_SLM) { + assert(devinfo->ver >= 11); + + /* This SFID doesn't exist on Gfx11-12.0, but we use it to represent + * SLM fences, and map back here to the way Gfx11-12 represented that: + * a special "SLM" binding table index and the data cache SFID. + */ + sfid = GFX7_SFID_DATAPORT_DATA_CACHE; + bti = GFX7_BTI_SLM; + } + brw_eu_inst_set_sfid(devinfo, insn, sfid); switch (sfid) { @@ -1716,7 +1727,6 @@ brw_set_memory_fence_message(struct brw_codegen *p, if (commit_enable) brw_eu_inst_set_dp_msg_control(devinfo, insn, 1 << 5); - assert(devinfo->ver >= 11 || bti == 0); brw_eu_inst_set_binding_table_index(devinfo, insn, bti); } @@ -1780,8 +1790,7 @@ brw_memory_fence(struct brw_codegen *p, enum opcode send_op, enum brw_message_target sfid, uint32_t desc, - bool commit_enable, - unsigned bti) + bool commit_enable) { const struct intel_device_info *devinfo = p->devinfo; @@ -1801,7 +1810,7 @@ brw_memory_fence(struct brw_codegen *p, if (devinfo->has_lsc) gfx12_set_memory_fence_message(p, insn, sfid, desc); else - brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti); + brw_set_memory_fence_message(p, insn, sfid, commit_enable); } void diff --git a/src/intel/compiler/brw_from_nir.cpp b/src/intel/compiler/brw_from_nir.cpp index fb028d1c6d4..cc4bdf8365e 100644 --- a/src/intel/compiler/brw_from_nir.cpp +++ b/src/intel/compiler/brw_from_nir.cpp @@ -4967,7 +4967,7 @@ increment_a64_address(const brw_builder &_bld, brw_reg address, uint32_t v, bool static brw_reg emit_fence(const brw_builder &bld, enum opcode opcode, uint8_t sfid, uint32_t desc, - bool commit_enable, uint8_t bti) + bool commit_enable) { const struct intel_device_info *devinfo = bld.shader->devinfo; @@ -4976,8 +4976,7 @@ emit_fence(const brw_builder &bld, enum opcode opcode, brw_reg dst = bld.vgrf(BRW_TYPE_UD); brw_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0), - brw_imm_ud(commit_enable), - brw_imm_ud(bti)); + brw_imm_ud(commit_enable)); fence->sfid = sfid; fence->desc = desc; fence->size_written = commit_enable ? REG_SIZE * reg_unit(devinfo) : 0; @@ -5993,15 +5992,13 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, if (ugm_fence) { fence_regs[fence_regs_count++] = emit_fence(ubld1, opcode, GFX12_SFID_UGM, desc, - true /* commit_enable */, - 0 /* bti; ignored for LSC */); + true /* commit_enable */); } if (tgm_fence) { fence_regs[fence_regs_count++] = emit_fence(ubld1, opcode, GFX12_SFID_TGM, desc, - true /* commit_enable */, - 0 /* bti; ignored for LSC */); + true /* commit_enable */); } if (slm_fence) { @@ -6016,31 +6013,31 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, } fence_regs[fence_regs_count++] = emit_fence(ubld1, opcode, GFX12_SFID_SLM, desc, - true /* commit_enable */, - 0 /* BTI; ignored for LSC */); + true /* commit_enable */); } if (urb_fence) { assert(opcode == SHADER_OPCODE_MEMORY_FENCE); fence_regs[fence_regs_count++] = emit_fence(ubld1, opcode, BRW_SFID_URB, desc, - true /* commit_enable */, - 0 /* BTI; ignored for LSC */); + true /* commit_enable */); } } else if (devinfo->ver >= 11) { if (tgm_fence || ugm_fence || urb_fence) { fence_regs[fence_regs_count++] = emit_fence(ubld1, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, - true /* commit_enable HSD ES # 1404612949 */, - 0 /* BTI = 0 means data cache */); + true /* commit_enable HSD ES # 1404612949 */); } if (slm_fence) { assert(opcode == SHADER_OPCODE_MEMORY_FENCE); + /* We use the "SLM" SFID here even though it doesn't exist; + * the logical send lowering will replace it with the SLM + * special binding table index and the normal DATA_CACHE SFID. + */ fence_regs[fence_regs_count++] = - emit_fence(ubld1, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, - true /* commit_enable HSD ES # 1404612949 */, - GFX7_BTI_SLM); + emit_fence(ubld1, opcode, GFX12_SFID_SLM, 0, + true /* commit_enable HSD ES # 1404612949 */); } } else { /* Simulation also complains on Gfx9 if we do not enable commit. @@ -6052,7 +6049,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, if (tgm_fence || ugm_fence || slm_fence || urb_fence) { fence_regs[fence_regs_count++] = emit_fence(ubld1, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0, - commit_enable, 0 /* BTI */); + commit_enable); } } diff --git a/src/intel/compiler/brw_generator.cpp b/src/intel/compiler/brw_generator.cpp index c27cd968c4d..59669bef1c6 100644 --- a/src/intel/compiler/brw_generator.cpp +++ b/src/intel/compiler/brw_generator.cpp @@ -1196,7 +1196,6 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_INTERLOCK: case SHADER_OPCODE_MEMORY_FENCE: { assert(src[1].file == IMM); - assert(src[2].file == IMM); const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ? BRW_OPCODE_SENDC : BRW_OPCODE_SEND; @@ -1204,8 +1203,7 @@ brw_generator::generate_code(const cfg_t *cfg, int dispatch_width, brw_memory_fence(p, dst, src[0], send_op, brw_message_target(inst->sfid), inst->desc, - /* commit_enable */ src[1].ud, - /* bti */ src[2].ud); + /* commit_enable */ src[1].ud); send_count++; break; } diff --git a/src/intel/compiler/brw_validate.cpp b/src/intel/compiler/brw_validate.cpp index 12670dccb16..b6db795cf3e 100644 --- a/src/intel/compiler/brw_validate.cpp +++ b/src/intel/compiler/brw_validate.cpp @@ -318,6 +318,8 @@ brw_validate(const fs_visitor &s) case SHADER_OPCODE_INTERLOCK: fsv_assert(inst->exec_size == 1); fsv_assert(inst->force_writemask_all); + fsv_assert(inst->sources == 2); + fsv_assert(is_ud_imm(inst->src[1])); /* commit enable */ break; default: diff --git a/src/intel/compiler/brw_workaround.cpp b/src/intel/compiler/brw_workaround.cpp index c1f3e00b5a7..7b699610103 100644 --- a/src/intel/compiler/brw_workaround.cpp +++ b/src/intel/compiler/brw_workaround.cpp @@ -105,8 +105,7 @@ brw_workaround_memory_fence_before_eot(fs_visitor &s) brw_reg dst = ubld.vgrf(BRW_TYPE_UD); brw_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE, dst, brw_vec8_grf(0, 0), - /* commit enable */ brw_imm_ud(1), - /* bti */ brw_imm_ud(0)); + /* commit enable */ brw_imm_ud(1)); dummy_fence->sfid = GFX12_SFID_UGM; dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE, LSC_FLUSH_TYPE_NONE_6, false);