From fd744b0c8a495be49240590156a3af5e22fc9cf8 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 12 Jan 2026 10:49:24 +0200 Subject: [PATCH] brw: switch buffer/image size intrinsics lowering to NIR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fossil-db DG2: Totals from 127 (0.01% of 1799288) affected shaders: Instrs: 60593 -> 60508 (-0.14%); split: -0.15%, +0.01% Cycle count: 7099635 -> 7116148 (+0.23%); split: -0.12%, +0.35% Spill count: 468 -> 466 (-0.43%) Fill count: 224 -> 222 (-0.89%) Max live registers: 6418 -> 6424 (+0.09%); split: -0.06%, +0.16% Non SSA regs after NIR: 11228 -> 11220 (-0.07%); split: -0.20%, +0.12% Fossil-db LNL: Totals from 135 (0.01% of 1573226) affected shaders: Instrs: 55173 -> 55143 (-0.05%); split: -0.07%, +0.01% Cycle count: 9178338 -> 9157052 (-0.23%); split: -0.32%, +0.09% Spill count: 454 -> 452 (-0.44%) Fill count: 181 -> 179 (-1.10%) Max live registers: 12915 -> 12919 (+0.03%); split: -0.06%, +0.09% Non SSA regs after NIR: 10860 -> 10852 (-0.07%); split: -0.20%, +0.13% shader-db LNL: total instructions in shared programs: 16911578 -> 16911566 (<.01%) instructions in affected programs: 1602 -> 1590 (-0.75%) helped: 7 HURT: 0 helped stats (abs) min: 1.0 max: 2.0 x̄: 1.71 x̃: 2 helped stats (rel) min: 0.48% max: 1.10% x̄: 0.75% x̃: 0.74% 95% mean confidence interval for instructions value: -2.17 -1.26 95% mean confidence interval for instructions %-change: -0.96% -0.55% Instructions are helped. total loops in shared programs: 5168 -> 5168 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 848964184 -> 848955094 (<.01%) cycles in affected programs: 1528020 -> 1518930 (-0.59%) helped: 9 HURT: 6 helped stats (abs) min: 2.0 max: 8484.0 x̄: 1212.89 x̃: 20 helped stats (rel) min: 0.02% max: 3.23% x̄: 0.57% x̃: 0.11% HURT stats (abs) min: 2.0 max: 1608.0 x̄: 304.33 x̃: 15 HURT stats (rel) min: <.01% max: 0.59% x̄: 0.19% x̃: 0.07% 95% mean confidence interval for cycles value: -1875.18 663.18 95% mean confidence interval for cycles %-change: -0.75% 0.23% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 3345 -> 3345 (0.00%) spills in affected programs: 0 -> 0 helped: 0 HURT: 0 total fills in shared programs: 1777 -> 1777 (0.00%) fills in affected programs: 0 -> 0 helped: 0 HURT: 0 total sends in shared programs: 869299 -> 869299 (0.00%) sends in affected programs: 0 -> 0 helped: 0 HURT: 0 LOST: 0 GAINED: 0 Signed-off-by: Lionel Landwerlin Reviewed-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/brw/brw_from_nir.cpp | 102 ------------------ .../compiler/brw/brw_nir_lower_texture.c | 88 +++++++++++++-- 2 files changed, 81 insertions(+), 109 deletions(-) diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index c3a564142f7..d68a9398973 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -5076,47 +5076,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, break; } - case nir_intrinsic_image_size: - case nir_intrinsic_bindless_image_size: { - /* Cube image sizes should have previously been lowered to a 2D array */ - assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE); - - /* Unlike the [un]typed load and store opcodes, the TXS that this turns - * into will handle the binding table index for us in the geneerator. - * Incidentally, this means that we can handle bindless with exactly the - * same code. - */ - brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD); - image = bld.emit_uniformize(image); - - assert(nir_src_as_uint(instr->src[1]) == 0); - - brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; - srcs[TEX_LOGICAL_SRC_SURFACE] = image; - srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); - srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */ - - /* Since the image size is always uniform, we can just emit a SIMD8 - * query instruction and splat the result out. - */ - const brw_builder ubld = bld.scalar_group(); - - brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4); - brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER, - tmp, srcs, - TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex(); - inst->required_params = 0x1 /* LOD */; - inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO; - inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size; - inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); - - for (unsigned c = 0; c < instr->def.num_components; ++c) { - bld.MOV(offset(retype(dest, tmp.type), bld, c), - component(offset(tmp, ubld, c), 0)); - } - break; - } - case nir_intrinsic_barrier: case nir_intrinsic_begin_invocation_interlock: case nir_intrinsic_end_invocation_interlock: { @@ -5570,67 +5529,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, break; } - case nir_intrinsic_get_ssbo_size: { - assert(nir_src_num_components(instr->src[0]) == 1); - - /* A resinfo's sampler message is used to get the buffer size. The - * SIMD8's writeback message consists of four registers and SIMD16's - * writeback message consists of 8 destination registers (two per each - * component). Because we are only interested on the first channel of - * the first returned component, where resinfo returns the buffer size - * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of - * the dispatch width. - */ - const brw_builder ubld = bld.scalar_group(); - - brw_reg srcs[TEX_LOGICAL_NUM_SRCS]; - srcs[TEX_LOGICAL_SRC_SURFACE] = get_nir_buffer_intrinsic_index(ntb, bld, instr); - srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); - srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */ - - brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4); - brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER, - tmp, srcs, - TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex(); - inst->required_params = 0x1 /* LOD */; - inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO; - inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]); - inst->size_written = 4 * REG_SIZE * reg_unit(devinfo); - inst->fused_eu_disable = - (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0; - - for (unsigned c = 0; c < instr->def.num_components; ++c) { - bld.MOV(offset(retype(dest, tmp.type), bld, c), - component(offset(tmp, ubld, c), 0)); - } - - /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: - * - * "Out-of-bounds checking is always performed at a DWord granularity. If - * any part of the DWord is out-of-bounds then the whole DWord is - * considered out-of-bounds." - * - * This implies that types with size smaller than 4-bytes need to be - * padded if they don't complete the last dword of the buffer. But as we - * need to maintain the original size we need to reverse the padding - * calculation to return the correct size to know the number of elements - * of an unsized array. As we stored in the last two bits of the surface - * size the needed padding for the buffer, we calculate here the - * original buffer_size reversing the surface_size calculation: - * - * surface_size = isl_align(buffer_size, 4) + - * (isl_align(buffer_size) - buffer_size) - * - * buffer_size = surface_size & ~3 - surface_size & 3 - */ - brw_reg size_padding = ubld.AND(tmp, brw_imm_ud(3)); - brw_reg size_aligned4 = ubld.AND(tmp, brw_imm_ud(~3)); - brw_reg buffer_size = ubld.ADD(size_aligned4, negate(size_padding)); - - bld.MOV(retype(dest, tmp.type), component(buffer_size, 0)); - break; - } - case nir_intrinsic_load_subgroup_size: /* This should only happen for fragment shaders because every other case * is lowered in NIR so we can optimize on it. diff --git a/src/intel/compiler/brw/brw_nir_lower_texture.c b/src/intel/compiler/brw/brw_nir_lower_texture.c index e6d17d3d416..5252244e940 100644 --- a/src/intel/compiler/brw/brw_nir_lower_texture.c +++ b/src/intel/compiler/brw/brw_nir_lower_texture.c @@ -36,9 +36,7 @@ * to sample_po_c_l instead. */ static bool -pre_lower_texture_instr(nir_builder *b, - nir_tex_instr *tex, - void *data) +pre_lower_tex_instr(nir_builder *b, nir_tex_instr *tex) { switch (tex->op) { case nir_texop_txb: { @@ -87,13 +85,89 @@ pre_lower_texture_instr(nir_builder *b, } } +/* Lower size intrinsic to use the sampler. */ +static bool +pre_lower_intrinsic_instr(nir_builder *b, nir_intrinsic_instr *intrin) +{ + enum glsl_sampler_dim dim = GLSL_SAMPLER_DIM_BUF; + bool is_array = false; + + switch (intrin->intrinsic) { + case nir_intrinsic_get_ssbo_size: + break; + + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_image_size: + dim = nir_intrinsic_image_dim(intrin); + is_array = nir_intrinsic_image_array(intrin); + break; + + default: + return false; + } + + b->cursor = nir_before_instr(&intrin->instr); + + nir_src *surface = nir_get_io_index_src(intrin); + nir_intrinsic_instr *rsrc = nir_src_as_intrinsic(*surface); + + bool bindless = rsrc && (nir_intrinsic_resource_access_intel(rsrc) & + nir_resource_intel_bindless); + + nir_def *txs = nir_txs(b, .lod = nir_imm_int(b, 0), + .dim = dim, .is_array = is_array, + .texture_offset = bindless ? NULL : surface->ssa, + .texture_handle = bindless ? surface->ssa : NULL); + + /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: + * + * "Out-of-bounds checking is always performed at a DWord granularity. If + * any part of the DWord is out-of-bounds then the whole DWord is + * considered out-of-bounds." + * + * This implies that types with size smaller than 4-bytes need to be + * padded if they don't complete the last dword of the buffer. But as we + * need to maintain the original size we need to reverse the padding + * calculation to return the correct size to know the number of elements + * of an unsized array. As we stored in the last two bits of the surface + * size the needed padding for the buffer, we calculate here the + * original buffer_size reversing the surface_size calculation: + * + * surface_size = isl_align(buffer_size, 4) + + * (isl_align(buffer_size) - buffer_size) + * + * buffer_size = surface_size & ~3 - surface_size & 3 + */ + if (intrin->intrinsic == nir_intrinsic_get_ssbo_size) + txs = nir_isub(b, txs, nir_imul_imm(b, nir_iand_imm(b, txs, 3), 2)); + + nir_def_replace(&intrin->def, txs); + + return true; +} + +static bool +pre_lower_texture_instr(nir_builder *b, nir_instr *instr, void *data) +{ + switch (instr->type) { + case nir_instr_type_tex: + return pre_lower_tex_instr(b, nir_instr_as_tex(instr)); + + case nir_instr_type_intrinsic: + return pre_lower_intrinsic_instr(b, nir_instr_as_intrinsic(instr)); + + default: + return false; + } +} + bool brw_nir_pre_lower_texture(nir_shader *shader) { - return nir_shader_tex_pass(shader, - pre_lower_texture_instr, - nir_metadata_control_flow, - NULL); + return nir_shader_instructions_pass(shader, + pre_lower_texture_instr, + nir_metadata_control_flow, + NULL); } /**