brw: switch buffer/image size intrinsics lowering to NIR

Fossil-db DG2: Totals from 127 (0.01% of 1799288) affected shaders: Instrs: 60593 -> 60508 (-0.14%); split: -0.15%, +0.01% Cycle count: 7099635 -> 7116148 (+0.23%); split: -0.12%, +0.35% Spill count: 468 -> 466 (-0.43%) Fill count: 224 -> 222 (-0.89%) Max live registers: 6418 -> 6424 (+0.09%); split: -0.06%, +0.16% Non SSA regs after NIR: 11228 -> 11220 (-0.07%); split: -0.20%, +0.12% Fossil-db LNL: Totals from 135 (0.01% of 1573226) affected shaders: Instrs: 55173 -> 55143 (-0.05%); split: -0.07%, +0.01% Cycle count: 9178338 -> 9157052 (-0.23%); split: -0.32%, +0.09% Spill count: 454 -> 452 (-0.44%) Fill count: 181 -> 179 (-1.10%) Max live registers: 12915 -> 12919 (+0.03%); split: -0.06%, +0.09% Non SSA regs after NIR: 10860 -> 10852 (-0.07%); split: -0.20%, +0.13% shader-db LNL: total instructions in shared programs: 16911578 -> 16911566 (<.01%) instructions in affected programs: 1602 -> 1590 (-0.75%) helped: 7 HURT: 0 helped stats (abs) min: 1.0 max: 2.0 x̄: 1.71 x̃: 2 helped stats (rel) min: 0.48% max: 1.10% x̄: 0.75% x̃: 0.74% 95% mean confidence interval for instructions value: -2.17 -1.26 95% mean confidence interval for instructions %-change: -0.96% -0.55% Instructions are helped. total loops in shared programs: 5168 -> 5168 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 848964184 -> 848955094 (<.01%) cycles in affected programs: 1528020 -> 1518930 (-0.59%) helped: 9 HURT: 6 helped stats (abs) min: 2.0 max: 8484.0 x̄: 1212.89 x̃: 20 helped stats (rel) min: 0.02% max: 3.23% x̄: 0.57% x̃: 0.11% HURT stats (abs) min: 2.0 max: 1608.0 x̄: 304.33 x̃: 15 HURT stats (rel) min: <.01% max: 0.59% x̄: 0.19% x̃: 0.07% 95% mean confidence interval for cycles value: -1875.18 663.18 95% mean confidence interval for cycles %-change: -0.75% 0.23% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 3345 -> 3345 (0.00%) spills in affected programs: 0 -> 0 helped: 0 HURT: 0 total fills in shared programs: 1777 -> 1777 (0.00%) fills in affected programs: 0 -> 0 helped: 0 HURT: 0 total sends in shared programs: 869299 -> 869299 (0.00%) sends in affected programs: 0 -> 0 helped: 0 HURT: 0 LOST: 0 GAINED: 0 Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39258>
2026-01-27 01:10:25 +01:00 · 2026-01-12 10:49:24 +02:00 · 2026-01-12 10:49:24 +02:00 · fd744b0c8a
commit fd744b0c8a
parent 6293137d77
2 changed files with 81 additions and 109 deletions
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@ -5076,47 +5076,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
      break;
   }

-   case nir_intrinsic_image_size:
-   case nir_intrinsic_bindless_image_size: {
-      /* Cube image sizes should have previously been lowered to a 2D array */
-      assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
-
-      /* Unlike the [un]typed load and store opcodes, the TXS that this turns
-       * into will handle the binding table index for us in the geneerator.
-       * Incidentally, this means that we can handle bindless with exactly the
-       * same code.
-       */
-      brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD);
-      image = bld.emit_uniformize(image);
-
-      assert(nir_src_as_uint(instr->src[1]) == 0);
-
-      brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
-      srcs[TEX_LOGICAL_SRC_SURFACE] = image;
-      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
-      srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
-
-      /* Since the image size is always uniform, we can just emit a SIMD8
-       * query instruction and splat the result out.
-       */
-      const brw_builder ubld = bld.scalar_group();
-
-      brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
-      brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
-                                     tmp, srcs,
-                                     TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
-      inst->required_params = 0x1 /* LOD */;
-      inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
-      inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
-      inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
-
-      for (unsigned c = 0; c < instr->def.num_components; ++c) {
-         bld.MOV(offset(retype(dest, tmp.type), bld, c),
-                 component(offset(tmp, ubld, c), 0));
-      }
-      break;
-   }
-
   case nir_intrinsic_barrier:
   case nir_intrinsic_begin_invocation_interlock:
   case nir_intrinsic_end_invocation_interlock: {
@ -5570,67 +5529,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
      break;
   }

-   case nir_intrinsic_get_ssbo_size: {
-      assert(nir_src_num_components(instr->src[0]) == 1);
-
-      /* A resinfo's sampler message is used to get the buffer size.  The
-       * SIMD8's writeback message consists of four registers and SIMD16's
-       * writeback message consists of 8 destination registers (two per each
-       * component).  Because we are only interested on the first channel of
-       * the first returned component, where resinfo returns the buffer size
-       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
-       * the dispatch width.
-       */
-      const brw_builder ubld = bld.scalar_group();
-
-      brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
-      srcs[TEX_LOGICAL_SRC_SURFACE] = get_nir_buffer_intrinsic_index(ntb, bld, instr);
-      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
-      srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
-
-      brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
-      brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
-                                     tmp, srcs,
-                                     TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
-      inst->required_params = 0x1 /* LOD */;
-      inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
-      inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]);
-      inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
-      inst->fused_eu_disable =
-         (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0;
-
-      for (unsigned c = 0; c < instr->def.num_components; ++c) {
-         bld.MOV(offset(retype(dest, tmp.type), bld, c),
-                 component(offset(tmp, ubld, c), 0));
-      }
-
-      /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
-       *
-       * "Out-of-bounds checking is always performed at a DWord granularity. If
-       * any part of the DWord is out-of-bounds then the whole DWord is
-       * considered out-of-bounds."
-       *
-       * This implies that types with size smaller than 4-bytes need to be
-       * padded if they don't complete the last dword of the buffer. But as we
-       * need to maintain the original size we need to reverse the padding
-       * calculation to return the correct size to know the number of elements
-       * of an unsized array. As we stored in the last two bits of the surface
-       * size the needed padding for the buffer, we calculate here the
-       * original buffer_size reversing the surface_size calculation:
-       *
-       * surface_size = isl_align(buffer_size, 4) +
-       *                (isl_align(buffer_size) - buffer_size)
-       *
-       * buffer_size = surface_size & ~3 - surface_size & 3
-       */
-      brw_reg size_padding  = ubld.AND(tmp, brw_imm_ud(3));
-      brw_reg size_aligned4 = ubld.AND(tmp, brw_imm_ud(~3));
-      brw_reg buffer_size   = ubld.ADD(size_aligned4, negate(size_padding));
-
-      bld.MOV(retype(dest, tmp.type), component(buffer_size, 0));
-      break;
-   }
-
   case nir_intrinsic_load_subgroup_size:
      /* This should only happen for fragment shaders because every other case
       * is lowered in NIR so we can optimize on it.
--- a/src/intel/compiler/brw/brw_nir_lower_texture.c
+++ b/src/intel/compiler/brw/brw_nir_lower_texture.c
@ -36,9 +36,7 @@
 *      to sample_po_c_l instead.
 */
 static bool
-pre_lower_texture_instr(nir_builder *b,
-                        nir_tex_instr *tex,
-                        void *data)
+pre_lower_tex_instr(nir_builder *b, nir_tex_instr *tex)
 {
   switch (tex->op) {
   case nir_texop_txb: {
@ -87,13 +85,89 @@ pre_lower_texture_instr(nir_builder *b,
   }
 }

+/* Lower size intrinsic to use the sampler. */
+static bool
+pre_lower_intrinsic_instr(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   enum glsl_sampler_dim dim = GLSL_SAMPLER_DIM_BUF;
+   bool is_array = false;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_get_ssbo_size:
+      break;
+
+   case nir_intrinsic_bindless_image_size:
+   case nir_intrinsic_image_size:
+      dim = nir_intrinsic_image_dim(intrin);
+      is_array = nir_intrinsic_image_array(intrin);
+      break;
+
+   default:
+      return false;
+   }
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_src *surface = nir_get_io_index_src(intrin);
+   nir_intrinsic_instr *rsrc = nir_src_as_intrinsic(*surface);
+
+   bool bindless = rsrc && (nir_intrinsic_resource_access_intel(rsrc) &
+                            nir_resource_intel_bindless);
+
+   nir_def *txs = nir_txs(b, .lod = nir_imm_int(b, 0),
+                             .dim = dim, .is_array = is_array,
+                             .texture_offset = bindless ? NULL : surface->ssa,
+                             .texture_handle = bindless ? surface->ssa : NULL);
+
+   /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
+    *
+    * "Out-of-bounds checking is always performed at a DWord granularity. If
+    * any part of the DWord is out-of-bounds then the whole DWord is
+    * considered out-of-bounds."
+    *
+    * This implies that types with size smaller than 4-bytes need to be
+    * padded if they don't complete the last dword of the buffer. But as we
+    * need to maintain the original size we need to reverse the padding
+    * calculation to return the correct size to know the number of elements
+    * of an unsized array. As we stored in the last two bits of the surface
+    * size the needed padding for the buffer, we calculate here the
+    * original buffer_size reversing the surface_size calculation:
+    *
+    * surface_size = isl_align(buffer_size, 4) +
+    *                (isl_align(buffer_size) - buffer_size)
+    *
+    * buffer_size = surface_size & ~3 - surface_size & 3
+    */
+   if (intrin->intrinsic == nir_intrinsic_get_ssbo_size)
+      txs = nir_isub(b, txs, nir_imul_imm(b, nir_iand_imm(b, txs, 3), 2));
+
+   nir_def_replace(&intrin->def, txs);
+
+   return true;
+}
+
+static bool
+pre_lower_texture_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   switch (instr->type) {
+   case nir_instr_type_tex:
+      return pre_lower_tex_instr(b, nir_instr_as_tex(instr));
+
+   case nir_instr_type_intrinsic:
+      return pre_lower_intrinsic_instr(b, nir_instr_as_intrinsic(instr));
+
+   default:
+      return false;
+   }
+}
+
 bool
 brw_nir_pre_lower_texture(nir_shader *shader)
 {
-   return nir_shader_tex_pass(shader,
-                              pre_lower_texture_instr,
-                              nir_metadata_control_flow,
-                              NULL);
+   return nir_shader_instructions_pass(shader,
+                                       pre_lower_texture_instr,
+                                       nir_metadata_control_flow,
+                                       NULL);
 }

 /**