anv/brw: remove push constant load emulation from the backend compiler

Anv is responsible for much of how the data is accessed (where is the push constant base pointer located, etc...), so move the memory load there. Fossildb on LNL : Totals from 135931 (8.65% of 1572134) affected shaders: Instrs: 68518228 -> 67142101 (-2.01%); split: -2.05%, +0.05% CodeSize: 1123507040 -> 1092022560 (-2.80%); split: -2.88%, +0.08% Subgroup size: 32 -> 16 (-50.00%) Send messages: 4401584 -> 4402565 (+0.02%); split: -0.02%, +0.04% Cycle count: 4626573038 -> 4619434858 (-0.15%); split: -0.89%, +0.74% Spill count: 451759 -> 452407 (+0.14%); split: -0.43%, +0.57% Fill count: 374513 -> 377440 (+0.78%); split: -0.76%, +1.54% Max live registers: 15788042 -> 15791399 (+0.02%); split: -0.05%, +0.08% Max dispatch width: 3349408 -> 3346192 (-0.10%); split: +0.09%, -0.19% Non SSA regs after NIR: 9477038 -> 9498328 (+0.22%); split: -0.27%, +0.50% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40174>
2026-06-24 05:28:25 +02:00 · 2026-02-20 12:14:44 +02:00 · 2026-02-20 12:14:44 +02:00 · 9f2215b480
commit 9f2215b480
parent 38cc622d8b
3 changed files with 84 additions and 141 deletions
--- a/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c
+++ b/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c
@ -117,23 +117,6 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
            nir_instr_remove(instr);
            break;

-         case nir_intrinsic_load_push_data_intel:
-            /* We don't want to lower this in the launch trampoline.
-             *
-             * Also if the driver chooses to use an inline push address, we
-             * can do all the loading of the push constant in
-             * assign_curb_setup() (more efficient as we can do NoMask
-             * instructions for address calculations).
-             */
-            if (stage == MESA_SHADER_COMPUTE || key->uses_inline_push_addr)
-               break;
-
-            sysval = brw_nir_load_global_const(b, intrin,
-                        nir_load_btd_global_arg_addr_intel(b),
-                        BRW_RT_PUSH_CONST_OFFSET);
-
-            break;
-
         case nir_intrinsic_load_ray_launch_id:
            sysval = nir_channels(b, hotzone, 0xe);
            break;
--- a/src/intel/compiler/brw/brw_shader.cpp
+++ b/src/intel/compiler/brw/brw_shader.cpp
@ -365,124 +365,8 @@ brw_shader::assign_curb_setup()
      this->push_data_size += prog_data->push_sizes[i];
   }

-   uint64_t used = 0;
-   const bool pull_constants =
-      devinfo->verx10 >= 125 &&
-      (mesa_shader_stage_is_compute(stage) ||
-       mesa_shader_stage_is_mesh(stage)) &&
-      this->push_data_size > 0;
-
-   if (pull_constants) {
-      const bool pull_constants_a64 =
-         (mesa_shader_stage_is_rt(stage) &&
-          brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
-         ((mesa_shader_stage_is_compute(stage) ||
-           mesa_shader_stage_is_mesh(stage)) &&
-          brw_cs_prog_data(prog_data)->uses_inline_push_addr);
-      assert(devinfo->has_lsc);
-      brw_builder ubld = brw_builder(this, 1).exec_all().at_start(cfg->first_block());
-
-      brw_reg base_addr;
-      if (pull_constants_a64) {
-         /* The address of the push constants is at offset 0 in the inline
-          * parameter.
-          */
-         base_addr =
-            mesa_shader_stage_is_rt(stage) ?
-            retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
-            retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
-      } else {
-         /* The base offset for our push data is passed in as R0.0[31:6]. We
-          * have to mask off the bottom 6 bits.
-          */
-         base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
-                              brw_imm_ud(INTEL_MASK(31, 6)));
-      }
-
-      brw_analysis_dependency_class dirty_bits = BRW_DEPENDENCY_INSTRUCTIONS;
-
-      /* On Gfx12-HP we load constants at the start of the program using A32
-       * stateless messages.
-       */
-      const unsigned n_push_data_regs = reg_unit(devinfo) *
-         DIV_ROUND_UP(this->push_data_size, reg_unit(devinfo) * REG_SIZE);
-      for (unsigned i = 0; i < this->push_data_size / REG_SIZE;) {
-         /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
-         unsigned num_regs = MIN2(this->push_data_size / REG_SIZE - i, 8);
-         assert(num_regs > 0);
-         num_regs = 1 << util_logbase2(num_regs);
-
-         brw_reg addr;
-
-         if (i != 0 && brw_lsc_supports_base_offset(devinfo) == false) {
-            if (pull_constants_a64) {
-               dirty_bits |= BRW_DEPENDENCY_VARIABLES;
-               /* We need to do the carry manually as when this pass is run,
-                * we're not expecting any 64bit ALUs. Unfortunately all the
-                * 64bit lowering is done in NIR.
-                */
-               addr = ubld.vgrf(BRW_TYPE_UQ);
-               brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
-               brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
-               brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
-               brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
-               ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
-               ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
-               set_predicate(BRW_PREDICATE_NORMAL,
-                             ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
-               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
-                                 ubld.MOV(addr_udw, base_addr_udw));
-            } else {
-               addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
-            }
-         } else {
-            addr = base_addr;
-         }
-
-         brw_send_inst *send = ubld.SEND();
-         send->dst = retype(brw_vec8_grf(payload().num_regs + i, 0),
-                            BRW_TYPE_UD);
-
-         send->src[SEND_SRC_DESC]     = brw_imm_ud(0);
-         send->src[SEND_SRC_EX_DESC]  = brw_lsc_supports_base_offset(devinfo) ?
-                                        brw_imm_ud(lsc_flat_ex_desc(devinfo,
-                                                                    i * REG_SIZE)) :
-                                        brw_imm_ud(0);
-         send->src[SEND_SRC_PAYLOAD1] = addr;
-         send->src[SEND_SRC_PAYLOAD2] = brw_reg();
-
-         send->sfid = BRW_SFID_UGM;
-         uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
-                                      LSC_ADDR_SURFTYPE_FLAT,
-                                      pull_constants_a64 ?
-                                      LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
-                                      LSC_DATA_SIZE_D32,
-                                      num_regs * 8 /* num_channels */,
-                                      true /* transpose */,
-                                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
-         send->header_size = 0;
-         send->mlen = lsc_msg_addr_len(
-            devinfo, pull_constants_a64 ?
-            LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
-         send->size_written =
-            lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
-         assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
-                (payload().num_regs + n_push_data_regs));
-         send->is_volatile = true;
-
-         send->src[SEND_SRC_DESC] =
-            brw_imm_ud(desc | brw_message_desc(devinfo,
-                                               send->mlen,
-                                               send->size_written / REG_SIZE,
-                                               send->header_size));
-
-         i += num_regs;
-      }
-
-      invalidate_analysis(dirty_bits);
-   }
-
   /* Map the offsets in the UNIFORM file to fixed HW regs. */
+   uint64_t used = 0;
   foreach_block_and_inst(block, brw_inst, inst, cfg) {
      for (unsigned int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file != UNIFORM)
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@ -166,6 +166,7 @@ gather_push_data(nir_shader *nir,
 }

 struct lower_to_push_data_intel_state {
+   const struct intel_device_info *devinfo;
   struct anv_pipeline_bind_map *bind_map;
   const struct anv_pipeline_push_map *push_map;

@ -242,6 +243,9 @@ lower_ubo_to_push_data_intel(nir_builder *b,
   if (push_range == NULL)
      return lower_internal_ubo(b, intrin);

+   assert(!brw_shader_stage_is_bindless(b->shader->info.stage));
+   assert(!brw_shader_stage_has_inline_data(state->devinfo, b->shader->info.stage));
+
   b->cursor = nir_before_instr(&intrin->instr);
   nir_def *data = nir_load_push_data_intel(
      b,
@ -257,6 +261,50 @@ lower_ubo_to_push_data_intel(nir_builder *b,
   return true;
 }

+static nir_def *
+load_push_data_from_ptr(nir_builder *b,
+                        int base,
+                        unsigned range,
+                        unsigned num_components,
+                        unsigned bit_size,
+                        nir_src offset)
+{
+   /* If the offset is constant, put the load at the beginning of the shader
+    * much like this was previously done in the backend. This gives the
+    * vectorizer the opportunity to pack together constant loading.
+    */
+   if (nir_src_is_const(offset)) {
+      nir_block *block = nir_cursor_current_block(b->cursor);
+      nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
+      b->cursor = nir_before_impl(impl);
+   }
+
+   nir_def *base_addr =
+      brw_shader_stage_is_bindless(b->shader->info.stage) ?
+      nir_load_btd_global_arg_addr_intel(b) :
+      nir_load_inline_data_intel(b, 1, 64, nir_imm_int(b, 0), .base = 0);
+
+   if (brw_shader_stage_is_bindless(b->shader->info.stage))
+      base += BRW_RT_PUSH_CONST_OFFSET;
+
+   if (nir_src_is_const(offset)) {
+      /* Align everything to dwords to allow better vectorization. */
+      int final_offset = base + nir_src_as_int(offset);
+      nir_def *data =
+         nir_load_global_constant(
+            b, DIV_ROUND_UP(num_components * bit_size, 32), 32,
+            nir_iadd_imm(b, base_addr, ROUND_DOWN_TO(final_offset, 4)));
+      return nir_extract_bits(b, &data, 1,
+                              (final_offset * 8) % 32, num_components, bit_size);
+   } else {
+      return nir_load_global_constant(
+         b, num_components, bit_size,
+         nir_iadd(b,
+                  nir_iadd(b, base_addr, nir_i2i64(b, offset.ssa)),
+                  nir_imm_int64(b, base)));
+   }
+}
+
 static bool
 lower_to_inline_data_intel(nir_builder *b,
                           nir_intrinsic_instr *intrin,
@ -304,6 +352,8 @@ lower_to_push_data_intel(nir_builder *b,

   switch (intrin->intrinsic) {
   case nir_intrinsic_load_push_data_intel: {
+      b->cursor = nir_before_instr(&intrin->instr);
+
      const unsigned base = nir_intrinsic_base(intrin);
      if (_mesa_set_search(state->lowered_ubo_instrs, intrin)) {
         /* For lowered UBOs to push constants, shrink the base by the amount
@ -328,6 +378,18 @@ lower_to_push_data_intel(nir_builder *b,
            state->bind_map->binding_mask |= ANV_PIPELINE_BIND_MASK_UNALIGNED_INV_X;
      }
      nir_intrinsic_set_base(intrin, base - base_offset);
+
+      if (brw_shader_stage_is_bindless(b->shader->info.stage) ||
+          brw_shader_stage_has_inline_data(state->devinfo, b->shader->info.stage)) {
+         nir_def *data = load_push_data_from_ptr(
+            b,
+            nir_intrinsic_base(intrin),
+            nir_intrinsic_range(intrin),
+            intrin->def.num_components,
+            intrin->def.bit_size,
+            intrin->src[0]);
+         nir_def_replace(&intrin->def, data);
+      }
      return true;
   }

@ -336,13 +398,26 @@ lower_to_push_data_intel(nir_builder *b,
         return true;

      b->cursor = nir_before_instr(&intrin->instr);
-      nir_def *data = nir_load_push_data_intel(
-         b,
-         intrin->def.num_components,
-         intrin->def.bit_size,
-         intrin->src[0].ssa,
-         .base = nir_intrinsic_base(intrin) - base_offset,
-         .range = nir_intrinsic_range(intrin));
+      nir_def *data;
+      if (brw_shader_stage_is_bindless(b->shader->info.stage) ||
+          brw_shader_stage_has_inline_data(state->devinfo, b->shader->info.stage)) {
+         b->cursor = nir_before_instr(&intrin->instr);
+         data = load_push_data_from_ptr(
+            b,
+            nir_intrinsic_base(intrin),
+            nir_intrinsic_range(intrin),
+            intrin->def.num_components,
+            intrin->def.bit_size,
+            intrin->src[0]);
+      } else {
+         data = nir_load_push_data_intel(
+            b,
+            intrin->def.num_components,
+            intrin->def.bit_size,
+            intrin->src[0].ssa,
+            .base = nir_intrinsic_base(intrin) - base_offset,
+            .range = nir_intrinsic_range(intrin));
+      }
      nir_def_replace(&intrin->def, data);
      return true;
   }
@ -561,6 +636,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
   assert(n_push_ranges <= 4);

   struct lower_to_push_data_intel_state lower_state = {
+      .devinfo = devinfo,
      .bind_map = map,
      .push_map = push_map,
      .lowered_ubo_instrs = _mesa_pointer_set_create(NULL),