brw: enable A64 pulling of push constants

This will be useful for pulling constants in device bound shaders. A64 allows us to put the constants anywhere. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32895>
2026-03-19 03:30:42 +01:00 · 2024-04-24 16:14:16 +03:00 · 2024-04-24 16:14:16 +03:00 · 5c17299084
commit 5c17299084
parent 0808125914
10 changed files with 111 additions and 29 deletions
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@ -148,7 +148,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
   prog_data->base.total_shared = nir->info.shared_size;
   prog_data->base.ray_queries = nir->info.ray_queries;
   prog_data->base.total_scratch = 0;
-   prog_data->uses_inline_data = brw_nir_uses_inline_data(nir);
+   prog_data->uses_inline_data = brw_nir_uses_inline_data(nir) ||
+                                 key->base.uses_inline_push_addr;
   assert(compiler->devinfo->verx10 >= 125 || !prog_data->uses_inline_data);

   if (!nir->info.workgroup_size_variable) {
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@ -386,7 +386,8 @@ brw_compile_task(const struct brw_compiler *compiler,
   prog_data->uses_drawid =
      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);

-   prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
+   prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir) ||
+                                      key->base.uses_inline_push_addr;

   brw_simd_selection_state simd_state{
      .devinfo = compiler->devinfo,
@ -1696,7 +1697,8 @@ brw_compile_mesh(const struct brw_compiler *compiler,

   prog_data->autostrip_enable = brw_mesh_autostrip_enable(compiler, nir, &prog_data->map);

-   prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir);
+   prog_data->base.uses_inline_data = brw_nir_uses_inline_data(nir) ||
+                                      key->base.uses_inline_push_addr;

   brw_simd_selection_state simd_state{
      .devinfo = compiler->devinfo,
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -208,7 +208,9 @@ struct brw_base_prog_key {

   enum brw_robustness_flags robust_flags:2;

-   unsigned padding:22;
+   bool uses_inline_push_addr:1;
+
+   unsigned padding:21;

   /**
    * Apply workarounds for SIN and COS input range problems.
@ -891,6 +893,10 @@ struct brw_cs_prog_data {
   bool uses_barrier;
   bool uses_num_work_groups;
   bool uses_inline_data;
+   /** Whether inline push data is used to provide a 64bit pointer to push
+    * constants
+    */
+   bool uses_inline_push_addr;
   bool uses_btd_stack_ids;
   bool uses_systolic;
   uint8_t generate_local_id;
@ -920,6 +926,11 @@ brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
 struct brw_bs_prog_data {
   struct brw_stage_prog_data base;

+   /** Whether inline push data is used to provide a 64bit pointer to push
+    * constants
+    */
+   bool uses_inline_push_addr;
+
   /** SIMD size of the root shader */
   uint8_t simd_size;

--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -34,6 +34,7 @@
 #include "brw_fs_live_variables.h"
 #include "brw_nir.h"
 #include "brw_cfg.h"
+#include "brw_rt.h"
 #include "brw_private.h"
 #include "intel_nir.h"
 #include "shader_enums.h"
@ -205,20 +206,39 @@ fs_visitor::assign_curb_setup()
      prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);

   uint64_t used = 0;
-   const bool shader_pulls_constants = devinfo->verx10 >= 125 &&
-      (gl_shader_stage_is_compute(stage) || gl_shader_stage_is_mesh(stage));
+   const bool pull_constants =
+      devinfo->verx10 >= 125 &&
+      (gl_shader_stage_is_compute(stage) ||
+       gl_shader_stage_is_mesh(stage)) &&
+      uniform_push_length;

-   if (shader_pulls_constants && uniform_push_length > 0) {
+   if (pull_constants) {
+      const bool pull_constants_a64 =
+         (gl_shader_stage_is_rt(stage) &&
+          brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
+         ((gl_shader_stage_is_compute(stage) ||
+           gl_shader_stage_is_mesh(stage)) &&
+          brw_cs_prog_data(prog_data)->uses_inline_push_addr);
      assert(devinfo->has_lsc);
      brw_builder ubld = brw_builder(this, 1).exec_all().at(
         cfg->first_block(), cfg->first_block()->start());

-      /* The base offset for our push data is passed in as R0.0[31:6]. We have
-       * to mask off the bottom 6 bits.
-       */
-      brw_reg base_addr =
-         ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
-                  brw_imm_ud(INTEL_MASK(31, 6)));
+      brw_reg base_addr;
+      if (pull_constants_a64) {
+         /* The address of the push constants is at offset 0 in the inline
+          * parameter.
+          */
+         base_addr =
+            gl_shader_stage_is_rt(stage) ?
+            retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
+            retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
+      } else {
+         /* The base offset for our push data is passed in as R0.0[31:6]. We
+          * have to mask off the bottom 6 bits.
+          */
+         base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
+                              brw_imm_ud(INTEL_MASK(31, 6)));
+      }

      /* On Gfx12-HP we load constants at the start of the program using A32
       * stateless messages.
@ -229,11 +249,31 @@ fs_visitor::assign_curb_setup()
         assert(num_regs > 0);
         num_regs = 1 << util_logbase2(num_regs);

-         /* This pass occurs after all of the optimization passes, so don't
-          * emit an 'ADD addr, base_addr, 0' instruction.
-          */
-         brw_reg addr = i == 0 ? base_addr :
-            ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
+         brw_reg addr;
+
+         if (i != 0) {
+            if (pull_constants_a64) {
+               /* We need to do the carry manually as when this pass is run,
+                * we're not expecting any 64bit ALUs. Unfortunately all the
+                * 64bit lowering is done in NIR.
+                */
+               addr = ubld.vgrf(BRW_TYPE_UQ);
+               brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
+               brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
+               brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
+               brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
+               ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
+               ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
+               set_predicate(BRW_PREDICATE_NORMAL,
+                             ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
+               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                                 ubld.MOV(addr_udw, base_addr_udw));
+            } else {
+               addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
+            }
+         } else {
+            addr = base_addr;
+         }

         brw_reg srcs[4] = {
            brw_imm_ud(0), /* desc */
@ -249,15 +289,20 @@ fs_visitor::assign_curb_setup()
         send->sfid = GFX12_SFID_UGM;
         uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
                                      LSC_ADDR_SURFTYPE_FLAT,
-                                      LSC_ADDR_SIZE_A32,
+                                      pull_constants_a64 ?
+                                      LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
                                      LSC_DATA_SIZE_D32,
                                      num_regs * 8 /* num_channels */,
                                      true /* transpose */,
                                      LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
         send->header_size = 0;
-         send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
+         send->mlen = lsc_msg_addr_len(
+            devinfo, pull_constants_a64 ?
+            LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
         send->size_written =
            lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
+         assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
+                (payload().num_regs + prog_data->curb_read_length));
         send->send_is_volatile = true;

         send->src[0] = brw_imm_ud(desc |
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@ -227,6 +227,8 @@ struct task_mesh_thread_payload : public cs_thread_payload {
 struct bs_thread_payload : public thread_payload {
   bs_thread_payload(const fs_visitor &v);

+   brw_reg inline_parameter;
+
   brw_reg global_arg_ptr;
   brw_reg local_arg_ptr;

--- a/src/intel/compiler/brw_fs_thread_payload.cpp
+++ b/src/intel/compiler/brw_fs_thread_payload.cpp
@ -361,6 +361,8 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)

   unsigned r = reg_unit(v.devinfo);

+   prog_data->uses_inline_push_addr = v.key->uses_inline_push_addr;
+
   /* See nir_setup_uniforms for subgroup_id in earlier versions. */
   if (v.devinfo->verx10 >= 125) {
      subgroup_id_ = brw_ud1_grf(0, 2);
@ -380,10 +382,14 @@ cs_thread_payload::cs_thread_payload(const fs_visitor &v)
      if (prog_data->uses_btd_stack_ids)
         r += reg_unit(v.devinfo);

-      if (v.stage == MESA_SHADER_COMPUTE && prog_data->uses_inline_data) {
+      if (v.stage == MESA_SHADER_COMPUTE &&
+          (prog_data->uses_inline_data ||
+           prog_data->uses_inline_push_addr)) {
         inline_parameter = brw_ud1_grf(r, 0);
         r += reg_unit(v.devinfo);
      }
+   } else {
+      assert(!prog_data->uses_inline_push_addr);
   }

   num_regs = r;
@ -464,7 +470,7 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
      r += reg_unit(v.devinfo);

   struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
-   if (prog_data->uses_inline_data) {
+   if (prog_data->uses_inline_data || prog_data->uses_inline_push_addr) {
      inline_parameter = brw_ud1_grf(r, 0);
      r += reg_unit(v.devinfo);
   }
@ -474,6 +480,8 @@ task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)

 bs_thread_payload::bs_thread_payload(const fs_visitor &v)
 {
+   struct brw_bs_prog_data *prog_data = brw_bs_prog_data(v.prog_data);
+
   unsigned r = 0;

   /* R0: Thread header. */
@ -483,6 +491,8 @@ bs_thread_payload::bs_thread_payload(const fs_visitor &v)
   r += reg_unit(v.devinfo);

   /* R2: Inline Parameter.  Used for argument addresses. */
+   prog_data->uses_inline_push_addr = v.key->uses_inline_push_addr;
+   inline_parameter = brw_ud1_grf(r, 0);
   global_arg_ptr = brw_ud1_grf(r, 0);
   local_arg_ptr = brw_ud1_grf(r, 2);
   r += reg_unit(v.devinfo);
--- a/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_rt_intrinsics.c
@ -64,6 +64,7 @@ build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)

 static void
 lower_rt_intrinsics_impl(nir_function_impl *impl,
+                         const struct brw_base_prog_key *key,
                         const struct intel_device_info *devinfo)
 {
   bool progress = false;
@ -153,8 +154,14 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
            break;

         case nir_intrinsic_load_uniform: {
-            /* We don't want to lower this in the launch trampoline. */
-            if (stage == MESA_SHADER_COMPUTE)
+            /* We don't want to lower this in the launch trampoline.
+             *
+             * Also if the driver chooses to use an inline push address, we
+             * can do all the loading of the push constant in
+             * assign_curb_setup() (more efficient as we can do NoMask
+             * instructions for address calculations).
+             */
+            if (stage == MESA_SHADER_COMPUTE || key->uses_inline_push_addr)
               break;

            sysval = brw_nir_load_global_const(b, intrin,
@ -409,9 +416,10 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
 */
 void
 brw_nir_lower_rt_intrinsics(nir_shader *nir,
+                            const struct brw_base_prog_key *key,
                            const struct intel_device_info *devinfo)
 {
   nir_foreach_function_impl(impl, nir) {
-      lower_rt_intrinsics_impl(impl, devinfo);
+      lower_rt_intrinsics_impl(impl, key, devinfo);
   }
 }
--- a/src/intel/compiler/brw_nir_rt.c
+++ b/src/intel/compiler/brw_nir_rt.c
@ -495,7 +495,8 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
   struct brw_nir_compiler_opts opts = {};
   brw_preprocess_nir(compiler, nir, &opts);

-   NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+   struct brw_cs_prog_key key = {};
+   NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, &key.base, devinfo);

   b = nir_builder_create(nir_shader_get_entrypoint(b.shader));
   /* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
--- a/src/intel/compiler/brw_nir_rt.h
+++ b/src/intel/compiler/brw_nir_rt.h
@ -56,6 +56,7 @@ void brw_nir_lower_shader_returns(nir_shader *shader);
 bool brw_nir_lower_shader_calls(nir_shader *shader, struct brw_bs_prog_key *key);

 void brw_nir_lower_rt_intrinsics(nir_shader *shader,
+                                 const struct brw_base_prog_key *key,
                                 const struct intel_device_info *devinfo);
 void brw_nir_lower_intersection_shader(nir_shader *intersection,
                                       const nir_shader *any_hit,
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -3309,12 +3309,12 @@ compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
      NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
               &resume_shaders, &num_resume_shaders, mem_ctx);
      NIR_PASS(_, nir, brw_nir_lower_shader_calls, &stage->key.bs);
-      NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+      NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, &stage->key.base, devinfo);
   }

   for (unsigned i = 0; i < num_resume_shaders; i++) {
      NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls, &stage->key.bs);
-      NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
+      NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, &stage->key.base, devinfo);
   }

   struct brw_compile_bs_params params = {
@ -3853,7 +3853,8 @@ anv_device_init_rt_shaders(struct anv_device *device)
      nir_shader *trivial_return_nir =
         brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx);

-      NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info);
+      NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics,
+                 &return_key.key.base, device->info);

      struct brw_bs_prog_data return_prog_data = { 0, };
      struct brw_compile_bs_params params = {