anv/brw: prep work for SIMD32 ray queries

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Cc: mesa-stable Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> (cherry picked from commit 6d19b898e7) Conflicts: src/intel/compiler/brw/brw_lower_logical_sends.cpp Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39462>
2026-02-01 19:50:25 +01:00 · 2026-01-06 16:57:46 +02:00 · 2026-01-06 16:57:46 +02:00 · 5f119bb7af
commit 5f119bb7af
parent 4bdf4f3e89
6 changed files with 53 additions and 22 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -4144,7 +4144,7 @@
        "description": "anv/brw: prep work for SIMD32 ray queries",
        "nominated": true,
        "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -2520,6 +2520,8 @@ system_value("leaf_procedural_intel", 1, bit_sizes=[1])
 #  2: Miss
 #  3: Intersection
 system_value("btd_shader_type_intel", 1)
+# 64bit pointer to a couple of RT_DISPATCH_GLOBALS structure each aligned to
+# 64B, the pointer needs 256B aligned.
 system_value("ray_query_global_intel", 1, bit_sizes=[64])

 # Source 0: Accumulator matrix (type specified by DEST_TYPE)
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@ -28,6 +28,7 @@
 #include "brw_eu.h"
 #include "brw_shader.h"
 #include "brw_builder.h"
+#include "brw_rt.h"

 #include "util/bitpack_helpers.h"

@ -2353,11 +2354,16 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst)
   brw_reg header = ubld.vgrf(BRW_TYPE_UD);
   ubld.MOV(header, brw_imm_ud(0));

+   const uint32_t second_group_offset =
+      align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
+
   const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
   if (globals_addr.file != UNIFORM) {
      brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD);
      addr_ud.stride = 1;
      ubld.group(2, 0).MOV(header, addr_ud);
+      if (inst->group == 16)
+         ubld.group(1, 0).ADD(header, header, brw_imm_ud(second_group_offset));
   } else {
      /* If the globals address comes from a uniform, do not do the SIMD2
       * optimization. This occurs in many Vulkan CTS tests.
@ -2367,8 +2373,14 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst)
       * UNIFORM will be uniform (i.e., <0,1,0>). The clever SIMD2
       * optimization violates that assumption.
       */
-      ubld.group(1, 0).MOV(byte_offset(header, 0),
-                           subscript(globals_addr, BRW_TYPE_UD, 0));
+      if (inst->group == 16) {
+         ubld.group(1, 0).ADD(byte_offset(header, 0),
+                              subscript(globals_addr, BRW_TYPE_UD, 0),
+                              brw_imm_ud(second_group_offset));
+      } else {
+         ubld.group(1, 0).MOV(byte_offset(header, 0),
+                              subscript(globals_addr, BRW_TYPE_UD, 0));
+      }
      ubld.group(1, 0).MOV(byte_offset(header, 4),
                           subscript(globals_addr, BRW_TYPE_UD, 1));
   }
--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@ -38,6 +38,13 @@ extern "C" {
 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
 #define BRW_RT_DISPATCH_GLOBALS_SIZE 80

+/** RT_DISPATCH_GLOBALS alignment
+ *
+ * Use 256B to make sure we can access the pair of RT_DISPATCH_GLOBALS without
+ * 64bit math.
+ */
+#define BRW_RT_DISPATCH_GLOBALS_ALIGN 256
+
 /** Offset after the RT dispatch globals at which "push" constants live */
 #define BRW_RT_PUSH_CONST_OFFSET 128

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -4161,7 +4161,11 @@ struct anv_push_constants {
    */
   uint32_t surfaces_base_offset;

-   /** Ray query globals (RT_DISPATCH_GLOBALS) */
+   /** Ray query globals
+    *
+    * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
+    * genX(cmd_buffer_ray_query_globals))
+    */
   uint64_t ray_query_globals;

   union {
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@ -817,7 +817,8 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)

   struct anv_state state =
      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
-                                           BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
+                                           2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
+                                           BRW_RT_DISPATCH_GLOBALS_ALIGN);
   struct brw_rt_scratch_layout layout;
   uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
                                       * some cases?
@ -827,23 +828,28 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)

   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);

-   const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
-      .MemBaseAddress = (struct anv_address) {
-         /* The ray query HW computes offsets from the top of the buffer, so
-          * let the address at the end of the buffer.
-          */
-         .bo = device->ray_query_bo[idx],
-         .offset = device->ray_query_bo[idx]->size
-      },
-      .AsyncRTStackSize = layout.ray_stack_stride / 64,
-      .NumDSSRTStacks = layout.stack_ids_per_dss,
-      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-      .ResumeShaderTable = (struct anv_address) {
-         .bo = cmd_buffer->state.ray_query_shadow_bo,
-      },
-   };
-   GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
+   for (uint32_t i = 0; i < 2; i++) {
+      const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+         .MemBaseAddress = (struct anv_address) {
+            /* The ray query HW computes offsets from the top of the buffer, so
+             * let the address at the end of the buffer.
+             */
+            .bo = device->ray_query_bo[idx],
+            .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
+         },
+         .AsyncRTStackSize = layout.ray_stack_stride / 64,
+         .NumDSSRTStacks = layout.stack_ids_per_dss,
+         .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+         .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+         .ResumeShaderTable = (struct anv_address) {
+            .bo = cmd_buffer->state.ray_query_shadow_bo,
+         },
+      };
+      GENX(RT_DISPATCH_GLOBALS_pack)(
+         NULL,
+         state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+         &rtdg);
+   }

   return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
 #else