From 5f119bb7afebcf997a0b286330ed0a8e5bb8a71e Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Tue, 6 Jan 2026 16:57:46 +0200
Subject: [PATCH] anv/brw: prep work for SIMD32 ray queries

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: mesa-stable
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
(cherry picked from commit 6d19b898e730b6d7abeb6b0c6cf217456d4b680f)

Conflicts:
	src/intel/compiler/brw/brw_lower_logical_sends.cpp

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39462>
---
 .pick_status.json                             |  2 +-
 src/compiler/nir/nir_intrinsics.py            |  2 +
 .../compiler/brw/brw_lower_logical_sends.cpp  | 16 ++++++-
 src/intel/compiler/brw/brw_rt.h               |  7 ++++
 src/intel/vulkan/anv_private.h                |  6 ++-
 src/intel/vulkan/genX_cmd_compute.c           | 42 +++++++++++--------
 6 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 1b7d98d67cb..fb359e9c3e9 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -4144,7 +4144,7 @@
         "description": "anv/brw: prep work for SIMD32 ray queries",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 6cacbd2abbf..fdfd9d9a7de 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -2520,6 +2520,8 @@ system_value("leaf_procedural_intel", 1, bit_sizes=[1])
 #  2: Miss
 #  3: Intersection
 system_value("btd_shader_type_intel", 1)
+# 64bit pointer to a couple of RT_DISPATCH_GLOBALS structure each aligned to
+# 64B, the pointer needs 256B aligned.
 system_value("ray_query_global_intel", 1, bit_sizes=[64])
 
 # Source 0: Accumulator matrix (type specified by DEST_TYPE)
diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
index 567ab45431a..74e228a5e45 100644
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@@ -28,6 +28,7 @@
 #include "brw_eu.h"
 #include "brw_shader.h"
 #include "brw_builder.h"
+#include "brw_rt.h"
 
 #include "util/bitpack_helpers.h"
 
@@ -2353,11 +2354,16 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst)
    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
    ubld.MOV(header, brw_imm_ud(0));
 
+   const uint32_t second_group_offset =
+      align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
+
    const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
    if (globals_addr.file != UNIFORM) {
       brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD);
       addr_ud.stride = 1;
       ubld.group(2, 0).MOV(header, addr_ud);
+      if (inst->group == 16)
+         ubld.group(1, 0).ADD(header, header, brw_imm_ud(second_group_offset));
    } else {
       /* If the globals address comes from a uniform, do not do the SIMD2
        * optimization. This occurs in many Vulkan CTS tests.
@@ -2367,8 +2373,14 @@ lower_trace_ray_logical_send(const brw_builder &bld, brw_inst *inst)
        * UNIFORM will be uniform (i.e., <0,1,0>). The clever SIMD2
        * optimization violates that assumption.
        */
-      ubld.group(1, 0).MOV(byte_offset(header, 0),
-                           subscript(globals_addr, BRW_TYPE_UD, 0));
+      if (inst->group == 16) {
+         ubld.group(1, 0).ADD(byte_offset(header, 0),
+                              subscript(globals_addr, BRW_TYPE_UD, 0),
+                              brw_imm_ud(second_group_offset));
+      } else {
+         ubld.group(1, 0).MOV(byte_offset(header, 0),
+                              subscript(globals_addr, BRW_TYPE_UD, 0));
+      }
       ubld.group(1, 0).MOV(byte_offset(header, 4),
                            subscript(globals_addr, BRW_TYPE_UD, 1));
    }
diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h
index f71273efb53..f74f6179039 100644
--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@@ -38,6 +38,13 @@ extern "C" {
 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
 #define BRW_RT_DISPATCH_GLOBALS_SIZE 80
 
+/** RT_DISPATCH_GLOBALS alignment
+ *
+ * Use 256B to make sure we can access the pair of RT_DISPATCH_GLOBALS without
+ * 64bit math.
+ */
+#define BRW_RT_DISPATCH_GLOBALS_ALIGN 256
+
 /** Offset after the RT dispatch globals at which "push" constants live */
 #define BRW_RT_PUSH_CONST_OFFSET 128
 
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 20fed27e1ac..8d7d3d6d80d 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -4161,7 +4161,11 @@ struct anv_push_constants {
     */
    uint32_t surfaces_base_offset;
 
-   /** Ray query globals (RT_DISPATCH_GLOBALS) */
+   /** Ray query globals
+    *
+    * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
+    * genX(cmd_buffer_ray_query_globals))
+    */
    uint64_t ray_query_globals;
 
    union {
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
index 03491fdb322..1019b1eb0dd 100644
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -817,7 +817,8 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
 
    struct anv_state state =
       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
-                                           BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
+                                           2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
+                                           BRW_RT_DISPATCH_GLOBALS_ALIGN);
    struct brw_rt_scratch_layout layout;
    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
                                        * some cases?
@@ -827,23 +828,28 @@ genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
 
    uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
 
-   const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
-      .MemBaseAddress = (struct anv_address) {
-         /* The ray query HW computes offsets from the top of the buffer, so
-          * let the address at the end of the buffer.
-          */
-         .bo = device->ray_query_bo[idx],
-         .offset = device->ray_query_bo[idx]->size
-      },
-      .AsyncRTStackSize = layout.ray_stack_stride / 64,
-      .NumDSSRTStacks = layout.stack_ids_per_dss,
-      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-      .ResumeShaderTable = (struct anv_address) {
-         .bo = cmd_buffer->state.ray_query_shadow_bo,
-      },
-   };
-   GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
+   for (uint32_t i = 0; i < 2; i++) {
+      const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+         .MemBaseAddress = (struct anv_address) {
+            /* The ray query HW computes offsets from the top of the buffer, so
+             * let the address at the end of the buffer.
+             */
+            .bo = device->ray_query_bo[idx],
+            .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
+         },
+         .AsyncRTStackSize = layout.ray_stack_stride / 64,
+         .NumDSSRTStacks = layout.stack_ids_per_dss,
+         .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+         .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+         .ResumeShaderTable = (struct anv_address) {
+            .bo = cmd_buffer->state.ray_query_shadow_bo,
+         },
+      };
+      GENX(RT_DISPATCH_GLOBALS_pack)(
+         NULL,
+         state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+         &rtdg);
+   }
 
    return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
 #else