anv,brw: Allow multiple ray queries without spilling to a shadow stack

Allows a shader to have multiple ray queries without spilling them to a shadow stack. Instead, the driver provides the shader with an array of multiple RTDispatchGlobals structs to give each query its own dedicated stack. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38778>
2026-02-11 16:00:27 +01:00 · 2025-12-02 20:26:49 -08:00 · 2025-12-02 20:26:49 -08:00 · 1f1de7ebd6
commit 1f1de7ebd6
parent 0291aa3e71
10 changed files with 182 additions and 281 deletions
--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@ -38,8 +38,10 @@ struct lowering_state {
   struct hash_table *queries;
   uint32_t n_queries;

-   struct brw_nir_rt_globals_defs globals;
   nir_def *rq_globals;
+
+   uint32_t num_dss_rt_stacks;
+   uint32_t sync_stacks_stride;
 };

 struct brw_ray_query {
@ -50,12 +52,6 @@ struct brw_ray_query {

 #define SIZEOF_QUERY_STATE (sizeof(uint32_t))

-static bool
-need_spill_fill(struct lowering_state *state)
-{
-   return state->n_queries > 1;
-}
-
 /**
 * This pass converts opaque RayQuery structures from SPIRV into a vec3 where
 * the first 2 elements store a global address for the query and the third
@ -98,10 +94,8 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
                                                NULL);
 }

-
-
 static nir_def *
-get_ray_query_shadow_addr(nir_builder *b,
+get_ray_query_stack_index(nir_builder *b,
                          nir_deref_instr *deref,
                          struct lowering_state *state,
                          nir_deref_instr **out_state_deref)
@ -116,35 +110,17 @@ get_ray_query_shadow_addr(nir_builder *b,

   struct brw_ray_query *rq = entry->data;

-   /* Base address in the shadow memory of the variable associated with this
-    * ray query variable.
-    */
-   nir_def *base_addr =
-      nir_iadd_imm(b, state->globals.resume_sbt_addr,
-                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
-
-   bool spill_fill = need_spill_fill(state);
+   nir_def *query_idx = nir_imm_int(b, rq->id);
   *out_state_deref = nir_build_deref_var(b, rq->internal_var);

-   if (!spill_fill)
-      return NULL;
-
   /* Just emit code and let constant-folding go to town */
   nir_deref_instr **p = &path.path[1];
   for (; *p; p++) {
      if ((*p)->deref_type == nir_deref_type_array) {
         nir_def *index = (*p)->arr.index.ssa;
-
-         /**/
         *out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
-
-         /**/
-         uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
-            brw_rt_ray_queries_shadow_stack_size(state->devinfo);
-
-         nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
-
-         base_addr = nir_iadd(b, base_addr, mul);
+         index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
+         query_idx = nir_iadd(b, query_idx, index);
      } else {
         UNREACHABLE("Unsupported deref type");
      }
@ -152,28 +128,7 @@ get_ray_query_shadow_addr(nir_builder *b,

   nir_deref_path_finish(&path);

-   /* Add the lane offset to the shadow memory address */
-   nir_def *lane_offset =
-      nir_imul_imm(
-         b,
-         nir_iadd(
-            b,
-            nir_imul(
-               b,
-               brw_load_btd_dss_id(b),
-               state->globals.num_dss_rt_stacks),
-            brw_nir_rt_sync_stack_id(b)),
-         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
-
-   /* Top/bottom 16 lanes each get their own stack area */
-   lane_offset = nir_bcsel(
-      b,
-      nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
-      lane_offset,
-      nir_iadd_imm(b, lane_offset,
-                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
-
-   return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
+   return query_idx;
 }

 static void
@ -209,26 +164,6 @@ update_trace_ctrl_level(nir_builder *b,
   }
 }

-static void
-fill_query(nir_builder *b,
-           nir_def *hw_stack_addr,
-           nir_def *shadow_stack_addr,
-           nir_def *ctrl)
-{
-   brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
-                         BRW_RT_SIZEOF_RAY_QUERY);
-}
-
-static void
-spill_query(nir_builder *b,
-            nir_def *hw_stack_addr,
-            nir_def *shadow_stack_addr)
-{
-   brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
-                         BRW_RT_SIZEOF_RAY_QUERY);
-}
-
-
 static void
 lower_ray_query_intrinsic(nir_builder *b,
                          nir_intrinsic_instr *intrin,
@ -239,12 +174,20 @@ lower_ray_query_intrinsic(nir_builder *b,
   b->cursor = nir_instr_remove(&intrin->instr);

   nir_deref_instr *ctrl_level_deref;
-   nir_def *shadow_stack_addr =
-      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
-   nir_def *hw_stack_addr =
-      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
-                                 state->globals.num_dss_rt_stacks);
-   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
+   nir_def *stack_index =
+      get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
+   nir_def *rq_globals_addr =
+      nir_iadd(b, state->rq_globals,
+               nir_i2i64(b, nir_amul_imm(b, stack_index,
+                                         BRW_RT_DISPATCH_GLOBALS_ALIGN)));
+   nir_def *stack_base_addr =
+      nir_isub(b, state->rq_globals,
+               nir_i2i64(b, nir_amul_imm(b, stack_index,
+                                         state->sync_stacks_stride)));
+   nir_def *stack_addr =
+      brw_nir_rt_sync_stack_addr(b, stack_base_addr,
+                                 state->num_dss_rt_stacks,
+                                 state->devinfo);
   mesa_shader_stage stage = b->shader->info.stage;

   switch (intrin->intrinsic) {
@ -313,22 +256,12 @@ lower_ray_query_intrinsic(nir_builder *b,
          */
         brw_nir_rt_query_mark_done(b, stack_addr);

-         if (shadow_stack_addr)
-            fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
-
-         /* Do not use state->rq_globals, we want a uniform value for the
-          * tracing call.
-          */
-         nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
-                             level, ctrl, .synchronous = true);
+         nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);

         struct brw_nir_rt_mem_hit_defs hit_in = {};
-         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
+         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
                                           state->devinfo);

-         if (shadow_stack_addr)
-            spill_query(b, hw_stack_addr, shadow_stack_addr);
-
         update_trace_ctrl_level(b, ctrl_level_deref,
                                 NULL, NULL,
                                 nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@ -547,21 +480,17 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
   nir_builder _b, *b = &_b;
   _b = nir_builder_at(nir_before_impl(impl));

-   nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
+   state->rq_globals = nir_load_ray_query_global_intel(b);

-   /* Use a different global for each 16lanes groups (only in SIMD32). */
-   state->rq_globals = nir_bcsel(
-      b,
-      nir_iand(b,
-               nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
-               nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
-      nir_iadd_imm(
-         b, rq_globals_base,
-         align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
-      rq_globals_base);
+   /* ATSM PRMs Vol 9, "State Model for Ray Tracing - RTDispatchGlobals"
+    *
+    *    "For Sync Ray tracing (i.e. using RayQueries), SW must allocate
+    *    space assuming 2K StackIDs"
+    */
+   state->num_dss_rt_stacks = 2048; /* TODO */

-   brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
-                                state->devinfo);
+   state->sync_stacks_stride =
+      brw_rt_ray_queries_stacks_stride(state->devinfo);

   nir_foreach_block_safe(block, impl) {
      nir_foreach_instr_safe(instr, block) {
--- a/src/intel/compiler/brw/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw/brw_nir_rt_builder.h
@ -178,7 +178,8 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
 static inline nir_def *
 brw_nir_rt_sync_stack_addr(nir_builder *b,
                           nir_def *base_mem_addr,
-                           nir_def *num_dss_rt_stacks)
+                           uint32_t num_dss_rt_stacks,
+                           const struct intel_device_info *devinfo)
 {
   /* Bspec 47547 (Xe) and 56936 (Xe2+) say:
    *    For Ray queries (Synchronous Ray Tracing), the formula is similar but
@ -195,12 +196,29 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
    * NUM_SYNC_STACKID_PER_DSS instead.
    */
   nir_def *offset32 =
-      nir_imul(b,
-               nir_iadd(b,
-                        nir_imul(b, brw_load_btd_dss_id(b),
-                                    num_dss_rt_stacks),
-                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
-               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
+      nir_imul_imm(b,
+                   nir_iadd(b,
+                            nir_imul_imm(b, brw_load_btd_dss_id(b),
+                                            num_dss_rt_stacks),
+                            nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+                   BRW_RT_SIZEOF_RAY_QUERY);
+
+   /* StackID offset for the bottom 16 lanes in SIMD32, this must match the
+    * offset of the second base address provided by the driver through the
+    * pair of ray query RTDispatchGlobals
+    */
+   uint32_t simd32_stack_offset =
+      num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
+      intel_device_info_dual_subslice_id_bound(devinfo);
+
+   offset32 =
+      nir_bcsel(b,
+                nir_iand(b,
+                         nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
+                         nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
+                nir_iadd_imm(b, offset32, simd32_stack_offset),
+                offset32);
+
   return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
 }

@ -300,7 +318,6 @@ struct brw_nir_rt_globals_defs {
   nir_def *launch_size;
   nir_def *call_sbt_addr;
   nir_def *call_sbt_stride;
-   nir_def *resume_sbt_addr;
 };

 static inline void
@ -368,8 +385,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
      defs->call_sbt_stride =
         nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
                      0x1fff);
-      defs->resume_sbt_addr =
-         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
   } else {
      defs->call_sbt_addr =
         nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@ -377,9 +392,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
                                                      nir_imm_int(b, 0)));
      defs->call_sbt_stride =
         nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
-
-      defs->resume_sbt_addr =
-         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
   }
 }

--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@ -36,7 +36,7 @@ extern "C" {
 #define BRW_RT_SBT_HANDLE_SIZE 32

 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
-#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 72

 /** RT_DISPATCH_GLOBALS alignment
 *
@ -191,10 +191,6 @@ struct brw_rt_raygen_trampoline_params {
    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
    (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))

-#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
-   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
-    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
-
 #define BRW_RT_SIZEOF_HW_STACK \
   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
    BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@ -270,25 +266,15 @@ brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
 }

 static inline uint32_t
-brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
+brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
 {
-   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
-    * which includes all the threads.
-    */
-   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
-   uint32_t max_simd_size = 32;
-   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
+   return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
 }

 static inline uint32_t
-brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
-                                      uint32_t ray_queries)
+brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
 {
-   /* Don't bother a shadow stack if we only have a single query. We can
-    * directly write in the HW buffer.
-    */
-   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
-          ray_queries * 4; /* Ctrl + Level data */
+   return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
 }

 #ifdef __cplusplus
--- a/src/intel/genxml/gen125_rt.xml
+++ b/src/intel/genxml/gen125_rt.xml
@ -28,7 +28,6 @@
    <field name="Launch Height" dword="14" bits="31:0" type="uint" />
    <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
    <field name="Callable Group Table" dword="16" bits="63:0" type="RT_SHADER_TABLE" />
-    <field name="Resume Shader Table" dword="18" bits="63:0" type="address" />
  </struct>
  <struct name="RT_GENERAL_SBT_HANDLE" length="8">
    <field name="General" dword="0" bits="63:0" type="BINDLESS_SHADER_RECORD" />
--- a/src/intel/genxml/gen300_rt.xml
+++ b/src/intel/genxml/gen300_rt.xml
@ -36,6 +36,5 @@
    <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
    <field name="Callable Group Table" dword="16" bits="63:0" type="address" />
    <field name="Callable Group Stride" dword="18" bits="12:0" type="uint" />
-    <field name="Resume Shader Table" dword="19" bits="63:0" type="address" />
  </struct>
 </genxml>
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@ -442,58 +442,84 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
                                   uint32_t ray_queries,
                                   VkShaderStageFlags stages)
 {
-   struct anv_device *device = cmd_buffer->device;
-   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
+   if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
+      struct anv_device *device = cmd_buffer->device;
+      uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);

-   uint64_t ray_shadow_size =
-      align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
-              4096);
-   if (ray_shadow_size > 0 &&
-       (!cmd_buffer->state.ray_query_shadow_bo ||
-        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
-      unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
-      unsigned bucket = shadow_size_log2 - 16;
-      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
+      unsigned bucket = util_logbase2_ceil(ray_queries);
+      assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));

-      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
+      uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
+      uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
+
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
      if (bo == NULL) {
         struct anv_bo *new_bo;
-         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
-                                               1 << shadow_size_log2,
-                                               ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
-                                               0, /* explicit_address */
-                                               &new_bo);
+         VkResult result =
+            anv_device_alloc_bo(device, "RT queries scratch",
+                                offset + (stride << bucket), /* size */
+                                ANV_BO_ALLOC_INTERNAL |
+                                ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
+                                0, /* explicit_address */
+                                &new_bo);
+
         ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
         if (result != VK_SUCCESS) {
            anv_batch_set_error(&cmd_buffer->batch, result);
            return;
         }

-         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
+         /* Map extra space we added at end of the buffer, we will write the
+          * array of RT_DISPATCH_GLOBALS into it so we can use only a single
+          * memory address in our shaders for all stacks and globals
+          */
+         void *map;
+         result = anv_device_map_bo(device, new_bo, stride << bucket,
+                                    offset, NULL, &map);
+
+         if (result != VK_SUCCESS) {
+            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
+            anv_device_release_bo(device, new_bo);
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return;
+         }
+
+         anv_genX(device->info, setup_ray_query_globals)(device,
+                                                         new_bo,
+                                                         stride << bucket,
+                                                         map,
+                                                         1 << bucket);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+         if (device->physical->memory.need_flush)
+            util_flush_inval_range(map, offset);
+#endif
+
+         anv_device_unmap_bo(device, new_bo, map, offset, false);
+
+         bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
         if (bo != NULL) {
-            ANV_DMR_BO_FREE(&device->vk.base, new_bo);
+            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
            anv_device_release_bo(device, new_bo);
         } else {
            bo = new_bo;
         }
      }
-      cmd_buffer->state.ray_query_shadow_bo = bo;

-      /* Add the ray query buffers to the batch list. */
-      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
-                            cmd_buffer->state.ray_query_shadow_bo);
+      /* Add the HW buffer to the list of BO used. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
+
+      cmd_buffer->state.ray_query_globals = (struct anv_address) {
+         .bo = bo,
+         .offset = (int64_t) (stride << bucket),
+      };
+
+      cmd_buffer->state.num_ray_query_globals = 1 << bucket;
   }

-   /* Add the HW buffer to the list of BO used. */
-   assert(device->ray_query_bo[idx]);
-   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
-                         device->ray_query_bo[idx]);
-
-   /* Fill the push constants & mark them dirty. */
-   struct anv_address ray_query_globals_addr =
-      anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+   /* Update the push constants & mark them dirty. */
   pipeline_state->push_constants.ray_query_globals =
-      anv_address_physical(ray_query_globals_addr);
+      anv_address_physical(cmd_buffer->state.ray_query_globals);
   cmd_buffer->state.push_constants_dirty |= stages;
   pipeline_state->push_constants_data_dirty = true;
 }
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -341,22 +341,16 @@ VkResult anv_CreateDevice(
   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
   VkResult result;
   struct anv_device *device;
-   bool device_has_compute_queue = false;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);

   /* Check requested queues and fail if we are requested to create any
    * queues with flags we don't support.
    */
-   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
      if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
         return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);

-      const struct anv_queue_family *family =
-         &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
-      device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
-   }
-
   device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
                       sizeof(*device), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@ -786,36 +780,9 @@ VkResult anv_CreateDevice(
                                       device->workaround_bo->size,
                                       INTEL_DEBUG_BLOCK_TYPE_FRAME);

-   if (device->vk.enabled_extensions.KHR_ray_query) {
-      uint32_t ray_queries_size =
-         align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
-
-      result = anv_device_alloc_bo(device, "ray queries",
-                                   ray_queries_size,
-                                   ANV_BO_ALLOC_INTERNAL,
-                                   0 /* explicit_address */,
-                                   &device->ray_query_bo[0]);
-      ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
-      if (result != VK_SUCCESS)
-         goto fail_alloc_device_bo;
-
-      /* We need a separate ray query bo for CCS engine with Wa_14022863161. */
-      if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
-          device_has_compute_queue) {
-         result = anv_device_alloc_bo(device, "ray queries",
-                                      ray_queries_size,
-                                      ANV_BO_ALLOC_INTERNAL,
-                                      0 /* explicit_address */,
-                                      &device->ray_query_bo[1]);
-         ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
-         if (result != VK_SUCCESS)
-            goto fail_ray_query_bo;
-      }
-   }
-
   result = anv_device_init_trivial_batch(device);
   if (result != VK_SUCCESS)
-      goto fail_ray_query_bo;
+      goto fail_alloc_device_bo;

   /* Emit the CPS states before running the initialization batch as those
    * structures are referenced.
@ -1073,13 +1040,6 @@ VkResult anv_CreateDevice(
 fail_trivial_batch:
   ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
   anv_device_release_bo(device, device->trivial_batch_bo);
- fail_ray_query_bo:
-   for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
-      if (device->ray_query_bo[i]) {
-         ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
-         anv_device_release_bo(device, device->ray_query_bo[i]);
-      }
-   }
 fail_alloc_device_bo:
   if (device->mem_fence_bo) {
      ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@ -1231,17 +1191,13 @@ void anv_DestroyDevice(
   anv_scratch_pool_finish(device, &device->protected_scratch_pool);

   if (device->vk.enabled_extensions.KHR_ray_query) {
-      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
-         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
-            if (device->ray_query_shadow_bos[i][j] != NULL) {
-               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
-               anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
+         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
+            if (device->ray_query_bos[i][j] != NULL) {
+               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
+               anv_device_release_bo(device, device->ray_query_bos[i][j]);
            }
         }
-         if (device->ray_query_bo[i]) {
-            ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
-            anv_device_release_bo(device, device->ray_query_bo[i]);
-         }
      }
   }
   ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@ -226,7 +226,11 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,

 void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);

-struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+void genX(setup_ray_query_globals)(struct anv_device *device,
+                                   struct anv_bo* bo,
+                                   uint64_t offset,
+                                   void* map,
+                                   uint32_t num_queries);

 void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
                                       uint32_t total_scratch);
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -2625,22 +2625,11 @@ struct anv_device {

    uint32_t                                    protected_session_id;

-    /** Shadow ray query BO
-     *
-     * The ray_query_bo only holds the current ray being traced. When using
-     * more than 1 ray query per thread, we cannot fit all the queries in
-     * there, so we need a another buffer to hold query data that is not
-     * currently being used by the HW for tracing, similar to a scratch space.
-     *
-     * The size of the shadow buffer depends on the number of queries per
-     * shader.
+    /** Pool of ray query buffers used to communicated with HW unit.
     *
     * We might need a buffer per queue family due to Wa_14022863161.
     */
-    struct anv_bo                              *ray_query_shadow_bos[2][16];
-    /** Ray query buffer used to communicated with HW unit.
-     */
-    struct anv_bo                              *ray_query_bo[2];
+    struct anv_bo                              *ray_query_bos[2][16];

    struct anv_shader_internal                 *rt_trampoline;
    struct anv_shader_internal                 *rt_trivial_return;
@ -4247,10 +4236,19 @@ struct anv_push_constants {
    */
   uint32_t surfaces_base_offset;

-   /** Ray query globals
+   /**
+    * Pointer to ray query stacks and their associated pairs of
+    * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
    *
-    * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
-    * genX(cmd_buffer_ray_query_globals))
+    * The pair of globals for each query object are stored counting up from
+    * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
+    *
+    *    rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
+    *
+    * The raytracing scratch area for each ray query is stored counting down
+    * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
+    *
+    *    rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
    */
   uint64_t ray_query_globals;

@ -4753,9 +4751,14 @@ struct anv_cmd_state {
   unsigned                                     current_hash_scale;

   /**
-    * A buffer used for spill/fill of ray queries.
+    * Number of ray query buffers allocated.
    */
-   struct anv_bo *                              ray_query_shadow_bo;
+   uint32_t                                     num_ray_query_globals;
+
+   /**
+    * Current array of RT_DISPATCH_GLOBALS for ray queries.
+    */
+   struct anv_address                           ray_query_globals;

   /** Pointer to the last emitted COMPUTE_WALKER.
    *
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@ -37,6 +37,7 @@
 #include "ds/intel_tracepoints.h"

 #include "genX_mi_builder.h"
+#include "nir_builder.h"

 void
 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@ -811,49 +812,35 @@ void genX(CmdDispatchIndirect)(
   genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
 }

-struct anv_address
-genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+void
+genX(setup_ray_query_globals)(struct anv_device *device,
+                              struct anv_bo* bo,
+                              uint64_t offset,
+                              void* map,
+                              uint32_t num_queries)
 {
 #if GFX_VERx10 >= 125
-   struct anv_device *device = cmd_buffer->device;
-
-   struct anv_state state =
-      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
-                                           2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
-                                           BRW_RT_DISPATCH_GLOBALS_ALIGN);
-   struct brw_rt_scratch_layout layout;
-   uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
-                                       * some cases?
-                                       */
-   brw_rt_compute_scratch_layout(&layout, device->info,
-                                 stack_ids_per_dss, 1 << 10);
-
-   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
-
-   for (uint32_t i = 0; i < 2; i++) {
-      const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
-         .MemBaseAddress = (struct anv_address) {
-            /* The ray query HW computes offsets from the top of the buffer, so
-             * let the address at the end of the buffer.
-             */
-            .bo = device->ray_query_bo[idx],
-            .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
-         },
-         .AsyncRTStackSize = layout.ray_stack_stride / 64,
-         .NumDSSRTStacks = layout.stack_ids_per_dss,
-         .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-         .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-         .ResumeShaderTable = (struct anv_address) {
-            .bo = cmd_buffer->state.ray_query_shadow_bo,
-         },
-      };
-      GENX(RT_DISPATCH_GLOBALS_pack)(
-         NULL,
-         state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
-         &rtdg);
-   }
-
-   return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+   assert(num_queries > 0);
+   uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
+   for (uint32_t i = 0; i < num_queries; ++i)
+      for (uint32_t j = 0; j < 2; j++)
+         GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
+            (char*) map +
+               i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
+               j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+            &(struct GENX(RT_DISPATCH_GLOBALS)) {
+               .MemBaseAddress = (struct anv_address) {
+                  /* The ray query HW computes offsets from the top of the
+                   * buffer, so set the address at the end of the buffer.
+                   */
+                  .bo = bo,
+                  .offset = offset - i * stack_stride - j * stack_stride / 2,
+               },
+               .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
+               .NumDSSRTStacks = 2048, /* TODO */
+               .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+               .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+            });
 #else
   UNREACHABLE("Not supported");
 #endif