Revert "anv,brw: Allow multiple ray queries without spilling to a shadow stack"

This optimization doesn't work when the ray query index isn't uniform across the subgroup, which is something the spec allows. While there are some smart ways to fix this and still avoid unnecessary spilling, its not worth investing the time until we find a realtime raytracing workload that actually needs to use multiple live ray queries for something. Fixes: 1f1de7eb ("anv,brw: Allow multiple ray queries without spilling to a shadow stack") Acked-by: Sagar Ghuge <sagar.ghuge@intel.com> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39445>
2026-05-05 22:38:05 +02:00 · 2026-01-21 13:31:39 -08:00 · 2026-01-21 13:31:39 -08:00 · 895ff7fe92
commit 895ff7fe92
parent 88ae2365b2
10 changed files with 278 additions and 179 deletions
--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@ -38,10 +38,8 @@ struct lowering_state {
   struct hash_table *queries;
   uint32_t n_queries;

+   struct brw_nir_rt_globals_defs globals;
   nir_def *rq_globals;
-
-   uint32_t num_dss_rt_stacks;
-   uint32_t sync_stacks_stride;
 };

 struct brw_ray_query {
@ -52,6 +50,12 @@ struct brw_ray_query {

 #define SIZEOF_QUERY_STATE (sizeof(uint32_t))

+static bool
+need_spill_fill(struct lowering_state *state)
+{
+   return state->n_queries > 1;
+}
+
 /**
 * This pass converts opaque RayQuery structures from SPIRV into a vec3 where
 * the first 2 elements store a global address for the query and the third
@ -94,8 +98,10 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
                                                NULL);
 }

+
+
 static nir_def *
-get_ray_query_stack_index(nir_builder *b,
+get_ray_query_shadow_addr(nir_builder *b,
                          nir_deref_instr *deref,
                          struct lowering_state *state,
                          nir_deref_instr **out_state_deref)
@ -110,17 +116,35 @@ get_ray_query_stack_index(nir_builder *b,

   struct brw_ray_query *rq = entry->data;

-   nir_def *query_idx = nir_imm_int(b, rq->id);
+   /* Base address in the shadow memory of the variable associated with this
+    * ray query variable.
+    */
+   nir_def *base_addr =
+      nir_iadd_imm(b, state->globals.resume_sbt_addr,
+                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
+
+   bool spill_fill = need_spill_fill(state);
   *out_state_deref = nir_build_deref_var(b, rq->internal_var);

+   if (!spill_fill)
+      return NULL;
+
   /* Just emit code and let constant-folding go to town */
   nir_deref_instr **p = &path.path[1];
   for (; *p; p++) {
      if ((*p)->deref_type == nir_deref_type_array) {
         nir_def *index = (*p)->arr.index.ssa;
+
+         /**/
         *out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
-         index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
-         query_idx = nir_iadd(b, query_idx, index);
+
+         /**/
+         uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
+            brw_rt_ray_queries_shadow_stack_size(state->devinfo);
+
+         nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
+
+         base_addr = nir_iadd(b, base_addr, mul);
      } else {
         UNREACHABLE("Unsupported deref type");
      }
@ -128,7 +152,28 @@ get_ray_query_stack_index(nir_builder *b,

   nir_deref_path_finish(&path);

-   return query_idx;
+   /* Add the lane offset to the shadow memory address */
+   nir_def *lane_offset =
+      nir_imul_imm(
+         b,
+         nir_iadd(
+            b,
+            nir_imul(
+               b,
+               brw_load_btd_dss_id(b),
+               state->globals.num_dss_rt_stacks),
+            brw_nir_rt_sync_stack_id(b)),
+         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
+
+   /* Top/bottom 16 lanes each get their own stack area */
+   lane_offset = nir_bcsel(
+      b,
+      nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
+      lane_offset,
+      nir_iadd_imm(b, lane_offset,
+                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
+
+   return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
 }

 static void
@ -164,6 +209,26 @@ update_trace_ctrl_level(nir_builder *b,
   }
 }

+static void
+fill_query(nir_builder *b,
+           nir_def *hw_stack_addr,
+           nir_def *shadow_stack_addr,
+           nir_def *ctrl)
+{
+   brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+static void
+spill_query(nir_builder *b,
+            nir_def *hw_stack_addr,
+            nir_def *shadow_stack_addr)
+{
+   brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+
 static void
 lower_ray_query_intrinsic(nir_builder *b,
                          nir_intrinsic_instr *intrin,
@ -174,20 +239,12 @@ lower_ray_query_intrinsic(nir_builder *b,
   b->cursor = nir_instr_remove(&intrin->instr);

   nir_deref_instr *ctrl_level_deref;
-   nir_def *stack_index =
-      get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
-   nir_def *rq_globals_addr =
-      nir_iadd(b, state->rq_globals,
-               nir_i2i64(b, nir_amul_imm(b, stack_index,
-                                         BRW_RT_DISPATCH_GLOBALS_ALIGN)));
-   nir_def *stack_base_addr =
-      nir_isub(b, state->rq_globals,
-               nir_i2i64(b, nir_amul_imm(b, stack_index,
-                                         state->sync_stacks_stride)));
-   nir_def *stack_addr =
-      brw_nir_rt_sync_stack_addr(b, stack_base_addr,
-                                 state->num_dss_rt_stacks,
-                                 state->devinfo);
+   nir_def *shadow_stack_addr =
+      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
+   nir_def *hw_stack_addr =
+      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
+                                 state->globals.num_dss_rt_stacks);
+   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
   mesa_shader_stage stage = b->shader->info.stage;

   switch (intrin->intrinsic) {
@ -256,12 +313,22 @@ lower_ray_query_intrinsic(nir_builder *b,
          */
         brw_nir_rt_query_mark_done(b, stack_addr);

-         nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
+         if (shadow_stack_addr)
+            fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
+
+         /* Do not use state->rq_globals, we want a uniform value for the
+          * tracing call.
+          */
+         nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
+                             level, ctrl, .synchronous = true);

         struct brw_nir_rt_mem_hit_defs hit_in = {};
-         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
+         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
                                           state->devinfo);

+         if (shadow_stack_addr)
+            spill_query(b, hw_stack_addr, shadow_stack_addr);
+
         update_trace_ctrl_level(b, ctrl_level_deref,
                                 NULL, NULL,
                                 nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@ -480,12 +547,21 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
   nir_builder _b, *b = &_b;
   _b = nir_builder_at(nir_before_impl(impl));

-   state->rq_globals = nir_load_ray_query_global_intel(b);
+   nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);

-   state->num_dss_rt_stacks =
-      brw_rt_ray_queries_stack_ids_per_dss(state->devinfo);
-   state->sync_stacks_stride =
-      brw_rt_ray_queries_stacks_stride(state->devinfo);
+   /* Use a different global for each 16lanes groups (only in SIMD32). */
+   state->rq_globals = nir_bcsel(
+      b,
+      nir_iand(b,
+               nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
+               nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
+      nir_iadd_imm(
+         b, rq_globals_base,
+         align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
+      rq_globals_base);
+
+   brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
+                                state->devinfo);

   nir_foreach_block_safe(block, impl) {
      nir_foreach_instr_safe(instr, block) {
--- a/src/intel/compiler/brw/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw/brw_nir_rt_builder.h
@ -178,8 +178,7 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
 static inline nir_def *
 brw_nir_rt_sync_stack_addr(nir_builder *b,
                           nir_def *base_mem_addr,
-                           uint32_t num_dss_rt_stacks,
-                           const struct intel_device_info *devinfo)
+                           nir_def *num_dss_rt_stacks)
 {
   /* Bspec 47547 (Xe) and 56936 (Xe2+) say:
    *    For Ray queries (Synchronous Ray Tracing), the formula is similar but
@ -196,29 +195,12 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
    * NUM_SYNC_STACKID_PER_DSS instead.
    */
   nir_def *offset32 =
-      nir_imul_imm(b,
-                   nir_iadd(b,
-                            nir_imul_imm(b, brw_load_btd_dss_id(b),
-                                            num_dss_rt_stacks),
-                            nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
-                   BRW_RT_SIZEOF_RAY_QUERY);
-
-   /* StackID offset for the bottom 16 lanes in SIMD32, this must match the
-    * offset of the second base address provided by the driver through the
-    * pair of ray query RTDispatchGlobals
-    */
-   uint32_t simd32_stack_offset =
-      num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
-      intel_device_info_dual_subslice_id_bound(devinfo);
-
-   offset32 =
-      nir_bcsel(b,
-                nir_iand(b,
-                         nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
-                         nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
-                nir_iadd_imm(b, offset32, simd32_stack_offset),
-                offset32);
-
+      nir_imul(b,
+               nir_iadd(b,
+                        nir_imul(b, brw_load_btd_dss_id(b),
+                                    num_dss_rt_stacks),
+                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
   return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
 }

@ -318,6 +300,7 @@ struct brw_nir_rt_globals_defs {
   nir_def *launch_size;
   nir_def *call_sbt_addr;
   nir_def *call_sbt_stride;
+   nir_def *resume_sbt_addr;
 };

 static inline void
@ -385,6 +368,8 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
      defs->call_sbt_stride =
         nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
                      0x1fff);
+      defs->resume_sbt_addr =
+         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
   } else {
      defs->call_sbt_addr =
         nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@ -392,6 +377,9 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
                                                      nir_imm_int(b, 0)));
      defs->call_sbt_stride =
         nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
+
+      defs->resume_sbt_addr =
+         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
   }
 }

--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@ -36,7 +36,7 @@ extern "C" {
 #define BRW_RT_SBT_HANDLE_SIZE 32

 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
-#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 80

 /** RT_DISPATCH_GLOBALS alignment
 *
@ -194,6 +194,10 @@ struct brw_rt_raygen_trampoline_params {
    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
    (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))

+#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
+
 #define BRW_RT_SIZEOF_HW_STACK \
   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
    BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@ -280,15 +284,25 @@ brw_rt_ray_queries_stack_ids_per_dss(const struct intel_device_info *devinfo)
 }

 static inline uint32_t
-brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
+brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
 {
-   return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
+   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+    * which includes all the threads.
+    */
+   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+   uint32_t max_simd_size = 32;
+   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
 }

 static inline uint32_t
-brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
+brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
+                                      uint32_t ray_queries)
 {
-   return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
+   /* Don't bother a shadow stack if we only have a single query. We can
+    * directly write in the HW buffer.
+    */
+   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
+          ray_queries * 4; /* Ctrl + Level data */
 }

 #ifdef __cplusplus
--- a/src/intel/genxml/gen125_rt.xml
+++ b/src/intel/genxml/gen125_rt.xml
@ -28,6 +28,7 @@
    <field name="Launch Height" dword="14" bits="31:0" type="uint" />
    <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
    <field name="Callable Group Table" dword="16" bits="63:0" type="RT_SHADER_TABLE" />
+    <field name="Resume Shader Table" dword="18" bits="63:0" type="address" />
  </struct>
  <struct name="RT_GENERAL_SBT_HANDLE" length="8">
    <field name="General" dword="0" bits="63:0" type="BINDLESS_SHADER_RECORD" />
--- a/src/intel/genxml/gen300_rt.xml
+++ b/src/intel/genxml/gen300_rt.xml
@ -36,5 +36,6 @@
    <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
    <field name="Callable Group Table" dword="16" bits="63:0" type="address" />
    <field name="Callable Group Stride" dword="18" bits="12:0" type="uint" />
+    <field name="Resume Shader Table" dword="19" bits="63:0" type="address" />
  </struct>
 </genxml>
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@ -442,84 +442,58 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
                                   uint32_t ray_queries,
                                   VkShaderStageFlags stages)
 {
-   if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
-      struct anv_device *device = cmd_buffer->device;
-      uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
+   struct anv_device *device = cmd_buffer->device;
+   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);

-      unsigned bucket = util_logbase2_ceil(ray_queries);
-      assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
+   uint64_t ray_shadow_size =
+      align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
+              4096);
+   if (ray_shadow_size > 0 &&
+       (!cmd_buffer->state.ray_query_shadow_bo ||
+        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+      unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
+      unsigned bucket = shadow_size_log2 - 16;
+      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));

-      uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
-      uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
-
-      struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
      if (bo == NULL) {
         struct anv_bo *new_bo;
-         VkResult result =
-            anv_device_alloc_bo(device, "RT queries scratch",
-                                offset + (stride << bucket), /* size */
-                                ANV_BO_ALLOC_INTERNAL |
-                                ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
-                                0, /* explicit_address */
-                                &new_bo);
-
+         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+                                               1 << shadow_size_log2,
+                                               ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
+                                               0, /* explicit_address */
+                                               &new_bo);
         ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
         if (result != VK_SUCCESS) {
            anv_batch_set_error(&cmd_buffer->batch, result);
            return;
         }

-         /* Map extra space we added at end of the buffer, we will write the
-          * array of RT_DISPATCH_GLOBALS into it so we can use only a single
-          * memory address in our shaders for all stacks and globals
-          */
-         void *map;
-         result = anv_device_map_bo(device, new_bo, stride << bucket,
-                                    offset, NULL, &map);
-
-         if (result != VK_SUCCESS) {
-            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
-            anv_device_release_bo(device, new_bo);
-            anv_batch_set_error(&cmd_buffer->batch, result);
-            return;
-         }
-
-         anv_genX(device->info, setup_ray_query_globals)(device,
-                                                         new_bo,
-                                                         stride << bucket,
-                                                         map,
-                                                         1 << bucket);
-
-#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
-         if (device->physical->memory.need_flush)
-            util_flush_inval_range(map, offset);
-#endif
-
-         anv_device_unmap_bo(device, new_bo, map, offset, false);
-
-         bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
+         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
         if (bo != NULL) {
-            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
+            ANV_DMR_BO_FREE(&device->vk.base, new_bo);
            anv_device_release_bo(device, new_bo);
         } else {
            bo = new_bo;
         }
      }
+      cmd_buffer->state.ray_query_shadow_bo = bo;

-      /* Add the HW buffer to the list of BO used. */
-      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
-
-      cmd_buffer->state.ray_query_globals = (struct anv_address) {
-         .bo = bo,
-         .offset = (int64_t) (stride << bucket),
-      };
-
-      cmd_buffer->state.num_ray_query_globals = 1 << bucket;
+      /* Add the ray query buffers to the batch list. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                            cmd_buffer->state.ray_query_shadow_bo);
   }

-   /* Update the push constants & mark them dirty. */
+   /* Add the HW buffer to the list of BO used. */
+   assert(device->ray_query_bo[idx]);
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         device->ray_query_bo[idx]);
+
+   /* Fill the push constants & mark them dirty. */
+   struct anv_address ray_query_globals_addr =
+      anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
   pipeline_state->push_constants.ray_query_globals =
-      anv_address_physical(cmd_buffer->state.ray_query_globals);
+      anv_address_physical(ray_query_globals_addr);
   cmd_buffer->state.push_constants_dirty |= stages;
   pipeline_state->push_constants_data_dirty = true;
 }
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -341,16 +341,22 @@ VkResult anv_CreateDevice(
   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
   VkResult result;
   struct anv_device *device;
+   bool device_has_compute_queue = false;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);

   /* Check requested queues and fail if we are requested to create any
    * queues with flags we don't support.
    */
-   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
      if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
         return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);

+      const struct anv_queue_family *family =
+         &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
+      device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
+   }
+
   device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
                       sizeof(*device), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@ -780,9 +786,36 @@ VkResult anv_CreateDevice(
                                       device->workaround_bo->size,
                                       INTEL_DEBUG_BLOCK_TYPE_FRAME);

+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      uint32_t ray_queries_size =
+         align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
+
+      result = anv_device_alloc_bo(device, "ray queries",
+                                   ray_queries_size,
+                                   ANV_BO_ALLOC_INTERNAL,
+                                   0 /* explicit_address */,
+                                   &device->ray_query_bo[0]);
+      ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
+      if (result != VK_SUCCESS)
+         goto fail_alloc_device_bo;
+
+      /* We need a separate ray query bo for CCS engine with Wa_14022863161. */
+      if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
+          device_has_compute_queue) {
+         result = anv_device_alloc_bo(device, "ray queries",
+                                      ray_queries_size,
+                                      ANV_BO_ALLOC_INTERNAL,
+                                      0 /* explicit_address */,
+                                      &device->ray_query_bo[1]);
+         ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
+         if (result != VK_SUCCESS)
+            goto fail_ray_query_bo;
+      }
+   }
+
   result = anv_device_init_trivial_batch(device);
   if (result != VK_SUCCESS)
-      goto fail_alloc_device_bo;
+      goto fail_ray_query_bo;

   /* Emit the CPS states before running the initialization batch as those
    * structures are referenced.
@ -1040,6 +1073,13 @@ VkResult anv_CreateDevice(
 fail_trivial_batch:
   ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
   anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+   for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
+      if (device->ray_query_bo[i]) {
+         ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
+         anv_device_release_bo(device, device->ray_query_bo[i]);
+      }
+   }
 fail_alloc_device_bo:
   if (device->mem_fence_bo) {
      ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@ -1191,13 +1231,17 @@ void anv_DestroyDevice(
   anv_scratch_pool_finish(device, &device->protected_scratch_pool);

   if (device->vk.enabled_extensions.KHR_ray_query) {
-      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
-         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
-            if (device->ray_query_bos[i][j] != NULL) {
-               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
-               anv_device_release_bo(device, device->ray_query_bos[i][j]);
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
+         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
+            if (device->ray_query_shadow_bos[i][j] != NULL) {
+               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
+               anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
            }
         }
+         if (device->ray_query_bo[i]) {
+            ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
+            anv_device_release_bo(device, device->ray_query_bo[i]);
+         }
      }
   }
   ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@ -226,11 +226,7 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,

 void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);

-void genX(setup_ray_query_globals)(struct anv_device *device,
-                                   struct anv_bo* bo,
-                                   uint64_t offset,
-                                   void* map,
-                                   uint32_t num_queries);
+struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);

 void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
                                       uint32_t total_scratch);
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -2626,11 +2626,22 @@ struct anv_device {

    uint32_t                                    protected_session_id;

-    /** Pool of ray query buffers used to communicated with HW unit.
+    /** Shadow ray query BO
+     *
+     * The ray_query_bo only holds the current ray being traced. When using
+     * more than 1 ray query per thread, we cannot fit all the queries in
+     * there, so we need a another buffer to hold query data that is not
+     * currently being used by the HW for tracing, similar to a scratch space.
+     *
+     * The size of the shadow buffer depends on the number of queries per
+     * shader.
     *
     * We might need a buffer per queue family due to Wa_14022863161.
     */
-    struct anv_bo                              *ray_query_bos[2][16];
+    struct anv_bo                              *ray_query_shadow_bos[2][16];
+    /** Ray query buffer used to communicated with HW unit.
+     */
+    struct anv_bo                              *ray_query_bo[2];

    struct anv_shader_internal                 *rt_trampoline;
    struct anv_shader_internal                 *rt_trivial_return;
@ -4237,19 +4248,10 @@ struct anv_push_constants {
    */
   uint32_t surfaces_base_offset;

-   /**
-    * Pointer to ray query stacks and their associated pairs of
-    * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
+   /** Ray query globals
    *
-    * The pair of globals for each query object are stored counting up from
-    * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
-    *
-    *    rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
-    *
-    * The raytracing scratch area for each ray query is stored counting down
-    * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
-    *
-    *    rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
+    * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
+    * genX(cmd_buffer_ray_query_globals))
    */
   uint64_t ray_query_globals;

@ -4752,14 +4754,9 @@ struct anv_cmd_state {
   unsigned                                     current_hash_scale;

   /**
-    * Number of ray query buffers allocated.
+    * A buffer used for spill/fill of ray queries.
    */
-   uint32_t                                     num_ray_query_globals;
-
-   /**
-    * Current array of RT_DISPATCH_GLOBALS for ray queries.
-    */
-   struct anv_address                           ray_query_globals;
+   struct anv_bo *                              ray_query_shadow_bo;

   /** Pointer to the last emitted COMPUTE_WALKER.
    *
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@ -37,7 +37,6 @@
 #include "ds/intel_tracepoints.h"

 #include "genX_mi_builder.h"
-#include "nir_builder.h"

 void
 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@ -813,36 +812,45 @@ void genX(CmdDispatchIndirect)(
   genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
 }

-void
-genX(setup_ray_query_globals)(struct anv_device *device,
-                              struct anv_bo* bo,
-                              uint64_t offset,
-                              void* map,
-                              uint32_t num_queries)
+struct anv_address
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
 {
 #if GFX_VERx10 >= 125
-   assert(num_queries > 0);
-   uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
-   uint32_t ids_per_dss = brw_rt_ray_queries_stack_ids_per_dss(device->info);
-   for (uint32_t i = 0; i < num_queries; ++i)
-      for (uint32_t j = 0; j < 2; j++)
-         GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
-            (char*) map +
-               i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
-               j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
-            &(struct GENX(RT_DISPATCH_GLOBALS)) {
-               .MemBaseAddress = (struct anv_address) {
-                  /* The ray query HW computes offsets from the top of the
-                   * buffer, so set the address at the end of the buffer.
-                   */
-                  .bo = bo,
-                  .offset = offset - i * stack_stride - j * stack_stride / 2,
-               },
-               .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
-               .NumDSSRTStacks = ids_per_dss,
-               .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-               .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-            });
+   struct anv_device *device = cmd_buffer->device;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+                                           2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
+                                           BRW_RT_DISPATCH_GLOBALS_ALIGN);
+   uint32_t stack_ids_per_dss =
+      brw_rt_ray_queries_stack_ids_per_dss(device->info);
+
+   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
+
+   for (uint32_t i = 0; i < 2; i++) {
+      const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+         .MemBaseAddress = (struct anv_address) {
+            /* The ray query HW computes offsets from the top of the buffer, so
+             * let the address at the end of the buffer.
+             */
+            .bo = device->ray_query_bo[idx],
+            .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
+         },
+         .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
+         .NumDSSRTStacks = stack_ids_per_dss,
+         .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+         .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+         .ResumeShaderTable = (struct anv_address) {
+            .bo = cmd_buffer->state.ray_query_shadow_bo,
+         },
+      };
+      GENX(RT_DISPATCH_GLOBALS_pack)(
+         NULL,
+         state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+         &rtdg);
+   }
+
+   return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
 #else
   UNREACHABLE("Not supported");
 #endif