diff --git a/.pick_status.json b/.pick_status.json
index ff7b8093ff9..dccbe8fdfb0 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -2914,7 +2914,7 @@
         "description": "Revert \"anv,brw: Allow multiple ray queries without spilling to a shadow stack\"",
         "nominated": true,
         "nomination_type": 2,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "1f1de7ebd63cbe55972e01a0e7f5509e9e232917",
         "notes": null
diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
index 8bbec94dd5b..9a9ddbd0364 100644
--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@@ -38,10 +38,8 @@ struct lowering_state {
    struct hash_table *queries;
    uint32_t n_queries;
 
+   struct brw_nir_rt_globals_defs globals;
    nir_def *rq_globals;
-
-   uint32_t num_dss_rt_stacks;
-   uint32_t sync_stacks_stride;
 };
 
 struct brw_ray_query {
@@ -52,6 +50,12 @@ struct brw_ray_query {
 
 #define SIZEOF_QUERY_STATE (sizeof(uint32_t))
 
+static bool
+need_spill_fill(struct lowering_state *state)
+{
+   return state->n_queries > 1;
+}
+
 /**
  * This pass converts opaque RayQuery structures from SPIRV into a vec3 where
  * the first 2 elements store a global address for the query and the third
@@ -94,8 +98,10 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
                                                 NULL);
 }
 
+
+
 static nir_def *
-get_ray_query_stack_index(nir_builder *b,
+get_ray_query_shadow_addr(nir_builder *b,
                           nir_deref_instr *deref,
                           struct lowering_state *state,
                           nir_deref_instr **out_state_deref)
@@ -110,17 +116,35 @@ get_ray_query_stack_index(nir_builder *b,
 
    struct brw_ray_query *rq = entry->data;
 
-   nir_def *query_idx = nir_imm_int(b, rq->id);
+   /* Base address in the shadow memory of the variable associated with this
+    * ray query variable.
+    */
+   nir_def *base_addr =
+      nir_iadd_imm(b, state->globals.resume_sbt_addr,
+                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
+
+   bool spill_fill = need_spill_fill(state);
    *out_state_deref = nir_build_deref_var(b, rq->internal_var);
 
+   if (!spill_fill)
+      return NULL;
+
    /* Just emit code and let constant-folding go to town */
    nir_deref_instr **p = &path.path[1];
    for (; *p; p++) {
       if ((*p)->deref_type == nir_deref_type_array) {
          nir_def *index = (*p)->arr.index.ssa;
+
+         /**/
          *out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
-         index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
-         query_idx = nir_iadd(b, query_idx, index);
+
+         /**/
+         uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
+            brw_rt_ray_queries_shadow_stack_size(state->devinfo);
+
+         nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
+
+         base_addr = nir_iadd(b, base_addr, mul);
       } else {
          UNREACHABLE("Unsupported deref type");
       }
@@ -128,7 +152,28 @@ get_ray_query_stack_index(nir_builder *b,
 
    nir_deref_path_finish(&path);
 
-   return query_idx;
+   /* Add the lane offset to the shadow memory address */
+   nir_def *lane_offset =
+      nir_imul_imm(
+         b,
+         nir_iadd(
+            b,
+            nir_imul(
+               b,
+               brw_load_btd_dss_id(b),
+               state->globals.num_dss_rt_stacks),
+            brw_nir_rt_sync_stack_id(b)),
+         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
+
+   /* Top/bottom 16 lanes each get their own stack area */
+   lane_offset = nir_bcsel(
+      b,
+      nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
+      lane_offset,
+      nir_iadd_imm(b, lane_offset,
+                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
+
+   return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
 }
 
 static void
@@ -164,6 +209,26 @@ update_trace_ctrl_level(nir_builder *b,
    }
 }
 
+static void
+fill_query(nir_builder *b,
+           nir_def *hw_stack_addr,
+           nir_def *shadow_stack_addr,
+           nir_def *ctrl)
+{
+   brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+static void
+spill_query(nir_builder *b,
+            nir_def *hw_stack_addr,
+            nir_def *shadow_stack_addr)
+{
+   brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
+                         BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+
 static void
 lower_ray_query_intrinsic(nir_builder *b,
                           nir_intrinsic_instr *intrin,
@@ -174,20 +239,12 @@ lower_ray_query_intrinsic(nir_builder *b,
    b->cursor = nir_instr_remove(&intrin->instr);
 
    nir_deref_instr *ctrl_level_deref;
-   nir_def *stack_index =
-      get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
-   nir_def *rq_globals_addr =
-      nir_iadd(b, state->rq_globals,
-               nir_i2i64(b, nir_amul_imm(b, stack_index,
-                                         BRW_RT_DISPATCH_GLOBALS_ALIGN)));
-   nir_def *stack_base_addr =
-      nir_isub(b, state->rq_globals,
-               nir_i2i64(b, nir_amul_imm(b, stack_index,
-                                         state->sync_stacks_stride)));
-   nir_def *stack_addr =
-      brw_nir_rt_sync_stack_addr(b, stack_base_addr,
-                                 state->num_dss_rt_stacks,
-                                 state->devinfo);
+   nir_def *shadow_stack_addr =
+      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
+   nir_def *hw_stack_addr =
+      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
+                                 state->globals.num_dss_rt_stacks);
+   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
    mesa_shader_stage stage = b->shader->info.stage;
 
    switch (intrin->intrinsic) {
@@ -256,12 +313,22 @@ lower_ray_query_intrinsic(nir_builder *b,
           */
          brw_nir_rt_query_mark_done(b, stack_addr);
 
-         nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
+         if (shadow_stack_addr)
+            fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
+
+         /* Do not use state->rq_globals, we want a uniform value for the
+          * tracing call.
+          */
+         nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
+                             level, ctrl, .synchronous = true);
 
          struct brw_nir_rt_mem_hit_defs hit_in = {};
-         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
+         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
                                            state->devinfo);
 
+         if (shadow_stack_addr)
+            spill_query(b, hw_stack_addr, shadow_stack_addr);
+
          update_trace_ctrl_level(b, ctrl_level_deref,
                                  NULL, NULL,
                                  nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@@ -480,12 +547,21 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
    nir_builder _b, *b = &_b;
    _b = nir_builder_at(nir_before_impl(impl));
 
-   state->rq_globals = nir_load_ray_query_global_intel(b);
+   nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
 
-   state->num_dss_rt_stacks =
-      brw_rt_ray_queries_stack_ids_per_dss(state->devinfo);
-   state->sync_stacks_stride =
-      brw_rt_ray_queries_stacks_stride(state->devinfo);
+   /* Use a different global for each 16lanes groups (only in SIMD32). */
+   state->rq_globals = nir_bcsel(
+      b,
+      nir_iand(b,
+               nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
+               nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
+      nir_iadd_imm(
+         b, rq_globals_base,
+         align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
+      rq_globals_base);
+
+   brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
+                                state->devinfo);
 
    nir_foreach_block_safe(block, impl) {
       nir_foreach_instr_safe(instr, block) {
diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h
index 2062b24fc7a..d66fa897e4c 100644
--- a/src/intel/compiler/brw/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw/brw_nir_rt_builder.h
@@ -178,8 +178,7 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
 static inline nir_def *
 brw_nir_rt_sync_stack_addr(nir_builder *b,
                            nir_def *base_mem_addr,
-                           uint32_t num_dss_rt_stacks,
-                           const struct intel_device_info *devinfo)
+                           nir_def *num_dss_rt_stacks)
 {
    /* Bspec 47547 (Xe) and 56936 (Xe2+) say:
     *    For Ray queries (Synchronous Ray Tracing), the formula is similar but
@@ -196,29 +195,12 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
     * NUM_SYNC_STACKID_PER_DSS instead.
     */
    nir_def *offset32 =
-      nir_imul_imm(b,
-                   nir_iadd(b,
-                            nir_imul_imm(b, brw_load_btd_dss_id(b),
-                                            num_dss_rt_stacks),
-                            nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
-                   BRW_RT_SIZEOF_RAY_QUERY);
-
-   /* StackID offset for the bottom 16 lanes in SIMD32, this must match the
-    * offset of the second base address provided by the driver through the
-    * pair of ray query RTDispatchGlobals
-    */
-   uint32_t simd32_stack_offset =
-      num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
-      intel_device_info_dual_subslice_id_bound(devinfo);
-
-   offset32 =
-      nir_bcsel(b,
-                nir_iand(b,
-                         nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
-                         nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
-                nir_iadd_imm(b, offset32, simd32_stack_offset),
-                offset32);
-
+      nir_imul(b,
+               nir_iadd(b,
+                        nir_imul(b, brw_load_btd_dss_id(b),
+                                    num_dss_rt_stacks),
+                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
    return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
 }
 
@@ -318,6 +300,7 @@ struct brw_nir_rt_globals_defs {
    nir_def *launch_size;
    nir_def *call_sbt_addr;
    nir_def *call_sbt_stride;
+   nir_def *resume_sbt_addr;
 };
 
 static inline void
@@ -385,6 +368,8 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
       defs->call_sbt_stride =
          nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
                       0x1fff);
+      defs->resume_sbt_addr =
+         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
    } else {
       defs->call_sbt_addr =
          nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@@ -392,6 +377,9 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
                                                       nir_imm_int(b, 0)));
       defs->call_sbt_stride =
          nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
+
+      defs->resume_sbt_addr =
+         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
    }
 }
 
diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h
index 4255db0ff2b..3b4fdf4ec92 100644
--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@@ -36,7 +36,7 @@ extern "C" {
 #define BRW_RT_SBT_HANDLE_SIZE 32
 
 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
-#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
 
 /** RT_DISPATCH_GLOBALS alignment
  *
@@ -191,6 +191,10 @@ struct brw_rt_raygen_trampoline_params {
     (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
     (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
 
+#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
+   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
+
 #define BRW_RT_SIZEOF_HW_STACK \
    (BRW_RT_SIZEOF_HIT_INFO * 2 + \
     BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@@ -277,15 +281,25 @@ brw_rt_ray_queries_stack_ids_per_dss(const struct intel_device_info *devinfo)
 }
 
 static inline uint32_t
-brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
+brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
 {
-   return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
+   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+    * which includes all the threads.
+    */
+   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+   uint32_t max_simd_size = 32;
+   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
 }
 
 static inline uint32_t
-brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
+brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
+                                      uint32_t ray_queries)
 {
-   return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
+   /* Don't bother a shadow stack if we only have a single query. We can
+    * directly write in the HW buffer.
+    */
+   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
+          ray_queries * 4; /* Ctrl + Level data */
 }
 
 #ifdef __cplusplus
diff --git a/src/intel/genxml/gen125_rt.xml b/src/intel/genxml/gen125_rt.xml
index 6c4298905ae..b23134ae038 100644
--- a/src/intel/genxml/gen125_rt.xml
+++ b/src/intel/genxml/gen125_rt.xml
@@ -28,6 +28,7 @@
     <field name="Launch Height" dword="14" bits="31:0" type="uint" />
     <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
     <field name="Callable Group Table" dword="16" bits="63:0" type="RT_SHADER_TABLE" />
+    <field name="Resume Shader Table" dword="18" bits="63:0" type="address" />
   </struct>
   <struct name="RT_GENERAL_SBT_HANDLE" length="8">
     <field name="General" dword="0" bits="63:0" type="BINDLESS_SHADER_RECORD" />
diff --git a/src/intel/genxml/gen300_rt.xml b/src/intel/genxml/gen300_rt.xml
index 5861f3eacab..7b2bcff39cb 100644
--- a/src/intel/genxml/gen300_rt.xml
+++ b/src/intel/genxml/gen300_rt.xml
@@ -36,5 +36,6 @@
     <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
     <field name="Callable Group Table" dword="16" bits="63:0" type="address" />
     <field name="Callable Group Stride" dword="18" bits="12:0" type="uint" />
+    <field name="Resume Shader Table" dword="19" bits="63:0" type="address" />
   </struct>
 </genxml>
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 77a90224c4d..a7d817ee88b 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -442,84 +442,58 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t ray_queries,
                                    VkShaderStageFlags stages)
 {
-   if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
-      struct anv_device *device = cmd_buffer->device;
-      uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
+   struct anv_device *device = cmd_buffer->device;
+   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
 
-      unsigned bucket = util_logbase2_ceil(ray_queries);
-      assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
+   uint64_t ray_shadow_size =
+      align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
+              4096);
+   if (ray_shadow_size > 0 &&
+       (!cmd_buffer->state.ray_query_shadow_bo ||
+        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+      unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
+      unsigned bucket = shadow_size_log2 - 16;
+      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
 
-      uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
-      uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
-
-      struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
       if (bo == NULL) {
          struct anv_bo *new_bo;
-         VkResult result =
-            anv_device_alloc_bo(device, "RT queries scratch",
-                                offset + (stride << bucket), /* size */
-                                ANV_BO_ALLOC_INTERNAL |
-                                ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
-                                0, /* explicit_address */
-                                &new_bo);
-
+         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+                                               1 << shadow_size_log2,
+                                               ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
+                                               0, /* explicit_address */
+                                               &new_bo);
          ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
          if (result != VK_SUCCESS) {
             anv_batch_set_error(&cmd_buffer->batch, result);
             return;
          }
 
-         /* Map extra space we added at end of the buffer, we will write the
-          * array of RT_DISPATCH_GLOBALS into it so we can use only a single
-          * memory address in our shaders for all stacks and globals
-          */
-         void *map;
-         result = anv_device_map_bo(device, new_bo, stride << bucket,
-                                    offset, NULL, &map);
-
-         if (result != VK_SUCCESS) {
-            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
-            anv_device_release_bo(device, new_bo);
-            anv_batch_set_error(&cmd_buffer->batch, result);
-            return;
-         }
-
-         anv_genX(device->info, setup_ray_query_globals)(device,
-                                                         new_bo,
-                                                         stride << bucket,
-                                                         map,
-                                                         1 << bucket);
-
-#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
-         if (device->physical->memory.need_flush)
-            util_flush_inval_range(map, offset);
-#endif
-
-         anv_device_unmap_bo(device, new_bo, map, offset, false);
-
-         bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
+         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
          if (bo != NULL) {
-            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
+            ANV_DMR_BO_FREE(&device->vk.base, new_bo);
             anv_device_release_bo(device, new_bo);
          } else {
             bo = new_bo;
          }
       }
+      cmd_buffer->state.ray_query_shadow_bo = bo;
 
-      /* Add the HW buffer to the list of BO used. */
-      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
-
-      cmd_buffer->state.ray_query_globals = (struct anv_address) {
-         .bo = bo,
-         .offset = (int64_t) (stride << bucket),
-      };
-
-      cmd_buffer->state.num_ray_query_globals = 1 << bucket;
+      /* Add the ray query buffers to the batch list. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                            cmd_buffer->state.ray_query_shadow_bo);
    }
 
-   /* Update the push constants & mark them dirty. */
+   /* Add the HW buffer to the list of BO used. */
+   assert(device->ray_query_bo[idx]);
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         device->ray_query_bo[idx]);
+
+   /* Fill the push constants & mark them dirty. */
+   struct anv_address ray_query_globals_addr =
+      anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
    pipeline_state->push_constants.ray_query_globals =
-      anv_address_physical(cmd_buffer->state.ray_query_globals);
+      anv_address_physical(ray_query_globals_addr);
    cmd_buffer->state.push_constants_dirty |= stages;
    pipeline_state->push_constants_data_dirty = true;
 }
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 86bb309ec2f..6cb87a9ced8 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -341,16 +341,22 @@ VkResult anv_CreateDevice(
    ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
    VkResult result;
    struct anv_device *device;
+   bool device_has_compute_queue = false;
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
    /* Check requested queues and fail if we are requested to create any
     * queues with flags we don't support.
     */
-   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
       if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
          return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
 
+      const struct anv_queue_family *family =
+         &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
+      device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
+   }
+
    device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
                        sizeof(*device), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -780,9 +786,36 @@ VkResult anv_CreateDevice(
                                        device->workaround_bo->size,
                                        INTEL_DEBUG_BLOCK_TYPE_FRAME);
 
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      uint32_t ray_queries_size =
+         align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
+
+      result = anv_device_alloc_bo(device, "ray queries",
+                                   ray_queries_size,
+                                   ANV_BO_ALLOC_INTERNAL,
+                                   0 /* explicit_address */,
+                                   &device->ray_query_bo[0]);
+      ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
+      if (result != VK_SUCCESS)
+         goto fail_alloc_device_bo;
+
+      /* We need a separate ray query bo for CCS engine with Wa_14022863161. */
+      if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
+          device_has_compute_queue) {
+         result = anv_device_alloc_bo(device, "ray queries",
+                                      ray_queries_size,
+                                      ANV_BO_ALLOC_INTERNAL,
+                                      0 /* explicit_address */,
+                                      &device->ray_query_bo[1]);
+         ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
+         if (result != VK_SUCCESS)
+            goto fail_ray_query_bo;
+      }
+   }
+
    result = anv_device_init_trivial_batch(device);
    if (result != VK_SUCCESS)
-      goto fail_alloc_device_bo;
+      goto fail_ray_query_bo;
 
    /* Emit the CPS states before running the initialization batch as those
     * structures are referenced.
@@ -1040,6 +1073,13 @@ VkResult anv_CreateDevice(
  fail_trivial_batch:
    ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
    anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+   for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
+      if (device->ray_query_bo[i]) {
+         ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
+         anv_device_release_bo(device, device->ray_query_bo[i]);
+      }
+   }
  fail_alloc_device_bo:
    if (device->mem_fence_bo) {
       ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@@ -1191,13 +1231,17 @@ void anv_DestroyDevice(
    anv_scratch_pool_finish(device, &device->protected_scratch_pool);
 
    if (device->vk.enabled_extensions.KHR_ray_query) {
-      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
-         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
-            if (device->ray_query_bos[i][j] != NULL) {
-               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
-               anv_device_release_bo(device, device->ray_query_bos[i][j]);
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
+         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
+            if (device->ray_query_shadow_bos[i][j] != NULL) {
+               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
+               anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
             }
          }
+         if (device->ray_query_bo[i]) {
+            ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
+            anv_device_release_bo(device, device->ray_query_bo[i]);
+         }
       }
    }
    ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 1dd77b9c69e..5234823b94a 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -226,11 +226,7 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
 
 void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
 
-void genX(setup_ray_query_globals)(struct anv_device *device,
-                                   struct anv_bo* bo,
-                                   uint64_t offset,
-                                   void* map,
-                                   uint32_t num_queries);
+struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
                                        uint32_t total_scratch);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 3a8a79ea6d2..1f481ec0fa2 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2625,11 +2625,22 @@ struct anv_device {
 
     uint32_t                                    protected_session_id;
 
-    /** Pool of ray query buffers used to communicated with HW unit.
+    /** Shadow ray query BO
+     *
+     * The ray_query_bo only holds the current ray being traced. When using
+     * more than 1 ray query per thread, we cannot fit all the queries in
+     * there, so we need a another buffer to hold query data that is not
+     * currently being used by the HW for tracing, similar to a scratch space.
+     *
+     * The size of the shadow buffer depends on the number of queries per
+     * shader.
      *
      * We might need a buffer per queue family due to Wa_14022863161.
      */
-    struct anv_bo                              *ray_query_bos[2][16];
+    struct anv_bo                              *ray_query_shadow_bos[2][16];
+    /** Ray query buffer used to communicated with HW unit.
+     */
+    struct anv_bo                              *ray_query_bo[2];
 
     struct anv_shader_internal                 *rt_trampoline;
     struct anv_shader_internal                 *rt_trivial_return;
@@ -4236,19 +4247,10 @@ struct anv_push_constants {
     */
    uint32_t surfaces_base_offset;
 
-   /**
-    * Pointer to ray query stacks and their associated pairs of
-    * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
+   /** Ray query globals
     *
-    * The pair of globals for each query object are stored counting up from
-    * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
-    *
-    *    rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
-    *
-    * The raytracing scratch area for each ray query is stored counting down
-    * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
-    *
-    *    rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
+    * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
+    * genX(cmd_buffer_ray_query_globals))
     */
    uint64_t ray_query_globals;
 
@@ -4751,14 +4753,9 @@ struct anv_cmd_state {
    unsigned                                     current_hash_scale;
 
    /**
-    * Number of ray query buffers allocated.
+    * A buffer used for spill/fill of ray queries.
     */
-   uint32_t                                     num_ray_query_globals;
-
-   /**
-    * Current array of RT_DISPATCH_GLOBALS for ray queries.
-    */
-   struct anv_address                           ray_query_globals;
+   struct anv_bo *                              ray_query_shadow_bo;
 
    /** Pointer to the last emitted COMPUTE_WALKER.
     *
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
index b4f50ef51a1..64ada047dc4 100644
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -37,7 +37,6 @@
 #include "ds/intel_tracepoints.h"
 
 #include "genX_mi_builder.h"
-#include "nir_builder.h"
 
 void
 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@@ -812,36 +811,45 @@ void genX(CmdDispatchIndirect)(
    genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
 }
 
-void
-genX(setup_ray_query_globals)(struct anv_device *device,
-                              struct anv_bo* bo,
-                              uint64_t offset,
-                              void* map,
-                              uint32_t num_queries)
+struct anv_address
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
 {
 #if GFX_VERx10 >= 125
-   assert(num_queries > 0);
-   uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
-   uint32_t ids_per_dss = brw_rt_ray_queries_stack_ids_per_dss(device->info);
-   for (uint32_t i = 0; i < num_queries; ++i)
-      for (uint32_t j = 0; j < 2; j++)
-         GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
-            (char*) map +
-               i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
-               j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
-            &(struct GENX(RT_DISPATCH_GLOBALS)) {
-               .MemBaseAddress = (struct anv_address) {
-                  /* The ray query HW computes offsets from the top of the
-                   * buffer, so set the address at the end of the buffer.
-                   */
-                  .bo = bo,
-                  .offset = offset - i * stack_stride - j * stack_stride / 2,
-               },
-               .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
-               .NumDSSRTStacks = ids_per_dss,
-               .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-               .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-            });
+   struct anv_device *device = cmd_buffer->device;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+                                           2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
+                                           BRW_RT_DISPATCH_GLOBALS_ALIGN);
+   uint32_t stack_ids_per_dss =
+      brw_rt_ray_queries_stack_ids_per_dss(device->info);
+
+   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
+
+   for (uint32_t i = 0; i < 2; i++) {
+      const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+         .MemBaseAddress = (struct anv_address) {
+            /* The ray query HW computes offsets from the top of the buffer, so
+             * let the address at the end of the buffer.
+             */
+            .bo = device->ray_query_bo[idx],
+            .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
+         },
+         .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
+         .NumDSSRTStacks = stack_ids_per_dss,
+         .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+         .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+         .ResumeShaderTable = (struct anv_address) {
+            .bo = cmd_buffer->state.ray_query_shadow_bo,
+         },
+      };
+      GENX(RT_DISPATCH_GLOBALS_pack)(
+         NULL,
+         state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+         &rtdg);
+   }
+
+   return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
 #else
    UNREACHABLE("Not supported");
 #endif