diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
index 9a9ddbd0364..85bc0f24bed 100644
--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@@ -38,8 +38,10 @@ struct lowering_state {
    struct hash_table *queries;
    uint32_t n_queries;
 
-   struct brw_nir_rt_globals_defs globals;
    nir_def *rq_globals;
+
+   uint32_t num_dss_rt_stacks;
+   uint32_t sync_stacks_stride;
 };
 
 struct brw_ray_query {
@@ -50,12 +52,6 @@ struct brw_ray_query {
 
 #define SIZEOF_QUERY_STATE (sizeof(uint32_t))
 
-static bool
-need_spill_fill(struct lowering_state *state)
-{
-   return state->n_queries > 1;
-}
-
 /**
  * This pass converts opaque RayQuery structures from SPIRV into a vec3 where
  * the first 2 elements store a global address for the query and the third
@@ -98,10 +94,8 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
                                                 NULL);
 }
 
-
-
 static nir_def *
-get_ray_query_shadow_addr(nir_builder *b,
+get_ray_query_stack_index(nir_builder *b,
                           nir_deref_instr *deref,
                           struct lowering_state *state,
                           nir_deref_instr **out_state_deref)
@@ -116,35 +110,17 @@ get_ray_query_shadow_addr(nir_builder *b,
 
    struct brw_ray_query *rq = entry->data;
 
-   /* Base address in the shadow memory of the variable associated with this
-    * ray query variable.
-    */
-   nir_def *base_addr =
-      nir_iadd_imm(b, state->globals.resume_sbt_addr,
-                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
-
-   bool spill_fill = need_spill_fill(state);
+   nir_def *query_idx = nir_imm_int(b, rq->id);
    *out_state_deref = nir_build_deref_var(b, rq->internal_var);
 
-   if (!spill_fill)
-      return NULL;
-
    /* Just emit code and let constant-folding go to town */
    nir_deref_instr **p = &path.path[1];
    for (; *p; p++) {
       if ((*p)->deref_type == nir_deref_type_array) {
          nir_def *index = (*p)->arr.index.ssa;
-
-         /**/
          *out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
-
-         /**/
-         uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
-            brw_rt_ray_queries_shadow_stack_size(state->devinfo);
-
-         nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
-
-         base_addr = nir_iadd(b, base_addr, mul);
+         index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
+         query_idx = nir_iadd(b, query_idx, index);
       } else {
          UNREACHABLE("Unsupported deref type");
       }
@@ -152,28 +128,7 @@ get_ray_query_shadow_addr(nir_builder *b,
 
    nir_deref_path_finish(&path);
 
-   /* Add the lane offset to the shadow memory address */
-   nir_def *lane_offset =
-      nir_imul_imm(
-         b,
-         nir_iadd(
-            b,
-            nir_imul(
-               b,
-               brw_load_btd_dss_id(b),
-               state->globals.num_dss_rt_stacks),
-            brw_nir_rt_sync_stack_id(b)),
-         BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
-
-   /* Top/bottom 16 lanes each get their own stack area */
-   lane_offset = nir_bcsel(
-      b,
-      nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
-      lane_offset,
-      nir_iadd_imm(b, lane_offset,
-                   brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
-
-   return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
+   return query_idx;
 }
 
 static void
@@ -209,26 +164,6 @@ update_trace_ctrl_level(nir_builder *b,
    }
 }
 
-static void
-fill_query(nir_builder *b,
-           nir_def *hw_stack_addr,
-           nir_def *shadow_stack_addr,
-           nir_def *ctrl)
-{
-   brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
-                         BRW_RT_SIZEOF_RAY_QUERY);
-}
-
-static void
-spill_query(nir_builder *b,
-            nir_def *hw_stack_addr,
-            nir_def *shadow_stack_addr)
-{
-   brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
-                         BRW_RT_SIZEOF_RAY_QUERY);
-}
-
-
 static void
 lower_ray_query_intrinsic(nir_builder *b,
                           nir_intrinsic_instr *intrin,
@@ -239,12 +174,20 @@ lower_ray_query_intrinsic(nir_builder *b,
    b->cursor = nir_instr_remove(&intrin->instr);
 
    nir_deref_instr *ctrl_level_deref;
-   nir_def *shadow_stack_addr =
-      get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
-   nir_def *hw_stack_addr =
-      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
-                                 state->globals.num_dss_rt_stacks);
-   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
+   nir_def *stack_index =
+      get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
+   nir_def *rq_globals_addr =
+      nir_iadd(b, state->rq_globals,
+               nir_i2i64(b, nir_amul_imm(b, stack_index,
+                                         BRW_RT_DISPATCH_GLOBALS_ALIGN)));
+   nir_def *stack_base_addr =
+      nir_isub(b, state->rq_globals,
+               nir_i2i64(b, nir_amul_imm(b, stack_index,
+                                         state->sync_stacks_stride)));
+   nir_def *stack_addr =
+      brw_nir_rt_sync_stack_addr(b, stack_base_addr,
+                                 state->num_dss_rt_stacks,
+                                 state->devinfo);
    mesa_shader_stage stage = b->shader->info.stage;
 
    switch (intrin->intrinsic) {
@@ -313,22 +256,12 @@ lower_ray_query_intrinsic(nir_builder *b,
           */
          brw_nir_rt_query_mark_done(b, stack_addr);
 
-         if (shadow_stack_addr)
-            fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
-
-         /* Do not use state->rq_globals, we want a uniform value for the
-          * tracing call.
-          */
-         nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
-                             level, ctrl, .synchronous = true);
+         nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
 
          struct brw_nir_rt_mem_hit_defs hit_in = {};
-         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
+         brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
                                            state->devinfo);
 
-         if (shadow_stack_addr)
-            spill_query(b, hw_stack_addr, shadow_stack_addr);
-
          update_trace_ctrl_level(b, ctrl_level_deref,
                                  NULL, NULL,
                                  nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@@ -547,21 +480,17 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
    nir_builder _b, *b = &_b;
    _b = nir_builder_at(nir_before_impl(impl));
 
-   nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
+   state->rq_globals = nir_load_ray_query_global_intel(b);
 
-   /* Use a different global for each 16lanes groups (only in SIMD32). */
-   state->rq_globals = nir_bcsel(
-      b,
-      nir_iand(b,
-               nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
-               nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
-      nir_iadd_imm(
-         b, rq_globals_base,
-         align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
-      rq_globals_base);
+   /* ATSM PRMs Vol 9, "State Model for Ray Tracing - RTDispatchGlobals"
+    *
+    *    "For Sync Ray tracing (i.e. using RayQueries), SW must allocate
+    *    space assuming 2K StackIDs"
+    */
+   state->num_dss_rt_stacks = 2048; /* TODO */
 
-   brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
-                                state->devinfo);
+   state->sync_stacks_stride =
+      brw_rt_ray_queries_stacks_stride(state->devinfo);
 
    nir_foreach_block_safe(block, impl) {
       nir_foreach_instr_safe(instr, block) {
diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h
index d66fa897e4c..2062b24fc7a 100644
--- a/src/intel/compiler/brw/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw/brw_nir_rt_builder.h
@@ -178,7 +178,8 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
 static inline nir_def *
 brw_nir_rt_sync_stack_addr(nir_builder *b,
                            nir_def *base_mem_addr,
-                           nir_def *num_dss_rt_stacks)
+                           uint32_t num_dss_rt_stacks,
+                           const struct intel_device_info *devinfo)
 {
    /* Bspec 47547 (Xe) and 56936 (Xe2+) say:
     *    For Ray queries (Synchronous Ray Tracing), the formula is similar but
@@ -195,12 +196,29 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
     * NUM_SYNC_STACKID_PER_DSS instead.
     */
    nir_def *offset32 =
-      nir_imul(b,
-               nir_iadd(b,
-                        nir_imul(b, brw_load_btd_dss_id(b),
-                                    num_dss_rt_stacks),
-                        nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
-               nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
+      nir_imul_imm(b,
+                   nir_iadd(b,
+                            nir_imul_imm(b, brw_load_btd_dss_id(b),
+                                            num_dss_rt_stacks),
+                            nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+                   BRW_RT_SIZEOF_RAY_QUERY);
+
+   /* StackID offset for the bottom 16 lanes in SIMD32, this must match the
+    * offset of the second base address provided by the driver through the
+    * pair of ray query RTDispatchGlobals
+    */
+   uint32_t simd32_stack_offset =
+      num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
+      intel_device_info_dual_subslice_id_bound(devinfo);
+
+   offset32 =
+      nir_bcsel(b,
+                nir_iand(b,
+                         nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
+                         nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
+                nir_iadd_imm(b, offset32, simd32_stack_offset),
+                offset32);
+
    return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
 }
 
@@ -300,7 +318,6 @@ struct brw_nir_rt_globals_defs {
    nir_def *launch_size;
    nir_def *call_sbt_addr;
    nir_def *call_sbt_stride;
-   nir_def *resume_sbt_addr;
 };
 
 static inline void
@@ -368,8 +385,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
       defs->call_sbt_stride =
          nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
                       0x1fff);
-      defs->resume_sbt_addr =
-         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
    } else {
       defs->call_sbt_addr =
          nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@@ -377,9 +392,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
                                                       nir_imm_int(b, 0)));
       defs->call_sbt_stride =
          nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
-
-      defs->resume_sbt_addr =
-         nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
    }
 }
 
diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h
index 4d5791c88e1..09a0b86af77 100644
--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@@ -36,7 +36,7 @@ extern "C" {
 #define BRW_RT_SBT_HANDLE_SIZE 32
 
 /** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
-#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
 
 /** RT_DISPATCH_GLOBALS alignment
  *
@@ -191,10 +191,6 @@ struct brw_rt_raygen_trampoline_params {
     (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
     (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
 
-#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
-   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
-    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
-
 #define BRW_RT_SIZEOF_HW_STACK \
    (BRW_RT_SIZEOF_HIT_INFO * 2 + \
     BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@@ -270,25 +266,15 @@ brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
 }
 
 static inline uint32_t
-brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
+brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
 {
-   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
-    * which includes all the threads.
-    */
-   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
-   uint32_t max_simd_size = 32;
-   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
+   return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
 }
 
 static inline uint32_t
-brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
-                                      uint32_t ray_queries)
+brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
 {
-   /* Don't bother a shadow stack if we only have a single query. We can
-    * directly write in the HW buffer.
-    */
-   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
-          ray_queries * 4; /* Ctrl + Level data */
+   return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
 }
 
 #ifdef __cplusplus
diff --git a/src/intel/genxml/gen125_rt.xml b/src/intel/genxml/gen125_rt.xml
index b23134ae038..6c4298905ae 100644
--- a/src/intel/genxml/gen125_rt.xml
+++ b/src/intel/genxml/gen125_rt.xml
@@ -28,7 +28,6 @@
     <field name="Launch Height" dword="14" bits="31:0" type="uint" />
     <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
     <field name="Callable Group Table" dword="16" bits="63:0" type="RT_SHADER_TABLE" />
-    <field name="Resume Shader Table" dword="18" bits="63:0" type="address" />
   </struct>
   <struct name="RT_GENERAL_SBT_HANDLE" length="8">
     <field name="General" dword="0" bits="63:0" type="BINDLESS_SHADER_RECORD" />
diff --git a/src/intel/genxml/gen300_rt.xml b/src/intel/genxml/gen300_rt.xml
index 7b2bcff39cb..5861f3eacab 100644
--- a/src/intel/genxml/gen300_rt.xml
+++ b/src/intel/genxml/gen300_rt.xml
@@ -36,6 +36,5 @@
     <field name="Launch Depth" dword="15" bits="31:0" type="uint" />
     <field name="Callable Group Table" dword="16" bits="63:0" type="address" />
     <field name="Callable Group Stride" dword="18" bits="12:0" type="uint" />
-    <field name="Resume Shader Table" dword="19" bits="63:0" type="address" />
   </struct>
 </genxml>
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 60789abcbcc..a1da91a8f4b 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -442,58 +442,84 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t ray_queries,
                                    VkShaderStageFlags stages)
 {
-   struct anv_device *device = cmd_buffer->device;
-   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
+   if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
+      struct anv_device *device = cmd_buffer->device;
+      uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
 
-   uint64_t ray_shadow_size =
-      align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
-              4096);
-   if (ray_shadow_size > 0 &&
-       (!cmd_buffer->state.ray_query_shadow_bo ||
-        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
-      unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
-      unsigned bucket = shadow_size_log2 - 16;
-      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
+      unsigned bucket = util_logbase2_ceil(ray_queries);
+      assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
 
-      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
+      uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
+      uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
+
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
       if (bo == NULL) {
          struct anv_bo *new_bo;
-         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
-                                               1 << shadow_size_log2,
-                                               ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
-                                               0, /* explicit_address */
-                                               &new_bo);
+         VkResult result =
+            anv_device_alloc_bo(device, "RT queries scratch",
+                                offset + (stride << bucket), /* size */
+                                ANV_BO_ALLOC_INTERNAL |
+                                ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
+                                0, /* explicit_address */
+                                &new_bo);
+
          ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
          if (result != VK_SUCCESS) {
             anv_batch_set_error(&cmd_buffer->batch, result);
             return;
          }
 
-         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
+         /* Map extra space we added at end of the buffer, we will write the
+          * array of RT_DISPATCH_GLOBALS into it so we can use only a single
+          * memory address in our shaders for all stacks and globals
+          */
+         void *map;
+         result = anv_device_map_bo(device, new_bo, stride << bucket,
+                                    offset, NULL, &map);
+
+         if (result != VK_SUCCESS) {
+            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
+            anv_device_release_bo(device, new_bo);
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return;
+         }
+
+         anv_genX(device->info, setup_ray_query_globals)(device,
+                                                         new_bo,
+                                                         stride << bucket,
+                                                         map,
+                                                         1 << bucket);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+         if (device->physical->memory.need_flush)
+            util_flush_inval_range(map, offset);
+#endif
+
+         anv_device_unmap_bo(device, new_bo, map, offset, false);
+
+         bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
          if (bo != NULL) {
-            ANV_DMR_BO_FREE(&device->vk.base, new_bo);
+            ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
             anv_device_release_bo(device, new_bo);
          } else {
             bo = new_bo;
          }
       }
-      cmd_buffer->state.ray_query_shadow_bo = bo;
 
-      /* Add the ray query buffers to the batch list. */
-      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
-                            cmd_buffer->state.ray_query_shadow_bo);
+      /* Add the HW buffer to the list of BO used. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
+
+      cmd_buffer->state.ray_query_globals = (struct anv_address) {
+         .bo = bo,
+         .offset = (int64_t) (stride << bucket),
+      };
+
+      cmd_buffer->state.num_ray_query_globals = 1 << bucket;
    }
 
-   /* Add the HW buffer to the list of BO used. */
-   assert(device->ray_query_bo[idx]);
-   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
-                         device->ray_query_bo[idx]);
-
-   /* Fill the push constants & mark them dirty. */
-   struct anv_address ray_query_globals_addr =
-      anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+   /* Update the push constants & mark them dirty. */
    pipeline_state->push_constants.ray_query_globals =
-      anv_address_physical(ray_query_globals_addr);
+      anv_address_physical(cmd_buffer->state.ray_query_globals);
    cmd_buffer->state.push_constants_dirty |= stages;
    pipeline_state->push_constants_data_dirty = true;
 }
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 6cb87a9ced8..86bb309ec2f 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -341,22 +341,16 @@ VkResult anv_CreateDevice(
    ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
    VkResult result;
    struct anv_device *device;
-   bool device_has_compute_queue = false;
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
    /* Check requested queues and fail if we are requested to create any
     * queues with flags we don't support.
     */
-   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
       if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
          return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
 
-      const struct anv_queue_family *family =
-         &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
-      device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
-   }
-
    device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
                        sizeof(*device), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -786,36 +780,9 @@ VkResult anv_CreateDevice(
                                        device->workaround_bo->size,
                                        INTEL_DEBUG_BLOCK_TYPE_FRAME);
 
-   if (device->vk.enabled_extensions.KHR_ray_query) {
-      uint32_t ray_queries_size =
-         align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
-
-      result = anv_device_alloc_bo(device, "ray queries",
-                                   ray_queries_size,
-                                   ANV_BO_ALLOC_INTERNAL,
-                                   0 /* explicit_address */,
-                                   &device->ray_query_bo[0]);
-      ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
-      if (result != VK_SUCCESS)
-         goto fail_alloc_device_bo;
-
-      /* We need a separate ray query bo for CCS engine with Wa_14022863161. */
-      if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
-          device_has_compute_queue) {
-         result = anv_device_alloc_bo(device, "ray queries",
-                                      ray_queries_size,
-                                      ANV_BO_ALLOC_INTERNAL,
-                                      0 /* explicit_address */,
-                                      &device->ray_query_bo[1]);
-         ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
-         if (result != VK_SUCCESS)
-            goto fail_ray_query_bo;
-      }
-   }
-
    result = anv_device_init_trivial_batch(device);
    if (result != VK_SUCCESS)
-      goto fail_ray_query_bo;
+      goto fail_alloc_device_bo;
 
    /* Emit the CPS states before running the initialization batch as those
     * structures are referenced.
@@ -1073,13 +1040,6 @@ VkResult anv_CreateDevice(
  fail_trivial_batch:
    ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
    anv_device_release_bo(device, device->trivial_batch_bo);
- fail_ray_query_bo:
-   for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
-      if (device->ray_query_bo[i]) {
-         ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
-         anv_device_release_bo(device, device->ray_query_bo[i]);
-      }
-   }
  fail_alloc_device_bo:
    if (device->mem_fence_bo) {
       ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@@ -1231,17 +1191,13 @@ void anv_DestroyDevice(
    anv_scratch_pool_finish(device, &device->protected_scratch_pool);
 
    if (device->vk.enabled_extensions.KHR_ray_query) {
-      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
-         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
-            if (device->ray_query_shadow_bos[i][j] != NULL) {
-               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
-               anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
+         for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
+            if (device->ray_query_bos[i][j] != NULL) {
+               ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
+               anv_device_release_bo(device, device->ray_query_bos[i][j]);
             }
          }
-         if (device->ray_query_bo[i]) {
-            ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
-            anv_device_release_bo(device, device->ray_query_bo[i]);
-         }
       }
    }
    ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 5234823b94a..1dd77b9c69e 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -226,7 +226,11 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
 
 void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
 
-struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+void genX(setup_ray_query_globals)(struct anv_device *device,
+                                   struct anv_bo* bo,
+                                   uint64_t offset,
+                                   void* map,
+                                   uint32_t num_queries);
 
 void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
                                        uint32_t total_scratch);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index a33f9e19365..741c7d41b32 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2625,22 +2625,11 @@ struct anv_device {
 
     uint32_t                                    protected_session_id;
 
-    /** Shadow ray query BO
-     *
-     * The ray_query_bo only holds the current ray being traced. When using
-     * more than 1 ray query per thread, we cannot fit all the queries in
-     * there, so we need a another buffer to hold query data that is not
-     * currently being used by the HW for tracing, similar to a scratch space.
-     *
-     * The size of the shadow buffer depends on the number of queries per
-     * shader.
+    /** Pool of ray query buffers used to communicated with HW unit.
      *
      * We might need a buffer per queue family due to Wa_14022863161.
      */
-    struct anv_bo                              *ray_query_shadow_bos[2][16];
-    /** Ray query buffer used to communicated with HW unit.
-     */
-    struct anv_bo                              *ray_query_bo[2];
+    struct anv_bo                              *ray_query_bos[2][16];
 
     struct anv_shader_internal                 *rt_trampoline;
     struct anv_shader_internal                 *rt_trivial_return;
@@ -4247,10 +4236,19 @@ struct anv_push_constants {
     */
    uint32_t surfaces_base_offset;
 
-   /** Ray query globals
+   /**
+    * Pointer to ray query stacks and their associated pairs of
+    * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
     *
-    * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
-    * genX(cmd_buffer_ray_query_globals))
+    * The pair of globals for each query object are stored counting up from
+    * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
+    *
+    *    rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
+    *
+    * The raytracing scratch area for each ray query is stored counting down
+    * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
+    *
+    *    rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
     */
    uint64_t ray_query_globals;
 
@@ -4753,9 +4751,14 @@ struct anv_cmd_state {
    unsigned                                     current_hash_scale;
 
    /**
-    * A buffer used for spill/fill of ray queries.
+    * Number of ray query buffers allocated.
     */
-   struct anv_bo *                              ray_query_shadow_bo;
+   uint32_t                                     num_ray_query_globals;
+
+   /**
+    * Current array of RT_DISPATCH_GLOBALS for ray queries.
+    */
+   struct anv_address                           ray_query_globals;
 
    /** Pointer to the last emitted COMPUTE_WALKER.
     *
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
index b7c84498bd2..ad93a8c718d 100644
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -37,6 +37,7 @@
 #include "ds/intel_tracepoints.h"
 
 #include "genX_mi_builder.h"
+#include "nir_builder.h"
 
 void
 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@@ -811,49 +812,35 @@ void genX(CmdDispatchIndirect)(
    genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
 }
 
-struct anv_address
-genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+void
+genX(setup_ray_query_globals)(struct anv_device *device,
+                              struct anv_bo* bo,
+                              uint64_t offset,
+                              void* map,
+                              uint32_t num_queries)
 {
 #if GFX_VERx10 >= 125
-   struct anv_device *device = cmd_buffer->device;
-
-   struct anv_state state =
-      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
-                                           2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
-                                           BRW_RT_DISPATCH_GLOBALS_ALIGN);
-   struct brw_rt_scratch_layout layout;
-   uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
-                                       * some cases?
-                                       */
-   brw_rt_compute_scratch_layout(&layout, device->info,
-                                 stack_ids_per_dss, 1 << 10);
-
-   uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
-
-   for (uint32_t i = 0; i < 2; i++) {
-      const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
-         .MemBaseAddress = (struct anv_address) {
-            /* The ray query HW computes offsets from the top of the buffer, so
-             * let the address at the end of the buffer.
-             */
-            .bo = device->ray_query_bo[idx],
-            .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
-         },
-         .AsyncRTStackSize = layout.ray_stack_stride / 64,
-         .NumDSSRTStacks = layout.stack_ids_per_dss,
-         .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-         .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-         .ResumeShaderTable = (struct anv_address) {
-            .bo = cmd_buffer->state.ray_query_shadow_bo,
-         },
-      };
-      GENX(RT_DISPATCH_GLOBALS_pack)(
-         NULL,
-         state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
-         &rtdg);
-   }
-
-   return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+   assert(num_queries > 0);
+   uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
+   for (uint32_t i = 0; i < num_queries; ++i)
+      for (uint32_t j = 0; j < 2; j++)
+         GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
+            (char*) map +
+               i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
+               j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+            &(struct GENX(RT_DISPATCH_GLOBALS)) {
+               .MemBaseAddress = (struct anv_address) {
+                  /* The ray query HW computes offsets from the top of the
+                   * buffer, so set the address at the end of the buffer.
+                   */
+                  .bo = bo,
+                  .offset = offset - i * stack_stride - j * stack_stride / 2,
+               },
+               .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
+               .NumDSSRTStacks = 2048, /* TODO */
+               .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+               .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+            });
 #else
    UNREACHABLE("Not supported");
 #endif