diff --git a/.pick_status.json b/.pick_status.json
index ff7b8093ff9..dccbe8fdfb0 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -2914,7 +2914,7 @@
"description": "Revert \"anv,brw: Allow multiple ray queries without spilling to a shadow stack\"",
"nominated": true,
"nomination_type": 2,
- "resolution": 0,
+ "resolution": 1,
"main_sha": null,
"because_sha": "1f1de7ebd63cbe55972e01a0e7f5509e9e232917",
"notes": null
diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
index 8bbec94dd5b..9a9ddbd0364 100644
--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@@ -38,10 +38,8 @@ struct lowering_state {
struct hash_table *queries;
uint32_t n_queries;
+ struct brw_nir_rt_globals_defs globals;
nir_def *rq_globals;
-
- uint32_t num_dss_rt_stacks;
- uint32_t sync_stacks_stride;
};
struct brw_ray_query {
@@ -52,6 +50,12 @@ struct brw_ray_query {
#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
+static bool
+need_spill_fill(struct lowering_state *state)
+{
+ return state->n_queries > 1;
+}
+
/**
* This pass converts opaque RayQuery structures from SPIRV into a vec3 where
* the first 2 elements store a global address for the query and the third
@@ -94,8 +98,10 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
NULL);
}
+
+
static nir_def *
-get_ray_query_stack_index(nir_builder *b,
+get_ray_query_shadow_addr(nir_builder *b,
nir_deref_instr *deref,
struct lowering_state *state,
nir_deref_instr **out_state_deref)
@@ -110,17 +116,35 @@ get_ray_query_stack_index(nir_builder *b,
struct brw_ray_query *rq = entry->data;
- nir_def *query_idx = nir_imm_int(b, rq->id);
+ /* Base address in the shadow memory of the variable associated with this
+ * ray query variable.
+ */
+ nir_def *base_addr =
+ nir_iadd_imm(b, state->globals.resume_sbt_addr,
+ brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
+
+ bool spill_fill = need_spill_fill(state);
*out_state_deref = nir_build_deref_var(b, rq->internal_var);
+ if (!spill_fill)
+ return NULL;
+
/* Just emit code and let constant-folding go to town */
nir_deref_instr **p = &path.path[1];
for (; *p; p++) {
if ((*p)->deref_type == nir_deref_type_array) {
nir_def *index = (*p)->arr.index.ssa;
+
+ /**/
*out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
- index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
- query_idx = nir_iadd(b, query_idx, index);
+
+ /**/
+ uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
+ brw_rt_ray_queries_shadow_stack_size(state->devinfo);
+
+ nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
+
+ base_addr = nir_iadd(b, base_addr, mul);
} else {
UNREACHABLE("Unsupported deref type");
}
@@ -128,7 +152,28 @@ get_ray_query_stack_index(nir_builder *b,
nir_deref_path_finish(&path);
- return query_idx;
+ /* Add the lane offset to the shadow memory address */
+ nir_def *lane_offset =
+ nir_imul_imm(
+ b,
+ nir_iadd(
+ b,
+ nir_imul(
+ b,
+ brw_load_btd_dss_id(b),
+ state->globals.num_dss_rt_stacks),
+ brw_nir_rt_sync_stack_id(b)),
+ BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
+
+ /* Top/bottom 16 lanes each get their own stack area */
+ lane_offset = nir_bcsel(
+ b,
+ nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
+ lane_offset,
+ nir_iadd_imm(b, lane_offset,
+ brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
+
+ return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
}
static void
@@ -164,6 +209,26 @@ update_trace_ctrl_level(nir_builder *b,
}
}
+static void
+fill_query(nir_builder *b,
+ nir_def *hw_stack_addr,
+ nir_def *shadow_stack_addr,
+ nir_def *ctrl)
+{
+ brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
+ BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+static void
+spill_query(nir_builder *b,
+ nir_def *hw_stack_addr,
+ nir_def *shadow_stack_addr)
+{
+ brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
+ BRW_RT_SIZEOF_RAY_QUERY);
+}
+
+
static void
lower_ray_query_intrinsic(nir_builder *b,
nir_intrinsic_instr *intrin,
@@ -174,20 +239,12 @@ lower_ray_query_intrinsic(nir_builder *b,
b->cursor = nir_instr_remove(&intrin->instr);
nir_deref_instr *ctrl_level_deref;
- nir_def *stack_index =
- get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
- nir_def *rq_globals_addr =
- nir_iadd(b, state->rq_globals,
- nir_i2i64(b, nir_amul_imm(b, stack_index,
- BRW_RT_DISPATCH_GLOBALS_ALIGN)));
- nir_def *stack_base_addr =
- nir_isub(b, state->rq_globals,
- nir_i2i64(b, nir_amul_imm(b, stack_index,
- state->sync_stacks_stride)));
- nir_def *stack_addr =
- brw_nir_rt_sync_stack_addr(b, stack_base_addr,
- state->num_dss_rt_stacks,
- state->devinfo);
+ nir_def *shadow_stack_addr =
+ get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
+ nir_def *hw_stack_addr =
+ brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
+ state->globals.num_dss_rt_stacks);
+ nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
mesa_shader_stage stage = b->shader->info.stage;
switch (intrin->intrinsic) {
@@ -256,12 +313,22 @@ lower_ray_query_intrinsic(nir_builder *b,
*/
brw_nir_rt_query_mark_done(b, stack_addr);
- nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
+ if (shadow_stack_addr)
+ fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
+
+ /* Do not use state->rq_globals, we want a uniform value for the
+ * tracing call.
+ */
+ nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
+ level, ctrl, .synchronous = true);
struct brw_nir_rt_mem_hit_defs hit_in = {};
- brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
+ brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
state->devinfo);
+ if (shadow_stack_addr)
+ spill_query(b, hw_stack_addr, shadow_stack_addr);
+
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@@ -480,12 +547,21 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
nir_builder _b, *b = &_b;
_b = nir_builder_at(nir_before_impl(impl));
- state->rq_globals = nir_load_ray_query_global_intel(b);
+ nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
- state->num_dss_rt_stacks =
- brw_rt_ray_queries_stack_ids_per_dss(state->devinfo);
- state->sync_stacks_stride =
- brw_rt_ray_queries_stacks_stride(state->devinfo);
+ /* Use a different global for each 16lanes groups (only in SIMD32). */
+ state->rq_globals = nir_bcsel(
+ b,
+ nir_iand(b,
+ nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
+ nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
+ nir_iadd_imm(
+ b, rq_globals_base,
+ align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
+ rq_globals_base);
+
+ brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
+ state->devinfo);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h
index 2062b24fc7a..d66fa897e4c 100644
--- a/src/intel/compiler/brw/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw/brw_nir_rt_builder.h
@@ -178,8 +178,7 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
static inline nir_def *
brw_nir_rt_sync_stack_addr(nir_builder *b,
nir_def *base_mem_addr,
- uint32_t num_dss_rt_stacks,
- const struct intel_device_info *devinfo)
+ nir_def *num_dss_rt_stacks)
{
/* Bspec 47547 (Xe) and 56936 (Xe2+) say:
* For Ray queries (Synchronous Ray Tracing), the formula is similar but
@@ -196,29 +195,12 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
* NUM_SYNC_STACKID_PER_DSS instead.
*/
nir_def *offset32 =
- nir_imul_imm(b,
- nir_iadd(b,
- nir_imul_imm(b, brw_load_btd_dss_id(b),
- num_dss_rt_stacks),
- nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
- BRW_RT_SIZEOF_RAY_QUERY);
-
- /* StackID offset for the bottom 16 lanes in SIMD32, this must match the
- * offset of the second base address provided by the driver through the
- * pair of ray query RTDispatchGlobals
- */
- uint32_t simd32_stack_offset =
- num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
- intel_device_info_dual_subslice_id_bound(devinfo);
-
- offset32 =
- nir_bcsel(b,
- nir_iand(b,
- nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
- nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
- nir_iadd_imm(b, offset32, simd32_stack_offset),
- offset32);
-
+ nir_imul(b,
+ nir_iadd(b,
+ nir_imul(b, brw_load_btd_dss_id(b),
+ num_dss_rt_stacks),
+ nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+ nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
}
@@ -318,6 +300,7 @@ struct brw_nir_rt_globals_defs {
nir_def *launch_size;
nir_def *call_sbt_addr;
nir_def *call_sbt_stride;
+ nir_def *resume_sbt_addr;
};
static inline void
@@ -385,6 +368,8 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
defs->call_sbt_stride =
nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
0x1fff);
+ defs->resume_sbt_addr =
+ nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
} else {
defs->call_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@@ -392,6 +377,9 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
nir_imm_int(b, 0)));
defs->call_sbt_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
+
+ defs->resume_sbt_addr =
+ nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
}
}
diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h
index 4255db0ff2b..3b4fdf4ec92 100644
--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@@ -36,7 +36,7 @@ extern "C" {
#define BRW_RT_SBT_HANDLE_SIZE 32
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
-#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
/** RT_DISPATCH_GLOBALS alignment
*
@@ -191,6 +191,10 @@ struct brw_rt_raygen_trampoline_params {
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
(BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
+#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
+ (BRW_RT_SIZEOF_HIT_INFO * 2 + \
+ (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
+
#define BRW_RT_SIZEOF_HW_STACK \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@@ -277,15 +281,25 @@ brw_rt_ray_queries_stack_ids_per_dss(const struct intel_device_info *devinfo)
}
static inline uint32_t
-brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
+brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
{
- return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
+ /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
+ * which includes all the threads.
+ */
+ uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
+ uint32_t max_simd_size = 32;
+ return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
}
static inline uint32_t
-brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
+brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
+ uint32_t ray_queries)
{
- return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
+ /* Don't bother a shadow stack if we only have a single query. We can
+ * directly write in the HW buffer.
+ */
+ return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
+ ray_queries * 4; /* Ctrl + Level data */
}
#ifdef __cplusplus
diff --git a/src/intel/genxml/gen125_rt.xml b/src/intel/genxml/gen125_rt.xml
index 6c4298905ae..b23134ae038 100644
--- a/src/intel/genxml/gen125_rt.xml
+++ b/src/intel/genxml/gen125_rt.xml
@@ -28,6 +28,7 @@
+
diff --git a/src/intel/genxml/gen300_rt.xml b/src/intel/genxml/gen300_rt.xml
index 5861f3eacab..7b2bcff39cb 100644
--- a/src/intel/genxml/gen300_rt.xml
+++ b/src/intel/genxml/gen300_rt.xml
@@ -36,5 +36,6 @@
+
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 77a90224c4d..a7d817ee88b 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -442,84 +442,58 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
uint32_t ray_queries,
VkShaderStageFlags stages)
{
- if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
- struct anv_device *device = cmd_buffer->device;
- uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
+ struct anv_device *device = cmd_buffer->device;
+ uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
- unsigned bucket = util_logbase2_ceil(ray_queries);
- assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
+ uint64_t ray_shadow_size =
+ align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
+ 4096);
+ if (ray_shadow_size > 0 &&
+ (!cmd_buffer->state.ray_query_shadow_bo ||
+ cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+ unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
+ unsigned bucket = shadow_size_log2 - 16;
+ assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
- uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
- uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
-
- struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
+ struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
if (bo == NULL) {
struct anv_bo *new_bo;
- VkResult result =
- anv_device_alloc_bo(device, "RT queries scratch",
- offset + (stride << bucket), /* size */
- ANV_BO_ALLOC_INTERNAL |
- ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
- 0, /* explicit_address */
- &new_bo);
-
+ VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+ 1 << shadow_size_log2,
+ ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
+ 0, /* explicit_address */
+ &new_bo);
ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
if (result != VK_SUCCESS) {
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
- /* Map extra space we added at end of the buffer, we will write the
- * array of RT_DISPATCH_GLOBALS into it so we can use only a single
- * memory address in our shaders for all stacks and globals
- */
- void *map;
- result = anv_device_map_bo(device, new_bo, stride << bucket,
- offset, NULL, &map);
-
- if (result != VK_SUCCESS) {
- ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
- anv_device_release_bo(device, new_bo);
- anv_batch_set_error(&cmd_buffer->batch, result);
- return;
- }
-
- anv_genX(device->info, setup_ray_query_globals)(device,
- new_bo,
- stride << bucket,
- map,
- 1 << bucket);
-
-#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
- if (device->physical->memory.need_flush)
- util_flush_inval_range(map, offset);
-#endif
-
- anv_device_unmap_bo(device, new_bo, map, offset, false);
-
- bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
+ bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
if (bo != NULL) {
- ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
+ ANV_DMR_BO_FREE(&device->vk.base, new_bo);
anv_device_release_bo(device, new_bo);
} else {
bo = new_bo;
}
}
+ cmd_buffer->state.ray_query_shadow_bo = bo;
- /* Add the HW buffer to the list of BO used. */
- anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
-
- cmd_buffer->state.ray_query_globals = (struct anv_address) {
- .bo = bo,
- .offset = (int64_t) (stride << bucket),
- };
-
- cmd_buffer->state.num_ray_query_globals = 1 << bucket;
+ /* Add the ray query buffers to the batch list. */
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ cmd_buffer->state.ray_query_shadow_bo);
}
- /* Update the push constants & mark them dirty. */
+ /* Add the HW buffer to the list of BO used. */
+ assert(device->ray_query_bo[idx]);
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+ device->ray_query_bo[idx]);
+
+ /* Fill the push constants & mark them dirty. */
+ struct anv_address ray_query_globals_addr =
+ anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
pipeline_state->push_constants.ray_query_globals =
- anv_address_physical(cmd_buffer->state.ray_query_globals);
+ anv_address_physical(ray_query_globals_addr);
cmd_buffer->state.push_constants_dirty |= stages;
pipeline_state->push_constants_data_dirty = true;
}
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 86bb309ec2f..6cb87a9ced8 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -341,16 +341,22 @@ VkResult anv_CreateDevice(
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
VkResult result;
struct anv_device *device;
+ bool device_has_compute_queue = false;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
/* Check requested queues and fail if we are requested to create any
* queues with flags we don't support.
*/
- for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
+ for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
+ const struct anv_queue_family *family =
+ &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
+ device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
+ }
+
device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -780,9 +786,36 @@ VkResult anv_CreateDevice(
device->workaround_bo->size,
INTEL_DEBUG_BLOCK_TYPE_FRAME);
+ if (device->vk.enabled_extensions.KHR_ray_query) {
+ uint32_t ray_queries_size =
+ align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
+
+ result = anv_device_alloc_bo(device, "ray queries",
+ ray_queries_size,
+ ANV_BO_ALLOC_INTERNAL,
+ 0 /* explicit_address */,
+ &device->ray_query_bo[0]);
+ ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
+ if (result != VK_SUCCESS)
+ goto fail_alloc_device_bo;
+
+ /* We need a separate ray query bo for CCS engine with Wa_14022863161. */
+ if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
+ device_has_compute_queue) {
+ result = anv_device_alloc_bo(device, "ray queries",
+ ray_queries_size,
+ ANV_BO_ALLOC_INTERNAL,
+ 0 /* explicit_address */,
+ &device->ray_query_bo[1]);
+ ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
+ if (result != VK_SUCCESS)
+ goto fail_ray_query_bo;
+ }
+ }
+
result = anv_device_init_trivial_batch(device);
if (result != VK_SUCCESS)
- goto fail_alloc_device_bo;
+ goto fail_ray_query_bo;
/* Emit the CPS states before running the initialization batch as those
* structures are referenced.
@@ -1040,6 +1073,13 @@ VkResult anv_CreateDevice(
fail_trivial_batch:
ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+ for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
+ if (device->ray_query_bo[i]) {
+ ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
+ anv_device_release_bo(device, device->ray_query_bo[i]);
+ }
+ }
fail_alloc_device_bo:
if (device->mem_fence_bo) {
ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@@ -1191,13 +1231,17 @@ void anv_DestroyDevice(
anv_scratch_pool_finish(device, &device->protected_scratch_pool);
if (device->vk.enabled_extensions.KHR_ray_query) {
- for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
- for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
- if (device->ray_query_bos[i][j] != NULL) {
- ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
- anv_device_release_bo(device, device->ray_query_bos[i][j]);
+ for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
+ for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
+ if (device->ray_query_shadow_bos[i][j] != NULL) {
+ ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
+ anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
}
}
+ if (device->ray_query_bo[i]) {
+ ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
+ anv_device_release_bo(device, device->ray_query_bo[i]);
+ }
}
}
ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 1dd77b9c69e..5234823b94a 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -226,11 +226,7 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
-void genX(setup_ray_query_globals)(struct anv_device *device,
- struct anv_bo* bo,
- uint64_t offset,
- void* map,
- uint32_t num_queries);
+struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
uint32_t total_scratch);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 3a8a79ea6d2..1f481ec0fa2 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2625,11 +2625,22 @@ struct anv_device {
uint32_t protected_session_id;
- /** Pool of ray query buffers used to communicated with HW unit.
+ /** Shadow ray query BO
+ *
+ * The ray_query_bo only holds the current ray being traced. When using
+ * more than 1 ray query per thread, we cannot fit all the queries in
+ * there, so we need a another buffer to hold query data that is not
+ * currently being used by the HW for tracing, similar to a scratch space.
+ *
+ * The size of the shadow buffer depends on the number of queries per
+ * shader.
*
* We might need a buffer per queue family due to Wa_14022863161.
*/
- struct anv_bo *ray_query_bos[2][16];
+ struct anv_bo *ray_query_shadow_bos[2][16];
+ /** Ray query buffer used to communicated with HW unit.
+ */
+ struct anv_bo *ray_query_bo[2];
struct anv_shader_internal *rt_trampoline;
struct anv_shader_internal *rt_trivial_return;
@@ -4236,19 +4247,10 @@ struct anv_push_constants {
*/
uint32_t surfaces_base_offset;
- /**
- * Pointer to ray query stacks and their associated pairs of
- * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
+ /** Ray query globals
*
- * The pair of globals for each query object are stored counting up from
- * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
- *
- * rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
- *
- * The raytracing scratch area for each ray query is stored counting down
- * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
- *
- * rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
+ * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
+ * genX(cmd_buffer_ray_query_globals))
*/
uint64_t ray_query_globals;
@@ -4751,14 +4753,9 @@ struct anv_cmd_state {
unsigned current_hash_scale;
/**
- * Number of ray query buffers allocated.
+ * A buffer used for spill/fill of ray queries.
*/
- uint32_t num_ray_query_globals;
-
- /**
- * Current array of RT_DISPATCH_GLOBALS for ray queries.
- */
- struct anv_address ray_query_globals;
+ struct anv_bo * ray_query_shadow_bo;
/** Pointer to the last emitted COMPUTE_WALKER.
*
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
index b4f50ef51a1..64ada047dc4 100644
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -37,7 +37,6 @@
#include "ds/intel_tracepoints.h"
#include "genX_mi_builder.h"
-#include "nir_builder.h"
void
genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@@ -812,36 +811,45 @@ void genX(CmdDispatchIndirect)(
genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
}
-void
-genX(setup_ray_query_globals)(struct anv_device *device,
- struct anv_bo* bo,
- uint64_t offset,
- void* map,
- uint32_t num_queries)
+struct anv_address
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
{
#if GFX_VERx10 >= 125
- assert(num_queries > 0);
- uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
- uint32_t ids_per_dss = brw_rt_ray_queries_stack_ids_per_dss(device->info);
- for (uint32_t i = 0; i < num_queries; ++i)
- for (uint32_t j = 0; j < 2; j++)
- GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
- (char*) map +
- i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
- j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
- &(struct GENX(RT_DISPATCH_GLOBALS)) {
- .MemBaseAddress = (struct anv_address) {
- /* The ray query HW computes offsets from the top of the
- * buffer, so set the address at the end of the buffer.
- */
- .bo = bo,
- .offset = offset - i * stack_stride - j * stack_stride / 2,
- },
- .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
- .NumDSSRTStacks = ids_per_dss,
- .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
- .Flags = RT_DEPTH_TEST_LESS_EQUAL,
- });
+ struct anv_device *device = cmd_buffer->device;
+
+ struct anv_state state =
+ anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+ 2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
+ BRW_RT_DISPATCH_GLOBALS_ALIGN);
+ uint32_t stack_ids_per_dss =
+ brw_rt_ray_queries_stack_ids_per_dss(device->info);
+
+ uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
+
+ for (uint32_t i = 0; i < 2; i++) {
+ const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+ .MemBaseAddress = (struct anv_address) {
+ /* The ray query HW computes offsets from the top of the buffer, so
+ * let the address at the end of the buffer.
+ */
+ .bo = device->ray_query_bo[idx],
+ .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
+ },
+ .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
+ .NumDSSRTStacks = stack_ids_per_dss,
+ .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+ .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+ .ResumeShaderTable = (struct anv_address) {
+ .bo = cmd_buffer->state.ray_query_shadow_bo,
+ },
+ };
+ GENX(RT_DISPATCH_GLOBALS_pack)(
+ NULL,
+ state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+ &rtdg);
+ }
+
+ return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
#else
UNREACHABLE("Not supported");
#endif