diff --git a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
index 9a9ddbd0364..85bc0f24bed 100644
--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@@ -38,8 +38,10 @@ struct lowering_state {
struct hash_table *queries;
uint32_t n_queries;
- struct brw_nir_rt_globals_defs globals;
nir_def *rq_globals;
+
+ uint32_t num_dss_rt_stacks;
+ uint32_t sync_stacks_stride;
};
struct brw_ray_query {
@@ -50,12 +52,6 @@ struct brw_ray_query {
#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
-static bool
-need_spill_fill(struct lowering_state *state)
-{
- return state->n_queries > 1;
-}
-
/**
* This pass converts opaque RayQuery structures from SPIRV into a vec3 where
* the first 2 elements store a global address for the query and the third
@@ -98,10 +94,8 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
NULL);
}
-
-
static nir_def *
-get_ray_query_shadow_addr(nir_builder *b,
+get_ray_query_stack_index(nir_builder *b,
nir_deref_instr *deref,
struct lowering_state *state,
nir_deref_instr **out_state_deref)
@@ -116,35 +110,17 @@ get_ray_query_shadow_addr(nir_builder *b,
struct brw_ray_query *rq = entry->data;
- /* Base address in the shadow memory of the variable associated with this
- * ray query variable.
- */
- nir_def *base_addr =
- nir_iadd_imm(b, state->globals.resume_sbt_addr,
- brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
-
- bool spill_fill = need_spill_fill(state);
+ nir_def *query_idx = nir_imm_int(b, rq->id);
*out_state_deref = nir_build_deref_var(b, rq->internal_var);
- if (!spill_fill)
- return NULL;
-
/* Just emit code and let constant-folding go to town */
nir_deref_instr **p = &path.path[1];
for (; *p; p++) {
if ((*p)->deref_type == nir_deref_type_array) {
nir_def *index = (*p)->arr.index.ssa;
-
- /**/
*out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
-
- /**/
- uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
- brw_rt_ray_queries_shadow_stack_size(state->devinfo);
-
- nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
-
- base_addr = nir_iadd(b, base_addr, mul);
+ index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
+ query_idx = nir_iadd(b, query_idx, index);
} else {
UNREACHABLE("Unsupported deref type");
}
@@ -152,28 +128,7 @@ get_ray_query_shadow_addr(nir_builder *b,
nir_deref_path_finish(&path);
- /* Add the lane offset to the shadow memory address */
- nir_def *lane_offset =
- nir_imul_imm(
- b,
- nir_iadd(
- b,
- nir_imul(
- b,
- brw_load_btd_dss_id(b),
- state->globals.num_dss_rt_stacks),
- brw_nir_rt_sync_stack_id(b)),
- BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
-
- /* Top/bottom 16 lanes each get their own stack area */
- lane_offset = nir_bcsel(
- b,
- nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
- lane_offset,
- nir_iadd_imm(b, lane_offset,
- brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
-
- return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
+ return query_idx;
}
static void
@@ -209,26 +164,6 @@ update_trace_ctrl_level(nir_builder *b,
}
}
-static void
-fill_query(nir_builder *b,
- nir_def *hw_stack_addr,
- nir_def *shadow_stack_addr,
- nir_def *ctrl)
-{
- brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
- BRW_RT_SIZEOF_RAY_QUERY);
-}
-
-static void
-spill_query(nir_builder *b,
- nir_def *hw_stack_addr,
- nir_def *shadow_stack_addr)
-{
- brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
- BRW_RT_SIZEOF_RAY_QUERY);
-}
-
-
static void
lower_ray_query_intrinsic(nir_builder *b,
nir_intrinsic_instr *intrin,
@@ -239,12 +174,20 @@ lower_ray_query_intrinsic(nir_builder *b,
b->cursor = nir_instr_remove(&intrin->instr);
nir_deref_instr *ctrl_level_deref;
- nir_def *shadow_stack_addr =
- get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
- nir_def *hw_stack_addr =
- brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
- state->globals.num_dss_rt_stacks);
- nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
+ nir_def *stack_index =
+ get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
+ nir_def *rq_globals_addr =
+ nir_iadd(b, state->rq_globals,
+ nir_i2i64(b, nir_amul_imm(b, stack_index,
+ BRW_RT_DISPATCH_GLOBALS_ALIGN)));
+ nir_def *stack_base_addr =
+ nir_isub(b, state->rq_globals,
+ nir_i2i64(b, nir_amul_imm(b, stack_index,
+ state->sync_stacks_stride)));
+ nir_def *stack_addr =
+ brw_nir_rt_sync_stack_addr(b, stack_base_addr,
+ state->num_dss_rt_stacks,
+ state->devinfo);
mesa_shader_stage stage = b->shader->info.stage;
switch (intrin->intrinsic) {
@@ -313,22 +256,12 @@ lower_ray_query_intrinsic(nir_builder *b,
*/
brw_nir_rt_query_mark_done(b, stack_addr);
- if (shadow_stack_addr)
- fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
-
- /* Do not use state->rq_globals, we want a uniform value for the
- * tracing call.
- */
- nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
- level, ctrl, .synchronous = true);
+ nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
struct brw_nir_rt_mem_hit_defs hit_in = {};
- brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
+ brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
state->devinfo);
- if (shadow_stack_addr)
- spill_query(b, hw_stack_addr, shadow_stack_addr);
-
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@@ -547,21 +480,17 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
nir_builder _b, *b = &_b;
_b = nir_builder_at(nir_before_impl(impl));
- nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
+ state->rq_globals = nir_load_ray_query_global_intel(b);
- /* Use a different global for each 16lanes groups (only in SIMD32). */
- state->rq_globals = nir_bcsel(
- b,
- nir_iand(b,
- nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
- nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
- nir_iadd_imm(
- b, rq_globals_base,
- align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
- rq_globals_base);
+ /* ATSM PRMs Vol 9, "State Model for Ray Tracing - RTDispatchGlobals"
+ *
+ * "For Sync Ray tracing (i.e. using RayQueries), SW must allocate
+ * space assuming 2K StackIDs"
+ */
+ state->num_dss_rt_stacks = 2048; /* TODO */
- brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
- state->devinfo);
+ state->sync_stacks_stride =
+ brw_rt_ray_queries_stacks_stride(state->devinfo);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
diff --git a/src/intel/compiler/brw/brw_nir_rt_builder.h b/src/intel/compiler/brw/brw_nir_rt_builder.h
index d66fa897e4c..2062b24fc7a 100644
--- a/src/intel/compiler/brw/brw_nir_rt_builder.h
+++ b/src/intel/compiler/brw/brw_nir_rt_builder.h
@@ -178,7 +178,8 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
static inline nir_def *
brw_nir_rt_sync_stack_addr(nir_builder *b,
nir_def *base_mem_addr,
- nir_def *num_dss_rt_stacks)
+ uint32_t num_dss_rt_stacks,
+ const struct intel_device_info *devinfo)
{
/* Bspec 47547 (Xe) and 56936 (Xe2+) say:
* For Ray queries (Synchronous Ray Tracing), the formula is similar but
@@ -195,12 +196,29 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
* NUM_SYNC_STACKID_PER_DSS instead.
*/
nir_def *offset32 =
- nir_imul(b,
- nir_iadd(b,
- nir_imul(b, brw_load_btd_dss_id(b),
- num_dss_rt_stacks),
- nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
- nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
+ nir_imul_imm(b,
+ nir_iadd(b,
+ nir_imul_imm(b, brw_load_btd_dss_id(b),
+ num_dss_rt_stacks),
+ nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
+ BRW_RT_SIZEOF_RAY_QUERY);
+
+ /* StackID offset for the bottom 16 lanes in SIMD32, this must match the
+ * offset of the second base address provided by the driver through the
+ * pair of ray query RTDispatchGlobals
+ */
+ uint32_t simd32_stack_offset =
+ num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
+ intel_device_info_dual_subslice_id_bound(devinfo);
+
+ offset32 =
+ nir_bcsel(b,
+ nir_iand(b,
+ nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
+ nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
+ nir_iadd_imm(b, offset32, simd32_stack_offset),
+ offset32);
+
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
}
@@ -300,7 +318,6 @@ struct brw_nir_rt_globals_defs {
nir_def *launch_size;
nir_def *call_sbt_addr;
nir_def *call_sbt_stride;
- nir_def *resume_sbt_addr;
};
static inline void
@@ -368,8 +385,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
defs->call_sbt_stride =
nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
0x1fff);
- defs->resume_sbt_addr =
- nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
} else {
defs->call_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@@ -377,9 +392,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
nir_imm_int(b, 0)));
defs->call_sbt_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
-
- defs->resume_sbt_addr =
- nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
}
}
diff --git a/src/intel/compiler/brw/brw_rt.h b/src/intel/compiler/brw/brw_rt.h
index 4d5791c88e1..09a0b86af77 100644
--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@@ -36,7 +36,7 @@ extern "C" {
#define BRW_RT_SBT_HANDLE_SIZE 32
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
-#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
+#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
/** RT_DISPATCH_GLOBALS alignment
*
@@ -191,10 +191,6 @@ struct brw_rt_raygen_trampoline_params {
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
(BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
-#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
- (BRW_RT_SIZEOF_HIT_INFO * 2 + \
- (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
-
#define BRW_RT_SIZEOF_HW_STACK \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@@ -270,25 +266,15 @@ brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
}
static inline uint32_t
-brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
+brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
{
- /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
- * which includes all the threads.
- */
- uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
- uint32_t max_simd_size = 32;
- return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
+ return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
}
static inline uint32_t
-brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
- uint32_t ray_queries)
+brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
{
- /* Don't bother a shadow stack if we only have a single query. We can
- * directly write in the HW buffer.
- */
- return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
- ray_queries * 4; /* Ctrl + Level data */
+ return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
}
#ifdef __cplusplus
diff --git a/src/intel/genxml/gen125_rt.xml b/src/intel/genxml/gen125_rt.xml
index b23134ae038..6c4298905ae 100644
--- a/src/intel/genxml/gen125_rt.xml
+++ b/src/intel/genxml/gen125_rt.xml
@@ -28,7 +28,6 @@
-
diff --git a/src/intel/genxml/gen300_rt.xml b/src/intel/genxml/gen300_rt.xml
index 7b2bcff39cb..5861f3eacab 100644
--- a/src/intel/genxml/gen300_rt.xml
+++ b/src/intel/genxml/gen300_rt.xml
@@ -36,6 +36,5 @@
-
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 60789abcbcc..a1da91a8f4b 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -442,58 +442,84 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
uint32_t ray_queries,
VkShaderStageFlags stages)
{
- struct anv_device *device = cmd_buffer->device;
- uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
+ if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
+ struct anv_device *device = cmd_buffer->device;
+ uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
- uint64_t ray_shadow_size =
- align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
- 4096);
- if (ray_shadow_size > 0 &&
- (!cmd_buffer->state.ray_query_shadow_bo ||
- cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
- unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
- unsigned bucket = shadow_size_log2 - 16;
- assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
+ unsigned bucket = util_logbase2_ceil(ray_queries);
+ assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
- struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
+ uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
+ uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
+
+ struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
if (bo == NULL) {
struct anv_bo *new_bo;
- VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
- 1 << shadow_size_log2,
- ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
- 0, /* explicit_address */
- &new_bo);
+ VkResult result =
+ anv_device_alloc_bo(device, "RT queries scratch",
+ offset + (stride << bucket), /* size */
+ ANV_BO_ALLOC_INTERNAL |
+ ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
+ 0, /* explicit_address */
+ &new_bo);
+
ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
if (result != VK_SUCCESS) {
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
- bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
+ /* Map extra space we added at end of the buffer, we will write the
+ * array of RT_DISPATCH_GLOBALS into it so we can use only a single
+ * memory address in our shaders for all stacks and globals
+ */
+ void *map;
+ result = anv_device_map_bo(device, new_bo, stride << bucket,
+ offset, NULL, &map);
+
+ if (result != VK_SUCCESS) {
+ ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
+ anv_device_release_bo(device, new_bo);
+ anv_batch_set_error(&cmd_buffer->batch, result);
+ return;
+ }
+
+ anv_genX(device->info, setup_ray_query_globals)(device,
+ new_bo,
+ stride << bucket,
+ map,
+ 1 << bucket);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+ if (device->physical->memory.need_flush)
+ util_flush_inval_range(map, offset);
+#endif
+
+ anv_device_unmap_bo(device, new_bo, map, offset, false);
+
+ bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
if (bo != NULL) {
- ANV_DMR_BO_FREE(&device->vk.base, new_bo);
+ ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
anv_device_release_bo(device, new_bo);
} else {
bo = new_bo;
}
}
- cmd_buffer->state.ray_query_shadow_bo = bo;
- /* Add the ray query buffers to the batch list. */
- anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
- cmd_buffer->state.ray_query_shadow_bo);
+ /* Add the HW buffer to the list of BO used. */
+ anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
+
+ cmd_buffer->state.ray_query_globals = (struct anv_address) {
+ .bo = bo,
+ .offset = (int64_t) (stride << bucket),
+ };
+
+ cmd_buffer->state.num_ray_query_globals = 1 << bucket;
}
- /* Add the HW buffer to the list of BO used. */
- assert(device->ray_query_bo[idx]);
- anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
- device->ray_query_bo[idx]);
-
- /* Fill the push constants & mark them dirty. */
- struct anv_address ray_query_globals_addr =
- anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+ /* Update the push constants & mark them dirty. */
pipeline_state->push_constants.ray_query_globals =
- anv_address_physical(ray_query_globals_addr);
+ anv_address_physical(cmd_buffer->state.ray_query_globals);
cmd_buffer->state.push_constants_dirty |= stages;
pipeline_state->push_constants_data_dirty = true;
}
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 6cb87a9ced8..86bb309ec2f 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -341,22 +341,16 @@ VkResult anv_CreateDevice(
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
VkResult result;
struct anv_device *device;
- bool device_has_compute_queue = false;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
/* Check requested queues and fail if we are requested to create any
* queues with flags we don't support.
*/
- for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+ for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
- const struct anv_queue_family *family =
- &physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
- device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
- }
-
device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@@ -786,36 +780,9 @@ VkResult anv_CreateDevice(
device->workaround_bo->size,
INTEL_DEBUG_BLOCK_TYPE_FRAME);
- if (device->vk.enabled_extensions.KHR_ray_query) {
- uint32_t ray_queries_size =
- align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
-
- result = anv_device_alloc_bo(device, "ray queries",
- ray_queries_size,
- ANV_BO_ALLOC_INTERNAL,
- 0 /* explicit_address */,
- &device->ray_query_bo[0]);
- ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
- if (result != VK_SUCCESS)
- goto fail_alloc_device_bo;
-
- /* We need a separate ray query bo for CCS engine with Wa_14022863161. */
- if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
- device_has_compute_queue) {
- result = anv_device_alloc_bo(device, "ray queries",
- ray_queries_size,
- ANV_BO_ALLOC_INTERNAL,
- 0 /* explicit_address */,
- &device->ray_query_bo[1]);
- ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
- if (result != VK_SUCCESS)
- goto fail_ray_query_bo;
- }
- }
-
result = anv_device_init_trivial_batch(device);
if (result != VK_SUCCESS)
- goto fail_ray_query_bo;
+ goto fail_alloc_device_bo;
/* Emit the CPS states before running the initialization batch as those
* structures are referenced.
@@ -1073,13 +1040,6 @@ VkResult anv_CreateDevice(
fail_trivial_batch:
ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
anv_device_release_bo(device, device->trivial_batch_bo);
- fail_ray_query_bo:
- for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
- if (device->ray_query_bo[i]) {
- ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
- anv_device_release_bo(device, device->ray_query_bo[i]);
- }
- }
fail_alloc_device_bo:
if (device->mem_fence_bo) {
ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@@ -1231,17 +1191,13 @@ void anv_DestroyDevice(
anv_scratch_pool_finish(device, &device->protected_scratch_pool);
if (device->vk.enabled_extensions.KHR_ray_query) {
- for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
- for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
- if (device->ray_query_shadow_bos[i][j] != NULL) {
- ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
- anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
+ for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
+ for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
+ if (device->ray_query_bos[i][j] != NULL) {
+ ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
+ anv_device_release_bo(device, device->ray_query_bos[i][j]);
}
}
- if (device->ray_query_bo[i]) {
- ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
- anv_device_release_bo(device, device->ray_query_bo[i]);
- }
}
}
ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 5234823b94a..1dd77b9c69e 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -226,7 +226,11 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
-struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+void genX(setup_ray_query_globals)(struct anv_device *device,
+ struct anv_bo* bo,
+ uint64_t offset,
+ void* map,
+ uint32_t num_queries);
void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
uint32_t total_scratch);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index a33f9e19365..741c7d41b32 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2625,22 +2625,11 @@ struct anv_device {
uint32_t protected_session_id;
- /** Shadow ray query BO
- *
- * The ray_query_bo only holds the current ray being traced. When using
- * more than 1 ray query per thread, we cannot fit all the queries in
- * there, so we need a another buffer to hold query data that is not
- * currently being used by the HW for tracing, similar to a scratch space.
- *
- * The size of the shadow buffer depends on the number of queries per
- * shader.
+ /** Pool of ray query buffers used to communicated with HW unit.
*
* We might need a buffer per queue family due to Wa_14022863161.
*/
- struct anv_bo *ray_query_shadow_bos[2][16];
- /** Ray query buffer used to communicated with HW unit.
- */
- struct anv_bo *ray_query_bo[2];
+ struct anv_bo *ray_query_bos[2][16];
struct anv_shader_internal *rt_trampoline;
struct anv_shader_internal *rt_trivial_return;
@@ -4247,10 +4236,19 @@ struct anv_push_constants {
*/
uint32_t surfaces_base_offset;
- /** Ray query globals
+ /**
+ * Pointer to ray query stacks and their associated pairs of
+ * RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
*
- * Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
- * genX(cmd_buffer_ray_query_globals))
+ * The pair of globals for each query object are stored counting up from
+ * this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
+ *
+ * rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
+ *
+ * The raytracing scratch area for each ray query is stored counting down
+ * from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
+ *
+ * rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
*/
uint64_t ray_query_globals;
@@ -4753,9 +4751,14 @@ struct anv_cmd_state {
unsigned current_hash_scale;
/**
- * A buffer used for spill/fill of ray queries.
+ * Number of ray query buffers allocated.
*/
- struct anv_bo * ray_query_shadow_bo;
+ uint32_t num_ray_query_globals;
+
+ /**
+ * Current array of RT_DISPATCH_GLOBALS for ray queries.
+ */
+ struct anv_address ray_query_globals;
/** Pointer to the last emitted COMPUTE_WALKER.
*
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
index b7c84498bd2..ad93a8c718d 100644
--- a/src/intel/vulkan/genX_cmd_compute.c
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -37,6 +37,7 @@
#include "ds/intel_tracepoints.h"
#include "genX_mi_builder.h"
+#include "nir_builder.h"
void
genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@@ -811,49 +812,35 @@ void genX(CmdDispatchIndirect)(
genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
}
-struct anv_address
-genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+void
+genX(setup_ray_query_globals)(struct anv_device *device,
+ struct anv_bo* bo,
+ uint64_t offset,
+ void* map,
+ uint32_t num_queries)
{
#if GFX_VERx10 >= 125
- struct anv_device *device = cmd_buffer->device;
-
- struct anv_state state =
- anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
- 2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
- BRW_RT_DISPATCH_GLOBALS_ALIGN);
- struct brw_rt_scratch_layout layout;
- uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
- * some cases?
- */
- brw_rt_compute_scratch_layout(&layout, device->info,
- stack_ids_per_dss, 1 << 10);
-
- uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
-
- for (uint32_t i = 0; i < 2; i++) {
- const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
- .MemBaseAddress = (struct anv_address) {
- /* The ray query HW computes offsets from the top of the buffer, so
- * let the address at the end of the buffer.
- */
- .bo = device->ray_query_bo[idx],
- .offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
- },
- .AsyncRTStackSize = layout.ray_stack_stride / 64,
- .NumDSSRTStacks = layout.stack_ids_per_dss,
- .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
- .Flags = RT_DEPTH_TEST_LESS_EQUAL,
- .ResumeShaderTable = (struct anv_address) {
- .bo = cmd_buffer->state.ray_query_shadow_bo,
- },
- };
- GENX(RT_DISPATCH_GLOBALS_pack)(
- NULL,
- state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
- &rtdg);
- }
-
- return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+ assert(num_queries > 0);
+ uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
+ for (uint32_t i = 0; i < num_queries; ++i)
+ for (uint32_t j = 0; j < 2; j++)
+ GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
+ (char*) map +
+ i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
+ j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
+ &(struct GENX(RT_DISPATCH_GLOBALS)) {
+ .MemBaseAddress = (struct anv_address) {
+ /* The ray query HW computes offsets from the top of the
+ * buffer, so set the address at the end of the buffer.
+ */
+ .bo = bo,
+ .offset = offset - i * stack_stride - j * stack_stride / 2,
+ },
+ .AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
+ .NumDSSRTStacks = 2048, /* TODO */
+ .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+ .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+ });
#else
UNREACHABLE("Not supported");
#endif