anv,brw: Allow multiple ray queries without spilling to a shadow stack

Allows a shader to have multiple ray queries without spilling them to a shadow
stack. Instead, the driver provides the shader with an array of multiple
RTDispatchGlobals structs to give each query its own dedicated stack.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38778>
This commit is contained in:
Calder Young 2025-12-02 20:26:49 -08:00 committed by Marge Bot
parent 0291aa3e71
commit 1f1de7ebd6
10 changed files with 182 additions and 281 deletions

View file

@ -38,8 +38,10 @@ struct lowering_state {
struct hash_table *queries;
uint32_t n_queries;
struct brw_nir_rt_globals_defs globals;
nir_def *rq_globals;
uint32_t num_dss_rt_stacks;
uint32_t sync_stacks_stride;
};
struct brw_ray_query {
@ -50,12 +52,6 @@ struct brw_ray_query {
#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
static bool
need_spill_fill(struct lowering_state *state)
{
return state->n_queries > 1;
}
/**
* This pass converts opaque RayQuery structures from SPIRV into a vec3 where
* the first 2 elements store a global address for the query and the third
@ -98,10 +94,8 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
NULL);
}
static nir_def *
get_ray_query_shadow_addr(nir_builder *b,
get_ray_query_stack_index(nir_builder *b,
nir_deref_instr *deref,
struct lowering_state *state,
nir_deref_instr **out_state_deref)
@ -116,35 +110,17 @@ get_ray_query_shadow_addr(nir_builder *b,
struct brw_ray_query *rq = entry->data;
/* Base address in the shadow memory of the variable associated with this
* ray query variable.
*/
nir_def *base_addr =
nir_iadd_imm(b, state->globals.resume_sbt_addr,
brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
bool spill_fill = need_spill_fill(state);
nir_def *query_idx = nir_imm_int(b, rq->id);
*out_state_deref = nir_build_deref_var(b, rq->internal_var);
if (!spill_fill)
return NULL;
/* Just emit code and let constant-folding go to town */
nir_deref_instr **p = &path.path[1];
for (; *p; p++) {
if ((*p)->deref_type == nir_deref_type_array) {
nir_def *index = (*p)->arr.index.ssa;
/**/
*out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
/**/
uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
brw_rt_ray_queries_shadow_stack_size(state->devinfo);
nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
base_addr = nir_iadd(b, base_addr, mul);
index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
query_idx = nir_iadd(b, query_idx, index);
} else {
UNREACHABLE("Unsupported deref type");
}
@ -152,28 +128,7 @@ get_ray_query_shadow_addr(nir_builder *b,
nir_deref_path_finish(&path);
/* Add the lane offset to the shadow memory address */
nir_def *lane_offset =
nir_imul_imm(
b,
nir_iadd(
b,
nir_imul(
b,
brw_load_btd_dss_id(b),
state->globals.num_dss_rt_stacks),
brw_nir_rt_sync_stack_id(b)),
BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
/* Top/bottom 16 lanes each get their own stack area */
lane_offset = nir_bcsel(
b,
nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
lane_offset,
nir_iadd_imm(b, lane_offset,
brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
return query_idx;
}
static void
@ -209,26 +164,6 @@ update_trace_ctrl_level(nir_builder *b,
}
}
static void
fill_query(nir_builder *b,
nir_def *hw_stack_addr,
nir_def *shadow_stack_addr,
nir_def *ctrl)
{
brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
BRW_RT_SIZEOF_RAY_QUERY);
}
static void
spill_query(nir_builder *b,
nir_def *hw_stack_addr,
nir_def *shadow_stack_addr)
{
brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
BRW_RT_SIZEOF_RAY_QUERY);
}
static void
lower_ray_query_intrinsic(nir_builder *b,
nir_intrinsic_instr *intrin,
@ -239,12 +174,20 @@ lower_ray_query_intrinsic(nir_builder *b,
b->cursor = nir_instr_remove(&intrin->instr);
nir_deref_instr *ctrl_level_deref;
nir_def *shadow_stack_addr =
get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
nir_def *hw_stack_addr =
brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
state->globals.num_dss_rt_stacks);
nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
nir_def *stack_index =
get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
nir_def *rq_globals_addr =
nir_iadd(b, state->rq_globals,
nir_i2i64(b, nir_amul_imm(b, stack_index,
BRW_RT_DISPATCH_GLOBALS_ALIGN)));
nir_def *stack_base_addr =
nir_isub(b, state->rq_globals,
nir_i2i64(b, nir_amul_imm(b, stack_index,
state->sync_stacks_stride)));
nir_def *stack_addr =
brw_nir_rt_sync_stack_addr(b, stack_base_addr,
state->num_dss_rt_stacks,
state->devinfo);
mesa_shader_stage stage = b->shader->info.stage;
switch (intrin->intrinsic) {
@ -313,22 +256,12 @@ lower_ray_query_intrinsic(nir_builder *b,
*/
brw_nir_rt_query_mark_done(b, stack_addr);
if (shadow_stack_addr)
fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
/* Do not use state->rq_globals, we want a uniform value for the
* tracing call.
*/
nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
level, ctrl, .synchronous = true);
nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
struct brw_nir_rt_mem_hit_defs hit_in = {};
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
state->devinfo);
if (shadow_stack_addr)
spill_query(b, hw_stack_addr, shadow_stack_addr);
update_trace_ctrl_level(b, ctrl_level_deref,
NULL, NULL,
nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
@ -547,21 +480,17 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
nir_builder _b, *b = &_b;
_b = nir_builder_at(nir_before_impl(impl));
nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
state->rq_globals = nir_load_ray_query_global_intel(b);
/* Use a different global for each 16lanes groups (only in SIMD32). */
state->rq_globals = nir_bcsel(
b,
nir_iand(b,
nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
nir_iadd_imm(
b, rq_globals_base,
align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
rq_globals_base);
/* ATSM PRMs Vol 9, "State Model for Ray Tracing - RTDispatchGlobals"
*
* "For Sync Ray tracing (i.e. using RayQueries), SW must allocate
* space assuming 2K StackIDs"
*/
state->num_dss_rt_stacks = 2048; /* TODO */
brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
state->devinfo);
state->sync_stacks_stride =
brw_rt_ray_queries_stacks_stride(state->devinfo);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {

View file

@ -178,7 +178,8 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
static inline nir_def *
brw_nir_rt_sync_stack_addr(nir_builder *b,
nir_def *base_mem_addr,
nir_def *num_dss_rt_stacks)
uint32_t num_dss_rt_stacks,
const struct intel_device_info *devinfo)
{
/* Bspec 47547 (Xe) and 56936 (Xe2+) say:
* For Ray queries (Synchronous Ray Tracing), the formula is similar but
@ -195,12 +196,29 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
* NUM_SYNC_STACKID_PER_DSS instead.
*/
nir_def *offset32 =
nir_imul(b,
nir_iadd(b,
nir_imul(b, brw_load_btd_dss_id(b),
num_dss_rt_stacks),
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
nir_imul_imm(b,
nir_iadd(b,
nir_imul_imm(b, brw_load_btd_dss_id(b),
num_dss_rt_stacks),
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
BRW_RT_SIZEOF_RAY_QUERY);
/* StackID offset for the bottom 16 lanes in SIMD32, this must match the
* offset of the second base address provided by the driver through the
* pair of ray query RTDispatchGlobals
*/
uint32_t simd32_stack_offset =
num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
intel_device_info_dual_subslice_id_bound(devinfo);
offset32 =
nir_bcsel(b,
nir_iand(b,
nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
nir_iadd_imm(b, offset32, simd32_stack_offset),
offset32);
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
}
@ -300,7 +318,6 @@ struct brw_nir_rt_globals_defs {
nir_def *launch_size;
nir_def *call_sbt_addr;
nir_def *call_sbt_stride;
nir_def *resume_sbt_addr;
};
static inline void
@ -368,8 +385,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
defs->call_sbt_stride =
nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
0x1fff);
defs->resume_sbt_addr =
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
} else {
defs->call_sbt_addr =
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
@ -377,9 +392,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
nir_imm_int(b, 0)));
defs->call_sbt_stride =
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
defs->resume_sbt_addr =
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
}
}

View file

@ -36,7 +36,7 @@ extern "C" {
#define BRW_RT_SBT_HANDLE_SIZE 32
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
/** RT_DISPATCH_GLOBALS alignment
*
@ -191,10 +191,6 @@ struct brw_rt_raygen_trampoline_params {
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
(BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
#define BRW_RT_SIZEOF_HW_STACK \
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
@ -270,25 +266,15 @@ brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
}
static inline uint32_t
brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
{
/* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
* which includes all the threads.
*/
uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
uint32_t max_simd_size = 32;
return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
}
static inline uint32_t
brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
uint32_t ray_queries)
brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
{
/* Don't bother a shadow stack if we only have a single query. We can
* directly write in the HW buffer.
*/
return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
ray_queries * 4; /* Ctrl + Level data */
return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
}
#ifdef __cplusplus

View file

@ -28,7 +28,6 @@
<field name="Launch Height" dword="14" bits="31:0" type="uint" />
<field name="Launch Depth" dword="15" bits="31:0" type="uint" />
<field name="Callable Group Table" dword="16" bits="63:0" type="RT_SHADER_TABLE" />
<field name="Resume Shader Table" dword="18" bits="63:0" type="address" />
</struct>
<struct name="RT_GENERAL_SBT_HANDLE" length="8">
<field name="General" dword="0" bits="63:0" type="BINDLESS_SHADER_RECORD" />

View file

@ -36,6 +36,5 @@
<field name="Launch Depth" dword="15" bits="31:0" type="uint" />
<field name="Callable Group Table" dword="16" bits="63:0" type="address" />
<field name="Callable Group Stride" dword="18" bits="12:0" type="uint" />
<field name="Resume Shader Table" dword="19" bits="63:0" type="address" />
</struct>
</genxml>

View file

@ -442,58 +442,84 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
uint32_t ray_queries,
VkShaderStageFlags stages)
{
struct anv_device *device = cmd_buffer->device;
uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
struct anv_device *device = cmd_buffer->device;
uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
uint64_t ray_shadow_size =
align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
4096);
if (ray_shadow_size > 0 &&
(!cmd_buffer->state.ray_query_shadow_bo ||
cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
unsigned bucket = shadow_size_log2 - 16;
assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
unsigned bucket = util_logbase2_ceil(ray_queries);
assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
if (bo == NULL) {
struct anv_bo *new_bo;
VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
1 << shadow_size_log2,
ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
0, /* explicit_address */
&new_bo);
VkResult result =
anv_device_alloc_bo(device, "RT queries scratch",
offset + (stride << bucket), /* size */
ANV_BO_ALLOC_INTERNAL |
ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
0, /* explicit_address */
&new_bo);
ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
if (result != VK_SUCCESS) {
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
/* Map extra space we added at end of the buffer, we will write the
* array of RT_DISPATCH_GLOBALS into it so we can use only a single
* memory address in our shaders for all stacks and globals
*/
void *map;
result = anv_device_map_bo(device, new_bo, stride << bucket,
offset, NULL, &map);
if (result != VK_SUCCESS) {
ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
anv_device_release_bo(device, new_bo);
anv_batch_set_error(&cmd_buffer->batch, result);
return;
}
anv_genX(device->info, setup_ray_query_globals)(device,
new_bo,
stride << bucket,
map,
1 << bucket);
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush)
util_flush_inval_range(map, offset);
#endif
anv_device_unmap_bo(device, new_bo, map, offset, false);
bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
if (bo != NULL) {
ANV_DMR_BO_FREE(&device->vk.base, new_bo);
ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
anv_device_release_bo(device, new_bo);
} else {
bo = new_bo;
}
}
cmd_buffer->state.ray_query_shadow_bo = bo;
/* Add the ray query buffers to the batch list. */
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
cmd_buffer->state.ray_query_shadow_bo);
/* Add the HW buffer to the list of BO used. */
anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
cmd_buffer->state.ray_query_globals = (struct anv_address) {
.bo = bo,
.offset = (int64_t) (stride << bucket),
};
cmd_buffer->state.num_ray_query_globals = 1 << bucket;
}
/* Add the HW buffer to the list of BO used. */
assert(device->ray_query_bo[idx]);
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
device->ray_query_bo[idx]);
/* Fill the push constants & mark them dirty. */
struct anv_address ray_query_globals_addr =
anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
/* Update the push constants & mark them dirty. */
pipeline_state->push_constants.ray_query_globals =
anv_address_physical(ray_query_globals_addr);
anv_address_physical(cmd_buffer->state.ray_query_globals);
cmd_buffer->state.push_constants_dirty |= stages;
pipeline_state->push_constants_data_dirty = true;
}

View file

@ -341,22 +341,16 @@ VkResult anv_CreateDevice(
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
VkResult result;
struct anv_device *device;
bool device_has_compute_queue = false;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
/* Check requested queues and fail if we are requested to create any
* queues with flags we don't support.
*/
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
const struct anv_queue_family *family =
&physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
}
device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
sizeof(*device), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@ -786,36 +780,9 @@ VkResult anv_CreateDevice(
device->workaround_bo->size,
INTEL_DEBUG_BLOCK_TYPE_FRAME);
if (device->vk.enabled_extensions.KHR_ray_query) {
uint32_t ray_queries_size =
align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
result = anv_device_alloc_bo(device, "ray queries",
ray_queries_size,
ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->ray_query_bo[0]);
ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
if (result != VK_SUCCESS)
goto fail_alloc_device_bo;
/* We need a separate ray query bo for CCS engine with Wa_14022863161. */
if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
device_has_compute_queue) {
result = anv_device_alloc_bo(device, "ray queries",
ray_queries_size,
ANV_BO_ALLOC_INTERNAL,
0 /* explicit_address */,
&device->ray_query_bo[1]);
ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
if (result != VK_SUCCESS)
goto fail_ray_query_bo;
}
}
result = anv_device_init_trivial_batch(device);
if (result != VK_SUCCESS)
goto fail_ray_query_bo;
goto fail_alloc_device_bo;
/* Emit the CPS states before running the initialization batch as those
* structures are referenced.
@ -1073,13 +1040,6 @@ VkResult anv_CreateDevice(
fail_trivial_batch:
ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
anv_device_release_bo(device, device->trivial_batch_bo);
fail_ray_query_bo:
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
if (device->ray_query_bo[i]) {
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
anv_device_release_bo(device, device->ray_query_bo[i]);
}
}
fail_alloc_device_bo:
if (device->mem_fence_bo) {
ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
@ -1231,17 +1191,13 @@ void anv_DestroyDevice(
anv_scratch_pool_finish(device, &device->protected_scratch_pool);
if (device->vk.enabled_extensions.KHR_ray_query) {
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
if (device->ray_query_shadow_bos[i][j] != NULL) {
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
if (device->ray_query_bos[i][j] != NULL) {
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
anv_device_release_bo(device, device->ray_query_bos[i][j]);
}
}
if (device->ray_query_bo[i]) {
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
anv_device_release_bo(device, device->ray_query_bo[i]);
}
}
}
ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);

View file

@ -226,7 +226,11 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
void genX(setup_ray_query_globals)(struct anv_device *device,
struct anv_bo* bo,
uint64_t offset,
void* map,
uint32_t num_queries);
void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
uint32_t total_scratch);

View file

@ -2625,22 +2625,11 @@ struct anv_device {
uint32_t protected_session_id;
/** Shadow ray query BO
*
* The ray_query_bo only holds the current ray being traced. When using
* more than 1 ray query per thread, we cannot fit all the queries in
* there, so we need a another buffer to hold query data that is not
* currently being used by the HW for tracing, similar to a scratch space.
*
* The size of the shadow buffer depends on the number of queries per
* shader.
/** Pool of ray query buffers used to communicated with HW unit.
*
* We might need a buffer per queue family due to Wa_14022863161.
*/
struct anv_bo *ray_query_shadow_bos[2][16];
/** Ray query buffer used to communicated with HW unit.
*/
struct anv_bo *ray_query_bo[2];
struct anv_bo *ray_query_bos[2][16];
struct anv_shader_internal *rt_trampoline;
struct anv_shader_internal *rt_trivial_return;
@ -4247,10 +4236,19 @@ struct anv_push_constants {
*/
uint32_t surfaces_base_offset;
/** Ray query globals
/**
* Pointer to ray query stacks and their associated pairs of
* RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
*
* Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
* genX(cmd_buffer_ray_query_globals))
* The pair of globals for each query object are stored counting up from
* this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
*
* rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
*
* The raytracing scratch area for each ray query is stored counting down
* from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
*
* rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
*/
uint64_t ray_query_globals;
@ -4753,9 +4751,14 @@ struct anv_cmd_state {
unsigned current_hash_scale;
/**
* A buffer used for spill/fill of ray queries.
* Number of ray query buffers allocated.
*/
struct anv_bo * ray_query_shadow_bo;
uint32_t num_ray_query_globals;
/**
* Current array of RT_DISPATCH_GLOBALS for ray queries.
*/
struct anv_address ray_query_globals;
/** Pointer to the last emitted COMPUTE_WALKER.
*

View file

@ -37,6 +37,7 @@
#include "ds/intel_tracepoints.h"
#include "genX_mi_builder.h"
#include "nir_builder.h"
void
genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
@ -811,49 +812,35 @@ void genX(CmdDispatchIndirect)(
genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
}
struct anv_address
genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
void
genX(setup_ray_query_globals)(struct anv_device *device,
struct anv_bo* bo,
uint64_t offset,
void* map,
uint32_t num_queries)
{
#if GFX_VERx10 >= 125
struct anv_device *device = cmd_buffer->device;
struct anv_state state =
anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
BRW_RT_DISPATCH_GLOBALS_ALIGN);
struct brw_rt_scratch_layout layout;
uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
* some cases?
*/
brw_rt_compute_scratch_layout(&layout, device->info,
stack_ids_per_dss, 1 << 10);
uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
for (uint32_t i = 0; i < 2; i++) {
const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
.MemBaseAddress = (struct anv_address) {
/* The ray query HW computes offsets from the top of the buffer, so
* let the address at the end of the buffer.
*/
.bo = device->ray_query_bo[idx],
.offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
},
.AsyncRTStackSize = layout.ray_stack_stride / 64,
.NumDSSRTStacks = layout.stack_ids_per_dss,
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
.ResumeShaderTable = (struct anv_address) {
.bo = cmd_buffer->state.ray_query_shadow_bo,
},
};
GENX(RT_DISPATCH_GLOBALS_pack)(
NULL,
state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
&rtdg);
}
return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
assert(num_queries > 0);
uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
for (uint32_t i = 0; i < num_queries; ++i)
for (uint32_t j = 0; j < 2; j++)
GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
(char*) map +
i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
&(struct GENX(RT_DISPATCH_GLOBALS)) {
.MemBaseAddress = (struct anv_address) {
/* The ray query HW computes offsets from the top of the
* buffer, so set the address at the end of the buffer.
*/
.bo = bo,
.offset = offset - i * stack_stride - j * stack_stride / 2,
},
.AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
.NumDSSRTStacks = 2048, /* TODO */
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
});
#else
UNREACHABLE("Not supported");
#endif