mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-11 16:00:27 +01:00
anv,brw: Allow multiple ray queries without spilling to a shadow stack
Allows a shader to have multiple ray queries without spilling them to a shadow stack. Instead, the driver provides the shader with an array of multiple RTDispatchGlobals structs to give each query its own dedicated stack. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38778>
This commit is contained in:
parent
0291aa3e71
commit
1f1de7ebd6
10 changed files with 182 additions and 281 deletions
|
|
@ -38,8 +38,10 @@ struct lowering_state {
|
|||
struct hash_table *queries;
|
||||
uint32_t n_queries;
|
||||
|
||||
struct brw_nir_rt_globals_defs globals;
|
||||
nir_def *rq_globals;
|
||||
|
||||
uint32_t num_dss_rt_stacks;
|
||||
uint32_t sync_stacks_stride;
|
||||
};
|
||||
|
||||
struct brw_ray_query {
|
||||
|
|
@ -50,12 +52,6 @@ struct brw_ray_query {
|
|||
|
||||
#define SIZEOF_QUERY_STATE (sizeof(uint32_t))
|
||||
|
||||
static bool
|
||||
need_spill_fill(struct lowering_state *state)
|
||||
{
|
||||
return state->n_queries > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* This pass converts opaque RayQuery structures from SPIRV into a vec3 where
|
||||
* the first 2 elements store a global address for the query and the third
|
||||
|
|
@ -98,10 +94,8 @@ create_internal_var(struct brw_ray_query *rq, struct lowering_state *state)
|
|||
NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static nir_def *
|
||||
get_ray_query_shadow_addr(nir_builder *b,
|
||||
get_ray_query_stack_index(nir_builder *b,
|
||||
nir_deref_instr *deref,
|
||||
struct lowering_state *state,
|
||||
nir_deref_instr **out_state_deref)
|
||||
|
|
@ -116,35 +110,17 @@ get_ray_query_shadow_addr(nir_builder *b,
|
|||
|
||||
struct brw_ray_query *rq = entry->data;
|
||||
|
||||
/* Base address in the shadow memory of the variable associated with this
|
||||
* ray query variable.
|
||||
*/
|
||||
nir_def *base_addr =
|
||||
nir_iadd_imm(b, state->globals.resume_sbt_addr,
|
||||
brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id);
|
||||
|
||||
bool spill_fill = need_spill_fill(state);
|
||||
nir_def *query_idx = nir_imm_int(b, rq->id);
|
||||
*out_state_deref = nir_build_deref_var(b, rq->internal_var);
|
||||
|
||||
if (!spill_fill)
|
||||
return NULL;
|
||||
|
||||
/* Just emit code and let constant-folding go to town */
|
||||
nir_deref_instr **p = &path.path[1];
|
||||
for (; *p; p++) {
|
||||
if ((*p)->deref_type == nir_deref_type_array) {
|
||||
nir_def *index = (*p)->arr.index.ssa;
|
||||
|
||||
/**/
|
||||
*out_state_deref = nir_build_deref_array(b, *out_state_deref, index);
|
||||
|
||||
/**/
|
||||
uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) *
|
||||
brw_rt_ray_queries_shadow_stack_size(state->devinfo);
|
||||
|
||||
nir_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size);
|
||||
|
||||
base_addr = nir_iadd(b, base_addr, mul);
|
||||
index = nir_amul_imm(b, index, MAX2(1, glsl_get_aoa_size((*p)->type)));
|
||||
query_idx = nir_iadd(b, query_idx, index);
|
||||
} else {
|
||||
UNREACHABLE("Unsupported deref type");
|
||||
}
|
||||
|
|
@ -152,28 +128,7 @@ get_ray_query_shadow_addr(nir_builder *b,
|
|||
|
||||
nir_deref_path_finish(&path);
|
||||
|
||||
/* Add the lane offset to the shadow memory address */
|
||||
nir_def *lane_offset =
|
||||
nir_imul_imm(
|
||||
b,
|
||||
nir_iadd(
|
||||
b,
|
||||
nir_imul(
|
||||
b,
|
||||
brw_load_btd_dss_id(b),
|
||||
state->globals.num_dss_rt_stacks),
|
||||
brw_nir_rt_sync_stack_id(b)),
|
||||
BRW_RT_SIZEOF_SHADOW_RAY_QUERY);
|
||||
|
||||
/* Top/bottom 16 lanes each get their own stack area */
|
||||
lane_offset = nir_bcsel(
|
||||
b,
|
||||
nir_ilt_imm(b, nir_load_subgroup_invocation(b), 16),
|
||||
lane_offset,
|
||||
nir_iadd_imm(b, lane_offset,
|
||||
brw_rt_ray_queries_shadow_stack_size(state->devinfo) / 2));
|
||||
|
||||
return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset));
|
||||
return query_idx;
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -209,26 +164,6 @@ update_trace_ctrl_level(nir_builder *b,
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
fill_query(nir_builder *b,
|
||||
nir_def *hw_stack_addr,
|
||||
nir_def *shadow_stack_addr,
|
||||
nir_def *ctrl)
|
||||
{
|
||||
brw_nir_memcpy_global(b, hw_stack_addr, 64, shadow_stack_addr, 64,
|
||||
BRW_RT_SIZEOF_RAY_QUERY);
|
||||
}
|
||||
|
||||
static void
|
||||
spill_query(nir_builder *b,
|
||||
nir_def *hw_stack_addr,
|
||||
nir_def *shadow_stack_addr)
|
||||
{
|
||||
brw_nir_memcpy_global(b, shadow_stack_addr, 64, hw_stack_addr, 64,
|
||||
BRW_RT_SIZEOF_RAY_QUERY);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lower_ray_query_intrinsic(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
|
|
@ -239,12 +174,20 @@ lower_ray_query_intrinsic(nir_builder *b,
|
|||
b->cursor = nir_instr_remove(&intrin->instr);
|
||||
|
||||
nir_deref_instr *ctrl_level_deref;
|
||||
nir_def *shadow_stack_addr =
|
||||
get_ray_query_shadow_addr(b, deref, state, &ctrl_level_deref);
|
||||
nir_def *hw_stack_addr =
|
||||
brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
|
||||
state->globals.num_dss_rt_stacks);
|
||||
nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
|
||||
nir_def *stack_index =
|
||||
get_ray_query_stack_index(b, deref, state, &ctrl_level_deref);
|
||||
nir_def *rq_globals_addr =
|
||||
nir_iadd(b, state->rq_globals,
|
||||
nir_i2i64(b, nir_amul_imm(b, stack_index,
|
||||
BRW_RT_DISPATCH_GLOBALS_ALIGN)));
|
||||
nir_def *stack_base_addr =
|
||||
nir_isub(b, state->rq_globals,
|
||||
nir_i2i64(b, nir_amul_imm(b, stack_index,
|
||||
state->sync_stacks_stride)));
|
||||
nir_def *stack_addr =
|
||||
brw_nir_rt_sync_stack_addr(b, stack_base_addr,
|
||||
state->num_dss_rt_stacks,
|
||||
state->devinfo);
|
||||
mesa_shader_stage stage = b->shader->info.stage;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
|
|
@ -313,22 +256,12 @@ lower_ray_query_intrinsic(nir_builder *b,
|
|||
*/
|
||||
brw_nir_rt_query_mark_done(b, stack_addr);
|
||||
|
||||
if (shadow_stack_addr)
|
||||
fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl);
|
||||
|
||||
/* Do not use state->rq_globals, we want a uniform value for the
|
||||
* tracing call.
|
||||
*/
|
||||
nir_trace_ray_intel(b, nir_load_ray_query_global_intel(b),
|
||||
level, ctrl, .synchronous = true);
|
||||
nir_trace_ray_intel(b, rq_globals_addr, level, ctrl, .synchronous = true);
|
||||
|
||||
struct brw_nir_rt_mem_hit_defs hit_in = {};
|
||||
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false,
|
||||
brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, false,
|
||||
state->devinfo);
|
||||
|
||||
if (shadow_stack_addr)
|
||||
spill_query(b, hw_stack_addr, shadow_stack_addr);
|
||||
|
||||
update_trace_ctrl_level(b, ctrl_level_deref,
|
||||
NULL, NULL,
|
||||
nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE),
|
||||
|
|
@ -547,21 +480,17 @@ lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state)
|
|||
nir_builder _b, *b = &_b;
|
||||
_b = nir_builder_at(nir_before_impl(impl));
|
||||
|
||||
nir_def *rq_globals_base = nir_load_ray_query_global_intel(b);
|
||||
state->rq_globals = nir_load_ray_query_global_intel(b);
|
||||
|
||||
/* Use a different global for each 16lanes groups (only in SIMD32). */
|
||||
state->rq_globals = nir_bcsel(
|
||||
b,
|
||||
nir_iand(b,
|
||||
nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
|
||||
nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
|
||||
nir_iadd_imm(
|
||||
b, rq_globals_base,
|
||||
align(4 * RT_DISPATCH_GLOBALS_length(state->devinfo), 64)),
|
||||
rq_globals_base);
|
||||
/* ATSM PRMs Vol 9, "State Model for Ray Tracing - RTDispatchGlobals"
|
||||
*
|
||||
* "For Sync Ray tracing (i.e. using RayQueries), SW must allocate
|
||||
* space assuming 2K StackIDs"
|
||||
*/
|
||||
state->num_dss_rt_stacks = 2048; /* TODO */
|
||||
|
||||
brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals,
|
||||
state->devinfo);
|
||||
state->sync_stacks_stride =
|
||||
brw_rt_ray_queries_stacks_stride(state->devinfo);
|
||||
|
||||
nir_foreach_block_safe(block, impl) {
|
||||
nir_foreach_instr_safe(instr, block) {
|
||||
|
|
|
|||
|
|
@ -178,7 +178,8 @@ brw_nir_rt_sw_hotzone_addr(nir_builder *b,
|
|||
static inline nir_def *
|
||||
brw_nir_rt_sync_stack_addr(nir_builder *b,
|
||||
nir_def *base_mem_addr,
|
||||
nir_def *num_dss_rt_stacks)
|
||||
uint32_t num_dss_rt_stacks,
|
||||
const struct intel_device_info *devinfo)
|
||||
{
|
||||
/* Bspec 47547 (Xe) and 56936 (Xe2+) say:
|
||||
* For Ray queries (Synchronous Ray Tracing), the formula is similar but
|
||||
|
|
@ -195,12 +196,29 @@ brw_nir_rt_sync_stack_addr(nir_builder *b,
|
|||
* NUM_SYNC_STACKID_PER_DSS instead.
|
||||
*/
|
||||
nir_def *offset32 =
|
||||
nir_imul(b,
|
||||
nir_iadd(b,
|
||||
nir_imul(b, brw_load_btd_dss_id(b),
|
||||
num_dss_rt_stacks),
|
||||
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
|
||||
nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
|
||||
nir_imul_imm(b,
|
||||
nir_iadd(b,
|
||||
nir_imul_imm(b, brw_load_btd_dss_id(b),
|
||||
num_dss_rt_stacks),
|
||||
nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
|
||||
BRW_RT_SIZEOF_RAY_QUERY);
|
||||
|
||||
/* StackID offset for the bottom 16 lanes in SIMD32, this must match the
|
||||
* offset of the second base address provided by the driver through the
|
||||
* pair of ray query RTDispatchGlobals
|
||||
*/
|
||||
uint32_t simd32_stack_offset =
|
||||
num_dss_rt_stacks * BRW_RT_SIZEOF_RAY_QUERY *
|
||||
intel_device_info_dual_subslice_id_bound(devinfo);
|
||||
|
||||
offset32 =
|
||||
nir_bcsel(b,
|
||||
nir_iand(b,
|
||||
nir_ige_imm(b, nir_load_subgroup_invocation(b), 16),
|
||||
nir_ieq_imm(b, nir_load_subgroup_size(b), 32)),
|
||||
nir_iadd_imm(b, offset32, simd32_stack_offset),
|
||||
offset32);
|
||||
|
||||
return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
|
||||
}
|
||||
|
||||
|
|
@ -300,7 +318,6 @@ struct brw_nir_rt_globals_defs {
|
|||
nir_def *launch_size;
|
||||
nir_def *call_sbt_addr;
|
||||
nir_def *call_sbt_stride;
|
||||
nir_def *resume_sbt_addr;
|
||||
};
|
||||
|
||||
static inline void
|
||||
|
|
@ -368,8 +385,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
|
|||
defs->call_sbt_stride =
|
||||
nir_iand_imm(b, nir_unpack_32_2x16_split_x(b, nir_channel(b, data, 2)),
|
||||
0x1fff);
|
||||
defs->resume_sbt_addr =
|
||||
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 3));
|
||||
} else {
|
||||
defs->call_sbt_addr =
|
||||
nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
|
||||
|
|
@ -377,9 +392,6 @@ brw_nir_rt_load_globals_addr(nir_builder *b,
|
|||
nir_imm_int(b, 0)));
|
||||
defs->call_sbt_stride =
|
||||
nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
|
||||
|
||||
defs->resume_sbt_addr =
|
||||
nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ extern "C" {
|
|||
#define BRW_RT_SBT_HANDLE_SIZE 32
|
||||
|
||||
/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
|
||||
#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
|
||||
#define BRW_RT_DISPATCH_GLOBALS_SIZE 72
|
||||
|
||||
/** RT_DISPATCH_GLOBALS alignment
|
||||
*
|
||||
|
|
@ -191,10 +191,6 @@ struct brw_rt_raygen_trampoline_params {
|
|||
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
|
||||
(BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
|
||||
|
||||
#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \
|
||||
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
|
||||
(BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
|
||||
|
||||
#define BRW_RT_SIZEOF_HW_STACK \
|
||||
(BRW_RT_SIZEOF_HIT_INFO * 2 + \
|
||||
BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
|
||||
|
|
@ -270,25 +266,15 @@ brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
|
|||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
|
||||
brw_rt_ray_queries_stacks_offset(uint32_t num_queries)
|
||||
{
|
||||
/* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
|
||||
* which includes all the threads.
|
||||
*/
|
||||
uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
|
||||
uint32_t max_simd_size = 32;
|
||||
return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
|
||||
return BRW_RT_DISPATCH_GLOBALS_ALIGN << util_logbase2_ceil(num_queries);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
|
||||
uint32_t ray_queries)
|
||||
brw_rt_ray_queries_stacks_stride(const struct intel_device_info *devinfo)
|
||||
{
|
||||
/* Don't bother a shadow stack if we only have a single query. We can
|
||||
* directly write in the HW buffer.
|
||||
*/
|
||||
return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
|
||||
ray_queries * 4; /* Ctrl + Level data */
|
||||
return align(brw_rt_ray_queries_hw_stacks_size(devinfo), 4096);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@
|
|||
<field name="Launch Height" dword="14" bits="31:0" type="uint" />
|
||||
<field name="Launch Depth" dword="15" bits="31:0" type="uint" />
|
||||
<field name="Callable Group Table" dword="16" bits="63:0" type="RT_SHADER_TABLE" />
|
||||
<field name="Resume Shader Table" dword="18" bits="63:0" type="address" />
|
||||
</struct>
|
||||
<struct name="RT_GENERAL_SBT_HANDLE" length="8">
|
||||
<field name="General" dword="0" bits="63:0" type="BINDLESS_SHADER_RECORD" />
|
||||
|
|
|
|||
|
|
@ -36,6 +36,5 @@
|
|||
<field name="Launch Depth" dword="15" bits="31:0" type="uint" />
|
||||
<field name="Callable Group Table" dword="16" bits="63:0" type="address" />
|
||||
<field name="Callable Group Stride" dword="18" bits="12:0" type="uint" />
|
||||
<field name="Resume Shader Table" dword="19" bits="63:0" type="address" />
|
||||
</struct>
|
||||
</genxml>
|
||||
|
|
|
|||
|
|
@ -442,58 +442,84 @@ anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer,
|
|||
uint32_t ray_queries,
|
||||
VkShaderStageFlags stages)
|
||||
{
|
||||
struct anv_device *device = cmd_buffer->device;
|
||||
uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
|
||||
if (ray_queries > cmd_buffer->state.num_ray_query_globals) {
|
||||
struct anv_device *device = cmd_buffer->device;
|
||||
uint8_t wa_idx = anv_get_ray_query_bo_index(cmd_buffer);
|
||||
|
||||
uint64_t ray_shadow_size =
|
||||
align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries),
|
||||
4096);
|
||||
if (ray_shadow_size > 0 &&
|
||||
(!cmd_buffer->state.ray_query_shadow_bo ||
|
||||
cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
|
||||
unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
|
||||
unsigned bucket = shadow_size_log2 - 16;
|
||||
assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos[0]));
|
||||
unsigned bucket = util_logbase2_ceil(ray_queries);
|
||||
assert(bucket < ARRAY_SIZE(device->ray_query_bos[0]));
|
||||
|
||||
struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[idx][bucket]);
|
||||
uint64_t offset = brw_rt_ray_queries_stacks_offset(1 << bucket);
|
||||
uint64_t stride = brw_rt_ray_queries_stacks_stride(device->info);
|
||||
|
||||
struct anv_bo *bo = p_atomic_read(&device->ray_query_bos[wa_idx][bucket]);
|
||||
if (bo == NULL) {
|
||||
struct anv_bo *new_bo;
|
||||
VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
|
||||
1 << shadow_size_log2,
|
||||
ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
|
||||
0, /* explicit_address */
|
||||
&new_bo);
|
||||
VkResult result =
|
||||
anv_device_alloc_bo(device, "RT queries scratch",
|
||||
offset + (stride << bucket), /* size */
|
||||
ANV_BO_ALLOC_INTERNAL |
|
||||
ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE, /* alloc_flags */
|
||||
0, /* explicit_address */
|
||||
&new_bo);
|
||||
|
||||
ANV_DMR_BO_ALLOC(&cmd_buffer->vk.base, new_bo, result);
|
||||
if (result != VK_SUCCESS) {
|
||||
anv_batch_set_error(&cmd_buffer->batch, result);
|
||||
return;
|
||||
}
|
||||
|
||||
bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[idx][bucket], NULL, new_bo);
|
||||
/* Map extra space we added at end of the buffer, we will write the
|
||||
* array of RT_DISPATCH_GLOBALS into it so we can use only a single
|
||||
* memory address in our shaders for all stacks and globals
|
||||
*/
|
||||
void *map;
|
||||
result = anv_device_map_bo(device, new_bo, stride << bucket,
|
||||
offset, NULL, &map);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
|
||||
anv_device_release_bo(device, new_bo);
|
||||
anv_batch_set_error(&cmd_buffer->batch, result);
|
||||
return;
|
||||
}
|
||||
|
||||
anv_genX(device->info, setup_ray_query_globals)(device,
|
||||
new_bo,
|
||||
stride << bucket,
|
||||
map,
|
||||
1 << bucket);
|
||||
|
||||
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
|
||||
if (device->physical->memory.need_flush)
|
||||
util_flush_inval_range(map, offset);
|
||||
#endif
|
||||
|
||||
anv_device_unmap_bo(device, new_bo, map, offset, false);
|
||||
|
||||
bo = p_atomic_cmpxchg(&device->ray_query_bos[wa_idx][bucket], NULL, new_bo);
|
||||
if (bo != NULL) {
|
||||
ANV_DMR_BO_FREE(&device->vk.base, new_bo);
|
||||
ANV_DMR_BO_FREE(&cmd_buffer->vk.base, new_bo);
|
||||
anv_device_release_bo(device, new_bo);
|
||||
} else {
|
||||
bo = new_bo;
|
||||
}
|
||||
}
|
||||
cmd_buffer->state.ray_query_shadow_bo = bo;
|
||||
|
||||
/* Add the ray query buffers to the batch list. */
|
||||
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
|
||||
cmd_buffer->state.ray_query_shadow_bo);
|
||||
/* Add the HW buffer to the list of BO used. */
|
||||
anv_reloc_list_add_bo(cmd_buffer->batch.relocs, bo);
|
||||
|
||||
cmd_buffer->state.ray_query_globals = (struct anv_address) {
|
||||
.bo = bo,
|
||||
.offset = (int64_t) (stride << bucket),
|
||||
};
|
||||
|
||||
cmd_buffer->state.num_ray_query_globals = 1 << bucket;
|
||||
}
|
||||
|
||||
/* Add the HW buffer to the list of BO used. */
|
||||
assert(device->ray_query_bo[idx]);
|
||||
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
|
||||
device->ray_query_bo[idx]);
|
||||
|
||||
/* Fill the push constants & mark them dirty. */
|
||||
struct anv_address ray_query_globals_addr =
|
||||
anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
|
||||
/* Update the push constants & mark them dirty. */
|
||||
pipeline_state->push_constants.ray_query_globals =
|
||||
anv_address_physical(ray_query_globals_addr);
|
||||
anv_address_physical(cmd_buffer->state.ray_query_globals);
|
||||
cmd_buffer->state.push_constants_dirty |= stages;
|
||||
pipeline_state->push_constants_data_dirty = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -341,22 +341,16 @@ VkResult anv_CreateDevice(
|
|||
ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
|
||||
VkResult result;
|
||||
struct anv_device *device;
|
||||
bool device_has_compute_queue = false;
|
||||
|
||||
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
|
||||
|
||||
/* Check requested queues and fail if we are requested to create any
|
||||
* queues with flags we don't support.
|
||||
*/
|
||||
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
|
||||
for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
|
||||
if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
|
||||
return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
|
||||
|
||||
const struct anv_queue_family *family =
|
||||
&physical_device->queue.families[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex];
|
||||
device_has_compute_queue |= family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
|
||||
}
|
||||
|
||||
device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
|
||||
sizeof(*device), 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
|
||||
|
|
@ -786,36 +780,9 @@ VkResult anv_CreateDevice(
|
|||
device->workaround_bo->size,
|
||||
INTEL_DEBUG_BLOCK_TYPE_FRAME);
|
||||
|
||||
if (device->vk.enabled_extensions.KHR_ray_query) {
|
||||
uint32_t ray_queries_size =
|
||||
align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
|
||||
|
||||
result = anv_device_alloc_bo(device, "ray queries",
|
||||
ray_queries_size,
|
||||
ANV_BO_ALLOC_INTERNAL,
|
||||
0 /* explicit_address */,
|
||||
&device->ray_query_bo[0]);
|
||||
ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[0], result);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_alloc_device_bo;
|
||||
|
||||
/* We need a separate ray query bo for CCS engine with Wa_14022863161. */
|
||||
if (intel_needs_workaround(device->isl_dev.info, 14022863161) &&
|
||||
device_has_compute_queue) {
|
||||
result = anv_device_alloc_bo(device, "ray queries",
|
||||
ray_queries_size,
|
||||
ANV_BO_ALLOC_INTERNAL,
|
||||
0 /* explicit_address */,
|
||||
&device->ray_query_bo[1]);
|
||||
ANV_DMR_BO_ALLOC(&device->vk.base, device->ray_query_bo[1], result);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_ray_query_bo;
|
||||
}
|
||||
}
|
||||
|
||||
result = anv_device_init_trivial_batch(device);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_ray_query_bo;
|
||||
goto fail_alloc_device_bo;
|
||||
|
||||
/* Emit the CPS states before running the initialization batch as those
|
||||
* structures are referenced.
|
||||
|
|
@ -1073,13 +1040,6 @@ VkResult anv_CreateDevice(
|
|||
fail_trivial_batch:
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->trivial_batch_bo);
|
||||
anv_device_release_bo(device, device->trivial_batch_bo);
|
||||
fail_ray_query_bo:
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
|
||||
if (device->ray_query_bo[i]) {
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
|
||||
anv_device_release_bo(device, device->ray_query_bo[i]);
|
||||
}
|
||||
}
|
||||
fail_alloc_device_bo:
|
||||
if (device->mem_fence_bo) {
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->mem_fence_bo);
|
||||
|
|
@ -1231,17 +1191,13 @@ void anv_DestroyDevice(
|
|||
anv_scratch_pool_finish(device, &device->protected_scratch_pool);
|
||||
|
||||
if (device->vk.enabled_extensions.KHR_ray_query) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bo); i++) {
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_shadow_bos[0]); j++) {
|
||||
if (device->ray_query_shadow_bos[i][j] != NULL) {
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_shadow_bos[i][j]);
|
||||
anv_device_release_bo(device, device->ray_query_shadow_bos[i][j]);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_bos); i++) {
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(device->ray_query_bos[0]); j++) {
|
||||
if (device->ray_query_bos[i][j] != NULL) {
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bos[i][j]);
|
||||
anv_device_release_bo(device, device->ray_query_bos[i][j]);
|
||||
}
|
||||
}
|
||||
if (device->ray_query_bo[i]) {
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->ray_query_bo[i]);
|
||||
anv_device_release_bo(device, device->ray_query_bo[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
ANV_DMR_BO_FREE(&device->vk.base, device->workaround_bo);
|
||||
|
|
|
|||
|
|
@ -226,7 +226,11 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
|
|||
|
||||
void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
|
||||
|
||||
struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
|
||||
void genX(setup_ray_query_globals)(struct anv_device *device,
|
||||
struct anv_bo* bo,
|
||||
uint64_t offset,
|
||||
void* map,
|
||||
uint32_t num_queries);
|
||||
|
||||
void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
|
||||
uint32_t total_scratch);
|
||||
|
|
|
|||
|
|
@ -2625,22 +2625,11 @@ struct anv_device {
|
|||
|
||||
uint32_t protected_session_id;
|
||||
|
||||
/** Shadow ray query BO
|
||||
*
|
||||
* The ray_query_bo only holds the current ray being traced. When using
|
||||
* more than 1 ray query per thread, we cannot fit all the queries in
|
||||
* there, so we need a another buffer to hold query data that is not
|
||||
* currently being used by the HW for tracing, similar to a scratch space.
|
||||
*
|
||||
* The size of the shadow buffer depends on the number of queries per
|
||||
* shader.
|
||||
/** Pool of ray query buffers used to communicated with HW unit.
|
||||
*
|
||||
* We might need a buffer per queue family due to Wa_14022863161.
|
||||
*/
|
||||
struct anv_bo *ray_query_shadow_bos[2][16];
|
||||
/** Ray query buffer used to communicated with HW unit.
|
||||
*/
|
||||
struct anv_bo *ray_query_bo[2];
|
||||
struct anv_bo *ray_query_bos[2][16];
|
||||
|
||||
struct anv_shader_internal *rt_trampoline;
|
||||
struct anv_shader_internal *rt_trivial_return;
|
||||
|
|
@ -4247,10 +4236,19 @@ struct anv_push_constants {
|
|||
*/
|
||||
uint32_t surfaces_base_offset;
|
||||
|
||||
/** Ray query globals
|
||||
/**
|
||||
* Pointer to ray query stacks and their associated pairs of
|
||||
* RT_DISPATCH_GLOBALS structures (see genX(setup_ray_query_globals))
|
||||
*
|
||||
* Pointer to a couple of RT_DISPATCH_GLOBALS structures (see
|
||||
* genX(cmd_buffer_ray_query_globals))
|
||||
* The pair of globals for each query object are stored counting up from
|
||||
* this address in units of BRW_RT_DISPATCH_GLOBALS_ALIGN:
|
||||
*
|
||||
* rq_globals = ray_query_globals + (rq * BRW_RT_DISPATCH_GLOBALS_ALIGN)
|
||||
*
|
||||
* The raytracing scratch area for each ray query is stored counting down
|
||||
* from this address in units of brw_rt_ray_queries_stacks_stride(devinfo):
|
||||
*
|
||||
* rq_stacks_addr = ray_query_globals - (rq * ray_queries_stacks_stride)
|
||||
*/
|
||||
uint64_t ray_query_globals;
|
||||
|
||||
|
|
@ -4753,9 +4751,14 @@ struct anv_cmd_state {
|
|||
unsigned current_hash_scale;
|
||||
|
||||
/**
|
||||
* A buffer used for spill/fill of ray queries.
|
||||
* Number of ray query buffers allocated.
|
||||
*/
|
||||
struct anv_bo * ray_query_shadow_bo;
|
||||
uint32_t num_ray_query_globals;
|
||||
|
||||
/**
|
||||
* Current array of RT_DISPATCH_GLOBALS for ray queries.
|
||||
*/
|
||||
struct anv_address ray_query_globals;
|
||||
|
||||
/** Pointer to the last emitted COMPUTE_WALKER.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@
|
|||
#include "ds/intel_tracepoints.h"
|
||||
|
||||
#include "genX_mi_builder.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
void
|
||||
genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
|
||||
|
|
@ -811,49 +812,35 @@ void genX(CmdDispatchIndirect)(
|
|||
genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
|
||||
}
|
||||
|
||||
struct anv_address
|
||||
genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
|
||||
void
|
||||
genX(setup_ray_query_globals)(struct anv_device *device,
|
||||
struct anv_bo* bo,
|
||||
uint64_t offset,
|
||||
void* map,
|
||||
uint32_t num_queries)
|
||||
{
|
||||
#if GFX_VERx10 >= 125
|
||||
struct anv_device *device = cmd_buffer->device;
|
||||
|
||||
struct anv_state state =
|
||||
anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
|
||||
2 * align(BRW_RT_DISPATCH_GLOBALS_SIZE, 64),
|
||||
BRW_RT_DISPATCH_GLOBALS_ALIGN);
|
||||
struct brw_rt_scratch_layout layout;
|
||||
uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
|
||||
* some cases?
|
||||
*/
|
||||
brw_rt_compute_scratch_layout(&layout, device->info,
|
||||
stack_ids_per_dss, 1 << 10);
|
||||
|
||||
uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
|
||||
|
||||
for (uint32_t i = 0; i < 2; i++) {
|
||||
const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
|
||||
.MemBaseAddress = (struct anv_address) {
|
||||
/* The ray query HW computes offsets from the top of the buffer, so
|
||||
* let the address at the end of the buffer.
|
||||
*/
|
||||
.bo = device->ray_query_bo[idx],
|
||||
.offset = (i + 1) * (device->ray_query_bo[idx]->size / 2),
|
||||
},
|
||||
.AsyncRTStackSize = layout.ray_stack_stride / 64,
|
||||
.NumDSSRTStacks = layout.stack_ids_per_dss,
|
||||
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
|
||||
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
|
||||
.ResumeShaderTable = (struct anv_address) {
|
||||
.bo = cmd_buffer->state.ray_query_shadow_bo,
|
||||
},
|
||||
};
|
||||
GENX(RT_DISPATCH_GLOBALS_pack)(
|
||||
NULL,
|
||||
state.map + i * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
|
||||
&rtdg);
|
||||
}
|
||||
|
||||
return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
|
||||
assert(num_queries > 0);
|
||||
uint64_t stack_stride = brw_rt_ray_queries_stacks_stride(device->info);
|
||||
for (uint32_t i = 0; i < num_queries; ++i)
|
||||
for (uint32_t j = 0; j < 2; j++)
|
||||
GENX(RT_DISPATCH_GLOBALS_pack)(NULL,
|
||||
(char*) map +
|
||||
i * BRW_RT_DISPATCH_GLOBALS_ALIGN +
|
||||
j * align(4 * GENX(RT_DISPATCH_GLOBALS_length), 64),
|
||||
&(struct GENX(RT_DISPATCH_GLOBALS)) {
|
||||
.MemBaseAddress = (struct anv_address) {
|
||||
/* The ray query HW computes offsets from the top of the
|
||||
* buffer, so set the address at the end of the buffer.
|
||||
*/
|
||||
.bo = bo,
|
||||
.offset = offset - i * stack_stride - j * stack_stride / 2,
|
||||
},
|
||||
.AsyncRTStackSize = BRW_RT_SIZEOF_RAY_QUERY / 64,
|
||||
.NumDSSRTStacks = 2048, /* TODO */
|
||||
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
|
||||
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
|
||||
});
|
||||
#else
|
||||
UNREACHABLE("Not supported");
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue