radv: Refactor RT lowering decisions and add RADV_PERFTEST CPS override

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29580>
This commit is contained in:
Natalie Vock 2025-12-09 16:33:15 +01:00 committed by Marge Bot
parent 63c019f5be
commit d3cb8b4046
6 changed files with 46 additions and 18 deletions

View file

@ -100,6 +100,7 @@ enum {
RADV_PERFTEST_NO_GTT_SPILL = 1u << 16,
RADV_PERFTEST_HIC = 1u << 17,
RADV_PERFTEST_SPARSE = 1u << 18,
RADV_PERFTEST_RT_CPS = 1u << 19,
};
enum {

View file

@ -125,6 +125,7 @@ static const struct debug_control radv_perftest_options[] = {
{"nogttspill", RADV_PERFTEST_NO_GTT_SPILL},
{"hic", RADV_PERFTEST_HIC},
{"sparse", RADV_PERFTEST_SPARSE},
{"rtcps", RADV_PERFTEST_RT_CPS},
{NULL, 0},
};

View file

@ -305,6 +305,7 @@ radv_physical_device_init_cache_key(struct radv_physical_device *pdev)
key->use_ngg_culling = pdev->use_ngg_culling;
key->no_implicit_varying_subgroup_size = instance->drirc.debug.no_implicit_varying_subgroup_size;
key->mitigate_smem_oob = pdev->info.cu_info.has_smem_oob_access_bug && !(instance->debug_flags & RADV_DEBUG_NO_SMEM_MITIGATION);
key->rt_cps = !!(instance->perftest_flags & RADV_PERFTEST_RT_CPS);
}
static int

View file

@ -66,8 +66,9 @@ struct radv_physical_device_cache_key {
uint32_t use_ngg_culling : 1;
uint32_t no_implicit_varying_subgroup_size : 1;
uint32_t mitigate_smem_oob : 1;
uint32_t rt_cps : 1;
uint32_t reserved : 8;
uint32_t reserved : 7;
};
enum radv_video_enc_hw_ver {

View file

@ -363,8 +363,9 @@ move_rt_instructions(nir_shader *shader)
static VkResult
radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
struct radv_ray_tracing_pipeline *pipeline, bool monolithic, struct radv_shader_stage *stage,
uint32_t *stack_size, struct radv_ray_tracing_stage_info *stage_info,
struct radv_ray_tracing_pipeline *pipeline, enum radv_rt_lowering_mode mode,
struct radv_shader_stage *stage, uint32_t *stack_size,
struct radv_ray_tracing_stage_info *stage_info,
const struct radv_ray_tracing_stage_info *traversal_stage_info,
struct radv_serialized_shader_arena_block *replay_block, bool skip_shaders_cache,
bool has_position_fetch, struct radv_shader **out_shader)
@ -376,10 +377,14 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags);
bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags);
if (monolithic)
switch (mode) {
case RADV_RT_LOWERING_MODE_MONOLITHIC:
radv_nir_lower_rt_io_monolithic(stage->nir);
else
break;
case RADV_RT_LOWERING_MODE_CPS:
radv_nir_lower_rt_io_cps(stage->nir);
break;
}
/* Gather shader info. */
nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
@ -402,7 +407,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
nir_shader **resume_shaders = NULL;
void *mem_ctx = ralloc_context(NULL);
if (stage->stage != MESA_SHADER_INTERSECTION && !monolithic) {
if (stage->stage != MESA_SHADER_INTERSECTION && mode == RADV_RT_LOWERING_MODE_CPS) {
nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(stage->nir)));
nir_rt_return_amd(&b);
@ -432,12 +437,15 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
for (uint32_t i = 0; i < num_shaders; i++) {
struct radv_shader_stage temp_stage = *stage;
temp_stage.nir = shaders[i];
if (monolithic) {
switch (mode) {
case RADV_RT_LOWERING_MODE_MONOLITHIC:
assert(num_shaders == 1);
radv_nir_lower_rt_abi_monolithic(temp_stage.nir, &temp_stage.args, stack_size, device, pipeline);
} else {
break;
case RADV_RT_LOWERING_MODE_CPS:
radv_nir_lower_rt_abi_cps(temp_stage.nir, &temp_stage.args, &stage->info, stack_size, i > 0, device, pipeline,
has_position_fetch, traversal_stage_info);
break;
}
/* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */
@ -612,6 +620,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
const struct radv_shader_stage_key *stage_keys, struct radv_ray_tracing_pipeline *pipeline,
struct radv_serialized_shader_arena_block *capture_replay_handles, bool skip_shaders_cache)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
if (pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
@ -630,7 +639,8 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
* potentially even reducing runtime performance because of instruction cache coherency issues in the
* traversal loop.
*/
bool monolithic = !library && pipeline->stage_count < 50;
bool can_use_monolithic = !library && pipeline->stage_count < 50;
for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
if (rt_stages[i].shader || rt_stages[i].nir)
continue;
@ -660,12 +670,21 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
for (uint32_t i = 0; i < pipeline->stage_count; i++) {
has_callable |= rt_stages[i].stage == MESA_SHADER_CALLABLE;
has_position_fetch |= rt_stages[i].info.has_position_fetch;
monolithic &= rt_stages[i].info.can_inline;
can_use_monolithic &= rt_stages[i].info.can_inline;
if (i >= pCreateInfo->stageCount)
raygen_imported |= rt_stages[i].stage == MESA_SHADER_RAYGEN;
}
enum radv_rt_lowering_mode recursive_lowering_mode =
pdev->cache_key.rt_cps ? RADV_RT_LOWERING_MODE_CPS : RADV_RT_LOWERING_MODE_FUNCTION_CALLS;
enum radv_rt_lowering_mode raygen_lowering_mode;
if (can_use_monolithic)
raygen_lowering_mode = RADV_RT_LOWERING_MODE_MONOLITHIC;
else
raygen_lowering_mode = recursive_lowering_mode;
for (uint32_t idx = 0; idx < pCreateInfo->stageCount; idx++) {
if (rt_stages[idx].shader || rt_stages[idx].nir)
continue;
@ -681,6 +700,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
* - non-recursive: Non-recursive shaders are inlined into the traversal shader.
* - monolithic: Callable shaders (chit/miss) are inlined into the raygen shader.
*/
bool monolithic = raygen_lowering_mode == RADV_RT_LOWERING_MODE_MONOLITHIC;
bool always_inlined = radv_ray_tracing_stage_is_always_inlined(&rt_stages[idx]);
bool nir_needed =
(library && !has_callable) || always_inlined || (monolithic && rt_stages[idx].stage != MESA_SHADER_RAYGEN);
@ -706,18 +726,18 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
*/
bool shader_needed = !radv_ray_tracing_stage_is_always_inlined(&rt_stages[idx]) && !rt_stages[idx].shader;
if (rt_stages[idx].stage == MESA_SHADER_CLOSEST_HIT || rt_stages[idx].stage == MESA_SHADER_MISS)
shader_needed &= !monolithic || raygen_imported;
shader_needed &= raygen_lowering_mode != RADV_RT_LOWERING_MODE_MONOLITHIC || raygen_imported;
if (shader_needed) {
uint32_t stack_size = 0;
struct radv_serialized_shader_arena_block *replay_block =
capture_replay_handles[idx].arena_va ? &capture_replay_handles[idx] : NULL;
bool monolithic_raygen = monolithic && stage->stage == MESA_SHADER_RAYGEN;
enum radv_rt_lowering_mode mode =
stage->stage == MESA_SHADER_RAYGEN ? raygen_lowering_mode : recursive_lowering_mode;
result =
radv_rt_nir_to_asm(device, cache, pipeline, monolithic_raygen, stage, &stack_size, &rt_stages[idx].info, NULL, replay_block, skip_shaders_cache, has_position_fetch,
&rt_stages[idx].shader);
result = radv_rt_nir_to_asm(device, cache, pipeline, mode, stage, &stack_size, &rt_stages[idx].info, NULL,
replay_block, skip_shaders_cache, has_position_fetch, &rt_stages[idx].shader);
if (result != VK_SUCCESS)
goto cleanup;
@ -735,7 +755,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
/* Monolithic raygen shaders do not need a traversal shader. Skip compiling one if there are only monolithic raygen
* shaders.
*/
bool traversal_needed = !library && (!monolithic || raygen_imported);
bool traversal_needed = !library && (raygen_lowering_mode != RADV_RT_LOWERING_MODE_MONOLITHIC || raygen_imported);
if (!traversal_needed) {
result = VK_SUCCESS;
goto cleanup;
@ -775,8 +795,8 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
.key = stage_keys[MESA_SHADER_INTERSECTION],
};
radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout);
result = radv_rt_nir_to_asm(device, cache, pipeline, false, &traversal_stage, NULL, NULL, &traversal_info, NULL,
skip_shaders_cache, has_position_fetch,
result = radv_rt_nir_to_asm(device, cache, pipeline, recursive_lowering_mode, &traversal_stage, NULL, NULL,
&traversal_info, NULL, skip_shaders_cache, has_position_fetch,
&pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]);
ralloc_free(traversal_nir);

View file

@ -651,6 +651,10 @@ bool radv_consider_culling(const struct radv_physical_device *pdev, struct nir_s
void radv_get_nir_options(struct radv_physical_device *pdev);
enum radv_rt_lowering_mode {
RADV_RT_LOWERING_MODE_MONOLITHIC,
RADV_RT_LOWERING_MODE_CPS,
};
enum radv_rt_priority {
radv_rt_priority_raygen = 0,