radv: upload and emit dynamic descriptors separately from push constants

Dynamic descriptors are rarely used and this will allow to do more
optimizations for push constants, like gathering the size from shaders
themselves instead of using the pipeline layout.

fossils-db (GFX1201):
Totals from 21740 (27.30% of 79646) affected shaders:
Instrs: 11186407 -> 11192061 (+0.05%); split: -0.05%, +0.10%
CodeSize: 59842068 -> 59864412 (+0.04%); split: -0.04%, +0.08%
Latency: 56333136 -> 56325208 (-0.01%); split: -0.03%, +0.02%
InvThroughput: 8576452 -> 8576516 (+0.00%); split: -0.00%, +0.00%
SClause: 279186 -> 279713 (+0.19%); split: -0.06%, +0.25%
Copies: 577854 -> 581735 (+0.67%); split: -0.28%, +0.95%
PreSGPRs: 867163 -> 866409 (-0.09%)
SALU: 1391187 -> 1395055 (+0.28%); split: -0.12%, +0.39%

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37768>
This commit is contained in:
Samuel Pitoiset 2025-09-02 11:51:58 +02:00 committed by Marge Bot
parent bc32286e5b
commit a47952d495
4 changed files with 81 additions and 17 deletions

View file

@ -69,8 +69,8 @@ visit_vulkan_resource_index(nir_builder *b, apply_layout_state *state, nir_intri
nir_def *set_ptr; nir_def *set_ptr;
if (vk_descriptor_type_is_dynamic(layout->binding[binding].type)) { if (vk_descriptor_type_is_dynamic(layout->binding[binding].type)) {
unsigned idx = state->layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; unsigned idx = state->layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
set_ptr = get_scalar_arg(b, 1, state->args->ac.push_constants); set_ptr = get_scalar_arg(b, 1, state->args->ac.dynamic_descriptors);
offset = state->layout->push_constant_size + idx * 16; offset = idx * 16;
stride = 16; stride = 16;
} else { } else {
set_ptr = load_desc_ptr(b, state, desc_set); set_ptr = load_desc_ptr(b, state, desc_set);

View file

@ -1285,6 +1285,7 @@ radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandB
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
cmd_buffer->descriptors[i].dirty = 0; cmd_buffer->descriptors[i].dirty = 0;
cmd_buffer->descriptors[i].valid = 0; cmd_buffer->descriptors[i].valid = 0;
cmd_buffer->descriptors[i].dirty_dynamic = false;
} }
radv_cmd_buffer_reset_rendering(cmd_buffer); radv_cmd_buffer_reset_rendering(cmd_buffer);
@ -6285,7 +6286,7 @@ radv_must_flush_constants(const struct radv_cmd_buffer *cmd_buffer, VkShaderStag
{ {
const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point); const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
if (push_constants->size || push_constants->dynamic_offset_count) if (push_constants->size)
return stages & cmd_buffer->push_constant_stages; return stages & cmd_buffer->push_constant_stages;
return 0; return 0;
@ -6331,16 +6332,15 @@ radv_emit_push_constants_per_stage(const struct radv_device *device, struct radv
static void static void
radv_upload_push_constants(struct radv_cmd_buffer *cmd_buffer, const struct radv_push_constant_state *pc_state, radv_upload_push_constants(struct radv_cmd_buffer *cmd_buffer, const struct radv_push_constant_state *pc_state,
const struct radv_descriptor_state *descriptors_state, uint64_t *va) uint64_t *va)
{ {
unsigned offset; unsigned offset;
void *ptr; void *ptr;
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pc_state->size + 16 * pc_state->dynamic_offset_count, &offset, &ptr)) if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pc_state->size, &offset, &ptr))
return; return;
memcpy(ptr, cmd_buffer->push_constants, pc_state->size); memcpy(ptr, cmd_buffer->push_constants, pc_state->size);
memcpy((char *)ptr + pc_state->size, descriptors_state->dynamic_buffers, 16 * pc_state->dynamic_offset_count);
*va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; *va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
} }
@ -6350,7 +6350,6 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
{ {
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
struct radv_cmd_stream *cs = cmd_buffer->cs; struct radv_cmd_stream *cs = cmd_buffer->cs;
struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point); const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
uint64_t va = 0; uint64_t va = 0;
uint32_t internal_stages = stages; uint32_t internal_stages = stages;
@ -6368,7 +6367,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
} }
if (push_constants->need_upload) { if (push_constants->need_upload) {
radv_upload_push_constants(cmd_buffer, push_constants, descriptors_state, &va); radv_upload_push_constants(cmd_buffer, push_constants, &va);
} }
if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) { if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
@ -6400,6 +6399,58 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
cmd_buffer->push_constant_stages &= ~stages; cmd_buffer->push_constant_stages &= ~stages;
} }
static void
radv_upload_dynamic_descriptors(struct radv_cmd_buffer *cmd_buffer,
const struct radv_descriptor_state *descriptors_state, uint64_t *va)
{
const uint32_t size = descriptors_state->dynamic_offset_count * 16;
unsigned offset;
void *ptr;
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
return;
memcpy(ptr, descriptors_state->dynamic_buffers, size);
*va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
}
static void
radv_flush_dynamic_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
VkPipelineBindPoint bind_point)
{
const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
struct radv_cmd_stream *cs = cmd_buffer->cs;
uint64_t va = 0;
radv_upload_dynamic_descriptors(cmd_buffer, descriptors_state, &va);
ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs->b, MESA_VULKAN_SHADER_STAGES * 4);
if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
const struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
: cmd_buffer->state.rt_prolog;
radv_emit_userdata_address(device, cs, compute_shader, AC_UD_DYNAMIC_DESCRIPTORS, va);
} else {
radv_foreach_stage (stage, stages & ~VK_SHADER_STAGE_TASK_BIT_EXT) {
if (!cmd_buffer->state.shaders[stage])
continue;
radv_emit_userdata_address(device, cs, cmd_buffer->state.shaders[stage], AC_UD_DYNAMIC_DESCRIPTORS, va);
}
if (stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
AC_UD_DYNAMIC_DESCRIPTORS, va);
}
}
assert(cs->b->cdw <= cdw_max);
}
ALWAYS_INLINE void ALWAYS_INLINE void
radv_get_vbo_info(const struct radv_cmd_buffer *cmd_buffer, uint32_t idx, struct radv_vbo_info *vbo_info) radv_get_vbo_info(const struct radv_cmd_buffer *cmd_buffer, uint32_t idx, struct radv_vbo_info *vbo_info)
{ {
@ -6717,6 +6768,11 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
descriptors_state->dirty = 0; descriptors_state->dirty = 0;
} }
if (descriptors_state->dirty_dynamic && descriptors_state->dynamic_offset_count) {
radv_flush_dynamic_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
descriptors_state->dirty_dynamic = false;
}
const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS); const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
if (pc_stages) if (pc_stages)
radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS); radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
@ -7773,7 +7829,7 @@ radv_bind_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, const VkBindDescri
ac_build_raw_buffer_descriptor(pdev->info.gfx_level, va, size, dst); ac_build_raw_buffer_descriptor(pdev->info.gfx_level, va, size, dst);
} }
cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages; descriptors_state->dirty_dynamic = true;
} }
} }
} }
@ -8667,8 +8723,7 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].size = pipeline->push_constant_size; cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].size = pipeline->push_constant_size;
cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].need_upload = cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].need_upload =
pipeline->need_push_constants_upload; pipeline->need_push_constants_upload;
cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count = cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count = pipeline->dynamic_offset_count;
pipeline->dynamic_offset_count;
cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptors = cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptors =
pipeline->need_indirect_descriptors; pipeline->need_indirect_descriptors;
} }
@ -12356,9 +12411,9 @@ radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_GRAPHICS]; struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
descriptors_state->need_indirect_descriptors = need_indirect_descriptors; descriptors_state->need_indirect_descriptors = need_indirect_descriptors;
descriptors_state->dynamic_offset_count = dynamic_offset_count;
pc_state->need_upload = need_push_constants_upload; pc_state->need_upload = need_push_constants_upload;
pc_state->size = push_constant_size; pc_state->size = push_constant_size;
pc_state->dynamic_offset_count = dynamic_offset_count;
if (pdev->info.gfx_level <= GFX9) { if (pdev->info.gfx_level <= GFX9) {
cmd_buffer->state.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param(device, cmd_buffer->state.shaders); cmd_buffer->state.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param(device, cmd_buffer->state.shaders);
@ -12491,6 +12546,11 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
descriptors_state->dirty = 0; descriptors_state->dirty = 0;
} }
if (descriptors_state->dirty_dynamic && descriptors_state->dynamic_offset_count) {
radv_flush_dynamic_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
descriptors_state->dirty_dynamic = false;
}
const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS); const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
if (pc_stages) if (pc_stages)
radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS); radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
@ -13343,6 +13403,11 @@ radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPip
descriptors_state->dirty = 0; descriptors_state->dirty = 0;
} }
if (descriptors_state->dirty_dynamic && descriptors_state->dynamic_offset_count) {
radv_flush_dynamic_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, bind_point);
descriptors_state->dirty_dynamic = false;
}
const VkShaderStageFlags stages = const VkShaderStageFlags stages =
bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR ? RADV_RT_STAGE_BITS : VK_SHADER_STAGE_COMPUTE_BIT; bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR ? RADV_RT_STAGE_BITS : VK_SHADER_STAGE_COMPUTE_BIT;
const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, bind_point); const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, bind_point);
@ -15329,9 +15394,9 @@ radv_bind_compute_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_
struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_COMPUTE]; struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_COMPUTE];
descriptors_state->need_indirect_descriptors = radv_shader_need_indirect_descriptors(shader); descriptors_state->need_indirect_descriptors = radv_shader_need_indirect_descriptors(shader);
descriptors_state->dynamic_offset_count = shader_obj->dynamic_offset_count;
pc_state->need_upload = radv_shader_need_push_constants_upload(shader); pc_state->need_upload = radv_shader_need_push_constants_upload(shader);
pc_state->size = shader_obj->push_constant_size; pc_state->size = shader_obj->push_constant_size;
pc_state->dynamic_offset_count = shader_obj->dynamic_offset_count;
assert(cs->b->cdw <= cdw_max); assert(cs->b->cdw <= cdw_max);
} }

View file

@ -244,6 +244,8 @@ struct radv_descriptor_state {
uint32_t valid; uint32_t valid;
struct radv_push_descriptor_set push_set; struct radv_push_descriptor_set push_set;
uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS]; uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS];
uint32_t dynamic_offset_count;
bool dirty_dynamic;
uint64_t descriptor_buffers[MAX_SETS]; uint64_t descriptor_buffers[MAX_SETS];
bool need_indirect_descriptors; bool need_indirect_descriptors;
uint64_t indirect_descriptor_sets_va; uint64_t indirect_descriptor_sets_va;
@ -251,7 +253,6 @@ struct radv_descriptor_state {
struct radv_push_constant_state { struct radv_push_constant_state {
uint32_t size; uint32_t size;
uint32_t dynamic_offset_count;
bool need_upload; bool need_upload;
}; };

View file

@ -1034,10 +1034,8 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
const struct radv_physical_device *pdev = radv_device_physical(device); const struct radv_physical_device *pdev = radv_device_physical(device);
struct nir_function *func = (struct nir_function *)exec_list_get_head_const(&nir->functions); struct nir_function *func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
if (layout->use_dynamic_descriptors) { if (layout->use_dynamic_descriptors)
info->loads_push_constants = true;
info->loads_dynamic_offsets = true; info->loads_dynamic_offsets = true;
}
nir_foreach_block (block, func->impl) { nir_foreach_block (block, func->impl) {
gather_info_block(nir, block, info, gfx_state, stage_key, consider_force_vrs); gather_info_block(nir, block, info, gfx_state, stage_key, consider_force_vrs);