radv: Decouple radv_before_taskmesh_draw from radv_before_draw.

radv_before_taskmesh_draw will no longer call radv_before_draw and
instead implement the necessary functionality on its own.

radv_before_draw will no longer have to emit mesh shader descriptors.

As a result, both functions should have a lower CPU overhead now.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18829>
This commit is contained in:
Timur Kristóf 2022-12-21 17:32:57 +01:00
parent 050c39c92f
commit d496f0b97d

View file

@ -4890,7 +4890,7 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
radv_flush_streamout_descriptors(cmd_buffer);
VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT;
VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS;
radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
radv_flush_ngg_query_state(cmd_buffer);
@ -8626,61 +8626,66 @@ ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
uint32_t drawCount)
{
struct radv_descriptor_state *descriptors_state =
radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
/* For direct draws, this makes sure we don't draw anything.
* For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100).
*/
if (unlikely(!info->count))
return false;
struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK);
assert(!task_shader || ace_cs);
const VkShaderStageFlags stages = VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_FRAGMENT_BIT | (task_shader ? VK_SHADER_STAGE_TASK_BIT_EXT : 0);
const bool pipeline_is_dirty =
cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE &&
cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
const bool push_dirty = descriptors_state->push_dirty;
const uint32_t desc_dirty = descriptors_state->dirty;
const bool need_task_semaphore = task_shader && radv_flush_gfx2ace_semaphore(cmd_buffer);
const bool gfx_result = radv_before_draw(cmd_buffer, info, drawCount);
struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK);
ASSERTED const unsigned cdw_max =
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
ASSERTED const unsigned ace_cdw_max = !ace_cs ? 0 :
radeon_check_space(cmd_buffer->device->ws, ace_cs, 4096 + 128 * (drawCount - 1));
/* If there is no task shader, no need to do anything special. */
if (!task_shader)
return gfx_result;
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
radv_emit_fb_mip_change_flush(cmd_buffer);
/* Need to check the count even for indirect draws to work around
* an issue with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
*/
if (!info->count || !gfx_result)
return false;
const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
struct radeon_winsys *ws = cmd_buffer->device->ws;
assert(ace_cs);
ASSERTED const unsigned ace_cdw_max =
radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
if (need_task_semaphore)
radv_wait_gfx2ace_semaphore(cmd_buffer);
if (pipeline_is_dirty) {
radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
if (task_shader && pipeline_is_dirty) {
radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
}
radv_ace_internal_cache_flush(cmd_buffer);
si_emit_cache_flush(cmd_buffer);
/* Restore dirty state of descriptors
* They were marked non-dirty in radv_before_draw,
* but they need to be re-emitted now to the ACE cmdbuf.
*/
descriptors_state->push_dirty = push_dirty;
descriptors_state->dirty = desc_dirty;
if (task_shader) {
radv_ace_internal_cache_flush(cmd_buffer);
/* Flush descriptors and push constants for task shaders. */
radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_EXT, &pipeline->base,
VK_PIPELINE_BIND_POINT_GRAPHICS);
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_EXT, &pipeline->base,
VK_PIPELINE_BIND_POINT_GRAPHICS);
if (need_task_semaphore) {
radv_wait_gfx2ace_semaphore(cmd_buffer);
}
}
radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
radv_describe_draw(cmd_buffer);
if (likely(!info->indirect)) {
struct radv_cmd_state *state = &cmd_buffer->state;
if (unlikely(state->last_num_instances != 1)) {
struct radeon_cmdbuf *cs = cmd_buffer->cs;
radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
radeon_emit(cs, 1);
state->last_num_instances = 1;
}
}
assert(cmd_buffer->cs->cdw <= cdw_max);
assert(!ace_cs || ace_cs->cdw <= ace_cdw_max);
assert(ace_cs->cdw <= ace_cdw_max);
return true;
}