turnip: Lazily call tu6_emit_descriptor_sets() at draw time.

This lets us batch up the state changes from multiple
vkCmdBindDescriptorSets, which ANGLE and zink will both do in a single
draw.

Improves ANGLE (sysmem) driver_overhead perf by 5.18806% +/- 1.03444% (n=5).
Improves ANGLE aztec_ruins_high perf by ~.3%. (clear result in the graph,
but the screen went to sleep mid way through and so it was high variance)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20084>
This commit is contained in:
Emma Anholt 2022-11-30 11:31:22 -08:00 committed by Marge Bot
parent 73db82c816
commit c1968deec2
2 changed files with 35 additions and 14 deletions

View file

@ -2085,7 +2085,6 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
tu_cs_draw_state(&cmd->sub_cs, &state_cs,
4 + 4 * descriptors_state->max_sets_bound +
(descriptors_state->dynamic_bound ? 6 : 0));
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD;
cs = &state_cs;
} else {
assert(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
@ -2094,7 +2093,6 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);
cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
cs = &cmd->cs;
}
@ -2125,6 +2123,22 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
}
}
/* We lazily emit the draw state for desciptor sets at draw time, so that we can
* batch together multiple tu_CmdBindDescriptorSets() calls. ANGLE and zink
* will often emit multiple bind calls in a draw.
*/
static void
tu_dirty_desc_sets(struct tu_cmd_buffer *cmd,
VkPipelineBindPoint pipelineBindPoint)
{
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS;
} else {
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS;
}
}
VKAPI_ATTR void VKAPI_CALL
tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
VkPipelineBindPoint pipelineBindPoint,
@ -2239,7 +2253,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
descriptors_state->dynamic_bound = true;
}
tu6_emit_descriptor_sets(cmd, pipelineBindPoint);
tu_dirty_desc_sets(cmd, pipelineBindPoint);
}
VKAPI_ATTR void VKAPI_CALL
@ -2284,7 +2298,7 @@ tu_CmdSetDescriptorBufferOffsetsEXT(
cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
}
tu6_emit_descriptor_sets(cmd, pipelineBindPoint);
tu_dirty_desc_sets(cmd, pipelineBindPoint);
}
VKAPI_ATTR void VKAPI_CALL
@ -2307,7 +2321,7 @@ tu_CmdBindDescriptorBufferEmbeddedSamplersEXT(
descriptors_state->set_iova[set] = set_layout->embedded_samplers->iova | 3;
tu6_emit_descriptor_sets(cmd, pipelineBindPoint);
tu_dirty_desc_sets(cmd, pipelineBindPoint);
}
static enum VkResult
@ -2645,7 +2659,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
cmd->state.pipeline = pipeline;
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS |
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS | TU_CMD_DIRTY_SHADER_CONSTS |
TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS;
if (pipeline->output.feedback_loop_may_involve_textures &&
@ -4924,7 +4938,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
/* Early exit if there is nothing to emit, saves CPU cycles */
uint32_t dirty = cmd->state.dirty;
if (!(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD))
if (!(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS))
return VK_SUCCESS;
bool dirty_lrz =
@ -5021,6 +5035,9 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
cmd->state.patch_control_points);
}
if (dirty & TU_CMD_DIRTY_DESC_SETS)
tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
/* for the first draw in a renderpass, re-emit all the draw states
*
* and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
@ -5061,7 +5078,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
emit_patch_control_points = false;
uint32_t draw_state_count =
((dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 1 : 0) +
((dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
((dirty & TU_CMD_DIRTY_DESC_SETS) ? 1 : 0) +
((dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
((dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
(dirty_lrz ? 1 : 0);
@ -5090,8 +5107,10 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
if (dirty & TU_CMD_DIRTY_DESC_SETS_LOAD)
if (dirty & TU_CMD_DIRTY_DESC_SETS) {
/* tu6_emit_descriptor_sets emitted the cmd->state.desc_sets draw state. */
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
}
if (dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
if (emit_binding_stride) {
@ -5120,7 +5139,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
* bits to preserve instead. The only things not emitted here are
* compute-related state.
*/
cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS;
return VK_SUCCESS;
}
@ -5730,10 +5749,12 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
tu_emit_compute_driver_params(cmd, cs, pipeline, info);
if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)
if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS) {
tu6_emit_descriptor_sets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
tu_cs_emit_state_ib(cs, pipeline->load_state);
}
cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS;
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));

View file

@ -58,8 +58,8 @@ enum tu_cmd_dirty_bits
TU_CMD_DIRTY_RAST = BIT(2),
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
TU_CMD_DIRTY_DESC_SETS = BIT(5),
TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(6),
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
TU_CMD_DIRTY_LRZ = BIT(8),
TU_CMD_DIRTY_VS_PARAMS = BIT(9),