turnip: Only emit descriptor loads for active stages in the pipeline.

zink has a push descriptor template layout that has every possible stage,
which gets used regardless of what stages are in the pipeline.  By
skipping over the unused stages, we cut the CP overhead.

Improves TU_DEBUG=sysmem gfxbench gl_driver2 on zink by 6.57% +/-
0.331143% (n=5).

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18562>
This commit is contained in:
Emma Anholt 2022-09-12 21:11:14 -07:00 committed by Marge Bot
parent f51bbcc8ae
commit d862a2ebcb

View file

@ -53,7 +53,7 @@ emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
static unsigned static unsigned
tu6_load_state_size(struct tu_pipeline *pipeline, tu6_load_state_size(struct tu_pipeline *pipeline,
struct tu_pipeline_layout *layout, bool compute) struct tu_pipeline_layout *layout)
{ {
const unsigned load_state_size = 4; const unsigned load_state_size = 4;
unsigned size = 0; unsigned size = 0;
@ -65,13 +65,8 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
for (unsigned j = 0; j < set_layout->binding_count; j++) { for (unsigned j = 0; j < set_layout->binding_count; j++) {
struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
unsigned count = 0; unsigned count = 0;
/* Note: some users, like amber for example, pass in /* See comment in tu6_emit_load_state(). */
* VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
* filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
*/
VkShaderStageFlags stages = compute ?
binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
unsigned stage_count = util_bitcount(stages); unsigned stage_count = util_bitcount(stages);
if (!binding->array_size) if (!binding->array_size)
@ -83,9 +78,7 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
/* IBO-backed resources only need one packet for all graphics stages */ /* IBO-backed resources only need one packet for all graphics stages */
if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) if (stage_count)
count += 1;
if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
count += 1; count += 1;
break; break;
case VK_DESCRIPTOR_TYPE_SAMPLER: case VK_DESCRIPTOR_TYPE_SAMPLER:
@ -116,9 +109,9 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
static void static void
tu6_emit_load_state(struct tu_pipeline *pipeline, tu6_emit_load_state(struct tu_pipeline *pipeline,
struct tu_pipeline_layout *layout, bool compute) struct tu_pipeline_layout *layout)
{ {
unsigned size = tu6_load_state_size(pipeline, layout, compute); unsigned size = tu6_load_state_size(pipeline, layout);
if (size == 0) if (size == 0)
return; return;
@ -150,13 +143,12 @@ tu6_emit_load_state(struct tu_pipeline *pipeline,
struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
unsigned base = i; unsigned base = i;
unsigned offset = binding->offset / 4; unsigned offset = binding->offset / 4;
/* Note: some users, like amber for example, pass in /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
* VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so * zink has descriptors for each stage in the push layout even if some
* filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly. * stages aren't present in a used pipeline. We don't want to emit
* loads for unused descriptors.
*/ */
VkShaderStageFlags stages = compute ? VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
unsigned count = binding->array_size; unsigned count = binding->array_size;
if (count == 0 || stages == 0) if (count == 0 || stages == 0)
continue; continue;
@ -2395,7 +2387,7 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
struct tu_pipeline_builder *builder, struct tu_pipeline_builder *builder,
struct ir3_shader_variant *compute) struct ir3_shader_variant *compute)
{ {
uint32_t size = 1024 + tu6_load_state_size(pipeline, layout, compute); uint32_t size = 1024 + tu6_load_state_size(pipeline, layout);
/* graphics case: */ /* graphics case: */
if (builder) { if (builder) {
@ -2861,6 +2853,8 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
gl_shader_stage stage = gl_shader_stage stage =
vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
stage_infos[stage] = &builder->create_info->pStages[i]; stage_infos[stage] = &builder->create_info->pStages[i];
pipeline->active_stages |= builder->create_info->pStages[i].stage;
} }
if (tu6_shared_constants_enable(builder->layout, builder->device->compiler)) { if (tu6_shared_constants_enable(builder->layout, builder->device->compiler)) {
@ -3303,12 +3297,6 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
tu6_emit_program(&prog_cs, builder, true, pipeline); tu6_emit_program(&prog_cs, builder, true, pipeline);
pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
VkShaderStageFlags stages = 0;
for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
stages |= builder->create_info->pStages[i].stage;
}
pipeline->active_stages = stages;
for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) { for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
if (!builder->shaders->variants[i]) if (!builder->shaders->variants[i])
continue; continue;
@ -3960,7 +3948,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
tu_pipeline_builder_parse_rasterization_order(builder, *pipeline); tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
tu6_emit_load_state(*pipeline, builder->layout, false); tu6_emit_load_state(*pipeline, builder->layout);
return VK_SUCCESS; return VK_SUCCESS;
} }
@ -4218,6 +4206,7 @@ tu_compute_pipeline_create(VkDevice device,
pipeline->executables_mem_ctx = ralloc_context(NULL); pipeline->executables_mem_ctx = ralloc_context(NULL);
util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx); util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
pipeline->active_stages = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
struct tu_shader_key key = { }; struct tu_shader_key key = { };
tu_shader_key_init(&key, stage_info, dev); tu_shader_key_init(&key, stage_info, dev);
@ -4336,7 +4325,7 @@ tu_compute_pipeline_create(VkDevice device,
tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova); tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova);
pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
tu6_emit_load_state(pipeline, layout, true); tu6_emit_load_state(pipeline, layout);
tu_append_executable(pipeline, v, nir_initial_disasm); tu_append_executable(pipeline, v, nir_initial_disasm);