From 24b1aa6c28d7a22376bffcdf05028b5f4b2210e0 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 7 Mar 2025 15:48:12 +0100 Subject: [PATCH] panvk/csf: Optimize read-only tile buffer access When the color/input attachment map is known at compile time, we can determine the set of read-only render targets and replace .wait by .wait_resource flows, in order to avoid read-after-read serialization. Signed-off-by: Boris Brezillon Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 34 ++++-- src/panfrost/vulkan/panvk_shader.h | 2 +- src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c | 9 ++ .../vulkan/panvk_vX_nir_lower_descriptors.c | 102 ++++++++++++++++-- src/panfrost/vulkan/panvk_vX_shader.c | 2 +- 5 files changed, 126 insertions(+), 23 deletions(-) diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 2960faa76e8..5c2cc302b57 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1465,6 +1465,10 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf, struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); const struct panvk_shader *fs = get_fs(cmdbuf); + bool dcd2_dirty = + fs_user_dirty(cmdbuf) || + dyn_gfx_state_dirty(cmdbuf, INPUT_ATTACHMENT_MAP) || + dyn_gfx_state_dirty(cmdbuf, COLOR_ATTACHMENT_MAP); bool dcd0_dirty = dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) || dyn_gfx_state_dirty(cmdbuf, RS_CULL_MODE) || @@ -1486,7 +1490,7 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf, dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) || dyn_gfx_state_dirty(cmdbuf, INPUT_ATTACHMENT_MAP) || fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE) || - gfx_state_dirty(cmdbuf, OQ); + gfx_state_dirty(cmdbuf, OQ) || dcd2_dirty; bool dcd1_dirty = dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) || dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) || /* line mode needs primitive topology */ @@ -1505,6 +1509,14 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf, bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable; bool writes_z = writes_depth(cmdbuf); bool writes_s = writes_stencil(cmdbuf); + uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments & + MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS; + uint8_t rt_written = 0, rt_read = 0; + + if (fs) { + rt_written = color_attachment_written_mask(fs, &dyns->cal); + rt_read = color_attachment_read_mask(fs, &dyns->ial, rt_mask); + } bool msaa = dyns->ms.rasterization_samples > 1; if ((ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || @@ -1524,12 +1536,6 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf, struct mali_dcd_flags_0_packed dcd0; pan_pack(&dcd0, DCD_FLAGS_0, cfg) { if (fs) { - uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments & - MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS; - uint8_t rt_written = color_attachment_written_mask( - fs, &cmdbuf->vk.dynamic_graphics_state.cal); - uint8_t rt_read = - color_attachment_read_mask(fs, &dyns->ial, rt_mask); bool zs_read = zs_attachment_read(fs, &dyns->ial); cfg.allow_forward_pixel_to_kill = @@ -1594,6 +1600,17 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf, cs_update_vt_ctx(b) cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD1), dcd1.opaque[0]); } + + if (dcd2_dirty) { + struct mali_dcd_flags_2_packed dcd2; + pan_pack(&dcd2, DCD_FLAGS_2, cfg) { + cfg.read_mask = rt_read; + cfg.write_mask = rt_written; + } + + cs_update_vt_ctx(b) + cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD2), dcd2.opaque[0]); + } } static void @@ -1750,9 +1767,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) uint32_t varying_size = get_varying_slots(cmdbuf) * 16; cs_update_vt_ctx(b) { - /* We don't use the resource dep system yet. */ - cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD2), 0); - prepare_index_buffer(cmdbuf, draw); set_tiler_idvs_flags(b, cmdbuf, draw); diff --git a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h index 06169ac8083..395c58d5fc7 100644 --- a/src/panfrost/vulkan/panvk_shader.h +++ b/src/panfrost/vulkan/panvk_shader.h @@ -364,7 +364,7 @@ void panvk_per_arch(nir_lower_descriptors)( nir_shader *nir, struct panvk_device *dev, const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count, struct vk_descriptor_set_layout *const *set_layouts, - struct panvk_shader *shader); + const struct vk_graphics_pipeline_state *state, struct panvk_shader *shader); /* This a stripped-down version of panvk_shader for internal shaders that * are managed by vk_meta (blend and preload shaders). Those don't need the diff --git a/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c b/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c index 0ea891c303c..e4065d60c64 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_fb_preload.c @@ -561,6 +561,14 @@ cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo, fill_textures(cmdbuf, fbinfo, key, descs.cpu + PANVK_DESCRIPTOR_SIZE); + uint32_t rt_written = 0; + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + for (unsigned i = 0; i < fbinfo->rt_count; i++) { + if (fbinfo->rts[i].preload) + rt_written |= BITFIELD_BIT(i); + } + } + if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) fill_bds(fbinfo, key, bds.cpu); @@ -646,6 +654,7 @@ cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo, cfg.shader.resources = res_table.gpu | 1; cfg.shader.shader = panvk_priv_mem_dev_addr(shader->spd); cfg.shader.thread_storage = cmdbuf->state.gfx.tsd; + cfg.flags_2.write_mask = rt_written; } if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) { diff --git a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c index aada6522ac0..afbf440677f 100644 --- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c +++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c @@ -829,6 +829,11 @@ get_img_index(nir_builder *b, nir_deref_instr *deref, } } +struct panvk_lower_input_attachment_load_ctx { + uint32_t ro_color_mask; + struct panvk_shader *shader; +}; + static bool lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr, void *data) @@ -843,7 +848,8 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr, image_dim != GLSL_SAMPLER_DIM_SUBPASS_MS) return false; - struct panvk_shader *shader = data; + const struct panvk_lower_input_attachment_load_ctx *ctx = data; + struct panvk_shader *shader = ctx->shader; nir_variable *var = nir_deref_instr_get_variable(deref); assert(var); @@ -869,12 +875,28 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr, { nir_def *conversion = nir_load_input_attachment_conv_pan(b, nir_imm_int(b, iam_idx)); + nir_def *is_read_only = + nir_i2b(b, nir_iand_imm(b, nir_ishl(b, nir_imm_int(b, 1), target), + ctx->ro_color_mask)); + nir_def *load_ro_color, *load_rw_color; iosem.location = FRAG_RESULT_DATA0; - load_color = nir_load_converted_output_pan( - b, intr->def.num_components, intr->def.bit_size, target, - intr->src[2].ssa, conversion, .dest_type = dest_type, - .io_semantics = iosem); + nir_push_if(b, is_read_only); + { + load_ro_color = nir_load_readonly_output_pan( + b, intr->def.num_components, intr->def.bit_size, target, + intr->src[2].ssa, conversion, .dest_type = dest_type, + .io_semantics = iosem); + } + nir_push_else(b, NULL); + { + load_rw_color = nir_load_converted_output_pan( + b, intr->def.num_components, intr->def.bit_size, target, + intr->src[2].ssa, conversion, .dest_type = dest_type, + .io_semantics = iosem); + } + nir_pop_if(b, NULL); + load_color = nir_if_phi(b, load_ro_color, load_rw_color); } nir_push_else(b, NULL); { @@ -942,12 +964,71 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr, } static bool -lower_input_attachment_loads(nir_shader *nir, struct panvk_shader *shader) +collect_frag_writes(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic != nir_intrinsic_store_deref) + return false; + + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + + if (deref->modes != nir_var_shader_out) + return false; + + nir_variable *var = nir_deref_instr_get_variable(deref); + assert(var); + + if (var->data.location < FRAG_RESULT_DATA0 || + var->data.location > FRAG_RESULT_DATA7) + return false; + + uint32_t *written_mask = data; + + *written_mask |= BITFIELD_BIT(var->data.location - FRAG_RESULT_DATA0); + return true; +} + +static uint32_t +readonly_color_mask(nir_shader *nir, + const struct vk_graphics_pipeline_state *state) +{ + if (!state || !state->ial || !state->cal) + return 0; + + uint32_t in_mask = 0, out_mask = 0; + + for (uint32_t i = 0; i < ARRAY_SIZE(state->ial->color_map); i++) { + if (i >= state->ial->color_attachment_count) + break; + + if (state->ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) + in_mask |= BITFIELD_BIT(i); + } + + NIR_PASS(_, nir, nir_shader_intrinsics_pass, collect_frag_writes, + nir_metadata_all, &out_mask); + + for (uint32_t i = 0; i < ARRAY_SIZE(state->cal->color_map); i++) { + if (state->ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED) + out_mask &= ~BITFIELD_BIT(i); + } + + return in_mask & ~out_mask; +} + +static bool +lower_input_attachment_loads(nir_shader *nir, + const struct vk_graphics_pipeline_state *state, + struct panvk_shader *shader) { bool progress = false; + struct panvk_lower_input_attachment_load_ctx ia_load_ctx = { + .ro_color_mask = readonly_color_mask(nir, state), + .shader = shader, + }; NIR_PASS(progress, nir, nir_shader_intrinsics_pass, - lower_input_attachment_load, nir_metadata_control_flow, shader); + lower_input_attachment_load, nir_metadata_control_flow, + &ia_load_ctx); /* Lower the remaining input attachment loads. */ struct nir_input_attachment_options lower_input_attach_opts = { @@ -1364,10 +1445,9 @@ upload_shader_desc_info(struct panvk_device *dev, struct panvk_shader *shader, void panvk_per_arch(nir_lower_descriptors)( nir_shader *nir, struct panvk_device *dev, - const struct vk_pipeline_robustness_state *rs, - uint32_t set_layout_count, + const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count, struct vk_descriptor_set_layout *const *set_layouts, - struct panvk_shader *shader) + const struct vk_graphics_pipeline_state *state, struct panvk_shader *shader) { struct lower_desc_ctx ctx = { .shader = shader, @@ -1415,7 +1495,7 @@ panvk_per_arch(nir_lower_descriptors)( upload_shader_desc_info(dev, shader, &ctx.desc_info); if (nir->info.stage == MESA_SHADER_FRAGMENT) - NIR_PASS(progress, nir, lower_input_attachment_loads, shader); + NIR_PASS(progress, nir, lower_input_attachment_loads, state, shader); NIR_PASS(progress, nir, nir_shader_instructions_pass, lower_descriptors_instr, nir_metadata_control_flow, &ctx); diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 209c0f00e93..411dec4244a 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -769,7 +769,7 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir, #endif panvk_per_arch(nir_lower_descriptors)(nir, dev, rs, set_layout_count, - set_layouts, shader); + set_layouts, state, shader); NIR_PASS(_, nir, nir_split_var_copies); NIR_PASS(_, nir, nir_lower_var_copies);