panvk/csf: Optimize read-only tile buffer access

When the color/input attachment map is known at compile time, we can
determine the set of read-only render targets and replace .wait by
.wait_resource flows, in order to avoid read-after-read serialization.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32540>
This commit is contained in:
Boris Brezillon 2025-03-07 15:48:12 +01:00 committed by Marge Bot
parent 4f4ac56145
commit 24b1aa6c28
5 changed files with 126 additions and 23 deletions

View file

@ -1465,6 +1465,10 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf,
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
const struct panvk_shader *fs = get_fs(cmdbuf);
bool dcd2_dirty =
fs_user_dirty(cmdbuf) ||
dyn_gfx_state_dirty(cmdbuf, INPUT_ATTACHMENT_MAP) ||
dyn_gfx_state_dirty(cmdbuf, COLOR_ATTACHMENT_MAP);
bool dcd0_dirty =
dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||
dyn_gfx_state_dirty(cmdbuf, RS_CULL_MODE) ||
@ -1486,7 +1490,7 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf,
dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) ||
dyn_gfx_state_dirty(cmdbuf, INPUT_ATTACHMENT_MAP) ||
fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE) ||
gfx_state_dirty(cmdbuf, OQ);
gfx_state_dirty(cmdbuf, OQ) || dcd2_dirty;
bool dcd1_dirty = dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
/* line mode needs primitive topology */
@ -1505,6 +1509,14 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf,
bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
bool writes_z = writes_depth(cmdbuf);
bool writes_s = writes_stencil(cmdbuf);
uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
uint8_t rt_written = 0, rt_read = 0;
if (fs) {
rt_written = color_attachment_written_mask(fs, &dyns->cal);
rt_read = color_attachment_read_mask(fs, &dyns->ial, rt_mask);
}
bool msaa = dyns->ms.rasterization_samples > 1;
if ((ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST ||
@ -1524,12 +1536,6 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf,
struct mali_dcd_flags_0_packed dcd0;
pan_pack(&dcd0, DCD_FLAGS_0, cfg) {
if (fs) {
uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
uint8_t rt_written = color_attachment_written_mask(
fs, &cmdbuf->vk.dynamic_graphics_state.cal);
uint8_t rt_read =
color_attachment_read_mask(fs, &dyns->ial, rt_mask);
bool zs_read = zs_attachment_read(fs, &dyns->ial);
cfg.allow_forward_pixel_to_kill =
@ -1594,6 +1600,17 @@ prepare_dcd(struct panvk_cmd_buffer *cmdbuf,
cs_update_vt_ctx(b)
cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD1), dcd1.opaque[0]);
}
if (dcd2_dirty) {
struct mali_dcd_flags_2_packed dcd2;
pan_pack(&dcd2, DCD_FLAGS_2, cfg) {
cfg.read_mask = rt_read;
cfg.write_mask = rt_written;
}
cs_update_vt_ctx(b)
cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD2), dcd2.opaque[0]);
}
}
static void
@ -1750,9 +1767,6 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
cs_update_vt_ctx(b) {
/* We don't use the resource dep system yet. */
cs_move32_to(b, cs_sr_reg32(b, IDVS, DCD2), 0);
prepare_index_buffer(cmdbuf, draw);
set_tiler_idvs_flags(b, cmdbuf, draw);

View file

@ -364,7 +364,7 @@ void panvk_per_arch(nir_lower_descriptors)(
nir_shader *nir, struct panvk_device *dev,
const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count,
struct vk_descriptor_set_layout *const *set_layouts,
struct panvk_shader *shader);
const struct vk_graphics_pipeline_state *state, struct panvk_shader *shader);
/* This a stripped-down version of panvk_shader for internal shaders that
* are managed by vk_meta (blend and preload shaders). Those don't need the

View file

@ -561,6 +561,14 @@ cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
fill_textures(cmdbuf, fbinfo, key, descs.cpu + PANVK_DESCRIPTOR_SIZE);
uint32_t rt_written = 0;
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
for (unsigned i = 0; i < fbinfo->rt_count; i++) {
if (fbinfo->rts[i].preload)
rt_written |= BITFIELD_BIT(i);
}
}
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT)
fill_bds(fbinfo, key, bds.cpu);
@ -646,6 +654,7 @@ cmd_emit_dcd(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
cfg.shader.resources = res_table.gpu | 1;
cfg.shader.shader = panvk_priv_mem_dev_addr(shader->spd);
cfg.shader.thread_storage = cmdbuf->state.gfx.tsd;
cfg.flags_2.write_mask = rt_written;
}
if (key->aspects == VK_IMAGE_ASPECT_COLOR_BIT) {

View file

@ -829,6 +829,11 @@ get_img_index(nir_builder *b, nir_deref_instr *deref,
}
}
struct panvk_lower_input_attachment_load_ctx {
uint32_t ro_color_mask;
struct panvk_shader *shader;
};
static bool
lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr,
void *data)
@ -843,7 +848,8 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr,
image_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
return false;
struct panvk_shader *shader = data;
const struct panvk_lower_input_attachment_load_ctx *ctx = data;
struct panvk_shader *shader = ctx->shader;
nir_variable *var = nir_deref_instr_get_variable(deref);
assert(var);
@ -869,12 +875,28 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr,
{
nir_def *conversion =
nir_load_input_attachment_conv_pan(b, nir_imm_int(b, iam_idx));
nir_def *is_read_only =
nir_i2b(b, nir_iand_imm(b, nir_ishl(b, nir_imm_int(b, 1), target),
ctx->ro_color_mask));
nir_def *load_ro_color, *load_rw_color;
iosem.location = FRAG_RESULT_DATA0;
load_color = nir_load_converted_output_pan(
b, intr->def.num_components, intr->def.bit_size, target,
intr->src[2].ssa, conversion, .dest_type = dest_type,
.io_semantics = iosem);
nir_push_if(b, is_read_only);
{
load_ro_color = nir_load_readonly_output_pan(
b, intr->def.num_components, intr->def.bit_size, target,
intr->src[2].ssa, conversion, .dest_type = dest_type,
.io_semantics = iosem);
}
nir_push_else(b, NULL);
{
load_rw_color = nir_load_converted_output_pan(
b, intr->def.num_components, intr->def.bit_size, target,
intr->src[2].ssa, conversion, .dest_type = dest_type,
.io_semantics = iosem);
}
nir_pop_if(b, NULL);
load_color = nir_if_phi(b, load_ro_color, load_rw_color);
}
nir_push_else(b, NULL);
{
@ -942,12 +964,71 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr,
}
static bool
lower_input_attachment_loads(nir_shader *nir, struct panvk_shader *shader)
collect_frag_writes(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
if (intr->intrinsic != nir_intrinsic_store_deref)
return false;
nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
if (deref->modes != nir_var_shader_out)
return false;
nir_variable *var = nir_deref_instr_get_variable(deref);
assert(var);
if (var->data.location < FRAG_RESULT_DATA0 ||
var->data.location > FRAG_RESULT_DATA7)
return false;
uint32_t *written_mask = data;
*written_mask |= BITFIELD_BIT(var->data.location - FRAG_RESULT_DATA0);
return true;
}
static uint32_t
readonly_color_mask(nir_shader *nir,
const struct vk_graphics_pipeline_state *state)
{
if (!state || !state->ial || !state->cal)
return 0;
uint32_t in_mask = 0, out_mask = 0;
for (uint32_t i = 0; i < ARRAY_SIZE(state->ial->color_map); i++) {
if (i >= state->ial->color_attachment_count)
break;
if (state->ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED)
in_mask |= BITFIELD_BIT(i);
}
NIR_PASS(_, nir, nir_shader_intrinsics_pass, collect_frag_writes,
nir_metadata_all, &out_mask);
for (uint32_t i = 0; i < ARRAY_SIZE(state->cal->color_map); i++) {
if (state->ial->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
out_mask &= ~BITFIELD_BIT(i);
}
return in_mask & ~out_mask;
}
static bool
lower_input_attachment_loads(nir_shader *nir,
const struct vk_graphics_pipeline_state *state,
struct panvk_shader *shader)
{
bool progress = false;
struct panvk_lower_input_attachment_load_ctx ia_load_ctx = {
.ro_color_mask = readonly_color_mask(nir, state),
.shader = shader,
};
NIR_PASS(progress, nir, nir_shader_intrinsics_pass,
lower_input_attachment_load, nir_metadata_control_flow, shader);
lower_input_attachment_load, nir_metadata_control_flow,
&ia_load_ctx);
/* Lower the remaining input attachment loads. */
struct nir_input_attachment_options lower_input_attach_opts = {
@ -1364,10 +1445,9 @@ upload_shader_desc_info(struct panvk_device *dev, struct panvk_shader *shader,
void
panvk_per_arch(nir_lower_descriptors)(
nir_shader *nir, struct panvk_device *dev,
const struct vk_pipeline_robustness_state *rs,
uint32_t set_layout_count,
const struct vk_pipeline_robustness_state *rs, uint32_t set_layout_count,
struct vk_descriptor_set_layout *const *set_layouts,
struct panvk_shader *shader)
const struct vk_graphics_pipeline_state *state, struct panvk_shader *shader)
{
struct lower_desc_ctx ctx = {
.shader = shader,
@ -1415,7 +1495,7 @@ panvk_per_arch(nir_lower_descriptors)(
upload_shader_desc_info(dev, shader, &ctx.desc_info);
if (nir->info.stage == MESA_SHADER_FRAGMENT)
NIR_PASS(progress, nir, lower_input_attachment_loads, shader);
NIR_PASS(progress, nir, lower_input_attachment_loads, state, shader);
NIR_PASS(progress, nir, nir_shader_instructions_pass,
lower_descriptors_instr, nir_metadata_control_flow, &ctx);

View file

@ -769,7 +769,7 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
#endif
panvk_per_arch(nir_lower_descriptors)(nir, dev, rs, set_layout_count,
set_layouts, shader);
set_layouts, state, shader);
NIR_PASS(_, nir, nir_split_var_copies);
NIR_PASS(_, nir, nir_lower_var_copies);