From aee14878cc5efc9f1d7a2cd489e917522cd92b24 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Sat, 10 Jan 2026 00:17:02 -0500 Subject: [PATCH] panvk: Push our own blend descriptors This fixes attachment remapping for KHR_dynamic_rendering_local_read. See the long comment in panvk_vX_blend.c for more details. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/14484 Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/vulkan/panvk_cmd_draw.h | 2 +- src/panfrost/vulkan/panvk_shader.h | 5 + src/panfrost/vulkan/panvk_vX_blend.c | 118 ++++++++++++++++-------- src/panfrost/vulkan/panvk_vX_cmd_draw.c | 5 + src/panfrost/vulkan/panvk_vX_shader.c | 13 +++ 5 files changed, 105 insertions(+), 38 deletions(-) diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 304df7584a5..33c299335f5 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -26,7 +26,6 @@ #include "pan_props.h" #define MAX_VBS 16 -#define MAX_RTS 8 struct panvk_cmd_buffer; @@ -139,6 +138,7 @@ struct panvk_cmd_graphics_state { struct { const struct panvk_shader *shader; struct panvk_shader_desc_state desc; + uint64_t blend_descs[MAX_RTS]; uint64_t push_uniforms; bool required; #if PAN_ARCH < 9 diff --git a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h index d4951df3ff8..aac73645b8b 100644 --- a/src/panfrost/vulkan/panvk_shader.h +++ b/src/panfrost/vulkan/panvk_shader.h @@ -26,6 +26,7 @@ extern const struct vk_device_shader_ops panvk_per_arch(device_shader_ops); +#define MAX_RTS 8 #define MAX_VS_ATTRIBS 16 #if PAN_ARCH < 9 @@ -155,6 +156,10 @@ struct panvk_graphics_sysvals { uint32_t noperspective_varyings; } vs; + struct { + aligned_u64 blend_descs[MAX_RTS]; + } fs; + struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE]; #if PAN_ARCH < 9 diff --git a/src/panfrost/vulkan/panvk_vX_blend.c b/src/panfrost/vulkan/panvk_vX_blend.c index 099468be0d2..505a7213f1c 100644 --- a/src/panfrost/vulkan/panvk_vX_blend.c +++ b/src/panfrost/vulkan/panvk_vX_blend.c @@ -114,16 +114,15 @@ out: } static void -emit_blend_desc(const struct pan_shader_info *fs_info, uint64_t fs_code, - const struct pan_blend_state *state, unsigned blend_idx, - unsigned rt_idx, uint64_t blend_shader, uint16_t constant, +emit_blend_desc(const struct pan_blend_state *state, uint8_t rt_idx, + const struct pan_shader_info *fs_info, uint8_t loc, + uint64_t fs_code, uint64_t blend_shader, uint16_t constant, struct mali_blend_packed *bd) { - const struct pan_blend_rt_state *rt = - rt_idx != MESA_VK_ATTACHMENT_UNUSED ? &state->rts[rt_idx] : NULL; + const struct pan_blend_rt_state *rt = &state->rts[rt_idx]; pan_pack(bd, BLEND, cfg) { - if (!state->rt_count || !rt || !rt->equation.color_mask) { + if (loc == MESA_VK_ATTACHMENT_UNUSED || !rt->equation.color_mask) { cfg.enable = false; cfg.internal.mode = MALI_BLEND_MODE_OFF; continue; @@ -146,7 +145,7 @@ emit_blend_desc(const struct pan_shader_info *fs_info, uint64_t fs_code, cfg.internal.shader.pc = (uint32_t)blend_shader; #if PAN_ARCH < 9 - uint32_t ret_offset = fs_info->bifrost.blend[blend_idx].return_offset; + uint32_t ret_offset = fs_info->bifrost.blend[loc].return_offset; /* If ret_offset is zero, we assume the BLEND is a terminal * instruction and set return_value to zero, to let the @@ -186,7 +185,7 @@ emit_blend_desc(const struct pan_shader_info *fs_info, uint64_t fs_code, cfg.internal.fixed_function.rt = rt_idx; #if PAN_ARCH < 9 - nir_alu_type type = fs_info->bifrost.blend[blend_idx].type; + nir_alu_type type = fs_info->bifrost.blend[loc].type; if (fs_info->fs.untyped_color_outputs) { cfg.internal.fixed_function.conversion.register_format = GENX(pan_fixup_blend_type)(type, rt->format); @@ -285,7 +284,7 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf, .alpha_to_one = dyns->ms.alpha_to_one_enable, .logicop_enable = cb->logic_op_enable, .logicop_func = vk_logic_op_to_pipe(cb->logic_op), - .rt_count = cb->attachment_count, + .rt_count = cmdbuf->state.gfx.render.fb.info.rt_count, .constants = { cb->blend_constants[0], @@ -297,31 +296,28 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf, uint64_t blend_shaders[8] = {}; /* All bits set to one encodes unused fixed-function blend constant. */ unsigned ff_blend_constant = ~0; - uint8_t remap_catts[MAX_RTS] = { - MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED, - MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED, - MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED, - MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED, - }; uint32_t blend_count = MAX2(cmdbuf->state.gfx.render.fb.info.rt_count, 1); - static_assert(ARRAY_SIZE(remap_catts) <= ARRAY_SIZE(cal->color_map), - "vk_color_attachment_location_state::color_map is too small"); - - for (uint32_t i = 0; i < ARRAY_SIZE(remap_catts); i++) { - if (cal->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) { - assert(cal->color_map[i] < MAX_RTS); - remap_catts[cal->color_map[i]] = i; - } - } + uint8_t loc_rt[MAX_RTS], rt_loc[MAX_RTS]; + memset(loc_rt, MESA_VK_ATTACHMENT_UNUSED, sizeof(loc_rt)); + memset(rt_loc, MESA_VK_ATTACHMENT_UNUSED, sizeof(rt_loc)); memset(blend_info, 0, sizeof(*blend_info)); - for (uint8_t i = 0; i < cb->attachment_count; i++) { + for (uint8_t i = 0; i < bs.rt_count; i++) { struct pan_blend_rt_state *rt = &bs.rts[i]; - if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED) + uint8_t loc = cal->color_map[i]; + if (loc == MESA_VK_ATTACHMENT_UNUSED) continue; + if (!(fs_info->outputs_written & BITFIELD_BIT(FRAG_RESULT_DATA0 + loc))) + continue; + + /* At this point, we know it's mapped to a shader location. */ + assert(loc < MAX_RTS && loc_rt[loc] == MESA_VK_ATTACHMENT_UNUSED); + rt_loc[i] = loc; + loc_rt[loc] = i; + if (!(cb->color_write_enables & BITFIELD_BIT(i))) continue; @@ -365,7 +361,7 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf, blend_info->any_dest_read |= pan_blend_reads_dest(rt->equation); if (blend_needs_shader(&bs, i, &ff_blend_constant)) { - nir_alu_type src0_type = fs_info->bifrost.blend[i].type; + nir_alu_type src0_type = fs_info->bifrost.blend[loc].type; nir_alu_type src1_type = fs_info->bifrost.blend_src1_type; VkResult result = get_blend_shader(dev, &bs, src0_type, src1_type, @@ -383,18 +379,66 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf, if (ff_blend_constant == ~0) ff_blend_constant = 0; - /* Now that we've collected all the information, we can emit. */ - for (uint8_t i = 0; i < blend_count; i++) { - uint32_t catt_idx = remap_catts[i]; - uint64_t blend_shader = - catt_idx != MESA_VK_ATTACHMENT_UNUSED ? blend_shaders[catt_idx] : 0; - - emit_blend_desc(fs_info, fs_code, &bs, i, catt_idx, - blend_shader, ff_blend_constant, &bds[i]); + struct mali_blend_packed packed[MAX_RTS]; + for (uint8_t rt = 0; rt < blend_count; rt++) { + emit_blend_desc(&bs, rt, fs_info, rt_loc[rt], fs_code, + blend_shaders[rt], ff_blend_constant, &packed[rt]); } - if (blend_info->shader_loads_blend_const) - gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); + /* Copy into the GPU descriptor array */ + typed_memcpy(bds, packed, blend_count); + + /* Re-order blend descriptors for the shader + * + * Blending on Bifrost+ is really annoying. In theory, we can order the + * blend descriptors any way we want and, in theory, they all have an RT + * index. However, that's not really the way blending works. If every + * blend descriptor is either disabled or has the RT index that's the same + * as the blend descriptor index, everything is fine. If not, we're in for + * a bit of trouble. + * + * The RT index is only really consumed by the BLEND instruction to tell it + * what RT to target. The FF blend hardware, however, assumes that blend + * RTs map 1:1 to framebuffer RTs. If the BLEND instruction kicks off a + * blend shader, everything is good because the blend shader has the + * correct RT index baked into it. But if the BLEND fires off a message to + * the blend unit, it then looks up the blend descriptor by RT index, + * ignoring which blend descriptor FAU we read from the shader. Say, for + * instance bds[3].rt == 5. If we BLEND using blend_descriptor_3, the + * BLEND instruction will see an RT of 5 and kick off a blend message. The + * blend unit will then fetch bds[5] and use that to do the actual blend. + * + * We could try to do something crazy where we divide blend descriptors + * into a shader half and a HW half and only re-arrange the shader half so + * that the hardware's little daisy-chain game works. However, there are a + * few fields that are used by both, notably the conversion, so this is + * dead in the water. + * + * The solution, then, is to push our own blend descriptors. Or at least + * the shader half (everything in .internal). This lets us leave the HW + * blend descriptors with a 1:1 mapping to framebuffer RTs and re-order + * things from the shader PoV. The BLEND instructions in the shader will + * use the ones we pushed manually and then, when a blend message gets sent + * to the hardware, it'll pick up an idential blend descriptor in the RT + * index location in the bds[] array. + */ + for (uint8_t loc = 0; loc < MAX_RTS; loc++) { + uint8_t rt = loc_rt[loc]; + if (rt == MESA_VK_ATTACHMENT_UNUSED) { + struct mali_blend_packed disabled; + pan_pack(&disabled, BLEND, cfg) { + cfg.enable = false; + cfg.internal.mode = MALI_BLEND_MODE_OFF; + } + cmdbuf->state.gfx.fs.blend_descs[loc] = + disabled.opaque[2] | (uint64_t)disabled.opaque[3] << 32; + } else { + cmdbuf->state.gfx.fs.blend_descs[loc] = + packed[rt].opaque[2] | (uint64_t)packed[rt].opaque[3] << 32; + } + } + + gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS); return VK_SUCCESS; } diff --git a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c index 70fbe90f06e..aed4d5d615c 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c @@ -733,6 +733,11 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf, } } + for (unsigned i = 0; i < MAX_RTS; i++) { + set_gfx_sysval(cmdbuf, dirty_sysvals, fs.blend_descs[i], + cmdbuf->state.gfx.fs.blend_descs[i]); + } + if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) || dyn_gfx_state_dirty(cmdbuf, VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) || dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) || diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index b8b4522e4e5..380a374336c 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -146,6 +146,12 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data) val = load_sysval(b, common, bit_size, printf_buffer_address); break; + case nir_intrinsic_load_blend_descriptor_pan: { + uint32_t loc = nir_intrinsic_base(intr); + val = load_sysval(b, graphics, bit_size, fs.blend_descs[loc]); + break; + } + case nir_intrinsic_load_input_attachment_target_pan: { const struct vk_input_attachment_location_state *ial = ctx->state ? ctx->state->ial : NULL; @@ -1400,6 +1406,13 @@ panvk_compile_shader(struct panvk_device *dev, nir_assign_io_var_locations(nir, nir_var_shader_out); panvk_lower_nir_io(nir); + /* Lower FS outputs now so that we can lower load_blend_descriptor_pan + * to a driver-provided FAU instead of using the blend descriptors + * uploaded by the hardware. See panvk_vX_blend.c for details. + */ + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, pan_nir_lower_fs_outputs, false); + variant->own_bin = true; result = panvk_compile_nir(dev, nir, info->flags, &inputs, state,