panvk: Push our own blend descriptors

This fixes attachment remapping for KHR_dynamic_rendering_local_read.
See the long comment in panvk_vX_blend.c for more details.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/14484
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39274>
This commit is contained in:
Faith Ekstrand 2026-01-10 00:17:02 -05:00 committed by Marge Bot
parent f66b48d790
commit aee14878cc
5 changed files with 105 additions and 38 deletions

View file

@ -26,7 +26,6 @@
#include "pan_props.h"
#define MAX_VBS 16
#define MAX_RTS 8
struct panvk_cmd_buffer;
@ -139,6 +138,7 @@ struct panvk_cmd_graphics_state {
struct {
const struct panvk_shader *shader;
struct panvk_shader_desc_state desc;
uint64_t blend_descs[MAX_RTS];
uint64_t push_uniforms;
bool required;
#if PAN_ARCH < 9

View file

@ -26,6 +26,7 @@
extern const struct vk_device_shader_ops panvk_per_arch(device_shader_ops);
#define MAX_RTS 8
#define MAX_VS_ATTRIBS 16
#if PAN_ARCH < 9
@ -155,6 +156,10 @@ struct panvk_graphics_sysvals {
uint32_t noperspective_varyings;
} vs;
struct {
aligned_u64 blend_descs[MAX_RTS];
} fs;
struct panvk_input_attachment_info iam[INPUT_ATTACHMENT_MAP_SIZE];
#if PAN_ARCH < 9

View file

@ -114,16 +114,15 @@ out:
}
static void
emit_blend_desc(const struct pan_shader_info *fs_info, uint64_t fs_code,
const struct pan_blend_state *state, unsigned blend_idx,
unsigned rt_idx, uint64_t blend_shader, uint16_t constant,
emit_blend_desc(const struct pan_blend_state *state, uint8_t rt_idx,
const struct pan_shader_info *fs_info, uint8_t loc,
uint64_t fs_code, uint64_t blend_shader, uint16_t constant,
struct mali_blend_packed *bd)
{
const struct pan_blend_rt_state *rt =
rt_idx != MESA_VK_ATTACHMENT_UNUSED ? &state->rts[rt_idx] : NULL;
const struct pan_blend_rt_state *rt = &state->rts[rt_idx];
pan_pack(bd, BLEND, cfg) {
if (!state->rt_count || !rt || !rt->equation.color_mask) {
if (loc == MESA_VK_ATTACHMENT_UNUSED || !rt->equation.color_mask) {
cfg.enable = false;
cfg.internal.mode = MALI_BLEND_MODE_OFF;
continue;
@ -146,7 +145,7 @@ emit_blend_desc(const struct pan_shader_info *fs_info, uint64_t fs_code,
cfg.internal.shader.pc = (uint32_t)blend_shader;
#if PAN_ARCH < 9
uint32_t ret_offset = fs_info->bifrost.blend[blend_idx].return_offset;
uint32_t ret_offset = fs_info->bifrost.blend[loc].return_offset;
/* If ret_offset is zero, we assume the BLEND is a terminal
* instruction and set return_value to zero, to let the
@ -186,7 +185,7 @@ emit_blend_desc(const struct pan_shader_info *fs_info, uint64_t fs_code,
cfg.internal.fixed_function.rt = rt_idx;
#if PAN_ARCH < 9
nir_alu_type type = fs_info->bifrost.blend[blend_idx].type;
nir_alu_type type = fs_info->bifrost.blend[loc].type;
if (fs_info->fs.untyped_color_outputs) {
cfg.internal.fixed_function.conversion.register_format =
GENX(pan_fixup_blend_type)(type, rt->format);
@ -285,7 +284,7 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf,
.alpha_to_one = dyns->ms.alpha_to_one_enable,
.logicop_enable = cb->logic_op_enable,
.logicop_func = vk_logic_op_to_pipe(cb->logic_op),
.rt_count = cb->attachment_count,
.rt_count = cmdbuf->state.gfx.render.fb.info.rt_count,
.constants =
{
cb->blend_constants[0],
@ -297,31 +296,28 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf,
uint64_t blend_shaders[8] = {};
/* All bits set to one encodes unused fixed-function blend constant. */
unsigned ff_blend_constant = ~0;
uint8_t remap_catts[MAX_RTS] = {
MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED,
MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED,
MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED,
MESA_VK_ATTACHMENT_UNUSED, MESA_VK_ATTACHMENT_UNUSED,
};
uint32_t blend_count = MAX2(cmdbuf->state.gfx.render.fb.info.rt_count, 1);
static_assert(ARRAY_SIZE(remap_catts) <= ARRAY_SIZE(cal->color_map),
"vk_color_attachment_location_state::color_map is too small");
for (uint32_t i = 0; i < ARRAY_SIZE(remap_catts); i++) {
if (cal->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) {
assert(cal->color_map[i] < MAX_RTS);
remap_catts[cal->color_map[i]] = i;
}
}
uint8_t loc_rt[MAX_RTS], rt_loc[MAX_RTS];
memset(loc_rt, MESA_VK_ATTACHMENT_UNUSED, sizeof(loc_rt));
memset(rt_loc, MESA_VK_ATTACHMENT_UNUSED, sizeof(rt_loc));
memset(blend_info, 0, sizeof(*blend_info));
for (uint8_t i = 0; i < cb->attachment_count; i++) {
for (uint8_t i = 0; i < bs.rt_count; i++) {
struct pan_blend_rt_state *rt = &bs.rts[i];
if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
uint8_t loc = cal->color_map[i];
if (loc == MESA_VK_ATTACHMENT_UNUSED)
continue;
if (!(fs_info->outputs_written & BITFIELD_BIT(FRAG_RESULT_DATA0 + loc)))
continue;
/* At this point, we know it's mapped to a shader location. */
assert(loc < MAX_RTS && loc_rt[loc] == MESA_VK_ATTACHMENT_UNUSED);
rt_loc[i] = loc;
loc_rt[loc] = i;
if (!(cb->color_write_enables & BITFIELD_BIT(i)))
continue;
@ -365,7 +361,7 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf,
blend_info->any_dest_read |= pan_blend_reads_dest(rt->equation);
if (blend_needs_shader(&bs, i, &ff_blend_constant)) {
nir_alu_type src0_type = fs_info->bifrost.blend[i].type;
nir_alu_type src0_type = fs_info->bifrost.blend[loc].type;
nir_alu_type src1_type = fs_info->bifrost.blend_src1_type;
VkResult result = get_blend_shader(dev, &bs, src0_type, src1_type,
@ -383,18 +379,66 @@ panvk_per_arch(blend_emit_descs)(struct panvk_cmd_buffer *cmdbuf,
if (ff_blend_constant == ~0)
ff_blend_constant = 0;
/* Now that we've collected all the information, we can emit. */
for (uint8_t i = 0; i < blend_count; i++) {
uint32_t catt_idx = remap_catts[i];
uint64_t blend_shader =
catt_idx != MESA_VK_ATTACHMENT_UNUSED ? blend_shaders[catt_idx] : 0;
emit_blend_desc(fs_info, fs_code, &bs, i, catt_idx,
blend_shader, ff_blend_constant, &bds[i]);
struct mali_blend_packed packed[MAX_RTS];
for (uint8_t rt = 0; rt < blend_count; rt++) {
emit_blend_desc(&bs, rt, fs_info, rt_loc[rt], fs_code,
blend_shaders[rt], ff_blend_constant, &packed[rt]);
}
if (blend_info->shader_loads_blend_const)
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
/* Copy into the GPU descriptor array */
typed_memcpy(bds, packed, blend_count);
/* Re-order blend descriptors for the shader
*
* Blending on Bifrost+ is really annoying. In theory, we can order the
* blend descriptors any way we want and, in theory, they all have an RT
* index. However, that's not really the way blending works. If every
* blend descriptor is either disabled or has the RT index that's the same
* as the blend descriptor index, everything is fine. If not, we're in for
* a bit of trouble.
*
* The RT index is only really consumed by the BLEND instruction to tell it
* what RT to target. The FF blend hardware, however, assumes that blend
* RTs map 1:1 to framebuffer RTs. If the BLEND instruction kicks off a
* blend shader, everything is good because the blend shader has the
* correct RT index baked into it. But if the BLEND fires off a message to
* the blend unit, it then looks up the blend descriptor by RT index,
* ignoring which blend descriptor FAU we read from the shader. Say, for
* instance bds[3].rt == 5. If we BLEND using blend_descriptor_3, the
* BLEND instruction will see an RT of 5 and kick off a blend message. The
* blend unit will then fetch bds[5] and use that to do the actual blend.
*
* We could try to do something crazy where we divide blend descriptors
* into a shader half and a HW half and only re-arrange the shader half so
* that the hardware's little daisy-chain game works. However, there are a
* few fields that are used by both, notably the conversion, so this is
* dead in the water.
*
* The solution, then, is to push our own blend descriptors. Or at least
* the shader half (everything in .internal). This lets us leave the HW
* blend descriptors with a 1:1 mapping to framebuffer RTs and re-order
* things from the shader PoV. The BLEND instructions in the shader will
* use the ones we pushed manually and then, when a blend message gets sent
* to the hardware, it'll pick up an idential blend descriptor in the RT
* index location in the bds[] array.
*/
for (uint8_t loc = 0; loc < MAX_RTS; loc++) {
uint8_t rt = loc_rt[loc];
if (rt == MESA_VK_ATTACHMENT_UNUSED) {
struct mali_blend_packed disabled;
pan_pack(&disabled, BLEND, cfg) {
cfg.enable = false;
cfg.internal.mode = MALI_BLEND_MODE_OFF;
}
cmdbuf->state.gfx.fs.blend_descs[loc] =
disabled.opaque[2] | (uint64_t)disabled.opaque[3] << 32;
} else {
cmdbuf->state.gfx.fs.blend_descs[loc] =
packed[rt].opaque[2] | (uint64_t)packed[rt].opaque[3] << 32;
}
}
gfx_state_set_dirty(cmdbuf, FS_PUSH_UNIFORMS);
return VK_SUCCESS;
}

View file

@ -733,6 +733,11 @@ panvk_per_arch(cmd_prepare_draw_sysvals)(struct panvk_cmd_buffer *cmdbuf,
}
}
for (unsigned i = 0; i < MAX_RTS; i++) {
set_gfx_sysval(cmdbuf, dirty_sysvals, fs.blend_descs[i],
cmdbuf->state.gfx.fs.blend_descs[i]);
}
if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
dyn_gfx_state_dirty(cmdbuf, VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||

View file

@ -146,6 +146,12 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
val = load_sysval(b, common, bit_size, printf_buffer_address);
break;
case nir_intrinsic_load_blend_descriptor_pan: {
uint32_t loc = nir_intrinsic_base(intr);
val = load_sysval(b, graphics, bit_size, fs.blend_descs[loc]);
break;
}
case nir_intrinsic_load_input_attachment_target_pan: {
const struct vk_input_attachment_location_state *ial =
ctx->state ? ctx->state->ial : NULL;
@ -1400,6 +1406,13 @@ panvk_compile_shader(struct panvk_device *dev,
nir_assign_io_var_locations(nir, nir_var_shader_out);
panvk_lower_nir_io(nir);
/* Lower FS outputs now so that we can lower load_blend_descriptor_pan
* to a driver-provided FAU instead of using the blend descriptors
* uploaded by the hardware. See panvk_vX_blend.c for details.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, pan_nir_lower_fs_outputs, false);
variant->own_bin = true;
result = panvk_compile_nir(dev, nir, info->flags, &inputs, state,