panfrost: Enable more than 16 varyings on v9+

This change removes the limit of 16 varyings caused by the 8-bit offset
value used in LD_VAR_BUF[_IMM]. LD_VAR[_IMM] is used instead and the
necessary ADs are emitted at draw time.

Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34074>
This commit is contained in:
Christoph Pillmayer 2025-03-13 15:58:38 +00:00 committed by Marge Bot
parent 85b6bd989e
commit cd2ca0ac22
9 changed files with 88 additions and 32 deletions

View file

@ -1,4 +1,5 @@
/*
* Copyright (C) 2025 Arm Ltd.
* Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
* Copyright (C) 2018 Alyssa Rosenzweig
* Copyright (C) 2020 Collabora Ltd.
@ -2875,6 +2876,57 @@ panfrost_update_streamout_offsets(struct panfrost_context *ctx)
(PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER | \
PAN_DIRTY_OQ)
#if PAN_ARCH >= 9
static uint64_t
panfrost_emit_varying_descriptors(struct panfrost_batch *batch)
{
struct panfrost_compiled_shader *vs =
batch->ctx->prog[PIPE_SHADER_VERTEX];
struct panfrost_compiled_shader *fs =
batch->ctx->prog[PIPE_SHADER_FRAGMENT];
const uint32_t vs_out_mask = vs->info.varyings.fixed_varyings;
const uint32_t fs_in_mask = fs->info.varyings.fixed_varyings;
const uint32_t fs_in_slots = fs->info.varyings.input_count +
util_bitcount(fs_in_mask);
struct panfrost_ptr bufs =
pan_pool_alloc_desc_array(&batch->pool.base, fs_in_slots, ATTRIBUTE);
struct mali_attribute_packed *descs = bufs.cpu;
batch->nr_varying_attribs[PIPE_SHADER_FRAGMENT] = fs_in_slots;
const uint32_t varying_size = panfrost_vertex_attribute_stride(vs, fs);
for (uint32_t i = 0; i < fs_in_slots; i++) {
const struct pan_shader_varying *var = &fs->info.varyings.input[i];
uint32_t index = 0;
if (var->location >= VARYING_SLOT_VAR0) {
unsigned nr_special = util_bitcount(vs_out_mask);
unsigned general_index = (var->location - VARYING_SLOT_VAR0);
index = nr_special + general_index;
} else {
index = util_bitcount(vs_out_mask & BITFIELD_MASK(var->location));
}
pan_pack(&descs[i], ATTRIBUTE, cfg) {
cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET;
cfg.offset_enable = false;
cfg.format = GENX(panfrost_format_from_pipe_format)(var->format)->hw;
cfg.table = 61;
cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
cfg.offset = 1024 + (index * 16);
cfg.buffer_index = 0;
cfg.attribute_stride = varying_size;
cfg.packet_stride = varying_size + 16;
}
}
return bufs.gpu;
}
#endif
static inline void
panfrost_update_shader_state(struct panfrost_batch *batch,
enum pipe_shader_type st)
@ -2904,6 +2956,9 @@ panfrost_update_shader_state(struct panfrost_batch *batch,
}
#if PAN_ARCH >= 9
if ((dirty & PAN_DIRTY_STAGE_SHADER) && frag)
batch->attribs[st] = panfrost_emit_varying_descriptors(batch);
if (dirty & PAN_DIRTY_STAGE_IMAGE) {
batch->images[st] =
ctx->image_mask[st] ? panfrost_emit_images(batch, st) : 0;

View file

@ -273,7 +273,7 @@ panfrost_vertex_attribute_stride(struct panfrost_compiled_shader *vs,
unsigned v = vs->info.varyings.output_count;
unsigned f = fs->info.varyings.input_count;
unsigned slots = MAX2(v, f);
slots += util_bitcount(fs->key.fs.fixed_varying_mask);
slots += util_bitcount(vs->info.varyings.fixed_varyings);
/* Assumes 16 byte slots. We could do better. */
return slots * 16;
@ -310,7 +310,11 @@ panfrost_emit_resources(struct panfrost_batch *batch,
panfrost_make_resource_table(T, PAN_TABLE_IMAGE, batch->images[stage],
util_last_bit(ctx->image_mask[stage]));
if (stage == PIPE_SHADER_VERTEX) {
if (stage == PIPE_SHADER_FRAGMENT) {
panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE,
batch->attribs[stage],
batch->nr_varying_attribs[PIPE_SHADER_FRAGMENT]);
} else if (stage == PIPE_SHADER_VERTEX) {
panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE,
batch->attribs[stage],
ctx->vertex->num_elements);

View file

@ -346,9 +346,6 @@ struct panfrost_fs_key {
/* Number of colour buffers if gl_FragColor is written */
unsigned nr_cbufs_for_fragcolor;
/* On Valhall, fixed_varying_mask of the linked vertex shader */
uint32_t fixed_varying_mask;
/* Midgard shaders that read the tilebuffer must be keyed for
* non-blendable formats
*/

View file

@ -133,6 +133,7 @@ struct panfrost_batch {
unsigned nr_push_uniforms[PIPE_SHADER_TYPES];
unsigned nr_uniform_buffers[PIPE_SHADER_TYPES];
unsigned nr_varying_attribs[PIPE_SHADER_TYPES];
/* Varying related pointers */
struct {

View file

@ -77,15 +77,12 @@ static bool
lower_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
const struct panfrost_compile_inputs *inputs)
{
/* We always use heap-based varying allocation when IDVS is used on Valhall. */
bool malloc_idvs = !inputs->no_idvs;
/* All vertex attributes come from the attribute table.
* Fragment inputs come from the attribute table too, unless they've
* been allocated on the heap.
*/
if (b->shader->info.stage == MESA_SHADER_VERTEX ||
(b->shader->info.stage == MESA_SHADER_FRAGMENT && !malloc_idvs)) {
b->shader->info.stage == MESA_SHADER_FRAGMENT) {
nir_intrinsic_set_base(
intrin,
pan_res_handle(PAN_TABLE_ATTRIBUTE, nir_intrinsic_base(intrin)));
@ -131,6 +128,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
case nir_intrinsic_image_texel_address:
return lower_image_intrin(b, intrin);
case nir_intrinsic_load_input:
case nir_intrinsic_load_interpolated_input:
return lower_input_intrin(b, intrin, inputs);
case nir_intrinsic_load_ubo:
return lower_load_ubo_intrin(b, intrin);

View file

@ -364,7 +364,7 @@ panfrost_init_shader_caps(struct panfrost_screen *screen)
caps->max_tex_indirections = 16384; /* arbitrary */
caps->max_control_flow_depth = 1024; /* arbitrary */
/* Used as ABI on Midgard */
caps->max_inputs = 16;
caps->max_inputs = dev->arch >= 9 ? 32 : 16;
caps->max_outputs = i == PIPE_SHADER_FRAGMENT ? 8 : PIPE_MAX_ATTRIBS;
caps->max_temps = 256; /* arbitrary */
caps->max_const_buffer0_size = 16 * 1024 * sizeof(float);
@ -638,7 +638,7 @@ panfrost_init_screen_caps(struct panfrost_screen *screen)
caps->shader_buffer_offset_alignment = 4;
caps->max_varyings = dev->arch >= 9 ? 16 : 32;
caps->max_varyings = 32;
/* Removed in v6 (Bifrost) */
caps->gl_clamp =

View file

@ -136,19 +136,21 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
.push_uniforms = true,
};
if (dev->arch >= 9)
/* Use LD_VAR_BUF for varying lookups. */
inputs.valhall.use_ld_var_buf = true;
/* Lower this early so the backends don't have to worry about it */
if (s->info.stage == MESA_SHADER_FRAGMENT) {
inputs.fixed_varying_mask = key->fs.fixed_varying_mask;
} else if (s->info.stage == MESA_SHADER_VERTEX) {
inputs.fixed_varying_mask = fixed_varying_mask;
unsigned fixed_varying_mask =
(ir->info.inputs_read & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
inputs.fixed_varying_mask = fixed_varying_mask;
} else if (s->info.stage == MESA_SHADER_VERTEX) {
/* No IDVS for internal XFB shaders */
inputs.no_idvs = s->info.has_transform_feedback_varyings;
inputs.fixed_varying_mask =
(ir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
if (s->info.has_transform_feedback_varyings) {
NIR_PASS(_, s, nir_io_add_const_offset_to_base,
nir_var_shader_in | nir_var_shader_out);
@ -293,7 +295,6 @@ panfrost_build_fs_key(struct panfrost_context *ctx,
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
struct pipe_rasterizer_state *rast = (void *)ctx->rasterizer;
struct panfrost_uncompiled_shader *vs = ctx->uncompiled[MESA_SHADER_VERTEX];
/* gl_FragColor lowering needs the number of colour buffers */
if (uncompiled->fragcolor_lowered) {
@ -326,12 +327,6 @@ panfrost_build_fs_key(struct panfrost_context *ctx,
key->rt_formats[i] = fmt;
}
}
/* Funny desktop GL varying lowering on Valhall */
if (dev->arch >= 9) {
assert(vs != NULL && "too early");
key->fixed_varying_mask = vs->fixed_varying_mask;
}
}
static void
@ -471,13 +466,6 @@ panfrost_create_shader_state(struct pipe_context *pctx,
so->stream_output = cso->stream_output;
so->nir = nir;
/* Fix linkage early */
if (so->nir->info.stage == MESA_SHADER_VERTEX) {
so->fixed_varying_mask =
(so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
}
/* gl_FragColor needs to be lowered before lowering I/O, do that now */
if (nir->info.stage == MESA_SHADER_FRAGMENT &&
nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) {

View file

@ -146,6 +146,11 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
#if PAN_ARCH >= 9
info->varyings.output_count =
util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0);
/* Store the mask of special varyings, in case we need to emit ADs later. */
info->varyings.fixed_varyings =
(s->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
#endif
break;
case MESA_SHADER_FRAGMENT:
@ -195,6 +200,11 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
#if PAN_ARCH >= 9
info->varyings.input_count =
util_last_bit(s->info.inputs_read >> VARYING_SLOT_VAR0);
/* Store the mask of special varyings, in case we need to emit ADs later. */
info->varyings.fixed_varyings =
(s->info.inputs_read & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
#endif
break;
default:

View file

@ -294,6 +294,9 @@ struct pan_shader_info {
/* Bitfield of noperspective varyings, starting at VARYING_SLOT_VAR0 */
uint32_t noperspective;
/* Bitfield of special varyings. */
uint32_t fixed_varyings;
} varyings;
/* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access