From cd2ca0ac222f50fad989229906170e4d1acdc1fc Mon Sep 17 00:00:00 2001 From: Christoph Pillmayer Date: Thu, 13 Mar 2025 15:58:38 +0000 Subject: [PATCH] panfrost: Enable more than 16 varyings on v9+ This change removes the limit of 16 varyings caused by the 8-bit offset value used in LD_VAR_BUF[_IMM]. LD_VAR[_IMM] is used instead and the necessary ADs are emitted at draw time. Reviewed-by: Lars-Ivar Hesselberg Simonsen Reviewed-by: Boris Brezillon Part-of: --- src/gallium/drivers/panfrost/pan_cmdstream.c | 55 +++++++++++++++++++ src/gallium/drivers/panfrost/pan_cmdstream.h | 8 ++- src/gallium/drivers/panfrost/pan_context.h | 3 - src/gallium/drivers/panfrost/pan_job.h | 1 + .../panfrost/pan_nir_lower_res_indices.c | 6 +- src/gallium/drivers/panfrost/pan_screen.c | 4 +- src/gallium/drivers/panfrost/pan_shader.c | 30 +++------- src/panfrost/lib/pan_shader.c | 10 ++++ src/panfrost/util/pan_ir.h | 3 + 9 files changed, 88 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index a9b89f1e99c..bdde1da6397 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1,4 +1,5 @@ /* + * Copyright (C) 2025 Arm Ltd. * Copyright (C) 2023 Amazon.com, Inc. or its affiliates. * Copyright (C) 2018 Alyssa Rosenzweig * Copyright (C) 2020 Collabora Ltd. @@ -2875,6 +2876,57 @@ panfrost_update_streamout_offsets(struct panfrost_context *ctx) (PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER | \ PAN_DIRTY_OQ) +#if PAN_ARCH >= 9 +static uint64_t +panfrost_emit_varying_descriptors(struct panfrost_batch *batch) +{ + struct panfrost_compiled_shader *vs = + batch->ctx->prog[PIPE_SHADER_VERTEX]; + struct panfrost_compiled_shader *fs = + batch->ctx->prog[PIPE_SHADER_FRAGMENT]; + + const uint32_t vs_out_mask = vs->info.varyings.fixed_varyings; + const uint32_t fs_in_mask = fs->info.varyings.fixed_varyings; + const uint32_t fs_in_slots = fs->info.varyings.input_count + + util_bitcount(fs_in_mask); + + struct panfrost_ptr bufs = + pan_pool_alloc_desc_array(&batch->pool.base, fs_in_slots, ATTRIBUTE); + struct mali_attribute_packed *descs = bufs.cpu; + + batch->nr_varying_attribs[PIPE_SHADER_FRAGMENT] = fs_in_slots; + + const uint32_t varying_size = panfrost_vertex_attribute_stride(vs, fs); + + for (uint32_t i = 0; i < fs_in_slots; i++) { + const struct pan_shader_varying *var = &fs->info.varyings.input[i]; + + uint32_t index = 0; + if (var->location >= VARYING_SLOT_VAR0) { + unsigned nr_special = util_bitcount(vs_out_mask); + unsigned general_index = (var->location - VARYING_SLOT_VAR0); + index = nr_special + general_index; + } else { + index = util_bitcount(vs_out_mask & BITFIELD_MASK(var->location)); + } + + pan_pack(&descs[i], ATTRIBUTE, cfg) { + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET; + cfg.offset_enable = false; + cfg.format = GENX(panfrost_format_from_pipe_format)(var->format)->hw; + cfg.table = 61; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; + cfg.offset = 1024 + (index * 16); + cfg.buffer_index = 0; + cfg.attribute_stride = varying_size; + cfg.packet_stride = varying_size + 16; + } + } + + return bufs.gpu; +} +#endif + static inline void panfrost_update_shader_state(struct panfrost_batch *batch, enum pipe_shader_type st) @@ -2904,6 +2956,9 @@ panfrost_update_shader_state(struct panfrost_batch *batch, } #if PAN_ARCH >= 9 + if ((dirty & PAN_DIRTY_STAGE_SHADER) && frag) + batch->attribs[st] = panfrost_emit_varying_descriptors(batch); + if (dirty & PAN_DIRTY_STAGE_IMAGE) { batch->images[st] = ctx->image_mask[st] ? panfrost_emit_images(batch, st) : 0; diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.h b/src/gallium/drivers/panfrost/pan_cmdstream.h index 3623595c0e7..51d655610c7 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.h +++ b/src/gallium/drivers/panfrost/pan_cmdstream.h @@ -273,7 +273,7 @@ panfrost_vertex_attribute_stride(struct panfrost_compiled_shader *vs, unsigned v = vs->info.varyings.output_count; unsigned f = fs->info.varyings.input_count; unsigned slots = MAX2(v, f); - slots += util_bitcount(fs->key.fs.fixed_varying_mask); + slots += util_bitcount(vs->info.varyings.fixed_varyings); /* Assumes 16 byte slots. We could do better. */ return slots * 16; @@ -310,7 +310,11 @@ panfrost_emit_resources(struct panfrost_batch *batch, panfrost_make_resource_table(T, PAN_TABLE_IMAGE, batch->images[stage], util_last_bit(ctx->image_mask[stage])); - if (stage == PIPE_SHADER_VERTEX) { + if (stage == PIPE_SHADER_FRAGMENT) { + panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, + batch->attribs[stage], + batch->nr_varying_attribs[PIPE_SHADER_FRAGMENT]); + } else if (stage == PIPE_SHADER_VERTEX) { panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, batch->attribs[stage], ctx->vertex->num_elements); diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index 6835d39de49..84e2c350ad2 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -346,9 +346,6 @@ struct panfrost_fs_key { /* Number of colour buffers if gl_FragColor is written */ unsigned nr_cbufs_for_fragcolor; - /* On Valhall, fixed_varying_mask of the linked vertex shader */ - uint32_t fixed_varying_mask; - /* Midgard shaders that read the tilebuffer must be keyed for * non-blendable formats */ diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index 8ab9bca2147..e089ad85800 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -133,6 +133,7 @@ struct panfrost_batch { unsigned nr_push_uniforms[PIPE_SHADER_TYPES]; unsigned nr_uniform_buffers[PIPE_SHADER_TYPES]; + unsigned nr_varying_attribs[PIPE_SHADER_TYPES]; /* Varying related pointers */ struct { diff --git a/src/gallium/drivers/panfrost/pan_nir_lower_res_indices.c b/src/gallium/drivers/panfrost/pan_nir_lower_res_indices.c index 1d7ccf83565..52dc3efc64a 100644 --- a/src/gallium/drivers/panfrost/pan_nir_lower_res_indices.c +++ b/src/gallium/drivers/panfrost/pan_nir_lower_res_indices.c @@ -77,15 +77,12 @@ static bool lower_input_intrin(nir_builder *b, nir_intrinsic_instr *intrin, const struct panfrost_compile_inputs *inputs) { - /* We always use heap-based varying allocation when IDVS is used on Valhall. */ - bool malloc_idvs = !inputs->no_idvs; - /* All vertex attributes come from the attribute table. * Fragment inputs come from the attribute table too, unless they've * been allocated on the heap. */ if (b->shader->info.stage == MESA_SHADER_VERTEX || - (b->shader->info.stage == MESA_SHADER_FRAGMENT && !malloc_idvs)) { + b->shader->info.stage == MESA_SHADER_FRAGMENT) { nir_intrinsic_set_base( intrin, pan_res_handle(PAN_TABLE_ATTRIBUTE, nir_intrinsic_base(intrin))); @@ -131,6 +128,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, case nir_intrinsic_image_texel_address: return lower_image_intrin(b, intrin); case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: return lower_input_intrin(b, intrin, inputs); case nir_intrinsic_load_ubo: return lower_load_ubo_intrin(b, intrin); diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index dcdcfa6a3af..ed9c01b8ad0 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -364,7 +364,7 @@ panfrost_init_shader_caps(struct panfrost_screen *screen) caps->max_tex_indirections = 16384; /* arbitrary */ caps->max_control_flow_depth = 1024; /* arbitrary */ /* Used as ABI on Midgard */ - caps->max_inputs = 16; + caps->max_inputs = dev->arch >= 9 ? 32 : 16; caps->max_outputs = i == PIPE_SHADER_FRAGMENT ? 8 : PIPE_MAX_ATTRIBS; caps->max_temps = 256; /* arbitrary */ caps->max_const_buffer0_size = 16 * 1024 * sizeof(float); @@ -638,7 +638,7 @@ panfrost_init_screen_caps(struct panfrost_screen *screen) caps->shader_buffer_offset_alignment = 4; - caps->max_varyings = dev->arch >= 9 ? 16 : 32; + caps->max_varyings = 32; /* Removed in v6 (Bifrost) */ caps->gl_clamp = diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index 5654969785e..0f1f6253435 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -136,19 +136,21 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, .push_uniforms = true, }; - if (dev->arch >= 9) - /* Use LD_VAR_BUF for varying lookups. */ - inputs.valhall.use_ld_var_buf = true; - /* Lower this early so the backends don't have to worry about it */ if (s->info.stage == MESA_SHADER_FRAGMENT) { - inputs.fixed_varying_mask = key->fs.fixed_varying_mask; - } else if (s->info.stage == MESA_SHADER_VERTEX) { - inputs.fixed_varying_mask = fixed_varying_mask; + unsigned fixed_varying_mask = + (ir->info.inputs_read & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; + inputs.fixed_varying_mask = fixed_varying_mask; + } else if (s->info.stage == MESA_SHADER_VERTEX) { /* No IDVS for internal XFB shaders */ inputs.no_idvs = s->info.has_transform_feedback_varyings; + inputs.fixed_varying_mask = + (ir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; + if (s->info.has_transform_feedback_varyings) { NIR_PASS(_, s, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out); @@ -293,7 +295,6 @@ panfrost_build_fs_key(struct panfrost_context *ctx, struct panfrost_device *dev = pan_device(ctx->base.screen); struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer; struct pipe_rasterizer_state *rast = (void *)ctx->rasterizer; - struct panfrost_uncompiled_shader *vs = ctx->uncompiled[MESA_SHADER_VERTEX]; /* gl_FragColor lowering needs the number of colour buffers */ if (uncompiled->fragcolor_lowered) { @@ -326,12 +327,6 @@ panfrost_build_fs_key(struct panfrost_context *ctx, key->rt_formats[i] = fmt; } } - - /* Funny desktop GL varying lowering on Valhall */ - if (dev->arch >= 9) { - assert(vs != NULL && "too early"); - key->fixed_varying_mask = vs->fixed_varying_mask; - } } static void @@ -471,13 +466,6 @@ panfrost_create_shader_state(struct pipe_context *pctx, so->stream_output = cso->stream_output; so->nir = nir; - /* Fix linkage early */ - if (so->nir->info.stage == MESA_SHADER_VERTEX) { - so->fixed_varying_mask = - (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) & - ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; - } - /* gl_FragColor needs to be lowered before lowering I/O, do that now */ if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index 24801086bdb..1393a6063e0 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -146,6 +146,11 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, #if PAN_ARCH >= 9 info->varyings.output_count = util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0); + + /* Store the mask of special varyings, in case we need to emit ADs later. */ + info->varyings.fixed_varyings = + (s->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; #endif break; case MESA_SHADER_FRAGMENT: @@ -195,6 +200,11 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, #if PAN_ARCH >= 9 info->varyings.input_count = util_last_bit(s->info.inputs_read >> VARYING_SLOT_VAR0); + + /* Store the mask of special varyings, in case we need to emit ADs later. */ + info->varyings.fixed_varyings = + (s->info.inputs_read & BITFIELD_MASK(VARYING_SLOT_VAR0)) & + ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ; #endif break; default: diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 62539472e07..fb44bb2a0c8 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -294,6 +294,9 @@ struct pan_shader_info { /* Bitfield of noperspective varyings, starting at VARYING_SLOT_VAR0 */ uint32_t noperspective; + + /* Bitfield of special varyings. */ + uint32_t fixed_varyings; } varyings; /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access