panfrost: Move sysvals to dedicated UBO

This makes UBO 0 less special, allowing us to generalize uniform optimization. Note this disables RMU on Midgard as we're about to rewrite the RMU mechanism. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8973>
2026-01-06 13:10:10 +01:00 · 2021-01-29 18:06:02 -05:00 · 2021-01-29 18:06:02 -05:00 · db7e2dce1c
commit db7e2dce1c
parent 0dc539a872
5 changed files with 19 additions and 60 deletions
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@ -399,12 +399,8 @@ panfrost_shader_compile(struct panfrost_context *ctx,
        state->attribute_count = attribute_count;
        state->varying_count = varying_count;

-        /* Uniforms have been lowered to UBOs using nir_lower_uniforms_to_ubo()
-         * which already increments s->info.num_ubos. We do have to account for
-         * the "no uniform, no UBO" case though, otherwise sysval passed
-         * through uniforms won't work correctly.
-         */
-        state->ubo_count = MAX2(s->info.num_ubos, 1);
+        /* Sysvals have dedicated UBO */
+        state->ubo_count = s->info.num_ubos + (state->sysval_count ? 1 : 0);

        /* Prepare the descriptors at compile-time */
        state->shader.shader = shader;
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -989,55 +989,39 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
                return 0;

        struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
-
        struct panfrost_shader_state *ss = &all->variants[all->active_variant];

-        /* Uniforms are implicitly UBO #0 */
-        bool has_uniforms = buf->enabled_mask & (1 << 0);
-
        /* Allocate room for the sysval and the uniforms */
        size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
-        size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
-        size_t size = sys_size + uniform_size;
        struct panfrost_ptr transfer =
-                panfrost_pool_alloc_aligned(&batch->pool, size, 16);
+                panfrost_pool_alloc_aligned(&batch->pool, sys_size, 16);

        /* Upload sysvals requested by the shader */
        panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);

-        /* Upload uniforms */
-        if (has_uniforms && uniform_size) {
-                const void *cpu = panfrost_map_constant_buffer_cpu(ctx, buf, 0);
-                memcpy(transfer.cpu + sys_size, cpu, uniform_size);
-        }
+        /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
+        struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
+        unsigned ubo_count = shader->ubo_count - (sys_size ? 1 : 0);

-        /* Next up, attach UBOs. UBO #0 is the uniforms we just
-         * uploaded, so it's always included. The count is the highest UBO
-         * addressable -- gaps are included. */
-
-        unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
-
-        size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
+        size_t sz = MALI_UNIFORM_BUFFER_LENGTH * (ubo_count + 1);
        struct panfrost_ptr ubos =
                panfrost_pool_alloc_aligned(&batch->pool, sz,
                                MALI_UNIFORM_BUFFER_LENGTH);

        uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;

-        /* Upload uniforms as a UBO */
+        /* Upload sysval as a final UBO */

-        if (size) {
-                pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
-                        cfg.entries = DIV_ROUND_UP(size, 16);
+        if (sys_size) {
+                pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) {
+                        cfg.entries = DIV_ROUND_UP(sys_size, 16);
                        cfg.pointer = transfer.gpu;
                }
-        } else {
-                *ubo_ptr = 0;
        }

        /* The rest are honest-to-goodness UBOs */

-        for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
+        for (unsigned ubo = 0; ubo < ubo_count; ++ubo) {
                size_t usz = buf->cb[ubo].buffer_size;
                bool enabled = buf->enabled_mask & (1 << ubo);
                bool empty = usz == 0;
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@ -489,28 +489,9 @@ bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)

        bool offset_is_const = nir_src_is_const(*offset);
        bi_index dyn_offset = bi_src_index(offset);
-        uint32_t const_offset = 0;
-
+        uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
        bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);

-        /* We may need to offset UBO loads by however many sysvals we have */
-        unsigned sysval_offset = 16 * b->shader->sysvals.sysval_count;
-
-        if (nir_src_is_const(*offset))
-                const_offset = nir_src_as_uint(*offset);
-
-        if ((kernel_input ||
-             (nir_src_is_const(instr->src[0]) &&
-              nir_src_as_uint(instr->src[0]) == 0)) &&
-            b->shader->sysvals.sysval_count) {
-                if (offset_is_const) {
-                        const_offset += sysval_offset;
-                } else {
-                        dyn_offset = bi_iadd_u32(b, dyn_offset,
-                                        bi_imm_u32(sysval_offset), false);
-                }
-        }
-
        bi_load_to(b, instr->num_components * 32,
                        bi_dest_index(&instr->dest), offset_is_const ?
                        bi_imm_u32(const_offset) : dyn_offset,
@ -635,7 +616,8 @@ bi_load_sysval(bi_builder *b, nir_instr *instr,
        unsigned idx = (uniform * 16) + offset;

        bi_load_to(b, nr_components * 32, bi_dest_index(&nir_dest),
-                        bi_imm_u32(idx), bi_zero(), BI_SEG_UBO);
+                        bi_imm_u32(idx),
+                        bi_imm_u32(b->shader->nir->info.num_ubos), BI_SEG_UBO);
 }

 /* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@ -1451,7 +1451,8 @@ emit_sysval_read(compiler_context *ctx, nir_instr *instr,

        /* Emit the read itself -- this is never indirect */
        midgard_instruction *ins =
-                emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0, 0);
+                emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,
+                                ctx->nir->info.num_ubos);

        ins->mask = mask_of(nr_components);
 }
@ -1708,7 +1709,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                reg = nir_dest_index(&instr->dest);

                if (is_kernel) {
-                        emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysvals.sysval_count * 16) + offset, indirect_offset, 0, 0);
+                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0);
                } else if (is_ubo) {
                        nir_src index = instr->src[0];

@ -1716,10 +1717,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
                        assert(nir_src_is_const(index));

                        uint32_t uindex = nir_src_as_uint(index);
-
-                        if (uindex == 0)
-                                offset += ctx->sysvals.sysval_count * 16;
-
                        emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex);
                } else if (is_global || is_shared || is_scratch) {
                        unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@ -1447,7 +1447,7 @@ schedule_block(compiler_context *ctx, midgard_block *block)
 void
 midgard_schedule_program(compiler_context *ctx)
 {
-        midgard_promote_uniforms(ctx);
+//        midgard_promote_uniforms(ctx);

        /* Must be lowered right before scheduling */
        mir_squeeze_index(ctx);