panfrost: Push uniforms required by the program

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8973>
2026-01-05 11:10:10 +01:00 · 2021-02-07 10:09:21 -05:00 · 2021-02-07 10:09:21 -05:00 · ce8188ccf2
commit ce8188ccf2
parent d4dccea0ba
3 changed files with 25 additions and 2 deletions
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@ -299,6 +299,7 @@ panfrost_shader_compile(struct panfrost_context *ctx,

        state->sysval_count = program->sysval_count;
        memcpy(state->sysval, program->sysvals, sizeof(state->sysval[0]) * state->sysval_count);
+        memcpy(&state->push, &program->push, sizeof(program->push));

        bool vertex_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_VERTEX_ID);
        bool instance_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -1002,6 +1002,7 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
        /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
        struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
        unsigned ubo_count = shader->ubo_count - (sys_size ? 1 : 0);
+        unsigned sysval_ubo = sys_size ? ubo_count : ~0;

        size_t sz = MALI_UNIFORM_BUFFER_LENGTH * (ubo_count + 1);
        struct panfrost_ptr ubos =
@ -1042,8 +1043,27 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
                }
        }

-        if (ss->uniform_count)
-                *push_constants = transfer.gpu;
+        /* Copy push constants required by the shader */
+        struct panfrost_ptr push_transfer =
+                panfrost_pool_alloc_aligned(&batch->pool, ss->push.count * 4, 16);
+
+        uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;
+        *push_constants = push_transfer.gpu;
+
+        for (unsigned i = 0; i < ss->push.count; ++i) {
+                struct panfrost_ubo_word src = ss->push.words[i];
+
+                /* Map the UBO, this should be cheap. However this is reading
+                 * from write-combine memory which is _very_ slow. It might pay
+                 * off to upload sysvals to a staging buffer on the CPU on the
+                 * assumption sysvals will get pushed (TODO) */
+
+                const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
+                        panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
+
+                /* TODO: Is there any benefit to combining ranges */
+                memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
+        }

        buf->dirty_mask = 0;
        return ubos.gpu;
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@ -244,6 +244,8 @@ struct panfrost_shader_state {
        unsigned sysval_count;
        unsigned sysval[MAX_SYSVAL_COUNT];

+        struct panfrost_ubo_push push;
+
        /* Should we enable helper invocations */
        bool helper_invocations;