diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c index 38614d2c62d..32254ebe93e 100644 --- a/src/gallium/drivers/panfrost/pan_assemble.c +++ b/src/gallium/drivers/panfrost/pan_assemble.c @@ -299,6 +299,7 @@ panfrost_shader_compile(struct panfrost_context *ctx, state->sysval_count = program->sysval_count; memcpy(state->sysval, program->sysvals, sizeof(state->sysval[0]) * state->sysval_count); + memcpy(&state->push, &program->push, sizeof(program->push)); bool vertex_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_VERTEX_ID); bool instance_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 40bfec8568d..ccdaf7ec371 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1002,6 +1002,7 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage); unsigned ubo_count = shader->ubo_count - (sys_size ? 1 : 0); + unsigned sysval_ubo = sys_size ? ubo_count : ~0; size_t sz = MALI_UNIFORM_BUFFER_LENGTH * (ubo_count + 1); struct panfrost_ptr ubos = @@ -1042,8 +1043,27 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, } } - if (ss->uniform_count) - *push_constants = transfer.gpu; + /* Copy push constants required by the shader */ + struct panfrost_ptr push_transfer = + panfrost_pool_alloc_aligned(&batch->pool, ss->push.count * 4, 16); + + uint32_t *push_cpu = (uint32_t *) push_transfer.cpu; + *push_constants = push_transfer.gpu; + + for (unsigned i = 0; i < ss->push.count; ++i) { + struct panfrost_ubo_word src = ss->push.words[i]; + + /* Map the UBO, this should be cheap. However this is reading + * from write-combine memory which is _very_ slow. It might pay + * off to upload sysvals to a staging buffer on the CPU on the + * assumption sysvals will get pushed (TODO) */ + + const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu : + panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); + + /* TODO: Is there any benefit to combining ranges */ + memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); + } buf->dirty_mask = 0; return ubos.gpu; diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index d1512fa579b..4e41a3e4f43 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -244,6 +244,8 @@ struct panfrost_shader_state { unsigned sysval_count; unsigned sysval[MAX_SYSVAL_COUNT]; + struct panfrost_ubo_push push; + /* Should we enable helper invocations */ bool helper_invocations;