panfrost: Move sysvals to dedicated UBO

This makes UBO 0 less special, allowing us to generalize uniform
optimization. Note this disables RMU on Midgard as we're about to
rewrite the RMU mechanism.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8973>
This commit is contained in:
Alyssa Rosenzweig 2021-01-29 18:06:02 -05:00 committed by Marge Bot
parent 0dc539a872
commit db7e2dce1c
5 changed files with 19 additions and 60 deletions

View file

@ -399,12 +399,8 @@ panfrost_shader_compile(struct panfrost_context *ctx,
state->attribute_count = attribute_count;
state->varying_count = varying_count;
/* Uniforms have been lowered to UBOs using nir_lower_uniforms_to_ubo()
* which already increments s->info.num_ubos. We do have to account for
* the "no uniform, no UBO" case though, otherwise sysval passed
* through uniforms won't work correctly.
*/
state->ubo_count = MAX2(s->info.num_ubos, 1);
/* Sysvals have dedicated UBO */
state->ubo_count = s->info.num_ubos + (state->sysval_count ? 1 : 0);
/* Prepare the descriptors at compile-time */
state->shader.shader = shader;

View file

@ -989,55 +989,39 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
return 0;
struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
struct panfrost_shader_state *ss = &all->variants[all->active_variant];
/* Uniforms are implicitly UBO #0 */
bool has_uniforms = buf->enabled_mask & (1 << 0);
/* Allocate room for the sysval and the uniforms */
size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
size_t size = sys_size + uniform_size;
struct panfrost_ptr transfer =
panfrost_pool_alloc_aligned(&batch->pool, size, 16);
panfrost_pool_alloc_aligned(&batch->pool, sys_size, 16);
/* Upload sysvals requested by the shader */
panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
/* Upload uniforms */
if (has_uniforms && uniform_size) {
const void *cpu = panfrost_map_constant_buffer_cpu(ctx, buf, 0);
memcpy(transfer.cpu + sys_size, cpu, uniform_size);
}
/* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
unsigned ubo_count = shader->ubo_count - (sys_size ? 1 : 0);
/* Next up, attach UBOs. UBO #0 is the uniforms we just
* uploaded, so it's always included. The count is the highest UBO
* addressable -- gaps are included. */
unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
size_t sz = MALI_UNIFORM_BUFFER_LENGTH * (ubo_count + 1);
struct panfrost_ptr ubos =
panfrost_pool_alloc_aligned(&batch->pool, sz,
MALI_UNIFORM_BUFFER_LENGTH);
uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
/* Upload uniforms as a UBO */
/* Upload sysval as a final UBO */
if (size) {
pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
cfg.entries = DIV_ROUND_UP(size, 16);
if (sys_size) {
pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) {
cfg.entries = DIV_ROUND_UP(sys_size, 16);
cfg.pointer = transfer.gpu;
}
} else {
*ubo_ptr = 0;
}
/* The rest are honest-to-goodness UBOs */
for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
for (unsigned ubo = 0; ubo < ubo_count; ++ubo) {
size_t usz = buf->cb[ubo].buffer_size;
bool enabled = buf->enabled_mask & (1 << ubo);
bool empty = usz == 0;

View file

@ -489,28 +489,9 @@ bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
bool offset_is_const = nir_src_is_const(*offset);
bi_index dyn_offset = bi_src_index(offset);
uint32_t const_offset = 0;
uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
/* We may need to offset UBO loads by however many sysvals we have */
unsigned sysval_offset = 16 * b->shader->sysvals.sysval_count;
if (nir_src_is_const(*offset))
const_offset = nir_src_as_uint(*offset);
if ((kernel_input ||
(nir_src_is_const(instr->src[0]) &&
nir_src_as_uint(instr->src[0]) == 0)) &&
b->shader->sysvals.sysval_count) {
if (offset_is_const) {
const_offset += sysval_offset;
} else {
dyn_offset = bi_iadd_u32(b, dyn_offset,
bi_imm_u32(sysval_offset), false);
}
}
bi_load_to(b, instr->num_components * 32,
bi_dest_index(&instr->dest), offset_is_const ?
bi_imm_u32(const_offset) : dyn_offset,
@ -635,7 +616,8 @@ bi_load_sysval(bi_builder *b, nir_instr *instr,
unsigned idx = (uniform * 16) + offset;
bi_load_to(b, nr_components * 32, bi_dest_index(&nir_dest),
bi_imm_u32(idx), bi_zero(), BI_SEG_UBO);
bi_imm_u32(idx),
bi_imm_u32(b->shader->nir->info.num_ubos), BI_SEG_UBO);
}
/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5

View file

@ -1451,7 +1451,8 @@ emit_sysval_read(compiler_context *ctx, nir_instr *instr,
/* Emit the read itself -- this is never indirect */
midgard_instruction *ins =
emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0, 0);
emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0,
ctx->nir->info.num_ubos);
ins->mask = mask_of(nr_components);
}
@ -1708,7 +1709,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
reg = nir_dest_index(&instr->dest);
if (is_kernel) {
emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysvals.sysval_count * 16) + offset, indirect_offset, 0, 0);
emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0);
} else if (is_ubo) {
nir_src index = instr->src[0];
@ -1716,10 +1717,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
assert(nir_src_is_const(index));
uint32_t uindex = nir_src_as_uint(index);
if (uindex == 0)
offset += ctx->sysvals.sysval_count * 16;
emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex);
} else if (is_global || is_shared || is_scratch) {
unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);

View file

@ -1447,7 +1447,7 @@ schedule_block(compiler_context *ctx, midgard_block *block)
void
midgard_schedule_program(compiler_context *ctx)
{
midgard_promote_uniforms(ctx);
// midgard_promote_uniforms(ctx);
/* Must be lowered right before scheduling */
mir_squeeze_index(ctx);