panfrost: pass blend constants to blend shaders dynamically

This is similar to the approach in panvk, where we pass blend constants
to the blend shader in fixed FAU slots instead of specializing the
shader on blend constants. TODO: explain midgard stuff

This eliminates the blend shader variant cache, which performed very
badly when the working set of blend constants in an application was >32
(the maximum number of variants stored). Just increasing the cache size
like we did in f1f39fa645 ("panfrost: Increase the limit for blend
shader variants") would help for applications with a larger static set,
but we would still have cache thrashing on applications which change the
blend constants dynamically.

For gfxbench gl_driver, which uses 386 blend constant values, this
improves FPS on a G610 from 6.06 to 40.48. Most applications are
unaffected, because they don't use enough constant values to cause
thrashing.

Signed-off-by: Olivia Lee <olivia.lee@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Acked-by: Ryan Mckeever <ryan.mckeever@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34666>
This commit is contained in:
Olivia Lee 2025-04-21 21:27:34 -07:00 committed by Marge Bot
parent def9af0255
commit 781d3162e4
12 changed files with 77 additions and 67 deletions

View file

@ -10,6 +10,7 @@
#include "compiler/nir/nir_builder.h"
#include "pan_shader.h"
#include "panfrost/util/pan_lower_framebuffer.h"
#include "pan_context.h"
#ifndef PAN_ARCH
@ -34,26 +35,24 @@ pan_blend_shader_cache_cleanup(struct pan_blend_shader_cache *cache)
#else /* PAN_ARCH */
static bool
pan_inline_blend_constants(nir_builder *b, nir_intrinsic_instr *intr,
pan_lower_blend_constants(nir_builder *b, nir_intrinsic_instr *intr,
void *data)
{
if (intr->intrinsic != nir_intrinsic_load_blend_const_color_rgba)
return false;
float *floats = data;
const nir_const_value constants[4] = {
nir_const_value_for_float(floats[0], 32),
nir_const_value_for_float(floats[1], 32),
nir_const_value_for_float(floats[2], 32),
nir_const_value_for_float(floats[3], 32)};
/* panfrost_nir_lower_sysvals always maps blend constants to slot 0 */
unsigned offset = 0;
b->cursor = nir_after_instr(&intr->instr);
nir_def *constant = nir_build_imm(b, 4, 32, constants);
b->cursor = nir_before_instr(&intr->instr);
nir_def *constant = nir_load_ubo(
b, 4, 32, nir_imm_int(b, PAN_UBO_SYSVALS), nir_imm_int(b, offset),
.align_mul = 4, .align_offset = 0, .range_base = 0, .range = 4);
nir_def_replace(&intr->def, constant);
return true;
}
struct pan_blend_shader_variant *
struct pan_blend_shader *
GENX(pan_blend_get_shader_locked)(struct pan_blend_shader_cache *cache,
const struct pan_blend_state *state,
nir_alu_type src0_type,
@ -64,7 +63,6 @@ GENX(pan_blend_get_shader_locked)(struct pan_blend_shader_cache *cache,
.src0_type = src0_type,
.src1_type = src1_type,
.rt = rt,
.has_constants = pan_blend_constant_mask(state->rts[rt].equation) != 0,
.logicop_enable = state->logicop_enable,
.logicop_func = state->logicop_func,
.nr_samples = state->rts[rt].nr_samples,
@ -79,51 +77,26 @@ GENX(pan_blend_get_shader_locked)(struct pan_blend_shader_cache *cache,
struct hash_entry *he =
_mesa_hash_table_search(cache->shaders, &key);
struct pan_blend_shader *shader = he ? he->data : NULL;
if (shader)
return shader;
if (!shader) {
shader = rzalloc(cache->shaders, struct pan_blend_shader);
shader->key = key;
list_inithead(&shader->variants);
_mesa_hash_table_insert(cache->shaders, &shader->key, shader);
}
list_for_each_entry(struct pan_blend_shader_variant, iter, &shader->variants,
node) {
if (!key.has_constants ||
!memcmp(iter->constants, state->constants, sizeof(iter->constants))) {
return iter;
}
}
struct pan_blend_shader_variant *variant = NULL;
if (shader->nvariants < PAN_BLEND_SHADER_MAX_VARIANTS) {
variant = rzalloc(shader, struct pan_blend_shader_variant);
util_dynarray_init(&variant->binary, variant);
list_add(&variant->node, &shader->variants);
shader->nvariants++;
} else {
variant = list_last_entry(&shader->variants,
struct pan_blend_shader_variant, node);
list_del(&variant->node);
list_add(&variant->node, &shader->variants);
util_dynarray_clear(&variant->binary);
}
memcpy(variant->constants, state->constants, sizeof(variant->constants));
shader = rzalloc(cache->shaders, struct pan_blend_shader);
shader->key = key;
_mesa_hash_table_insert(cache->shaders, &shader->key, shader);
nir_shader *nir =
GENX(pan_blend_create_shader)(state, src0_type, src1_type, rt);
nir_shader_intrinsics_pass(nir, pan_inline_blend_constants,
nir_metadata_control_flow,
(void *)state->constants);
nir->info.num_ubos = PAN_UBO_SYSVALS + 1;
nir_shader_intrinsics_pass(nir, pan_lower_blend_constants,
nir_metadata_control_flow, NULL);
/* Compile the NIR shader */
struct panfrost_compile_inputs inputs = {
.gpu_id = cache->gpu_id,
.is_blend = true,
.blend.nr_samples = key.nr_samples,
.pushable_ubos = BITFIELD_BIT(PAN_UBO_SYSVALS),
};
enum pipe_format rt_formats[8] = {0};
@ -145,17 +118,17 @@ GENX(pan_blend_get_shader_locked)(struct pan_blend_shader_cache *cache,
cache->gpu_id < 0x700);
#endif
GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info);
GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &info);
variant->work_reg_count = info.work_reg_count;
shader->work_reg_count = info.work_reg_count;
#if PAN_ARCH <= 5
variant->first_tag = info.midgard.first_tag;
shader->first_tag = info.midgard.first_tag;
#endif
ralloc_free(nir);
return variant;
return shader;
}
#endif /* PAN_ARCH */

View file

@ -65,22 +65,14 @@ struct pan_blend_shader_cache {
pthread_mutex_t lock;
};
struct pan_blend_shader_variant {
struct list_head node;
float constants[4];
struct pan_blend_shader {
struct pan_blend_shader_key key;
struct util_dynarray binary;
unsigned first_tag;
unsigned work_reg_count;
};
#define PAN_BLEND_SHADER_MAX_VARIANTS 32
struct pan_blend_shader {
struct pan_blend_shader_key key;
unsigned nvariants;
struct list_head variants;
};
uint64_t panfrost_get_blend(struct panfrost_batch *batch, unsigned rt,
struct panfrost_bo **bo, unsigned *shader_offset);
@ -94,7 +86,7 @@ void pan_blend_shader_cache_cleanup(struct pan_blend_shader_cache *cache);
/* Take blend_shaders.lock before calling this function and release it when
* you're done with the shader variant object.
*/
struct pan_blend_shader_variant *GENX(pan_blend_get_shader_locked)(
struct pan_blend_shader *GENX(pan_blend_get_shader_locked)(
struct pan_blend_shader_cache *cache, const struct pan_blend_state *state,
nir_alu_type src0_type, nir_alu_type src1_type, unsigned rt);

View file

@ -1254,6 +1254,15 @@ panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
uniform->u[0] = (samples > 1) ? ~0 : 0;
}
static void
panfrost_upload_blend_constants_sysval(struct panfrost_batch *batch,
struct sysval_uniform *uniform)
{
struct panfrost_context *ctx = batch->ctx;
for (unsigned i = 0; i < 4; i++)
uniform->f[i] = ctx->blend_color.color[i];
}
#if PAN_ARCH >= 6
static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
@ -1367,6 +1376,9 @@ panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu,
case PAN_SYSVAL_MULTISAMPLED:
panfrost_upload_multisampled_sysval(batch, &uniforms[i]);
break;
case PAN_SYSVAL_BLEND_CONSTANTS:
panfrost_upload_blend_constants_sysval(batch, &uniforms[i]);
break;
#if PAN_ARCH >= 6
case PAN_SYSVAL_RT_CONVERSION:
panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval),

View file

@ -235,7 +235,7 @@ panfrost_get_blend(struct panfrost_batch *batch, unsigned rti,
}
pthread_mutex_lock(&dev->blend_shaders.lock);
struct pan_blend_shader_variant *shader =
struct pan_blend_shader *shader =
pan_screen(ctx->base.screen)
->vtbl.get_blend_shader(&dev->blend_shaders, &pan_blend, col0_type,
col1_type, rti);

View file

@ -367,7 +367,7 @@ pan_preload_get_blend_shaders(struct pan_fb_preload_cache *cache,
};
pthread_mutex_lock(&cache->blend_shader_cache->lock);
struct pan_blend_shader_variant *b = GENX(pan_blend_get_shader_locked)(
struct pan_blend_shader *b = GENX(pan_blend_get_shader_locked)(
cache->blend_shader_cache, &blend_state,
preload_shader->blend_types[i], nir_type_float32, /* unused */
i);

View file

@ -70,11 +70,16 @@ panfrost_analyze_sysvals(struct panfrost_compiled_shader *ss)
dirty |= PAN_DIRTY_DRAWID;
break;
case PAN_SYSVAL_BLEND_CONSTANTS:
dirty |= PAN_DIRTY_BLEND;
break;
case PAN_SYSVAL_SAMPLE_POSITIONS:
case PAN_SYSVAL_MULTISAMPLED:
case PAN_SYSVAL_RT_CONVERSION:
/* Nothing beyond the batch itself */
break;
default:
unreachable("Invalid sysval");
}

View file

@ -118,6 +118,9 @@ sysval_for_intrinsic(unsigned arch, nir_intrinsic_instr *intr, unsigned *offset)
case nir_intrinsic_load_printf_buffer_address:
return PAN_SYSVAL_PRINTF_BUFFER;
case nir_intrinsic_load_blend_const_color_rgba:
return PAN_SYSVAL_BLEND_CONSTANTS;
case nir_intrinsic_load_rt_conversion_pan: {
unsigned size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr));
unsigned rt = nir_intrinsic_base(intr);
@ -141,6 +144,10 @@ sysval_for_intrinsic(unsigned arch, nir_intrinsic_instr *intr, unsigned *offset)
static bool
uses_sysvals(unsigned arch, nir_shader *shader)
{
/* Fragment shaders always use the blend constant sysval */
if (shader->info.stage == MESA_SHADER_FRAGMENT)
return true;
nir_foreach_function_impl(impl, shader) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
@ -259,6 +266,11 @@ panfrost_nir_lower_sysvals(nir_shader *shader, unsigned arch,
* sysval at UBO1 */
shader->info.num_ubos = MAX2(PAN_UBO_SYSVALS, shader->info.num_ubos) + 1;
/* Reserve the first slot for blend constants, so that they can be accessed
* from a fixed offset in the blend shader */
if (shader->info.stage == MESA_SHADER_FRAGMENT)
lookup_sysval(ctx.sysval_to_id, ctx.sysvals, PAN_SYSVAL_BLEND_CONSTANTS);
nir_shader_instructions_pass(
shader, lower, nir_metadata_control_flow, &ctx);

View file

@ -82,7 +82,7 @@ struct panfrost_vtable {
int (*submit_batch)(struct panfrost_batch *batch, struct pan_fb_info *fb);
/* Get blend shader */
struct pan_blend_shader_variant *(*get_blend_shader)(
struct pan_blend_shader *(*get_blend_shader)(
struct pan_blend_shader_cache *cache, const struct pan_blend_state *,
nir_alu_type, nir_alu_type, unsigned rt);

View file

@ -84,6 +84,15 @@ bi_analyze_ranges(bi_context *ctx)
assert(ubo < res.nr_blocks);
assert(channels > 0 && channels <= 4);
/* Blend constants are always loaded from the sysval UBO in blend shaders,
* do not push them. */
if (ctx->stage == MESA_SHADER_FRAGMENT) {
/* PAN_UBO_SYSVALS from the gallium driver */
unsigned sysval_ubo = 1;
if(ubo == sysval_ubo && word == 0)
continue;
}
if (word >= MAX_UBO_WORDS)
continue;
@ -136,6 +145,7 @@ void
bi_opt_push_ubo(bi_context *ctx)
{
struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
bi_pick_ubo(ctx->info.push, &analysis);
ctx->ubo_mask = 0;

View file

@ -70,7 +70,6 @@ struct pan_blend_shader_key {
enum pipe_format format;
nir_alu_type src0_type, src1_type;
uint32_t rt : 3;
uint32_t has_constants : 1;
uint32_t logicop_enable : 1;
uint32_t logicop_func : 4;
uint32_t nr_samples : 5;

View file

@ -88,6 +88,15 @@ mir_analyze_ranges(compiler_context *ctx)
assert(ubo < res.nr_blocks);
/* Blend constants are always loaded from the sysval UBO in blend shaders,
* do not push them. */
if (ctx->stage == MESA_SHADER_FRAGMENT) {
/* PAN_UBO_SYSVALS from the gallium driver */
unsigned sysval_ubo = 1;
if(ubo == sysval_ubo && offset == 0)
continue;
}
if (offset < MAX_UBO_QWORDS)
BITSET_SET(res.blocks[ubo].uses, offset);
}

View file

@ -60,8 +60,6 @@ get_blend_shader(struct panvk_device *dev,
.src0_type = src0_type,
.src1_type = src1_type,
.rt = rt,
.has_constants =
pan_blend_constant_mask(state->rts[rt].equation) != 0,
.logicop_enable = state->logicop_enable,
.logicop_func = state->logicop_func,
.nr_samples = state->rts[rt].nr_samples,