mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 07:28:11 +02:00
agx: promote constants to uniforms
Add an optimization pass to promote constants loaded in the shader to dedicated uniform registers preloaded before the shader. This is beneficial for two reasons: * fewer mov_imm instructions * less GPR pressure (uniforms have dedicated registers) The latter can significantly improve occupancy since we don't remat constants for occupancy. We do remat to avoid spilling so it won't affect spilling, although it can still be a win by reducing remat when a shader would otherwise spill. The problem is that we have limited uniform registers so can't promote everything that we would want to. We model this as a 0-1 knapsack problem and use the well-known heuristic to prioritize frequently used constants. This is not optimal but works quite well in practice. This gives a nice fps win in some complex shaders, including: * Dolphin ubers from 10.25fps to 10.85fps at 4K in MMG. * "Wall and chimney" shadertoy from 24.8fps to 29.5fps at 1188x658. shader-db results are excellent as well. total instructions in shared programs: 2088290 -> 2039709 (-2.33%) instructions in affected programs: 1478061 -> 1429480 (-3.29%) helped: 8246 HURT: 85 Instructions are helped. total bytes in shared programs: 14321004 -> 14111800 (-1.46%) bytes in affected programs: 10108742 -> 9899538 (-2.07%) helped: 7999 HURT: 1416 Bytes are helped. total regs in shared programs: 602415 -> 590371 (-2.00%) regs in affected programs: 92177 -> 80133 (-13.07%) helped: 1887 HURT: 209 Regs are helped. total uniforms in shared programs: 1457531 -> 1533232 (5.19%) uniforms in affected programs: 835522 -> 911223 (9.06%) helped: 0 HURT: 11042 Uniforms are HURT. total threads in shared programs: 20325824 -> 20329216 (0.02%) threads in affected programs: 29632 -> 33024 (11.45%) helped: 41 HURT: 0 Threads are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
This commit is contained in:
parent
2a97657792
commit
61b74894a9
7 changed files with 222 additions and 2 deletions
|
|
@ -37,6 +37,7 @@ static const struct debug_named_value agx_debug_options[] = {
|
|||
{"demand", AGX_DBG_DEMAND, "Bound tightly to register demand"},
|
||||
{"nosched", AGX_DBG_NOSCHED, "Do not schedule the shader"},
|
||||
{"spill", AGX_DBG_SPILL, "Spill (almost) everything"},
|
||||
{"nopromote", AGX_DBG_NOPROMOTE, "Do not promote constants to uniforms"},
|
||||
DEBUG_NAMED_VALUE_END
|
||||
};
|
||||
/* clang-format on */
|
||||
|
|
@ -2821,6 +2822,11 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
|
|||
|
||||
/* After DCE, use counts are right so we can run the optimizer. */
|
||||
agx_optimizer(ctx);
|
||||
|
||||
/* After inlining constants, promote what's left */
|
||||
if (key->promote_constants && !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
|
||||
agx_opt_promote_constants(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
/* For correctness, lower uniform sources after copyprop (for correctness,
|
||||
|
|
|
|||
|
|
@ -166,6 +166,11 @@ struct agx_shader_info {
|
|||
|
||||
/* Output mask set during driver lowering */
|
||||
uint64_t outputs;
|
||||
|
||||
/* Immediate data that must be uploaded and mapped as uniform registers */
|
||||
unsigned immediate_base_uniform;
|
||||
unsigned immediate_size_16;
|
||||
uint16_t immediates[512];
|
||||
};
|
||||
|
||||
#define AGX_MAX_RTS (8)
|
||||
|
|
@ -233,6 +238,11 @@ struct agx_shader_key {
|
|||
*/
|
||||
bool is_helper;
|
||||
|
||||
/* Whether the driver supports uploading constants for this shader. If
|
||||
* false, constants will not be promoted to uniforms.
|
||||
*/
|
||||
bool promote_constants;
|
||||
|
||||
union {
|
||||
struct agx_vs_shader_key vs;
|
||||
struct agx_fs_shader_key fs;
|
||||
|
|
|
|||
|
|
@ -939,6 +939,7 @@ void agx_lower_pseudo(agx_context *ctx);
|
|||
void agx_lower_spill(agx_context *ctx);
|
||||
void agx_lower_uniform_sources(agx_context *ctx);
|
||||
void agx_opt_cse(agx_context *ctx);
|
||||
void agx_opt_promote_constants(agx_context *ctx);
|
||||
void agx_dce(agx_context *ctx, bool partial);
|
||||
void agx_pressure_schedule(agx_context *ctx);
|
||||
void agx_spill(agx_context *ctx, unsigned k);
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ enum agx_compiler_dbg {
|
|||
AGX_DBG_DEMAND = BITFIELD_BIT(9),
|
||||
AGX_DBG_NOSCHED = BITFIELD_BIT(10),
|
||||
AGX_DBG_SPILL = BITFIELD_BIT(11),
|
||||
AGX_DBG_NOPROMOTE = BITFIELD_BIT(12),
|
||||
};
|
||||
/* clang-format on */
|
||||
|
||||
|
|
|
|||
184
src/asahi/compiler/agx_opt_promote_constants.c
Normal file
184
src/asahi/compiler/agx_opt_promote_constants.c
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
/*
|
||||
* Copyright 2023 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "util/bitset.h"
|
||||
#include "util/hash_table.h"
|
||||
#include "util/ralloc.h"
|
||||
#include "agx_compiler.h"
|
||||
#include "agx_opcodes.h"
|
||||
|
||||
/*
|
||||
* Information about a constant, indexed by its 64-bit value. This describes the
|
||||
* value, not the move that generated it. If there are multiple moves in the
|
||||
* shader with the same immediate value, they resolve to the same constant.
|
||||
*/
|
||||
struct constant_info {
|
||||
uint64_t value;
|
||||
|
||||
/* Number of uses of the constant that could be promoted */
|
||||
unsigned nr_promotable_uses;
|
||||
|
||||
/* If we push, the uniform used */
|
||||
uint16_t uniform;
|
||||
|
||||
/* Alignment in 16-bit units needed for the constant */
|
||||
uint8_t align_16;
|
||||
|
||||
/* True if the constant was promoted to a uniform */
|
||||
bool promoted;
|
||||
};
|
||||
|
||||
/*
|
||||
* Choosing constants to promote is similar to the 0-1 knapsack problem. We use
|
||||
* a well-known heuristic: sort by benefit divided by size. We approximate
|
||||
* benefit by use count.
|
||||
*/
|
||||
static int
|
||||
constant_priority(const struct constant_info *const info)
|
||||
{
|
||||
int size = info->align_16;
|
||||
assert(size == 1 || size == 2 || size == 4);
|
||||
int inverse_size = (size == 1) ? 4 : (size == 2) ? 2 : 1;
|
||||
|
||||
return info->nr_promotable_uses * inverse_size;
|
||||
}
|
||||
|
||||
static int
|
||||
priority_compare(const void *A_, const void *B_)
|
||||
{
|
||||
const struct constant_info *const *A = A_;
|
||||
const struct constant_info *const *B = B_;
|
||||
|
||||
/* This is backwards from qsort's documentation, because we want descending
|
||||
* order and qsort returns ascending.
|
||||
*/
|
||||
return constant_priority(*B) - constant_priority(*A);
|
||||
}
|
||||
|
||||
static void
|
||||
record_use(void *memctx, struct hash_table_u64 *constants, uint64_t imm,
|
||||
enum agx_size size)
|
||||
{
|
||||
struct constant_info *info = _mesa_hash_table_u64_search(constants, imm);
|
||||
|
||||
if (!info) {
|
||||
info = rzalloc(memctx, struct constant_info);
|
||||
info->value = imm;
|
||||
_mesa_hash_table_u64_insert(constants, imm, info);
|
||||
}
|
||||
|
||||
info->nr_promotable_uses++;
|
||||
info->align_16 = MAX2(info->align_16, agx_size_align_16(size));
|
||||
}
|
||||
|
||||
static void
|
||||
pass(agx_context *ctx, void *memctx)
|
||||
{
|
||||
/* Map from SSA indices to struct constant_info */
|
||||
struct hash_table_u64 *constants = _mesa_hash_table_u64_create(memctx);
|
||||
|
||||
/* Map from SSA indices to immediate values */
|
||||
uint64_t *values = rzalloc_array(memctx, uint64_t, ctx->alloc);
|
||||
|
||||
/* Set of SSA indices that map to immediate values */
|
||||
BITSET_WORD *is_immediate =
|
||||
rzalloc_array(memctx, BITSET_WORD, BITSET_WORDS(ctx->alloc));
|
||||
|
||||
/* Gather constant definitions and use */
|
||||
agx_foreach_instr_global(ctx, I) {
|
||||
if (I->op == AGX_OPCODE_MOV_IMM) {
|
||||
assert(I->dest[0].type == AGX_INDEX_NORMAL);
|
||||
BITSET_SET(is_immediate, I->dest[0].value);
|
||||
values[I->dest[0].value] = I->imm;
|
||||
} else {
|
||||
agx_foreach_ssa_src(I, s) {
|
||||
if (BITSET_TEST(is_immediate, I->src[s].value) &&
|
||||
agx_instr_accepts_uniform(I->op, s, ctx->out->push_count,
|
||||
I->src[s].size)) {
|
||||
|
||||
record_use(memctx, constants, values[I->src[s].value],
|
||||
I->src[s].size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Early exit if there were no constants */
|
||||
unsigned nr_nodes = _mesa_hash_table_u64_num_entries(constants);
|
||||
if (nr_nodes == 0)
|
||||
return;
|
||||
|
||||
/* Collect nodes that are promotable */
|
||||
struct constant_info **flat =
|
||||
rzalloc_array(memctx, struct constant_info *, nr_nodes);
|
||||
|
||||
unsigned flat_count = 0;
|
||||
hash_table_u64_foreach(constants, entry) {
|
||||
flat[flat_count++] = entry.data;
|
||||
}
|
||||
|
||||
/* Select constants. Even when we can promote everything, sorting keeps hot
|
||||
* constants in lower uniforms, required by some instructions.
|
||||
*/
|
||||
qsort(flat, flat_count, sizeof(*flat), priority_compare);
|
||||
|
||||
ctx->out->immediate_base_uniform = ctx->out->push_count;
|
||||
|
||||
/* Promote as many constants as we can */
|
||||
for (unsigned i = 0; i < flat_count; ++i) {
|
||||
struct constant_info *info = flat[i];
|
||||
assert(info->nr_promotable_uses > 0);
|
||||
|
||||
/* Try to assign a uniform */
|
||||
unsigned uniform = ALIGN_POT(ctx->out->push_count, info->align_16);
|
||||
unsigned new_count = uniform + info->align_16;
|
||||
if (new_count > AGX_NUM_UNIFORMS)
|
||||
break;
|
||||
|
||||
info->uniform = uniform;
|
||||
info->promoted = true;
|
||||
ctx->out->push_count = new_count;
|
||||
|
||||
unsigned size_B = info->align_16 * 2;
|
||||
memcpy(&ctx->out->immediates[uniform - ctx->out->immediate_base_uniform],
|
||||
&info->value, size_B);
|
||||
|
||||
ctx->out->immediate_size_16 =
|
||||
new_count - ctx->out->immediate_base_uniform;
|
||||
}
|
||||
|
||||
/* Promote in the IR */
|
||||
agx_foreach_instr_global(ctx, I) {
|
||||
agx_foreach_ssa_src(I, s) {
|
||||
if (!BITSET_TEST(is_immediate, I->src[s].value))
|
||||
continue;
|
||||
|
||||
struct constant_info *info =
|
||||
_mesa_hash_table_u64_search(constants, values[I->src[s].value]);
|
||||
|
||||
if (info && info->promoted &&
|
||||
agx_instr_accepts_uniform(I->op, s, info->uniform,
|
||||
I->src[s].size)) {
|
||||
|
||||
agx_replace_src(I, s, agx_uniform(info->uniform, I->src[s].size));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
agx_opt_promote_constants(agx_context *ctx)
|
||||
{
|
||||
/* We do not promote constants in preambles since it's pointless and wastes
|
||||
* uniform slots.
|
||||
*/
|
||||
if (ctx->is_preamble)
|
||||
return;
|
||||
|
||||
void *memctx = ralloc_context(NULL);
|
||||
pass(ctx, memctx);
|
||||
ralloc_free(memctx);
|
||||
}
|
||||
|
|
@ -33,6 +33,7 @@ libasahi_agx_files = files(
|
|||
'agx_opt_break_if.c',
|
||||
'agx_opt_empty_else.c',
|
||||
'agx_opt_jmp_none.c',
|
||||
'agx_opt_promote_constants.c',
|
||||
'agx_optimizer.c',
|
||||
'agx_repair_ssa.c',
|
||||
'agx_reindex_ssa.c',
|
||||
|
|
|
|||
|
|
@ -1771,6 +1771,7 @@ agx_compile_nir(struct agx_device *dev, nir_shader *nir,
|
|||
dev->params.num_dies > 1;
|
||||
key.libagx = dev->libagx;
|
||||
key.has_scratch = true;
|
||||
key.promote_constants = true;
|
||||
|
||||
NIR_PASS(_, nir, agx_nir_lower_sysvals, stage, true);
|
||||
NIR_PASS(_, nir, agx_nir_layout_uniforms, compiled, &key.reserved_preamble);
|
||||
|
|
@ -3095,8 +3096,9 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
|
|||
unsigned variable_shared_mem, size_t max_subgroups)
|
||||
{
|
||||
struct agx_context *ctx = batch->ctx;
|
||||
struct agx_usc_builder b =
|
||||
agx_alloc_usc_control(&batch->pipeline_pool, cs->push_range_count + 2);
|
||||
unsigned constant_push_ranges = DIV_ROUND_UP(cs->info.immediate_size_16, 64);
|
||||
struct agx_usc_builder b = agx_alloc_usc_control(
|
||||
&batch->pipeline_pool, constant_push_ranges + cs->push_range_count + 2);
|
||||
|
||||
enum pipe_shader_type stage = cs->stage;
|
||||
|
||||
|
|
@ -3123,6 +3125,21 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
|
|||
batch->uniforms.tables[cs->push[i].table] + cs->push[i].offset);
|
||||
}
|
||||
|
||||
if (cs->info.immediate_size_16) {
|
||||
/* XXX: do ahead of time */
|
||||
uint64_t ptr = agx_pool_upload_aligned(
|
||||
&batch->pool, cs->info.immediates, cs->info.immediate_size_16 * 2, 64);
|
||||
|
||||
for (unsigned range = 0; range < constant_push_ranges; ++range) {
|
||||
unsigned offset = 64 * range;
|
||||
assert(offset < cs->info.immediate_size_16);
|
||||
|
||||
agx_usc_uniform(&b, cs->info.immediate_base_uniform + offset,
|
||||
MIN2(64, cs->info.immediate_size_16 - offset),
|
||||
ptr + (offset * 2));
|
||||
}
|
||||
}
|
||||
|
||||
if (stage == PIPE_SHADER_FRAGMENT) {
|
||||
agx_usc_tilebuffer(&b, &batch->tilebuffer_layout);
|
||||
} else if (stage == PIPE_SHADER_COMPUTE || stage == PIPE_SHADER_TESS_CTRL) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue