hk,agx: promote bindless samplers

via the bindless_sampler_agx intrinsic.

Totals from 29771 (55.11% of 54019) affected shaders:
MaxWaves: 28934080 -> 28938304 (+0.01%); split: +0.02%, -0.00%
Instrs: 16623874 -> 16369120 (-1.53%); split: -1.54%, +0.01%
CodeSize: 117532138 -> 115994992 (-1.31%); split: -1.32%, +0.01%
Spills: 12721 -> 12652 (-0.54%); split: -0.72%, +0.17%
Fills: 6733 -> 6636 (-1.44%); split: -1.96%, +0.52%
Scratch: 132994 -> 132712 (-0.21%); split: -0.22%, +0.01%
ALU: 13054253 -> 12803059 (-1.92%); split: -1.93%, +0.01%
FSCIB: 13054138 -> 12802912 (-1.92%); split: -1.94%, +0.01%
IC: 3916012 -> 3915588 (-0.01%); split: -0.01%, +0.00%
GPRs: 2290907 -> 2289519 (-0.06%); split: -0.07%, +0.01%
Uniforms: 6794773 -> 6696943 (-1.44%); split: -1.44%, +0.00%
Preamble instrs: 6953594 -> 7024455 (+1.02%); split: -0.37%, +1.39%

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36127>
This commit is contained in:
Alyssa Rosenzweig 2025-07-14 19:13:31 -04:00 committed by Marge Bot
parent 49f042c5e8
commit 642c6c6f62
8 changed files with 130 additions and 37 deletions

View file

@ -792,7 +792,8 @@ static agx_index
agx_translate_bindless_handle(agx_builder *b, nir_src *handle, agx_index *base)
{
nir_intrinsic_instr *intr = nir_src_as_intrinsic(*handle);
assert(intr->intrinsic == nir_intrinsic_bindless_image_agx);
assert(intr->intrinsic == nir_intrinsic_bindless_image_agx ||
intr->intrinsic == nir_intrinsic_bindless_sampler_agx);
*base = agx_uniform(nir_intrinsic_desc_set(intr), AGX_SIZE_64);
return agx_src_index(&intr->src[0]);
@ -822,10 +823,17 @@ agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
nir_preamble_class cls = nir_intrinsic_preamble_class(instr);
unsigned base = nir_intrinsic_base(instr);
if (cls == nir_preamble_class_image) {
if (cls != nir_preamble_class_general) {
agx_index heap, offset;
offset = agx_translate_bindless_handle(b, &instr->src[0], &heap);
return agx_tex_state_store(b, heap, offset, base / 2);
/* base is 32-bit units for images but 16-bit for samplers, hence the
* division difference to convert into texture/sampler state units.
*/
if (cls == nir_preamble_class_image)
return agx_tex_state_store(b, heap, offset, base / 2);
else
return agx_sampler_state_store(b, heap, offset, base);
}
agx_index vec = agx_src_index(&instr->src[0]);
@ -1740,6 +1748,7 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
return agx_emit_export(b, nir_intrinsic_base(instr), instr->src[0]);
case nir_intrinsic_bindless_image_agx:
case nir_intrinsic_bindless_sampler_agx:
/* These must always be chased */
return NULL;
@ -2217,8 +2226,14 @@ agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
{
agx_index coords = agx_null(), bindless = agx_immediate(0),
texture = agx_immediate(instr->texture_index),
sampler = agx_immediate(0), lod = agx_immediate(0),
compare = agx_null(), packed_offset = agx_null();
sampler = agx_immediate(instr->sampler_index),
lod = agx_immediate(0), compare = agx_null(),
packed_offset = agx_null();
/* Default to the txf sampler at ss0 */
if (!nir_tex_instr_need_sampler(instr)) {
sampler = agx_immediate(0);
}
bool lod_is_zero = true;
@ -3047,7 +3062,7 @@ optimize_bounds(nir_builder *b, nir_intrinsic_instr *intr, void *data)
static void
agx_optimize_nir(nir_shader *nir, bool soft_fault, uint16_t *preamble_size,
uint8_t *ts_count)
uint8_t *ts_count, uint8_t *ss_count)
{
/* This runs only once up front since other optimizations don't affect it */
NIR_PASS(_, nir, nir_opt_shrink_stores, true);
@ -3151,13 +3166,26 @@ agx_optimize_nir(nir_shader *nir, bool soft_fault, uint16_t *preamble_size,
NIR_PASS(_, nir, agx_nir_lower_fminmax);
if (preamble_size && (!(agx_compiler_debug & AGX_DBG_NOPREAMBLE))) {
unsigned temp = *preamble_size;
unsigned temp_ts_count = ts_count ? *ts_count : 1000 /* large finite */;
NIR_PASS(_, nir, agx_nir_opt_preamble, &temp, &temp_ts_count);
*preamble_size = temp;
unsigned sizes[] = {
*preamble_size,
ts_count ? *ts_count : 1000 /* large finite */,
ss_count ? *ss_count : 1000 /* large finite */,
};
/* Don't clobber txf sampler */
if (sizes[2] == 0)
sizes[2]++;
NIR_PASS(_, nir, agx_nir_opt_preamble, sizes);
*preamble_size = sizes[0];
if (ts_count)
*ts_count = temp_ts_count;
*ts_count = sizes[1];
/* if something other than the txf sampler is written... */
if (ss_count && sizes[2] > 1)
*ss_count = sizes[2];
}
/* Forming preambles may dramatically reduce the instruction count
@ -3907,7 +3935,9 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
agx_optimize_nir(
nir, key->dev.soft_fault, key->secondary ? NULL : &info->push_count,
(key->secondary || !key->promote_textures) ? NULL
: &info->texture_state_count);
: &info->texture_state_count,
(key->secondary || !key->promote_textures) ? NULL
: &info->sampler_state_count);
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
info->varyings.fs.nr_cf = key->fs.cf_base;

View file

@ -287,9 +287,10 @@ struct agx_shader_key {
*/
bool promote_constants;
/* Similarly whether the driver supports promoting bindless textures.
* Currently this works only if non-bindless textures are not used, but
* none of our drivers mix bindless / non-bindless usage.
/* Similarly whether the driver supports promoting bindless
* textures/samplers. Currently this works only if non-bindless
* textures/samplers are not used, but none of our drivers mix bindless /
* non-bindless usage.
*/
bool promote_textures;

View file

@ -1080,8 +1080,7 @@ void agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
void agx_compute_liveness(agx_context *ctx);
void agx_liveness_ins_update(BITSET_WORD *live, agx_instr *I);
bool agx_nir_opt_preamble(nir_shader *s, unsigned *preamble_size,
unsigned *ts_count);
bool agx_nir_opt_preamble(nir_shader *s, unsigned *sizes);
bool agx_nir_lower_load_mask(nir_shader *shader);
bool agx_nir_lower_ubo(nir_shader *shader);
bool agx_nir_lower_shared_bitsize(nir_shader *shader);

View file

@ -8,18 +8,28 @@
#include "util/macros.h"
#include "agx_compiler.h"
#include "nir.h"
#include "nir_builder_opcodes.h"
#include "nir_intrinsics.h"
#include "nir_intrinsics_indices.h"
#include "nir_opcodes.h"
static bool
is_promotable_texture_handle(nir_def *def)
static nir_preamble_class
preamble_class(nir_def *def)
{
nir_instr *instr = def->parent_instr;
if (instr->type != nir_instr_type_intrinsic)
return false;
return nir_preamble_class_general;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
return intr->intrinsic == nir_intrinsic_bindless_image_agx &&
nir_intrinsic_desc_set(intr) < 32 /* encoding restriction */;
if (nir_intrinsic_has_desc_set(intr) && nir_intrinsic_desc_set(intr) >= 32)
return nir_preamble_class_general /* encoding restriction */;
if (intr->intrinsic == nir_intrinsic_bindless_image_agx)
return nir_preamble_class_image;
else if (intr->intrinsic == nir_intrinsic_bindless_sampler_agx)
return nir_preamble_class_sampler;
else
return nir_preamble_class_general;
}
static void
@ -30,8 +40,7 @@ def_size(nir_def *def, unsigned *size, unsigned *align,
*size = (bit_size * def->num_components) / 16;
*align = bit_size / 16;
*class = is_promotable_texture_handle(def) ? nir_preamble_class_image
: nir_preamble_class_general;
*class = preamble_class(def);
}
static bool
@ -238,6 +247,7 @@ instr_cost(nir_instr *instr, const void *data)
case nir_intrinsic_ddy_coarse:
return 1.0;
case nir_intrinsic_bindless_image_agx:
case nir_intrinsic_bindless_sampler_agx:
/* It's worth promoting even with a constant source, but it doesn't
* turn into instructions so should be less than any other normal
* instruction... But just enough to get over the image rewrite_cost.
@ -319,6 +329,9 @@ static const nir_opt_preamble_options preamble_options = {
/* We have at least 32 texture state registers. TODO: check for more? */
.preamble_storage_size[nir_preamble_class_image] = 32,
/* We have at least 16 sampler state registers. TODO: check for more? */
.preamble_storage_size[nir_preamble_class_sampler] = 16,
};
/*
@ -345,6 +358,31 @@ lower_store_preamble(nir_builder *b, nir_intrinsic_instr *intr, void *data)
static bool
lower_preamble(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
if (intr->intrinsic == nir_intrinsic_bindless_sampler_agx) {
/* Rematerialize bindless_sampler_agx before store_preamble with only the
* byte offset (first source), not the sampler index.
*/
nir_foreach_use_safe(use, &intr->def) {
nir_instr *parent = nir_src_parent_instr(use);
if (parent->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *pintr = nir_instr_as_intrinsic(parent);
if (pintr->intrinsic != nir_intrinsic_store_preamble ||
nir_intrinsic_preamble_class(pintr) != nir_preamble_class_sampler)
continue;
b->cursor = nir_before_src(use);
nir_def *repl =
nir_bindless_sampler_agx(b, intr->src[0].ssa, nir_undef(b, 1, 16),
.desc_set = nir_intrinsic_desc_set(intr));
nir_src_rewrite(use, repl);
}
/* Replace other uses with just the sampler index. */
nir_def_replace(&intr->def, intr->src[1].ssa);
return true;
}
if (intr->intrinsic != nir_intrinsic_load_preamble)
return false;
@ -354,6 +392,7 @@ lower_preamble(nir_builder *b, nir_intrinsic_instr *intr, void *data)
unsigned base = nir_intrinsic_base(intr);
nir_def *new_ = NULL;
bool ts = nir_intrinsic_preamble_class(intr) == nir_preamble_class_image;
bool ss = nir_intrinsic_preamble_class(intr) == nir_preamble_class_sampler;
if (!ts && heaps[base] >= 0) {
new_ = nir_bindless_image_agx(b, &intr->def, .desc_set = heaps[base]);
}
@ -374,13 +413,14 @@ lower_preamble(nir_builder *b, nir_intrinsic_instr *intr, void *data)
} else if (parent->type == nir_instr_type_tex) {
nir_tex_instr *tex = nir_instr_as_tex(parent);
nir_tex_src *src = (nir_tex_src *)use;
if (src->src_type != nir_tex_src_texture_handle)
continue;
if (ts) {
if (src->src_type == nir_tex_src_sampler_handle && ss) {
nir_steal_tex_src(tex, nir_tex_src_sampler_handle);
tex->sampler_index = base;
} else if (src->src_type == nir_tex_src_texture_handle && ts) {
nir_steal_tex_src(tex, nir_tex_src_texture_handle);
tex->texture_index = base / 2;
} else {
} else if (src->src_type == nir_tex_src_texture_handle) {
assert(new_ != NULL);
nir_src_rewrite(use, new_);
}
@ -391,15 +431,10 @@ lower_preamble(nir_builder *b, nir_intrinsic_instr *intr, void *data)
}
bool
agx_nir_opt_preamble(nir_shader *nir, unsigned *preamble_size,
unsigned *ts_count)
agx_nir_opt_preamble(nir_shader *nir, unsigned *sizes)
{
bool progress = false;
unsigned sizes[] = {*preamble_size, *ts_count};
NIR_PASS(progress, nir, nir_opt_preamble, &preamble_options, sizes);
*preamble_size = sizes[0];
*ts_count = sizes[1];
if (progress) {
int16_t heap[512];

View file

@ -114,6 +114,7 @@ write_sampled_image_view_desc(struct hk_descriptor_set *set,
*/
desc[plane].sampler_index =
sampler->planes[sampler_plane].hw->index + 28;
desc[plane].sampler = sampler->planes[sampler_plane].hw->key;
desc[plane].lod_bias_fp16 = sampler->lod_bias_fp16;
desc[plane].clamp_0_sampler_index_or_negative = -1;
}

View file

@ -23,6 +23,7 @@ struct hk_descriptor_set_layout;
struct hk_sampled_image_descriptor {
struct agx_texture_packed tex;
struct agx_sampler_packed sampler;
uint16_t sampler_index;
/* Negative if there is no border colour, else the clamp=0 sampler index used
@ -42,7 +43,7 @@ struct hk_sampled_image_descriptor {
uint16_t pad;
/* TODO: This should probably be a heap! */
uint32_t border[4];
uint8_t pad2[12];
uint8_t pad2[4];
};
static_assert(sizeof(struct hk_sampled_image_descriptor) == 64,
"hk_sampled_image_descriptor has no holes");

View file

@ -671,8 +671,9 @@ lower_tex(nir_builder *b, nir_tex_instr *tex,
if (sampler != NULL) {
unsigned offs =
offsetof(struct hk_sampled_image_descriptor, sampler_index);
bool clamp_to_0 = tex->backend_flags & AGX_TEXTURE_FLAG_CLAMP_TO_0;
if (tex->backend_flags & AGX_TEXTURE_FLAG_CLAMP_TO_0) {
if (clamp_to_0) {
offs = offsetof(struct hk_sampled_image_descriptor,
clamp_0_sampler_index_or_negative);
}
@ -681,6 +682,30 @@ lower_tex(nir_builder *b, nir_tex_instr *tex,
b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
plane_offset_B + offs, ctx);
if (!clamp_to_0) {
uint32_t set, binding;
nir_def *idx;
get_resource_deref_binding(b,
nir_src_as_deref(nir_src_for_ssa(sampler)),
&set, &binding, &idx);
const struct hk_descriptor_set_binding_layout *binding_layout =
get_binding_layout(set, binding, ctx);
if (ctx->clamp_desc_array_bounds)
idx =
nir_umin(b, idx, nir_imm_int(b, binding_layout->array_size - 1));
assert(binding_layout->stride > 0);
nir_def *desc_offs_B = nir_iadd_imm(
b, nir_imul_imm(b, idx, binding_layout->stride),
binding_layout->offset + plane_offset_B +
offsetof(struct hk_sampled_image_descriptor, sampler));
index =
nir_bindless_sampler_agx(b, desc_offs_B, index, .desc_set = set);
}
nir_tex_instr_add_src(tex, nir_tex_src_sampler_handle, index);
}

View file

@ -952,7 +952,8 @@ static bool
lower_uniforms(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
struct fixed_uniforms *ctx = data;
if (intr->intrinsic == nir_intrinsic_bindless_image_agx) {
if (intr->intrinsic == nir_intrinsic_bindless_image_agx ||
intr->intrinsic == nir_intrinsic_bindless_sampler_agx) {
/* Change of units from sets to uniforms */
nir_intrinsic_set_desc_set(
intr, ctx->sets + (nir_intrinsic_desc_set(intr) * 4));