ir3,tu,freedreno: Allow more tex coord interpolations for prefetch
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

FS tex prefetch reads tex coords from r0.x, and it doesn't care
what interpolation they have. Thus we can allow all interpolations
which HLSQ_CONTROL_3_REG/HLSQ_CONTROL_4_REG support. Which would
be: (pixel, centroid, sample) x (nopersp, persp). So all but FLAT
are supported.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34422>
This commit is contained in:
Danylo Piliaiev 2025-04-08 14:22:13 +02:00 committed by Marge Bot
parent c4c7482a90
commit cc7aa31b30
8 changed files with 91 additions and 29 deletions

View file

@ -51,10 +51,10 @@ traces:
freedreno-a630:
checksum: e0e18dcc50ab2e23cead650d64469178
zink-a618:
checksum: b589b5d9ddd3026cbde08f0abe840ea7
checksum: b7e0cdb0db74ea9a31fb7a75ae0d76fc
zink-a630:
label: [skip, flakes]
checksum: b589b5d9ddd3026cbde08f0abe840ea7
checksum: b7e0cdb0db74ea9a31fb7a75ae0d76fc
text: seems to trigger oomkilling recently
valve/counterstrike-source-v2.trace:

View file

@ -3854,7 +3854,7 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
ir3_builder_at(ir3_before_terminator(ctx->in_block));
sam = ir3_SAM(&build, opc, type, MASK(ncomp), 0, NULL,
get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa, NULL);
/* make sure not to add irrelevant flags like S2EN */
sam->flags = flags | (info.flags & IR3_INSTR_B);
sam->prefetch.tex = info.tex_idx;
@ -5810,7 +5810,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
int idx = 0;
foreach_input (instr, ir) {
if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
if (instr->input.sysval !=
(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + so->prefetch_bary_type))
continue;
assert(idx < 2);

View file

@ -109,8 +109,9 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
/* Enable the texture pre-fetch feature only a4xx onwards. But
* only enable it on generations that have been tested:
*/
if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch)
NIR_PASS(_, ctx->s, ir3_nir_lower_tex_prefetch);
if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch) {
NIR_PASS(_, ctx->s, ir3_nir_lower_tex_prefetch, &so->prefetch_bary_type);
}
bool vectorized = false;
NIR_PASS(vectorized, ctx->s, nir_opt_vectorize, ir3_nir_vectorize_filter,

View file

@ -27,8 +27,9 @@ bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir,
bool ir3_nir_lower_driver_params_to_ubo(nir_shader *nir,
struct ir3_shader_variant *v);
bool ir3_nir_move_varying_inputs(nir_shader *shader);
int ir3_nir_coord_offset(nir_def *ssa);
bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
int ir3_nir_coord_offset(nir_def *ssa, gl_system_value *bary_type);
bool ir3_nir_lower_tex_prefetch(nir_shader *shader,
enum ir3_bary *prefetch_bary_type);
bool ir3_nir_lower_layer_id(nir_shader *shader);
bool ir3_nir_lower_frag_shading_rate(nir_shader *shader);
bool ir3_nir_lower_primitive_shading_rate(nir_shader *shader);

View file

@ -5,13 +5,25 @@
#include "ir3_nir.h"
#include "util/u_vector.h"
/**
* A pass which detects tex instructions which are candidate to be executed
* prior to FS shader start, and change them to nir_texop_tex_prefetch.
*/
typedef struct {
nir_tex_instr *tex;
enum ir3_bary bary;
} tex_prefetch_candidate;
typedef struct {
struct u_vector candidates;
uint32_t per_bary_candidates[IJ_COUNT];
} ir3_prefetch_state;
static int
coord_offset(nir_def *ssa)
coord_offset(nir_def *ssa, gl_system_value *bary_type)
{
nir_instr *parent_instr = ssa->parent_instr;
@ -27,7 +39,7 @@ coord_offset(nir_def *ssa)
if (alu->op != nir_op_vec2)
return -1;
int base_src_offset = coord_offset(alu->src[0].src.ssa);
int base_src_offset = coord_offset(alu->src[0].src.ssa, bary_type);
if (base_src_offset < 0)
return -1;
@ -35,7 +47,7 @@ coord_offset(nir_def *ssa)
/* NOTE it might be possible to support more than 2D? */
for (int i = 1; i < 2; i++) {
int nth_src_offset = coord_offset(alu->src[i].src.ssa);
int nth_src_offset = coord_offset(alu->src[i].src.ssa, bary_type);
if (nth_src_offset < 0)
return -1;
int nth_offset = nth_src_offset + alu->src[i].swizzle[0];
@ -62,20 +74,26 @@ coord_offset(nir_def *ssa)
nir_intrinsic_instr *interp =
nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel &&
interp->intrinsic != nir_intrinsic_load_barycentric_sample &&
interp->intrinsic != nir_intrinsic_load_barycentric_centroid)
return -1;
/* interpolation modes such as noperspective aren't covered by the other
/* interpolation modes such as flat aren't covered by the other
* test, we need to explicitly check for them here.
*/
unsigned interp_mode = nir_intrinsic_interp_mode(interp);
if (interp_mode != INTERP_MODE_NONE && interp_mode != INTERP_MODE_SMOOTH)
if (interp_mode != INTERP_MODE_NONE && interp_mode != INTERP_MODE_SMOOTH &&
interp_mode != INTERP_MODE_NOPERSPECTIVE)
return -1;
/* we also need a const input offset: */
if (!nir_src_is_const(input->src[1]))
return -1;
if (bary_type)
*bary_type = ir3_nir_intrinsic_barycentric_sysval(interp);
unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
unsigned comp = nir_intrinsic_component(input);
@ -83,11 +101,13 @@ coord_offset(nir_def *ssa)
}
int
ir3_nir_coord_offset(nir_def *ssa)
ir3_nir_coord_offset(nir_def *ssa, gl_system_value *bary_type)
{
assert(ssa->num_components == 2);
return coord_offset(ssa);
if (bary_type)
*bary_type = SYSTEM_VALUE_MAX;
return coord_offset(ssa, bary_type);
}
static bool
@ -136,7 +156,7 @@ ok_tex_samp(nir_tex_instr *tex)
}
static bool
lower_tex_prefetch_block(nir_block *block)
lower_tex_prefetch_block(nir_block *block, ir3_prefetch_state *state)
{
bool progress = false;
@ -168,8 +188,14 @@ lower_tex_prefetch_block(nir_block *block)
/* First source should be the sampling coordinate. */
nir_tex_src *coord = &tex->src[idx];
if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
tex->op = nir_texop_tex_prefetch;
gl_system_value bary_type;
if (ir3_nir_coord_offset(coord->src.ssa, &bary_type) >= 0) {
enum ir3_bary bary = bary_type - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
state->per_bary_candidates[bary]++;
tex_prefetch_candidate *candidate = u_vector_add(&state->candidates);
candidate->tex = tex;
candidate->bary = bary;
progress |= true;
}
@ -179,7 +205,7 @@ lower_tex_prefetch_block(nir_block *block)
}
static bool
lower_tex_prefetch_func(nir_function_impl *impl)
lower_tex_prefetch_func(nir_function_impl *impl, ir3_prefetch_state *state)
{
/* Only instructions in the the outer-most block are considered eligible for
* pre-dispatch, because they need to be move-able to the beginning of the
@ -201,18 +227,22 @@ lower_tex_prefetch_func(nir_function_impl *impl)
}
}
bool progress = lower_tex_prefetch_block(block);
bool progress = lower_tex_prefetch_block(block, state);
return nir_progress(progress, impl, nir_metadata_control_flow);
}
bool
ir3_nir_lower_tex_prefetch(nir_shader *shader)
ir3_nir_lower_tex_prefetch(nir_shader *shader,
enum ir3_bary *prefetch_bary_type)
{
bool progress = false;
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
ir3_prefetch_state state = {};
u_vector_init(&state.candidates, 4, sizeof(tex_prefetch_candidate));
nir_foreach_function (function, shader) {
/* Only texture sampling instructions inside the main function
* are eligible for pre-dispatch.
@ -220,8 +250,36 @@ ir3_nir_lower_tex_prefetch(nir_shader *shader)
if (!function->impl || !function->is_entrypoint)
continue;
progress |= lower_tex_prefetch_func(function->impl);
progress |= lower_tex_prefetch_func(function->impl, &state);
}
if (progress) {
/* We cannot prefetch tex ops that use different interpolation modes,
* so we have to choose a single mode to prefetch. We select the
* interpolation mode that would allow us to prefetch the most tex ops.
*/
uint32_t max_tex_with_bary = 0;
uint32_t chosen_bary = 0;
for (int i = 0; i < IJ_COUNT; i++) {
if (state.per_bary_candidates[i] > max_tex_with_bary) {
max_tex_with_bary = state.per_bary_candidates[i];
chosen_bary = i;
}
}
tex_prefetch_candidate *candidate;
u_vector_foreach(candidate, &state.candidates) {
if (candidate->bary == chosen_bary) {
candidate->tex->op = nir_texop_tex_prefetch;
}
}
*prefetch_bary_type = chosen_bary;
} else {
*prefetch_bary_type = IJ_COUNT;
}
u_vector_finish(&state.candidates);
return progress;
}

View file

@ -879,6 +879,7 @@ struct ir3_shader_variant {
/* texture sampler pre-dispatches */
uint32_t num_sampler_prefetch;
struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
enum ir3_bary prefetch_bary_type;
/* If true, the last use of helper invocations is the texture prefetch and
* they should be disabled for the actual shader. Equivalent to adding

View file

@ -1685,9 +1685,9 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
if (fs->num_sampler_prefetch > 0) {
/* It seems like ij_pix is *required* to be r0.x */
assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
/* FS prefetch reads coordinates from r0.x */
assert(!VALIDREG(ij_regid[fs->prefetch_bary_type]) ||
ij_regid[fs->prefetch_bary_type] == regid(0, 0));
}
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);

View file

@ -939,9 +939,9 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
if (fs->num_sampler_prefetch > 0) {
/* It seems like ij_pix is *required* to be r0.x */
assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
/* FS prefetch reads coordinates from r0.x */
assert(!VALIDREG(ij_regid[fs->prefetch_bary_type]) ||
ij_regid[fs->prefetch_bary_type] == regid(0, 0));
}
OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);