ac/nir: optimize txd(coord, ddx/ddy(coord))

This is done in ac_nir_lower_tex so that we can optimize derivative
calculations with a different exec mask than the texture sample by using
the nir_strict_wqm_coord_amd path.

It's also more aware of divergence than nir_lower_tex is.

fossil-db (gfx1201):
Totals from 103 (0.13% of 79839) affected shaders:
MaxWaves: 2610 -> 2620 (+0.38%)
Instrs: 347283 -> 345912 (-0.39%); split: -0.40%, +0.00%
CodeSize: 1892380 -> 1883824 (-0.45%); split: -0.46%, +0.00%
VGPRs: 8028 -> 7824 (-2.54%)
Latency: 3942575 -> 3939623 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 867147 -> 865281 (-0.22%); split: -0.24%, +0.02%
VClause: 6230 -> 6221 (-0.14%); split: -0.19%, +0.05%
SClause: 3910 -> 3914 (+0.10%); split: -0.26%, +0.36%
Copies: 16091 -> 15721 (-2.30%); split: -2.74%, +0.44%
PreSGPRs: 4651 -> 4658 (+0.15%)
PreVGPRs: 6389 -> 6320 (-1.08%); split: -1.17%, +0.09%
VALU: 228715 -> 227490 (-0.54%); split: -0.54%, +0.01%
SALU: 32763 -> 32767 (+0.01%); split: -0.06%, +0.07%
VMEM: 9027 -> 9024 (-0.03%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37561>
This commit is contained in:
Rhys Perry 2025-09-24 16:16:43 +01:00 committed by Marge Bot
parent 309ac1f0c0
commit 7d552d71e9

View file

@ -221,12 +221,20 @@ typedef struct {
nir_intrinsic_instr *load;
} coord_info;
static bool
can_move_coord(nir_scalar scalar, coord_info *info)
static bool can_move_coord(nir_scalar scalar, coord_info *info, nir_block *toplevel_block, bool txd)
{
if (scalar.def->bit_size != 32)
return false;
/* Allow any def that is reachable from the nir_strict_wqm_coord_amd when
* optimizing nir_texop_txd. Otherwise, we only use nir_strict_wqm_coord_amd
* for cases that D3D11 requires.
*/
if (txd && nir_block_dominates(scalar.def->parent_instr->block, toplevel_block)) {
info->load = NULL;
return true;
}
if (nir_scalar_is_const(scalar))
return true;
@ -273,7 +281,8 @@ struct move_tex_coords_state {
struct loop_if_state {
bool inside_loop;
bool divergent_discard;
unsigned prev_terminate;
unsigned prev_break_continue;
};
static nir_def *
@ -284,6 +293,9 @@ build_coordinate(struct move_tex_coords_state *state, nir_scalar scalar, coord_i
if (nir_scalar_is_const(scalar))
return nir_imm_intN_t(b, nir_scalar_as_uint(scalar), scalar.def->bit_size);
if (!info.load)
return nir_mov_scalar(b, scalar);
ASSERTED nir_src offset = *nir_get_io_offset_src(info.load);
assert(nir_src_is_const(offset) && !nir_src_as_uint(offset));
@ -304,11 +316,48 @@ build_coordinate(struct move_tex_coords_state *state, nir_scalar scalar, coord_i
return res;
}
static bool can_optimize_txd(nir_shader *shader, struct loop_if_state *loop_if, nir_tex_instr *tex,
bool *need_strict_wqm_coord)
{
nir_instr *ddxy_instrs[NIR_MAX_VEC_COMPONENTS * 2];
unsigned size = nir_tex_parse_txd_coords(shader, tex, ddxy_instrs);
if (!size)
return false;
bool incomplete_quad =
tex->instr.block->divergent || loop_if->prev_terminate || loop_if->inside_loop;
*need_strict_wqm_coord = false;
if (incomplete_quad) {
for (unsigned i = 0; i < size; i++) {
nir_instr *instr = ddxy_instrs[i];
*need_strict_wqm_coord |=
instr->block->cf_node.parent != tex->instr.block->cf_node.parent ||
loop_if->prev_terminate > instr->index || loop_if->prev_break_continue > instr->index;
}
}
return true;
}
static bool optimize_txd(nir_tex_instr *tex)
{
if (tex->op == nir_texop_txd) {
tex->op = nir_texop_tex;
nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_ddx));
nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_ddy));
return true;
}
return false;
}
static bool
move_tex_coords(struct move_tex_coords_state *state, nir_function_impl *impl, nir_instr *instr)
{
nir_tex_instr *tex = nir_instr_as_tex(instr);
if (tex->op != nir_texop_tex && tex->op != nir_texop_txb && tex->op != nir_texop_lod)
if (tex->op != nir_texop_tex && tex->op != nir_texop_txb && tex->op != nir_texop_lod &&
tex->op != nir_texop_txd)
return false;
switch (tex->sampler_dim) {
@ -333,9 +382,11 @@ move_tex_coords(struct move_tex_coords_state *state, nir_function_impl *impl, ni
nir_scalar components[NIR_MAX_VEC_COMPONENTS];
coord_info infos[NIR_MAX_VEC_COMPONENTS];
bool can_move_all = true;
nir_block *toplevel_block = nir_cursor_current_block(state->toplevel_b.cursor);
for (unsigned i = 0; i < tex->coord_components; i++) {
components[i] = nir_scalar_resolved(src->src.ssa, i);
can_move_all &= can_move_coord(components[i], &infos[i]);
can_move_all &=
can_move_coord(components[i], &infos[i], toplevel_block, tex->op == nir_texop_txd);
}
if (!can_move_all)
return false;
@ -377,6 +428,8 @@ move_tex_coords(struct move_tex_coords_state *state, nir_function_impl *impl, ni
if (offset_src >= 0) /* Workaround requirement in nir_tex_instr_src_size(). */
tex->src[offset_src].src_type = nir_tex_src_backend2;
optimize_txd(tex);
state->num_wqm_vgprs += linear_vgpr_size;
return true;
@ -391,7 +444,7 @@ move_ddxy(struct move_tex_coords_state *state, nir_function_impl *impl, nir_intr
bool can_move_all = true;
for (unsigned i = 0; i < num_components; i++) {
components[i] = nir_scalar_resolved(instr->src[0].ssa, i);
can_move_all &= can_move_coord(components[i], &infos[i]);
can_move_all &= can_move_coord(components[i], &infos[i], NULL, false);
}
if (!can_move_all || state->num_wqm_vgprs + num_components > state->options->max_wqm_vgprs)
return false;
@ -415,6 +468,7 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state,
struct loop_if_state *loop_if, struct exec_list *cf_list)
{
nir_function_impl *impl = state->toplevel_b.impl;
nir_shader *shader = impl->function->shader;
bool progress = false;
foreach_list_typed (nir_cf_node, cf_node, node, cf_list) {
@ -425,27 +479,38 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state,
bool top_level = cf_list == &impl->body;
nir_foreach_instr (instr, block) {
if (top_level && !loop_if->divergent_discard)
if (top_level && !loop_if->prev_terminate)
state->toplevel_b.cursor = nir_before_instr(instr);
/* Assume quads might be incomplete when inside loops in case of a
* divergent terminate from a previous iteration.
*/
bool incomplete_quad =
block->divergent || loop_if->divergent_discard || loop_if->inside_loop;
block->divergent || loop_if->prev_terminate || loop_if->inside_loop;
if (instr->type == nir_instr_type_tex && incomplete_quad) {
progress |= move_tex_coords(state, impl, instr);
if (instr->type == nir_instr_type_tex) {
nir_tex_instr *tex = nir_instr_as_tex(instr);
if (tex->op == nir_texop_txd) {
bool txd_need_strict_wqm_coord = false;
if (!can_optimize_txd(shader, loop_if, tex, &txd_need_strict_wqm_coord))
continue;
if (!txd_need_strict_wqm_coord)
progress |= optimize_txd(tex);
}
if (state->options->fix_derivs_in_divergent_cf && incomplete_quad)
progress |= move_tex_coords(state, impl, instr);
} else if (instr->type == nir_instr_type_intrinsic) {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_terminate:
if (block->divergent)
loop_if->divergent_discard = true;
loop_if->prev_terminate = instr->index;
break;
case nir_intrinsic_terminate_if:
if (block->divergent || nir_src_is_divergent(&intrin->src[0]))
loop_if->divergent_discard = true;
loop_if->prev_terminate = instr->index;
break;
case nir_intrinsic_ddx:
case nir_intrinsic_ddy:
@ -459,10 +524,12 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state,
default:
break;
}
} else if (instr->type == nir_instr_type_jump && block->divergent) {
loop_if->prev_break_continue = instr->index;
}
}
if (top_level && !loop_if->divergent_discard)
if (top_level && !loop_if->prev_terminate)
state->toplevel_b.cursor = nir_after_block_before_jump(block);
break;
}
@ -472,7 +539,9 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state,
struct loop_if_state inner_else = *loop_if;
progress |= move_coords_from_divergent_cf(state, &inner_then, &nif->then_list);
progress |= move_coords_from_divergent_cf(state, &inner_else, &nif->else_list);
loop_if->divergent_discard |= inner_then.divergent_discard || inner_else.divergent_discard;
loop_if->prev_terminate = MAX2(inner_then.prev_terminate, inner_else.prev_terminate);
loop_if->prev_break_continue =
MAX2(inner_then.prev_break_continue, inner_else.prev_break_continue);
break;
}
case nir_cf_node_loop: {
@ -481,7 +550,7 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state,
struct loop_if_state inner = *loop_if;
inner.inside_loop = true;
progress |= move_coords_from_divergent_cf(state, &inner, &loop->body);
loop_if->divergent_discard |= inner.divergent_discard;
loop_if->prev_terminate = inner.prev_terminate;
break;
}
case nir_cf_node_function:
@ -496,9 +565,10 @@ bool
ac_nir_lower_tex(nir_shader *nir, const ac_nir_lower_tex_options *options)
{
bool progress = false;
if (options->fix_derivs_in_divergent_cf) {
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_metadata_require(impl, nir_metadata_divergence);
nir_metadata_require(
impl, nir_metadata_divergence | nir_metadata_dominance | nir_metadata_instr_index);
struct move_tex_coords_state state;
state.toplevel_b = nir_builder_create(impl);
@ -507,7 +577,8 @@ ac_nir_lower_tex(nir_shader *nir, const ac_nir_lower_tex_options *options)
struct loop_if_state loop_if;
loop_if.inside_loop = false;
loop_if.divergent_discard = false;
loop_if.prev_terminate = 0;
loop_if.prev_break_continue = 0;
bool impl_progress = move_coords_from_divergent_cf(&state, &loop_if, &impl->body);
progress |= nir_progress(impl_progress, impl, nir_metadata_control_flow);
}