nir/opt_varyings: back propagate signed zero information to outputs

Foz-DB Navi48:
Totals from 809 (0.67% of 120695) affected shaders:
MaxWaves: 21804 -> 21808 (+0.02%)
Instrs: 863131 -> 861310 (-0.21%); split: -0.22%, +0.01%
CodeSize: 4535500 -> 4523232 (-0.27%); split: -0.30%, +0.03%
VGPRs: 47304 -> 47280 (-0.05%)
SpillSGPRs: 170 -> 82 (-51.76%)
Latency: 6791484 -> 6786880 (-0.07%); split: -0.07%, +0.00%
InvThroughput: 906281 -> 905301 (-0.11%); split: -0.11%, +0.00%
VClause: 16910 -> 16917 (+0.04%); split: -0.01%, +0.05%
SClause: 21856 -> 21827 (-0.13%); split: -0.14%, +0.01%
Copies: 61890 -> 61436 (-0.73%); split: -0.80%, +0.06%
Branches: 19725 -> 19640 (-0.43%)
PreSGPRs: 38011 -> 37851 (-0.42%)
PreVGPRs: 36482 -> 36454 (-0.08%)
VALU: 465316 -> 464323 (-0.21%); split: -0.22%, +0.00%
SALU: 143757 -> 143395 (-0.25%); split: -0.33%, +0.08%
VMEM: 36827 -> 36806 (-0.06%)
SMEM: 37769 -> 37768 (-0.00%)

Reviewed-by: Marek Olšák <maraeo@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41292>
This commit is contained in:
Georg Lehmann 2026-04-27 09:14:00 +02:00 committed by Marge Bot
parent b2bc57551a
commit fac4edbcba
8 changed files with 91 additions and 10 deletions

View file

@ -162,7 +162,13 @@
* * Eliminated output stores get the "no_varying" flag if they are also
* xfb stores or write sysval outputs.
*
* 5. Backward inter-shader code motion
* 5. Link signed zero information
*
* If no loads care about the sign of zero, and if there is no xfb or
* sign aware sysval usage, set the no_signed_zero flag for output stores.
* This can then be further back propagated by `nir_opt_fp_math_ctrl`.
*
* 6. Backward inter-shader code motion
*
* "Backward" refers to moving code in the opposite direction that shaders
* are executed, i.e. moving code from the consumer to the producer.
@ -329,7 +335,7 @@
* the above case (which is temp0 and temp1 to replace all 3 inputs), let
* us know.
*
* 6. Forward inter-shader code motion
* 7. Forward inter-shader code motion
*
* TODO: Not implemented. The text below is a draft of the description.
*
@ -365,7 +371,7 @@
* we don't increase the GPU overhead measurably by moving code across
* pipeline stages that amplify GPU work.
*
* 7. Compaction to vec4 slots (AKA packing)
* 8. Compaction to vec4 slots (AKA packing)
*
* First, varyings are divided into these groups, and components from each
* group are assigned locations in this order (effectively forcing
@ -764,6 +770,11 @@ struct linkage_info {
*/
BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
/* Mask of components that have an input load, xfb, or sysval usage that
* cares about the sign of zero.
*/
BITSET_DECLARE(signed_zero_mask, NUM_SCALAR_SLOTS);
};
/******************************************************************
@ -1060,6 +1071,32 @@ is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
!nir_intrinsic_io_semantics(intr).no_sysval_output;
}
static bool
is_sz_sysval(struct linkage_info *linkage, unsigned slot,
nir_intrinsic_instr *intr)
{
if (!is_active_sysval_output(linkage, slot, intr))
return false;
switch (vec4_slot(slot)) {
case VARYING_SLOT_POS:
case VARYING_SLOT_CLIP_VERTEX:
case VARYING_SLOT_PSIZ:
case VARYING_SLOT_CLIP_DIST0:
case VARYING_SLOT_CLIP_DIST1:
case VARYING_SLOT_CULL_DIST0:
case VARYING_SLOT_CULL_DIST1:
return false;
case VARYING_SLOT_TESS_LEVEL_OUTER:
case VARYING_SLOT_TESS_LEVEL_INNER:
case VARYING_SLOT_BOUNDING_BOX0:
case VARYING_SLOT_BOUNDING_BOX1:
/* These enums are aliased with integer mesh outputs. */
return linkage->producer_stage != MESA_SHADER_TESS_CTRL;
default: return true;
}
}
/**
* This function acts like a filter. The pass won't touch varyings that
* return false here, and the return value is saved in the linkage bitmasks,
@ -1276,6 +1313,12 @@ gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_d
list_addtail(&node->head, &in->consumer.loads);
in->num_slots = MAX2(in->num_slots, sem.num_slots);
if (!sem.no_signed_zero && intr->intrinsic != nir_intrinsic_load_interpolated_input) {
unsigned nsz_count = nir_src_is_const(offset) ? 1 : sem.num_slots;
for (unsigned i = 0; i < nsz_count; i++)
BITSET_SET(linkage->signed_zero_mask, slot + i * 8);
}
BITSET_SET(linkage->removable_mask, slot);
enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
@ -1555,6 +1598,12 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_
list_addtail(&node->head, &out->producer.loads);
}
if (is_store ? (is_sz_sysval(linkage, slot, intr) || has_xfb(intr)) : !sem.no_signed_zero) {
unsigned nsz_count = nir_src_is_const(offset) ? 1 : sem.num_slots;
for (unsigned i = 0; i < nsz_count; i++)
BITSET_SET(linkage->signed_zero_mask, slot + i * 8);
}
BITSET_SET(linkage->removable_mask, slot);
/* Indirect indexing. */
@ -4181,6 +4230,35 @@ backward_inter_shader_code_motion(struct linkage_info *linkage,
return false;
}
/******************************************************************
* SIGNED ZERO LINKING
******************************************************************/
static void
link_no_signed_zero(struct linkage_info *linkage,
nir_opt_varyings_progress *progress)
{
for (unsigned slot = 0; slot < NUM_SCALAR_SLOTS; slot++) {
struct scalar_slot *scalar_slot = &linkage->slot[slot];
list_for_each_entry(struct list_node, iter, &scalar_slot->producer.stores, head) {
nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
bool no_signed_zero = true;
unsigned nsz_count = nir_src_is_const(*nir_get_io_offset_src(iter->instr)) ? 1 : sem.num_slots;
for (unsigned i = 0; i < nsz_count; i++)
no_signed_zero &= !BITSET_TEST(linkage->signed_zero_mask, slot + i * 8);
if (sem.no_signed_zero != no_signed_zero) {
*progress |= nir_progress_producer;
sem.no_signed_zero = no_signed_zero;
nir_intrinsic_set_io_semantics(iter->instr, sem);
}
}
}
}
/******************************************************************
* COMPACTION
******************************************************************/
@ -5396,6 +5474,9 @@ nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
init_linkage(producer, consumer, spirv, max_uniform_components,
max_ubos_per_stage, linkage, &progress);
link_no_signed_zero(linkage, &progress);
/* This must be done after deduplication and before inter-shader code
* motion.
*/

View file

@ -82,7 +82,7 @@ TEST_F(nir_opt_varyings_test_bicm_binary_alu, \
ASSERT_TRUE(!shader_contains_def(b2, load[0])); \
ASSERT_TRUE(!shader_contains_def(b2, load[1])); \
} else { \
ASSERT_EQ(opt_varyings(), 0); \
ASSERT_EQ(opt_varyings() & nir_progress_consumer, 0); \
ASSERT_TRUE(!shader_contains_alu_op(b1, nir_op_##alu, bitsize)); \
ASSERT_TRUE(shader_contains_alu_op(b2, nir_op_##alu, bitsize)); \
ASSERT_TRUE(shader_contains_instr(b1, &store[0]->instr)); \

View file

@ -34,7 +34,7 @@ TEST_F(nir_opt_varyings_test_dead_output, \
store_output(b1, VARYING_SLOT_##slot, 0, nir_type_float##bitsize, \
nir_imm_floatN_t(b1, 0, bitsize), 0); \
\
ASSERT_TRUE(opt_varyings() == 0); \
opt_varyings(); \
ASSERT_TRUE(b1->shader->info.outputs_written == VARYING_BIT_##slot); \
ASSERT_TRUE(shader_contains_instr(b1, &intr->instr)); \
ASSERT_TRUE(nir_intrinsic_io_semantics(intr).no_varying == \

View file

@ -62,7 +62,7 @@ TEST_F(nir_opt_varyings_test_dedup, \
ASSERT_TRUE(!shader_contains_def(b2, load[1][v])); \
} \
} else { \
ASSERT_EQ(opt_varyings(), 0); \
ASSERT_EQ(opt_varyings() & nir_progress_consumer, 0); \
for (unsigned v = 0; v < (is_per_vertex(b1, (gl_varying_slot)pslot[1], false) ? 3 : 1); v++) { \
ASSERT_TRUE(shader_contains_instr(b1, &store[0][v]->instr)); \
ASSERT_TRUE(shader_contains_instr(b1, &store[1][v]->instr)); \

View file

@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_const, \
SHADER_CONST_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, value, value) \
\
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
} else { \

View file

@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_ubo, \
SHADER_UBO_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
\
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
} else { \

View file

@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_uniform, \
SHADER_UNIFORM_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
\
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
} else { \

View file

@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_uniform_expr, \
SHADER_UNI_EXPR_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
\
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
} else { \