mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 11:38:05 +02:00
nir/opt_varyings: back propagate signed zero information to outputs
Foz-DB Navi48: Totals from 809 (0.67% of 120695) affected shaders: MaxWaves: 21804 -> 21808 (+0.02%) Instrs: 863131 -> 861310 (-0.21%); split: -0.22%, +0.01% CodeSize: 4535500 -> 4523232 (-0.27%); split: -0.30%, +0.03% VGPRs: 47304 -> 47280 (-0.05%) SpillSGPRs: 170 -> 82 (-51.76%) Latency: 6791484 -> 6786880 (-0.07%); split: -0.07%, +0.00% InvThroughput: 906281 -> 905301 (-0.11%); split: -0.11%, +0.00% VClause: 16910 -> 16917 (+0.04%); split: -0.01%, +0.05% SClause: 21856 -> 21827 (-0.13%); split: -0.14%, +0.01% Copies: 61890 -> 61436 (-0.73%); split: -0.80%, +0.06% Branches: 19725 -> 19640 (-0.43%) PreSGPRs: 38011 -> 37851 (-0.42%) PreVGPRs: 36482 -> 36454 (-0.08%) VALU: 465316 -> 464323 (-0.21%); split: -0.22%, +0.00% SALU: 143757 -> 143395 (-0.25%); split: -0.33%, +0.08% VMEM: 36827 -> 36806 (-0.06%) SMEM: 37769 -> 37768 (-0.00%) Reviewed-by: Marek Olšák <maraeo@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41292>
This commit is contained in:
parent
b2bc57551a
commit
fac4edbcba
8 changed files with 91 additions and 10 deletions
|
|
@ -162,7 +162,13 @@
|
|||
* * Eliminated output stores get the "no_varying" flag if they are also
|
||||
* xfb stores or write sysval outputs.
|
||||
*
|
||||
* 5. Backward inter-shader code motion
|
||||
* 5. Link signed zero information
|
||||
*
|
||||
* If no loads care about the sign of zero, and if there is no xfb or
|
||||
* sign aware sysval usage, set the no_signed_zero flag for output stores.
|
||||
* This can then be further back propagated by `nir_opt_fp_math_ctrl`.
|
||||
*
|
||||
* 6. Backward inter-shader code motion
|
||||
*
|
||||
* "Backward" refers to moving code in the opposite direction that shaders
|
||||
* are executed, i.e. moving code from the consumer to the producer.
|
||||
|
|
@ -329,7 +335,7 @@
|
|||
* the above case (which is temp0 and temp1 to replace all 3 inputs), let
|
||||
* us know.
|
||||
*
|
||||
* 6. Forward inter-shader code motion
|
||||
* 7. Forward inter-shader code motion
|
||||
*
|
||||
* TODO: Not implemented. The text below is a draft of the description.
|
||||
*
|
||||
|
|
@ -365,7 +371,7 @@
|
|||
* we don't increase the GPU overhead measurably by moving code across
|
||||
* pipeline stages that amplify GPU work.
|
||||
*
|
||||
* 7. Compaction to vec4 slots (AKA packing)
|
||||
* 8. Compaction to vec4 slots (AKA packing)
|
||||
*
|
||||
* First, varyings are divided into these groups, and components from each
|
||||
* group are assigned locations in this order (effectively forcing
|
||||
|
|
@ -764,6 +770,11 @@ struct linkage_info {
|
|||
*/
|
||||
BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
|
||||
BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
|
||||
|
||||
/* Mask of components that have an input load, xfb, or sysval usage that
|
||||
* cares about the sign of zero.
|
||||
*/
|
||||
BITSET_DECLARE(signed_zero_mask, NUM_SCALAR_SLOTS);
|
||||
};
|
||||
|
||||
/******************************************************************
|
||||
|
|
@ -1060,6 +1071,32 @@ is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
|
|||
!nir_intrinsic_io_semantics(intr).no_sysval_output;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_sz_sysval(struct linkage_info *linkage, unsigned slot,
|
||||
nir_intrinsic_instr *intr)
|
||||
{
|
||||
if (!is_active_sysval_output(linkage, slot, intr))
|
||||
return false;
|
||||
|
||||
switch (vec4_slot(slot)) {
|
||||
case VARYING_SLOT_POS:
|
||||
case VARYING_SLOT_CLIP_VERTEX:
|
||||
case VARYING_SLOT_PSIZ:
|
||||
case VARYING_SLOT_CLIP_DIST0:
|
||||
case VARYING_SLOT_CLIP_DIST1:
|
||||
case VARYING_SLOT_CULL_DIST0:
|
||||
case VARYING_SLOT_CULL_DIST1:
|
||||
return false;
|
||||
case VARYING_SLOT_TESS_LEVEL_OUTER:
|
||||
case VARYING_SLOT_TESS_LEVEL_INNER:
|
||||
case VARYING_SLOT_BOUNDING_BOX0:
|
||||
case VARYING_SLOT_BOUNDING_BOX1:
|
||||
/* These enums are aliased with integer mesh outputs. */
|
||||
return linkage->producer_stage != MESA_SHADER_TESS_CTRL;
|
||||
default: return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function acts like a filter. The pass won't touch varyings that
|
||||
* return false here, and the return value is saved in the linkage bitmasks,
|
||||
|
|
@ -1276,6 +1313,12 @@ gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_d
|
|||
list_addtail(&node->head, &in->consumer.loads);
|
||||
in->num_slots = MAX2(in->num_slots, sem.num_slots);
|
||||
|
||||
if (!sem.no_signed_zero && intr->intrinsic != nir_intrinsic_load_interpolated_input) {
|
||||
unsigned nsz_count = nir_src_is_const(offset) ? 1 : sem.num_slots;
|
||||
for (unsigned i = 0; i < nsz_count; i++)
|
||||
BITSET_SET(linkage->signed_zero_mask, slot + i * 8);
|
||||
}
|
||||
|
||||
BITSET_SET(linkage->removable_mask, slot);
|
||||
|
||||
enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
|
||||
|
|
@ -1555,6 +1598,12 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_
|
|||
list_addtail(&node->head, &out->producer.loads);
|
||||
}
|
||||
|
||||
if (is_store ? (is_sz_sysval(linkage, slot, intr) || has_xfb(intr)) : !sem.no_signed_zero) {
|
||||
unsigned nsz_count = nir_src_is_const(offset) ? 1 : sem.num_slots;
|
||||
for (unsigned i = 0; i < nsz_count; i++)
|
||||
BITSET_SET(linkage->signed_zero_mask, slot + i * 8);
|
||||
}
|
||||
|
||||
BITSET_SET(linkage->removable_mask, slot);
|
||||
|
||||
/* Indirect indexing. */
|
||||
|
|
@ -4181,6 +4230,35 @@ backward_inter_shader_code_motion(struct linkage_info *linkage,
|
|||
return false;
|
||||
}
|
||||
|
||||
/******************************************************************
|
||||
* SIGNED ZERO LINKING
|
||||
******************************************************************/
|
||||
|
||||
static void
|
||||
link_no_signed_zero(struct linkage_info *linkage,
|
||||
nir_opt_varyings_progress *progress)
|
||||
{
|
||||
for (unsigned slot = 0; slot < NUM_SCALAR_SLOTS; slot++) {
|
||||
struct scalar_slot *scalar_slot = &linkage->slot[slot];
|
||||
|
||||
list_for_each_entry(struct list_node, iter, &scalar_slot->producer.stores, head) {
|
||||
nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
|
||||
|
||||
bool no_signed_zero = true;
|
||||
unsigned nsz_count = nir_src_is_const(*nir_get_io_offset_src(iter->instr)) ? 1 : sem.num_slots;
|
||||
for (unsigned i = 0; i < nsz_count; i++)
|
||||
no_signed_zero &= !BITSET_TEST(linkage->signed_zero_mask, slot + i * 8);
|
||||
|
||||
if (sem.no_signed_zero != no_signed_zero) {
|
||||
*progress |= nir_progress_producer;
|
||||
sem.no_signed_zero = no_signed_zero;
|
||||
nir_intrinsic_set_io_semantics(iter->instr, sem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************
|
||||
* COMPACTION
|
||||
******************************************************************/
|
||||
|
|
@ -5396,6 +5474,9 @@ nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
|
|||
init_linkage(producer, consumer, spirv, max_uniform_components,
|
||||
max_ubos_per_stage, linkage, &progress);
|
||||
|
||||
|
||||
link_no_signed_zero(linkage, &progress);
|
||||
|
||||
/* This must be done after deduplication and before inter-shader code
|
||||
* motion.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ TEST_F(nir_opt_varyings_test_bicm_binary_alu, \
|
|||
ASSERT_TRUE(!shader_contains_def(b2, load[0])); \
|
||||
ASSERT_TRUE(!shader_contains_def(b2, load[1])); \
|
||||
} else { \
|
||||
ASSERT_EQ(opt_varyings(), 0); \
|
||||
ASSERT_EQ(opt_varyings() & nir_progress_consumer, 0); \
|
||||
ASSERT_TRUE(!shader_contains_alu_op(b1, nir_op_##alu, bitsize)); \
|
||||
ASSERT_TRUE(shader_contains_alu_op(b2, nir_op_##alu, bitsize)); \
|
||||
ASSERT_TRUE(shader_contains_instr(b1, &store[0]->instr)); \
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ TEST_F(nir_opt_varyings_test_dead_output, \
|
|||
store_output(b1, VARYING_SLOT_##slot, 0, nir_type_float##bitsize, \
|
||||
nir_imm_floatN_t(b1, 0, bitsize), 0); \
|
||||
\
|
||||
ASSERT_TRUE(opt_varyings() == 0); \
|
||||
opt_varyings(); \
|
||||
ASSERT_TRUE(b1->shader->info.outputs_written == VARYING_BIT_##slot); \
|
||||
ASSERT_TRUE(shader_contains_instr(b1, &intr->instr)); \
|
||||
ASSERT_TRUE(nir_intrinsic_io_semantics(intr).no_varying == \
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ TEST_F(nir_opt_varyings_test_dedup, \
|
|||
ASSERT_TRUE(!shader_contains_def(b2, load[1][v])); \
|
||||
} \
|
||||
} else { \
|
||||
ASSERT_EQ(opt_varyings(), 0); \
|
||||
ASSERT_EQ(opt_varyings() & nir_progress_consumer, 0); \
|
||||
for (unsigned v = 0; v < (is_per_vertex(b1, (gl_varying_slot)pslot[1], false) ? 3 : 1); v++) { \
|
||||
ASSERT_TRUE(shader_contains_instr(b1, &store[0][v]->instr)); \
|
||||
ASSERT_TRUE(shader_contains_instr(b1, &store[1][v]->instr)); \
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_const, \
|
|||
SHADER_CONST_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, value, value) \
|
||||
\
|
||||
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
|
||||
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
|
||||
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
|
||||
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
|
||||
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
|
||||
} else { \
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_ubo, \
|
|||
SHADER_UBO_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
|
||||
\
|
||||
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
|
||||
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
|
||||
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
|
||||
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
|
||||
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
|
||||
} else { \
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_uniform, \
|
|||
SHADER_UNIFORM_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
|
||||
\
|
||||
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
|
||||
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
|
||||
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
|
||||
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
|
||||
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
|
||||
} else { \
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_uniform_expr, \
|
|||
SHADER_UNI_EXPR_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
|
||||
\
|
||||
if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
|
||||
ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
|
||||
ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
|
||||
ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
|
||||
ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
|
||||
} else { \
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue