mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
nir/opt_varyings: rewrite elimination of duplicated outputs
to strengthen it and fix a rare case of incorrect behavior.
This is the incorrect behavior that's fixed:
if (invocation_id == 0)
patch_output[0] = invocation_id;
else if (invocation_id == 1)
patch_output[1] = invocation_id;
else
patch_output[2] = invocation_id;
The stored SSA defs are the same, but each invocation writes a different
value to different outputs, so they are not duplicated, but the code
incorrectly treated them as duplicated and removed [1] and [2].
The requirement for correctness is that if an output is duplicated,
it must have stores in the same blocks as the output it duplicates.
To fix the incorrect behavior, awareness of blocks is added to the algorithm,
which leads to the following new cases that are optimized:
1. Different blocks store different SSA defs, but equal in each block:
if (..) {
output[0] = a;
output[1] = a; // eliminated
} else {
output[0] = b;
output[1] = b; // eliminated
}
2. Different GS emit sections store different SSA defs, but equal in each
section:
output[0] = a;
output[1] = a; // eliminated
emit_vertex;
output[0] = b;
output[1] = b; // eliminated
emit_vertex;
3. A weird case that could be duplicated but is left alone due to
the counterexample above:
if (..)
output[0] = a;
else
output[1] = a;
This commit is contained in:
parent
eb8dbc2bbf
commit
3458eae4bc
1 changed files with 203 additions and 43 deletions
|
|
@ -149,18 +149,21 @@
|
|||
*
|
||||
* 4. Remove duplicated output components
|
||||
*
|
||||
* * By comparing SSA defs.
|
||||
* * If there are multiple stores to the same output, all such stores
|
||||
* should store the same SSA as all stores of another output for
|
||||
* the output to be considered duplicated. If an output has multiple
|
||||
* vertices, all vertices should store the same SSA.
|
||||
* * The value equality between outputs is determined by comparing
|
||||
* the stored SSA defs.
|
||||
* * 2 outputs are duplicated if both have stores in the same blocks and
|
||||
* storing the same SSA defs in those blocks.
|
||||
* * If an output has multiple vertices, all vertices should store the same
|
||||
* SSA def for the output to be considered duplicated. This constraint
|
||||
* allows the vertex index to be non-constant. (only possible in TCS
|
||||
* and MS)
|
||||
* * Deduplication can only be done between outputs of the same category.
|
||||
* Those are: interpolated, patch, flat, interpolated color, flat color,
|
||||
* and conditionally interpolated color based on the flat
|
||||
* shade state
|
||||
* * Everything is deduplicated except TEXn due to the coord replace state.
|
||||
* * Eliminated output stores get the "no_varying" flag if they are also
|
||||
* xfb stores or write sysval outputs.
|
||||
* * Duplicated output stores that can't be removed because they are also
|
||||
* xfb stores or write sysval outputs get the "no_varying" flag.
|
||||
*
|
||||
* 5. Link signed zero information
|
||||
*
|
||||
|
|
@ -507,6 +510,9 @@
|
|||
#include "nir_builder.h"
|
||||
#include "nir_xfb_info.h"
|
||||
|
||||
#define XXH_INLINE_ALL
|
||||
#include "util/xxhash.h"
|
||||
|
||||
/* nir_opt_varyings works at scalar 16-bit granularity across all varyings.
|
||||
*
|
||||
* Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels.
|
||||
|
|
@ -616,14 +622,22 @@ vec4_slot(unsigned scalar_slot)
|
|||
|
||||
struct list_node {
|
||||
struct list_head head;
|
||||
nir_intrinsic_instr *instr;
|
||||
nir_intrinsic_instr *instr; /* load or store */
|
||||
|
||||
/* The number of emit_vertex instructions that precede this store
|
||||
* in the current block. It's the index of the following emit_vertex,
|
||||
* effectively identifying which emit_vertex section this store belongs
|
||||
* to within a block.
|
||||
*/
|
||||
unsigned gs_emit_index;
|
||||
};
|
||||
|
||||
/* Information about 1 scalar varying slot for both shader stages. */
|
||||
struct scalar_slot {
|
||||
struct {
|
||||
/* Linked list of all store instructions writing into the scalar slot
|
||||
* in the producer.
|
||||
* in the producer. The stores are inserted in the order in which they
|
||||
* occur in the shader.
|
||||
*/
|
||||
struct list_head stores;
|
||||
|
||||
|
|
@ -751,7 +765,6 @@ struct linkage_info {
|
|||
* the same value. If the output has multiple vertices, all vertices store
|
||||
* the same value. This is a useful property for:
|
||||
* - constant and uniform propagation to the next shader
|
||||
* - deduplicating outputs
|
||||
*/
|
||||
BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS);
|
||||
|
||||
|
|
@ -775,6 +788,11 @@ struct linkage_info {
|
|||
* cares about the sign of zero.
|
||||
*/
|
||||
BITSET_DECLARE(signed_zero_mask, NUM_SCALAR_SLOTS);
|
||||
|
||||
struct {
|
||||
nir_block *last_gs_emit_block;
|
||||
unsigned gs_emit_index;
|
||||
} gather_outputs_state;
|
||||
};
|
||||
|
||||
/******************************************************************
|
||||
|
|
@ -1513,6 +1531,22 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_
|
|||
{
|
||||
struct linkage_info *linkage = (struct linkage_info *)cb_data;
|
||||
|
||||
/* Track the index of emit_vertex in the same block, so that we can identify
|
||||
* which GS output stores belong to which emit_vertex section in the same
|
||||
* block.
|
||||
*/
|
||||
if (intr->intrinsic == nir_intrinsic_emit_vertex ||
|
||||
intr->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
|
||||
if (linkage->gather_outputs_state.last_gs_emit_block ==
|
||||
intr->instr.block) {
|
||||
linkage->gather_outputs_state.gs_emit_index++;
|
||||
} else {
|
||||
linkage->gather_outputs_state.last_gs_emit_block = intr->instr.block;
|
||||
linkage->gather_outputs_state.gs_emit_index = 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (intr->intrinsic != nir_intrinsic_store_output &&
|
||||
intr->intrinsic != nir_intrinsic_load_output &&
|
||||
intr->intrinsic != nir_intrinsic_store_per_vertex_output &&
|
||||
|
|
@ -1573,6 +1607,7 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_
|
|||
struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
|
||||
sizeof(struct list_node));
|
||||
node->instr = intr;
|
||||
node->gs_emit_index = linkage->gather_outputs_state.gs_emit_index;
|
||||
out->num_slots = MAX2(out->num_slots, sem.num_slots);
|
||||
|
||||
list_addtail(&node->head, is_store ? &out->producer.stores : &out->producer.loads);
|
||||
|
|
@ -2746,22 +2781,93 @@ get_input_qualifier(struct linkage_info *linkage, unsigned i)
|
|||
return qual + pixel_location;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
nir_ht_scalar_hash(const void *key)
|
||||
{
|
||||
nir_scalar s;
|
||||
static_assert(offsetof(nir_scalar, def) == 0, "known layout");
|
||||
static_assert(offsetof(nir_scalar, comp) == sizeof(s.def), "no padding");
|
||||
static_assert(sizeof(s.comp) == sizeof(unsigned), "known layout");
|
||||
/* Duplicated outputs are those outputs that have stores in the same blocks and
|
||||
* store equal values in each such block.
|
||||
*
|
||||
* It's implemented by representing each output store as <block, stored_value,
|
||||
* gs_emit_index, is_back_color> entries. 1 output slot is represented as
|
||||
* a set of such entries. 2 outputs store identical values if their sets are
|
||||
* equal.
|
||||
*
|
||||
* dedup_entry is the entry. Entries are in "struct set". The sets are keys in
|
||||
* the hash table storing all outputs. An output slot is a duplicate of
|
||||
* another if its set of dedup_entries is equal to the set of dedup_entries
|
||||
* of another slot that's already in the hash table.
|
||||
*/
|
||||
typedef struct {
|
||||
/* The block and the value stored in that block. */
|
||||
nir_block *block;
|
||||
nir_scalar value;
|
||||
|
||||
/* Don't include structure padding of nir_scalar. */
|
||||
return _mesa_hash_data(key, offsetof(nir_scalar, comp) + sizeof(unsigned));
|
||||
/* If a block has multiple emits, this is the index of the next emit.
|
||||
* There is one dedup_entry per emit.
|
||||
*/
|
||||
unsigned gs_emit_index;
|
||||
|
||||
/* Treat back colors as different outputs from front colors because both
|
||||
* front and back colors happen to be in the same output slot and set.
|
||||
*/
|
||||
bool is_back_color;
|
||||
} dedup_entry;
|
||||
|
||||
#define DEBUG_PRINT_DEDUP 0
|
||||
|
||||
static void
|
||||
print_dedup_entry(const char *place, struct set *set, dedup_entry *entry)
|
||||
{
|
||||
printf("%s> set=0x%lx, entry=0x%lx, block=0x%lx, def=0x%lx, comp=%u, emit=%u\n",
|
||||
place, (uintptr_t)set, (uintptr_t)entry, (uintptr_t)entry->block,
|
||||
(uintptr_t)entry->value.def, entry->value.comp,
|
||||
entry->gs_emit_index);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
dedup_entry_hash_accum(const void *_key, uint32_t hash)
|
||||
{
|
||||
return XXH32(_key, sizeof(dedup_entry), hash);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
dedup_entry_set_hash(const void *key)
|
||||
{
|
||||
return dedup_entry_hash_accum(key, 0);
|
||||
}
|
||||
|
||||
static bool
|
||||
nir_ht_scalar_equal(const void *a, const void *b)
|
||||
dedup_entry_set_equal(const void *_a, const void *_b)
|
||||
{
|
||||
return nir_scalar_equal(*(nir_scalar*)a, *(nir_scalar*)b);
|
||||
dedup_entry *a = (dedup_entry *)_a;
|
||||
dedup_entry *b = (dedup_entry *)_b;
|
||||
|
||||
return a->block == b->block && a->value.def == b->value.def &&
|
||||
a->value.comp == b->value.comp;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
dedup_ht_key_hash(const void *_key)
|
||||
{
|
||||
struct set *set = (struct set *)_key;
|
||||
uint32_t hash = 0;
|
||||
|
||||
set_foreach(set, entry) {
|
||||
if (DEBUG_PRINT_DEDUP)
|
||||
print_dedup_entry("ht_key_hash", set, (dedup_entry *)entry->key);
|
||||
|
||||
hash = dedup_entry_hash_accum((dedup_entry *)entry->key, hash);
|
||||
}
|
||||
|
||||
if (DEBUG_PRINT_DEDUP)
|
||||
printf("ht_key_hash key=0x%lu hash=%u\n", (uintptr_t)_key, hash);
|
||||
return hash;
|
||||
}
|
||||
|
||||
static bool
|
||||
dedup_ht_key_equal(const void *_a, const void *_b)
|
||||
{
|
||||
struct set *a = (struct set *)_a;
|
||||
struct set *b = (struct set *)_b;
|
||||
|
||||
return _mesa_set_equal(a, b);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -2769,24 +2875,31 @@ deduplicate_outputs(struct linkage_info *linkage,
|
|||
nir_opt_varyings_progress *progress,
|
||||
bool *consumer_progress)
|
||||
{
|
||||
struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = { NULL };
|
||||
struct hash_table tables[NUM_DEDUP_QUALIFIERS] = { 0 };
|
||||
unsigned i;
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
/* Find duplicated outputs. If there are multiple stores, they should all
|
||||
* store the same value as all stores of some other output. That's
|
||||
* guaranteed by output_equal_mask.
|
||||
*/
|
||||
BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
|
||||
/* Find duplicated outputs. */
|
||||
BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) {
|
||||
if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate)
|
||||
continue;
|
||||
|
||||
/* Skip indirect indexing. */
|
||||
if (BITSET_TEST(linkage->indirect_mask, i))
|
||||
continue;
|
||||
|
||||
struct scalar_slot *slot = &linkage->slot[i];
|
||||
assert(!list_is_empty(&slot->producer.stores));
|
||||
|
||||
enum var_qualifier qualifier;
|
||||
gl_varying_slot var_slot = vec4_slot(i);
|
||||
|
||||
/* Determine which qualifier this slot has. */
|
||||
if ((var_slot >= VARYING_SLOT_PATCH0 &&
|
||||
var_slot <= VARYING_SLOT_PATCH31) ||
|
||||
if (list_is_empty(&slot->consumer.loads)) {
|
||||
/* XFB or TCS outputs not consumed by the next stage */
|
||||
qualifier = QUAL_VAR_FLAT;
|
||||
} else if ((var_slot >= VARYING_SLOT_PATCH0 &&
|
||||
var_slot <= VARYING_SLOT_PATCH31) ||
|
||||
var_slot == VARYING_SLOT_TESS_LEVEL_INNER ||
|
||||
var_slot == VARYING_SLOT_TESS_LEVEL_OUTER)
|
||||
qualifier = QUAL_PATCH;
|
||||
|
|
@ -2798,21 +2911,69 @@ deduplicate_outputs(struct linkage_info *linkage,
|
|||
if (qualifier == QUAL_SKIP)
|
||||
continue;
|
||||
|
||||
struct hash_table **table = &tables[qualifier];
|
||||
if (!*table)
|
||||
*table = _mesa_hash_table_create(NULL, nir_ht_scalar_hash,
|
||||
nir_ht_scalar_equal);
|
||||
|
||||
nir_scalar value = slot->producer.value;
|
||||
|
||||
struct hash_entry *entry = _mesa_hash_table_search(*table, &value);
|
||||
if (!entry) {
|
||||
_mesa_hash_table_insert(*table, &value, (void *)(uintptr_t)i);
|
||||
continue;
|
||||
struct hash_table *table = &tables[qualifier];
|
||||
if (!table->table) {
|
||||
_mesa_hash_table_init(table, mem_ctx, dedup_ht_key_hash,
|
||||
dedup_ht_key_equal);
|
||||
}
|
||||
|
||||
/* Create the hash table key. */
|
||||
struct set *key = _mesa_set_create(mem_ctx, dedup_entry_set_hash,
|
||||
dedup_entry_set_equal);
|
||||
|
||||
/* Only looking at SSA def equality is insufficient.
|
||||
*
|
||||
* TCS proof:
|
||||
*
|
||||
* if (invocation_id == 0)
|
||||
* patch_output[0] = invocation_id;
|
||||
* else
|
||||
* patch_output[1] = invocation_id; // not duplicated
|
||||
*
|
||||
* VS proof:
|
||||
*
|
||||
* if (vertex_id == 0)
|
||||
* output[0] = vertex_id;
|
||||
* else
|
||||
* output[1] = vertex_id; // can't remove because output[0] is
|
||||
* // uninitialized for vertex_id > 0
|
||||
*/
|
||||
|
||||
/* Add all stores to the set of <block, value, gs_emit_index,
|
||||
* is_back_color> entries.
|
||||
*/
|
||||
list_for_each_entry(struct list_node, iter, &slot->producer.stores,
|
||||
head) {
|
||||
unsigned location = nir_intrinsic_io_semantics(iter->instr).location;
|
||||
dedup_entry *entry = rzalloc(mem_ctx, dedup_entry);
|
||||
entry->block = iter->instr->instr.block;
|
||||
entry->value =
|
||||
nir_scalar_resolved(nir_get_io_data_src(iter->instr)->ssa, 0);
|
||||
entry->gs_emit_index = iter->gs_emit_index;
|
||||
entry->is_back_color = location == VARYING_SLOT_BFC0 ||
|
||||
location == VARYING_SLOT_BFC1;
|
||||
|
||||
if (DEBUG_PRINT_DEDUP)
|
||||
print_dedup_entry("add/block", key, entry);
|
||||
|
||||
_mesa_set_add(key, entry);
|
||||
}
|
||||
|
||||
/* Search in the table for an identical output. */
|
||||
struct hash_entry *entry = _mesa_hash_table_search(table, key);
|
||||
if (!entry) {
|
||||
if (DEBUG_PRINT_DEDUP)
|
||||
printf("ht miss slot=%u\n", i);
|
||||
_mesa_hash_table_insert(table, key, slot);
|
||||
if (DEBUG_PRINT_DEDUP)
|
||||
printf("ht inserted slot=%u\n", i);
|
||||
continue;
|
||||
}
|
||||
if (DEBUG_PRINT_DEDUP)
|
||||
printf("ht hit slot=%u\n", i);
|
||||
|
||||
/* We've found a duplicate. Redirect loads and remove stores. */
|
||||
struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data];
|
||||
struct scalar_slot *found_slot = (struct scalar_slot *)entry->data;
|
||||
nir_intrinsic_instr *store =
|
||||
list_first_entry(&found_slot->producer.stores,
|
||||
struct list_node, head)
|
||||
|
|
@ -2863,8 +3024,7 @@ deduplicate_outputs(struct linkage_info *linkage,
|
|||
remove_all_stores_and_clear_slot(linkage, i, progress);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(tables); i++)
|
||||
_mesa_hash_table_destroy(tables[i], NULL);
|
||||
ralloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
/******************************************************************
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue