diff --git a/src/compiler/nir/nir_opt_varyings.c b/src/compiler/nir/nir_opt_varyings.c index db6c3400c0e..d21c502cc1a 100644 --- a/src/compiler/nir/nir_opt_varyings.c +++ b/src/compiler/nir/nir_opt_varyings.c @@ -149,18 +149,21 @@ * * 4. Remove duplicated output components * - * * By comparing SSA defs. - * * If there are multiple stores to the same output, all such stores - * should store the same SSA as all stores of another output for - * the output to be considered duplicated. If an output has multiple - * vertices, all vertices should store the same SSA. + * * The value equality between outputs is determined by comparing + * the stored SSA defs. + * * 2 outputs are duplicated if both have stores in the same blocks and + * storing the same SSA defs in those blocks. + * * If an output has multiple vertices, all vertices should store the same + * SSA def for the output to be considered duplicated. This constraint + * allows the vertex index to be non-constant. (only possible in TCS + * and MS) * * Deduplication can only be done between outputs of the same category. * Those are: interpolated, patch, flat, interpolated color, flat color, * and conditionally interpolated color based on the flat * shade state * * Everything is deduplicated except TEXn due to the coord replace state. - * * Eliminated output stores get the "no_varying" flag if they are also - * xfb stores or write sysval outputs. + * * Duplicated output stores that can't be removed because they are also + * xfb stores or write sysval outputs get the "no_varying" flag. * * 5. Link signed zero information * @@ -507,6 +510,9 @@ #include "nir_builder.h" #include "nir_xfb_info.h" +#define XXH_INLINE_ALL +#include "util/xxhash.h" + /* nir_opt_varyings works at scalar 16-bit granularity across all varyings. * * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels. @@ -616,14 +622,22 @@ vec4_slot(unsigned scalar_slot) struct list_node { struct list_head head; - nir_intrinsic_instr *instr; + nir_intrinsic_instr *instr; /* load or store */ + + /* The number of emit_vertex instructions that precede this store + * in the current block. It's the index of the following emit_vertex, + * effectively identifying which emit_vertex section this store belongs + * to within a block. + */ + unsigned gs_emit_index; }; /* Information about 1 scalar varying slot for both shader stages. */ struct scalar_slot { struct { /* Linked list of all store instructions writing into the scalar slot - * in the producer. + * in the producer. The stores are inserted in the order in which they + * occur in the shader. */ struct list_head stores; @@ -751,7 +765,6 @@ struct linkage_info { * the same value. If the output has multiple vertices, all vertices store * the same value. This is a useful property for: * - constant and uniform propagation to the next shader - * - deduplicating outputs */ BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS); @@ -775,6 +788,11 @@ struct linkage_info { * cares about the sign of zero. */ BITSET_DECLARE(signed_zero_mask, NUM_SCALAR_SLOTS); + + struct { + nir_block *last_gs_emit_block; + unsigned gs_emit_index; + } gather_outputs_state; }; /****************************************************************** @@ -1513,6 +1531,22 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_ { struct linkage_info *linkage = (struct linkage_info *)cb_data; + /* Track the index of emit_vertex in the same block, so that we can identify + * which GS output stores belong to which emit_vertex section in the same + * block. + */ + if (intr->intrinsic == nir_intrinsic_emit_vertex || + intr->intrinsic == nir_intrinsic_emit_vertex_with_counter) { + if (linkage->gather_outputs_state.last_gs_emit_block == + intr->instr.block) { + linkage->gather_outputs_state.gs_emit_index++; + } else { + linkage->gather_outputs_state.last_gs_emit_block = intr->instr.block; + linkage->gather_outputs_state.gs_emit_index = 0; + } + return false; + } + if (intr->intrinsic != nir_intrinsic_store_output && intr->intrinsic != nir_intrinsic_load_output && intr->intrinsic != nir_intrinsic_store_per_vertex_output && @@ -1573,6 +1607,7 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_ struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx, sizeof(struct list_node)); node->instr = intr; + node->gs_emit_index = linkage->gather_outputs_state.gs_emit_index; out->num_slots = MAX2(out->num_slots, sem.num_slots); list_addtail(&node->head, is_store ? &out->producer.stores : &out->producer.loads); @@ -2746,22 +2781,93 @@ get_input_qualifier(struct linkage_info *linkage, unsigned i) return qual + pixel_location; } -static uint32_t -nir_ht_scalar_hash(const void *key) -{ - nir_scalar s; - static_assert(offsetof(nir_scalar, def) == 0, "known layout"); - static_assert(offsetof(nir_scalar, comp) == sizeof(s.def), "no padding"); - static_assert(sizeof(s.comp) == sizeof(unsigned), "known layout"); +/* Duplicated outputs are those outputs that have stores in the same blocks and + * store equal values in each such block. + * + * It's implemented by representing each output store as entries. 1 output slot is represented as + * a set of such entries. 2 outputs store identical values if their sets are + * equal. + * + * dedup_entry is the entry. Entries are in "struct set". The sets are keys in + * the hash table storing all outputs. An output slot is a duplicate of + * another if its set of dedup_entries is equal to the set of dedup_entries + * of another slot that's already in the hash table. + */ +typedef struct { + /* The block and the value stored in that block. */ + nir_block *block; + nir_scalar value; - /* Don't include structure padding of nir_scalar. */ - return _mesa_hash_data(key, offsetof(nir_scalar, comp) + sizeof(unsigned)); + /* If a block has multiple emits, this is the index of the next emit. + * There is one dedup_entry per emit. + */ + unsigned gs_emit_index; + + /* Treat back colors as different outputs from front colors because both + * front and back colors happen to be in the same output slot and set. + */ + bool is_back_color; +} dedup_entry; + +#define DEBUG_PRINT_DEDUP 0 + +static void +print_dedup_entry(const char *place, struct set *set, dedup_entry *entry) +{ + printf("%s> set=0x%lx, entry=0x%lx, block=0x%lx, def=0x%lx, comp=%u, emit=%u\n", + place, (uintptr_t)set, (uintptr_t)entry, (uintptr_t)entry->block, + (uintptr_t)entry->value.def, entry->value.comp, + entry->gs_emit_index); +} + +static uint32_t +dedup_entry_hash_accum(const void *_key, uint32_t hash) +{ + return XXH32(_key, sizeof(dedup_entry), hash); +} + +static uint32_t +dedup_entry_set_hash(const void *key) +{ + return dedup_entry_hash_accum(key, 0); } static bool -nir_ht_scalar_equal(const void *a, const void *b) +dedup_entry_set_equal(const void *_a, const void *_b) { - return nir_scalar_equal(*(nir_scalar*)a, *(nir_scalar*)b); + dedup_entry *a = (dedup_entry *)_a; + dedup_entry *b = (dedup_entry *)_b; + + return a->block == b->block && a->value.def == b->value.def && + a->value.comp == b->value.comp; +} + +static uint32_t +dedup_ht_key_hash(const void *_key) +{ + struct set *set = (struct set *)_key; + uint32_t hash = 0; + + set_foreach(set, entry) { + if (DEBUG_PRINT_DEDUP) + print_dedup_entry("ht_key_hash", set, (dedup_entry *)entry->key); + + hash = dedup_entry_hash_accum((dedup_entry *)entry->key, hash); + } + + if (DEBUG_PRINT_DEDUP) + printf("ht_key_hash key=0x%lu hash=%u\n", (uintptr_t)_key, hash); + return hash; +} + +static bool +dedup_ht_key_equal(const void *_a, const void *_b) +{ + struct set *a = (struct set *)_a; + struct set *b = (struct set *)_b; + + return _mesa_set_equal(a, b); } static void @@ -2769,24 +2875,31 @@ deduplicate_outputs(struct linkage_info *linkage, nir_opt_varyings_progress *progress, bool *consumer_progress) { - struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = { NULL }; + struct hash_table tables[NUM_DEDUP_QUALIFIERS] = { 0 }; unsigned i; + void *mem_ctx = ralloc_context(NULL); - /* Find duplicated outputs. If there are multiple stores, they should all - * store the same value as all stores of some other output. That's - * guaranteed by output_equal_mask. - */ - BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { + /* Find duplicated outputs. */ + BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) { if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate) continue; + /* Skip indirect indexing. */ + if (BITSET_TEST(linkage->indirect_mask, i)) + continue; + struct scalar_slot *slot = &linkage->slot[i]; + assert(!list_is_empty(&slot->producer.stores)); + enum var_qualifier qualifier; gl_varying_slot var_slot = vec4_slot(i); /* Determine which qualifier this slot has. */ - if ((var_slot >= VARYING_SLOT_PATCH0 && - var_slot <= VARYING_SLOT_PATCH31) || + if (list_is_empty(&slot->consumer.loads)) { + /* XFB or TCS outputs not consumed by the next stage */ + qualifier = QUAL_VAR_FLAT; + } else if ((var_slot >= VARYING_SLOT_PATCH0 && + var_slot <= VARYING_SLOT_PATCH31) || var_slot == VARYING_SLOT_TESS_LEVEL_INNER || var_slot == VARYING_SLOT_TESS_LEVEL_OUTER) qualifier = QUAL_PATCH; @@ -2798,21 +2911,69 @@ deduplicate_outputs(struct linkage_info *linkage, if (qualifier == QUAL_SKIP) continue; - struct hash_table **table = &tables[qualifier]; - if (!*table) - *table = _mesa_hash_table_create(NULL, nir_ht_scalar_hash, - nir_ht_scalar_equal); - - nir_scalar value = slot->producer.value; - - struct hash_entry *entry = _mesa_hash_table_search(*table, &value); - if (!entry) { - _mesa_hash_table_insert(*table, &value, (void *)(uintptr_t)i); - continue; + struct hash_table *table = &tables[qualifier]; + if (!table->table) { + _mesa_hash_table_init(table, mem_ctx, dedup_ht_key_hash, + dedup_ht_key_equal); } + /* Create the hash table key. */ + struct set *key = _mesa_set_create(mem_ctx, dedup_entry_set_hash, + dedup_entry_set_equal); + + /* Only looking at SSA def equality is insufficient. + * + * TCS proof: + * + * if (invocation_id == 0) + * patch_output[0] = invocation_id; + * else + * patch_output[1] = invocation_id; // not duplicated + * + * VS proof: + * + * if (vertex_id == 0) + * output[0] = vertex_id; + * else + * output[1] = vertex_id; // can't remove because output[0] is + * // uninitialized for vertex_id > 0 + */ + + /* Add all stores to the set of entries. + */ + list_for_each_entry(struct list_node, iter, &slot->producer.stores, + head) { + unsigned location = nir_intrinsic_io_semantics(iter->instr).location; + dedup_entry *entry = rzalloc(mem_ctx, dedup_entry); + entry->block = iter->instr->instr.block; + entry->value = + nir_scalar_resolved(nir_get_io_data_src(iter->instr)->ssa, 0); + entry->gs_emit_index = iter->gs_emit_index; + entry->is_back_color = location == VARYING_SLOT_BFC0 || + location == VARYING_SLOT_BFC1; + + if (DEBUG_PRINT_DEDUP) + print_dedup_entry("add/block", key, entry); + + _mesa_set_add(key, entry); + } + + /* Search in the table for an identical output. */ + struct hash_entry *entry = _mesa_hash_table_search(table, key); + if (!entry) { + if (DEBUG_PRINT_DEDUP) + printf("ht miss slot=%u\n", i); + _mesa_hash_table_insert(table, key, slot); + if (DEBUG_PRINT_DEDUP) + printf("ht inserted slot=%u\n", i); + continue; + } + if (DEBUG_PRINT_DEDUP) + printf("ht hit slot=%u\n", i); + /* We've found a duplicate. Redirect loads and remove stores. */ - struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data]; + struct scalar_slot *found_slot = (struct scalar_slot *)entry->data; nir_intrinsic_instr *store = list_first_entry(&found_slot->producer.stores, struct list_node, head) @@ -2863,8 +3024,7 @@ deduplicate_outputs(struct linkage_info *linkage, remove_all_stores_and_clear_slot(linkage, i, progress); } - for (unsigned i = 0; i < ARRAY_SIZE(tables); i++) - _mesa_hash_table_destroy(tables[i], NULL); + ralloc_free(mem_ctx); } /******************************************************************