diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index e626f219285..2474e49f7ea 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -1176,6 +1176,7 @@ nir_get_io_data_src_number(const nir_intrinsic_instr *intr) case nir_intrinsic_store_pixel_local: case nir_intrinsic_store_per_vertex_output: case nir_intrinsic_store_per_primitive_output: + case nir_intrinsic_store_per_view_output: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_ssbo_block_intel: case nir_intrinsic_store_ssbo_intel: diff --git a/src/compiler/nir/nir_opt_varyings.c b/src/compiler/nir/nir_opt_varyings.c index f52efccf87c..a3d0d984e4e 100644 --- a/src/compiler/nir/nir_opt_varyings.c +++ b/src/compiler/nir/nir_opt_varyings.c @@ -149,18 +149,31 @@ * * 4. Remove duplicated output components * - * * By comparing SSA defs. - * * If there are multiple stores to the same output, all such stores - * should store the same SSA as all stores of another output for - * the output to be considered duplicated. If an output has multiple - * vertices, all vertices should store the same SSA. + * * 2 output components are duplicated if: + * * They store the same values (SSA defs). + * * They must have stores in the same blocks, and they can store + * different values in each such block, but the values must be equal + * within each block between the 2 output components. + * * Blocks are divided into GS emit sections for GS if a block contains + * more than one emit_vertex, and the previous rule is applied to GS + * emit sections as if they were blocks. + * * If the outputs are arrayed (TCS and MS), the same values must be + * stored in the same vertex/primitive elements (which are identified + * by the index SSA def), but the values can be different between + * those elements. + * * Front colors and back colors are deduplicated in sync. For example, + * if the front color is (a,b,c,a) and the back color is (d,e,f,d), then + * the last component is removed and sourced from the first component. + * This works the same as the previous rules for blocks, GS emit + * sections, and arrayed outputs, but here the key is front/back instead + * of block 0/1, GS emit section 0/1, or index 0/1. * * Deduplication can only be done between outputs of the same category. * Those are: interpolated, patch, flat, interpolated color, flat color, - * and conditionally interpolated color based on the flat - * shade state + * and conditionally interpolated color based on the dynamic + * shade model state * * Everything is deduplicated except TEXn due to the coord replace state. - * * Eliminated output stores get the "no_varying" flag if they are also - * xfb stores or write sysval outputs. + * * Duplicated output stores that can't be removed because they are also + * xfb stores or write sysval outputs get the "no_varying" flag. * * 5. Link signed zero information * @@ -507,6 +520,9 @@ #include "nir_builder.h" #include "nir_xfb_info.h" +#define XXH_INLINE_ALL +#include "util/xxhash.h" + /* nir_opt_varyings works at scalar 16-bit granularity across all varyings. * * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels. @@ -616,14 +632,22 @@ vec4_slot(unsigned scalar_slot) struct list_node { struct list_head head; - nir_intrinsic_instr *instr; + nir_intrinsic_instr *instr; /* load or store */ + + /* The number of emit_vertex instructions that precede this store + * in the current block. It's the index of the following emit_vertex, + * effectively identifying which emit_vertex section this store belongs + * to within a block. + */ + unsigned gs_emit_index; }; /* Information about 1 scalar varying slot for both shader stages. */ struct scalar_slot { struct { /* Linked list of all store instructions writing into the scalar slot - * in the producer. + * in the producer. The stores are inserted in the order in which they + * occur in the shader. */ struct list_head stores; @@ -751,7 +775,6 @@ struct linkage_info { * the same value. If the output has multiple vertices, all vertices store * the same value. This is a useful property for: * - constant and uniform propagation to the next shader - * - deduplicating outputs */ BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS); @@ -775,6 +798,11 @@ struct linkage_info { * cares about the sign of zero. */ BITSET_DECLARE(signed_zero_mask, NUM_SCALAR_SLOTS); + + struct { + nir_block *last_gs_emit_block; + unsigned gs_emit_index; + } gather_outputs_state; }; /****************************************************************** @@ -1513,6 +1541,22 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_ { struct linkage_info *linkage = (struct linkage_info *)cb_data; + /* Track the index of emit_vertex in the same block, so that we can identify + * which GS output stores belong to which emit_vertex section in the same + * block. + */ + if (intr->intrinsic == nir_intrinsic_emit_vertex || + intr->intrinsic == nir_intrinsic_emit_vertex_with_counter) { + if (linkage->gather_outputs_state.last_gs_emit_block == + intr->instr.block) { + linkage->gather_outputs_state.gs_emit_index++; + } else { + linkage->gather_outputs_state.last_gs_emit_block = intr->instr.block; + linkage->gather_outputs_state.gs_emit_index = 0; + } + return false; + } + if (intr->intrinsic != nir_intrinsic_store_output && intr->intrinsic != nir_intrinsic_load_output && intr->intrinsic != nir_intrinsic_store_per_vertex_output && @@ -1573,6 +1617,7 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_ struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx, sizeof(struct list_node)); node->instr = intr; + node->gs_emit_index = linkage->gather_outputs_state.gs_emit_index; out->num_slots = MAX2(out->num_slots, sem.num_slots); list_addtail(&node->head, is_store ? &out->producer.stores : &out->producer.loads); @@ -2742,22 +2787,93 @@ get_input_qualifier(struct linkage_info *linkage, unsigned i) return qual + pixel_location; } -static uint32_t -nir_ht_scalar_hash(const void *key) -{ - nir_scalar s; - static_assert(offsetof(nir_scalar, def) == 0, "known layout"); - static_assert(offsetof(nir_scalar, comp) == sizeof(s.def), "no padding"); - static_assert(sizeof(s.comp) == sizeof(unsigned), "known layout"); +/* Duplicated outputs are those outputs that have stores in the same blocks and + * store equal values in each such block. + * + * It's implemented by representing each output store as entries. 1 output slot is + * represented as a set of such entries. 2 outputs store identical values if + * their sets are equal. + * + * dedup_entry is the entry. Entries are in "struct set". The sets are keys in + * the hash table storing all outputs. An output slot is a duplicate of + * another if its set of dedup_entries is equal to the set of dedup_entries + * of another slot that's already in the hash table. + */ +typedef struct { + /* The block and the value stored in that block. */ + nir_block *block; + nir_scalar value; - /* Don't include structure padding of nir_scalar. */ - return _mesa_hash_data(key, offsetof(nir_scalar, comp) + sizeof(unsigned)); + /* The output vertex or primitive index for TCS and MS. + * If this is non-NULL, it means the output is arrayed. + */ + nir_scalar index; + + /* If a block has multiple emits, this is the index of the next emit. + * There is one dedup_entry per emit. + */ + unsigned gs_emit_index; + + /* Treat back colors as different outputs from front colors because both + * front and back colors happen to be in the same output slot and set. + */ + bool is_back_color; +} dedup_entry; + +#define DEBUG_PRINT_DEDUP 0 + +static void +print_dedup_entry(const char *place, struct set *set, dedup_entry *entry) +{ + printf("%s> set=0x%" SCNxPTR ", entry=0x%" SCNxPTR ", block=0x%" SCNxPTR ", " + "value=0x%" SCNxPTR ":%u, index=0x%" SCNxPTR ":%u, emit=%u\n", + place, (uintptr_t)set, (uintptr_t)entry, (uintptr_t)entry->block, + (uintptr_t)entry->value.def, entry->value.comp, + (uintptr_t)entry->index.def, entry->index.comp, + entry->gs_emit_index); +} + +static uint32_t +dedup_entry_hash_accum(const void *key, uint32_t hash) +{ + return XXH32(key, sizeof(dedup_entry), hash); +} + +static uint32_t +dedup_entry_set_hash(const void *key) +{ + return dedup_entry_hash_accum(key, 0); } static bool -nir_ht_scalar_equal(const void *a, const void *b) +dedup_entry_set_equal(const void *a, const void *b) { - return nir_scalar_equal(*(nir_scalar*)a, *(nir_scalar*)b); + return !memcmp(a, b, sizeof(dedup_entry)); +} + +static uint32_t +dedup_ht_key_hash(const void *_key) +{ + struct set *set = (struct set *)_key; + uint32_t hash = 0; + + set_foreach(set, entry) { + if (DEBUG_PRINT_DEDUP) + print_dedup_entry("ht_key_hash", set, (dedup_entry *)entry->key); + + hash = dedup_entry_hash_accum((dedup_entry *)entry->key, hash); + } + + if (DEBUG_PRINT_DEDUP) + printf("ht_key_hash key=0x%" SCNuPTR " hash=%u\n", (uintptr_t)_key, hash); + return hash; +} + +static bool +dedup_ht_key_equal(const void *a, const void *b) +{ + return _mesa_set_equal((struct set *)a, (struct set *)b); } static void @@ -2765,24 +2881,41 @@ deduplicate_outputs(struct linkage_info *linkage, nir_opt_varyings_progress *progress, bool *consumer_progress) { - struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = { NULL }; + struct hash_table tables[NUM_DEDUP_QUALIFIERS] = { 0 }; unsigned i; + void *mem_ctx = ralloc_context(NULL); - /* Find duplicated outputs. If there are multiple stores, they should all - * store the same value as all stores of some other output. That's - * guaranteed by output_equal_mask. - */ - BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) { + /* Find duplicated outputs. */ + BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) { if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate) continue; + /* Skip indirect indexing. */ + if (BITSET_TEST(linkage->indirect_mask, i)) + continue; + struct scalar_slot *slot = &linkage->slot[i]; + assert(!list_is_empty(&slot->producer.stores)); + + /* Skip non-zero GS streams because they should be just XFB outputs. */ + if (linkage->producer_stage == MESA_SHADER_GEOMETRY) { + nir_intrinsic_instr *store = + list_first_entry(&slot->producer.stores, struct list_node, + head)->instr; + + if (nir_intrinsic_io_semantics(store).gs_streams) + continue; + } + enum var_qualifier qualifier; gl_varying_slot var_slot = vec4_slot(i); /* Determine which qualifier this slot has. */ - if ((var_slot >= VARYING_SLOT_PATCH0 && - var_slot <= VARYING_SLOT_PATCH31) || + if (list_is_empty(&slot->consumer.loads)) { + /* XFB or TCS outputs not consumed by the next stage */ + qualifier = QUAL_VAR_FLAT; + } else if ((var_slot >= VARYING_SLOT_PATCH0 && + var_slot <= VARYING_SLOT_PATCH31) || var_slot == VARYING_SLOT_TESS_LEVEL_INNER || var_slot == VARYING_SLOT_TESS_LEVEL_OUTER) qualifier = QUAL_PATCH; @@ -2794,21 +2927,74 @@ deduplicate_outputs(struct linkage_info *linkage, if (qualifier == QUAL_SKIP) continue; - struct hash_table **table = &tables[qualifier]; - if (!*table) - *table = _mesa_hash_table_create(NULL, nir_ht_scalar_hash, - nir_ht_scalar_equal); - - nir_scalar value = slot->producer.value; - - struct hash_entry *entry = _mesa_hash_table_search(*table, &value); - if (!entry) { - _mesa_hash_table_insert(*table, &value, (void *)(uintptr_t)i); - continue; + struct hash_table *table = &tables[qualifier]; + if (!table->table) { + _mesa_hash_table_init(table, mem_ctx, dedup_ht_key_hash, + dedup_ht_key_equal); } + /* Create the hash table key. */ + struct set *key = _mesa_set_create(mem_ctx, dedup_entry_set_hash, + dedup_entry_set_equal); + + /* Only looking at SSA def equality is insufficient. + * + * TCS proof: + * + * if (invocation_id == 0) + * patch_output[0] = invocation_id; + * else + * patch_output[1] = invocation_id; // not duplicated + * + * VS proof: + * + * if (vertex_id == 0) + * output[0] = vertex_id; + * else + * output[1] = vertex_id; // can't remove because output[0] is + * // uninitialized for vertex_id > 0 + */ + + /* Add all stores to the set of entries. + */ + list_for_each_entry(struct list_node, iter, &slot->producer.stores, + head) { + unsigned location = nir_intrinsic_io_semantics(iter->instr).location; + dedup_entry *entry = rzalloc(mem_ctx, dedup_entry); + entry->block = iter->instr->instr.block; + entry->value = + nir_scalar_resolved(nir_get_io_data_src(iter->instr)->ssa, 0); + entry->gs_emit_index = iter->gs_emit_index; + entry->is_back_color = location == VARYING_SLOT_BFC0 || + location == VARYING_SLOT_BFC1; + /* TODO: per-view outputs might need something here */ + + nir_src *index = nir_get_io_arrayed_index_src(iter->instr); + if (index) + entry->index = nir_scalar_resolved(index->ssa, 0); + + if (DEBUG_PRINT_DEDUP) + print_dedup_entry("add/block", key, entry); + + _mesa_set_add(key, entry); + } + + /* Search in the table for an identical output. */ + struct hash_entry *entry = _mesa_hash_table_search(table, key); + if (!entry) { + if (DEBUG_PRINT_DEDUP) + printf("ht miss slot=%u\n", i); + _mesa_hash_table_insert(table, key, slot); + if (DEBUG_PRINT_DEDUP) + printf("ht inserted slot=%u\n", i); + continue; + } + if (DEBUG_PRINT_DEDUP) + printf("ht hit slot=%u\n", i); + /* We've found a duplicate. Redirect loads and remove stores. */ - struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data]; + struct scalar_slot *found_slot = (struct scalar_slot *)entry->data; nir_intrinsic_instr *store = list_first_entry(&found_slot->producer.stores, struct list_node, head) @@ -2818,27 +3004,42 @@ deduplicate_outputs(struct linkage_info *linkage, /* Redirect loads. */ for (unsigned list_index = 0; list_index < 2; list_index++) { - struct list_head *src_loads = list_index ? &slot->producer.loads : &slot->consumer.loads; - struct list_head *dst_loads = list_index ? &found_slot->producer.loads : &found_slot->consumer.loads; - bool has_progress = !list_is_empty(src_loads); + struct list_head *dupl_slot_loads = + list_index ? &slot->producer.loads : &slot->consumer.loads; + struct list_head *slot_loads = + list_index ? &found_slot->producer.loads : + &found_slot->consumer.loads; + bool has_progress = !list_is_empty(dupl_slot_loads); - list_for_each_entry(struct list_node, iter, src_loads, head) { + list_for_each_entry(struct list_node, iter, dupl_slot_loads, head) { nir_intrinsic_instr *loadi = iter->instr; + nir_io_semantics load_sem = nir_intrinsic_io_semantics(loadi); - nir_intrinsic_set_io_semantics(loadi, sem); + /* Only redirect the location. */ + load_sem.location = sem.location; + nir_intrinsic_set_io_semantics(loadi, load_sem); nir_intrinsic_set_component(loadi, component); /* We also need to set the base to match the duplicate load, so * that CSE can eliminate it. */ if (list_index == 0) { - /* Outputs that aren't loaded by the consumer should be already deleted. */ - assert(!list_is_empty(dst_loads)); - struct list_node *first = - list_first_entry(dst_loads, struct list_node, head); - nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr)); + if (!list_is_empty(slot_loads)) { + struct list_node *first = + list_first_entry(slot_loads, struct list_node, head); + nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr)); + } else { + /* Keep the same base because the found slot has no consumer + * loads. Drivers should call nir_recompute_io_bases after + * this. If they don't and expect bases to be sensible, + * the behavior can be undefined. This pass arguably + * shouldn't be trying to preserve bases anyway. + */ + } } else { - /* The duplicate output may not have any loads, use the base of the found store. */ + /* The duplicated output might not have any loads, use the base + * of the found store. + */ nir_intrinsic_set_base(loadi, nir_intrinsic_base(store)); } } @@ -2847,8 +3048,8 @@ deduplicate_outputs(struct linkage_info *linkage, /* Move the redirected loads to the found slot, so that compaction * can find them. */ - list_splicetail(src_loads, dst_loads); - list_inithead(src_loads); + list_splicetail(dupl_slot_loads, slot_loads); + list_inithead(dupl_slot_loads); *progress |= list_index ? nir_progress_producer : nir_progress_consumer; *consumer_progress |= list_index == 0; /* 0 means consumer loads */ @@ -2859,8 +3060,7 @@ deduplicate_outputs(struct linkage_info *linkage, remove_all_stores_and_clear_slot(linkage, i, progress); } - for (unsigned i = 0; i < ARRAY_SIZE(tables); i++) - _mesa_hash_table_destroy(tables[i], NULL); + ralloc_free(mem_ctx); } /****************************************************************** diff --git a/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt b/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt index f4f74de2f12..27e2d8feb7a 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt @@ -202,3 +202,25 @@ dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_max,Fail dEQP-GLES31.functional.primitive_bounding_box.triangles.global_state.vertex_tessellation_fragment.fbo_bbox_smaller,Fail dEQP-GLES31.functional.primitive_bounding_box.triangles.global_state.vertex_tessellation_geometry_fragment.fbo_bbox_equal,Fail KHR-GL32.framebuffer_blit.multisampled_to_singlesampled_blit_depth_config_test,Fail + +# !41329 +dEQP-GLES3.functional.transform_feedback.random.interleaved.triangles.7,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.4,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.interleaved.triangles.1,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.points.4,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.4,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.7,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.lines.6,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.6,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.interleaved.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.lines.6,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.lines.2,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.lines.8,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.1,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.1,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.4,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.6,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.1,Crash diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt b/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt index d2d124e148e..5826d97ca65 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt @@ -198,3 +198,25 @@ KHR-GL32.framebuffer_blit.multisampled_to_singlesampled_blit_depth_config_test,F # !40444 KHR-GL32.packed_depth_stencil.blit.depth24_stencil8,Fail KHR-GL32.packed_depth_stencil.blit.depth32f_stencil8,Fail + +# !41329 +dEQP-GLES3.functional.transform_feedback.random.interleaved.triangles.7,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.4,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.interleaved.triangles.1,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.points.4,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.4,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.7,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.lines.6,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.6,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.interleaved.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.lines.6,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.lines.2,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.lines.8,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.points.1,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.1,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.4,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.6,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.1,Crash diff --git a/src/gallium/drivers/virgl/ci/virpipe-gl-fails.txt b/src/gallium/drivers/virgl/ci/virpipe-gl-fails.txt index dc41c55576c..edb4519a4f6 100644 --- a/src/gallium/drivers/virgl/ci/virpipe-gl-fails.txt +++ b/src/gallium/drivers/virgl/ci/virpipe-gl-fails.txt @@ -592,3 +592,12 @@ KHR-Single-GL43.arrays_of_arrays_gl.SubroutineFunctionCalls2_var_type_index_14,F KHR-GL43.cull_distance.functional_test_item_5_primitive_mode_lines_max_culldist_0,Fail KHR-GL43.framebuffer_blit.multisampled_to_singlesampled_blit_depth_config_test,Fail KHR-Single-GL43.arrays_of_arrays_gl.SubroutineArgumentAliasing2_var_type_index_14,Fail + +# !41329 +dEQP-GLES3.functional.transform_feedback.random.separate.lines.6,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.4,Crash +dEQP-GLES3.functional.transform_feedback.random.separate.triangles.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.interleaved.points.5,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.interleaved.triangles.1,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.lines.6,Crash +dEQP-GLES3.functional.transform_feedback.random_full_array_capture.separate.triangles.4,Crash