diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index da3285874c2..959ba95e039 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2649,6 +2649,35 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.context_roll_without_scissor_emitted = true; } +unsigned +radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs) +{ + /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a + * single array sorted in ascending order using: + * - total number of attributes + * - number of instanced attributes + * - index of first instanced attribute + */ + + /* From total number of attributes to offset. */ + static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, + 120, 165, 220, 286, 364, 455, 560, 680}; + unsigned start_index = total_to_offset[num_attributes - 1]; + + /* From number of instanced attributes to offset. This would require a different LUT depending on + * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total + * attributes. + */ + static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91, + 100, 108, 115, 121, 126, 130, 133, 135}; + unsigned count = util_bitcount(instance_rate_inputs); + unsigned offset_from_start_index = + count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1)); + + unsigned first = ffs(instance_rate_inputs) - 1; + return start_index + offset_from_start_index + first; +} + union vs_prolog_key_header { struct { uint32_t key_size : 8; @@ -2734,6 +2763,25 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant else if (pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) key.next_stage = MESA_SHADER_GEOMETRY; + /* try to use a pre-compiled prolog first */ + struct radv_shader_prolog *prolog = NULL; + if (!key.as_ls && key.next_stage == MESA_SHADER_VERTEX && + key.is_ngg == device->physical_device->use_ngg && !misaligned_mask && + !state->alpha_adjust_lo && !state->alpha_adjust_hi && + vs_shader->info.wave_size == device->physical_device->ge_wave_size) { + if (!instance_rate_inputs) { + prolog = device->simple_vs_prologs[num_attributes - 1]; + } else if (num_attributes <= 16 && !*nontrivial_divisors && + util_bitcount(instance_rate_inputs) == + (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) { + unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs); + prolog = device->instance_rate_vs_prologs[index]; + } + } + if (prolog) + return prolog; + + /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */ uint32_t key_words[16]; unsigned key_size = 1; @@ -2801,7 +2849,7 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant return prolog_entry->data; } - struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key); + prolog = radv_create_vs_prolog(device, &key); uint32_t *key2 = malloc(key_size * 4); if (!prolog || !key2) { free(key2); diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index a866812f9fd..3246ffeefe3 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -2675,6 +2675,61 @@ radv_device_init_vs_prologs(struct radv_device *device) if (!device->vs_prologs) return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + /* don't pre-compile prologs if we want to print them */ + if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS) + return VK_SUCCESS; + + struct radv_vs_input_state state; + state.nontrivial_divisors = 0; + memset(state.offsets, 0, sizeof(state.offsets)); + state.alpha_adjust_lo = 0; + state.alpha_adjust_hi = 0; + memset(state.formats, 0, sizeof(state.formats)); + + struct radv_vs_prolog_key key; + key.state = &state; + key.misaligned_mask = 0; + key.as_ls = false; + key.is_ngg = device->physical_device->use_ngg; + key.next_stage = MESA_SHADER_VERTEX; + key.wave32 = device->physical_device->ge_wave_size == 32; + + for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) { + state.attribute_mask = BITFIELD_MASK(i); + state.instance_rate_inputs = 0; + + key.num_attributes = i; + + device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key); + if (!device->simple_vs_prologs[i - 1]) + return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + unsigned idx = 0; + for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) { + state.attribute_mask = BITFIELD_MASK(num_attributes); + + for (unsigned i = 0; i < num_attributes; i++) + state.divisors[i] = 1; + + for (unsigned count = 1; count <= num_attributes; count++) { + for (unsigned start = 0; start <= (num_attributes - count); start++) { + state.instance_rate_inputs = u_bit_consecutive(start, count); + + key.num_attributes = num_attributes; + + struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key); + if (!prolog) + return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + assert(idx == + radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs)); + device->instance_rate_vs_prologs[idx++] = prolog; + } + } + } + assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs)); + return VK_SUCCESS; } @@ -2689,6 +2744,12 @@ radv_device_finish_vs_prologs(struct radv_device *device) } _mesa_hash_table_destroy(device->vs_prologs, NULL); } + + for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++) + radv_prolog_destroy(device, device->simple_vs_prologs[i]); + + for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++) + radv_prolog_destroy(device, device->instance_rate_vs_prologs[i]); } VkResult diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 60ea3b3c2aa..f4cf110df05 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -835,6 +835,9 @@ struct radv_device { struct u_rwlock vs_prologs_lock; struct hash_table *vs_prologs; + + struct radv_shader_prolog *simple_vs_prologs[MAX_VERTEX_ATTRIBS]; + struct radv_shader_prolog *instance_rate_vs_prologs[816]; }; VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line, @@ -1543,6 +1546,7 @@ void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer); void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer); +unsigned radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs); uint32_t radv_hash_vs_prolog(const void *key_); bool radv_cmp_vs_prolog(const void *a_, const void *b_);