mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 04:48:08 +02:00
radv: add pre-compiled vertex shader prologs for common states
This lets us pre-compile a prolog and avoid a hash table lookup during command buffer recording, most of the time. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11717>
This commit is contained in:
parent
80841196b2
commit
f6f6f18e55
3 changed files with 114 additions and 1 deletions
|
|
@ -2649,6 +2649,35 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->state.context_roll_without_scissor_emitted = true;
|
||||
}
|
||||
|
||||
unsigned
|
||||
radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
|
||||
{
|
||||
/* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
|
||||
* single array sorted in ascending order using:
|
||||
* - total number of attributes
|
||||
* - number of instanced attributes
|
||||
* - index of first instanced attribute
|
||||
*/
|
||||
|
||||
/* From total number of attributes to offset. */
|
||||
static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84,
|
||||
120, 165, 220, 286, 364, 455, 560, 680};
|
||||
unsigned start_index = total_to_offset[num_attributes - 1];
|
||||
|
||||
/* From number of instanced attributes to offset. This would require a different LUT depending on
|
||||
* the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
|
||||
* attributes.
|
||||
*/
|
||||
static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91,
|
||||
100, 108, 115, 121, 126, 130, 133, 135};
|
||||
unsigned count = util_bitcount(instance_rate_inputs);
|
||||
unsigned offset_from_start_index =
|
||||
count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
|
||||
|
||||
unsigned first = ffs(instance_rate_inputs) - 1;
|
||||
return start_index + offset_from_start_index + first;
|
||||
}
|
||||
|
||||
union vs_prolog_key_header {
|
||||
struct {
|
||||
uint32_t key_size : 8;
|
||||
|
|
@ -2734,6 +2763,25 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant
|
|||
else if (pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader)
|
||||
key.next_stage = MESA_SHADER_GEOMETRY;
|
||||
|
||||
/* try to use a pre-compiled prolog first */
|
||||
struct radv_shader_prolog *prolog = NULL;
|
||||
if (!key.as_ls && key.next_stage == MESA_SHADER_VERTEX &&
|
||||
key.is_ngg == device->physical_device->use_ngg && !misaligned_mask &&
|
||||
!state->alpha_adjust_lo && !state->alpha_adjust_hi &&
|
||||
vs_shader->info.wave_size == device->physical_device->ge_wave_size) {
|
||||
if (!instance_rate_inputs) {
|
||||
prolog = device->simple_vs_prologs[num_attributes - 1];
|
||||
} else if (num_attributes <= 16 && !*nontrivial_divisors &&
|
||||
util_bitcount(instance_rate_inputs) ==
|
||||
(util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
|
||||
unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
|
||||
prolog = device->instance_rate_vs_prologs[index];
|
||||
}
|
||||
}
|
||||
if (prolog)
|
||||
return prolog;
|
||||
|
||||
/* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
|
||||
uint32_t key_words[16];
|
||||
unsigned key_size = 1;
|
||||
|
||||
|
|
@ -2801,7 +2849,7 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant
|
|||
return prolog_entry->data;
|
||||
}
|
||||
|
||||
struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
|
||||
prolog = radv_create_vs_prolog(device, &key);
|
||||
uint32_t *key2 = malloc(key_size * 4);
|
||||
if (!prolog || !key2) {
|
||||
free(key2);
|
||||
|
|
|
|||
|
|
@ -2675,6 +2675,61 @@ radv_device_init_vs_prologs(struct radv_device *device)
|
|||
if (!device->vs_prologs)
|
||||
return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
|
||||
/* don't pre-compile prologs if we want to print them */
|
||||
if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
|
||||
return VK_SUCCESS;
|
||||
|
||||
struct radv_vs_input_state state;
|
||||
state.nontrivial_divisors = 0;
|
||||
memset(state.offsets, 0, sizeof(state.offsets));
|
||||
state.alpha_adjust_lo = 0;
|
||||
state.alpha_adjust_hi = 0;
|
||||
memset(state.formats, 0, sizeof(state.formats));
|
||||
|
||||
struct radv_vs_prolog_key key;
|
||||
key.state = &state;
|
||||
key.misaligned_mask = 0;
|
||||
key.as_ls = false;
|
||||
key.is_ngg = device->physical_device->use_ngg;
|
||||
key.next_stage = MESA_SHADER_VERTEX;
|
||||
key.wave32 = device->physical_device->ge_wave_size == 32;
|
||||
|
||||
for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
|
||||
state.attribute_mask = BITFIELD_MASK(i);
|
||||
state.instance_rate_inputs = 0;
|
||||
|
||||
key.num_attributes = i;
|
||||
|
||||
device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
|
||||
if (!device->simple_vs_prologs[i - 1])
|
||||
return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
}
|
||||
|
||||
unsigned idx = 0;
|
||||
for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
|
||||
state.attribute_mask = BITFIELD_MASK(num_attributes);
|
||||
|
||||
for (unsigned i = 0; i < num_attributes; i++)
|
||||
state.divisors[i] = 1;
|
||||
|
||||
for (unsigned count = 1; count <= num_attributes; count++) {
|
||||
for (unsigned start = 0; start <= (num_attributes - count); start++) {
|
||||
state.instance_rate_inputs = u_bit_consecutive(start, count);
|
||||
|
||||
key.num_attributes = num_attributes;
|
||||
|
||||
struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
|
||||
if (!prolog)
|
||||
return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
|
||||
assert(idx ==
|
||||
radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs));
|
||||
device->instance_rate_vs_prologs[idx++] = prolog;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs));
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -2689,6 +2744,12 @@ radv_device_finish_vs_prologs(struct radv_device *device)
|
|||
}
|
||||
_mesa_hash_table_destroy(device->vs_prologs, NULL);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++)
|
||||
radv_prolog_destroy(device, device->simple_vs_prologs[i]);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++)
|
||||
radv_prolog_destroy(device, device->instance_rate_vs_prologs[i]);
|
||||
}
|
||||
|
||||
VkResult
|
||||
|
|
|
|||
|
|
@ -835,6 +835,9 @@ struct radv_device {
|
|||
|
||||
struct u_rwlock vs_prologs_lock;
|
||||
struct hash_table *vs_prologs;
|
||||
|
||||
struct radv_shader_prolog *simple_vs_prologs[MAX_VERTEX_ATTRIBS];
|
||||
struct radv_shader_prolog *instance_rate_vs_prologs[816];
|
||||
};
|
||||
|
||||
VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line,
|
||||
|
|
@ -1543,6 +1546,7 @@ void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
|
|||
|
||||
void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
|
||||
|
||||
unsigned radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs);
|
||||
uint32_t radv_hash_vs_prolog(const void *key_);
|
||||
bool radv_cmp_vs_prolog(const void *a_, const void *b_);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue