mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-06 15:20:17 +01:00
radv, aco: Rework VS prolog key handling.
The main change is to use struct radv_vs_prolog_key directly instead of the compressed representation to simplify an upcoming rework in prolog / epilog caching. In doing so the state struct pointer was replaced with an inline struct. Care was also taken to pre-mask all the states with the active attribute mask and other masks when it makes sense; this ensures that we don't accidentally use information not hashed into the key during compilation. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26023>
This commit is contained in:
parent
5cc7f54f15
commit
3fc3a94bce
6 changed files with 45 additions and 120 deletions
|
|
@ -12739,7 +12739,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
|
|||
bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
|
||||
|
||||
uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
|
||||
bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & attrib_mask;
|
||||
bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors;
|
||||
|
||||
wait_imm lgkm_imm;
|
||||
lgkm_imm.lgkm = 0;
|
||||
|
|
@ -12800,10 +12800,9 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
|
|||
}
|
||||
|
||||
bool needs_instance_index =
|
||||
pinfo->state.instance_rate_inputs & attrib_mask &
|
||||
pinfo->state.instance_rate_inputs &
|
||||
~(pinfo->state.zero_divisors | pinfo->state.nontrivial_divisors); /* divisor is 1 */
|
||||
bool needs_start_instance =
|
||||
pinfo->state.instance_rate_inputs & attrib_mask & pinfo->state.zero_divisors;
|
||||
bool needs_start_instance = pinfo->state.instance_rate_inputs & pinfo->state.zero_divisors;
|
||||
bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & attrib_mask;
|
||||
if (needs_vertex_index)
|
||||
bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
|
||||
|
|
|
|||
|
|
@ -72,8 +72,8 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
|
|||
aco_info->next_stage_pc = radv_args->next_stage_pc;
|
||||
}
|
||||
|
||||
#define ASSIGN_VS_STATE_FIELD(x) aco_info->state.x = radv->state->x
|
||||
#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state->x, sizeof(radv->state->x))
|
||||
#define ASSIGN_VS_STATE_FIELD(x) aco_info->state.x = radv->state.x
|
||||
#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state.x, sizeof(radv->state.x))
|
||||
static inline void
|
||||
radv_aco_convert_vs_prolog_key(struct aco_vs_prolog_info *aco_info, const struct radv_vs_prolog_key *radv,
|
||||
const struct radv_shader_args *radv_args)
|
||||
|
|
|
|||
|
|
@ -3691,53 +3691,25 @@ radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_
|
|||
return start_index + offset_from_start_index + first;
|
||||
}
|
||||
|
||||
union vs_prolog_key_header {
|
||||
struct {
|
||||
uint32_t key_size : 8;
|
||||
uint32_t num_attributes : 6;
|
||||
uint32_t as_ls : 1;
|
||||
uint32_t is_ngg : 1;
|
||||
uint32_t wave32 : 1;
|
||||
uint32_t next_stage : 3;
|
||||
uint32_t instance_rate_inputs : 1;
|
||||
uint32_t alpha_adjust_lo : 1;
|
||||
uint32_t alpha_adjust_hi : 1;
|
||||
uint32_t misaligned_mask : 1;
|
||||
uint32_t post_shuffle : 1;
|
||||
uint32_t nontrivial_divisors : 1;
|
||||
uint32_t zero_divisors : 1;
|
||||
/* We need this to ensure the padding is zero. It's useful even if it's unused. */
|
||||
uint32_t padding0 : 5;
|
||||
};
|
||||
uint32_t v;
|
||||
};
|
||||
|
||||
uint32_t
|
||||
radv_hash_vs_prolog(const void *key_)
|
||||
{
|
||||
const uint32_t *key = key_;
|
||||
union vs_prolog_key_header header;
|
||||
header.v = key[0];
|
||||
return _mesa_hash_data(key, header.key_size);
|
||||
const struct radv_vs_prolog_key *key = key_;
|
||||
return _mesa_hash_data(key, sizeof(*key));
|
||||
}
|
||||
|
||||
bool
|
||||
radv_cmp_vs_prolog(const void *a_, const void *b_)
|
||||
{
|
||||
const uint32_t *a = a_;
|
||||
const uint32_t *b = b_;
|
||||
if (a[0] != b[0])
|
||||
return false;
|
||||
const struct radv_vs_prolog_key *a = a_;
|
||||
const struct radv_vs_prolog_key *b = b_;
|
||||
|
||||
union vs_prolog_key_header header;
|
||||
header.v = a[0];
|
||||
return memcmp(a, b, header.key_size) == 0;
|
||||
return memcmp(a, b, sizeof(*a)) == 0;
|
||||
}
|
||||
|
||||
static struct radv_shader_part *
|
||||
lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors)
|
||||
{
|
||||
STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
|
||||
assert(vs_shader->info.vs.dynamic_inputs);
|
||||
|
||||
const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
|
||||
|
|
@ -3800,12 +3772,17 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v
|
|||
if (prolog)
|
||||
return prolog;
|
||||
|
||||
/* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
|
||||
uint32_t key_words[17];
|
||||
unsigned key_size = 1;
|
||||
|
||||
struct radv_vs_prolog_key key;
|
||||
key.state = state;
|
||||
memset(&key, 0, sizeof(key));
|
||||
key.state.instance_rate_inputs = instance_rate_inputs;
|
||||
key.state.nontrivial_divisors = *nontrivial_divisors;
|
||||
key.state.zero_divisors = zero_divisors;
|
||||
/* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */
|
||||
key.state.post_shuffle = state->post_shuffle & attribute_mask & misaligned_mask;
|
||||
key.state.alpha_adjust_hi = state->alpha_adjust_hi & attribute_mask;
|
||||
key.state.alpha_adjust_lo = state->alpha_adjust_lo & attribute_mask;
|
||||
u_foreach_bit (index, misaligned_mask)
|
||||
key.state.formats[index] = state->formats[index];
|
||||
key.num_attributes = num_attributes;
|
||||
key.misaligned_mask = misaligned_mask;
|
||||
/* The instance ID input VGPR is placed differently when as_ls=true. */
|
||||
|
|
@ -3820,78 +3797,29 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v
|
|||
key.next_stage = vs_shader->info.stage;
|
||||
}
|
||||
|
||||
union vs_prolog_key_header header;
|
||||
header.v = 0;
|
||||
header.num_attributes = num_attributes;
|
||||
header.as_ls = key.as_ls;
|
||||
header.is_ngg = key.is_ngg;
|
||||
header.wave32 = key.wave32;
|
||||
header.next_stage = key.next_stage;
|
||||
|
||||
if (instance_rate_inputs & ~*nontrivial_divisors) {
|
||||
header.instance_rate_inputs = true;
|
||||
key_words[key_size++] = instance_rate_inputs;
|
||||
}
|
||||
if (*nontrivial_divisors) {
|
||||
header.nontrivial_divisors = true;
|
||||
key_words[key_size++] = *nontrivial_divisors;
|
||||
}
|
||||
if (zero_divisors) {
|
||||
header.zero_divisors = true;
|
||||
key_words[key_size++] = zero_divisors;
|
||||
}
|
||||
if (misaligned_mask) {
|
||||
header.misaligned_mask = true;
|
||||
key_words[key_size++] = misaligned_mask;
|
||||
|
||||
uint8_t *formats = (uint8_t *)&key_words[key_size];
|
||||
unsigned num_formats = 0;
|
||||
u_foreach_bit (index, misaligned_mask)
|
||||
formats[num_formats++] = state->formats[index];
|
||||
while (num_formats & 0x3)
|
||||
formats[num_formats++] = 0;
|
||||
key_size += num_formats / 4u;
|
||||
|
||||
if (state->post_shuffle & attribute_mask) {
|
||||
header.post_shuffle = true;
|
||||
key_words[key_size++] = state->post_shuffle & attribute_mask;
|
||||
}
|
||||
}
|
||||
if (state->alpha_adjust_lo & attribute_mask) {
|
||||
header.alpha_adjust_lo = true;
|
||||
key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
|
||||
}
|
||||
if (state->alpha_adjust_hi & attribute_mask) {
|
||||
header.alpha_adjust_hi = true;
|
||||
key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
|
||||
}
|
||||
|
||||
header.key_size = key_size * sizeof(key_words[0]);
|
||||
key_words[0] = header.v;
|
||||
|
||||
uint32_t hash = radv_hash_vs_prolog(key_words);
|
||||
uint32_t hash = radv_hash_vs_prolog(&key);
|
||||
|
||||
u_rwlock_rdlock(&device->vs_prologs_lock);
|
||||
struct hash_entry *prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
|
||||
struct hash_entry *prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key);
|
||||
u_rwlock_rdunlock(&device->vs_prologs_lock);
|
||||
|
||||
if (!prolog_entry) {
|
||||
u_rwlock_wrlock(&device->vs_prologs_lock);
|
||||
prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
|
||||
prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key);
|
||||
if (prolog_entry) {
|
||||
u_rwlock_wrunlock(&device->vs_prologs_lock);
|
||||
return prolog_entry->data;
|
||||
}
|
||||
|
||||
prolog = radv_create_vs_prolog(device, &key);
|
||||
uint32_t *key2 = malloc(key_size * 4);
|
||||
struct radv_vs_prolog_key *key2 = malloc(sizeof(key));
|
||||
if (!prolog || !key2) {
|
||||
radv_shader_part_unref(device, prolog);
|
||||
free(key2);
|
||||
u_rwlock_wrunlock(&device->vs_prologs_lock);
|
||||
return NULL;
|
||||
}
|
||||
memcpy(key2, key_words, key_size * 4);
|
||||
memcpy(key2, &key, sizeof(key));
|
||||
_mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
|
||||
|
||||
u_rwlock_wrunlock(&device->vs_prologs_lock);
|
||||
|
|
|
|||
|
|
@ -168,25 +168,15 @@ radv_device_init_vs_prologs(struct radv_device *device)
|
|||
if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
|
||||
return VK_SUCCESS;
|
||||
|
||||
struct radv_vs_input_state state;
|
||||
state.nontrivial_divisors = 0;
|
||||
memset(state.offsets, 0, sizeof(state.offsets));
|
||||
state.alpha_adjust_lo = 0;
|
||||
state.alpha_adjust_hi = 0;
|
||||
memset(state.formats, 0, sizeof(state.formats));
|
||||
|
||||
struct radv_vs_prolog_key key;
|
||||
key.state = &state;
|
||||
key.misaligned_mask = 0;
|
||||
memset(&key, 0, sizeof(key));
|
||||
key.as_ls = false;
|
||||
key.is_ngg = device->physical_device->use_ngg;
|
||||
key.next_stage = MESA_SHADER_VERTEX;
|
||||
key.wave32 = device->physical_device->ge_wave_size == 32;
|
||||
|
||||
for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
|
||||
state.attribute_mask = BITFIELD_MASK(i);
|
||||
state.instance_rate_inputs = 0;
|
||||
|
||||
key.state.instance_rate_inputs = 0;
|
||||
key.num_attributes = i;
|
||||
|
||||
device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
|
||||
|
|
@ -196,22 +186,16 @@ radv_device_init_vs_prologs(struct radv_device *device)
|
|||
|
||||
unsigned idx = 0;
|
||||
for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
|
||||
state.attribute_mask = BITFIELD_MASK(num_attributes);
|
||||
|
||||
for (unsigned i = 0; i < num_attributes; i++)
|
||||
state.divisors[i] = 1;
|
||||
|
||||
for (unsigned count = 1; count <= num_attributes; count++) {
|
||||
for (unsigned start = 0; start <= (num_attributes - count); start++) {
|
||||
state.instance_rate_inputs = u_bit_consecutive(start, count);
|
||||
|
||||
key.state.instance_rate_inputs = u_bit_consecutive(start, count);
|
||||
key.num_attributes = num_attributes;
|
||||
|
||||
struct radv_shader_part *prolog = radv_create_vs_prolog(device, &key);
|
||||
if (!prolog)
|
||||
return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
|
||||
assert(idx == radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs));
|
||||
assert(idx == radv_instance_rate_prolog_index(num_attributes, key.state.instance_rate_inputs));
|
||||
device->instance_rate_vs_prologs[idx++] = prolog;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2581,7 +2581,7 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke
|
|||
if (!prolog)
|
||||
goto fail;
|
||||
|
||||
prolog->nontrivial_divisors = key->state->nontrivial_divisors;
|
||||
prolog->nontrivial_divisors = key->state.nontrivial_divisors;
|
||||
|
||||
if (options.dump_shader) {
|
||||
fprintf(stderr, "Vertex prolog");
|
||||
|
|
|
|||
|
|
@ -485,7 +485,21 @@ struct radv_vs_input_state {
|
|||
};
|
||||
|
||||
struct radv_vs_prolog_key {
|
||||
const struct radv_vs_input_state *state;
|
||||
/* All the fields are pre-masked with BITFIELD_MASK(num_attributes).
|
||||
* Some of the fields are pre-masked by other conditions. See lookup_vs_prolog.
|
||||
*/
|
||||
struct {
|
||||
uint32_t instance_rate_inputs;
|
||||
uint32_t nontrivial_divisors;
|
||||
uint32_t zero_divisors;
|
||||
uint32_t post_shuffle;
|
||||
/* Having two separate fields instead of a single uint64_t makes it easier to remove attributes
|
||||
* using bitwise arithmetic.
|
||||
*/
|
||||
uint32_t alpha_adjust_lo;
|
||||
uint32_t alpha_adjust_hi;
|
||||
uint8_t formats[MAX_VERTEX_ATTRIBS];
|
||||
} state;
|
||||
unsigned num_attributes;
|
||||
uint32_t misaligned_mask;
|
||||
bool as_ls;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue