radv, aco: Rework VS prolog key handling.

The main change is to use struct radv_vs_prolog_key directly instead of
the compressed representation to simplify an upcoming rework in prolog /
epilog caching. In doing so the state struct pointer was replaced with
an inline struct.

Care was also taken to pre-mask all the states with the active attribute
mask and other masks when it makes sense; this ensures that we don't
accidentally use information not hashed into the key during compilation.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26023>
This commit is contained in:
Tatsuyuki Ishi 2023-11-03 19:07:51 +09:00 committed by Marge Bot
parent 5cc7f54f15
commit 3fc3a94bce
6 changed files with 45 additions and 120 deletions

View file

@ -12739,7 +12739,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & attrib_mask;
bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors;
wait_imm lgkm_imm;
lgkm_imm.lgkm = 0;
@ -12800,10 +12800,9 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
}
bool needs_instance_index =
pinfo->state.instance_rate_inputs & attrib_mask &
pinfo->state.instance_rate_inputs &
~(pinfo->state.zero_divisors | pinfo->state.nontrivial_divisors); /* divisor is 1 */
bool needs_start_instance =
pinfo->state.instance_rate_inputs & attrib_mask & pinfo->state.zero_divisors;
bool needs_start_instance = pinfo->state.instance_rate_inputs & pinfo->state.zero_divisors;
bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & attrib_mask;
if (needs_vertex_index)
bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),

View file

@ -72,8 +72,8 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
aco_info->next_stage_pc = radv_args->next_stage_pc;
}
#define ASSIGN_VS_STATE_FIELD(x) aco_info->state.x = radv->state->x
#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state->x, sizeof(radv->state->x))
#define ASSIGN_VS_STATE_FIELD(x) aco_info->state.x = radv->state.x
#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state.x, sizeof(radv->state.x))
static inline void
radv_aco_convert_vs_prolog_key(struct aco_vs_prolog_info *aco_info, const struct radv_vs_prolog_key *radv,
const struct radv_shader_args *radv_args)

View file

@ -3691,53 +3691,25 @@ radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_
return start_index + offset_from_start_index + first;
}
union vs_prolog_key_header {
struct {
uint32_t key_size : 8;
uint32_t num_attributes : 6;
uint32_t as_ls : 1;
uint32_t is_ngg : 1;
uint32_t wave32 : 1;
uint32_t next_stage : 3;
uint32_t instance_rate_inputs : 1;
uint32_t alpha_adjust_lo : 1;
uint32_t alpha_adjust_hi : 1;
uint32_t misaligned_mask : 1;
uint32_t post_shuffle : 1;
uint32_t nontrivial_divisors : 1;
uint32_t zero_divisors : 1;
/* We need this to ensure the padding is zero. It's useful even if it's unused. */
uint32_t padding0 : 5;
};
uint32_t v;
};
uint32_t
radv_hash_vs_prolog(const void *key_)
{
const uint32_t *key = key_;
union vs_prolog_key_header header;
header.v = key[0];
return _mesa_hash_data(key, header.key_size);
const struct radv_vs_prolog_key *key = key_;
return _mesa_hash_data(key, sizeof(*key));
}
bool
radv_cmp_vs_prolog(const void *a_, const void *b_)
{
const uint32_t *a = a_;
const uint32_t *b = b_;
if (a[0] != b[0])
return false;
const struct radv_vs_prolog_key *a = a_;
const struct radv_vs_prolog_key *b = b_;
union vs_prolog_key_header header;
header.v = a[0];
return memcmp(a, b, header.key_size) == 0;
return memcmp(a, b, sizeof(*a)) == 0;
}
static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors)
{
STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
assert(vs_shader->info.vs.dynamic_inputs);
const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
@ -3800,12 +3772,17 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v
if (prolog)
return prolog;
/* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
uint32_t key_words[17];
unsigned key_size = 1;
struct radv_vs_prolog_key key;
key.state = state;
memset(&key, 0, sizeof(key));
key.state.instance_rate_inputs = instance_rate_inputs;
key.state.nontrivial_divisors = *nontrivial_divisors;
key.state.zero_divisors = zero_divisors;
/* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */
key.state.post_shuffle = state->post_shuffle & attribute_mask & misaligned_mask;
key.state.alpha_adjust_hi = state->alpha_adjust_hi & attribute_mask;
key.state.alpha_adjust_lo = state->alpha_adjust_lo & attribute_mask;
u_foreach_bit (index, misaligned_mask)
key.state.formats[index] = state->formats[index];
key.num_attributes = num_attributes;
key.misaligned_mask = misaligned_mask;
/* The instance ID input VGPR is placed differently when as_ls=true. */
@ -3820,78 +3797,29 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v
key.next_stage = vs_shader->info.stage;
}
union vs_prolog_key_header header;
header.v = 0;
header.num_attributes = num_attributes;
header.as_ls = key.as_ls;
header.is_ngg = key.is_ngg;
header.wave32 = key.wave32;
header.next_stage = key.next_stage;
if (instance_rate_inputs & ~*nontrivial_divisors) {
header.instance_rate_inputs = true;
key_words[key_size++] = instance_rate_inputs;
}
if (*nontrivial_divisors) {
header.nontrivial_divisors = true;
key_words[key_size++] = *nontrivial_divisors;
}
if (zero_divisors) {
header.zero_divisors = true;
key_words[key_size++] = zero_divisors;
}
if (misaligned_mask) {
header.misaligned_mask = true;
key_words[key_size++] = misaligned_mask;
uint8_t *formats = (uint8_t *)&key_words[key_size];
unsigned num_formats = 0;
u_foreach_bit (index, misaligned_mask)
formats[num_formats++] = state->formats[index];
while (num_formats & 0x3)
formats[num_formats++] = 0;
key_size += num_formats / 4u;
if (state->post_shuffle & attribute_mask) {
header.post_shuffle = true;
key_words[key_size++] = state->post_shuffle & attribute_mask;
}
}
if (state->alpha_adjust_lo & attribute_mask) {
header.alpha_adjust_lo = true;
key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
}
if (state->alpha_adjust_hi & attribute_mask) {
header.alpha_adjust_hi = true;
key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
}
header.key_size = key_size * sizeof(key_words[0]);
key_words[0] = header.v;
uint32_t hash = radv_hash_vs_prolog(key_words);
uint32_t hash = radv_hash_vs_prolog(&key);
u_rwlock_rdlock(&device->vs_prologs_lock);
struct hash_entry *prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
struct hash_entry *prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key);
u_rwlock_rdunlock(&device->vs_prologs_lock);
if (!prolog_entry) {
u_rwlock_wrlock(&device->vs_prologs_lock);
prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key);
if (prolog_entry) {
u_rwlock_wrunlock(&device->vs_prologs_lock);
return prolog_entry->data;
}
prolog = radv_create_vs_prolog(device, &key);
uint32_t *key2 = malloc(key_size * 4);
struct radv_vs_prolog_key *key2 = malloc(sizeof(key));
if (!prolog || !key2) {
radv_shader_part_unref(device, prolog);
free(key2);
u_rwlock_wrunlock(&device->vs_prologs_lock);
return NULL;
}
memcpy(key2, key_words, key_size * 4);
memcpy(key2, &key, sizeof(key));
_mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
u_rwlock_wrunlock(&device->vs_prologs_lock);

View file

@ -168,25 +168,15 @@ radv_device_init_vs_prologs(struct radv_device *device)
if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
return VK_SUCCESS;
struct radv_vs_input_state state;
state.nontrivial_divisors = 0;
memset(state.offsets, 0, sizeof(state.offsets));
state.alpha_adjust_lo = 0;
state.alpha_adjust_hi = 0;
memset(state.formats, 0, sizeof(state.formats));
struct radv_vs_prolog_key key;
key.state = &state;
key.misaligned_mask = 0;
memset(&key, 0, sizeof(key));
key.as_ls = false;
key.is_ngg = device->physical_device->use_ngg;
key.next_stage = MESA_SHADER_VERTEX;
key.wave32 = device->physical_device->ge_wave_size == 32;
for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
state.attribute_mask = BITFIELD_MASK(i);
state.instance_rate_inputs = 0;
key.state.instance_rate_inputs = 0;
key.num_attributes = i;
device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
@ -196,22 +186,16 @@ radv_device_init_vs_prologs(struct radv_device *device)
unsigned idx = 0;
for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
state.attribute_mask = BITFIELD_MASK(num_attributes);
for (unsigned i = 0; i < num_attributes; i++)
state.divisors[i] = 1;
for (unsigned count = 1; count <= num_attributes; count++) {
for (unsigned start = 0; start <= (num_attributes - count); start++) {
state.instance_rate_inputs = u_bit_consecutive(start, count);
key.state.instance_rate_inputs = u_bit_consecutive(start, count);
key.num_attributes = num_attributes;
struct radv_shader_part *prolog = radv_create_vs_prolog(device, &key);
if (!prolog)
return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
assert(idx == radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs));
assert(idx == radv_instance_rate_prolog_index(num_attributes, key.state.instance_rate_inputs));
device->instance_rate_vs_prologs[idx++] = prolog;
}
}

View file

@ -2581,7 +2581,7 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke
if (!prolog)
goto fail;
prolog->nontrivial_divisors = key->state->nontrivial_divisors;
prolog->nontrivial_divisors = key->state.nontrivial_divisors;
if (options.dump_shader) {
fprintf(stderr, "Vertex prolog");

View file

@ -485,7 +485,21 @@ struct radv_vs_input_state {
};
struct radv_vs_prolog_key {
const struct radv_vs_input_state *state;
/* All the fields are pre-masked with BITFIELD_MASK(num_attributes).
* Some of the fields are pre-masked by other conditions. See lookup_vs_prolog.
*/
struct {
uint32_t instance_rate_inputs;
uint32_t nontrivial_divisors;
uint32_t zero_divisors;
uint32_t post_shuffle;
/* Having two separate fields instead of a single uint64_t makes it easier to remove attributes
* using bitwise arithmetic.
*/
uint32_t alpha_adjust_lo;
uint32_t alpha_adjust_hi;
uint8_t formats[MAX_VERTEX_ATTRIBS];
} state;
unsigned num_attributes;
uint32_t misaligned_mask;
bool as_ls;