mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 02:38:04 +02:00
ir3: optimize subgroup shuffles using shfl
One quirk of the shfl instruction is that it only works with dynamically uniform indices. This commit adds a pass to lower shuffles to the ir3-specific ones using a loop that iterates all distinct indices one by one. This is based on the blob's sequence. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31501>
This commit is contained in:
parent
0919d0f694
commit
dd6ac7055f
3 changed files with 135 additions and 2 deletions
|
|
@ -694,3 +694,134 @@ ir3_nir_lower_64b_subgroups(nir_shader *nir)
|
|||
return nir_shader_lower_instructions(nir, filter_64b_scan_reduce,
|
||||
lower_64b_scan_reduce, NULL);
|
||||
}
|
||||
|
||||
static bool
|
||||
filter_shuffle(const nir_instr *instr, const void *data)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic) {
|
||||
return false;
|
||||
}
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_shuffle:
|
||||
case nir_intrinsic_shuffle_up:
|
||||
case nir_intrinsic_shuffle_down:
|
||||
case nir_intrinsic_shuffle_xor:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
shuffle_to_uniform(nir_builder *b, nir_intrinsic_op op, struct nir_def *val,
|
||||
struct nir_def *id)
|
||||
{
|
||||
switch (op) {
|
||||
case nir_intrinsic_shuffle:
|
||||
return nir_rotate(b, val, id);
|
||||
case nir_intrinsic_shuffle_up:
|
||||
return nir_shuffle_up_uniform_ir3(b, val, id);
|
||||
case nir_intrinsic_shuffle_down:
|
||||
return nir_shuffle_down_uniform_ir3(b, val, id);
|
||||
case nir_intrinsic_shuffle_xor:
|
||||
return nir_shuffle_xor_uniform_ir3(b, val, id);
|
||||
default:
|
||||
unreachable("filtered intrinsic");
|
||||
}
|
||||
}
|
||||
|
||||
/* Transforms a shuffle operation into a loop that only uses shuffles with
|
||||
* (dynamically) uniform indices. This is based on the blob's sequence and
|
||||
* carefully makes sure that the least amount of iterations are performed (i.e.,
|
||||
* one iteration per distinct index) while keeping all invocations active during
|
||||
* each shfl operation. This is necessary since shfl does not update its dst
|
||||
* when its src is inactive.
|
||||
*
|
||||
* done = false;
|
||||
* while (true) {
|
||||
* next_index = read_invocation_cond_ir3(index, !done);
|
||||
* shuffled = op_uniform(val, next_index);
|
||||
*
|
||||
* if (index == next_index) {
|
||||
* result = shuffled;
|
||||
* done = true;
|
||||
* }
|
||||
*
|
||||
* if (subgroupAll(done)) {
|
||||
* break;
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
static nir_def *
|
||||
make_shuffle_uniform(nir_builder *b, nir_def *val, nir_def *index,
|
||||
nir_intrinsic_op op)
|
||||
{
|
||||
nir_variable *done =
|
||||
nir_local_variable_create(b->impl, glsl_bool_type(), "done");
|
||||
nir_store_var(b, done, nir_imm_false(b), 1);
|
||||
nir_variable *result =
|
||||
nir_local_variable_create(b->impl, glsl_type_for_def(val), "result");
|
||||
|
||||
nir_loop *loop = nir_push_loop(b);
|
||||
{
|
||||
nir_def *next_index = nir_read_invocation_cond_ir3(
|
||||
b, index->bit_size, index, nir_inot(b, nir_load_var(b, done)));
|
||||
next_index->divergent = false;
|
||||
nir_def *shuffled = shuffle_to_uniform(b, op, val, next_index);
|
||||
|
||||
nir_if *nif = nir_push_if(b, nir_ieq(b, index, next_index));
|
||||
{
|
||||
nir_store_var(b, result, shuffled, 1);
|
||||
nir_store_var(b, done, nir_imm_true(b), 1);
|
||||
}
|
||||
nir_pop_if(b, nif);
|
||||
|
||||
nir_break_if(b, nir_vote_all(b, 1, nir_load_var(b, done)));
|
||||
}
|
||||
nir_pop_loop(b, loop);
|
||||
|
||||
return nir_load_var(b, result);
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
lower_shuffle(nir_builder *b, nir_instr *instr, void *data)
|
||||
{
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
nir_def *val = intrin->src[0].ssa;
|
||||
nir_def *index = intrin->src[1].ssa;
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_shuffle) {
|
||||
/* The hw only does relative shuffles/rotates so transform shuffle(val, x)
|
||||
* into rotate(val, x - gl_SubgroupInvocationID) which is valid since we
|
||||
* make sure to only use it with uniform indices.
|
||||
*/
|
||||
index = nir_isub(b, index, nir_load_subgroup_invocation(b));
|
||||
}
|
||||
|
||||
if (!index->divergent) {
|
||||
return shuffle_to_uniform(b, intrin->intrinsic, val, index);
|
||||
}
|
||||
|
||||
return make_shuffle_uniform(b, val, index, intrin->intrinsic);
|
||||
}
|
||||
|
||||
/* Lower (relative) shuffles to be able to use the shfl instruction. One quirk
|
||||
* of shfl is that its index has to be dynamically uniform, so we transform the
|
||||
* standard NIR intrinsics into ir3-specific ones which require their index to
|
||||
* be uniform.
|
||||
*/
|
||||
bool
|
||||
ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader)
|
||||
{
|
||||
if (!shader->compiler->has_shfl) {
|
||||
return false;
|
||||
}
|
||||
|
||||
nir_convert_to_lcssa(nir, true, true);
|
||||
nir_divergence_analysis(nir);
|
||||
return nir_shader_lower_instructions(nir, filter_shuffle, lower_shuffle,
|
||||
NULL);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -786,8 +786,8 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
|
|||
.lower_vote_bool_eq = true,
|
||||
.lower_subgroup_masks = true,
|
||||
.lower_read_invocation_to_cond = true,
|
||||
.lower_shuffle = true,
|
||||
.lower_relative_shuffle = true,
|
||||
.lower_shuffle = !compiler->has_shfl,
|
||||
.lower_relative_shuffle = !compiler->has_shfl,
|
||||
.lower_rotate_to_shuffle = !compiler->has_shfl,
|
||||
.lower_inverse_ballot = true,
|
||||
};
|
||||
|
|
@ -800,6 +800,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
|
|||
}
|
||||
|
||||
OPT(s, nir_lower_subgroups, &options);
|
||||
OPT(s, ir3_nir_lower_shuffle, shader);
|
||||
|
||||
/* We want to run the 64b lowering after nir_lower_subgroups so that the
|
||||
* operations have been scalarized. However, the 64b lowering will insert
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
|
|||
int32_t shift);
|
||||
|
||||
bool ir3_nir_lower_64b_subgroups(nir_shader *nir);
|
||||
bool ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader);
|
||||
bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v);
|
||||
|
||||
nir_def *ir3_get_shared_driver_ubo(nir_builder *b,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue