From dd6ac7055f6d580e8858e30386361b3faf261df8 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 16 Oct 2024 20:25:32 +0200 Subject: [PATCH] ir3: optimize subgroup shuffles using shfl One quirk of the shfl instruction is that it only works with dynamically uniform indices. This commit adds a pass to lower shuffles to the ir3-specific ones using a loop that iterates all distinct indices one by one. This is based on the blob's sequence. Signed-off-by: Job Noorman Part-of: --- src/freedreno/ir3/ir3_lower_subgroups.c | 131 ++++++++++++++++++++++++ src/freedreno/ir3/ir3_nir.c | 5 +- src/freedreno/ir3/ir3_nir.h | 1 + 3 files changed, 135 insertions(+), 2 deletions(-) diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c index 2c0b57803bc..ffed827d5f5 100644 --- a/src/freedreno/ir3/ir3_lower_subgroups.c +++ b/src/freedreno/ir3/ir3_lower_subgroups.c @@ -694,3 +694,134 @@ ir3_nir_lower_64b_subgroups(nir_shader *nir) return nir_shader_lower_instructions(nir, filter_64b_scan_reduce, lower_64b_scan_reduce, NULL); } + +static bool +filter_shuffle(const nir_instr *instr, const void *data) +{ + if (instr->type != nir_instr_type_intrinsic) { + return false; + } + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + switch (intrin->intrinsic) { + case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_up: + case nir_intrinsic_shuffle_down: + case nir_intrinsic_shuffle_xor: + return true; + default: + return false; + } +} + +static nir_def * +shuffle_to_uniform(nir_builder *b, nir_intrinsic_op op, struct nir_def *val, + struct nir_def *id) +{ + switch (op) { + case nir_intrinsic_shuffle: + return nir_rotate(b, val, id); + case nir_intrinsic_shuffle_up: + return nir_shuffle_up_uniform_ir3(b, val, id); + case nir_intrinsic_shuffle_down: + return nir_shuffle_down_uniform_ir3(b, val, id); + case nir_intrinsic_shuffle_xor: + return nir_shuffle_xor_uniform_ir3(b, val, id); + default: + unreachable("filtered intrinsic"); + } +} + +/* Transforms a shuffle operation into a loop that only uses shuffles with + * (dynamically) uniform indices. This is based on the blob's sequence and + * carefully makes sure that the least amount of iterations are performed (i.e., + * one iteration per distinct index) while keeping all invocations active during + * each shfl operation. This is necessary since shfl does not update its dst + * when its src is inactive. + * + * done = false; + * while (true) { + * next_index = read_invocation_cond_ir3(index, !done); + * shuffled = op_uniform(val, next_index); + * + * if (index == next_index) { + * result = shuffled; + * done = true; + * } + * + * if (subgroupAll(done)) { + * break; + * } + * } + */ +static nir_def * +make_shuffle_uniform(nir_builder *b, nir_def *val, nir_def *index, + nir_intrinsic_op op) +{ + nir_variable *done = + nir_local_variable_create(b->impl, glsl_bool_type(), "done"); + nir_store_var(b, done, nir_imm_false(b), 1); + nir_variable *result = + nir_local_variable_create(b->impl, glsl_type_for_def(val), "result"); + + nir_loop *loop = nir_push_loop(b); + { + nir_def *next_index = nir_read_invocation_cond_ir3( + b, index->bit_size, index, nir_inot(b, nir_load_var(b, done))); + next_index->divergent = false; + nir_def *shuffled = shuffle_to_uniform(b, op, val, next_index); + + nir_if *nif = nir_push_if(b, nir_ieq(b, index, next_index)); + { + nir_store_var(b, result, shuffled, 1); + nir_store_var(b, done, nir_imm_true(b), 1); + } + nir_pop_if(b, nif); + + nir_break_if(b, nir_vote_all(b, 1, nir_load_var(b, done))); + } + nir_pop_loop(b, loop); + + return nir_load_var(b, result); +} + +static nir_def * +lower_shuffle(nir_builder *b, nir_instr *instr, void *data) +{ + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + nir_def *val = intrin->src[0].ssa; + nir_def *index = intrin->src[1].ssa; + + if (intrin->intrinsic == nir_intrinsic_shuffle) { + /* The hw only does relative shuffles/rotates so transform shuffle(val, x) + * into rotate(val, x - gl_SubgroupInvocationID) which is valid since we + * make sure to only use it with uniform indices. + */ + index = nir_isub(b, index, nir_load_subgroup_invocation(b)); + } + + if (!index->divergent) { + return shuffle_to_uniform(b, intrin->intrinsic, val, index); + } + + return make_shuffle_uniform(b, val, index, intrin->intrinsic); +} + +/* Lower (relative) shuffles to be able to use the shfl instruction. One quirk + * of shfl is that its index has to be dynamically uniform, so we transform the + * standard NIR intrinsics into ir3-specific ones which require their index to + * be uniform. + */ +bool +ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader) +{ + if (!shader->compiler->has_shfl) { + return false; + } + + nir_convert_to_lcssa(nir, true, true); + nir_divergence_analysis(nir); + return nir_shader_lower_instructions(nir, filter_shuffle, lower_shuffle, + NULL); +} diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index ff34b3aa8df..6fa9d884b17 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -786,8 +786,8 @@ ir3_nir_post_finalize(struct ir3_shader *shader) .lower_vote_bool_eq = true, .lower_subgroup_masks = true, .lower_read_invocation_to_cond = true, - .lower_shuffle = true, - .lower_relative_shuffle = true, + .lower_shuffle = !compiler->has_shfl, + .lower_relative_shuffle = !compiler->has_shfl, .lower_rotate_to_shuffle = !compiler->has_shfl, .lower_inverse_ballot = true, }; @@ -800,6 +800,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader) } OPT(s, nir_lower_subgroups, &options); + OPT(s, ir3_nir_lower_shuffle, shader); /* We want to run the 64b lowering after nir_lower_subgroups so that the * operations have been scalarized. However, the 64b lowering will insert diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 983a8cf7d3d..cf69d0e4c7f 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -82,6 +82,7 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b, int32_t shift); bool ir3_nir_lower_64b_subgroups(nir_shader *nir); +bool ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader); bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v); nir_def *ir3_get_shared_driver_ubo(nir_builder *b,