radeonsi: lower subgroup ops after wave size is known

We use wave 32 sometime so should not use static subgroup
size of 64.

Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30610>
This commit is contained in:
Qiang Yu 2024-07-22 17:21:17 +08:00
parent 31dfb04fd3
commit 0f937426cc
6 changed files with 31 additions and 37 deletions

View file

@ -17,12 +17,6 @@
#include "vl/vl_video_buffer.h"
#include <sys/utsname.h>
#if LLVM_AVAILABLE
#include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
#else
#define LLVM_VERSION_MAJOR 0
#endif
/* The capabilities reported by the kernel has priority
over the existing logic in si_get_video_param */
#define QUERYABLE_KERNEL (sscreen->info.is_amdgpu && \
@ -1690,23 +1684,4 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
options->support_indirect_outputs = BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
options->varying_expression_max_cost = si_varying_expression_max_cost;
options->varying_estimate_instr_cost = si_varying_estimate_instr_cost;
nir_lower_subgroups_options *lower_subgroups_options = sscreen->nir_lower_subgroups_options;
lower_subgroups_options->subgroup_size = 64;
lower_subgroups_options->ballot_bit_size = 64;
lower_subgroups_options->ballot_components = 1;
lower_subgroups_options->lower_to_scalar = true;
lower_subgroups_options->lower_subgroup_masks = true;
lower_subgroups_options->lower_relative_shuffle = true;
lower_subgroups_options->lower_rotate_to_shuffle = !sscreen->use_aco;
lower_subgroups_options->lower_shuffle_to_32bit = true;
lower_subgroups_options->lower_vote_eq = true;
lower_subgroups_options->lower_vote_bool_eq = true;
lower_subgroups_options->lower_quad_broadcast_dynamic = true;
lower_subgroups_options->lower_quad_broadcast_dynamic_to_const = sscreen->info.gfx_level <= GFX7;
lower_subgroups_options->lower_shuffle_to_swizzle_amd = true;
lower_subgroups_options->lower_ballot_bit_count_to_mbcnt_amd = true;
lower_subgroups_options->lower_inverse_ballot = !sscreen->use_aco && LLVM_VERSION_MAJOR < 17;
lower_subgroups_options->lower_boolean_reduce = true;
lower_subgroups_options->lower_boolean_shuffle = true;
}

View file

@ -1066,7 +1066,6 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
sscreen->ws->destroy(sscreen->ws);
FREE(sscreen->nir_options);
FREE(sscreen->nir_lower_subgroups_options);
FREE(sscreen);
}
@ -1235,7 +1234,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->b.finalize_nir = si_finalize_nir;
sscreen->nir_options = CALLOC_STRUCT(nir_shader_compiler_options);
sscreen->nir_lower_subgroups_options = CALLOC_STRUCT(nir_lower_subgroups_options);
si_init_screen_get_functions(sscreen);
si_init_screen_buffer_functions(sscreen);
@ -1272,7 +1270,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
si_init_gs_info(sscreen);
if (!si_init_shader_cache(sscreen)) {
FREE(sscreen->nir_options);
FREE(sscreen->nir_lower_subgroups_options);
FREE(sscreen);
return NULL;
}
@ -1329,7 +1326,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen->nir_options);
FREE(sscreen->nir_lower_subgroups_options);
FREE(sscreen);
glsl_type_singleton_decref();
return NULL;
@ -1341,7 +1337,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL)) {
si_destroy_shader_cache(sscreen);
FREE(sscreen->nir_options);
FREE(sscreen->nir_lower_subgroups_options);
FREE(sscreen);
glsl_type_singleton_decref();
return NULL;

View file

@ -527,7 +527,6 @@ struct si_screen {
struct radeon_info info;
struct nir_shader_compiler_options *nir_options;
struct nir_lower_subgroups_options *nir_lower_subgroups_options;
uint64_t debug_flags;
char renderer_string[183];

View file

@ -20,6 +20,12 @@
#include "util/ralloc.h"
#include "util/u_upload_mgr.h"
#if LLVM_AVAILABLE
#include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
#else
#define LLVM_VERSION_MAJOR 0
#endif
static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
@ -1957,9 +1963,6 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
}
/* may generate some subgroup op like ballot */
NIR_PASS_V(nir, nir_lower_subgroups, sel->screen->nir_lower_subgroups_options);
/* may generate some vector output store */
NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
}
@ -2480,6 +2483,30 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader,
progress = true;
}
assert(shader->wave_size == 32 || shader->wave_size == 64);
NIR_PASS(progress, nir, nir_lower_subgroups,
&(struct nir_lower_subgroups_options) {
.subgroup_size = shader->wave_size,
.ballot_bit_size = shader->wave_size,
.ballot_components = 1,
.lower_to_scalar = true,
.lower_subgroup_masks = true,
.lower_relative_shuffle = true,
.lower_rotate_to_shuffle = !sel->info.base.use_aco_amd,
.lower_shuffle_to_32bit = true,
.lower_vote_eq = true,
.lower_vote_bool_eq = true,
.lower_quad_broadcast_dynamic = true,
.lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
.lower_shuffle_to_swizzle_amd = true,
.lower_ballot_bit_count_to_mbcnt_amd = true,
.lower_inverse_ballot = !sel->info.base.use_aco_amd && LLVM_VERSION_MAJOR < 17,
.lower_boolean_reduce = true,
.lower_boolean_shuffle = true,
});
NIR_PASS(progress, nir, nir_lower_pack);
NIR_PASS(progress, nir, nir_lower_int64);
NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
NIR_PASS(progress, nir, nir_lower_idiv,
@ -3595,6 +3622,7 @@ nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
*/
prev_shader->key.ge.opt.kill_outputs = 0;
prev_shader->is_monolithic = true;
prev_shader->wave_size = shader->wave_size;
si_init_shader_args(prev_shader, args);

View file

@ -125,7 +125,6 @@ extern "C" {
struct nir_shader;
struct nir_instr;
struct nir_lower_subgroups_options;
#define SI_NUM_INTERP 32
#define SI_MAX_ATTRIBS 16

View file

@ -307,8 +307,6 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
NIR_PASS_V(nir, ac_nir_lower_sin_cos);
NIR_PASS_V(nir, nir_lower_subgroups, sscreen->nir_lower_subgroups_options);
/* Lower load constants to scalar and then clean up the mess */
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
NIR_PASS_V(nir, nir_lower_var_copies);