intel: switch to new subgroup size info

Reviewed-by: Iván Briano <ivan.briano@intel.com>
Acked-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37258>
This commit is contained in:
Georg Lehmann 2025-09-09 18:24:08 +02:00 committed by Marge Bot
parent 04d3b3bde5
commit 79d02047b8
9 changed files with 44 additions and 108 deletions

View file

@ -36,8 +36,11 @@ blorp_compile_fs_brw(struct blorp_context *blorp, void *mem_ctx,
brw_preprocess_nir(compiler, nir, &opts);
nir_remove_dead_variables(nir, nir_var_shader_in, NULL);
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
if (is_fast_clear || use_repclear)
nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_16;
if (is_fast_clear || use_repclear) {
nir->info.api_subgroup_size = 16;
nir->info.max_subgroup_size = 16;
nir->info.min_subgroup_size = 16;
}
struct brw_wm_prog_key wm_key;
memset(&wm_key, 0, sizeof(wm_key));

View file

@ -1576,8 +1576,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
* data clear shaders.
*/
const unsigned reqd_dispatch_width = brw_required_dispatch_width(&nir->info);
assert(reqd_dispatch_width == SUBGROUP_SIZE_VARYING ||
reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16);
assert(reqd_dispatch_width == 0 || reqd_dispatch_width == 16);
/* Limit identified when first variant is compiled, see
* brw_shader::limit_dispatch_width().
@ -1750,7 +1749,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
} else {
if ((!has_spilled && dispatch_width_limit >= 16 && INTEL_SIMD(FS, 16)) ||
reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16) {
reqd_dispatch_width == 16) {
/* Try a SIMD16 compile */
brw_shader_params shader_params = base_shader_params;
shader_params.dispatch_width = 16;
@ -1783,7 +1782,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
/* Currently, the compiler only supports SIMD32 on SNB+ */
if (!has_spilled &&
dispatch_width_limit >= 32 &&
reqd_dispatch_width == SUBGROUP_SIZE_VARYING &&
reqd_dispatch_width == 0 &&
!simd16_failed && INTEL_SIMD(FS, 32) &&
!prog_data->base.ray_queries) {
/* Try a SIMD32 compile */
@ -1818,7 +1817,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
if (devinfo->ver >= 12 && !has_spilled &&
max_polygons >= 2 && !key->coarse_pixel &&
reqd_dispatch_width == SUBGROUP_SIZE_VARYING) {
reqd_dispatch_width == 0) {
if (devinfo->ver >= 20 && max_polygons >= 4 &&
dispatch_width_limit >= 32 &&
@ -1890,7 +1889,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
/* When the caller compiles a repclear or fast clear shader, they
* want SIMD16-only.
*/
if (reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16)
if (reqd_dispatch_width == 16)
v8.reset();
brw_generator g(compiler, &params->base, &prog_data->base,

View file

@ -2426,12 +2426,11 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
static unsigned
get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
{
switch (info->subgroup_size) {
case SUBGROUP_SIZE_API_CONSTANT:
/* We have to use the global constant size. */
return BRW_SUBGROUP_SIZE;
case SUBGROUP_SIZE_UNIFORM:
if (info->api_subgroup_size) {
/* We have to use the global/required constant size. */
assert(info->api_subgroup_size >= 8 && info->api_subgroup_size <= 32);
return info->api_subgroup_size;
} else if (info->api_subgroup_size_draw_uniform) {
/* It has to be uniform across all invocations but can vary per stage
* if we want. This gives us a bit more freedom.
*
@ -2441,8 +2440,7 @@ get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
* to be uniform across invocations.
*/
return max_subgroup_size;
case SUBGROUP_SIZE_VARYING:
} else {
/* The subgroup size is allowed to be fully varying. For geometry
* stages, we know it's always 8 which is max_subgroup_size so we can
* return that. For compute, brw_nir_apply_key is called once per
@ -2454,25 +2452,7 @@ get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
* size.
*/
return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
case SUBGROUP_SIZE_REQUIRE_4:
UNREACHABLE("Unsupported subgroup size type");
case SUBGROUP_SIZE_REQUIRE_8:
case SUBGROUP_SIZE_REQUIRE_16:
case SUBGROUP_SIZE_REQUIRE_32:
/* These enum values are expressly chosen to be equal to the subgroup
* size that they require.
*/
return info->subgroup_size;
case SUBGROUP_SIZE_FULL_SUBGROUPS:
case SUBGROUP_SIZE_REQUIRE_64:
case SUBGROUP_SIZE_REQUIRE_128:
break;
}
UNREACHABLE("Invalid subgroup size type");
}
unsigned

View file

@ -30,11 +30,8 @@
unsigned
brw_required_dispatch_width(const struct shader_info *info)
{
if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
/* These enum values are expressly chosen to be equal to the subgroup
* size that they require.
*/
return (unsigned)info->subgroup_size;
if (info->min_subgroup_size == info->max_subgroup_size) {
return info->max_subgroup_size;
} else {
return 0;
}

View file

@ -1683,12 +1683,11 @@ elk_nir_apply_sampler_key(nir_shader *nir,
static unsigned
get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
{
switch (info->subgroup_size) {
case SUBGROUP_SIZE_API_CONSTANT:
/* We have to use the global constant size. */
return ELK_SUBGROUP_SIZE;
case SUBGROUP_SIZE_UNIFORM:
if (info->api_subgroup_size) {
/* We have to use the global/required constant size. */
assert(info->api_subgroup_size >= 8 && info->api_subgroup_size <= 32);
return info->api_subgroup_size;
} else if (info->api_subgroup_size_draw_uniform) {
/* It has to be uniform across all invocations but can vary per stage
* if we want. This gives us a bit more freedom.
*
@ -1698,8 +1697,7 @@ get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
* to be uniform across invocations.
*/
return max_subgroup_size;
case SUBGROUP_SIZE_VARYING:
} else {
/* The subgroup size is allowed to be fully varying. For geometry
* stages, we know it's always 8 which is max_subgroup_size so we can
* return that. For compute, elk_nir_apply_key is called once per
@ -1711,27 +1709,7 @@ get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
* size.
*/
return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
case SUBGROUP_SIZE_REQUIRE_4:
UNREACHABLE("Unsupported subgroup size type");
case SUBGROUP_SIZE_REQUIRE_8:
case SUBGROUP_SIZE_REQUIRE_16:
case SUBGROUP_SIZE_REQUIRE_32:
assert(mesa_shader_stage_uses_workgroup(info->stage) ||
(info->stage >= MESA_SHADER_RAYGEN && info->stage <= MESA_SHADER_CALLABLE));
/* These enum values are expressly chosen to be equal to the subgroup
* size that they require.
*/
return info->subgroup_size;
case SUBGROUP_SIZE_FULL_SUBGROUPS:
case SUBGROUP_SIZE_REQUIRE_64:
case SUBGROUP_SIZE_REQUIRE_128:
break;
}
UNREACHABLE("Invalid subgroup size type");
}
unsigned

View file

@ -30,12 +30,8 @@
unsigned
elk_required_dispatch_width(const struct shader_info *info)
{
if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
assert(mesa_shader_stage_uses_workgroup(info->stage));
/* These enum values are expressly chosen to be equal to the subgroup
* size that they require.
*/
return (unsigned)info->subgroup_size;
if (info->min_subgroup_size == info->max_subgroup_size) {
return info->max_subgroup_size;
} else {
return 0;
}

View file

@ -652,29 +652,21 @@ anv_fixup_subgroup_size(struct anv_instance *instance, struct shader_info *info)
*/
if (instance->assume_full_subgroups &&
info->uses_wide_subgroup_intrinsics &&
info->subgroup_size == SUBGROUP_SIZE_API_CONSTANT &&
info->api_subgroup_size == BRW_SUBGROUP_SIZE &&
local_size &&
local_size % BRW_SUBGROUP_SIZE == 0)
info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
/* If the client requests that we dispatch full subgroups but doesn't
* allow us to pick a subgroup size, we have to smash it to the API
* value of 32. Performance will likely be terrible in this case but
* there's nothing we can do about that. The client should have chosen
* a size.
*/
if (info->subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS)
info->subgroup_size =
instance->assume_full_subgroups != 0 ?
instance->assume_full_subgroups : BRW_SUBGROUP_SIZE;
local_size % BRW_SUBGROUP_SIZE == 0) {
info->max_subgroup_size = BRW_SUBGROUP_SIZE;
info->min_subgroup_size = BRW_SUBGROUP_SIZE;
}
/* Cooperative matrix extension requires that all invocations in a subgroup
* be active. As a result, when the application does not request a specific
* subgroup size, we must use SIMD32.
*/
if (info->stage == MESA_SHADER_COMPUTE && info->cs.has_cooperative_matrix &&
info->subgroup_size < SUBGROUP_SIZE_REQUIRE_8) {
info->subgroup_size = BRW_SUBGROUP_SIZE;
info->max_subgroup_size > info->min_subgroup_size) {
info->api_subgroup_size = info->max_subgroup_size;
info->min_subgroup_size = info->max_subgroup_size;
}
}
@ -1244,7 +1236,7 @@ anv_shader_lower_nir(struct anv_device *device,
if (nir->info.stage == MESA_SHADER_COMPUTE &&
nir->info.cs.has_cooperative_matrix) {
anv_fixup_subgroup_size(pdevice->instance, &nir->info);
NIR_PASS(_, nir, brw_nir_lower_cmat, nir->info.subgroup_size);
NIR_PASS(_, nir, brw_nir_lower_cmat, nir->info.api_subgroup_size);
NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, 16);
}

View file

@ -360,10 +360,10 @@ anv_device_init_rt_shaders(struct anv_device *device)
nir_shader *trampoline_nir =
brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx);
if (device->info->ver >= 20)
trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_16;
else
trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_8;
unsigned require_size = device->info->ver >= 20 ? 16 : 8;
trampoline_nir->info.api_subgroup_size = require_size;
trampoline_nir->info.max_subgroup_size = require_size;
trampoline_nir->info.min_subgroup_size = require_size;
struct brw_cs_prog_data trampoline_prog_data = {
.uses_btd_stack_ids = true,

View file

@ -1516,21 +1516,12 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
*/
if (device->physical->instance->assume_full_subgroups &&
stage.nir->info.uses_wide_subgroup_intrinsics &&
stage.nir->info.subgroup_size == SUBGROUP_SIZE_API_CONSTANT &&
stage.nir->info.api_subgroup_size == ELK_SUBGROUP_SIZE &&
local_size &&
local_size % ELK_SUBGROUP_SIZE == 0)
stage.nir->info.subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
/* If the client requests that we dispatch full subgroups but doesn't
* allow us to pick a subgroup size, we have to smash it to the API
* value of 32. Performance will likely be terrible in this case but
* there's nothing we can do about that. The client should have chosen
* a size.
*/
if (stage.nir->info.subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS)
stage.nir->info.subgroup_size =
device->physical->instance->assume_full_subgroups != 0 ?
device->physical->instance->assume_full_subgroups : ELK_SUBGROUP_SIZE;
local_size % ELK_SUBGROUP_SIZE == 0) {
stage.nir->info.max_subgroup_size = ELK_SUBGROUP_SIZE;
stage.nir->info.min_subgroup_size = ELK_SUBGROUP_SIZE;
}
stage.num_stats = 1;