diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index ad67e695701..7d7a07eaa4b 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -115,6 +115,12 @@ ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count) { const struct ir3_compiler *compiler = v->shader->compiler; + /* If the user forced a particular wavesize respect that. */ + if (v->shader->real_wavesize == IR3_SINGLE_ONLY) + return false; + if (v->shader->real_wavesize == IR3_DOUBLE_ONLY) + return true; + /* We can't support more than compiler->branchstack_size diverging threads * in a wave. Thus, doubling the threadsize is only possible if we don't * exceed the branchstack size limit. diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c index 65d40d7a460..a629e51e5e5 100644 --- a/src/freedreno/ir3/ir3_disk_cache.c +++ b/src/freedreno/ir3/ir3_disk_cache.c @@ -90,6 +90,11 @@ ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler, _mesa_sha1_update(&ctx, blob.data, blob.size); blob_finish(&blob); + _mesa_sha1_update(&ctx, &shader->api_wavesize, + sizeof(shader->api_wavesize)); + _mesa_sha1_update(&ctx, &shader->real_wavesize, + sizeof(shader->real_wavesize)); + /* Note that on some gens stream-out is lowered in ir3 to stg. For later * gens we maybe don't need to include stream-out in the cache key. */ diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index a6c1b3a377e..15b6b3f9698 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -533,11 +533,39 @@ ir3_nir_post_finalize(struct ir3_shader *shader) if ((s->info.stage == MESA_SHADER_COMPUTE) || (s->info.stage == MESA_SHADER_KERNEL) || compiler->has_getfiberid) { + /* If the API-facing subgroup size is forced to a particular value, lower + * it here. Beyond this point nir_intrinsic_load_subgroup_size will return + * the "real" subgroup size. + */ + unsigned subgroup_size = 0, max_subgroup_size = 0; + switch (shader->api_wavesize) { + case IR3_SINGLE_ONLY: + subgroup_size = max_subgroup_size = compiler->threadsize_base; + break; + case IR3_DOUBLE_ONLY: + subgroup_size = max_subgroup_size = compiler->threadsize_base * 2; + break; + case IR3_SINGLE_OR_DOUBLE: + /* For vertex stages, we know the wavesize will never be doubled. + * Lower subgroup_size here, to avoid having to deal with it when + * translating from NIR. Otherwise use the "real" wavesize obtained as + * a driver param. + */ + if (s->info.stage != MESA_SHADER_COMPUTE && + s->info.stage != MESA_SHADER_FRAGMENT) { + subgroup_size = max_subgroup_size = compiler->threadsize_base; + } else { + subgroup_size = 0; + max_subgroup_size = compiler->threadsize_base * 2; + } + break; + } + OPT(s, nir_lower_subgroups, &(nir_lower_subgroups_options){ - .subgroup_size = 128, + .subgroup_size = subgroup_size, .ballot_bit_size = 32, - .ballot_components = 4, + .ballot_components = max_subgroup_size / 32, .lower_to_scalar = true, .lower_vote_eq = true, .lower_subgroup_masks = true, diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 0fd31eacae0..2897163b2f9 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -2295,6 +2295,15 @@ ir3_ra(struct ir3_shader_variant *v) calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure); } + /* If the user forces a doubled threadsize, we may have to lower the limit + * because on some gens the register file is not big enough to hold a + * double-size wave with all 48 registers in use. + */ + if (v->shader->real_wavesize == IR3_DOUBLE_ONLY) { + limit_pressure.full = + MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16); + } + /* If requested, lower the limit so that spilling happens more often. */ if (ir3_shader_debug & IR3_DBG_SPILLALL) calc_min_limit_pressure(v, live, &limit_pressure); diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index c22cae603ba..dbdc6178688 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -596,6 +596,8 @@ ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir, memcpy(&shader->stream_output, stream_output, sizeof(shader->stream_output)); shader->num_reserved_user_consts = options->reserved_user_consts; + shader->api_wavesize = options->api_wavesize; + shader->real_wavesize = options->real_wavesize; shader->nir = nir; ir3_disk_cache_init_shader_key(compiler, shader); diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 4f35f007bef..f8ecd69604f 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -92,6 +92,13 @@ enum ir3_bary { IJ_COUNT, }; +/* Description of what wavesizes are allowed. */ +enum ir3_wavesize_option { + IR3_SINGLE_ONLY, + IR3_SINGLE_OR_DOUBLE, + IR3_DOUBLE_ONLY, +}; + /** * Description of a lowered UBO. */ @@ -757,6 +764,17 @@ struct ir3_shader { unsigned num_reserved_user_consts; + /* What API-visible wavesizes are allowed. Even if only double wavesize is + * allowed, we may still use the smaller wavesize "under the hood" and the + * application simply sees the upper half as always disabled. + */ + enum ir3_wavesize_option api_wavesize; + + /* What wavesizes we're allowed to actually use. If the API wavesize is + * single-only, then this must be single-only too. + */ + enum ir3_wavesize_option real_wavesize; + bool nir_finalized; struct nir_shader *nir; struct ir3_stream_output_info stream_output; @@ -822,6 +840,7 @@ ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_options { unsigned reserved_user_consts; + enum ir3_wavesize_option api_wavesize, real_wavesize; }; struct ir3_shader * diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index fece4886c08..988c7d11a74 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -549,6 +549,8 @@ compile_shader(struct tu_device *dev, struct nir_shader *nir, struct ir3_shader *sh = ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) { + .api_wavesize = IR3_SINGLE_OR_DOUBLE, + .real_wavesize = IR3_SINGLE_OR_DOUBLE, .reserved_user_consts = align(consts, 4), }, NULL); diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c index ef8d732b56b..366e18e562a 100644 --- a/src/freedreno/vulkan/tu_shader.c +++ b/src/freedreno/vulkan/tu_shader.c @@ -787,6 +787,8 @@ tu_shader_create(struct tu_device *dev, shader->ir3_shader = ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) { .reserved_user_consts = align(shader->push_consts.count, 4), + .api_wavesize = IR3_DOUBLE_ONLY, + .real_wavesize = IR3_SINGLE_OR_DOUBLE, }, &so_info); return shader; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 4ee1e056b55..4ac1b67aee5 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -308,7 +308,13 @@ ir3_shader_compute_state_create(struct pipe_context *pctx, } struct ir3_shader *shader = - ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){}, NULL); + ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){ + /* TODO: force to single on a6xx with legacy + * ballot extension that uses 64-bit masks + */ + .api_wavesize = IR3_SINGLE_OR_DOUBLE, + .real_wavesize = IR3_SINGLE_OR_DOUBLE, + }, NULL); shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */ shader->cs.req_local_mem = cso->req_local_mem; @@ -369,7 +375,13 @@ ir3_shader_state_create(struct pipe_context *pctx, copy_stream_out(&stream_output, &cso->stream_output); hwcso->shader = - ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){}, + ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){ + /* TODO: force to single on a6xx with legacy + * ballot extension that uses 64-bit masks + */ + .api_wavesize = IR3_SINGLE_OR_DOUBLE, + .real_wavesize = IR3_SINGLE_OR_DOUBLE, + }, &stream_output); /*