diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index ad67e695701..7d7a07eaa4b 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -115,6 +115,12 @@ ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
 {
    const struct ir3_compiler *compiler = v->shader->compiler;
 
+   /* If the user forced a particular wavesize respect that. */
+   if (v->shader->real_wavesize == IR3_SINGLE_ONLY)
+      return false;
+   if (v->shader->real_wavesize == IR3_DOUBLE_ONLY)
+      return true;
+
    /* We can't support more than compiler->branchstack_size diverging threads
     * in a wave. Thus, doubling the threadsize is only possible if we don't
     * exceed the branchstack size limit.
diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c
index 65d40d7a460..a629e51e5e5 100644
--- a/src/freedreno/ir3/ir3_disk_cache.c
+++ b/src/freedreno/ir3/ir3_disk_cache.c
@@ -90,6 +90,11 @@ ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
    _mesa_sha1_update(&ctx, blob.data, blob.size);
    blob_finish(&blob);
 
+   _mesa_sha1_update(&ctx, &shader->api_wavesize,
+                     sizeof(shader->api_wavesize));
+   _mesa_sha1_update(&ctx, &shader->real_wavesize,
+                     sizeof(shader->real_wavesize));
+
    /* Note that on some gens stream-out is lowered in ir3 to stg.  For later
     * gens we maybe don't need to include stream-out in the cache key.
     */
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index a6c1b3a377e..15b6b3f9698 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -533,11 +533,39 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
    if ((s->info.stage == MESA_SHADER_COMPUTE) ||
        (s->info.stage == MESA_SHADER_KERNEL) ||
        compiler->has_getfiberid) {
+      /* If the API-facing subgroup size is forced to a particular value, lower
+       * it here. Beyond this point nir_intrinsic_load_subgroup_size will return
+       * the "real" subgroup size.
+       */
+      unsigned subgroup_size = 0, max_subgroup_size = 0;
+      switch (shader->api_wavesize) {
+      case IR3_SINGLE_ONLY:
+         subgroup_size = max_subgroup_size = compiler->threadsize_base;
+         break;
+      case IR3_DOUBLE_ONLY:
+         subgroup_size = max_subgroup_size = compiler->threadsize_base * 2;
+         break;
+      case IR3_SINGLE_OR_DOUBLE:
+         /* For vertex stages, we know the wavesize will never be doubled.
+          * Lower subgroup_size here, to avoid having to deal with it when
+          * translating from NIR. Otherwise use the "real" wavesize obtained as
+          * a driver param.
+          */
+         if (s->info.stage != MESA_SHADER_COMPUTE &&
+             s->info.stage != MESA_SHADER_FRAGMENT) {
+            subgroup_size = max_subgroup_size = compiler->threadsize_base;
+         } else {
+            subgroup_size = 0;
+            max_subgroup_size = compiler->threadsize_base * 2;
+         }
+         break;
+      }
+
       OPT(s, nir_lower_subgroups,
           &(nir_lower_subgroups_options){
-             .subgroup_size = 128,
+             .subgroup_size = subgroup_size,
              .ballot_bit_size = 32,
-             .ballot_components = 4,
+             .ballot_components = max_subgroup_size / 32,
              .lower_to_scalar = true,
              .lower_vote_eq = true,
              .lower_subgroup_masks = true,
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 0fd31eacae0..2897163b2f9 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -2295,6 +2295,15 @@ ir3_ra(struct ir3_shader_variant *v)
       calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
    }
 
+   /* If the user forces a doubled threadsize, we may have to lower the limit
+    * because on some gens the register file is not big enough to hold a
+    * double-size wave with all 48 registers in use.
+    */
+   if (v->shader->real_wavesize == IR3_DOUBLE_ONLY) {
+      limit_pressure.full =
+         MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
+   }
+
    /* If requested, lower the limit so that spilling happens more often. */
    if (ir3_shader_debug & IR3_DBG_SPILLALL)
       calc_min_limit_pressure(v, live, &limit_pressure);
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index c22cae603ba..dbdc6178688 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -596,6 +596,8 @@ ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
       memcpy(&shader->stream_output, stream_output,
              sizeof(shader->stream_output));
    shader->num_reserved_user_consts = options->reserved_user_consts;
+   shader->api_wavesize = options->api_wavesize;
+   shader->real_wavesize = options->real_wavesize;
    shader->nir = nir;
 
    ir3_disk_cache_init_shader_key(compiler, shader);
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 4f35f007bef..f8ecd69604f 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -92,6 +92,13 @@ enum ir3_bary {
    IJ_COUNT,
 };
 
+/* Description of what wavesizes are allowed. */
+enum ir3_wavesize_option {
+   IR3_SINGLE_ONLY,
+   IR3_SINGLE_OR_DOUBLE,
+   IR3_DOUBLE_ONLY,
+};
+
 /**
  * Description of a lowered UBO.
  */
@@ -757,6 +764,17 @@ struct ir3_shader {
 
    unsigned num_reserved_user_consts;
 
+   /* What API-visible wavesizes are allowed. Even if only double wavesize is
+    * allowed, we may still use the smaller wavesize "under the hood" and the
+    * application simply sees the upper half as always disabled.
+    */
+   enum ir3_wavesize_option api_wavesize;
+
+   /* What wavesizes we're allowed to actually use. If the API wavesize is
+    * single-only, then this must be single-only too.
+    */
+   enum ir3_wavesize_option real_wavesize;
+
    bool nir_finalized;
    struct nir_shader *nir;
    struct ir3_stream_output_info stream_output;
@@ -822,6 +840,7 @@ ir3_shader_get_variant(struct ir3_shader *shader,
 
 struct ir3_shader_options {
    unsigned reserved_user_consts;
+   enum ir3_wavesize_option api_wavesize, real_wavesize;
 };
 
 struct ir3_shader *
diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c
index fece4886c08..988c7d11a74 100644
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@@ -549,6 +549,8 @@ compile_shader(struct tu_device *dev, struct nir_shader *nir,
 
    struct ir3_shader *sh =
       ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
+                              .api_wavesize = IR3_SINGLE_OR_DOUBLE,
+                              .real_wavesize = IR3_SINGLE_OR_DOUBLE,
                               .reserved_user_consts = align(consts, 4),
                           }, NULL);
 
diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c
index ef8d732b56b..366e18e562a 100644
--- a/src/freedreno/vulkan/tu_shader.c
+++ b/src/freedreno/vulkan/tu_shader.c
@@ -787,6 +787,8 @@ tu_shader_create(struct tu_device *dev,
    shader->ir3_shader =
       ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
                            .reserved_user_consts = align(shader->push_consts.count, 4),
+                           .api_wavesize = IR3_DOUBLE_ONLY,
+                           .real_wavesize = IR3_SINGLE_OR_DOUBLE,
                           }, &so_info);
 
    return shader;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 4ee1e056b55..4ac1b67aee5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -308,7 +308,13 @@ ir3_shader_compute_state_create(struct pipe_context *pctx,
    }
 
    struct ir3_shader *shader =
-      ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){}, NULL);
+      ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
+                              /* TODO: force to single on a6xx with legacy
+                               * ballot extension that uses 64-bit masks
+                               */
+                              .api_wavesize = IR3_SINGLE_OR_DOUBLE,
+                              .real_wavesize = IR3_SINGLE_OR_DOUBLE,
+                          }, NULL);
    shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4;     /* byte->dword */
    shader->cs.req_local_mem = cso->req_local_mem;
 
@@ -369,7 +375,13 @@ ir3_shader_state_create(struct pipe_context *pctx,
    copy_stream_out(&stream_output, &cso->stream_output);
 
    hwcso->shader =
-      ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){},
+      ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
+                              /* TODO: force to single on a6xx with legacy
+                               * ballot extension that uses 64-bit masks
+                               */
+                              .api_wavesize = IR3_SINGLE_OR_DOUBLE,
+                              .real_wavesize = IR3_SINGLE_OR_DOUBLE,
+                          },
                           &stream_output);
 
    /*