radeonsi: Use one more bit for number of patches in TCS offchip layout.

There was 1 more bit left, may as well use it for something. In the future, this may allow increasing the maximum number of patches per workgroup. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28425>
2026-05-05 00:58:05 +02:00 · 2024-03-30 02:01:03 +01:00 · 2024-03-30 02:01:03 +01:00 · b34e99d021
commit b34e99d021
parent 04dea4aef2
3 changed files with 11 additions and 19 deletions
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@ -338,9 +338,9 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
   }
   case nir_intrinsic_load_patch_vertices_in:
      if (stage == MESA_SHADER_TESS_CTRL)
-         replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 11, 5);
+         replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5);
      else if (stage == MESA_SHADER_TESS_EVAL) {
-         replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 6, 5);
+         replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
      } else
         unreachable("no nir_load_patch_vertices_in");
      replacement = nir_iadd_imm(b, replacement, 1);
@ -372,7 +372,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
      }
      break;
   case nir_intrinsic_load_tcs_num_patches_amd: {
-      nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
+      nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
      replacement = nir_iadd_imm(b, tmp, 1);
      break;
   }
@ -387,12 +387,12 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
      } else {
         nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
         nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
-         nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 6, 5);
+         nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
         nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
         per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
      }

-      nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);
+      nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
      nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
      replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
      break;
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@ -51,14 +51,12 @@ struct si_shader_args {

   /* API TCS & TES */
   /* Layout of TCS outputs in the offchip buffer
-    * # 6 bits
-    *   [0:5] = the number of patches per threadgroup - 1, max = 63
+    * # 7 bits
+    *   [0:6] = the number of patches per threadgroup - 1, max = 127
    * # 5 bits
-    *   [6:10] = the number of output vertices per patch - 1, max = 31
+    *   [7:11] = the number of output vertices per patch - 1, max = 31
    * # 5 bits
-    *   [11:15] = the number of input vertices per patch - 1, max = 31 (TCS only)
-    * # 1 bit
-    *   [16] = reserved for future use
+    *   [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only)
    * # 6 bits
    *   [17:22] = the number of LS outputs, max = 63
    * # 6 bits
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -4587,18 +4587,12 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
      si_mark_atom_dirty(sctx, &sctx->atoms.s.vgt_pipeline_state);
   }

-   unsigned output_patch0_offset = input_patch_size * num_patches;
-   unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
-
   /* Compute userdata SGPRs. */
-   assert(((input_vertex_size / 4) & ~0xff) == 0);
-   assert(((perpatch_output_offset / 4) & ~0xffff) == 0);
   assert(num_tcs_input_cp <= 32);
   assert(num_tcs_output_cp <= 32);
-   assert(num_patches <= 64);
+   assert(num_patches <= 128);
   assert(num_vs_outputs <= 63);
   assert(num_tcs_outputs <= 63);
-   assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0);

   uint64_t ring_va =
      sctx->ws->cs_is_secure(&sctx->gfx_cs) ?
@ -4609,7 +4603,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
   sctx->tes_offchip_ring_va_sgpr = ring_va;
   sctx->tcs_offchip_layout &= 0xe0000000;
   sctx->tcs_offchip_layout |=
-      (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) |
+      (num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) |
      (num_vs_outputs << 17) | (num_tcs_outputs << 23);

   /* Compute the LDS size. */