brw/nir_lower_cs_intrinsics: do some math at 16-bit

There are less than 2^16 lanes within a threadgroup, so it is safe to do all math at 16-bit. This allows us to use 16-bit integer division which is much faster than 32-bit integer division (in terms of the lowerings). In a "hello world" kernel with variable wg size, simd32 goes 72 inst -> 57 inst on jay and 82 -> 67 inst on brw. OTOH it's a loss for non-variable wg size, so do it only there to avoid unwelcome stats regresions on Vulkan. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41084>
2026-05-07 00:38:48 +02:00 · 2026-04-21 13:54:25 -04:00 · 2026-04-21 13:54:25 -04:00 · bccaeb28bb
commit bccaeb28bb
parent e4c9d57ddf
1 changed files with 20 additions and 18 deletions
--- a/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c
@ -77,15 +77,25 @@ compute_local_index_id(struct lower_intrinsics_state *state, nir_intrinsic_instr
      linear = nir_iadd(b, channel, thread_local_id);
   }

+   /* There are less than 2^16 lanes within a threadgroup, so it is safe to do
+    * all math at 16-bit. This allows us to use 16-bit integer division which is
+    * much faster than 32-bit integer division (in terms of the lowerings).
+    *
+    * However, if the workgroup size is known at compile-time, the divisions
+    * will optimize away and downconverting ends up as a loss.
+    */
+   unsigned bit_size = nir->info.workgroup_size_variable ? 16 : 32;
+   linear = nir_u2uN(b, linear, bit_size);
+
   nir_def *size_x;
   nir_def *size_y;
   if (nir->info.workgroup_size_variable) {
-      nir_def *size_xyz = nir_load_workgroup_size(b);
+      nir_def *size_xyz = nir_u2uN(b, nir_load_workgroup_size(b), bit_size);
      size_x = nir_channel(b, size_xyz, 0);
      size_y = nir_channel(b, size_xyz, 1);
   } else {
-      size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
-      size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
+      size_x = nir_imm_intN_t(b, nir->info.workgroup_size[0], bit_size);
+      size_y = nir_imm_intN_t(b, nir->info.workgroup_size[1], bit_size);
   }
   nir_def *size_xy = nir_imul(b, size_x, size_y);

@ -171,23 +181,17 @@ compute_local_index_id(struct lower_intrinsics_state *state, nir_intrinsic_instr
       * Then map that into local invocation ID (trivial) and local
       * invocation index.  Skipping Z simplify index calculation.
       */
-
-      nir_def *one = nir_imm_int(b, 1);
-      nir_def *double_size_x = nir_ishl(b, size_x, one);
+      nir_def *double_size_x = nir_ishl_imm(b, size_x, 1);

      /* ID within a pair of rows, where each group of 4 is 2x2 quad. */
      nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
      nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);

-      nir_def *x =
-         nir_ior(b,
-                 nir_iand(b, row_pair_id, one),
-                 nir_iand(b, nir_ishr(b, row_pair_id, one),
-                          nir_imm_int(b, 0xfffffffe)));
-      nir_def *y =
-         nir_ior(b,
-                 nir_ishl(b, y_row_pairs, one),
-                 nir_iand(b, nir_ishr(b, row_pair_id, one), one));
+      nir_def *row_pair_id_shr_1 = nir_ishr_imm(b, row_pair_id, 1);
+      nir_def *x = nir_ior(b, nir_iand_imm(b, row_pair_id, 1),
+                              nir_iand_imm(b, row_pair_id_shr_1, 0xfffffffe));
+      nir_def *y = nir_ior(b, nir_ishl_imm(b, y_row_pairs, 1),
+                              nir_iand_imm(b, row_pair_id_shr_1, 1));

      state->local_id = nir_vec3(b, x,
                                 nir_umod(b, y, size_y),
@ -270,9 +274,7 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
         continue;
      }

-      if (intrinsic->def.bit_size == 64)
-         sysval = nir_u2u64(b, sysval);
-
+      sysval = nir_u2uN(b, sysval, intrinsic->def.bit_size);
      nir_def_replace(&intrinsic->def, sysval);

      state->progress = true;