intel: Improve N-way pixel hashing computation to handle pixel pipes with asymmetric processing power.

This reworks the intel_compute_pixel_hash_table_nway() pixel pipe hashing table computation helper to handle cases where some pixel pipes have processing power different from the others, this is helpful for Gfx12.7+ platforms where there are pixel pipes with 1 DSS as well as pixel pipes with 2 DSSes, which currently can lead to a serious performance bottleneck in the pixel pipes with lower processing power. In order to avoid such a load imbalance the intel_compute_pixel_hash_table_nway() function will now take two pixel pipe bitsets instead of one: Pixel pipes enabled on both bitsets will appear with twice the frequency on the table as pixel pipes which only appear on one bitset. See the comments below for more details on the algorithm used to construct a pixel hashing table with the desired properties. With this change rendering performance improves by about 25% on a fused MTL platform -- The list of specific configs this is expected to show an improvement on is not included here since the list is rather long and some of the configs may still be embargoed or may never be productized, but in order to find out whether your Gfx12.7+ device could be affected by this you can check the output of the intel_dev_info tool from the Mesa tree and see if there are multiple "pixel pipe" entries with different DSS count. That isn't expected to occur on any DG2 configuration, only on MTL+ platforms, so this change should have no effect at all on DG2 (it's easy to convince oneself that it won't since for DG2 mask1 should equal mask2 so mask2 will be set to zero at the beginning of intel_compute_pixel_hash_table_nway() and the new swzx[] permutation will be set to the identity). Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26266>
2026-02-21 16:30:29 +01:00 · 2023-11-17 20:40:36 -08:00 · 2023-11-17 20:40:36 -08:00 · 6a810b0ba8
commit 6a810b0ba8
parent 9033df070e
3 changed files with 142 additions and 32 deletions
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -990,12 +990,19 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
                                         0, size, PIPE_MAP_WRITE,
                                         &transfer);

-   uint32_t ppipe_mask = 0;
+   /* Calculate the set of present pixel pipes, and another set of
+    * present pixel pipes with 2 dual subslices enabled, the latter
+    * will appear on the hashing table with twice the frequency of
+    * pixel pipes with a single dual subslice present.
+    */
+   uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
   for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
      if (devinfo->ppipe_subslices[p])
-         ppipe_mask |= (1u << p);
+         ppipe_mask1 |= (1u << p);
+      if (devinfo->ppipe_subslices[p] > 1)
+         ppipe_mask2 |= (1u << p);
   }
-   assert(ppipe_mask);
+   assert(ppipe_mask1);

   struct GENX(SLICE_HASH_TABLE) table;

@ -1008,7 +1015,8 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
    * initialized to the same value.
    */
   for (unsigned i = 0; i < 7; i++)
-     intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]);
+      intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
+                                          table.Entry[i][0]);

   GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);

@ -1026,7 +1034,7 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
   iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
      mode.SliceHashingTableEnable = true;
      mode.SliceHashingTableEnableMask = true;
-      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
                                    hashing32x32 : NormalMode);
      mode.CrossSliceHashingModeMask = -1;
   }
--- a/src/intel/common/intel_pixel_hash.h
+++ b/src/intel/common/intel_pixel_hash.h
@ -66,22 +66,37 @@ intel_compute_pixel_hash_table_3way(unsigned n, unsigned m,
 * Compute an \p n x \p m pixel hashing table usable as slice,
 * subslice or pixel pipe hashing table.  This generalizes the
 * previous 3-way hash table function to an arbitrary number of ways
- * given by the number of bits set in the \p mask argument, but
- * doesn't allow the specification of different frequencies for
- * different table indices.
+ * given by the number of bits set in the expression "mask1 | mask2".
+ * If a way is only set in one of the two mask arguments it will
+ * appear on the table with half the frequency as a way set on both
+ * masks.
 */
 UNUSED static void
-intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
+intel_compute_pixel_hash_table_nway(unsigned n, unsigned m,
+                                    uint32_t mask1, uint32_t mask2,
                                    uint32_t *p)
 {
-   /* Construct a table mapping consecutive indices to the physical
-    * indices given by the bits set on the mask argument.
+   /* If both masks are equal all ways are expected to show up with
+    * the same frequency on the final table, so we can zero out one of
+    * the masks in order to halve the number of IDs we need to handle.
    */
-   unsigned phys_ids[sizeof(mask) * CHAR_BIT];
+   if (mask1 == mask2)
+      mask2 = 0;
+
+   /* Construct a table mapping consecutive indices to the physical
+    * indices given by the bits set on the mask arguments.  Ways
+    * enabled on both masks will appear twice on the mapping, so
+    * they'll show up with twice the frequency on the final table.
+    */
+   unsigned phys_ids[(sizeof(mask1) + sizeof(mask2)) * CHAR_BIT];
   unsigned num_ids = 0;

-   u_foreach_bit(i, mask)
-      phys_ids[num_ids++] = i;
+   for (unsigned i = 0; i < sizeof(mask1) * CHAR_BIT; i++) {
+      if (mask1 & (1u << i))
+         phys_ids[num_ids++] = i;
+      if (mask2 & (1u << i))
+         phys_ids[num_ids++] = i;
+   }

   assert(num_ids > 0);

@ -95,10 +110,11 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
    * bit l of its input will lead to a change in its result of the
    * order of num_ids/2^(l+1) (see variable t below).  The
    * bijectivity of this permutation can be verified easily by
-    * induction.
+    * induction.  This permutation is applied cyclically to the
+    * vertical indices of the hashing table constructed below.
    */
   const unsigned bits = util_logbase2_ceil(num_ids);
-   unsigned swz[ARRAY_SIZE(phys_ids)];
+   unsigned swzy[ARRAY_SIZE(phys_ids)];

   for (unsigned k = 0; k < num_ids; k++) {
      unsigned t = num_ids;
@ -113,25 +129,103 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
         }
      }

-      swz[k] = s;
+      swzy[k] = s;
+   }
+
+   /* Compute a second permutation applied cyclically to the
+    * horizontal indices of the hashing table.  In cases where a
+    * single mask is present (which means that all ways are expected
+    * to have the same frequency) this permutation will be the
+    * identity and will have no effect.
+    *
+    * In cases where some ways have twice the frequency of the others,
+    * use a similar iterative halving of the range of the permutation
+    * as in the the swzy[] permutation defined above, but instead of
+    * scanning the bits of its argument (the "k" variable above) in
+    * the opposite order (from LSB to MSB), proceed by halving the
+    * domain of the permutation in the same order as its range, which
+    * would lead to an identity permutation if it wasn't because the
+    * LSB of its range is adjusted as early as possible instead of at
+    * the last iteration.
+    *
+    * The reason for the special casing of the LSB is that we want to
+    * avoid assigning adjacent IDs to adjacent elements of the table,
+    * since ways that appear duplicated in the phys_ids mapping above
+    * would then appear duplicated in adjacent positions of the final
+    * table, which would lead to poor utilization for small primitives
+    * that only cover a small contiguous portion of the hashing table
+    * and would have twice as much work as necessary submitted to the
+    * same way instead of spreading its processing over a larger
+    * number of ways.
+    */
+   unsigned swzx[ARRAY_SIZE(phys_ids)];
+
+   if (mask1 && mask2) {
+      for (unsigned k = 0; k < num_ids; k++) {
+         unsigned l = k;
+         unsigned t = num_ids;
+         unsigned s = 0;
+         bool in_range = false;
+
+         while (t > 1) {
+            const bool first_in_range = t <= m && !in_range;
+            in_range |= first_in_range;
+
+            if (l >= (t + 1) >> 1) {
+               /* Apply the s++ increment (which could only occur in
+                * the last t == 2 iteration if we were constructing an
+                * identity permutation) as soon as the domain of the
+                * permutation has been decomposed into a chunk smaller
+                * than the width of the hashing table \p m (which
+                * causes in_range to be first set to true), since
+                * doing it earlier would prevent any alternation
+                * between even and odd indices in the first \p m
+                * elements of swzx[], which are the only ones actually
+                * used.
+                *
+                * Subsequent (in_range == true) increments of s need
+                * to be doubled since they are selecting between
+                * indices of the same parity.
+                */
+               if (!in_range)
+                  s += (t + 1) >> 1;
+               else if (first_in_range)
+                  s++;
+               else
+                  s += (t + 1) >> 1 << 1;
+
+               l -= (t + 1) >> 1;
+               t >>= 1;
+            } else {
+               t = (t + 1) >> 1;
+            }
+         }
+
+         swzx[k] = s;
+      }
+   } else {
+      for (unsigned k = 0; k < num_ids; k++)
+         swzx[k] = k;
   }

   /* Initialize the table with the cyclic repetition of a
    * num_ids-periodic pattern.
    *
-    * Note that the swz permutation only affects the ordering of rows.
-    * This is intentional in order to minimize the size of the
-    * contiguous area that needs to be rendered in parallel in order
-    * to utilize the whole GPU: A rendering rectangle of width W will
-    * need to be at least H blocks high, where H is bounded by
-    * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz
-    * permutation.
+    * Note that the horizontal and vertical permutations (swzx and
+    * swzy respectively) are different, and the former is either an
+    * identity permutation or close to the identity.  This asymmetry
+    * is intentional in order to minimize the size of the contiguous
+    * area that needs to be rendered in parallel in order to utilize
+    * the whole GPU: In cases where swzx is the identity a rendering
+    * rectangle of width W will need to be at least H blocks high,
+    * where H is bounded by 2^ceil(log2(num_ids/W)) thanks to the
+    * above definition of the swzy permutation.
    */
   for (unsigned i = 0; i < n; i++) {
      const unsigned k = i % num_ids;
-      assert(swz[k] < num_ids);
      for (unsigned j = 0; j < m; j++) {
-         p[j + m * i] = phys_ids[(j + swz[k]) % num_ids];
+         const unsigned l = j % num_ids;
+         p[j + m * i] = phys_ids[(swzx[l] + swzy[k]) % num_ids];
      }
   }
 }
--- a/src/intel/vulkan/genX_init_state.c
+++ b/src/intel/vulkan/genX_init_state.c
@ -116,12 +116,19 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
      p.SubsliceHashingTableEnableMask = true;
   }
 #elif GFX_VERx10 == 125
-   uint32_t ppipe_mask = 0;
+   /* Calculate the set of present pixel pipes, and another set of
+    * present pixel pipes with 2 dual subslices enabled, the latter
+    * will appear on the hashing table with twice the frequency of
+    * pixel pipes with a single dual subslice present.
+    */
+   uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
   for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
-      if (device->info->ppipe_subslices[p])
-         ppipe_mask |= (1u << p);
+      if (device->info->ppipe_subslices[p] > 0)
+         ppipe_mask1 |= (1u << p);
+      if (device->info->ppipe_subslices[p] > 1)
+         ppipe_mask2 |= (1u << p);
   }
-   assert(ppipe_mask);
+   assert(ppipe_mask1);

   if (!device->slice_hash.alloc_size) {
      unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
@ -139,7 +146,8 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
       * need to be initialized to the same value.
       */
      for (unsigned i = 0; i < 7; i++)
-         intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]);
+         intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
+                                             table.Entry[i][0]);

      GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
   }
@ -160,7 +168,7 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
      mode.SliceHashingTableEnable = true;
      mode.SliceHashingTableEnableMask = true;
-      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
 				    hashing32x32 : NormalMode);
      mode.CrossSliceHashingModeMask = -1;
      mode.FastClearOptimizationEnable = !device->physical->disable_fcv;