intel/xehp: Switch to coarser cross-slice pixel hashing with table permutation.

The coarser 32x32 cross-slice hashing mode seems to lead to better L1 and L2 utilization due to the improved execution locality, however it can also lead to a bottleneck in a single slice, especially in workloads that concentrate heavy rendering in small areas of the screen (e.g. SynMark2 OglGeomPoint, OglTerrain*) -- This effect is mitigated here by performing a permutation of the pixel pipe hashing tables that ensures that adjacent rows map to pixel pipes as far away as possible in the caching hierarchy. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13569>
2025-12-25 15:10:10 +01:00 · 2021-10-12 23:57:53 -07:00 · 2021-10-12 23:57:53 -07:00 · 074bde9989
commit 074bde9989
parent ef675e6857
4 changed files with 51 additions and 3 deletions
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -939,6 +939,9 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
   iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
      mode.SliceHashingTableEnable = true;
      mode.SliceHashingTableEnableMask = true;
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+                                    hashing32x32 : NormalMode);
+      mode.CrossSliceHashingModeMask = -1;
   }
 #endif
 }
--- a/src/intel/common/intel_pixel_hash.h
+++ b/src/intel/common/intel_pixel_hash.h
@ -85,12 +85,54 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,

   assert(num_ids > 0);

+   /* Compute a permutation of the above indices that assigns indices
+    * as far as possible to adjacent entries.  This permutation is
+    * designed to be equivalent to the bit reversal of each index in
+    * cases where num_ids is a power of two, but doesn't actually
+    * require it to be a power of two in order to satisfy the required
+    * properties (which is necessary to handle configurations with
+    * arbitrary non-power of two fusing).  By construction, flipping
+    * bit l of its input will lead to a change in its result of the
+    * order of num_ids/2^(l+1) (see variable t below).  The
+    * bijectivity of this permutation can be verified easily by
+    * induction.
+    */
+   const unsigned bits = util_logbase2_ceil(num_ids);
+   unsigned swz[ARRAY_SIZE(phys_ids)];
+
+   for (unsigned k = 0; k < num_ids; k++) {
+      unsigned t = num_ids;
+      unsigned s = 0;
+
+      for (unsigned l = 0; l < bits; l++) {
+         if (k & (1u << l)) {
+            s += (t + 1) >> 1;
+            t >>= 1;
+         } else {
+            t = (t + 1) >> 1;
+         }
+      }
+
+      swz[k] = s;
+   }
+
   /* Initialize the table with the cyclic repetition of a
    * num_ids-periodic pattern.
+    *
+    * Note that the swz permutation only affects the ordering of rows.
+    * This is intentional in order to minimize the size of the
+    * contiguous area that needs to be rendered in parallel in order
+    * to utilize the whole GPU: A rendering rectangle of width W will
+    * need to be at least H blocks high, where H is bounded by
+    * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz
+    * permutation.
    */
   for (unsigned i = 0; i < n; i++) {
-      for (unsigned j = 0; j < m; j++)
-         p[j + m * i] = phys_ids[(j + i) % num_ids];
+      const unsigned k = i % num_ids;
+      assert(swz[k] < num_ids);
+      for (unsigned j = 0; j < m; j++) {
+         p[j + m * i] = phys_ids[(j + swz[k]) % num_ids];
+      }
   }
 }

--- a/src/intel/genxml/gen125.xml
+++ b/src/intel/genxml/gen125.xml
@ -1372,7 +1372,7 @@
    <field name="3D Scoreboard Hashing Mode" start="36" end="36" type="bool"/>
    <field name="Subslice Hashing Table Enable" start="37" end="37" type="bool"/>
    <field name="Slice Hashing Table Enable" start="38" end="38" type="bool"/>
-    <field name="Cross Slice Hashing Mode Mask" start="48" end="49" type="uint"/>
+    <field name="Cross Slice Hashing Mode Mask" start="48" end="49" type="int"/>
    <field name="3D Scoreboard Hashing Mode Mask" start="52" end="52" type="bool"/>
    <field name="Subslice Hashing Table Enable Mask" start="53" end="53" type="bool"/>
    <field name="Slice Hashing Table Enable Mask" start="54" end="54" type="bool"/>
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@ -151,6 +151,9 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
      mode.SliceHashingTableEnable = true;
      mode.SliceHashingTableEnableMask = true;
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+				    hashing32x32 : NormalMode);
+      mode.CrossSliceHashingModeMask = -1;
   }
 #endif
 }