diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 658fb6e5fd8..b2739986b0a 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -939,6 +939,9 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
       mode.SliceHashingTableEnable = true;
       mode.SliceHashingTableEnableMask = true;
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+                                    hashing32x32 : NormalMode);
+      mode.CrossSliceHashingModeMask = -1;
    }
 #endif
 }
diff --git a/src/intel/common/intel_pixel_hash.h b/src/intel/common/intel_pixel_hash.h
index 5fbd90145b0..f24528ddc53 100644
--- a/src/intel/common/intel_pixel_hash.h
+++ b/src/intel/common/intel_pixel_hash.h
@@ -85,12 +85,54 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
 
    assert(num_ids > 0);
 
+   /* Compute a permutation of the above indices that assigns indices
+    * as far as possible to adjacent entries.  This permutation is
+    * designed to be equivalent to the bit reversal of each index in
+    * cases where num_ids is a power of two, but doesn't actually
+    * require it to be a power of two in order to satisfy the required
+    * properties (which is necessary to handle configurations with
+    * arbitrary non-power of two fusing).  By construction, flipping
+    * bit l of its input will lead to a change in its result of the
+    * order of num_ids/2^(l+1) (see variable t below).  The
+    * bijectivity of this permutation can be verified easily by
+    * induction.
+    */
+   const unsigned bits = util_logbase2_ceil(num_ids);
+   unsigned swz[ARRAY_SIZE(phys_ids)];
+
+   for (unsigned k = 0; k < num_ids; k++) {
+      unsigned t = num_ids;
+      unsigned s = 0;
+
+      for (unsigned l = 0; l < bits; l++) {
+         if (k & (1u << l)) {
+            s += (t + 1) >> 1;
+            t >>= 1;
+         } else {
+            t = (t + 1) >> 1;
+         }
+      }
+
+      swz[k] = s;
+   }
+
    /* Initialize the table with the cyclic repetition of a
     * num_ids-periodic pattern.
+    *
+    * Note that the swz permutation only affects the ordering of rows.
+    * This is intentional in order to minimize the size of the
+    * contiguous area that needs to be rendered in parallel in order
+    * to utilize the whole GPU: A rendering rectangle of width W will
+    * need to be at least H blocks high, where H is bounded by
+    * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz
+    * permutation.
     */
    for (unsigned i = 0; i < n; i++) {
-      for (unsigned j = 0; j < m; j++)
-         p[j + m * i] = phys_ids[(j + i) % num_ids];
+      const unsigned k = i % num_ids;
+      assert(swz[k] < num_ids);
+      for (unsigned j = 0; j < m; j++) {
+         p[j + m * i] = phys_ids[(j + swz[k]) % num_ids];
+      }
    }
 }
 
diff --git a/src/intel/genxml/gen125.xml b/src/intel/genxml/gen125.xml
index bc716687d2c..7af0d5ecd75 100644
--- a/src/intel/genxml/gen125.xml
+++ b/src/intel/genxml/gen125.xml
@@ -1372,7 +1372,7 @@
     <field name="3D Scoreboard Hashing Mode" start="36" end="36" type="bool"/>
     <field name="Subslice Hashing Table Enable" start="37" end="37" type="bool"/>
     <field name="Slice Hashing Table Enable" start="38" end="38" type="bool"/>
-    <field name="Cross Slice Hashing Mode Mask" start="48" end="49" type="uint"/>
+    <field name="Cross Slice Hashing Mode Mask" start="48" end="49" type="int"/>
     <field name="3D Scoreboard Hashing Mode Mask" start="52" end="52" type="bool"/>
     <field name="Subslice Hashing Table Enable Mask" start="53" end="53" type="bool"/>
     <field name="Slice Hashing Table Enable Mask" start="54" end="54" type="bool"/>
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
index 7d6fd065f7b..104d0a8efd8 100644
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@@ -151,6 +151,9 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
       mode.SliceHashingTableEnable = true;
       mode.SliceHashingTableEnableMask = true;
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+				    hashing32x32 : NormalMode);
+      mode.CrossSliceHashingModeMask = -1;
    }
 #endif
 }