diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 18900f2b5e0..c0a4c9130bc 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -990,12 +990,19 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
                                          0, size, PIPE_MAP_WRITE,
                                          &transfer);
 
-   uint32_t ppipe_mask = 0;
+   /* Calculate the set of present pixel pipes, and another set of
+    * present pixel pipes with 2 dual subslices enabled, the latter
+    * will appear on the hashing table with twice the frequency of
+    * pixel pipes with a single dual subslice present.
+    */
+   uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
    for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
       if (devinfo->ppipe_subslices[p])
-         ppipe_mask |= (1u << p);
+         ppipe_mask1 |= (1u << p);
+      if (devinfo->ppipe_subslices[p] > 1)
+         ppipe_mask2 |= (1u << p);
    }
-   assert(ppipe_mask);
+   assert(ppipe_mask1);
 
    struct GENX(SLICE_HASH_TABLE) table;
 
@@ -1008,7 +1015,8 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
     * initialized to the same value.
     */
    for (unsigned i = 0; i < 7; i++)
-     intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]);
+      intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
+                                          table.Entry[i][0]);
 
    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
 
@@ -1026,7 +1034,7 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
       mode.SliceHashingTableEnable = true;
       mode.SliceHashingTableEnableMask = true;
-      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
                                     hashing32x32 : NormalMode);
       mode.CrossSliceHashingModeMask = -1;
    }
diff --git a/src/intel/common/intel_pixel_hash.h b/src/intel/common/intel_pixel_hash.h
index f24528ddc53..1bc32e5f6d9 100644
--- a/src/intel/common/intel_pixel_hash.h
+++ b/src/intel/common/intel_pixel_hash.h
@@ -66,22 +66,37 @@ intel_compute_pixel_hash_table_3way(unsigned n, unsigned m,
  * Compute an \p n x \p m pixel hashing table usable as slice,
  * subslice or pixel pipe hashing table.  This generalizes the
  * previous 3-way hash table function to an arbitrary number of ways
- * given by the number of bits set in the \p mask argument, but
- * doesn't allow the specification of different frequencies for
- * different table indices.
+ * given by the number of bits set in the expression "mask1 | mask2".
+ * If a way is only set in one of the two mask arguments it will
+ * appear on the table with half the frequency as a way set on both
+ * masks.
  */
 UNUSED static void
-intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
+intel_compute_pixel_hash_table_nway(unsigned n, unsigned m,
+                                    uint32_t mask1, uint32_t mask2,
                                     uint32_t *p)
 {
-   /* Construct a table mapping consecutive indices to the physical
-    * indices given by the bits set on the mask argument.
+   /* If both masks are equal all ways are expected to show up with
+    * the same frequency on the final table, so we can zero out one of
+    * the masks in order to halve the number of IDs we need to handle.
     */
-   unsigned phys_ids[sizeof(mask) * CHAR_BIT];
+   if (mask1 == mask2)
+      mask2 = 0;
+
+   /* Construct a table mapping consecutive indices to the physical
+    * indices given by the bits set on the mask arguments.  Ways
+    * enabled on both masks will appear twice on the mapping, so
+    * they'll show up with twice the frequency on the final table.
+    */
+   unsigned phys_ids[(sizeof(mask1) + sizeof(mask2)) * CHAR_BIT];
    unsigned num_ids = 0;
 
-   u_foreach_bit(i, mask)
-      phys_ids[num_ids++] = i;
+   for (unsigned i = 0; i < sizeof(mask1) * CHAR_BIT; i++) {
+      if (mask1 & (1u << i))
+         phys_ids[num_ids++] = i;
+      if (mask2 & (1u << i))
+         phys_ids[num_ids++] = i;
+   }
 
    assert(num_ids > 0);
 
@@ -95,10 +110,11 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
     * bit l of its input will lead to a change in its result of the
     * order of num_ids/2^(l+1) (see variable t below).  The
     * bijectivity of this permutation can be verified easily by
-    * induction.
+    * induction.  This permutation is applied cyclically to the
+    * vertical indices of the hashing table constructed below.
     */
    const unsigned bits = util_logbase2_ceil(num_ids);
-   unsigned swz[ARRAY_SIZE(phys_ids)];
+   unsigned swzy[ARRAY_SIZE(phys_ids)];
 
    for (unsigned k = 0; k < num_ids; k++) {
       unsigned t = num_ids;
@@ -113,25 +129,103 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
          }
       }
 
-      swz[k] = s;
+      swzy[k] = s;
+   }
+
+   /* Compute a second permutation applied cyclically to the
+    * horizontal indices of the hashing table.  In cases where a
+    * single mask is present (which means that all ways are expected
+    * to have the same frequency) this permutation will be the
+    * identity and will have no effect.
+    *
+    * In cases where some ways have twice the frequency of the others,
+    * use a similar iterative halving of the range of the permutation
+    * as in the the swzy[] permutation defined above, but instead of
+    * scanning the bits of its argument (the "k" variable above) in
+    * the opposite order (from LSB to MSB), proceed by halving the
+    * domain of the permutation in the same order as its range, which
+    * would lead to an identity permutation if it wasn't because the
+    * LSB of its range is adjusted as early as possible instead of at
+    * the last iteration.
+    *
+    * The reason for the special casing of the LSB is that we want to
+    * avoid assigning adjacent IDs to adjacent elements of the table,
+    * since ways that appear duplicated in the phys_ids mapping above
+    * would then appear duplicated in adjacent positions of the final
+    * table, which would lead to poor utilization for small primitives
+    * that only cover a small contiguous portion of the hashing table
+    * and would have twice as much work as necessary submitted to the
+    * same way instead of spreading its processing over a larger
+    * number of ways.
+    */
+   unsigned swzx[ARRAY_SIZE(phys_ids)];
+
+   if (mask1 && mask2) {
+      for (unsigned k = 0; k < num_ids; k++) {
+         unsigned l = k;
+         unsigned t = num_ids;
+         unsigned s = 0;
+         bool in_range = false;
+
+         while (t > 1) {
+            const bool first_in_range = t <= m && !in_range;
+            in_range |= first_in_range;
+
+            if (l >= (t + 1) >> 1) {
+               /* Apply the s++ increment (which could only occur in
+                * the last t == 2 iteration if we were constructing an
+                * identity permutation) as soon as the domain of the
+                * permutation has been decomposed into a chunk smaller
+                * than the width of the hashing table \p m (which
+                * causes in_range to be first set to true), since
+                * doing it earlier would prevent any alternation
+                * between even and odd indices in the first \p m
+                * elements of swzx[], which are the only ones actually
+                * used.
+                *
+                * Subsequent (in_range == true) increments of s need
+                * to be doubled since they are selecting between
+                * indices of the same parity.
+                */
+               if (!in_range)
+                  s += (t + 1) >> 1;
+               else if (first_in_range)
+                  s++;
+               else
+                  s += (t + 1) >> 1 << 1;
+
+               l -= (t + 1) >> 1;
+               t >>= 1;
+            } else {
+               t = (t + 1) >> 1;
+            }
+         }
+
+         swzx[k] = s;
+      }
+   } else {
+      for (unsigned k = 0; k < num_ids; k++)
+         swzx[k] = k;
    }
 
    /* Initialize the table with the cyclic repetition of a
     * num_ids-periodic pattern.
     *
-    * Note that the swz permutation only affects the ordering of rows.
-    * This is intentional in order to minimize the size of the
-    * contiguous area that needs to be rendered in parallel in order
-    * to utilize the whole GPU: A rendering rectangle of width W will
-    * need to be at least H blocks high, where H is bounded by
-    * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz
-    * permutation.
+    * Note that the horizontal and vertical permutations (swzx and
+    * swzy respectively) are different, and the former is either an
+    * identity permutation or close to the identity.  This asymmetry
+    * is intentional in order to minimize the size of the contiguous
+    * area that needs to be rendered in parallel in order to utilize
+    * the whole GPU: In cases where swzx is the identity a rendering
+    * rectangle of width W will need to be at least H blocks high,
+    * where H is bounded by 2^ceil(log2(num_ids/W)) thanks to the
+    * above definition of the swzy permutation.
     */
    for (unsigned i = 0; i < n; i++) {
       const unsigned k = i % num_ids;
-      assert(swz[k] < num_ids);
       for (unsigned j = 0; j < m; j++) {
-         p[j + m * i] = phys_ids[(j + swz[k]) % num_ids];
+         const unsigned l = j % num_ids;
+         p[j + m * i] = phys_ids[(swzx[l] + swzy[k]) % num_ids];
       }
    }
 }
diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c
index 1586686fa99..bb1878032eb 100644
--- a/src/intel/vulkan/genX_init_state.c
+++ b/src/intel/vulkan/genX_init_state.c
@@ -116,12 +116,19 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
       p.SubsliceHashingTableEnableMask = true;
    }
 #elif GFX_VERx10 == 125
-   uint32_t ppipe_mask = 0;
+   /* Calculate the set of present pixel pipes, and another set of
+    * present pixel pipes with 2 dual subslices enabled, the latter
+    * will appear on the hashing table with twice the frequency of
+    * pixel pipes with a single dual subslice present.
+    */
+   uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
    for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
-      if (device->info->ppipe_subslices[p])
-         ppipe_mask |= (1u << p);
+      if (device->info->ppipe_subslices[p] > 0)
+         ppipe_mask1 |= (1u << p);
+      if (device->info->ppipe_subslices[p] > 1)
+         ppipe_mask2 |= (1u << p);
    }
-   assert(ppipe_mask);
+   assert(ppipe_mask1);
 
    if (!device->slice_hash.alloc_size) {
       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
@@ -139,7 +146,8 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
        * need to be initialized to the same value.
        */
       for (unsigned i = 0; i < 7; i++)
-         intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]);
+         intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
+                                             table.Entry[i][0]);
 
       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
    }
@@ -160,7 +168,7 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
       mode.SliceHashingTableEnable = true;
       mode.SliceHashingTableEnableMask = true;
-      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
 				    hashing32x32 : NormalMode);
       mode.CrossSliceHashingModeMask = -1;
       mode.FastClearOptimizationEnable = !device->physical->disable_fcv;