diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 658fb6e5fd8..b2739986b0a 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -939,6 +939,9 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
mode.SliceHashingTableEnable = true;
mode.SliceHashingTableEnableMask = true;
+ mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+ hashing32x32 : NormalMode);
+ mode.CrossSliceHashingModeMask = -1;
}
#endif
}
diff --git a/src/intel/common/intel_pixel_hash.h b/src/intel/common/intel_pixel_hash.h
index 5fbd90145b0..f24528ddc53 100644
--- a/src/intel/common/intel_pixel_hash.h
+++ b/src/intel/common/intel_pixel_hash.h
@@ -85,12 +85,54 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
assert(num_ids > 0);
+ /* Compute a permutation of the above indices that assigns indices
+ * as far as possible to adjacent entries. This permutation is
+ * designed to be equivalent to the bit reversal of each index in
+ * cases where num_ids is a power of two, but doesn't actually
+ * require it to be a power of two in order to satisfy the required
+ * properties (which is necessary to handle configurations with
+ * arbitrary non-power of two fusing). By construction, flipping
+ * bit l of its input will lead to a change in its result of the
+ * order of num_ids/2^(l+1) (see variable t below). The
+ * bijectivity of this permutation can be verified easily by
+ * induction.
+ */
+ const unsigned bits = util_logbase2_ceil(num_ids);
+ unsigned swz[ARRAY_SIZE(phys_ids)];
+
+ for (unsigned k = 0; k < num_ids; k++) {
+ unsigned t = num_ids;
+ unsigned s = 0;
+
+ for (unsigned l = 0; l < bits; l++) {
+ if (k & (1u << l)) {
+ s += (t + 1) >> 1;
+ t >>= 1;
+ } else {
+ t = (t + 1) >> 1;
+ }
+ }
+
+ swz[k] = s;
+ }
+
/* Initialize the table with the cyclic repetition of a
* num_ids-periodic pattern.
+ *
+ * Note that the swz permutation only affects the ordering of rows.
+ * This is intentional in order to minimize the size of the
+ * contiguous area that needs to be rendered in parallel in order
+ * to utilize the whole GPU: A rendering rectangle of width W will
+ * need to be at least H blocks high, where H is bounded by
+ * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz
+ * permutation.
*/
for (unsigned i = 0; i < n; i++) {
- for (unsigned j = 0; j < m; j++)
- p[j + m * i] = phys_ids[(j + i) % num_ids];
+ const unsigned k = i % num_ids;
+ assert(swz[k] < num_ids);
+ for (unsigned j = 0; j < m; j++) {
+ p[j + m * i] = phys_ids[(j + swz[k]) % num_ids];
+ }
}
}
diff --git a/src/intel/genxml/gen125.xml b/src/intel/genxml/gen125.xml
index bc716687d2c..7af0d5ecd75 100644
--- a/src/intel/genxml/gen125.xml
+++ b/src/intel/genxml/gen125.xml
@@ -1372,7 +1372,7 @@
-
+
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
index 7d6fd065f7b..104d0a8efd8 100644
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@@ -151,6 +151,9 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
mode.SliceHashingTableEnable = true;
mode.SliceHashingTableEnableMask = true;
+ mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+ hashing32x32 : NormalMode);
+ mode.CrossSliceHashingModeMask = -1;
}
#endif
}