diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 658fb6e5fd8..b2739986b0a 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -939,6 +939,9 @@ upload_pixel_hashing_tables(struct iris_batch *batch) iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) { mode.SliceHashingTableEnable = true; mode.SliceHashingTableEnableMask = true; + mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ? + hashing32x32 : NormalMode); + mode.CrossSliceHashingModeMask = -1; } #endif } diff --git a/src/intel/common/intel_pixel_hash.h b/src/intel/common/intel_pixel_hash.h index 5fbd90145b0..f24528ddc53 100644 --- a/src/intel/common/intel_pixel_hash.h +++ b/src/intel/common/intel_pixel_hash.h @@ -85,12 +85,54 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask, assert(num_ids > 0); + /* Compute a permutation of the above indices that assigns indices + * as far as possible to adjacent entries. This permutation is + * designed to be equivalent to the bit reversal of each index in + * cases where num_ids is a power of two, but doesn't actually + * require it to be a power of two in order to satisfy the required + * properties (which is necessary to handle configurations with + * arbitrary non-power of two fusing). By construction, flipping + * bit l of its input will lead to a change in its result of the + * order of num_ids/2^(l+1) (see variable t below). The + * bijectivity of this permutation can be verified easily by + * induction. + */ + const unsigned bits = util_logbase2_ceil(num_ids); + unsigned swz[ARRAY_SIZE(phys_ids)]; + + for (unsigned k = 0; k < num_ids; k++) { + unsigned t = num_ids; + unsigned s = 0; + + for (unsigned l = 0; l < bits; l++) { + if (k & (1u << l)) { + s += (t + 1) >> 1; + t >>= 1; + } else { + t = (t + 1) >> 1; + } + } + + swz[k] = s; + } + /* Initialize the table with the cyclic repetition of a * num_ids-periodic pattern. + * + * Note that the swz permutation only affects the ordering of rows. + * This is intentional in order to minimize the size of the + * contiguous area that needs to be rendered in parallel in order + * to utilize the whole GPU: A rendering rectangle of width W will + * need to be at least H blocks high, where H is bounded by + * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz + * permutation. */ for (unsigned i = 0; i < n; i++) { - for (unsigned j = 0; j < m; j++) - p[j + m * i] = phys_ids[(j + i) % num_ids]; + const unsigned k = i % num_ids; + assert(swz[k] < num_ids); + for (unsigned j = 0; j < m; j++) { + p[j + m * i] = phys_ids[(j + swz[k]) % num_ids]; + } } } diff --git a/src/intel/genxml/gen125.xml b/src/intel/genxml/gen125.xml index bc716687d2c..7af0d5ecd75 100644 --- a/src/intel/genxml/gen125.xml +++ b/src/intel/genxml/gen125.xml @@ -1372,7 +1372,7 @@ - + diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c index 7d6fd065f7b..104d0a8efd8 100644 --- a/src/intel/vulkan/genX_state.c +++ b/src/intel/vulkan/genX_state.c @@ -151,6 +151,9 @@ genX(emit_slice_hashing_state)(struct anv_device *device, anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) { mode.SliceHashingTableEnable = true; mode.SliceHashingTableEnableMask = true; + mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ? + hashing32x32 : NormalMode); + mode.CrossSliceHashingModeMask = -1; } #endif }