intel/xehp: Switch to coarser cross-slice pixel hashing with table permutation.

The coarser 32x32 cross-slice hashing mode seems to lead to better L1
and L2 utilization due to the improved execution locality, however it
can also lead to a bottleneck in a single slice, especially in
workloads that concentrate heavy rendering in small areas of the
screen (e.g. SynMark2 OglGeomPoint, OglTerrain*) -- This effect is
mitigated here by performing a permutation of the pixel pipe hashing
tables that ensures that adjacent rows map to pixel pipes as far away
as possible in the caching hierarchy.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13569>
This commit is contained in:
Francisco Jerez 2021-10-12 23:57:53 -07:00
parent ef675e6857
commit 074bde9989
4 changed files with 51 additions and 3 deletions

View file

@ -939,6 +939,9 @@ upload_pixel_hashing_tables(struct iris_batch *batch)
iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
mode.SliceHashingTableEnable = true;
mode.SliceHashingTableEnableMask = true;
mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
hashing32x32 : NormalMode);
mode.CrossSliceHashingModeMask = -1;
}
#endif
}

View file

@ -85,12 +85,54 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask,
assert(num_ids > 0);
/* Compute a permutation of the above indices that assigns indices
* as far as possible to adjacent entries. This permutation is
* designed to be equivalent to the bit reversal of each index in
* cases where num_ids is a power of two, but doesn't actually
* require it to be a power of two in order to satisfy the required
* properties (which is necessary to handle configurations with
* arbitrary non-power of two fusing). By construction, flipping
* bit l of its input will lead to a change in its result of the
* order of num_ids/2^(l+1) (see variable t below). The
* bijectivity of this permutation can be verified easily by
* induction.
*/
const unsigned bits = util_logbase2_ceil(num_ids);
unsigned swz[ARRAY_SIZE(phys_ids)];
for (unsigned k = 0; k < num_ids; k++) {
unsigned t = num_ids;
unsigned s = 0;
for (unsigned l = 0; l < bits; l++) {
if (k & (1u << l)) {
s += (t + 1) >> 1;
t >>= 1;
} else {
t = (t + 1) >> 1;
}
}
swz[k] = s;
}
/* Initialize the table with the cyclic repetition of a
* num_ids-periodic pattern.
*
* Note that the swz permutation only affects the ordering of rows.
* This is intentional in order to minimize the size of the
* contiguous area that needs to be rendered in parallel in order
* to utilize the whole GPU: A rendering rectangle of width W will
* need to be at least H blocks high, where H is bounded by
* 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz
* permutation.
*/
for (unsigned i = 0; i < n; i++) {
for (unsigned j = 0; j < m; j++)
p[j + m * i] = phys_ids[(j + i) % num_ids];
const unsigned k = i % num_ids;
assert(swz[k] < num_ids);
for (unsigned j = 0; j < m; j++) {
p[j + m * i] = phys_ids[(j + swz[k]) % num_ids];
}
}
}

View file

@ -1372,7 +1372,7 @@
<field name="3D Scoreboard Hashing Mode" start="36" end="36" type="bool"/>
<field name="Subslice Hashing Table Enable" start="37" end="37" type="bool"/>
<field name="Slice Hashing Table Enable" start="38" end="38" type="bool"/>
<field name="Cross Slice Hashing Mode Mask" start="48" end="49" type="uint"/>
<field name="Cross Slice Hashing Mode Mask" start="48" end="49" type="int"/>
<field name="3D Scoreboard Hashing Mode Mask" start="52" end="52" type="bool"/>
<field name="Subslice Hashing Table Enable Mask" start="53" end="53" type="bool"/>
<field name="Slice Hashing Table Enable Mask" start="54" end="54" type="bool"/>

View file

@ -151,6 +151,9 @@ genX(emit_slice_hashing_state)(struct anv_device *device,
anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
mode.SliceHashingTableEnable = true;
mode.SliceHashingTableEnableMask = true;
mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
hashing32x32 : NormalMode);
mode.CrossSliceHashingModeMask = -1;
}
#endif
}