diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 18900f2b5e0..c0a4c9130bc 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -990,12 +990,19 @@ upload_pixel_hashing_tables(struct iris_batch *batch) 0, size, PIPE_MAP_WRITE, &transfer); - uint32_t ppipe_mask = 0; + /* Calculate the set of present pixel pipes, and another set of + * present pixel pipes with 2 dual subslices enabled, the latter + * will appear on the hashing table with twice the frequency of + * pixel pipes with a single dual subslice present. + */ + uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0; for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) { if (devinfo->ppipe_subslices[p]) - ppipe_mask |= (1u << p); + ppipe_mask1 |= (1u << p); + if (devinfo->ppipe_subslices[p] > 1) + ppipe_mask2 |= (1u << p); } - assert(ppipe_mask); + assert(ppipe_mask1); struct GENX(SLICE_HASH_TABLE) table; @@ -1008,7 +1015,8 @@ upload_pixel_hashing_tables(struct iris_batch *batch) * initialized to the same value. */ for (unsigned i = 0; i < 7; i++) - intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]); + intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2, + table.Entry[i][0]); GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table); @@ -1026,7 +1034,7 @@ upload_pixel_hashing_tables(struct iris_batch *batch) iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) { mode.SliceHashingTableEnable = true; mode.SliceHashingTableEnableMask = true; - mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ? + mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ? hashing32x32 : NormalMode); mode.CrossSliceHashingModeMask = -1; } diff --git a/src/intel/common/intel_pixel_hash.h b/src/intel/common/intel_pixel_hash.h index f24528ddc53..1bc32e5f6d9 100644 --- a/src/intel/common/intel_pixel_hash.h +++ b/src/intel/common/intel_pixel_hash.h @@ -66,22 +66,37 @@ intel_compute_pixel_hash_table_3way(unsigned n, unsigned m, * Compute an \p n x \p m pixel hashing table usable as slice, * subslice or pixel pipe hashing table. This generalizes the * previous 3-way hash table function to an arbitrary number of ways - * given by the number of bits set in the \p mask argument, but - * doesn't allow the specification of different frequencies for - * different table indices. + * given by the number of bits set in the expression "mask1 | mask2". + * If a way is only set in one of the two mask arguments it will + * appear on the table with half the frequency as a way set on both + * masks. */ UNUSED static void -intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask, +intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, + uint32_t mask1, uint32_t mask2, uint32_t *p) { - /* Construct a table mapping consecutive indices to the physical - * indices given by the bits set on the mask argument. + /* If both masks are equal all ways are expected to show up with + * the same frequency on the final table, so we can zero out one of + * the masks in order to halve the number of IDs we need to handle. */ - unsigned phys_ids[sizeof(mask) * CHAR_BIT]; + if (mask1 == mask2) + mask2 = 0; + + /* Construct a table mapping consecutive indices to the physical + * indices given by the bits set on the mask arguments. Ways + * enabled on both masks will appear twice on the mapping, so + * they'll show up with twice the frequency on the final table. + */ + unsigned phys_ids[(sizeof(mask1) + sizeof(mask2)) * CHAR_BIT]; unsigned num_ids = 0; - u_foreach_bit(i, mask) - phys_ids[num_ids++] = i; + for (unsigned i = 0; i < sizeof(mask1) * CHAR_BIT; i++) { + if (mask1 & (1u << i)) + phys_ids[num_ids++] = i; + if (mask2 & (1u << i)) + phys_ids[num_ids++] = i; + } assert(num_ids > 0); @@ -95,10 +110,11 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask, * bit l of its input will lead to a change in its result of the * order of num_ids/2^(l+1) (see variable t below). The * bijectivity of this permutation can be verified easily by - * induction. + * induction. This permutation is applied cyclically to the + * vertical indices of the hashing table constructed below. */ const unsigned bits = util_logbase2_ceil(num_ids); - unsigned swz[ARRAY_SIZE(phys_ids)]; + unsigned swzy[ARRAY_SIZE(phys_ids)]; for (unsigned k = 0; k < num_ids; k++) { unsigned t = num_ids; @@ -113,25 +129,103 @@ intel_compute_pixel_hash_table_nway(unsigned n, unsigned m, uint32_t mask, } } - swz[k] = s; + swzy[k] = s; + } + + /* Compute a second permutation applied cyclically to the + * horizontal indices of the hashing table. In cases where a + * single mask is present (which means that all ways are expected + * to have the same frequency) this permutation will be the + * identity and will have no effect. + * + * In cases where some ways have twice the frequency of the others, + * use a similar iterative halving of the range of the permutation + * as in the the swzy[] permutation defined above, but instead of + * scanning the bits of its argument (the "k" variable above) in + * the opposite order (from LSB to MSB), proceed by halving the + * domain of the permutation in the same order as its range, which + * would lead to an identity permutation if it wasn't because the + * LSB of its range is adjusted as early as possible instead of at + * the last iteration. + * + * The reason for the special casing of the LSB is that we want to + * avoid assigning adjacent IDs to adjacent elements of the table, + * since ways that appear duplicated in the phys_ids mapping above + * would then appear duplicated in adjacent positions of the final + * table, which would lead to poor utilization for small primitives + * that only cover a small contiguous portion of the hashing table + * and would have twice as much work as necessary submitted to the + * same way instead of spreading its processing over a larger + * number of ways. + */ + unsigned swzx[ARRAY_SIZE(phys_ids)]; + + if (mask1 && mask2) { + for (unsigned k = 0; k < num_ids; k++) { + unsigned l = k; + unsigned t = num_ids; + unsigned s = 0; + bool in_range = false; + + while (t > 1) { + const bool first_in_range = t <= m && !in_range; + in_range |= first_in_range; + + if (l >= (t + 1) >> 1) { + /* Apply the s++ increment (which could only occur in + * the last t == 2 iteration if we were constructing an + * identity permutation) as soon as the domain of the + * permutation has been decomposed into a chunk smaller + * than the width of the hashing table \p m (which + * causes in_range to be first set to true), since + * doing it earlier would prevent any alternation + * between even and odd indices in the first \p m + * elements of swzx[], which are the only ones actually + * used. + * + * Subsequent (in_range == true) increments of s need + * to be doubled since they are selecting between + * indices of the same parity. + */ + if (!in_range) + s += (t + 1) >> 1; + else if (first_in_range) + s++; + else + s += (t + 1) >> 1 << 1; + + l -= (t + 1) >> 1; + t >>= 1; + } else { + t = (t + 1) >> 1; + } + } + + swzx[k] = s; + } + } else { + for (unsigned k = 0; k < num_ids; k++) + swzx[k] = k; } /* Initialize the table with the cyclic repetition of a * num_ids-periodic pattern. * - * Note that the swz permutation only affects the ordering of rows. - * This is intentional in order to minimize the size of the - * contiguous area that needs to be rendered in parallel in order - * to utilize the whole GPU: A rendering rectangle of width W will - * need to be at least H blocks high, where H is bounded by - * 2^ceil(log2(num_ids/W)) thanks to the above definition of the swz - * permutation. + * Note that the horizontal and vertical permutations (swzx and + * swzy respectively) are different, and the former is either an + * identity permutation or close to the identity. This asymmetry + * is intentional in order to minimize the size of the contiguous + * area that needs to be rendered in parallel in order to utilize + * the whole GPU: In cases where swzx is the identity a rendering + * rectangle of width W will need to be at least H blocks high, + * where H is bounded by 2^ceil(log2(num_ids/W)) thanks to the + * above definition of the swzy permutation. */ for (unsigned i = 0; i < n; i++) { const unsigned k = i % num_ids; - assert(swz[k] < num_ids); for (unsigned j = 0; j < m; j++) { - p[j + m * i] = phys_ids[(j + swz[k]) % num_ids]; + const unsigned l = j % num_ids; + p[j + m * i] = phys_ids[(swzx[l] + swzy[k]) % num_ids]; } } } diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index 1586686fa99..bb1878032eb 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -116,12 +116,19 @@ genX(emit_slice_hashing_state)(struct anv_device *device, p.SubsliceHashingTableEnableMask = true; } #elif GFX_VERx10 == 125 - uint32_t ppipe_mask = 0; + /* Calculate the set of present pixel pipes, and another set of + * present pixel pipes with 2 dual subslices enabled, the latter + * will appear on the hashing table with twice the frequency of + * pixel pipes with a single dual subslice present. + */ + uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0; for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) { - if (device->info->ppipe_subslices[p]) - ppipe_mask |= (1u << p); + if (device->info->ppipe_subslices[p] > 0) + ppipe_mask1 |= (1u << p); + if (device->info->ppipe_subslices[p] > 1) + ppipe_mask2 |= (1u << p); } - assert(ppipe_mask); + assert(ppipe_mask1); if (!device->slice_hash.alloc_size) { unsigned size = GENX(SLICE_HASH_TABLE_length) * 4; @@ -139,7 +146,8 @@ genX(emit_slice_hashing_state)(struct anv_device *device, * need to be initialized to the same value. */ for (unsigned i = 0; i < 7; i++) - intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]); + intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2, + table.Entry[i][0]); GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table); } @@ -160,7 +168,7 @@ genX(emit_slice_hashing_state)(struct anv_device *device, anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) { mode.SliceHashingTableEnable = true; mode.SliceHashingTableEnableMask = true; - mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ? + mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ? hashing32x32 : NormalMode); mode.CrossSliceHashingModeMask = -1; mode.FastClearOptimizationEnable = !device->physical->disable_fcv;