mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-20 06:58:16 +02:00
Up to now preferred SLM size was being set to maximum preferred SLM size for GFX 12.5 platforms and to workgroup SLM size for Xe2 but neither of those values are the optimal. The optimal value is: <number of workgroups that can run per subslice> * <workgroup SLM size> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Signed-off-by: José Roberto de Souza <jose.souza@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28910>
184 lines
5.9 KiB
C
184 lines
5.9 KiB
C
/*
|
|
* Copyright 2024 Intel Corporation
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "intel_compute_slm.h"
|
|
|
|
#include <assert.h>
|
|
|
|
#include "util/macros.h"
|
|
#include "util/u_math.h"
|
|
|
|
struct slm_encode {
|
|
uint32_t encode;
|
|
uint32_t size_in_kb;
|
|
};
|
|
|
|
static inline struct slm_encode *
|
|
slm_encode_lookup(struct slm_encode *table, unsigned int table_len, uint32_t bytes)
|
|
{
|
|
const uint32_t kbytes = DIV_ROUND_UP(bytes, 1024);
|
|
unsigned int i;
|
|
|
|
assert(kbytes <= table[table_len - 1].size_in_kb);
|
|
for (i = 0; i < table_len; i++) {
|
|
if (table[i].size_in_kb >= kbytes)
|
|
return &table[i];
|
|
}
|
|
|
|
return &table[table_len - 1];
|
|
}
|
|
|
|
static struct slm_encode xe2_slm_allocation_size_table[] = {
|
|
{ .encode = 0x0, .size_in_kb = 0, },
|
|
{ .encode = 0x1, .size_in_kb = 1, },
|
|
{ .encode = 0x2, .size_in_kb = 2, },
|
|
{ .encode = 0x3, .size_in_kb = 4, },
|
|
{ .encode = 0x4, .size_in_kb = 8, },
|
|
{ .encode = 0x5, .size_in_kb = 16, },
|
|
{ .encode = 0x8, .size_in_kb = 24, },
|
|
{ .encode = 0x6, .size_in_kb = 32, },
|
|
{ .encode = 0x9, .size_in_kb = 48, },
|
|
{ .encode = 0x7, .size_in_kb = 64, },
|
|
{ .encode = 0xA, .size_in_kb = 96, },
|
|
{ .encode = 0xB, .size_in_kb = 128, },
|
|
{ .encode = 0xC, .size_in_kb = 192, },
|
|
{ .encode = 0xD, .size_in_kb = 256, },
|
|
{ .encode = 0xE, .size_in_kb = 384, },
|
|
};
|
|
|
|
/* Shared Local Memory Size is specified as powers of two,
|
|
* and also have a Gen-dependent minimum value if not zero.
|
|
*/
|
|
uint32_t
|
|
intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes)
|
|
{
|
|
if (gen >= 20) {
|
|
struct slm_encode *slm_encode;
|
|
|
|
slm_encode = slm_encode_lookup(xe2_slm_allocation_size_table,
|
|
ARRAY_SIZE(xe2_slm_allocation_size_table),
|
|
bytes);
|
|
return slm_encode->size_in_kb * 1024;
|
|
}
|
|
|
|
assert(bytes <= 64 * 1024);
|
|
if (bytes > 0)
|
|
return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
uint32_t
|
|
intel_compute_slm_encode_size(unsigned gen, uint32_t bytes)
|
|
{
|
|
uint32_t slm_size;
|
|
|
|
if (bytes == 0)
|
|
return 0;
|
|
|
|
if (gen >= 20) {
|
|
struct slm_encode *slm_encode;
|
|
|
|
slm_encode = slm_encode_lookup(xe2_slm_allocation_size_table,
|
|
ARRAY_SIZE(xe2_slm_allocation_size_table),
|
|
bytes);
|
|
return slm_encode->encode;
|
|
}
|
|
|
|
/* Shared Local Memory is specified as powers of two, and encoded in
|
|
* INTERFACE_DESCRIPTOR_DATA with the following representations:
|
|
*
|
|
* Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
|
|
* -------------------------------------------------------------------
|
|
* Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 |
|
|
* -------------------------------------------------------------------
|
|
* Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
*/
|
|
|
|
slm_size = intel_compute_slm_calculate_size(gen, bytes);
|
|
assert(util_is_power_of_two_nonzero(slm_size));
|
|
|
|
if (gen >= 9) {
|
|
/* Turn an exponent of 10 (1024 kB) into 1. */
|
|
assert(slm_size >= 1024);
|
|
slm_size = ffs(slm_size) - 10;
|
|
} else {
|
|
assert(slm_size >= 4096);
|
|
/* Convert to the pre-Gfx9 representation. */
|
|
slm_size = slm_size / 4096;
|
|
}
|
|
|
|
return slm_size;
|
|
}
|
|
|
|
/* encode = 0 sets to largest SLM size supported in subslice */
|
|
static struct slm_encode preferred_slm_allocation_size_table[] = {
|
|
{ .encode = 0x8, .size_in_kb = 0, },
|
|
{ .encode = 0x9, .size_in_kb = 16, },
|
|
{ .encode = 0xa, .size_in_kb = 32, },
|
|
{ .encode = 0xb, .size_in_kb = 64, },
|
|
{ .encode = 0xc, .size_in_kb = 96, },
|
|
{ .encode = 0xd, .size_in_kb = 128, },
|
|
};
|
|
|
|
static struct slm_encode xe2_preferred_slm_allocation_size_table[] = {
|
|
{ .encode = 0x0, .size_in_kb = 0, },
|
|
{ .encode = 0x1, .size_in_kb = 16, },
|
|
{ .encode = 0x2, .size_in_kb = 32, },
|
|
{ .encode = 0x3, .size_in_kb = 64, },
|
|
{ .encode = 0x4, .size_in_kb = 96, },
|
|
{ .encode = 0x5, .size_in_kb = 128, },
|
|
{ .encode = 0x6, .size_in_kb = 160, },
|
|
{ .encode = 0x7, .size_in_kb = 192, },
|
|
{ .encode = 0x8, .size_in_kb = 224, },
|
|
{ .encode = 0x9, .size_in_kb = 256, },
|
|
{ .encode = 0xA, .size_in_kb = 384, },
|
|
};
|
|
|
|
static uint32_t
|
|
intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
|
|
{
|
|
struct slm_encode *table;
|
|
unsigned int table_len;
|
|
|
|
if (gen >= 20) {
|
|
table = xe2_preferred_slm_allocation_size_table;
|
|
table_len = ARRAY_SIZE(xe2_preferred_slm_allocation_size_table);
|
|
} else {
|
|
table = preferred_slm_allocation_size_table;
|
|
table_len = ARRAY_SIZE(preferred_slm_allocation_size_table);
|
|
}
|
|
|
|
return slm_encode_lookup(table, table_len, bytes)->encode;
|
|
}
|
|
|
|
/**
|
|
* Compute a shared local memory size to be allocated for each sub-slice.
|
|
* It estimate how many workgroups will run concurrently per sub-slice and
|
|
* multiply that per each workgroup SLM size.
|
|
*/
|
|
uint32_t
|
|
intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo,
|
|
const uint32_t slm_size_per_workgroup,
|
|
const uint32_t invocations_per_workgroup,
|
|
const uint8_t cs_simd)
|
|
{
|
|
const uint32_t max_preferred_slm_size = intel_device_info_get_max_preferred_slm_size(devinfo);
|
|
const uint32_t invocations_per_ss = intel_device_info_get_eu_count_first_subslice(devinfo) *
|
|
devinfo->num_thread_per_eu * cs_simd;
|
|
uint32_t preferred_slm_size;
|
|
|
|
if (slm_size_per_workgroup) {
|
|
uint32_t workgroups_per_ss = invocations_per_ss / invocations_per_workgroup;
|
|
|
|
preferred_slm_size = workgroups_per_ss * slm_size_per_workgroup;
|
|
preferred_slm_size = MIN2(preferred_slm_size, max_preferred_slm_size);
|
|
} else {
|
|
preferred_slm_size = 0;
|
|
}
|
|
|
|
assert(preferred_slm_size >= slm_size_per_workgroup);
|
|
return intel_compute_preferred_slm_encode_size(devinfo->ver, preferred_slm_size);
|
|
}
|