mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 07:10:09 +01:00
util: completely rewrite and do AMD Zen L3 cache pinning correctly
This queries the CPU cache topology correctly. Acked-by: Jose Fonseca <jfonseca@vmware.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7054>
This commit is contained in:
parent
4f2c2307f9
commit
d8ea509965
7 changed files with 112 additions and 44 deletions
|
|
@ -2002,8 +2002,9 @@ tc_set_context_param(struct pipe_context *_pipe,
|
||||||
|
|
||||||
if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) {
|
if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) {
|
||||||
/* Pin the gallium thread as requested. */
|
/* Pin the gallium thread as requested. */
|
||||||
util_pin_thread_to_L3(tc->queue.threads[0], value,
|
util_set_thread_affinity(tc->queue.threads[0],
|
||||||
util_cpu_caps.cores_per_L3);
|
util_cpu_caps.L3_affinity_mask[value],
|
||||||
|
NULL, UTIL_MAX_CPUS);
|
||||||
|
|
||||||
/* Execute this immediately (without enqueuing).
|
/* Execute this immediately (without enqueuing).
|
||||||
* It's required to be thread-safe.
|
* It's required to be thread-safe.
|
||||||
|
|
|
||||||
|
|
@ -311,8 +311,9 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws,
|
||||||
{
|
{
|
||||||
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
|
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
|
||||||
|
|
||||||
util_pin_thread_to_L3(ws->cs_queue.threads[0], cache,
|
util_set_thread_affinity(ws->cs_queue.threads[0],
|
||||||
util_cpu_caps.cores_per_L3);
|
util_cpu_caps.L3_affinity_mask[cache],
|
||||||
|
NULL, UTIL_MAX_CPUS);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t kms_handle_hash(const void *key)
|
static uint32_t kms_handle_hash(const void *key)
|
||||||
|
|
|
||||||
|
|
@ -798,8 +798,9 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws,
|
||||||
struct radeon_drm_winsys *rws = (struct radeon_drm_winsys*)ws;
|
struct radeon_drm_winsys *rws = (struct radeon_drm_winsys*)ws;
|
||||||
|
|
||||||
if (util_queue_is_initialized(&rws->cs_queue)) {
|
if (util_queue_is_initialized(&rws->cs_queue)) {
|
||||||
util_pin_thread_to_L3(rws->cs_queue.threads[0], cache,
|
util_set_thread_affinity(rws->cs_queue.threads[0],
|
||||||
util_cpu_caps.cores_per_L3);
|
util_cpu_caps.L3_affinity_mask[cache],
|
||||||
|
NULL, UTIL_MAX_CPUS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -139,7 +139,7 @@ prepare_draw(struct st_context *st, struct gl_context *ctx)
|
||||||
++st->pin_thread_counter % 512 == 0)) {
|
++st->pin_thread_counter % 512 == 0)) {
|
||||||
int cpu = util_get_current_cpu();
|
int cpu = util_get_current_cpu();
|
||||||
if (cpu >= 0) {
|
if (cpu >= 0) {
|
||||||
unsigned L3_cache = cpu / util_cpu_caps.cores_per_L3;
|
unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
|
||||||
|
|
||||||
pipe->set_context_param(pipe,
|
pipe->set_context_param(pipe,
|
||||||
PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
|
PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
|
||||||
|
|
|
||||||
|
|
@ -37,8 +37,12 @@
|
||||||
|
|
||||||
#include "util/u_debug.h"
|
#include "util/u_debug.h"
|
||||||
#include "u_cpu_detect.h"
|
#include "u_cpu_detect.h"
|
||||||
|
#include "u_math.h"
|
||||||
#include "c11/threads.h"
|
#include "c11/threads.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
#if defined(PIPE_ARCH_PPC)
|
#if defined(PIPE_ARCH_PPC)
|
||||||
#if defined(PIPE_OS_APPLE)
|
#if defined(PIPE_OS_APPLE)
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
|
|
@ -83,9 +87,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef DEBUG
|
|
||||||
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
|
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
struct util_cpu_caps util_cpu_caps;
|
struct util_cpu_caps util_cpu_caps;
|
||||||
|
|
@ -432,21 +434,104 @@ check_os_arm_support(void)
|
||||||
static void
|
static void
|
||||||
get_cpu_topology(void)
|
get_cpu_topology(void)
|
||||||
{
|
{
|
||||||
/* Default. This is correct if L3 is not present or there is only one. */
|
/* Default. This is OK if L3 is not present or there is only one. */
|
||||||
util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
|
util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
|
||||||
|
util_cpu_caps.num_L3_caches = 1;
|
||||||
|
|
||||||
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
|
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
|
||||||
/* AMD Zen */
|
/* AMD Zen */
|
||||||
if (util_cpu_caps.x86_cpu_type == 0x17) {
|
if (util_cpu_caps.x86_cpu_type == 0x17) {
|
||||||
uint32_t regs[4];
|
uint32_t regs[4];
|
||||||
|
|
||||||
/* Query the L3 cache topology information. */
|
/* Query the L3 cache count. */
|
||||||
cpuid_count(0x8000001D, 3, regs);
|
cpuid_count(0x8000001D, 3, regs);
|
||||||
unsigned cache_level = (regs[0] >> 5) & 0x7;
|
unsigned cache_level = (regs[0] >> 5) & 0x7;
|
||||||
unsigned cores_per_cache = ((regs[0] >> 14) & 0xfff) + 1;
|
unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
|
||||||
|
|
||||||
if (cache_level == 3)
|
if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
|
||||||
util_cpu_caps.cores_per_L3 = cores_per_cache;
|
return;
|
||||||
|
|
||||||
|
uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
|
||||||
|
uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
|
||||||
|
uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
|
||||||
|
uint32_t apic_id[UTIL_MAX_CPUS];
|
||||||
|
bool saved = false;
|
||||||
|
|
||||||
|
/* Query APIC IDs from each CPU core.
|
||||||
|
*
|
||||||
|
* An APIC ID is a logical ID of the CPU with respect to the cache
|
||||||
|
* hierarchy, meaning that consecutive APIC IDs are neighbours in
|
||||||
|
* the hierarchy, e.g. sharing the same cache.
|
||||||
|
*
|
||||||
|
* For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
|
||||||
|
* which means that both CPU 0 and 12 are next to each other.
|
||||||
|
* (e.g. they are 2 threads belonging to 1 SMT2 core)
|
||||||
|
*
|
||||||
|
* We need to find out which CPUs share the same L3 cache and they can
|
||||||
|
* be all over the place.
|
||||||
|
*
|
||||||
|
* Querying the APIC ID can only be done by pinning the current thread
|
||||||
|
* to each core. The original affinity mask is saved.
|
||||||
|
*/
|
||||||
|
for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
|
||||||
|
i++) {
|
||||||
|
uint32_t cpu_bit = 1u << (i % 32);
|
||||||
|
|
||||||
|
mask[i / 32] = cpu_bit;
|
||||||
|
|
||||||
|
if (util_set_current_thread_affinity(mask,
|
||||||
|
!saved ? saved_mask : NULL,
|
||||||
|
UTIL_MAX_CPUS)) {
|
||||||
|
saved = true;
|
||||||
|
allowed_mask[i / 32] |= cpu_bit;
|
||||||
|
|
||||||
|
/* Query the APIC ID of the current core. */
|
||||||
|
cpuid(0x00000001, regs);
|
||||||
|
apic_id[i] = regs[1] >> 24;
|
||||||
|
}
|
||||||
|
mask[i / 32] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (saved) {
|
||||||
|
|
||||||
|
/* We succeeded in using at least one CPU. */
|
||||||
|
util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
|
||||||
|
util_cpu_caps.cores_per_L3 = cores_per_L3;
|
||||||
|
util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
|
||||||
|
util_cpu_caps.num_L3_caches);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
|
||||||
|
i++) {
|
||||||
|
uint32_t cpu_bit = 1u << (i % 32);
|
||||||
|
|
||||||
|
if (allowed_mask[i / 32] & cpu_bit) {
|
||||||
|
/* Each APIC ID bit represents a topology level, so we need
|
||||||
|
* to round up to the next power of two.
|
||||||
|
*/
|
||||||
|
unsigned L3_index = apic_id[i] /
|
||||||
|
util_next_power_of_two(cores_per_L3);
|
||||||
|
|
||||||
|
util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
|
||||||
|
util_cpu_caps.cpu_to_L3[i] = L3_index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug_get_option_dump_cpu()) {
|
||||||
|
fprintf(stderr, "CPU <-> L3 cache mapping:\n");
|
||||||
|
for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
|
||||||
|
fprintf(stderr, " - L3 %u mask = ", i);
|
||||||
|
for (int j = util_cpu_caps.nr_cpus - 1; j >= 0; j -= 32)
|
||||||
|
fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Restore the original affinity mask. */
|
||||||
|
util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS);
|
||||||
|
} else {
|
||||||
|
if (debug_get_option_dump_cpu())
|
||||||
|
fprintf(stderr, "Cannot set thread affinity for any thread.\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
@ -606,7 +691,6 @@ util_cpu_detect_once(void)
|
||||||
|
|
||||||
get_cpu_topology();
|
get_cpu_topology();
|
||||||
|
|
||||||
#ifdef DEBUG
|
|
||||||
if (debug_get_option_dump_cpu()) {
|
if (debug_get_option_dump_cpu()) {
|
||||||
debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
|
debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
|
||||||
|
|
||||||
|
|
@ -643,7 +727,6 @@ util_cpu_detect_once(void)
|
||||||
debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
|
debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
|
||||||
debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
|
debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static once_flag cpu_once_flag = ONCE_FLAG_INIT;
|
static once_flag cpu_once_flag = ONCE_FLAG_INIT;
|
||||||
|
|
|
||||||
|
|
@ -37,12 +37,14 @@
|
||||||
|
|
||||||
|
|
||||||
#include "pipe/p_config.h"
|
#include "pipe/p_config.h"
|
||||||
|
#include "util/u_thread.h"
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
|
||||||
|
|
||||||
struct util_cpu_caps {
|
struct util_cpu_caps {
|
||||||
int nr_cpus;
|
int nr_cpus;
|
||||||
|
|
@ -50,7 +52,6 @@ struct util_cpu_caps {
|
||||||
/* Feature flags */
|
/* Feature flags */
|
||||||
int x86_cpu_type;
|
int x86_cpu_type;
|
||||||
unsigned cacheline;
|
unsigned cacheline;
|
||||||
unsigned cores_per_L3;
|
|
||||||
|
|
||||||
unsigned has_intel:1;
|
unsigned has_intel:1;
|
||||||
unsigned has_tsc:1;
|
unsigned has_tsc:1;
|
||||||
|
|
@ -84,6 +85,13 @@ struct util_cpu_caps {
|
||||||
unsigned has_avx512bw:1;
|
unsigned has_avx512bw:1;
|
||||||
unsigned has_avx512vl:1;
|
unsigned has_avx512vl:1;
|
||||||
unsigned has_avx512vbmi:1;
|
unsigned has_avx512vbmi:1;
|
||||||
|
|
||||||
|
unsigned num_L3_caches;
|
||||||
|
unsigned cores_per_L3;
|
||||||
|
|
||||||
|
uint16_t cpu_to_L3[UTIL_MAX_CPUS];
|
||||||
|
/* Affinity masks for each L3 cache. */
|
||||||
|
util_affinity_mask *L3_affinity_mask;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct util_cpu_caps
|
extern struct util_cpu_caps
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,7 @@
|
||||||
|
|
||||||
/* For util_set_thread_affinity to size the mask. */
|
/* For util_set_thread_affinity to size the mask. */
|
||||||
#define UTIL_MAX_CPUS 1024 /* this should be enough */
|
#define UTIL_MAX_CPUS 1024 /* this should be enough */
|
||||||
|
#define UTIL_MAX_L3_CACHES UTIL_MAX_CPUS
|
||||||
|
|
||||||
static inline int
|
static inline int
|
||||||
util_get_current_cpu(void)
|
util_get_current_cpu(void)
|
||||||
|
|
@ -198,33 +199,6 @@ util_set_current_thread_affinity(const uint32_t *mask,
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* An AMD Zen CPU consists of multiple modules where each module has its own L3
|
|
||||||
* cache. Inter-thread communication such as locks and atomics between modules
|
|
||||||
* is very expensive. It's desirable to pin a group of closely cooperating
|
|
||||||
* threads to one group of cores sharing L3.
|
|
||||||
*
|
|
||||||
* \param thread thread
|
|
||||||
* \param L3_index index of the L3 cache
|
|
||||||
* \param cores_per_L3 number of CPU cores shared by one L3
|
|
||||||
*/
|
|
||||||
static inline bool
|
|
||||||
util_pin_thread_to_L3(thrd_t thread, unsigned L3_index, unsigned cores_per_L3)
|
|
||||||
{
|
|
||||||
unsigned num_mask_bits = DIV_ROUND_UP((L3_index + 1) * cores_per_L3, 32);
|
|
||||||
uint32_t mask[UTIL_MAX_CPUS / 32];
|
|
||||||
|
|
||||||
assert((L3_index + 1) * cores_per_L3 <= UTIL_MAX_CPUS);
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < cores_per_L3; i++) {
|
|
||||||
unsigned core = L3_index * cores_per_L3 + i;
|
|
||||||
|
|
||||||
mask[core / 32] |= 1u << (core % 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
return util_set_thread_affinity(thread, mask, NULL, num_mask_bits);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Thread statistics.
|
* Thread statistics.
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue