diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index f9d22c3bcb9..d8dcad57793 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -2002,8 +2002,9 @@ tc_set_context_param(struct pipe_context *_pipe, if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) { /* Pin the gallium thread as requested. */ - util_pin_thread_to_L3(tc->queue.threads[0], value, - util_cpu_caps.cores_per_L3); + util_set_thread_affinity(tc->queue.threads[0], + util_cpu_caps.L3_affinity_mask[value], + NULL, UTIL_MAX_CPUS); /* Execute this immediately (without enqueuing). * It's required to be thread-safe. diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index d554ea5a67c..8a0aedfed64 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -311,8 +311,9 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws, { struct amdgpu_winsys *ws = amdgpu_winsys(rws); - util_pin_thread_to_L3(ws->cs_queue.threads[0], cache, - util_cpu_caps.cores_per_L3); + util_set_thread_affinity(ws->cs_queue.threads[0], + util_cpu_caps.L3_affinity_mask[cache], + NULL, UTIL_MAX_CPUS); } static uint32_t kms_handle_hash(const void *key) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index b9a092d9ae4..569d273a1f7 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -798,8 +798,9 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws, struct radeon_drm_winsys *rws = (struct radeon_drm_winsys*)ws; if (util_queue_is_initialized(&rws->cs_queue)) { - util_pin_thread_to_L3(rws->cs_queue.threads[0], cache, - util_cpu_caps.cores_per_L3); + util_set_thread_affinity(rws->cs_queue.threads[0], + util_cpu_caps.L3_affinity_mask[cache], + NULL, UTIL_MAX_CPUS); } } diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c index 0e22d0c5a0d..676db28df96 100644 --- a/src/mesa/state_tracker/st_draw.c +++ b/src/mesa/state_tracker/st_draw.c @@ -139,7 +139,7 @@ prepare_draw(struct st_context *st, struct gl_context *ctx) ++st->pin_thread_counter % 512 == 0)) { int cpu = util_get_current_cpu(); if (cpu >= 0) { - unsigned L3_cache = cpu / util_cpu_caps.cores_per_L3; + unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu]; pipe->set_context_param(pipe, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE, diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c index ab064957382..af3663a8bd6 100644 --- a/src/util/u_cpu_detect.c +++ b/src/util/u_cpu_detect.c @@ -37,8 +37,12 @@ #include "util/u_debug.h" #include "u_cpu_detect.h" +#include "u_math.h" #include "c11/threads.h" +#include +#include + #if defined(PIPE_ARCH_PPC) #if defined(PIPE_OS_APPLE) #include @@ -83,9 +87,7 @@ #endif -#ifdef DEBUG DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false) -#endif struct util_cpu_caps util_cpu_caps; @@ -432,21 +434,104 @@ check_os_arm_support(void) static void get_cpu_topology(void) { - /* Default. This is correct if L3 is not present or there is only one. */ + /* Default. This is OK if L3 is not present or there is only one. */ util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus; + util_cpu_caps.num_L3_caches = 1; #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* AMD Zen */ if (util_cpu_caps.x86_cpu_type == 0x17) { uint32_t regs[4]; - /* Query the L3 cache topology information. */ + /* Query the L3 cache count. */ cpuid_count(0x8000001D, 3, regs); unsigned cache_level = (regs[0] >> 5) & 0x7; - unsigned cores_per_cache = ((regs[0] >> 14) & 0xfff) + 1; + unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; - if (cache_level == 3) - util_cpu_caps.cores_per_L3 = cores_per_cache; + if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus) + return; + + uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; + uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; + uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0}; + uint32_t apic_id[UTIL_MAX_CPUS]; + bool saved = false; + + /* Query APIC IDs from each CPU core. + * + * An APIC ID is a logical ID of the CPU with respect to the cache + * hierarchy, meaning that consecutive APIC IDs are neighbours in + * the hierarchy, e.g. sharing the same cache. + * + * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1, + * which means that both CPU 0 and 12 are next to each other. + * (e.g. they are 2 threads belonging to 1 SMT2 core) + * + * We need to find out which CPUs share the same L3 cache and they can + * be all over the place. + * + * Querying the APIC ID can only be done by pinning the current thread + * to each core. The original affinity mask is saved. + */ + for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS; + i++) { + uint32_t cpu_bit = 1u << (i % 32); + + mask[i / 32] = cpu_bit; + + if (util_set_current_thread_affinity(mask, + !saved ? saved_mask : NULL, + UTIL_MAX_CPUS)) { + saved = true; + allowed_mask[i / 32] |= cpu_bit; + + /* Query the APIC ID of the current core. */ + cpuid(0x00000001, regs); + apic_id[i] = regs[1] >> 24; + } + mask[i / 32] = 0; + } + + if (saved) { + + /* We succeeded in using at least one CPU. */ + util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3; + util_cpu_caps.cores_per_L3 = cores_per_L3; + util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask), + util_cpu_caps.num_L3_caches); + + for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS; + i++) { + uint32_t cpu_bit = 1u << (i % 32); + + if (allowed_mask[i / 32] & cpu_bit) { + /* Each APIC ID bit represents a topology level, so we need + * to round up to the next power of two. + */ + unsigned L3_index = apic_id[i] / + util_next_power_of_two(cores_per_L3); + + util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit; + util_cpu_caps.cpu_to_L3[i] = L3_index; + } + } + + if (debug_get_option_dump_cpu()) { + fprintf(stderr, "CPU <-> L3 cache mapping:\n"); + for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) { + fprintf(stderr, " - L3 %u mask = ", i); + for (int j = util_cpu_caps.nr_cpus - 1; j >= 0; j -= 32) + fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]); + fprintf(stderr, "\n"); + } + } + + /* Restore the original affinity mask. */ + util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS); + } else { + if (debug_get_option_dump_cpu()) + fprintf(stderr, "Cannot set thread affinity for any thread.\n"); + } } #endif } @@ -606,7 +691,6 @@ util_cpu_detect_once(void) get_cpu_topology(); -#ifdef DEBUG if (debug_get_option_dump_cpu()) { debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); @@ -643,7 +727,6 @@ util_cpu_detect_once(void) debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl); debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi); } -#endif } static once_flag cpu_once_flag = ONCE_FLAG_INIT; diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h index a09aca8fbac..2e47ee69af4 100644 --- a/src/util/u_cpu_detect.h +++ b/src/util/u_cpu_detect.h @@ -37,12 +37,14 @@ #include "pipe/p_config.h" +#include "util/u_thread.h" #ifdef __cplusplus extern "C" { #endif +typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32]; struct util_cpu_caps { int nr_cpus; @@ -50,7 +52,6 @@ struct util_cpu_caps { /* Feature flags */ int x86_cpu_type; unsigned cacheline; - unsigned cores_per_L3; unsigned has_intel:1; unsigned has_tsc:1; @@ -84,6 +85,13 @@ struct util_cpu_caps { unsigned has_avx512bw:1; unsigned has_avx512vl:1; unsigned has_avx512vbmi:1; + + unsigned num_L3_caches; + unsigned cores_per_L3; + + uint16_t cpu_to_L3[UTIL_MAX_CPUS]; + /* Affinity masks for each L3 cache. */ + util_affinity_mask *L3_affinity_mask; }; extern struct util_cpu_caps diff --git a/src/util/u_thread.h b/src/util/u_thread.h index 93d8b0f92dc..bdfb05e158c 100644 --- a/src/util/u_thread.h +++ b/src/util/u_thread.h @@ -62,6 +62,7 @@ /* For util_set_thread_affinity to size the mask. */ #define UTIL_MAX_CPUS 1024 /* this should be enough */ +#define UTIL_MAX_L3_CACHES UTIL_MAX_CPUS static inline int util_get_current_cpu(void) @@ -198,33 +199,6 @@ util_set_current_thread_affinity(const uint32_t *mask, #endif } -/** - * An AMD Zen CPU consists of multiple modules where each module has its own L3 - * cache. Inter-thread communication such as locks and atomics between modules - * is very expensive. It's desirable to pin a group of closely cooperating - * threads to one group of cores sharing L3. - * - * \param thread thread - * \param L3_index index of the L3 cache - * \param cores_per_L3 number of CPU cores shared by one L3 - */ -static inline bool -util_pin_thread_to_L3(thrd_t thread, unsigned L3_index, unsigned cores_per_L3) -{ - unsigned num_mask_bits = DIV_ROUND_UP((L3_index + 1) * cores_per_L3, 32); - uint32_t mask[UTIL_MAX_CPUS / 32]; - - assert((L3_index + 1) * cores_per_L3 <= UTIL_MAX_CPUS); - - for (unsigned i = 0; i < cores_per_L3; i++) { - unsigned core = L3_index * cores_per_L3 + i; - - mask[core / 32] |= 1u << (core % 32); - } - - return util_set_thread_affinity(thread, mask, NULL, num_mask_bits); -} - /* * Thread statistics.