From 11d2db17c522e5a123e781f001d7f75e9abe2bcd Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 25 Mar 2021 16:59:50 +1000 Subject: [PATCH] util: rework AMD cpu L3 cache affinity code. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes how the L3 cache affinity code works out the affinity masks. It works better with multi-CPU systems and should also be capable of handling big/little type situations if they appear in the future. It now iterates over all CPU cores, gets the core count for each CPU, and works out the L3_ID from the physical CPU ID, and the current cores L3 cache. It then tracks how many L3 caches it has seen and reallocate the affinity masks for each one. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4496 Fixes: d8ea5099658 ("util: completely rewrite and do AMD Zen L3 cache pinning correctly") Reviewed-by: Marek Olšák Part-of: --- src/util/u_cpu_detect.c | 83 ++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c index bedb94f22ec..4a4b06e1bc6 100644 --- a/src/util/u_cpu_detect.c +++ b/src/util/u_cpu_detect.c @@ -446,20 +446,14 @@ get_cpu_topology(void) util_cpu_caps.family < CPU_AMD_LAST) { uint32_t regs[4]; - /* Query the L3 cache count. */ - cpuid_count(0x8000001D, 3, regs); - unsigned cache_level = (regs[0] >> 5) & 0x7; - unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; - - if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus) - return; - uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; - uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0}; - uint32_t apic_id[UTIL_MAX_CPUS]; bool saved = false; + uint32_t L3_found[UTIL_MAX_CPUS] = {0}; + uint32_t num_L3_caches = 0; + util_affinity_mask *L3_affinity_masks = NULL; + /* Query APIC IDs from each CPU core. * * An APIC ID is a logical ID of the CPU with respect to the cache @@ -486,39 +480,58 @@ get_cpu_topology(void) !saved ? saved_mask : NULL, util_cpu_caps.num_cpu_mask_bits)) { saved = true; - allowed_mask[i / 32] |= cpu_bit; /* Query the APIC ID of the current core. */ cpuid(0x00000001, regs); - apic_id[i] = regs[1] >> 24; + unsigned apic_id = regs[1] >> 24; + + /* Query the total core count for the CPU */ + uint32_t core_count = 1; + if (regs[3] & (1 << 28)) + core_count = (regs[1] >> 16) & 0xff; + + core_count = util_next_power_of_two(core_count); + + /* Query the L3 cache count. */ + cpuid_count(0x8000001D, 3, regs); + unsigned cache_level = (regs[0] >> 5) & 0x7; + unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; + + if (cache_level != 3) + continue; + + unsigned local_core_id = apic_id & (core_count - 1); + unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count); + unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3); +#define L3_ID(p, i) (p << 16 | i << 1 | 1); + + unsigned l3_id = L3_ID(phys_id, local_l3_cache_index); + int idx = -1; + for (unsigned c = 0; c < num_L3_caches; c++) { + if (L3_found[c] == l3_id) { + idx = c; + break; + } + } + if (idx == -1) { + idx = num_L3_caches; + L3_found[num_L3_caches++] = l3_id; + L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches); + if (!L3_affinity_masks) + return; + memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask)); + } + util_cpu_caps.cpu_to_L3[i] = idx; + L3_affinity_masks[idx][i / 32] |= cpu_bit; + } mask[i / 32] = 0; } + util_cpu_caps.num_L3_caches = num_L3_caches; + util_cpu_caps.L3_affinity_mask = L3_affinity_masks; + if (saved) { - - /* We succeeded in using at least one CPU. */ - util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3; - util_cpu_caps.cores_per_L3 = cores_per_L3; - util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask), - util_cpu_caps.num_L3_caches); - - for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS; - i++) { - uint32_t cpu_bit = 1u << (i % 32); - - if (allowed_mask[i / 32] & cpu_bit) { - /* Each APIC ID bit represents a topology level, so we need - * to round up to the next power of two. - */ - unsigned L3_index = apic_id[i] / - util_next_power_of_two(cores_per_L3); - - util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit; - util_cpu_caps.cpu_to_L3[i] = L3_index; - } - } - if (debug_get_option_dump_cpu()) { fprintf(stderr, "CPU <-> L3 cache mapping:\n"); for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {