diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c
index bedb94f22ec..4a4b06e1bc6 100644
--- a/src/util/u_cpu_detect.c
+++ b/src/util/u_cpu_detect.c
@@ -446,20 +446,14 @@ get_cpu_topology(void)
        util_cpu_caps.family < CPU_AMD_LAST) {
       uint32_t regs[4];
 
-      /* Query the L3 cache count. */
-      cpuid_count(0x8000001D, 3, regs);
-      unsigned cache_level = (regs[0] >> 5) & 0x7;
-      unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
-
-      if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
-         return;
-
       uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
       uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
-      uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
-      uint32_t apic_id[UTIL_MAX_CPUS];
       bool saved = false;
 
+      uint32_t L3_found[UTIL_MAX_CPUS] = {0};
+      uint32_t num_L3_caches = 0;
+      util_affinity_mask *L3_affinity_masks = NULL;
+
       /* Query APIC IDs from each CPU core.
        *
        * An APIC ID is a logical ID of the CPU with respect to the cache
@@ -486,39 +480,58 @@ get_cpu_topology(void)
                                               !saved ? saved_mask : NULL,
                                               util_cpu_caps.num_cpu_mask_bits)) {
             saved = true;
-            allowed_mask[i / 32] |= cpu_bit;
 
             /* Query the APIC ID of the current core. */
             cpuid(0x00000001, regs);
-            apic_id[i] = regs[1] >> 24;
+            unsigned apic_id = regs[1] >> 24;
+
+            /* Query the total core count for the CPU */
+            uint32_t core_count = 1;
+            if (regs[3] & (1 << 28))
+               core_count = (regs[1] >> 16) & 0xff;
+
+            core_count = util_next_power_of_two(core_count);
+
+            /* Query the L3 cache count. */
+            cpuid_count(0x8000001D, 3, regs);
+            unsigned cache_level = (regs[0] >> 5) & 0x7;
+            unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
+
+            if (cache_level != 3)
+               continue;
+
+            unsigned local_core_id = apic_id & (core_count - 1);
+            unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
+            unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
+#define L3_ID(p, i) (p << 16 | i << 1 | 1);
+
+            unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
+            int idx = -1;
+            for (unsigned c = 0; c < num_L3_caches; c++) {
+               if (L3_found[c] == l3_id) {
+                  idx = c;
+                  break;
+               }
+            }
+            if (idx == -1) {
+               idx = num_L3_caches;
+               L3_found[num_L3_caches++] = l3_id;
+               L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
+               if (!L3_affinity_masks)
+                  return;
+               memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
+            }
+            util_cpu_caps.cpu_to_L3[i] = idx;
+            L3_affinity_masks[idx][i / 32] |= cpu_bit;
+
          }
          mask[i / 32] = 0;
       }
 
+      util_cpu_caps.num_L3_caches = num_L3_caches;
+      util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
+
       if (saved) {
-
-         /* We succeeded in using at least one CPU. */
-         util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
-         util_cpu_caps.cores_per_L3 = cores_per_L3;
-         util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
-                                                 util_cpu_caps.num_L3_caches);
-
-         for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
-              i++) {
-            uint32_t cpu_bit = 1u << (i % 32);
-
-            if (allowed_mask[i / 32] & cpu_bit) {
-               /* Each APIC ID bit represents a topology level, so we need
-                * to round up to the next power of two.
-                */
-               unsigned L3_index = apic_id[i] /
-                                   util_next_power_of_two(cores_per_L3);
-
-               util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
-               util_cpu_caps.cpu_to_L3[i] = L3_index;
-            }
-         }
-
          if (debug_get_option_dump_cpu()) {
             fprintf(stderr, "CPU <-> L3 cache mapping:\n");
             for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {