mesa/src/util/u_cpu_detect.c
Dave Airlie 11d2db17c5 util: rework AMD cpu L3 cache affinity code.
This changes how the L3 cache affinity code works out the affinity
masks. It works better with multi-CPU systems and should also be
capable of handling big/little type situations if they appear in
the future.

It now iterates over all CPU cores, gets the core count for each
CPU, and works out the L3_ID from the physical CPU ID, and
the current cores L3 cache. It then tracks how many L3 caches
it has seen and reallocate the affinity masks for each one.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4496
Fixes: d8ea509965 ("util: completely rewrite and do AMD Zen L3 cache pinning correctly")
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9782>
2021-03-29 08:31:09 +00:00

769 lines
24 KiB
C

/**************************************************************************
*
* Copyright 2008 Dennis Smit
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* @file
* CPU feature detection.
*
* @author Dennis Smit
* @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
*/
#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_debug.h"
#include "u_cpu_detect.h"
#include "u_math.h"
#include "c11/threads.h"
#include <stdio.h>
#include <inttypes.h>
#if defined(PIPE_ARCH_PPC)
#if defined(PIPE_OS_APPLE)
#include <sys/sysctl.h>
#else
#include <signal.h>
#include <setjmp.h>
#endif
#endif
#if defined(PIPE_OS_BSD)
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif
#if defined(PIPE_OS_FREEBSD)
#if __has_include(<sys/auxv.h>)
#include <sys/auxv.h>
#define HAVE_ELF_AUX_INFO
#endif
#endif
#if defined(PIPE_OS_LINUX)
#include <signal.h>
#include <fcntl.h>
#include <elf.h>
#endif
#ifdef PIPE_OS_UNIX
#include <unistd.h>
#endif
#if defined(HAS_ANDROID_CPUFEATURES)
#include <cpu-features.h>
#endif
#if defined(PIPE_OS_WINDOWS)
#include <windows.h>
#if defined(PIPE_CC_MSVC)
#include <intrin.h>
#endif
#endif
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
struct util_cpu_caps_t util_cpu_caps;
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
static int has_cpuid(void);
#endif
#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX)
static jmp_buf __lv_powerpc_jmpbuf;
static volatile sig_atomic_t __lv_powerpc_canjump = 0;
static void
sigill_handler(int sig)
{
if (!__lv_powerpc_canjump) {
signal (sig, SIG_DFL);
raise (sig);
}
__lv_powerpc_canjump = 0;
longjmp(__lv_powerpc_jmpbuf, 1);
}
#endif
#if defined(PIPE_ARCH_PPC)
static void
check_os_altivec_support(void)
{
#if defined(__ALTIVEC__)
util_cpu_caps.has_altivec = 1;
#endif
#if defined(__VSX__)
util_cpu_caps.has_vsx = 1;
#endif
#if defined(__ALTIVEC__) && defined(__VSX__)
/* Do nothing */
#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
#ifdef HW_VECTORUNIT
int sels[2] = {CTL_HW, HW_VECTORUNIT};
#else
int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
#endif
int has_vu = 0;
int len = sizeof (has_vu);
int err;
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
if (err == 0) {
if (has_vu != 0) {
util_cpu_caps.has_altivec = 1;
}
}
#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */
unsigned long hwcap = 0;
#ifdef HAVE_ELF_AUX_INFO
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
#else
size_t len = sizeof(hwcap);
sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0);
#endif
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
util_cpu_caps.has_altivec = 1;
if (hwcap & PPC_FEATURE_HAS_VSX)
util_cpu_caps.has_vsx = 1;
#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */
#if defined(PIPE_ARCH_PPC_64)
Elf64_auxv_t aux;
#else
Elf32_auxv_t aux;
#endif
int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
if (aux.a_type == AT_HWCAP) {
char *env_vsx = getenv("GALLIVM_VSX");
uint64_t hwcap = aux.a_un.a_val;
util_cpu_caps.has_altivec = (hwcap >> 28) & 1;
if (!env_vsx || env_vsx[0] != '0') {
util_cpu_caps.has_vsx = (hwcap >> 7) & 1;
}
break;
}
}
close(fd);
}
#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */
/* not on Apple/Darwin or Linux, do it the brute-force way */
/* this is borrowed from the libmpeg2 library */
signal(SIGILL, sigill_handler);
if (setjmp(__lv_powerpc_jmpbuf)) {
signal(SIGILL, SIG_DFL);
} else {
boolean enable_altivec = TRUE; /* Default: enable if available, and if not overridden */
boolean enable_vsx = TRUE;
#ifdef DEBUG
/* Disabling Altivec code generation is not the same as disabling VSX code generation,
* which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
* lp_build_create_jit_compiler_for_module().
* If you want to disable Altivec code generation, the best place to do it is here.
*/
char *env_control = getenv("GALLIVM_ALTIVEC"); /* 1=enable (default); 0=disable */
if (env_control && env_control[0] == '0') {
enable_altivec = FALSE;
}
#endif
/* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */
char *env_vsx = getenv("GALLIVM_VSX");
if (env_vsx && env_vsx[0] == '0') {
enable_vsx = FALSE;
}
if (enable_altivec) {
__lv_powerpc_canjump = 1;
__asm __volatile
("mtspr 256, %0\n\t"
"vand %%v0, %%v0, %%v0"
:
: "r" (-1));
util_cpu_caps.has_altivec = 1;
if (enable_vsx) {
__asm __volatile("xxland %vs0, %vs0, %vs0");
util_cpu_caps.has_vsx = 1;
}
signal(SIGILL, SIG_DFL);
} else {
util_cpu_caps.has_altivec = 0;
}
}
#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */
}
#endif /* PIPE_ARCH_PPC */
#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
static int has_cpuid(void)
{
#if defined(PIPE_ARCH_X86)
#if defined(PIPE_OS_GCC)
int a, c;
__asm __volatile
("pushf\n"
"popl %0\n"
"movl %0, %1\n"
"xorl $0x200000, %0\n"
"push %0\n"
"popf\n"
"pushf\n"
"popl %0\n"
: "=a" (a), "=c" (c)
:
: "cc");
return a != c;
#else
/* FIXME */
return 1;
#endif
#elif defined(PIPE_ARCH_X86_64)
return 1;
#else
return 0;
#endif
}
/**
* @sa cpuid.h included in gcc-4.3 onwards.
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
*/
static inline void
cpuid(uint32_t ax, uint32_t *p)
{
#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
__asm __volatile (
"xchgl %%ebx, %1\n\t"
"cpuid\n\t"
"xchgl %%ebx, %1"
: "=a" (p[0]),
"=S" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax)
);
#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
__asm __volatile (
"cpuid\n\t"
: "=a" (p[0]),
"=b" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax)
);
#elif defined(PIPE_CC_MSVC)
__cpuid(p, ax);
#else
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
#endif
}
/**
* @sa cpuid.h included in gcc-4.4 onwards.
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
*/
static inline void
cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
{
#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
__asm __volatile (
"xchgl %%ebx, %1\n\t"
"cpuid\n\t"
"xchgl %%ebx, %1"
: "=a" (p[0]),
"=S" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax), "2" (cx)
);
#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
__asm __volatile (
"cpuid\n\t"
: "=a" (p[0]),
"=b" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax), "2" (cx)
);
#elif defined(PIPE_CC_MSVC)
__cpuidex(p, ax, cx);
#else
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
#endif
}
static inline uint64_t xgetbv(void)
{
#if defined(PIPE_CC_GCC)
uint32_t eax, edx;
__asm __volatile (
".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
: "=a"(eax),
"=d"(edx)
: "c"(0)
);
return ((uint64_t)edx << 32) | eax;
#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
#else
return 0;
#endif
}
#if defined(PIPE_ARCH_X86)
PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
{
struct {
uint32_t pad1[7];
uint32_t mxcsr_mask;
uint32_t pad2[128-8];
} PIPE_ALIGN_VAR(16) fxarea;
fxarea.mxcsr_mask = 0;
#if defined(PIPE_CC_GCC)
__asm __volatile ("fxsave %0" : "+m" (fxarea));
#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL)
_fxsave(&fxarea);
#else
fxarea.mxcsr_mask = 0;
#endif
return !!(fxarea.mxcsr_mask & (1 << 6));
}
#endif
#endif /* X86 or X86_64 */
#if defined(PIPE_ARCH_ARM)
static void
check_os_arm_support(void)
{
/*
* On Android, the cpufeatures library is preferred way of checking
* CPU capabilities. However, it is not available for standalone Mesa
* builds, i.e. when Android build system (Android.mk-based) is not
* used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
* have a separate macro that only gets enabled from respective Android.mk.
*/
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
util_cpu_caps.has_neon = 1;
#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO)
unsigned long hwcap = 0;
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
if (hwcap & HWCAP_NEON)
util_cpu_caps.has_neon = 1;
#elif defined(HAS_ANDROID_CPUFEATURES)
AndroidCpuFamily cpu_family = android_getCpuFamily();
uint64_t cpu_features = android_getCpuFeatures();
if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
util_cpu_caps.has_neon = 1;
}
#elif defined(PIPE_OS_LINUX)
Elf32_auxv_t aux;
int fd;
fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
if (aux.a_type == AT_HWCAP) {
uint32_t hwcap = aux.a_un.a_val;
util_cpu_caps.has_neon = (hwcap >> 12) & 1;
break;
}
}
close (fd);
}
#endif /* PIPE_OS_LINUX */
}
#elif defined(PIPE_ARCH_AARCH64)
static void
check_os_arm_support(void)
{
util_cpu_caps.has_neon = true;
}
#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */
static void
get_cpu_topology(void)
{
/* Default. This is OK if L3 is not present or there is only one. */
util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
util_cpu_caps.num_L3_caches = 1;
memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
/* AMD Zen */
if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
util_cpu_caps.family < CPU_AMD_LAST) {
uint32_t regs[4];
uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
bool saved = false;
uint32_t L3_found[UTIL_MAX_CPUS] = {0};
uint32_t num_L3_caches = 0;
util_affinity_mask *L3_affinity_masks = NULL;
/* Query APIC IDs from each CPU core.
*
* An APIC ID is a logical ID of the CPU with respect to the cache
* hierarchy, meaning that consecutive APIC IDs are neighbours in
* the hierarchy, e.g. sharing the same cache.
*
* For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
* which means that both CPU 0 and 12 are next to each other.
* (e.g. they are 2 threads belonging to 1 SMT2 core)
*
* We need to find out which CPUs share the same L3 cache and they can
* be all over the place.
*
* Querying the APIC ID can only be done by pinning the current thread
* to each core. The original affinity mask is saved.
*/
for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
i++) {
uint32_t cpu_bit = 1u << (i % 32);
mask[i / 32] = cpu_bit;
if (util_set_current_thread_affinity(mask,
!saved ? saved_mask : NULL,
util_cpu_caps.num_cpu_mask_bits)) {
saved = true;
/* Query the APIC ID of the current core. */
cpuid(0x00000001, regs);
unsigned apic_id = regs[1] >> 24;
/* Query the total core count for the CPU */
uint32_t core_count = 1;
if (regs[3] & (1 << 28))
core_count = (regs[1] >> 16) & 0xff;
core_count = util_next_power_of_two(core_count);
/* Query the L3 cache count. */
cpuid_count(0x8000001D, 3, regs);
unsigned cache_level = (regs[0] >> 5) & 0x7;
unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
if (cache_level != 3)
continue;
unsigned local_core_id = apic_id & (core_count - 1);
unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
#define L3_ID(p, i) (p << 16 | i << 1 | 1);
unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
int idx = -1;
for (unsigned c = 0; c < num_L3_caches; c++) {
if (L3_found[c] == l3_id) {
idx = c;
break;
}
}
if (idx == -1) {
idx = num_L3_caches;
L3_found[num_L3_caches++] = l3_id;
L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
if (!L3_affinity_masks)
return;
memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
}
util_cpu_caps.cpu_to_L3[i] = idx;
L3_affinity_masks[idx][i / 32] |= cpu_bit;
}
mask[i / 32] = 0;
}
util_cpu_caps.num_L3_caches = num_L3_caches;
util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
if (saved) {
if (debug_get_option_dump_cpu()) {
fprintf(stderr, "CPU <-> L3 cache mapping:\n");
for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
fprintf(stderr, " - L3 %u mask = ", i);
for (int j = util_cpu_caps.nr_cpus - 1; j >= 0; j -= 32)
fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
fprintf(stderr, "\n");
}
}
/* Restore the original affinity mask. */
util_set_current_thread_affinity(saved_mask, NULL,
util_cpu_caps.num_cpu_mask_bits);
} else {
if (debug_get_option_dump_cpu())
fprintf(stderr, "Cannot set thread affinity for any thread.\n");
}
}
#endif
}
static void
util_cpu_detect_once(void)
{
memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
/* Count the number of CPUs in system */
#if defined(PIPE_OS_WINDOWS)
{
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
util_cpu_caps.nr_cpus = MAX2(1, system_info.dwNumberOfProcessors);
}
#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (util_cpu_caps.nr_cpus == ~0)
util_cpu_caps.nr_cpus = 1;
#elif defined(PIPE_OS_BSD)
{
int mib[2], ncpu;
int len;
mib[0] = CTL_HW;
mib[1] = HW_NCPU;
len = sizeof (ncpu);
sysctl(mib, 2, &ncpu, &len, NULL, 0);
util_cpu_caps.nr_cpus = ncpu;
}
#else
util_cpu_caps.nr_cpus = 1;
#endif
util_cpu_caps.num_cpu_mask_bits = align(util_cpu_caps.nr_cpus, 32);
/* Make the fallback cacheline size nonzero so that it can be
* safely passed to align().
*/
util_cpu_caps.cacheline = sizeof(void *);
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
if (has_cpuid()) {
uint32_t regs[4];
uint32_t regs2[4];
util_cpu_caps.cacheline = 32;
/* Get max cpuid level */
cpuid(0x00000000, regs);
if (regs[0] >= 0x00000001) {
unsigned int cacheline;
cpuid (0x00000001, regs2);
util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
/* Add "extended family". */
if (util_cpu_caps.x86_cpu_type == 0xf)
util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff);
switch (util_cpu_caps.x86_cpu_type) {
case 0x17:
util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2;
break;
case 0x18:
util_cpu_caps.family = CPU_AMD_ZEN_HYGON;
break;
case 0x19:
util_cpu_caps.family = CPU_AMD_ZEN3;
break;
}
/* general feature flags */
util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */
util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */
util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */
util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */
util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */
util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */
util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX
((regs2[2] >> 27) & 1) && // OSXSAVE
((xgetbv() & 6) == 6); // XMM & YMM
util_cpu_caps.has_f16c = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
util_cpu_caps.has_fma = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
#if defined(PIPE_ARCH_X86_64)
util_cpu_caps.has_daz = 1;
#else
util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
(util_cpu_caps.has_sse2 && sse2_has_daz());
#endif
cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
if (cacheline > 0)
util_cpu_caps.cacheline = cacheline;
}
if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
uint32_t regs7[4];
cpuid_count(0x00000007, 0x00000000, regs7);
util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
}
// check for avx512
if (((regs2[2] >> 27) & 1) && // OSXSAVE
(xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
uint32_t regs3[4];
cpuid_count(0x00000007, 0x00000000, regs3);
util_cpu_caps.has_avx512f = (regs3[1] >> 16) & 1;
util_cpu_caps.has_avx512dq = (regs3[1] >> 17) & 1;
util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
util_cpu_caps.has_avx512pf = (regs3[1] >> 26) & 1;
util_cpu_caps.has_avx512er = (regs3[1] >> 27) & 1;
util_cpu_caps.has_avx512cd = (regs3[1] >> 28) & 1;
util_cpu_caps.has_avx512bw = (regs3[1] >> 30) & 1;
util_cpu_caps.has_avx512vl = (regs3[1] >> 31) & 1;
util_cpu_caps.has_avx512vbmi = (regs3[2] >> 1) & 1;
}
if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
/* GenuineIntel */
util_cpu_caps.has_intel = 1;
}
cpuid(0x80000000, regs);
if (regs[0] >= 0x80000001) {
cpuid(0x80000001, regs2);
util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1;
util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
((regs2[2] >> 11) & 1);
}
if (regs[0] >= 0x80000006) {
/* should we really do this if the clflush size above worked? */
unsigned int cacheline;
cpuid(0x80000006, regs2);
cacheline = regs2[2] & 0xFF;
if (cacheline > 0)
util_cpu_caps.cacheline = cacheline;
}
if (!util_cpu_caps.has_sse) {
util_cpu_caps.has_sse2 = 0;
util_cpu_caps.has_sse3 = 0;
util_cpu_caps.has_ssse3 = 0;
util_cpu_caps.has_sse4_1 = 0;
}
}
#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
check_os_arm_support();
#endif
#if defined(PIPE_ARCH_PPC)
check_os_altivec_support();
#endif /* PIPE_ARCH_PPC */
get_cpu_topology();
if (debug_get_option_dump_cpu()) {
debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
debug_printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx);
debug_printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
debug_printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
debug_printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
debug_printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma);
debug_printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf);
debug_printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er);
debug_printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd);
debug_printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw);
debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
}
}
static once_flag cpu_once_flag = ONCE_FLAG_INIT;
void
util_cpu_detect(void)
{
call_once(&cpu_once_flag, util_cpu_detect_once);
}