gallivm,llvmpipe: Use 4-wide vectors on AMD Bulldozer.

8-wide vectors is slower. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2026-05-05 00:58:05 +02:00 · 2012-08-31 17:01:50 +01:00 · 2012-08-31 17:01:50 +01:00 · 7eb5040197
commit 7eb5040197
parent 9a31e090ef
3 changed files with 15 additions and 1 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@ -434,8 +434,16 @@ lp_build_init(void)

   util_cpu_detect();

+   /* AMD Bulldozer AVX's throughput is the same as SSE2; and because using
+    * 8-wide vector needs more floating ops than 4-wide (due to padding), it is
+    * actually more efficient to use 4-wide vectors on this processor.
+    *
+    * See also:
+    * - http://www.anandtech.com/show/4955/the-bulldozer-review-amd-fx8150-tested/2
+    */
   if (HAVE_AVX &&
-       util_cpu_caps.has_avx) {
+       util_cpu_caps.has_avx &&
+       util_cpu_caps.has_intel) {
      lp_native_vector_width = 256;
   } else {
      /* Leave it at 128, even when no SIMD extensions are available.
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@ -286,6 +286,11 @@ util_cpu_detect(void)
            util_cpu_caps.cacheline = cacheline;
      }

+      if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
+         /* GenuineIntel */
+         util_cpu_caps.has_intel = 1;
+      }
+
      cpuid(0x80000000, regs);

      if (regs[0] >= 0x80000001) {
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@ -52,6 +52,7 @@ struct util_cpu_caps {
   int x86_cpu_type;
   unsigned cacheline;

+   unsigned has_intel:1;
   unsigned has_tsc:1;
   unsigned has_mmx:1;
   unsigned has_mmx2:1;