glthread: pin driver threads to the same L3 as the main thread regularly

This improves performance on my Ryzen 3900X, which has 4 L3 caches and 6 threads per L3. The best improvement is 33% if the kernel CPU scheduler doesn't move the main thread too often. v2: pin only once in 128 batch flushes Acked-by: Jose Fonseca <jfonseca@vmware.com> Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7054>
2026-05-05 05:18:08 +02:00 · 2020-10-07 07:41:41 -04:00 · 2020-10-07 07:41:41 -04:00 · 5957b0c162
commit 5957b0c162
parent d8ea509965
4 changed files with 38 additions and 0 deletions
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@ -1326,6 +1326,8 @@ struct dd_function_table {
   void (*SetMaxShaderCompilerThreads)(struct gl_context *ctx, unsigned count);
   bool (*GetShaderProgramCompletionStatus)(struct gl_context *ctx,
                                            struct gl_shader_program *shprog);
+
+   void (*PinDriverToL3Cache)(struct gl_context *ctx, unsigned L3_cache);
 };


--- a/src/mesa/main/glthread.c
+++ b/src/mesa/main/glthread.c
@ -38,6 +38,7 @@
 #include "main/hash.h"
 #include "util/u_atomic.h"
 #include "util/u_thread.h"
+#include "util/u_cpu_detect.h"


 static void
@ -195,6 +196,25 @@ _mesa_glthread_flush_batch(struct gl_context *ctx)
   if (!next->used)
      return;

+   /* Pin threads regularly to the same Zen CCX that the main thread is
+    * running on. The main thread can move between CCXs.
+    */
+   if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
+       /* driver support */
+       ctx->Driver.PinDriverToL3Cache &&
+       ++glthread->pin_thread_counter % 128 == 0) {
+      int cpu = util_get_current_cpu();
+
+      if (cpu >= 0) {
+         unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
+
+         util_set_thread_affinity(glthread->queue.threads[0],
+                                  util_cpu_caps.L3_affinity_mask[L3_cache],
+                                  NULL, UTIL_MAX_CPUS);
+         ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
+      }
+   }
+
   /* Debug: execute the batch immediately from this thread.
    *
    * Note that glthread_unmarshal_batch() changes the dispatch table so we'll
--- a/src/mesa/main/glthread.h
+++ b/src/mesa/main/glthread.h
@ -134,6 +134,9 @@ struct glthread_state
   /** Whether GLThread is inside a display list generation. */
   bool inside_dlist;

+   /** For L3 cache pinning. */
+   unsigned pin_thread_counter;
+
   /** The ring of batches in memory. */
   struct glthread_batch batches[MARSHAL_MAX_BATCHES];

--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@ -908,6 +908,16 @@ st_get_driver_uuid(struct gl_context *ctx, char *uuid)
 }


+static void
+st_pin_driver_to_l3_cache(struct gl_context *ctx, unsigned L3_cache)
+{
+   struct pipe_context *pipe = st_context(ctx)->pipe;
+
+   pipe->set_context_param(pipe, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
+                           L3_cache);
+}
+
+
 static void
 st_init_driver_functions(struct pipe_screen *screen,
                         struct dd_function_table *functions)
@ -999,6 +1009,9 @@ st_create_context(gl_api api, struct pipe_context *pipe,
   memset(&funcs, 0, sizeof(funcs));
   st_init_driver_functions(pipe->screen, &funcs);

+   if (pipe->set_context_param)
+      funcs.PinDriverToL3Cache = st_pin_driver_to_l3_cache;
+
   ctx = calloc(1, sizeof(struct gl_context));
   if (!ctx)
      return NULL;