glthread: pin driver threads to the same L3 as the main thread regularly

This improves performance on my Ryzen 3900X, which has 4 L3 caches and
6 threads per L3.

The best improvement is 33% if the kernel CPU scheduler doesn't move
the main thread too often.

v2: pin only once in 128 batch flushes

Acked-by: Jose Fonseca <jfonseca@vmware.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7054>
This commit is contained in:
Marek Olšák 2020-10-07 07:41:41 -04:00 committed by Marge Bot
parent d8ea509965
commit 5957b0c162
4 changed files with 38 additions and 0 deletions

View file

@ -1326,6 +1326,8 @@ struct dd_function_table {
void (*SetMaxShaderCompilerThreads)(struct gl_context *ctx, unsigned count);
bool (*GetShaderProgramCompletionStatus)(struct gl_context *ctx,
struct gl_shader_program *shprog);
void (*PinDriverToL3Cache)(struct gl_context *ctx, unsigned L3_cache);
};

View file

@ -38,6 +38,7 @@
#include "main/hash.h"
#include "util/u_atomic.h"
#include "util/u_thread.h"
#include "util/u_cpu_detect.h"
static void
@ -195,6 +196,25 @@ _mesa_glthread_flush_batch(struct gl_context *ctx)
if (!next->used)
return;
/* Pin threads regularly to the same Zen CCX that the main thread is
* running on. The main thread can move between CCXs.
*/
if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
/* driver support */
ctx->Driver.PinDriverToL3Cache &&
++glthread->pin_thread_counter % 128 == 0) {
int cpu = util_get_current_cpu();
if (cpu >= 0) {
unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
util_set_thread_affinity(glthread->queue.threads[0],
util_cpu_caps.L3_affinity_mask[L3_cache],
NULL, UTIL_MAX_CPUS);
ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
}
}
/* Debug: execute the batch immediately from this thread.
*
* Note that glthread_unmarshal_batch() changes the dispatch table so we'll

View file

@ -134,6 +134,9 @@ struct glthread_state
/** Whether GLThread is inside a display list generation. */
bool inside_dlist;
/** For L3 cache pinning. */
unsigned pin_thread_counter;
/** The ring of batches in memory. */
struct glthread_batch batches[MARSHAL_MAX_BATCHES];

View file

@ -908,6 +908,16 @@ st_get_driver_uuid(struct gl_context *ctx, char *uuid)
}
static void
st_pin_driver_to_l3_cache(struct gl_context *ctx, unsigned L3_cache)
{
struct pipe_context *pipe = st_context(ctx)->pipe;
pipe->set_context_param(pipe, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
L3_cache);
}
static void
st_init_driver_functions(struct pipe_screen *screen,
struct dd_function_table *functions)
@ -999,6 +1009,9 @@ st_create_context(gl_api api, struct pipe_context *pipe,
memset(&funcs, 0, sizeof(funcs));
st_init_driver_functions(pipe->screen, &funcs);
if (pipe->set_context_param)
funcs.PinDriverToL3Cache = st_pin_driver_to_l3_cache;
ctx = calloc(1, sizeof(struct gl_context));
if (!ctx)
return NULL;