mesa/st: add specialized pbo download shaders

the goal of the pbo ubershaders was to have a very small number of shaders that could handle lots of different readback operations this is great for cases where an app is doing lots of different types of readback, but it's not as optimal for apps which do only 1-2 types of readback over and over to handle this, track the usage for every readback which reaches compute pbo and after the same readback is used N times, inline the constant data and create a specialized version of the shader which will be more optimal Acked-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18198>
2026-01-17 02:40:37 +01:00 · 2022-08-17 10:04:43 -04:00 · 2022-08-17 10:04:43 -04:00 · 5dab7673e1
commit 5dab7673e1
parent c79fc40de5
1 changed files with 100 additions and 4 deletions
--- a/src/mesa/state_tracker/st_pbo_compute.c
+++ b/src/mesa/state_tracker/st_pbo_compute.c
@ -40,13 +40,26 @@
 #include "util/u_sampler.h"
 #include "util/streaming-load-memcpy.h"

+#define SPEC_USES_THRESHOLD 5
+
+struct pbo_spec_async_data {
+   uint32_t data[4]; //must be first
+   bool created;
+   unsigned uses;
+   struct util_queue_fence fence;
+   nir_shader *nir;
+   struct pipe_shader_state *cs;
+};
+
 struct pbo_async_data {
   struct st_context *st;
   enum pipe_texture_target target;
   unsigned num_components;
   struct util_queue_fence fence;
   nir_shader *nir;
+   nir_shader *copy; //immutable
   struct pipe_shader_state *cs;
+   struct set specialized;
 };

 #define BGR_FORMAT(NAME) \
@ -792,6 +805,53 @@ create_conversion_shader_async(void *data, void *gdata, int thread_index)
 {
   struct pbo_async_data *async = data;
   async->nir = create_conversion_shader(async->st, async->target, async->num_components);
+   /* this is hefty, but specialized shaders need a base to work from */
+   async->copy = nir_shader_clone(NULL, async->nir);
+}
+
+static void
+create_spec_shader_async(void *data, void *gdata, int thread_index)
+{
+   struct pbo_spec_async_data *spec = data;
+   /* this is still the immutable clone: create our own copy */
+   spec->nir = nir_shader_clone(NULL, spec->nir);
+   /* do not inline geometry */
+   uint16_t offsets[2] = {2, 3};
+   nir_inline_uniforms(spec->nir, ARRAY_SIZE(offsets), &spec->data[2], offsets);
+   spec->created = true;
+}
+
+static uint32_t
+hash_pbo_data(const void *data)
+{
+   const struct pbo_data *p = data;
+   return _mesa_hash_data(&p->vec[2], sizeof(uint32_t) * 2);
+}
+
+static bool
+equals_pbo_data(const void *a, const void *b)
+{
+   const struct pbo_data *pa = a, *pb = b;
+   return !memcmp(&pa->vec[2], &pb->vec[2], sizeof(uint32_t) * 2);
+}
+
+static struct pbo_spec_async_data *
+add_spec_data(struct pbo_async_data *async, struct pbo_data *pd)
+{
+   bool found = false;
+   struct pbo_spec_async_data *spec;
+   struct set_entry *entry = _mesa_set_search_or_add(&async->specialized, pd, &found);
+   if (!found) {
+      spec = calloc(1, sizeof(struct pbo_async_data));
+      util_queue_fence_init(&spec->fence);
+      memcpy(spec->data, pd, sizeof(struct pbo_data));
+      entry->key = spec;
+   }
+   spec = (void*)entry->key;
+   if (!spec->nir && !spec->created)
+      spec->nir = async->copy;
+   spec->uses++;
+   return spec;
 }

 static struct pipe_resource *
@ -845,6 +905,7 @@ download_texture_compute(struct st_context *st,
   if (he) {
      if (screen->driver_thread_add_job) {
         struct pbo_async_data *async = he->data;
+         struct pbo_spec_async_data *spec = add_spec_data(async, &pd);
         if (!util_queue_fence_is_signalled(&async->fence))
            return NULL;
         /* nir is definitely done */
@ -863,18 +924,40 @@ download_texture_compute(struct st_context *st,
             !screen->is_parallel_shader_compilation_finished(screen, async->cs, MESA_SHADER_COMPUTE))
            return NULL;
         cs = async->cs;
+         if (spec->uses > SPEC_USES_THRESHOLD && util_queue_fence_is_signalled(&spec->fence)) {
+            if (spec->created) {
+               if (!spec->cs) {
+                  struct pipe_compute_state state = {0};
+                  state.ir_type = PIPE_SHADER_IR_NIR;
+                  state.req_local_mem = spec->nir->info.shared_size;
+                  state.prog = spec->nir;
+                  spec->nir = NULL;
+                  spec->cs = pipe->create_compute_state(pipe, &state);
+               }
+               if (screen->is_parallel_shader_compilation_finished &&
+                   screen->is_parallel_shader_compilation_finished(screen, spec->cs, MESA_SHADER_COMPUTE)) {
+                  cs = spec->cs;
+                  cb.buffer_size = 2 * sizeof(uint32_t);
+               }
+            } else {
+               screen->driver_thread_add_job(screen, spec, &spec->fence, create_spec_shader_async, NULL, 0);
+            }
+         }
+      } else {
+         cs = he->data;
      }
   } else {
      if (screen->driver_thread_add_job) {
-         struct pbo_async_data *async = malloc(sizeof(struct pbo_async_data));
+         struct pbo_async_data *async = calloc(1, sizeof(struct pbo_async_data));
         async->st = st;
         async->target = view_target;
         async->num_components = num_components;
-         async->nir = NULL;
-         async->cs = NULL;
         util_queue_fence_init(&async->fence);
         screen->driver_thread_add_job(screen, async, &async->fence, create_conversion_shader_async, NULL, 0);
         _mesa_hash_table_insert(st->pbo.shaders, (void*)(uintptr_t)hash_key, async);
+
+         _mesa_set_init(&async->specialized, NULL, hash_pbo_data, equals_pbo_data);
+         add_spec_data(async, &pd);
         return NULL;
      } else {
         nir_shader *nir = create_conversion_shader(st, view_target, num_components);
@ -1225,8 +1308,21 @@ st_pbo_compute_deinit(struct st_context *st)
      if (screen->driver_thread_add_job) {
         struct pbo_async_data *async = entry->data;
         util_queue_fence_wait(&async->fence);
-         st->pipe->delete_compute_state(st->pipe, async->cs);
+         if (async->cs)
+            st->pipe->delete_compute_state(st->pipe, async->cs);
         util_queue_fence_destroy(&async->fence);
+         ralloc_free(async->copy);
+         set_foreach_remove(&async->specialized, se) {
+            struct pbo_spec_async_data *spec = (void*)se->key;
+            util_queue_fence_wait(&spec->fence);
+            util_queue_fence_destroy(&spec->fence);
+            if (spec->created) {
+               ralloc_free(spec->nir);
+               st->pipe->delete_compute_state(st->pipe, spec->cs);
+            }
+            free(spec);
+         }
+         ralloc_free(async->specialized.table);
         free(async);
      } else {
         st->pipe->delete_compute_state(st->pipe, entry->data);