nvc0/vbo: wrap draw_vbo for multidraw performance

This patch is to avoid the high overhead that exists when trying to kick ever single draw during multidraw. glMultiDrawArrays performance profiling: 342.5 thousand draws/second -> 40 million draws/second Special thanks to Arthur Huillet for helping getting this profiled in irc. Signed-off-by: Yusuf Khan <yusisamerican@gmail.com> Reviewed-by: Karol Herbst <kherbst@redhat.com> --- v2: fix typos pointed out by Arthur v3: nvc0_draw_vbo -> nvc0_draw_single_vbo, intialize count v4: remove num_draws from wrapped function and add mutex assert Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28641>
2026-02-03 00:00:25 +01:00 · 2024-04-07 06:09:36 -05:00 · 2024-04-07 06:09:36 -05:00 · 225f2aac96
commit 225f2aac96
parent 3fb1a64918
1 changed files with 144 additions and 115 deletions
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@ -925,6 +925,126 @@ nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index)
   }
 }

+static void
+nvc0_draw_single_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
+                     unsigned drawid_offset,
+                     const struct pipe_draw_indirect_info *indirect,
+                     const struct pipe_draw_start_count_bias *draws)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
+   int s;
+
+   simple_mtx_assert_locked(&nvc0->screen->state_lock);
+
+   if (nvc0->vertprog->vp.need_draw_parameters && (!indirect || indirect->count_from_stream_output)) {
+      PUSH_SPACE(push, 9);
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
+      PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
+      PUSH_DATA (push, info->index_size ? draws->index_bias : 0);
+      PUSH_DATA (push, info->start_instance);
+      PUSH_DATA (push, drawid_offset);
+   }
+
+   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS &&
+       nvc0->seamless_cube_map != nvc0->state.seamless_cube_map) {
+      nvc0->state.seamless_cube_map = nvc0->seamless_cube_map;
+      PUSH_SPACE(push, 1);
+      IMMED_NVC0(push, NVC0_3D(TEX_MISC),
+                 nvc0->seamless_cube_map ? NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP : 0);
+   }
+
+   nvc0->base.kick_notify = nvc0_draw_vbo_kick_notify;
+
+   for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
+      if (nvc0->constbuf_coherent[s])
+         nvc0->cb_dirty = true;
+   }
+
+   if (nvc0->cb_dirty) {
+      PUSH_SPACE(push, 1);
+      IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
+      nvc0->cb_dirty = false;
+   }
+
+   for (s = 0; s < 5; ++s) {
+      if (!nvc0->textures_coherent[s])
+         continue;
+
+      PUSH_SPACE(push, nvc0->num_textures[s] * 2);
+
+      for (int i = 0; i < nvc0->num_textures[s]; ++i) {
+         struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
+         if (!(nvc0->textures_coherent[s] & (1 << i)))
+            continue;
+
+         BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
+         PUSH_DATA (push, (tic->id << 4) | 1);
+         NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_cache_flush_count, 1);
+      }
+   }
+
+   if (nvc0->state.vbo_mode) {
+      if (indirect && indirect->buffer)
+         nvc0_push_vbo_indirect(nvc0, info, drawid_offset, indirect, &draws[0]);
+      else
+         nvc0_push_vbo(nvc0, info, indirect, &draws[0]);
+      return;
+   }
+
+   /* space for base instance, flush, and prim restart */
+   PUSH_SPACE(push, 8);
+
+   if (nvc0->state.instance_base != info->start_instance) {
+      nvc0->state.instance_base = info->start_instance;
+      /* NOTE: this does not affect the shader input, should it ? */
+      BEGIN_NVC0(push, NVC0_3D(VB_INSTANCE_BASE), 1);
+      PUSH_DATA (push, info->start_instance);
+   }
+
+   nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
+
+   if (!nvc0->base.vbo_dirty && info->index_size && !info->has_user_indices &&
+       info->index.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
+      nvc0->base.vbo_dirty = true;
+
+   nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
+
+   if (nvc0->base.vbo_dirty) {
+      if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
+         IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
+      nvc0->base.vbo_dirty = false;
+   }
+
+   if (unlikely(indirect && indirect->buffer)) {
+      nvc0_draw_indirect(nvc0, info, drawid_offset, indirect);
+   } else
+   if (unlikely(indirect && indirect->count_from_stream_output)) {
+      nvc0_draw_stream_output(nvc0, info, indirect);
+   } else
+   if (info->index_size) {
+      bool shorten = info->index_bounds_valid && info->max_index <= 65535;
+
+      if (info->primitive_restart && info->restart_index > 65535)
+         shorten = false;
+
+      nvc0_draw_elements(nvc0, shorten, info,
+                         info->mode, draws[0].start, draws[0].count,
+                         info->instance_count, draws->index_bias, info->index_size);
+   } else {
+      nvc0_draw_arrays(nvc0,
+                       info->mode, draws[0].start, draws[0].count,
+                       info->instance_count);
+   }
+}
+
+/* Thin wrapper to avoid kicking every 3 ns during multidraw */
+
 void
 nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
              unsigned drawid_offset,
@ -932,19 +1052,23 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
              const struct pipe_draw_start_count_bias *draws,
              unsigned num_draws)
 {
-   if (num_draws > 1) {
-      util_draw_multi(pipe, info, drawid_offset, indirect, draws, num_draws);
-      return;
-   }
-
-   if (!indirect && (!draws[0].count || !info->instance_count))
-      return;
-
   struct nvc0_context *nvc0 = nvc0_context(pipe);
   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
   struct nvc0_screen *screen = nvc0->screen;
   unsigned vram_domain = NV_VRAM_DOMAIN(&screen->base);
-   int s;
+   unsigned count_total = 0;
+
+   /* The rest is copied straight from util_multi_draw
+    *
+    * XXX: Properly rewrite vbo handling in nvc0/nv50
+    *
+     */
+
+   unsigned drawid = drawid_offset;
+
+   /* dont wanna start trippin */
+   simple_mtx_lock(&nvc0->screen->state_lock);

   /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
   if (info->index_bounds_valid) {
@ -957,12 +1081,17 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
   nvc0->instance_off = info->start_instance;
   nvc0->instance_max = info->instance_count - 1;

+   /* Get total amount of draw counts to determine whether to push or upload vertices */
+   for (unsigned i = 0; i < num_draws; i++) {
+      count_total += draws[i].count;
+   }
+
   /* For picking only a few vertices from a large user buffer, push is better,
    * if index count is larger and we expect repeated vertices, suggest upload.
    */
   nvc0->vbo_push_hint =
      (!indirect || indirect->count_from_stream_output) && info->index_size &&
-      (nvc0->vb_elt_limit >= (draws[0].count * 2));
+      (nvc0->vb_elt_limit >= (count_total * 2));

   if (nvc0->dirty_3d & (NVC0_NEW_3D_ARRAYS | NVC0_NEW_3D_VERTEX))
      nvc0->constant_vbos = nvc0->vertex->constant_vbos & nvc0->vbo_user;
@ -1028,115 +1157,15 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
   BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TEXT, vram_domain | NOUVEAU_BO_RD,
                screen->text);

-   simple_mtx_lock(&nvc0->screen->state_lock);
-
   nvc0_state_validate_3d(nvc0, ~0);

-   if (nvc0->vertprog->vp.need_draw_parameters && (!indirect || indirect->count_from_stream_output)) {
-      PUSH_SPACE(push, 9);
-      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, NVC0_CB_AUX_SIZE);
-      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
-      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
-      BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
-      PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
-      PUSH_DATA (push, info->index_size ? draws->index_bias : 0);
-      PUSH_DATA (push, info->start_instance);
-      PUSH_DATA (push, drawid_offset);
+   for (unsigned i = 0; i < num_draws; i++) {
+      if (indirect || (draws[i].count && info->instance_count))
+         nvc0_draw_single_vbo(pipe, info, drawid, indirect, &draws[i]);
+      if (info->increment_draw_id)
+         drawid++;
   }

-   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS &&
-       nvc0->seamless_cube_map != nvc0->state.seamless_cube_map) {
-      nvc0->state.seamless_cube_map = nvc0->seamless_cube_map;
-      PUSH_SPACE(push, 1);
-      IMMED_NVC0(push, NVC0_3D(TEX_MISC),
-                 nvc0->seamless_cube_map ? NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP : 0);
-   }
-
-   nvc0->base.kick_notify = nvc0_draw_vbo_kick_notify;
-
-   for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
-      if (nvc0->constbuf_coherent[s])
-         nvc0->cb_dirty = true;
-   }
-
-   if (nvc0->cb_dirty) {
-      PUSH_SPACE(push, 1);
-      IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
-      nvc0->cb_dirty = false;
-   }
-
-   for (s = 0; s < 5; ++s) {
-      if (!nvc0->textures_coherent[s])
-         continue;
-
-      PUSH_SPACE(push, nvc0->num_textures[s] * 2);
-
-      for (int i = 0; i < nvc0->num_textures[s]; ++i) {
-         struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
-         if (!(nvc0->textures_coherent[s] & (1 << i)))
-            continue;
-
-         BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
-         PUSH_DATA (push, (tic->id << 4) | 1);
-         NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_cache_flush_count, 1);
-      }
-   }
-
-   if (nvc0->state.vbo_mode) {
-      if (indirect && indirect->buffer)
-         nvc0_push_vbo_indirect(nvc0, info, drawid_offset, indirect, &draws[0]);
-      else
-         nvc0_push_vbo(nvc0, info, indirect, &draws[0]);
-      goto cleanup;
-   }
-
-   /* space for base instance, flush, and prim restart */
-   PUSH_SPACE(push, 8);
-
-   if (nvc0->state.instance_base != info->start_instance) {
-      nvc0->state.instance_base = info->start_instance;
-      /* NOTE: this does not affect the shader input, should it ? */
-      BEGIN_NVC0(push, NVC0_3D(VB_INSTANCE_BASE), 1);
-      PUSH_DATA (push, info->start_instance);
-   }
-
-   nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
-
-   if (!nvc0->base.vbo_dirty && info->index_size && !info->has_user_indices &&
-       info->index.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nvc0->base.vbo_dirty = true;
-
-   nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
-
-   if (nvc0->base.vbo_dirty) {
-      if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
-         IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
-      nvc0->base.vbo_dirty = false;
-   }
-
-   if (unlikely(indirect && indirect->buffer)) {
-      nvc0_draw_indirect(nvc0, info, drawid_offset, indirect);
-   } else
-   if (unlikely(indirect && indirect->count_from_stream_output)) {
-      nvc0_draw_stream_output(nvc0, info, indirect);
-   } else
-   if (info->index_size) {
-      bool shorten = info->index_bounds_valid && info->max_index <= 65535;
-
-      if (info->primitive_restart && info->restart_index > 65535)
-         shorten = false;
-
-      nvc0_draw_elements(nvc0, shorten, info,
-                         info->mode, draws[0].start, draws[0].count,
-                         info->instance_count, draws->index_bias, info->index_size);
-   } else {
-      nvc0_draw_arrays(nvc0,
-                       info->mode, draws[0].start, draws[0].count,
-                       info->instance_count);
-   }
-
-cleanup:
   PUSH_KICK(push);
   simple_mtx_unlock(&nvc0->screen->state_lock);