From 11dbdedf46575cf114244eedd283dc63fd9e99f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 4 Jan 2024 12:44:12 -0500
Subject: [PATCH] st/mesa: optimize st_update_arrays using lots of C++ template
 variants

This adds the following template options:
- add an option to fill TC set_vertex_buffers from st_update_array directly
  (always true without u_vbuf, so always used with radeonsi)
- add an option saying that there are no zero-stride attribs
- add an option saying that there are no user buffers
  (always true with glthread, so always used with radeonsi)
- add an option saying that there is an identity mapping between vertex
  buffers and vertex attribs

I have specifically chosen those options because they improve performance.
I also had other options that didn't, like unrolling the setup_arrays loop.

This adds a total of 42 variants of st_update_array_templ for various cases.
Usually only a few of them are used in practice.

Overhead of st_prepare_draw in VP2020/Catia:
    Before: 8.5% of CPU used
    After: 6.13% of CPU used

That's 2.37% improvement. Since there are 4 threads using the CPU and
the percentage includes all threads in the system, the improvement for
the GL thread is about 8% (roughly 2.17% * 4; each thread at 25% of global
utilization means 100% utilization in 4 cores).

Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27731>
---
 src/mesa/state_tracker/st_atom_array.cpp | 315 ++++++++++++++++++++---
 src/util/bitscan.h                       |   1 +
 2 files changed, 278 insertions(+), 38 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_array.cpp b/src/mesa/state_tracker/st_atom_array.cpp
index e899b993238..faa2d626de9 100644
--- a/src/mesa/state_tracker/st_atom_array.cpp
+++ b/src/mesa/state_tracker/st_atom_array.cpp
@@ -44,16 +44,38 @@
 #include "util/u_cpu_detect.h"
 #include "util/u_math.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_threaded_context.h"
 #include "main/bufferobj.h"
 #include "main/glformats.h"
 #include "main/varray.h"
 #include "main/arrayobj.h"
 
+enum st_fill_tc_set_vb {
+   FILL_TC_SET_VB_OFF,        /* always works */
+   FILL_TC_SET_VB_ON,         /* specialized version (faster) */
+};
+
 enum st_use_vao_fast_path {
    VAO_FAST_PATH_OFF,         /* more complicated version (slower) */
    VAO_FAST_PATH_ON,          /* always works (faster) */
 };
 
+enum st_allow_zero_stride_attribs {
+   ZERO_STRIDE_ATTRIBS_OFF,   /* specialized version (faster) */
+   ZERO_STRIDE_ATTRIBS_ON,    /* always works */
+};
+
+/* Whether vertex attrib indices are equal to their vertex buffer indices. */
+enum st_identity_attrib_mapping {
+   IDENTITY_ATTRIB_MAPPING_OFF,  /* always works */
+   IDENTITY_ATTRIB_MAPPING_ON,   /* specialized version (faster) */
+};
+
+enum st_allow_user_buffers {
+   USER_BUFFERS_OFF,          /* specialized version (faster) */
+   USER_BUFFERS_ON,           /* always works */
+};
+
 enum st_update_velems {
    UPDATE_VELEMS_OFF,         /* specialized version (faster) */
    UPDATE_VELEMS_ON,          /* always works */
@@ -82,7 +104,11 @@ init_velement(struct pipe_vertex_element *velements,
  * on the stack.
  */
 template<util_popcnt POPCNT,
+         st_fill_tc_set_vb FILL_TC_SET_VB,
          st_use_vao_fast_path USE_VAO_FAST_PATH,
+         st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS,
+         st_identity_attrib_mapping HAS_IDENTITY_ATTRIB_MAPPING,
+         st_allow_user_buffers ALLOW_USER_BUFFERS,
          st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
 setup_arrays(struct gl_context *ctx,
              const struct gl_vertex_array_object *vao,
@@ -95,38 +121,73 @@ setup_arrays(struct gl_context *ctx,
    /* Set up enabled vertex arrays. */
    if (USE_VAO_FAST_PATH) {
       const GLubyte *attribute_map =
-         _mesa_vao_attribute_map[vao->_AttributeMapMode];
+         !HAS_IDENTITY_ATTRIB_MAPPING ?
+               _mesa_vao_attribute_map[vao->_AttributeMapMode] : NULL;
+      struct pipe_context *pipe = ctx->pipe;
+      struct tc_buffer_list *next_buffer_list = NULL;
 
+      if (FILL_TC_SET_VB)
+         next_buffer_list = tc_get_next_buffer_list(pipe);
+
+      /* Note: I did try to unroll this loop by passing the number of
+       * iterations as a template parameter, but it resulted in more overhead.
+       */
       while (mask) {
          const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&mask);
-         const struct gl_array_attributes *const attrib =
-            &vao->VertexAttrib[attribute_map[attr]];
-         const struct gl_vertex_buffer_binding *const binding =
-            &vao->BufferBinding[attrib->BufferBindingIndex];
+         const struct gl_array_attributes *attrib;
+         const struct gl_vertex_buffer_binding *binding;
+
+         if (HAS_IDENTITY_ATTRIB_MAPPING) {
+            attrib = &vao->VertexAttrib[attr];
+            binding = &vao->BufferBinding[attr];
+         } else {
+            attrib = &vao->VertexAttrib[attribute_map[attr]];
+            binding = &vao->BufferBinding[attrib->BufferBindingIndex];
+         }
          const unsigned bufidx = (*num_vbuffers)++;
 
          /* Set the vertex buffer. */
-         if (binding->BufferObj) {
-            vbuffer[bufidx].buffer.resource =
+         if (!ALLOW_USER_BUFFERS || binding->BufferObj) {
+            assert(binding->BufferObj);
+            struct pipe_resource *buf =
                _mesa_get_bufferobj_reference(ctx, binding->BufferObj);
+            vbuffer[bufidx].buffer.resource = buf;
             vbuffer[bufidx].is_user_buffer = false;
             vbuffer[bufidx].buffer_offset = binding->Offset +
                                             attrib->RelativeOffset;
+            if (FILL_TC_SET_VB)
+               tc_track_vertex_buffer(pipe, bufidx, buf, next_buffer_list);
          } else {
             vbuffer[bufidx].buffer.user = attrib->Ptr;
             vbuffer[bufidx].is_user_buffer = true;
             vbuffer[bufidx].buffer_offset = 0;
+            assert(!FILL_TC_SET_VB);
          }
 
          if (!UPDATE_VELEMS)
             continue;
 
+         /* Determine the vertex element index without popcnt
+          * if !ALLOW_ZERO_STRIDE_ATTRIBS, which means that we don't need
+          * to leave any holes for zero-stride attribs, thus the mapping from
+          * vertex elements to vertex buffers is identity.
+          */
+         unsigned index;
+
+         if (ALLOW_ZERO_STRIDE_ATTRIBS) {
+            assert(POPCNT != POPCNT_INVALID);
+            index = util_bitcount_fast<POPCNT>(inputs_read &
+                                               BITFIELD_MASK(attr));
+         } else {
+            index = bufidx;
+            assert(index == util_bitcount(inputs_read &
+                                          BITFIELD_MASK(attr)));
+         }
+
          /* Set the vertex element. */
-         init_velement(velements->velems, &attrib->Format, 0,
-                       binding->Stride,
+         init_velement(velements->velems, &attrib->Format, 0, binding->Stride,
                        binding->InstanceDivisor, bufidx,
-                       dual_slot_inputs & BITFIELD_BIT(attr),
-                       util_bitcount_fast<POPCNT>(inputs_read & BITFIELD_MASK(attr)));
+                       dual_slot_inputs & BITFIELD_BIT(attr), index);
       }
       return;
    }
@@ -136,6 +197,15 @@ setup_arrays(struct gl_context *ctx,
     */
    assert(!ctx->Const.UseVAOFastPath || vao->SharedAndImmutable);
 
+   /* Require these because we don't use them here and we don't want to
+    * generate identical template variants.
+    */
+   assert(!FILL_TC_SET_VB);
+   assert(ALLOW_ZERO_STRIDE_ATTRIBS);
+   assert(!HAS_IDENTITY_ATTRIB_MAPPING);
+   assert(ALLOW_USER_BUFFERS);
+   assert(UPDATE_VELEMS);
+
    while (mask) {
       /* The attribute index to start pulling a binding */
       const gl_vert_attrib i = (gl_vert_attrib)(ffs(mask) - 1);
@@ -164,8 +234,6 @@ setup_arrays(struct gl_context *ctx,
       /* We can assume that we have array for the binding */
       assert(attrmask);
 
-      if (!UPDATE_VELEMS)
-         continue;
 
       /* Walk attributes belonging to the binding */
       do {
@@ -173,10 +241,13 @@ setup_arrays(struct gl_context *ctx,
          const struct gl_array_attributes *const attrib
             = _mesa_draw_array_attrib(vao, attr);
          const GLuint off = _mesa_draw_attributes_relative_offset(attrib);
+         assert(POPCNT != POPCNT_INVALID);
+
          init_velement(velements->velems, &attrib->Format, off,
                        binding->Stride, binding->InstanceDivisor, bufidx,
                        dual_slot_inputs & BITFIELD_BIT(attr),
-                       util_bitcount_fast<POPCNT>(inputs_read & BITFIELD_MASK(attr)));
+                       util_bitcount_fast<POPCNT>(inputs_read &
+                                                  BITFIELD_MASK(attr)));
       } while (attrmask);
    }
 }
@@ -192,7 +263,9 @@ st_setup_arrays(struct st_context *st,
    struct gl_context *ctx = st->ctx;
    GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx);
 
-   setup_arrays<POPCNT_NO, VAO_FAST_PATH_ON, UPDATE_VELEMS_ON>
+   setup_arrays<POPCNT_NO, FILL_TC_SET_VB_OFF, VAO_FAST_PATH_ON,
+                ZERO_STRIDE_ATTRIBS_ON, IDENTITY_ATTRIB_MAPPING_OFF,
+                USER_BUFFERS_ON, UPDATE_VELEMS_ON>
       (ctx, ctx->Array._DrawVAO, vp->Base.DualSlotInputs,
        vp_variant->vert_attrib_mask,
        vp_variant->vert_attrib_mask & enabled_arrays,
@@ -205,7 +278,9 @@ st_setup_arrays(struct st_context *st,
  * Return the index of the vertex buffer where current attribs have been
  * uploaded.
  */
-template<util_popcnt POPCNT, st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
+template<util_popcnt POPCNT,
+         st_fill_tc_set_vb FILL_TC_SET_VB,
+         st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
 st_setup_current(struct st_context *st,
                  const GLbitfield dual_slot_inputs,
                  const GLbitfield inputs_read,
@@ -216,6 +291,7 @@ st_setup_current(struct st_context *st,
    /* Process values that should have better been uniforms in the application */
    if (curmask) {
       struct gl_context *ctx = st->ctx;
+      assert(POPCNT != POPCNT_INVALID);
       unsigned num_attribs = util_bitcount_fast<POPCNT>(curmask);
       unsigned num_dual_attribs = util_bitcount_fast<POPCNT>(curmask &
                                                              dual_slot_inputs);
@@ -245,6 +321,12 @@ st_setup_current(struct st_context *st,
                      &vbuffer[bufidx].buffer.resource, (void**)&ptr);
       uint8_t *cursor = ptr;
 
+      if (FILL_TC_SET_VB) {
+         struct pipe_context *pipe = ctx->pipe;
+         tc_track_vertex_buffer(pipe, bufidx, vbuffer[bufidx].buffer.resource,
+                                tc_get_next_buffer_list(pipe));
+      }
+
       do {
          const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&curmask);
          const struct gl_array_attributes *const attrib
@@ -264,7 +346,8 @@ st_setup_current(struct st_context *st,
          if (UPDATE_VELEMS) {
             init_velement(velements->velems, &attrib->Format, cursor - ptr,
                           0, 0, bufidx, dual_slot_inputs & BITFIELD_BIT(attr),
-                          util_bitcount_fast<POPCNT>(inputs_read & BITFIELD_MASK(attr)));
+                          util_bitcount_fast<POPCNT>(inputs_read &
+                                                     BITFIELD_MASK(attr)));
          }
 
          cursor += size;
@@ -308,7 +391,11 @@ st_setup_current_user(struct st_context *st,
 }
 
 template<util_popcnt POPCNT,
+         st_fill_tc_set_vb FILL_TC_SET_VB,
          st_use_vao_fast_path USE_VAO_FAST_PATH,
+         st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS,
+         st_identity_attrib_mapping HAS_IDENTITY_ATTRIB_MAPPING,
+         st_allow_user_buffers ALLOW_USER_BUFFERS,
          st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
 st_update_array_templ(struct st_context *st,
                       const GLbitfield enabled_arrays,
@@ -324,47 +411,174 @@ st_update_array_templ(struct st_context *st,
    const struct st_common_variant *vp_variant = st->vp_variant;
    const GLbitfield inputs_read = vp_variant->vert_attrib_mask;
    const GLbitfield dual_slot_inputs = vp->Base.DualSlotInputs;
-   const GLbitfield userbuf_arrays = inputs_read & enabled_user_arrays;
+   const GLbitfield userbuf_arrays =
+      ALLOW_USER_BUFFERS ? inputs_read & enabled_user_arrays : 0;
    bool uses_user_vertex_buffers = userbuf_arrays != 0;
 
    st->draw_needs_minmax_index =
       (userbuf_arrays & ~nonzero_divisor_arrays) != 0;
 
-   struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS];
-   unsigned num_vbuffers = 0;
+   struct pipe_vertex_buffer vbuffer_local[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_buffer *vbuffer;
+   unsigned num_vbuffers = 0, num_vbuffers_tc;
    struct cso_velems_state velements;
 
+   if (FILL_TC_SET_VB) {
+      assert(!uses_user_vertex_buffers);
+      assert(POPCNT != POPCNT_INVALID);
+      num_vbuffers_tc = util_bitcount_fast<POPCNT>(inputs_read &
+                                                   enabled_arrays);
+
+      /* Add up to 1 vertex buffer for zero-stride vertex attribs. */
+      num_vbuffers_tc += ALLOW_ZERO_STRIDE_ATTRIBS &&
+                         inputs_read & ~enabled_arrays;
+      vbuffer = tc_add_set_vertex_buffers_call(st->pipe, num_vbuffers_tc);
+   } else {
+      vbuffer = vbuffer_local;
+   }
+
    /* ST_NEW_VERTEX_ARRAYS */
    /* Setup arrays */
-   setup_arrays<POPCNT, USE_VAO_FAST_PATH, UPDATE_VELEMS>
+   setup_arrays<POPCNT, FILL_TC_SET_VB, USE_VAO_FAST_PATH,
+                ALLOW_ZERO_STRIDE_ATTRIBS, HAS_IDENTITY_ATTRIB_MAPPING,
+                ALLOW_USER_BUFFERS, UPDATE_VELEMS>
       (ctx, ctx->Array._DrawVAO, dual_slot_inputs, inputs_read,
        inputs_read & enabled_arrays, &velements, vbuffer, &num_vbuffers);
 
    /* _NEW_CURRENT_ATTRIB */
    /* Setup zero-stride attribs. */
-   st_setup_current<POPCNT, UPDATE_VELEMS>
-      (st, dual_slot_inputs, inputs_read, inputs_read & ~enabled_arrays,
-       &velements, vbuffer, &num_vbuffers);
+   if (ALLOW_ZERO_STRIDE_ATTRIBS) {
+      st_setup_current<POPCNT, FILL_TC_SET_VB, UPDATE_VELEMS>
+         (st, dual_slot_inputs, inputs_read, inputs_read & ~enabled_arrays,
+          &velements, vbuffer, &num_vbuffers);
+   } else {
+      assert(!(inputs_read & ~enabled_arrays));
+   }
 
-   struct cso_context *cso = st->cso_context;
+   if (FILL_TC_SET_VB)
+         assert(num_vbuffers == num_vbuffers_tc);
 
    if (UPDATE_VELEMS) {
+      struct cso_context *cso = st->cso_context;
       velements.count = vp->num_inputs + vp_variant->key.passthrough_edgeflags;
 
       /* Set vertex buffers and elements. */
-      cso_set_vertex_buffers_and_elements(cso, &velements, num_vbuffers,
-                                          uses_user_vertex_buffers, vbuffer);
+      if (FILL_TC_SET_VB) {
+         cso_set_vertex_elements(cso, &velements);
+      } else {
+         cso_set_vertex_buffers_and_elements(cso, &velements, num_vbuffers,
+                                             uses_user_vertex_buffers, vbuffer);
+      }
       /* The driver should clear this after it has processed the update. */
       ctx->Array.NewVertexElements = false;
       st->uses_user_vertex_buffers = uses_user_vertex_buffers;
    } else {
       /* Only vertex buffers. */
-      cso_set_vertex_buffers(cso, num_vbuffers, true, vbuffer);
+      if (!FILL_TC_SET_VB)
+         cso_set_vertex_buffers(st->cso_context, num_vbuffers, true, vbuffer);
+
       /* This can change only when we update vertex elements. */
       assert(st->uses_user_vertex_buffers == uses_user_vertex_buffers);
    }
 }
 
+typedef void (*update_array_func)(struct st_context *st,
+                                  const GLbitfield enabled_arrays,
+                                  const GLbitfield enabled_user_attribs,
+                                  const GLbitfield nonzero_divisor_attribs);
+
+/* This just initializes the table of all st_update_array variants. */
+struct st_update_array_table {
+   update_array_func funcs[2][2][2][2][2][2];
+
+   template<util_popcnt POPCNT,
+            st_fill_tc_set_vb FILL_TC_SET_VB,
+            st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS,
+            st_identity_attrib_mapping HAS_IDENTITY_ATTRIB_MAPPING,
+            st_allow_user_buffers ALLOW_USER_BUFFERS,
+            st_update_velems UPDATE_VELEMS>
+   void init_one()
+   {
+      /* These conditions reduce the number of compiled variants. */
+      /* The TC path is only valid without user buffers.
+       */
+      constexpr st_fill_tc_set_vb fill_tc_set_vb =
+         !ALLOW_USER_BUFFERS ? FILL_TC_SET_VB : FILL_TC_SET_VB_OFF;
+
+      /* POPCNT is unused without zero-stride attribs and without TC. */
+      constexpr util_popcnt popcnt =
+         !ALLOW_ZERO_STRIDE_ATTRIBS && !fill_tc_set_vb ?
+            POPCNT_INVALID : POPCNT;
+
+      funcs[POPCNT][FILL_TC_SET_VB][ALLOW_ZERO_STRIDE_ATTRIBS]
+           [HAS_IDENTITY_ATTRIB_MAPPING][ALLOW_USER_BUFFERS][UPDATE_VELEMS] =
+         st_update_array_templ<
+            popcnt,
+            fill_tc_set_vb,
+            VAO_FAST_PATH_ON,
+            ALLOW_ZERO_STRIDE_ATTRIBS,
+            HAS_IDENTITY_ATTRIB_MAPPING,
+            ALLOW_USER_BUFFERS,
+            UPDATE_VELEMS>;
+   }
+
+   /* We have to do this in stages because of the combinatorial explosion of
+    * variants.
+    */
+   template<util_popcnt POPCNT,
+            st_fill_tc_set_vb FILL_TC_SET_VB,
+            st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS>
+   void init_last_3_args()
+   {
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_OFF, USER_BUFFERS_OFF,
+               UPDATE_VELEMS_OFF>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_OFF,
+               USER_BUFFERS_OFF, UPDATE_VELEMS_ON>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_OFF,
+               USER_BUFFERS_ON,  UPDATE_VELEMS_OFF>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_OFF,
+               USER_BUFFERS_ON,  UPDATE_VELEMS_ON>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_ON,
+               USER_BUFFERS_OFF, UPDATE_VELEMS_OFF>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_ON,
+               USER_BUFFERS_OFF, UPDATE_VELEMS_ON>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_ON,
+               USER_BUFFERS_ON,  UPDATE_VELEMS_OFF>();
+      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
+               IDENTITY_ATTRIB_MAPPING_ON,
+               USER_BUFFERS_ON,  UPDATE_VELEMS_ON>();
+   }
+
+   st_update_array_table()
+   {
+      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_OFF,
+                       ZERO_STRIDE_ATTRIBS_OFF>();
+      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_OFF,
+                       ZERO_STRIDE_ATTRIBS_ON>();
+      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_ON,
+                       ZERO_STRIDE_ATTRIBS_OFF>();
+      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_ON,
+                       ZERO_STRIDE_ATTRIBS_ON>();
+      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_OFF,
+                       ZERO_STRIDE_ATTRIBS_OFF>();
+      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_OFF,
+                       ZERO_STRIDE_ATTRIBS_ON>();
+      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_ON,
+                       ZERO_STRIDE_ATTRIBS_OFF>();
+      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_ON,
+                       ZERO_STRIDE_ATTRIBS_ON>();
+   }
+};
+
+static st_update_array_table update_array_table;
+
 template<util_popcnt POPCNT,
          st_use_vao_fast_path USE_VAO_FAST_PATH> void ALWAYS_INLINE
 st_update_array_impl(struct st_context *st)
@@ -384,19 +598,42 @@ st_update_array_impl(struct st_context *st)
    _mesa_get_derived_vao_masks(ctx, enabled_arrays, &enabled_user_arrays,
                                &nonzero_divisor_arrays);
 
+   /* Execute the slow path without using multiple C++ template variants. */
+   if (!USE_VAO_FAST_PATH) {
+      st_update_array_templ<POPCNT, FILL_TC_SET_VB_OFF, VAO_FAST_PATH_OFF,
+                            ZERO_STRIDE_ATTRIBS_ON, IDENTITY_ATTRIB_MAPPING_OFF,
+                            USER_BUFFERS_ON, UPDATE_VELEMS_ON>
+         (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays);
+      return;
+   }
+
+   /* The fast path that selects from multiple C++ template variants. */
+   const GLbitfield inputs_read = st->vp_variant->vert_attrib_mask;
+   const GLbitfield enabled_arrays_read = inputs_read & enabled_arrays;
+
+   /* Check cso_context whether it goes directly to TC. */
+   bool fill_tc_set_vbs = st->cso_context->draw_vbo == tc_draw_vbo;
+   bool has_zero_stride_attribs = inputs_read & ~enabled_arrays;
+   uint32_t non_identity_attrib_mapping =
+      vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_IDENTITY ? 0 :
+      vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_POSITION ? VERT_BIT_GENERIC0
+                                                            : VERT_BIT_POS;
+   bool has_identity_mapping = !(enabled_arrays_read &
+                                 (vao->NonIdentityBufferAttribMapping |
+                                  non_identity_attrib_mapping));
+   /* has_user_buffers is always false with glthread. */
+   bool has_user_buffers = inputs_read & enabled_user_arrays;
    /* Changing from user to non-user buffers and vice versa can switch between
     * cso and u_vbuf, which means that we need to update vertex elements even
     * when they have not changed.
     */
-   if (ctx->Array.NewVertexElements ||
-       st->uses_user_vertex_buffers !=
-       !!(st->vp_variant->vert_attrib_mask & enabled_user_arrays)) {
-      st_update_array_templ<POPCNT, USE_VAO_FAST_PATH, UPDATE_VELEMS_ON>
-         (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays);
-   } else {
-      st_update_array_templ<POPCNT, USE_VAO_FAST_PATH, UPDATE_VELEMS_OFF>
-         (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays);
-   }
+   bool update_velems = ctx->Array.NewVertexElements ||
+                        st->uses_user_vertex_buffers != has_user_buffers;
+
+   update_array_table.funcs[POPCNT][fill_tc_set_vbs][has_zero_stride_attribs]
+                           [has_identity_mapping][has_user_buffers]
+                           [update_velems]
+      (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays);
 }
 
 /* The default callback that must be present before st_init_update_array
@@ -442,7 +679,9 @@ st_create_gallium_vertex_state(struct gl_context *ctx,
    /* This should use the slow path because there is only 1 interleaved
     * vertex buffers.
     */
-   setup_arrays<POPCNT_NO, VAO_FAST_PATH_OFF, UPDATE_VELEMS_ON>
+   setup_arrays<POPCNT_NO, FILL_TC_SET_VB_OFF, VAO_FAST_PATH_OFF,
+                ZERO_STRIDE_ATTRIBS_ON, IDENTITY_ATTRIB_MAPPING_OFF,
+                USER_BUFFERS_ON, UPDATE_VELEMS_ON>
       (ctx, vao, dual_slot_inputs, inputs_read, inputs_read, &velements,
        vbuffer, &num_vbuffers);
 
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index ae0e721e61c..01f6d80b1c3 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -399,6 +399,7 @@ util_widen_mask(uint32_t mask, unsigned multiplier)
 enum util_popcnt {
    POPCNT_NO,
    POPCNT_YES,
+   POPCNT_INVALID,
 };
 
 /* Convenient function to select popcnt through a C++ template argument.