From 11dbdedf46575cf114244eedd283dc63fd9e99f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 4 Jan 2024 12:44:12 -0500 Subject: [PATCH] st/mesa: optimize st_update_arrays using lots of C++ template variants This adds the following template options: - add an option to fill TC set_vertex_buffers from st_update_array directly (always true without u_vbuf, so always used with radeonsi) - add an option saying that there are no zero-stride attribs - add an option saying that there are no user buffers (always true with glthread, so always used with radeonsi) - add an option saying that there is an identity mapping between vertex buffers and vertex attribs I have specifically chosen those options because they improve performance. I also had other options that didn't, like unrolling the setup_arrays loop. This adds a total of 42 variants of st_update_array_templ for various cases. Usually only a few of them are used in practice. Overhead of st_prepare_draw in VP2020/Catia: Before: 8.5% of CPU used After: 6.13% of CPU used That's 2.37% improvement. Since there are 4 threads using the CPU and the percentage includes all threads in the system, the improvement for the GL thread is about 8% (roughly 2.17% * 4; each thread at 25% of global utilization means 100% utilization in 4 cores). Reviewed-By: Mike Blumenkrantz Part-of: --- src/mesa/state_tracker/st_atom_array.cpp | 315 ++++++++++++++++++++--- src/util/bitscan.h | 1 + 2 files changed, 278 insertions(+), 38 deletions(-) diff --git a/src/mesa/state_tracker/st_atom_array.cpp b/src/mesa/state_tracker/st_atom_array.cpp index e899b993238..faa2d626de9 100644 --- a/src/mesa/state_tracker/st_atom_array.cpp +++ b/src/mesa/state_tracker/st_atom_array.cpp @@ -44,16 +44,38 @@ #include "util/u_cpu_detect.h" #include "util/u_math.h" #include "util/u_upload_mgr.h" +#include "util/u_threaded_context.h" #include "main/bufferobj.h" #include "main/glformats.h" #include "main/varray.h" #include "main/arrayobj.h" +enum st_fill_tc_set_vb { + FILL_TC_SET_VB_OFF, /* always works */ + FILL_TC_SET_VB_ON, /* specialized version (faster) */ +}; + enum st_use_vao_fast_path { VAO_FAST_PATH_OFF, /* more complicated version (slower) */ VAO_FAST_PATH_ON, /* always works (faster) */ }; +enum st_allow_zero_stride_attribs { + ZERO_STRIDE_ATTRIBS_OFF, /* specialized version (faster) */ + ZERO_STRIDE_ATTRIBS_ON, /* always works */ +}; + +/* Whether vertex attrib indices are equal to their vertex buffer indices. */ +enum st_identity_attrib_mapping { + IDENTITY_ATTRIB_MAPPING_OFF, /* always works */ + IDENTITY_ATTRIB_MAPPING_ON, /* specialized version (faster) */ +}; + +enum st_allow_user_buffers { + USER_BUFFERS_OFF, /* specialized version (faster) */ + USER_BUFFERS_ON, /* always works */ +}; + enum st_update_velems { UPDATE_VELEMS_OFF, /* specialized version (faster) */ UPDATE_VELEMS_ON, /* always works */ @@ -82,7 +104,11 @@ init_velement(struct pipe_vertex_element *velements, * on the stack. */ template void ALWAYS_INLINE setup_arrays(struct gl_context *ctx, const struct gl_vertex_array_object *vao, @@ -95,38 +121,73 @@ setup_arrays(struct gl_context *ctx, /* Set up enabled vertex arrays. */ if (USE_VAO_FAST_PATH) { const GLubyte *attribute_map = - _mesa_vao_attribute_map[vao->_AttributeMapMode]; + !HAS_IDENTITY_ATTRIB_MAPPING ? + _mesa_vao_attribute_map[vao->_AttributeMapMode] : NULL; + struct pipe_context *pipe = ctx->pipe; + struct tc_buffer_list *next_buffer_list = NULL; + if (FILL_TC_SET_VB) + next_buffer_list = tc_get_next_buffer_list(pipe); + + /* Note: I did try to unroll this loop by passing the number of + * iterations as a template parameter, but it resulted in more overhead. + */ while (mask) { const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&mask); - const struct gl_array_attributes *const attrib = - &vao->VertexAttrib[attribute_map[attr]]; - const struct gl_vertex_buffer_binding *const binding = - &vao->BufferBinding[attrib->BufferBindingIndex]; + const struct gl_array_attributes *attrib; + const struct gl_vertex_buffer_binding *binding; + + if (HAS_IDENTITY_ATTRIB_MAPPING) { + attrib = &vao->VertexAttrib[attr]; + binding = &vao->BufferBinding[attr]; + } else { + attrib = &vao->VertexAttrib[attribute_map[attr]]; + binding = &vao->BufferBinding[attrib->BufferBindingIndex]; + } const unsigned bufidx = (*num_vbuffers)++; /* Set the vertex buffer. */ - if (binding->BufferObj) { - vbuffer[bufidx].buffer.resource = + if (!ALLOW_USER_BUFFERS || binding->BufferObj) { + assert(binding->BufferObj); + struct pipe_resource *buf = _mesa_get_bufferobj_reference(ctx, binding->BufferObj); + vbuffer[bufidx].buffer.resource = buf; vbuffer[bufidx].is_user_buffer = false; vbuffer[bufidx].buffer_offset = binding->Offset + attrib->RelativeOffset; + if (FILL_TC_SET_VB) + tc_track_vertex_buffer(pipe, bufidx, buf, next_buffer_list); } else { vbuffer[bufidx].buffer.user = attrib->Ptr; vbuffer[bufidx].is_user_buffer = true; vbuffer[bufidx].buffer_offset = 0; + assert(!FILL_TC_SET_VB); } if (!UPDATE_VELEMS) continue; + /* Determine the vertex element index without popcnt + * if !ALLOW_ZERO_STRIDE_ATTRIBS, which means that we don't need + * to leave any holes for zero-stride attribs, thus the mapping from + * vertex elements to vertex buffers is identity. + */ + unsigned index; + + if (ALLOW_ZERO_STRIDE_ATTRIBS) { + assert(POPCNT != POPCNT_INVALID); + index = util_bitcount_fast(inputs_read & + BITFIELD_MASK(attr)); + } else { + index = bufidx; + assert(index == util_bitcount(inputs_read & + BITFIELD_MASK(attr))); + } + /* Set the vertex element. */ - init_velement(velements->velems, &attrib->Format, 0, - binding->Stride, + init_velement(velements->velems, &attrib->Format, 0, binding->Stride, binding->InstanceDivisor, bufidx, - dual_slot_inputs & BITFIELD_BIT(attr), - util_bitcount_fast(inputs_read & BITFIELD_MASK(attr))); + dual_slot_inputs & BITFIELD_BIT(attr), index); } return; } @@ -136,6 +197,15 @@ setup_arrays(struct gl_context *ctx, */ assert(!ctx->Const.UseVAOFastPath || vao->SharedAndImmutable); + /* Require these because we don't use them here and we don't want to + * generate identical template variants. + */ + assert(!FILL_TC_SET_VB); + assert(ALLOW_ZERO_STRIDE_ATTRIBS); + assert(!HAS_IDENTITY_ATTRIB_MAPPING); + assert(ALLOW_USER_BUFFERS); + assert(UPDATE_VELEMS); + while (mask) { /* The attribute index to start pulling a binding */ const gl_vert_attrib i = (gl_vert_attrib)(ffs(mask) - 1); @@ -164,8 +234,6 @@ setup_arrays(struct gl_context *ctx, /* We can assume that we have array for the binding */ assert(attrmask); - if (!UPDATE_VELEMS) - continue; /* Walk attributes belonging to the binding */ do { @@ -173,10 +241,13 @@ setup_arrays(struct gl_context *ctx, const struct gl_array_attributes *const attrib = _mesa_draw_array_attrib(vao, attr); const GLuint off = _mesa_draw_attributes_relative_offset(attrib); + assert(POPCNT != POPCNT_INVALID); + init_velement(velements->velems, &attrib->Format, off, binding->Stride, binding->InstanceDivisor, bufidx, dual_slot_inputs & BITFIELD_BIT(attr), - util_bitcount_fast(inputs_read & BITFIELD_MASK(attr))); + util_bitcount_fast(inputs_read & + BITFIELD_MASK(attr))); } while (attrmask); } } @@ -192,7 +263,9 @@ st_setup_arrays(struct st_context *st, struct gl_context *ctx = st->ctx; GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx); - setup_arrays + setup_arrays (ctx, ctx->Array._DrawVAO, vp->Base.DualSlotInputs, vp_variant->vert_attrib_mask, vp_variant->vert_attrib_mask & enabled_arrays, @@ -205,7 +278,9 @@ st_setup_arrays(struct st_context *st, * Return the index of the vertex buffer where current attribs have been * uploaded. */ -template void ALWAYS_INLINE +template void ALWAYS_INLINE st_setup_current(struct st_context *st, const GLbitfield dual_slot_inputs, const GLbitfield inputs_read, @@ -216,6 +291,7 @@ st_setup_current(struct st_context *st, /* Process values that should have better been uniforms in the application */ if (curmask) { struct gl_context *ctx = st->ctx; + assert(POPCNT != POPCNT_INVALID); unsigned num_attribs = util_bitcount_fast(curmask); unsigned num_dual_attribs = util_bitcount_fast(curmask & dual_slot_inputs); @@ -245,6 +321,12 @@ st_setup_current(struct st_context *st, &vbuffer[bufidx].buffer.resource, (void**)&ptr); uint8_t *cursor = ptr; + if (FILL_TC_SET_VB) { + struct pipe_context *pipe = ctx->pipe; + tc_track_vertex_buffer(pipe, bufidx, vbuffer[bufidx].buffer.resource, + tc_get_next_buffer_list(pipe)); + } + do { const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&curmask); const struct gl_array_attributes *const attrib @@ -264,7 +346,8 @@ st_setup_current(struct st_context *st, if (UPDATE_VELEMS) { init_velement(velements->velems, &attrib->Format, cursor - ptr, 0, 0, bufidx, dual_slot_inputs & BITFIELD_BIT(attr), - util_bitcount_fast(inputs_read & BITFIELD_MASK(attr))); + util_bitcount_fast(inputs_read & + BITFIELD_MASK(attr))); } cursor += size; @@ -308,7 +391,11 @@ st_setup_current_user(struct st_context *st, } template void ALWAYS_INLINE st_update_array_templ(struct st_context *st, const GLbitfield enabled_arrays, @@ -324,47 +411,174 @@ st_update_array_templ(struct st_context *st, const struct st_common_variant *vp_variant = st->vp_variant; const GLbitfield inputs_read = vp_variant->vert_attrib_mask; const GLbitfield dual_slot_inputs = vp->Base.DualSlotInputs; - const GLbitfield userbuf_arrays = inputs_read & enabled_user_arrays; + const GLbitfield userbuf_arrays = + ALLOW_USER_BUFFERS ? inputs_read & enabled_user_arrays : 0; bool uses_user_vertex_buffers = userbuf_arrays != 0; st->draw_needs_minmax_index = (userbuf_arrays & ~nonzero_divisor_arrays) != 0; - struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS]; - unsigned num_vbuffers = 0; + struct pipe_vertex_buffer vbuffer_local[PIPE_MAX_ATTRIBS]; + struct pipe_vertex_buffer *vbuffer; + unsigned num_vbuffers = 0, num_vbuffers_tc; struct cso_velems_state velements; + if (FILL_TC_SET_VB) { + assert(!uses_user_vertex_buffers); + assert(POPCNT != POPCNT_INVALID); + num_vbuffers_tc = util_bitcount_fast(inputs_read & + enabled_arrays); + + /* Add up to 1 vertex buffer for zero-stride vertex attribs. */ + num_vbuffers_tc += ALLOW_ZERO_STRIDE_ATTRIBS && + inputs_read & ~enabled_arrays; + vbuffer = tc_add_set_vertex_buffers_call(st->pipe, num_vbuffers_tc); + } else { + vbuffer = vbuffer_local; + } + /* ST_NEW_VERTEX_ARRAYS */ /* Setup arrays */ - setup_arrays + setup_arrays (ctx, ctx->Array._DrawVAO, dual_slot_inputs, inputs_read, inputs_read & enabled_arrays, &velements, vbuffer, &num_vbuffers); /* _NEW_CURRENT_ATTRIB */ /* Setup zero-stride attribs. */ - st_setup_current - (st, dual_slot_inputs, inputs_read, inputs_read & ~enabled_arrays, - &velements, vbuffer, &num_vbuffers); + if (ALLOW_ZERO_STRIDE_ATTRIBS) { + st_setup_current + (st, dual_slot_inputs, inputs_read, inputs_read & ~enabled_arrays, + &velements, vbuffer, &num_vbuffers); + } else { + assert(!(inputs_read & ~enabled_arrays)); + } - struct cso_context *cso = st->cso_context; + if (FILL_TC_SET_VB) + assert(num_vbuffers == num_vbuffers_tc); if (UPDATE_VELEMS) { + struct cso_context *cso = st->cso_context; velements.count = vp->num_inputs + vp_variant->key.passthrough_edgeflags; /* Set vertex buffers and elements. */ - cso_set_vertex_buffers_and_elements(cso, &velements, num_vbuffers, - uses_user_vertex_buffers, vbuffer); + if (FILL_TC_SET_VB) { + cso_set_vertex_elements(cso, &velements); + } else { + cso_set_vertex_buffers_and_elements(cso, &velements, num_vbuffers, + uses_user_vertex_buffers, vbuffer); + } /* The driver should clear this after it has processed the update. */ ctx->Array.NewVertexElements = false; st->uses_user_vertex_buffers = uses_user_vertex_buffers; } else { /* Only vertex buffers. */ - cso_set_vertex_buffers(cso, num_vbuffers, true, vbuffer); + if (!FILL_TC_SET_VB) + cso_set_vertex_buffers(st->cso_context, num_vbuffers, true, vbuffer); + /* This can change only when we update vertex elements. */ assert(st->uses_user_vertex_buffers == uses_user_vertex_buffers); } } +typedef void (*update_array_func)(struct st_context *st, + const GLbitfield enabled_arrays, + const GLbitfield enabled_user_attribs, + const GLbitfield nonzero_divisor_attribs); + +/* This just initializes the table of all st_update_array variants. */ +struct st_update_array_table { + update_array_func funcs[2][2][2][2][2][2]; + + template + void init_one() + { + /* These conditions reduce the number of compiled variants. */ + /* The TC path is only valid without user buffers. + */ + constexpr st_fill_tc_set_vb fill_tc_set_vb = + !ALLOW_USER_BUFFERS ? FILL_TC_SET_VB : FILL_TC_SET_VB_OFF; + + /* POPCNT is unused without zero-stride attribs and without TC. */ + constexpr util_popcnt popcnt = + !ALLOW_ZERO_STRIDE_ATTRIBS && !fill_tc_set_vb ? + POPCNT_INVALID : POPCNT; + + funcs[POPCNT][FILL_TC_SET_VB][ALLOW_ZERO_STRIDE_ATTRIBS] + [HAS_IDENTITY_ATTRIB_MAPPING][ALLOW_USER_BUFFERS][UPDATE_VELEMS] = + st_update_array_templ< + popcnt, + fill_tc_set_vb, + VAO_FAST_PATH_ON, + ALLOW_ZERO_STRIDE_ATTRIBS, + HAS_IDENTITY_ATTRIB_MAPPING, + ALLOW_USER_BUFFERS, + UPDATE_VELEMS>; + } + + /* We have to do this in stages because of the combinatorial explosion of + * variants. + */ + template + void init_last_3_args() + { + init_one(); + init_one(); + init_one(); + init_one(); + init_one(); + init_one(); + init_one(); + init_one(); + } + + st_update_array_table() + { + init_last_3_args(); + init_last_3_args(); + init_last_3_args(); + init_last_3_args(); + init_last_3_args(); + init_last_3_args(); + init_last_3_args(); + init_last_3_args(); + } +}; + +static st_update_array_table update_array_table; + template void ALWAYS_INLINE st_update_array_impl(struct st_context *st) @@ -384,19 +598,42 @@ st_update_array_impl(struct st_context *st) _mesa_get_derived_vao_masks(ctx, enabled_arrays, &enabled_user_arrays, &nonzero_divisor_arrays); + /* Execute the slow path without using multiple C++ template variants. */ + if (!USE_VAO_FAST_PATH) { + st_update_array_templ + (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays); + return; + } + + /* The fast path that selects from multiple C++ template variants. */ + const GLbitfield inputs_read = st->vp_variant->vert_attrib_mask; + const GLbitfield enabled_arrays_read = inputs_read & enabled_arrays; + + /* Check cso_context whether it goes directly to TC. */ + bool fill_tc_set_vbs = st->cso_context->draw_vbo == tc_draw_vbo; + bool has_zero_stride_attribs = inputs_read & ~enabled_arrays; + uint32_t non_identity_attrib_mapping = + vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_IDENTITY ? 0 : + vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_POSITION ? VERT_BIT_GENERIC0 + : VERT_BIT_POS; + bool has_identity_mapping = !(enabled_arrays_read & + (vao->NonIdentityBufferAttribMapping | + non_identity_attrib_mapping)); + /* has_user_buffers is always false with glthread. */ + bool has_user_buffers = inputs_read & enabled_user_arrays; /* Changing from user to non-user buffers and vice versa can switch between * cso and u_vbuf, which means that we need to update vertex elements even * when they have not changed. */ - if (ctx->Array.NewVertexElements || - st->uses_user_vertex_buffers != - !!(st->vp_variant->vert_attrib_mask & enabled_user_arrays)) { - st_update_array_templ - (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays); - } else { - st_update_array_templ - (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays); - } + bool update_velems = ctx->Array.NewVertexElements || + st->uses_user_vertex_buffers != has_user_buffers; + + update_array_table.funcs[POPCNT][fill_tc_set_vbs][has_zero_stride_attribs] + [has_identity_mapping][has_user_buffers] + [update_velems] + (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays); } /* The default callback that must be present before st_init_update_array @@ -442,7 +679,9 @@ st_create_gallium_vertex_state(struct gl_context *ctx, /* This should use the slow path because there is only 1 interleaved * vertex buffers. */ - setup_arrays + setup_arrays (ctx, vao, dual_slot_inputs, inputs_read, inputs_read, &velements, vbuffer, &num_vbuffers); diff --git a/src/util/bitscan.h b/src/util/bitscan.h index ae0e721e61c..01f6d80b1c3 100644 --- a/src/util/bitscan.h +++ b/src/util/bitscan.h @@ -399,6 +399,7 @@ util_widen_mask(uint32_t mask, unsigned multiplier) enum util_popcnt { POPCNT_NO, POPCNT_YES, + POPCNT_INVALID, }; /* Convenient function to select popcnt through a C++ template argument.