nvc0/vbo: wrap draw_vbo for multidraw performance

This patch is to avoid the high overhead that exists when trying to
kick ever single draw during multidraw.

glMultiDrawArrays performance profiling:

342.5 thousand draws/second -> 40 million draws/second

Special thanks to Arthur Huillet for helping getting this profiled
in irc.

Signed-off-by: Yusuf Khan <yusisamerican@gmail.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com>
---
v2: fix typos pointed out by Arthur
v3: nvc0_draw_vbo -> nvc0_draw_single_vbo, intialize count
v4: remove num_draws from wrapped function and add mutex assert

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28641>
This commit is contained in:
Yusuf Khan 2024-04-07 06:09:36 -05:00 committed by Marge Bot
parent 3fb1a64918
commit 225f2aac96

View file

@ -925,6 +925,126 @@ nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index)
}
}
static void
nvc0_draw_single_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *draws)
{
struct nvc0_context *nvc0 = nvc0_context(pipe);
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
struct nvc0_screen *screen = nvc0->screen;
int s;
simple_mtx_assert_locked(&nvc0->screen->state_lock);
if (nvc0->vertprog->vp.need_draw_parameters && (!indirect || indirect->count_from_stream_output)) {
PUSH_SPACE(push, 9);
BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
PUSH_DATA (push, NVC0_CB_AUX_SIZE);
PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
PUSH_DATA (push, info->index_size ? draws->index_bias : 0);
PUSH_DATA (push, info->start_instance);
PUSH_DATA (push, drawid_offset);
}
if (nvc0->screen->base.class_3d < NVE4_3D_CLASS &&
nvc0->seamless_cube_map != nvc0->state.seamless_cube_map) {
nvc0->state.seamless_cube_map = nvc0->seamless_cube_map;
PUSH_SPACE(push, 1);
IMMED_NVC0(push, NVC0_3D(TEX_MISC),
nvc0->seamless_cube_map ? NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP : 0);
}
nvc0->base.kick_notify = nvc0_draw_vbo_kick_notify;
for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
if (nvc0->constbuf_coherent[s])
nvc0->cb_dirty = true;
}
if (nvc0->cb_dirty) {
PUSH_SPACE(push, 1);
IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
nvc0->cb_dirty = false;
}
for (s = 0; s < 5; ++s) {
if (!nvc0->textures_coherent[s])
continue;
PUSH_SPACE(push, nvc0->num_textures[s] * 2);
for (int i = 0; i < nvc0->num_textures[s]; ++i) {
struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
if (!(nvc0->textures_coherent[s] & (1 << i)))
continue;
BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
PUSH_DATA (push, (tic->id << 4) | 1);
NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_cache_flush_count, 1);
}
}
if (nvc0->state.vbo_mode) {
if (indirect && indirect->buffer)
nvc0_push_vbo_indirect(nvc0, info, drawid_offset, indirect, &draws[0]);
else
nvc0_push_vbo(nvc0, info, indirect, &draws[0]);
return;
}
/* space for base instance, flush, and prim restart */
PUSH_SPACE(push, 8);
if (nvc0->state.instance_base != info->start_instance) {
nvc0->state.instance_base = info->start_instance;
/* NOTE: this does not affect the shader input, should it ? */
BEGIN_NVC0(push, NVC0_3D(VB_INSTANCE_BASE), 1);
PUSH_DATA (push, info->start_instance);
}
nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
if (!nvc0->base.vbo_dirty && info->index_size && !info->has_user_indices &&
info->index.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
nvc0->base.vbo_dirty = true;
nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
if (nvc0->base.vbo_dirty) {
if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
nvc0->base.vbo_dirty = false;
}
if (unlikely(indirect && indirect->buffer)) {
nvc0_draw_indirect(nvc0, info, drawid_offset, indirect);
} else
if (unlikely(indirect && indirect->count_from_stream_output)) {
nvc0_draw_stream_output(nvc0, info, indirect);
} else
if (info->index_size) {
bool shorten = info->index_bounds_valid && info->max_index <= 65535;
if (info->primitive_restart && info->restart_index > 65535)
shorten = false;
nvc0_draw_elements(nvc0, shorten, info,
info->mode, draws[0].start, draws[0].count,
info->instance_count, draws->index_bias, info->index_size);
} else {
nvc0_draw_arrays(nvc0,
info->mode, draws[0].start, draws[0].count,
info->instance_count);
}
}
/* Thin wrapper to avoid kicking every 3 ns during multidraw */
void
nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
unsigned drawid_offset,
@ -932,19 +1052,23 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draws,
unsigned num_draws)
{
if (num_draws > 1) {
util_draw_multi(pipe, info, drawid_offset, indirect, draws, num_draws);
return;
}
if (!indirect && (!draws[0].count || !info->instance_count))
return;
struct nvc0_context *nvc0 = nvc0_context(pipe);
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
struct nvc0_screen *screen = nvc0->screen;
unsigned vram_domain = NV_VRAM_DOMAIN(&screen->base);
int s;
unsigned count_total = 0;
/* The rest is copied straight from util_multi_draw
*
* XXX: Properly rewrite vbo handling in nvc0/nv50
*
*/
unsigned drawid = drawid_offset;
/* dont wanna start trippin */
simple_mtx_lock(&nvc0->screen->state_lock);
/* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
if (info->index_bounds_valid) {
@ -957,12 +1081,17 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
nvc0->instance_off = info->start_instance;
nvc0->instance_max = info->instance_count - 1;
/* Get total amount of draw counts to determine whether to push or upload vertices */
for (unsigned i = 0; i < num_draws; i++) {
count_total += draws[i].count;
}
/* For picking only a few vertices from a large user buffer, push is better,
* if index count is larger and we expect repeated vertices, suggest upload.
*/
nvc0->vbo_push_hint =
(!indirect || indirect->count_from_stream_output) && info->index_size &&
(nvc0->vb_elt_limit >= (draws[0].count * 2));
(nvc0->vb_elt_limit >= (count_total * 2));
if (nvc0->dirty_3d & (NVC0_NEW_3D_ARRAYS | NVC0_NEW_3D_VERTEX))
nvc0->constant_vbos = nvc0->vertex->constant_vbos & nvc0->vbo_user;
@ -1028,115 +1157,15 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TEXT, vram_domain | NOUVEAU_BO_RD,
screen->text);
simple_mtx_lock(&nvc0->screen->state_lock);
nvc0_state_validate_3d(nvc0, ~0);
if (nvc0->vertprog->vp.need_draw_parameters && (!indirect || indirect->count_from_stream_output)) {
PUSH_SPACE(push, 9);
BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
PUSH_DATA (push, NVC0_CB_AUX_SIZE);
PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
PUSH_DATA (push, info->index_size ? draws->index_bias : 0);
PUSH_DATA (push, info->start_instance);
PUSH_DATA (push, drawid_offset);
for (unsigned i = 0; i < num_draws; i++) {
if (indirect || (draws[i].count && info->instance_count))
nvc0_draw_single_vbo(pipe, info, drawid, indirect, &draws[i]);
if (info->increment_draw_id)
drawid++;
}
if (nvc0->screen->base.class_3d < NVE4_3D_CLASS &&
nvc0->seamless_cube_map != nvc0->state.seamless_cube_map) {
nvc0->state.seamless_cube_map = nvc0->seamless_cube_map;
PUSH_SPACE(push, 1);
IMMED_NVC0(push, NVC0_3D(TEX_MISC),
nvc0->seamless_cube_map ? NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP : 0);
}
nvc0->base.kick_notify = nvc0_draw_vbo_kick_notify;
for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
if (nvc0->constbuf_coherent[s])
nvc0->cb_dirty = true;
}
if (nvc0->cb_dirty) {
PUSH_SPACE(push, 1);
IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
nvc0->cb_dirty = false;
}
for (s = 0; s < 5; ++s) {
if (!nvc0->textures_coherent[s])
continue;
PUSH_SPACE(push, nvc0->num_textures[s] * 2);
for (int i = 0; i < nvc0->num_textures[s]; ++i) {
struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
if (!(nvc0->textures_coherent[s] & (1 << i)))
continue;
BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
PUSH_DATA (push, (tic->id << 4) | 1);
NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_cache_flush_count, 1);
}
}
if (nvc0->state.vbo_mode) {
if (indirect && indirect->buffer)
nvc0_push_vbo_indirect(nvc0, info, drawid_offset, indirect, &draws[0]);
else
nvc0_push_vbo(nvc0, info, indirect, &draws[0]);
goto cleanup;
}
/* space for base instance, flush, and prim restart */
PUSH_SPACE(push, 8);
if (nvc0->state.instance_base != info->start_instance) {
nvc0->state.instance_base = info->start_instance;
/* NOTE: this does not affect the shader input, should it ? */
BEGIN_NVC0(push, NVC0_3D(VB_INSTANCE_BASE), 1);
PUSH_DATA (push, info->start_instance);
}
nvc0->base.vbo_dirty |= !!nvc0->vtxbufs_coherent;
if (!nvc0->base.vbo_dirty && info->index_size && !info->has_user_indices &&
info->index.resource->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
nvc0->base.vbo_dirty = true;
nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
if (nvc0->base.vbo_dirty) {
if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
nvc0->base.vbo_dirty = false;
}
if (unlikely(indirect && indirect->buffer)) {
nvc0_draw_indirect(nvc0, info, drawid_offset, indirect);
} else
if (unlikely(indirect && indirect->count_from_stream_output)) {
nvc0_draw_stream_output(nvc0, info, indirect);
} else
if (info->index_size) {
bool shorten = info->index_bounds_valid && info->max_index <= 65535;
if (info->primitive_restart && info->restart_index > 65535)
shorten = false;
nvc0_draw_elements(nvc0, shorten, info,
info->mode, draws[0].start, draws[0].count,
info->instance_count, draws->index_bias, info->index_size);
} else {
nvc0_draw_arrays(nvc0,
info->mode, draws[0].start, draws[0].count,
info->instance_count);
}
cleanup:
PUSH_KICK(push);
simple_mtx_unlock(&nvc0->screen->state_lock);