diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 166102db5f5..4a4fdc91b65 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -164,6 +164,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_ATOMINC_WRAP: return 1; + case PIPE_CAP_DRAW_VERTEX_STATE: + return !(sscreen->debug_flags & DBG(NO_FAST_DISPLAY_LIST)); + case PIPE_CAP_GLSL_ZERO_INIT: return 2; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index f44f8e1eb65..9cb7cd0f813 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -298,20 +298,34 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx) ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */ } -void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper) +void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper, + pipe_draw_vertex_state_func vstate_wrapper) { if (wrapper) { if (wrapper != sctx->b.draw_vbo) { - assert (!sctx->real_draw_vbo); + assert(!sctx->real_draw_vbo); + assert(!sctx->real_draw_vertex_state); sctx->real_draw_vbo = sctx->b.draw_vbo; + sctx->real_draw_vertex_state = sctx->b.draw_vertex_state; sctx->b.draw_vbo = wrapper; + sctx->b.draw_vertex_state = vstate_wrapper; } } else if (sctx->real_draw_vbo) { sctx->real_draw_vbo = NULL; + sctx->real_draw_vertex_state = NULL; si_select_draw_vbo(sctx); } } +static void si_tmz_preamble(struct si_context *sctx) +{ + bool secure = si_gfx_resources_check_encrypted(sctx); + if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) { + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW | + RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL); + } +} + static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -320,15 +334,22 @@ static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx, unsigned num_draws) { struct si_context *sctx = (struct si_context *)ctx; - bool secure = si_gfx_resources_check_encrypted(sctx); - if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW | - RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL); - } - + si_tmz_preamble(sctx); sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws); } +static void si_draw_vstate_tmz_preamble(struct pipe_context *ctx, + struct pipe_vertex_state *state, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + si_tmz_preamble(sctx); + sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws); +} + void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) { bool is_secure = false; @@ -336,7 +357,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) if (unlikely(radeon_uses_secure_bos(ctx->ws))) { is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs); - si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble); + si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble, + si_draw_vstate_tmz_preamble); } if (ctx->is_debug) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 2b3400dc800..1883a1f0d55 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -88,6 +88,7 @@ static const struct debug_named_value radeonsi_debug_options[] = { {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."}, {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."}, {"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."}, + {"nofastdlist", DBG(NO_FAST_DISPLAY_LIST), "Disable fast display lists"}, /* 3D engine options: */ {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."}, @@ -916,6 +917,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen) disk_cache_destroy(sscreen->disk_shader_cache); util_live_shader_cache_deinit(&sscreen->live_shader_cache); util_idalloc_mt_fini(&sscreen->buffer_ids); + util_vertex_state_cache_deinit(&sscreen->vertex_state_cache); sscreen->ws->destroy(sscreen->ws); FREE(sscreen); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 36aaa5fee27..5c115f33b73 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -31,6 +31,7 @@ #include "util/u_idalloc.h" #include "util/u_suballoc.h" #include "util/u_threaded_context.h" +#include "util/u_vertex_state_cache.h" #include "ac_sqtt.h" #ifdef __cplusplus @@ -210,6 +211,7 @@ enum DBG_CHECK_VM, DBG_RESERVE_VMID, DBG_SHADOW_REGS, + DBG_NO_FAST_DISPLAY_LIST, /* 3D engine options: */ DBG_NO_GFX, @@ -659,6 +661,7 @@ struct si_screen { unsigned ngg_subgroup_size; struct util_idalloc_mt buffer_ids; + struct util_vertex_state_cache vertex_state_cache; }; struct si_sampler_view { @@ -867,12 +870,24 @@ struct si_small_prim_cull_info { float small_prim_precision; }; +struct si_vertex_state { + struct pipe_vertex_state b; + struct si_vertex_elements velems; + uint32_t descriptors[4 * SI_MAX_ATTRIBS]; +}; + typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe, const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count_bias *draws, unsigned num_draws); +typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx, + struct pipe_vertex_state *vstate, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws); struct si_context { struct pipe_context b; /* base class */ @@ -1011,6 +1026,8 @@ struct si_context { struct si_vertex_elements *vertex_elements; unsigned num_vertex_elements; unsigned cs_max_waves_per_sh; + bool uses_nontrivial_vs_prolog; + bool force_trivial_vs_prolog; bool do_update_shaders; bool compute_shaderbuf_sgprs_dirty; bool compute_image_sgprs_dirty; @@ -1219,8 +1236,10 @@ struct si_context { struct hash_table *dirty_implicit_resources; pipe_draw_vbo_func draw_vbo[2][2][2]; + pipe_draw_vertex_state_func draw_vertex_state[2][2][2]; /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */ pipe_draw_vbo_func real_draw_vbo; + pipe_draw_vertex_state_func real_draw_vertex_state; void (*emit_spi_map[33])(struct si_context *sctx); /* SQTT */ @@ -1422,7 +1441,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); /* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement * optimizations without affecting the normal draw_vbo functions perf. */ -void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper); +void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper, + pipe_draw_vertex_state_func vstate_wrapper); /* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); @@ -1954,11 +1974,22 @@ static inline void si_select_draw_vbo(struct si_context *sctx) pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso] [!!sctx->shader.gs.cso] [sctx->ngg]; + pipe_draw_vertex_state_func draw_vertex_state = + sctx->draw_vertex_state[!!sctx->shader.tes.cso] + [!!sctx->shader.gs.cso] + [sctx->ngg]; assert(draw_vbo); - if (unlikely(sctx->real_draw_vbo)) + assert(draw_vertex_state); + + if (unlikely(sctx->real_draw_vbo)) { + assert(sctx->real_draw_vertex_state); sctx->real_draw_vbo = draw_vbo; - else + sctx->real_draw_vertex_state = draw_vertex_state; + } else { + assert(!sctx->real_draw_vertex_state); sctx->b.draw_vbo = draw_vbo; + sctx->b.draw_vertex_state = draw_vertex_state; + } } /* Return the number of samples that the rasterizer uses. */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 2f179e9195f..8b02e79437c 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -30,6 +30,7 @@ #include "util/format/u_format.h" #include "util/format/u_format_s3tc.h" #include "util/u_dual_blend.h" +#include "util/u_helpers.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" @@ -636,14 +637,8 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL); } -static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, - const struct pipe_draw_info *info, - unsigned drawid_offset, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws) { - struct si_context *sctx = (struct si_context *)ctx; - +static bool si_check_blend_dst_sampler_noop(struct si_context *sctx) +{ if (sctx->framebuffer.state.nr_cbufs == 1) { struct si_shader_selector *sel = sctx->shader.ps.cso; bool free_nir; @@ -677,16 +672,44 @@ static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, if (tex->is_depth && tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) && tex->depth_clear_value[0] == 1) { - return; + return false; } /* TODO: handle color textures */ } } } + return true; +} + +static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + if (!si_check_blend_dst_sampler_noop(sctx)) + return; + sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws); } +static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx, + struct pipe_vertex_state *state, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) { + struct si_context *sctx = (struct si_context *)ctx; + + if (!si_check_blend_dst_sampler_noop(sctx)) + return; + + sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws); +} + static void si_bind_blend_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -731,9 +754,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) if (likely(!radeon_uses_secure_bos(sctx->ws))) { if (unlikely(blend->allows_noop_optimization)) { - si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop); + si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop, + si_draw_vstate_blend_dst_sampler_noop); } else { - si_install_draw_wrapper(sctx, NULL); + si_install_draw_wrapper(sctx, NULL, NULL); } } } @@ -5011,6 +5035,78 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, } } +static struct pipe_vertex_state * +si_create_vertex_state(struct pipe_screen *screen, + struct pipe_vertex_buffer *buffer, + const struct pipe_vertex_element *elements, + unsigned num_elements, + struct pipe_resource *indexbuf, + uint32_t full_velem_mask) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state); + + util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask, + &state->b); + + /* Initialize the vertex element state in state->element. + * Do it by creating a vertex element state object and copying it there. + */ + struct pipe_context ctx = {}; + ctx.screen = screen; + struct si_vertex_elements *velems = si_create_vertex_elements(&ctx, num_elements, elements); + state->velems = *velems; + si_delete_vertex_element(&ctx, velems); + + assert(!state->velems.instance_divisor_is_one); + assert(!state->velems.instance_divisor_is_fetched); + assert(!state->velems.fix_fetch_always); + assert(buffer->stride % 4 == 0); + assert(buffer->buffer_offset % 4 == 0); + assert(!buffer->is_user_buffer); + for (unsigned i = 0; i < num_elements; i++) { + assert(elements[i].src_offset % 4 == 0); + assert(!elements[i].dual_slot); + } + + for (unsigned i = 0; i < num_elements; i++) { + si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i, + &state->descriptors[i * 4]); + } + + return &state->b; +} + +static void si_vertex_state_destroy(struct pipe_screen *screen, + struct pipe_vertex_state *state) +{ + pipe_vertex_buffer_unreference(&state->input.vbuffer); + pipe_resource_reference(&state->input.indexbuf, NULL); + FREE(state); +} + +static struct pipe_vertex_state * +si_pipe_create_vertex_state(struct pipe_screen *screen, + struct pipe_vertex_buffer *buffer, + const struct pipe_vertex_element *elements, + unsigned num_elements, + struct pipe_resource *indexbuf, + uint32_t full_velem_mask) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf, + full_velem_mask, &sscreen->vertex_state_cache); +} + +static void si_pipe_vertex_state_destroy(struct pipe_screen *screen, + struct pipe_vertex_state *state) +{ + struct si_screen *sscreen = (struct si_screen *)screen; + + util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state); +} + /* * Misc */ @@ -5177,12 +5273,17 @@ void si_init_state_functions(struct si_context *sctx) void si_init_screen_state_functions(struct si_screen *sscreen) { sscreen->b.is_format_supported = si_is_format_supported; + sscreen->b.create_vertex_state = si_pipe_create_vertex_state; + sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy; if (sscreen->info.chip_class >= GFX10) { sscreen->make_texture_descriptor = gfx10_make_texture_descriptor; } else { sscreen->make_texture_descriptor = si_make_texture_descriptor; } + + util_vertex_state_cache_init(&sscreen->vertex_state_cache, + si_create_vertex_state, si_vertex_state_destroy); } static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 6382c34a598..3999fb4eac2 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -25,6 +25,7 @@ #include "ac_exp_param.h" #include "ac_sqtt.h" #include "si_build_pm4.h" +#include "util/u_cpu_detect.h" #include "util/u_index_modify.h" #include "util/u_prim.h" #include "util/u_upload_mgr.h" @@ -944,6 +945,12 @@ static bool si_is_line_stipple_enabled(struct si_context *sctx) (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim)); } +enum si_is_draw_vertex_state { + DRAW_VERTEX_STATE_OFF, + DRAW_VERTEX_STATE_ON, +}; + +template ALWAYS_INLINE static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect, enum pipe_prim_type prim, unsigned min_vertex_count, @@ -951,6 +958,9 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info * unsigned num_prims, ubyte vertices_per_patch) { + if (IS_DRAW_VERTEX_STATE) + return 0; + if (indirect) { return indirect->buffer || (instance_count > 1 && indirect->count_from_stream_output); @@ -960,7 +970,8 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info * } } -template ALWAYS_INLINE +template ALWAYS_INLINE static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_indirect_info *indirect, enum pipe_prim_type prim, unsigned num_patches, @@ -980,12 +991,15 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, } key.u.prim = prim; - key.u.uses_instancing = (indirect && indirect->buffer) || instance_count > 1; + key.u.uses_instancing = !IS_DRAW_VERTEX_STATE && + ((indirect && indirect->buffer) || instance_count > 1); key.u.multi_instances_smaller_than_primgroup = - num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count, - primgroup_size, sctx->patch_vertices); - key.u.primitive_restart = primitive_restart; - key.u.count_from_stream_output = indirect && indirect->count_from_stream_output; + num_instanced_prims_less_than(indirect, prim, min_vertex_count, + instance_count, primgroup_size, + sctx->patch_vertices); + key.u.primitive_restart = !IS_DRAW_VERTEX_STATE && primitive_restart; + key.u.count_from_stream_output = !IS_DRAW_VERTEX_STATE && indirect && + indirect->count_from_stream_output; key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx); ia_multi_vgt_param = @@ -1003,8 +1017,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, */ if (GFX_VERSION == GFX7 && sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) && - num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count, 2, - sctx->patch_vertices)) + num_instanced_prims_less_than(indirect, prim, min_vertex_count, + instance_count, 2, sctx->patch_vertices)) sctx->flags |= SI_CONTEXT_VGT_FLUSH; } @@ -1089,11 +1103,11 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) } } -template -ALWAYS_INLINE +template ALWAYS_INLINE static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) { - if (sctx->num_vs_blit_sgprs) { + if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) { /* Re-emit the state after we leave u_blitter. */ sctx->last_vs_state = ~0; return; @@ -1143,7 +1157,8 @@ static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitiv sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN); } -template ALWAYS_INLINE +template ALWAYS_INLINE static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_indirect_info *indirect, enum pipe_prim_type prim, unsigned num_patches, @@ -1154,7 +1169,7 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, unsigned ia_multi_vgt_param; ia_multi_vgt_param = - si_get_ia_multi_vgt_param + si_get_ia_multi_vgt_param (sctx, indirect, prim, num_patches, instance_count, primitive_restart, min_vertex_count); @@ -1225,7 +1240,8 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches) } } -template ALWAYS_INLINE +template ALWAYS_INLINE static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_indirect_info *indirect, enum pipe_prim_type prim, unsigned num_patches, @@ -1234,10 +1250,13 @@ static void si_emit_draw_registers(struct si_context *sctx, { struct radeon_cmdbuf *cs = &sctx->gfx_cs; + if (IS_DRAW_VERTEX_STATE) + primitive_restart = false; + if (GFX_VERSION >= GFX10) gfx10_emit_ge_cntl(sctx, num_patches); else - si_emit_ia_multi_vgt_param + si_emit_ia_multi_vgt_param (sctx, indirect, prim, num_patches, instance_count, primitive_restart, min_vertex_count); @@ -1284,7 +1303,7 @@ static void si_emit_draw_registers(struct si_context *sctx, } \ } while (0) -template +template ALWAYS_INLINE static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, unsigned drawid_base, @@ -1304,7 +1323,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw uint32_t use_opaque = 0; - if (indirect && indirect->count_from_stream_output) { + if (!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) { struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output; radeon_begin(cs); @@ -1379,7 +1398,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX]; bool render_cond_bit = sctx->render_cond_enabled; - if (indirect) { + if (!IS_DRAW_VERTEX_STATE && indirect) { assert(num_draws == 1); uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address; @@ -1454,10 +1473,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw /* Base vertex and start instance. */ int base_vertex = original_index_size ? draws[0].index_bias : draws[0].start; - bool set_draw_id = sctx->vs_uses_draw_id; + bool set_draw_id = !IS_DRAW_VERTEX_STATE && sctx->vs_uses_draw_id; bool set_base_instance = sctx->vs_uses_base_instance; - if (sctx->num_vs_blit_sgprs) { + if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) { /* Re-emit draw constants after we leave u_blitter. */ si_invalidate_draw_sh_constants(sctx); @@ -1496,7 +1515,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } /* Don't update draw_id in the following code if it doesn't increment. */ - bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id; + bool increment_draw_id = !IS_DRAW_VERTEX_STATE && num_draws > 1 && + set_draw_id && info->increment_draw_id; if (index_size) { /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs @@ -1514,7 +1534,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw * else for (all draws); * */ - bool index_bias_varies = num_draws > 1 && info->index_bias_varies; + bool index_bias_varies = !IS_DRAW_VERTEX_STATE && num_draws > 1 && + info->index_bias_varies; if (increment_draw_id) { if (index_bias_varies) { @@ -1655,7 +1676,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(draws[i].count); radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque); } - if (num_draws > 1 && !sctx->num_vs_blit_sgprs) + if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs)) sctx->last_base_vertex = draws[num_draws - 1].start; } } @@ -1743,20 +1764,56 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex #endif -template ALWAYS_INLINE -static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) +/* util_bitcount has large measurable overhead (~2% difference in viewperf), so we use + * the POPCNT x86 instruction via inline assembly if the CPU supports it. + */ +enum si_has_popcnt { + POPCNT_NO, + POPCNT_YES, +}; + +template +unsigned bitcount_asm(unsigned n) { - unsigned count = sctx->num_vertex_elements; + if (POPCNT == POPCNT_YES) + return util_popcnt_inline_asm(n); + else + return util_bitcount(n); +} + +template +static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state, + uint32_t *partial_velem_mask) +{ + unsigned semantic_index = u_bit_scan(partial_velem_mask); + assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index)); + /* A prefix mask of the full mask gives us the index in pipe_vertex_state. */ + return bitcount_asm(state->input.full_velem_mask & BITFIELD_MASK(semantic_index)); +} + +template ALWAYS_INLINE +static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx, + struct pipe_vertex_state *state, + uint32_t partial_velem_mask) +{ + struct si_vertex_state *vstate = (struct si_vertex_state *)state; + unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm(partial_velem_mask) : + sctx->num_vertex_elements; + unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, + PIPE_SHADER_VERTEX); unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION); bool pointer_dirty, user_sgprs_dirty; assert(count <= SI_MAX_ATTRIBS); - if (sctx->vertex_buffers_dirty) { + if (sctx->vertex_buffers_dirty || IS_DRAW_VERTEX_STATE) { assert(count); struct si_vertex_elements *velems = sctx->vertex_elements; - unsigned alloc_size = velems->vb_desc_list_alloc_size; + unsigned alloc_size = IS_DRAW_VERTEX_STATE ? + vstate->velems.vb_desc_list_alloc_size : + velems->vb_desc_list_alloc_size; uint32_t *ptr; if (alloc_size) { @@ -1783,27 +1840,64 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) si_resource_reference(&sctx->vb_descriptors_buffer, NULL); } - unsigned first_vb_use_mask = velems->first_vb_use_mask; + if (IS_DRAW_VERTEX_STATE) { + unsigned partial_count = bitcount_asm(partial_velem_mask); + unsigned i = 0; - for (unsigned i = 0; i < count; i++) { - unsigned vbo_index = velems->vertex_buffer_index[i]; - struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index]; - uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4] - : &ptr[(i - num_vbos_in_user_sgprs) * 4]; + if (num_vbos_in_user_sgprs) { + unsigned num_vb_sgprs = MIN2(partial_count, num_vbos_in_user_sgprs) * 4; - if (!si_set_vb_descriptor(velems, vb, i, desc)) - continue; + radeon_begin(&sctx->gfx_cs); + radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_vb_sgprs); - if (first_vb_use_mask & (1 << i)) { - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource), + for (; partial_velem_mask && i < num_vbos_in_user_sgprs; i++) { + unsigned velem_index = get_next_vertex_state_elem(state, &partial_velem_mask); + + radeon_emit_array(&vstate->descriptors[velem_index * 4], 4); + } + radeon_end(); + } + + for (; partial_velem_mask; i++) { + unsigned velem_index = get_next_vertex_state_elem(state, &partial_velem_mask); + uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4]; + + memcpy(desc, &vstate->descriptors[velem_index * 4], 16); + } + + if (vstate->b.input.vbuffer.buffer.resource != vstate->b.input.indexbuf) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, + si_resource(vstate->b.input.vbuffer.buffer.resource), RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } + + /* The next draw_vbo should recompute and rebind vertex buffer descriptors. */ + sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0; + + user_sgprs_dirty = false; /* We just set them above. */ + pointer_dirty = count > num_vbos_in_user_sgprs; + } else { + unsigned first_vb_use_mask = velems->first_vb_use_mask; + + for (unsigned i = 0; i < count; i++) { + unsigned vbo_index = velems->vertex_buffer_index[i]; + struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index]; + uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4] + : &ptr[(i - num_vbos_in_user_sgprs) * 4]; + + if (!si_set_vb_descriptor(velems, vb, i, desc)) + continue; + + if (first_vb_use_mask & (1 << i)) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource), + RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); + } + } + + sctx->vertex_buffers_dirty = false; + user_sgprs_dirty = num_vbos_in_user_sgprs > 0; + pointer_dirty = alloc_size != 0; } - - sctx->vertex_buffers_dirty = false; - - pointer_dirty = alloc_size != 0; - user_sgprs_dirty = num_vbos_in_user_sgprs > 0; } else { pointer_dirty = sctx->vertex_buffer_pointer_dirty; user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty; @@ -1811,8 +1905,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx) if (pointer_dirty || user_sgprs_dirty) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, - PIPE_SHADER_VERTEX); assert(count); radeon_begin(cs); @@ -1922,8 +2014,8 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d } } -template -ALWAYS_INLINE +template ALWAYS_INLINE static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, enum pipe_prim_type prim, unsigned instance_count, @@ -1964,8 +2056,8 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i } /* Emit draw states. */ - si_emit_vs_state(sctx, info->index_size); - si_emit_draw_registers + si_emit_vs_state(sctx, info->index_size); + si_emit_draw_registers (sctx, indirect, prim, num_patches, instance_count, primitive_restart, info->restart_index, min_vertex_count); } @@ -1975,13 +2067,16 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i pipe_resource_reference(&indexbuf, NULL); \ } while (0) -template -static void si_draw_vbo(struct pipe_context *ctx, - const struct pipe_draw_info *info, - unsigned drawid_offset, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws) +template ALWAYS_INLINE +static void si_draw(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws, + struct pipe_vertex_state *state, + uint32_t partial_velem_mask) { /* Keep code that uses the least number of local variables as close to the beginning * of this function as possible to minimize register pressure. @@ -2052,11 +2147,14 @@ static void si_draw_vbo(struct pipe_context *ctx, * 'instance_count == 0' seems to be problematic on Renoir chips (#4866), * so simplify the condition and drop these draws for all <= GFX9 chips. */ - if (GFX_VERSION <= GFX9 && unlikely(!indirect && !instance_count)) + if (GFX_VERSION <= GFX9 && unlikely(!IS_DRAW_VERTEX_STATE && !indirect && !instance_count)) return; struct si_shader_selector *vs = sctx->shader.vs.cso; - if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs || + struct si_vertex_state *vstate = (struct si_vertex_state *)state; + if (unlikely(!vs || + (!IS_DRAW_VERTEX_STATE && sctx->num_vertex_elements < vs->num_vs_inputs) || + (IS_DRAW_VERTEX_STATE && vstate->velems.count < vs->num_vs_inputs) || !sctx->shader.ps.cso || (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) { assert(0); return; @@ -2084,7 +2182,7 @@ static void si_draw_vbo(struct pipe_context *ctx, if (index_size) { /* Translate or upload, if needed. */ /* 8-bit indices are supported on GFX8. */ - if (GFX_VERSION <= GFX7 && index_size == 1) { + if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) { unsigned start, count, start_offset, size, offset; void *ptr; @@ -2103,7 +2201,7 @@ static void si_draw_vbo(struct pipe_context *ctx, /* info->start will be added by the drawing code */ index_offset = offset - start_offset; index_size = 2; - } else if (info->has_user_indices) { + } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) { unsigned start_offset; assert(!indirect); @@ -2130,7 +2228,7 @@ static void si_draw_vbo(struct pipe_context *ctx, unsigned min_direct_count = 0; unsigned total_direct_count = 0; - if (indirect) { + if (!IS_DRAW_VERTEX_STATE && indirect) { /* Add the buffer size for memory checking in need_cs_space. */ if (indirect->buffer) si_context_add_resource_size(sctx, indirect->buffer); @@ -2192,6 +2290,32 @@ static void si_draw_vbo(struct pipe_context *ctx, } } + if (IS_DRAW_VERTEX_STATE) { + /* draw_vertex_state doesn't use the current vertex buffers and vertex elements, + * so disable any non-trivial VS prolog that is based on them, such as vertex + * format lowering. + */ + if (!sctx->force_trivial_vs_prolog) { + sctx->force_trivial_vs_prolog = true; + + /* Update shaders to disable the non-trivial VS prolog. */ + if (sctx->uses_nontrivial_vs_prolog) { + si_vs_key_update_inputs(sctx); + sctx->do_update_shaders = true; + } + } + } else { + if (sctx->force_trivial_vs_prolog) { + sctx->force_trivial_vs_prolog = false; + + /* Update shaders to enable the non-trivial VS prolog. */ + if (sctx->uses_nontrivial_vs_prolog) { + si_vs_key_update_inputs(sctx); + sctx->do_update_shaders = true; + } + } + } + /* Update NGG culling settings. */ uint8_t old_ngg_culling = sctx->ngg_culling; if (GFX_VERSION >= GFX10) { @@ -2314,7 +2438,7 @@ static void si_draw_vbo(struct pipe_context *ctx, masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors); gfx9_scissor_bug = true; - if ((indirect && indirect->count_from_stream_output) || + if ((!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) || sctx->dirty_atoms & si_atoms_that_always_roll_context() || sctx->dirty_states & si_states_that_always_roll_context()) sctx->context_roll = true; @@ -2333,7 +2457,7 @@ static void si_draw_vbo(struct pipe_context *ctx, masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond); /* Emit all states except possibly render condition. */ - si_emit_all_states + si_emit_all_states (sctx, info, indirect, prim, instance_count, min_direct_count, primitive_restart, masked_atoms); sctx->emit_cache_flush(sctx, &sctx->gfx_cs); @@ -2342,7 +2466,9 @@ static void si_draw_vbo(struct pipe_context *ctx, /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch. * It should done after cache flushing. */ - if (unlikely((!si_upload_and_prefetch_VB_descriptors(sctx)))) { + if (unlikely((!si_upload_and_prefetch_VB_descriptors + + (sctx, state, partial_velem_mask)))) { DRAW_CLEANUP; return; } @@ -2359,7 +2485,7 @@ static void si_draw_vbo(struct pipe_context *ctx, } assert(sctx->dirty_atoms == 0); - si_emit_draw_packets + si_emit_draw_packets (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf, index_size, index_offset, instance_count, original_index_size); /* <-- CUs are busy here. */ @@ -2381,12 +2507,14 @@ static void si_draw_vbo(struct pipe_context *ctx, /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch. * It should done after cache flushing and after the VS prefetch. */ - if (unlikely((!si_upload_and_prefetch_VB_descriptors(sctx)))) { + if (unlikely((!si_upload_and_prefetch_VB_descriptors + + (sctx, state, partial_velem_mask)))) { DRAW_CLEANUP; return; } - si_emit_all_states + si_emit_all_states (sctx, info, indirect, prim, instance_count, min_direct_count, primitive_restart, masked_atoms); @@ -2397,7 +2525,7 @@ static void si_draw_vbo(struct pipe_context *ctx, } assert(sctx->dirty_atoms == 0); - si_emit_draw_packets + si_emit_draw_packets (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf, index_size, index_offset, instance_count, original_index_size); @@ -2428,9 +2556,9 @@ static void si_draw_vbo(struct pipe_context *ctx, if (unlikely(sctx->decompression_enabled)) { sctx->num_decompress_calls++; } else { - sctx->num_draw_calls++; + sctx->num_draw_calls += num_draws; if (primitive_restart) - sctx->num_prim_restart_calls++; + sctx->num_prim_restart_calls += num_draws; } if (!sctx->blitter_running && sctx->framebuffer.state.zsbuf) { @@ -2441,6 +2569,39 @@ static void si_draw_vbo(struct pipe_context *ctx, DRAW_CLEANUP; } +template +static void si_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) +{ + si_draw + (ctx, info, drawid_offset, indirect, draws, num_draws, NULL, 0); +} + +template +static void si_draw_vertex_state(struct pipe_context *ctx, + struct pipe_vertex_state *vstate, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) +{ + struct si_vertex_state *state = (struct si_vertex_state *)vstate; + struct pipe_draw_info dinfo = {}; + + dinfo.mode = info.mode; + dinfo.index_size = 4; + dinfo.instance_count = 1; + dinfo.index.resource = state->b.input.indexbuf; + + si_draw + (ctx, &dinfo, 0, NULL, draws, num_draws, vstate, partial_velem_mask); +} + static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso, blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2, float depth, unsigned num_instances, enum blitter_attrib_type type, @@ -2492,6 +2653,14 @@ static void si_init_draw_vbo(struct si_context *sctx) sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = si_draw_vbo; + + if (util_get_cpu_caps()->has_popcnt) { + sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = + si_draw_vertex_state; + } else { + sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] = + si_draw_vertex_state; + } } template @@ -2517,6 +2686,16 @@ static void si_invalid_draw_vbo(struct pipe_context *pipe, unreachable("vertex shader not bound"); } +static void si_invalid_draw_vertex_state(struct pipe_context *ctx, + struct pipe_vertex_state *vstate, + uint32_t partial_velem_mask, + struct pipe_draw_vertex_state_info info, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) +{ + unreachable("vertex shader not bound"); +} + extern "C" void GFX(si_init_draw_functions_)(struct si_context *sctx) { @@ -2528,6 +2707,7 @@ void GFX(si_init_draw_functions_)(struct si_context *sctx) * initialization of callbacks in upper layers (such as u_threaded_context). */ sctx->b.draw_vbo = si_invalid_draw_vbo; + sctx->b.draw_vertex_state = si_invalid_draw_vertex_state; sctx->blitter->draw_rectangle = si_draw_rectangle; si_init_ia_multi_vgt_param_table(sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 2414d52a7fd..b0cf1d1b4eb 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1816,9 +1816,15 @@ void si_vs_key_update_inputs(struct si_context *sctx) if (vs->info.base.vs.blit_sgprs_amd) { si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog); key->opt.prefer_mono = 0; + sctx->uses_nontrivial_vs_prolog = false; return; } + bool uses_nontrivial_vs_prolog = false; + + if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched) + uses_nontrivial_vs_prolog = true; + key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one; key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched; key->opt.prefer_mono = elts->instance_divisor_is_fetched; @@ -1846,9 +1852,29 @@ void si_vs_key_update_inputs(struct si_context *sctx) while (fix) { unsigned i = u_bit_scan(&fix); - key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i]; + uint8_t fix_fetch = elts->fix_fetch[i]; + + key->mono.vs_fix_fetch[i].bits = fix_fetch; + if (fix_fetch) + uses_nontrivial_vs_prolog = true; } key->mono.vs_fetch_opencode = opencode; + if (opencode) + uses_nontrivial_vs_prolog = true; + + sctx->uses_nontrivial_vs_prolog = uses_nontrivial_vs_prolog; + + /* draw_vertex_state (display lists) requires a trivial VS prolog that ignores + * the current vertex buffers and vertex elements. + * + * We just computed the prolog key because we needed to set uses_nontrivial_vs_prolog, + * so that we know whether the VS prolog should be updated when we switch from + * draw_vertex_state to draw_vbo. Now clear the VS prolog for draw_vertex_state. + * This should happen rarely because the VS prolog should be trivial in most + * cases. + */ + if (uses_nontrivial_vs_prolog && sctx->force_trivial_vs_prolog) + si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog); } void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,