From 844f66bf3887cd91273cc0f3dcb0d605d97e1ed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 9 Oct 2021 22:13:41 -0400 Subject: [PATCH] radeonsi: remove GS fast launch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It regresses the first snx test because it adds CPU overhead, and there is no way to work around it. The average effect on viewperf is 0, meaning that a few cases improve, while a few others regress. Acked-by: Timur Kristóf Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/gfx10_shader_ngg.c | 44 +------- src/gallium/drivers/radeonsi/si_pipe.c | 1 - src/gallium/drivers/radeonsi/si_pipe.h | 9 +- src/gallium/drivers/radeonsi/si_shader.c | 25 +---- src/gallium/drivers/radeonsi/si_shader.h | 16 +-- .../drivers/radeonsi/si_shader_llvm_gs.c | 2 +- .../drivers/radeonsi/si_shader_llvm_vs.c | 105 ------------------ .../drivers/radeonsi/si_state_draw.cpp | 88 +-------------- .../drivers/radeonsi/si_state_shaders.c | 54 ++++----- 9 files changed, 39 insertions(+), 305 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index bcba01f910f..8ee9720e171 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -909,16 +909,9 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) */ LLVMValueRef vtxindex[3]; - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) { - /* For the GS fast launch, the VS prolog simply puts the Vertex IDs - * into these VGPRs. - */ - for (unsigned i = 0; i < num_vertices; ++i) - vtxindex[i] = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[i]); - } else { - for (unsigned i = 0; i < num_vertices; ++i) - vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16); - }; + for (unsigned i = 0; i < num_vertices; ++i) + vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16); + LLVMValueRef gs_vtxptr[3]; for (unsigned i = 0; i < num_vertices; i++) gs_vtxptr[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); @@ -1005,7 +998,6 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE)); assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE)); - assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)); } else { options.num_vertices = 3; options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE; @@ -2028,14 +2020,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */ unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size; - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { - /* All lanes are filled in wave32. */ - max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32); - max_esverts_base = max_gsprims_base * 3; - } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - max_gsprims_base = max_esverts_base - 2; - } - if (gs_stage == MESA_SHADER_GEOMETRY) { bool force_multi_cycling = false; unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations; @@ -2165,28 +2149,6 @@ retry_select_mode: prim_amp_factor = gs_sel->info.base.gs.vertices_out; } - /* Fix up the thread counts for fast launch. */ - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { - /* The vertex count must be a multiple of 3. */ - max_esverts -= max_esverts % 3; - /* We can only decrease the size, not increase it. */ - if (max_gsprims * 3 < max_esverts) { - max_esverts = max_gsprims * 3; - } else { - max_gsprims = max_esverts / 3; - } - } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - /* The primitive count must be even to get correct winding for triangle strips. */ - max_gsprims &= ~1; - if (max_gsprims - 2 < max_esverts) { - max_esverts = max_gsprims + 2; - } else { - max_gsprims = max_esverts - 2; - max_gsprims &= ~1; - max_esverts = max_gsprims + 2; - } - } - shader->ngg.hw_max_esverts = max_esverts; shader->ngg.max_gsprims = max_gsprims; shader->ngg.max_out_verts = max_out_vertices; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 77f4c29e1b5..b812f170c59 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -93,7 +93,6 @@ static const struct debug_named_value radeonsi_debug_options[] = { /* 3D engine options: */ {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."}, {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."}, - {"nofastlaunch", DBG(NO_FAST_LAUNCH), "Disable NGG GS fast launch."}, {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."}, {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."}, {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."}, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 43baeea73c4..1cd347ab751 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -219,7 +219,6 @@ enum DBG_ALWAYS_NGG_CULLING_ALL, DBG_ALWAYS_NGG_CULLING_TESS, DBG_NO_NGG_CULLING, - DBG_NO_FAST_LAUNCH, DBG_SWITCH_ON_EOP, DBG_NO_OUT_OF_ORDER, DBG_NO_DPBB, @@ -1953,15 +1952,12 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc } static inline unsigned si_get_wave_size(struct si_screen *sscreen, - gl_shader_stage stage, bool ngg, bool es, - bool gs_fast_launch) + gl_shader_stage stage, bool ngg, bool es) { if (stage == MESA_SHADER_COMPUTE) return sscreen->compute_wave_size; else if (stage == MESA_SHADER_FRAGMENT) return sscreen->ps_wave_size; - else if (gs_fast_launch) - return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */ else if ((stage == MESA_SHADER_VERTEX && es && !ngg) || (stage == MESA_SHADER_TESS_EVAL && es && !ngg) || (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */ @@ -1974,8 +1970,7 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader) { return si_get_wave_size(shader->selector->screen, shader->selector->info.stage, shader->key.as_ngg, - shader->key.as_es, - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + shader->key.as_es); } static inline void si_select_draw_vbo(struct si_context *sctx) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4bc70ce9a22..546f9da1120 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1276,9 +1276,7 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel, * VS prolog. */ return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || /* The 2nd VS prolog loads input VGPRs from LDS */ - (key->opt.ngg_culling && !ngg_cull_shader) || - /* The 1st VS prolog generates input VGPRs for fast launch. */ - (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); + (key->opt.ngg_culling && !ngg_cull_shader); } /** @@ -1304,16 +1302,8 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ key->vs_prolog.as_es = shader_out->key.as_es; key->vs_prolog.as_ngg = shader_out->key.as_ngg; - if (ngg_cull_shader) { - key->vs_prolog.gs_fast_launch_tri_list = - !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST); - key->vs_prolog.gs_fast_launch_tri_strip = - !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP); - key->vs_prolog.gs_fast_launch_index_size_packed = - SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling); - } else if (shader_out->key.opt.ngg_culling) { + if (!ngg_cull_shader && shader_out->key.opt.ngg_culling) key->vs_prolog.load_vgprs_after_culling = 1; - } if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { key->vs_prolog.as_ls = 1; @@ -1576,10 +1566,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, shader.key.as_ls = key->vs_prolog.as_ls; shader.key.as_es = key->vs_prolog.as_es; shader.key.as_ngg = key->vs_prolog.as_ngg; - shader.key.opt.ngg_culling = - (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) | - (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) | - SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed); break; case MESA_SHADER_TESS_CTRL: assert(!prolog); @@ -1602,8 +1588,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, struct si_shader_context ctx; si_llvm_context_init(&ctx, sscreen, compiler, si_get_wave_size(sscreen, stage, - shader.key.as_ngg, shader.key.as_es, - shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)); + shader.key.as_ngg, shader.key.as_es)); ctx.shader = &shader; ctx.stage = stage; @@ -2130,9 +2115,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) || (sel->info.stage == MESA_SHADER_VERTEX && /* Used to export PrimitiveID from the correct vertex. */ - (shader->key.mono.u.vs_export_prim_id || - /* Used to generate triangle strip vertex IDs for all threads. */ - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP))); + shader->key.mono.u.vs_export_prim_id)); shader->uses_vs_state_outprim = sscreen->use_ngg && /* Only used by streamout in vertex shaders. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index ff32672658f..4072a6c028d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -282,12 +282,7 @@ enum #define SI_NGG_CULL_ENABLED (1 << 0) /* this implies W, view.xy, and small prim culling */ #define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */ #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */ -#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */ -#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3) -#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */ -#define SI_NGG_CULL_LINES (1 << 7) /* the primitive type is lines */ +#define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */ /** * For VS shader keys, describe any fixups required for vertex fetch. @@ -590,9 +585,6 @@ union si_shader_part_key { unsigned as_ls : 1; unsigned as_es : 1; unsigned as_ngg : 1; - unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ - unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ - unsigned gs_fast_launch_index_size_packed : 2; unsigned load_vgprs_after_culling : 1; /* Prologs for monolithic shaders shouldn't set EXEC. */ unsigned is_monolithic : 1; @@ -686,7 +678,7 @@ struct si_shader_key { unsigned kill_pointsize : 1; /* For NGG VS and TES. */ - unsigned ngg_culling : 8; /* SI_NGG_CULL_* */ + unsigned ngg_culling : 4; /* SI_NGG_CULL_* */ /* For shaders where monolithic variants have better code. * @@ -744,7 +736,7 @@ struct gfx9_gs_info { unsigned esgs_ring_size; /* in bytes */ }; -#define SI_NUM_VGT_STAGES_KEY_BITS 6 +#define SI_NUM_VGT_STAGES_KEY_BITS 5 #define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS) /* The VGT_SHADER_STAGES key used to index the table of precomputed values. @@ -755,7 +747,6 @@ union si_vgt_stages_key { #if UTIL_ARCH_LITTLE_ENDIAN uint8_t tess : 1; uint8_t gs : 1; - uint8_t ngg_gs_fast_launch : 1; uint8_t ngg_passthrough : 1; uint8_t ngg : 1; /* gfx10+ */ uint8_t streamout : 1; /* only used with NGG */ @@ -765,7 +756,6 @@ union si_vgt_stages_key { uint8_t streamout : 1; uint8_t ngg : 1; uint8_t ngg_passthrough : 1; - uint8_t ngg_gs_fast_launch : 1; uint8_t gs : 1; uint8_t tess : 1; #endif diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index a9ab0c549f3..c22e826ff01 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, si_llvm_context_init(&ctx, sscreen, compiler, si_get_wave_size(sscreen, MESA_SHADER_VERTEX, - false, false, false)); + false, false)); ctx.shader = shader; ctx.stage = MESA_SHADER_VERTEX; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index d35c296c219..ecdcf48403d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -839,8 +839,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part returns[num_returns++] = ctx->ac.i32; } - struct ac_arg merged_wave_info = input_sgpr_param[3]; - /* Preloaded VGPRs (outputs must be floats) */ for (i = 0; i < num_input_vgprs; i++) { ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]); @@ -892,109 +890,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part } } - if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) { - LLVMValueRef wave_id, thread_id_in_tg; - - wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); - thread_id_in_tg = - ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), - ac_get_thread_id(&ctx->ac)); - - /* The GS fast launch initializes all VGPRs to the value of - * the first thread, so we have to add the thread ID. - * - * Only these are initialized by the hw: - * VGPR2: Base Primitive ID - * VGPR5: Base Vertex ID - * VGPR6: Instance ID - */ - - /* Put the vertex thread IDs into VGPRs as-is instead of packing them. - * The NGG cull shader will read them from there. - */ - if (key->vs_prolog.gs_fast_launch_tri_list) { - input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */ - LLVMConstInt(ctx->ac.i32, 0, 0)); - input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */ - LLVMConstInt(ctx->ac.i32, 1, 0)); - input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */ - LLVMConstInt(ctx->ac.i32, 2, 0)); - } else { - assert(key->vs_prolog.gs_fast_launch_tri_strip); - LLVMBuilderRef builder = ctx->ac.builder; - /* Triangle indices: */ - LLVMValueRef index[3] = { - thread_id_in_tg, - LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""), - LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""), - }; - LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, ""); - LLVMValueRef flatshade_first = LLVMBuildICmp( - builder, LLVMIntEQ, - si_unpack_param(ctx, input_sgpr_param[8 + SI_SGPR_VS_STATE_BITS], 4, 2), - ctx->ac.i32_0, ""); - - ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index); - input_vgprs[0] = index[0]; - input_vgprs[1] = index[1]; - input_vgprs[4] = index[2]; - } - - /* Triangles always have all edge flags set initially. */ - input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0); - - input_vgprs[2] = - LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */ - input_vgprs[5] = - LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */ - input_vgprs[8] = input_vgprs[6]; /* InstanceID */ - - if (key->vs_prolog.gs_fast_launch_index_size_packed) { - LLVMTypeRef index_type = ctx->ac.voidt; - - switch (key->vs_prolog.gs_fast_launch_index_size_packed) { - case 1: - index_type = ctx->ac.i8; - break; - case 2: - index_type = ctx->ac.i16; - break; - case 3: - index_type = ctx->ac.i32; - break; - default: - unreachable("invalid gs_fast_launch_index_size_packed"); - } - - LLVMValueRef sgprs[2] = { - ac_get_arg(&ctx->ac, input_sgpr_param[0]), - ac_get_arg(&ctx->ac, input_sgpr_param[1]), - }; - LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2); - indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, ""); - indices = LLVMBuildIntToPtr(ctx->ac.builder, indices, - LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), ""); - - LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], ""); - - /* if (is ES thread...) */ - ac_build_ifcc(&ctx->ac, - LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), - si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0); - /* VertexID = indexBufferLoad(VertexID); */ - LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, ""); - index = LLVMBuildLoad(ctx->ac.builder, index, ""); - index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, ""); - LLVMBuildStore(ctx->ac.builder, index, vertex_id); - ac_build_endif(&ctx->ac, 0); - - input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, ""); - } - } - unsigned vertex_id_vgpr = first_vs_vgpr; unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10 ? first_vs_vgpr + 3 diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 2b07c897645..cc824c6b891 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1621,42 +1621,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } } } else { - /* Set the index buffer for fast launch. The VS prolog will load the indices. */ - if (GFX_VERSION >= GFX10_3 && NGG && - sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) { - index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size); - - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), - RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); - uint64_t base_index_va = si_resource(indexbuf)->gpu_address + index_offset; - - for (unsigned i = 0; i < num_draws; i++) { - uint64_t index_va = base_index_va + draws[i].start * original_index_size; - - radeon_set_sh_reg_seq(R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, 2); - radeon_emit(index_va); - radeon_emit(index_va >> 32); - - if (i > 0) { - if (increment_draw_id) { - unsigned draw_id = drawid_base + i; - - radeon_set_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, draw_id); - sctx->last_drawid = draw_id; - } - } - - /* TODO: Do index buffer bounds checking? We don't do it in this case. */ - radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); - radeon_emit(draws[i].count); - radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX); - } - radeon_end(); - - EMIT_SQTT_END_DRAW; - return; - } - for (unsigned i = 0; i < num_draws; i++) { if (i > 0) { if (increment_draw_id) { @@ -2340,31 +2304,6 @@ static void si_draw(struct pipe_context *ctx, ngg_culling = SI_NGG_CULL_ENABLED | SI_NGG_CULL_LINES; } - /* Use NGG fast launch for certain primitive types. - * A draw must have at least 1 full primitive. - * The fast launch doesn't work with tessellation. - * - * Fast launch is disabled on Navi1x because enabling it requires VGT_FLUSH, - * which decreases performance by up to 10%. Only use fast launch on gfx10.3 and newer. - * - * Since NGG fast launch is enabled by VGT_SHADER_STAGES_EN, which causes a context roll, - * which decreases performance, decrease the frequency of switching it on/off using - * a high vertex count threshold. - */ - if (GFX_VERSION >= GFX10_3 && !HAS_TESS && total_direct_count >= 8000 && - !(sctx->screen->debug_flags & DBG(NO_FAST_LAUNCH))) { - if (prim == PIPE_PRIM_TRIANGLES && !index_size) { - ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST; - } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { - if (!index_size) { - ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP; - } else if (!primitive_restart) { - ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP | - SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3)); - } - } - } - if (ngg_culling != old_ngg_culling) { /* If shader compilation is not ready, this setting will be rejected. */ sctx->ngg_culling = ngg_culling; @@ -2382,32 +2321,13 @@ static void si_draw(struct pipe_context *ctx, return; } - /* si_update_shaders can clear the ngg_culling settings if the shader compilation hasn't - * finished. + /* si_update_shaders can clear the ngg_culling in the shader key if the shader compilation + * hasn't finished. Set it to the correct value in si_context. */ - if (GFX_VERSION >= GFX10 && NGG) { - uint8_t ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling; - - if (GFX_VERSION >= GFX10_3 && - old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) && - !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) { - /* Need to re-set these, because we have bound an index buffer there. */ - sctx->shader_pointers_dirty |= - (1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) | - (1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY)); - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - } - - /* Set this to the correct value determined by si_update_shaders. */ - sctx->ngg_culling = ngg_culling; - } + if (GFX_VERSION >= GFX10 && NGG) + sctx->ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling; } - /* ngg_culling can be changed after si_update_shaders above, so determine index_size here. */ - if (GFX_VERSION >= GFX10_3 && NGG && - sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) - index_size = 0; /* The index buffer will be emulated. */ - /* Since we've called si_context_add_resource_size for vertex buffers, * this must be called after si_need_cs_space, because we must let * need_cs_space flush before we add buffers to the buffer list. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index b0cf1d1b4eb..3589a0ca1d0 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, shader_variant_flags |= 1 << 0; if (sel->nir) shader_variant_flags |= 1 << 1; - if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32) + if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es) == 32) shader_variant_flags |= 1 << 2; if (sel->info.stage == MESA_SHADER_FRAGMENT && /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */ @@ -1306,33 +1306,27 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1); - if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST || - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts); - } else { - shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | - S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | - S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); + shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) | + S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) | + S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi); - /* On gfx10, the GE only checks against the maximum number of ES verts after - * allocating a full GS primitive. So we need to ensure that whenever - * this check passes, there is enough space for a full primitive without - * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256 - * if we have enough LDS. - * - * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0. + /* On gfx10, the GE only checks against the maximum number of ES verts after + * allocating a full GS primitive. So we need to ensure that whenever + * this check passes, there is enough space for a full primitive without + * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256 + * if we have enough LDS. + * + * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0. + */ + if ((sscreen->info.chip_class == GFX10) && + (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */ + shader->ngg.hw_max_esverts != 256 && + shader->ngg.hw_max_esverts > 5) { + /* This could be based on the input primitive type. 5 is the worst case + * for primitive types with adjacency. */ - if ((sscreen->info.chip_class == GFX10) && - (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */ - shader->ngg.hw_max_esverts != 256 && - shader->ngg.hw_max_esverts > 5) { - /* This could be based on the input primitive type. 5 is the worst case - * for primitive types with adjacency. - */ - shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; - shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); - } + shader->ge_cntl &= C_03096C_VERT_GRP_SIZE; + shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5); } if (window_space) { @@ -1347,8 +1341,6 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader shader->ctx_reg.ngg.vgt_stages.u.ngg = 1; shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs; shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader); - shader->ctx_reg.ngg.vgt_stages.u.ngg_gs_fast_launch = - !!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); } static void si_emit_shader_vs(struct si_context *sctx) @@ -4025,7 +4017,7 @@ struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union } if (key.u.ngg) { - stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) | + stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_NGG_WAVE_ID_EN(key.u.streamout) | S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough) | S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key.u.ngg_passthrough && @@ -4036,9 +4028,7 @@ struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union if (screen->info.chip_class >= GFX9) stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2); - if (screen->info.chip_class >= GFX10 && - /* GS fast launch hangs with Wave64, so always use Wave32. */ - (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) { + if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) { stages |= S_028B54_HS_W32_EN(1) | S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */ S_028B54_VS_W32_EN(1);