From 844f66bf3887cd91273cc0f3dcb0d605d97e1ed2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 9 Oct 2021 22:13:41 -0400
Subject: [PATCH] radeonsi: remove GS fast launch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It regresses the first snx test because it adds CPU overhead, and there is
no way to work around it. The average effect on viewperf is 0, meaning that
a few cases improve, while a few others regress.

Acked-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13279>
---
 .../drivers/radeonsi/gfx10_shader_ngg.c       |  44 +-------
 src/gallium/drivers/radeonsi/si_pipe.c        |   1 -
 src/gallium/drivers/radeonsi/si_pipe.h        |   9 +-
 src/gallium/drivers/radeonsi/si_shader.c      |  25 +----
 src/gallium/drivers/radeonsi/si_shader.h      |  16 +--
 .../drivers/radeonsi/si_shader_llvm_gs.c      |   2 +-
 .../drivers/radeonsi/si_shader_llvm_vs.c      | 105 ------------------
 .../drivers/radeonsi/si_state_draw.cpp        |  88 +--------------
 .../drivers/radeonsi/si_state_shaders.c       |  54 ++++-----
 9 files changed, 39 insertions(+), 305 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index bcba01f910f..8ee9720e171 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -909,16 +909,9 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
     */
 
    LLVMValueRef vtxindex[3];
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
-      /* For the GS fast launch, the VS prolog simply puts the Vertex IDs
-       * into these VGPRs.
-       */
-      for (unsigned i = 0; i < num_vertices; ++i)
-         vtxindex[i] = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[i]);
-   } else {
-      for (unsigned i = 0; i < num_vertices; ++i)
-         vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
-   };
+   for (unsigned i = 0; i < num_vertices; ++i)
+      vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
+
    LLVMValueRef gs_vtxptr[3];
    for (unsigned i = 0; i < num_vertices; i++)
       gs_vtxptr[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
@@ -1005,7 +998,6 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
 
          assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE));
          assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE));
-         assert(!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
       } else {
          options.num_vertices = 3;
          options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
@@ -2028,14 +2020,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
    unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
    unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
 
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-      /* All lanes are filled in wave32. */
-      max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32);
-      max_esverts_base = max_gsprims_base * 3;
-   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-      max_gsprims_base = max_esverts_base - 2;
-   }
-
    if (gs_stage == MESA_SHADER_GEOMETRY) {
       bool force_multi_cycling = false;
       unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
@@ -2165,28 +2149,6 @@ retry_select_mode:
       prim_amp_factor = gs_sel->info.base.gs.vertices_out;
    }
 
-   /* Fix up the thread counts for fast launch. */
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-      /* The vertex count must be a multiple of 3. */
-      max_esverts -= max_esverts % 3;
-      /* We can only decrease the size, not increase it. */
-      if (max_gsprims * 3 < max_esverts) {
-         max_esverts = max_gsprims * 3;
-      } else {
-         max_gsprims = max_esverts / 3;
-      }
-   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-      /* The primitive count must be even to get correct winding for triangle strips. */
-      max_gsprims &= ~1;
-      if (max_gsprims - 2 < max_esverts) {
-         max_esverts = max_gsprims + 2;
-      } else {
-         max_gsprims = max_esverts - 2;
-         max_gsprims &= ~1;
-         max_esverts = max_gsprims + 2;
-      }
-   }
-
    shader->ngg.hw_max_esverts = max_esverts;
    shader->ngg.max_gsprims = max_gsprims;
    shader->ngg.max_out_verts = max_out_vertices;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 77f4c29e1b5..b812f170c59 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -93,7 +93,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    /* 3D engine options: */
    {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
    {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
-   {"nofastlaunch", DBG(NO_FAST_LAUNCH), "Disable NGG GS fast launch."},
    {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
    {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
    {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 43baeea73c4..1cd347ab751 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -219,7 +219,6 @@ enum
    DBG_ALWAYS_NGG_CULLING_ALL,
    DBG_ALWAYS_NGG_CULLING_TESS,
    DBG_NO_NGG_CULLING,
-   DBG_NO_FAST_LAUNCH,
    DBG_SWITCH_ON_EOP,
    DBG_NO_OUT_OF_ORDER,
    DBG_NO_DPBB,
@@ -1953,15 +1952,12 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
 }
 
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
-                                        gl_shader_stage stage, bool ngg, bool es,
-                                        bool gs_fast_launch)
+                                        gl_shader_stage stage, bool ngg, bool es)
 {
    if (stage == MESA_SHADER_COMPUTE)
       return sscreen->compute_wave_size;
    else if (stage == MESA_SHADER_FRAGMENT)
       return sscreen->ps_wave_size;
-   else if (gs_fast_launch)
-      return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
    else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
             (stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
             (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
@@ -1974,8 +1970,7 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
 {
    return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
                            shader->key.as_ngg,
-                           shader->key.as_es,
-                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+                           shader->key.as_es);
 }
 
 static inline void si_select_draw_vbo(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4bc70ce9a22..546f9da1120 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1276,9 +1276,7 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel,
     * VS prolog. */
    return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
           /* The 2nd VS prolog loads input VGPRs from LDS */
-          (key->opt.ngg_culling && !ngg_cull_shader) ||
-          /* The 1st VS prolog generates input VGPRs for fast launch. */
-          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+          (key->opt.ngg_culling && !ngg_cull_shader);
 }
 
 /**
@@ -1304,16 +1302,8 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
    key->vs_prolog.as_es = shader_out->key.as_es;
    key->vs_prolog.as_ngg = shader_out->key.as_ngg;
 
-   if (ngg_cull_shader) {
-      key->vs_prolog.gs_fast_launch_tri_list =
-         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
-      key->vs_prolog.gs_fast_launch_tri_strip =
-         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-      key->vs_prolog.gs_fast_launch_index_size_packed =
-         SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);
-   } else if (shader_out->key.opt.ngg_culling) {
+   if (!ngg_cull_shader && shader_out->key.opt.ngg_culling)
       key->vs_prolog.load_vgprs_after_culling = 1;
-   }
 
    if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
       key->vs_prolog.as_ls = 1;
@@ -1576,10 +1566,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
       shader.key.as_ls = key->vs_prolog.as_ls;
       shader.key.as_es = key->vs_prolog.as_es;
       shader.key.as_ngg = key->vs_prolog.as_ngg;
-      shader.key.opt.ngg_culling =
-         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
-         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
-         SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
       break;
    case MESA_SHADER_TESS_CTRL:
       assert(!prolog);
@@ -1602,8 +1588,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
    struct si_shader_context ctx;
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, stage,
-                                         shader.key.as_ngg, shader.key.as_es,
-                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
+                                         shader.key.as_ngg, shader.key.as_es));
    ctx.shader = &shader;
    ctx.stage = stage;
 
@@ -2130,9 +2115,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
        (sel->info.stage == MESA_SHADER_VERTEX &&
         /* Used to export PrimitiveID from the correct vertex. */
-        (shader->key.mono.u.vs_export_prim_id ||
-         /* Used to generate triangle strip vertex IDs for all threads. */
-         shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)));
+        shader->key.mono.u.vs_export_prim_id));
 
    shader->uses_vs_state_outprim = sscreen->use_ngg &&
                                    /* Only used by streamout in vertex shaders. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ff32672658f..4072a6c028d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -282,12 +282,7 @@ enum
 #define SI_NGG_CULL_ENABLED                  (1 << 0)   /* this implies W, view.xy, and small prim culling */
 #define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST  (1 << 3)   /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4)   /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x)     (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
-#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL       (0xf << 3) /* GS fast launch (both prim types) */
-#define SI_NGG_CULL_LINES                    (1 << 7)   /* the primitive type is lines */
+#define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -590,9 +585,6 @@ union si_shader_part_key {
       unsigned as_ls : 1;
       unsigned as_es : 1;
       unsigned as_ngg : 1;
-      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
-      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
-      unsigned gs_fast_launch_index_size_packed : 2;
       unsigned load_vgprs_after_culling : 1;
       /* Prologs for monolithic shaders shouldn't set EXEC. */
       unsigned is_monolithic : 1;
@@ -686,7 +678,7 @@ struct si_shader_key {
       unsigned kill_pointsize : 1;
 
       /* For NGG VS and TES. */
-      unsigned ngg_culling : 8; /* SI_NGG_CULL_* */
+      unsigned ngg_culling : 4; /* SI_NGG_CULL_* */
 
       /* For shaders where monolithic variants have better code.
        *
@@ -744,7 +736,7 @@ struct gfx9_gs_info {
    unsigned esgs_ring_size; /* in bytes */
 };
 
-#define SI_NUM_VGT_STAGES_KEY_BITS 6
+#define SI_NUM_VGT_STAGES_KEY_BITS 5
 #define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
 
 /* The VGT_SHADER_STAGES key used to index the table of precomputed values.
@@ -755,7 +747,6 @@ union si_vgt_stages_key {
 #if UTIL_ARCH_LITTLE_ENDIAN
       uint8_t tess : 1;
       uint8_t gs : 1;
-      uint8_t ngg_gs_fast_launch : 1;
       uint8_t ngg_passthrough : 1;
       uint8_t ngg : 1;       /* gfx10+ */
       uint8_t streamout : 1; /* only used with NGG */
@@ -765,7 +756,6 @@ union si_vgt_stages_key {
       uint8_t streamout : 1;
       uint8_t ngg : 1;
       uint8_t ngg_passthrough : 1;
-      uint8_t ngg_gs_fast_launch : 1;
       uint8_t gs : 1;
       uint8_t tess : 1;
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index a9ab0c549f3..c22e826ff01 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
 
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
-                                         false, false, false));
+                                         false, false));
    ctx.shader = shader;
    ctx.stage = MESA_SHADER_VERTEX;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index d35c296c219..ecdcf48403d 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -839,8 +839,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
       returns[num_returns++] = ctx->ac.i32;
    }
 
-   struct ac_arg merged_wave_info = input_sgpr_param[3];
-
    /* Preloaded VGPRs (outputs must be floats) */
    for (i = 0; i < num_input_vgprs; i++) {
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
@@ -892,109 +890,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
       }
    }
 
-   if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
-      LLVMValueRef wave_id, thread_id_in_tg;
-
-      wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
-      thread_id_in_tg =
-         ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
-                       ac_get_thread_id(&ctx->ac));
-
-      /* The GS fast launch initializes all VGPRs to the value of
-       * the first thread, so we have to add the thread ID.
-       *
-       * Only these are initialized by the hw:
-       *   VGPR2: Base Primitive ID
-       *   VGPR5: Base Vertex ID
-       *   VGPR6: Instance ID
-       */
-
-      /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
-       * The NGG cull shader will read them from there.
-       */
-      if (key->vs_prolog.gs_fast_launch_tri_list) {
-         input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
-                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
-                                        LLVMConstInt(ctx->ac.i32, 0, 0));
-         input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
-                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
-                                        LLVMConstInt(ctx->ac.i32, 1, 0));
-         input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
-                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
-                                        LLVMConstInt(ctx->ac.i32, 2, 0));
-      } else {
-         assert(key->vs_prolog.gs_fast_launch_tri_strip);
-         LLVMBuilderRef builder = ctx->ac.builder;
-         /* Triangle indices: */
-         LLVMValueRef index[3] = {
-            thread_id_in_tg,
-            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
-            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
-         };
-         LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
-         LLVMValueRef flatshade_first = LLVMBuildICmp(
-            builder, LLVMIntEQ,
-            si_unpack_param(ctx, input_sgpr_param[8 + SI_SGPR_VS_STATE_BITS], 4, 2),
-            ctx->ac.i32_0, "");
-
-         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
-         input_vgprs[0] = index[0];
-         input_vgprs[1] = index[1];
-         input_vgprs[4] = index[2];
-      }
-
-      /* Triangles always have all edge flags set initially. */
-      input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
-
-      input_vgprs[2] =
-         LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
-      input_vgprs[5] =
-         LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
-      input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
-
-      if (key->vs_prolog.gs_fast_launch_index_size_packed) {
-         LLVMTypeRef index_type = ctx->ac.voidt;
-
-         switch (key->vs_prolog.gs_fast_launch_index_size_packed) {
-         case 1:
-            index_type = ctx->ac.i8;
-            break;
-         case 2:
-            index_type = ctx->ac.i16;
-            break;
-         case 3:
-            index_type = ctx->ac.i32;
-            break;
-         default:
-            unreachable("invalid gs_fast_launch_index_size_packed");
-         }
-
-         LLVMValueRef sgprs[2] = {
-            ac_get_arg(&ctx->ac, input_sgpr_param[0]),
-            ac_get_arg(&ctx->ac, input_sgpr_param[1]),
-         };
-         LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2);
-         indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, "");
-         indices = LLVMBuildIntToPtr(ctx->ac.builder, indices,
-                                     LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), "");
-
-         LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], "");
-
-         /* if (is ES thread...) */
-         ac_build_ifcc(&ctx->ac,
-                       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
-                                     si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0);
-         /* VertexID = indexBufferLoad(VertexID); */
-         LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, "");
-         index = LLVMBuildLoad(ctx->ac.builder, index, "");
-         index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, "");
-         LLVMBuildStore(ctx->ac.builder, index, vertex_id);
-         ac_build_endif(&ctx->ac, 0);
-
-         input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, "");
-      }
-   }
-
    unsigned vertex_id_vgpr = first_vs_vgpr;
    unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
                                   ? first_vs_vgpr + 3
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 2b07c897645..cc824c6b891 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1621,42 +1621,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
             }
          }
       } else {
-         /* Set the index buffer for fast launch. The VS prolog will load the indices. */
-         if (GFX_VERSION >= GFX10_3 && NGG &&
-             sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
-            index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
-
-            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),
-                                      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
-            uint64_t base_index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-            for (unsigned i = 0; i < num_draws; i++) {
-               uint64_t index_va = base_index_va + draws[i].start * original_index_size;
-
-               radeon_set_sh_reg_seq(R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, 2);
-               radeon_emit(index_va);
-               radeon_emit(index_va >> 32);
-
-               if (i > 0) {
-                  if (increment_draw_id) {
-                     unsigned draw_id = drawid_base + i;
-
-                     radeon_set_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, draw_id);
-                     sctx->last_drawid = draw_id;
-                  }
-               }
-
-               /* TODO: Do index buffer bounds checking? We don't do it in this case. */
-               radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
-               radeon_emit(draws[i].count);
-               radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
-            }
-            radeon_end();
-
-            EMIT_SQTT_END_DRAW;
-            return;
-         }
-
          for (unsigned i = 0; i < num_draws; i++) {
             if (i > 0) {
                if (increment_draw_id) {
@@ -2340,31 +2304,6 @@ static void si_draw(struct pipe_context *ctx,
             ngg_culling = SI_NGG_CULL_ENABLED | SI_NGG_CULL_LINES;
          }
 
-         /* Use NGG fast launch for certain primitive types.
-          * A draw must have at least 1 full primitive.
-          * The fast launch doesn't work with tessellation.
-          *
-          * Fast launch is disabled on Navi1x because enabling it requires VGT_FLUSH,
-          * which decreases performance by up to 10%. Only use fast launch on gfx10.3 and newer.
-          *
-          * Since NGG fast launch is enabled by VGT_SHADER_STAGES_EN, which causes a context roll,
-          * which decreases performance, decrease the frequency of switching it on/off using
-          * a high vertex count threshold.
-          */
-         if (GFX_VERSION >= GFX10_3 && !HAS_TESS && total_direct_count >= 8000 &&
-             !(sctx->screen->debug_flags & DBG(NO_FAST_LAUNCH))) {
-            if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
-               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
-            } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-               if (!index_size) {
-                  ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
-               } else if (!primitive_restart) {
-                  ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
-                                 SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
-               }
-            }
-         }
-
          if (ngg_culling != old_ngg_culling) {
             /* If shader compilation is not ready, this setting will be rejected. */
             sctx->ngg_culling = ngg_culling;
@@ -2382,32 +2321,13 @@ static void si_draw(struct pipe_context *ctx,
          return;
       }
 
-      /* si_update_shaders can clear the ngg_culling settings if the shader compilation hasn't
-       * finished.
+      /* si_update_shaders can clear the ngg_culling in the shader key if the shader compilation
+       * hasn't finished. Set it to the correct value in si_context.
        */
-      if (GFX_VERSION >= GFX10 && NGG) {
-         uint8_t ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling;
-
-         if (GFX_VERSION >= GFX10_3 &&
-             old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
-             !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
-            /* Need to re-set these, because we have bound an index buffer there. */
-            sctx->shader_pointers_dirty |=
-               (1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
-               (1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-         }
-
-         /* Set this to the correct value determined by si_update_shaders. */
-         sctx->ngg_culling = ngg_culling;
-      }
+      if (GFX_VERSION >= GFX10 && NGG)
+         sctx->ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling;
    }
 
-   /* ngg_culling can be changed after si_update_shaders above, so determine index_size here. */
-   if (GFX_VERSION >= GFX10_3 && NGG &&
-       sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))
-      index_size = 0; /* The index buffer will be emulated. */
-
    /* Since we've called si_context_add_resource_size for vertex buffers,
     * this must be called after si_need_cs_space, because we must let
     * need_cs_space flush before we add buffers to the buffer list.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b0cf1d1b4eb..3589a0ca1d0 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
       shader_variant_flags |= 1 << 0;
    if (sel->nir)
       shader_variant_flags |= 1 << 1;
-   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32)
+   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es) == 32)
       shader_variant_flags |= 1 << 2;
    if (sel->info.stage == MESA_SHADER_FRAGMENT &&
        /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
@@ -1306,33 +1306,27 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
    shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) |
                                      S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
 
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST ||
-       shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                        S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts);
-   } else {
-      shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
-                        S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
-                        S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+   shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                     S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
+                     S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
 
-      /* On gfx10, the GE only checks against the maximum number of ES verts after
-       * allocating a full GS primitive. So we need to ensure that whenever
-       * this check passes, there is enough space for a full primitive without
-       * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
-       * if we have enough LDS.
-       *
-       * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
+   /* On gfx10, the GE only checks against the maximum number of ES verts after
+    * allocating a full GS primitive. So we need to ensure that whenever
+    * this check passes, there is enough space for a full primitive without
+    * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
+    * if we have enough LDS.
+    *
+    * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
+    */
+   if ((sscreen->info.chip_class == GFX10) &&
+       (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
+       shader->ngg.hw_max_esverts != 256 &&
+       shader->ngg.hw_max_esverts > 5) {
+      /* This could be based on the input primitive type. 5 is the worst case
+       * for primitive types with adjacency.
        */
-      if ((sscreen->info.chip_class == GFX10) &&
-          (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
-          shader->ngg.hw_max_esverts != 256 &&
-          shader->ngg.hw_max_esverts > 5) {
-         /* This could be based on the input primitive type. 5 is the worst case
-          * for primitive types with adjacency.
-          */
-         shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-         shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
-      }
+      shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+      shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
    }
 
    if (window_space) {
@@ -1347,8 +1341,6 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
    shader->ctx_reg.ngg.vgt_stages.u.ngg = 1;
    shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs;
    shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader);
-   shader->ctx_reg.ngg.vgt_stages.u.ngg_gs_fast_launch =
-      !!(shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }
 
 static void si_emit_shader_vs(struct si_context *sctx)
@@ -4025,7 +4017,7 @@ struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union
    }
 
    if (key.u.ngg) {
-      stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_GS_FAST_LAUNCH(key.u.ngg_gs_fast_launch) |
+      stages |= S_028B54_PRIMGEN_EN(1) |
                 S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
                 S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough) |
                 S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key.u.ngg_passthrough &&
@@ -4036,9 +4028,7 @@ struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union
    if (screen->info.chip_class >= GFX9)
       stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
 
-   if (screen->info.chip_class >= GFX10 &&
-       /* GS fast launch hangs with Wave64, so always use Wave32. */
-       (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) {
+   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
       stages |= S_028B54_HS_W32_EN(1) |
                 S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
                 S_028B54_VS_W32_EN(1);