diff --git a/docs/envvars.rst b/docs/envvars.rst
index a61d694aa47..16a83d21cfd 100644
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -773,12 +773,6 @@ radeonsi driver environment variables
       Always use NGG culling even when it can hurt.
    ``nonggc``
       Disable NGG culling.
-   ``alwayspd``
-      Always enable the primitive discard compute shader.
-   ``pd``
-      Enable the primitive discard compute shader for large draw calls.
-   ``nopd``
-      Disable the primitive discard compute shader.
    ``switch_on_eop``
       Program WD/IA to switch on end-of-packet.
    ``nooutoforder``
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 6b72bfb8003..7b2ef80f5a2 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -80,9 +80,6 @@ enum radeon_bo_flag
 
 enum radeon_dependency_flag
 {
-   /* Add the dependency to the parallel compute IB only. */
-   RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
-
    /* Instead of waiting for a job to finish execution, the dependency will
     * be signaled when the job starts execution.
     */
@@ -512,26 +509,6 @@ struct radeon_winsys {
                                    struct pipe_fence_handle **fence),
                      void *flush_ctx, bool stop_exec_on_failure);
 
-   /**
-    * Add a parallel compute IB to a gfx IB. It will share the buffer list
-    * and fence dependencies with the gfx IB. The gfx flush call will submit
-    * both IBs at the same time.
-    *
-    * The compute IB doesn't have an output fence, so the primary IB has
-    * to use a wait packet for synchronization.
-    *
-    * The returned IB is only a stream for writing packets to the new
-    * IB. The only function that can be used on the compute cs is cs_check_space.
-    *
-    * \param compute_cs      The returned structure of the command stream.
-    * \param gfx_cs          Gfx IB
-    *
-    * \return true on success
-    */
-   bool (*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *compute_cs,
-                                      struct radeon_cmdbuf *gfx_cs,
-                                      bool uses_gds_ordered_append);
-
    /**
     * Set up and enable mid command buffer preemption for the command stream.
     *
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index 4b734d2b1ef..79af306e29c 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -27,7 +27,6 @@ files_libradeonsi = files(
   'si_build_pm4.h',
   'si_clear.c',
   'si_compute.c',
-  'si_compute_prim_discard.c',
   'si_compute.h',
   'si_compute_blit.c',
   'si_cp_dma.c',
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
deleted file mode 100644
index 67e42801a3f..00000000000
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*
- * Copyright 2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "ac_llvm_cull.h"
-#include "si_build_pm4.h"
-#include "si_pipe.h"
-#include "si_shader_internal.h"
-#include "sid.h"
-#include "util/u_upload_mgr.h"
-
-/* Based on:
- * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
- */
-
-/* This file implements primitive culling using asynchronous compute.
- *
- * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
- * in a compute shader. The shader processes 1 primitive/thread by invoking
- * the VS for each vertex to get the positions, decomposes strips
- * into triangles (if needed), eliminates primitive restart (if needed),
- * does (W<0) culling, face culling, view XY culling, zero-area and
- * small-primitive culling, and generates a new index buffer that doesn't
- * contain culled primitives.
- *
- * There is no primitive ordering. The generated index buffer will contain
- * primitives in a random order.
- *
- * IB = a GPU command buffer
- *
- * Both the compute and gfx IBs run in parallel sort of like CE and DE.
- * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
- * doesn't continue if its word isn't 0x80000000. The vertex count is being
- * atomically incremented within the draw packet. A CS_DONE event will signal
- * the REWIND packet to continue. It's really a direct draw with command
- * buffer patching from the compute queue.
- *
- * The compute IB doesn't have to start when its corresponding gfx IB starts,
- * but can start sooner. The compute IB is signaled to start after the last
- * execution barrier in the *previous* gfx IB. This is handled as follows.
- * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
- * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
- * represents the barrier in the previous gfx IB.
- *
- * Features:
- * - Triangle strips are decomposed into an indexed triangle list.
- *   The decomposition differs based on the provoking vertex state.
- * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
- * - Back face culling, incl. culling zero-area / degenerate primitives.
- * - View XY culling.
- * - Small primitive culling for all MSAA modes and all quant modes.
- *
- * The following are not implemented:
- * - ClipVertex/ClipDistance/CullDistance-based culling.
- * - Scissor culling.
- * - HiZ culling.
- *
- * Limitations (and unimplemented features that may be possible to implement):
- * - Only triangles and triangle strips are supported.
- * - Primitive restart is not supported.
- * - Instancing is unsupported.
- * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
- * - No support for tessellation and geometry shaders.
- *   (patch elimination where tess factors are 0 would be possible to implement)
- * - The vertex shader must not contain memory stores.
- * - All VS resources must not have a write usage in the command buffer.
- * - Bindless textures and images must not occur in the vertex shader.
- *
- * User data SGPR layout:
- *   VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader.
- *   START_OUT_INDEX: output index buffer offset / 12
- *   START_IN_INDEX:   input index buffer offset / index_size
- *   VS.BASE_VERTEX:              same value as VS
- *   INDEX_BUFFERS: pointer to constants
- *     0..3: input index buffer - typed buffer view
- *     4..7: output index buffer - typed buffer view
- *     8..11: viewport state - scale.xy, translate.xy
- *   VS.VERTEX_BUFFERS:           same value as VS
- *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
- *   VS.SAMPLERS_AND_IMAGES:      same value as VS
- *   VS.START_INSTANCE:           same value as VS
- *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
- *
- * How to test primitive restart (the most complicated part because it needs
- * to get the primitive orientation right):
- *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
- *   primitive orientation flips with small draw calls, which is what most tests use.
- *   You can also enable draw call splitting into draw calls with just 2 primitives.
- */
-
-/* At least 256 is needed for the fastest wave launch rate from compute queues
- * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
-#define THREADGROUP_SIZE     256 /* high numbers limit available VGPRs */
-#define THREADGROUPS_PER_CU  1   /* TGs to launch on 1 CU before going onto the next, max 8 */
-#define MAX_WAVES_PER_SH     0   /* no limit */
-#define INDEX_STORES_USE_SLC 1   /* don't cache indices if L2 is full */
-
-/* Grouping compute dispatches for small draw calls: How many primitives from multiple
- * draw calls to process by compute before signaling the gfx IB. This reduces the number
- * of EOP events + REWIND packets, because they decrease performance.
- * This also determines the granularity of draw-level and packet-level splitting.
- */
-#define PRIMS_PER_IB  (1024 * 1024)  /* size per gfx IB */
-#define PRIMS_PER_BATCH (128 * 1024) /* size between REWIND packets */
-
-/* Derived values. */
-#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
-
-#define REWIND_SIGNAL_BIT 0x80000000
-
-static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr);
-
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
-                                         unsigned *prim_discard_vertex_count_threshold,
-                                         unsigned *index_ring_size_per_ib)
-{
-   *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
-
-   if (sscreen->info.chip_class <= GFX7 || /* SI-CI support is not implemented */
-       sscreen->debug_flags & DBG(NO_PD) || is_aux_context)
-      return;
-
-   /* TODO: enable this */
-   bool enable_by_default = false;
-
-   if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) ||
-       (enable_by_default && sscreen->allow_draw_out_of_order &&
-        sscreen->info.num_se >= 2)) {
-      *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
-
-      if (sscreen->debug_flags & DBG(ALWAYS_PD))
-         *prim_discard_vertex_count_threshold = 0; /* always enable */
-
-      /* The total size is double this per context. Greater numbers allow bigger gfx IBs. */
-      *index_ring_size_per_ib = PRIMS_PER_IB * 12; /* 3 32-bit indices per primitive. */
-   }
-}
-
-static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
-{
-   uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
-   ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, "");
-   ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), "");
-   return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
-                            LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), "");
-}
-
-struct si_thread0_section {
-   struct si_shader_context *ctx;
-   LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
-   LLVMValueRef saved_exec;
-};
-
-/* Enter a section that only executes on thread 0. */
-static void si_enter_thread0_section(struct si_shader_context *ctx,
-                                     struct si_thread0_section *section, LLVMValueRef thread_id,
-                                     LLVMValueRef check_nonzero)
-{
-   section->ctx = ctx;
-   section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0");
-
-   /* This IF has 4 instructions:
-    *   v_and_b32_e32 v, 63, v         ; get the thread ID
-    *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
-    *   s_and_saveexec_b64 s, vcc
-    *   s_cbranch_execz BB0_4
-    *
-    * It could just be s_and_saveexec_b64 s, 1.
-    */
-   LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, "");
-   if (check_nonzero) {
-      cond = LLVMBuildAnd(ctx->ac.builder, cond,
-                          LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, check_nonzero,
-                                        ctx->ac.i32_0, ""), "");
-   }
-   ac_build_ifcc(&ctx->ac, cond, 12601);
-}
-
-/* Exit a section that only executes on thread 0 and broadcast the result
- * to all threads. */
-static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result)
-{
-   struct si_shader_context *ctx = section->ctx;
-
-   LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
-
-   ac_build_endif(&ctx->ac, 12601);
-
-   /* Broadcast the result from thread 0 to all threads. */
-   *result =
-      ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
-}
-
-static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
-                                        void *data);
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
-{
-   struct si_shader_key *key = &ctx->shader->key;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef vs = ctx->main_fn;
-
-   /* Always inline the VS function. */
-   ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
-   LLVMSetLinkage(vs, LLVMPrivateLinkage);
-
-   enum ac_arg_type const_desc_type;
-   if (ctx->shader->selector->info.base.num_ubos == 1 &&
-       ctx->shader->selector->info.base.num_ssbos == 0)
-      const_desc_type = AC_ARG_CONST_FLOAT_PTR;
-   else
-      const_desc_type = AC_ARG_CONST_DESC_PTR;
-
-   memset(&ctx->args, 0, sizeof(ctx->args));
-
-   struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
-   struct ac_arg param_vb_desc, param_const_desc, param_start_out_index;
-   struct ac_arg param_base_vertex, param_start_instance, param_start_in_index;
-   struct ac_arg param_block_id, param_local_id, param_smallprim_precision;
-   struct ac_arg param_sampler_desc;
-
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_vertex_counter);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_out_index);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_in_index);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_base_vertex);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_index_buffers_and_constants);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &param_vb_desc);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, &param_const_desc);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, &param_sampler_desc);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_start_instance);
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &param_smallprim_precision);
-
-   /* Block ID and thread ID inputs. */
-   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &param_block_id);
-   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &param_local_id);
-
-   /* Create the compute shader function. */
-   gl_shader_stage old_stage = ctx->stage;
-   ctx->stage = MESA_SHADER_COMPUTE;
-   si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE);
-   ctx->stage = old_stage;
-
-   /* Assemble parameters for VS. */
-   LLVMValueRef vs_params[16];
-   unsigned num_vs_params = 0;
-   unsigned param_vertex_id;
-
-   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */
-   vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc);
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc);
-   vs_params[num_vs_params++] =
-      LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex);
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance);
-   vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */
-   vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc);
-
-   vs_params[(param_vertex_id = num_vs_params++)] = NULL;   /* VertexID */
-   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* InstanceID */
-   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused (PrimID) */
-   vs_params[num_vs_params++] = ctx->ac.i32_0;              /* unused */
-
-   assert(num_vs_params <= ARRAY_SIZE(vs_params));
-   assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
-
-   /* Load descriptors. (load 8 dwords at once) */
-   LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
-
-   LLVMValueRef index_buffers_and_constants =
-      ac_get_arg(&ctx->ac, param_index_buffers_and_constants);
-   tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
-                              ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
-   tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0);
-
-   for (unsigned i = 0; i < 8; i++)
-      desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
-
-   input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
-   output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
-
-   /* Compute PrimID. */
-   LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id),
-                                                 LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0),
-                                                 ac_get_arg(&ctx->ac, param_local_id));
-   LLVMValueRef prim_id = global_thread_id;
-
-   /* Generate indices (like a non-indexed draw call). */
-   LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)};
-   unsigned vertices_per_prim = 3;
-
-   switch (key->opt.cs_prim_type) {
-   case PIPE_PRIM_TRIANGLES:
-      for (unsigned i = 0; i < 3; i++) {
-         index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0),
-                                  LLVMConstInt(ctx->ac.i32, i, 0));
-      }
-      break;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      for (unsigned i = 0; i < 3; i++) {
-         index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), "");
-      }
-      break;
-   default:
-      unreachable("unexpected primitive type");
-   }
-
-   /* Fetch indices. */
-   if (key->opt.cs_indexed) {
-      for (unsigned i = 0; i < 3; i++) {
-         index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), "");
-         index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
-                                                1, 0, true, false, false);
-         index[i] = ac_to_integer(&ctx->ac, index[i]);
-      }
-   }
-
-   LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id),
-                                         LLVMConstInt(ctx->ac.i32, 63, 0), "");
-
-   /* Every other triangle in a strip has a reversed vertex order, so we
-    * need to swap vertices of odd primitives to get the correct primitive
-    * orientation when converting triangle strips to triangles. Primitive
-    * restart complicates it, because a strip can start anywhere.
-    */
-   LLVMValueRef prim_restart_accepted = ctx->ac.i1true;
-   LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter);
-
-   if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
-      /* Without primitive restart, odd primitives have reversed orientation.
-       * Only primitive restart can flip it with respect to the first vertex
-       * of the draw call.
-       */
-      /* prim_is_odd = current_is_odd % 2. */
-      LLVMValueRef prim_is_odd = LLVMBuildXor(
-         builder, ctx->ac.i1false, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), "");
-
-      /* Convert triangle strip indices to triangle indices. */
-      ac_build_triangle_strip_indices_to_triangle(
-         &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0),
-         index);
-   }
-
-   /* Execute the vertex shader for each vertex to get vertex positions. */
-   LLVMValueRef pos[3][4];
-   for (unsigned i = 0; i < vertices_per_prim; i++) {
-      vs_params[param_vertex_id] = index[i];
-
-      LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params);
-      for (unsigned chan = 0; chan < 4; chan++)
-         pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
-   }
-
-   /* Divide XYZ by W. */
-   for (unsigned i = 0; i < vertices_per_prim; i++) {
-      for (unsigned chan = 0; chan < 3; chan++)
-         pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
-   }
-
-   /* Load the viewport state. */
-   LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
-                                             LLVMConstInt(ctx->ac.i32, 2, 0));
-   vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
-   LLVMValueRef vp_scale[2], vp_translate[2];
-   vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
-   vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
-   vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
-   vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
-
-   /* Do culling. */
-   struct ac_cull_options options = {};
-   options.cull_front = key->opt.cs_cull_front;
-   options.cull_back = key->opt.cs_cull_back;
-   options.cull_view_xy = true;
-   options.cull_small_prims = true;
-   options.cull_zero_area = true;
-   options.cull_w = true;
-
-   LLVMValueRef params[] = {
-      vertex_counter,
-      output_indexbuf,
-      (void*)index,
-      ac_get_arg(&ctx->ac, param_start_out_index),
-   };
-
-   ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate,
-                    ac_get_arg(&ctx->ac, param_smallprim_precision), &options,
-                    si_build_primitive_accepted, params);
-   LLVMBuildRetVoid(builder);
-}
-
-static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted,
-                                        void *userdata)
-{
-   struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac);
-   LLVMBuilderRef builder = ctx->ac.builder;
-   unsigned vertices_per_prim = 3;
-   LLVMValueRef *params = (LLVMValueRef *)userdata;
-   LLVMValueRef vertex_counter = params[0];
-   LLVMValueRef output_indexbuf = params[1];
-   LLVMValueRef *index = (LLVMValueRef *)params[2];
-   LLVMValueRef start_out_index = params[3];
-
-   LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
-
-   ac_build_ifcc(&ctx->ac, accepted, 16607);
-
-   /* Count the number of active threads by doing bitcount(accepted). */
-   LLVMValueRef num_prims_accepted = ac_build_bit_count(&ctx->ac, accepted_threadmask);
-   num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, "");
-
-   /* Get the number of bits set before the index of this thread. */
-   LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
-   LLVMValueRef start;
-
-   /* Execute atomic_add on the vertex count. */
-   struct si_thread0_section section;
-   si_enter_thread0_section(ctx, &section, prim_index, num_prims_accepted);
-   {
-      LLVMValueRef num_indices = LLVMBuildMul(
-         builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-      vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
-      start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices,
-                                 LLVMAtomicOrderingMonotonic, false);
-   }
-   si_exit_thread0_section(&section, &start);
-
-   /* Convert it into the primitive index. */
-   start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), "");
-
-   /* Now we need to store the indices of accepted primitives into
-    * the output index buffer.
-    */
-
-   /* Write indices for accepted primitives. */
-   LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
-   vindex = LLVMBuildAdd(builder, vindex, start_out_index, "");
-   LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
-
-   if (!ac_has_vec3_support(ctx->ac.chip_class, true))
-      vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3);
-
-   ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0,
-                                ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0));
-   ac_build_endif(&ctx->ac, 16607);
-}
-
-/* Return false if the shader isn't ready. */
-static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
-                                             const struct pipe_draw_info *info)
-{
-   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-   struct si_shader_key key;
-
-   memset(&key, 0, sizeof(key));
-   si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog);
-   assert(!key.part.vs.prolog.instance_divisor_is_fetched);
-
-   key.opt.vs_as_prim_discard_cs = 1;
-   key.opt.cs_prim_type = info->mode;
-   key.opt.cs_indexed = info->index_size != 0;
-   key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
-
-   if (rs->rasterizer_discard) {
-      /* Just for performance testing and analysis of trivial bottlenecks.
-       * This should result in a very short compute shader. */
-      key.opt.cs_cull_front = 1;
-      key.opt.cs_cull_back = 1;
-   } else {
-      key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front;
-      key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back;
-   }
-
-   sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso;
-   sctx->cs_prim_discard_state.current = NULL;
-
-   if (!sctx->compiler.passes)
-      si_init_compiler(sctx->screen, &sctx->compiler);
-
-   struct si_compiler_ctx_state compiler_state;
-   compiler_state.compiler = &sctx->compiler;
-   compiler_state.debug = sctx->debug;
-   compiler_state.is_debug_context = sctx->is_debug;
-
-   return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state,
-                                    &key, -1, true) == 0 &&
-          /* Disallow compute shaders using the scratch buffer. */
-          sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
-}
-
-static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
-{
-   if (sctx->index_ring)
-      return true;
-
-   if (!sctx->prim_discard_compute_cs.priv) {
-      struct radeon_winsys *ws = sctx->ws;
-
-      if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs,
-                                          &sctx->gfx_cs, false))
-         return false;
-   }
-
-   if (!sctx->index_ring) {
-      sctx->index_ring = si_aligned_buffer_create(
-         sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
-         PIPE_USAGE_DEFAULT,
-         sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size);
-      if (!sctx->index_ring)
-         return false;
-   }
-   return true;
-}
-
-static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
-{
-   return sctx->index_ring_offset +
-             align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
-          sctx->index_ring_size_per_ib;
-}
-
-#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7)
-
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      unsigned drawid_offset,
-                                      const struct pipe_draw_start_count_bias *draws,
-                                      unsigned num_draws, unsigned total_count)
-{
-   /* If the compute shader compilation isn't finished, this returns false. */
-   if (!si_shader_select_prim_discard_cs(sctx, info))
-      return SI_PRIM_DISCARD_DISABLED;
-
-   if (!si_initialize_prim_discard_cmdbuf(sctx))
-      return SI_PRIM_DISCARD_DISABLED;
-
-   struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
-   unsigned prim = info->mode;
-
-   unsigned num_prims;
-   if (prim == PIPE_PRIM_TRIANGLES)
-      num_prims = total_count / 3;
-   else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
-      num_prims = total_count - 2; /* approximation ignoring multi draws */
-   else
-      unreachable("shouldn't get here");
-
-   unsigned out_indexbuf_size = num_prims * 12;
-   bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
-
-   /* Split draws at the draw call level if the ring is full. This makes
-    * better use of the ring space.
-    */
-   if (ring_full && num_prims > PRIMS_PER_BATCH) {
-      unsigned vert_count_per_subdraw = 0;
-
-      if (prim == PIPE_PRIM_TRIANGLES)
-         vert_count_per_subdraw = PRIMS_PER_BATCH * 3;
-      else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
-         vert_count_per_subdraw = PRIMS_PER_BATCH;
-
-      /* Split multi draws first. */
-      if (num_draws > 1) {
-         unsigned count = 0;
-         unsigned first_draw = 0;
-         unsigned num_draws_split = 0;
-
-         for (unsigned i = 0; i < num_draws; i++) {
-            if (count && count + draws[i].count > vert_count_per_subdraw) {
-               /* Submit previous draws.  */
-               sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);
-               count = 0;
-               first_draw = i;
-               num_draws_split = 0;
-            }
-
-            if (draws[i].count > vert_count_per_subdraw) {
-               /* Submit just 1 draw. It will be split. */
-               sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + i, 1);
-               assert(count == 0);
-               assert(first_draw == i);
-               assert(num_draws_split == 0);
-               first_draw = i + 1;
-               continue;
-            }
-
-            count += draws[i].count;
-            num_draws_split++;
-         }
-
-         if (count) {
-            /* Submit the remaining draws.  */
-            assert(num_draws_split > 0);
-            sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split);
-         }
-         return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT;
-      }
-
-      /* Split single draws if splitting multi draws isn't enough. */
-      struct pipe_draw_info split_draw = *info;
-      struct pipe_draw_start_count_bias split_draw_range = draws[0];
-      unsigned base_start = split_draw_range.start;
-      unsigned count = draws[0].count;
-
-      if (prim == PIPE_PRIM_TRIANGLES) {
-         assert(vert_count_per_subdraw < count);
-
-         for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
-            split_draw_range.start = base_start + start;
-            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
-
-            sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);
-         }
-      } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-         /* No primitive pair can be split, because strips reverse orientation
-          * for odd primitives. */
-         STATIC_ASSERT(PRIMS_PER_BATCH % 2 == 0);
-
-         for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
-            split_draw_range.start = base_start + start;
-            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
-
-            sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1);
-         }
-      }
-
-      return SI_PRIM_DISCARD_DRAW_SPLIT;
-   }
-
-   /* Just quit if the draw call doesn't fit into the ring and can't be split. */
-   if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
-      if (SI_PRIM_DISCARD_DEBUG)
-         puts("PD failed: draw call too big, can't be split");
-      return SI_PRIM_DISCARD_DISABLED;
-   }
-
-   /* Compute how many CS dwords we need to reserve. */
-   unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE +
-                              11 /* shader */ +
-                              30; /* leave some space at the end */
-   unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
-
-   for (unsigned i = 0; i < num_draws; i++) {
-      unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH);
-
-      need_compute_dw += 8 * num_subdraws + /* signal REWIND */
-                         14 /* user SGPRs */ +
-                         4 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */
-                         11 * num_subdraws;
-      need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
-   }
-
-   if (ring_full ||
-       !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
-      /* If the current IB is empty but the size is too small, add a NOP
-       * packet to force a flush and get a bigger IB.
-       */
-      if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
-          gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
-         radeon_begin(gfx_cs);
-         radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
-         radeon_emit(gfx_cs, 0);
-         radeon_end();
-      }
-
-      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-   }
-
-   /* The compute IB is always chained, but we need to call cs_check_space to add more space. */
-   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
-   ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
-   assert(compute_has_space);
-   assert(si_check_ring_space(sctx, out_indexbuf_size));
-   assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw);
-   return SI_PRIM_DISCARD_ENABLED;
-}
-
-void si_compute_signal_gfx(struct si_context *sctx)
-{
-   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
-   unsigned writeback_L2_flags = 0;
-
-   /* GFX8 needs to flush L2 for CP to see the updated vertex count. */
-   if (sctx->chip_class == GFX8)
-      writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
-
-   if (!sctx->compute_num_prims_in_batch)
-      return;
-
-   assert(sctx->compute_rewind_va);
-
-   /* After the queued dispatches are done and vertex counts are written to
-    * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
-    * the dispatches to finish, it only adds the CS_DONE event into the event
-    * queue.
-    */
-   si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
-                     sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
-                     writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE,
-                     EOP_DATA_SEL_VALUE_32BIT, NULL,
-                     sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
-                     REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
-                     SI_NOT_QUERY);
-
-   sctx->compute_rewind_va = 0;
-   sctx->compute_num_prims_in_batch = 0;
-}
-
-/* Dispatch a primitive discard compute shader. */
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info,
-                                          const struct pipe_draw_start_count_bias *draws,
-                                          unsigned num_draws, unsigned index_size,
-                                          unsigned total_count, uint64_t input_indexbuf_va,
-                                          unsigned index_max_size)
-{
-   struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
-   struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
-   unsigned num_total_prims;
-   unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
-
-   if (!info->instance_count)
-      return;
-
-   switch (info->mode) {
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (info->mode == PIPE_PRIM_TRIANGLES)
-         num_total_prims = total_count / 3;
-      else if (total_count >= 2)
-         num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */
-      else
-         num_total_prims = 0;
-
-      vertices_per_prim = 3;
-      output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
-      gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
-      break;
-   default:
-      unreachable("unsupported primitive type");
-      return;
-   }
-
-   if (!num_total_prims)
-      return;
-
-   unsigned out_indexbuf_offset;
-   uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4;
-
-   /* Initialize the compute IB if it's empty. */
-   if (!sctx->prim_discard_compute_ib_initialized) {
-      /* 1) State initialization. */
-      sctx->compute_ib_last_shader = NULL;
-
-      if (sctx->last_ib_barrier_fence) {
-         assert(!sctx->last_ib_barrier_buf);
-         sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence,
-                                           RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
-      }
-
-      /* 2) IB initialization. */
-
-      /* This needs to be done at the beginning of IBs due to possible
-       * TTM buffer moves in the kernel.
-       */
-      if (sctx->chip_class >= GFX10) { /* 8 DW */
-         radeon_begin(cs);
-         radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-         radeon_emit(cs, 0);          /* CP_COHER_CNTL */
-         radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
-         radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
-         radeon_emit(cs, 0);          /* CP_COHER_BASE */
-         radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
-         radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
-         radeon_emit(cs,              /* GCR_CNTL */
-                     S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
-                        S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
-                        S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
-         radeon_end();
-      } else {
-         si_emit_surface_sync(sctx, cs,
-                              S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
-                                 S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) |
-                                 S_0085F0_SH_ICACHE_ACTION_ENA(1) |
-                                 S_0085F0_SH_KCACHE_ACTION_ENA(1));
-      }
-
-      si_emit_initial_compute_regs(sctx, cs); /* 39 DW */
-
-      radeon_begin(cs); /* 11 DW */
-      radeon_set_sh_reg(
-         cs, R_00B860_COMPUTE_TMPRING_SIZE,
-         S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
-
-      /* Only 1D grids are launched. */
-      radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
-      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1));
-      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1));
-
-      radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
-      radeon_emit(cs, 0);
-      radeon_emit(cs, 0);
-      radeon_end();
-
-      if (sctx->last_ib_barrier_buf) {
-         assert(!sctx->last_ib_barrier_fence);
-         radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
-                                   RADEON_PRIO_FENCE);
-         si_cp_wait_mem(sctx, cs, /* 7 DW */
-                        sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
-                        1, 1, WAIT_REG_MEM_EQUAL);
-      }
-
-      sctx->prim_discard_compute_ib_initialized = true;
-      assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE);
-   }
-
-   /* Allocate the output index buffer. */
-   output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size);
-   assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
-   out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
-   sctx->index_ring_offset += output_indexbuf_size;
-
-   radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
-                             RADEON_PRIO_SHADER_RW_BUFFER);
-   uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
-
-   /* Prepare index buffer descriptors. */
-   struct si_resource *indexbuf_desc = NULL;
-   unsigned indexbuf_desc_offset;
-   unsigned desc_size = 12 * 4;
-   uint32_t *desc;
-
-   u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size),
-                  &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc);
-   radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
-                             RADEON_PRIO_DESCRIPTORS);
-
-   /* Input index buffer. */
-   desc[0] = input_indexbuf_va;
-   desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
-   desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1);
-
-   if (sctx->chip_class >= GFX10) {
-      desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                S_008F0C_FORMAT(index_size == 1 ? V_008F0C_GFX10_FORMAT_8_UINT
-                                                : index_size == 2 ? V_008F0C_GFX10_FORMAT_16_UINT
-                                                                  : V_008F0C_GFX10_FORMAT_32_UINT) |
-                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                S_008F0C_RESOURCE_LEVEL(1);
-   } else {
-      desc[3] =
-         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-         S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8
-                                              : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16
-                                                                : V_008F0C_BUF_DATA_FORMAT_32);
-   }
-
-   /* Output index buffer. */
-   desc[4] = out_indexbuf_va;
-   desc[5] =
-      S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
-   desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
-
-   if (sctx->chip_class >= GFX10) {
-      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-                S_008F0C_FORMAT(gfx10_output_indexbuf_format) |
-                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
-                S_008F0C_RESOURCE_LEVEL(1);
-   } else {
-      desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
-                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                S_008F0C_DATA_FORMAT(output_indexbuf_format);
-   }
-
-   /* Viewport state. */
-   struct si_small_prim_cull_info cull_info;
-   si_get_small_prim_cull_info(sctx, &cull_info);
-
-   desc[8] = fui(cull_info.scale[0]);
-   desc[9] = fui(cull_info.scale[1]);
-   desc[10] = fui(cull_info.translate[0]);
-   desc[11] = fui(cull_info.translate[1]);
-
-   /* Set user data SGPRs. */
-   /* This can't be >= 16 if we want the fastest launch rate. */
-   unsigned user_sgprs = 10;
-
-   uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
-   unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
-   unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
-   uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
-   uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
-   uint64_t vb_desc_va = sctx->vb_descriptors_buffer
-                            ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
-                            : 0;
-   si_resource_reference(&indexbuf_desc, NULL);
-
-   /* Set the compute shader. */
-   struct si_shader *shader = sctx->cs_prim_discard_state.current;
-
-   if (shader != sctx->compute_ib_last_shader) {
-      radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
-                                RADEON_PRIO_SHADER_BINARY);
-      uint64_t shader_va = shader->bo->gpu_address;
-
-      assert(shader->config.scratch_bytes_per_wave == 0);
-      assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
-
-      radeon_begin(cs);
-      radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-      radeon_emit(cs, shader_va >> 8);
-      radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
-
-      radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-      radeon_emit(
-         cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
-                S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) |
-                S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) |
-                S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) |
-                S_00B848_WGP_MODE(sctx->chip_class >= GFX10));
-      radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) |
-                         S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
-                         S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
-                         S_00B84C_LDS_SIZE(shader->config.lds_size));
-
-      radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
-                        ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
-                                                       MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
-      radeon_end();
-      sctx->compute_ib_last_shader = shader;
-   }
-
-   STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);
-
-   for (unsigned i = 0; i < num_draws; i++) {
-      unsigned count = draws[i].count;
-      unsigned num_prims;
-
-      /* Determine the number of primitives per draw. */
-      if (info->mode == PIPE_PRIM_TRIANGLES)
-         num_prims = count / 3;
-      else if (count >= 2)
-         num_prims = count - 2;
-      else
-         num_prims = 0;
-
-      if (!num_prims)
-         continue;
-
-      /* Big draw calls are split into smaller dispatches and draw packets. */
-      for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) {
-         unsigned num_subdraw_prims;
-
-         if (start_prim + PRIMS_PER_BATCH < num_prims) {
-            num_subdraw_prims = PRIMS_PER_BATCH;
-         } else {
-            num_subdraw_prims = num_prims - start_prim;
-         }
-
-         /* Small dispatches are executed back to back until a specific primitive
-          * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
-          * to start drawing the batch. This batching adds latency to the gfx IB,
-          * but CS_DONE and REWIND are too slow.
-          */
-         if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
-            si_compute_signal_gfx(sctx);
-
-         if (sctx->compute_num_prims_in_batch == 0) {
-            assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
-            sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
-
-            radeon_begin(gfx_cs);
-            radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
-            radeon_emit(gfx_cs, 0);
-            radeon_end();
-         }
-
-         sctx->compute_num_prims_in_batch += num_subdraw_prims;
-
-         uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
-         uint64_t index_va = out_indexbuf_va + start_prim * 12;
-
-         /* Emit the draw packet into the gfx IB. */
-         radeon_begin(gfx_cs);
-         radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
-         radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim);
-         radeon_emit(gfx_cs, index_va);
-         radeon_emit(gfx_cs, index_va >> 32);
-         radeon_emit(gfx_cs, 0);
-         radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
-         radeon_end();
-
-         radeon_begin_again(cs);
-
-         /* Continue with the compute IB. */
-         if (start_prim == 0) {
-            if (i == 0) {
-               /* First draw. */
-               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
-               radeon_emit(cs, count_va);
-               radeon_emit(cs, start_prim);
-               radeon_emit(cs, draws[i].start);
-               radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
-               radeon_emit(cs, index_buffers_va);
-               radeon_emit(cs, vb_desc_va);
-               radeon_emit(cs, vs_const_desc_va);
-               radeon_emit(cs, vs_sampler_desc_va);
-               radeon_emit(cs, info->start_instance);
-               /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
-               radeon_emit(cs, fui(cull_info.small_prim_precision));
-            } else {
-               /* Subsequent draws. */
-               radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4);
-               radeon_emit(cs, count_va);
-               radeon_emit(cs, 0);
-               radeon_emit(cs, draws[i].start);
-               radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
-            }
-         } else {
-            /* Draw split. Only update the SGPRs that changed. */
-            radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
-            radeon_emit(cs, count_va);
-            radeon_emit(cs, start_prim);
-         }
-
-         /* Set grid dimensions. */
-         unsigned start_block = start_prim / THREADGROUP_SIZE;
-         unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
-         unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
-         radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
-         radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
-                           S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
-                              S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
-         radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
-         radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
-         radeon_emit(cs, 1);
-         radeon_emit(cs, 1);
-         radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
-                         S_00B800_ORDER_MODE(0 /* launch in order */));
-         radeon_end();
-
-         assert(cs->current.cdw <= cs->current.max_dw);
-         assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
-      }
-   }
-}
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index b7aece56463..e0e0a669341 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
       sdst->TC_L2_dirty = true;
 
    /* If it's not a framebuffer fast clear... */
-   if (coher == SI_COHERENCY_SHADER) {
+   if (coher == SI_COHERENCY_SHADER)
       sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }
 
 /**
@@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
       si_resource(dst)->TC_L2_dirty = true;
 
    /* If it's not a prefetch or GDS copy... */
-   if (dst && src && (dst != src || dst_offset != src_offset)) {
+   if (dst && src && (dst != src || dst_offset != src_offset))
       sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }
 
 void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index f79f49b54dd..540206c1520 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -344,7 +344,6 @@ struct si_log_chunk_cs {
    struct si_saved_cs *cs;
    bool dump_bo_list;
    unsigned gfx_begin, gfx_end;
-   unsigned compute_begin, compute_end;
 };
 
 static void si_log_chunk_type_cs_destroy(void *data)
@@ -402,7 +401,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
    struct si_context *ctx = chunk->ctx;
    struct si_saved_cs *scs = chunk->cs;
    int last_trace_id = -1;
-   int last_compute_trace_id = -1;
 
    /* We are expecting that the ddebug pipe has already
     * waited for the context, so this buffer should be idle.
@@ -410,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
     */
    uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
                                        PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
-   if (map) {
+   if (map)
       last_trace_id = map[0];
-      last_compute_trace_id = map[1];
-   }
 
    if (chunk->gfx_end != chunk->gfx_begin) {
       if (chunk->gfx_begin == 0) {
@@ -435,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
       }
    }
 
-   if (chunk->compute_end != chunk->compute_begin) {
-      assert(ctx->prim_discard_compute_cs.priv);
-
-      if (scs->flushed) {
-         ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
-                     chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
-                     "Compute IB", ctx->chip_class, NULL, NULL);
-      } else {
-         si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
-                             chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
-                             ctx->chip_class);
-      }
-   }
-
    if (chunk->dump_bo_list) {
       fprintf(f, "Flushing. Time: ");
       util_dump_ns(f, scs->time_flush);
@@ -468,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
 
    struct si_saved_cs *scs = ctx->current_saved_cs;
    unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
-   unsigned compute_cur = 0;
 
-   if (ctx->prim_discard_compute_cs.priv)
-      compute_cur =
-         ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
-
-   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
       return;
 
    struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -487,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
    chunk->gfx_end = gfx_cur;
    scs->gfx_last_dw = gfx_cur;
 
-   chunk->compute_begin = scs->compute_last_dw;
-   chunk->compute_end = compute_cur;
-   scs->compute_last_dw = compute_cur;
-
    u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index 7b82aa3abd3..d389a758d0f 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -73,7 +73,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
                  EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
                  event_flags;
    unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
-   bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs;
+   bool compute_ib = !ctx->has_graphics;
 
    radeon_begin(cs);
 
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index b9305e77115..02421e3970f 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -92,9 +92,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
 
    ctx->gfx_flush_in_progress = true;
 
-   if (radeon_emitted(&ctx->prim_discard_compute_cs, 0))
-      si_compute_signal_gfx(ctx);
-
    if (ctx->has_graphics) {
       if (!list_is_empty(&ctx->active_queries))
          si_suspend_queries(ctx);
@@ -136,29 +133,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
       si_log_hw_flush(ctx);
    }
 
-   if (si_compute_prim_discard_enabled(ctx)) {
-      /* The compute IB can start after the previous gfx IB starts. */
-      if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
-         ctx->ws->cs_add_fence_dependency(
-            &ctx->gfx_cs, ctx->last_gfx_fence,
-            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
-      }
-
-      /* Remember the last execution barrier. It's in the IB.
-       * It will signal the start of the next compute IB.
-       */
-      if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
-         *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
-         ctx->last_pkt3_write_data = NULL;
-
-         si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
-         ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
-         si_resource_reference(&ctx->barrier_buf, NULL);
-
-         ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
-      }
-   }
-
    if (ctx->is_noop)
       flags |= RADEON_FLUSH_NOOP;
 
@@ -171,17 +145,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
 
    ctx->num_gfx_cs_flushes++;
 
-   if (si_compute_prim_discard_enabled(ctx)) {
-      /* Remember the last execution barrier, which is the last fence
-       * in this case.
-       */
-      if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-         ctx->last_pkt3_write_data = NULL;
-         si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
-         ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
-      }
-   }
-
    /* Check VM faults if needed. */
    if (sscreen->debug_flags & DBG(CHECK_VM)) {
       /* Use conservative timeout 800ms, after which we won't wait any
@@ -216,7 +179,7 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)
    pipe_reference_init(&ctx->current_saved_cs->reference, 1);
 
    ctx->current_saved_cs->trace_buf =
-      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4));
    if (!ctx->current_saved_cs->trace_buf) {
       free(ctx->current_saved_cs);
       ctx->current_saved_cs = NULL;
@@ -368,11 +331,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
    bool is_secure = false;
 
    if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
-      /* Disable features that don't work with TMZ:
-       *   - primitive discard
-       */
-      ctx->prim_discard_vertex_count_threshold = UINT_MAX;
-
       is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
 
       si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);
@@ -549,18 +507,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
 
    assert(!ctx->gfx_cs.prev_dw);
    ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
-   ctx->prim_discard_compute_ib_initialized = false;
-
-   /* Compute-based primitive discard:
-    *   The index ring is divided into 2 halves. Switch between the halves
-    *   in the same fashion as doublebuffering.
-    */
-   if (ctx->index_ring_base)
-      ctx->index_ring_base = 0;
-   else
-      ctx->index_ring_base = ctx->index_ring_size_per_ib;
-
-   ctx->index_ring_offset = 0;
 
    /* All buffer references are removed on a flush, so si_check_needs_implicit_sync
     * cannot determine if si_make_CB_shader_coherent() needs to be called.
@@ -586,34 +532,9 @@ void si_trace_emit(struct si_context *sctx)
       u_log_flush(sctx->log);
 }
 
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
-{
-   if (!si_compute_prim_discard_enabled(sctx))
-      return;
-
-   if (!sctx->barrier_buf) {
-      u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
-                           (struct pipe_resource **)&sctx->barrier_buf);
-   }
-
-   /* Emit a placeholder to signal the next compute IB to start.
-    * See si_compute_prim_discard.c for explanation.
-    */
-   uint32_t signal = 1;
-   si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
-                    &signal);
-
-   sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5];
-
-   /* Only the last occurrence of WRITE_DATA will be executed.
-    * The packet will be enabled in si_flush_gfx_cs.
-    */
-   *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
-}
-
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
 {
-   bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
+   bool compute_ib = !sctx->has_graphics;
 
    assert(sctx->chip_class <= GFX9);
 
@@ -857,14 +778,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
 
    uint32_t cp_coher_cntl = 0;
    const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
-   const bool is_barrier =
-      flush_cb_db ||
-      /* INV_ICACHE == beginning of gfx IB. Checking
-       * INV_ICACHE fixes corruption for DeusExMD with
-       * compute-based culling, but I don't know why.
-       */
-      flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
-      (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
 
    assert(sctx->chip_class <= GFX9);
 
@@ -1077,9 +990,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
       radeon_end();
    }
 
-   if (is_barrier)
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-
    if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
       radeon_begin(cs);
       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index dee46810203..c52de041304 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -95,9 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
    {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
    {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
-   {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
-   {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
-   {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
    {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
    {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
    {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
@@ -309,12 +306,8 @@ static void si_destroy_context(struct pipe_context *context)
    u_suballocator_destroy(&sctx->allocator_zeroed_memory);
 
    sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
-   sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
    si_resource_reference(&sctx->eop_bug_scratch, NULL);
    si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
-   si_resource_reference(&sctx->index_ring, NULL);
-   si_resource_reference(&sctx->barrier_buf, NULL);
-   si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
    si_resource_reference(&sctx->shadowed_regs, NULL);
    radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
    radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
@@ -618,12 +611,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       default:
          unreachable("unhandled chip class");
       }
-
-      si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
-                                          &sctx->prim_discard_vertex_count_threshold,
-                                          &sctx->index_ring_size_per_ib);
-   } else {
-      sctx->prim_discard_vertex_count_threshold = UINT_MAX;
    }
 
    sctx->sample_mask = 0xffff;
@@ -641,7 +628,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       sctx->b.create_video_buffer = vl_video_buffer_create;
    }
 
-   if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+   if (sctx->chip_class >= GFX9) {
       sctx->wait_mem_scratch =
            si_aligned_buffer_create(screen,
                                     SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
@@ -1167,15 +1154,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
 
    sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
 
-   unsigned prim_discard_vertex_count_threshold, tmp;
-   si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
-   /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
-   if (prim_discard_vertex_count_threshold == UINT_MAX) {
-      /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
-       * have to allocate and count references for the upload buffer.
-       */
-      sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
-   }
+   /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
+    * have to allocate and count references for the upload buffer.
+    */
+   sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
 
    /* Determine tessellation ring info. */
    bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 062fa6e34f7..44f160450f7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -44,7 +44,6 @@ extern "C" {
 #endif
 
 #define ATI_VENDOR_ID         0x1002
-#define SI_PRIM_DISCARD_DEBUG 0
 #define SI_NOT_QUERY          0xffffffff
 
 /* The base vertex and primitive restart can be any number, but we must pick
@@ -155,11 +154,6 @@ enum si_has_ngg {
    NGG_ON,
 };
 
-enum si_has_prim_discard_cs {
-   PRIM_DISCARD_CS_OFF,
-   PRIM_DISCARD_CS_ON,
-};
-
 enum si_clear_code
 {
    DCC_CLEAR_COLOR_0000 = 0x00000000,
@@ -223,9 +217,6 @@ enum
    DBG_ALWAYS_NGG_CULLING_TESS,
    DBG_NO_NGG_CULLING,
    DBG_NO_FAST_LAUNCH,
-   DBG_ALWAYS_PD,
-   DBG_PD,
-   DBG_NO_PD,
    DBG_SWITCH_ON_EOP,
    DBG_NO_OUT_OF_ORDER,
    DBG_NO_DPBB,
@@ -896,7 +887,6 @@ struct si_saved_cs {
    unsigned trace_id;
 
    unsigned gfx_last_dw;
-   unsigned compute_last_dw;
    bool flushed;
    int64_t time_flush;
 };
@@ -995,26 +985,6 @@ struct si_context {
    /* NGG streamout. */
    struct pb_buffer *gds;
    struct pb_buffer *gds_oa;
-   /* Compute-based primitive discard. */
-   unsigned prim_discard_vertex_count_threshold;
-   struct radeon_cmdbuf prim_discard_compute_cs;
-   struct si_shader *compute_ib_last_shader;
-   uint32_t compute_rewind_va;
-   unsigned compute_num_prims_in_batch;
-   /* index_ring is divided into 2 halves for doublebuffering. */
-   struct si_resource *index_ring;
-   unsigned index_ring_base;        /* offset of a per-IB portion */
-   unsigned index_ring_offset;      /* offset within a per-IB portion */
-   unsigned index_ring_size_per_ib; /* max available size per IB */
-   bool prim_discard_compute_ib_initialized;
-   /* For tracking the last execution barrier - it can be either
-    * a WRITE_DATA packet or a fence. */
-   uint32_t *last_pkt3_write_data;
-   struct si_resource *barrier_buf;
-   unsigned barrier_buf_offset;
-   struct pipe_fence_handle *last_ib_barrier_fence;
-   struct si_resource *last_ib_barrier_buf;
-   unsigned last_ib_barrier_buf_offset;
 
    /* Atoms (direct states). */
    union si_state_atoms atoms;
@@ -1063,7 +1033,6 @@ struct si_context {
       /* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
       struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
    };
-   struct si_shader_ctx_state cs_prim_discard_state;
    struct si_cs_shader_state cs_shader_state;
 
    /* shader information */
@@ -1254,9 +1223,6 @@ struct si_context {
    unsigned num_resident_handles;
    uint64_t num_alloc_tex_transfer_bytes;
    unsigned last_tex_ps_draw_ratio; /* for query */
-   unsigned compute_num_verts_accepted;
-   unsigned compute_num_verts_rejected;
-   unsigned compute_num_verts_ineligible; /* due to low vertex count */
    unsigned context_roll;
 
    /* Queries. */
@@ -1287,7 +1253,7 @@ struct si_context {
     */
    struct hash_table *dirty_implicit_resources;
 
-   pipe_draw_vbo_func draw_vbo[2][2][2][2];
+   pipe_draw_vbo_func draw_vbo[2][2][2];
    /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
    pipe_draw_vbo_func real_draw_vbo;
 
@@ -1483,7 +1449,6 @@ void si_allocate_gds(struct si_context *ctx);
 void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
 void si_trace_emit(struct si_context *sctx);
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
                           unsigned cp_coher_cntl);
 void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
@@ -1502,32 +1467,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void si_init_compute_functions(struct si_context *sctx);
 
-/* si_compute_prim_discard.c */
-enum si_prim_discard_outcome
-{
-   SI_PRIM_DISCARD_ENABLED,
-   SI_PRIM_DISCARD_DISABLED,
-   SI_PRIM_DISCARD_DRAW_SPLIT,
-   SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
-};
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      unsigned drawid_offset,
-                                      const struct pipe_draw_start_count_bias *draws,
-                                      unsigned num_draws, unsigned total_count);
-void si_compute_signal_gfx(struct si_context *sctx);
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info,
-                                          const struct pipe_draw_start_count_bias *draws,
-                                          unsigned num_draws, unsigned index_size,
-                                          unsigned total_count, uint64_t input_indexbuf_va,
-                                          unsigned index_max_size);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
-                                         unsigned *prim_discard_vertex_count_threshold,
-                                         unsigned *index_ring_size_per_ib);
-
 /* si_pipe.c */
 void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
 
@@ -1996,14 +1935,9 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
 }
 
-static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
-{
-   return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
-}
-
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
                                         gl_shader_stage stage, bool ngg, bool es,
-                                        bool gs_fast_launch, bool prim_discard_cs)
+                                        bool gs_fast_launch)
 {
    if (stage == MESA_SHADER_COMPUTE)
       return sscreen->compute_wave_size;
@@ -2011,8 +1945,7 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen,
       return sscreen->ps_wave_size;
    else if (gs_fast_launch)
       return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
-   else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
-            (stage == MESA_SHADER_VERTEX && es && !ngg) ||
+   else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
             (stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
             (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
       return 64;
@@ -2025,18 +1958,14 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
    return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
                            shader->key.as_ngg,
                            shader->key.as_es,
-                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                           shader->key.opt.vs_as_prim_discard_cs);
+                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }
 
 static inline void si_select_draw_vbo(struct si_context *sctx)
 {
-   bool has_prim_discard_cs = si_compute_prim_discard_enabled(sctx) &&
-                              !sctx->shader.tes.cso && !sctx->shader.gs.cso;
    pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
                                                [!!sctx->shader.gs.cso]
-                                               [sctx->ngg]
-                                               [has_prim_discard_cs];
+                                               [sctx->ngg];
    assert(draw_vbo);
    if (unlikely(sctx->real_draw_vbo))
       sctx->real_draw_vbo = draw_vbo;
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index eba6af47a99..8908a56554e 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -260,15 +260,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
       break;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-      query->begin_result = sctx->compute_num_verts_accepted;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-      query->begin_result = sctx->compute_num_verts_rejected;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      query->begin_result = sctx->compute_num_verts_ineligible;
-      break;
    case SI_QUERY_GPIN_ASIC_ID:
    case SI_QUERY_GPIN_NUM_SIMD:
    case SI_QUERY_GPIN_NUM_RB:
@@ -429,15 +420,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
       query->end_result = sctx->screen->num_disk_shader_cache_misses;
       break;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-      query->end_result = sctx->compute_num_verts_accepted;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-      query->end_result = sctx->compute_num_verts_rejected;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      query->end_result = sctx->compute_num_verts_ineligible;
-      break;
    case SI_QUERY_GPIN_ASIC_ID:
    case SI_QUERY_GPIN_NUM_SIMD:
    case SI_QUERY_GPIN_NUM_RB:
@@ -479,11 +461,6 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ
       result->u64 =
          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
       return true;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
-      return true;
    case SI_QUERY_GPIN_ASIC_ID:
       result->u32 = 0;
       return true;
@@ -1758,10 +1735,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
-
-   X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
-   X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
-   X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
 };
 
 #undef X
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index b1654106b13..b0e11373852 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -111,9 +111,6 @@ enum
    SI_QUERY_GPIN_NUM_RB,
    SI_QUERY_GPIN_NUM_SPI,
    SI_QUERY_GPIN_NUM_SE,
-   SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
-   SI_QUERY_PD_NUM_PRIMS_REJECTED,
-   SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
    SI_QUERY_LIVE_SHADER_CACHE_HITS,
    SI_QUERY_LIVE_SHADER_CACHE_MISSES,
    SI_QUERY_MEMORY_SHADER_CACHE_HITS,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ee4ed59b096..92b3cb98fcc 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
 
       /* VGPRs */
       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
-
-      /* Return values */
-      if (shader->key.opt.vs_as_prim_discard_cs) {
-         for (i = 0; i < 4; i++)
-            ac_add_return(&ctx->args, AC_ARG_VGPR);
-      }
       break;
 
    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -1070,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader)
          return "Vertex Shader as ES";
       else if (shader->key.as_ls)
          return "Vertex Shader as LS";
-      else if (shader->key.opt.vs_as_prim_discard_cs)
-         return "Vertex Shader as Primitive Discard CS";
       else if (shader->key.as_ngg)
          return "Vertex Shader as ESGS";
       else
@@ -1183,12 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
       fprintf(f, "  as_ls = %u\n", key->as_ls);
       fprintf(f, "  as_ngg = %u\n", key->as_ngg);
       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
-      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
-      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
-      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
-      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
-      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
-      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
       break;
 
    case MESA_SHADER_TESS_CTRL:
@@ -1317,7 +1303,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
    key->vs_prolog.as_ls = shader_out->key.as_ls;
    key->vs_prolog.as_es = shader_out->key.as_es;
    key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
 
    if (ngg_cull_shader) {
       key->vs_prolog.gs_fast_launch_tri_list =
@@ -1342,8 +1327,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
 
    /* Only one of these combinations can be set. as_ngg can be set with as_es. */
    assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
-             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
-          1);
+          (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
 
    /* Enable loading the InstanceID VGPR. */
    uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
@@ -1557,7 +1541,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
          (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
          (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
          SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
-      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
       break;
    case MESA_SHADER_TESS_CTRL:
       assert(!prolog);
@@ -1581,8 +1564,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, stage,
                                          shader.key.as_ngg, shader.key.as_es,
-                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                                         shader.key.opt.vs_as_prim_discard_cs));
+                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
    ctx.shader = &shader;
    ctx.stage = stage;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 8b787185464..8ddaeaab7de 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -446,7 +446,6 @@ struct si_shader_selector {
    ubyte const_and_shader_buf_descriptors_index;
    ubyte sampler_and_images_descriptors_index;
    bool vs_needs_prolog;
-   bool prim_discard_cs_allowed;
    ubyte cs_shaderbufs_sgpr_index;
    ubyte cs_num_shaderbufs_in_user_sgprs;
    ubyte cs_images_sgpr_index;
@@ -577,7 +576,6 @@ union si_shader_part_key {
       unsigned as_ls : 1;
       unsigned as_es : 1;
       unsigned as_ngg : 1;
-      unsigned as_prim_discard_cs : 1;
       unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
       unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
       unsigned gs_fast_launch_index_size_packed : 2;
@@ -684,14 +682,6 @@ struct si_shader_key {
        */
       unsigned prefer_mono : 1;
 
-      /* Primitive discard compute shader. */
-      unsigned vs_as_prim_discard_cs : 1;
-      unsigned cs_prim_type : 4;
-      unsigned cs_indexed : 1;
-      unsigned cs_provoking_vertex_first : 1;
-      unsigned cs_cull_front : 1;
-      unsigned cs_cull_back : 1;
-
       /* VS and TCS have the same number of patch vertices. */
       unsigned same_patch_vertices:1;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 8854584e059..c975581fe4f 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -804,9 +804,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
        !same_thread_count && si_is_multi_part_shader(ctx->shader))
       ac_build_endif(&ctx->ac, 6507);
 
-   /* Return the value from the last part. It's non-void only for the prim
-    * discard compute shader.
-    */
    if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
       LLVMBuildRetVoid(builder);
    else
@@ -1116,9 +1113,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
       parts[num_parts++] = main_fn;
 
       si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
-
-      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
-         si_build_prim_discard_compute_shader(&ctx);
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
       LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
 
@@ -1289,8 +1283,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
    }
 
    /* Make sure the input is a pointer and not integer followed by inttoptr. */
-   if (!shader->key.opt.vs_as_prim_discard_cs)
-      assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
 
    /* Compile to bytecode. */
    if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 0cfd441488a..b4a3b8a8aad 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
 
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
-                                         false, false, false, false));
+                                         false, false, false));
    ctx.shader = shader;
    ctx.stage = MESA_SHADER_VERTEX;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index 73dff3f2203..b6bfa6fe09d 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -793,32 +793,6 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
    FREE(outputs);
 }
 
-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi)
-{
-   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-   struct si_shader_info *info = &ctx->shader->selector->info;
-   LLVMValueRef *addrs = abi->outputs;
-   LLVMValueRef pos[4] = {};
-
-   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
-
-   for (unsigned i = 0; i < info->num_outputs; i++) {
-      if (info->output_semantic[i] != VARYING_SLOT_POS)
-         continue;
-
-      for (unsigned chan = 0; chan < 4; chan++)
-         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-      break;
-   }
-   assert(pos[0] != NULL);
-
-   /* Return the position output. */
-   LLVMValueRef ret = ctx->return_value;
-   for (unsigned chan = 0; chan < 4; chan++)
-      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
-   ctx->return_value = ret;
-}
-
 /**
  * Build the vertex shader prolog function.
  *
@@ -1121,8 +1095,6 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad
       ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
    else if (shader->key.as_es)
       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-   else if (shader->key.opt.vs_as_prim_discard_cs)
-      ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
    else if (ngg_cull_shader)
       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
    else if (shader->key.as_ngg)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 9646fc36195..f4a6ef77b25 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -971,7 +971,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
       }                                                                  \
    } while (0)
 
-template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_ngg NGG>
 ALWAYS_INLINE
 static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
                                  unsigned drawid_base,
@@ -980,7 +980,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                                  unsigned num_draws, unsigned total_count,
                                  struct pipe_resource *indexbuf, unsigned index_size,
                                  unsigned index_offset, unsigned instance_count,
-                                 bool dispatch_prim_discard_cs, unsigned original_index_size)
+                                 unsigned original_index_size)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
@@ -1042,22 +1042,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
          sctx->last_index_size = index_size;
       }
 
-      /* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
-      if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
-         index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
-         /* Skip draw calls with 0-sized index buffers.
-          * They cause a hang on some chips, like Navi10-14.
-          */
-         if (!index_max_size) {
-            radeon_end();
-            return;
-         }
-
-         index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
-                                   RADEON_PRIO_INDEX_BUFFER);
+      index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size);
+      /* Skip draw calls with 0-sized index buffers.
+       * They cause a hang on some chips, like Navi10-14.
+       */
+      if (!index_max_size) {
+         radeon_end();
+         return;
       }
+
+      index_va = si_resource(indexbuf)->gpu_address + index_offset;
+
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
+                                RADEON_PRIO_INDEX_BUFFER);
    } else {
       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
        * so the state must be re-emitted before the next indexed draw.
@@ -1190,16 +1187,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
       bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;
 
       if (index_size) {
-         if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
-            radeon_end();
-
-            si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
-                                                 original_index_size, total_count, index_va,
-                                                 index_max_size);
-            EMIT_SQTT_END_DRAW;
-            return;
-         }
-
          /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
           * can be changed between draws, and GS fast launch must be disabled.
           * NOT_EOP doesn't work on gfx9 and older.
@@ -1629,100 +1616,12 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
           info->restart_index, min_vertex_count);
 }
 
-static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
-{
-   struct radeon_winsys *ws = sctx->ws;
-   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-   struct si_descriptors *buffers =
-      &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
-   struct si_shader_selector *vs = sctx->shader.vs.cso;
-   struct si_vertex_elements *velems = sctx->vertex_elements;
-   unsigned num_velems = velems->count;
-   unsigned num_images = vs->info.base.num_images;
-
-   /* Index buffer. */
-   if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
-      goto has_write_reference;
-
-   /* Vertex buffers. */
-   for (unsigned i = 0; i < num_velems; i++) {
-      if (!((1 << i) & velems->first_vb_use_mask))
-         continue;
-
-      unsigned vb_index = velems->vertex_buffer_index[i];
-      struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
-      if (!res)
-         continue;
-
-      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-         goto has_write_reference;
-   }
-
-   /* Constant and shader buffers. */
-   for (unsigned i = 0; i < buffers->num_active_slots; i++) {
-      unsigned index = buffers->first_active_slot + i;
-      struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
-      if (!res)
-         continue;
-
-      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-         goto has_write_reference;
-   }
-
-   /* Samplers. */
-   if (vs->info.base.textures_used[0]) {
-      unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used);
-
-      for (unsigned i = 0; i < num_samplers; i++) {
-         struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
-         if (!view)
-            continue;
-
-         if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
-            goto has_write_reference;
-      }
-   }
-
-   /* Images. */
-   if (num_images) {
-      for (unsigned i = 0; i < num_images; i++) {
-         struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
-         if (!res)
-            continue;
-
-         if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-            goto has_write_reference;
-      }
-   }
-
-   return true;
-
-has_write_reference:
-   /* If the current gfx IB has enough packets, flush it to remove write
-    * references to buffers.
-    */
-   if (cs->prev_dw + cs->current.cdw > 2048) {
-      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-      assert(si_all_vs_resources_read_only(sctx, indexbuf));
-      return true;
-   }
-   return false;
-}
-
-static ALWAYS_INLINE bool pd_msg(const char *s)
-{
-   if (SI_PRIM_DISCARD_DEBUG)
-      printf("PD failed: %s\n", s);
-   return false;
-}
-
 #define DRAW_CLEANUP do {                                 \
       if (index_size && indexbuf != info->index.resource) \
          pipe_resource_reference(&indexbuf, NULL);        \
    } while (0)
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
-          si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
 static void si_draw_vbo(struct pipe_context *ctx,
                         const struct pipe_draw_info *info,
                         unsigned drawid_offset,
@@ -1910,70 +1809,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
       info->primitive_restart &&
       (!sctx->screen->options.prim_restart_tri_strips_only ||
        (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
-   bool dispatch_prim_discard_cs = false;
    unsigned original_index_size = index_size;
 
-   /* Determine if we can use the primitive discard compute shader. */
-   /* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */
-   if (ALLOW_PRIM_DISCARD_CS &&
-       (total_direct_count > sctx->prim_discard_vertex_count_threshold
-           ? (sctx->compute_num_verts_rejected += total_direct_count, true)
-           : /* Add, then return true. */
-           (sctx->compute_num_verts_ineligible += total_direct_count,
-            false)) && /* Add, then return false. */
-       (!primitive_restart || pd_msg("primitive restart")) &&
-       /* Supported prim types. */
-       (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) &&
-       (instance_count == 1 || pd_msg("instancing")) &&
-       ((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) ||
-        !sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
-       (!sctx->render_cond || pd_msg("render condition")) &&
-       /* Forced enablement ignores pipeline statistics queries. */
-       (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
-        (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
-        pd_msg("pipestat or primgen query")) &&
-       (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
-       (!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
-       !rs->polygon_mode_enabled &&
-#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
-       (!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
-       (!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
-       (!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) &&
-       (!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
-       !sctx->shader.vs.cso->info.base.vs.window_space_position &&
-       !sctx->shader.vs.cso->so.num_outputs &&
-#else
-       (sctx->shader.vs.cso->prim_discard_cs_allowed ||
-        pd_msg("VS shader uses unsupported features")) &&
-#endif
-       /* Check that all buffers are used for read only, because compute
-        * dispatches can run ahead. */
-       (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
-        pd_msg("write reference"))) {
-      switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws,
-                                                    total_direct_count)) {
-      case SI_PRIM_DISCARD_ENABLED:
-         original_index_size = index_size;
-         dispatch_prim_discard_cs = true;
-
-         /* The compute shader changes/lowers the following: */
-         prim = PIPE_PRIM_TRIANGLES;
-         index_size = 4;
-         instance_count = 1;
-         sctx->compute_num_verts_rejected -= total_direct_count;
-         sctx->compute_num_verts_accepted += total_direct_count;
-         break;
-      case SI_PRIM_DISCARD_DISABLED:
-         break;
-      case SI_PRIM_DISCARD_DRAW_SPLIT:
-      case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT:
-         sctx->compute_num_verts_rejected -= total_direct_count;
-         /* The multi draw was split into multiple ones and executed. Return. */
-         DRAW_CLEANUP;
-         return;
-      }
-   }
-
    /* Set the rasterization primitive type.
     *
     * This must be done after si_decompress_textures, which can call
@@ -2005,7 +1842,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    if (GFX_VERSION >= GFX10) {
       struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
 
-      if (NGG && !HAS_GS && !dispatch_prim_discard_cs &&
+      if (NGG && !HAS_GS &&
           /* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type
            * is not triangles, so this check is only needed without tessellation. */
           (HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&
@@ -2154,10 +1991,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+      si_emit_draw_packets<GFX_VERSION, NGG>
             (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
-             index_size, index_offset, instance_count, dispatch_prim_discard_cs,
-             original_index_size);
+             index_size, index_offset, instance_count, original_index_size);
       /* <-- CUs are busy here. */
 
       /* Start prefetches after the draw has been started. Both will run
@@ -2193,10 +2029,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+      si_emit_draw_packets<GFX_VERSION, NGG>
             (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
-             index_size, index_offset, instance_count, dispatch_prim_discard_cs,
-             original_index_size);
+             index_size, index_offset, instance_count, original_index_size);
 
       /* Prefetch the remaining shaders after the draw has been
        * started. */
@@ -2281,40 +2116,27 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
    pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
-          si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
 static void si_init_draw_vbo(struct si_context *sctx)
 {
-   /* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
-   if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)
-      return;
-
-   if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))
-      return;
-
    if (NGG && GFX_VERSION < GFX10)
       return;
 
-   sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
-      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
-}
-
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
-static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
-{
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
+   sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
+      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
 }
 
 template <chip_class GFX_VERSION>
 static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
 {
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_ON>(sctx);
 }
 
 static void si_invalid_draw_vbo(struct pipe_context *pipe,
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c
index 5412a87f0a1..8ffe2901970 100644
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -81,8 +81,8 @@
  *   Right half: {1,3,5,7,9,11,13,15}
  */
 
-/* Important note: We have to use the standard DX positions, because
- * the primitive discard compute shader relies on them.
+/* Important note: We have to use the standard DX positions because shader-based culling
+ * relies on them.
  */
 
 /* 1x MSAA */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 00352ddeae3..dc1d5f795a6 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
       shader_variant_flags |= 1 << 0;
    if (sel->nir)
       shader_variant_flags |= 1 << 1;
-   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
+   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32)
       shader_variant_flags |= 1 << 2;
    if (sel->info.stage == MESA_SHADER_FRAGMENT &&
        /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
@@ -78,11 +78,9 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
        sel->info.base.fs.uses_discard &&
        sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
       shader_variant_flags |= 1 << 3;
-   if (sel->info.stage == MESA_SHADER_VERTEX) {
-      /* This varies depending on whether compute-based culling is enabled. */
-      assert(sel->screen->num_vbos_in_user_sgprs <= 7);
-      shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
-   }
+
+   /* bit gap */
+
    if (sel->screen->options.no_infinite_interp)
       shader_variant_flags |= 1 << 7;
    if (sel->screen->options.clamp_div_by_zero)
@@ -2291,10 +2289,8 @@ current_not_ready:
 
    /* Compile the main shader part if it doesn't exist. This can happen
     * if the initial guess was wrong.
-    *
-    * The prim discard CS doesn't need the main shader part.
     */
-   if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+   if (!is_pure_monolithic) {
       bool ok = true;
 
       /* Make sure the main shader part is present. This is needed
@@ -2348,8 +2344,7 @@ current_not_ready:
    shader->is_monolithic =
       is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
-   /* The prim discard CS is always optimized. */
-   shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+   shader->is_optimized = !is_pure_monolithic &&
                           memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
    /* If it's an optimized shader, compile it asynchronously. */
@@ -2706,12 +2701,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
    sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
                           !sel->info.base.vs.blit_sgprs_amd;
 
-   sel->prim_discard_cs_allowed =
-      sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images &&
-      !sel->info.uses_bindless_samplers && !sel->info.base.writes_memory &&
-      !sel->info.writes_viewport_index &&
-      !sel->info.base.vs.window_space_position && !sel->so.num_outputs;
-
    if (sel->info.stage == MESA_SHADER_VERTEX ||
        sel->info.stage == MESA_SHADER_TESS_CTRL ||
        sel->info.stage == MESA_SHADER_TESS_EVAL ||
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 0792055ccca..747fe281d27 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -771,9 +771,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
        *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
        */
       return 20 * 1024;
-   case IB_PARALLEL_COMPUTE:
-      /* Always chain this IB. */
-      return UINT_MAX;
    default:
       unreachable("bad ib_type");
    }
@@ -908,9 +905,6 @@ static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
       assert(0);
    }
 
-   cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE;
-   cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
-
    cs->last_added_bo = NULL;
    return true;
 }
@@ -938,8 +932,6 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs
    cleanup_fence_list(&cs->fence_dependencies);
    cleanup_fence_list(&cs->syncobj_dependencies);
    cleanup_fence_list(&cs->syncobj_to_signal);
-   cleanup_fence_list(&cs->compute_fence_dependencies);
-   cleanup_fence_list(&cs->compute_start_fence_dependencies);
 
    cs->num_real_buffers = 0;
    cs->num_slab_buffers = 0;
@@ -957,8 +949,6 @@ static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs
    FREE(cs->fence_dependencies.list);
    FREE(cs->syncobj_dependencies.list);
    FREE(cs->syncobj_to_signal.list);
-   FREE(cs->compute_fence_dependencies.list);
-   FREE(cs->compute_start_fence_dependencies.list);
 }
 
 
@@ -997,7 +987,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
    amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
 
    cs->main.ib_type = IB_MAIN;
-   cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE;
 
    if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
       FREE(cs);
@@ -1035,37 +1024,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
    return true;
 }
 
-static bool
-amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *compute_cs,
-                                  struct radeon_cmdbuf *gfx_cs,
-                                  bool uses_gds_ordered_append)
-{
-   struct amdgpu_cs *cs = amdgpu_cs(gfx_cs);
-   struct amdgpu_winsys *ws = cs->ws;
-
-   if (cs->ring_type != RING_GFX)
-      return false;
-
-   /* only one secondary IB can be added */
-   if (cs->compute_ib.ib_mapped)
-      return false;
-
-   /* Allocate the compute IB. */
-   if (!amdgpu_get_new_ib(ws, compute_cs, &cs->compute_ib, cs))
-      return false;
-
-   if (uses_gds_ordered_append) {
-      cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |=
-            AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
-      cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |=
-            AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
-   }
-
-   cs->compute_ib.rcs = compute_cs;
-   compute_cs->priv = cs;
-   return true;
-}
-
 static bool
 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
                            unsigned preamble_num_dw)
@@ -1128,7 +1086,7 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
                                   bool force_chaining)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
-   struct amdgpu_ib *ib = rcs == cs->main.rcs ? &cs->main : &cs->compute_ib;
+   struct amdgpu_ib *ib = &cs->main;
    unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
    unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
 
@@ -1286,18 +1244,6 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
 
    util_queue_fence_wait(&fence->submitted);
 
-   if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) {
-      /* Syncobjs are not needed here. */
-      assert(!amdgpu_fence_is_syncobj(fence));
-
-      if (acs->ws->info.has_scheduled_fence_dependency &&
-          dependency_flags & RADEON_DEPENDENCY_START_FENCE)
-         add_fence_to_list(&cs->compute_start_fence_dependencies, fence);
-      else
-         add_fence_to_list(&cs->compute_fence_dependencies, fence);
-      return;
-   }
-
    /* Start fences are not needed here. */
    assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE));
 
@@ -1589,66 +1535,6 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
          num_chunks++;
       }
 
-      /* Submit the parallel compute IB first. */
-      if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) {
-         unsigned old_num_chunks = num_chunks;
-
-         /* Add compute fence dependencies. */
-         unsigned num_dependencies = cs->compute_fence_dependencies.num;
-         if (num_dependencies) {
-            struct drm_amdgpu_cs_chunk_dep *dep_chunk =
-               alloca(num_dependencies * sizeof(*dep_chunk));
-
-            for (unsigned i = 0; i < num_dependencies; i++) {
-               struct amdgpu_fence *fence =
-                  (struct amdgpu_fence*)cs->compute_fence_dependencies.list[i];
-
-               assert(util_queue_fence_is_signalled(&fence->submitted));
-               amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
-            }
-
-            chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
-            chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
-            chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
-            num_chunks++;
-         }
-
-         /* Add compute start fence dependencies. */
-         unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num;
-         if (num_start_dependencies) {
-            struct drm_amdgpu_cs_chunk_dep *dep_chunk =
-               alloca(num_start_dependencies * sizeof(*dep_chunk));
-
-            for (unsigned i = 0; i < num_start_dependencies; i++) {
-               struct amdgpu_fence *fence =
-                  (struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i];
-
-               assert(util_queue_fence_is_signalled(&fence->submitted));
-               amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
-            }
-
-            chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES;
-            chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies;
-            chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
-            num_chunks++;
-         }
-
-         /* Convert from dwords to bytes. */
-         cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4;
-         chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
-         chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
-         chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE];
-         num_chunks++;
-
-         r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
-                                                   num_chunks, chunks, NULL);
-         if (r)
-            goto finalize;
-
-         /* Back off the compute chunks. */
-         num_chunks = old_num_chunks;
-      }
-
       /* Syncobj signals. */
       unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
       if (num_syncobj_to_signal) {
@@ -1706,7 +1592,7 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
       r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
                                                 num_chunks, chunks, &seq_no);
    }
-finalize:
+
    if (r) {
       if (r == -ENOMEM)
          fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@@ -1798,12 +1684,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
       }
       if (cs->ring_type == RING_GFX)
          ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
-
-      /* Also pad secondary IBs. */
-      if (cs->compute_ib.ib_mapped) {
-         while (cs->compute_ib.rcs->current.cdw & ib_pad_dw_mask)
-            radeon_emit(cs->compute_ib.rcs, PKT3_NOP_PAD);
-      }
       break;
    case RING_UVD:
    case RING_UVD_ENC:
@@ -1839,9 +1719,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
       /* Set IB sizes. */
       amdgpu_ib_finalize(ws, rcs, &cs->main);
 
-      if (cs->compute_ib.ib_mapped)
-         amdgpu_ib_finalize(ws, cs->compute_ib.rcs, &cs->compute_ib);
-
       /* Create a fence. */
       amdgpu_fence_reference(&cur->fence, NULL);
       if (cs->next_fence) {
@@ -1897,8 +1774,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
    memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
 
    amdgpu_get_new_ib(ws, rcs, &cs->main, cs);
-   if (cs->compute_ib.ib_mapped)
-      amdgpu_get_new_ib(ws, cs->compute_ib.rcs, &cs->compute_ib, cs);
 
    if (cs->preamble_ib_bo) {
       amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0,
@@ -1929,9 +1804,6 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
    radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
    radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL);
    FREE(rcs->prev);
-   radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->compute_ib.big_ib_buffer, NULL);
-   if (cs->compute_ib.rcs)
-      FREE(cs->compute_ib.rcs->prev);
    amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
    amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
    amdgpu_fence_reference(&cs->next_fence, NULL);
@@ -1954,7 +1826,6 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
    ws->base.ctx_destroy = amdgpu_ctx_destroy;
    ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
    ws->base.cs_create = amdgpu_cs_create;
-   ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib;
    ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
    ws->base.cs_destroy = amdgpu_cs_destroy;
    ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 77bde4a070b..4568a6e9b1f 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -58,7 +58,6 @@ struct amdgpu_cs_buffer {
 enum ib_type {
    IB_PREAMBLE,
    IB_MAIN,
-   IB_PARALLEL_COMPUTE,
    IB_NUM,
 };
 
@@ -115,10 +114,6 @@ struct amdgpu_cs_context {
    struct amdgpu_fence_list    syncobj_dependencies;
    struct amdgpu_fence_list    syncobj_to_signal;
 
-   /* The compute IB uses the dependencies above + these: */
-   struct amdgpu_fence_list    compute_fence_dependencies;
-   struct amdgpu_fence_list    compute_start_fence_dependencies;
-
    struct pipe_fence_handle    *fence;
 
    /* the error returned from cs_flush for non-async submissions */
@@ -132,7 +127,6 @@ struct amdgpu_cs_context {
 
 struct amdgpu_cs {
    struct amdgpu_ib main; /* must be first because this is inherited */
-   struct amdgpu_ib compute_ib;      /* optional parallel compute IB */
    struct amdgpu_winsys *ws;
    struct amdgpu_ctx *ctx;
    enum ring_type ring_type;