radeonsi: remove the primitive discard compute shader

It doesn't always work, it's only useful on gfx9 and older, and it's too complicated. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4011 Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12812>
2026-02-04 10:40:36 +01:00 · 2021-08-11 13:31:19 -04:00 · 2021-08-11 13:31:19 -04:00 · 576f8394db
commit 576f8394db
parent 9e994560ff
22 changed files with 62 additions and 1791 deletions
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@ -773,12 +773,6 @@ radeonsi driver environment variables
      Always use NGG culling even when it can hurt.
   ``nonggc``
      Disable NGG culling.
-   ``alwayspd``
-      Always enable the primitive discard compute shader.
-   ``pd``
-      Enable the primitive discard compute shader for large draw calls.
-   ``nopd``
-      Disable the primitive discard compute shader.
   ``switch_on_eop``
      Program WD/IA to switch on end-of-packet.
   ``nooutoforder``
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@ -80,9 +80,6 @@ enum radeon_bo_flag

 enum radeon_dependency_flag
 {
-   /* Add the dependency to the parallel compute IB only. */
-   RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
-
   /* Instead of waiting for a job to finish execution, the dependency will
    * be signaled when the job starts execution.
    */
@ -512,26 +509,6 @@ struct radeon_winsys {
                                   struct pipe_fence_handle **fence),
                     void *flush_ctx, bool stop_exec_on_failure);

-   /**
-    * Add a parallel compute IB to a gfx IB. It will share the buffer list
-    * and fence dependencies with the gfx IB. The gfx flush call will submit
-    * both IBs at the same time.
-    *
-    * The compute IB doesn't have an output fence, so the primary IB has
-    * to use a wait packet for synchronization.
-    *
-    * The returned IB is only a stream for writing packets to the new
-    * IB. The only function that can be used on the compute cs is cs_check_space.
-    *
-    * \param compute_cs      The returned structure of the command stream.
-    * \param gfx_cs          Gfx IB
-    *
-    * \return true on success
-    */
-   bool (*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *compute_cs,
-                                      struct radeon_cmdbuf *gfx_cs,
-                                      bool uses_gds_ordered_append);
-
   /**
    * Set up and enable mid command buffer preemption for the command stream.
    *
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@ -27,7 +27,6 @@ files_libradeonsi = files(
  'si_build_pm4.h',
  'si_clear.c',
  'si_compute.c',
-  'si_compute_prim_discard.c',
  'si_compute.h',
  'si_compute_blit.c',
  'si_cp_dma.c',
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
      sdst->TC_L2_dirty = true;

   /* If it's not a framebuffer fast clear... */
-   if (coher == SI_COHERENCY_SHADER) {
+   if (coher == SI_COHERENCY_SHADER)
      sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }

 /**
@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
      si_resource(dst)->TC_L2_dirty = true;

   /* If it's not a prefetch or GDS copy... */
-   if (dst && src && (dst != src || dst_offset != src_offset)) {
+   if (dst && src && (dst != src || dst_offset != src_offset))
      sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }

 void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@ -344,7 +344,6 @@ struct si_log_chunk_cs {
   struct si_saved_cs *cs;
   bool dump_bo_list;
   unsigned gfx_begin, gfx_end;
-   unsigned compute_begin, compute_end;
 };

 static void si_log_chunk_type_cs_destroy(void *data)
@ -402,7 +401,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
   struct si_context *ctx = chunk->ctx;
   struct si_saved_cs *scs = chunk->cs;
   int last_trace_id = -1;
-   int last_compute_trace_id = -1;

   /* We are expecting that the ddebug pipe has already
    * waited for the context, so this buffer should be idle.
@ -410,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
    */
   uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
                                       PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
-   if (map) {
+   if (map)
      last_trace_id = map[0];
-      last_compute_trace_id = map[1];
-   }

   if (chunk->gfx_end != chunk->gfx_begin) {
      if (chunk->gfx_begin == 0) {
@ -435,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
      }
   }

-   if (chunk->compute_end != chunk->compute_begin) {
-      assert(ctx->prim_discard_compute_cs.priv);
-
-      if (scs->flushed) {
-         ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
-                     chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
-                     "Compute IB", ctx->chip_class, NULL, NULL);
-      } else {
-         si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
-                             chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
-                             ctx->chip_class);
-      }
-   }
-
   if (chunk->dump_bo_list) {
      fprintf(f, "Flushing. Time: ");
      util_dump_ns(f, scs->time_flush);
@ -468,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du

   struct si_saved_cs *scs = ctx->current_saved_cs;
   unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
-   unsigned compute_cur = 0;

-   if (ctx->prim_discard_compute_cs.priv)
-      compute_cur =
-         ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
-
-   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
      return;

   struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@ -487,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
   chunk->gfx_end = gfx_cur;
   scs->gfx_last_dw = gfx_cur;

-   chunk->compute_begin = scs->compute_last_dw;
-   chunk->compute_end = compute_cur;
-   scs->compute_last_dw = compute_cur;
-
   u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }

--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@ -73,7 +73,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
                 EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
                 event_flags;
   unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
-   bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs;
+   bool compute_ib = !ctx->has_graphics;

   radeon_begin(cs);

--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@ -92,9 +92,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h

   ctx->gfx_flush_in_progress = true;

-   if (radeon_emitted(&ctx->prim_discard_compute_cs, 0))
-      si_compute_signal_gfx(ctx);
-
   if (ctx->has_graphics) {
      if (!list_is_empty(&ctx->active_queries))
         si_suspend_queries(ctx);
@ -136,29 +133,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
      si_log_hw_flush(ctx);
   }

-   if (si_compute_prim_discard_enabled(ctx)) {
-      /* The compute IB can start after the previous gfx IB starts. */
-      if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
-         ctx->ws->cs_add_fence_dependency(
-            &ctx->gfx_cs, ctx->last_gfx_fence,
-            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
-      }
-
-      /* Remember the last execution barrier. It's in the IB.
-       * It will signal the start of the next compute IB.
-       */
-      if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
-         *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
-         ctx->last_pkt3_write_data = NULL;
-
-         si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
-         ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
-         si_resource_reference(&ctx->barrier_buf, NULL);
-
-         ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
-      }
-   }
-
   if (ctx->is_noop)
      flags |= RADEON_FLUSH_NOOP;

@ -171,17 +145,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h

   ctx->num_gfx_cs_flushes++;

-   if (si_compute_prim_discard_enabled(ctx)) {
-      /* Remember the last execution barrier, which is the last fence
-       * in this case.
-       */
-      if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-         ctx->last_pkt3_write_data = NULL;
-         si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
-         ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
-      }
-   }
-
   /* Check VM faults if needed. */
   if (sscreen->debug_flags & DBG(CHECK_VM)) {
      /* Use conservative timeout 800ms, after which we won't wait any
@ -216,7 +179,7 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)
   pipe_reference_init(&ctx->current_saved_cs->reference, 1);

   ctx->current_saved_cs->trace_buf =
-      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4));
   if (!ctx->current_saved_cs->trace_buf) {
      free(ctx->current_saved_cs);
      ctx->current_saved_cs = NULL;
@ -368,11 +331,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
   bool is_secure = false;

   if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
-      /* Disable features that don't work with TMZ:
-       *   - primitive discard
-       */
-      ctx->prim_discard_vertex_count_threshold = UINT_MAX;
-
      is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);

      si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);
@ -549,18 +507,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)

   assert(!ctx->gfx_cs.prev_dw);
   ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
-   ctx->prim_discard_compute_ib_initialized = false;
-
-   /* Compute-based primitive discard:
-    *   The index ring is divided into 2 halves. Switch between the halves
-    *   in the same fashion as doublebuffering.
-    */
-   if (ctx->index_ring_base)
-      ctx->index_ring_base = 0;
-   else
-      ctx->index_ring_base = ctx->index_ring_size_per_ib;
-
-   ctx->index_ring_offset = 0;

   /* All buffer references are removed on a flush, so si_check_needs_implicit_sync
    * cannot determine if si_make_CB_shader_coherent() needs to be called.
@ -586,34 +532,9 @@ void si_trace_emit(struct si_context *sctx)
      u_log_flush(sctx->log);
 }

-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
-{
-   if (!si_compute_prim_discard_enabled(sctx))
-      return;
-
-   if (!sctx->barrier_buf) {
-      u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
-                           (struct pipe_resource **)&sctx->barrier_buf);
-   }
-
-   /* Emit a placeholder to signal the next compute IB to start.
-    * See si_compute_prim_discard.c for explanation.
-    */
-   uint32_t signal = 1;
-   si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
-                    &signal);
-
-   sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5];
-
-   /* Only the last occurrence of WRITE_DATA will be executed.
-    * The packet will be enabled in si_flush_gfx_cs.
-    */
-   *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
-}
-
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
 {
-   bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
+   bool compute_ib = !sctx->has_graphics;

   assert(sctx->chip_class <= GFX9);

@ -857,14 +778,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)

   uint32_t cp_coher_cntl = 0;
   const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
-   const bool is_barrier =
-      flush_cb_db ||
-      /* INV_ICACHE == beginning of gfx IB. Checking
-       * INV_ICACHE fixes corruption for DeusExMD with
-       * compute-based culling, but I don't know why.
-       */
-      flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
-      (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);

   assert(sctx->chip_class <= GFX9);

@ -1077,9 +990,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
      radeon_end();
   }

-   if (is_barrier)
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-
   if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
      radeon_begin(cs);
      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -95,9 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
   {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
   {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
   {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
-   {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
-   {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
-   {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
   {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
   {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
   {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
@ -309,12 +306,8 @@ static void si_destroy_context(struct pipe_context *context)
   u_suballocator_destroy(&sctx->allocator_zeroed_memory);

   sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
-   sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
   si_resource_reference(&sctx->eop_bug_scratch, NULL);
   si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
-   si_resource_reference(&sctx->index_ring, NULL);
-   si_resource_reference(&sctx->barrier_buf, NULL);
-   si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
   si_resource_reference(&sctx->shadowed_regs, NULL);
   radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
   radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
@ -618,12 +611,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
      default:
         unreachable("unhandled chip class");
      }
-
-      si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
-                                          &sctx->prim_discard_vertex_count_threshold,
-                                          &sctx->index_ring_size_per_ib);
-   } else {
-      sctx->prim_discard_vertex_count_threshold = UINT_MAX;
   }

   sctx->sample_mask = 0xffff;
@ -641,7 +628,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
      sctx->b.create_video_buffer = vl_video_buffer_create;
   }

-   if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+   if (sctx->chip_class >= GFX9) {
      sctx->wait_mem_scratch =
           si_aligned_buffer_create(screen,
                                    SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
@ -1167,15 +1154,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,

   sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;

-   unsigned prim_discard_vertex_count_threshold, tmp;
-   si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
-   /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
-   if (prim_discard_vertex_count_threshold == UINT_MAX) {
-      /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
-       * have to allocate and count references for the upload buffer.
-       */
-      sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
-   }
+   /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
+    * have to allocate and count references for the upload buffer.
+    */
+   sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;

   /* Determine tessellation ring info. */
   bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -44,7 +44,6 @@ extern "C" {
 #endif

 #define ATI_VENDOR_ID         0x1002
-#define SI_PRIM_DISCARD_DEBUG 0
 #define SI_NOT_QUERY          0xffffffff

 /* The base vertex and primitive restart can be any number, but we must pick
@ -155,11 +154,6 @@ enum si_has_ngg {
   NGG_ON,
 };

-enum si_has_prim_discard_cs {
-   PRIM_DISCARD_CS_OFF,
-   PRIM_DISCARD_CS_ON,
-};
-
 enum si_clear_code
 {
   DCC_CLEAR_COLOR_0000 = 0x00000000,
@ -223,9 +217,6 @@ enum
   DBG_ALWAYS_NGG_CULLING_TESS,
   DBG_NO_NGG_CULLING,
   DBG_NO_FAST_LAUNCH,
-   DBG_ALWAYS_PD,
-   DBG_PD,
-   DBG_NO_PD,
   DBG_SWITCH_ON_EOP,
   DBG_NO_OUT_OF_ORDER,
   DBG_NO_DPBB,
@ -896,7 +887,6 @@ struct si_saved_cs {
   unsigned trace_id;

   unsigned gfx_last_dw;
-   unsigned compute_last_dw;
   bool flushed;
   int64_t time_flush;
 };
@ -995,26 +985,6 @@ struct si_context {
   /* NGG streamout. */
   struct pb_buffer *gds;
   struct pb_buffer *gds_oa;
-   /* Compute-based primitive discard. */
-   unsigned prim_discard_vertex_count_threshold;
-   struct radeon_cmdbuf prim_discard_compute_cs;
-   struct si_shader *compute_ib_last_shader;
-   uint32_t compute_rewind_va;
-   unsigned compute_num_prims_in_batch;
-   /* index_ring is divided into 2 halves for doublebuffering. */
-   struct si_resource *index_ring;
-   unsigned index_ring_base;        /* offset of a per-IB portion */
-   unsigned index_ring_offset;      /* offset within a per-IB portion */
-   unsigned index_ring_size_per_ib; /* max available size per IB */
-   bool prim_discard_compute_ib_initialized;
-   /* For tracking the last execution barrier - it can be either
-    * a WRITE_DATA packet or a fence. */
-   uint32_t *last_pkt3_write_data;
-   struct si_resource *barrier_buf;
-   unsigned barrier_buf_offset;
-   struct pipe_fence_handle *last_ib_barrier_fence;
-   struct si_resource *last_ib_barrier_buf;
-   unsigned last_ib_barrier_buf_offset;

   /* Atoms (direct states). */
   union si_state_atoms atoms;
@ -1063,7 +1033,6 @@ struct si_context {
      /* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
      struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
   };
-   struct si_shader_ctx_state cs_prim_discard_state;
   struct si_cs_shader_state cs_shader_state;

   /* shader information */
@ -1254,9 +1223,6 @@ struct si_context {
   unsigned num_resident_handles;
   uint64_t num_alloc_tex_transfer_bytes;
   unsigned last_tex_ps_draw_ratio; /* for query */
-   unsigned compute_num_verts_accepted;
-   unsigned compute_num_verts_rejected;
-   unsigned compute_num_verts_ineligible; /* due to low vertex count */
   unsigned context_roll;

   /* Queries. */
@ -1287,7 +1253,7 @@ struct si_context {
    */
   struct hash_table *dirty_implicit_resources;

-   pipe_draw_vbo_func draw_vbo[2][2][2][2];
+   pipe_draw_vbo_func draw_vbo[2][2][2];
   /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
   pipe_draw_vbo_func real_draw_vbo;

@ -1483,7 +1449,6 @@ void si_allocate_gds(struct si_context *ctx);
 void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
 void si_trace_emit(struct si_context *sctx);
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
                          unsigned cp_coher_cntl);
 void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
@ -1502,32 +1467,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void si_init_compute_functions(struct si_context *sctx);

-/* si_compute_prim_discard.c */
-enum si_prim_discard_outcome
-{
-   SI_PRIM_DISCARD_ENABLED,
-   SI_PRIM_DISCARD_DISABLED,
-   SI_PRIM_DISCARD_DRAW_SPLIT,
-   SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
-};
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      unsigned drawid_offset,
-                                      const struct pipe_draw_start_count_bias *draws,
-                                      unsigned num_draws, unsigned total_count);
-void si_compute_signal_gfx(struct si_context *sctx);
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info,
-                                          const struct pipe_draw_start_count_bias *draws,
-                                          unsigned num_draws, unsigned index_size,
-                                          unsigned total_count, uint64_t input_indexbuf_va,
-                                          unsigned index_max_size);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
-                                         unsigned *prim_discard_vertex_count_threshold,
-                                         unsigned *index_ring_size_per_ib);
-
 /* si_pipe.c */
 void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);

@ -1996,14 +1935,9 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
 }

-static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
-{
-   return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
-}
-
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
                                        gl_shader_stage stage, bool ngg, bool es,
-                                        bool gs_fast_launch, bool prim_discard_cs)
+                                        bool gs_fast_launch)
 {
   if (stage == MESA_SHADER_COMPUTE)
      return sscreen->compute_wave_size;
@ -2011,8 +1945,7 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen,
      return sscreen->ps_wave_size;
   else if (gs_fast_launch)
      return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
-   else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
-            (stage == MESA_SHADER_VERTEX && es && !ngg) ||
+   else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
            (stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
            (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
      return 64;
@ -2025,18 +1958,14 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
   return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
                           shader->key.as_ngg,
                           shader->key.as_es,
-                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                           shader->key.opt.vs_as_prim_discard_cs);
+                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
 }

 static inline void si_select_draw_vbo(struct si_context *sctx)
 {
-   bool has_prim_discard_cs = si_compute_prim_discard_enabled(sctx) &&
-                              !sctx->shader.tes.cso && !sctx->shader.gs.cso;
   pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
                                               [!!sctx->shader.gs.cso]
-                                               [sctx->ngg]
-                                               [has_prim_discard_cs];
+                                               [sctx->ngg];
   assert(draw_vbo);
   if (unlikely(sctx->real_draw_vbo))
      sctx->real_draw_vbo = draw_vbo;
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@ -260,15 +260,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
      query->begin_result = sctx->screen->num_disk_shader_cache_misses;
      break;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-      query->begin_result = sctx->compute_num_verts_accepted;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-      query->begin_result = sctx->compute_num_verts_rejected;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      query->begin_result = sctx->compute_num_verts_ineligible;
-      break;
   case SI_QUERY_GPIN_ASIC_ID:
   case SI_QUERY_GPIN_NUM_SIMD:
   case SI_QUERY_GPIN_NUM_RB:
@ -429,15 +420,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
   case SI_QUERY_DISK_SHADER_CACHE_MISSES:
      query->end_result = sctx->screen->num_disk_shader_cache_misses;
      break;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-      query->end_result = sctx->compute_num_verts_accepted;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-      query->end_result = sctx->compute_num_verts_rejected;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      query->end_result = sctx->compute_num_verts_ineligible;
-      break;
   case SI_QUERY_GPIN_ASIC_ID:
   case SI_QUERY_GPIN_NUM_SIMD:
   case SI_QUERY_GPIN_NUM_RB:
@ -479,11 +461,6 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ
      result->u64 =
         (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
      return true;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
-      return true;
   case SI_QUERY_GPIN_ASIC_ID:
      result->u32 = 0;
      return true;
@ -1758,10 +1735,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
   X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
   X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
   X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
-
-   X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
-   X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
-   X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
 };

 #undef X
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@ -111,9 +111,6 @@ enum
   SI_QUERY_GPIN_NUM_RB,
   SI_QUERY_GPIN_NUM_SPI,
   SI_QUERY_GPIN_NUM_SE,
-   SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
-   SI_QUERY_PD_NUM_PRIMS_REJECTED,
-   SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
   SI_QUERY_LIVE_SHADER_CACHE_HITS,
   SI_QUERY_LIVE_SHADER_CACHE_MISSES,
   SI_QUERY_MEMORY_SHADER_CACHE_HITS,
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)

      /* VGPRs */
      declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
-
-      /* Return values */
-      if (shader->key.opt.vs_as_prim_discard_cs) {
-         for (i = 0; i < 4; i++)
-            ac_add_return(&ctx->args, AC_ARG_VGPR);
-      }
      break;

   case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@ -1070,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader)
         return "Vertex Shader as ES";
      else if (shader->key.as_ls)
         return "Vertex Shader as LS";
-      else if (shader->key.opt.vs_as_prim_discard_cs)
-         return "Vertex Shader as Primitive Discard CS";
      else if (shader->key.as_ngg)
         return "Vertex Shader as ESGS";
      else
@ -1183,12 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
      fprintf(f, "  as_ls = %u\n", key->as_ls);
      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
-      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
-      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
-      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
-      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
-      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
-      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
      break;

   case MESA_SHADER_TESS_CTRL:
@ -1317,7 +1303,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
   key->vs_prolog.as_ls = shader_out->key.as_ls;
   key->vs_prolog.as_es = shader_out->key.as_es;
   key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;

   if (ngg_cull_shader) {
      key->vs_prolog.gs_fast_launch_tri_list =
@ -1342,8 +1327,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_

   /* Only one of these combinations can be set. as_ngg can be set with as_es. */
   assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
-             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
-          1);
+          (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);

   /* Enable loading the InstanceID VGPR. */
   uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
@ -1557,7 +1541,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
         SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
-      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
      break;
   case MESA_SHADER_TESS_CTRL:
      assert(!prolog);
@ -1581,8 +1564,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
   si_llvm_context_init(&ctx, sscreen, compiler,
                        si_get_wave_size(sscreen, stage,
                                         shader.key.as_ngg, shader.key.as_es,
-                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                                         shader.key.opt.vs_as_prim_discard_cs));
+                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
   ctx.shader = &shader;
   ctx.stage = stage;

--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -446,7 +446,6 @@ struct si_shader_selector {
   ubyte const_and_shader_buf_descriptors_index;
   ubyte sampler_and_images_descriptors_index;
   bool vs_needs_prolog;
-   bool prim_discard_cs_allowed;
   ubyte cs_shaderbufs_sgpr_index;
   ubyte cs_num_shaderbufs_in_user_sgprs;
   ubyte cs_images_sgpr_index;
@ -577,7 +576,6 @@ union si_shader_part_key {
      unsigned as_ls : 1;
      unsigned as_es : 1;
      unsigned as_ngg : 1;
-      unsigned as_prim_discard_cs : 1;
      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
      unsigned gs_fast_launch_index_size_packed : 2;
@ -684,14 +682,6 @@ struct si_shader_key {
       */
      unsigned prefer_mono : 1;

-      /* Primitive discard compute shader. */
-      unsigned vs_as_prim_discard_cs : 1;
-      unsigned cs_prim_type : 4;
-      unsigned cs_indexed : 1;
-      unsigned cs_provoking_vertex_first : 1;
-      unsigned cs_cull_front : 1;
-      unsigned cs_cull_back : 1;
-
      /* VS and TCS have the same number of patch vertices. */
      unsigned same_patch_vertices:1;

--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@ -804,9 +804,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
       !same_thread_count && si_is_multi_part_shader(ctx->shader))
      ac_build_endif(&ctx->ac, 6507);

-   /* Return the value from the last part. It's non-void only for the prim
-    * discard compute shader.
-    */
   if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
      LLVMBuildRetVoid(builder);
   else
@ -1116,9 +1113,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
      parts[num_parts++] = main_fn;

      si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
-
-      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
-         si_build_prim_discard_compute_shader(&ctx);
   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
      LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;

@ -1289,8 +1283,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
   }

   /* Make sure the input is a pointer and not integer followed by inttoptr. */
-   if (!shader->key.opt.vs_as_prim_discard_cs)
-      assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);

   /* Compile to bytecode. */
   if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,

   si_llvm_context_init(&ctx, sscreen, compiler,
                        si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
-                                         false, false, false, false));
+                                         false, false, false));
   ctx.shader = shader;
   ctx.stage = MESA_SHADER_VERTEX;

--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@ -793,32 +793,6 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
   FREE(outputs);
 }

-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi)
-{
-   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-   struct si_shader_info *info = &ctx->shader->selector->info;
-   LLVMValueRef *addrs = abi->outputs;
-   LLVMValueRef pos[4] = {};
-
-   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
-
-   for (unsigned i = 0; i < info->num_outputs; i++) {
-      if (info->output_semantic[i] != VARYING_SLOT_POS)
-         continue;
-
-      for (unsigned chan = 0; chan < 4; chan++)
-         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-      break;
-   }
-   assert(pos[0] != NULL);
-
-   /* Return the position output. */
-   LLVMValueRef ret = ctx->return_value;
-   for (unsigned chan = 0; chan < 4; chan++)
-      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
-   ctx->return_value = ret;
-}
-
 /**
 * Build the vertex shader prolog function.
 *
@ -1121,8 +1095,6 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad
      ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
   else if (shader->key.as_es)
      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-   else if (shader->key.opt.vs_as_prim_discard_cs)
-      ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
   else if (ngg_cull_shader)
      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
   else if (shader->key.as_ngg)
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -971,7 +971,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
      }                                                                  \
   } while (0)

-template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_ngg NGG>
 ALWAYS_INLINE
 static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
                                 unsigned drawid_base,
@ -980,7 +980,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                                 unsigned num_draws, unsigned total_count,
                                 struct pipe_resource *indexbuf, unsigned index_size,
                                 unsigned index_offset, unsigned instance_count,
-                                 bool dispatch_prim_discard_cs, unsigned original_index_size)
+                                 unsigned original_index_size)
 {
   struct radeon_cmdbuf *cs = &sctx->gfx_cs;

@ -1042,22 +1042,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
         sctx->last_index_size = index_size;
      }

-      /* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
-      if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
-         index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
-         /* Skip draw calls with 0-sized index buffers.
-          * They cause a hang on some chips, like Navi10-14.
-          */
-         if (!index_max_size) {
-            radeon_end();
-            return;
-         }
-
-         index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
-                                   RADEON_PRIO_INDEX_BUFFER);
+      index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size);
+      /* Skip draw calls with 0-sized index buffers.
+       * They cause a hang on some chips, like Navi10-14.
+       */
+      if (!index_max_size) {
+         radeon_end();
+         return;
      }
+
+      index_va = si_resource(indexbuf)->gpu_address + index_offset;
+
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
+                                RADEON_PRIO_INDEX_BUFFER);
   } else {
      /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
       * so the state must be re-emitted before the next indexed draw.
@ -1190,16 +1187,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
      bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;

      if (index_size) {
-         if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
-            radeon_end();
-
-            si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
-                                                 original_index_size, total_count, index_va,
-                                                 index_max_size);
-            EMIT_SQTT_END_DRAW;
-            return;
-         }
-
         /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
          * can be changed between draws, and GS fast launch must be disabled.
          * NOT_EOP doesn't work on gfx9 and older.
@ -1629,100 +1616,12 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
          info->restart_index, min_vertex_count);
 }

-static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
-{
-   struct radeon_winsys *ws = sctx->ws;
-   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-   struct si_descriptors *buffers =
-      &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
-   struct si_shader_selector *vs = sctx->shader.vs.cso;
-   struct si_vertex_elements *velems = sctx->vertex_elements;
-   unsigned num_velems = velems->count;
-   unsigned num_images = vs->info.base.num_images;
-
-   /* Index buffer. */
-   if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
-      goto has_write_reference;
-
-   /* Vertex buffers. */
-   for (unsigned i = 0; i < num_velems; i++) {
-      if (!((1 << i) & velems->first_vb_use_mask))
-         continue;
-
-      unsigned vb_index = velems->vertex_buffer_index[i];
-      struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
-      if (!res)
-         continue;
-
-      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-         goto has_write_reference;
-   }
-
-   /* Constant and shader buffers. */
-   for (unsigned i = 0; i < buffers->num_active_slots; i++) {
-      unsigned index = buffers->first_active_slot + i;
-      struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
-      if (!res)
-         continue;
-
-      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-         goto has_write_reference;
-   }
-
-   /* Samplers. */
-   if (vs->info.base.textures_used[0]) {
-      unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used);
-
-      for (unsigned i = 0; i < num_samplers; i++) {
-         struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
-         if (!view)
-            continue;
-
-         if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
-            goto has_write_reference;
-      }
-   }
-
-   /* Images. */
-   if (num_images) {
-      for (unsigned i = 0; i < num_images; i++) {
-         struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
-         if (!res)
-            continue;
-
-         if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-            goto has_write_reference;
-      }
-   }
-
-   return true;
-
-has_write_reference:
-   /* If the current gfx IB has enough packets, flush it to remove write
-    * references to buffers.
-    */
-   if (cs->prev_dw + cs->current.cdw > 2048) {
-      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-      assert(si_all_vs_resources_read_only(sctx, indexbuf));
-      return true;
-   }
-   return false;
-}
-
-static ALWAYS_INLINE bool pd_msg(const char *s)
-{
-   if (SI_PRIM_DISCARD_DEBUG)
-      printf("PD failed: %s\n", s);
-   return false;
-}
-
 #define DRAW_CLEANUP do {                                 \
      if (index_size && indexbuf != info->index.resource) \
         pipe_resource_reference(&indexbuf, NULL);        \
   } while (0)

-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
-          si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
 static void si_draw_vbo(struct pipe_context *ctx,
                        const struct pipe_draw_info *info,
                        unsigned drawid_offset,
@ -1910,70 +1809,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
      info->primitive_restart &&
      (!sctx->screen->options.prim_restart_tri_strips_only ||
       (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
-   bool dispatch_prim_discard_cs = false;
   unsigned original_index_size = index_size;

-   /* Determine if we can use the primitive discard compute shader. */
-   /* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */
-   if (ALLOW_PRIM_DISCARD_CS &&
-       (total_direct_count > sctx->prim_discard_vertex_count_threshold
-           ? (sctx->compute_num_verts_rejected += total_direct_count, true)
-           : /* Add, then return true. */
-           (sctx->compute_num_verts_ineligible += total_direct_count,
-            false)) && /* Add, then return false. */
-       (!primitive_restart || pd_msg("primitive restart")) &&
-       /* Supported prim types. */
-       (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) &&
-       (instance_count == 1 || pd_msg("instancing")) &&
-       ((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) ||
-        !sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
-       (!sctx->render_cond || pd_msg("render condition")) &&
-       /* Forced enablement ignores pipeline statistics queries. */
-       (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
-        (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
-        pd_msg("pipestat or primgen query")) &&
-       (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
-       (!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
-       !rs->polygon_mode_enabled &&
-#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
-       (!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
-       (!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
-       (!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) &&
-       (!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
-       !sctx->shader.vs.cso->info.base.vs.window_space_position &&
-       !sctx->shader.vs.cso->so.num_outputs &&
-#else
-       (sctx->shader.vs.cso->prim_discard_cs_allowed ||
-        pd_msg("VS shader uses unsupported features")) &&
-#endif
-       /* Check that all buffers are used for read only, because compute
-        * dispatches can run ahead. */
-       (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
-        pd_msg("write reference"))) {
-      switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws,
-                                                    total_direct_count)) {
-      case SI_PRIM_DISCARD_ENABLED:
-         original_index_size = index_size;
-         dispatch_prim_discard_cs = true;
-
-         /* The compute shader changes/lowers the following: */
-         prim = PIPE_PRIM_TRIANGLES;
-         index_size = 4;
-         instance_count = 1;
-         sctx->compute_num_verts_rejected -= total_direct_count;
-         sctx->compute_num_verts_accepted += total_direct_count;
-         break;
-      case SI_PRIM_DISCARD_DISABLED:
-         break;
-      case SI_PRIM_DISCARD_DRAW_SPLIT:
-      case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT:
-         sctx->compute_num_verts_rejected -= total_direct_count;
-         /* The multi draw was split into multiple ones and executed. Return. */
-         DRAW_CLEANUP;
-         return;
-      }
-   }
-
   /* Set the rasterization primitive type.
    *
    * This must be done after si_decompress_textures, which can call
@ -2005,7 +1842,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
   if (GFX_VERSION >= GFX10) {
      struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;

-      if (NGG && !HAS_GS && !dispatch_prim_discard_cs &&
+      if (NGG && !HAS_GS &&
          /* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type
           * is not triangles, so this check is only needed without tessellation. */
          (HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&
@ -2154,10 +1991,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
      }
      assert(sctx->dirty_atoms == 0);

-      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+      si_emit_draw_packets<GFX_VERSION, NGG>
            (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
-             index_size, index_offset, instance_count, dispatch_prim_discard_cs,
-             original_index_size);
+             index_size, index_offset, instance_count, original_index_size);
      /* <-- CUs are busy here. */

      /* Start prefetches after the draw has been started. Both will run
@ -2193,10 +2029,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
      }
      assert(sctx->dirty_atoms == 0);

-      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+      si_emit_draw_packets<GFX_VERSION, NGG>
            (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
-             index_size, index_offset, instance_count, dispatch_prim_discard_cs,
-             original_index_size);
+             index_size, index_offset, instance_count, original_index_size);

      /* Prefetch the remaining shaders after the draw has been
       * started. */
@ -2281,40 +2116,27 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
   pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
 }

-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
-          si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
 static void si_init_draw_vbo(struct si_context *sctx)
 {
-   /* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
-   if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)
-      return;
-
-   if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))
-      return;
-
   if (NGG && GFX_VERSION < GFX10)
      return;

-   sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
-      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
-}
-
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
-static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
-{
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
+   sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
+      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
 }

 template <chip_class GFX_VERSION>
 static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
 {
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_ON>(sctx);
 }

 static void si_invalid_draw_vbo(struct pipe_context *pipe,
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@ -81,8 +81,8 @@
 *   Right half: {1,3,5,7,9,11,13,15}
 */

-/* Important note: We have to use the standard DX positions, because
- * the primitive discard compute shader relies on them.
+/* Important note: We have to use the standard DX positions because shader-based culling
+ * relies on them.
 */

 /* 1x MSAA */
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
      shader_variant_flags |= 1 << 0;
   if (sel->nir)
      shader_variant_flags |= 1 << 1;
-   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
+   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32)
      shader_variant_flags |= 1 << 2;
   if (sel->info.stage == MESA_SHADER_FRAGMENT &&
       /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
@ -78,11 +78,9 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
       sel->info.base.fs.uses_discard &&
       sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
      shader_variant_flags |= 1 << 3;
-   if (sel->info.stage == MESA_SHADER_VERTEX) {
-      /* This varies depending on whether compute-based culling is enabled. */
-      assert(sel->screen->num_vbos_in_user_sgprs <= 7);
-      shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
-   }
+
+   /* bit gap */
+
   if (sel->screen->options.no_infinite_interp)
      shader_variant_flags |= 1 << 7;
   if (sel->screen->options.clamp_div_by_zero)
@ -2291,10 +2289,8 @@ current_not_ready:

   /* Compile the main shader part if it doesn't exist. This can happen
    * if the initial guess was wrong.
-    *
-    * The prim discard CS doesn't need the main shader part.
    */
-   if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
+   if (!is_pure_monolithic) {
      bool ok = true;

      /* Make sure the main shader part is present. This is needed
@ -2348,8 +2344,7 @@ current_not_ready:
   shader->is_monolithic =
      is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;

-   /* The prim discard CS is always optimized. */
-   shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+   shader->is_optimized = !is_pure_monolithic &&
                          memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;

   /* If it's an optimized shader, compile it asynchronously. */
@ -2706,12 +2701,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
   sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
                          !sel->info.base.vs.blit_sgprs_amd;

-   sel->prim_discard_cs_allowed =
-      sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images &&
-      !sel->info.uses_bindless_samplers && !sel->info.base.writes_memory &&
-      !sel->info.writes_viewport_index &&
-      !sel->info.base.vs.window_space_position && !sel->so.num_outputs;
-
   if (sel->info.stage == MESA_SHADER_VERTEX ||
       sel->info.stage == MESA_SHADER_TESS_CTRL ||
       sel->info.stage == MESA_SHADER_TESS_EVAL ||
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@ -771,9 +771,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
       *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
       */
      return 20 * 1024;
-   case IB_PARALLEL_COMPUTE:
-      /* Always chain this IB. */
-      return UINT_MAX;
   default:
      unreachable("bad ib_type");
   }
@ -908,9 +905,6 @@ static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
      assert(0);
   }

-   cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE;
-   cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
-
   cs->last_added_bo = NULL;
   return true;
 }
@ -938,8 +932,6 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs
   cleanup_fence_list(&cs->fence_dependencies);
   cleanup_fence_list(&cs->syncobj_dependencies);
   cleanup_fence_list(&cs->syncobj_to_signal);
-   cleanup_fence_list(&cs->compute_fence_dependencies);
-   cleanup_fence_list(&cs->compute_start_fence_dependencies);

   cs->num_real_buffers = 0;
   cs->num_slab_buffers = 0;
@ -957,8 +949,6 @@ static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs
   FREE(cs->fence_dependencies.list);
   FREE(cs->syncobj_dependencies.list);
   FREE(cs->syncobj_to_signal.list);
-   FREE(cs->compute_fence_dependencies.list);
-   FREE(cs->compute_start_fence_dependencies.list);
 }


@ -997,7 +987,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
   amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);

   cs->main.ib_type = IB_MAIN;
-   cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE;

   if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
      FREE(cs);
@ -1035,37 +1024,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
   return true;
 }

-static bool
-amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *compute_cs,
-                                  struct radeon_cmdbuf *gfx_cs,
-                                  bool uses_gds_ordered_append)
-{
-   struct amdgpu_cs *cs = amdgpu_cs(gfx_cs);
-   struct amdgpu_winsys *ws = cs->ws;
-
-   if (cs->ring_type != RING_GFX)
-      return false;
-
-   /* only one secondary IB can be added */
-   if (cs->compute_ib.ib_mapped)
-      return false;
-
-   /* Allocate the compute IB. */
-   if (!amdgpu_get_new_ib(ws, compute_cs, &cs->compute_ib, cs))
-      return false;
-
-   if (uses_gds_ordered_append) {
-      cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |=
-            AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
-      cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |=
-            AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
-   }
-
-   cs->compute_ib.rcs = compute_cs;
-   compute_cs->priv = cs;
-   return true;
-}
-
 static bool
 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
                           unsigned preamble_num_dw)
@ -1128,7 +1086,7 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
                                  bool force_chaining)
 {
   struct amdgpu_cs *cs = amdgpu_cs(rcs);
-   struct amdgpu_ib *ib = rcs == cs->main.rcs ? &cs->main : &cs->compute_ib;
+   struct amdgpu_ib *ib = &cs->main;
   unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
   unsigned need_byte_size = (dw + cs_epilog_dw) * 4;

@ -1286,18 +1244,6 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,

   util_queue_fence_wait(&fence->submitted);

-   if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) {
-      /* Syncobjs are not needed here. */
-      assert(!amdgpu_fence_is_syncobj(fence));
-
-      if (acs->ws->info.has_scheduled_fence_dependency &&
-          dependency_flags & RADEON_DEPENDENCY_START_FENCE)
-         add_fence_to_list(&cs->compute_start_fence_dependencies, fence);
-      else
-         add_fence_to_list(&cs->compute_fence_dependencies, fence);
-      return;
-   }
-
   /* Start fences are not needed here. */
   assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE));

@ -1589,66 +1535,6 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
         num_chunks++;
      }

-      /* Submit the parallel compute IB first. */
-      if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) {
-         unsigned old_num_chunks = num_chunks;
-
-         /* Add compute fence dependencies. */
-         unsigned num_dependencies = cs->compute_fence_dependencies.num;
-         if (num_dependencies) {
-            struct drm_amdgpu_cs_chunk_dep *dep_chunk =
-               alloca(num_dependencies * sizeof(*dep_chunk));
-
-            for (unsigned i = 0; i < num_dependencies; i++) {
-               struct amdgpu_fence *fence =
-                  (struct amdgpu_fence*)cs->compute_fence_dependencies.list[i];
-
-               assert(util_queue_fence_is_signalled(&fence->submitted));
-               amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
-            }
-
-            chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
-            chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
-            chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
-            num_chunks++;
-         }
-
-         /* Add compute start fence dependencies. */
-         unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num;
-         if (num_start_dependencies) {
-            struct drm_amdgpu_cs_chunk_dep *dep_chunk =
-               alloca(num_start_dependencies * sizeof(*dep_chunk));
-
-            for (unsigned i = 0; i < num_start_dependencies; i++) {
-               struct amdgpu_fence *fence =
-                  (struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i];
-
-               assert(util_queue_fence_is_signalled(&fence->submitted));
-               amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
-            }
-
-            chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES;
-            chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies;
-            chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
-            num_chunks++;
-         }
-
-         /* Convert from dwords to bytes. */
-         cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4;
-         chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
-         chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
-         chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE];
-         num_chunks++;
-
-         r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
-                                                   num_chunks, chunks, NULL);
-         if (r)
-            goto finalize;
-
-         /* Back off the compute chunks. */
-         num_chunks = old_num_chunks;
-      }
-
      /* Syncobj signals. */
      unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
      if (num_syncobj_to_signal) {
@ -1706,7 +1592,7 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
      r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
                                                num_chunks, chunks, &seq_no);
   }
-finalize:
+
   if (r) {
      if (r == -ENOMEM)
         fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@ -1798,12 +1684,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
      }
      if (cs->ring_type == RING_GFX)
         ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
-
-      /* Also pad secondary IBs. */
-      if (cs->compute_ib.ib_mapped) {
-         while (cs->compute_ib.rcs->current.cdw & ib_pad_dw_mask)
-            radeon_emit(cs->compute_ib.rcs, PKT3_NOP_PAD);
-      }
      break;
   case RING_UVD:
   case RING_UVD_ENC:
@ -1839,9 +1719,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
      /* Set IB sizes. */
      amdgpu_ib_finalize(ws, rcs, &cs->main);

-      if (cs->compute_ib.ib_mapped)
-         amdgpu_ib_finalize(ws, cs->compute_ib.rcs, &cs->compute_ib);
-
      /* Create a fence. */
      amdgpu_fence_reference(&cur->fence, NULL);
      if (cs->next_fence) {
@ -1897,8 +1774,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
   memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));

   amdgpu_get_new_ib(ws, rcs, &cs->main, cs);
-   if (cs->compute_ib.ib_mapped)
-      amdgpu_get_new_ib(ws, cs->compute_ib.rcs, &cs->compute_ib, cs);

   if (cs->preamble_ib_bo) {
      amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0,
@ -1929,9 +1804,6 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
   radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
   radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL);
   FREE(rcs->prev);
-   radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->compute_ib.big_ib_buffer, NULL);
-   if (cs->compute_ib.rcs)
-      FREE(cs->compute_ib.rcs->prev);
   amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
   amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
   amdgpu_fence_reference(&cs->next_fence, NULL);
@ -1954,7 +1826,6 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
   ws->base.ctx_destroy = amdgpu_ctx_destroy;
   ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
   ws->base.cs_create = amdgpu_cs_create;
-   ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib;
   ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
   ws->base.cs_destroy = amdgpu_cs_destroy;
   ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@ -58,7 +58,6 @@ struct amdgpu_cs_buffer {
 enum ib_type {
   IB_PREAMBLE,
   IB_MAIN,
-   IB_PARALLEL_COMPUTE,
   IB_NUM,
 };

@ -115,10 +114,6 @@ struct amdgpu_cs_context {
   struct amdgpu_fence_list    syncobj_dependencies;
   struct amdgpu_fence_list    syncobj_to_signal;

-   /* The compute IB uses the dependencies above + these: */
-   struct amdgpu_fence_list    compute_fence_dependencies;
-   struct amdgpu_fence_list    compute_start_fence_dependencies;
-
   struct pipe_fence_handle    *fence;

   /* the error returned from cs_flush for non-async submissions */
@ -132,7 +127,6 @@ struct amdgpu_cs_context {

 struct amdgpu_cs {
   struct amdgpu_ib main; /* must be first because this is inherited */
-   struct amdgpu_ib compute_ib;      /* optional parallel compute IB */
   struct amdgpu_winsys *ws;
   struct amdgpu_ctx *ctx;
   enum ring_type ring_type;