radeonsi: remove the primitive discard compute shader

It doesn't always work, it's only useful on gfx9 and older, and it's too
complicated.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4011

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12812>
This commit is contained in:
Marek Olšák 2021-08-11 13:31:19 -04:00 committed by Marge Bot
parent 9e994560ff
commit 576f8394db
22 changed files with 62 additions and 1791 deletions

View file

@ -773,12 +773,6 @@ radeonsi driver environment variables
Always use NGG culling even when it can hurt.
``nonggc``
Disable NGG culling.
``alwayspd``
Always enable the primitive discard compute shader.
``pd``
Enable the primitive discard compute shader for large draw calls.
``nopd``
Disable the primitive discard compute shader.
``switch_on_eop``
Program WD/IA to switch on end-of-packet.
``nooutoforder``

View file

@ -80,9 +80,6 @@ enum radeon_bo_flag
enum radeon_dependency_flag
{
/* Add the dependency to the parallel compute IB only. */
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
/* Instead of waiting for a job to finish execution, the dependency will
* be signaled when the job starts execution.
*/
@ -512,26 +509,6 @@ struct radeon_winsys {
struct pipe_fence_handle **fence),
void *flush_ctx, bool stop_exec_on_failure);
/**
* Add a parallel compute IB to a gfx IB. It will share the buffer list
* and fence dependencies with the gfx IB. The gfx flush call will submit
* both IBs at the same time.
*
* The compute IB doesn't have an output fence, so the primary IB has
* to use a wait packet for synchronization.
*
* The returned IB is only a stream for writing packets to the new
* IB. The only function that can be used on the compute cs is cs_check_space.
*
* \param compute_cs The returned structure of the command stream.
* \param gfx_cs Gfx IB
*
* \return true on success
*/
bool (*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *compute_cs,
struct radeon_cmdbuf *gfx_cs,
bool uses_gds_ordered_append);
/**
* Set up and enable mid command buffer preemption for the command stream.
*

View file

@ -27,7 +27,6 @@ files_libradeonsi = files(
'si_build_pm4.h',
'si_clear.c',
'si_compute.c',
'si_compute_prim_discard.c',
'si_compute.h',
'si_compute_blit.c',
'si_cp_dma.c',

File diff suppressed because it is too large Load diff

View file

@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
sdst->TC_L2_dirty = true;
/* If it's not a framebuffer fast clear... */
if (coher == SI_COHERENCY_SHADER) {
if (coher == SI_COHERENCY_SHADER)
sctx->num_cp_dma_calls++;
si_prim_discard_signal_next_compute_ib_start(sctx);
}
}
/**
@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
si_resource(dst)->TC_L2_dirty = true;
/* If it's not a prefetch or GDS copy... */
if (dst && src && (dst != src || dst_offset != src_offset)) {
if (dst && src && (dst != src || dst_offset != src_offset))
sctx->num_cp_dma_calls++;
si_prim_discard_signal_next_compute_ib_start(sctx);
}
}
void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,

View file

@ -344,7 +344,6 @@ struct si_log_chunk_cs {
struct si_saved_cs *cs;
bool dump_bo_list;
unsigned gfx_begin, gfx_end;
unsigned compute_begin, compute_end;
};
static void si_log_chunk_type_cs_destroy(void *data)
@ -402,7 +401,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
struct si_context *ctx = chunk->ctx;
struct si_saved_cs *scs = chunk->cs;
int last_trace_id = -1;
int last_compute_trace_id = -1;
/* We are expecting that the ddebug pipe has already
* waited for the context, so this buffer should be idle.
@ -410,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
*/
uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
if (map) {
if (map)
last_trace_id = map[0];
last_compute_trace_id = map[1];
}
if (chunk->gfx_end != chunk->gfx_begin) {
if (chunk->gfx_begin == 0) {
@ -435,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
}
}
if (chunk->compute_end != chunk->compute_begin) {
assert(ctx->prim_discard_compute_cs.priv);
if (scs->flushed) {
ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
"Compute IB", ctx->chip_class, NULL, NULL);
} else {
si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
ctx->chip_class);
}
}
if (chunk->dump_bo_list) {
fprintf(f, "Flushing. Time: ");
util_dump_ns(f, scs->time_flush);
@ -468,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
struct si_saved_cs *scs = ctx->current_saved_cs;
unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
unsigned compute_cur = 0;
if (ctx->prim_discard_compute_cs.priv)
compute_cur =
ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
return;
struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@ -487,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
chunk->gfx_end = gfx_cur;
scs->gfx_last_dw = gfx_cur;
chunk->compute_begin = scs->compute_last_dw;
chunk->compute_end = compute_cur;
scs->compute_last_dw = compute_cur;
u_log_chunk(log, &si_log_chunk_type_cs, chunk);
}

View file

@ -73,7 +73,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
event_flags;
unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs;
bool compute_ib = !ctx->has_graphics;
radeon_begin(cs);

View file

@ -92,9 +92,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
ctx->gfx_flush_in_progress = true;
if (radeon_emitted(&ctx->prim_discard_compute_cs, 0))
si_compute_signal_gfx(ctx);
if (ctx->has_graphics) {
if (!list_is_empty(&ctx->active_queries))
si_suspend_queries(ctx);
@ -136,29 +133,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
si_log_hw_flush(ctx);
}
if (si_compute_prim_discard_enabled(ctx)) {
/* The compute IB can start after the previous gfx IB starts. */
if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
ctx->ws->cs_add_fence_dependency(
&ctx->gfx_cs, ctx->last_gfx_fence,
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
}
/* Remember the last execution barrier. It's in the IB.
* It will signal the start of the next compute IB.
*/
if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
ctx->last_pkt3_write_data = NULL;
si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
si_resource_reference(&ctx->barrier_buf, NULL);
ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
}
}
if (ctx->is_noop)
flags |= RADEON_FLUSH_NOOP;
@ -171,17 +145,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
ctx->num_gfx_cs_flushes++;
if (si_compute_prim_discard_enabled(ctx)) {
/* Remember the last execution barrier, which is the last fence
* in this case.
*/
if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
ctx->last_pkt3_write_data = NULL;
si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
}
}
/* Check VM faults if needed. */
if (sscreen->debug_flags & DBG(CHECK_VM)) {
/* Use conservative timeout 800ms, after which we won't wait any
@ -216,7 +179,7 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)
pipe_reference_init(&ctx->current_saved_cs->reference, 1);
ctx->current_saved_cs->trace_buf =
si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4));
if (!ctx->current_saved_cs->trace_buf) {
free(ctx->current_saved_cs);
ctx->current_saved_cs = NULL;
@ -368,11 +331,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
bool is_secure = false;
if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
/* Disable features that don't work with TMZ:
* - primitive discard
*/
ctx->prim_discard_vertex_count_threshold = UINT_MAX;
is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);
@ -549,18 +507,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
assert(!ctx->gfx_cs.prev_dw);
ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
ctx->prim_discard_compute_ib_initialized = false;
/* Compute-based primitive discard:
* The index ring is divided into 2 halves. Switch between the halves
* in the same fashion as doublebuffering.
*/
if (ctx->index_ring_base)
ctx->index_ring_base = 0;
else
ctx->index_ring_base = ctx->index_ring_size_per_ib;
ctx->index_ring_offset = 0;
/* All buffer references are removed on a flush, so si_check_needs_implicit_sync
* cannot determine if si_make_CB_shader_coherent() needs to be called.
@ -586,34 +532,9 @@ void si_trace_emit(struct si_context *sctx)
u_log_flush(sctx->log);
}
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
{
if (!si_compute_prim_discard_enabled(sctx))
return;
if (!sctx->barrier_buf) {
u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
(struct pipe_resource **)&sctx->barrier_buf);
}
/* Emit a placeholder to signal the next compute IB to start.
* See si_compute_prim_discard.c for explanation.
*/
uint32_t signal = 1;
si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
&signal);
sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5];
/* Only the last occurrence of WRITE_DATA will be executed.
* The packet will be enabled in si_flush_gfx_cs.
*/
*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
}
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
{
bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
bool compute_ib = !sctx->has_graphics;
assert(sctx->chip_class <= GFX9);
@ -857,14 +778,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
uint32_t cp_coher_cntl = 0;
const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
const bool is_barrier =
flush_cb_db ||
/* INV_ICACHE == beginning of gfx IB. Checking
* INV_ICACHE fixes corruption for DeusExMD with
* compute-based culling, but I don't know why.
*/
flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
(flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
assert(sctx->chip_class <= GFX9);
@ -1077,9 +990,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
radeon_end();
}
if (is_barrier)
si_prim_discard_signal_next_compute_ib_start(sctx);
if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));

View file

@ -95,9 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
{"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
{"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
{"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
{"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
{"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
{"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
{"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
{"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
{"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
@ -309,12 +306,8 @@ static void si_destroy_context(struct pipe_context *context)
u_suballocator_destroy(&sctx->allocator_zeroed_memory);
sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
si_resource_reference(&sctx->eop_bug_scratch, NULL);
si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
si_resource_reference(&sctx->index_ring, NULL);
si_resource_reference(&sctx->barrier_buf, NULL);
si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
si_resource_reference(&sctx->shadowed_regs, NULL);
radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
@ -618,12 +611,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
default:
unreachable("unhandled chip class");
}
si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
&sctx->prim_discard_vertex_count_threshold,
&sctx->index_ring_size_per_ib);
} else {
sctx->prim_discard_vertex_count_threshold = UINT_MAX;
}
sctx->sample_mask = 0xffff;
@ -641,7 +628,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
sctx->b.create_video_buffer = vl_video_buffer_create;
}
if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
if (sctx->chip_class >= GFX9) {
sctx->wait_mem_scratch =
si_aligned_buffer_create(screen,
SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
@ -1167,15 +1154,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
unsigned prim_discard_vertex_count_threshold, tmp;
si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
/* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
if (prim_discard_vertex_count_threshold == UINT_MAX) {
/* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
* have to allocate and count references for the upload buffer.
*/
sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
}
/* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
* have to allocate and count references for the upload buffer.
*/
sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
/* Determine tessellation ring info. */
bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&

View file

@ -44,7 +44,6 @@ extern "C" {
#endif
#define ATI_VENDOR_ID 0x1002
#define SI_PRIM_DISCARD_DEBUG 0
#define SI_NOT_QUERY 0xffffffff
/* The base vertex and primitive restart can be any number, but we must pick
@ -155,11 +154,6 @@ enum si_has_ngg {
NGG_ON,
};
enum si_has_prim_discard_cs {
PRIM_DISCARD_CS_OFF,
PRIM_DISCARD_CS_ON,
};
enum si_clear_code
{
DCC_CLEAR_COLOR_0000 = 0x00000000,
@ -223,9 +217,6 @@ enum
DBG_ALWAYS_NGG_CULLING_TESS,
DBG_NO_NGG_CULLING,
DBG_NO_FAST_LAUNCH,
DBG_ALWAYS_PD,
DBG_PD,
DBG_NO_PD,
DBG_SWITCH_ON_EOP,
DBG_NO_OUT_OF_ORDER,
DBG_NO_DPBB,
@ -896,7 +887,6 @@ struct si_saved_cs {
unsigned trace_id;
unsigned gfx_last_dw;
unsigned compute_last_dw;
bool flushed;
int64_t time_flush;
};
@ -995,26 +985,6 @@ struct si_context {
/* NGG streamout. */
struct pb_buffer *gds;
struct pb_buffer *gds_oa;
/* Compute-based primitive discard. */
unsigned prim_discard_vertex_count_threshold;
struct radeon_cmdbuf prim_discard_compute_cs;
struct si_shader *compute_ib_last_shader;
uint32_t compute_rewind_va;
unsigned compute_num_prims_in_batch;
/* index_ring is divided into 2 halves for doublebuffering. */
struct si_resource *index_ring;
unsigned index_ring_base; /* offset of a per-IB portion */
unsigned index_ring_offset; /* offset within a per-IB portion */
unsigned index_ring_size_per_ib; /* max available size per IB */
bool prim_discard_compute_ib_initialized;
/* For tracking the last execution barrier - it can be either
* a WRITE_DATA packet or a fence. */
uint32_t *last_pkt3_write_data;
struct si_resource *barrier_buf;
unsigned barrier_buf_offset;
struct pipe_fence_handle *last_ib_barrier_fence;
struct si_resource *last_ib_barrier_buf;
unsigned last_ib_barrier_buf_offset;
/* Atoms (direct states). */
union si_state_atoms atoms;
@ -1063,7 +1033,6 @@ struct si_context {
/* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
};
struct si_shader_ctx_state cs_prim_discard_state;
struct si_cs_shader_state cs_shader_state;
/* shader information */
@ -1254,9 +1223,6 @@ struct si_context {
unsigned num_resident_handles;
uint64_t num_alloc_tex_transfer_bytes;
unsigned last_tex_ps_draw_ratio; /* for query */
unsigned compute_num_verts_accepted;
unsigned compute_num_verts_rejected;
unsigned compute_num_verts_ineligible; /* due to low vertex count */
unsigned context_roll;
/* Queries. */
@ -1287,7 +1253,7 @@ struct si_context {
*/
struct hash_table *dirty_implicit_resources;
pipe_draw_vbo_func draw_vbo[2][2][2][2];
pipe_draw_vbo_func draw_vbo[2][2][2];
/* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
pipe_draw_vbo_func real_draw_vbo;
@ -1483,7 +1449,6 @@ void si_allocate_gds(struct si_context *ctx);
void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
void si_trace_emit(struct si_context *sctx);
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
unsigned cp_coher_cntl);
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
@ -1502,32 +1467,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_init_compute_functions(struct si_context *sctx);
/* si_compute_prim_discard.c */
enum si_prim_discard_outcome
{
SI_PRIM_DISCARD_ENABLED,
SI_PRIM_DISCARD_DISABLED,
SI_PRIM_DISCARD_DRAW_SPLIT,
SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
};
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
enum si_prim_discard_outcome
si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
unsigned drawid_offset,
const struct pipe_draw_start_count_bias *draws,
unsigned num_draws, unsigned total_count);
void si_compute_signal_gfx(struct si_context *sctx);
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draws,
unsigned num_draws, unsigned index_size,
unsigned total_count, uint64_t input_indexbuf_va,
unsigned index_max_size);
void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
unsigned *prim_discard_vertex_count_threshold,
unsigned *index_ring_size_per_ib);
/* si_pipe.c */
void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
@ -1996,14 +1935,9 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
}
static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
{
return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
}
static inline unsigned si_get_wave_size(struct si_screen *sscreen,
gl_shader_stage stage, bool ngg, bool es,
bool gs_fast_launch, bool prim_discard_cs)
bool gs_fast_launch)
{
if (stage == MESA_SHADER_COMPUTE)
return sscreen->compute_wave_size;
@ -2011,8 +1945,7 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen,
return sscreen->ps_wave_size;
else if (gs_fast_launch)
return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
(stage == MESA_SHADER_VERTEX && es && !ngg) ||
else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
(stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
(stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
return 64;
@ -2025,18 +1958,14 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
shader->key.as_ngg,
shader->key.as_es,
shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
shader->key.opt.vs_as_prim_discard_cs);
shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}
static inline void si_select_draw_vbo(struct si_context *sctx)
{
bool has_prim_discard_cs = si_compute_prim_discard_enabled(sctx) &&
!sctx->shader.tes.cso && !sctx->shader.gs.cso;
pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
[!!sctx->shader.gs.cso]
[sctx->ngg]
[has_prim_discard_cs];
[sctx->ngg];
assert(draw_vbo);
if (unlikely(sctx->real_draw_vbo))
sctx->real_draw_vbo = draw_vbo;

View file

@ -260,15 +260,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
case SI_QUERY_DISK_SHADER_CACHE_MISSES:
query->begin_result = sctx->screen->num_disk_shader_cache_misses;
break;
case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
query->begin_result = sctx->compute_num_verts_accepted;
break;
case SI_QUERY_PD_NUM_PRIMS_REJECTED:
query->begin_result = sctx->compute_num_verts_rejected;
break;
case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
query->begin_result = sctx->compute_num_verts_ineligible;
break;
case SI_QUERY_GPIN_ASIC_ID:
case SI_QUERY_GPIN_NUM_SIMD:
case SI_QUERY_GPIN_NUM_RB:
@ -429,15 +420,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
case SI_QUERY_DISK_SHADER_CACHE_MISSES:
query->end_result = sctx->screen->num_disk_shader_cache_misses;
break;
case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
query->end_result = sctx->compute_num_verts_accepted;
break;
case SI_QUERY_PD_NUM_PRIMS_REJECTED:
query->end_result = sctx->compute_num_verts_rejected;
break;
case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
query->end_result = sctx->compute_num_verts_ineligible;
break;
case SI_QUERY_GPIN_ASIC_ID:
case SI_QUERY_GPIN_NUM_SIMD:
case SI_QUERY_GPIN_NUM_RB:
@ -479,11 +461,6 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ
result->u64 =
(query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
return true;
case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
case SI_QUERY_PD_NUM_PRIMS_REJECTED:
case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
return true;
case SI_QUERY_GPIN_ASIC_ID:
result->u32 = 0;
return true;
@ -1758,10 +1735,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
};
#undef X

View file

@ -111,9 +111,6 @@ enum
SI_QUERY_GPIN_NUM_RB,
SI_QUERY_GPIN_NUM_SPI,
SI_QUERY_GPIN_NUM_SE,
SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
SI_QUERY_PD_NUM_PRIMS_REJECTED,
SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
SI_QUERY_LIVE_SHADER_CACHE_HITS,
SI_QUERY_LIVE_SHADER_CACHE_MISSES,
SI_QUERY_MEMORY_SHADER_CACHE_HITS,

View file

@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
/* VGPRs */
declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
/* Return values */
if (shader->key.opt.vs_as_prim_discard_cs) {
for (i = 0; i < 4; i++)
ac_add_return(&ctx->args, AC_ARG_VGPR);
}
break;
case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@ -1070,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader)
return "Vertex Shader as ES";
else if (shader->key.as_ls)
return "Vertex Shader as LS";
else if (shader->key.opt.vs_as_prim_discard_cs)
return "Vertex Shader as Primitive Discard CS";
else if (shader->key.as_ngg)
return "Vertex Shader as ESGS";
else
@ -1183,12 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
fprintf(f, " as_ls = %u\n", key->as_ls);
fprintf(f, " as_ngg = %u\n", key->as_ngg);
fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
break;
case MESA_SHADER_TESS_CTRL:
@ -1317,7 +1303,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
key->vs_prolog.as_ls = shader_out->key.as_ls;
key->vs_prolog.as_es = shader_out->key.as_es;
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
if (ngg_cull_shader) {
key->vs_prolog.gs_fast_launch_tri_list =
@ -1342,8 +1327,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
/* Only one of these combinations can be set. as_ngg can be set with as_es. */
assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
(key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
1);
(key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
/* Enable loading the InstanceID VGPR. */
uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
@ -1557,7 +1541,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
(key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
(key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
break;
case MESA_SHADER_TESS_CTRL:
assert(!prolog);
@ -1581,8 +1564,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
si_llvm_context_init(&ctx, sscreen, compiler,
si_get_wave_size(sscreen, stage,
shader.key.as_ngg, shader.key.as_es,
shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
shader.key.opt.vs_as_prim_discard_cs));
shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
ctx.shader = &shader;
ctx.stage = stage;

View file

@ -446,7 +446,6 @@ struct si_shader_selector {
ubyte const_and_shader_buf_descriptors_index;
ubyte sampler_and_images_descriptors_index;
bool vs_needs_prolog;
bool prim_discard_cs_allowed;
ubyte cs_shaderbufs_sgpr_index;
ubyte cs_num_shaderbufs_in_user_sgprs;
ubyte cs_images_sgpr_index;
@ -577,7 +576,6 @@ union si_shader_part_key {
unsigned as_ls : 1;
unsigned as_es : 1;
unsigned as_ngg : 1;
unsigned as_prim_discard_cs : 1;
unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
unsigned gs_fast_launch_index_size_packed : 2;
@ -684,14 +682,6 @@ struct si_shader_key {
*/
unsigned prefer_mono : 1;
/* Primitive discard compute shader. */
unsigned vs_as_prim_discard_cs : 1;
unsigned cs_prim_type : 4;
unsigned cs_indexed : 1;
unsigned cs_provoking_vertex_first : 1;
unsigned cs_cull_front : 1;
unsigned cs_cull_back : 1;
/* VS and TCS have the same number of patch vertices. */
unsigned same_patch_vertices:1;

View file

@ -804,9 +804,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
!same_thread_count && si_is_multi_part_shader(ctx->shader))
ac_build_endif(&ctx->ac, 6507);
/* Return the value from the last part. It's non-void only for the prim
* discard compute shader.
*/
if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
LLVMBuildRetVoid(builder);
else
@ -1116,9 +1113,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
parts[num_parts++] = main_fn;
si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
@ -1289,8 +1283,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
}
/* Make sure the input is a pointer and not integer followed by inttoptr. */
if (!shader->key.opt.vs_as_prim_discard_cs)
assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
/* Compile to bytecode. */
if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,

View file

@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
si_llvm_context_init(&ctx, sscreen, compiler,
si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
false, false, false, false));
false, false, false));
ctx.shader = shader;
ctx.stage = MESA_SHADER_VERTEX;

View file

@ -793,32 +793,6 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
FREE(outputs);
}
static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi)
{
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
struct si_shader_info *info = &ctx->shader->selector->info;
LLVMValueRef *addrs = abi->outputs;
LLVMValueRef pos[4] = {};
assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
for (unsigned i = 0; i < info->num_outputs; i++) {
if (info->output_semantic[i] != VARYING_SLOT_POS)
continue;
for (unsigned chan = 0; chan < 4; chan++)
pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
break;
}
assert(pos[0] != NULL);
/* Return the position output. */
LLVMValueRef ret = ctx->return_value;
for (unsigned chan = 0; chan < 4; chan++)
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
ctx->return_value = ret;
}
/**
* Build the vertex shader prolog function.
*
@ -1121,8 +1095,6 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad
ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
else if (shader->key.as_es)
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
else if (shader->key.opt.vs_as_prim_discard_cs)
ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
else if (ngg_cull_shader)
ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
else if (shader->key.as_ngg)

View file

@ -971,7 +971,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
} \
} while (0)
template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
template <chip_class GFX_VERSION, si_has_ngg NGG>
ALWAYS_INLINE
static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
unsigned drawid_base,
@ -980,7 +980,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
unsigned num_draws, unsigned total_count,
struct pipe_resource *indexbuf, unsigned index_size,
unsigned index_offset, unsigned instance_count,
bool dispatch_prim_discard_cs, unsigned original_index_size)
unsigned original_index_size)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@ -1042,22 +1042,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
sctx->last_index_size = index_size;
}
/* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
/* Skip draw calls with 0-sized index buffers.
* They cause a hang on some chips, like Navi10-14.
*/
if (!index_max_size) {
radeon_end();
return;
}
index_va = si_resource(indexbuf)->gpu_address + index_offset;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
RADEON_PRIO_INDEX_BUFFER);
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size);
/* Skip draw calls with 0-sized index buffers.
* They cause a hang on some chips, like Navi10-14.
*/
if (!index_max_size) {
radeon_end();
return;
}
index_va = si_resource(indexbuf)->gpu_address + index_offset;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
RADEON_PRIO_INDEX_BUFFER);
} else {
/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
* so the state must be re-emitted before the next indexed draw.
@ -1190,16 +1187,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;
if (index_size) {
if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
radeon_end();
si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
original_index_size, total_count, index_va,
index_max_size);
EMIT_SQTT_END_DRAW;
return;
}
/* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
* can be changed between draws, and GS fast launch must be disabled.
* NOT_EOP doesn't work on gfx9 and older.
@ -1629,100 +1616,12 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
info->restart_index, min_vertex_count);
}
static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
{
struct radeon_winsys *ws = sctx->ws;
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
struct si_descriptors *buffers =
&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
struct si_shader_selector *vs = sctx->shader.vs.cso;
struct si_vertex_elements *velems = sctx->vertex_elements;
unsigned num_velems = velems->count;
unsigned num_images = vs->info.base.num_images;
/* Index buffer. */
if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
goto has_write_reference;
/* Vertex buffers. */
for (unsigned i = 0; i < num_velems; i++) {
if (!((1 << i) & velems->first_vb_use_mask))
continue;
unsigned vb_index = velems->vertex_buffer_index[i];
struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
if (!res)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
goto has_write_reference;
}
/* Constant and shader buffers. */
for (unsigned i = 0; i < buffers->num_active_slots; i++) {
unsigned index = buffers->first_active_slot + i;
struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
if (!res)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
goto has_write_reference;
}
/* Samplers. */
if (vs->info.base.textures_used[0]) {
unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used);
for (unsigned i = 0; i < num_samplers; i++) {
struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
if (!view)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
goto has_write_reference;
}
}
/* Images. */
if (num_images) {
for (unsigned i = 0; i < num_images; i++) {
struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
if (!res)
continue;
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
goto has_write_reference;
}
}
return true;
has_write_reference:
/* If the current gfx IB has enough packets, flush it to remove write
* references to buffers.
*/
if (cs->prev_dw + cs->current.cdw > 2048) {
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
assert(si_all_vs_resources_read_only(sctx, indexbuf));
return true;
}
return false;
}
static ALWAYS_INLINE bool pd_msg(const char *s)
{
if (SI_PRIM_DISCARD_DEBUG)
printf("PD failed: %s\n", s);
return false;
}
#define DRAW_CLEANUP do { \
if (index_size && indexbuf != info->index.resource) \
pipe_resource_reference(&indexbuf, NULL); \
} while (0)
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
static void si_draw_vbo(struct pipe_context *ctx,
const struct pipe_draw_info *info,
unsigned drawid_offset,
@ -1910,70 +1809,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
info->primitive_restart &&
(!sctx->screen->options.prim_restart_tri_strips_only ||
(prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
bool dispatch_prim_discard_cs = false;
unsigned original_index_size = index_size;
/* Determine if we can use the primitive discard compute shader. */
/* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */
if (ALLOW_PRIM_DISCARD_CS &&
(total_direct_count > sctx->prim_discard_vertex_count_threshold
? (sctx->compute_num_verts_rejected += total_direct_count, true)
: /* Add, then return true. */
(sctx->compute_num_verts_ineligible += total_direct_count,
false)) && /* Add, then return false. */
(!primitive_restart || pd_msg("primitive restart")) &&
/* Supported prim types. */
(1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) &&
(instance_count == 1 || pd_msg("instancing")) &&
((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) ||
!sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
(!sctx->render_cond || pd_msg("render condition")) &&
/* Forced enablement ignores pipeline statistics queries. */
(sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
(!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
pd_msg("pipestat or primgen query")) &&
(!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
(!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
!rs->polygon_mode_enabled &&
#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
(!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
(!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
(!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) &&
(!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
!sctx->shader.vs.cso->info.base.vs.window_space_position &&
!sctx->shader.vs.cso->so.num_outputs &&
#else
(sctx->shader.vs.cso->prim_discard_cs_allowed ||
pd_msg("VS shader uses unsupported features")) &&
#endif
/* Check that all buffers are used for read only, because compute
* dispatches can run ahead. */
(si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
pd_msg("write reference"))) {
switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws,
total_direct_count)) {
case SI_PRIM_DISCARD_ENABLED:
original_index_size = index_size;
dispatch_prim_discard_cs = true;
/* The compute shader changes/lowers the following: */
prim = PIPE_PRIM_TRIANGLES;
index_size = 4;
instance_count = 1;
sctx->compute_num_verts_rejected -= total_direct_count;
sctx->compute_num_verts_accepted += total_direct_count;
break;
case SI_PRIM_DISCARD_DISABLED:
break;
case SI_PRIM_DISCARD_DRAW_SPLIT:
case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT:
sctx->compute_num_verts_rejected -= total_direct_count;
/* The multi draw was split into multiple ones and executed. Return. */
DRAW_CLEANUP;
return;
}
}
/* Set the rasterization primitive type.
*
* This must be done after si_decompress_textures, which can call
@ -2005,7 +1842,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
if (GFX_VERSION >= GFX10) {
struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
if (NGG && !HAS_GS && !dispatch_prim_discard_cs &&
if (NGG && !HAS_GS &&
/* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type
* is not triangles, so this check is only needed without tessellation. */
(HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&
@ -2154,10 +1991,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
assert(sctx->dirty_atoms == 0);
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
si_emit_draw_packets<GFX_VERSION, NGG>
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
index_size, index_offset, instance_count, dispatch_prim_discard_cs,
original_index_size);
index_size, index_offset, instance_count, original_index_size);
/* <-- CUs are busy here. */
/* Start prefetches after the draw has been started. Both will run
@ -2193,10 +2029,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
assert(sctx->dirty_atoms == 0);
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
si_emit_draw_packets<GFX_VERSION, NGG>
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
index_size, index_offset, instance_count, dispatch_prim_discard_cs,
original_index_size);
index_size, index_offset, instance_count, original_index_size);
/* Prefetch the remaining shaders after the draw has been
* started. */
@ -2281,40 +2116,27 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
static void si_init_draw_vbo(struct si_context *sctx)
{
/* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)
return;
if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))
return;
if (NGG && GFX_VERSION < GFX10)
return;
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
{
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
}
template <chip_class GFX_VERSION>
static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
{
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_OFF>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_ON>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_ON>(sctx);
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_ON>(sctx);
}
static void si_invalid_draw_vbo(struct pipe_context *pipe,

View file

@ -81,8 +81,8 @@
* Right half: {1,3,5,7,9,11,13,15}
*/
/* Important note: We have to use the standard DX positions, because
* the primitive discard compute shader relies on them.
/* Important note: We have to use the standard DX positions because shader-based culling
* relies on them.
*/
/* 1x MSAA */

View file

@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
shader_variant_flags |= 1 << 0;
if (sel->nir)
shader_variant_flags |= 1 << 1;
if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32)
shader_variant_flags |= 1 << 2;
if (sel->info.stage == MESA_SHADER_FRAGMENT &&
/* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
@ -78,11 +78,9 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
sel->info.base.fs.uses_discard &&
sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
shader_variant_flags |= 1 << 3;
if (sel->info.stage == MESA_SHADER_VERTEX) {
/* This varies depending on whether compute-based culling is enabled. */
assert(sel->screen->num_vbos_in_user_sgprs <= 7);
shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
}
/* bit gap */
if (sel->screen->options.no_infinite_interp)
shader_variant_flags |= 1 << 7;
if (sel->screen->options.clamp_div_by_zero)
@ -2291,10 +2289,8 @@ current_not_ready:
/* Compile the main shader part if it doesn't exist. This can happen
* if the initial guess was wrong.
*
* The prim discard CS doesn't need the main shader part.
*/
if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
if (!is_pure_monolithic) {
bool ok = true;
/* Make sure the main shader part is present. This is needed
@ -2348,8 +2344,7 @@ current_not_ready:
shader->is_monolithic =
is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
/* The prim discard CS is always optimized. */
shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
shader->is_optimized = !is_pure_monolithic &&
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
/* If it's an optimized shader, compile it asynchronously. */
@ -2706,12 +2701,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
!sel->info.base.vs.blit_sgprs_amd;
sel->prim_discard_cs_allowed =
sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images &&
!sel->info.uses_bindless_samplers && !sel->info.base.writes_memory &&
!sel->info.writes_viewport_index &&
!sel->info.base.vs.window_space_position && !sel->so.num_outputs;
if (sel->info.stage == MESA_SHADER_VERTEX ||
sel->info.stage == MESA_SHADER_TESS_CTRL ||
sel->info.stage == MESA_SHADER_TESS_EVAL ||

View file

@ -771,9 +771,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
* http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
*/
return 20 * 1024;
case IB_PARALLEL_COMPUTE:
/* Always chain this IB. */
return UINT_MAX;
default:
unreachable("bad ib_type");
}
@ -908,9 +905,6 @@ static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
assert(0);
}
cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE;
cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
cs->last_added_bo = NULL;
return true;
}
@ -938,8 +932,6 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs
cleanup_fence_list(&cs->fence_dependencies);
cleanup_fence_list(&cs->syncobj_dependencies);
cleanup_fence_list(&cs->syncobj_to_signal);
cleanup_fence_list(&cs->compute_fence_dependencies);
cleanup_fence_list(&cs->compute_start_fence_dependencies);
cs->num_real_buffers = 0;
cs->num_slab_buffers = 0;
@ -957,8 +949,6 @@ static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs
FREE(cs->fence_dependencies.list);
FREE(cs->syncobj_dependencies.list);
FREE(cs->syncobj_to_signal.list);
FREE(cs->compute_fence_dependencies.list);
FREE(cs->compute_start_fence_dependencies.list);
}
@ -997,7 +987,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
cs->main.ib_type = IB_MAIN;
cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE;
if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
FREE(cs);
@ -1035,37 +1024,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
return true;
}
static bool
amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *compute_cs,
struct radeon_cmdbuf *gfx_cs,
bool uses_gds_ordered_append)
{
struct amdgpu_cs *cs = amdgpu_cs(gfx_cs);
struct amdgpu_winsys *ws = cs->ws;
if (cs->ring_type != RING_GFX)
return false;
/* only one secondary IB can be added */
if (cs->compute_ib.ib_mapped)
return false;
/* Allocate the compute IB. */
if (!amdgpu_get_new_ib(ws, compute_cs, &cs->compute_ib, cs))
return false;
if (uses_gds_ordered_append) {
cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |=
AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |=
AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
}
cs->compute_ib.rcs = compute_cs;
compute_cs->priv = cs;
return true;
}
static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
unsigned preamble_num_dw)
@ -1128,7 +1086,7 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
bool force_chaining)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_ib *ib = rcs == cs->main.rcs ? &cs->main : &cs->compute_ib;
struct amdgpu_ib *ib = &cs->main;
unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
@ -1286,18 +1244,6 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
util_queue_fence_wait(&fence->submitted);
if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) {
/* Syncobjs are not needed here. */
assert(!amdgpu_fence_is_syncobj(fence));
if (acs->ws->info.has_scheduled_fence_dependency &&
dependency_flags & RADEON_DEPENDENCY_START_FENCE)
add_fence_to_list(&cs->compute_start_fence_dependencies, fence);
else
add_fence_to_list(&cs->compute_fence_dependencies, fence);
return;
}
/* Start fences are not needed here. */
assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE));
@ -1589,66 +1535,6 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
num_chunks++;
}
/* Submit the parallel compute IB first. */
if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) {
unsigned old_num_chunks = num_chunks;
/* Add compute fence dependencies. */
unsigned num_dependencies = cs->compute_fence_dependencies.num;
if (num_dependencies) {
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
alloca(num_dependencies * sizeof(*dep_chunk));
for (unsigned i = 0; i < num_dependencies; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->compute_fence_dependencies.list[i];
assert(util_queue_fence_is_signalled(&fence->submitted));
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
}
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
num_chunks++;
}
/* Add compute start fence dependencies. */
unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num;
if (num_start_dependencies) {
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
alloca(num_start_dependencies * sizeof(*dep_chunk));
for (unsigned i = 0; i < num_start_dependencies; i++) {
struct amdgpu_fence *fence =
(struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i];
assert(util_queue_fence_is_signalled(&fence->submitted));
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
}
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES;
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies;
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
num_chunks++;
}
/* Convert from dwords to bytes. */
cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4;
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE];
num_chunks++;
r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
num_chunks, chunks, NULL);
if (r)
goto finalize;
/* Back off the compute chunks. */
num_chunks = old_num_chunks;
}
/* Syncobj signals. */
unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
if (num_syncobj_to_signal) {
@ -1706,7 +1592,7 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
num_chunks, chunks, &seq_no);
}
finalize:
if (r) {
if (r == -ENOMEM)
fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@ -1798,12 +1684,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
}
if (cs->ring_type == RING_GFX)
ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
/* Also pad secondary IBs. */
if (cs->compute_ib.ib_mapped) {
while (cs->compute_ib.rcs->current.cdw & ib_pad_dw_mask)
radeon_emit(cs->compute_ib.rcs, PKT3_NOP_PAD);
}
break;
case RING_UVD:
case RING_UVD_ENC:
@ -1839,9 +1719,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
/* Set IB sizes. */
amdgpu_ib_finalize(ws, rcs, &cs->main);
if (cs->compute_ib.ib_mapped)
amdgpu_ib_finalize(ws, cs->compute_ib.rcs, &cs->compute_ib);
/* Create a fence. */
amdgpu_fence_reference(&cur->fence, NULL);
if (cs->next_fence) {
@ -1897,8 +1774,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
amdgpu_get_new_ib(ws, rcs, &cs->main, cs);
if (cs->compute_ib.ib_mapped)
amdgpu_get_new_ib(ws, cs->compute_ib.rcs, &cs->compute_ib, cs);
if (cs->preamble_ib_bo) {
amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0,
@ -1929,9 +1804,6 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL);
FREE(rcs->prev);
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->compute_ib.big_ib_buffer, NULL);
if (cs->compute_ib.rcs)
FREE(cs->compute_ib.rcs->prev);
amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
amdgpu_fence_reference(&cs->next_fence, NULL);
@ -1954,7 +1826,6 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
ws->base.ctx_destroy = amdgpu_ctx_destroy;
ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
ws->base.cs_create = amdgpu_cs_create;
ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib;
ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
ws->base.cs_destroy = amdgpu_cs_destroy;
ws->base.cs_add_buffer = amdgpu_cs_add_buffer;

View file

@ -58,7 +58,6 @@ struct amdgpu_cs_buffer {
enum ib_type {
IB_PREAMBLE,
IB_MAIN,
IB_PARALLEL_COMPUTE,
IB_NUM,
};
@ -115,10 +114,6 @@ struct amdgpu_cs_context {
struct amdgpu_fence_list syncobj_dependencies;
struct amdgpu_fence_list syncobj_to_signal;
/* The compute IB uses the dependencies above + these: */
struct amdgpu_fence_list compute_fence_dependencies;
struct amdgpu_fence_list compute_start_fence_dependencies;
struct pipe_fence_handle *fence;
/* the error returned from cs_flush for non-async submissions */
@ -132,7 +127,6 @@ struct amdgpu_cs_context {
struct amdgpu_cs {
struct amdgpu_ib main; /* must be first because this is inherited */
struct amdgpu_ib compute_ib; /* optional parallel compute IB */
struct amdgpu_winsys *ws;
struct amdgpu_ctx *ctx;
enum ring_type ring_type;