mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-04 10:40:36 +01:00
radeonsi: remove the primitive discard compute shader
It doesn't always work, it's only useful on gfx9 and older, and it's too complicated. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4011 Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12812>
This commit is contained in:
parent
9e994560ff
commit
576f8394db
22 changed files with 62 additions and 1791 deletions
|
|
@ -773,12 +773,6 @@ radeonsi driver environment variables
|
|||
Always use NGG culling even when it can hurt.
|
||||
``nonggc``
|
||||
Disable NGG culling.
|
||||
``alwayspd``
|
||||
Always enable the primitive discard compute shader.
|
||||
``pd``
|
||||
Enable the primitive discard compute shader for large draw calls.
|
||||
``nopd``
|
||||
Disable the primitive discard compute shader.
|
||||
``switch_on_eop``
|
||||
Program WD/IA to switch on end-of-packet.
|
||||
``nooutoforder``
|
||||
|
|
|
|||
|
|
@ -80,9 +80,6 @@ enum radeon_bo_flag
|
|||
|
||||
enum radeon_dependency_flag
|
||||
{
|
||||
/* Add the dependency to the parallel compute IB only. */
|
||||
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
|
||||
|
||||
/* Instead of waiting for a job to finish execution, the dependency will
|
||||
* be signaled when the job starts execution.
|
||||
*/
|
||||
|
|
@ -512,26 +509,6 @@ struct radeon_winsys {
|
|||
struct pipe_fence_handle **fence),
|
||||
void *flush_ctx, bool stop_exec_on_failure);
|
||||
|
||||
/**
|
||||
* Add a parallel compute IB to a gfx IB. It will share the buffer list
|
||||
* and fence dependencies with the gfx IB. The gfx flush call will submit
|
||||
* both IBs at the same time.
|
||||
*
|
||||
* The compute IB doesn't have an output fence, so the primary IB has
|
||||
* to use a wait packet for synchronization.
|
||||
*
|
||||
* The returned IB is only a stream for writing packets to the new
|
||||
* IB. The only function that can be used on the compute cs is cs_check_space.
|
||||
*
|
||||
* \param compute_cs The returned structure of the command stream.
|
||||
* \param gfx_cs Gfx IB
|
||||
*
|
||||
* \return true on success
|
||||
*/
|
||||
bool (*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *compute_cs,
|
||||
struct radeon_cmdbuf *gfx_cs,
|
||||
bool uses_gds_ordered_append);
|
||||
|
||||
/**
|
||||
* Set up and enable mid command buffer preemption for the command stream.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ files_libradeonsi = files(
|
|||
'si_build_pm4.h',
|
||||
'si_clear.c',
|
||||
'si_compute.c',
|
||||
'si_compute_prim_discard.c',
|
||||
'si_compute.h',
|
||||
'si_compute_blit.c',
|
||||
'si_cp_dma.c',
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
|||
sdst->TC_L2_dirty = true;
|
||||
|
||||
/* If it's not a framebuffer fast clear... */
|
||||
if (coher == SI_COHERENCY_SHADER) {
|
||||
if (coher == SI_COHERENCY_SHADER)
|
||||
sctx->num_cp_dma_calls++;
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
|||
si_resource(dst)->TC_L2_dirty = true;
|
||||
|
||||
/* If it's not a prefetch or GDS copy... */
|
||||
if (dst && src && (dst != src || dst_offset != src_offset)) {
|
||||
if (dst && src && (dst != src || dst_offset != src_offset))
|
||||
sctx->num_cp_dma_calls++;
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
}
|
||||
}
|
||||
|
||||
void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
|
||||
|
|
|
|||
|
|
@ -344,7 +344,6 @@ struct si_log_chunk_cs {
|
|||
struct si_saved_cs *cs;
|
||||
bool dump_bo_list;
|
||||
unsigned gfx_begin, gfx_end;
|
||||
unsigned compute_begin, compute_end;
|
||||
};
|
||||
|
||||
static void si_log_chunk_type_cs_destroy(void *data)
|
||||
|
|
@ -402,7 +401,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
|
|||
struct si_context *ctx = chunk->ctx;
|
||||
struct si_saved_cs *scs = chunk->cs;
|
||||
int last_trace_id = -1;
|
||||
int last_compute_trace_id = -1;
|
||||
|
||||
/* We are expecting that the ddebug pipe has already
|
||||
* waited for the context, so this buffer should be idle.
|
||||
|
|
@ -410,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
|
|||
*/
|
||||
uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
|
||||
PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
|
||||
if (map) {
|
||||
if (map)
|
||||
last_trace_id = map[0];
|
||||
last_compute_trace_id = map[1];
|
||||
}
|
||||
|
||||
if (chunk->gfx_end != chunk->gfx_begin) {
|
||||
if (chunk->gfx_begin == 0) {
|
||||
|
|
@ -435,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
|
|||
}
|
||||
}
|
||||
|
||||
if (chunk->compute_end != chunk->compute_begin) {
|
||||
assert(ctx->prim_discard_compute_cs.priv);
|
||||
|
||||
if (scs->flushed) {
|
||||
ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
|
||||
chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
|
||||
"Compute IB", ctx->chip_class, NULL, NULL);
|
||||
} else {
|
||||
si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
|
||||
chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
|
||||
ctx->chip_class);
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk->dump_bo_list) {
|
||||
fprintf(f, "Flushing. Time: ");
|
||||
util_dump_ns(f, scs->time_flush);
|
||||
|
|
@ -468,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
|
|||
|
||||
struct si_saved_cs *scs = ctx->current_saved_cs;
|
||||
unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
|
||||
unsigned compute_cur = 0;
|
||||
|
||||
if (ctx->prim_discard_compute_cs.priv)
|
||||
compute_cur =
|
||||
ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
|
||||
|
||||
if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
|
||||
if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
|
||||
return;
|
||||
|
||||
struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
|
||||
|
|
@ -487,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
|
|||
chunk->gfx_end = gfx_cur;
|
||||
scs->gfx_last_dw = gfx_cur;
|
||||
|
||||
chunk->compute_begin = scs->compute_last_dw;
|
||||
chunk->compute_end = compute_cur;
|
||||
scs->compute_last_dw = compute_cur;
|
||||
|
||||
u_log_chunk(log, &si_log_chunk_type_cs, chunk);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
|
|||
EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
|
||||
event_flags;
|
||||
unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
|
||||
bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs;
|
||||
bool compute_ib = !ctx->has_graphics;
|
||||
|
||||
radeon_begin(cs);
|
||||
|
||||
|
|
|
|||
|
|
@ -92,9 +92,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
|
|||
|
||||
ctx->gfx_flush_in_progress = true;
|
||||
|
||||
if (radeon_emitted(&ctx->prim_discard_compute_cs, 0))
|
||||
si_compute_signal_gfx(ctx);
|
||||
|
||||
if (ctx->has_graphics) {
|
||||
if (!list_is_empty(&ctx->active_queries))
|
||||
si_suspend_queries(ctx);
|
||||
|
|
@ -136,29 +133,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
|
|||
si_log_hw_flush(ctx);
|
||||
}
|
||||
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* The compute IB can start after the previous gfx IB starts. */
|
||||
if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
|
||||
ctx->ws->cs_add_fence_dependency(
|
||||
&ctx->gfx_cs, ctx->last_gfx_fence,
|
||||
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
|
||||
}
|
||||
|
||||
/* Remember the last execution barrier. It's in the IB.
|
||||
* It will signal the start of the next compute IB.
|
||||
*/
|
||||
if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
|
||||
*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
|
||||
ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
|
||||
si_resource_reference(&ctx->barrier_buf, NULL);
|
||||
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->is_noop)
|
||||
flags |= RADEON_FLUSH_NOOP;
|
||||
|
||||
|
|
@ -171,17 +145,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
|
|||
|
||||
ctx->num_gfx_cs_flushes++;
|
||||
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* Remember the last execution barrier, which is the last fence
|
||||
* in this case.
|
||||
*/
|
||||
if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check VM faults if needed. */
|
||||
if (sscreen->debug_flags & DBG(CHECK_VM)) {
|
||||
/* Use conservative timeout 800ms, after which we won't wait any
|
||||
|
|
@ -216,7 +179,7 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)
|
|||
pipe_reference_init(&ctx->current_saved_cs->reference, 1);
|
||||
|
||||
ctx->current_saved_cs->trace_buf =
|
||||
si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
|
||||
si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4));
|
||||
if (!ctx->current_saved_cs->trace_buf) {
|
||||
free(ctx->current_saved_cs);
|
||||
ctx->current_saved_cs = NULL;
|
||||
|
|
@ -368,11 +331,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
bool is_secure = false;
|
||||
|
||||
if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
|
||||
/* Disable features that don't work with TMZ:
|
||||
* - primitive discard
|
||||
*/
|
||||
ctx->prim_discard_vertex_count_threshold = UINT_MAX;
|
||||
|
||||
is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
|
||||
|
||||
si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);
|
||||
|
|
@ -549,18 +507,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
|
||||
assert(!ctx->gfx_cs.prev_dw);
|
||||
ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
|
||||
ctx->prim_discard_compute_ib_initialized = false;
|
||||
|
||||
/* Compute-based primitive discard:
|
||||
* The index ring is divided into 2 halves. Switch between the halves
|
||||
* in the same fashion as doublebuffering.
|
||||
*/
|
||||
if (ctx->index_ring_base)
|
||||
ctx->index_ring_base = 0;
|
||||
else
|
||||
ctx->index_ring_base = ctx->index_ring_size_per_ib;
|
||||
|
||||
ctx->index_ring_offset = 0;
|
||||
|
||||
/* All buffer references are removed on a flush, so si_check_needs_implicit_sync
|
||||
* cannot determine if si_make_CB_shader_coherent() needs to be called.
|
||||
|
|
@ -586,34 +532,9 @@ void si_trace_emit(struct si_context *sctx)
|
|||
u_log_flush(sctx->log);
|
||||
}
|
||||
|
||||
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
|
||||
{
|
||||
if (!si_compute_prim_discard_enabled(sctx))
|
||||
return;
|
||||
|
||||
if (!sctx->barrier_buf) {
|
||||
u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
|
||||
(struct pipe_resource **)&sctx->barrier_buf);
|
||||
}
|
||||
|
||||
/* Emit a placeholder to signal the next compute IB to start.
|
||||
* See si_compute_prim_discard.c for explanation.
|
||||
*/
|
||||
uint32_t signal = 1;
|
||||
si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
|
||||
&signal);
|
||||
|
||||
sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5];
|
||||
|
||||
/* Only the last occurrence of WRITE_DATA will be executed.
|
||||
* The packet will be enabled in si_flush_gfx_cs.
|
||||
*/
|
||||
*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
|
||||
}
|
||||
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
|
||||
{
|
||||
bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
|
||||
bool compute_ib = !sctx->has_graphics;
|
||||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
|
|
@ -857,14 +778,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|||
|
||||
uint32_t cp_coher_cntl = 0;
|
||||
const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
|
||||
const bool is_barrier =
|
||||
flush_cb_db ||
|
||||
/* INV_ICACHE == beginning of gfx IB. Checking
|
||||
* INV_ICACHE fixes corruption for DeusExMD with
|
||||
* compute-based culling, but I don't know why.
|
||||
*/
|
||||
flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
|
||||
(flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
|
||||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
|
|
@ -1077,9 +990,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|||
radeon_end();
|
||||
}
|
||||
|
||||
if (is_barrier)
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
|
||||
if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
|
||||
radeon_begin(cs);
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
|
|
|
|||
|
|
@ -95,9 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = {
|
|||
{"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
|
||||
{"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
|
||||
{"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
|
||||
{"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
|
||||
{"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
|
||||
{"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
|
||||
{"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
|
||||
{"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
|
||||
{"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
|
||||
|
|
@ -309,12 +306,8 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
u_suballocator_destroy(&sctx->allocator_zeroed_memory);
|
||||
|
||||
sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
|
||||
sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
|
||||
si_resource_reference(&sctx->eop_bug_scratch, NULL);
|
||||
si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
|
||||
si_resource_reference(&sctx->index_ring, NULL);
|
||||
si_resource_reference(&sctx->barrier_buf, NULL);
|
||||
si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
|
||||
si_resource_reference(&sctx->shadowed_regs, NULL);
|
||||
radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
|
||||
radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
|
||||
|
|
@ -618,12 +611,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
|
|||
default:
|
||||
unreachable("unhandled chip class");
|
||||
}
|
||||
|
||||
si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
|
||||
&sctx->prim_discard_vertex_count_threshold,
|
||||
&sctx->index_ring_size_per_ib);
|
||||
} else {
|
||||
sctx->prim_discard_vertex_count_threshold = UINT_MAX;
|
||||
}
|
||||
|
||||
sctx->sample_mask = 0xffff;
|
||||
|
|
@ -641,7 +628,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
|
|||
sctx->b.create_video_buffer = vl_video_buffer_create;
|
||||
}
|
||||
|
||||
if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
|
||||
if (sctx->chip_class >= GFX9) {
|
||||
sctx->wait_mem_scratch =
|
||||
si_aligned_buffer_create(screen,
|
||||
SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
|
||||
|
|
@ -1167,15 +1154,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
|||
|
||||
sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
|
||||
|
||||
unsigned prim_discard_vertex_count_threshold, tmp;
|
||||
si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
|
||||
/* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
|
||||
if (prim_discard_vertex_count_threshold == UINT_MAX) {
|
||||
/* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
|
||||
* have to allocate and count references for the upload buffer.
|
||||
*/
|
||||
sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
|
||||
}
|
||||
/* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
|
||||
* have to allocate and count references for the upload buffer.
|
||||
*/
|
||||
sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
|
||||
|
||||
/* Determine tessellation ring info. */
|
||||
bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
|
||||
|
|
|
|||
|
|
@ -44,7 +44,6 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#define ATI_VENDOR_ID 0x1002
|
||||
#define SI_PRIM_DISCARD_DEBUG 0
|
||||
#define SI_NOT_QUERY 0xffffffff
|
||||
|
||||
/* The base vertex and primitive restart can be any number, but we must pick
|
||||
|
|
@ -155,11 +154,6 @@ enum si_has_ngg {
|
|||
NGG_ON,
|
||||
};
|
||||
|
||||
enum si_has_prim_discard_cs {
|
||||
PRIM_DISCARD_CS_OFF,
|
||||
PRIM_DISCARD_CS_ON,
|
||||
};
|
||||
|
||||
enum si_clear_code
|
||||
{
|
||||
DCC_CLEAR_COLOR_0000 = 0x00000000,
|
||||
|
|
@ -223,9 +217,6 @@ enum
|
|||
DBG_ALWAYS_NGG_CULLING_TESS,
|
||||
DBG_NO_NGG_CULLING,
|
||||
DBG_NO_FAST_LAUNCH,
|
||||
DBG_ALWAYS_PD,
|
||||
DBG_PD,
|
||||
DBG_NO_PD,
|
||||
DBG_SWITCH_ON_EOP,
|
||||
DBG_NO_OUT_OF_ORDER,
|
||||
DBG_NO_DPBB,
|
||||
|
|
@ -896,7 +887,6 @@ struct si_saved_cs {
|
|||
unsigned trace_id;
|
||||
|
||||
unsigned gfx_last_dw;
|
||||
unsigned compute_last_dw;
|
||||
bool flushed;
|
||||
int64_t time_flush;
|
||||
};
|
||||
|
|
@ -995,26 +985,6 @@ struct si_context {
|
|||
/* NGG streamout. */
|
||||
struct pb_buffer *gds;
|
||||
struct pb_buffer *gds_oa;
|
||||
/* Compute-based primitive discard. */
|
||||
unsigned prim_discard_vertex_count_threshold;
|
||||
struct radeon_cmdbuf prim_discard_compute_cs;
|
||||
struct si_shader *compute_ib_last_shader;
|
||||
uint32_t compute_rewind_va;
|
||||
unsigned compute_num_prims_in_batch;
|
||||
/* index_ring is divided into 2 halves for doublebuffering. */
|
||||
struct si_resource *index_ring;
|
||||
unsigned index_ring_base; /* offset of a per-IB portion */
|
||||
unsigned index_ring_offset; /* offset within a per-IB portion */
|
||||
unsigned index_ring_size_per_ib; /* max available size per IB */
|
||||
bool prim_discard_compute_ib_initialized;
|
||||
/* For tracking the last execution barrier - it can be either
|
||||
* a WRITE_DATA packet or a fence. */
|
||||
uint32_t *last_pkt3_write_data;
|
||||
struct si_resource *barrier_buf;
|
||||
unsigned barrier_buf_offset;
|
||||
struct pipe_fence_handle *last_ib_barrier_fence;
|
||||
struct si_resource *last_ib_barrier_buf;
|
||||
unsigned last_ib_barrier_buf_offset;
|
||||
|
||||
/* Atoms (direct states). */
|
||||
union si_state_atoms atoms;
|
||||
|
|
@ -1063,7 +1033,6 @@ struct si_context {
|
|||
/* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
|
||||
struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
|
||||
};
|
||||
struct si_shader_ctx_state cs_prim_discard_state;
|
||||
struct si_cs_shader_state cs_shader_state;
|
||||
|
||||
/* shader information */
|
||||
|
|
@ -1254,9 +1223,6 @@ struct si_context {
|
|||
unsigned num_resident_handles;
|
||||
uint64_t num_alloc_tex_transfer_bytes;
|
||||
unsigned last_tex_ps_draw_ratio; /* for query */
|
||||
unsigned compute_num_verts_accepted;
|
||||
unsigned compute_num_verts_rejected;
|
||||
unsigned compute_num_verts_ineligible; /* due to low vertex count */
|
||||
unsigned context_roll;
|
||||
|
||||
/* Queries. */
|
||||
|
|
@ -1287,7 +1253,7 @@ struct si_context {
|
|||
*/
|
||||
struct hash_table *dirty_implicit_resources;
|
||||
|
||||
pipe_draw_vbo_func draw_vbo[2][2][2][2];
|
||||
pipe_draw_vbo_func draw_vbo[2][2][2];
|
||||
/* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
|
||||
pipe_draw_vbo_func real_draw_vbo;
|
||||
|
||||
|
|
@ -1483,7 +1449,6 @@ void si_allocate_gds(struct si_context *ctx);
|
|||
void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
|
||||
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
|
||||
void si_trace_emit(struct si_context *sctx);
|
||||
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned cp_coher_cntl);
|
||||
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
|
|
@ -1502,32 +1467,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
|
|||
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
void si_init_compute_functions(struct si_context *sctx);
|
||||
|
||||
/* si_compute_prim_discard.c */
|
||||
enum si_prim_discard_outcome
|
||||
{
|
||||
SI_PRIM_DISCARD_ENABLED,
|
||||
SI_PRIM_DISCARD_DISABLED,
|
||||
SI_PRIM_DISCARD_DRAW_SPLIT,
|
||||
SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
|
||||
};
|
||||
|
||||
void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
|
||||
enum si_prim_discard_outcome
|
||||
si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
|
||||
unsigned drawid_offset,
|
||||
const struct pipe_draw_start_count_bias *draws,
|
||||
unsigned num_draws, unsigned total_count);
|
||||
void si_compute_signal_gfx(struct si_context *sctx);
|
||||
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
|
||||
const struct pipe_draw_info *info,
|
||||
const struct pipe_draw_start_count_bias *draws,
|
||||
unsigned num_draws, unsigned index_size,
|
||||
unsigned total_count, uint64_t input_indexbuf_va,
|
||||
unsigned index_max_size);
|
||||
void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
|
||||
unsigned *prim_discard_vertex_count_threshold,
|
||||
unsigned *index_ring_size_per_ib);
|
||||
|
||||
/* si_pipe.c */
|
||||
void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
|
||||
|
||||
|
|
@ -1996,14 +1935,9 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
|
|||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
|
||||
}
|
||||
|
||||
static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
|
||||
{
|
||||
return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
|
||||
}
|
||||
|
||||
static inline unsigned si_get_wave_size(struct si_screen *sscreen,
|
||||
gl_shader_stage stage, bool ngg, bool es,
|
||||
bool gs_fast_launch, bool prim_discard_cs)
|
||||
bool gs_fast_launch)
|
||||
{
|
||||
if (stage == MESA_SHADER_COMPUTE)
|
||||
return sscreen->compute_wave_size;
|
||||
|
|
@ -2011,8 +1945,7 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen,
|
|||
return sscreen->ps_wave_size;
|
||||
else if (gs_fast_launch)
|
||||
return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
|
||||
else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
|
||||
(stage == MESA_SHADER_VERTEX && es && !ngg) ||
|
||||
else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
|
||||
(stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
|
||||
(stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
|
||||
return 64;
|
||||
|
|
@ -2025,18 +1958,14 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
|
|||
return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
|
||||
shader->key.as_ngg,
|
||||
shader->key.as_es,
|
||||
shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
|
||||
shader->key.opt.vs_as_prim_discard_cs);
|
||||
shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
|
||||
}
|
||||
|
||||
static inline void si_select_draw_vbo(struct si_context *sctx)
|
||||
{
|
||||
bool has_prim_discard_cs = si_compute_prim_discard_enabled(sctx) &&
|
||||
!sctx->shader.tes.cso && !sctx->shader.gs.cso;
|
||||
pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
|
||||
[!!sctx->shader.gs.cso]
|
||||
[sctx->ngg]
|
||||
[has_prim_discard_cs];
|
||||
[sctx->ngg];
|
||||
assert(draw_vbo);
|
||||
if (unlikely(sctx->real_draw_vbo))
|
||||
sctx->real_draw_vbo = draw_vbo;
|
||||
|
|
|
|||
|
|
@ -260,15 +260,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
|
|||
case SI_QUERY_DISK_SHADER_CACHE_MISSES:
|
||||
query->begin_result = sctx->screen->num_disk_shader_cache_misses;
|
||||
break;
|
||||
case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
|
||||
query->begin_result = sctx->compute_num_verts_accepted;
|
||||
break;
|
||||
case SI_QUERY_PD_NUM_PRIMS_REJECTED:
|
||||
query->begin_result = sctx->compute_num_verts_rejected;
|
||||
break;
|
||||
case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
|
||||
query->begin_result = sctx->compute_num_verts_ineligible;
|
||||
break;
|
||||
case SI_QUERY_GPIN_ASIC_ID:
|
||||
case SI_QUERY_GPIN_NUM_SIMD:
|
||||
case SI_QUERY_GPIN_NUM_RB:
|
||||
|
|
@ -429,15 +420,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
|
|||
case SI_QUERY_DISK_SHADER_CACHE_MISSES:
|
||||
query->end_result = sctx->screen->num_disk_shader_cache_misses;
|
||||
break;
|
||||
case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
|
||||
query->end_result = sctx->compute_num_verts_accepted;
|
||||
break;
|
||||
case SI_QUERY_PD_NUM_PRIMS_REJECTED:
|
||||
query->end_result = sctx->compute_num_verts_rejected;
|
||||
break;
|
||||
case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
|
||||
query->end_result = sctx->compute_num_verts_ineligible;
|
||||
break;
|
||||
case SI_QUERY_GPIN_ASIC_ID:
|
||||
case SI_QUERY_GPIN_NUM_SIMD:
|
||||
case SI_QUERY_GPIN_NUM_RB:
|
||||
|
|
@ -479,11 +461,6 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ
|
|||
result->u64 =
|
||||
(query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
|
||||
return true;
|
||||
case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
|
||||
case SI_QUERY_PD_NUM_PRIMS_REJECTED:
|
||||
case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
|
||||
result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
|
||||
return true;
|
||||
case SI_QUERY_GPIN_ASIC_ID:
|
||||
result->u32 = 0;
|
||||
return true;
|
||||
|
|
@ -1758,10 +1735,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
|
|||
X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
|
||||
X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
|
||||
X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
|
||||
|
||||
X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
|
||||
X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
|
||||
X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
|
||||
};
|
||||
|
||||
#undef X
|
||||
|
|
|
|||
|
|
@ -111,9 +111,6 @@ enum
|
|||
SI_QUERY_GPIN_NUM_RB,
|
||||
SI_QUERY_GPIN_NUM_SPI,
|
||||
SI_QUERY_GPIN_NUM_SE,
|
||||
SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
|
||||
SI_QUERY_PD_NUM_PRIMS_REJECTED,
|
||||
SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
|
||||
SI_QUERY_LIVE_SHADER_CACHE_HITS,
|
||||
SI_QUERY_LIVE_SHADER_CACHE_MISSES,
|
||||
SI_QUERY_MEMORY_SHADER_CACHE_HITS,
|
||||
|
|
|
|||
|
|
@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
|
|||
|
||||
/* VGPRs */
|
||||
declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
|
||||
|
||||
/* Return values */
|
||||
if (shader->key.opt.vs_as_prim_discard_cs) {
|
||||
for (i = 0; i < 4; i++)
|
||||
ac_add_return(&ctx->args, AC_ARG_VGPR);
|
||||
}
|
||||
break;
|
||||
|
||||
case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
|
||||
|
|
@ -1070,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader)
|
|||
return "Vertex Shader as ES";
|
||||
else if (shader->key.as_ls)
|
||||
return "Vertex Shader as LS";
|
||||
else if (shader->key.opt.vs_as_prim_discard_cs)
|
||||
return "Vertex Shader as Primitive Discard CS";
|
||||
else if (shader->key.as_ngg)
|
||||
return "Vertex Shader as ESGS";
|
||||
else
|
||||
|
|
@ -1183,12 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
|
|||
fprintf(f, " as_ls = %u\n", key->as_ls);
|
||||
fprintf(f, " as_ngg = %u\n", key->as_ngg);
|
||||
fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
|
||||
fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
|
||||
fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
|
||||
fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
|
||||
fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
|
||||
fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
|
||||
fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
|
||||
break;
|
||||
|
||||
case MESA_SHADER_TESS_CTRL:
|
||||
|
|
@ -1317,7 +1303,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
|
|||
key->vs_prolog.as_ls = shader_out->key.as_ls;
|
||||
key->vs_prolog.as_es = shader_out->key.as_es;
|
||||
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
|
||||
key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
|
||||
|
||||
if (ngg_cull_shader) {
|
||||
key->vs_prolog.gs_fast_launch_tri_list =
|
||||
|
|
@ -1342,8 +1327,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
|
|||
|
||||
/* Only one of these combinations can be set. as_ngg can be set with as_es. */
|
||||
assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
|
||||
(key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
|
||||
1);
|
||||
(key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
|
||||
|
||||
/* Enable loading the InstanceID VGPR. */
|
||||
uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
|
||||
|
|
@ -1557,7 +1541,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
|
|||
(key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
|
||||
(key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
|
||||
SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
|
||||
shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
|
||||
break;
|
||||
case MESA_SHADER_TESS_CTRL:
|
||||
assert(!prolog);
|
||||
|
|
@ -1581,8 +1564,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
|
|||
si_llvm_context_init(&ctx, sscreen, compiler,
|
||||
si_get_wave_size(sscreen, stage,
|
||||
shader.key.as_ngg, shader.key.as_es,
|
||||
shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
|
||||
shader.key.opt.vs_as_prim_discard_cs));
|
||||
shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL));
|
||||
ctx.shader = &shader;
|
||||
ctx.stage = stage;
|
||||
|
||||
|
|
|
|||
|
|
@ -446,7 +446,6 @@ struct si_shader_selector {
|
|||
ubyte const_and_shader_buf_descriptors_index;
|
||||
ubyte sampler_and_images_descriptors_index;
|
||||
bool vs_needs_prolog;
|
||||
bool prim_discard_cs_allowed;
|
||||
ubyte cs_shaderbufs_sgpr_index;
|
||||
ubyte cs_num_shaderbufs_in_user_sgprs;
|
||||
ubyte cs_images_sgpr_index;
|
||||
|
|
@ -577,7 +576,6 @@ union si_shader_part_key {
|
|||
unsigned as_ls : 1;
|
||||
unsigned as_es : 1;
|
||||
unsigned as_ngg : 1;
|
||||
unsigned as_prim_discard_cs : 1;
|
||||
unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
|
||||
unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
|
||||
unsigned gs_fast_launch_index_size_packed : 2;
|
||||
|
|
@ -684,14 +682,6 @@ struct si_shader_key {
|
|||
*/
|
||||
unsigned prefer_mono : 1;
|
||||
|
||||
/* Primitive discard compute shader. */
|
||||
unsigned vs_as_prim_discard_cs : 1;
|
||||
unsigned cs_prim_type : 4;
|
||||
unsigned cs_indexed : 1;
|
||||
unsigned cs_provoking_vertex_first : 1;
|
||||
unsigned cs_cull_front : 1;
|
||||
unsigned cs_cull_back : 1;
|
||||
|
||||
/* VS and TCS have the same number of patch vertices. */
|
||||
unsigned same_patch_vertices:1;
|
||||
|
||||
|
|
|
|||
|
|
@ -804,9 +804,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
|
|||
!same_thread_count && si_is_multi_part_shader(ctx->shader))
|
||||
ac_build_endif(&ctx->ac, 6507);
|
||||
|
||||
/* Return the value from the last part. It's non-void only for the prim
|
||||
* discard compute shader.
|
||||
*/
|
||||
if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
|
||||
LLVMBuildRetVoid(builder);
|
||||
else
|
||||
|
|
@ -1116,9 +1113,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
parts[num_parts++] = main_fn;
|
||||
|
||||
si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
|
||||
|
||||
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
|
||||
si_build_prim_discard_compute_shader(&ctx);
|
||||
} else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
|
||||
LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
|
||||
|
||||
|
|
@ -1289,8 +1283,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
|||
}
|
||||
|
||||
/* Make sure the input is a pointer and not integer followed by inttoptr. */
|
||||
if (!shader->key.opt.vs_as_prim_discard_cs)
|
||||
assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
|
||||
assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
|
||||
|
||||
/* Compile to bytecode. */
|
||||
if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
|
||||
|
|
|
|||
|
|
@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
|
|||
|
||||
si_llvm_context_init(&ctx, sscreen, compiler,
|
||||
si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
|
||||
false, false, false, false));
|
||||
false, false, false));
|
||||
ctx.shader = shader;
|
||||
ctx.stage = MESA_SHADER_VERTEX;
|
||||
|
||||
|
|
|
|||
|
|
@ -793,32 +793,6 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
|
|||
FREE(outputs);
|
||||
}
|
||||
|
||||
static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
struct si_shader_info *info = &ctx->shader->selector->info;
|
||||
LLVMValueRef *addrs = abi->outputs;
|
||||
LLVMValueRef pos[4] = {};
|
||||
|
||||
assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
|
||||
|
||||
for (unsigned i = 0; i < info->num_outputs; i++) {
|
||||
if (info->output_semantic[i] != VARYING_SLOT_POS)
|
||||
continue;
|
||||
|
||||
for (unsigned chan = 0; chan < 4; chan++)
|
||||
pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
|
||||
break;
|
||||
}
|
||||
assert(pos[0] != NULL);
|
||||
|
||||
/* Return the position output. */
|
||||
LLVMValueRef ret = ctx->return_value;
|
||||
for (unsigned chan = 0; chan < 4; chan++)
|
||||
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
|
||||
ctx->return_value = ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the vertex shader prolog function.
|
||||
*
|
||||
|
|
@ -1121,8 +1095,6 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad
|
|||
ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
|
||||
else if (shader->key.as_es)
|
||||
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
|
||||
else if (shader->key.opt.vs_as_prim_discard_cs)
|
||||
ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
|
||||
else if (ngg_cull_shader)
|
||||
ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
|
||||
else if (shader->key.as_ngg)
|
||||
|
|
|
|||
|
|
@ -971,7 +971,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
|
||||
template <chip_class GFX_VERSION, si_has_ngg NGG>
|
||||
ALWAYS_INLINE
|
||||
static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
|
||||
unsigned drawid_base,
|
||||
|
|
@ -980,7 +980,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|||
unsigned num_draws, unsigned total_count,
|
||||
struct pipe_resource *indexbuf, unsigned index_size,
|
||||
unsigned index_offset, unsigned instance_count,
|
||||
bool dispatch_prim_discard_cs, unsigned original_index_size)
|
||||
unsigned original_index_size)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
|
||||
|
|
@ -1042,22 +1042,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|||
sctx->last_index_size = index_size;
|
||||
}
|
||||
|
||||
/* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
|
||||
if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
|
||||
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
|
||||
/* Skip draw calls with 0-sized index buffers.
|
||||
* They cause a hang on some chips, like Navi10-14.
|
||||
*/
|
||||
if (!index_max_size) {
|
||||
radeon_end();
|
||||
return;
|
||||
}
|
||||
|
||||
index_va = si_resource(indexbuf)->gpu_address + index_offset;
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
|
||||
RADEON_PRIO_INDEX_BUFFER);
|
||||
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size);
|
||||
/* Skip draw calls with 0-sized index buffers.
|
||||
* They cause a hang on some chips, like Navi10-14.
|
||||
*/
|
||||
if (!index_max_size) {
|
||||
radeon_end();
|
||||
return;
|
||||
}
|
||||
|
||||
index_va = si_resource(indexbuf)->gpu_address + index_offset;
|
||||
|
||||
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
|
||||
RADEON_PRIO_INDEX_BUFFER);
|
||||
} else {
|
||||
/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
|
||||
* so the state must be re-emitted before the next indexed draw.
|
||||
|
|
@ -1190,16 +1187,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|||
bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;
|
||||
|
||||
if (index_size) {
|
||||
if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
|
||||
radeon_end();
|
||||
|
||||
si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
|
||||
original_index_size, total_count, index_va,
|
||||
index_max_size);
|
||||
EMIT_SQTT_END_DRAW;
|
||||
return;
|
||||
}
|
||||
|
||||
/* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
|
||||
* can be changed between draws, and GS fast launch must be disabled.
|
||||
* NOT_EOP doesn't work on gfx9 and older.
|
||||
|
|
@ -1629,100 +1616,12 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
|
|||
info->restart_index, min_vertex_count);
|
||||
}
|
||||
|
||||
static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
|
||||
{
|
||||
struct radeon_winsys *ws = sctx->ws;
|
||||
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
|
||||
struct si_descriptors *buffers =
|
||||
&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
|
||||
struct si_shader_selector *vs = sctx->shader.vs.cso;
|
||||
struct si_vertex_elements *velems = sctx->vertex_elements;
|
||||
unsigned num_velems = velems->count;
|
||||
unsigned num_images = vs->info.base.num_images;
|
||||
|
||||
/* Index buffer. */
|
||||
if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
|
||||
goto has_write_reference;
|
||||
|
||||
/* Vertex buffers. */
|
||||
for (unsigned i = 0; i < num_velems; i++) {
|
||||
if (!((1 << i) & velems->first_vb_use_mask))
|
||||
continue;
|
||||
|
||||
unsigned vb_index = velems->vertex_buffer_index[i];
|
||||
struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
|
||||
if (!res)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
|
||||
goto has_write_reference;
|
||||
}
|
||||
|
||||
/* Constant and shader buffers. */
|
||||
for (unsigned i = 0; i < buffers->num_active_slots; i++) {
|
||||
unsigned index = buffers->first_active_slot + i;
|
||||
struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
|
||||
if (!res)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
|
||||
goto has_write_reference;
|
||||
}
|
||||
|
||||
/* Samplers. */
|
||||
if (vs->info.base.textures_used[0]) {
|
||||
unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used);
|
||||
|
||||
for (unsigned i = 0; i < num_samplers; i++) {
|
||||
struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
|
||||
if (!view)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
|
||||
goto has_write_reference;
|
||||
}
|
||||
}
|
||||
|
||||
/* Images. */
|
||||
if (num_images) {
|
||||
for (unsigned i = 0; i < num_images; i++) {
|
||||
struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
|
||||
if (!res)
|
||||
continue;
|
||||
|
||||
if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
|
||||
goto has_write_reference;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
has_write_reference:
|
||||
/* If the current gfx IB has enough packets, flush it to remove write
|
||||
* references to buffers.
|
||||
*/
|
||||
if (cs->prev_dw + cs->current.cdw > 2048) {
|
||||
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
assert(si_all_vs_resources_read_only(sctx, indexbuf));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE bool pd_msg(const char *s)
|
||||
{
|
||||
if (SI_PRIM_DISCARD_DEBUG)
|
||||
printf("PD failed: %s\n", s);
|
||||
return false;
|
||||
}
|
||||
|
||||
#define DRAW_CLEANUP do { \
|
||||
if (index_size && indexbuf != info->index.resource) \
|
||||
pipe_resource_reference(&indexbuf, NULL); \
|
||||
} while (0)
|
||||
|
||||
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
|
||||
si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
|
||||
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
|
||||
static void si_draw_vbo(struct pipe_context *ctx,
|
||||
const struct pipe_draw_info *info,
|
||||
unsigned drawid_offset,
|
||||
|
|
@ -1910,70 +1809,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|||
info->primitive_restart &&
|
||||
(!sctx->screen->options.prim_restart_tri_strips_only ||
|
||||
(prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
|
||||
bool dispatch_prim_discard_cs = false;
|
||||
unsigned original_index_size = index_size;
|
||||
|
||||
/* Determine if we can use the primitive discard compute shader. */
|
||||
/* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */
|
||||
if (ALLOW_PRIM_DISCARD_CS &&
|
||||
(total_direct_count > sctx->prim_discard_vertex_count_threshold
|
||||
? (sctx->compute_num_verts_rejected += total_direct_count, true)
|
||||
: /* Add, then return true. */
|
||||
(sctx->compute_num_verts_ineligible += total_direct_count,
|
||||
false)) && /* Add, then return false. */
|
||||
(!primitive_restart || pd_msg("primitive restart")) &&
|
||||
/* Supported prim types. */
|
||||
(1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) &&
|
||||
(instance_count == 1 || pd_msg("instancing")) &&
|
||||
((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) ||
|
||||
!sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
|
||||
(!sctx->render_cond || pd_msg("render condition")) &&
|
||||
/* Forced enablement ignores pipeline statistics queries. */
|
||||
(sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
|
||||
(!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
|
||||
pd_msg("pipestat or primgen query")) &&
|
||||
(!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
|
||||
(!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
|
||||
!rs->polygon_mode_enabled &&
|
||||
#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
|
||||
(!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
|
||||
(!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
|
||||
(!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) &&
|
||||
(!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
|
||||
!sctx->shader.vs.cso->info.base.vs.window_space_position &&
|
||||
!sctx->shader.vs.cso->so.num_outputs &&
|
||||
#else
|
||||
(sctx->shader.vs.cso->prim_discard_cs_allowed ||
|
||||
pd_msg("VS shader uses unsupported features")) &&
|
||||
#endif
|
||||
/* Check that all buffers are used for read only, because compute
|
||||
* dispatches can run ahead. */
|
||||
(si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
|
||||
pd_msg("write reference"))) {
|
||||
switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws,
|
||||
total_direct_count)) {
|
||||
case SI_PRIM_DISCARD_ENABLED:
|
||||
original_index_size = index_size;
|
||||
dispatch_prim_discard_cs = true;
|
||||
|
||||
/* The compute shader changes/lowers the following: */
|
||||
prim = PIPE_PRIM_TRIANGLES;
|
||||
index_size = 4;
|
||||
instance_count = 1;
|
||||
sctx->compute_num_verts_rejected -= total_direct_count;
|
||||
sctx->compute_num_verts_accepted += total_direct_count;
|
||||
break;
|
||||
case SI_PRIM_DISCARD_DISABLED:
|
||||
break;
|
||||
case SI_PRIM_DISCARD_DRAW_SPLIT:
|
||||
case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT:
|
||||
sctx->compute_num_verts_rejected -= total_direct_count;
|
||||
/* The multi draw was split into multiple ones and executed. Return. */
|
||||
DRAW_CLEANUP;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Set the rasterization primitive type.
|
||||
*
|
||||
* This must be done after si_decompress_textures, which can call
|
||||
|
|
@ -2005,7 +1842,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|||
if (GFX_VERSION >= GFX10) {
|
||||
struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
|
||||
|
||||
if (NGG && !HAS_GS && !dispatch_prim_discard_cs &&
|
||||
if (NGG && !HAS_GS &&
|
||||
/* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type
|
||||
* is not triangles, so this check is only needed without tessellation. */
|
||||
(HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&
|
||||
|
|
@ -2154,10 +1991,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|||
}
|
||||
assert(sctx->dirty_atoms == 0);
|
||||
|
||||
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
|
||||
si_emit_draw_packets<GFX_VERSION, NGG>
|
||||
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
|
||||
index_size, index_offset, instance_count, dispatch_prim_discard_cs,
|
||||
original_index_size);
|
||||
index_size, index_offset, instance_count, original_index_size);
|
||||
/* <-- CUs are busy here. */
|
||||
|
||||
/* Start prefetches after the draw has been started. Both will run
|
||||
|
|
@ -2193,10 +2029,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
|||
}
|
||||
assert(sctx->dirty_atoms == 0);
|
||||
|
||||
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
|
||||
si_emit_draw_packets<GFX_VERSION, NGG>
|
||||
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
|
||||
index_size, index_offset, instance_count, dispatch_prim_discard_cs,
|
||||
original_index_size);
|
||||
index_size, index_offset, instance_count, original_index_size);
|
||||
|
||||
/* Prefetch the remaining shaders after the draw has been
|
||||
* started. */
|
||||
|
|
@ -2281,40 +2116,27 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
|
|||
pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
|
||||
}
|
||||
|
||||
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
|
||||
si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
|
||||
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
|
||||
static void si_init_draw_vbo(struct si_context *sctx)
|
||||
{
|
||||
/* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
|
||||
if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)
|
||||
return;
|
||||
|
||||
if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))
|
||||
return;
|
||||
|
||||
if (NGG && GFX_VERSION < GFX10)
|
||||
return;
|
||||
|
||||
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
|
||||
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
|
||||
}
|
||||
|
||||
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
|
||||
static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
|
||||
{
|
||||
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
|
||||
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
|
||||
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
|
||||
}
|
||||
|
||||
template <chip_class GFX_VERSION>
|
||||
static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
|
||||
{
|
||||
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
|
||||
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
|
||||
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
|
||||
si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_OFF>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_OFF>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_OFF>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON, NGG_ON>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_OFF, NGG_ON>(sctx);
|
||||
si_init_draw_vbo<GFX_VERSION, TESS_ON, GS_ON, NGG_ON>(sctx);
|
||||
}
|
||||
|
||||
static void si_invalid_draw_vbo(struct pipe_context *pipe,
|
||||
|
|
|
|||
|
|
@ -81,8 +81,8 @@
|
|||
* Right half: {1,3,5,7,9,11,13,15}
|
||||
*/
|
||||
|
||||
/* Important note: We have to use the standard DX positions, because
|
||||
* the primitive discard compute shader relies on them.
|
||||
/* Important note: We have to use the standard DX positions because shader-based culling
|
||||
* relies on them.
|
||||
*/
|
||||
|
||||
/* 1x MSAA */
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
|
|||
shader_variant_flags |= 1 << 0;
|
||||
if (sel->nir)
|
||||
shader_variant_flags |= 1 << 1;
|
||||
if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32)
|
||||
if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32)
|
||||
shader_variant_flags |= 1 << 2;
|
||||
if (sel->info.stage == MESA_SHADER_FRAGMENT &&
|
||||
/* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
|
||||
|
|
@ -78,11 +78,9 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
|
|||
sel->info.base.fs.uses_discard &&
|
||||
sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
|
||||
shader_variant_flags |= 1 << 3;
|
||||
if (sel->info.stage == MESA_SHADER_VERTEX) {
|
||||
/* This varies depending on whether compute-based culling is enabled. */
|
||||
assert(sel->screen->num_vbos_in_user_sgprs <= 7);
|
||||
shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4;
|
||||
}
|
||||
|
||||
/* bit gap */
|
||||
|
||||
if (sel->screen->options.no_infinite_interp)
|
||||
shader_variant_flags |= 1 << 7;
|
||||
if (sel->screen->options.clamp_div_by_zero)
|
||||
|
|
@ -2291,10 +2289,8 @@ current_not_ready:
|
|||
|
||||
/* Compile the main shader part if it doesn't exist. This can happen
|
||||
* if the initial guess was wrong.
|
||||
*
|
||||
* The prim discard CS doesn't need the main shader part.
|
||||
*/
|
||||
if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) {
|
||||
if (!is_pure_monolithic) {
|
||||
bool ok = true;
|
||||
|
||||
/* Make sure the main shader part is present. This is needed
|
||||
|
|
@ -2348,8 +2344,7 @@ current_not_ready:
|
|||
shader->is_monolithic =
|
||||
is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
|
||||
|
||||
/* The prim discard CS is always optimized. */
|
||||
shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
|
||||
shader->is_optimized = !is_pure_monolithic &&
|
||||
memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
|
||||
|
||||
/* If it's an optimized shader, compile it asynchronously. */
|
||||
|
|
@ -2706,12 +2701,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
|
|||
sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
|
||||
!sel->info.base.vs.blit_sgprs_amd;
|
||||
|
||||
sel->prim_discard_cs_allowed =
|
||||
sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images &&
|
||||
!sel->info.uses_bindless_samplers && !sel->info.base.writes_memory &&
|
||||
!sel->info.writes_viewport_index &&
|
||||
!sel->info.base.vs.window_space_position && !sel->so.num_outputs;
|
||||
|
||||
if (sel->info.stage == MESA_SHADER_VERTEX ||
|
||||
sel->info.stage == MESA_SHADER_TESS_CTRL ||
|
||||
sel->info.stage == MESA_SHADER_TESS_EVAL ||
|
||||
|
|
|
|||
|
|
@ -771,9 +771,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
|
|||
* http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
|
||||
*/
|
||||
return 20 * 1024;
|
||||
case IB_PARALLEL_COMPUTE:
|
||||
/* Always chain this IB. */
|
||||
return UINT_MAX;
|
||||
default:
|
||||
unreachable("bad ib_type");
|
||||
}
|
||||
|
|
@ -908,9 +905,6 @@ static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
|
|||
assert(0);
|
||||
}
|
||||
|
||||
cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE;
|
||||
cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
|
||||
|
||||
cs->last_added_bo = NULL;
|
||||
return true;
|
||||
}
|
||||
|
|
@ -938,8 +932,6 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs
|
|||
cleanup_fence_list(&cs->fence_dependencies);
|
||||
cleanup_fence_list(&cs->syncobj_dependencies);
|
||||
cleanup_fence_list(&cs->syncobj_to_signal);
|
||||
cleanup_fence_list(&cs->compute_fence_dependencies);
|
||||
cleanup_fence_list(&cs->compute_start_fence_dependencies);
|
||||
|
||||
cs->num_real_buffers = 0;
|
||||
cs->num_slab_buffers = 0;
|
||||
|
|
@ -957,8 +949,6 @@ static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs
|
|||
FREE(cs->fence_dependencies.list);
|
||||
FREE(cs->syncobj_dependencies.list);
|
||||
FREE(cs->syncobj_to_signal.list);
|
||||
FREE(cs->compute_fence_dependencies.list);
|
||||
FREE(cs->compute_start_fence_dependencies.list);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -997,7 +987,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
|
|||
amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
|
||||
|
||||
cs->main.ib_type = IB_MAIN;
|
||||
cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE;
|
||||
|
||||
if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
|
||||
FREE(cs);
|
||||
|
|
@ -1035,37 +1024,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *compute_cs,
|
||||
struct radeon_cmdbuf *gfx_cs,
|
||||
bool uses_gds_ordered_append)
|
||||
{
|
||||
struct amdgpu_cs *cs = amdgpu_cs(gfx_cs);
|
||||
struct amdgpu_winsys *ws = cs->ws;
|
||||
|
||||
if (cs->ring_type != RING_GFX)
|
||||
return false;
|
||||
|
||||
/* only one secondary IB can be added */
|
||||
if (cs->compute_ib.ib_mapped)
|
||||
return false;
|
||||
|
||||
/* Allocate the compute IB. */
|
||||
if (!amdgpu_get_new_ib(ws, compute_cs, &cs->compute_ib, cs))
|
||||
return false;
|
||||
|
||||
if (uses_gds_ordered_append) {
|
||||
cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |=
|
||||
AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
|
||||
cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |=
|
||||
AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
|
||||
}
|
||||
|
||||
cs->compute_ib.rcs = compute_cs;
|
||||
compute_cs->priv = cs;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
|
||||
unsigned preamble_num_dw)
|
||||
|
|
@ -1128,7 +1086,7 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
|
|||
bool force_chaining)
|
||||
{
|
||||
struct amdgpu_cs *cs = amdgpu_cs(rcs);
|
||||
struct amdgpu_ib *ib = rcs == cs->main.rcs ? &cs->main : &cs->compute_ib;
|
||||
struct amdgpu_ib *ib = &cs->main;
|
||||
unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
|
||||
unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
|
||||
|
||||
|
|
@ -1286,18 +1244,6 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
|
|||
|
||||
util_queue_fence_wait(&fence->submitted);
|
||||
|
||||
if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) {
|
||||
/* Syncobjs are not needed here. */
|
||||
assert(!amdgpu_fence_is_syncobj(fence));
|
||||
|
||||
if (acs->ws->info.has_scheduled_fence_dependency &&
|
||||
dependency_flags & RADEON_DEPENDENCY_START_FENCE)
|
||||
add_fence_to_list(&cs->compute_start_fence_dependencies, fence);
|
||||
else
|
||||
add_fence_to_list(&cs->compute_fence_dependencies, fence);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Start fences are not needed here. */
|
||||
assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE));
|
||||
|
||||
|
|
@ -1589,66 +1535,6 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
|
|||
num_chunks++;
|
||||
}
|
||||
|
||||
/* Submit the parallel compute IB first. */
|
||||
if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) {
|
||||
unsigned old_num_chunks = num_chunks;
|
||||
|
||||
/* Add compute fence dependencies. */
|
||||
unsigned num_dependencies = cs->compute_fence_dependencies.num;
|
||||
if (num_dependencies) {
|
||||
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
|
||||
alloca(num_dependencies * sizeof(*dep_chunk));
|
||||
|
||||
for (unsigned i = 0; i < num_dependencies; i++) {
|
||||
struct amdgpu_fence *fence =
|
||||
(struct amdgpu_fence*)cs->compute_fence_dependencies.list[i];
|
||||
|
||||
assert(util_queue_fence_is_signalled(&fence->submitted));
|
||||
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
|
||||
}
|
||||
|
||||
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
|
||||
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
|
||||
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
|
||||
num_chunks++;
|
||||
}
|
||||
|
||||
/* Add compute start fence dependencies. */
|
||||
unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num;
|
||||
if (num_start_dependencies) {
|
||||
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
|
||||
alloca(num_start_dependencies * sizeof(*dep_chunk));
|
||||
|
||||
for (unsigned i = 0; i < num_start_dependencies; i++) {
|
||||
struct amdgpu_fence *fence =
|
||||
(struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i];
|
||||
|
||||
assert(util_queue_fence_is_signalled(&fence->submitted));
|
||||
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
|
||||
}
|
||||
|
||||
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES;
|
||||
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies;
|
||||
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
|
||||
num_chunks++;
|
||||
}
|
||||
|
||||
/* Convert from dwords to bytes. */
|
||||
cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4;
|
||||
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
|
||||
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
|
||||
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE];
|
||||
num_chunks++;
|
||||
|
||||
r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
|
||||
num_chunks, chunks, NULL);
|
||||
if (r)
|
||||
goto finalize;
|
||||
|
||||
/* Back off the compute chunks. */
|
||||
num_chunks = old_num_chunks;
|
||||
}
|
||||
|
||||
/* Syncobj signals. */
|
||||
unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
|
||||
if (num_syncobj_to_signal) {
|
||||
|
|
@ -1706,7 +1592,7 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
|
|||
r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
|
||||
num_chunks, chunks, &seq_no);
|
||||
}
|
||||
finalize:
|
||||
|
||||
if (r) {
|
||||
if (r == -ENOMEM)
|
||||
fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
|
||||
|
|
@ -1798,12 +1684,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
|
|||
}
|
||||
if (cs->ring_type == RING_GFX)
|
||||
ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
|
||||
|
||||
/* Also pad secondary IBs. */
|
||||
if (cs->compute_ib.ib_mapped) {
|
||||
while (cs->compute_ib.rcs->current.cdw & ib_pad_dw_mask)
|
||||
radeon_emit(cs->compute_ib.rcs, PKT3_NOP_PAD);
|
||||
}
|
||||
break;
|
||||
case RING_UVD:
|
||||
case RING_UVD_ENC:
|
||||
|
|
@ -1839,9 +1719,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
|
|||
/* Set IB sizes. */
|
||||
amdgpu_ib_finalize(ws, rcs, &cs->main);
|
||||
|
||||
if (cs->compute_ib.ib_mapped)
|
||||
amdgpu_ib_finalize(ws, cs->compute_ib.rcs, &cs->compute_ib);
|
||||
|
||||
/* Create a fence. */
|
||||
amdgpu_fence_reference(&cur->fence, NULL);
|
||||
if (cs->next_fence) {
|
||||
|
|
@ -1897,8 +1774,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
|
|||
memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
|
||||
|
||||
amdgpu_get_new_ib(ws, rcs, &cs->main, cs);
|
||||
if (cs->compute_ib.ib_mapped)
|
||||
amdgpu_get_new_ib(ws, cs->compute_ib.rcs, &cs->compute_ib, cs);
|
||||
|
||||
if (cs->preamble_ib_bo) {
|
||||
amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0,
|
||||
|
|
@ -1929,9 +1804,6 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
|
|||
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
|
||||
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL);
|
||||
FREE(rcs->prev);
|
||||
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->compute_ib.big_ib_buffer, NULL);
|
||||
if (cs->compute_ib.rcs)
|
||||
FREE(cs->compute_ib.rcs->prev);
|
||||
amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
|
||||
amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
|
||||
amdgpu_fence_reference(&cs->next_fence, NULL);
|
||||
|
|
@ -1954,7 +1826,6 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
|
|||
ws->base.ctx_destroy = amdgpu_ctx_destroy;
|
||||
ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
|
||||
ws->base.cs_create = amdgpu_cs_create;
|
||||
ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib;
|
||||
ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
|
||||
ws->base.cs_destroy = amdgpu_cs_destroy;
|
||||
ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
|
||||
|
|
|
|||
|
|
@ -58,7 +58,6 @@ struct amdgpu_cs_buffer {
|
|||
enum ib_type {
|
||||
IB_PREAMBLE,
|
||||
IB_MAIN,
|
||||
IB_PARALLEL_COMPUTE,
|
||||
IB_NUM,
|
||||
};
|
||||
|
||||
|
|
@ -115,10 +114,6 @@ struct amdgpu_cs_context {
|
|||
struct amdgpu_fence_list syncobj_dependencies;
|
||||
struct amdgpu_fence_list syncobj_to_signal;
|
||||
|
||||
/* The compute IB uses the dependencies above + these: */
|
||||
struct amdgpu_fence_list compute_fence_dependencies;
|
||||
struct amdgpu_fence_list compute_start_fence_dependencies;
|
||||
|
||||
struct pipe_fence_handle *fence;
|
||||
|
||||
/* the error returned from cs_flush for non-async submissions */
|
||||
|
|
@ -132,7 +127,6 @@ struct amdgpu_cs_context {
|
|||
|
||||
struct amdgpu_cs {
|
||||
struct amdgpu_ib main; /* must be first because this is inherited */
|
||||
struct amdgpu_ib compute_ib; /* optional parallel compute IB */
|
||||
struct amdgpu_winsys *ws;
|
||||
struct amdgpu_ctx *ctx;
|
||||
enum ring_type ring_type;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue