From 845ed015dd359f40c74a2466efecc815500f07b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 7 Jun 2023 15:48:46 -0400 Subject: [PATCH] radeonsi: remove gfx10 NGG streamout Unused and unstable. Keep it only for gfx11. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../radeonsi/{gfx10_query.c => gfx11_query.c} | 118 +++++++++--------- src/gallium/drivers/radeonsi/meson.build | 2 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 25 +--- .../drivers/radeonsi/si_nir_lower_abi.c | 4 +- src/gallium/drivers/radeonsi/si_pipe.c | 7 +- src/gallium/drivers/radeonsi/si_pipe.h | 10 +- src/gallium/drivers/radeonsi/si_query.c | 18 +-- src/gallium/drivers/radeonsi/si_query.h | 12 +- src/gallium/drivers/radeonsi/si_shader.c | 7 +- .../drivers/radeonsi/si_shaderlib_tgsi.c | 2 +- .../drivers/radeonsi/si_state_shaders.cpp | 8 +- .../drivers/radeonsi/si_state_streamout.c | 51 ++------ 12 files changed, 107 insertions(+), 157 deletions(-) rename src/gallium/drivers/radeonsi/{gfx10_query.c => gfx11_query.c} (78%) diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx11_query.c similarity index 78% rename from src/gallium/drivers/radeonsi/gfx10_query.c rename to src/gallium/drivers/radeonsi/gfx11_query.c index 8040b67dcd8..bfcd8e25110 100644 --- a/src/gallium/drivers/radeonsi/gfx10_query.c +++ b/src/gallium/drivers/radeonsi/gfx11_query.c @@ -16,19 +16,19 @@ static void emit_shader_query(struct si_context *sctx) { assert(!list_is_empty(&sctx->shader_query_buffers)); - struct gfx10_sh_query_buffer *qbuf = - list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); + struct gfx11_sh_query_buffer *qbuf = + list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list); + qbuf->head += sizeof(struct gfx11_sh_query_buffer_mem); } -static void gfx10_release_query_buffers(struct si_context *sctx, - struct gfx10_sh_query_buffer *first, - struct gfx10_sh_query_buffer *last) +static void gfx11_release_query_buffers(struct si_context *sctx, + struct gfx11_sh_query_buffer *first, + struct gfx11_sh_query_buffer *last) { while (first) { - struct gfx10_sh_query_buffer *qbuf = first; + struct gfx11_sh_query_buffer *qbuf = first; if (first != last) - first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); + first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list); else first = NULL; @@ -47,19 +47,19 @@ static void gfx10_release_query_buffers(struct si_context *sctx, } } -static bool gfx10_alloc_query_buffer(struct si_context *sctx) +static bool gfx11_alloc_query_buffer(struct si_context *sctx) { if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) return true; - struct gfx10_sh_query_buffer *qbuf = NULL; + struct gfx11_sh_query_buffer *qbuf = NULL; if (!list_is_empty(&sctx->shader_query_buffers)) { - qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) + qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list); + if (qbuf->head + sizeof(struct gfx11_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) goto success; - qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list); if (!qbuf->refcount && !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { @@ -71,13 +71,13 @@ static bool gfx10_alloc_query_buffer(struct si_context *sctx) } if (!qbuf) { - qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); + qbuf = CALLOC_STRUCT(gfx11_sh_query_buffer); if (unlikely(!qbuf)) return false; struct si_screen *screen = sctx->screen; unsigned buf_size = - MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size); + MAX2(sizeof(struct gfx11_sh_query_buffer_mem), screen->info.min_alloc_size); qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); if (unlikely(!qbuf->buf)) { FREE(qbuf); @@ -94,7 +94,7 @@ static bool gfx10_alloc_query_buffer(struct si_context *sctx) PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED); assert(results); - for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e; + for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx11_sh_query_buffer_mem); i < e; ++i) { for (unsigned j = 0; j < 16; ++j) results[32 * i + j] = (uint64_t)1 << 63; @@ -109,7 +109,7 @@ success:; struct pipe_shader_buffer sbuf; sbuf.buffer = &qbuf->buf->b.b; sbuf.buffer_offset = qbuf->head; - sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); + sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem); si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf); SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1); @@ -117,24 +117,24 @@ success:; return true; } -static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) +static void gfx11_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - gfx10_release_query_buffers(sctx, query->first, query->last); + struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery; + gfx11_release_query_buffers(sctx, query->first, query->last); FREE(query); } -static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery) +static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery; - gfx10_release_query_buffers(sctx, query->first, query->last); + gfx11_release_query_buffers(sctx, query->first, query->last); query->first = query->last = NULL; - if (unlikely(!gfx10_alloc_query_buffer(sctx))) + if (unlikely(!gfx11_alloc_query_buffer(sctx))) return false; - query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list); query->first_begin = query->first->head; sctx->num_active_shader_queries++; @@ -143,21 +143,21 @@ static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquer return true; } -static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) +static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery; if (unlikely(!query->first)) return false; /* earlier out of memory error */ - query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list); query->last_end = query->last->head; /* Signal the fence of the previous chunk */ if (query->last_end != 0) { uint64_t fence_va = query->last->buf->gpu_address; - fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); - fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + fence_va += query->last_end - sizeof(struct gfx11_sh_query_buffer_mem); + fence_va += offsetof(struct gfx11_sh_query_buffer_mem, fence); si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va, 0xffffffff, PIPE_QUERY_GPU_FINISHED); @@ -178,8 +178,8 @@ static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) return true; } -static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, - struct gfx10_sh_query_buffer_mem *qmem, +static void gfx11_sh_query_add_result(struct gfx11_sh_query *query, + struct gfx11_sh_query_buffer_mem *qmem, union pipe_query_result *result) { static const uint64_t mask = ((uint64_t)1 << 63) - 1; @@ -212,10 +212,10 @@ static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, } } -static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait, +static bool gfx11_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait, union pipe_query_result *result) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery; util_query_clear_result(result, query->b.type); @@ -223,8 +223,8 @@ static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query * return false; /* earlier out of memory error */ assert(query->last); - for (struct gfx10_sh_query_buffer *qbuf = query->last;; - qbuf = list_entry(qbuf->list.prev, struct gfx10_sh_query_buffer, list)) { + for (struct gfx11_sh_query_buffer *qbuf = query->last;; + qbuf = list_entry(qbuf->list.prev, struct gfx11_sh_query_buffer, list)) { unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK); void *map; @@ -244,10 +244,10 @@ static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query * results_end = query->last_end; while (results_begin != results_end) { - struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; + struct gfx11_sh_query_buffer_mem *qmem = map + results_begin; results_begin += sizeof(*qmem); - gfx10_sh_query_add_result(query, qmem, result); + gfx11_sh_query_add_result(query, qmem, result); } if (qbuf == query->first) @@ -257,19 +257,19 @@ static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query * return true; } -static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery, +static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery, enum pipe_query_flags flags, enum pipe_query_value_type result_type, int index, struct pipe_resource *resource, unsigned offset) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery; struct si_qbo_state saved_state = {}; struct pipe_resource *tmp_buffer = NULL; unsigned tmp_buffer_offset = 0; if (!sctx->sh_query_result_shader) { - sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); + sctx->sh_query_result_shader = gfx11_create_sh_query_result_cs(sctx); if (!sctx->sh_query_result_shader) return; } @@ -345,7 +345,7 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s grid.grid[1] = 1; grid.grid[2] = 1; - struct gfx10_sh_query_buffer *qbuf = query->first; + struct gfx11_sh_query_buffer *qbuf = query->first; for (;;) { unsigned begin = qbuf == query->first ? query->first_begin : 0; unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; @@ -356,7 +356,7 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s ssbo[0].buffer_offset = begin; ssbo[0].buffer_size = end - begin; - consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); + consts.result_count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem); consts.chain = 0; if (qbuf != query->first) consts.chain |= 1; @@ -379,8 +379,8 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s * serialized in the CP. */ va = qbuf->buf->gpu_address; - va += end - sizeof(struct gfx10_sh_query_buffer_mem); - va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + va += end - sizeof(struct gfx11_sh_query_buffer_mem); + va += offsetof(struct gfx11_sh_query_buffer_mem, fence); si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); } @@ -393,49 +393,49 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s if (qbuf == query->last) break; - qbuf = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); + qbuf = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list); } si_restore_qbo_state(sctx, &saved_state); pipe_resource_reference(&tmp_buffer, NULL); } -static const struct si_query_ops gfx10_sh_query_ops = { - .destroy = gfx10_sh_query_destroy, - .begin = gfx10_sh_query_begin, - .end = gfx10_sh_query_end, - .get_result = gfx10_sh_query_get_result, - .get_result_resource = gfx10_sh_query_get_result_resource, +static const struct si_query_ops gfx11_sh_query_ops = { + .destroy = gfx11_sh_query_destroy, + .begin = gfx11_sh_query_begin, + .end = gfx11_sh_query_end, + .get_result = gfx11_sh_query_get_result, + .get_result_resource = gfx11_sh_query_get_result_resource, }; -struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, +struct pipe_query *gfx11_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, unsigned index) { - struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); + struct gfx11_sh_query *query = CALLOC_STRUCT(gfx11_sh_query); if (unlikely(!query)) return NULL; - query->b.ops = &gfx10_sh_query_ops; + query->b.ops = &gfx11_sh_query_ops; query->b.type = query_type; query->stream = index; return (struct pipe_query *)query; } -void gfx10_init_query(struct si_context *sctx) +void gfx11_init_query(struct si_context *sctx) { list_inithead(&sctx->shader_query_buffers); sctx->atoms.s.shader_query.emit = emit_shader_query; } -void gfx10_destroy_query(struct si_context *sctx) +void gfx11_destroy_query(struct si_context *sctx) { if (!sctx->shader_query_buffers.next) return; while (!list_is_empty(&sctx->shader_query_buffers)) { - struct gfx10_sh_query_buffer *qbuf = - list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); + struct gfx11_sh_query_buffer *qbuf = + list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list); list_del(&qbuf->list); assert(!qbuf->refcount); diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index a0570276de5..1d80ee3185d 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -20,8 +20,8 @@ files_libradeonsi = files( 'driinfo_radeonsi.h', - 'gfx10_query.c', 'gfx10_shader_ngg.c', + 'gfx11_query.c', 'si_blit.c', 'si_buffer.c', 'si_build_pm4.h', diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 75db3581283..8aa969d0c66 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -85,12 +85,8 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h * idle when we leave the IB, otherwise another process * might overwrite it while our shaders are busy. */ - if (sscreen->use_ngg_streamout) { - if (ctx->gfx_level >= GFX11) - wait_flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; - else - wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - } + if (ctx->gfx_level >= GFX11) + wait_flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; } } @@ -194,8 +190,6 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx) static void si_add_gds_to_buffer_list(struct si_context *sctx) { - if (sctx->screen->gds) - sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->screen->gds, RADEON_USAGE_READWRITE, 0); if (sctx->screen->gds_oa) sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->screen->gds_oa, RADEON_USAGE_READWRITE, 0); } @@ -204,25 +198,16 @@ void si_allocate_gds(struct si_context *sctx) { struct radeon_winsys *ws = sctx->ws; - assert(sctx->screen->use_ngg_streamout); + assert(sctx->gfx_level >= GFX11); if (sctx->screen->gds_oa) return; - assert(!sctx->screen->gds && !sctx->screen->gds_oa); - - /* Gfx11 only uses GDS OA, not GDS memory. - * Gfx10 needs 256B (64 dw) of GDS, otherwise streamout hangs. - */ + /* Gfx11 only uses GDS OA, not GDS memory. */ simple_mtx_lock(&sctx->screen->gds_mutex); if (!sctx->screen->gds_oa) { sctx->screen->gds_oa = ws->buffer_create(ws, 1, 1, RADEON_DOMAIN_OA, RADEON_FLAG_DRIVER_INTERNAL); assert(sctx->screen->gds_oa); - - if (sctx->gfx_level < GFX11) { - sctx->screen->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, RADEON_FLAG_DRIVER_INTERNAL); - assert(sctx->screen->gds); - } } simple_mtx_unlock(&sctx->screen->gds_mutex); @@ -514,7 +499,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state); si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref); si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map); - if (!ctx->screen->use_ngg_streamout) + if (ctx->gfx_level < GFX11) si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable); /* CLEAR_STATE disables all window rectangles. */ if (!has_clear_state || ctx->num_window_rectangles > 0) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 4841a379e26..342b4cb4c39 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -497,8 +497,8 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s unsigned stream = nir_intrinsic_stream_id(intrin); unsigned offset = intrin->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ? - offsetof(struct gfx10_sh_query_buffer_mem, stream[stream].generated_primitives) : - offsetof(struct gfx10_sh_query_buffer_mem, stream[stream].emitted_primitives); + offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].generated_primitives) : + offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].emitted_primitives); nir_ssa_def *prim_count = intrin->src[0].ssa; nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), prim_count, diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1580a5ee86b..80695b9bad5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -192,7 +192,7 @@ static void si_destroy_context(struct pipe_context *context) si_release_all_descriptors(sctx); if (sctx->gfx_level >= GFX10 && sctx->has_graphics) - gfx10_destroy_query(sctx); + gfx11_destroy_query(sctx); if (sctx->sqtt) { struct si_screen *sscreen = sctx->screen; @@ -634,7 +634,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign /* Initialize graphics-only context functions. */ if (sctx->has_graphics) { if (sctx->gfx_level >= GFX10) - gfx10_init_query(sctx); + gfx11_init_query(sctx); si_init_msaa_functions(sctx); si_init_shader_functions(sctx); si_init_state_functions(sctx); @@ -993,7 +993,6 @@ static void si_destroy_screen(struct pipe_screen *pscreen) simple_mtx_destroy(&sscreen->gpu_load_mutex); simple_mtx_destroy(&sscreen->gds_mutex); - radeon_bo_reference(sscreen->ws, &sscreen->gds, NULL); radeon_bo_reference(sscreen->ws, &sscreen->gds_oa, NULL); slab_destroy_parent(&sscreen->pool_transfers); @@ -1308,7 +1307,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, if (sscreen->info.gfx_level >= GFX11) { sscreen->use_ngg = true; - sscreen->use_ngg_streamout = true; /* TODO: Disable for now. Investigate if it helps. */ sscreen->use_ngg_culling = (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL)) && !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); @@ -1317,7 +1315,6 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->info.gfx_level >= GFX10 && (sscreen->info.family != CHIP_NAVI14 || sscreen->info.is_pro_graphics); - sscreen->use_ngg_streamout = false; sscreen->use_ngg_culling = sscreen->use_ngg && sscreen->info.max_render_backends >= 2 && !(sscreen->debug_flags & DBG(NO_NGG_CULLING)); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f2d0b21b52b..34902f93a24 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -564,7 +564,6 @@ struct si_screen { bool dpbb_allowed; bool use_ngg; bool use_ngg_culling; - bool use_ngg_streamout; bool allow_dcc_msaa_clear_to_reg_for_bpp[5]; /* indexed by log2(Bpp) */ bool always_allow_dcc_stores; @@ -690,7 +689,6 @@ struct si_screen { /* NGG streamout. */ simple_mtx_t gds_mutex; - struct pb_buffer *gds; struct pb_buffer *gds_oa; }; @@ -1597,11 +1595,11 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx); void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array); void *si_create_query_result_cs(struct si_context *sctx); -void *gfx10_create_sh_query_result_cs(struct si_context *sctx); +void *gfx11_create_sh_query_result_cs(struct si_context *sctx); -/* gfx10_query.c */ -void gfx10_init_query(struct si_context *sctx); -void gfx10_destroy_query(struct si_context *sctx); +/* gfx11_query.c */ +void gfx11_init_query(struct si_context *sctx); +void gfx11_destroy_query(struct si_context *sctx); /* si_test_image_copy_region.c */ void si_test_image_copy_region(struct si_screen *sscreen); diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 9893d69497b..70339c793b2 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -1050,10 +1050,10 @@ static void si_emit_query_predication(struct si_context *ctx) flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; - if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || - query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { - struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query; - struct gfx10_sh_query_buffer *qbuf, *first, *last; + if (ctx->gfx_level >= GFX11 && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { + struct gfx11_sh_query *gfx10_query = (struct gfx11_sh_query *)query; + struct gfx11_sh_query_buffer *qbuf, *first, *last; op = PRED_OP(PREDICATION_OP_PRIMCOUNT); @@ -1071,7 +1071,7 @@ static void si_emit_query_predication(struct si_context *ctx) while (first) { qbuf = first; if (first != last) - first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); + first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list); else first = NULL; @@ -1082,7 +1082,7 @@ static void si_emit_query_predication(struct si_context *ctx) unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0; unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0; - unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); + unsigned count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem); do { if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { @@ -1096,7 +1096,7 @@ static void si_emit_query_predication(struct si_context *ctx) op |= PREDICATION_CONTINUE; } - results_base += sizeof(struct gfx10_sh_query_buffer_mem); + results_base += sizeof(struct gfx11_sh_query_buffer_mem); } while (count--); } } else { @@ -1178,12 +1178,12 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que (query_type >= PIPE_QUERY_DRIVER_SPECIFIC)) return si_query_sw_create(query_type); - if (sscreen->use_ngg_streamout && + if (sscreen->info.gfx_level >= GFX11 && (query_type == PIPE_QUERY_PRIMITIVES_EMITTED || query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS || query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) - return gfx10_sh_query_create(sscreen, query_type, index); + return gfx11_sh_query_create(sscreen, query_type, index); return si_query_hw_create(sscreen, query_type, index); } diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 2e1e175dd4e..e51bf4ad47c 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -221,7 +221,7 @@ void si_query_hw_resume(struct si_context *sctx, struct si_query *query); * point into the ring, allowing an arbitrary number of queries to be active * without additional GPU cost. */ -struct gfx10_sh_query_buffer { +struct gfx11_sh_query_buffer { struct list_head list; struct si_resource *buf; unsigned refcount; @@ -237,7 +237,7 @@ struct gfx10_sh_query_buffer { * SET_PREDICATION packet, which also means that we're setting the high bit * of all those values unconditionally. */ -struct gfx10_sh_query_buffer_mem { +struct gfx11_sh_query_buffer_mem { struct { uint64_t generated_primitives_start_dummy; uint64_t emitted_primitives_start_dummy; @@ -248,18 +248,18 @@ struct gfx10_sh_query_buffer_mem { uint32_t pad[31]; }; -struct gfx10_sh_query { +struct gfx11_sh_query { struct si_query b; - struct gfx10_sh_query_buffer *first; - struct gfx10_sh_query_buffer *last; + struct gfx11_sh_query_buffer *first; + struct gfx11_sh_query_buffer *last; unsigned first_begin; unsigned last_end; unsigned stream; }; -struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, +struct pipe_query *gfx11_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, unsigned index); /* Performance counters */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 18b79a2dc57..38d29da9185 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -142,7 +142,8 @@ static void declare_streamout_params(struct si_shader_args *args, struct si_shad { struct si_shader_selector *sel = shader->selector; - if (sel->screen->use_ngg_streamout) { + if (shader->selector->screen->info.gfx_level >= GFX11) { + /* NGG streamout. */ if (sel->stage == MESA_SHADER_TESS_EVAL) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); return; @@ -1761,7 +1762,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) options.passthrough = gfx10_is_ngg_passthrough(shader); options.use_edgeflags = gfx10_edgeflags_have_effect(shader); options.has_gen_prim_query = options.has_xfb_prim_query = - sel->screen->use_ngg_streamout && !sel->info.base.vs.blit_sgprs_amd; + sel->screen->info.gfx_level >= GFX11 && !sel->info.base.vs.blit_sgprs_amd; options.export_primitive_id = key->ge.mono.u.vs_export_prim_id; options.instance_rate_inputs = instance_rate_inputs; options.user_clip_plane_enable_mask = clip_plane_enable; @@ -1772,7 +1773,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size; options.has_gen_prim_query = options.has_xfb_prim_query = - sel->screen->use_ngg_streamout; + sel->screen->info.gfx_level >= GFX11; NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options); } diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 42dc364daf6..b01fdcfe415 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -581,7 +581,7 @@ void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, * 2: write next summary buffer * 0.w = result_count */ -void *gfx10_create_sh_query_result_cs(struct si_context *sctx) +void *gfx11_create_sh_query_result_cs(struct si_context *sctx) { /* TEMP[0].x = accumulated result so far * TEMP[0].y = result missing diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 4d7e5ba6187..777aa16deaf 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1681,7 +1681,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, else if (sscreen->info.gfx_level == GFX9) rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); - if (!sscreen->use_ngg_streamout && si_shader_uses_streamout(shader)) { + if (si_shader_uses_streamout(shader)) { rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->info.base.xfb_stride[0]) | S_00B12C_SO_BASE1_EN(!!shader->selector->info.base.xfb_stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->info.base.xfb_stride[2]) | @@ -2963,7 +2963,7 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind if (sel->stage <= MESA_SHADER_GEOMETRY && sscreen->use_ngg && (!sel->info.enabled_streamout_buffer_mask || - sscreen->use_ngg_streamout) && + sscreen->info.gfx_level >= GFX11) && ((sel->stage == MESA_SHADER_VERTEX && !shader->key.ge.as_ls) || sel->stage == MESA_SHADER_TESS_EVAL || sel->stage == MESA_SHADER_GEOMETRY)) shader->key.ge.as_ngg = 1; @@ -3258,7 +3258,7 @@ static void si_update_streamout_state(struct si_context *sctx) sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride; /* GDS must be allocated when any GDS instructions are used, otherwise it hangs. */ - if (sctx->screen->use_ngg_streamout && shader_with_so->info.enabled_streamout_buffer_mask) + if (sctx->gfx_level >= GFX11 && shader_with_so->info.enabled_streamout_buffer_mask) si_allocate_gds(sctx); } @@ -3384,7 +3384,7 @@ bool si_update_ngg(struct si_context *sctx) if (sctx->shader.gs.cso && sctx->shader.tes.cso && sctx->shader.gs.cso->tess_turns_off_ngg) { new_ngg = false; - } else if (!sctx->screen->use_ngg_streamout) { + } else if (sctx->gfx_level < GFX11) { struct si_shader_selector *last = si_get_vs(sctx)->cso; if ((last && last->info.enabled_streamout_buffer_mask) || diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index d6fe2cc48d0..fefabf875d7 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -92,13 +92,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ */ sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; - if (sctx->screen->use_ngg_streamout) { - if (sctx->gfx_level >= GFX11) { - sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; - } else { - /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; - } + if (sctx->gfx_level >= GFX11) { + sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; /* Wait now. This is needed to make sure that GDS is not * busy at the end of IBs. @@ -149,7 +144,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ /* Allocate space for the filled buffer size. */ struct si_streamout_target *t = sctx->streamout.targets[i]; if (!t->buf_filled_size) { - unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; + unsigned buf_filled_size_size = sctx->gfx_level >= GFX11 ? 8 : 4; u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4, &t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size); @@ -181,7 +176,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ struct pipe_shader_buffer sbuf; sbuf.buffer = targets[i]->buffer; - if (sctx->screen->use_ngg_streamout) { + if (sctx->gfx_level >= GFX11) { sbuf.buffer_offset = targets[i]->buffer_offset; sbuf.buffer_size = targets[i]->buffer_size; } else { @@ -243,7 +238,7 @@ static void si_emit_streamout_begin(struct si_context *sctx) struct radeon_cmdbuf *cs = &sctx->gfx_cs; struct si_streamout_target **t = sctx->streamout.targets; - if (!sctx->screen->use_ngg_streamout) + if (sctx->gfx_level < GFX11) si_flush_vgt_streamout(sctx); for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { @@ -265,27 +260,6 @@ static void si_emit_streamout_begin(struct si_context *sctx) radeon_set_uconfig_reg(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0); radeon_end(); } - } else if (sctx->screen->use_ngg_streamout) { - bool append = sctx->streamout.append_bitmask & (1 << i); - uint64_t va = 0; - - if (append) { - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, - RADEON_USAGE_READ | RADEON_PRIO_SO_FILLED_SIZE); - - va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; - } - - radeon_begin(cs); - radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); - radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | - S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(1)); - radeon_emit(va); - radeon_emit(va >> 32); - radeon_emit(4 * i); /* destination in GDS */ - radeon_emit(0); - radeon_emit(S_415_BYTE_COUNT_GFX9(4)); - radeon_end(); } else { /* Legacy streamout. * @@ -337,7 +311,7 @@ void si_emit_streamout_end(struct si_context *sctx) /* Wait for streamout to finish before reading GDS_STRMOUT registers. */ sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; sctx->emit_cache_flush(sctx, &sctx->gfx_cs); - } else if (!sctx->screen->use_ngg_streamout) { + } else { si_flush_vgt_streamout(sctx); } @@ -353,11 +327,6 @@ void si_emit_streamout_end(struct si_context *sctx) COPY_DATA_REG, NULL, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; - } else if (sctx->screen->use_ngg_streamout) { - /* TODO: PS_DONE doesn't ensure completion of VS if there are no PS waves. */ - si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, - EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, - t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); } else { radeon_begin(cs); radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); @@ -394,7 +363,7 @@ void si_emit_streamout_end(struct si_context *sctx) static void si_emit_streamout_enable(struct si_context *sctx) { - assert(!sctx->screen->use_ngg_streamout); + assert(sctx->gfx_level < GFX11); radeon_begin(&sctx->gfx_cs); radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2); @@ -418,7 +387,7 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable) sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) | (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); - if (!sctx->screen->use_ngg_streamout && + if (sctx->gfx_level < GFX11 && ((old_strmout_en != si_get_strmout_en(sctx)) || (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); @@ -426,7 +395,7 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable) void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) { - if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) { + if (sctx->gfx_level < GFX11 && type == PIPE_QUERY_PRIMITIVES_GENERATED) { bool old_strmout_en = si_get_strmout_en(sctx); sctx->streamout.num_prims_gen_queries += diff; @@ -451,6 +420,6 @@ void si_init_streamout_functions(struct si_context *sctx) sctx->b.set_stream_output_targets = si_set_streamout_targets; sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; - if (!sctx->screen->use_ngg_streamout) + if (sctx->gfx_level < GFX11) sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; }