diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 83cca397716..f25309736c9 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -6,6 +6,7 @@ C_SOURCES := \ $(GENERATED_SOURCES) \ cik_sdma.c \ driinfo_radeonsi.h \ + gfx10_query.c \ gfx10_shader_ngg.c \ si_blit.c \ si_buffer.c \ diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c new file mode 100644 index 00000000000..8584b2af505 --- /dev/null +++ b/src/gallium/drivers/radeonsi/gfx10_query.c @@ -0,0 +1,521 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "si_pipe.h" +#include "si_query.h" +#include "util/u_memory.h" +#include "util/u_suballoc.h" +#include "sid.h" + +/** + * The query buffer is written to by ESGS NGG shaders with statistics about + * generated and (streamout-)emitted primitives. + * + * The context maintains a ring of these query buffers, and queries simply + * point into the ring, allowing an arbitrary number of queries to be active + * without additional GPU cost. + */ +struct gfx10_sh_query_buffer { + struct list_head list; + struct si_resource *buf; + unsigned refcount; + + /* Offset into the buffer in bytes; points at the first un-emitted entry. */ + unsigned head; +}; + +/* Memory layout of the query buffer. Must be kept in sync with shaders + * (including QBO shaders) and should be aligned to cachelines. + * + * The somewhat awkward memory layout is for compatibility with the + * SET_PREDICATION packet, which also means that we're setting the high bit + * of all those values unconditionally. + */ +struct gfx10_sh_query_buffer_mem { + struct { + uint64_t generated_primitives_start_dummy; + uint64_t emitted_primitives_start_dummy; + uint64_t generated_primitives; + uint64_t emitted_primitives; + } stream[4]; + uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ + uint32_t pad[31]; +}; + +/* Shader-based queries. */ +struct gfx10_sh_query { + struct si_query b; + + struct gfx10_sh_query_buffer *first; + struct gfx10_sh_query_buffer *last; + unsigned first_begin; + unsigned last_end; + + unsigned stream; +}; + +static void emit_shader_query(struct si_context *sctx) +{ + assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers)); + + struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); +} + +static void gfx10_release_query_buffers(struct si_context *sctx, + struct gfx10_sh_query_buffer *first, + struct gfx10_sh_query_buffer *last) +{ + while (first) { + struct gfx10_sh_query_buffer *qbuf = first; + if (first != last) + first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + else + first = NULL; + + qbuf->refcount--; + if (qbuf->refcount) + continue; + + if (qbuf->list.next == &sctx->shader_query_buffers) + continue; /* keep the most recent buffer; it may not be full yet */ + if (qbuf->list.prev == &sctx->shader_query_buffers) + continue; /* keep the oldest buffer for recycling */ + + LIST_DEL(&qbuf->list); + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } +} + +static bool gfx10_alloc_query_buffer(struct si_context *sctx) +{ + if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) + return true; + + struct gfx10_sh_query_buffer *qbuf = NULL; + + if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) { + qbuf = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) + goto success; + + qbuf = list_first_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + if (!qbuf->refcount && + !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && + sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { + /* Can immediately re-use the oldest buffer */ + LIST_DEL(&qbuf->list); + } else { + qbuf = NULL; + } + } + + if (!qbuf) { + qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); + if (unlikely(!qbuf)) + return false; + + struct si_screen *screen = sctx->screen; + unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem), + screen->info.min_alloc_size); + qbuf->buf = si_resource( + pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); + if (unlikely(!qbuf->buf)) { + FREE(qbuf); + return false; + } + } + + /* The buffer is currently unused by the GPU. Initialize it. + * + * We need to set the high bit of all the primitive counters for + * compatibility with the SET_PREDICATION packet. + */ + uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED); + assert(results); + + for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); + i < e; ++i) { + for (unsigned j = 0; j < 16; ++j) + results[32 * i + j] = (uint64_t)1 << 63; + results[32 * i + 16] = 0; + } + + LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers); + qbuf->head = 0; + qbuf->refcount = sctx->num_active_shader_queries; + +success:; + struct pipe_shader_buffer sbuf; + sbuf.buffer = &qbuf->buf->b.b; + sbuf.buffer_offset = qbuf->head; + sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); + si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf); + + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); + return true; +} + +static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) +{ + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + gfx10_release_query_buffers(sctx, query->first, query->last); + FREE(query); +} + +static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery) +{ + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + + gfx10_release_query_buffers(sctx, query->first, query->last); + query->first = query->last = NULL; + + if (unlikely(!gfx10_alloc_query_buffer(sctx))) + return false; + + query->first = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + query->first_begin = query->first->head; + + sctx->num_active_shader_queries++; + query->first->refcount++; + + return true; +} + +static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) +{ + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + + if (unlikely(!query->first)) + return false; /* earlier out of memory error */ + + query->last = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + query->last_end = query->last->head; + + /* Signal the fence of the previous chunk */ + if (query->last_end != 0) { + uint64_t fence_va = query->last->buf->gpu_address; + fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); + fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + si_cp_release_mem(sctx, sctx->gfx_cs, + V_028A90_BOTTOM_OF_PIPE_TS, 0, + EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, + query->last->buf, fence_va, 0xffffffff, + PIPE_QUERY_GPU_FINISHED); + } + + sctx->num_active_shader_queries--; + + if (sctx->num_active_shader_queries > 0) { + gfx10_alloc_query_buffer(sctx); + } else { + si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL); + + /* If a query_begin is followed by a query_end without a draw + * in-between, we need to clear the atom to ensure that the + * next query_begin will re-initialize the shader buffer. */ + si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); + } + + return true; +} + +static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, + struct gfx10_sh_query_buffer_mem *qmem, + union pipe_query_result *result) +{ + static const uint64_t mask = ((uint64_t)1 << 63) - 1; + + switch (query->b.type) { + case PIPE_QUERY_PRIMITIVES_EMITTED: + result->u64 += qmem->stream[query->stream].emitted_primitives & mask; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + result->u64 += qmem->stream[query->stream].generated_primitives & mask; + break; + case PIPE_QUERY_SO_STATISTICS: + result->so_statistics.num_primitives_written += + qmem->stream[query->stream].emitted_primitives & mask; + result->so_statistics.primitives_storage_needed += + qmem->stream[query->stream].generated_primitives & mask; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result->b |= qmem->stream[query->stream].emitted_primitives != + qmem->stream[query->stream].generated_primitives; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + result->b |= qmem->stream[query->stream].emitted_primitives != + qmem->stream[query->stream].generated_primitives; + } + break; + default: + assert(0); + } +} + +static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, + bool wait, union pipe_query_result *result) +{ + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + + util_query_clear_result(result, query->b.type); + + if (unlikely(!query->first)) + return false; /* earlier out of memory error */ + assert(query->last); + + for (struct gfx10_sh_query_buffer *qbuf = query->last;; + qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) { + unsigned usage = PIPE_TRANSFER_READ | + (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + void *map; + + if (rquery->b.flushed) + map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); + + if (!map) + return false; + + unsigned results_begin = 0; + unsigned results_end = qbuf->head; + if (qbuf == query->first) + results_begin = query->first_begin; + if (qbuf == query->last) + results_end = query->last_end; + + while (results_begin != results_end) { + struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; + results_begin += sizeof(*qmem); + + gfx10_sh_query_add_result(query, qmem, result); + } + + if (qbuf == query->first) + break; + } + + return true; +} + +static void gfx10_sh_query_get_result_resource(struct si_context *sctx, + struct si_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct si_qbo_state saved_state = {}; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + + if (!sctx->sh_query_result_shader) { + sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); + if (!sctx->sh_query_result_shader) + return; + } + + if (query->first != query->last) { + u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, + &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } + + si_save_qbo_state(sctx, &saved_state); + + /* Pre-fill the constants configuring the shader behavior. */ + struct { + uint32_t config; + uint32_t offset; + uint32_t chain; + uint32_t result_count; + } consts; + struct pipe_constant_buffer constant_buffer = {}; + + if (index >= 0) { + switch (query->b.type) { + case PIPE_QUERY_PRIMITIVES_GENERATED: + consts.offset = sizeof(uint32_t) * query->stream; + consts.config = 0; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + consts.offset = sizeof(uint32_t) * (4 + query->stream); + consts.config = 0; + break; + case PIPE_QUERY_SO_STATISTICS: + consts.offset = sizeof(uint32_t) * (4 * index + query->stream); + consts.config = 0; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + consts.offset = sizeof(uint32_t) * query->stream; + consts.config = 2; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + consts.offset = 0; + consts.config = 3; + break; + default: unreachable("bad query type"); + } + } else { + /* Check result availability. */ + consts.offset = 0; + consts.config = 1; + } + + if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) + consts.config |= 8; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + /* Pre-fill the SSBOs and grid. */ + struct pipe_shader_buffer ssbo[3]; + struct pipe_grid_info grid = {}; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + struct gfx10_sh_query_buffer *qbuf = query->first; + for (;;) { + unsigned begin = qbuf == query->first ? query->first_begin : 0; + unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; + if (!end) + continue; + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = begin; + ssbo[0].buffer_size = end - begin; + + consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); + consts.chain = 0; + if (qbuf != query->first) + consts.chain |= 1; + if (qbuf != query->last) + consts.chain |= 2; + + if (qbuf == query->last) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + } + + sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6); + + if (wait) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address; + va += end - sizeof(struct gfx10_sh_query_buffer_mem); + va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + + si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); + } + + sctx->b.launch_grid(&sctx->b, &grid); + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (qbuf == query->last) + break; + qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + } + + si_restore_qbo_state(sctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); +} + +static const struct si_query_ops gfx10_sh_query_ops = { + .destroy = gfx10_sh_query_destroy, + .begin = gfx10_sh_query_begin, + .end = gfx10_sh_query_end, + .get_result = gfx10_sh_query_get_result, + .get_result_resource = gfx10_sh_query_get_result_resource, +}; + +struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, + enum pipe_query_type query_type, + unsigned index) +{ + struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); + if (unlikely(!query)) + return NULL; + + query->b.ops = &gfx10_sh_query_ops; + query->b.type = query_type; + query->stream = index; + + return (struct pipe_query *)query; +} + +void gfx10_init_query(struct si_context *sctx) +{ + LIST_INITHEAD(&sctx->shader_query_buffers); + sctx->atoms.s.shader_query.emit = emit_shader_query; +} + +void gfx10_destroy_query(struct si_context *sctx) +{ + while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) { + struct gfx10_sh_query_buffer *qbuf = + list_first_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + LIST_DEL(&qbuf->list); + + assert(!qbuf->refcount); + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } +} diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 87ca56b1fdf..c97d9009164 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -64,6 +64,15 @@ static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx) false); } +static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx) +{ + LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, + ctx->param_rw_buffers); + + return ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + LLVMConstInt(ctx->i32, GFX10_GS_QUERY_BUF, false)); +} + /* Send GS Alloc Req message from the first wave of the group to SPI. * Message payload is: * - bits 0..10: vertices in group @@ -209,6 +218,27 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + /* Update query buffer */ + tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5030); + tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5031); + { + LLVMValueRef args[] = { + ngg_get_prim_cnt(ctx), + ngg_get_query_buf(ctx), + LLVMConstInt(ctx->i32, 16, false), /* offset of stream[0].generated_primitives */ + ctx->i32_0, /* soffset */ + ctx->i32_0, /* cachepolicy */ + }; + + /* TODO: should this be 64-bit atomics? */ + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", + ctx->i32, args, 5, 0); + } + ac_build_endif(&ctx->ac, 5031); + ac_build_endif(&ctx->ac, 5030); + /* Export primitive data to the index buffer. Format is: * - bits 0..8: index 0 * - bit 9: edge flag 0 @@ -431,9 +461,34 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); LLVMBuildStore(builder, tmp, primflagptr); + tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); + LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); + lp_build_endif(&if_state); } +void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx) +{ + /* Zero out the part of LDS scratch that is used to accumulate the + * per-stream generated primitive count. + */ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef scratchptr = ctx->gs_ngg_scratch; + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef tmp; + + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5090); + { + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); + LLVMBuildStore(builder, ctx->i32_0, ptr); + } + ac_build_endif(&ctx->ac, 5090); + + ac_build_s_barrier(&ctx->ac); +} + void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) { const struct si_shader_selector *sel = ctx->shader->selector; @@ -481,6 +536,26 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) ac_build_endloop(&ctx->ac, 5100); } + /* Accumulate generated primitives counts across the entire threadgroup. */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!info->num_stream_output_components[stream]) + continue; + + LLVMValueRef numprims = + LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, 64); + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5105); + { + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->i32, stream, false)), + numprims, LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5105); + } + lp_build_endif(&ctx->merged_wrap_if_state); ac_build_s_barrier(&ctx->ac); @@ -490,6 +565,33 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) /* TODO: streamout */ + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + LLVMValueRef offset; + tmp = tid; + if (sel->so.num_outputs) + tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->i32, 3, false), ""); + offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 32, false), ""); + if (sel->so.num_outputs) { + tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->i32, 2, false), ""); + tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->i32, 8, false), ""); + offset = LLVMBuildAdd(builder, offset, tmp, ""); + } + + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + LLVMValueRef args[] = { + tmp, + ngg_get_query_buf(ctx), + offset, + LLVMConstInt(ctx->i32, 16, false), /* soffset */ + ctx->i32_0, /* cachepolicy */ + }; + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", + ctx->i32, args, 5, 0); + } + ac_build_endif(&ctx->ac, 5110); + /* TODO: culling */ /* Determine vertex liveness. */ diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 0ca065f34e0..a362c207776 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -21,6 +21,7 @@ files_libradeonsi = files( 'cik_sdma.c', 'driinfo_radeonsi.h', + 'gfx10_query.c', 'gfx10_shader_ngg.c', 'si_blit.c', 'si_buffer.c', diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index c2cee024982..91b474d4d8f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -168,6 +168,9 @@ static void si_destroy_context(struct pipe_context *context) si_release_all_descriptors(sctx); + if (sctx->chip_class >= GFX10) + gfx10_destroy_query(sctx); + pipe_resource_reference(&sctx->esgs_ring, NULL); pipe_resource_reference(&sctx->gsvs_ring, NULL); pipe_resource_reference(&sctx->tess_rings, NULL); @@ -239,6 +242,8 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->query_result_shader) sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader); + if (sctx->sh_query_result_shader) + sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader); if (sctx->gfx_cs) sctx->ws->cs_destroy(sctx->gfx_cs); @@ -516,6 +521,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, /* Initialize graphics-only context functions. */ if (sctx->has_graphics) { si_init_context_texture_functions(sctx); + if (sctx->chip_class >= GFX10) + gfx10_init_query(sctx); si_init_msaa_functions(sctx); si_init_shader_functions(sctx); si_init_state_functions(sctx); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a351e5004b1..874b1bf4cd0 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -868,6 +868,7 @@ struct si_context { struct pipe_device_reset_callback device_reset_callback; struct u_log_context *log; void *query_result_shader; + void *sh_query_result_shader; void (*emit_cache_flush)(struct si_context *ctx); @@ -1178,6 +1179,10 @@ struct si_context { unsigned num_sdma_uploads; unsigned max_sdma_uploads; + /* Shader-based queries. */ + struct list_head shader_query_buffers; + unsigned num_active_shader_queries; + /* Statistics gathering for the DCC enablement heuristic. It can't be * in si_texture because si_texture can be shared by multiple * contexts. This is for back buffers only. We shouldn't get too many @@ -1439,6 +1444,11 @@ void *si_clear_render_target_shader(struct pipe_context *ctx); void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); void *si_create_dcc_retile_cs(struct pipe_context *ctx); void *si_create_query_result_cs(struct si_context *sctx); +void *gfx10_create_sh_query_result_cs(struct si_context *sctx); + +/* gfx10_query.c */ +void gfx10_init_query(struct si_context *sctx); +void gfx10_destroy_query(struct si_context *sctx); /* si_test_dma.c */ void si_test_dma(struct si_screen *sscreen); diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index ae6498e1895..394bf7ff124 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -32,8 +32,6 @@ #include "util/u_suballoc.h" #include "amd/common/sid.h" -#define SI_MAX_STREAMS 4 - static const struct si_query_ops query_hw_ops; struct si_hw_query_params { @@ -1015,6 +1013,12 @@ static void si_emit_query_predication(struct si_context *ctx) if (!query) return; + if (ctx->chip_class == GFX10 && + (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { + assert(!"not implemented"); + } + invert = ctx->render_cond_invert; flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; @@ -1096,6 +1100,14 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que query_type != SI_QUERY_TIME_ELAPSED_SDMA)) return si_query_sw_create(query_type); + if (sscreen->info.chip_class >= GFX10 && + (query_type == PIPE_QUERY_PRIMITIVES_EMITTED || + query_type == PIPE_QUERY_PRIMITIVES_GENERATED || + query_type == PIPE_QUERY_SO_STATISTICS || + query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) + return gfx10_sh_query_create(sscreen, query_type, index); + return si_query_hw_create(sscreen, query_type, index); } diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 82e5e25ed00..dc219f8551c 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -38,6 +38,8 @@ struct si_query_buffer; struct si_query_hw; struct si_resource; +#define SI_MAX_STREAMS 4 + enum { SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC, SI_QUERY_DECOMPRESS_CALLS, @@ -228,6 +230,12 @@ void si_query_hw_suspend(struct si_context *sctx, struct si_query *query); void si_query_hw_resume(struct si_context *sctx, struct si_query *query); +/* Shader-based queries */ +struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, + enum pipe_query_type query_type, + unsigned index); + + /* Performance counters */ struct si_perfcounters { unsigned num_groups; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 9abecdf1003..68506b7a92c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -6082,6 +6082,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx) for (unsigned i = 0; i < 4; ++i) { ctx->gs_curprim_verts[i] = lp_build_alloca(&ctx->gallivm, ctx->ac.i32, ""); + ctx->gs_generated_prims[i] = + lp_build_alloca(&ctx->gallivm, ctx->ac.i32, ""); } LLVMTypeRef a8i32 = LLVMArrayType(ctx->i32, 8); @@ -6135,9 +6137,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx) if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY) { + if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) { + gfx10_ngg_gs_emit_prologue(ctx); + nested_barrier = false; + } else { + nested_barrier = true; + } + /* Number of patches / primitives */ num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); - nested_barrier = true; } else { /* Number of vertices */ num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8); diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 09efc91b9f5..7832f75ef65 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -214,6 +214,7 @@ struct si_shader_context { LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */ LLVMValueRef gs_next_vertex[4]; LLVMValueRef gs_curprim_verts[4]; + LLVMValueRef gs_generated_prims[4]; LLVMValueRef gs_ngg_emit; LLVMValueRef gs_ngg_scratch; LLVMValueRef postponed_kill; @@ -388,6 +389,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs); +void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index b68fd2ff236..9f2f9d30216 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -660,3 +660,228 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx) return ctx->create_compute_state(ctx, &state); } + +/* Create the compute shader that is used to collect the results of gfx10+ + * shader queries. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem) + * BUFFER[1] = previous summary buffer + * BUFFER[2] = next summary buffer or user-supplied buffer + * + * CONST + * 0.x = config; the low 3 bits indicate the mode: + * 0: sum up counts + * 1: determine result availability and write it as a boolean + * 2: SO_OVERFLOW + * 3: SO_ANY_OVERFLOW + * the remaining bits form a bitfield: + * 8: write result as a 64-bit value + * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode + * 0.z = chain bit field: + * 1: have previous summary buffer + * 2: write next summary buffer + * 0.w = result_count + */ +void *gfx10_create_sh_query_result_cs(struct si_context *sctx) +{ + /* TEMP[0].x = accumulated result so far + * TEMP[0].y = result missing + * TEMP[0].z = whether we're in overflow mode + */ + static const char text_tmpl[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0][0..0]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 7, 0, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + + /* + acc_result = 0; + acc_missing = 0; + if (chain & 1) { + acc_result = buffer[1][0]; + acc_missing = buffer[1][1]; + } + */ + "MOV TEMP[0].xy, IMM[0].xxxx\n" + "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n" + "UIF TEMP[5]\n" + "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + /* + is_overflow (TEMP[0].z) = (config & 7) >= 2; + result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count; + base_offset (TEMP[1].y) = 0; + for (;;) { + if (!result_remaining) + break; + result_remaining--; + */ + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n" + + "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n" + "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n" + "MOV TEMP[1].y, IMM[0].xxxx\n" + + "BGNLOOP\n" + "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n" + + /* + fence = buffer[0]@(base_offset + 32); + if (!fence) { + acc_missing = ~0u; + break; + } + */ + "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "MOV TEMP[0].y, TEMP[5].xxxx\n" + "BRK\n" + "ENDIF\n" + + /* + stream_offset (TEMP[2].x) = base_offset + offset; + + if (!(config & 7)) { + acc_result += buffer[0]@stream_offset; + } + */ + "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n" + + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n" + "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n" + "ENDIF\n" + + /* + if ((config & 7) >= 2) { + count (TEMP[2].y) = (config & 1) ? 4 : 1; + */ + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n" + "UIF TEMP[5]\n" + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n" + "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n" + + /* + do { + generated = buffer[0]@stream_offset; + emitted = buffer[0]@(stream_offset + 16); + if (generated != emitted) { + acc_result = 1; + result_remaining = 0; + break; + } + + stream_offset += 4; + } while (--count); + */ + "BGNLOOP\n" + "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n" + "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n" + "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n" + "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n" + "UIF TEMP[5]\n" + "MOV TEMP[0].x, IMM[1].xxxx\n" + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BRK\n" + "ENDIF\n" + + "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n" + "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n" + "ENDLOOP\n" + "ENDIF\n" + + /* + base_offset += 64; + } // end outer loop + */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n" + "ENDLOOP\n" + + /* + if (chain & 2) { + buffer[2][0] = acc_result; + buffer[2][1] = acc_missing; + } else { + */ + "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + + /* + if ((config & 7) == 1) { + acc_result = acc_missing ? 0 : 1; + acc_missing = 0; + } + */ + "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n" + "UIF TEMP[5]\n" + "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + /* + if (!acc_missing) { + buffer[2][0] = acc_result; + if (config & 8) + buffer[2][1] = 0; + } + */ + "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + + "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return sctx->b.create_compute_state(&sctx->b, &state); +} diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 678f87cd73d..757dd1bf5cd 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -228,6 +228,7 @@ union si_state_atoms { struct si_atom spi_map; struct si_atom scratch_state; struct si_atom window_rectangles; + struct si_atom shader_query; } s; struct si_atom array[0]; }; @@ -370,6 +371,8 @@ enum { SI_PS_IMAGE_COLORBUF0_FMASK, SI_PS_IMAGE_COLORBUF0_FMASK_HI, + GFX10_GS_QUERY_BUF, + SI_NUM_RW_BUFFERS, };