radv/gfx10: implement NGG GS queries

The number of generated primitives is only counted by the hardware
if GS uses the legacy path. For NGG GS, we need to accumulate that
value in the NGG GS itself. To achieve that, we use a plain GDS
atomic operation.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3380>
This commit is contained in:
Samuel Pitoiset 2020-01-13 18:30:50 +01:00
parent 3c1f657f35
commit e4752dafed
7 changed files with 155 additions and 6 deletions

View file

@ -2574,6 +2574,35 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
}
static void
radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
{
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
struct radv_userdata_info *loc;
uint32_t ngg_gs_state = 0;
uint32_t base_reg;
if (!radv_pipeline_has_gs(pipeline) ||
!radv_pipeline_has_ngg(pipeline))
return;
/* By default NGG GS queries are disabled but they are enabled if the
* command buffer has active GDS queries or if it's a secondary command
* buffer that inherits the number of generated primitives.
*/
if (cmd_buffer->state.active_pipeline_gds_queries ||
(cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
ngg_gs_state = 1;
loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY,
AC_UD_NGG_GS_STATE);
base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
assert(loc->sgpr_idx != -1);
radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
ngg_gs_state);
}
static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
@ -2581,6 +2610,7 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool
radv_flush_streamout_descriptors(cmd_buffer);
radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
radv_flush_ngg_gs_state(cmd_buffer);
}
struct radv_draw_info {
@ -3349,6 +3379,9 @@ VkResult radv_BeginCommandBuffer(
return result;
}
cmd_buffer->state.inherited_pipeline_statistics =
pBeginInfo->pInheritanceInfo->pipelineStatistics;
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
}
@ -4089,6 +4122,8 @@ void radv_CmdExecuteCommands(
primary->tess_rings_needed = true;
if (secondary->sample_positions_needed)
primary->sample_positions_needed = true;
if (secondary->gds_needed)
primary->gds_needed = true;
if (!secondary->state.framebuffer &&
(primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {

View file

@ -3170,6 +3170,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
build_streamout(ctx, &nggso);
}
/* Write shader query data. */
tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
ac_build_ifcc(&ctx->ac, tmp, 5109);
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
LLVMConstInt(ctx->ac.i32, 4, false), "");
ac_build_ifcc(&ctx->ac, tmp, 5110);
{
tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
ac_llvm_add_target_dep_function_attr(ctx->main_function,
"amdgpu-gds-size", 256);
LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
/* Use a plain GDS atomic to accumulate the number of generated
* primitives.
*/
ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase,
tmp, sync_scope);
}
ac_build_endif(&ctx->ac, 5110);
ac_build_endif(&ctx->ac, 5109);
/* TODO: culling */
/* Determine vertex liveness. */

View file

@ -1260,6 +1260,7 @@ struct radv_cmd_state {
unsigned active_occlusion_queries;
bool perfect_occlusion_queries_enabled;
unsigned active_pipeline_queries;
unsigned active_pipeline_gds_queries;
float offset_scale;
uint32_t trace_id;
uint32_t last_ia_multi_vgt_param;
@ -1275,6 +1276,9 @@ struct radv_cmd_state {
int predication_type; /* -1: disabled, 0: normal, 1: inverted */
uint64_t predication_va;
/* Inheritance info. */
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
bool context_roll_without_scissor_emitted;
};
@ -1333,7 +1337,7 @@ struct radv_cmd_buffer {
uint32_t esgs_ring_size_needed;
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
bool gds_needed; /* for GFX10 streamout */
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
bool gds_oa_needed; /* for GFX10 streamout */
bool sample_positions_needed;

View file

@ -40,6 +40,14 @@
static const int pipelinestat_block_size = 11 * 8;
static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
static unsigned
radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
{
int offset = ffs(flag) - 1;
assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
return pipeline_statistics_indices[offset];
}
static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
{
return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
@ -1261,6 +1269,22 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
radv_meta_restore(&saved_state, cmd_buffer);
}
static bool
radv_query_pool_needs_gds(struct radv_device *device,
struct radv_query_pool *pool)
{
/* The number of primitives generated by geometry shader invocations is
* only counted by the hardware if GS uses the legacy path. When NGG GS
* is used, the hardware can't know the number of generated primitives
* and we have to it manually inside the shader. To achieve that, the
* driver does a plain GDS atomic to accumulate that value.
* TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
* query.
*/
return device->physical_device->use_ngg &&
(pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
}
VkResult radv_CreateQueryPool(
VkDevice _device,
const VkQueryPoolCreateInfo* pCreateInfo,
@ -1725,6 +1749,7 @@ static unsigned event_type_for_stream(unsigned stream)
}
static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
struct radv_query_pool *pool,
uint64_t va,
VkQueryType query_type,
VkQueryControlFlags flags,
@ -1776,6 +1801,30 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
/* Make sure GDS is idle before copying the value. */
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
RADV_CMD_FLAG_INV_L2;
si_emit_cache_flush(cmd_buffer);
va += 8 * idx;
si_cs_emit_write_event_eop(cs,
cmd_buffer->device->physical_device->rad_info.chip_class,
radv_cmd_buffer_uses_mec(cmd_buffer),
V_028A90_PS_DONE, 0,
EOP_DST_SEL_TC_L2,
EOP_DATA_SEL_GDS,
va, EOP_DATA_GDS(0, 1), 0);
/* Record that the command buffer needs GDS. */
cmd_buffer->gds_needed = true;
cmd_buffer->state.active_pipeline_gds_queries++;
}
break;
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
radeon_check_space(cmd_buffer->device->ws, cs, 4);
@ -1794,6 +1843,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
}
static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
struct radv_query_pool *pool,
uint64_t va, uint64_t avail_va,
VkQueryType query_type, uint32_t index)
{
@ -1841,6 +1891,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
EOP_DATA_SEL_VALUE_32BIT,
avail_va, 1,
cmd_buffer->gfx9_eop_bug_va);
if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
/* Make sure GDS is idle before copying the value. */
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
RADV_CMD_FLAG_INV_L2;
si_emit_cache_flush(cmd_buffer);
va += 8 * idx;
si_cs_emit_write_event_eop(cs,
cmd_buffer->device->physical_device->rad_info.chip_class,
radv_cmd_buffer_uses_mec(cmd_buffer),
V_028A90_PS_DONE, 0,
EOP_DST_SEL_TC_L2,
EOP_DATA_SEL_GDS,
va, EOP_DATA_GDS(0, 1), 0);
cmd_buffer->state.active_pipeline_gds_queries--;
}
break;
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
radeon_check_space(cmd_buffer->device->ws, cs, 4);
@ -1884,7 +1955,7 @@ void radv_CmdBeginQueryIndexedEXT(
va += pool->stride * query;
emit_begin_query(cmd_buffer, va, pool->type, flags, index);
emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
}
void radv_CmdBeginQuery(
@ -1911,7 +1982,7 @@ void radv_CmdEndQueryIndexedEXT(
/* Do not need to add the pool BO to the list because the query must
* currently be active, which means the BO is already in the list.
*/
emit_end_query(cmd_buffer, va, avail_va, pool->type, index);
emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
/*
* For multiview we have to emit a query for each bit in the mask,
@ -1928,8 +1999,8 @@ void radv_CmdEndQueryIndexedEXT(
for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
va += pool->stride;
avail_va += 4;
emit_begin_query(cmd_buffer, va, pool->type, 0, 0);
emit_end_query(cmd_buffer, va, avail_va, pool->type, 0);
emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
}
}
}

View file

@ -148,7 +148,8 @@ enum radv_ud_index {
AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
AC_UD_VIEW_INDEX = 4,
AC_UD_STREAMOUT_BUFFERS = 5,
AC_UD_SHADER_START = 6,
AC_UD_NGG_GS_STATE = 6,
AC_UD_SHADER_START = 7,
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
AC_UD_VS_MAX_UD,

View file

@ -615,6 +615,11 @@ radv_declare_shader_args(struct radv_shader_args *args,
&args->ac.view_index);
}
if (args->options->key.vs_common_out.as_ngg) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
&args->ngg_gs_state);
}
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
&args->gs_vtx_offset[0]);
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
@ -742,6 +747,9 @@ radv_declare_shader_args(struct radv_shader_args *args,
}
if (args->ac.view_index.used)
set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
if (args->ngg_gs_state.used)
set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1);
break;
case MESA_SHADER_FRAGMENT:
break;

View file

@ -65,6 +65,9 @@ struct radv_shader_args {
struct ac_arg streamout_config;
struct ac_arg streamout_offset[4];
/* NGG GS */
struct ac_arg ngg_gs_state;
bool is_gs_copy_shader;
};