diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index f81e5fa91dc..21ef5caa8e5 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2574,6 +2574,35 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER; } +static void +radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_userdata_info *loc; + uint32_t ngg_gs_state = 0; + uint32_t base_reg; + + if (!radv_pipeline_has_gs(pipeline) || + !radv_pipeline_has_ngg(pipeline)) + return; + + /* By default NGG GS queries are disabled but they are enabled if the + * command buffer has active GDS queries or if it's a secondary command + * buffer that inherits the number of generated primitives. + */ + if (cmd_buffer->state.active_pipeline_gds_queries || + (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) + ngg_gs_state = 1; + + loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, + AC_UD_NGG_GS_STATE); + base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY]; + assert(loc->sgpr_idx != -1); + + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, + ngg_gs_state); +} + static void radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty) { @@ -2581,6 +2610,7 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool radv_flush_streamout_descriptors(cmd_buffer); radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS); + radv_flush_ngg_gs_state(cmd_buffer); } struct radv_draw_info { @@ -3349,6 +3379,9 @@ VkResult radv_BeginCommandBuffer( return result; } + cmd_buffer->state.inherited_pipeline_statistics = + pBeginInfo->pInheritanceInfo->pipelineStatistics; + radv_cmd_buffer_set_subpass(cmd_buffer, subpass); } @@ -4089,6 +4122,8 @@ void radv_CmdExecuteCommands( primary->tess_rings_needed = true; if (secondary->sample_positions_needed) primary->sample_positions_needed = true; + if (secondary->gds_needed) + primary->gds_needed = true; if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) { diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 58b679a35ae..422ffa17699 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -3170,6 +3170,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) build_streamout(ctx, &nggso); } + /* Write shader query data. */ + tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + /* Use a plain GDS atomic to accumulate the number of generated + * primitives. + */ + ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, + tmp, sync_scope); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); + /* TODO: culling */ /* Determine vertex liveness. */ diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 4494d595074..ca7d9a084f5 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1260,6 +1260,7 @@ struct radv_cmd_state { unsigned active_occlusion_queries; bool perfect_occlusion_queries_enabled; unsigned active_pipeline_queries; + unsigned active_pipeline_gds_queries; float offset_scale; uint32_t trace_id; uint32_t last_ia_multi_vgt_param; @@ -1275,6 +1276,9 @@ struct radv_cmd_state { int predication_type; /* -1: disabled, 0: normal, 1: inverted */ uint64_t predication_va; + /* Inheritance info. */ + VkQueryPipelineStatisticFlags inherited_pipeline_statistics; + bool context_roll_without_scissor_emitted; }; @@ -1333,7 +1337,7 @@ struct radv_cmd_buffer { uint32_t esgs_ring_size_needed; uint32_t gsvs_ring_size_needed; bool tess_rings_needed; - bool gds_needed; /* for GFX10 streamout */ + bool gds_needed; /* for GFX10 streamout and NGG GS queries */ bool gds_oa_needed; /* for GFX10 streamout */ bool sample_positions_needed; diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index f59e435e018..6f660c109e6 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -40,6 +40,14 @@ static const int pipelinestat_block_size = 11 * 8; static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10}; +static unsigned +radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag) +{ + int offset = ffs(flag) - 1; + assert(offset < ARRAY_SIZE(pipeline_statistics_indices)); + return pipeline_statistics_indices[offset]; +} + static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag) { return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag))); @@ -1261,6 +1269,22 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, radv_meta_restore(&saved_state, cmd_buffer); } +static bool +radv_query_pool_needs_gds(struct radv_device *device, + struct radv_query_pool *pool) +{ + /* The number of primitives generated by geometry shader invocations is + * only counted by the hardware if GS uses the legacy path. When NGG GS + * is used, the hardware can't know the number of generated primitives + * and we have to it manually inside the shader. To achieve that, the + * driver does a plain GDS atomic to accumulate that value. + * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end + * query. + */ + return device->physical_device->use_ngg && + (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); +} + VkResult radv_CreateQueryPool( VkDevice _device, const VkQueryPoolCreateInfo* pCreateInfo, @@ -1725,6 +1749,7 @@ static unsigned event_type_for_stream(unsigned stream) } static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, + struct radv_query_pool *pool, uint64_t va, VkQueryType query_type, VkQueryControlFlags flags, @@ -1776,6 +1801,30 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + + if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { + int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + /* Make sure GDS is idle before copying the value. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2; + si_emit_cache_flush(cmd_buffer); + + va += 8 * idx; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(0, 1), 0); + + /* Record that the command buffer needs GDS. */ + cmd_buffer->gds_needed = true; + + cmd_buffer->state.active_pipeline_gds_queries++; + } break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: radeon_check_space(cmd_buffer->device->ws, cs, 4); @@ -1794,6 +1843,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer, } static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, + struct radv_query_pool *pool, uint64_t va, uint64_t avail_va, VkQueryType query_type, uint32_t index) { @@ -1841,6 +1891,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va); + + if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) { + int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT); + + /* Make sure GDS is idle before copying the value. */ + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_L2; + si_emit_cache_flush(cmd_buffer); + + va += 8 * idx; + + si_cs_emit_write_event_eop(cs, + cmd_buffer->device->physical_device->rad_info.chip_class, + radv_cmd_buffer_uses_mec(cmd_buffer), + V_028A90_PS_DONE, 0, + EOP_DST_SEL_TC_L2, + EOP_DATA_SEL_GDS, + va, EOP_DATA_GDS(0, 1), 0); + + cmd_buffer->state.active_pipeline_gds_queries--; + } break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: radeon_check_space(cmd_buffer->device->ws, cs, 4); @@ -1884,7 +1955,7 @@ void radv_CmdBeginQueryIndexedEXT( va += pool->stride * query; - emit_begin_query(cmd_buffer, va, pool->type, flags, index); + emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index); } void radv_CmdBeginQuery( @@ -1911,7 +1982,7 @@ void radv_CmdEndQueryIndexedEXT( /* Do not need to add the pool BO to the list because the query must * currently be active, which means the BO is already in the list. */ - emit_end_query(cmd_buffer, va, avail_va, pool->type, index); + emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index); /* * For multiview we have to emit a query for each bit in the mask, @@ -1928,8 +1999,8 @@ void radv_CmdEndQueryIndexedEXT( for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) { va += pool->stride; avail_va += 4; - emit_begin_query(cmd_buffer, va, pool->type, 0, 0); - emit_end_query(cmd_buffer, va, avail_va, pool->type, 0); + emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0); + emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0); } } } diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 131774bd886..b38710e6fcf 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -148,7 +148,8 @@ enum radv_ud_index { AC_UD_INDIRECT_DESCRIPTOR_SETS = 3, AC_UD_VIEW_INDEX = 4, AC_UD_STREAMOUT_BUFFERS = 5, - AC_UD_SHADER_START = 6, + AC_UD_NGG_GS_STATE = 6, + AC_UD_SHADER_START = 7, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_MAX_UD, diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index 6f40808d825..1b57d402d5c 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -615,6 +615,11 @@ radv_declare_shader_args(struct radv_shader_args *args, &args->ac.view_index); } + if (args->options->key.vs_common_out.as_ngg) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, + &args->ngg_gs_state); + } + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->gs_vtx_offset[0]); ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, @@ -742,6 +747,9 @@ radv_declare_shader_args(struct radv_shader_args *args, } if (args->ac.view_index.used) set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + + if (args->ngg_gs_state.used) + set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1); break; case MESA_SHADER_FRAGMENT: break; diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index 3c7aceb6385..451077a9ede 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -65,6 +65,9 @@ struct radv_shader_args { struct ac_arg streamout_config; struct ac_arg streamout_offset[4]; + /* NGG GS */ + struct ac_arg ngg_gs_state; + bool is_gs_copy_shader; };