diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index be9ccaa14da..84ce6131446 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -1327,6 +1327,19 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw); + if (pipeline->graphics.has_ngg_culling && + pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY && + !cmd_buffer->state.last_nggc_settings) { + /* The already emitted RSRC2 contains the LDS required for NGG culling. + * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage. + * API GS always needs LDS, so this isn't useful there. + */ + struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage]; + radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, + (v->config.rsrc2 & C_00B22C_LDS_SIZE) | + S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling)); + } + if (!cmd_buffer->state.emitted_pipeline || cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw || cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash || @@ -3839,6 +3852,8 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi cmd_buffer->state.last_sx_ps_downconvert = -1; cmd_buffer->state.last_sx_blend_opt_epsilon = -1; cmd_buffer->state.last_sx_blend_opt_control = -1; + cmd_buffer->state.last_nggc_settings = -1; + cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; cmd_buffer->usage_flags = pBeginInfo->flags; if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY && @@ -4961,6 +4976,10 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou if (secondary->state.last_index_type != -1) { primary->state.last_index_type = secondary->state.last_index_type; } + + primary->state.last_nggc_settings = secondary->state.last_nggc_settings; + primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx; + primary->state.last_nggc_skip = secondary->state.last_nggc_skip; } /* After executing commands from secondary buffers we have to dirty @@ -5635,6 +5654,209 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, return false; } +enum { + ngg_cull_none = 0, + ngg_cull_front_face = 1, + ngg_cull_back_face = 2, + ngg_cull_face_is_ccw = 4, + ngg_cull_small_primitives = 8, +}; + +ALWAYS_INLINE static bool +radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt, + bool indirect, unsigned num_viewports) +{ + /* If we have to draw only a few vertices, we get better latency if + * we disable NGG culling. + * + * When tessellation is used, what matters is the number of tessellated + * vertices, so let's always assume it's not a small draw. + * + * TODO: Figure out how to do culling with multiple viewports efficiently. + */ + return !has_tess && !indirect && vtx_cnt < 512 && num_viewports == 1; +} + +ALWAYS_INLINE static uint32_t +radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted) +{ + const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic; + + /* Cull every triangle when rasterizer discard is enabled. */ + if (d->rasterizer_discard_enable || + G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl)) + return ngg_cull_front_face | ngg_cull_back_face; + + uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl; + uint32_t nggc_settings = ngg_cull_none; + + /* The culling code needs to know whether face is CW or CCW. */ + bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE) + ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE + : G_028814_FACE(pa_su_sc_mode_cntl) == 0; + + /* Take inverted viewport into account. */ + ccw ^= vp_y_inverted; + + if (ccw) + nggc_settings |= ngg_cull_face_is_ccw; + + /* Face culling settings. */ + if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) + ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT) + : G_028814_CULL_FRONT(pa_su_sc_mode_cntl)) + nggc_settings |= ngg_cull_front_face; + if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE) + ? (d->cull_mode & VK_CULL_MODE_BACK_BIT) + : G_028814_CULL_BACK(pa_su_sc_mode_cntl)) + nggc_settings |= ngg_cull_back_face; + + /* Small primitive culling is only valid when conservative overestimation is not used. */ + if (!pipeline->graphics.uses_conservative_overestimate) { + nggc_settings |= ngg_cull_small_primitives; + + /* small_prim_precision = num_samples / 2^subpixel_bits + * num_samples is also always a power of two, so the small prim precision can only be + * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent. + */ + unsigned subpixel_bits = 256; + int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits); + nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u); + } + + return nggc_settings; +} + +static void +radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + const unsigned stage = pipeline->graphics.last_vgt_api_stage; + const bool nggc_supported = pipeline->graphics.has_ngg_culling; + + if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) { + /* Current shader doesn't support culling and culling was already disabled: + * No further steps needed, just remember the SGPR's location is not set. + */ + cmd_buffer->state.last_nggc_settings_sgpr_idx = -1; + return; + } + + /* Check dirty flags: + * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed). + * - Dirty dynamic flags: culling settings may have changed. + */ + const bool dirty = + cmd_buffer->state.dirty & + (RADV_CMD_DIRTY_PIPELINE | + RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE | + RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT); + + /* Check small draw status: + * For small draw calls, we disable culling by setting the SGPR to 0. + */ + const bool skip = + radv_skip_ngg_culling( + stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect, + cmd_buffer->state.dynamic.viewport.count); + + /* See if anything changed. */ + if (!dirty && skip == cmd_buffer->state.last_nggc_skip) + return; + + /* Remember small draw state. */ + cmd_buffer->state.last_nggc_skip = skip; + const struct radv_shader_variant *v = pipeline->shaders[stage]; + assert(v->info.has_ngg_culling == nggc_supported); + + /* Find the user SGPR. */ + const uint32_t base_reg = pipeline->user_data_0[stage]; + const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx; + assert(!nggc_supported || nggc_sgpr_idx != -1); + + /* Get viewport transform. */ + float vp_scale[3], vp_translate[3]; + radv_get_viewport_xform(&cmd_buffer->state.dynamic.viewport.viewports[0], vp_scale, vp_translate); + bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]); + + /* Get current culling settings. */ + uint32_t nggc_settings = nggc_supported && !skip + ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted) + : ngg_cull_none; + + bool emit_viewport = nggc_settings && + (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT || + cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx || + !cmd_buffer->state.last_nggc_settings); + + if (emit_viewport) { + /* Correction for inverted Y */ + if (vp_y_inverted) { + vp_scale[1] = -vp_scale[1]; + vp_translate[1] = -vp_translate[1]; + } + + /* Correction for number of samples per pixel. */ + for (unsigned i = 0; i < 2; ++i) { + vp_scale[i] *= (float) pipeline->graphics.ms.num_samples; + vp_translate[i] *= (float) pipeline->graphics.ms.num_samples; + } + + uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])}; + const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx; + assert(vp_sgpr_idx != -1); + radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4); + radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4); + } + + bool emit_settings = nggc_supported && + (cmd_buffer->state.last_nggc_settings != nggc_settings || + cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx); + + /* This needs to be emitted when culling is turned on + * and when it's already on but some settings change. + */ + if (emit_settings) { + assert(nggc_sgpr_idx >= 0); + radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings); + } + + /* These only need to be emitted when culling is turned on or off, + * but not when it stays on and just some settings change. + */ + if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) { + const struct radv_physical_device *physical_device = cmd_buffer->device->physical_device; + uint32_t rsrc2 = v->config.rsrc2; + uint32_t oversub_pc_lines = physical_device->rad_info.pc_lines / 4; + + if (nggc_settings) { + /* Tweak the parameter cache oversubscription. + * This allows the HW to launch more NGG workgroups than the pre-allocated parameter + * cache would normally allow, yielding better perf when culling is on. + */ + oversub_pc_lines = physical_device->rad_info.pc_lines * 3 / 4; + } else { + /* Allocate less LDS when culling is disabled. (But GS always needs it.) */ + if (stage != MESA_SHADER_GEOMETRY) + rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling); + } + + /* When the pipeline is dirty, radv_emit_graphics_pipeline will write this register. */ + if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)) { + radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2); + } + + /* Update parameter cache oversubscription setting. */ + radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC, + S_030980_OVERSUB_EN(physical_device->rad_info.use_late_alloc) | + S_030980_NUM_PC_LINES(oversub_pc_lines - 1)); + } + + cmd_buffer->state.last_nggc_settings = nggc_settings; + cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx; +} + static void radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info) { @@ -5644,6 +5866,10 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline) radv_emit_rbplus_state(cmd_buffer); + if ((cmd_buffer->device->instance->perftest_flags & RADV_PERFTEST_NGGC) && + cmd_buffer->state.pipeline->graphics.is_ngg) + radv_emit_ngg_culling_state(cmd_buffer, info); + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) radv_emit_graphics_pipeline(cmd_buffer); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 0d579d8c256..9b08c1ba1cf 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1790,6 +1790,10 @@ radv_pipeline_init_raster_state(struct radv_pipeline *pipeline, S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) | S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) | S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); + + pipeline->graphics.uses_conservative_overestimate = + radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) == + VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT; } static void @@ -5441,6 +5445,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device, pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline); pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline); + pipeline->graphics.has_ngg_culling = + pipeline->graphics.is_ngg && + pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling; radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 06ef5928dc4..b99bea00dd1 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1415,6 +1415,11 @@ struct radv_cmd_state { bool pending_sqtt_barrier_end; enum rgp_flush_bits sqtt_flush_bits; + /* NGG culling state. */ + uint32_t last_nggc_settings; + int8_t last_nggc_settings_sgpr_idx; + bool last_nggc_skip; + uint8_t cb_mip[MAX_RTS]; /* Whether DRAW_{INDEX}_INDIRECT_MULTI is emitted. */ @@ -1762,6 +1767,7 @@ struct radv_pipeline { unsigned pa_cl_clip_cntl; unsigned cb_color_control; bool uses_dynamic_stride; + bool uses_conservative_overestimate; /* Used for rbplus */ uint32_t col_format; @@ -1769,6 +1775,7 @@ struct radv_pipeline { /* Whether the pipeline uses NGG (GFX10+). */ bool is_ngg; + bool has_ngg_culling; /* Last pre-PS API stage */ gl_shader_stage last_vgt_api_stage; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index f1cb00f27ca..5a59e7f251a 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -969,6 +969,8 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, key->vs_common_out.export_prim_id, key->vs.provoking_vtx_last); + info->has_ngg_culling = out_conf.can_cull; + info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity); info->is_ngg_passthrough = out_conf.passthrough; key->vs_common_out.as_ngg_passthrough = out_conf.passthrough; } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index b755c59094d..1ad54b93276 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -162,7 +162,9 @@ enum radv_ud_index { AC_UD_VIEW_INDEX = 4, AC_UD_STREAMOUT_BUFFERS = 5, AC_UD_NGG_GS_STATE = 6, - AC_UD_SHADER_START = 7, + AC_UD_NGG_CULLING_SETTINGS = 7, + AC_UD_NGG_VIEWPORT = 8, + AC_UD_SHADER_START = 9, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_MAX_UD, @@ -261,6 +263,8 @@ struct radv_shader_info { bool need_indirect_descriptor_sets; bool is_ngg; bool is_ngg_passthrough; + bool has_ngg_culling; + uint32_t num_lds_blocks_when_not_culling; uint32_t num_tess_patches; struct { uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX]; diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index d0fde542d45..773a2364565 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -117,6 +117,19 @@ count_vs_user_sgprs(struct radv_shader_args *args) return count; } +static unsigned +count_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage) +{ + unsigned count = 0; + + if (stage == MESA_SHADER_GEOMETRY) + count += 1; /* ngg_gs_state */ + if (args->shader_info->has_ngg_culling) + count += 5; /* ngg_culling_settings + 4x ngg_viewport_* */ + + return count; +} + static void allocate_inline_push_consts(struct radv_shader_args *args, struct user_sgpr_info *user_sgpr_info) { @@ -184,6 +197,8 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h case MESA_SHADER_VERTEX: if (!args->is_gs_copy_shader) user_sgpr_count += count_vs_user_sgprs(args); + if (args->options->key.vs_common_out.as_ngg) + user_sgpr_count += count_ngg_sgprs(args, stage); break; case MESA_SHADER_TESS_CTRL: if (has_previous_stage) { @@ -192,11 +207,13 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h } break; case MESA_SHADER_TESS_EVAL: + if (args->options->key.vs_common_out.as_ngg) + user_sgpr_count += count_ngg_sgprs(args, stage); break; case MESA_SHADER_GEOMETRY: if (has_previous_stage) { if (args->options->key.vs_common_out.as_ngg) - user_sgpr_count++; /* NGG GS state */ + user_sgpr_count += count_ngg_sgprs(args, stage); if (previous_stage == MESA_SHADER_VERTEX) { user_sgpr_count += count_vs_user_sgprs(args); @@ -356,6 +373,22 @@ declare_tes_input_vgprs(struct radv_shader_args *args) ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id); } +static void +declare_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage) +{ + if (stage == MESA_SHADER_GEOMETRY) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state); + } + + if (args->shader_info->has_ngg_culling) { + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_culling_settings); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[0]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[1]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[0]); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[1]); + } +} + static void set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info, uint8_t *user_sgpr_idx) @@ -405,6 +438,24 @@ set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage, } } +static void +set_ngg_sgprs_locs(struct radv_shader_args *args, gl_shader_stage stage, uint8_t *user_sgpr_idx) +{ + if (stage == MESA_SHADER_GEOMETRY) { + assert(args->ngg_gs_state.used); + set_loc_shader(args, AC_UD_NGG_GS_STATE, user_sgpr_idx, 1); + } + + if (args->shader_info->has_ngg_culling) { + assert(args->ngg_culling_settings.used && + args->ngg_viewport_scale[0].used && args->ngg_viewport_scale[1].used && + args->ngg_viewport_translate[0].used && args->ngg_viewport_translate[1].used); + + set_loc_shader(args, AC_UD_NGG_CULLING_SETTINGS, user_sgpr_idx, 1); + set_loc_shader(args, AC_UD_NGG_VIEWPORT, user_sgpr_idx, 4); + } +} + /* Returns whether the stage is a stage that can be directly before the GS */ static bool is_pre_gs_stage(gl_shader_stage stage) @@ -488,6 +539,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage, if (args->options->explicit_scratch_args) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); } + if (args->options->key.vs_common_out.as_ngg) { + declare_ngg_sgprs(args, stage); + } declare_vs_input_vgprs(args); break; @@ -547,6 +601,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage, if (args->options->explicit_scratch_args) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); } + if (args->options->key.vs_common_out.as_ngg) { + declare_ngg_sgprs(args, stage); + } declare_tes_input_vgprs(args); break; case MESA_SHADER_GEOMETRY: @@ -576,7 +633,7 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage, } if (args->options->key.vs_common_out.as_ngg) { - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state); + declare_ngg_sgprs(args, stage); } ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]); @@ -669,6 +726,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage, set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx); if (args->ac.view_index.used) set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + if (args->options->key.vs_common_out.as_ngg) + set_ngg_sgprs_locs(args, stage, &user_sgpr_idx); break; case MESA_SHADER_TESS_CTRL: set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx); @@ -678,6 +737,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage, case MESA_SHADER_TESS_EVAL: if (args->ac.view_index.used) set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); + if (args->options->key.vs_common_out.as_ngg) + set_ngg_sgprs_locs(args, stage, &user_sgpr_idx); break; case MESA_SHADER_GEOMETRY: if (has_previous_stage) { @@ -688,8 +749,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage, if (args->ac.view_index.used) set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - if (args->ngg_gs_state.used) - set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1); + if (args->options->key.vs_common_out.as_ngg) + set_ngg_sgprs_locs(args, stage, &user_sgpr_idx); break; case MESA_SHADER_FRAGMENT: break; diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h index a6828cdf309..a7c13152fcb 100644 --- a/src/amd/vulkan/radv_shader_args.h +++ b/src/amd/vulkan/radv_shader_args.h @@ -41,6 +41,9 @@ struct radv_shader_args { /* NGG GS */ struct ac_arg ngg_gs_state; + struct ac_arg ngg_culling_settings; + struct ac_arg ngg_viewport_scale[2]; + struct ac_arg ngg_viewport_translate[2]; bool is_gs_copy_shader; bool is_trap_handler_shader;