diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 4e54dfc53e5..b0ed0d9b347 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -81,15 +81,16 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *ls, ac_nir_map_io_driver_location map, enum amd_gfx_level gfx_level, bool tcs_in_out_eq, - uint64_t tcs_inputs_read, - uint64_t tcs_temp_only_inputs); + uint64_t tcs_inputs_via_temp, + uint64_t tcs_inputs_via_lds); void ac_nir_lower_hs_inputs_to_mem(nir_shader *shader, ac_nir_map_io_driver_location map, enum amd_gfx_level gfx_level, bool tcs_in_out_eq, - uint64_t tcs_temp_only_inputs); + uint64_t tcs_inputs_via_temp, + uint64_t tcs_inputs_via_lds); void ac_nir_lower_hs_outputs_to_mem(nir_shader *shader, diff --git a/src/amd/common/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/ac_nir_lower_tess_io_to_mem.c index dd4417d79a9..dfe7c9b14c2 100644 --- a/src/amd/common/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/ac_nir_lower_tess_io_to_mem.c @@ -1168,19 +1168,24 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader, ac_nir_map_io_driver_location map, enum amd_gfx_level gfx_level, bool tcs_in_out_eq, - uint64_t tcs_inputs_read, - uint64_t tcs_temp_only_inputs) + uint64_t tcs_inputs_via_temp, + uint64_t tcs_inputs_via_lds) { assert(shader->info.stage == MESA_SHADER_VERTEX); assert(gfx_level >= GFX9 || !tcs_in_out_eq); lower_tess_io_state state = { .gfx_level = gfx_level, - .tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0, - .tcs_inputs_via_lds = tcs_inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull), .map_io = map, }; + if (tcs_in_out_eq) { + state.tcs_inputs_via_temp = tcs_inputs_via_temp; + state.tcs_inputs_via_lds = tcs_inputs_via_lds; + } else { + state.tcs_inputs_via_lds = tcs_inputs_via_lds | tcs_inputs_via_temp; + } + nir_shader_intrinsics_pass(shader, lower_ls_output_store, nir_metadata_control_flow, &state); @@ -1191,18 +1196,24 @@ ac_nir_lower_hs_inputs_to_mem(nir_shader *shader, ac_nir_map_io_driver_location map, enum amd_gfx_level gfx_level, bool tcs_in_out_eq, - uint64_t tcs_temp_only_inputs) + uint64_t tcs_inputs_via_temp, + uint64_t tcs_inputs_via_lds) { assert(shader->info.stage == MESA_SHADER_TESS_CTRL); assert(gfx_level >= GFX9 || !tcs_in_out_eq); lower_tess_io_state state = { .gfx_level = gfx_level, - .tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0, - .tcs_inputs_via_lds = shader->info.inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull), .map_io = map, }; + if (tcs_in_out_eq) { + state.tcs_inputs_via_temp = tcs_inputs_via_temp; + state.tcs_inputs_via_lds = tcs_inputs_via_lds; + } else { + state.tcs_inputs_via_lds = shader->info.inputs_read; + } + nir_shader_lower_instructions(shader, filter_load_tcs_per_vertex_input, lower_hs_per_vertex_input_load, diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 90f6bac6742..55e6a13625c 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11930,7 +11930,7 @@ select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */ const bool tcs_skip_barrier = - ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read; + ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds; /* A barrier is usually needed at the beginning of the second shader, with exceptions. */ const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier; diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index 3a0bff49678..dd965328ab9 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -115,7 +115,7 @@ struct isel_context { Temp ttmp8; /* tessellation information */ - uint64_t tcs_temp_only_inputs; + bool any_tcs_inputs_via_lds = false; bool tcs_in_out_eq = false; /* Fragment color output information */ diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 0c264c38ac4..25cc5f7aadf 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -228,7 +228,7 @@ void setup_tcs_info(isel_context* ctx) { ctx->tcs_in_out_eq = ctx->program->info.vs.tcs_in_out_eq; - ctx->tcs_temp_only_inputs = ctx->program->info.vs.tcs_temp_only_input_mask; + ctx->any_tcs_inputs_via_lds = ctx->program->info.vs.any_tcs_inputs_via_lds; } void diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index afce9c06a8a..d7a8f46745b 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -107,7 +107,7 @@ struct aco_shader_info { struct ac_arg epilog_pc; /* Vulkan only */ struct { bool tcs_in_out_eq; - uint64_t tcs_temp_only_input_mask; + bool any_tcs_inputs_via_lds; bool has_prolog; } vs; struct { diff --git a/src/amd/vulkan/nir/radv_nir_lower_io.c b/src/amd/vulkan/nir/radv_nir_lower_io.c index 6005b6a8ebf..5b9ea828515 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_io.c +++ b/src/amd/vulkan/nir/radv_nir_lower_io.c @@ -220,7 +220,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s if (nir->info.stage == MESA_SHADER_VERTEX) { if (info->vs.as_ls) { NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, pdev->info.gfx_level, info->vs.tcs_in_out_eq, - info->vs.hs_inputs_read, info->vs.tcs_temp_only_input_mask); + info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds); return true; } else if (info->vs.as_es) { NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, map_output, pdev->info.gfx_level, info->esgs_itemsize, info->gs_inputs_read); @@ -228,7 +228,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s } } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, pdev->info.gfx_level, info->vs.tcs_in_out_eq, - info->vs.tcs_temp_only_input_mask); + info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds); NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level, info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->wave_size); diff --git a/src/amd/vulkan/radv_aco_shader_info.h b/src/amd/vulkan/radv_aco_shader_info.h index 3ca8b735061..b6c63dc193c 100644 --- a/src/amd/vulkan/radv_aco_shader_info.h +++ b/src/amd/vulkan/radv_aco_shader_info.h @@ -34,11 +34,11 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv ASSIGN_FIELD(ps.has_epilog); ASSIGN_FIELD(merged_shader_compiled_separately); ASSIGN_FIELD(vs.tcs_in_out_eq); - ASSIGN_FIELD(vs.tcs_temp_only_input_mask); ASSIGN_FIELD(vs.has_prolog); ASSIGN_FIELD(tcs.num_lds_blocks); ASSIGN_FIELD(ps.num_inputs); ASSIGN_FIELD(cs.uses_full_subgroups); + aco_info->vs.any_tcs_inputs_via_lds = radv->vs.tcs_inputs_via_lds != 0; aco_info->ps.spi_ps_input_ena = radv->ps.spi_ps_input_ena; aco_info->ps.spi_ps_input_addr = radv->ps.spi_ps_input_addr; aco_info->ps.has_prolog = false; diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 0dc5a8e06f5..4a9ed3ba113 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -588,7 +588,7 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir, } info->gs_inputs_read = ~0ULL; - info->vs.hs_inputs_read = ~0ULL; + info->vs.tcs_inputs_via_lds = ~0ULL; /* Use per-attribute vertex descriptors to prevent faults and for correct bounds checking. */ info->vs.use_per_attribute_vb_descs = radv_use_per_attribute_vb_descs(nir, gfx_state, stage_key); @@ -1782,7 +1782,7 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro struct radv_shader_stage *vs_stage = producer; struct radv_shader_stage *tcs_stage = consumer; - vs_stage->info.vs.hs_inputs_read = tcs_stage->nir->info.inputs_read; + vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.inputs_read; if (gfx_state->ts.patch_control_points) { vs_stage->info.workgroup_size = @@ -1804,11 +1804,16 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro gfx_state->ts.patch_control_points == tcs_stage->info.tcs.tcs_vertices_out && vs_stage->nir->info.float_controls_execution_mode == tcs_stage->nir->info.float_controls_execution_mode; - if (vs_stage->info.vs.tcs_in_out_eq) - vs_stage->info.vs.tcs_temp_only_input_mask = - tcs_stage->nir->info.inputs_read & vs_stage->nir->info.outputs_written & - ~tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read & - ~tcs_stage->nir->info.inputs_read_indirectly & ~vs_stage->nir->info.outputs_accessed_indirectly; + if (vs_stage->info.vs.tcs_in_out_eq) { + vs_stage->info.vs.tcs_inputs_via_temp = vs_stage->nir->info.outputs_written & + ~vs_stage->nir->info.outputs_accessed_indirectly & + tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read; + vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read | + (tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read & + tcs_stage->nir->info.inputs_read_indirectly) | + (tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read & + vs_stage->nir->info.outputs_accessed_indirectly); + } } } } diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h index 7c66e2dc3d4..b35968c217f 100644 --- a/src/amd/vulkan/radv_shader_info.h +++ b/src/amd/vulkan/radv_shader_info.h @@ -118,7 +118,8 @@ struct radv_shader_info { bool as_es; bool as_ls; bool tcs_in_out_eq; - uint64_t tcs_temp_only_input_mask; + uint64_t tcs_inputs_via_temp; + uint64_t tcs_inputs_via_lds; uint8_t num_linked_outputs; bool needs_base_instance; bool use_per_attribute_vb_descs; @@ -128,7 +129,6 @@ struct radv_shader_info { bool dynamic_inputs; bool dynamic_num_verts_per_prim; uint32_t num_outputs; /* For NGG streamout only */ - uint64_t hs_inputs_read; /* Mask of HS inputs read (only used by linked LS) */ } vs; struct { uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index c6f02cd5f61..4873967eb45 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1857,8 +1857,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir, is_gfx9_mono_tcs ? NULL : si_map_io_driver_location, sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices, - is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull, - tcs_vgpr_only_inputs); + is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0, + is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull); return true; } else if (key->ge.as_es) { NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location, @@ -1868,8 +1868,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir, } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, is_gfx9_mono_tcs ? NULL : si_map_io_driver_location, - sel->screen->info.gfx_level, - key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs); + sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices, + sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds); /* Used by hs_emit_write_tess_factors() when monolithic shader. */ if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED) @@ -3644,8 +3644,7 @@ nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader, si_init_shader_args(prev_shader, args); - nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, - sel->info.tcs_vgpr_only_inputs, NULL); + nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, 0, NULL); si_update_shader_binary_info(shader, nir); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index adfd495890c..02d93eb08df 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -482,7 +482,8 @@ struct si_shader_info { uint16_t enabled_streamout_buffer_mask; uint64_t inputs_read; /* "get_unique_index" bits */ - uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */ + uint64_t tcs_inputs_via_temp; + uint64_t tcs_inputs_via_lds; /* For VS before {TCS, TES, GS} and TES before GS. */ uint64_t ls_es_outputs_written; /* "get_unique_index" bits */ diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c index e87130eb661..2bf55012fba 100644 --- a/src/gallium/drivers/radeonsi/si_shader_aco.c +++ b/src/gallium/drivers/radeonsi/si_shader_aco.c @@ -87,7 +87,9 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info, switch (stage) { case MESA_SHADER_TESS_CTRL: info->vs.tcs_in_out_eq = key->ge.opt.same_patch_vertices; - info->vs.tcs_temp_only_input_mask = sel->info.tcs_vgpr_only_inputs; + info->vs.any_tcs_inputs_via_lds = sel->info.tcs_inputs_via_lds || + (!shader->key.ge.opt.same_patch_vertices && + sel->info.tcs_inputs_via_temp); info->tcs.tcs_offchip_layout = args->tcs_offchip_layout; break; case MESA_SHADER_FRAGMENT: diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index 1d2d2495b07..8faca45b4f4 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -702,9 +702,10 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0); } - info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read & - ~info->base.inputs_read_indirectly & - info->base.inputs_read; + info->tcs_inputs_via_temp = info->base.tess.tcs_same_invocation_inputs_read; + info->tcs_inputs_via_lds = info->base.tess.tcs_cross_invocation_inputs_read | + (info->base.tess.tcs_same_invocation_inputs_read & + info->base.inputs_read_indirectly); } if (nir->info.stage == MESA_SHADER_GEOMETRY) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index dec1b87dd8f..fb8987adce9 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -702,8 +702,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade if (ctx->stage == MESA_SHADER_TESS_CTRL) { /* We need the barrier only if TCS inputs are read from LDS. */ if (!shader->key.ge.opt.same_patch_vertices || - shader->selector->info.base.inputs_read & - ~shader->selector->info.tcs_vgpr_only_inputs) { + shader->selector->info.tcs_inputs_via_lds) { ac_build_waitcnt(&ctx->ac, AC_WAIT_DS); /* If both input and output patches are wholly in one wave, we don't need a barrier. diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 48454588d8a..ac6d859d2a2 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4661,11 +4661,11 @@ unsigned si_shader_lshs_vertex_stride(struct si_shader *ls) assert(tcs->selector->screen->info.gfx_level >= GFX9); if (tcs->is_monolithic) { - uint64_t lds_inputs_read = tcs->selector->info.base.inputs_read; + uint64_t lds_inputs_read = tcs->selector->info.tcs_inputs_via_lds; - /* Don't allocate LDS for inputs passed via VGPRs. */ - if (tcs->key.ge.opt.same_patch_vertices) - lds_inputs_read &= ~tcs->selector->info.tcs_vgpr_only_inputs; + /* If the TCS in/out number of vertices is different, all inputs are passed via LDS. */ + if (!tcs->key.ge.opt.same_patch_vertices) + lds_inputs_read |= tcs->selector->info.tcs_inputs_via_temp; /* NIR lowering passes pack LS outputs/HS inputs if the usage masks of both are known. */ num_slots = util_bitcount64(lds_inputs_read);