From c1237256cb48f107d4d612e9e66954edbcbe476a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 20 Apr 2025 08:01:55 -0400 Subject: [PATCH] ac/nir/tess: execute the tess level workgroup vote on all chips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It will be used to skip stores for discarded patches. Reviewed-by: Timur Kristóf Part-of: --- src/amd/common/ac_shader_util.c | 3 +- src/amd/common/ac_shader_util.h | 4 +- .../common/nir/ac_nir_lower_tess_io_to_mem.c | 89 ++++++++++--------- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 0ef0ed2da77..69a7e58e435 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -998,8 +998,7 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu if (lds_per_patch) { const unsigned max_lds_size = (info->gfx_level >= GFX9 ? 64 * 1024 : 32 * 1024); /* hw limit */ /* Target at least 2 workgroups per CU. */ - const unsigned target_lds_size = max_lds_size / 2 - - (info->gfx_level >= GFX11 ? AC_HS_MSG_VOTE_LDS_BYTES : 0); + const unsigned target_lds_size = max_lds_size / 2 - AC_TESS_LEVEL_VOTE_LDS_BYTES; num_patches = MIN2(num_patches, target_lds_size / lds_per_patch); assert(num_patches * lds_per_patch <= max_lds_size); } diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 3ede1af6bb4..e4dac45f70f 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -30,8 +30,8 @@ extern "C" { #define AC_SENDMSG_GS_OP_EMIT (2 << 4) #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) -/* Reserve this size at the beginning of LDS for the tf0/1 shader message group vote. */ -#define AC_HS_MSG_VOTE_LDS_BYTES 16 +/* Reserve this size at the beginning of LDS for the tess level group vote. */ +#define AC_TESS_LEVEL_VOTE_LDS_BYTES 16 /* An extension of gl_access_qualifier describing other aspects of memory operations * for code generation. diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c index 6d49a34a6a2..0ed88c1c8a9 100644 --- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c @@ -301,9 +301,8 @@ lower_ls_output_store(nir_builder *b, nir_def *off = nir_iadd_nuw(b, base_off_var, io_off); - /* The first vec4 is reserved for the tf0/1 shader message group vote. */ - if (st->gfx_level >= GFX11) - off = nir_iadd_imm_nuw(b, off, AC_HS_MSG_VOTE_LDS_BYTES); + /* The beginning of LDS is reserved for the tess level group vote. */ + off = nir_iadd_imm_nuw(b, off, AC_TESS_LEVEL_VOTE_LDS_BYTES); AC_NIR_STORE_IO(b, intrin->src[0].ssa, 0, write_mask, io_sem.high_16bits, nir_store_shared, off, .write_mask = store_write_mask, .base = store_const_offset); @@ -369,8 +368,8 @@ hs_per_vertex_input_lds_offset(nir_builder *b, nir_imm_int(b, 16u), 4u, mapped); nir_def *lds_offset = nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset); - /* The first LDS vec4 is reserved for the tf0/1 shader message group vote. */ - return st->gfx_level >= GFX11 ? nir_iadd_imm_nuw(b, lds_offset, AC_HS_MSG_VOTE_LDS_BYTES) : lds_offset; + /* The beginning of LDS is reserved for the tess level group vote. */ + return nir_iadd_imm_nuw(b, lds_offset, AC_TESS_LEVEL_VOTE_LDS_BYTES); } static unsigned @@ -442,8 +441,8 @@ hs_output_lds_offset(nir_builder *b, lower_tess_io_state *st, unsigned location, lds_offset = nir_iadd_nuw(b, off, output_patch_offset); } - /* The first LDS vec4 is reserved for the tf0/1 shader message group vote. */ - return st->gfx_level >= GFX11 ? nir_iadd_imm_nuw(b, lds_offset, AC_HS_MSG_VOTE_LDS_BYTES) : lds_offset; + /* The beginning of LDS is reserved for the tess level group vote. */ + return nir_iadd_imm_nuw(b, lds_offset, AC_TESS_LEVEL_VOTE_LDS_BYTES); } static unsigned @@ -854,10 +853,14 @@ tess_level_has_effect(nir_builder *b, nir_def *prim_mode, unsigned comp, bool ou unreachable("invalid comp"); } -/* Return true if memory should be used. If false is returned, the shader message has been used. */ +#define VOTE_RESULT_NORMAL 0 /* execute output stores and tess factor stores */ +#define VOTE_RESULT_ALL_TF_ZERO 1 /* skip output stores, skip tess factor stores on GFX11+ */ +#define VOTE_RESULT_ALL_TF_ONE 2 /* execute output stores, skip tess factor stores on GFX11+ */ + +/* Return VOTE_RESULT_*. This also sends the HS_TESSFACTOR shader message on GFX11+. */ static nir_def * -hs_msg_group_vote_use_memory(nir_builder *b, lower_tess_io_state *st, - tess_levels *tessfactors, nir_def *prim_mode) +hs_tess_level_group_vote(nir_builder *b, lower_tess_io_state *st, + tess_levels *tessfactors, nir_def *prim_mode) { /* Don't do the group vote and send the message directly if tess level values were determined * by nir_gather_tcs_info at compile time. @@ -867,16 +870,20 @@ hs_msg_group_vote_use_memory(nir_builder *b, lower_tess_io_state *st, if (debug_get_bool_option("AMD_FAST_HS_MSG", true) && (st->tcs_info.all_tess_levels_are_effectively_zero || st->tcs_info.all_tess_levels_are_effectively_one)) { - nir_if *if_subgroup0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0)); - { - /* m0[0] == 0 means all TF are 0 in the workgroup. - * m0[0] == 1 means all TF are 1 in the workgroup. - */ - nir_def *m0 = nir_imm_int(b, st->tcs_info.all_tess_levels_are_effectively_zero ? 0 : 1); - nir_sendmsg_amd(b, m0, .base = AC_SENDMSG_HS_TESSFACTOR); + if (st->gfx_level >= GFX11) { + nir_if *if_subgroup0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0)); + { + /* m0[0] == 0 means all TF are 0 in the workgroup. + * m0[0] == 1 means all TF are 1 in the workgroup. + */ + nir_def *m0 = nir_imm_int(b, st->tcs_info.all_tess_levels_are_effectively_zero ? 0 : 1); + nir_sendmsg_amd(b, m0, .base = AC_SENDMSG_HS_TESSFACTOR); + } + nir_pop_if(b, if_subgroup0); } - nir_pop_if(b, if_subgroup0); - return nir_imm_false(b); + + return nir_imm_int(b, st->tcs_info.all_tess_levels_are_effectively_zero ? + VOTE_RESULT_ALL_TF_ZERO : VOTE_RESULT_ALL_TF_ONE); } /* Initialize the first LDS dword for the tf0/1 group vote at the beginning of TCS. */ @@ -1014,21 +1021,23 @@ hs_msg_group_vote_use_memory(nir_builder *b, lower_tess_io_state *st, lds_result = nir_if_phi(b, lds_result, nir_undef(b, 1, 32)); lds_result = nir_read_invocation(b, lds_result, nir_imm_int(b, 0)); - /* Determine the vote value and send the message. */ - nir_def *use_memory = nir_ieq_imm(b, lds_result, 0); + /* Send the message. */ + if (st->gfx_level >= GFX11) { + nir_def *use_memory = nir_ieq_imm(b, lds_result, 0); - nir_if *if_subgroup0_sendmsg = nir_push_if(b, nir_iand(b, nir_inot(b, use_memory), - nir_ieq_imm(b, nir_load_subgroup_id(b), 0))); - { - /* m0[0] == 0 means all TF are 0 in the workgroup. - * m0[0] == 1 means all TF are 1 in the workgroup. - */ - nir_def *m0 = nir_iadd_imm(b, lds_result, -1); - nir_sendmsg_amd(b, m0, .base = AC_SENDMSG_HS_TESSFACTOR); + nir_if *if_subgroup0_sendmsg = nir_push_if(b, nir_iand(b, nir_inot(b, use_memory), + nir_ieq_imm(b, nir_load_subgroup_id(b), 0))); + { + /* m0[0] == 0 means all TF are 0 in the workgroup. + * m0[0] == 1 means all TF are 1 in the workgroup. + */ + nir_def *m0 = nir_iadd_imm(b, lds_result, -1); + nir_sendmsg_amd(b, m0, .base = AC_SENDMSG_HS_TESSFACTOR); + } + nir_pop_if(b, if_subgroup0_sendmsg); } - nir_pop_if(b, if_subgroup0_sendmsg); - return use_memory; + return lds_result; } static void @@ -1156,12 +1165,8 @@ hs_finale(nir_shader *shader, lower_tess_io_state *st) } nir_def *prim_mode = nir_load_tcs_primitive_mode_amd(b); - nir_def *use_memory = NULL; tess_levels tessfactors = {0}; - - /* This also loads tess levels for patch invocation 0. */ - if (st->gfx_level >= GFX11) - use_memory = hs_msg_group_vote_use_memory(b, st, &tessfactors, prim_mode); + nir_def *vote_result = hs_tess_level_group_vote(b, st, &tessfactors, prim_mode); /* Only the 1st invocation of each patch needs to access VRAM and/or LDS. */ nir_if *if_invocation_id_zero = hs_if_invocation_id_zero(b); @@ -1170,8 +1175,8 @@ hs_finale(nir_shader *shader, lower_tess_io_state *st) tessfactors = hs_load_tess_levels(b, st); nir_if *if_use_memory = NULL; - if (use_memory != NULL) - if_use_memory = nir_push_if(b, use_memory); + if (st->gfx_level >= GFX11) + if_use_memory = nir_push_if(b, nir_ieq_imm(b, vote_result, VOTE_RESULT_NORMAL)); if (st->gfx_level <= GFX8) hs_store_dynamic_control_word_gfx6(b); @@ -1194,7 +1199,7 @@ hs_finale(nir_shader *shader, lower_tess_io_state *st) } nir_pop_if(b, if_triangles); - if (use_memory != NULL) + if (if_use_memory != NULL) nir_pop_if(b, if_use_memory); nir_if *if_tes_reads_tf = nir_push_if(b, nir_load_tcs_tess_levels_to_tes_amd(b)); @@ -1496,11 +1501,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, uint64_t outputs_rea unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp, num_mem_tcs_outputs, num_mem_tcs_patch_outputs, lds_per_patch, wave_size, tess_uses_primid); - unsigned lds_size = lds_per_patch * num_patches; - - /* The first vec4 is reserved for the tf0/1 shader message group vote. */ - if (info->gfx_level >= GFX11) - lds_size += AC_HS_MSG_VOTE_LDS_BYTES; + unsigned lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES; /* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip * ring buffer always uses a fixed allocation size per workgroup determined by