radv: use CU mode when LDS is used

This improves performance of llama.cpp.

fossil-db (navi21):
Totals from 1598 (2.00% of 79825) affected shaders:
MaxWaves: 30182 -> 29278 (-3.00%); split: +0.04%, -3.03%
Instrs: 1013136 -> 1013065 (-0.01%); split: -0.07%, +0.07%
CodeSize: 5275876 -> 5274948 (-0.02%); split: -0.06%, +0.04%
VGPRs: 86176 -> 88016 (+2.14%); split: -0.22%, +2.36%
SpillVGPRs: 0 -> 11 (+inf%)
Scratch: 0 -> 4096 (+inf%)
Latency: 7954289 -> 7824742 (-1.63%); split: -1.64%, +0.01%
InvThroughput: 1511429 -> 1510912 (-0.03%); split: -0.89%, +0.86%
VClause: 26503 -> 26460 (-0.16%); split: -0.23%, +0.07%
SClause: 19032 -> 19039 (+0.04%); split: -0.01%, +0.05%
Copies: 74577 -> 74329 (-0.33%); split: -0.79%, +0.46%
Branches: 20278 -> 20279 (+0.00%)
VALU: 665079 -> 664831 (-0.04%); split: -0.09%, +0.05%
SALU: 124899 -> 124818 (-0.06%); split: -0.08%, +0.01%
VMEM: 46141 -> 46163 (+0.05%)

fossil-db (navi31):
Totals from 1609 (2.02% of 79825) affected shaders:
MaxWaves: 39724 -> 38880 (-2.12%)
Instrs: 1147767 -> 1147595 (-0.01%); split: -0.04%, +0.03%
CodeSize: 5777072 -> 5776376 (-0.01%); split: -0.03%, +0.02%
VGPRs: 91752 -> 93132 (+1.50%); split: -0.03%, +1.53%
Latency: 7526930 -> 7396201 (-1.74%); split: -1.74%, +0.00%
InvThroughput: 1083131 -> 1088328 (+0.48%); split: -0.45%, +0.93%
VClause: 25864 -> 25789 (-0.29%); split: -0.33%, +0.04%
SClause: 19136 -> 19135 (-0.01%); split: -0.02%, +0.01%
Copies: 80797 -> 80501 (-0.37%); split: -0.42%, +0.05%
VALU: 674455 -> 674160 (-0.04%); split: -0.05%, +0.01%
SALU: 123849 -> 123806 (-0.03%)

fossil-db (gfx1201):
Totals from 1614 (2.02% of 79839) affected shaders:
MaxWaves: 40140 -> 39296 (-2.10%)
Instrs: 1183227 -> 1183102 (-0.01%); split: -0.04%, +0.03%
CodeSize: 6091060 -> 6090636 (-0.01%); split: -0.03%, +0.03%
VGPRs: 90708 -> 92040 (+1.47%); split: -0.01%, +1.48%
Latency: 7588683 -> 7425866 (-2.15%); split: -2.15%, +0.00%
InvThroughput: 1070469 -> 1075700 (+0.49%); split: -0.50%, +0.99%
VClause: 25691 -> 25597 (-0.37%); split: -0.37%, +0.00%
SClause: 19095 -> 19086 (-0.05%); split: -0.05%, +0.01%
Copies: 80753 -> 80452 (-0.37%); split: -0.42%, +0.05%
VALU: 665218 -> 664922 (-0.04%); split: -0.05%, +0.01%
SALU: 144059 -> 144011 (-0.03%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37791>
This commit is contained in:
Rhys Perry 2025-10-14 20:18:11 +01:00
parent 3e9921f52e
commit dd2f34c777
2 changed files with 19 additions and 16 deletions

View file

@ -57,7 +57,7 @@ create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuil
}
ac_llvm_set_workgroup_size(main_function.value, max_workgroup_size);
ac_llvm_set_target_features(main_function.value, ctx, true);
ac_llvm_set_target_features(main_function.value, ctx, options->wgp_mode);
return main_function;
}

View file

@ -1494,18 +1494,25 @@ radv_should_use_wgp_mode(const struct radv_device *device, mesa_shader_stage sta
{
const struct radv_physical_device *pdev = radv_device_physical(device);
enum amd_gfx_level chip = pdev->info.gfx_level;
switch (stage) {
case MESA_SHADER_COMPUTE:
case MESA_SHADER_TESS_CTRL:
return chip >= GFX10;
case MESA_SHADER_GEOMETRY:
return chip == GFX10 || (chip >= GFX10_3 && !info->is_ngg);
case MESA_SHADER_VERTEX:
case MESA_SHADER_TESS_EVAL:
return chip == GFX10 && info->is_ngg;
default:
if (chip < GFX10)
return false;
}
/* Disable the WGP mode on gfx10.3 because it can hang. (it
* happened on VanGogh) Let's disable it on all chips that
* disable exactly 1 CU per SA for GS.
*/
if (chip > GFX10 && info->is_ngg)
return false;
if (stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK || stage == MESA_SHADER_FRAGMENT)
return false;
/* VS+TCS programs might have an unknown LDS size if the input patch size is dynamic. */
bool uses_lds = radv_calculate_lds_size(info, chip) > 0 ||
(stage == MESA_SHADER_TESS_CTRL && !info->num_tess_patches);
/* LDS is faster with CU mode. */
return !uses_lds;
}
#if defined(USE_LIBELF)
@ -2344,10 +2351,6 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
}
}
/* Disable the WGP mode on gfx10.3 because it can hang. (it
* happened on VanGogh) Let's disable it on all chips that
* disable exactly 1 CU per SA for GS.
*/
config->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(config->wgp_mode);
config->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | S_00B22C_LDS_SIZE(lds_alloc) |
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL);