From 8a5178755e3566435eb80cf4aac69a0d1389eb38 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 26 Aug 2025 10:02:16 +0200 Subject: [PATCH] radv: reorder cmat properties according to performance On GFX12, int8 is twice as fast as fp16/bf16. On GFX11, they have the same throughput, but int8 at least still uses less registers. Also reorder 16bit accumulators before 32bit, because they use less registers on GFX12. Reviewed-by: Rhys Perry Part-of: --- src/amd/vulkan/radv_physical_device.c | 31 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index b2ce5985374..ba1f2dd69b4 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -3054,6 +3054,11 @@ static void fill_array_sizes_structs(const struct radv_physical_device *pdev, struct __vk_outarray *base, void (*array_size_cb)(struct __vk_outarray *base, struct matrix_prop *prop)) { + /* The Vulkan spec says: + * If some types are preferred over other types (e.g. for performance), + * they should appear earlier in the list enumerated by + * vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR. + */ struct matrix_prop prop; if (pdev->info.gfx_level >= GFX12) { @@ -3068,19 +3073,6 @@ static void fill_array_sizes_structs(const struct radv_physical_device *pdev, } } - for (unsigned bfloat = 0; bfloat < 2; bfloat++) { - for (unsigned fp32 = 0; fp32 < 2; fp32++) { - prop.saturate = false; - prop.a_type = prop.b_type = bfloat ? VK_COMPONENT_TYPE_BFLOAT16_KHR : VK_COMPONENT_TYPE_FLOAT16_KHR; - prop.c_type = prop.r_type = fp32 ? VK_COMPONENT_TYPE_FLOAT32_KHR : prop.a_type; - - if (pdev->info.gfx_level < GFX12 && bfloat) - continue; /* BF16 isn't working precisely on GFX11. */ - - (*array_size_cb)(base, &prop); - } - } - for (unsigned asigned = 0; asigned < 2; asigned++) { for (unsigned bsigned = 0; bsigned < 2; bsigned++) { for (unsigned csigned = 0; csigned < 2; csigned++) { @@ -3098,6 +3090,19 @@ static void fill_array_sizes_structs(const struct radv_physical_device *pdev, } } } + + for (unsigned fp32 = 0; fp32 < 2; fp32++) { + for (unsigned bfloat = 0; bfloat < 2; bfloat++) { + prop.saturate = false; + prop.a_type = prop.b_type = bfloat ? VK_COMPONENT_TYPE_BFLOAT16_KHR : VK_COMPONENT_TYPE_FLOAT16_KHR; + prop.c_type = prop.r_type = fp32 ? VK_COMPONENT_TYPE_FLOAT32_KHR : prop.a_type; + + if (pdev->info.gfx_level < GFX12 && bfloat) + continue; /* BF16 isn't working precisely on GFX11. */ + + (*array_size_cb)(base, &prop); + } + } } VKAPI_ATTR VkResult VKAPI_CALL