From 9a247643ebd0c8f9389c1c38870d4da94c29fdef Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Tue, 12 May 2026 17:38:59 +0200 Subject: [PATCH] tu: Don't disable UBWC for D24S8+USAGE_SAMPLED+customBorderColorWithoutFormat Apparently, this is a major footgun since it is not uncommon for apps to enable all the features exposed by a driver. Having UBWC disabled for D24S8 can result in a major performance loss, and the reason can be hard for devs to spot. This footgun is already known to have happened a few times. Furthermore, disabling UBWC depending on a Vulkan feature being requested broke D24S8 sharing via external memory when only one device was created with customBorderColorWithoutFormat. Fortunately, there is the depthStencilSwizzleOneSupport feature, which was added after the above hardware deficiency was found and, when false, forbids the problematic state combination. To prevent the footgun described above, we now set depthStencilSwizzleOneSupport to false by default. This allows UBWC to be enabled for D24S8 in all cases while remaining conformant. We also have the tu_enable_d24s8_border_color_workaround driconf option, which enables the previous workaround for apps that don't know about depthStencilSwizzleOneSupport, which is currently only the ANGLE translation layer. One caveat is that we cannot use the fast border color HW feature for D24S8+USAGE_SAMPLED+VK_FORMAT_UNDEFINED, so a new driconf toggle is added. enable_fast_border_color_for_undefined_formats is set for DXVK and vkd3d-proton since they are known not to use border colors with D24S8. Lacking fast border colors is a much smaller penalty than not having UBWC for D24S8. For some context also see: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4346 This partially reverts 36916949. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.cc | 4 ++-- src/freedreno/vulkan/tu_device.cc | 21 ++++++++++++++++----- src/freedreno/vulkan/tu_device.h | 26 ++++++++++++++++++++------ src/freedreno/vulkan/tu_sampler.cc | 7 +++++-- src/util/00-mesa-defaults.conf | 13 +++++++++---- src/util/driconf.h | 10 +++++++--- 6 files changed, 59 insertions(+), 22 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 1905e41aac9..4e536057969 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -2267,8 +2267,8 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) if (CHIP >= A8XX) tu_cs_emit_regs(cs, SP_ALPHA_TEST_CNTL(CHIP)); - tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor))); - tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor))); + tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor_builtin))); + tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor_builtin))); /* BR-only registers */ /* non-ctx regs programmed by KMD (and blocked from UMD) on gen8+ */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 213d9587d68..afe8a26f001 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -25,6 +25,7 @@ #include "vk_debug_utils.h" #include "vk_shader_module.h" #include "vk_util.h" +#include "vk_sampler.h" #include "common/freedreno_uuid.h" #include "fdl/freedreno_layout.h" @@ -1435,7 +1436,8 @@ tu_get_properties(struct tu_physical_device *pdevice, /* VK_KHR_maintenance5 */ props->earlyFragmentMultisampleCoverageAfterSampleCounting = true; props->earlyFragmentSampleMaskTestBeforeSampleCounting = true; - props->depthStencilSwizzleOneSupport = true; + props->depthStencilSwizzleOneSupport = + pdevice->info->props.has_z24uint_s8uint && pdevice->instance->enable_d24s8_border_color_workaround; props->polygonModePointSize = true; props->nonStrictWideLinesUseParallelogram = false; props->nonStrictSinglePixelWideLinesUseParallelogram = false; @@ -1852,7 +1854,8 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_DISABLE_CONSERVATIVE_LRZ(false) DRI_CONF_TU_DONT_RESERVE_DESCRIPTOR_SET(false) DRI_CONF_TU_ALLOW_OOB_INDIRECT_UBO_LOADS(false) - DRI_CONF_TU_DISABLE_D24S8_BORDER_COLOR_WORKAROUND(false) + DRI_CONF_TU_ENABLE_D24S8_BORDER_COLOR_WORKAROUND(false) + DRI_CONF_TU_ENABLE_FAST_BORDER_COLOR_FOR_UNDEFINED_FORMATS(false) DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false) DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false) DRI_CONF_TU_ENABLE_SOFTFLOAT32(false) @@ -1881,8 +1884,10 @@ tu_init_dri_options(struct tu_instance *instance) !driQueryOptionb(&instance->dri_options, "tu_dont_reserve_descriptor_set"); instance->allow_oob_indirect_ubo_loads = driQueryOptionb(&instance->dri_options, "tu_allow_oob_indirect_ubo_loads"); - instance->disable_d24s8_border_color_workaround = - driQueryOptionb(&instance->dri_options, "tu_disable_d24s8_border_color_workaround"); + instance->enable_d24s8_border_color_workaround = + driQueryOptionb(&instance->dri_options, "tu_enable_d24s8_border_color_workaround"); + instance->enable_fast_border_color_for_undefined_formats = + driQueryOptionb(&instance->dri_options, "tu_enable_fast_border_color_for_undefined_formats"); instance->use_tex_coord_round_nearest_even_mode = driQueryOptionb(&instance->dri_options, "tu_use_tex_coord_round_nearest_even_mode"); instance->ignore_frag_depth_direction = @@ -3042,6 +3047,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, global->zero_64b = 0; + for (int i = 0; i < TU_BORDER_COLOR_BUILTIN; i++) { + VkClearColorValue border_color = vk_border_color_value((VkBorderColor) i); + tu6_pack_border_color(&global->bcolor_builtin[i], &border_color, + vk_border_color_is_int((VkBorderColor) i)); + } + /* initialize to ones so ffs can be used to find unused slots */ BITSET_ONES(device->custom_border_color); @@ -3139,7 +3150,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->use_z24uint_s8uint = physical_device->info->props.has_z24uint_s8uint && (!border_color_without_format || - physical_device->instance->disable_d24s8_border_color_workaround); + !physical_device->instance->enable_d24s8_border_color_workaround); device->use_lrz = !TU_DEBUG_START(NOLRZ); tu_gpu_tracepoint_config_variable(); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index e5e58c99e2b..7083561ac31 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -35,6 +35,7 @@ #define TU_MAX_QUEUE_FAMILIES 2 #define TU_BORDER_COLOR_COUNT 4096 +#define TU_BORDER_COLOR_BUILTIN 6 #define TU_BLIT_SHADER_SIZE 4096 @@ -217,13 +218,25 @@ struct tu_instance */ bool allow_oob_indirect_ubo_loads; - /* DXVK and VKD3D-Proton use customBorderColorWithoutFormat - * and have most of D24S8 images with USAGE_SAMPLED, in such case we - * disable UBWC for correctness. However, games don't use border color for - * depth-stencil images. So we elect to ignore this edge case and force - * UBWC to be enabled. + /* The hardware doesn't support Vulkan's stencil swizzling rules for + * custom border colors. Vulkan requires stencil to be sampled as the red + * component, but hardware samples it as the green component. Without + * customBorderColorWithoutFormat we can work around this issue without + * perf loss, but with customBorderColorWithoutFormat we have to disable + * UBWC for D24S8 images with USAGE_SAMPLED set. + * However, VkPhysicalDeviceMaintenance5Properties.depthStencilSwizzleOneSupport + * forbids this state combination when false. It was added after the HW + * deficiency was discovered, and we want to work around apps that aren't + * aware of this. */ - bool disable_d24s8_border_color_workaround; + bool enable_d24s8_border_color_workaround; + + /* When D24S8 is used without enable_d24s8_border_color_workaround, the + * fast border color HW feature results in an incorrect color being used. + * However, we want to enable fast border colors for apps that are known + * not to use border colors with D24S8, such as DXVK and vkd3d-proton. + */ + bool enable_fast_border_color_for_undefined_formats; /* D3D emulation requires texture coordinates to be rounded to nearest even value. */ bool use_tex_coord_round_nearest_even_mode; @@ -329,6 +342,7 @@ struct tu6_global uint64_t preemption_latency_cmp_scratch; uint64_t zero_64b; + struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN]; struct bcolor_entry bcolor[]; }; #define gb_offset(member) offsetof(struct tu6_global, member) diff --git a/src/freedreno/vulkan/tu_sampler.cc b/src/freedreno/vulkan/tu_sampler.cc index b6138206c6a..7a7710ce9e6 100644 --- a/src/freedreno/vulkan/tu_sampler.cc +++ b/src/freedreno/vulkan/tu_sampler.cc @@ -58,7 +58,9 @@ tu_CreateSampler(VkDevice _device, tu6_pack_border_color( &device->global_bo_map->bcolor[border_color], &color, pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT); - } else { + border_color += TU_BORDER_COLOR_BUILTIN; + } else if (sampler->vk.format != VK_FORMAT_UNDEFINED || + device->instance->enable_fast_border_color_for_undefined_formats) { fast_border_color_enable = true; switch (pCreateInfo->borderColor) { case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK: @@ -192,7 +194,8 @@ tu_DestroySampler(VkDevice _device, pkt_field_get(A6XX_TEX_SAMP_2_BCOLOR, sampler->descriptor[2]); } - if (!fast_border_color) { + if (!fast_border_color && border_color >= TU_BORDER_COLOR_BUILTIN) { + border_color -= TU_BORDER_COLOR_BUILTIN; /* if the sampler had a custom border color, free it. TODO: no lock */ mtx_lock(&device->mutex); assert(!BITSET_TEST(device->custom_border_color, border_color)); diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index c5eca519f84..53a0e46f2f3 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -1402,11 +1402,9 @@ TODO: document the other workarounds. - diff --git a/src/util/driconf.h b/src/util/driconf.h index ca447061760..20de0497ccb 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -648,9 +648,13 @@ DRI_CONF_OPT_B(tu_allow_oob_indirect_ubo_loads, def, \ "Some D3D11 games rely on out-of-bounds indirect UBO loads to return real values from underlying bound descriptor, this prevents us from lowering indirectly accessed UBOs to consts") -#define DRI_CONF_TU_DISABLE_D24S8_BORDER_COLOR_WORKAROUND(def) \ - DRI_CONF_OPT_B(tu_disable_d24s8_border_color_workaround, def, \ - "Use UBWC for D24S8 images with VK_IMAGE_USAGE_SAMPLED_BIT when customBorderColorWithoutFormat is enabled") +#define DRI_CONF_TU_ENABLE_D24S8_BORDER_COLOR_WORKAROUND(def) \ + DRI_CONF_OPT_B(tu_enable_d24s8_border_color_workaround, def, \ + "Disable UBWC for D24S8 images with VK_IMAGE_USAGE_SAMPLED_BIT when customBorderColorWithoutFormat is enabled") + +#define DRI_CONF_TU_ENABLE_FAST_BORDER_COLOR_FOR_UNDEFINED_FORMATS(def) \ + DRI_CONF_OPT_B(tu_enable_fast_border_color_for_undefined_formats, def, \ + "Enables fast border color HW feature for VK_FORMAT_UNDEFINED sampler formats.") #define DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(def) \ DRI_CONF_OPT_B(tu_use_tex_coord_round_nearest_even_mode, def, \