tu: Don't disable UBWC for D24S8+USAGE_SAMPLED+customBorderColorWithoutFormat

Apparently, this is a major footgun since it is not uncommon for apps to
enable all the features exposed by a driver. Having UBWC disabled for
D24S8 can result in a major performance loss, and the reason can be hard
for devs to spot. This footgun is already known to have happened a few
times. Furthermore, disabling UBWC depending on a Vulkan feature being
requested broke D24S8 sharing via external memory when only one device
was created with customBorderColorWithoutFormat.

Fortunately, there is the depthStencilSwizzleOneSupport feature, which
was added after the above hardware deficiency was found and, when false,
forbids the problematic state combination.

To prevent the footgun described above, we now set
depthStencilSwizzleOneSupport to false by default. This allows UBWC to be
enabled for D24S8 in all cases while remaining conformant. We also have
the tu_enable_d24s8_border_color_workaround driconf option, which enables
the previous workaround for apps that don't know about
depthStencilSwizzleOneSupport, which is currently only the ANGLE
translation layer.

One caveat is that we cannot use the fast border color HW feature for
D24S8+USAGE_SAMPLED+VK_FORMAT_UNDEFINED, so a new driconf toggle is
added. enable_fast_border_color_for_undefined_formats is set for DXVK and
vkd3d-proton since they are known not to use border colors with D24S8.
Lacking fast border colors is a much smaller penalty than not having UBWC
for D24S8.

For some context also see: https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4346

This partially reverts 36916949.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41514>
This commit is contained in:
Danylo Piliaiev 2026-05-12 17:38:59 +02:00 committed by Marge Bot
parent ea8de0742b
commit 9a247643eb
6 changed files with 59 additions and 22 deletions

View file

@ -2267,8 +2267,8 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
if (CHIP >= A8XX)
tu_cs_emit_regs(cs, SP_ALPHA_TEST_CNTL(CHIP));
tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor)));
tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor)));
tu_cs_emit_regs(cs, A6XX_TPL1_GFX_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor_builtin)));
tu_cs_emit_regs(cs, A6XX_TPL1_CS_BORDER_COLOR_BASE(.qword = dev->global_bo->iova + gb_offset(bcolor_builtin)));
/* BR-only registers */
/* non-ctx regs programmed by KMD (and blocked from UMD) on gen8+ */

View file

@ -25,6 +25,7 @@
#include "vk_debug_utils.h"
#include "vk_shader_module.h"
#include "vk_util.h"
#include "vk_sampler.h"
#include "common/freedreno_uuid.h"
#include "fdl/freedreno_layout.h"
@ -1435,7 +1436,8 @@ tu_get_properties(struct tu_physical_device *pdevice,
/* VK_KHR_maintenance5 */
props->earlyFragmentMultisampleCoverageAfterSampleCounting = true;
props->earlyFragmentSampleMaskTestBeforeSampleCounting = true;
props->depthStencilSwizzleOneSupport = true;
props->depthStencilSwizzleOneSupport =
pdevice->info->props.has_z24uint_s8uint && pdevice->instance->enable_d24s8_border_color_workaround;
props->polygonModePointSize = true;
props->nonStrictWideLinesUseParallelogram = false;
props->nonStrictSinglePixelWideLinesUseParallelogram = false;
@ -1852,7 +1854,8 @@ static const driOptionDescription tu_dri_options[] = {
DRI_CONF_DISABLE_CONSERVATIVE_LRZ(false)
DRI_CONF_TU_DONT_RESERVE_DESCRIPTOR_SET(false)
DRI_CONF_TU_ALLOW_OOB_INDIRECT_UBO_LOADS(false)
DRI_CONF_TU_DISABLE_D24S8_BORDER_COLOR_WORKAROUND(false)
DRI_CONF_TU_ENABLE_D24S8_BORDER_COLOR_WORKAROUND(false)
DRI_CONF_TU_ENABLE_FAST_BORDER_COLOR_FOR_UNDEFINED_FORMATS(false)
DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(false)
DRI_CONF_TU_IGNORE_FRAG_DEPTH_DIRECTION(false)
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
@ -1881,8 +1884,10 @@ tu_init_dri_options(struct tu_instance *instance)
!driQueryOptionb(&instance->dri_options, "tu_dont_reserve_descriptor_set");
instance->allow_oob_indirect_ubo_loads =
driQueryOptionb(&instance->dri_options, "tu_allow_oob_indirect_ubo_loads");
instance->disable_d24s8_border_color_workaround =
driQueryOptionb(&instance->dri_options, "tu_disable_d24s8_border_color_workaround");
instance->enable_d24s8_border_color_workaround =
driQueryOptionb(&instance->dri_options, "tu_enable_d24s8_border_color_workaround");
instance->enable_fast_border_color_for_undefined_formats =
driQueryOptionb(&instance->dri_options, "tu_enable_fast_border_color_for_undefined_formats");
instance->use_tex_coord_round_nearest_even_mode =
driQueryOptionb(&instance->dri_options, "tu_use_tex_coord_round_nearest_even_mode");
instance->ignore_frag_depth_direction =
@ -3042,6 +3047,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
global->zero_64b = 0;
for (int i = 0; i < TU_BORDER_COLOR_BUILTIN; i++) {
VkClearColorValue border_color = vk_border_color_value((VkBorderColor) i);
tu6_pack_border_color(&global->bcolor_builtin[i], &border_color,
vk_border_color_is_int((VkBorderColor) i));
}
/* initialize to ones so ffs can be used to find unused slots */
BITSET_ONES(device->custom_border_color);
@ -3139,7 +3150,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
device->use_z24uint_s8uint =
physical_device->info->props.has_z24uint_s8uint &&
(!border_color_without_format ||
physical_device->instance->disable_d24s8_border_color_workaround);
!physical_device->instance->enable_d24s8_border_color_workaround);
device->use_lrz = !TU_DEBUG_START(NOLRZ);
tu_gpu_tracepoint_config_variable();

View file

@ -35,6 +35,7 @@
#define TU_MAX_QUEUE_FAMILIES 2
#define TU_BORDER_COLOR_COUNT 4096
#define TU_BORDER_COLOR_BUILTIN 6
#define TU_BLIT_SHADER_SIZE 4096
@ -217,13 +218,25 @@ struct tu_instance
*/
bool allow_oob_indirect_ubo_loads;
/* DXVK and VKD3D-Proton use customBorderColorWithoutFormat
* and have most of D24S8 images with USAGE_SAMPLED, in such case we
* disable UBWC for correctness. However, games don't use border color for
* depth-stencil images. So we elect to ignore this edge case and force
* UBWC to be enabled.
/* The hardware doesn't support Vulkan's stencil swizzling rules for
* custom border colors. Vulkan requires stencil to be sampled as the red
* component, but hardware samples it as the green component. Without
* customBorderColorWithoutFormat we can work around this issue without
* perf loss, but with customBorderColorWithoutFormat we have to disable
* UBWC for D24S8 images with USAGE_SAMPLED set.
* However, VkPhysicalDeviceMaintenance5Properties.depthStencilSwizzleOneSupport
* forbids this state combination when false. It was added after the HW
* deficiency was discovered, and we want to work around apps that aren't
* aware of this.
*/
bool disable_d24s8_border_color_workaround;
bool enable_d24s8_border_color_workaround;
/* When D24S8 is used without enable_d24s8_border_color_workaround, the
* fast border color HW feature results in an incorrect color being used.
* However, we want to enable fast border colors for apps that are known
* not to use border colors with D24S8, such as DXVK and vkd3d-proton.
*/
bool enable_fast_border_color_for_undefined_formats;
/* D3D emulation requires texture coordinates to be rounded to nearest even value. */
bool use_tex_coord_round_nearest_even_mode;
@ -329,6 +342,7 @@ struct tu6_global
uint64_t preemption_latency_cmp_scratch;
uint64_t zero_64b;
struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN];
struct bcolor_entry bcolor[];
};
#define gb_offset(member) offsetof(struct tu6_global, member)

View file

@ -58,7 +58,9 @@ tu_CreateSampler(VkDevice _device,
tu6_pack_border_color(
&device->global_bo_map->bcolor[border_color], &color,
pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT);
} else {
border_color += TU_BORDER_COLOR_BUILTIN;
} else if (sampler->vk.format != VK_FORMAT_UNDEFINED ||
device->instance->enable_fast_border_color_for_undefined_formats) {
fast_border_color_enable = true;
switch (pCreateInfo->borderColor) {
case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
@ -192,7 +194,8 @@ tu_DestroySampler(VkDevice _device,
pkt_field_get(A6XX_TEX_SAMP_2_BCOLOR, sampler->descriptor[2]);
}
if (!fast_border_color) {
if (!fast_border_color && border_color >= TU_BORDER_COLOR_BUILTIN) {
border_color -= TU_BORDER_COLOR_BUILTIN;
/* if the sampler had a custom border color, free it. TODO: no lock */
mtx_lock(&device->mutex);
assert(!BITSET_TEST(device->custom_border_color, border_color));

View file

@ -1402,11 +1402,9 @@ TODO: document the other workarounds.
<engine engine_name_match="DXVK|vkd3d">
<!--
DXVK and VKD3D-Proton use customBorderColorWithoutFormat and have most of
D24S8 images with USAGE_SAMPLED, in such case we disable UBWC for correctness.
However, games don't use border color for depth-stencil images. So we elect
to ignore this edge case and force UBWC to be enabled.
D24S8 images with USAGE_SAMPLED, but never sample it with border color.
-->
<option name="tu_disable_d24s8_border_color_workaround" value="true" />
<option name="tu_enable_fast_border_color_for_undefined_formats" value="true" />
<!--
For sampling, Vulkan requires texture coordinates to be truncated, whereas
D3D requires them to be rounded to nearest even value. The former is used by
@ -1433,6 +1431,13 @@ TODO: document the other workarounds.
<engine engine_name_match="vkd3d">
<option name="tu_enable_softfloat32" value="true" />
</engine>
<engine engine_name_match="ANGLE">
<!--
ANGLE uses customBorderColorWithoutFormat on D24S8 and doesn't check
depthStencilSwizzleOneSupport.
-->
<option name="tu_enable_d24s8_border_color_workaround" value="true" />
</engine>
<application name="Sons Of The Forest" executable="SonsOfTheForest.exe">
<option name="tu_ignore_frag_depth_direction" value="true" />
</application>

View file

@ -648,9 +648,13 @@
DRI_CONF_OPT_B(tu_allow_oob_indirect_ubo_loads, def, \
"Some D3D11 games rely on out-of-bounds indirect UBO loads to return real values from underlying bound descriptor, this prevents us from lowering indirectly accessed UBOs to consts")
#define DRI_CONF_TU_DISABLE_D24S8_BORDER_COLOR_WORKAROUND(def) \
DRI_CONF_OPT_B(tu_disable_d24s8_border_color_workaround, def, \
"Use UBWC for D24S8 images with VK_IMAGE_USAGE_SAMPLED_BIT when customBorderColorWithoutFormat is enabled")
#define DRI_CONF_TU_ENABLE_D24S8_BORDER_COLOR_WORKAROUND(def) \
DRI_CONF_OPT_B(tu_enable_d24s8_border_color_workaround, def, \
"Disable UBWC for D24S8 images with VK_IMAGE_USAGE_SAMPLED_BIT when customBorderColorWithoutFormat is enabled")
#define DRI_CONF_TU_ENABLE_FAST_BORDER_COLOR_FOR_UNDEFINED_FORMATS(def) \
DRI_CONF_OPT_B(tu_enable_fast_border_color_for_undefined_formats, def, \
"Enables fast border color HW feature for VK_FORMAT_UNDEFINED sampler formats.")
#define DRI_CONF_TU_USE_TEX_COORD_ROUND_NEAREST_EVEN_MODE(def) \
DRI_CONF_OPT_B(tu_use_tex_coord_round_nearest_even_mode, def, \