From 5529f15f31b0cd9c13f611148fcf9b272efd6b56 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Wed, 6 May 2026 16:23:02 +0200 Subject: [PATCH] tu: Disable concurrent binning by default due to perf regressions Unfortunately we have to disable concurrent binning by default because it hurts performance in a number of desktop games without any case where we know it helps. There are less vertex fetch resource available in BV compared to BR, so when binning runs in BV, there are many vertices, and vertices are attribute heavy - BV has much worse performance than BR, sometimes more than 50% worse. Even with worse performance it won't be bad if concurrent binning actually overlapped with other workload in those cases, but in case of desktop games - there is almost never a chance for overlap. However it's impossible to statically find out if binning on BV would be much slower than on BR, and we also cannot statically predict if there is enough overlap (if any) to cover for the performance penalty. Given the above, I don't see a way out but to make concurrent binning opt in via `tu_allow_concurrent_binning` driconf toggle. Still allow concurrent binning in CI to catch issues early. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/ci/deqp-freedreno-a750-vk.toml | 12 ++++++++++++ src/freedreno/vulkan/tu_cmd_buffer.cc | 6 +++--- src/freedreno/vulkan/tu_device.cc | 4 ++++ src/freedreno/vulkan/tu_device.h | 2 ++ src/freedreno/vulkan/tu_queue.cc | 2 +- src/util/driconf.h | 5 +++++ 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/freedreno/ci/deqp-freedreno-a750-vk.toml b/src/freedreno/ci/deqp-freedreno-a750-vk.toml index 5bd9483b38a..7a5118dfab5 100644 --- a/src/freedreno/ci/deqp-freedreno-a750-vk.toml +++ b/src/freedreno/ci/deqp-freedreno-a750-vk.toml @@ -31,6 +31,18 @@ tests_per_group = 10000 [deqp.env] TU_DEBUG = "gmem,unaligned_store" +# force-gmem with concurrent binning allowed to test concurrent binning +[[deqp]] +deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk" +caselists = ["/deqp-vk/mustpass/vk-main.txt"] +include = ["dEQP-VK.renderpass2.*"] +prefix = "gmem-cb-" +fraction = 20 +tests_per_group = 10000 +[deqp.env] +TU_DEBUG = "gmem" +tu_allow_concurrent_binning = "true" + # force-sysmem testing [[deqp]] deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk" diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 4e536057969..9d503edee9b 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -505,7 +505,7 @@ tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer) if ((flushes & TU_CMD_FLAG_WAIT_FOR_BR) && CHIP >= A7XX && !(cmd_buffer->state.pass && cmd_buffer->state.renderpass_cb_disabled) && - !TU_DEBUG(NO_CONCURRENT_BINNING)) { + cmd_buffer->device->instance->allow_concurrent_binning) { trace_start_concurrent_binning_barrier(&cmd_buffer->trace, cs, cmd_buffer); /* Wait-for-BR when repeated a lot of times per frame can add up @@ -3125,8 +3125,8 @@ tu7_emit_concurrent_binning_start(struct tu_cmd_buffer *cmd, tu7_cb_disable_reason( (!cmd->state.lrz.fast_clear && cmd->state.lrz.image_view), cmd, "LRZ fast clear disabled") || - tu7_cb_disable_reason(TU_DEBUG(NO_CONCURRENT_BINNING), cmd, - "TU_DEBUG(NO_CONCURRENT_BINNING)")) { + tu7_cb_disable_reason(!cmd->device->instance->allow_concurrent_binning, cmd, + "globally disabled")) { tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index afe8a26f001..013500c3a2b 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1840,6 +1840,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false) DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false) DRI_CONF_VK_XWAYLAND_WAIT_READY(false) + DRI_CONF_TU_ALLOW_CONCURRENT_BINNING(false) DRI_CONF_SECTION_END DRI_CONF_SECTION_DEBUG @@ -1900,6 +1901,9 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); instance->override_uncached_as_cache_coherent = driQueryOptionb(&instance->dri_options, "tu_override_uncached_as_cache_coherent"); + instance->allow_concurrent_binning = + (driQueryOptionb(&instance->dri_options, "tu_allow_concurrent_binning") && !TU_DEBUG(NO_CONCURRENT_BINNING)) || + TU_DEBUG(FORCE_CONCURRENT_BINNING); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 7083561ac31..db22f014fbc 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -262,6 +262,8 @@ struct tu_instance * with cached+coherent+host_visible when the hardware supports it. */ bool override_uncached_as_cache_coherent; + + bool allow_concurrent_binning; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index 828af5d4e98..2099c91f7c4 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -182,7 +182,7 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, * streams and therefore should be avoided. */ uint32_t min_vis_stream_count = - (TU_DEBUG(NO_CONCURRENT_BINNING) || dev->physical_device->info->chip < 7) ? + (!dev->instance->allow_concurrent_binning || dev->physical_device->info->chip < 7) ? 1 : MIN2(MAX2(rp_count, 1), TU_MAX_VIS_STREAMS); uint32_t vis_stream_count; diff --git a/src/util/driconf.h b/src/util/driconf.h index 20de0497ccb..b1d6eafd826 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -679,6 +679,11 @@ #define DRI_CONF_TU_OVERRIDE_UNCACHED_AS_CACHE_COHERENT(def) \ DRI_CONF_OPT_B(tu_override_uncached_as_cache_coherent, def, \ "Replaces uncached-host allocations with cached-coherent-host when possible. Only useful under x86 emulation where memory accesses tend to be atomic") + +#define DRI_CONF_TU_ALLOW_CONCURRENT_BINNING(def) \ + DRI_CONF_OPT_B(tu_allow_concurrent_binning, def, \ + "Allow concurrent binning on A7XX+, the CB is disabled by default because it regresses performance on desktop games") + /** * \brief Honeykrisp specific configuration options */