From 6e080e4adf4d9b96351eb7f1284c2b887edb1874 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Wed, 6 May 2026 16:23:02 +0200 Subject: [PATCH] tu: Disable concurrent binning by default due to perf regressions Unfortunately we have to disable concurrent binning by default because it hurts performance in a number of desktop games without any case where we know it helps. There are less vertex fetch resource available in BV compared to BR, so when binning runs in BV, there are many vertices, and vertices are attribute heavy - BV has much worse performance than BR, sometimes more than 50% worse. Even with worse performance it won't be bad if concurrent binning actually overlapped with other workload in those cases, but in case of desktop games - there is almost never a chance for overlap. However it's impossible to statically find out if binning on BV would be much slower than on BR, and we also cannot statically predict if there is enough overlap (if any) to cover for the performance penalty. Given the above, I don't see a way out but to make concurrent binning opt in via `tu_allow_concurrent_binning` driconf toggle. Still allow concurrent binning in CI to catch issues early. Signed-off-by: Danylo Piliaiev --- src/freedreno/ci/deqp-freedreno-a750-vk.toml | 12 ++++++++++++ src/freedreno/vulkan/tu_cmd_buffer.cc | 6 +++--- src/freedreno/vulkan/tu_device.cc | 4 ++++ src/freedreno/vulkan/tu_device.h | 2 ++ src/freedreno/vulkan/tu_queue.cc | 2 +- src/util/driconf.h | 4 ++++ 6 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/freedreno/ci/deqp-freedreno-a750-vk.toml b/src/freedreno/ci/deqp-freedreno-a750-vk.toml index 5bd9483b38a..7a5118dfab5 100644 --- a/src/freedreno/ci/deqp-freedreno-a750-vk.toml +++ b/src/freedreno/ci/deqp-freedreno-a750-vk.toml @@ -31,6 +31,18 @@ tests_per_group = 10000 [deqp.env] TU_DEBUG = "gmem,unaligned_store" +# force-gmem with concurrent binning allowed to test concurrent binning +[[deqp]] +deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk" +caselists = ["/deqp-vk/mustpass/vk-main.txt"] +include = ["dEQP-VK.renderpass2.*"] +prefix = "gmem-cb-" +fraction = 20 +tests_per_group = 10000 +[deqp.env] +TU_DEBUG = "gmem" +tu_allow_concurrent_binning = "true" + # force-sysmem testing [[deqp]] deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk" diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 9e00175d80f..a729e651b62 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -489,7 +489,7 @@ tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer) if ((flushes & TU_CMD_FLAG_WAIT_FOR_BR) && CHIP >= A7XX && !(cmd_buffer->state.pass && cmd_buffer->state.renderpass_cb_disabled) && - !TU_DEBUG(NO_CONCURRENT_BINNING)) { + cmd_buffer->device->instance->allow_concurrent_binning) { trace_start_concurrent_binning_barrier(&cmd_buffer->trace, cs, cmd_buffer); /* Wait-for-BR when repeated a lot of times per frame can add up @@ -3109,8 +3109,8 @@ tu7_emit_concurrent_binning_start(struct tu_cmd_buffer *cmd, tu7_cb_disable_reason( (!cmd->state.lrz.fast_clear && cmd->state.lrz.image_view), cmd, "LRZ fast clear disabled") || - tu7_cb_disable_reason(TU_DEBUG(NO_CONCURRENT_BINNING), cmd, - "TU_DEBUG(NO_CONCURRENT_BINNING)")) { + tu7_cb_disable_reason(!cmd->device->instance->allow_concurrent_binning, cmd, + "globally disabled")) { tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1); tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 2e755551dca..b8c0c7851c9 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1830,6 +1830,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false) DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false) DRI_CONF_VK_XWAYLAND_WAIT_READY(false) + DRI_CONF_TU_ALLOW_CONCURRENT_BINNING(false) DRI_CONF_SECTION_END DRI_CONF_SECTION_DEBUG @@ -1884,6 +1885,9 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionb(&instance->dri_options, "tu_emulate_alpha_to_coverage"); instance->autotune_algo = driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); + instance->allow_concurrent_binning = + (driQueryOptionb(&instance->dri_options, "tu_allow_concurrent_binning") && !TU_DEBUG(NO_CONCURRENT_BINNING)) || + TU_DEBUG(FORCE_CONCURRENT_BINNING); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..e2168b91e08 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -240,6 +240,8 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; + + bool allow_concurrent_binning; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index 828af5d4e98..2099c91f7c4 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -182,7 +182,7 @@ resolve_vis_stream_patchpoints(struct tu_queue *queue, * streams and therefore should be avoided. */ uint32_t min_vis_stream_count = - (TU_DEBUG(NO_CONCURRENT_BINNING) || dev->physical_device->info->chip < 7) ? + (!dev->instance->allow_concurrent_binning || dev->physical_device->info->chip < 7) ? 1 : MIN2(MAX2(rp_count, 1), TU_MAX_VIS_STREAMS); uint32_t vis_stream_count; diff --git a/src/util/driconf.h b/src/util/driconf.h index 98e53529ae7..fc395fdeb72 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -668,6 +668,10 @@ DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \ "Set the preferred autotune algorithm") +#define DRI_CONF_TU_ALLOW_CONCURRENT_BINNING(def) \ + DRI_CONF_OPT_B(tu_allow_concurrent_binning, def, \ + "Allow concurrent binning on A7XX+, the CB is disabled by default because it regresses performance on desktop games") + /** * \brief Honeykrisp specific configuration options */