From 36a9593f01c792e21d4e7c9c70c6d5d7d4817551 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 15 Apr 2026 17:12:57 +0200 Subject: [PATCH] tu: Add option to raise the maximum SSBO size Emulates SSBOS via 3D image access, real SSBO size and start offset (due to image aligment requirements) are stored in the descriptor and accessed via resbase. This also disables storage_16bit and storage_8bit to simplify SSBO emulation since no known D3D12 games, that need SSBO emulated, require those capabilities. Proprietary driver has a more complex solution which involves uniform branching on buffer size and using ldg for over-the-limit buffers. --- src/freedreno/vulkan/tu_descriptor_set.cc | 19 ++++++ src/freedreno/vulkan/tu_device.cc | 20 +++++- src/freedreno/vulkan/tu_device.h | 4 +- src/freedreno/vulkan/tu_shader.cc | 77 +++++++++++++++++++++++ src/util/driconf.h | 4 ++ 5 files changed, 120 insertions(+), 4 deletions(-) diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 636a97440cc..ff3f3d2a0a3 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -59,6 +59,13 @@ descriptor_size(struct tu_device *dev, return FDL6_TEX_CONST_DWORDS * 4 * (subsampled ? 3 : 2); case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + if (dev->physical_device->enable_ssbo_emulation) { + /* With SSBO emulation, use a single R32_UINT emulated 2D + * descriptor instead of multiple format-specific buffer + * descriptors. + */ + return FDL6_TEX_CONST_DWORDS * 4; + } /* isam.v allows using a single 16-bit descriptor for both 16-bit and * 32-bit loads. If not available but 16-bit storage is still supported, * two separate descriptors are required. @@ -1108,6 +1115,18 @@ write_buffer_descriptor_addr(const struct tu_device *device, const VkDescriptorAddressInfoEXT *buffer_info) { const struct fd_dev_info *info = device->physical_device->info; + + if (device->physical_device->enable_ssbo_emulation) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + if (!buffer_info || buffer_info->address == 0) + return; + uint32_t blocksize_B = util_format_get_blocksize(PIPE_FORMAT_R32_UINT); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, PIPE_FORMAT_R32_UINT, buffer_info->address, elements); + return; + } + /* This prevents any misconfiguration, but 16-bit descriptor capable of both * 16-bit and 32-bit access through isam.v will of course only be functional * when 16-bit storage is supported. */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 70ccc2c7eeb..657fe5e7fff 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -75,6 +75,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) sizeof(device->instance->allow_oob_indirect_ubo_loads)); _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, sizeof(device->enable_texel_buffer_emulation)); + _mesa_blake3_update(&ctx, &device->enable_ssbo_emulation, + sizeof(device->enable_ssbo_emulation)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; @@ -168,8 +170,10 @@ get_device_extensions(const struct tu_physical_device *device, (!device->info->props.has_sw_fuse || device->has_raytracing); *ext = (struct vk_device_extension_table) { .table = { - .KHR_8bit_storage = device->info->props.storage_8bit, - .KHR_16bit_storage = device->info->props.storage_16bit, + .KHR_8bit_storage = device->info->props.storage_8bit && + !device->enable_ssbo_emulation, + .KHR_16bit_storage = device->info->props.storage_16bit && + !device->enable_ssbo_emulation, .KHR_acceleration_structure = has_raytracing, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, @@ -1115,7 +1119,10 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxTexelBufferElements = pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; - props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE; + props->maxStorageBufferRange = + pdevice->enable_ssbo_emulation + ? TU_MAX_EMULATED_TEXEL_ELEMENTS * 4 + : MAX_STORAGE_BUFFER_RANGE; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; props->maxMemoryAllocationCount = UINT32_MAX; props->maxSamplerAllocationCount = 64 * 1024; @@ -1418,6 +1425,9 @@ tu_get_properties(struct tu_physical_device *pdevice, props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4 * (1 + COND(pdevice->info->props.storage_16bit && !pdevice->info->props.has_isam_v, 1) + COND(pdevice->info->props.storage_8bit, 1)); + if (pdevice->enable_ssbo_emulation) { + props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4; + } props->robustStorageBufferDescriptorSize = props->storageBufferDescriptorSize; props->accelerationStructureDescriptorSize = 4 * FDL6_TEX_CONST_DWORDS; @@ -1685,6 +1695,7 @@ tu_physical_device_init(struct tu_physical_device *device, if (fd_dev_gen(&device->dev_id) >= 7) { device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation; + device->enable_ssbo_emulation = instance->enable_ssbo_emulation; } device->memory.type_count = 1; @@ -1851,6 +1862,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false) DRI_CONF_TU_AUTOTUNE_ALGORITHM() DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false) + DRI_CONF_TU_ENABLE_SSBO_EMULATION(false) DRI_CONF_SECTION_END }; @@ -1887,6 +1899,8 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); instance->enable_texel_buffer_emulation = driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation"); + instance->enable_ssbo_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_ssbo_emulation"); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c7b46397e4d..8d064c58f94 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -144,6 +144,7 @@ struct tu_physical_device bool is_perf_cntr_selectable; bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; struct { uint32_t non_lazy_type_count; @@ -243,11 +244,12 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; - /* D3D12 doesn't have documented limit for texel buffer size, in practice + /* D3D12 doesn't have documented limit for texel buffer or SSBO size, in practice * some games expect up to (1 << 29) elements, which is higher than A6XX or * A7XX hardware can support. */ bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 094c1963013..98950941840 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -669,6 +669,77 @@ lower_texel_buffers_to_image(nir_builder *b, } } +static bool +lower_ssbo_to_image(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic == nir_intrinsic_load_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_def *load = nir_bindless_image_load( + b, intr->def.num_components, intr->def.bit_size, bindless, + nir_pad_vec4(b, coord3d), nir_imm_zero(b, 1, 32) /* sample index */, + nir_imm_zero(b, 1, 32) /* lod */, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = PIPE_FORMAT_R32_UINT, .access = nir_intrinsic_access(intr)); + nir_def_replace(&intr->def, load); + return true; + } else if (intr->intrinsic == nir_intrinsic_store_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_bindless_image_store( + b, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, nir_imm_zero(b, 1, 32) /* lod */, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = PIPE_FORMAT_R32_UINT, + .access = nir_intrinsic_access(intr), .src_type = nir_type_uint32); + nir_instr_remove(&intr->instr); + return true; + } else if (intr->intrinsic == nir_intrinsic_get_ssbo_size) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *num_elements = build_texel_buffer_size(b, bindless, NULL); + nir_def *size = nir_amul_imm(b, num_elements, sizeof(uint32_t)); + nir_def_replace(&intr->def, size); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic = nir_bindless_image_atomic( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = format, .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic_swap = nir_bindless_image_atomic_swap( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, intr->src[3].ssa, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = format, + .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic_swap); + return true; + } + + return false; +} + static void lower_image_deref(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, @@ -1383,6 +1454,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, nir_metadata_none, ¶ms); + if (dev->physical_device->enable_ssbo_emulation) { + progress |= nir_lower_io_to_scalar(shader, nir_var_mem_ssbo, NULL, NULL); + progress |= nir_shader_intrinsics_pass(shader, lower_ssbo_to_image, + nir_metadata_control_flow, NULL); + } + /* Remove now-unused variables so that when we gather the shader info later * they won't be counted. */ diff --git a/src/util/driconf.h b/src/util/driconf.h index 56b69a2aadb..0c2b9249525 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -700,6 +700,10 @@ DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \ "Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect") +#define DRI_CONF_TU_ENABLE_SSBO_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_ssbo_emulation, def, \ + "Emulate SSBO to allow higher limit for elements that is in line with what some D3D12 games expect") + /** * \brief Honeykrisp specific configuration options */