diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 636a97440cc..ff3f3d2a0a3 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -59,6 +59,13 @@ descriptor_size(struct tu_device *dev, return FDL6_TEX_CONST_DWORDS * 4 * (subsampled ? 3 : 2); case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + if (dev->physical_device->enable_ssbo_emulation) { + /* With SSBO emulation, use a single R32_UINT emulated 2D + * descriptor instead of multiple format-specific buffer + * descriptors. + */ + return FDL6_TEX_CONST_DWORDS * 4; + } /* isam.v allows using a single 16-bit descriptor for both 16-bit and * 32-bit loads. If not available but 16-bit storage is still supported, * two separate descriptors are required. @@ -1108,6 +1115,18 @@ write_buffer_descriptor_addr(const struct tu_device *device, const VkDescriptorAddressInfoEXT *buffer_info) { const struct fd_dev_info *info = device->physical_device->info; + + if (device->physical_device->enable_ssbo_emulation) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + if (!buffer_info || buffer_info->address == 0) + return; + uint32_t blocksize_B = util_format_get_blocksize(PIPE_FORMAT_R32_UINT); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, PIPE_FORMAT_R32_UINT, buffer_info->address, elements); + return; + } + /* This prevents any misconfiguration, but 16-bit descriptor capable of both * 16-bit and 32-bit access through isam.v will of course only be functional * when 16-bit storage is supported. */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 70ccc2c7eeb..657fe5e7fff 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -75,6 +75,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) sizeof(device->instance->allow_oob_indirect_ubo_loads)); _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, sizeof(device->enable_texel_buffer_emulation)); + _mesa_blake3_update(&ctx, &device->enable_ssbo_emulation, + sizeof(device->enable_ssbo_emulation)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; @@ -168,8 +170,10 @@ get_device_extensions(const struct tu_physical_device *device, (!device->info->props.has_sw_fuse || device->has_raytracing); *ext = (struct vk_device_extension_table) { .table = { - .KHR_8bit_storage = device->info->props.storage_8bit, - .KHR_16bit_storage = device->info->props.storage_16bit, + .KHR_8bit_storage = device->info->props.storage_8bit && + !device->enable_ssbo_emulation, + .KHR_16bit_storage = device->info->props.storage_16bit && + !device->enable_ssbo_emulation, .KHR_acceleration_structure = has_raytracing, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, @@ -1115,7 +1119,10 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxTexelBufferElements = pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; - props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE; + props->maxStorageBufferRange = + pdevice->enable_ssbo_emulation + ? TU_MAX_EMULATED_TEXEL_ELEMENTS * 4 + : MAX_STORAGE_BUFFER_RANGE; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; props->maxMemoryAllocationCount = UINT32_MAX; props->maxSamplerAllocationCount = 64 * 1024; @@ -1418,6 +1425,9 @@ tu_get_properties(struct tu_physical_device *pdevice, props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4 * (1 + COND(pdevice->info->props.storage_16bit && !pdevice->info->props.has_isam_v, 1) + COND(pdevice->info->props.storage_8bit, 1)); + if (pdevice->enable_ssbo_emulation) { + props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4; + } props->robustStorageBufferDescriptorSize = props->storageBufferDescriptorSize; props->accelerationStructureDescriptorSize = 4 * FDL6_TEX_CONST_DWORDS; @@ -1685,6 +1695,7 @@ tu_physical_device_init(struct tu_physical_device *device, if (fd_dev_gen(&device->dev_id) >= 7) { device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation; + device->enable_ssbo_emulation = instance->enable_ssbo_emulation; } device->memory.type_count = 1; @@ -1851,6 +1862,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false) DRI_CONF_TU_AUTOTUNE_ALGORITHM() DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false) + DRI_CONF_TU_ENABLE_SSBO_EMULATION(false) DRI_CONF_SECTION_END }; @@ -1887,6 +1899,8 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); instance->enable_texel_buffer_emulation = driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation"); + instance->enable_ssbo_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_ssbo_emulation"); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c7b46397e4d..8d064c58f94 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -144,6 +144,7 @@ struct tu_physical_device bool is_perf_cntr_selectable; bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; struct { uint32_t non_lazy_type_count; @@ -243,11 +244,12 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; - /* D3D12 doesn't have documented limit for texel buffer size, in practice + /* D3D12 doesn't have documented limit for texel buffer or SSBO size, in practice * some games expect up to (1 << 29) elements, which is higher than A6XX or * A7XX hardware can support. */ bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 094c1963013..98950941840 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -669,6 +669,77 @@ lower_texel_buffers_to_image(nir_builder *b, } } +static bool +lower_ssbo_to_image(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic == nir_intrinsic_load_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_def *load = nir_bindless_image_load( + b, intr->def.num_components, intr->def.bit_size, bindless, + nir_pad_vec4(b, coord3d), nir_imm_zero(b, 1, 32) /* sample index */, + nir_imm_zero(b, 1, 32) /* lod */, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = PIPE_FORMAT_R32_UINT, .access = nir_intrinsic_access(intr)); + nir_def_replace(&intr->def, load); + return true; + } else if (intr->intrinsic == nir_intrinsic_store_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_bindless_image_store( + b, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, nir_imm_zero(b, 1, 32) /* lod */, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = PIPE_FORMAT_R32_UINT, + .access = nir_intrinsic_access(intr), .src_type = nir_type_uint32); + nir_instr_remove(&intr->instr); + return true; + } else if (intr->intrinsic == nir_intrinsic_get_ssbo_size) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *num_elements = build_texel_buffer_size(b, bindless, NULL); + nir_def *size = nir_amul_imm(b, num_elements, sizeof(uint32_t)); + nir_def_replace(&intr->def, size); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic = nir_bindless_image_atomic( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = format, .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic_swap = nir_bindless_image_atomic_swap( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, intr->src[3].ssa, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = format, + .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic_swap); + return true; + } + + return false; +} + static void lower_image_deref(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, @@ -1383,6 +1454,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, nir_metadata_none, ¶ms); + if (dev->physical_device->enable_ssbo_emulation) { + progress |= nir_lower_io_to_scalar(shader, nir_var_mem_ssbo, NULL, NULL); + progress |= nir_shader_intrinsics_pass(shader, lower_ssbo_to_image, + nir_metadata_control_flow, NULL); + } + /* Remove now-unused variables so that when we gather the shader info later * they won't be counted. */ diff --git a/src/util/driconf.h b/src/util/driconf.h index 56b69a2aadb..0c2b9249525 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -700,6 +700,10 @@ DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \ "Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect") +#define DRI_CONF_TU_ENABLE_SSBO_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_ssbo_emulation, def, \ + "Emulate SSBO to allow higher limit for elements that is in line with what some D3D12 games expect") + /** * \brief Honeykrisp specific configuration options */