From 80e194d7f136c05012f6d5ea3652bcf01895fd99 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 23 Jan 2026 22:39:54 +0100 Subject: [PATCH] tu: Add option to raise the maximum texel buffer size Emulates texel buffers via 3D image access, real texel buffer size and start offset (due to image aligment requirements) are stored in the descriptor and accessed via resbase. - Read-only access: isam.a.1d to read as 3d image. - RW access: stib.b.typed.3d/ldib.b.typed.3d to read as 3d image. Verified that proprietary D3D12 driver uses the same workaround, the only difference is that proprietary driver uses arrayed 2d load for read-only access instead of 3d load, but benefits are not verified. Signed-off-by: Danylo Piliaiev --- src/freedreno/vulkan/tu_common.h | 3 + src/freedreno/vulkan/tu_descriptor_set.cc | 111 +++++++++++++++++++++- src/freedreno/vulkan/tu_device.cc | 12 ++- src/freedreno/vulkan/tu_device.h | 8 ++ src/freedreno/vulkan/tu_shader.cc | 106 +++++++++++++++++++++ src/util/driconf.h | 4 + 6 files changed, 238 insertions(+), 6 deletions(-) diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index d3564eba3b1..9ad9ce2b52b 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -110,6 +110,9 @@ /* match the latest Qualcomm driver which is also a hw limit on later gens */ #define MAX_STORAGE_BUFFER_RANGE (1u << 27) #define MAX_TEXEL_ELEMENTS (1u << 27) +#define TU_MAX_EMULATED_TEXEL_ELEMENTS ((1u << 30) - 1) +#define TU_TEXEL_BUFFER_WIDTH (1u << 14) +#define TU_TEXEL_BUFFER_MAX_HEIGHT (1u << 14) /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so * expose the same maximum range. * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 6dab563bea7..636a97440cc 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -20,6 +20,7 @@ #include "tu_descriptor_set.h" #include "util/mesa-blake3.h" +#include "util/format/u_format.h" #include "vk_acceleration_structure.h" #include "vk_descriptors.h" #include "vk_util.h" @@ -32,6 +33,7 @@ #include "tu_rmv.h" #include "tu_sampler.h" #include "tu_subsampled_image.h" +#include "fdl/fd6_format_table.h" static inline uint8_t * pool_base(struct tu_descriptor_pool *pool) @@ -989,6 +991,71 @@ write_texel_buffer_descriptor_addr(uint32_t *dst, } } +template +static void +write_emulated_texel_buffer_descriptor_common(uint32_t *dst, + enum pipe_format format, + uint64_t addr, uint32_t elements) +{ + uint32_t blocksize_B = util_format_get_blocksize(format); + + const uint32_t aligment = 64; + uint64_t aligned_addr = addr & ~(uint64_t) (aligment - 1); + uint32_t offset_texels = uint32_t(addr - aligned_addr) / blocksize_B; + uint32_t elements_with_offset = elements + offset_texels; + + uint32_t width = MIN2(elements_with_offset, TU_TEXEL_BUFFER_WIDTH); + uint32_t height = MIN2(DIV_ROUND_UP(elements_with_offset, width), + TU_TEXEL_BUFFER_MAX_HEIGHT); + uint32_t depth = elements_with_offset + ? DIV_ROUND_UP(elements_with_offset, width * height) + : 0; + uint32_t layer_size = width * height * blocksize_B; + enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + enum a6xx_format texture_format = + fd6_texture_format(format, tile_mode, false); + enum a3xx_color_swap swap = fd6_texture_swap(format, tile_mode, false); + + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + + dst[0] = A6XX_TEX_MEMOBJ_0_TILE_MODE(tile_mode) | + COND(util_format_is_srgb(format), A6XX_TEX_MEMOBJ_0_SRGB) | + A6XX_TEX_MEMOBJ_0_FMT(texture_format) | + A6XX_TEX_MEMOBJ_0_SWAP(swap); + dst[1] = A6XX_TEX_MEMOBJ_1_WIDTH(width) | A6XX_TEX_MEMOBJ_1_HEIGHT(height); + dst[2] = A6XX_TEX_MEMOBJ_2_PITCH(width * blocksize_B) | + A6XX_TEX_MEMOBJ_2_TYPE(A6XX_TEX_3D); + dst[3] = A6XX_TEX_MEMOBJ_3_ARRAY_PITCH(depth > 1 ? layer_size : 0); + dst[4] = aligned_addr; + dst[5] = (aligned_addr >> 32) | A6XX_TEX_MEMOBJ_5_DEPTH(depth); + dst[6] = A6XX_TEX_MEMOBJ_6_MIN_LOD_CLAMP(0); + /* Would be read by resbase to provide robustness guarantees */ + uint64_t encoded = MIN2(elements, TU_MAX_EMULATED_TEXEL_ELEMENTS); + encoded |= uint64_t(offset_texels & (aligment - 1)) << 30llu; + encoded <<= 6; + dst[7] = A6XX_TEX_MEMOBJ_7_FLAG_LO(encoded & 0x7FFFFFF); + dst[8] = A6XX_TEX_MEMOBJ_8_FLAG_HI(encoded >> 26); + + tu_desc_set_swiz(dst, tu_swiz(X, Y, Z, W)); +} + +template +static void +write_emulated_texel_buffer_descriptor_addr( + uint32_t *dst, const VkDescriptorAddressInfoEXT *buffer_info) +{ + if (!buffer_info || buffer_info->address == 0) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + return; + } + + enum pipe_format format = vk_format_to_pipe_format(buffer_info->format); + uint32_t blocksize_B = util_format_get_blocksize(format); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, format, buffer_info->address, elements); +} + static void write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) { @@ -1001,6 +1068,25 @@ write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) } } +template +static void +write_emulated_texel_buffer_descriptor(uint32_t *dst, + const VkBufferView buffer_view) +{ + if (buffer_view == VK_NULL_HANDLE) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + return; + } + + VK_FROM_HANDLE(tu_buffer_view, view, buffer_view); + + enum pipe_format format = vk_format_to_pipe_format(view->vk.format); + uint32_t elements = view->vk.elements; + write_emulated_texel_buffer_descriptor_common( + dst, format, vk_buffer_address(view->vk.buffer, view->vk.offset), + elements); +} + static VkDescriptorAddressInfoEXT buffer_info_to_address(const VkDescriptorBufferInfo *buffer_info) { @@ -1199,10 +1285,14 @@ tu_GetDescriptorEXT( write_buffer_descriptor_addr(device, dest, pDescriptorInfo->data.pStorageBuffer); break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - write_texel_buffer_descriptor_addr(dest, pDescriptorInfo->data.pUniformTexelBuffer); - break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor_addr(dest, pDescriptorInfo->data.pStorageTexelBuffer); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor_addr( + dest, pDescriptorInfo->data.pUniformTexelBuffer); + } else { + write_texel_buffer_descriptor_addr( + dest, pDescriptorInfo->data.pUniformTexelBuffer); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, @@ -1331,7 +1421,13 @@ tu_update_descriptor_sets(const struct tu_device *device, break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor( + ptr, writeset->pTexelBufferView[j]); + } else { + write_texel_buffer_descriptor(ptr, + writeset->pTexelBufferView[j]); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: @@ -1681,7 +1777,12 @@ tu_update_descriptor_set_with_template( break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor( + ptr, *(VkBufferView *) src); + } else { + write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index ff5a167836d..70ccc2c7eeb 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -73,6 +73,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) _mesa_blake3_final(&ctx, blake3); _mesa_blake3_update(&ctx, &device->instance->allow_oob_indirect_ubo_loads, sizeof(device->instance->allow_oob_indirect_ubo_loads)); + _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, + sizeof(device->enable_texel_buffer_emulation)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; @@ -1110,7 +1112,8 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxImageDimension3D = (1 << 11); props->maxImageDimensionCube = (1 << 14); props->maxImageArrayLayers = (1 << (pdevice->info->props.is_a702 ? 8 : 11)); - props->maxTexelBufferElements = MAX_TEXEL_ELEMENTS; + props->maxTexelBufferElements = + pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; @@ -1680,6 +1683,10 @@ tu_physical_device_init(struct tu_physical_device *device, device->has_cached_non_coherent_memory = device->level1_dcache_size > 0 && !DETECT_ARCH_ARM; + if (fd_dev_gen(&device->dev_id) >= 7) { + device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation; + } + device->memory.type_count = 1; device->memory.types[0] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | @@ -1843,6 +1850,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_ENABLE_SOFTFLOAT32(false) DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false) DRI_CONF_TU_AUTOTUNE_ALGORITHM() + DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false) DRI_CONF_SECTION_END }; @@ -1877,6 +1885,8 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionb(&instance->dri_options, "tu_emulate_alpha_to_coverage"); instance->autotune_algo = driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); + instance->enable_texel_buffer_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation"); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..c7b46397e4d 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -143,6 +143,8 @@ struct tu_physical_device /* Whether performance counter selector registers can be written by userspace CSes. */ bool is_perf_cntr_selectable; + bool enable_texel_buffer_emulation; + struct { uint32_t non_lazy_type_count; uint32_t type_count; @@ -240,6 +242,12 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; + + /* D3D12 doesn't have documented limit for texel buffer size, in practice + * some games expect up to (1 << 29) elements, which is higher than A6XX or + * A7XX hardware can support. + */ + bool enable_texel_buffer_emulation; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 0347db46acd..094c1963013 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -598,6 +598,77 @@ build_bindless(struct tu_device *dev, nir_builder *b, return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set); } +static nir_def * +build_texel_buffer_size(nir_builder *b, nir_def *desc, nir_def **offset_out) +{ + assert(nir_def_is_intrinsic(desc)); + nir_def *encoded_data = nir_resbase_ir3(b, 32, desc); + nir_def *encoded_data_lo = nir_channel(b, encoded_data, 0); + nir_def *encoded_data_hi = nir_channel(b, encoded_data, 1); + + nir_def *size_lo = nir_ishr_imm(b, encoded_data_lo, 6); + nir_def *size_hi = nir_ishl_imm(b, encoded_data_hi, 20); + nir_def *size = nir_iand_imm(b, nir_ior(b, size_lo, size_hi), + TU_MAX_EMULATED_TEXEL_ELEMENTS); + + if (offset_out) + *offset_out = nir_ishr_imm(b, encoded_data_hi, 10); + + return size; +} + +static nir_def * +build_texel_buffer_as_image_coords(nir_builder *b, + nir_def *offset, + nir_def *desc) +{ + nir_def *base_offset = nullptr; + nir_def *real_size = build_texel_buffer_size(b, desc, &base_offset); + nir_def *oob = nir_ige(b, offset, real_size); + + offset = nir_iadd(b, offset, base_offset); + + nir_def *x = nir_umod_imm(b, offset, TU_TEXEL_BUFFER_WIDTH); + nir_def *tmp = nir_udiv_imm(b, offset, TU_TEXEL_BUFFER_WIDTH); + nir_def *y = nir_umod_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT); + nir_def *z = nir_udiv_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT); + z = nir_bcsel(b, oob, nir_imm_int(b, 0xff), z); + + nir_def *coord3d = nir_vec3(b, x, y, z); + return coord3d; +} + +static void +lower_texel_buffers_to_image(nir_builder *b, + nir_intrinsic_instr *instr, + nir_def *bindless) +{ + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: { + b->cursor = nir_before_instr(&instr->instr); + + nir_def *coord = instr->src[1].ssa; + if (coord->num_components > 1) + coord = nir_channel(b, coord, 0); + nir_def *coord3d = + build_texel_buffer_as_image_coords(b, coord, bindless); + nir_src_rewrite(&instr->src[1], nir_pad_vector(b, coord3d, 4)); + nir_intrinsic_set_image_dim(instr, GLSL_SAMPLER_DIM_3D); + break; + } + case nir_intrinsic_bindless_image_size: { + nir_def_replace(&instr->def, + build_texel_buffer_size(b, bindless, nullptr)); + break; + } + default: + break; + } +} + static void lower_image_deref(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, @@ -607,6 +678,11 @@ lower_image_deref(struct tu_device *dev, nir_builder *b, nir_def *bindless = build_bindless(dev, b, deref, 0, shader, layout, 0, false); nir_rewrite_image_intrinsic(instr, bindless, nir_image_intrinsic_type_bindless); + + if (dev->physical_device->enable_texel_buffer_emulation && + nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) { + lower_texel_buffers_to_image(b, instr, bindless); + } } static bool @@ -871,6 +947,31 @@ lower_tex_immutable(struct tu_device *dev, } } +static void +lower_tex_texel_buffer_to_image(nir_builder *b, + nir_tex_instr *tex, + uint32_t tex_bindless_idx) +{ + if (tex->op == nir_texop_txf) { + int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); + if (coord_idx >= 0) { + nir_def *coord = tex->src[coord_idx].src.ssa; + if (coord->num_components > 1) + coord = nir_channel(b, coord, 0); + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, coord, tex->src[tex_bindless_idx].src.ssa); + nir_src_rewrite(&tex->src[coord_idx].src, coord3d); + + tex->sampler_dim = GLSL_SAMPLER_DIM_3D; + tex->coord_components = 3; + } + } else if (tex->op == nir_texop_txs) { + nir_def_replace( + &tex->def, + build_texel_buffer_size(b, tex->src[tex_bindless_idx].src.ssa, nullptr)); + } +} + static bool lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, struct tu_shader *shader, const struct tu_pipeline_layout *layout, @@ -901,6 +1002,11 @@ lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset; } + if (dev->physical_device->enable_texel_buffer_emulation && + tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + lower_tex_texel_buffer_to_image(b, tex, tex_src_idx); + } + return true; } diff --git a/src/util/driconf.h b/src/util/driconf.h index efd756fe765..56b69a2aadb 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -696,6 +696,10 @@ DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \ "Set the preferred autotune algorithm") +#define DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \ + "Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect") + /** * \brief Honeykrisp specific configuration options */