From d934b5293fbc21c43affdcca4ad9754d5d24bb72 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 23 Jan 2026 22:28:27 +0100 Subject: [PATCH 1/5] ir3: Add resbase_ir3 intrinsic Signed-off-by: Danylo Piliaiev --- src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 2 ++ src/freedreno/ir3/ir3.c | 1 + src/freedreno/ir3/ir3.h | 1 + src/freedreno/ir3/ir3_compiler_nir.c | 14 ++++++++++++++ src/freedreno/ir3/ir3_legalize.c | 2 +- src/freedreno/ir3/ir3_validate.c | 1 + src/freedreno/isa/ir3-cat6.xml | 3 ++- 8 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 9d8ea263c6b..7b6b58bb160 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -802,6 +802,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_gmem_frag_offset_ir3: case nir_intrinsic_bindless_resource_ir3: case nir_intrinsic_ray_intersection_ir3: + case nir_intrinsic_resbase_ir3: case nir_intrinsic_load_attribute_payload_intel: case nir_intrinsic_load_urb_vec4_intel: case nir_intrinsic_load_urb_lsc_intel: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 8426c0c0140..2ffd0270b7a 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1675,6 +1675,8 @@ intrinsic("prefetch_sam_ir3", [1, 1], flags=[CAN_REORDER]) intrinsic("prefetch_tex_ir3", [1], flags=[CAN_REORDER]) intrinsic("prefetch_ubo_ir3", [1], flags=[CAN_REORDER]) +intrinsic("resbase_ir3", src_comp=[1], dest_comp=2, flags=[CAN_ELIMINATE, CAN_REORDER]) + # Panfrost-specific intrinsic for loading vertex attributes. Takes explicit # vertex and instance IDs which we need in order to implement vertex attribute # divisor with non-zero base instance on v9+. diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index bc493a9dc76..dfdfb38da0f 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -1907,6 +1907,7 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags) return false; break; case OPC_RESINFO: + case OPC_RESBASE: if (n != 0) return false; break; diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index fafa51f1842..24c196e4992 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -3119,6 +3119,7 @@ INSTR3NODST(STLW) INSTR3NODST(STP) INSTR1(RESINFO) INSTR1(RESFMT) +INSTR1(RESBASE) INSTR2(ATOMIC_ADD) INSTR2(ATOMIC_SUB) INSTR2(ATOMIC_XCHG) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 38d9db43782..5a936b680dc 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3522,6 +3522,20 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) array_insert(ctx->block, ctx->block->keeps, ldc); break; } + case nir_intrinsic_resbase_ir3: { + struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]); + struct ir3_instruction *resbase = ir3_RESBASE(b, ibo, 0); + resbase->cat6.iim_val = 1; + resbase->cat6.d = 1; + resbase->cat6.type = TYPE_U32; + resbase->cat6.typed = false; + /* resbase has no writemask and always writes out 2 components */ + resbase->dsts[0]->wrmask = MASK(2); + ir3_handle_bindless_cat6(resbase, intr->src[0]); + ir3_handle_nonuniform(resbase, intr); + ir3_split_dest(b, dst, resbase, 0, 2); + break; + } case nir_intrinsic_rotate: case nir_intrinsic_shuffle_up_uniform_ir3: case nir_intrinsic_shuffle_down_uniform_ir3: diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 635af257b1a..02c3812264a 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -254,7 +254,7 @@ sync_update(struct ir3_legalize_state *state, struct ir3_compiler *compiler, if (is_tex_or_prefetch(n) && !has_dummy_dst(n)) { regmask_set(&state->needs_sy, n->dsts[0]); - } else if (n->opc == OPC_RESINFO && !has_dummy_dst(n)) { + } else if ((n->opc == OPC_RESINFO || n->opc == OPC_RESBASE) && !has_dummy_dst(n)) { regmask_set(&state->needs_ss, n->dsts[0]); } else if (is_load(n)) { if (is_local_mem_load(n)) diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index f047c8bdb69..8a29d767b60 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -464,6 +464,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) switch (instr->opc) { case OPC_RESINFO: case OPC_RESFMT: + case OPC_RESBASE: if (instr->dsts_count > 0) validate_reg_size(ctx, instr->dsts[0], instr->cat6.type); validate_reg_size(ctx, instr->srcs[0], instr->cat6.type); diff --git a/src/freedreno/isa/ir3-cat6.xml b/src/freedreno/isa/ir3-cat6.xml index 300633a6647..de97f571209 100644 --- a/src/freedreno/isa/ir3-cat6.xml +++ b/src/freedreno/isa/ir3-cat6.xml @@ -1311,7 +1311,8 @@ TODO rename UAV src to "UAV" so disasm_field_cb can find it easily? - RESourceBASE - returns the address of the bindless descriptor + RESourceBASE - returns the value encoded into TEX_CONST_7_FLAG_LO/HI + of the given descriptor. 001100 From 3709ebded81695b7f83b45eacf942e80b4bb1ccd Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 23 Jan 2026 22:38:23 +0100 Subject: [PATCH 2/5] tu: Add allow_oob_indirect_ubo_loads to device cache uuid Fixes: f4c40fc89c7 ("tu: Add workaround for D3D11 games accessing UBO out of bounds") Signed-off-by: Danylo Piliaiev --- src/freedreno/vulkan/tu_device.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 9b602c66769..ff5a167836d 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -71,6 +71,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) _mesa_blake3_update(&ctx, &driver_flags, sizeof(driver_flags)); _mesa_blake3_update(&ctx, &device->uche_trap_base, sizeof(device->uche_trap_base)); _mesa_blake3_final(&ctx, blake3); + _mesa_blake3_update(&ctx, &device->instance->allow_oob_indirect_ubo_loads, + sizeof(device->instance->allow_oob_indirect_ubo_loads)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; From 80e194d7f136c05012f6d5ea3652bcf01895fd99 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 23 Jan 2026 22:39:54 +0100 Subject: [PATCH 3/5] tu: Add option to raise the maximum texel buffer size Emulates texel buffers via 3D image access, real texel buffer size and start offset (due to image aligment requirements) are stored in the descriptor and accessed via resbase. - Read-only access: isam.a.1d to read as 3d image. - RW access: stib.b.typed.3d/ldib.b.typed.3d to read as 3d image. Verified that proprietary D3D12 driver uses the same workaround, the only difference is that proprietary driver uses arrayed 2d load for read-only access instead of 3d load, but benefits are not verified. Signed-off-by: Danylo Piliaiev --- src/freedreno/vulkan/tu_common.h | 3 + src/freedreno/vulkan/tu_descriptor_set.cc | 111 +++++++++++++++++++++- src/freedreno/vulkan/tu_device.cc | 12 ++- src/freedreno/vulkan/tu_device.h | 8 ++ src/freedreno/vulkan/tu_shader.cc | 106 +++++++++++++++++++++ src/util/driconf.h | 4 + 6 files changed, 238 insertions(+), 6 deletions(-) diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index d3564eba3b1..9ad9ce2b52b 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -110,6 +110,9 @@ /* match the latest Qualcomm driver which is also a hw limit on later gens */ #define MAX_STORAGE_BUFFER_RANGE (1u << 27) #define MAX_TEXEL_ELEMENTS (1u << 27) +#define TU_MAX_EMULATED_TEXEL_ELEMENTS ((1u << 30) - 1) +#define TU_TEXEL_BUFFER_WIDTH (1u << 14) +#define TU_TEXEL_BUFFER_MAX_HEIGHT (1u << 14) /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so * expose the same maximum range. * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 6dab563bea7..636a97440cc 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -20,6 +20,7 @@ #include "tu_descriptor_set.h" #include "util/mesa-blake3.h" +#include "util/format/u_format.h" #include "vk_acceleration_structure.h" #include "vk_descriptors.h" #include "vk_util.h" @@ -32,6 +33,7 @@ #include "tu_rmv.h" #include "tu_sampler.h" #include "tu_subsampled_image.h" +#include "fdl/fd6_format_table.h" static inline uint8_t * pool_base(struct tu_descriptor_pool *pool) @@ -989,6 +991,71 @@ write_texel_buffer_descriptor_addr(uint32_t *dst, } } +template +static void +write_emulated_texel_buffer_descriptor_common(uint32_t *dst, + enum pipe_format format, + uint64_t addr, uint32_t elements) +{ + uint32_t blocksize_B = util_format_get_blocksize(format); + + const uint32_t aligment = 64; + uint64_t aligned_addr = addr & ~(uint64_t) (aligment - 1); + uint32_t offset_texels = uint32_t(addr - aligned_addr) / blocksize_B; + uint32_t elements_with_offset = elements + offset_texels; + + uint32_t width = MIN2(elements_with_offset, TU_TEXEL_BUFFER_WIDTH); + uint32_t height = MIN2(DIV_ROUND_UP(elements_with_offset, width), + TU_TEXEL_BUFFER_MAX_HEIGHT); + uint32_t depth = elements_with_offset + ? DIV_ROUND_UP(elements_with_offset, width * height) + : 0; + uint32_t layer_size = width * height * blocksize_B; + enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + enum a6xx_format texture_format = + fd6_texture_format(format, tile_mode, false); + enum a3xx_color_swap swap = fd6_texture_swap(format, tile_mode, false); + + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + + dst[0] = A6XX_TEX_MEMOBJ_0_TILE_MODE(tile_mode) | + COND(util_format_is_srgb(format), A6XX_TEX_MEMOBJ_0_SRGB) | + A6XX_TEX_MEMOBJ_0_FMT(texture_format) | + A6XX_TEX_MEMOBJ_0_SWAP(swap); + dst[1] = A6XX_TEX_MEMOBJ_1_WIDTH(width) | A6XX_TEX_MEMOBJ_1_HEIGHT(height); + dst[2] = A6XX_TEX_MEMOBJ_2_PITCH(width * blocksize_B) | + A6XX_TEX_MEMOBJ_2_TYPE(A6XX_TEX_3D); + dst[3] = A6XX_TEX_MEMOBJ_3_ARRAY_PITCH(depth > 1 ? layer_size : 0); + dst[4] = aligned_addr; + dst[5] = (aligned_addr >> 32) | A6XX_TEX_MEMOBJ_5_DEPTH(depth); + dst[6] = A6XX_TEX_MEMOBJ_6_MIN_LOD_CLAMP(0); + /* Would be read by resbase to provide robustness guarantees */ + uint64_t encoded = MIN2(elements, TU_MAX_EMULATED_TEXEL_ELEMENTS); + encoded |= uint64_t(offset_texels & (aligment - 1)) << 30llu; + encoded <<= 6; + dst[7] = A6XX_TEX_MEMOBJ_7_FLAG_LO(encoded & 0x7FFFFFF); + dst[8] = A6XX_TEX_MEMOBJ_8_FLAG_HI(encoded >> 26); + + tu_desc_set_swiz(dst, tu_swiz(X, Y, Z, W)); +} + +template +static void +write_emulated_texel_buffer_descriptor_addr( + uint32_t *dst, const VkDescriptorAddressInfoEXT *buffer_info) +{ + if (!buffer_info || buffer_info->address == 0) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + return; + } + + enum pipe_format format = vk_format_to_pipe_format(buffer_info->format); + uint32_t blocksize_B = util_format_get_blocksize(format); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, format, buffer_info->address, elements); +} + static void write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) { @@ -1001,6 +1068,25 @@ write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) } } +template +static void +write_emulated_texel_buffer_descriptor(uint32_t *dst, + const VkBufferView buffer_view) +{ + if (buffer_view == VK_NULL_HANDLE) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + return; + } + + VK_FROM_HANDLE(tu_buffer_view, view, buffer_view); + + enum pipe_format format = vk_format_to_pipe_format(view->vk.format); + uint32_t elements = view->vk.elements; + write_emulated_texel_buffer_descriptor_common( + dst, format, vk_buffer_address(view->vk.buffer, view->vk.offset), + elements); +} + static VkDescriptorAddressInfoEXT buffer_info_to_address(const VkDescriptorBufferInfo *buffer_info) { @@ -1199,10 +1285,14 @@ tu_GetDescriptorEXT( write_buffer_descriptor_addr(device, dest, pDescriptorInfo->data.pStorageBuffer); break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - write_texel_buffer_descriptor_addr(dest, pDescriptorInfo->data.pUniformTexelBuffer); - break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor_addr(dest, pDescriptorInfo->data.pStorageTexelBuffer); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor_addr( + dest, pDescriptorInfo->data.pUniformTexelBuffer); + } else { + write_texel_buffer_descriptor_addr( + dest, pDescriptorInfo->data.pUniformTexelBuffer); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, @@ -1331,7 +1421,13 @@ tu_update_descriptor_sets(const struct tu_device *device, break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor( + ptr, writeset->pTexelBufferView[j]); + } else { + write_texel_buffer_descriptor(ptr, + writeset->pTexelBufferView[j]); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: @@ -1681,7 +1777,12 @@ tu_update_descriptor_set_with_template( break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor( + ptr, *(VkBufferView *) src); + } else { + write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index ff5a167836d..70ccc2c7eeb 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -73,6 +73,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) _mesa_blake3_final(&ctx, blake3); _mesa_blake3_update(&ctx, &device->instance->allow_oob_indirect_ubo_loads, sizeof(device->instance->allow_oob_indirect_ubo_loads)); + _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, + sizeof(device->enable_texel_buffer_emulation)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; @@ -1110,7 +1112,8 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxImageDimension3D = (1 << 11); props->maxImageDimensionCube = (1 << 14); props->maxImageArrayLayers = (1 << (pdevice->info->props.is_a702 ? 8 : 11)); - props->maxTexelBufferElements = MAX_TEXEL_ELEMENTS; + props->maxTexelBufferElements = + pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; @@ -1680,6 +1683,10 @@ tu_physical_device_init(struct tu_physical_device *device, device->has_cached_non_coherent_memory = device->level1_dcache_size > 0 && !DETECT_ARCH_ARM; + if (fd_dev_gen(&device->dev_id) >= 7) { + device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation; + } + device->memory.type_count = 1; device->memory.types[0] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | @@ -1843,6 +1850,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_ENABLE_SOFTFLOAT32(false) DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false) DRI_CONF_TU_AUTOTUNE_ALGORITHM() + DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false) DRI_CONF_SECTION_END }; @@ -1877,6 +1885,8 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionb(&instance->dri_options, "tu_emulate_alpha_to_coverage"); instance->autotune_algo = driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); + instance->enable_texel_buffer_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation"); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..c7b46397e4d 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -143,6 +143,8 @@ struct tu_physical_device /* Whether performance counter selector registers can be written by userspace CSes. */ bool is_perf_cntr_selectable; + bool enable_texel_buffer_emulation; + struct { uint32_t non_lazy_type_count; uint32_t type_count; @@ -240,6 +242,12 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; + + /* D3D12 doesn't have documented limit for texel buffer size, in practice + * some games expect up to (1 << 29) elements, which is higher than A6XX or + * A7XX hardware can support. + */ + bool enable_texel_buffer_emulation; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 0347db46acd..094c1963013 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -598,6 +598,77 @@ build_bindless(struct tu_device *dev, nir_builder *b, return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set); } +static nir_def * +build_texel_buffer_size(nir_builder *b, nir_def *desc, nir_def **offset_out) +{ + assert(nir_def_is_intrinsic(desc)); + nir_def *encoded_data = nir_resbase_ir3(b, 32, desc); + nir_def *encoded_data_lo = nir_channel(b, encoded_data, 0); + nir_def *encoded_data_hi = nir_channel(b, encoded_data, 1); + + nir_def *size_lo = nir_ishr_imm(b, encoded_data_lo, 6); + nir_def *size_hi = nir_ishl_imm(b, encoded_data_hi, 20); + nir_def *size = nir_iand_imm(b, nir_ior(b, size_lo, size_hi), + TU_MAX_EMULATED_TEXEL_ELEMENTS); + + if (offset_out) + *offset_out = nir_ishr_imm(b, encoded_data_hi, 10); + + return size; +} + +static nir_def * +build_texel_buffer_as_image_coords(nir_builder *b, + nir_def *offset, + nir_def *desc) +{ + nir_def *base_offset = nullptr; + nir_def *real_size = build_texel_buffer_size(b, desc, &base_offset); + nir_def *oob = nir_ige(b, offset, real_size); + + offset = nir_iadd(b, offset, base_offset); + + nir_def *x = nir_umod_imm(b, offset, TU_TEXEL_BUFFER_WIDTH); + nir_def *tmp = nir_udiv_imm(b, offset, TU_TEXEL_BUFFER_WIDTH); + nir_def *y = nir_umod_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT); + nir_def *z = nir_udiv_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT); + z = nir_bcsel(b, oob, nir_imm_int(b, 0xff), z); + + nir_def *coord3d = nir_vec3(b, x, y, z); + return coord3d; +} + +static void +lower_texel_buffers_to_image(nir_builder *b, + nir_intrinsic_instr *instr, + nir_def *bindless) +{ + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: { + b->cursor = nir_before_instr(&instr->instr); + + nir_def *coord = instr->src[1].ssa; + if (coord->num_components > 1) + coord = nir_channel(b, coord, 0); + nir_def *coord3d = + build_texel_buffer_as_image_coords(b, coord, bindless); + nir_src_rewrite(&instr->src[1], nir_pad_vector(b, coord3d, 4)); + nir_intrinsic_set_image_dim(instr, GLSL_SAMPLER_DIM_3D); + break; + } + case nir_intrinsic_bindless_image_size: { + nir_def_replace(&instr->def, + build_texel_buffer_size(b, bindless, nullptr)); + break; + } + default: + break; + } +} + static void lower_image_deref(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, @@ -607,6 +678,11 @@ lower_image_deref(struct tu_device *dev, nir_builder *b, nir_def *bindless = build_bindless(dev, b, deref, 0, shader, layout, 0, false); nir_rewrite_image_intrinsic(instr, bindless, nir_image_intrinsic_type_bindless); + + if (dev->physical_device->enable_texel_buffer_emulation && + nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) { + lower_texel_buffers_to_image(b, instr, bindless); + } } static bool @@ -871,6 +947,31 @@ lower_tex_immutable(struct tu_device *dev, } } +static void +lower_tex_texel_buffer_to_image(nir_builder *b, + nir_tex_instr *tex, + uint32_t tex_bindless_idx) +{ + if (tex->op == nir_texop_txf) { + int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); + if (coord_idx >= 0) { + nir_def *coord = tex->src[coord_idx].src.ssa; + if (coord->num_components > 1) + coord = nir_channel(b, coord, 0); + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, coord, tex->src[tex_bindless_idx].src.ssa); + nir_src_rewrite(&tex->src[coord_idx].src, coord3d); + + tex->sampler_dim = GLSL_SAMPLER_DIM_3D; + tex->coord_components = 3; + } + } else if (tex->op == nir_texop_txs) { + nir_def_replace( + &tex->def, + build_texel_buffer_size(b, tex->src[tex_bindless_idx].src.ssa, nullptr)); + } +} + static bool lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, struct tu_shader *shader, const struct tu_pipeline_layout *layout, @@ -901,6 +1002,11 @@ lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset; } + if (dev->physical_device->enable_texel_buffer_emulation && + tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + lower_tex_texel_buffer_to_image(b, tex, tex_src_idx); + } + return true; } diff --git a/src/util/driconf.h b/src/util/driconf.h index efd756fe765..56b69a2aadb 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -696,6 +696,10 @@ DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \ "Set the preferred autotune algorithm") +#define DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \ + "Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect") + /** * \brief Honeykrisp specific configuration options */ From 36a9593f01c792e21d4e7c9c70c6d5d7d4817551 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 15 Apr 2026 17:12:57 +0200 Subject: [PATCH 4/5] tu: Add option to raise the maximum SSBO size Emulates SSBOS via 3D image access, real SSBO size and start offset (due to image aligment requirements) are stored in the descriptor and accessed via resbase. This also disables storage_16bit and storage_8bit to simplify SSBO emulation since no known D3D12 games, that need SSBO emulated, require those capabilities. Proprietary driver has a more complex solution which involves uniform branching on buffer size and using ldg for over-the-limit buffers. --- src/freedreno/vulkan/tu_descriptor_set.cc | 19 ++++++ src/freedreno/vulkan/tu_device.cc | 20 +++++- src/freedreno/vulkan/tu_device.h | 4 +- src/freedreno/vulkan/tu_shader.cc | 77 +++++++++++++++++++++++ src/util/driconf.h | 4 ++ 5 files changed, 120 insertions(+), 4 deletions(-) diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 636a97440cc..ff3f3d2a0a3 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -59,6 +59,13 @@ descriptor_size(struct tu_device *dev, return FDL6_TEX_CONST_DWORDS * 4 * (subsampled ? 3 : 2); case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + if (dev->physical_device->enable_ssbo_emulation) { + /* With SSBO emulation, use a single R32_UINT emulated 2D + * descriptor instead of multiple format-specific buffer + * descriptors. + */ + return FDL6_TEX_CONST_DWORDS * 4; + } /* isam.v allows using a single 16-bit descriptor for both 16-bit and * 32-bit loads. If not available but 16-bit storage is still supported, * two separate descriptors are required. @@ -1108,6 +1115,18 @@ write_buffer_descriptor_addr(const struct tu_device *device, const VkDescriptorAddressInfoEXT *buffer_info) { const struct fd_dev_info *info = device->physical_device->info; + + if (device->physical_device->enable_ssbo_emulation) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + if (!buffer_info || buffer_info->address == 0) + return; + uint32_t blocksize_B = util_format_get_blocksize(PIPE_FORMAT_R32_UINT); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, PIPE_FORMAT_R32_UINT, buffer_info->address, elements); + return; + } + /* This prevents any misconfiguration, but 16-bit descriptor capable of both * 16-bit and 32-bit access through isam.v will of course only be functional * when 16-bit storage is supported. */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 70ccc2c7eeb..657fe5e7fff 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -75,6 +75,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) sizeof(device->instance->allow_oob_indirect_ubo_loads)); _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, sizeof(device->enable_texel_buffer_emulation)); + _mesa_blake3_update(&ctx, &device->enable_ssbo_emulation, + sizeof(device->enable_ssbo_emulation)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; @@ -168,8 +170,10 @@ get_device_extensions(const struct tu_physical_device *device, (!device->info->props.has_sw_fuse || device->has_raytracing); *ext = (struct vk_device_extension_table) { .table = { - .KHR_8bit_storage = device->info->props.storage_8bit, - .KHR_16bit_storage = device->info->props.storage_16bit, + .KHR_8bit_storage = device->info->props.storage_8bit && + !device->enable_ssbo_emulation, + .KHR_16bit_storage = device->info->props.storage_16bit && + !device->enable_ssbo_emulation, .KHR_acceleration_structure = has_raytracing, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, @@ -1115,7 +1119,10 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxTexelBufferElements = pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; - props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE; + props->maxStorageBufferRange = + pdevice->enable_ssbo_emulation + ? TU_MAX_EMULATED_TEXEL_ELEMENTS * 4 + : MAX_STORAGE_BUFFER_RANGE; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; props->maxMemoryAllocationCount = UINT32_MAX; props->maxSamplerAllocationCount = 64 * 1024; @@ -1418,6 +1425,9 @@ tu_get_properties(struct tu_physical_device *pdevice, props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4 * (1 + COND(pdevice->info->props.storage_16bit && !pdevice->info->props.has_isam_v, 1) + COND(pdevice->info->props.storage_8bit, 1)); + if (pdevice->enable_ssbo_emulation) { + props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4; + } props->robustStorageBufferDescriptorSize = props->storageBufferDescriptorSize; props->accelerationStructureDescriptorSize = 4 * FDL6_TEX_CONST_DWORDS; @@ -1685,6 +1695,7 @@ tu_physical_device_init(struct tu_physical_device *device, if (fd_dev_gen(&device->dev_id) >= 7) { device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation; + device->enable_ssbo_emulation = instance->enable_ssbo_emulation; } device->memory.type_count = 1; @@ -1851,6 +1862,7 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false) DRI_CONF_TU_AUTOTUNE_ALGORITHM() DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false) + DRI_CONF_TU_ENABLE_SSBO_EMULATION(false) DRI_CONF_SECTION_END }; @@ -1887,6 +1899,8 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); instance->enable_texel_buffer_emulation = driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation"); + instance->enable_ssbo_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_ssbo_emulation"); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c7b46397e4d..8d064c58f94 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -144,6 +144,7 @@ struct tu_physical_device bool is_perf_cntr_selectable; bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; struct { uint32_t non_lazy_type_count; @@ -243,11 +244,12 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; - /* D3D12 doesn't have documented limit for texel buffer size, in practice + /* D3D12 doesn't have documented limit for texel buffer or SSBO size, in practice * some games expect up to (1 << 29) elements, which is higher than A6XX or * A7XX hardware can support. */ bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 094c1963013..98950941840 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -669,6 +669,77 @@ lower_texel_buffers_to_image(nir_builder *b, } } +static bool +lower_ssbo_to_image(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic == nir_intrinsic_load_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_def *load = nir_bindless_image_load( + b, intr->def.num_components, intr->def.bit_size, bindless, + nir_pad_vec4(b, coord3d), nir_imm_zero(b, 1, 32) /* sample index */, + nir_imm_zero(b, 1, 32) /* lod */, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = PIPE_FORMAT_R32_UINT, .access = nir_intrinsic_access(intr)); + nir_def_replace(&intr->def, load); + return true; + } else if (intr->intrinsic == nir_intrinsic_store_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_bindless_image_store( + b, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, nir_imm_zero(b, 1, 32) /* lod */, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = PIPE_FORMAT_R32_UINT, + .access = nir_intrinsic_access(intr), .src_type = nir_type_uint32); + nir_instr_remove(&intr->instr); + return true; + } else if (intr->intrinsic == nir_intrinsic_get_ssbo_size) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *num_elements = build_texel_buffer_size(b, bindless, NULL); + nir_def *size = nir_amul_imm(b, num_elements, sizeof(uint32_t)); + nir_def_replace(&intr->def, size); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic = nir_bindless_image_atomic( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = format, .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic_swap = nir_bindless_image_atomic_swap( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, intr->src[3].ssa, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = format, + .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic_swap); + return true; + } + + return false; +} + static void lower_image_deref(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, @@ -1383,6 +1454,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, nir_metadata_none, ¶ms); + if (dev->physical_device->enable_ssbo_emulation) { + progress |= nir_lower_io_to_scalar(shader, nir_var_mem_ssbo, NULL, NULL); + progress |= nir_shader_intrinsics_pass(shader, lower_ssbo_to_image, + nir_metadata_control_flow, NULL); + } + /* Remove now-unused variables so that when we gather the shader info later * they won't be counted. */ diff --git a/src/util/driconf.h b/src/util/driconf.h index 56b69a2aadb..0c2b9249525 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -700,6 +700,10 @@ DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \ "Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect") +#define DRI_CONF_TU_ENABLE_SSBO_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_ssbo_emulation, def, \ + "Emulate SSBO to allow higher limit for elements that is in line with what some D3D12 games expect") + /** * \brief Honeykrisp specific configuration options */ From 02be9e786cc85949351ffe380a64f1150d5216de Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Tue, 21 Apr 2026 16:21:06 +0200 Subject: [PATCH 5/5] tu: Enable texel buffer / SSBO emulation for known problematic games Signed-off-by: Danylo Piliaiev --- src/util/00-mesa-defaults.conf | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index 9b5dcfe8728..6aaa934223b 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -1438,6 +1438,17 @@ TODO: document the other workarounds. + + + + + +