diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 93e6ebfb1ab..9345e2cf92f 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -807,6 +807,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_gmem_frag_offset_ir3: case nir_intrinsic_bindless_resource_ir3: case nir_intrinsic_ray_intersection_ir3: + case nir_intrinsic_resbase_ir3: case nir_intrinsic_load_attribute_payload_intel: case nir_intrinsic_load_urb_vec4_intel: case nir_intrinsic_load_urb_lsc_intel: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index f79008d76c4..40578c2b27b 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1683,6 +1683,8 @@ intrinsic("prefetch_sam_ir3", [1, 1], flags=[CAN_REORDER]) intrinsic("prefetch_tex_ir3", [1], flags=[CAN_REORDER]) intrinsic("prefetch_ubo_ir3", [1], flags=[CAN_REORDER]) +intrinsic("resbase_ir3", src_comp=[1], dest_comp=2, flags=[CAN_ELIMINATE, CAN_REORDER]) + # Panfrost-specific intrinsic for loading vertex attributes. Takes explicit # vertex and instance IDs which we need in order to implement vertex attribute # divisor with non-zero base instance on v9+. diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c index 7dc042c9a55..fda831921ef 100644 --- a/src/freedreno/ir3/ir3.c +++ b/src/freedreno/ir3/ir3.c @@ -1913,6 +1913,7 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags) return false; break; case OPC_RESINFO: + case OPC_RESBASE: if (n != 0) return false; break; diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 83517828918..9337916556a 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -3122,6 +3122,7 @@ INSTR3NODST(STLW) INSTR3NODST(STP) INSTR1(RESINFO) INSTR1(RESFMT) +INSTR1(RESBASE) INSTR2(ATOMIC_ADD) INSTR2(ATOMIC_SUB) INSTR2(ATOMIC_XCHG) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index fb451354163..2bc51771c55 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3522,6 +3522,20 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) array_insert(ctx->block, ctx->block->keeps, ldc); break; } + case nir_intrinsic_resbase_ir3: { + struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]); + struct ir3_instruction *resbase = ir3_RESBASE(b, ibo, 0); + resbase->cat6.iim_val = 1; + resbase->cat6.d = 1; + resbase->cat6.type = TYPE_U32; + resbase->cat6.typed = false; + /* resbase has no writemask and always writes out 2 components */ + resbase->dsts[0]->wrmask = MASK(2); + ir3_handle_bindless_cat6(resbase, intr->src[0]); + ir3_handle_nonuniform(resbase, intr); + ir3_split_dest(b, dst, resbase, 0, 2); + break; + } case nir_intrinsic_rotate: case nir_intrinsic_shuffle_up_uniform_ir3: case nir_intrinsic_shuffle_down_uniform_ir3: diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 635af257b1a..02c3812264a 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -254,7 +254,7 @@ sync_update(struct ir3_legalize_state *state, struct ir3_compiler *compiler, if (is_tex_or_prefetch(n) && !has_dummy_dst(n)) { regmask_set(&state->needs_sy, n->dsts[0]); - } else if (n->opc == OPC_RESINFO && !has_dummy_dst(n)) { + } else if ((n->opc == OPC_RESINFO || n->opc == OPC_RESBASE) && !has_dummy_dst(n)) { regmask_set(&state->needs_ss, n->dsts[0]); } else if (is_load(n)) { if (is_local_mem_load(n)) diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index f047c8bdb69..8a29d767b60 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -464,6 +464,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) switch (instr->opc) { case OPC_RESINFO: case OPC_RESFMT: + case OPC_RESBASE: if (instr->dsts_count > 0) validate_reg_size(ctx, instr->dsts[0], instr->cat6.type); validate_reg_size(ctx, instr->srcs[0], instr->cat6.type); diff --git a/src/freedreno/isa/ir3-cat6.xml b/src/freedreno/isa/ir3-cat6.xml index a211bd9eb34..4736c645ab6 100644 --- a/src/freedreno/isa/ir3-cat6.xml +++ b/src/freedreno/isa/ir3-cat6.xml @@ -1311,7 +1311,8 @@ TODO rename UAV src to "UAV" so disasm_field_cb can find it easily? - RESourceBASE - returns the address of the bindless descriptor + RESourceBASE - returns the value encoded into TEX_CONST_7_FLAG_LO/HI + of the given descriptor. 001100 diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index 18af24c00cf..80845ff7c48 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -110,6 +110,9 @@ /* match the latest Qualcomm driver which is also a hw limit on later gens */ #define MAX_STORAGE_BUFFER_RANGE (1u << 27) #define MAX_TEXEL_ELEMENTS (1u << 27) +#define TU_MAX_EMULATED_TEXEL_ELEMENTS ((1u << 30) - 1) +#define TU_TEXEL_BUFFER_WIDTH (1u << 14) +#define TU_TEXEL_BUFFER_MAX_HEIGHT (1u << 14) /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so * expose the same maximum range. * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index 6dab563bea7..ff3f3d2a0a3 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -20,6 +20,7 @@ #include "tu_descriptor_set.h" #include "util/mesa-blake3.h" +#include "util/format/u_format.h" #include "vk_acceleration_structure.h" #include "vk_descriptors.h" #include "vk_util.h" @@ -32,6 +33,7 @@ #include "tu_rmv.h" #include "tu_sampler.h" #include "tu_subsampled_image.h" +#include "fdl/fd6_format_table.h" static inline uint8_t * pool_base(struct tu_descriptor_pool *pool) @@ -57,6 +59,13 @@ descriptor_size(struct tu_device *dev, return FDL6_TEX_CONST_DWORDS * 4 * (subsampled ? 3 : 2); case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + if (dev->physical_device->enable_ssbo_emulation) { + /* With SSBO emulation, use a single R32_UINT emulated 2D + * descriptor instead of multiple format-specific buffer + * descriptors. + */ + return FDL6_TEX_CONST_DWORDS * 4; + } /* isam.v allows using a single 16-bit descriptor for both 16-bit and * 32-bit loads. If not available but 16-bit storage is still supported, * two separate descriptors are required. @@ -989,6 +998,71 @@ write_texel_buffer_descriptor_addr(uint32_t *dst, } } +template +static void +write_emulated_texel_buffer_descriptor_common(uint32_t *dst, + enum pipe_format format, + uint64_t addr, uint32_t elements) +{ + uint32_t blocksize_B = util_format_get_blocksize(format); + + const uint32_t aligment = 64; + uint64_t aligned_addr = addr & ~(uint64_t) (aligment - 1); + uint32_t offset_texels = uint32_t(addr - aligned_addr) / blocksize_B; + uint32_t elements_with_offset = elements + offset_texels; + + uint32_t width = MIN2(elements_with_offset, TU_TEXEL_BUFFER_WIDTH); + uint32_t height = MIN2(DIV_ROUND_UP(elements_with_offset, width), + TU_TEXEL_BUFFER_MAX_HEIGHT); + uint32_t depth = elements_with_offset + ? DIV_ROUND_UP(elements_with_offset, width * height) + : 0; + uint32_t layer_size = width * height * blocksize_B; + enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + enum a6xx_format texture_format = + fd6_texture_format(format, tile_mode, false); + enum a3xx_color_swap swap = fd6_texture_swap(format, tile_mode, false); + + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + + dst[0] = A6XX_TEX_MEMOBJ_0_TILE_MODE(tile_mode) | + COND(util_format_is_srgb(format), A6XX_TEX_MEMOBJ_0_SRGB) | + A6XX_TEX_MEMOBJ_0_FMT(texture_format) | + A6XX_TEX_MEMOBJ_0_SWAP(swap); + dst[1] = A6XX_TEX_MEMOBJ_1_WIDTH(width) | A6XX_TEX_MEMOBJ_1_HEIGHT(height); + dst[2] = A6XX_TEX_MEMOBJ_2_PITCH(width * blocksize_B) | + A6XX_TEX_MEMOBJ_2_TYPE(A6XX_TEX_3D); + dst[3] = A6XX_TEX_MEMOBJ_3_ARRAY_PITCH(depth > 1 ? layer_size : 0); + dst[4] = aligned_addr; + dst[5] = (aligned_addr >> 32) | A6XX_TEX_MEMOBJ_5_DEPTH(depth); + dst[6] = A6XX_TEX_MEMOBJ_6_MIN_LOD_CLAMP(0); + /* Would be read by resbase to provide robustness guarantees */ + uint64_t encoded = MIN2(elements, TU_MAX_EMULATED_TEXEL_ELEMENTS); + encoded |= uint64_t(offset_texels & (aligment - 1)) << 30llu; + encoded <<= 6; + dst[7] = A6XX_TEX_MEMOBJ_7_FLAG_LO(encoded & 0x7FFFFFF); + dst[8] = A6XX_TEX_MEMOBJ_8_FLAG_HI(encoded >> 26); + + tu_desc_set_swiz(dst, tu_swiz(X, Y, Z, W)); +} + +template +static void +write_emulated_texel_buffer_descriptor_addr( + uint32_t *dst, const VkDescriptorAddressInfoEXT *buffer_info) +{ + if (!buffer_info || buffer_info->address == 0) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + return; + } + + enum pipe_format format = vk_format_to_pipe_format(buffer_info->format); + uint32_t blocksize_B = util_format_get_blocksize(format); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, format, buffer_info->address, elements); +} + static void write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) { @@ -1001,6 +1075,25 @@ write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) } } +template +static void +write_emulated_texel_buffer_descriptor(uint32_t *dst, + const VkBufferView buffer_view) +{ + if (buffer_view == VK_NULL_HANDLE) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + return; + } + + VK_FROM_HANDLE(tu_buffer_view, view, buffer_view); + + enum pipe_format format = vk_format_to_pipe_format(view->vk.format); + uint32_t elements = view->vk.elements; + write_emulated_texel_buffer_descriptor_common( + dst, format, vk_buffer_address(view->vk.buffer, view->vk.offset), + elements); +} + static VkDescriptorAddressInfoEXT buffer_info_to_address(const VkDescriptorBufferInfo *buffer_info) { @@ -1022,6 +1115,18 @@ write_buffer_descriptor_addr(const struct tu_device *device, const VkDescriptorAddressInfoEXT *buffer_info) { const struct fd_dev_info *info = device->physical_device->info; + + if (device->physical_device->enable_ssbo_emulation) { + memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t)); + if (!buffer_info || buffer_info->address == 0) + return; + uint32_t blocksize_B = util_format_get_blocksize(PIPE_FORMAT_R32_UINT); + uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0; + write_emulated_texel_buffer_descriptor_common( + dst, PIPE_FORMAT_R32_UINT, buffer_info->address, elements); + return; + } + /* This prevents any misconfiguration, but 16-bit descriptor capable of both * 16-bit and 32-bit access through isam.v will of course only be functional * when 16-bit storage is supported. */ @@ -1199,10 +1304,14 @@ tu_GetDescriptorEXT( write_buffer_descriptor_addr(device, dest, pDescriptorInfo->data.pStorageBuffer); break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - write_texel_buffer_descriptor_addr(dest, pDescriptorInfo->data.pUniformTexelBuffer); - break; case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor_addr(dest, pDescriptorInfo->data.pStorageTexelBuffer); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor_addr( + dest, pDescriptorInfo->data.pUniformTexelBuffer); + } else { + write_texel_buffer_descriptor_addr( + dest, pDescriptorInfo->data.pUniformTexelBuffer); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, @@ -1331,7 +1440,13 @@ tu_update_descriptor_sets(const struct tu_device *device, break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor( + ptr, writeset->pTexelBufferView[j]); + } else { + write_texel_buffer_descriptor(ptr, + writeset->pTexelBufferView[j]); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: @@ -1681,7 +1796,12 @@ tu_update_descriptor_set_with_template( break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); + if (device->physical_device->enable_texel_buffer_emulation) { + write_emulated_texel_buffer_descriptor( + ptr, *(VkBufferView *) src); + } else { + write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 2e755551dca..bd8d3c35de2 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -71,6 +71,12 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) _mesa_blake3_update(&ctx, &driver_flags, sizeof(driver_flags)); _mesa_blake3_update(&ctx, &device->uche_trap_base, sizeof(device->uche_trap_base)); _mesa_blake3_final(&ctx, blake3); + _mesa_blake3_update(&ctx, &device->instance->allow_oob_indirect_ubo_loads, + sizeof(device->instance->allow_oob_indirect_ubo_loads)); + _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, + sizeof(device->enable_texel_buffer_emulation)); + _mesa_blake3_update(&ctx, &device->enable_ssbo_emulation, + sizeof(device->enable_ssbo_emulation)); memcpy(uuid, blake3, VK_UUID_SIZE); return 0; @@ -165,8 +171,10 @@ get_device_extensions(const struct tu_physical_device *device, (!device->info->props.has_sw_fuse || device->has_raytracing); *ext = (struct vk_device_extension_table) { .table = { - .KHR_8bit_storage = device->info->props.storage_8bit, - .KHR_16bit_storage = device->info->props.storage_16bit, + .KHR_8bit_storage = device->info->props.storage_8bit && + !device->enable_ssbo_emulation, + .KHR_16bit_storage = device->info->props.storage_16bit && + !device->enable_ssbo_emulation, .KHR_acceleration_structure = has_raytracing, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, @@ -1108,9 +1116,13 @@ tu_get_properties(struct tu_physical_device *pdevice, props->maxImageDimension3D = (1 << 11); props->maxImageDimensionCube = (1 << 14); props->maxImageArrayLayers = (1 << (pdevice->info->props.is_a702 ? 8 : 11)); - props->maxTexelBufferElements = MAX_TEXEL_ELEMENTS; + props->maxTexelBufferElements = + pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; - props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE; + props->maxStorageBufferRange = + pdevice->enable_ssbo_emulation + ? TU_MAX_EMULATED_TEXEL_ELEMENTS * 4 + : MAX_STORAGE_BUFFER_RANGE; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; props->maxMemoryAllocationCount = UINT32_MAX; props->maxSamplerAllocationCount = 64 * 1024; @@ -1413,6 +1425,9 @@ tu_get_properties(struct tu_physical_device *pdevice, props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4 * (1 + COND(pdevice->info->props.storage_16bit && !pdevice->info->props.has_isam_v, 1) + COND(pdevice->info->props.storage_8bit, 1)); + if (pdevice->enable_ssbo_emulation) { + props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4; + } props->robustStorageBufferDescriptorSize = props->storageBufferDescriptorSize; props->accelerationStructureDescriptorSize = 4 * FDL6_TEX_CONST_DWORDS; @@ -1687,6 +1702,11 @@ tu_physical_device_init(struct tu_physical_device *device, device->has_cached_non_coherent_memory = device->level1_dcache_size > 0 && !DETECT_ARCH_ARM; + if (fd_dev_gen(&device->dev_id) >= 7) { + device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation; + device->enable_ssbo_emulation = instance->enable_ssbo_emulation; + } + device->memory.type_count = 1; device->memory.types[0] = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | @@ -1850,6 +1870,8 @@ static const driOptionDescription tu_dri_options[] = { DRI_CONF_TU_ENABLE_SOFTFLOAT32(false) DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false) DRI_CONF_TU_AUTOTUNE_ALGORITHM() + DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false) + DRI_CONF_TU_ENABLE_SSBO_EMULATION(false) DRI_CONF_SECTION_END }; @@ -1884,6 +1906,10 @@ tu_init_dri_options(struct tu_instance *instance) driQueryOptionb(&instance->dri_options, "tu_emulate_alpha_to_coverage"); instance->autotune_algo = driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm"); + instance->enable_texel_buffer_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation"); + instance->enable_ssbo_emulation = + driQueryOptionb(&instance->dri_options, "tu_enable_ssbo_emulation"); } static uint32_t instance_count = 0; diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..8d064c58f94 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -143,6 +143,9 @@ struct tu_physical_device /* Whether performance counter selector registers can be written by userspace CSes. */ bool is_perf_cntr_selectable; + bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; + struct { uint32_t non_lazy_type_count; uint32_t type_count; @@ -240,6 +243,13 @@ struct tu_instance /* Configuration option to use a specific autotune algorithm by default. */ const char *autotune_algo; + + /* D3D12 doesn't have documented limit for texel buffer or SSBO size, in practice + * some games expect up to (1 << 29) elements, which is higher than A6XX or + * A7XX hardware can support. + */ + bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; }; VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 7a37e60aab5..53f92144391 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -596,6 +596,148 @@ build_bindless(struct tu_device *dev, nir_builder *b, return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set); } +static nir_def * +build_texel_buffer_size(nir_builder *b, nir_def *desc, nir_def **offset_out) +{ + assert(nir_def_is_intrinsic(desc)); + nir_def *encoded_data = nir_resbase_ir3(b, 32, desc); + nir_def *encoded_data_lo = nir_channel(b, encoded_data, 0); + nir_def *encoded_data_hi = nir_channel(b, encoded_data, 1); + + nir_def *size_lo = nir_ishr_imm(b, encoded_data_lo, 6); + nir_def *size_hi = nir_ishl_imm(b, encoded_data_hi, 20); + nir_def *size = nir_iand_imm(b, nir_ior(b, size_lo, size_hi), + TU_MAX_EMULATED_TEXEL_ELEMENTS); + + if (offset_out) + *offset_out = nir_ishr_imm(b, encoded_data_hi, 10); + + return size; +} + +static nir_def * +build_texel_buffer_as_image_coords(nir_builder *b, + nir_def *offset, + nir_def *desc) +{ + nir_def *base_offset = nullptr; + nir_def *real_size = build_texel_buffer_size(b, desc, &base_offset); + nir_def *oob = nir_ige(b, offset, real_size); + + offset = nir_iadd(b, offset, base_offset); + + nir_def *x = nir_umod_imm(b, offset, TU_TEXEL_BUFFER_WIDTH); + nir_def *tmp = nir_udiv_imm(b, offset, TU_TEXEL_BUFFER_WIDTH); + nir_def *y = nir_umod_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT); + nir_def *z = nir_udiv_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT); + z = nir_bcsel(b, oob, nir_imm_int(b, 0xff), z); + + nir_def *coord3d = nir_vec3(b, x, y, z); + return coord3d; +} + +static void +lower_texel_buffers_to_image(nir_builder *b, + nir_intrinsic_instr *instr, + nir_def *bindless) +{ + switch (instr->intrinsic) { + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: { + b->cursor = nir_before_instr(&instr->instr); + + nir_def *coord = instr->src[1].ssa; + if (coord->num_components > 1) + coord = nir_channel(b, coord, 0); + nir_def *coord3d = + build_texel_buffer_as_image_coords(b, coord, bindless); + nir_src_rewrite(&instr->src[1], nir_pad_vector(b, coord3d, 4)); + nir_intrinsic_set_image_dim(instr, GLSL_SAMPLER_DIM_3D); + break; + } + case nir_intrinsic_bindless_image_size: { + nir_def_replace(&instr->def, + build_texel_buffer_size(b, bindless, nullptr)); + break; + } + default: + break; + } +} + +static bool +lower_ssbo_to_image(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic == nir_intrinsic_load_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_def *load = nir_bindless_image_load( + b, intr->def.num_components, intr->def.bit_size, bindless, + nir_pad_vec4(b, coord3d), nir_imm_zero(b, 1, 32) /* sample index */, + nir_imm_zero(b, 1, 32) /* lod */, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = PIPE_FORMAT_R32_UINT, .access = nir_intrinsic_access(intr)); + nir_def_replace(&intr->def, load); + return true; + } else if (intr->intrinsic == nir_intrinsic_store_ssbo) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + nir_bindless_image_store( + b, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, nir_imm_zero(b, 1, 32) /* lod */, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = PIPE_FORMAT_R32_UINT, + .access = nir_intrinsic_access(intr), .src_type = nir_type_uint32); + nir_instr_remove(&intr->instr); + return true; + } else if (intr->intrinsic == nir_intrinsic_get_ssbo_size) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *num_elements = build_texel_buffer_size(b, bindless, NULL); + nir_def *size = nir_amul_imm(b, num_elements, sizeof(uint32_t)); + nir_def_replace(&intr->def, size); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic = nir_bindless_image_atomic( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, .image_dim = GLSL_SAMPLER_DIM_3D, + .format = format, .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic); + return true; + } else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) { + b->cursor = nir_after_instr(&intr->instr); + nir_def *bindless = nir_get_io_index_src(intr)->ssa; + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, nir_get_io_offset_src(intr)->ssa, bindless); + enum pipe_format format = + intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT; + nir_def *atomic_swap = nir_bindless_image_atomic_swap( + b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d), + nir_imm_zero(b, 1, 32) /* sample index */, + nir_get_io_data_src(intr)->ssa, intr->src[3].ssa, + .image_dim = GLSL_SAMPLER_DIM_3D, .format = format, + .access = nir_intrinsic_access(intr), + .atomic_op = nir_intrinsic_atomic_op(intr)); + nir_def_replace(&intr->def, atomic_swap); + return true; + } + + return false; +} + static void lower_image_deref(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *instr, struct tu_shader *shader, @@ -605,6 +747,11 @@ lower_image_deref(struct tu_device *dev, nir_builder *b, nir_def *bindless = build_bindless(dev, b, deref, 0, shader, layout, 0, false); nir_rewrite_image_intrinsic(instr, bindless, nir_image_intrinsic_type_bindless); + + if (dev->physical_device->enable_texel_buffer_emulation && + nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) { + lower_texel_buffers_to_image(b, instr, bindless); + } } static bool @@ -869,6 +1016,31 @@ lower_tex_immutable(struct tu_device *dev, } } +static void +lower_tex_texel_buffer_to_image(nir_builder *b, + nir_tex_instr *tex, + uint32_t tex_bindless_idx) +{ + if (tex->op == nir_texop_txf) { + int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); + if (coord_idx >= 0) { + nir_def *coord = tex->src[coord_idx].src.ssa; + if (coord->num_components > 1) + coord = nir_channel(b, coord, 0); + nir_def *coord3d = build_texel_buffer_as_image_coords( + b, coord, tex->src[tex_bindless_idx].src.ssa); + nir_src_rewrite(&tex->src[coord_idx].src, coord3d); + + tex->sampler_dim = GLSL_SAMPLER_DIM_3D; + tex->coord_components = 3; + } + } else if (tex->op == nir_texop_txs) { + nir_def_replace( + &tex->def, + build_texel_buffer_size(b, tex->src[tex_bindless_idx].src.ssa, nullptr)); + } +} + static bool lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, struct tu_shader *shader, const struct tu_pipeline_layout *layout, @@ -899,6 +1071,11 @@ lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset; } + if (dev->physical_device->enable_texel_buffer_emulation && + tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) { + lower_tex_texel_buffer_to_image(b, tex, tex_src_idx); + } + return true; } @@ -1275,6 +1452,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, nir_metadata_none, ¶ms); + if (dev->physical_device->enable_ssbo_emulation) { + progress |= nir_lower_io_to_scalar(shader, nir_var_mem_ssbo, NULL, NULL); + progress |= nir_shader_intrinsics_pass(shader, lower_ssbo_to_image, + nir_metadata_control_flow, NULL); + } + /* Remove now-unused variables so that when we gather the shader info later * they won't be counted. */ diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index 614968d57b1..8e4547d7388 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -1449,6 +1449,17 @@ TODO: document the other workarounds. + + + + + + diff --git a/src/util/driconf.h b/src/util/driconf.h index e618123d817..6713b16eb1b 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -672,6 +672,14 @@ DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \ "Set the preferred autotune algorithm") +#define DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \ + "Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect") + +#define DRI_CONF_TU_ENABLE_SSBO_EMULATION(def) \ +DRI_CONF_OPT_B(tu_enable_ssbo_emulation, def, \ + "Emulate SSBO to allow higher limit for elements that is in line with what some D3D12 games expect") + /** * \brief Honeykrisp specific configuration options */