From 9b322347264ee8dedd3b666259172aec06bc1986 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 10 Jun 2026 08:16:12 +0200 Subject: [PATCH] tu: Add option to raise the maximum SSBO size Emulates SSBOS via global memory, real SSBO size and global base address are stored in the descriptor. The size can be accessed using resbase, the base address is parsed manualy from the descriptor by passing the bindless base address into the shader via a driver UBO or const file. nir_lower_ssbo is used to lower SSBO accesses to global memory when the buffer size exceeds the limit. We also use it to insert bounds checks on global memory. The final code for SSBO accesses looks like this: if (@get_ssbo_size >= max_storage_buffer_range_bytes) { if (offset < @get_ssbo_size) { // global memory access using base (from resbase) + offset } else { // do nothing (stores) or return 0 (loads) } } else { // original SSBO access } A new pass is added to lower @load_ssbo_address generated by nir_lower_ssbo. We set native_offset=true for nir_lower_ssbo to make sure it doesn't generate 64 bit address math. The new pass then transforms @load/store_global into @load/store_global_ir3 passing the 32 bit offset from @load_ssbo_address. Signed-off-by: Job Noorman Co-authored-by: Danylo Piliaiev Reviewed-by: Emma Anholt Part-of: --- src/freedreno/fdl/fd6_view.cc | 16 +- src/freedreno/fdl/freedreno_layout.h | 9 +- src/freedreno/ir3/ir3_shader.c | 2 + src/freedreno/ir3/ir3_shader.h | 4 +- src/freedreno/isa/ir3-cat6.xml | 2 +- src/freedreno/vulkan/tu_cmd_buffer.cc | 134 ++++++++++++++++- src/freedreno/vulkan/tu_common.h | 1 + src/freedreno/vulkan/tu_descriptor_set.cc | 9 +- src/freedreno/vulkan/tu_device.cc | 11 +- src/freedreno/vulkan/tu_device.h | 1 + src/freedreno/vulkan/tu_drirc_gen.py | 3 + src/freedreno/vulkan/tu_shader.cc | 170 ++++++++++++++++++++++ src/freedreno/vulkan/tu_shader.h | 3 + 13 files changed, 354 insertions(+), 11 deletions(-) diff --git a/src/freedreno/fdl/fd6_view.cc b/src/freedreno/fdl/fd6_view.cc index a4a89336371..67f2095f8da 100644 --- a/src/freedreno/fdl/fd6_view.cc +++ b/src/freedreno/fdl/fd6_view.cc @@ -574,7 +574,8 @@ template void fdl6_buffer_view_init(uint32_t *descriptor, enum pipe_format format, const uint8_t (&swiz)[4], uint64_t iova, uint32_t size, - uint32_t struct_size_texels) + uint32_t struct_size_texels, + enum fdl_ssbo_emulation_mode ssbo_emulation) { unsigned elem_size = util_format_get_blocksize(format); unsigned elements = size / elem_size; @@ -609,6 +610,19 @@ fdl6_buffer_view_init(uint32_t *descriptor, enum pipe_format format, A6XX_TEX_MEMOBJ_2_TYPE(A6XX_TEX_BUFFER); descriptor[4] = base_iova; descriptor[5] = base_iova >> 32; + + if (ssbo_emulation == FDL_SSBO_EMULATION_ENABLED) { + /* resbase returns 0 if size is 0 */ + if (descriptor[1] == 0) { + descriptor[1] = A6XX_TEX_MEMOBJ_1_WIDTH(1); + } + + uint64_t encoded_size = (uint64_t) size << 6ull; + descriptor[7] = A6XX_TEX_MEMOBJ_7_FLAG_LO(encoded_size & 0x7FFFFFF); + descriptor[8] = A6XX_TEX_MEMOBJ_8_FLAG_HI(encoded_size >> 26); + descriptor[11] = iova; + descriptor[12] = iova >> 32; + } } else if (CHIP >= A8XX) { descriptor[0] = A8XX_TEX_MEMOBJ_0_INSTANCE_DESC_BASE_LO(iova); descriptor[1] = A8XX_TEX_MEMOBJ_1_BASE_HI(iova >> 32) | diff --git a/src/freedreno/fdl/freedreno_layout.h b/src/freedreno/fdl/freedreno_layout.h index 10b24f1227e..d0e482d6e92 100644 --- a/src/freedreno/fdl/freedreno_layout.h +++ b/src/freedreno/fdl/freedreno_layout.h @@ -481,11 +481,18 @@ template void fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts, const struct fdl_view_args *args, bool has_z24uint_s8uint); + +enum fdl_ssbo_emulation_mode { + FDL_SSBO_EMULATION_DISABLED, + FDL_SSBO_EMULATION_ENABLED, +}; + template void fdl6_buffer_view_init(uint32_t *descriptor, enum pipe_format format, const uint8_t (&swiz)[4], uint64_t iova, uint32_t size, - uint32_t struct_size_texels = 1); + uint32_t struct_size_texels = 1, + enum fdl_ssbo_emulation_mode ssbo_emulation = FDL_SSBO_EMULATION_DISABLED); #endif #endif /* FREEDRENO_LAYOUT_H_ */ diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 5652cd75b37..6b8419bb278 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -1014,6 +1014,8 @@ ir3_const_alloc_type_to_string(enum ir3_const_alloc_type type) return "dyn_descriptor_offset"; case IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS: return "inline_uniform_addresses"; + case IR3_CONST_ALLOC_BINDLESS_BASE_ADDRS: + return "bindless_base_addresses"; case IR3_CONST_ALLOC_DRIVER_PARAMS: return "driver_params"; case IR3_CONST_ALLOC_UBO_RANGES: diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index ad1390292fc..89fc8f84a6a 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -226,7 +226,9 @@ enum ir3_const_alloc_type { IR3_CONST_ALLOC_PRIMITIVE_PARAM = 10, /* Common, mapping from varying location to offset. */ IR3_CONST_ALLOC_PRIMITIVE_MAP = 11, - IR3_CONST_ALLOC_MAX = 12, + /* For SSBO emulation */ + IR3_CONST_ALLOC_BINDLESS_BASE_ADDRS = 12, + IR3_CONST_ALLOC_MAX = 13, }; struct ir3_const_allocation { diff --git a/src/freedreno/isa/ir3-cat6.xml b/src/freedreno/isa/ir3-cat6.xml index 4736c645ab6..46b01a065e4 100644 --- a/src/freedreno/isa/ir3-cat6.xml +++ b/src/freedreno/isa/ir3-cat6.xml @@ -1312,7 +1312,7 @@ TODO rename UAV src to "UAV" so disasm_field_cb can find it easily? RESourceBASE - returns the value encoded into TEX_CONST_7_FLAG_LO/HI - of the given descriptor. + of the given descriptor. Returns 0 if NUM_ELEMENTS == 0. 001100 diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 15e4562a416..60ad3a8ef90 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -4740,6 +4740,9 @@ tu_bind_descriptor_sets(struct tu_cmd_buffer *cmd, descriptors_state->set_iova[idx] = set ? (set->va | BINDLESS_DESCRIPTOR_64B) : 0; + if (cmd->device->physical_device->enable_ssbo_emulation) + cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; + if (!set) continue; @@ -4805,6 +4808,12 @@ tu_bind_descriptor_sets(struct tu_cmd_buffer *cmd, va += desc_offset << offset_shift; va += offset; + + if (cmd->device->physical_device->enable_ssbo_emulation) { + dst_desc[11] = va; + dst_desc[12] = va >> 32; + } + unsigned new_offset = (va & 0x3f) >> offset_shift; va &= ~0x3full; dst_desc[2] = @@ -4907,7 +4916,8 @@ tu_set_descriptor_buffer_offsets( info->pOffsets[i]) | BINDLESS_DESCRIPTOR_64B; - if (set_layout->has_inline_uniforms) + if (cmd->device->physical_device->enable_ssbo_emulation || + set_layout->has_inline_uniforms) cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; } @@ -7420,6 +7430,7 @@ TU_GENX(tu_CmdBeginCustomResolveEXT); static uint32_t tu6_user_consts_size(const struct tu_const_state *const_state, bool ldgk, + bool load_shader_consts_via_preamble, mesa_shader_stage type) { uint32_t dwords = 0; @@ -7436,6 +7447,15 @@ tu6_user_consts_size(const struct tu_const_state *const_state, dwords += 8 * const_state->num_inline_ubos; } + if (const_state->num_bindless_base_addresses > 0) { + if (load_shader_consts_via_preamble) { + if (const_state->bindless_base_addrs_ubo.idx != -1) + dwords += 6 + (2 * const_state->num_bindless_base_addresses + 4); + } else { + dwords += 4 + align(const_state->num_bindless_base_addresses * 2, 4); + } + } + return dwords; } @@ -7616,17 +7636,116 @@ tu6_const_size(struct tu_cmd_buffer *cmd, } bool ldgk = cmd->device->physical_device->info->props.load_inline_uniforms_via_preamble_ldgk; + bool load_shader_consts_via_preamble = + cmd->device->physical_device->info->props.load_shader_consts_via_preamble; if (compute) { dwords += - tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, ldgk, MESA_SHADER_COMPUTE); + tu6_user_consts_size(&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, + ldgk, + load_shader_consts_via_preamble, + MESA_SHADER_COMPUTE); } else { for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) - dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, ldgk, (mesa_shader_stage) type); + dwords += tu6_user_consts_size(&cmd->state.shaders[type]->const_state, + ldgk, + load_shader_consts_via_preamble, + (mesa_shader_stage) type); } return dwords; } +static void +fill_bindless_base_addresses(const struct tu_const_state *const_state, + struct tu_descriptor_state *descriptors, + uint64_t *addresses) +{ + assert(const_state->num_bindless_base_addresses <= MAX_SETS); + + for (unsigned i = 0; i < const_state->num_bindless_base_addresses; i++) + addresses[i] = descriptors->set_iova[i] & ~0x3f; +} + +static void +tu6_emit_bindless_base_addresses(struct tu_cs *cs, + const struct tu_const_state *const_state, + const struct ir3_const_state *ir_const_state, + mesa_shader_stage type, + struct tu_descriptor_state *descriptors) +{ + uint64_t addresses[MAX_SETS] = {0}; + fill_bindless_base_addresses(const_state, descriptors, addresses); + + uint32_t offset = + ir_const_state->allocs.consts[IR3_CONST_ALLOC_BINDLESS_BASE_ADDRS] + .offset_vec4; + unsigned num_dwords = align(const_state->num_bindless_base_addresses * 2, 4); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_dwords); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(num_dwords / 4)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + + for (unsigned i = 0; i < const_state->num_bindless_base_addresses; i++) + tu_cs_emit_qw(cs, addresses[i]); + + for (unsigned i = const_state->num_bindless_base_addresses * 2; + i < num_dwords; i++) + tu_cs_emit(cs, 0); +} + +static void +tu7_emit_bindless_base_addresses(struct tu_cs *cs, + const struct tu_const_state *const_state, + mesa_shader_stage type, + struct tu_descriptor_state *descriptors) +{ + uint64_t addresses[MAX_SETS] = {0}; + int ubo_offset = const_state->bindless_base_addrs_ubo.idx; + if (ubo_offset < 0) + return; + + fill_bindless_base_addresses(const_state, descriptors, addresses); + + /* A7XX TODO: Emit data via sub_cs instead of NOP */ + uint64_t iova = tu_cs_emit_data_nop( + cs, (uint32_t *) addresses, const_state->num_bindless_base_addresses * 2, 4); + unsigned size_vec4s = + DIV_ROUND_UP(const_state->num_bindless_base_addresses * 2, 4); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo_offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + tu_cs_emit_qw(cs, iova | ((uint64_t) A6XX_UBO_1_SIZE(size_vec4s) << 32)); +} + +static void +tu_emit_bindless_base_addresses(struct tu_cs *cs, + const struct tu_const_state *const_state, + const struct ir3_const_state *ir_const_state, + mesa_shader_stage type, + struct tu_descriptor_state *descriptors) +{ + if (!const_state->num_bindless_base_addresses) + return; + + if (cs->device->physical_device->info->props.load_shader_consts_via_preamble) { + tu7_emit_bindless_base_addresses(cs, const_state, type, descriptors); + } else { + tu6_emit_bindless_base_addresses(cs, const_state, ir_const_state, type, + descriptors); + } +} + template static struct tu_draw_state tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) @@ -7661,6 +7780,11 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen, MESA_SHADER_COMPUTE, tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE)); + tu_emit_bindless_base_addresses( + &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, + cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->const_state, + MESA_SHADER_COMPUTE, + tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE)); } else { struct tu_descriptor_state *descriptors = tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); @@ -7674,6 +7798,10 @@ tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) tu_emit_inline_ubo(&cs, &link->tu_const_state, &link->const_state, link->constlen, (mesa_shader_stage) type, descriptors); + tu_emit_bindless_base_addresses(&cs, &link->tu_const_state, + &link->const_state, + (mesa_shader_stage) type, + descriptors); } } diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index 141685b1a27..bf5c503dc55 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -110,6 +110,7 @@ #define TU_D3D12_MAX_TEXEL_BUFFER_ELEMENTS ((1u << 29u) - 1u) #define TU_TEXEL_BUFFER_MAX_WIDTH (1u << 14) #define TU_TEXEL_BUFFER_MAX_HEIGHT (1u << 14) +#define TU_D3D12_MAX_STORAGE_BUFFER_RANGE_BYTES ((1u << 31u) - 1u) /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so * expose the same maximum range. * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual diff --git a/src/freedreno/vulkan/tu_descriptor_set.cc b/src/freedreno/vulkan/tu_descriptor_set.cc index cb7b2e4e4d7..f4bee33428c 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.cc +++ b/src/freedreno/vulkan/tu_descriptor_set.cc @@ -1131,11 +1131,14 @@ write_buffer_descriptor_addr(const struct tu_device *device, if (!buffer_info || buffer_info->address == 0) return; + enum fdl_ssbo_emulation_mode ssbo_emulation_mode = + device->physical_device->enable_ssbo_emulation ? FDL_SSBO_EMULATION_ENABLED : FDL_SSBO_EMULATION_DISABLED; + uint64_t va = buffer_info->address; uint32_t range = buffer_info->range; if (info->props.storage_16bit) { - fdl6_buffer_view_init(dst, PIPE_FORMAT_R16_UINT, tu_swiz(X, Y, Z, W), va, range); + fdl6_buffer_view_init(dst, PIPE_FORMAT_R16_UINT, tu_swiz(X, Y, Z, W), va, range, 1, ssbo_emulation_mode); dst += FDL6_TEX_CONST_DWORDS; } @@ -1143,12 +1146,12 @@ write_buffer_descriptor_addr(const struct tu_device *device, * 16-bit descriptor cannot be used for 32-bit loads through isam.v. */ if (!info->props.storage_16bit || !info->props.has_isam_v) { - fdl6_buffer_view_init(dst, PIPE_FORMAT_R32_UINT, tu_swiz(X, Y, Z, W), va, range); + fdl6_buffer_view_init(dst, PIPE_FORMAT_R32_UINT, tu_swiz(X, Y, Z, W), va, range, 1, ssbo_emulation_mode); dst += FDL6_TEX_CONST_DWORDS; } if (info->props.storage_8bit) { - fdl6_buffer_view_init(dst, PIPE_FORMAT_R8_UINT, tu_swiz(X, Y, Z, W), va, range); + fdl6_buffer_view_init(dst, PIPE_FORMAT_R8_UINT, tu_swiz(X, Y, Z, W), va, range, 1, ssbo_emulation_mode); dst += FDL6_TEX_CONST_DWORDS; } } diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index f718dac1613..82efb07af90 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -105,6 +105,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) sizeof(device->instance->drirc.misc.allow_oob_indirect_ubo_loads)); _mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation, sizeof(device->enable_texel_buffer_emulation)); + _mesa_blake3_update(&ctx, &device->enable_ssbo_emulation, + sizeof(device->enable_ssbo_emulation)); _mesa_blake3_final(&ctx, blake3); memcpy(uuid, blake3, VK_UUID_SIZE); @@ -1155,7 +1157,10 @@ tu_get_properties(struct tu_physical_device *pdevice, ? TU_D3D12_MAX_TEXEL_BUFFER_ELEMENTS : pdevice->info->props.max_texel_buffer_range_elements; props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE; - props->maxStorageBufferRange = pdevice->info->props.max_storage_buffer_range_bytes; + props->maxStorageBufferRange = + pdevice->enable_ssbo_emulation + ? TU_D3D12_MAX_STORAGE_BUFFER_RANGE_BYTES + : pdevice->info->props.max_storage_buffer_range_bytes; props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE; props->maxMemoryAllocationCount = UINT32_MAX; props->maxSamplerAllocationCount = 64 * 1024; @@ -1732,6 +1737,10 @@ tu_physical_device_init(struct tu_physical_device *device, assert(fd_dev_gen(&device->dev_id) == 7); device->enable_texel_buffer_emulation = instance->drirc.misc.enable_texel_buffer_emulation; } + if (device->info->props.max_storage_buffer_range_bytes < TU_D3D12_MAX_STORAGE_BUFFER_RANGE_BYTES) { + assert(fd_dev_gen(&device->dev_id) == 7); + device->enable_ssbo_emulation = instance->drirc.misc.enable_ssbo_emulation; + } } if (tu_device_get_cache_uuid(device, device->cache_uuid)) { diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 3bce030edd8..d32a7d80f6d 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -153,6 +153,7 @@ struct tu_physical_device bool is_perf_cntr_selectable; bool enable_texel_buffer_emulation; + bool enable_ssbo_emulation; struct { uint32_t non_lazy_type_count; diff --git a/src/freedreno/vulkan/tu_drirc_gen.py b/src/freedreno/vulkan/tu_drirc_gen.py index dfe322254fa..95164eee1b2 100644 --- a/src/freedreno/vulkan/tu_drirc_gen.py +++ b/src/freedreno/vulkan/tu_drirc_gen.py @@ -112,6 +112,9 @@ def declare_options(): B("tu_enable_texel_buffer_emulation", False, "Emulate texel buffers to allow a higher limit for elements that is in line with what some D3D12 games expect", c_name="enable_texel_buffer_emulation"), + B("tu_enable_ssbo_emulation", False, + "Emulate SSBOs to allow a higher limit for elements that is in line with what some D3D12 games expect", + c_name="enable_ssbo_emulation"), ] features_options = [] diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 5f8d49092c0..8c228e9ff66 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -9,6 +9,7 @@ #include "nir/nir_xfb_info.h" #include "spirv/nir_spirv.h" +#include "util/macros.h" #include "util/mesa-blake3.h" #include "vk_nir.h" #include "vk_nir_convert_ycbcr.h" @@ -1367,6 +1368,21 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, ldgk_consts, 1); + if (dev->physical_device->enable_ssbo_emulation) { + const_state->num_bindless_base_addresses = layout->num_sets; + const_state->bindless_base_const_offset_vec4 = const_allocs->max_const_offset_vec4; + + if (dev->physical_device->reserved_set_idx >= 0) { + const_state->num_bindless_base_addresses = + MAX2(layout->num_sets, (unsigned) dev->physical_device->reserved_set_idx + 1); + } + + if (!dev->compiler->info->props.load_shader_consts_via_preamble) { + ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_BINDLESS_BASE_ADDRS, + DIV_ROUND_UP(const_state->num_bindless_base_addresses * 2, 4), 1); + } + } + struct lower_instr_params params = { .dev = dev, .shader = tu_shader, @@ -1557,6 +1573,141 @@ tu_nir_lower_ssbo_descriptor(nir_shader *shader, (void *)dev); } +static nir_def * +build_ssbo_size_from_resbase(nir_builder *b, nir_def *desc) +{ + assert(nir_def_is_intrinsic(desc)); + nir_def *encoded_data = nir_resbase_ir3(b, 32, desc); + nir_def *encoded_data_lo = nir_channel(b, encoded_data, 0); + nir_def *encoded_data_hi = nir_channel(b, encoded_data, 1); + + nir_def *size_lo = nir_ishr_imm(b, encoded_data_lo, 6); + nir_def *size_hi = nir_ishl_imm(b, encoded_data_hi, 20); + + return nir_ior(b, size_lo, size_hi); +} + +static nir_intrinsic_instr * +get_ssbo_bindless(nir_intrinsic_instr *intr) +{ + nir_def *buffer = nir_get_io_index_src(intr)->ssa; + assert(nir_def_is_intrinsic(buffer)); + + nir_intrinsic_instr *bindless = nir_def_as_intrinsic(buffer); + assert(bindless->intrinsic == nir_intrinsic_bindless_resource_ir3); + + return bindless; +} + +static nir_def * +build_ssbo_global_addr(nir_builder *b, + nir_intrinsic_instr *bindless, + struct tu_shader *shader, + bool load_shader_consts_via_preamble, + const struct ir3_const_allocations *const_allocs) +{ + nir_def *set_base; + + if (load_shader_consts_via_preamble) { + set_base = + ir3_load_driver_ubo(b, 2, &shader->const_state.bindless_base_addrs_ubo, + nir_intrinsic_desc_set(bindless) * 2); + } else { + const unsigned dword_base = + const_allocs->consts[IR3_CONST_ALLOC_BINDLESS_BASE_ADDRS].offset_vec4 * + 4 + + nir_intrinsic_desc_set(bindless) * 2; + + set_base = + nir_load_const_ir3(b, 2, 32, nir_imm_int(b, 0), .base = dword_base); + } + + nir_def *descriptor_offset = nir_iadd_imm( + b, nir_imul_imm(b, bindless->src[0].ssa, FDL6_TEX_CONST_DWORDS * 4), + 11 * 4); + nir_def *descriptor_words = nir_load_global_ir3( + b, 2, 32, nir_pack_64_2x32(b, set_base), descriptor_offset, + .access = (enum gl_access_qualifier)( + ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER | ACCESS_CAN_SPECULATE), + .align_mul = 4, .align_offset = 0); + + return nir_pack_64_2x32(b, descriptor_words); +} + +struct lower_ssbo_address_size_state { + struct tu_shader *shader; + const struct ir3_const_allocations *const_allocs; + bool load_shader_consts_via_preamble; +}; + +static bool +lower_ssbo_address_size(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_ssbo_address: + case nir_intrinsic_get_ssbo_size: + break; + default: + return false; + } + + auto state = static_cast(data); + b->cursor = nir_before_instr(&intr->instr); + + if (intr->intrinsic == nir_intrinsic_load_ssbo_address) { + nir_def *base = build_ssbo_global_addr( + b, get_ssbo_bindless(intr), state->shader, + state->load_shader_consts_via_preamble, state->const_allocs); + nir_def *offset = intr->src[1].ssa; + + nir_foreach_use_safe (use, &intr->def) { + nir_instr *use_instr = nir_src_use_instr(use); + b->cursor = nir_before_instr(use_instr); + + nir_intrinsic_instr *use_intr = nir_instr_as_intrinsic(use_instr); + + switch (use_intr->intrinsic) { + case nir_intrinsic_global_atomic: + case nir_intrinsic_global_atomic_swap: { + nir_def *addr = nir_iadd(b, base, nir_u2u64(b, offset)); + nir_src_rewrite(nir_get_io_offset_src(use_intr), addr); + break; + } + case nir_intrinsic_load_global: { + nir_def *load = nir_load_global_ir3( + b, use_intr->def.num_components, use_intr->def.bit_size, base, + offset, .access = nir_intrinsic_access(use_intr)); + nir_def_replace(&use_intr->def, load); + break; + } + case nir_intrinsic_store_global: { + nir_store_global_ir3(b, nir_get_io_data_src(use_intr)->ssa, base, + offset, + .access = nir_intrinsic_access(use_intr)); + nir_instr_remove(use_instr); + break; + } + default: + UNREACHABLE("unexpected use of @load_ssbo_address"); + } + } + } else { + nir_def *ssbo_size = + build_ssbo_size_from_resbase(b, nir_get_io_index_src(intr)->ssa); + nir_def_replace(&intr->def, ssbo_size); + } + + return true; +} + +static bool +tu_nir_lower_ssbo_address_size( + nir_shader *shader, const struct lower_ssbo_address_size_state *state) +{ + return nir_shader_intrinsics_pass(shader, lower_ssbo_address_size, + nir_metadata_control_flow, (void *) state); +} + struct lower_fdm_state { nir_variable *layer_var; nir_variable *viewport_var; @@ -2958,6 +3109,7 @@ tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size) shader->const_state.fdm_ubo.idx = -1; shader->const_state.dynamic_offsets_ubo.idx = -1; shader->const_state.inline_uniforms_ubo.idx = -1; + shader->const_state.bindless_base_addrs_ubo.idx = -1; return shader; } @@ -3251,6 +3403,24 @@ tu_shader_create(struct tu_device *dev, */ NIR_PASS(_, nir, tu_nir_lower_ssbo_descriptor, dev); + if (dev->physical_device->enable_ssbo_emulation) { + nir_lower_ssbo_options options = { + .native_offset = true, + .min_ssbo_size = dev->compiler->info->props.max_storage_buffer_range_bytes, + .bounds_check = true, + }; + + NIR_PASS(_, nir, nir_lower_ssbo, &options); + + struct lower_ssbo_address_size_state lower_ssbo_state = { + .shader = shader, + .const_allocs = &const_allocs, + .load_shader_consts_via_preamble = + dev->compiler->info->props.load_shader_consts_via_preamble, + }; + NIR_PASS(_, nir, tu_nir_lower_ssbo_address_size, &lower_ssbo_state); + } + const struct ir3_shader_options options = { .api_wavesize = key->api_wavesize, .real_wavesize = key->real_wavesize, diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h index 5817546a1c8..1cd0e890272 100644 --- a/src/freedreno/vulkan/tu_shader.h +++ b/src/freedreno/vulkan/tu_shader.h @@ -50,10 +50,13 @@ struct tu_const_state uint32_t dynamic_offset_loc; unsigned num_inline_ubos; struct tu_inline_ubo ubos[MAX_INLINE_UBOS]; + uint32_t num_bindless_base_addresses; + uint32_t bindless_base_const_offset_vec4; struct ir3_driver_ubo fdm_ubo; struct ir3_driver_ubo dynamic_offsets_ubo; struct ir3_driver_ubo inline_uniforms_ubo; + struct ir3_driver_ubo bindless_base_addrs_ubo; }; struct tu_shader