mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 06:58:05 +02:00
tu: Add option to raise the maximum SSBO size
Emulates SSBOS via 3D image access, real SSBO size and start offset (due to image aligment requirements) are stored in the descriptor and accessed via resbase. This also disables storage_16bit and storage_8bit to simplify SSBO emulation since no known D3D12 games, that need SSBO emulated, require those capabilities. Proprietary driver has a more complex solution which involves uniform branching on buffer size and using ldg for over-the-limit buffers.
This commit is contained in:
parent
80e194d7f1
commit
36a9593f01
5 changed files with 120 additions and 4 deletions
|
|
@ -59,6 +59,13 @@ descriptor_size(struct tu_device *dev,
|
|||
return FDL6_TEX_CONST_DWORDS * 4 * (subsampled ? 3 : 2);
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
|
||||
if (dev->physical_device->enable_ssbo_emulation) {
|
||||
/* With SSBO emulation, use a single R32_UINT emulated 2D
|
||||
* descriptor instead of multiple format-specific buffer
|
||||
* descriptors.
|
||||
*/
|
||||
return FDL6_TEX_CONST_DWORDS * 4;
|
||||
}
|
||||
/* isam.v allows using a single 16-bit descriptor for both 16-bit and
|
||||
* 32-bit loads. If not available but 16-bit storage is still supported,
|
||||
* two separate descriptors are required.
|
||||
|
|
@ -1108,6 +1115,18 @@ write_buffer_descriptor_addr(const struct tu_device *device,
|
|||
const VkDescriptorAddressInfoEXT *buffer_info)
|
||||
{
|
||||
const struct fd_dev_info *info = device->physical_device->info;
|
||||
|
||||
if (device->physical_device->enable_ssbo_emulation) {
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
if (!buffer_info || buffer_info->address == 0)
|
||||
return;
|
||||
uint32_t blocksize_B = util_format_get_blocksize(PIPE_FORMAT_R32_UINT);
|
||||
uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0;
|
||||
write_emulated_texel_buffer_descriptor_common<CHIP>(
|
||||
dst, PIPE_FORMAT_R32_UINT, buffer_info->address, elements);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This prevents any misconfiguration, but 16-bit descriptor capable of both
|
||||
* 16-bit and 32-bit access through isam.v will of course only be functional
|
||||
* when 16-bit storage is supported. */
|
||||
|
|
|
|||
|
|
@ -75,6 +75,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid)
|
|||
sizeof(device->instance->allow_oob_indirect_ubo_loads));
|
||||
_mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation,
|
||||
sizeof(device->enable_texel_buffer_emulation));
|
||||
_mesa_blake3_update(&ctx, &device->enable_ssbo_emulation,
|
||||
sizeof(device->enable_ssbo_emulation));
|
||||
|
||||
memcpy(uuid, blake3, VK_UUID_SIZE);
|
||||
return 0;
|
||||
|
|
@ -168,8 +170,10 @@ get_device_extensions(const struct tu_physical_device *device,
|
|||
(!device->info->props.has_sw_fuse || device->has_raytracing);
|
||||
|
||||
*ext = (struct vk_device_extension_table) { .table = {
|
||||
.KHR_8bit_storage = device->info->props.storage_8bit,
|
||||
.KHR_16bit_storage = device->info->props.storage_16bit,
|
||||
.KHR_8bit_storage = device->info->props.storage_8bit &&
|
||||
!device->enable_ssbo_emulation,
|
||||
.KHR_16bit_storage = device->info->props.storage_16bit &&
|
||||
!device->enable_ssbo_emulation,
|
||||
.KHR_acceleration_structure = has_raytracing,
|
||||
.KHR_bind_memory2 = true,
|
||||
.KHR_buffer_device_address = true,
|
||||
|
|
@ -1115,7 +1119,10 @@ tu_get_properties(struct tu_physical_device *pdevice,
|
|||
props->maxTexelBufferElements =
|
||||
pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS;
|
||||
props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE;
|
||||
props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE;
|
||||
props->maxStorageBufferRange =
|
||||
pdevice->enable_ssbo_emulation
|
||||
? TU_MAX_EMULATED_TEXEL_ELEMENTS * 4
|
||||
: MAX_STORAGE_BUFFER_RANGE;
|
||||
props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE;
|
||||
props->maxMemoryAllocationCount = UINT32_MAX;
|
||||
props->maxSamplerAllocationCount = 64 * 1024;
|
||||
|
|
@ -1418,6 +1425,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
|
|||
props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4 * (1 +
|
||||
COND(pdevice->info->props.storage_16bit && !pdevice->info->props.has_isam_v, 1) +
|
||||
COND(pdevice->info->props.storage_8bit, 1));
|
||||
if (pdevice->enable_ssbo_emulation) {
|
||||
props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4;
|
||||
}
|
||||
props->robustStorageBufferDescriptorSize =
|
||||
props->storageBufferDescriptorSize;
|
||||
props->accelerationStructureDescriptorSize = 4 * FDL6_TEX_CONST_DWORDS;
|
||||
|
|
@ -1685,6 +1695,7 @@ tu_physical_device_init(struct tu_physical_device *device,
|
|||
|
||||
if (fd_dev_gen(&device->dev_id) >= 7) {
|
||||
device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation;
|
||||
device->enable_ssbo_emulation = instance->enable_ssbo_emulation;
|
||||
}
|
||||
|
||||
device->memory.type_count = 1;
|
||||
|
|
@ -1851,6 +1862,7 @@ static const driOptionDescription tu_dri_options[] = {
|
|||
DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false)
|
||||
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
|
||||
DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false)
|
||||
DRI_CONF_TU_ENABLE_SSBO_EMULATION(false)
|
||||
DRI_CONF_SECTION_END
|
||||
};
|
||||
|
||||
|
|
@ -1887,6 +1899,8 @@ tu_init_dri_options(struct tu_instance *instance)
|
|||
driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
|
||||
instance->enable_texel_buffer_emulation =
|
||||
driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation");
|
||||
instance->enable_ssbo_emulation =
|
||||
driQueryOptionb(&instance->dri_options, "tu_enable_ssbo_emulation");
|
||||
}
|
||||
|
||||
static uint32_t instance_count = 0;
|
||||
|
|
|
|||
|
|
@ -144,6 +144,7 @@ struct tu_physical_device
|
|||
bool is_perf_cntr_selectable;
|
||||
|
||||
bool enable_texel_buffer_emulation;
|
||||
bool enable_ssbo_emulation;
|
||||
|
||||
struct {
|
||||
uint32_t non_lazy_type_count;
|
||||
|
|
@ -243,11 +244,12 @@ struct tu_instance
|
|||
/* Configuration option to use a specific autotune algorithm by default. */
|
||||
const char *autotune_algo;
|
||||
|
||||
/* D3D12 doesn't have documented limit for texel buffer size, in practice
|
||||
/* D3D12 doesn't have documented limit for texel buffer or SSBO size, in practice
|
||||
* some games expect up to (1 << 29) elements, which is higher than A6XX or
|
||||
* A7XX hardware can support.
|
||||
*/
|
||||
bool enable_texel_buffer_emulation;
|
||||
bool enable_ssbo_emulation;
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
||||
VK_OBJECT_TYPE_INSTANCE)
|
||||
|
|
|
|||
|
|
@ -669,6 +669,77 @@ lower_texel_buffers_to_image(nir_builder *b,
|
|||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_ssbo_to_image(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
||||
{
|
||||
if (intr->intrinsic == nir_intrinsic_load_ssbo) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
nir_def *load = nir_bindless_image_load(
|
||||
b, intr->def.num_components, intr->def.bit_size, bindless,
|
||||
nir_pad_vec4(b, coord3d), nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_imm_zero(b, 1, 32) /* lod */, .image_dim = GLSL_SAMPLER_DIM_3D,
|
||||
.format = PIPE_FORMAT_R32_UINT, .access = nir_intrinsic_access(intr));
|
||||
nir_def_replace(&intr->def, load);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_store_ssbo) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
nir_bindless_image_store(
|
||||
b, bindless, nir_pad_vec4(b, coord3d),
|
||||
nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_get_io_data_src(intr)->ssa, nir_imm_zero(b, 1, 32) /* lod */,
|
||||
.image_dim = GLSL_SAMPLER_DIM_3D, .format = PIPE_FORMAT_R32_UINT,
|
||||
.access = nir_intrinsic_access(intr), .src_type = nir_type_uint32);
|
||||
nir_instr_remove(&intr->instr);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_get_ssbo_size) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *num_elements = build_texel_buffer_size(b, bindless, NULL);
|
||||
nir_def *size = nir_amul_imm(b, num_elements, sizeof(uint32_t));
|
||||
nir_def_replace(&intr->def, size);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_ssbo_atomic) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
enum pipe_format format =
|
||||
intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT;
|
||||
nir_def *atomic = nir_bindless_image_atomic(
|
||||
b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d),
|
||||
nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_get_io_data_src(intr)->ssa, .image_dim = GLSL_SAMPLER_DIM_3D,
|
||||
.format = format, .access = nir_intrinsic_access(intr),
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr));
|
||||
nir_def_replace(&intr->def, atomic);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
enum pipe_format format =
|
||||
intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT;
|
||||
nir_def *atomic_swap = nir_bindless_image_atomic_swap(
|
||||
b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d),
|
||||
nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_get_io_data_src(intr)->ssa, intr->src[3].ssa,
|
||||
.image_dim = GLSL_SAMPLER_DIM_3D, .format = format,
|
||||
.access = nir_intrinsic_access(intr),
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr));
|
||||
nir_def_replace(&intr->def, atomic_swap);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_image_deref(struct tu_device *dev, nir_builder *b,
|
||||
nir_intrinsic_instr *instr, struct tu_shader *shader,
|
||||
|
|
@ -1383,6 +1454,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
|||
nir_metadata_none,
|
||||
¶ms);
|
||||
|
||||
if (dev->physical_device->enable_ssbo_emulation) {
|
||||
progress |= nir_lower_io_to_scalar(shader, nir_var_mem_ssbo, NULL, NULL);
|
||||
progress |= nir_shader_intrinsics_pass(shader, lower_ssbo_to_image,
|
||||
nir_metadata_control_flow, NULL);
|
||||
}
|
||||
|
||||
/* Remove now-unused variables so that when we gather the shader info later
|
||||
* they won't be counted.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -700,6 +700,10 @@
|
|||
DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \
|
||||
"Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect")
|
||||
|
||||
#define DRI_CONF_TU_ENABLE_SSBO_EMULATION(def) \
|
||||
DRI_CONF_OPT_B(tu_enable_ssbo_emulation, def, \
|
||||
"Emulate SSBO to allow higher limit for elements that is in line with what some D3D12 games expect")
|
||||
|
||||
/**
|
||||
* \brief Honeykrisp specific configuration options
|
||||
*/
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue