mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 02:38:04 +02:00
Merge branch 'turnip/feature/texel-buffer-emulation' into 'main'
tu: Add option to raise the maximum texel buffer size See merge request mesa/mesa!39499
This commit is contained in:
commit
2136291543
15 changed files with 393 additions and 11 deletions
|
|
@ -807,6 +807,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|||
case nir_intrinsic_load_gmem_frag_offset_ir3:
|
||||
case nir_intrinsic_bindless_resource_ir3:
|
||||
case nir_intrinsic_ray_intersection_ir3:
|
||||
case nir_intrinsic_resbase_ir3:
|
||||
case nir_intrinsic_load_attribute_payload_intel:
|
||||
case nir_intrinsic_load_urb_vec4_intel:
|
||||
case nir_intrinsic_load_urb_lsc_intel:
|
||||
|
|
|
|||
|
|
@ -1683,6 +1683,8 @@ intrinsic("prefetch_sam_ir3", [1, 1], flags=[CAN_REORDER])
|
|||
intrinsic("prefetch_tex_ir3", [1], flags=[CAN_REORDER])
|
||||
intrinsic("prefetch_ubo_ir3", [1], flags=[CAN_REORDER])
|
||||
|
||||
intrinsic("resbase_ir3", src_comp=[1], dest_comp=2, flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||
|
||||
# Panfrost-specific intrinsic for loading vertex attributes. Takes explicit
|
||||
# vertex and instance IDs which we need in order to implement vertex attribute
|
||||
# divisor with non-zero base instance on v9+.
|
||||
|
|
|
|||
|
|
@ -1913,6 +1913,7 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
|
|||
return false;
|
||||
break;
|
||||
case OPC_RESINFO:
|
||||
case OPC_RESBASE:
|
||||
if (n != 0)
|
||||
return false;
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -3122,6 +3122,7 @@ INSTR3NODST(STLW)
|
|||
INSTR3NODST(STP)
|
||||
INSTR1(RESINFO)
|
||||
INSTR1(RESFMT)
|
||||
INSTR1(RESBASE)
|
||||
INSTR2(ATOMIC_ADD)
|
||||
INSTR2(ATOMIC_SUB)
|
||||
INSTR2(ATOMIC_XCHG)
|
||||
|
|
|
|||
|
|
@ -3522,6 +3522,20 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
|||
array_insert(ctx->block, ctx->block->keeps, ldc);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_resbase_ir3: {
|
||||
struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
struct ir3_instruction *resbase = ir3_RESBASE(b, ibo, 0);
|
||||
resbase->cat6.iim_val = 1;
|
||||
resbase->cat6.d = 1;
|
||||
resbase->cat6.type = TYPE_U32;
|
||||
resbase->cat6.typed = false;
|
||||
/* resbase has no writemask and always writes out 2 components */
|
||||
resbase->dsts[0]->wrmask = MASK(2);
|
||||
ir3_handle_bindless_cat6(resbase, intr->src[0]);
|
||||
ir3_handle_nonuniform(resbase, intr);
|
||||
ir3_split_dest(b, dst, resbase, 0, 2);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_rotate:
|
||||
case nir_intrinsic_shuffle_up_uniform_ir3:
|
||||
case nir_intrinsic_shuffle_down_uniform_ir3:
|
||||
|
|
|
|||
|
|
@ -254,7 +254,7 @@ sync_update(struct ir3_legalize_state *state, struct ir3_compiler *compiler,
|
|||
|
||||
if (is_tex_or_prefetch(n) && !has_dummy_dst(n)) {
|
||||
regmask_set(&state->needs_sy, n->dsts[0]);
|
||||
} else if (n->opc == OPC_RESINFO && !has_dummy_dst(n)) {
|
||||
} else if ((n->opc == OPC_RESINFO || n->opc == OPC_RESBASE) && !has_dummy_dst(n)) {
|
||||
regmask_set(&state->needs_ss, n->dsts[0]);
|
||||
} else if (is_load(n)) {
|
||||
if (is_local_mem_load(n))
|
||||
|
|
|
|||
|
|
@ -464,6 +464,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
|
|||
switch (instr->opc) {
|
||||
case OPC_RESINFO:
|
||||
case OPC_RESFMT:
|
||||
case OPC_RESBASE:
|
||||
if (instr->dsts_count > 0)
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
|
||||
validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
|
||||
|
|
|
|||
|
|
@ -1311,7 +1311,8 @@ TODO rename UAV src to "UAV" so disasm_field_cb can find it easily?
|
|||
|
||||
<bitset name="resbase" extends="#instruction-cat6-a6xx-ibo-1src">
|
||||
<doc>
|
||||
RESourceBASE - returns the address of the bindless descriptor
|
||||
RESourceBASE - returns the value encoded into TEX_CONST_7_FLAG_LO/HI
|
||||
of the given descriptor.
|
||||
</doc>
|
||||
<pattern low="14" high="19">001100</pattern> <!-- OPC -->
|
||||
</bitset>
|
||||
|
|
|
|||
|
|
@ -110,6 +110,9 @@
|
|||
/* match the latest Qualcomm driver which is also a hw limit on later gens */
|
||||
#define MAX_STORAGE_BUFFER_RANGE (1u << 27)
|
||||
#define MAX_TEXEL_ELEMENTS (1u << 27)
|
||||
#define TU_MAX_EMULATED_TEXEL_ELEMENTS ((1u << 30) - 1)
|
||||
#define TU_TEXEL_BUFFER_WIDTH (1u << 14)
|
||||
#define TU_TEXEL_BUFFER_MAX_HEIGHT (1u << 14)
|
||||
/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
|
||||
* expose the same maximum range.
|
||||
* TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@
|
|||
#include "tu_descriptor_set.h"
|
||||
|
||||
#include "util/mesa-blake3.h"
|
||||
#include "util/format/u_format.h"
|
||||
#include "vk_acceleration_structure.h"
|
||||
#include "vk_descriptors.h"
|
||||
#include "vk_util.h"
|
||||
|
|
@ -32,6 +33,7 @@
|
|||
#include "tu_rmv.h"
|
||||
#include "tu_sampler.h"
|
||||
#include "tu_subsampled_image.h"
|
||||
#include "fdl/fd6_format_table.h"
|
||||
|
||||
static inline uint8_t *
|
||||
pool_base(struct tu_descriptor_pool *pool)
|
||||
|
|
@ -57,6 +59,13 @@ descriptor_size(struct tu_device *dev,
|
|||
return FDL6_TEX_CONST_DWORDS * 4 * (subsampled ? 3 : 2);
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
|
||||
if (dev->physical_device->enable_ssbo_emulation) {
|
||||
/* With SSBO emulation, use a single R32_UINT emulated 2D
|
||||
* descriptor instead of multiple format-specific buffer
|
||||
* descriptors.
|
||||
*/
|
||||
return FDL6_TEX_CONST_DWORDS * 4;
|
||||
}
|
||||
/* isam.v allows using a single 16-bit descriptor for both 16-bit and
|
||||
* 32-bit loads. If not available but 16-bit storage is still supported,
|
||||
* two separate descriptors are required.
|
||||
|
|
@ -989,6 +998,71 @@ write_texel_buffer_descriptor_addr(uint32_t *dst,
|
|||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
write_emulated_texel_buffer_descriptor_common(uint32_t *dst,
|
||||
enum pipe_format format,
|
||||
uint64_t addr, uint32_t elements)
|
||||
{
|
||||
uint32_t blocksize_B = util_format_get_blocksize(format);
|
||||
|
||||
const uint32_t aligment = 64;
|
||||
uint64_t aligned_addr = addr & ~(uint64_t) (aligment - 1);
|
||||
uint32_t offset_texels = uint32_t(addr - aligned_addr) / blocksize_B;
|
||||
uint32_t elements_with_offset = elements + offset_texels;
|
||||
|
||||
uint32_t width = MIN2(elements_with_offset, TU_TEXEL_BUFFER_WIDTH);
|
||||
uint32_t height = MIN2(DIV_ROUND_UP(elements_with_offset, width),
|
||||
TU_TEXEL_BUFFER_MAX_HEIGHT);
|
||||
uint32_t depth = elements_with_offset
|
||||
? DIV_ROUND_UP(elements_with_offset, width * height)
|
||||
: 0;
|
||||
uint32_t layer_size = width * height * blocksize_B;
|
||||
enum a6xx_tile_mode tile_mode = TILE6_LINEAR;
|
||||
enum a6xx_format texture_format =
|
||||
fd6_texture_format(format, tile_mode, false);
|
||||
enum a3xx_color_swap swap = fd6_texture_swap(format, tile_mode, false);
|
||||
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
|
||||
dst[0] = A6XX_TEX_MEMOBJ_0_TILE_MODE(tile_mode) |
|
||||
COND(util_format_is_srgb(format), A6XX_TEX_MEMOBJ_0_SRGB) |
|
||||
A6XX_TEX_MEMOBJ_0_FMT(texture_format) |
|
||||
A6XX_TEX_MEMOBJ_0_SWAP(swap);
|
||||
dst[1] = A6XX_TEX_MEMOBJ_1_WIDTH(width) | A6XX_TEX_MEMOBJ_1_HEIGHT(height);
|
||||
dst[2] = A6XX_TEX_MEMOBJ_2_PITCH(width * blocksize_B) |
|
||||
A6XX_TEX_MEMOBJ_2_TYPE(A6XX_TEX_3D);
|
||||
dst[3] = A6XX_TEX_MEMOBJ_3_ARRAY_PITCH(depth > 1 ? layer_size : 0);
|
||||
dst[4] = aligned_addr;
|
||||
dst[5] = (aligned_addr >> 32) | A6XX_TEX_MEMOBJ_5_DEPTH(depth);
|
||||
dst[6] = A6XX_TEX_MEMOBJ_6_MIN_LOD_CLAMP(0);
|
||||
/* Would be read by resbase to provide robustness guarantees */
|
||||
uint64_t encoded = MIN2(elements, TU_MAX_EMULATED_TEXEL_ELEMENTS);
|
||||
encoded |= uint64_t(offset_texels & (aligment - 1)) << 30llu;
|
||||
encoded <<= 6;
|
||||
dst[7] = A6XX_TEX_MEMOBJ_7_FLAG_LO(encoded & 0x7FFFFFF);
|
||||
dst[8] = A6XX_TEX_MEMOBJ_8_FLAG_HI(encoded >> 26);
|
||||
|
||||
tu_desc_set_swiz<CHIP>(dst, tu_swiz(X, Y, Z, W));
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
write_emulated_texel_buffer_descriptor_addr(
|
||||
uint32_t *dst, const VkDescriptorAddressInfoEXT *buffer_info)
|
||||
{
|
||||
if (!buffer_info || buffer_info->address == 0) {
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
return;
|
||||
}
|
||||
|
||||
enum pipe_format format = vk_format_to_pipe_format(buffer_info->format);
|
||||
uint32_t blocksize_B = util_format_get_blocksize(format);
|
||||
uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0;
|
||||
write_emulated_texel_buffer_descriptor_common<CHIP>(
|
||||
dst, format, buffer_info->address, elements);
|
||||
}
|
||||
|
||||
static void
|
||||
write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view)
|
||||
{
|
||||
|
|
@ -1001,6 +1075,25 @@ write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view)
|
|||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
write_emulated_texel_buffer_descriptor(uint32_t *dst,
|
||||
const VkBufferView buffer_view)
|
||||
{
|
||||
if (buffer_view == VK_NULL_HANDLE) {
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
return;
|
||||
}
|
||||
|
||||
VK_FROM_HANDLE(tu_buffer_view, view, buffer_view);
|
||||
|
||||
enum pipe_format format = vk_format_to_pipe_format(view->vk.format);
|
||||
uint32_t elements = view->vk.elements;
|
||||
write_emulated_texel_buffer_descriptor_common<CHIP>(
|
||||
dst, format, vk_buffer_address(view->vk.buffer, view->vk.offset),
|
||||
elements);
|
||||
}
|
||||
|
||||
static VkDescriptorAddressInfoEXT
|
||||
buffer_info_to_address(const VkDescriptorBufferInfo *buffer_info)
|
||||
{
|
||||
|
|
@ -1022,6 +1115,18 @@ write_buffer_descriptor_addr(const struct tu_device *device,
|
|||
const VkDescriptorAddressInfoEXT *buffer_info)
|
||||
{
|
||||
const struct fd_dev_info *info = device->physical_device->info;
|
||||
|
||||
if (device->physical_device->enable_ssbo_emulation) {
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
if (!buffer_info || buffer_info->address == 0)
|
||||
return;
|
||||
uint32_t blocksize_B = util_format_get_blocksize(PIPE_FORMAT_R32_UINT);
|
||||
uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0;
|
||||
write_emulated_texel_buffer_descriptor_common<CHIP>(
|
||||
dst, PIPE_FORMAT_R32_UINT, buffer_info->address, elements);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This prevents any misconfiguration, but 16-bit descriptor capable of both
|
||||
* 16-bit and 32-bit access through isam.v will of course only be functional
|
||||
* when 16-bit storage is supported. */
|
||||
|
|
@ -1199,10 +1304,14 @@ tu_GetDescriptorEXT(
|
|||
write_buffer_descriptor_addr<CHIP>(device, dest, pDescriptorInfo->data.pStorageBuffer);
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor_addr<CHIP>(dest, pDescriptorInfo->data.pUniformTexelBuffer);
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor_addr<CHIP>(dest, pDescriptorInfo->data.pStorageTexelBuffer);
|
||||
if (device->physical_device->enable_texel_buffer_emulation) {
|
||||
write_emulated_texel_buffer_descriptor_addr<CHIP>(
|
||||
dest, pDescriptorInfo->data.pUniformTexelBuffer);
|
||||
} else {
|
||||
write_texel_buffer_descriptor_addr<CHIP>(
|
||||
dest, pDescriptorInfo->data.pUniformTexelBuffer);
|
||||
}
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
||||
write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
|
||||
|
|
@ -1331,7 +1440,13 @@ tu_update_descriptor_sets(const struct tu_device *device,
|
|||
break;
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]);
|
||||
if (device->physical_device->enable_texel_buffer_emulation) {
|
||||
write_emulated_texel_buffer_descriptor<CHIP>(
|
||||
ptr, writeset->pTexelBufferView[j]);
|
||||
} else {
|
||||
write_texel_buffer_descriptor(ptr,
|
||||
writeset->pTexelBufferView[j]);
|
||||
}
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
|
||||
|
|
@ -1681,7 +1796,12 @@ tu_update_descriptor_set_with_template(
|
|||
break;
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor(ptr, *(VkBufferView *) src);
|
||||
if (device->physical_device->enable_texel_buffer_emulation) {
|
||||
write_emulated_texel_buffer_descriptor<CHIP>(
|
||||
ptr, *(VkBufferView *) src);
|
||||
} else {
|
||||
write_texel_buffer_descriptor(ptr, *(VkBufferView *) src);
|
||||
}
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
|
||||
|
|
|
|||
|
|
@ -71,6 +71,12 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid)
|
|||
_mesa_blake3_update(&ctx, &driver_flags, sizeof(driver_flags));
|
||||
_mesa_blake3_update(&ctx, &device->uche_trap_base, sizeof(device->uche_trap_base));
|
||||
_mesa_blake3_final(&ctx, blake3);
|
||||
_mesa_blake3_update(&ctx, &device->instance->allow_oob_indirect_ubo_loads,
|
||||
sizeof(device->instance->allow_oob_indirect_ubo_loads));
|
||||
_mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation,
|
||||
sizeof(device->enable_texel_buffer_emulation));
|
||||
_mesa_blake3_update(&ctx, &device->enable_ssbo_emulation,
|
||||
sizeof(device->enable_ssbo_emulation));
|
||||
|
||||
memcpy(uuid, blake3, VK_UUID_SIZE);
|
||||
return 0;
|
||||
|
|
@ -165,8 +171,10 @@ get_device_extensions(const struct tu_physical_device *device,
|
|||
(!device->info->props.has_sw_fuse || device->has_raytracing);
|
||||
|
||||
*ext = (struct vk_device_extension_table) { .table = {
|
||||
.KHR_8bit_storage = device->info->props.storage_8bit,
|
||||
.KHR_16bit_storage = device->info->props.storage_16bit,
|
||||
.KHR_8bit_storage = device->info->props.storage_8bit &&
|
||||
!device->enable_ssbo_emulation,
|
||||
.KHR_16bit_storage = device->info->props.storage_16bit &&
|
||||
!device->enable_ssbo_emulation,
|
||||
.KHR_acceleration_structure = has_raytracing,
|
||||
.KHR_bind_memory2 = true,
|
||||
.KHR_buffer_device_address = true,
|
||||
|
|
@ -1108,9 +1116,13 @@ tu_get_properties(struct tu_physical_device *pdevice,
|
|||
props->maxImageDimension3D = (1 << 11);
|
||||
props->maxImageDimensionCube = (1 << 14);
|
||||
props->maxImageArrayLayers = (1 << (pdevice->info->props.is_a702 ? 8 : 11));
|
||||
props->maxTexelBufferElements = MAX_TEXEL_ELEMENTS;
|
||||
props->maxTexelBufferElements =
|
||||
pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS;
|
||||
props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE;
|
||||
props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE;
|
||||
props->maxStorageBufferRange =
|
||||
pdevice->enable_ssbo_emulation
|
||||
? TU_MAX_EMULATED_TEXEL_ELEMENTS * 4
|
||||
: MAX_STORAGE_BUFFER_RANGE;
|
||||
props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE;
|
||||
props->maxMemoryAllocationCount = UINT32_MAX;
|
||||
props->maxSamplerAllocationCount = 64 * 1024;
|
||||
|
|
@ -1413,6 +1425,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
|
|||
props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4 * (1 +
|
||||
COND(pdevice->info->props.storage_16bit && !pdevice->info->props.has_isam_v, 1) +
|
||||
COND(pdevice->info->props.storage_8bit, 1));
|
||||
if (pdevice->enable_ssbo_emulation) {
|
||||
props->storageBufferDescriptorSize = FDL6_TEX_CONST_DWORDS * 4;
|
||||
}
|
||||
props->robustStorageBufferDescriptorSize =
|
||||
props->storageBufferDescriptorSize;
|
||||
props->accelerationStructureDescriptorSize = 4 * FDL6_TEX_CONST_DWORDS;
|
||||
|
|
@ -1687,6 +1702,11 @@ tu_physical_device_init(struct tu_physical_device *device,
|
|||
device->has_cached_non_coherent_memory =
|
||||
device->level1_dcache_size > 0 && !DETECT_ARCH_ARM;
|
||||
|
||||
if (fd_dev_gen(&device->dev_id) >= 7) {
|
||||
device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation;
|
||||
device->enable_ssbo_emulation = instance->enable_ssbo_emulation;
|
||||
}
|
||||
|
||||
device->memory.type_count = 1;
|
||||
device->memory.types[0] =
|
||||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
|
||||
|
|
@ -1850,6 +1870,8 @@ static const driOptionDescription tu_dri_options[] = {
|
|||
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
|
||||
DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false)
|
||||
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
|
||||
DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false)
|
||||
DRI_CONF_TU_ENABLE_SSBO_EMULATION(false)
|
||||
DRI_CONF_SECTION_END
|
||||
};
|
||||
|
||||
|
|
@ -1884,6 +1906,10 @@ tu_init_dri_options(struct tu_instance *instance)
|
|||
driQueryOptionb(&instance->dri_options, "tu_emulate_alpha_to_coverage");
|
||||
instance->autotune_algo =
|
||||
driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
|
||||
instance->enable_texel_buffer_emulation =
|
||||
driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation");
|
||||
instance->enable_ssbo_emulation =
|
||||
driQueryOptionb(&instance->dri_options, "tu_enable_ssbo_emulation");
|
||||
}
|
||||
|
||||
static uint32_t instance_count = 0;
|
||||
|
|
|
|||
|
|
@ -143,6 +143,9 @@ struct tu_physical_device
|
|||
/* Whether performance counter selector registers can be written by userspace CSes. */
|
||||
bool is_perf_cntr_selectable;
|
||||
|
||||
bool enable_texel_buffer_emulation;
|
||||
bool enable_ssbo_emulation;
|
||||
|
||||
struct {
|
||||
uint32_t non_lazy_type_count;
|
||||
uint32_t type_count;
|
||||
|
|
@ -240,6 +243,13 @@ struct tu_instance
|
|||
|
||||
/* Configuration option to use a specific autotune algorithm by default. */
|
||||
const char *autotune_algo;
|
||||
|
||||
/* D3D12 doesn't have documented limit for texel buffer or SSBO size, in practice
|
||||
* some games expect up to (1 << 29) elements, which is higher than A6XX or
|
||||
* A7XX hardware can support.
|
||||
*/
|
||||
bool enable_texel_buffer_emulation;
|
||||
bool enable_ssbo_emulation;
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
||||
VK_OBJECT_TYPE_INSTANCE)
|
||||
|
|
|
|||
|
|
@ -596,6 +596,148 @@ build_bindless(struct tu_device *dev, nir_builder *b,
|
|||
return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
build_texel_buffer_size(nir_builder *b, nir_def *desc, nir_def **offset_out)
|
||||
{
|
||||
assert(nir_def_is_intrinsic(desc));
|
||||
nir_def *encoded_data = nir_resbase_ir3(b, 32, desc);
|
||||
nir_def *encoded_data_lo = nir_channel(b, encoded_data, 0);
|
||||
nir_def *encoded_data_hi = nir_channel(b, encoded_data, 1);
|
||||
|
||||
nir_def *size_lo = nir_ishr_imm(b, encoded_data_lo, 6);
|
||||
nir_def *size_hi = nir_ishl_imm(b, encoded_data_hi, 20);
|
||||
nir_def *size = nir_iand_imm(b, nir_ior(b, size_lo, size_hi),
|
||||
TU_MAX_EMULATED_TEXEL_ELEMENTS);
|
||||
|
||||
if (offset_out)
|
||||
*offset_out = nir_ishr_imm(b, encoded_data_hi, 10);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
build_texel_buffer_as_image_coords(nir_builder *b,
|
||||
nir_def *offset,
|
||||
nir_def *desc)
|
||||
{
|
||||
nir_def *base_offset = nullptr;
|
||||
nir_def *real_size = build_texel_buffer_size(b, desc, &base_offset);
|
||||
nir_def *oob = nir_ige(b, offset, real_size);
|
||||
|
||||
offset = nir_iadd(b, offset, base_offset);
|
||||
|
||||
nir_def *x = nir_umod_imm(b, offset, TU_TEXEL_BUFFER_WIDTH);
|
||||
nir_def *tmp = nir_udiv_imm(b, offset, TU_TEXEL_BUFFER_WIDTH);
|
||||
nir_def *y = nir_umod_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT);
|
||||
nir_def *z = nir_udiv_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT);
|
||||
z = nir_bcsel(b, oob, nir_imm_int(b, 0xff), z);
|
||||
|
||||
nir_def *coord3d = nir_vec3(b, x, y, z);
|
||||
return coord3d;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_texel_buffers_to_image(nir_builder *b,
|
||||
nir_intrinsic_instr *instr,
|
||||
nir_def *bindless)
|
||||
{
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_bindless_image_load:
|
||||
case nir_intrinsic_bindless_image_store:
|
||||
case nir_intrinsic_bindless_image_atomic:
|
||||
case nir_intrinsic_bindless_image_atomic_swap: {
|
||||
b->cursor = nir_before_instr(&instr->instr);
|
||||
|
||||
nir_def *coord = instr->src[1].ssa;
|
||||
if (coord->num_components > 1)
|
||||
coord = nir_channel(b, coord, 0);
|
||||
nir_def *coord3d =
|
||||
build_texel_buffer_as_image_coords(b, coord, bindless);
|
||||
nir_src_rewrite(&instr->src[1], nir_pad_vector(b, coord3d, 4));
|
||||
nir_intrinsic_set_image_dim(instr, GLSL_SAMPLER_DIM_3D);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_bindless_image_size: {
|
||||
nir_def_replace(&instr->def,
|
||||
build_texel_buffer_size(b, bindless, nullptr));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_ssbo_to_image(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
||||
{
|
||||
if (intr->intrinsic == nir_intrinsic_load_ssbo) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
nir_def *load = nir_bindless_image_load(
|
||||
b, intr->def.num_components, intr->def.bit_size, bindless,
|
||||
nir_pad_vec4(b, coord3d), nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_imm_zero(b, 1, 32) /* lod */, .image_dim = GLSL_SAMPLER_DIM_3D,
|
||||
.format = PIPE_FORMAT_R32_UINT, .access = nir_intrinsic_access(intr));
|
||||
nir_def_replace(&intr->def, load);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_store_ssbo) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
nir_bindless_image_store(
|
||||
b, bindless, nir_pad_vec4(b, coord3d),
|
||||
nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_get_io_data_src(intr)->ssa, nir_imm_zero(b, 1, 32) /* lod */,
|
||||
.image_dim = GLSL_SAMPLER_DIM_3D, .format = PIPE_FORMAT_R32_UINT,
|
||||
.access = nir_intrinsic_access(intr), .src_type = nir_type_uint32);
|
||||
nir_instr_remove(&intr->instr);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_get_ssbo_size) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *num_elements = build_texel_buffer_size(b, bindless, NULL);
|
||||
nir_def *size = nir_amul_imm(b, num_elements, sizeof(uint32_t));
|
||||
nir_def_replace(&intr->def, size);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_ssbo_atomic) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
enum pipe_format format =
|
||||
intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT;
|
||||
nir_def *atomic = nir_bindless_image_atomic(
|
||||
b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d),
|
||||
nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_get_io_data_src(intr)->ssa, .image_dim = GLSL_SAMPLER_DIM_3D,
|
||||
.format = format, .access = nir_intrinsic_access(intr),
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr));
|
||||
nir_def_replace(&intr->def, atomic);
|
||||
return true;
|
||||
} else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap) {
|
||||
b->cursor = nir_after_instr(&intr->instr);
|
||||
nir_def *bindless = nir_get_io_index_src(intr)->ssa;
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, nir_get_io_offset_src(intr)->ssa, bindless);
|
||||
enum pipe_format format =
|
||||
intr->def.bit_size == 64 ? PIPE_FORMAT_R64_UINT : PIPE_FORMAT_R32_UINT;
|
||||
nir_def *atomic_swap = nir_bindless_image_atomic_swap(
|
||||
b, intr->def.bit_size, bindless, nir_pad_vec4(b, coord3d),
|
||||
nir_imm_zero(b, 1, 32) /* sample index */,
|
||||
nir_get_io_data_src(intr)->ssa, intr->src[3].ssa,
|
||||
.image_dim = GLSL_SAMPLER_DIM_3D, .format = format,
|
||||
.access = nir_intrinsic_access(intr),
|
||||
.atomic_op = nir_intrinsic_atomic_op(intr));
|
||||
nir_def_replace(&intr->def, atomic_swap);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_image_deref(struct tu_device *dev, nir_builder *b,
|
||||
nir_intrinsic_instr *instr, struct tu_shader *shader,
|
||||
|
|
@ -605,6 +747,11 @@ lower_image_deref(struct tu_device *dev, nir_builder *b,
|
|||
nir_def *bindless = build_bindless(dev, b, deref, 0, shader, layout, 0, false);
|
||||
nir_rewrite_image_intrinsic(instr, bindless,
|
||||
nir_image_intrinsic_type_bindless);
|
||||
|
||||
if (dev->physical_device->enable_texel_buffer_emulation &&
|
||||
nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) {
|
||||
lower_texel_buffers_to_image(b, instr, bindless);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
|
|
@ -869,6 +1016,31 @@ lower_tex_immutable(struct tu_device *dev,
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lower_tex_texel_buffer_to_image(nir_builder *b,
|
||||
nir_tex_instr *tex,
|
||||
uint32_t tex_bindless_idx)
|
||||
{
|
||||
if (tex->op == nir_texop_txf) {
|
||||
int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
|
||||
if (coord_idx >= 0) {
|
||||
nir_def *coord = tex->src[coord_idx].src.ssa;
|
||||
if (coord->num_components > 1)
|
||||
coord = nir_channel(b, coord, 0);
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, coord, tex->src[tex_bindless_idx].src.ssa);
|
||||
nir_src_rewrite(&tex->src[coord_idx].src, coord3d);
|
||||
|
||||
tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
|
||||
tex->coord_components = 3;
|
||||
}
|
||||
} else if (tex->op == nir_texop_txs) {
|
||||
nir_def_replace(
|
||||
&tex->def,
|
||||
build_texel_buffer_size(b, tex->src[tex_bindless_idx].src.ssa, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
|
||||
struct tu_shader *shader, const struct tu_pipeline_layout *layout,
|
||||
|
|
@ -899,6 +1071,11 @@ lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
|
|||
tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
|
||||
}
|
||||
|
||||
if (dev->physical_device->enable_texel_buffer_emulation &&
|
||||
tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
|
||||
lower_tex_texel_buffer_to_image(b, tex, tex_src_idx);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1275,6 +1452,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
|
|||
nir_metadata_none,
|
||||
¶ms);
|
||||
|
||||
if (dev->physical_device->enable_ssbo_emulation) {
|
||||
progress |= nir_lower_io_to_scalar(shader, nir_var_mem_ssbo, NULL, NULL);
|
||||
progress |= nir_shader_intrinsics_pass(shader, lower_ssbo_to_image,
|
||||
nir_metadata_control_flow, NULL);
|
||||
}
|
||||
|
||||
/* Remove now-unused variables so that when we gather the shader info later
|
||||
* they won't be counted.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1449,6 +1449,17 @@ TODO: document the other workarounds.
|
|||
<application name="Half-Life: Alyx" application_name_match="hlvr">
|
||||
<option name="tu_emulate_alpha_to_coverage" value="true" />
|
||||
</application>
|
||||
<application name="Marvel's Spider-Man Remastered" executable="Spider-Man.exe">
|
||||
<!-- Bad texturing without tu_enable_texel_buffer_emulation -->
|
||||
<option name="tu_enable_texel_buffer_emulation" value="true" />
|
||||
<!-- From brief testing doesn't require tu_enable_ssbo_emulation -->
|
||||
</application>
|
||||
<application name="Ratchet and Clank: Rift Apart" executable="RiftApart.exe">
|
||||
<!-- Many static object disappear without tu_enable_texel_buffer_emulation -->
|
||||
<option name="tu_enable_texel_buffer_emulation" value="true" />
|
||||
<!-- All moving objects disappear without tu_enable_ssbo_emulation -->
|
||||
<option name="tu_enable_ssbo_emulation" value="true" />
|
||||
</application>
|
||||
</device>
|
||||
|
||||
<device driver="asahi">
|
||||
|
|
|
|||
|
|
@ -672,6 +672,14 @@
|
|||
DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
|
||||
"Set the preferred autotune algorithm")
|
||||
|
||||
#define DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(def) \
|
||||
DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \
|
||||
"Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect")
|
||||
|
||||
#define DRI_CONF_TU_ENABLE_SSBO_EMULATION(def) \
|
||||
DRI_CONF_OPT_B(tu_enable_ssbo_emulation, def, \
|
||||
"Emulate SSBO to allow higher limit for elements that is in line with what some D3D12 games expect")
|
||||
|
||||
/**
|
||||
* \brief Honeykrisp specific configuration options
|
||||
*/
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue