mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
tu: Add option to raise the maximum texel buffer size
Emulates texel buffers via 3D image access, real texel buffer size and start offset (due to image aligment requirements) are stored in the descriptor and accessed via resbase. - Read-only access: isam.a.1d to read as 3d image. - RW access: stib.b.typed.3d/ldib.b.typed.3d to read as 3d image. Verified that proprietary D3D12 driver uses the same workaround, the only difference is that proprietary driver uses arrayed 2d load for read-only access instead of 3d load, but benefits are not verified. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
This commit is contained in:
parent
3709ebded8
commit
80e194d7f1
6 changed files with 238 additions and 6 deletions
|
|
@ -110,6 +110,9 @@
|
|||
/* match the latest Qualcomm driver which is also a hw limit on later gens */
|
||||
#define MAX_STORAGE_BUFFER_RANGE (1u << 27)
|
||||
#define MAX_TEXEL_ELEMENTS (1u << 27)
|
||||
#define TU_MAX_EMULATED_TEXEL_ELEMENTS ((1u << 30) - 1)
|
||||
#define TU_TEXEL_BUFFER_WIDTH (1u << 14)
|
||||
#define TU_TEXEL_BUFFER_MAX_HEIGHT (1u << 14)
|
||||
/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
|
||||
* expose the same maximum range.
|
||||
* TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@
|
|||
#include "tu_descriptor_set.h"
|
||||
|
||||
#include "util/mesa-blake3.h"
|
||||
#include "util/format/u_format.h"
|
||||
#include "vk_acceleration_structure.h"
|
||||
#include "vk_descriptors.h"
|
||||
#include "vk_util.h"
|
||||
|
|
@ -32,6 +33,7 @@
|
|||
#include "tu_rmv.h"
|
||||
#include "tu_sampler.h"
|
||||
#include "tu_subsampled_image.h"
|
||||
#include "fdl/fd6_format_table.h"
|
||||
|
||||
static inline uint8_t *
|
||||
pool_base(struct tu_descriptor_pool *pool)
|
||||
|
|
@ -989,6 +991,71 @@ write_texel_buffer_descriptor_addr(uint32_t *dst,
|
|||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
write_emulated_texel_buffer_descriptor_common(uint32_t *dst,
|
||||
enum pipe_format format,
|
||||
uint64_t addr, uint32_t elements)
|
||||
{
|
||||
uint32_t blocksize_B = util_format_get_blocksize(format);
|
||||
|
||||
const uint32_t aligment = 64;
|
||||
uint64_t aligned_addr = addr & ~(uint64_t) (aligment - 1);
|
||||
uint32_t offset_texels = uint32_t(addr - aligned_addr) / blocksize_B;
|
||||
uint32_t elements_with_offset = elements + offset_texels;
|
||||
|
||||
uint32_t width = MIN2(elements_with_offset, TU_TEXEL_BUFFER_WIDTH);
|
||||
uint32_t height = MIN2(DIV_ROUND_UP(elements_with_offset, width),
|
||||
TU_TEXEL_BUFFER_MAX_HEIGHT);
|
||||
uint32_t depth = elements_with_offset
|
||||
? DIV_ROUND_UP(elements_with_offset, width * height)
|
||||
: 0;
|
||||
uint32_t layer_size = width * height * blocksize_B;
|
||||
enum a6xx_tile_mode tile_mode = TILE6_LINEAR;
|
||||
enum a6xx_format texture_format =
|
||||
fd6_texture_format(format, tile_mode, false);
|
||||
enum a3xx_color_swap swap = fd6_texture_swap(format, tile_mode, false);
|
||||
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
|
||||
dst[0] = A6XX_TEX_MEMOBJ_0_TILE_MODE(tile_mode) |
|
||||
COND(util_format_is_srgb(format), A6XX_TEX_MEMOBJ_0_SRGB) |
|
||||
A6XX_TEX_MEMOBJ_0_FMT(texture_format) |
|
||||
A6XX_TEX_MEMOBJ_0_SWAP(swap);
|
||||
dst[1] = A6XX_TEX_MEMOBJ_1_WIDTH(width) | A6XX_TEX_MEMOBJ_1_HEIGHT(height);
|
||||
dst[2] = A6XX_TEX_MEMOBJ_2_PITCH(width * blocksize_B) |
|
||||
A6XX_TEX_MEMOBJ_2_TYPE(A6XX_TEX_3D);
|
||||
dst[3] = A6XX_TEX_MEMOBJ_3_ARRAY_PITCH(depth > 1 ? layer_size : 0);
|
||||
dst[4] = aligned_addr;
|
||||
dst[5] = (aligned_addr >> 32) | A6XX_TEX_MEMOBJ_5_DEPTH(depth);
|
||||
dst[6] = A6XX_TEX_MEMOBJ_6_MIN_LOD_CLAMP(0);
|
||||
/* Would be read by resbase to provide robustness guarantees */
|
||||
uint64_t encoded = MIN2(elements, TU_MAX_EMULATED_TEXEL_ELEMENTS);
|
||||
encoded |= uint64_t(offset_texels & (aligment - 1)) << 30llu;
|
||||
encoded <<= 6;
|
||||
dst[7] = A6XX_TEX_MEMOBJ_7_FLAG_LO(encoded & 0x7FFFFFF);
|
||||
dst[8] = A6XX_TEX_MEMOBJ_8_FLAG_HI(encoded >> 26);
|
||||
|
||||
tu_desc_set_swiz<CHIP>(dst, tu_swiz(X, Y, Z, W));
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
write_emulated_texel_buffer_descriptor_addr(
|
||||
uint32_t *dst, const VkDescriptorAddressInfoEXT *buffer_info)
|
||||
{
|
||||
if (!buffer_info || buffer_info->address == 0) {
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
return;
|
||||
}
|
||||
|
||||
enum pipe_format format = vk_format_to_pipe_format(buffer_info->format);
|
||||
uint32_t blocksize_B = util_format_get_blocksize(format);
|
||||
uint32_t elements = blocksize_B ? (buffer_info->range / blocksize_B) : 0;
|
||||
write_emulated_texel_buffer_descriptor_common<CHIP>(
|
||||
dst, format, buffer_info->address, elements);
|
||||
}
|
||||
|
||||
static void
|
||||
write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view)
|
||||
{
|
||||
|
|
@ -1001,6 +1068,25 @@ write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view)
|
|||
}
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
write_emulated_texel_buffer_descriptor(uint32_t *dst,
|
||||
const VkBufferView buffer_view)
|
||||
{
|
||||
if (buffer_view == VK_NULL_HANDLE) {
|
||||
memset(dst, 0, FDL6_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
return;
|
||||
}
|
||||
|
||||
VK_FROM_HANDLE(tu_buffer_view, view, buffer_view);
|
||||
|
||||
enum pipe_format format = vk_format_to_pipe_format(view->vk.format);
|
||||
uint32_t elements = view->vk.elements;
|
||||
write_emulated_texel_buffer_descriptor_common<CHIP>(
|
||||
dst, format, vk_buffer_address(view->vk.buffer, view->vk.offset),
|
||||
elements);
|
||||
}
|
||||
|
||||
static VkDescriptorAddressInfoEXT
|
||||
buffer_info_to_address(const VkDescriptorBufferInfo *buffer_info)
|
||||
{
|
||||
|
|
@ -1199,10 +1285,14 @@ tu_GetDescriptorEXT(
|
|||
write_buffer_descriptor_addr<CHIP>(device, dest, pDescriptorInfo->data.pStorageBuffer);
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor_addr<CHIP>(dest, pDescriptorInfo->data.pUniformTexelBuffer);
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor_addr<CHIP>(dest, pDescriptorInfo->data.pStorageTexelBuffer);
|
||||
if (device->physical_device->enable_texel_buffer_emulation) {
|
||||
write_emulated_texel_buffer_descriptor_addr<CHIP>(
|
||||
dest, pDescriptorInfo->data.pUniformTexelBuffer);
|
||||
} else {
|
||||
write_texel_buffer_descriptor_addr<CHIP>(
|
||||
dest, pDescriptorInfo->data.pUniformTexelBuffer);
|
||||
}
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
||||
write_image_descriptor(dest, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
|
||||
|
|
@ -1331,7 +1421,13 @@ tu_update_descriptor_sets(const struct tu_device *device,
|
|||
break;
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]);
|
||||
if (device->physical_device->enable_texel_buffer_emulation) {
|
||||
write_emulated_texel_buffer_descriptor<CHIP>(
|
||||
ptr, writeset->pTexelBufferView[j]);
|
||||
} else {
|
||||
write_texel_buffer_descriptor(ptr,
|
||||
writeset->pTexelBufferView[j]);
|
||||
}
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
|
||||
|
|
@ -1681,7 +1777,12 @@ tu_update_descriptor_set_with_template(
|
|||
break;
|
||||
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
||||
write_texel_buffer_descriptor(ptr, *(VkBufferView *) src);
|
||||
if (device->physical_device->enable_texel_buffer_emulation) {
|
||||
write_emulated_texel_buffer_descriptor<CHIP>(
|
||||
ptr, *(VkBufferView *) src);
|
||||
} else {
|
||||
write_texel_buffer_descriptor(ptr, *(VkBufferView *) src);
|
||||
}
|
||||
break;
|
||||
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
||||
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
|
||||
|
|
|
|||
|
|
@ -73,6 +73,8 @@ tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid)
|
|||
_mesa_blake3_final(&ctx, blake3);
|
||||
_mesa_blake3_update(&ctx, &device->instance->allow_oob_indirect_ubo_loads,
|
||||
sizeof(device->instance->allow_oob_indirect_ubo_loads));
|
||||
_mesa_blake3_update(&ctx, &device->enable_texel_buffer_emulation,
|
||||
sizeof(device->enable_texel_buffer_emulation));
|
||||
|
||||
memcpy(uuid, blake3, VK_UUID_SIZE);
|
||||
return 0;
|
||||
|
|
@ -1110,7 +1112,8 @@ tu_get_properties(struct tu_physical_device *pdevice,
|
|||
props->maxImageDimension3D = (1 << 11);
|
||||
props->maxImageDimensionCube = (1 << 14);
|
||||
props->maxImageArrayLayers = (1 << (pdevice->info->props.is_a702 ? 8 : 11));
|
||||
props->maxTexelBufferElements = MAX_TEXEL_ELEMENTS;
|
||||
props->maxTexelBufferElements =
|
||||
pdevice->enable_texel_buffer_emulation ? TU_MAX_EMULATED_TEXEL_ELEMENTS : MAX_TEXEL_ELEMENTS;
|
||||
props->maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE;
|
||||
props->maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE;
|
||||
props->maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE;
|
||||
|
|
@ -1680,6 +1683,10 @@ tu_physical_device_init(struct tu_physical_device *device,
|
|||
device->has_cached_non_coherent_memory =
|
||||
device->level1_dcache_size > 0 && !DETECT_ARCH_ARM;
|
||||
|
||||
if (fd_dev_gen(&device->dev_id) >= 7) {
|
||||
device->enable_texel_buffer_emulation = instance->enable_texel_buffer_emulation;
|
||||
}
|
||||
|
||||
device->memory.type_count = 1;
|
||||
device->memory.types[0] =
|
||||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
|
||||
|
|
@ -1843,6 +1850,7 @@ static const driOptionDescription tu_dri_options[] = {
|
|||
DRI_CONF_TU_ENABLE_SOFTFLOAT32(false)
|
||||
DRI_CONF_TU_EMULATE_ALPHA_TO_COVERAGE(false)
|
||||
DRI_CONF_TU_AUTOTUNE_ALGORITHM()
|
||||
DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(false)
|
||||
DRI_CONF_SECTION_END
|
||||
};
|
||||
|
||||
|
|
@ -1877,6 +1885,8 @@ tu_init_dri_options(struct tu_instance *instance)
|
|||
driQueryOptionb(&instance->dri_options, "tu_emulate_alpha_to_coverage");
|
||||
instance->autotune_algo =
|
||||
driQueryOptionstr(&instance->dri_options, "tu_autotune_algorithm");
|
||||
instance->enable_texel_buffer_emulation =
|
||||
driQueryOptionb(&instance->dri_options, "tu_enable_texel_buffer_emulation");
|
||||
}
|
||||
|
||||
static uint32_t instance_count = 0;
|
||||
|
|
|
|||
|
|
@ -143,6 +143,8 @@ struct tu_physical_device
|
|||
/* Whether performance counter selector registers can be written by userspace CSes. */
|
||||
bool is_perf_cntr_selectable;
|
||||
|
||||
bool enable_texel_buffer_emulation;
|
||||
|
||||
struct {
|
||||
uint32_t non_lazy_type_count;
|
||||
uint32_t type_count;
|
||||
|
|
@ -240,6 +242,12 @@ struct tu_instance
|
|||
|
||||
/* Configuration option to use a specific autotune algorithm by default. */
|
||||
const char *autotune_algo;
|
||||
|
||||
/* D3D12 doesn't have documented limit for texel buffer size, in practice
|
||||
* some games expect up to (1 << 29) elements, which is higher than A6XX or
|
||||
* A7XX hardware can support.
|
||||
*/
|
||||
bool enable_texel_buffer_emulation;
|
||||
};
|
||||
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
||||
VK_OBJECT_TYPE_INSTANCE)
|
||||
|
|
|
|||
|
|
@ -598,6 +598,77 @@ build_bindless(struct tu_device *dev, nir_builder *b,
|
|||
return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
build_texel_buffer_size(nir_builder *b, nir_def *desc, nir_def **offset_out)
|
||||
{
|
||||
assert(nir_def_is_intrinsic(desc));
|
||||
nir_def *encoded_data = nir_resbase_ir3(b, 32, desc);
|
||||
nir_def *encoded_data_lo = nir_channel(b, encoded_data, 0);
|
||||
nir_def *encoded_data_hi = nir_channel(b, encoded_data, 1);
|
||||
|
||||
nir_def *size_lo = nir_ishr_imm(b, encoded_data_lo, 6);
|
||||
nir_def *size_hi = nir_ishl_imm(b, encoded_data_hi, 20);
|
||||
nir_def *size = nir_iand_imm(b, nir_ior(b, size_lo, size_hi),
|
||||
TU_MAX_EMULATED_TEXEL_ELEMENTS);
|
||||
|
||||
if (offset_out)
|
||||
*offset_out = nir_ishr_imm(b, encoded_data_hi, 10);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
build_texel_buffer_as_image_coords(nir_builder *b,
|
||||
nir_def *offset,
|
||||
nir_def *desc)
|
||||
{
|
||||
nir_def *base_offset = nullptr;
|
||||
nir_def *real_size = build_texel_buffer_size(b, desc, &base_offset);
|
||||
nir_def *oob = nir_ige(b, offset, real_size);
|
||||
|
||||
offset = nir_iadd(b, offset, base_offset);
|
||||
|
||||
nir_def *x = nir_umod_imm(b, offset, TU_TEXEL_BUFFER_WIDTH);
|
||||
nir_def *tmp = nir_udiv_imm(b, offset, TU_TEXEL_BUFFER_WIDTH);
|
||||
nir_def *y = nir_umod_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT);
|
||||
nir_def *z = nir_udiv_imm(b, tmp, TU_TEXEL_BUFFER_MAX_HEIGHT);
|
||||
z = nir_bcsel(b, oob, nir_imm_int(b, 0xff), z);
|
||||
|
||||
nir_def *coord3d = nir_vec3(b, x, y, z);
|
||||
return coord3d;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_texel_buffers_to_image(nir_builder *b,
|
||||
nir_intrinsic_instr *instr,
|
||||
nir_def *bindless)
|
||||
{
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_bindless_image_load:
|
||||
case nir_intrinsic_bindless_image_store:
|
||||
case nir_intrinsic_bindless_image_atomic:
|
||||
case nir_intrinsic_bindless_image_atomic_swap: {
|
||||
b->cursor = nir_before_instr(&instr->instr);
|
||||
|
||||
nir_def *coord = instr->src[1].ssa;
|
||||
if (coord->num_components > 1)
|
||||
coord = nir_channel(b, coord, 0);
|
||||
nir_def *coord3d =
|
||||
build_texel_buffer_as_image_coords(b, coord, bindless);
|
||||
nir_src_rewrite(&instr->src[1], nir_pad_vector(b, coord3d, 4));
|
||||
nir_intrinsic_set_image_dim(instr, GLSL_SAMPLER_DIM_3D);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_bindless_image_size: {
|
||||
nir_def_replace(&instr->def,
|
||||
build_texel_buffer_size(b, bindless, nullptr));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lower_image_deref(struct tu_device *dev, nir_builder *b,
|
||||
nir_intrinsic_instr *instr, struct tu_shader *shader,
|
||||
|
|
@ -607,6 +678,11 @@ lower_image_deref(struct tu_device *dev, nir_builder *b,
|
|||
nir_def *bindless = build_bindless(dev, b, deref, 0, shader, layout, 0, false);
|
||||
nir_rewrite_image_intrinsic(instr, bindless,
|
||||
nir_image_intrinsic_type_bindless);
|
||||
|
||||
if (dev->physical_device->enable_texel_buffer_emulation &&
|
||||
nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_BUF) {
|
||||
lower_texel_buffers_to_image(b, instr, bindless);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
|
|
@ -871,6 +947,31 @@ lower_tex_immutable(struct tu_device *dev,
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lower_tex_texel_buffer_to_image(nir_builder *b,
|
||||
nir_tex_instr *tex,
|
||||
uint32_t tex_bindless_idx)
|
||||
{
|
||||
if (tex->op == nir_texop_txf) {
|
||||
int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
|
||||
if (coord_idx >= 0) {
|
||||
nir_def *coord = tex->src[coord_idx].src.ssa;
|
||||
if (coord->num_components > 1)
|
||||
coord = nir_channel(b, coord, 0);
|
||||
nir_def *coord3d = build_texel_buffer_as_image_coords(
|
||||
b, coord, tex->src[tex_bindless_idx].src.ssa);
|
||||
nir_src_rewrite(&tex->src[coord_idx].src, coord3d);
|
||||
|
||||
tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
|
||||
tex->coord_components = 3;
|
||||
}
|
||||
} else if (tex->op == nir_texop_txs) {
|
||||
nir_def_replace(
|
||||
&tex->def,
|
||||
build_texel_buffer_size(b, tex->src[tex_bindless_idx].src.ssa, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
|
||||
struct tu_shader *shader, const struct tu_pipeline_layout *layout,
|
||||
|
|
@ -901,6 +1002,11 @@ lower_tex_impl(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
|
|||
tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
|
||||
}
|
||||
|
||||
if (dev->physical_device->enable_texel_buffer_emulation &&
|
||||
tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
|
||||
lower_tex_texel_buffer_to_image(b, tex, tex_src_idx);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -696,6 +696,10 @@
|
|||
DRI_CONF_OPT_S_NODEF(tu_autotune_algorithm, \
|
||||
"Set the preferred autotune algorithm")
|
||||
|
||||
#define DRI_CONF_TU_ENABLE_TEXEL_BUFFER_EMULATION(def) \
|
||||
DRI_CONF_OPT_B(tu_enable_texel_buffer_emulation, def, \
|
||||
"Emulate texel buffer to allow higher limit for elements that is in line with what some D3D12 games expect")
|
||||
|
||||
/**
|
||||
* \brief Honeykrisp specific configuration options
|
||||
*/
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue