nvk: Use bindless cbufs on Turing+

These are much faster than ld.global.constant.  This takes The Witness
from 103 FPS to 130 FPS on my 4060 laptop GPU when run with
NVK_DEBUG=no_cbuf

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29591>
This commit is contained in:
Faith Ekstrand 2024-05-23 12:21:58 -05:00 committed by Marge Bot
parent 248b22d158
commit 8b5835af31
8 changed files with 197 additions and 59 deletions

View file

@ -596,11 +596,13 @@ nvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd,
}
static void
nvk_bind_descriptor_sets(UNUSED struct nvk_cmd_buffer *cmd,
nvk_bind_descriptor_sets(struct nvk_cmd_buffer *cmd,
struct nvk_descriptor_state *desc,
const VkBindDescriptorSetsInfoKHR *info)
{
VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
struct nvk_physical_device *pdev = nvk_device_physical(dev);
/* Fro the Vulkan 1.3.275 spec:
*
@ -644,9 +646,16 @@ nvk_bind_descriptor_sets(UNUSED struct nvk_cmd_buffer *cmd,
if (set != NULL && set_layout->dynamic_buffer_count > 0) {
for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) {
struct nvk_buffer_address addr = set->dynamic_buffers[j];
addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j];
desc->root.dynamic_buffers[dyn_buffer_start + j] = addr;
union nvk_buffer_descriptor db = set->dynamic_buffers[j];
uint32_t offset = info->pDynamicOffsets[next_dyn_offset + j];
if (BITSET_TEST(set_layout->dynamic_ubos, j) &&
nvk_use_bindless_cbuf(&pdev->info)) {
assert((offset & 0xf) == 0);
db.cbuf.base_addr_shift_4 += offset >> 4;
} else {
db.addr.base_addr += offset;
}
desc->root.dynamic_buffers[dyn_buffer_start + j] = db;
}
next_dyn_offset += set->layout->dynamic_buffer_count;
}
@ -804,6 +813,9 @@ nvk_cmd_buffer_get_cbuf_addr(struct nvk_cmd_buffer *cmd,
const struct nvk_cbuf *cbuf,
struct nvk_buffer_address *addr_out)
{
struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
struct nvk_physical_device *pdev = nvk_device_physical(dev);
switch (cbuf->type) {
case NVK_CBUF_TYPE_INVALID:
*addr_out = (struct nvk_buffer_address) { .size = 0 };
@ -827,7 +839,8 @@ nvk_cmd_buffer_get_cbuf_addr(struct nvk_cmd_buffer *cmd,
case NVK_CBUF_TYPE_DYNAMIC_UBO: {
const uint32_t dyn_start =
desc->root.set_dynamic_buffer_start[cbuf->desc_set];
*addr_out = desc->root.dynamic_buffers[dyn_start + cbuf->dynamic_idx];
*addr_out = nvk_ubo_descriptor_addr(pdev,
desc->root.dynamic_buffers[dyn_start + cbuf->dynamic_idx]);
return true;
}
@ -840,8 +853,9 @@ nvk_cmd_buffer_get_cbuf_addr(struct nvk_cmd_buffer *cmd,
return false;
assert(cbuf->desc_offset < NVK_PUSH_DESCRIPTOR_SET_SIZE);
void *desc = &push->data[cbuf->desc_offset];
*addr_out = *(struct nvk_buffer_address *)desc;
union nvk_buffer_descriptor desc;
memcpy(&desc, &push->data[cbuf->desc_offset], sizeof(desc));
*addr_out = nvk_ubo_descriptor_addr(pdev, desc);
return true;
}

View file

@ -65,7 +65,7 @@ struct nvk_root_descriptor_table {
uint8_t set_dynamic_buffer_start[NVK_MAX_SETS];
/* Dynamic buffer bindings */
struct nvk_buffer_address dynamic_buffers[NVK_MAX_DYNAMIC_BUFFERS];
union nvk_buffer_descriptor dynamic_buffers[NVK_MAX_DYNAMIC_BUFFERS];
/* enfore alignment to 0x100 as needed pre pascal */
uint8_t __padding[0x40];

View file

@ -2255,15 +2255,38 @@ nvk_mme_bind_cbuf_desc(struct mme_builder *b)
/* First 4 bits are group, later bits are slot */
struct mme_value group_slot = mme_load(b);
if (b->devinfo->cls_eng3d >= TURING_A) {
struct mme_value64 addr = mme_load_addr64(b);
mme_tu104_read_fifoed(b, addr, mme_imm(3));
}
struct mme_value addr_lo, addr_hi, size;
if (nvk_use_bindless_cbuf(b->devinfo)) {
if (b->devinfo->cls_eng3d >= TURING_A) {
struct mme_value64 addr = mme_load_addr64(b);
mme_tu104_read_fifoed(b, addr, mme_imm(2));
}
/* Load the descriptor */
struct mme_value addr_lo = mme_load(b);
struct mme_value addr_hi = mme_load(b);
struct mme_value size = mme_load(b);
/* Load the descriptor */
struct mme_value desc_lo = mme_load(b);
struct mme_value desc_hi = mme_load(b);
/* The bottom 45 bits are addr >> 4 */
addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
/* The top 19 bits are size >> 4 */
size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
mme_free_reg(b, desc_hi);
mme_free_reg(b, desc_lo);
} else {
if (b->devinfo->cls_eng3d >= TURING_A) {
struct mme_value64 addr = mme_load_addr64(b);
mme_tu104_read_fifoed(b, addr, mme_imm(3));
}
/* Load the descriptor */
addr_lo = mme_load(b);
addr_hi = mme_load(b);
size = mme_load(b);
}
struct mme_value cb = mme_alloc_reg(b);
mme_if(b, ieq, size, mme_zero()) {
@ -2285,7 +2308,7 @@ nvk_mme_bind_cbuf_desc(struct mme_builder *b)
mme_emit(b, addr_hi);
mme_emit(b, addr_lo);
/* Bottim bit is the valid bit, 8:4 are shader slot */
/* Bottom bit is the valid bit, 8:4 are shader slot */
mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
}

View file

@ -124,7 +124,7 @@ write_storage_image_view_desc(struct nvk_descriptor_set *set,
write_desc(set, binding, elem, &desc, sizeof(desc));
}
static struct nvk_buffer_address
static union nvk_buffer_descriptor
ubo_desc(struct nvk_physical_device *pdev,
const VkDescriptorBufferInfo *const info,
uint32_t binding, uint32_t elem)
@ -140,10 +140,17 @@ ubo_desc(struct nvk_physical_device *pdev,
addr_range.addr = align64(addr_range.addr, min_cbuf_alignment);
addr_range.range = align(addr_range.range, min_cbuf_alignment);
return (struct nvk_buffer_address) {
.base_addr = align64(addr_range.addr, min_cbuf_alignment),
.size = align(addr_range.range, min_cbuf_alignment),
};
if (nvk_use_bindless_cbuf(&pdev->info)) {
return (union nvk_buffer_descriptor) { .cbuf = {
.base_addr_shift_4 = addr_range.addr >> 4,
.size_shift_4 = addr_range.range >> 4,
}};
} else {
return (union nvk_buffer_descriptor) { .addr = {
.base_addr = addr_range.addr,
.size = addr_range.range,
}};
}
}
static void
@ -152,7 +159,7 @@ write_ubo_desc(struct nvk_physical_device *pdev,
const VkDescriptorBufferInfo *const info,
uint32_t binding, uint32_t elem)
{
const struct nvk_buffer_address desc = ubo_desc(pdev, info, binding, elem);
const union nvk_buffer_descriptor desc = ubo_desc(pdev, info, binding, elem);
write_desc(set, binding, elem, &desc, sizeof(desc));
}
@ -168,7 +175,7 @@ write_dynamic_ubo_desc(struct nvk_physical_device *pdev,
ubo_desc(pdev, info, binding, elem);
}
static struct nvk_buffer_address
static union nvk_buffer_descriptor
ssbo_desc(const VkDescriptorBufferInfo *const info,
uint32_t binding, uint32_t elem)
{
@ -182,10 +189,10 @@ ssbo_desc(const VkDescriptorBufferInfo *const info,
addr_range.addr = align64(addr_range.addr, NVK_MIN_SSBO_ALIGNMENT);
addr_range.range = align(addr_range.range, NVK_SSBO_BOUNDS_CHECK_ALIGNMENT);
return (struct nvk_buffer_address) {
.base_addr = align64(addr_range.addr, NVK_MIN_SSBO_ALIGNMENT),
.size = align(addr_range.range, NVK_SSBO_BOUNDS_CHECK_ALIGNMENT),
};
return (union nvk_buffer_descriptor) { .addr = {
.base_addr = addr_range.addr,
.size = addr_range.range,
}};
}
@ -194,7 +201,7 @@ write_ssbo_desc(struct nvk_descriptor_set *set,
const VkDescriptorBufferInfo *const info,
uint32_t binding, uint32_t elem)
{
const struct nvk_buffer_address desc = ssbo_desc(info, binding, elem);
const union nvk_buffer_descriptor desc = ssbo_desc(info, binding, elem);
write_desc(set, binding, elem, &desc, sizeof(desc));
}

View file

@ -9,6 +9,7 @@
#include "nouveau_bo.h"
#include "nvk_device.h"
#include "nvk_physical_device.h"
#include "vk_object.h"
#include "vk_descriptor_update_template.h"
@ -52,6 +53,16 @@ PRAGMA_DIAGNOSTIC_POP
static_assert(sizeof(struct nvk_buffer_view_descriptor) == 4,
"nvk_buffer_view_descriptor has no holes");
PRAGMA_DIAGNOSTIC_PUSH
PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
struct nvk_bindless_cbuf {
uint64_t base_addr_shift_4:45;
uint64_t size_shift_4:19;
};
PRAGMA_DIAGNOSTIC_POP
static_assert(sizeof(struct nvk_bindless_cbuf) == 8,
"nvk_bindless_cbuf has no holes");
/* This has to match nir_address_format_64bit_bounded_global */
PRAGMA_DIAGNOSTIC_PUSH
PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
@ -64,6 +75,31 @@ PRAGMA_DIAGNOSTIC_POP
static_assert(sizeof(struct nvk_buffer_address) == 16,
"nvk_buffer_address has no holes");
union nvk_buffer_descriptor {
struct nvk_buffer_address addr;
struct nvk_bindless_cbuf cbuf;
};
static inline bool
nvk_use_bindless_cbuf(const struct nv_device_info *info)
{
return info->cls_eng3d >= 0xC597 /* TURING_A */;
}
static inline struct nvk_buffer_address
nvk_ubo_descriptor_addr(const struct nvk_physical_device *pdev,
union nvk_buffer_descriptor desc)
{
if (nvk_use_bindless_cbuf(&pdev->info)) {
return (struct nvk_buffer_address) {
.base_addr = desc.cbuf.base_addr_shift_4 << 4,
.size = desc.cbuf.size_shift_4 << 4,
};
} else {
return desc.addr;
}
}
#define NVK_BUFFER_ADDRESS_NULL ((struct nvk_buffer_address) { .size = 0 })
struct nvk_descriptor_pool {
@ -90,7 +126,7 @@ struct nvk_descriptor_set {
uint64_t addr;
uint32_t size;
struct nvk_buffer_address dynamic_buffers[];
union nvk_buffer_descriptor dynamic_buffers[];
};
VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_descriptor_set, base, VkDescriptorSet,

View file

@ -51,7 +51,7 @@ nvk_descriptor_stride_align_for_type(const struct nvk_physical_device *pdev,
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
*stride = *alignment = sizeof(struct nvk_buffer_address);
*stride = *alignment = sizeof(union nvk_buffer_descriptor);
break;
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:

View file

@ -50,6 +50,7 @@ compar_cbufs(const void *_a, const void *_b)
struct lower_descriptors_ctx {
const struct nvk_descriptor_set_layout *set_layouts[NVK_MAX_SETS];
bool use_bindless_cbuf;
bool clamp_desc_array_bounds;
nir_address_format ubo_addr_format;
nir_address_format ssbo_addr_format;
@ -610,13 +611,14 @@ load_descriptor(nir_builder *b, unsigned num_components, unsigned bit_size,
index = nir_iadd(b, index,
nir_iadd_imm(b, dynamic_buffer_start,
binding_layout->dynamic_buffer_index));
uint32_t desc_size = sizeof(union nvk_buffer_descriptor);
nir_def *root_desc_offset =
nir_iadd_imm(b, nir_imul_imm(b, index, sizeof(struct nvk_buffer_address)),
nir_iadd_imm(b, nir_imul_imm(b, index, desc_size),
nvk_root_descriptor_offset(dynamic_buffers));
assert(num_components == 4 && bit_size == 32);
return nir_ldc_nv(b, 4, 32, nir_imm_int(b, 0), root_desc_offset,
assert(num_components * bit_size <= desc_size * 8);
return nir_ldc_nv(b, num_components, bit_size,
nir_imm_int(b, 0), root_desc_offset,
.align_mul = 16, .align_offset = 0);
}
@ -628,12 +630,19 @@ load_descriptor(nir_builder *b, unsigned num_components, unsigned bit_size,
assert(binding_layout->stride == 1);
const uint32_t binding_size = binding_layout->array_size;
/* Convert it to nir_address_format_64bit_bounded_global */
assert(num_components == 4 && bit_size == 32);
return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
nir_unpack_64_2x32_split_y(b, base_addr),
nir_imm_int(b, binding_size),
nir_imm_int(b, 0));
if (ctx->use_bindless_cbuf) {
assert(num_components == 1 && bit_size == 64);
const uint32_t size = align(binding_size, 16);
return nir_ior_imm(b, nir_ishr_imm(b, base_addr, 4),
((uint64_t)size >> 4) << 45);
} else {
/* Convert it to nir_address_format_64bit_bounded_global */
assert(num_components == 4 && bit_size == 32);
return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
nir_unpack_64_2x32_split_y(b, base_addr),
nir_imm_int(b, binding_size),
nir_imm_int(b, 0));
}
}
default: {
@ -685,6 +694,29 @@ is_idx_intrin(nir_intrinsic_instr *intrin)
return intrin->intrinsic == nir_intrinsic_vulkan_resource_index;
}
static nir_def *
buffer_address_to_ldcx_handle(nir_builder *b, nir_def *addr)
{
nir_def *base_addr = nir_pack_64_2x32(b, nir_channels(b, addr, 0x3));
nir_def *size = nir_channel(b, addr, 2);
nir_def *offset = nir_channel(b, addr, 3);
nir_def *addr16 = nir_ushr_imm(b, base_addr, 4);
nir_def *addr16_lo = nir_unpack_64_2x32_split_x(b, addr16);
nir_def *addr16_hi = nir_unpack_64_2x32_split_y(b, addr16);
/* If we assume the top bis of the address are 0 as well as the bottom two
* bits of the size. (We can trust it since it's a descriptor) then
*
* ((size >> 4) << 13) | addr
*
* is just an imad.
*/
nir_def *handle_hi = nir_imad(b, size, nir_imm_int(b, 1 << 9), addr16_hi);
return nir_vec3(b, addr16_lo, handle_hi, offset);
}
static nir_def *
load_descriptor_for_idx_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
const struct lower_descriptors_ctx *ctx)
@ -701,13 +733,23 @@ load_descriptor_for_idx_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
uint32_t binding = nir_intrinsic_binding(intrin);
index = nir_iadd(b, index, intrin->src[0].ssa);
nir_def *desc = load_descriptor(b, 4, 32, set, binding, index, 0, ctx);
const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
if (descriptor_type_is_ubo(desc_type) && ctx->use_bindless_cbuf) {
nir_def *desc = load_descriptor(b, 1, 64, set, binding, index, 0, ctx);
/* We know a priori that the the .w compnent (offset) is zero */
return nir_vec4(b, nir_channel(b, desc, 0),
nir_channel(b, desc, 1),
nir_channel(b, desc, 2),
nir_imm_int(b, 0));
/* The descriptor is just the handle. NIR also needs an offset. */
return nir_vec3(b, nir_unpack_64_2x32_split_x(b, desc),
nir_unpack_64_2x32_split_y(b, desc),
nir_imm_int(b, 0));
} else {
nir_def *desc = load_descriptor(b, 4, 32, set, binding, index, 0, ctx);
/* We know a priori that the the .w compnent (offset) is zero */
return nir_vec4(b, nir_channel(b, desc, 0),
nir_channel(b, desc, 1),
nir_channel(b, desc, 2),
nir_imm_int(b, 0));
}
}
static bool
@ -1253,6 +1295,7 @@ nvk_nir_lower_descriptors(nir_shader *nir,
struct nvk_cbuf_map *cbuf_map_out)
{
struct lower_descriptors_ctx ctx = {
.use_bindless_cbuf = nvk_use_bindless_cbuf(&pdev->info),
.clamp_desc_array_bounds =
rs->storage_buffers != VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
rs->uniform_buffers != VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||

View file

@ -116,14 +116,18 @@ nir_address_format
nvk_ubo_addr_format(const struct nvk_physical_device *pdev,
VkPipelineRobustnessBufferBehaviorEXT robustness)
{
switch (robustness) {
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
return nir_address_format_64bit_global_32bit_offset;
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
return nir_address_format_64bit_bounded_global;
default:
unreachable("Invalid robust buffer access behavior");
if (nvk_use_bindless_cbuf(&pdev->info)) {
return nir_address_format_vec2_index_32bit_offset;
} else {
switch (robustness) {
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
return nir_address_format_64bit_global_32bit_offset;
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
return nir_address_format_64bit_bounded_global;
default:
unreachable("Invalid robust buffer access behavior");
}
}
}
@ -258,10 +262,21 @@ lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load,
const uint32_t align_mul = nir_intrinsic_align_mul(load);
const uint32_t align_offset = nir_intrinsic_align_offset(load);
nir_def *val = nir_ldc_nv(b, load->num_components, load->def.bit_size,
index, offset, .access = access,
.align_mul = align_mul,
.align_offset = align_offset);
nir_def *val;
if (load->src[0].ssa->num_components == 1) {
val = nir_ldc_nv(b, load->num_components, load->def.bit_size,
index, offset, .access = access,
.align_mul = align_mul,
.align_offset = align_offset);
} else if (load->src[0].ssa->num_components == 2) {
nir_def *handle = nir_pack_64_2x32(b, load->src[0].ssa);
val = nir_ldcx_nv(b, load->num_components, load->def.bit_size,
handle, offset, .access = access,
.align_mul = align_mul,
.align_offset = align_offset);
} else {
unreachable("Invalid UBO index");
}
nir_def_rewrite_uses(&load->def, val);
return true;
}