anv: update shader descriptor resource limits

Some limits got stuck to the old binding table limits. Those don't
apply anymore since EXT_descriptor_indexing was implemented.

Fixes: 6e230d7607 ("anv: Implement VK_EXT_descriptor_indexing")
Fixes: 96c33fb027 ("anv: enable direct descriptors on platforms with extended bindless offset")
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31999>
(cherry picked from commit d6acb56f11)
This commit is contained in:
Lionel Landwerlin 2024-11-06 09:15:54 +02:00 committed by Eric Engestrom
parent c2c3d6ab61
commit cc372f4165
2 changed files with 98 additions and 50 deletions

View file

@ -374,7 +374,7 @@
"description": "anv: update shader descriptor resource limits",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": "6e230d7607f9b3e082d00859bd7725c4dc87e5cf",
"notes": null

View file

@ -165,6 +165,72 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
va_end(args);
}
struct anv_descriptor_limits {
uint32_t max_ubos;
uint32_t max_ssbos;
uint32_t max_samplers;
uint32_t max_images;
uint32_t max_resources;
};
static void
get_device_descriptor_limits(const struct anv_physical_device *device,
struct anv_descriptor_limits *limits)
{
memset(limits, 0, sizeof(*limits));
/* It's a bit hard to exactly map our implementation to the limits
* described by Vulkan. The bindless surface handle in the extended message
* descriptors is 20 bits on <= Gfx12.0, 26 bits on >= Gfx12.5 and it's an
* index into the table of RENDER_SURFACE_STATE structs that starts at
* bindless surface base address. On <= Gfx12.0, this means that we can
* have at must 1M surface states allocated at any given time. Since most
* image views take two descriptors, this means we have a limit of about
* 500K image views. On >= Gfx12.5, we do not need 2 surfaces per
* descriptors and we have 33M+ descriptors (we have a 2GB limit, due to
* overlapping heaps for workarounds, but HW can do 4GB).
*
* However, on <= Gfx12.0, since we allocate surface states at
* vkCreateImageView time, this means our limit is actually something on
* the order of 500K image views allocated at any time. The actual limit
* describe by Vulkan, on the other hand, is a limit of how many you can
* have in a descriptor set. Assuming anyone using 1M descriptors will be
* using the same image view twice a bunch of times (or a bunch of null
* descriptors), we can safely advertise a larger limit here.
*
* Here we use the size of the heap in which the descriptors are stored and
* divide by the size of the descriptor to get a limit value.
*/
const uint64_t descriptor_heap_size =
device->indirect_descriptors ?
device->va.indirect_descriptor_pool.size :
device->va.bindless_surface_state_pool.size;;
const uint32_t buffer_descriptor_size =
device->indirect_descriptors ?
sizeof(struct anv_address_range_descriptor) :
ANV_SURFACE_STATE_SIZE;
const uint32_t image_descriptor_size =
device->indirect_descriptors ?
sizeof(struct anv_address_range_descriptor) :
ANV_SURFACE_STATE_SIZE;
const uint32_t sampler_descriptor_size =
device->indirect_descriptors ?
sizeof(struct anv_sampled_image_descriptor) :
ANV_SAMPLER_STATE_SIZE;
limits->max_ubos = descriptor_heap_size / buffer_descriptor_size;
limits->max_ssbos = descriptor_heap_size / buffer_descriptor_size;
limits->max_images = descriptor_heap_size / image_descriptor_size;
limits->max_samplers = descriptor_heap_size / sampler_descriptor_size;
limits->max_resources = UINT32_MAX;
limits->max_resources = MIN2(limits->max_resources, limits->max_ubos);
limits->max_resources = MIN2(limits->max_resources, limits->max_ssbos);
limits->max_resources = MIN2(limits->max_resources, limits->max_images);
limits->max_resources = MIN2(limits->max_resources, limits->max_samplers);
}
#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
defined(VK_USE_PLATFORM_XCB_KHR) || \
defined(VK_USE_PLATFORM_XLIB_KHR) || \
@ -1111,25 +1177,10 @@ get_properties_1_2(const struct anv_physical_device *pdevice,
p->shaderRoundingModeRTZFloat64 = true;
p->shaderSignedZeroInfNanPreserveFloat64 = true;
/* It's a bit hard to exactly map our implementation to the limits
* described by Vulkan. The bindless surface handle in the extended
* message descriptors is 20 bits and it's an index into the table of
* RENDER_SURFACE_STATE structs that starts at bindless surface base
* address. This means that we can have at must 1M surface states
* allocated at any given time. Since most image views take two
* descriptors, this means we have a limit of about 500K image views.
*
* However, since we allocate surface states at vkCreateImageView time,
* this means our limit is actually something on the order of 500K image
* views allocated at any time. The actual limit describe by Vulkan, on
* the other hand, is a limit of how many you can have in a descriptor set.
* Assuming anyone using 1M descriptors will be using the same image view
* twice a bunch of times (or a bunch of null descriptors), we can safely
* advertise a larger limit here.
*/
const unsigned max_bindless_views =
anv_physical_device_bindless_heap_size(pdevice, false) / ANV_SURFACE_STATE_SIZE;
p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views;
struct anv_descriptor_limits desc_limits;
get_device_descriptor_limits(pdevice, &desc_limits);
p->maxUpdateAfterBindDescriptorsInAllPools = desc_limits.max_resources;
p->shaderUniformBufferArrayNonUniformIndexingNative = false;
p->shaderSampledImageArrayNonUniformIndexingNative = false;
p->shaderStorageBufferArrayNonUniformIndexingNative = true;
@ -1137,20 +1188,20 @@ get_properties_1_2(const struct anv_physical_device *pdevice,
p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
p->robustBufferAccessUpdateAfterBind = true;
p->quadDivergentImplicitLod = false;
p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views;
p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views;
p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views;
p->maxPerStageDescriptorUpdateAfterBindSamplers = desc_limits.max_samplers;
p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = desc_limits.max_ubos;
p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = desc_limits.max_ssbos;
p->maxPerStageDescriptorUpdateAfterBindSampledImages = desc_limits.max_images;
p->maxPerStageDescriptorUpdateAfterBindStorageImages = desc_limits.max_images;
p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
p->maxPerStageUpdateAfterBindResources = UINT32_MAX;
p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views;
p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
p->maxPerStageUpdateAfterBindResources = desc_limits.max_resources;
p->maxDescriptorSetUpdateAfterBindSamplers = desc_limits.max_samplers;
p->maxDescriptorSetUpdateAfterBindUniformBuffers = desc_limits.max_ubos;
p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX;
p->maxDescriptorSetUpdateAfterBindStorageBuffers = desc_limits.max_ssbos;
p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views;
p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views;
p->maxDescriptorSetUpdateAfterBindSampledImages = desc_limits.max_images;
p->maxDescriptorSetUpdateAfterBindStorageImages = desc_limits.max_images;
p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
/* We support all of the depth resolve modes */
@ -1264,15 +1315,8 @@ get_properties(const struct anv_physical_device *pdevice,
const struct intel_device_info *devinfo = &pdevice->info;
const uint32_t max_ssbos = UINT16_MAX;
const uint32_t max_textures = UINT16_MAX;
const uint32_t max_samplers = UINT16_MAX;
const uint32_t max_images = UINT16_MAX;
const VkDeviceSize max_heap_size = anx_get_physical_device_max_heap_size(pdevice);
/* Claim a high per-stage limit since we have bindless. */
const uint32_t max_per_stage = UINT32_MAX;
const uint32_t max_workgroup_size =
MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);
@ -1297,6 +1341,9 @@ get_properties(const struct anv_physical_device *pdevice,
}
#endif /* DETECT_OS_ANDROID */
struct anv_descriptor_limits desc_limits;
get_device_descriptor_limits(pdevice, &desc_limits);
*props = (struct vk_properties) {
.apiVersion = ANV_API_VERSION,
.driverVersion = vk_get_driver_version(),
@ -1322,20 +1369,20 @@ get_properties(const struct anv_physical_device *pdevice,
.bufferImageGranularity = 1,
.sparseAddressSpaceSize = sparse_addr_space_size,
.maxBoundDescriptorSets = MAX_SETS,
.maxPerStageDescriptorSamplers = max_samplers,
.maxPerStageDescriptorUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
.maxPerStageDescriptorStorageBuffers = max_ssbos,
.maxPerStageDescriptorSampledImages = max_textures,
.maxPerStageDescriptorStorageImages = max_images,
.maxPerStageDescriptorSamplers = desc_limits.max_samplers,
.maxPerStageDescriptorUniformBuffers = desc_limits.max_ubos,
.maxPerStageDescriptorStorageBuffers = desc_limits.max_ssbos,
.maxPerStageDescriptorSampledImages = desc_limits.max_images,
.maxPerStageDescriptorStorageImages = desc_limits.max_images,
.maxPerStageDescriptorInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
.maxPerStageResources = max_per_stage,
.maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
.maxDescriptorSetUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, /* number of stages * maxPerStageDescriptorUniformBuffers */
.maxPerStageResources = desc_limits.max_resources,
.maxDescriptorSetSamplers = desc_limits.max_samplers,
.maxDescriptorSetUniformBuffers = desc_limits.max_ubos,
.maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
.maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */
.maxDescriptorSetStorageBuffers = desc_limits.max_ssbos,
.maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2,
.maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
.maxDescriptorSetStorageImages = 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */
.maxDescriptorSetSampledImages = desc_limits.max_images,
.maxDescriptorSetStorageImages = desc_limits.max_images,
.maxDescriptorSetInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
.maxVertexInputAttributes = MAX_VES,
.maxVertexInputBindings = MAX_VBS,
@ -1366,7 +1413,8 @@ get_properties(const struct anv_physical_device *pdevice,
.maxFragmentInputComponents = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
.maxFragmentOutputAttachments = 8,
.maxFragmentDualSrcAttachments = 1,
.maxFragmentCombinedOutputResources = MAX_RTS + max_ssbos + max_images,
.maxFragmentCombinedOutputResources = MAX_RTS + desc_limits.max_ssbos +
desc_limits.max_images,
.maxComputeSharedMemorySize = intel_device_info_get_max_slm_size(&pdevice->info),
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
.maxComputeWorkGroupInvocations = max_workgroup_size,