From 7c76125db25db9bdf5521baf7c848ccd4bd37b5f Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Fri, 20 Oct 2023 17:33:21 +0300 Subject: [PATCH] anv: use 2 different buffers for surfaces/samplers in descriptor sets We had the unfortunate finding on a recent platform to learn that the bindless sampler heap is not functioning as expected. Nowhere in the documentation is the size of the heap written down. So most people assumed that's the max number that we can program (4Gb). The reality is that it's only 64Mb. Though it is appearing like it's working properly for the whole 4Gb range for most apps, this is only because the HW bounds checking applied is broken. Instead of clamping anything beyong 64Mb, it's only clamping the last 4Kb of each 64Mb region. So this heap is useless for us to make a 4Gb region of both sampler & surface states... This change essentially turns off the bindless sampler heap on DG2+. The only location where we can put SAMPLER_STATE elements is the dynamic state heap. Unfortunately we cannot align the dynamic state heap with the bindless surface state heap. So the solution is to allocate sampler & surface states separately, each from the own heap in the descriptor pool. We now have to provide 2 sets of offsets for surfaces & samplers. Signed-off-by: Lionel Landwerlin Reviewed-by: Rohan Garg Part-of: --- src/intel/vulkan/anv_cmd_buffer.c | 41 +- src/intel/vulkan/anv_descriptor_set.c | 772 ++++++++++++------ src/intel/vulkan/anv_device.c | 13 +- .../vulkan/anv_nir_apply_pipeline_layout.c | 96 ++- .../vulkan/anv_nir_compute_push_layout.c | 18 +- .../vulkan/anv_nir_lower_resource_intel.c | 4 +- src/intel/vulkan/anv_pipeline_cache.c | 6 +- src/intel/vulkan/anv_private.h | 105 ++- src/intel/vulkan/anv_va.c | 27 +- src/intel/vulkan/genX_cmd_buffer.c | 44 +- src/intel/vulkan/genX_init_state.c | 33 +- 11 files changed, 779 insertions(+), 380 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 86350dd6258..300de3dc906 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -673,10 +673,11 @@ void anv_CmdBindPipeline( assert(layout->set[s].dynamic_offset_start < MAX_DYNAMIC_BUFFERS); if (layout->set[s].layout->dynamic_offset_count > 0 && - (push->desc_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) != layout->set[s].dynamic_offset_start) { - push->desc_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK; - push->desc_offsets[s] |= (layout->set[s].dynamic_offset_start & - ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK); + (push->desc_surface_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) != + layout->set[s].dynamic_offset_start) { + push->desc_surface_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK; + push->desc_surface_offsets[s] |= (layout->set[s].dynamic_offset_start & + ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK); modified = true; } } @@ -788,16 +789,16 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, /* When using indirect descriptors, stages that have access to the HW * binding tables, never need to access the - * anv_push_constants::desc_offsets fields, because any data they need - * from the descriptor buffer is accessible through a binding table - * entry. For stages that are "bindless" (Mesh/Task/RT), we need to - * provide anv_push_constants::desc_offsets matching the bound + * anv_push_constants::desc_surface_offsets fields, because any data + * they need from the descriptor buffer is accessible through a binding + * table entry. For stages that are "bindless" (Mesh/Task/RT), we need + * to provide anv_push_constants::desc_surface_offsets matching the bound * descriptor so that shaders can access the descriptor buffer through * A64 messages. * * With direct descriptors, the shaders can use the - * anv_push_constants::desc_offsets to build bindless offsets. So it's - * we always need to update the push constant data. + * anv_push_constants::desc_surface_offsets to build bindless offsets. + * So it's we always need to update the push constant data. */ bool update_desc_sets = !cmd_buffer->device->physical->indirect_descriptors || @@ -813,18 +814,20 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, if (update_desc_sets) { struct anv_push_constants *push = &pipe_state->push_constants; - struct anv_address set_addr = anv_descriptor_set_address(set); uint64_t offset = - anv_address_physical(set_addr) - - cmd_buffer->device->physical->va.binding_table_pool.addr; + anv_address_physical(set->desc_surface_addr) - + cmd_buffer->device->physical->va.internal_surface_state_pool.addr; assert((offset & ~ANV_DESCRIPTOR_SET_OFFSET_MASK) == 0); - push->desc_offsets[set_index] &= ~ANV_DESCRIPTOR_SET_OFFSET_MASK; - push->desc_offsets[set_index] |= offset; + push->desc_surface_offsets[set_index] &= ~ANV_DESCRIPTOR_SET_OFFSET_MASK; + push->desc_surface_offsets[set_index] |= offset; + push->desc_sampler_offsets[set_index] |= + anv_address_physical(set->desc_sampler_addr) - + cmd_buffer->device->physical->va.dynamic_state_pool.addr; - if (set_addr.bo) { - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, - set_addr.bo); - } + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + set->desc_surface_addr.bo); + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + set->desc_sampler_addr.bo); } dirty_stages |= stages; diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 902989d7273..017bc306c94 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -36,27 +36,40 @@ * Descriptor set layouts. */ -static unsigned -anv_descriptor_data_alignment(enum anv_descriptor_data data) +static void +anv_descriptor_data_alignment(enum anv_descriptor_data data, + enum anv_descriptor_set_layout_type layout_type, + unsigned *out_surface_align, + unsigned *out_sampler_align) { - unsigned alignment = 1; + unsigned surface_align = 1, sampler_align = 1; if (data & (ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE | ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE | ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE)) - alignment = MAX2(alignment, 8); + surface_align = MAX2(surface_align, 8); - if (data & (ANV_DESCRIPTOR_SURFACE | - ANV_DESCRIPTOR_SURFACE_SAMPLER)) - alignment = MAX2(alignment, ANV_SURFACE_STATE_SIZE); + if (data & ANV_DESCRIPTOR_SURFACE) + surface_align = MAX2(surface_align, ANV_SURFACE_STATE_SIZE); - if (data & ANV_DESCRIPTOR_SAMPLER) - alignment = MAX2(alignment, ANV_SAMPLER_STATE_SIZE); + if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) { + surface_align = MAX2(surface_align, ANV_SURFACE_STATE_SIZE); + if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) + sampler_align = MAX2(sampler_align, ANV_SAMPLER_STATE_SIZE); + } + + if (data & ANV_DESCRIPTOR_SAMPLER) { + if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) + sampler_align = MAX2(sampler_align, ANV_SAMPLER_STATE_SIZE); + else + surface_align = MAX2(surface_align, ANV_SAMPLER_STATE_SIZE); + } if (data & ANV_DESCRIPTOR_INLINE_UNIFORM) - alignment = MAX2(alignment, ANV_UBO_ALIGNMENT); + surface_align = MAX2(surface_align, ANV_UBO_ALIGNMENT); - return alignment; + *out_surface_align = surface_align; + *out_sampler_align = sampler_align; } static enum anv_descriptor_data @@ -125,7 +138,8 @@ anv_indirect_descriptor_data_for_type(VkDescriptorType type) } static enum anv_descriptor_data -anv_direct_descriptor_data_for_type(VkDescriptorType type) +anv_direct_descriptor_data_for_type(enum anv_descriptor_set_layout_type layout_type, + VkDescriptorType type) { enum anv_descriptor_data data = 0; @@ -136,9 +150,16 @@ anv_direct_descriptor_data_for_type(VkDescriptorType type) break; case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - data = ANV_DESCRIPTOR_BTI_SURFACE_STATE | - ANV_DESCRIPTOR_BTI_SAMPLER_STATE | - ANV_DESCRIPTOR_SURFACE_SAMPLER; + if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) { + data = ANV_DESCRIPTOR_BTI_SURFACE_STATE | + ANV_DESCRIPTOR_BTI_SAMPLER_STATE | + ANV_DESCRIPTOR_SURFACE | + ANV_DESCRIPTOR_SAMPLER; + } else { + data = ANV_DESCRIPTOR_BTI_SURFACE_STATE | + ANV_DESCRIPTOR_BTI_SAMPLER_STATE | + ANV_DESCRIPTOR_SURFACE_SAMPLER; + } break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: @@ -171,16 +192,18 @@ anv_direct_descriptor_data_for_type(VkDescriptorType type) static enum anv_descriptor_data anv_descriptor_data_for_type(const struct anv_physical_device *device, + enum anv_descriptor_set_layout_type layout_type, VkDescriptorType type) { if (device->indirect_descriptors) return anv_indirect_descriptor_data_for_type(type); else - return anv_direct_descriptor_data_for_type(type); + return anv_direct_descriptor_data_for_type(layout_type, type); } static enum anv_descriptor_data anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device, + enum anv_descriptor_set_layout_type layout_type, const VkMutableDescriptorTypeCreateInfoEXT *mutable_info, int binding) { @@ -193,11 +216,11 @@ anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device, i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) continue; - desc_data |= anv_descriptor_data_for_type(device, i); + desc_data |= anv_descriptor_data_for_type(device, layout_type, i); } desc_data |= anv_descriptor_data_for_type( - device, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); + device, layout_type, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); return desc_data; } @@ -206,79 +229,102 @@ anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device, &mutable_info->pMutableDescriptorTypeLists[binding]; for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) { desc_data |= - anv_descriptor_data_for_type(device, type_list->pDescriptorTypes[i]); + anv_descriptor_data_for_type(device, layout_type, + type_list->pDescriptorTypes[i]); } return desc_data; } -static unsigned -anv_descriptor_data_size(enum anv_descriptor_data data) +static void +anv_descriptor_data_size(enum anv_descriptor_data data, + enum anv_descriptor_set_layout_type layout_type, + uint16_t *out_surface_size, + uint16_t *out_sampler_size) { - unsigned size = 0; + unsigned surface_size = 0; + unsigned sampler_size = 0; if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE) - size += sizeof(struct anv_sampled_image_descriptor); + surface_size += sizeof(struct anv_sampled_image_descriptor); if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE) - size += sizeof(struct anv_storage_image_descriptor); + surface_size += sizeof(struct anv_storage_image_descriptor); if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) - size += sizeof(struct anv_address_range_descriptor); + surface_size += sizeof(struct anv_address_range_descriptor); if (data & ANV_DESCRIPTOR_SURFACE) - size += ANV_SURFACE_STATE_SIZE; + surface_size += ANV_SURFACE_STATE_SIZE; - if (data & ANV_DESCRIPTOR_SAMPLER) - size += ANV_SAMPLER_STATE_SIZE; + /* Direct descriptors have sampler states stored separately */ + if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) { + if (data & ANV_DESCRIPTOR_SAMPLER) + sampler_size += ANV_SAMPLER_STATE_SIZE; - if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) { - size += ALIGN(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE, - ANV_SURFACE_STATE_SIZE); + if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) { + surface_size += ANV_SURFACE_STATE_SIZE; + sampler_size += ANV_SAMPLER_STATE_SIZE; + } + } else { + if (data & ANV_DESCRIPTOR_SAMPLER) + surface_size += ANV_SAMPLER_STATE_SIZE; + + if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) { + surface_size += ALIGN(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE, + ANV_SURFACE_STATE_SIZE); + } } - return size; + *out_surface_size = surface_size; + *out_sampler_size = sampler_size; } static bool anv_needs_descriptor_buffer(VkDescriptorType desc_type, + enum anv_descriptor_set_layout_type layout_type, enum anv_descriptor_data desc_data) { - if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK || - anv_descriptor_data_size(desc_data) > 0) + if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) return true; - return false; + + uint16_t surface_size, sampler_size; + anv_descriptor_data_size(desc_data, layout_type, + &surface_size, &sampler_size); + return surface_size > 0 || sampler_size > 0; } /** Returns the size in bytes of each descriptor with the given layout */ -static unsigned -anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout) +static void +anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout, + enum anv_descriptor_set_layout_type layout_type, + uint16_t *out_surface_stride, + uint16_t *out_sampler_stride) { if (layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) { assert(layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM); - return layout->array_size; + assert(layout->array_size <= UINT16_MAX); + *out_surface_stride = layout->array_size; + *out_sampler_stride = 0; + return; } - unsigned size = anv_descriptor_data_size(layout->data); - - /* For multi-planar bindings, we make every descriptor consume the maximum - * number of planes so we don't have to bother with walking arrays and - * adding things up every time. Fortunately, YCbCr samplers aren't all - * that common and likely won't be in the middle of big arrays. - */ - if (layout->max_plane_count > 1) - size *= layout->max_plane_count; - - return size; + anv_descriptor_data_size(layout->data, layout_type, + out_surface_stride, + out_sampler_stride); } /** Returns size in bytes of the biggest descriptor in the given layout */ -static unsigned +static void anv_descriptor_size_for_mutable_type(const struct anv_physical_device *device, + enum anv_descriptor_set_layout_type layout_type, const VkMutableDescriptorTypeCreateInfoEXT *mutable_info, - int binding) + int binding, + uint16_t *out_surface_stride, + uint16_t *out_sampler_stride) { - unsigned size = 0; + *out_surface_stride = 0; + *out_sampler_stride = 0; if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0 || @@ -291,26 +337,41 @@ anv_descriptor_size_for_mutable_type(const struct anv_physical_device *device, continue; enum anv_descriptor_data desc_data = - anv_descriptor_data_for_type(device, i); - size = MAX2(size, anv_descriptor_data_size(desc_data)); + anv_descriptor_data_for_type(device, layout_type, i); + uint16_t surface_stride, sampler_stride; + anv_descriptor_data_size(desc_data, layout_type, + &surface_stride, &sampler_stride); + + *out_surface_stride = MAX2(*out_surface_stride, surface_stride); + *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride); } enum anv_descriptor_data desc_data = anv_descriptor_data_for_type( - device, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); - size = MAX2(size, anv_descriptor_data_size(desc_data)); + device, layout_type, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); + uint16_t surface_stride, sampler_stride; + anv_descriptor_data_size(desc_data, layout_type, + &surface_stride, &sampler_stride); - return size; + *out_surface_stride = MAX2(*out_surface_stride, surface_stride); + *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride); + + return; } const VkMutableDescriptorTypeListEXT *type_list = &mutable_info->pMutableDescriptorTypeLists[binding]; for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) { enum anv_descriptor_data desc_data = - anv_descriptor_data_for_type(device, type_list->pDescriptorTypes[i]); - size = MAX2(size, anv_descriptor_data_size(desc_data)); - } + anv_descriptor_data_for_type(device, layout_type, + type_list->pDescriptorTypes[i]); - return size; + uint16_t surface_stride, sampler_stride; + anv_descriptor_data_size(desc_data, layout_type, + &surface_stride, &sampler_stride); + + *out_surface_stride = MAX2(*out_surface_stride, surface_stride); + *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride); + } } static bool @@ -353,6 +414,18 @@ anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice, return (binding->flags & flags_requiring_bindless) != 0; } +static enum anv_descriptor_set_layout_type +anv_descriptor_set_layout_type_for_flags(const struct anv_physical_device *device, + const VkDescriptorSetLayoutCreateInfo *pCreateInfo) +{ + if (pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) + return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER; + else if (device->indirect_descriptors) + return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT; + else + return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT; +} + void anv_GetDescriptorSetLayoutSupport( VkDevice _device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, @@ -372,6 +445,9 @@ void anv_GetDescriptorSetLayoutSupport( vk_find_struct_const(pCreateInfo->pNext, MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); + enum anv_descriptor_set_layout_type layout_type = + anv_descriptor_set_layout_type_for_flags(pdevice, pCreateInfo); + for (uint32_t b = 0; b < pCreateInfo->bindingCount; b++) { const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[b]; @@ -383,10 +459,11 @@ void anv_GetDescriptorSetLayoutSupport( enum anv_descriptor_data desc_data = binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_mutable_type(pdevice, mutable_info, b) : - anv_descriptor_data_for_type(pdevice, binding->descriptorType); + anv_descriptor_data_for_mutable_type(pdevice, layout_type, mutable_info, b) : + anv_descriptor_data_for_type(pdevice, layout_type, binding->descriptorType); - if (anv_needs_descriptor_buffer(binding->descriptorType, desc_data)) + if (anv_needs_descriptor_buffer(binding->descriptorType, + layout_type, desc_data)) needs_descriptor_buffer = true; if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) @@ -509,13 +586,8 @@ VkResult anv_CreateDescriptorSetLayout( set_layout->ref_cnt = 1; set_layout->binding_count = num_bindings; set_layout->flags = pCreateInfo->flags; - - if (pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) - set_layout->type = ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER; - else if (device->physical->indirect_descriptors) - set_layout->type = ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT; - else - set_layout->type = ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT; + set_layout->type = anv_descriptor_set_layout_type_for_flags(device->physical, + pCreateInfo); for (uint32_t b = 0; b < num_bindings; b++) { /* Initialize all binding_layout entries to -1 */ @@ -533,7 +605,8 @@ VkResult anv_CreateDescriptorSetLayout( uint32_t buffer_view_count = 0; uint32_t dynamic_offset_count = 0; - uint32_t descriptor_buffer_size = 0; + uint32_t descriptor_buffer_surface_size = 0; + uint32_t descriptor_buffer_sampler_size = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; @@ -597,8 +670,11 @@ VkResult anv_CreateDescriptorSetLayout( set_layout->binding[b].data = binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_mutable_type(device->physical, mutable_info, b) : - anv_descriptor_data_for_type(device->physical, binding->descriptorType); + anv_descriptor_data_for_mutable_type(device->physical, + set_layout->type, + mutable_info, b) : + anv_descriptor_data_for_type(device->physical, set_layout->type, + binding->descriptorType); set_layout->binding[b].array_size = binding->descriptorCount; set_layout->binding[b].descriptor_index = set_layout->descriptor_count; @@ -650,32 +726,64 @@ VkResult anv_CreateDescriptorSetLayout( break; } - set_layout->binding[b].descriptor_data_size = - anv_descriptor_data_size(set_layout->binding[b].data); - set_layout->binding[b].descriptor_stride = - binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_size_for_mutable_type(device->physical, mutable_info, b) : - anv_descriptor_size(&set_layout->binding[b]); + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { + anv_descriptor_size_for_mutable_type( + device->physical, set_layout->type, mutable_info, b, + &set_layout->binding[b].descriptor_data_surface_size, + &set_layout->binding[b].descriptor_data_sampler_size); + } else { + anv_descriptor_size(&set_layout->binding[b], + set_layout->type, + &set_layout->binding[b].descriptor_data_surface_size, + &set_layout->binding[b].descriptor_data_sampler_size); + } - descriptor_buffer_size = - align(descriptor_buffer_size, - anv_descriptor_data_alignment(set_layout->binding[b].data)); + /* For multi-planar bindings, we make every descriptor consume the maximum + * number of planes so we don't have to bother with walking arrays and + * adding things up every time. Fortunately, YCbCr samplers aren't all + * that common and likely won't be in the middle of big arrays. + */ + set_layout->binding[b].descriptor_surface_stride = + MAX2(set_layout->binding[b].max_plane_count, 1) * + set_layout->binding[b].descriptor_data_surface_size; + set_layout->binding[b].descriptor_sampler_stride = + MAX2(set_layout->binding[b].max_plane_count, 1) * + set_layout->binding[b].descriptor_data_sampler_size; + + unsigned surface_align, sampler_align; + anv_descriptor_data_alignment(set_layout->binding[b].data, + set_layout->type, + &surface_align, + &sampler_align); + descriptor_buffer_surface_size = + align(descriptor_buffer_surface_size, surface_align); + descriptor_buffer_sampler_size = + align(descriptor_buffer_sampler_size, sampler_align); if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - set_layout->binding[b].descriptor_offset = descriptor_buffer_size; - descriptor_buffer_size += binding->descriptorCount; + set_layout->binding[b].descriptor_surface_offset = descriptor_buffer_surface_size; + descriptor_buffer_surface_size += binding->descriptorCount; } else { - set_layout->binding[b].descriptor_offset = descriptor_buffer_size; - descriptor_buffer_size += - set_layout->binding[b].descriptor_stride * binding->descriptorCount; + set_layout->binding[b].descriptor_surface_offset = descriptor_buffer_surface_size; + descriptor_buffer_surface_size += + set_layout->binding[b].descriptor_surface_stride * binding->descriptorCount; } + set_layout->binding[b].descriptor_sampler_offset = descriptor_buffer_sampler_size; + descriptor_buffer_sampler_size += + set_layout->binding[b].descriptor_sampler_stride * binding->descriptorCount; + set_layout->shader_stages |= binding->stageFlags; } + /* Sanity checks */ + assert(descriptor_buffer_sampler_size == 0 || + set_layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT); + set_layout->buffer_view_count = buffer_view_count; set_layout->dynamic_offset_count = dynamic_offset_count; - set_layout->descriptor_buffer_size = descriptor_buffer_size; + set_layout->descriptor_buffer_surface_size = descriptor_buffer_surface_size; + set_layout->descriptor_buffer_sampler_size = descriptor_buffer_sampler_size; *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout); @@ -746,30 +854,44 @@ anv_descriptor_set_layout_empty(const struct anv_descriptor_set_layout *set_layo return set_layout->binding_count == 0; } -static uint32_t +static void anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout, - uint32_t var_desc_count) + uint32_t var_desc_count, + uint32_t *out_surface_size, + uint32_t *out_sampler_size) { const struct anv_descriptor_set_binding_layout *dynamic_binding = set_layout_dynamic_binding(set_layout); - if (dynamic_binding == NULL) - return ALIGN(set_layout->descriptor_buffer_size, ANV_UBO_ALIGNMENT); + if (dynamic_binding == NULL) { + *out_surface_size = ALIGN(set_layout->descriptor_buffer_surface_size, + ANV_UBO_ALIGNMENT); + *out_sampler_size = set_layout->descriptor_buffer_sampler_size; + return; + } assert(var_desc_count <= dynamic_binding->array_size); uint32_t shrink = dynamic_binding->array_size - var_desc_count; - uint32_t set_size; + uint32_t set_surface_size, set_sampler_size; if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { /* Inline uniform blocks are specified to use the descriptor array * size as the size in bytes of the block. */ - set_size = set_layout->descriptor_buffer_size - shrink; + set_surface_size = set_layout->descriptor_buffer_surface_size - shrink; + set_sampler_size = 0; } else { - set_size = set_layout->descriptor_buffer_size - - shrink * dynamic_binding->descriptor_stride; + set_surface_size = + set_layout->descriptor_buffer_surface_size > 0 ? + (set_layout->descriptor_buffer_surface_size - + shrink * dynamic_binding->descriptor_surface_stride) : 0; + set_sampler_size = + set_layout->descriptor_buffer_sampler_size > 0 ? + (set_layout->descriptor_buffer_sampler_size - + shrink * dynamic_binding->descriptor_sampler_stride) : 0; } - return ALIGN(set_size, ANV_UBO_ALIGNMENT); + *out_surface_size = ALIGN(set_surface_size, ANV_UBO_ALIGNMENT); + *out_sampler_size = set_sampler_size; } void anv_DestroyDescriptorSetLayout( @@ -791,11 +913,15 @@ anv_descriptor_set_layout_print(const struct anv_descriptor_set_layout *layout) { fprintf(stderr, "set layout:\n"); for (uint32_t b = 0; b < layout->binding_count; b++) { - fprintf(stderr, " binding%03u: offset=0x%08x stride=%03u size=%03u count=%03u\n", + fprintf(stderr, " binding%03u: offsets=0x%08x/0x%08x sizes=%04u/%04u strides=%03u/%03u planes=%hhu count=%03u\n", b, - layout->binding[b].descriptor_offset, - layout->binding[b].descriptor_data_size, - layout->binding[b].descriptor_stride, + layout->binding[b].descriptor_surface_offset, + layout->binding[b].descriptor_sampler_offset, + layout->binding[b].descriptor_data_surface_size, + layout->binding[b].descriptor_data_sampler_size, + layout->binding[b].descriptor_surface_stride, + layout->binding[b].descriptor_sampler_stride, + layout->binding[b].max_plane_count, layout->binding[b].array_size); } } @@ -824,7 +950,8 @@ sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx, SHA1_UPDATE_VALUE(ctx, layout->descriptor_index); SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index); SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index); - SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_surface_offset); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_sampler_offset); if (layout->immutable_samplers) { for (uint16_t i = 0; i < layout->array_size; i++) @@ -842,7 +969,8 @@ sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx, SHA1_UPDATE_VALUE(ctx, layout->shader_stages); SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count); SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count); - SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_surface_size); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_sampler_size); for (uint16_t i = 0; i < layout->binding_count; i++) sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]); @@ -1022,6 +1150,108 @@ void anv_DestroyPipelineLayout( #define EMPTY 1 +static VkResult +anv_descriptor_pool_heap_init(struct anv_device *device, + struct anv_descriptor_pool_heap *heap, + uint32_t size, + bool host_only, + bool samplers) +{ + if (size == 0) + return VK_SUCCESS; + + if (host_only) { + heap->size = size; + heap->host_mem = vk_zalloc(&device->vk.alloc, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (heap->host_mem == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } else { + const char *bo_name = + device->physical->indirect_descriptors ? "indirect descriptors" : + samplers ? "direct sampler" : "direct surfaces"; + + heap->size = align(size, 4096); + + VkResult result = anv_device_alloc_bo(device, + bo_name, heap->size, + ANV_BO_ALLOC_CAPTURE | + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_HOST_CACHED_COHERENT | + (samplers ? + ANV_BO_ALLOC_SAMPLER_POOL : + ANV_BO_ALLOC_DESCRIPTOR_POOL), + 0 /* explicit_address */, + &heap->bo); + if (result != VK_SUCCESS) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + util_vma_heap_init(&heap->heap, POOL_HEAP_OFFSET, heap->size); + + return VK_SUCCESS; +} + +static void +anv_descriptor_pool_heap_fini(struct anv_device *device, + struct anv_descriptor_pool_heap *heap) +{ + if (heap->size == 0) + return; + + util_vma_heap_finish(&heap->heap); + + if (heap->bo) + anv_device_release_bo(device, heap->bo); + + if (heap->host_mem) + vk_free(&device->vk.alloc, heap->host_mem); +} + +static void +anv_descriptor_pool_heap_reset(struct anv_device *device, + struct anv_descriptor_pool_heap *heap) +{ + if (heap->size == 0) + return; + + util_vma_heap_finish(&heap->heap); + util_vma_heap_init(&heap->heap, POOL_HEAP_OFFSET, heap->size); +} + +static VkResult +anv_descriptor_pool_heap_alloc(struct anv_descriptor_pool *pool, + struct anv_descriptor_pool_heap *heap, + uint32_t size, uint32_t alignment, + struct anv_state *state) +{ + uint64_t pool_vma_offset = + util_vma_heap_alloc(&heap->heap, size, alignment); + if (pool_vma_offset == 0) + return vk_error(pool, VK_ERROR_FRAGMENTED_POOL); + + assert(pool_vma_offset >= POOL_HEAP_OFFSET && + pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX); + + state->offset = pool_vma_offset - POOL_HEAP_OFFSET; + state->alloc_size = size; + if (heap->host_mem) + state->map = heap->host_mem + state->offset; + else + state->map = heap->bo->map + state->offset; + + return VK_SUCCESS; +} + +static void +anv_descriptor_pool_heap_free(struct anv_descriptor_pool_heap *heap, + struct anv_state state) +{ + util_vma_heap_free(&heap->heap, + (uint64_t)state.offset + POOL_HEAP_OFFSET, + state.alloc_size); +} + VkResult anv_CreateDescriptorPool( VkDevice _device, const VkDescriptorPoolCreateInfo* pCreateInfo, @@ -1040,30 +1270,47 @@ VkResult anv_CreateDescriptorPool( uint32_t descriptor_count = 0; uint32_t buffer_view_count = 0; - uint32_t descriptor_bo_size = 0; + uint32_t descriptor_bo_surface_size = 0; + uint32_t descriptor_bo_sampler_size = 0; + + const enum anv_descriptor_set_layout_type layout_type = + device->physical->indirect_descriptors ? + ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT : + ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT; for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) { enum anv_descriptor_data desc_data = pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_mutable_type(device->physical, mutable_info, i) : - anv_descriptor_data_for_type(device->physical, pCreateInfo->pPoolSizes[i].type); + anv_descriptor_data_for_mutable_type(device->physical, layout_type, + mutable_info, i) : + anv_descriptor_data_for_type(device->physical, layout_type, + pCreateInfo->pPoolSizes[i].type); if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW) buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount; - unsigned desc_data_size = - pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_size_for_mutable_type(device->physical, mutable_info, i) : - anv_descriptor_data_size(desc_data); + uint16_t desc_surface_size, desc_sampler_size; + if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { + anv_descriptor_size_for_mutable_type(device->physical, layout_type, mutable_info, i, + &desc_surface_size, &desc_sampler_size); + } else { + anv_descriptor_data_size(desc_data, layout_type, + &desc_surface_size, &desc_sampler_size); + } - desc_data_size *= pCreateInfo->pPoolSizes[i].descriptorCount; + uint32_t desc_data_surface_size = + desc_surface_size * pCreateInfo->pPoolSizes[i].descriptorCount; + uint32_t desc_data_sampler_size = + desc_sampler_size * pCreateInfo->pPoolSizes[i].descriptorCount; /* Combined image sampler descriptors can take up to 3 slots if they * hold a YCbCr image. */ if (pCreateInfo->pPoolSizes[i].type == - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) - desc_data_size *= 3; + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { + desc_data_surface_size *= 3; + desc_data_sampler_size *= 3; + } if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { @@ -1071,10 +1318,11 @@ VkResult anv_CreateDescriptorPool( * size as the size in bytes of the block. */ assert(inline_info); - desc_data_size += pCreateInfo->pPoolSizes[i].descriptorCount; + desc_data_surface_size += pCreateInfo->pPoolSizes[i].descriptorCount; } - descriptor_bo_size += desc_data_size; + descriptor_bo_surface_size += desc_data_surface_size; + descriptor_bo_sampler_size += desc_data_sampler_size; descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; } @@ -1086,13 +1334,12 @@ VkResult anv_CreateDescriptorPool( * extra space that we can chop it into maxSets pieces and align each one * of them to 32B. */ - descriptor_bo_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets; + descriptor_bo_surface_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets; /* We align inline uniform blocks to ANV_UBO_ALIGNMENT */ if (inline_info) { - descriptor_bo_size += + descriptor_bo_surface_size += ANV_UBO_ALIGNMENT * inline_info->maxInlineUniformBlockBindings; } - descriptor_bo_size = ALIGN(descriptor_bo_size, 4096); const bool host_only = pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT; @@ -1113,38 +1360,30 @@ VkResult anv_CreateDescriptorPool( if (!pool) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - pool->bo_mem_size = descriptor_bo_size; pool->host_mem_size = host_mem_size; util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, host_mem_size); pool->host_only = host_only; - if (pool->bo_mem_size > 0) { - if (pool->host_only) { - pool->host_bo = vk_zalloc(&device->vk.alloc, pool->bo_mem_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (pool->host_bo == NULL) { - vk_object_free(&device->vk, pAllocator, pool); - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - } - } else { - VkResult result = anv_device_alloc_bo(device, - device->physical->indirect_descriptors ? - "indirect descriptors" : - "direct descriptors", - descriptor_bo_size, - ANV_BO_ALLOC_CAPTURE | - ANV_BO_ALLOC_MAPPED | - ANV_BO_ALLOC_HOST_CACHED_COHERENT | - ANV_BO_ALLOC_DESCRIPTOR_POOL, - 0 /* explicit_address */, - &pool->bo); - if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, pool); - return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); - } - } - util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo_mem_size); + VkResult result = anv_descriptor_pool_heap_init(device, + &pool->surfaces, + descriptor_bo_surface_size, + pool->host_only, + false /* samplers */); + if (result != VK_SUCCESS) { + vk_object_free(&device->vk, pAllocator, pool); + return result; + } + + result = anv_descriptor_pool_heap_init(device, + &pool->samplers, + descriptor_bo_sampler_size, + pool->host_only, + true /* samplers */); + if (result != VK_SUCCESS) { + anv_descriptor_pool_heap_fini(device, &pool->surfaces); + vk_object_free(&device->vk, pAllocator, pool); + return result; } /* All the surface states allocated by the descriptor pool are internal. We @@ -1180,15 +1419,11 @@ void anv_DestroyDescriptorPool( util_vma_heap_finish(&pool->host_heap); - if (pool->bo_mem_size) { - if (pool->host_bo) - vk_free(&device->vk.alloc, pool->host_bo); - if (pool->bo) - anv_device_release_bo(device, pool->bo); - util_vma_heap_finish(&pool->bo_heap); - } anv_state_stream_finish(&pool->surface_state_stream); + anv_descriptor_pool_heap_fini(device, &pool->surfaces); + anv_descriptor_pool_heap_fini(device, &pool->samplers); + vk_object_free(&device->vk, pAllocator, pool); } @@ -1209,10 +1444,8 @@ VkResult anv_ResetDescriptorPool( util_vma_heap_finish(&pool->host_heap); util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, pool->host_mem_size); - if (pool->bo_mem_size) { - util_vma_heap_finish(&pool->bo_heap); - util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo_mem_size); - } + anv_descriptor_pool_heap_reset(device, &pool->surfaces); + anv_descriptor_pool_heap_reset(device, &pool->samplers); anv_state_stream_finish(&pool->surface_state_stream); anv_state_stream_init(&pool->surface_state_stream, @@ -1324,35 +1557,29 @@ anv_descriptor_set_create(struct anv_device *device, if (result != VK_SUCCESS) return result; - uint32_t descriptor_buffer_size = - anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count); + uint32_t descriptor_buffer_surface_size, descriptor_buffer_sampler_size; + anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count, + &descriptor_buffer_surface_size, + &descriptor_buffer_sampler_size); set->desc_surface_state = ANV_STATE_NULL; set->is_push = false; - if (descriptor_buffer_size) { - uint64_t pool_vma_offset = - util_vma_heap_alloc(&pool->bo_heap, descriptor_buffer_size, - ANV_UBO_ALIGNMENT); - if (pool_vma_offset == 0) { + if (descriptor_buffer_surface_size) { + result = anv_descriptor_pool_heap_alloc(pool, &pool->surfaces, + descriptor_buffer_surface_size, + ANV_UBO_ALIGNMENT, + &set->desc_surface_mem); + if (result != VK_SUCCESS) { anv_descriptor_pool_free_set(pool, set); - return vk_error(pool, VK_ERROR_FRAGMENTED_POOL); + return result; } - assert(pool_vma_offset >= POOL_HEAP_OFFSET && - pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX); - set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET; - set->desc_mem.alloc_size = descriptor_buffer_size; - if (pool->host_only) - set->desc_mem.map = pool->host_bo + set->desc_mem.offset; - else - set->desc_mem.map = pool->bo->map + set->desc_mem.offset; - - set->desc_addr = (struct anv_address) { - .bo = pool->bo, - .offset = set->desc_mem.offset, + set->desc_surface_addr = (struct anv_address) { + .bo = pool->surfaces.bo, + .offset = set->desc_surface_mem.offset, }; - set->desc_offset = anv_address_physical(set->desc_addr) - + set->desc_offset = anv_address_physical(set->desc_surface_addr) - device->physical->va.internal_surface_state_pool.addr; enum isl_format format = @@ -1369,12 +1596,31 @@ anv_descriptor_set_create(struct anv_device *device, anv_fill_buffer_surface_state(device, set->desc_surface_state.map, format, ISL_SWIZZLE_IDENTITY, ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, - set->desc_addr, - descriptor_buffer_size, 1); + set->desc_surface_addr, + descriptor_buffer_surface_size, 1); } } else { - set->desc_mem = ANV_STATE_NULL; - set->desc_addr = (struct anv_address) { .bo = NULL, .offset = 0 }; + set->desc_surface_mem = ANV_STATE_NULL; + set->desc_surface_addr = ANV_NULL_ADDRESS; + } + + if (descriptor_buffer_sampler_size) { + result = anv_descriptor_pool_heap_alloc(pool, &pool->samplers, + descriptor_buffer_sampler_size, + ANV_SAMPLER_STATE_SIZE, + &set->desc_sampler_mem); + if (result != VK_SUCCESS) { + anv_descriptor_pool_free_set(pool, set); + return result; + } + + set->desc_sampler_addr = (struct anv_address) { + .bo = pool->samplers.bo, + .offset = set->desc_sampler_mem.offset, + }; + } else { + set->desc_sampler_mem = ANV_STATE_NULL; + set->desc_sampler_addr = ANV_NULL_ADDRESS; } vk_object_base_init(&device->vk, &set->base, @@ -1456,14 +1702,15 @@ anv_descriptor_set_destroy(struct anv_device *device, { anv_descriptor_set_layout_unref(device, set->layout); - if (set->desc_mem.alloc_size) { - util_vma_heap_free(&pool->bo_heap, - (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET, - set->desc_mem.alloc_size); + if (set->desc_surface_mem.alloc_size) { + anv_descriptor_pool_heap_free(&pool->surfaces, set->desc_surface_mem); if (set->desc_surface_state.alloc_size) anv_descriptor_pool_free_state(pool, set->desc_surface_state); } + if (set->desc_sampler_mem.alloc_size) + anv_descriptor_pool_heap_free(&pool->samplers, set->desc_sampler_mem); + if (device->physical->indirect_descriptors) { if (!pool->host_only) { for (uint32_t b = 0; b < set->buffer_view_count; b++) { @@ -1582,9 +1829,9 @@ anv_push_descriptor_set_init(struct anv_cmd_buffer *cmd_buffer, set->buffer_view_count = layout->buffer_view_count; set->descriptor_count = layout->descriptor_count; - if (layout->descriptor_buffer_size && + if (layout->descriptor_buffer_surface_size && (push_set->set_used_on_gpu || - set->desc_mem.alloc_size < layout->descriptor_buffer_size)) { + set->desc_surface_mem.alloc_size < layout->descriptor_buffer_surface_size)) { struct anv_physical_device *pdevice = cmd_buffer->device->physical; struct anv_state_stream *push_stream = pdevice->indirect_descriptors ? @@ -1594,30 +1841,67 @@ anv_push_descriptor_set_init(struct anv_cmd_buffer *cmd_buffer, pdevice->va.indirect_push_descriptor_pool.addr : pdevice->va.internal_surface_state_pool.addr; + uint32_t surface_size, sampler_size; + anv_descriptor_set_layout_descriptor_buffer_size(layout, 0, + &surface_size, + &sampler_size); + /* The previous buffer is either actively used by some GPU command (so * we can't modify it) or is too small. Allocate a new one. */ - struct anv_state desc_mem = - anv_state_stream_alloc(push_stream, - anv_descriptor_set_layout_descriptor_buffer_size(layout, 0), - ANV_UBO_ALIGNMENT); - if (desc_mem.map == NULL) + struct anv_state desc_surface_mem = + anv_state_stream_alloc(push_stream, surface_size, ANV_UBO_ALIGNMENT); + if (desc_surface_mem.map == NULL) return false; - if (set->desc_mem.alloc_size) { + if (set->desc_surface_mem.alloc_size) { /* TODO: Do we really need to copy all the time? */ - memcpy(desc_mem.map, set->desc_mem.map, - MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size)); + memcpy(desc_surface_mem.map, set->desc_surface_mem.map, + MIN2(desc_surface_mem.alloc_size, + set->desc_surface_mem.alloc_size)); } - set->desc_mem = desc_mem; + set->desc_surface_mem = desc_surface_mem; - set->desc_addr = anv_state_pool_state_address( + set->desc_surface_addr = anv_state_pool_state_address( push_stream->state_pool, - set->desc_mem); - set->desc_offset = anv_address_physical(set->desc_addr) - + set->desc_surface_mem); + set->desc_offset = anv_address_physical(set->desc_surface_addr) - push_base_address; } + if (layout->descriptor_buffer_sampler_size && + (push_set->set_used_on_gpu || + set->desc_sampler_mem.alloc_size < layout->descriptor_buffer_sampler_size)) { + struct anv_physical_device *pdevice = cmd_buffer->device->physical; + assert(!pdevice->indirect_descriptors); + struct anv_state_stream *push_stream = &cmd_buffer->dynamic_state_stream; + + uint32_t surface_size, sampler_size; + anv_descriptor_set_layout_descriptor_buffer_size(layout, 0, + &surface_size, + &sampler_size); + + /* The previous buffer is either actively used by some GPU command (so + * we can't modify it) or is too small. Allocate a new one. + */ + struct anv_state desc_sampler_mem = + anv_state_stream_alloc(push_stream, sampler_size, ANV_SAMPLER_STATE_SIZE); + if (desc_sampler_mem.map == NULL) + return false; + + if (set->desc_sampler_mem.alloc_size) { + /* TODO: Do we really need to copy all the time? */ + memcpy(desc_sampler_mem.map, set->desc_sampler_mem.map, + MIN2(desc_sampler_mem.alloc_size, + set->desc_sampler_mem.alloc_size)); + } + set->desc_sampler_mem = desc_sampler_mem; + + set->desc_sampler_addr = anv_state_pool_state_address( + push_stream->state_pool, + set->desc_sampler_mem); + } + return true; } @@ -1723,12 +2007,16 @@ anv_descriptor_set_write_image_view(struct anv_device *device, .sampler = sampler, }; - void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + - element * bind_layout->descriptor_stride; + void *desc_surface_map = set->desc_surface_mem.map + + bind_layout->descriptor_surface_offset + + element * bind_layout->descriptor_surface_stride; + void *desc_sampler_map = set->desc_sampler_mem.map + + bind_layout->descriptor_sampler_offset + + element * bind_layout->descriptor_sampler_stride; enum anv_descriptor_data data = bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_type(device->physical, type) : + anv_descriptor_data_for_type(device->physical, set->layout->type, type) : bind_layout->data; if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE) { @@ -1755,7 +2043,7 @@ anv_descriptor_set_write_image_view(struct anv_device *device, * can be no more than the size of our array of handles. */ assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data)); - memcpy(desc_map, desc_data, + memcpy(desc_surface_map, desc_data, MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0])); } @@ -1768,20 +2056,20 @@ anv_descriptor_set_write_image_view(struct anv_device *device, image_view->planes[0].storage.state), .image_depth = image_view->vk.storage.z_slice_count, }; - memcpy(desc_map, &desc_data, sizeof(desc_data)); + memcpy(desc_surface_map, &desc_data, sizeof(desc_data)); } else { - memset(desc_map, 0, bind_layout->descriptor_stride); + memset(desc_surface_map, 0, bind_layout->descriptor_surface_stride); } } if (data & ANV_DESCRIPTOR_SAMPLER) { if (sampler) { for (unsigned p = 0; p < sampler->n_planes; p++) { - memcpy(desc_map + p * ANV_SAMPLER_STATE_SIZE, + memcpy(desc_sampler_map + p * ANV_SAMPLER_STATE_SIZE, sampler->state[p], ANV_SAMPLER_STATE_SIZE); } } else { - memset(desc_map, 0, bind_layout->descriptor_stride); + memset(desc_sampler_map, 0, bind_layout->descriptor_sampler_stride); } } @@ -1789,7 +2077,7 @@ anv_descriptor_set_write_image_view(struct anv_device *device, unsigned max_plane_count = image_view ? image_view->n_planes : 1; for (unsigned p = 0; p < max_plane_count; p++) { - void *plane_map = desc_map + p * ANV_SURFACE_STATE_SIZE; + void *plane_map = desc_surface_map + p * ANV_SURFACE_STATE_SIZE; if (image_view) { memcpy(plane_map, @@ -1808,7 +2096,7 @@ anv_descriptor_set_write_image_view(struct anv_device *device, sampler ? sampler->n_planes : 1); for (unsigned p = 0; p < max_plane_count; p++) { - void *plane_map = desc_map + p * 2 * ANV_SURFACE_STATE_SIZE; + void *plane_map = desc_surface_map + p * 2 * ANV_SURFACE_STATE_SIZE; if (image_view) { memcpy(plane_map, @@ -1866,17 +2154,18 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device, enum anv_descriptor_data data = bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_type(device->physical, type) : + anv_descriptor_data_for_type(device->physical, set->layout->type, type) : bind_layout->data; - void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + - element * bind_layout->descriptor_stride; + void *desc_map = set->desc_surface_mem.map + + bind_layout->descriptor_surface_offset + + element * bind_layout->descriptor_surface_stride; if (buffer_view == NULL) { if (data & ANV_DESCRIPTOR_SURFACE) memcpy(desc_map, device->null_surface_state.map, ANV_SURFACE_STATE_SIZE); else - memset(desc_map, 0, bind_layout->descriptor_stride); + memset(desc_map, 0, bind_layout->descriptor_surface_stride); return; } @@ -1954,17 +2243,18 @@ anv_descriptor_set_write_buffer(struct anv_device *device, enum anv_descriptor_data data = bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_type(device->physical, type) : + anv_descriptor_data_for_type(device->physical, set->layout->type, type) : bind_layout->data; - void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + - element * bind_layout->descriptor_stride; + void *desc_map = set->desc_surface_mem.map + + bind_layout->descriptor_surface_offset + + element * bind_layout->descriptor_surface_stride; if (buffer == NULL) { if (data & ANV_DESCRIPTOR_SURFACE) memcpy(desc_map, device->null_surface_state.map, ANV_SURFACE_STATE_SIZE); else - memset(desc_map, 0, bind_layout->descriptor_stride); + memset(desc_map, 0, bind_layout->descriptor_surface_stride); return; } @@ -2038,7 +2328,8 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device, assert(bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM); - void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset; + void *desc_map = set->desc_surface_mem.map + + bind_layout->descriptor_surface_offset; memcpy(desc_map + offset, data, size); } @@ -2066,10 +2357,11 @@ anv_descriptor_set_write_acceleration_structure(struct anv_device *device, desc_data.address = vk_acceleration_structure_get_va(accel); desc_data.range = accel->size; } - assert(sizeof(desc_data) <= bind_layout->descriptor_stride); + assert(sizeof(desc_data) <= bind_layout->descriptor_surface_stride); - void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + - element * bind_layout->descriptor_stride; + void *desc_map = set->desc_surface_mem.map + + bind_layout->descriptor_surface_offset + + element * bind_layout->descriptor_surface_stride; memcpy(desc_map, &desc_data, sizeof(desc_data)); } @@ -2190,14 +2482,19 @@ void anv_UpdateDescriptorSets( if (src_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { anv_descriptor_set_write_inline_uniform_data(device, dst, copy->dstBinding, - src->desc_mem.map + src_layout->descriptor_offset + copy->srcArrayElement, + src->desc_surface_mem.map + + src_layout->descriptor_surface_offset + copy->srcArrayElement, copy->dstArrayElement, copy->descriptorCount); continue; } - uint32_t copy_element_size = MIN2(src_layout->descriptor_stride, - dst_layout->descriptor_stride); + uint32_t copy_surface_element_size = + MIN2(src_layout->descriptor_surface_stride, + dst_layout->descriptor_surface_stride); + uint32_t copy_sampler_element_size = + MIN2(src_layout->descriptor_sampler_stride, + dst_layout->descriptor_sampler_stride); for (uint32_t j = 0; j < copy->descriptorCount; j++) { struct anv_descriptor *src_desc = &src->descriptors[src_layout->descriptor_index + @@ -2214,13 +2511,20 @@ void anv_UpdateDescriptorSets( * - RENDER_SURFACE_STATE * - SAMPLER_STATE */ - memcpy(dst->desc_mem.map + - dst_layout->descriptor_offset + - (copy->dstArrayElement + j) * dst_layout->descriptor_stride, - src->desc_mem.map + - src_layout->descriptor_offset + - (copy->srcArrayElement + j) * src_layout->descriptor_stride, - copy_element_size); + memcpy(dst->desc_surface_mem.map + + dst_layout->descriptor_surface_offset + + (copy->dstArrayElement + j) * dst_layout->descriptor_surface_stride, + src->desc_surface_mem.map + + src_layout->descriptor_surface_offset + + (copy->srcArrayElement + j) * src_layout->descriptor_surface_stride, + copy_surface_element_size); + memcpy(dst->desc_sampler_mem.map + + dst_layout->descriptor_sampler_offset + + (copy->dstArrayElement + j) * dst_layout->descriptor_sampler_stride, + src->desc_sampler_mem.map + + src_layout->descriptor_sampler_offset + + (copy->srcArrayElement + j) * src_layout->descriptor_sampler_stride, + copy_sampler_element_size); /* Copy the CPU side data anv_descriptor */ *dst_desc = *src_desc; @@ -2230,7 +2534,9 @@ void anv_UpdateDescriptorSets( */ const enum anv_descriptor_data data = src_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ? - anv_descriptor_data_for_type(device->physical, src_desc->type) : + anv_descriptor_data_for_type(device->physical, + src->layout->type, + src_desc->type) : src_layout->data; if (data & ANV_DESCRIPTOR_BUFFER_VIEW) { struct anv_buffer_view *src_bview = diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 358c283f1a6..c878d76c45e 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -1479,7 +1479,7 @@ anv_physical_device_try_create(struct vk_instance *vk_instance, device->compiler->supports_shader_constants = true; device->compiler->indirect_ubos_use_sampler = device->info.ver < 12; device->compiler->extended_bindless_surface_offset = device->uses_ex_bso; - device->compiler->use_bindless_sampler_offset = !device->indirect_descriptors; + device->compiler->use_bindless_sampler_offset = false; device->compiler->spilling_rate = driQueryOptioni(&instance->dri_options, "shader_spilling_rate"); @@ -3324,6 +3324,9 @@ VkResult anv_CreateDevice( device->physical->va.bindless_surface_state_pool.size); } + util_vma_heap_init(&device->vma_samplers, + device->physical->va.sampler_state_pool.addr, + device->physical->va.sampler_state_pool.size); util_vma_heap_init(&device->vma_trtt, device->physical->va.trtt.addr, device->physical->va.trtt.size); @@ -3789,6 +3792,8 @@ VkResult anv_CreateDevice( pthread_mutex_destroy(&device->mutex); fail_vmas: util_vma_heap_finish(&device->vma_trtt); + if (!device->physical->indirect_descriptors) + util_vma_heap_finish(&device->vma_samplers); util_vma_heap_finish(&device->vma_desc); util_vma_heap_finish(&device->vma_hi); util_vma_heap_finish(&device->vma_lo); @@ -3903,6 +3908,8 @@ void anv_DestroyDevice( anv_bo_cache_finish(&device->bo_cache); util_vma_heap_finish(&device->vma_trtt); + if (!device->physical->indirect_descriptors) + util_vma_heap_finish(&device->vma_samplers); util_vma_heap_finish(&device->vma_desc); util_vma_heap_finish(&device->vma_hi); util_vma_heap_finish(&device->vma_lo); @@ -3970,6 +3977,9 @@ anv_vma_heap_for_flags(struct anv_device *device, if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_POOL) return &device->vma_desc; + if (alloc_flags & ANV_BO_ALLOC_SAMPLER_POOL) + return &device->vma_samplers; + return &device->vma_hi; } @@ -4022,6 +4032,7 @@ anv_vma_free(struct anv_device *device, assert(vma_heap == &device->vma_lo || vma_heap == &device->vma_hi || vma_heap == &device->vma_desc || + vma_heap == &device->vma_samplers || vma_heap == &device->vma_trtt); const uint64_t addr_48b = intel_48b_address(address); diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 6aad6fe2fe5..e55e4e51391 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -136,7 +136,7 @@ add_binding(struct apply_pipeline_layout_state *state, * this binding. This lets us be lazy and call this function constantly * without worrying about unnecessarily enabling the buffer. */ - if (bind_layout->descriptor_stride) + if (bind_layout->descriptor_surface_stride) state->set[set].desc_buffer_used = true; if (bind_layout->dynamic_offset_index >= 0) @@ -556,7 +556,8 @@ build_res_index(nir_builder *b, case nir_address_format_64bit_global_32bit_offset: /* Descriptor set buffer accesses will go through A64 messages, so the * index to get the descriptor set buffer address is located in the - * anv_push_constants::desc_offsets and it's indexed by the set number. + * anv_push_constants::desc_surface_offsets and it's indexed by the set + * number. */ set_idx = set; break; @@ -593,8 +594,8 @@ build_res_index(nir_builder *b, } const uint32_t desc_bti = state->set[set].binding[binding].surface_offset; - assert(bind_layout->descriptor_stride % 8 == 0); - const uint32_t desc_stride = bind_layout->descriptor_stride / 8; + assert(bind_layout->descriptor_surface_stride % 8 == 0); + const uint32_t desc_stride = bind_layout->descriptor_surface_stride / 8; nir_def *packed = nir_ior_imm(b, @@ -605,7 +606,7 @@ build_res_index(nir_builder *b, return nir_vec4(b, packed, - nir_imm_int(b, bind_layout->descriptor_offset), + nir_imm_int(b, bind_layout->descriptor_surface_offset), nir_imm_int(b, array_size - 1), array_index); } @@ -748,8 +749,8 @@ build_desc_addr_for_binding(nir_builder *b, nir_iadd_imm(b, nir_imul_imm(b, array_index, - bind_layout->descriptor_stride), - bind_layout->descriptor_offset); + bind_layout->descriptor_surface_stride), + bind_layout->descriptor_surface_offset); return nir_vec4(b, nir_unpack_64_2x32_split_x(b, set_addr), nir_unpack_64_2x32_split_y(b, set_addr), @@ -763,14 +764,38 @@ build_desc_addr_for_binding(nir_builder *b, nir_iadd_imm(b, nir_imul_imm(b, array_index, - bind_layout->descriptor_stride), - bind_layout->descriptor_offset)); + bind_layout->descriptor_surface_stride), + bind_layout->descriptor_surface_offset)); default: unreachable("Unhandled address format"); } } +static unsigned +binding_descriptor_offset(const struct apply_pipeline_layout_state *state, + const struct anv_descriptor_set_binding_layout *bind_layout, + bool sampler) +{ + if (sampler && + state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) + return bind_layout->descriptor_sampler_offset; + + return bind_layout->descriptor_surface_offset; +} + +static unsigned +binding_descriptor_stride(const struct apply_pipeline_layout_state *state, + const struct anv_descriptor_set_binding_layout *bind_layout, + bool sampler) +{ + if (sampler && + state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) + return bind_layout->descriptor_sampler_stride; + + return bind_layout->descriptor_surface_stride; +} + static nir_def * build_surface_index_for_binding(nir_builder *b, unsigned set, unsigned binding, @@ -781,6 +806,10 @@ build_surface_index_for_binding(nir_builder *b, { const struct anv_descriptor_set_binding_layout *bind_layout = &state->layout->set[set].layout->binding[binding]; + const unsigned descriptor_offset = + binding_descriptor_offset(state, bind_layout, false /* sampler */); + const unsigned descriptor_stride = + binding_descriptor_stride(state, bind_layout, false /* sampler */); const bool is_bindless = is_binding_bindless(set, binding, false /* sampler */, state); @@ -797,23 +826,25 @@ build_surface_index_for_binding(nir_builder *b, } else { set_offset = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), - .base = offsetof(struct anv_push_constants, desc_offsets[set]), - .range = sizeof_field(struct anv_push_constants, desc_offsets[set])); + .base = offsetof(struct anv_push_constants, + desc_surface_offsets[set]), + .range = sizeof_field(struct anv_push_constants, + desc_surface_offsets[set])); /* With bindless indexes are offsets in the descriptor buffer */ surface_index = nir_iadd_imm(b, - nir_imul_imm(b, array_index, bind_layout->descriptor_stride), - bind_layout->descriptor_offset); + nir_imul_imm(b, array_index, descriptor_stride), + descriptor_offset); if (plane != 0) { assert(plane < bind_layout->max_plane_count); surface_index = nir_iadd_imm(b, surface_index, - plane * (bind_layout->descriptor_stride / + plane * (descriptor_stride / bind_layout->max_plane_count)); } - assert(bind_layout->descriptor_offset % 64 == 0); - assert(bind_layout->descriptor_stride % 64 == 0); + assert(descriptor_offset % 64 == 0); + assert(descriptor_stride % 64 == 0); } } else { /* Unused */ @@ -854,14 +885,17 @@ build_sampler_handle_for_binding(nir_builder *b, bool non_uniform, const struct apply_pipeline_layout_state *state) { + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + const unsigned descriptor_offset = + binding_descriptor_offset(state, bind_layout, true /* sampler */); + const unsigned descriptor_stride = + binding_descriptor_stride(state, bind_layout, true /* sampler */); const bool is_bindless = is_binding_bindless(set, binding, true /* sampler */, state); nir_def *set_offset, *sampler_index; if (is_bindless) { - const struct anv_descriptor_set_binding_layout *bind_layout = - &state->layout->set[set].layout->binding[binding]; - if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) { set_offset = nir_imm_int(b, 0xdeaddead); @@ -878,10 +912,12 @@ build_sampler_handle_for_binding(nir_builder *b, } else { set_offset = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), - .base = offsetof(struct anv_push_constants, desc_offsets[set]), - .range = sizeof_field(struct anv_push_constants, desc_offsets[set])); + .base = offsetof(struct anv_push_constants, + desc_sampler_offsets[set]), + .range = sizeof_field(struct anv_push_constants, + desc_sampler_offsets[set])); - uint32_t base_offset = bind_layout->descriptor_offset; + uint32_t base_offset = descriptor_offset; /* The SAMPLER_STATE can only be located at a 64 byte in the combined * image/sampler case. Combined image/sampler is not supported to be @@ -892,13 +928,13 @@ build_sampler_handle_for_binding(nir_builder *b, if (plane != 0) { assert(plane < bind_layout->max_plane_count); - base_offset += plane * (bind_layout->descriptor_stride / + base_offset += plane * (descriptor_stride / bind_layout->max_plane_count); } sampler_index = nir_iadd_imm(b, - nir_imul_imm(b, array_index, bind_layout->descriptor_stride), + nir_imul_imm(b, array_index, descriptor_stride), base_offset); } } else { @@ -1095,7 +1131,7 @@ build_buffer_addr_for_binding(nir_builder *b, &state->layout->set[set].layout->binding[binding]; return nir_vec2(b, nir_imm_int(b, state->set[set].desc_offset), - nir_imm_int(b, bind_layout->descriptor_offset)); + nir_imm_int(b, bind_layout->descriptor_surface_offset)); } struct res_index_defs res = unpack_res_index(b, res_index); @@ -1875,9 +1911,9 @@ add_bti_entry(struct anv_pipeline_bind_map *map, .set = set, .binding = binding, .index = bind_layout->descriptor_index + element, - .set_offset = bind_layout->descriptor_offset + - element * bind_layout->descriptor_stride + - plane * bind_layout->descriptor_data_size, + .set_offset = bind_layout->descriptor_surface_offset + + element * bind_layout->descriptor_surface_stride + + plane * bind_layout->descriptor_data_surface_size, .plane = plane, }; assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); @@ -1896,8 +1932,8 @@ add_dynamic_bti_entry(struct anv_pipeline_bind_map *map, .set = set, .binding = binding, .index = bind_layout->descriptor_index + element, - .set_offset = bind_layout->descriptor_offset + - element * bind_layout->descriptor_stride, + .set_offset = bind_layout->descriptor_surface_offset + + element * bind_layout->descriptor_surface_stride, .dynamic_offset_index = bind_layout->dynamic_offset_index + element, }; assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index ef9f7634a36..4035ba2f9cf 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -68,10 +68,12 @@ anv_nir_compute_push_layout(nir_shader *nir, case nir_intrinsic_load_desc_set_address_intel: case nir_intrinsic_load_desc_set_dynamic_index_intel: { - unsigned base = offsetof(struct anv_push_constants, desc_offsets); + unsigned base = offsetof(struct anv_push_constants, + desc_surface_offsets); push_start = MIN2(push_start, base); push_end = MAX2(push_end, base + - sizeof_field(struct anv_push_constants, desc_offsets)); + sizeof_field(struct anv_push_constants, + desc_surface_offsets)); break; } @@ -175,8 +177,10 @@ anv_nir_compute_push_layout(nir_shader *nir, b->cursor = nir_before_instr(&intrin->instr); nir_def *pc_load = nir_load_uniform(b, 1, 32, nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint32_t)), - .base = offsetof(struct anv_push_constants, desc_offsets), - .range = sizeof_field(struct anv_push_constants, desc_offsets), + .base = offsetof(struct anv_push_constants, + desc_surface_offsets), + .range = sizeof_field(struct anv_push_constants, + desc_surface_offsets), .dest_type = nir_type_uint32); pc_load = nir_iand_imm(b, pc_load, ANV_DESCRIPTOR_SET_OFFSET_MASK); nir_def *desc_addr = @@ -192,8 +196,10 @@ anv_nir_compute_push_layout(nir_shader *nir, b->cursor = nir_before_instr(&intrin->instr); nir_def *pc_load = nir_load_uniform(b, 1, 32, nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint32_t)), - .base = offsetof(struct anv_push_constants, desc_offsets), - .range = sizeof_field(struct anv_push_constants, desc_offsets), + .base = offsetof(struct anv_push_constants, + desc_surface_offsets), + .range = sizeof_field(struct anv_push_constants, + desc_surface_offsets), .dest_type = nir_type_uint32); pc_load = nir_iand_imm( b, pc_load, ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK); diff --git a/src/intel/vulkan/anv_nir_lower_resource_intel.c b/src/intel/vulkan/anv_nir_lower_resource_intel.c index fffe4a89611..603831b8b11 100644 --- a/src/intel/vulkan/anv_nir_lower_resource_intel.c +++ b/src/intel/vulkan/anv_nir_lower_resource_intel.c @@ -135,9 +135,7 @@ lower_resource_intel(nir_builder *b, nir_intrinsic_instr *intrin, void *data) * set, resource_intel::src[0] has to be shifted right by 6 (bringing * it back in bytes). */ - if (is_sampler) - set_offset = nir_ushr_imm(b, set_offset, 6); - else + if (!is_sampler) binding_offset = nir_ishl_imm(b, binding_offset, 6); } diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index 880b297858b..2ead8ecd3ba 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -123,13 +123,15 @@ anv_shader_bin_create(struct anv_device *device, int rv_count = 0; struct brw_shader_reloc_value reloc_values[6]; - assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0); + assert((device->physical->va.indirect_descriptor_pool.addr & 0xffffffff) == 0); + assert((device->physical->va.internal_surface_state_pool.addr & 0xffffffff) == 0); reloc_values[rv_count++] = (struct brw_shader_reloc_value) { .id = BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH, .value = device->physical->indirect_descriptors ? (device->physical->va.indirect_descriptor_pool.addr >> 32) : - (device->physical->va.binding_table_pool.addr >> 32), + (device->physical->va.internal_surface_state_pool.addr >> 32), }; + assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0); reloc_values[rv_count++] = (struct brw_shader_reloc_value) { .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, .value = shader_data_addr, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 9c108e78308..b1c558c7713 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -432,6 +432,9 @@ enum anv_bo_alloc_flags { * set it will allocate a coherent BO. **/ ANV_BO_ALLOC_HOST_CACHED = (1 << 16), + + /** For sampler pools */ + ANV_BO_ALLOC_SAMPLER_POOL = (1 << 17), }; struct anv_bo { @@ -1634,6 +1637,7 @@ struct anv_device { struct util_vma_heap vma_lo; struct util_vma_heap vma_hi; struct util_vma_heap vma_desc; + struct util_vma_heap vma_samplers; struct util_vma_heap vma_trtt; /** List of all anv_device_memory objects */ @@ -2370,18 +2374,30 @@ struct anv_descriptor_set_binding_layout { */ int16_t dynamic_offset_index; - /* Computed size from data */ - uint16_t descriptor_data_size; + /* Computed surface size from data (for one plane) */ + uint16_t descriptor_data_surface_size; + + /* Computed sampler size from data (for one plane) */ + uint16_t descriptor_data_sampler_size; /* Index into the descriptor set buffer views */ int32_t buffer_view_index; - /* Offset into the descriptor buffer where this descriptor lives */ - uint32_t descriptor_offset; + /* Offset into the descriptor buffer where the surface descriptor lives */ + uint32_t descriptor_surface_offset; - /* Pre computed stride (with multiplane descriptor, the descriptor includes - * all the planes) */ - unsigned descriptor_stride; + /* Offset into the descriptor buffer where the sampler descriptor lives */ + uint16_t descriptor_sampler_offset; + + /* Pre computed surface stride (with multiplane descriptor, the descriptor + * includes all the planes) + */ + uint16_t descriptor_surface_stride; + + /* Pre computed sampler stride (with multiplane descriptor, the descriptor + * includes all the planes) + */ + uint16_t descriptor_sampler_stride; /* Immutable samplers (or NULL if no immutable samplers) */ struct anv_sampler **immutable_samplers; @@ -2433,8 +2449,15 @@ struct anv_descriptor_set_layout { */ VkShaderStageFlags dynamic_offset_stages[MAX_DYNAMIC_BUFFERS]; - /* Size of the descriptor buffer for this descriptor set */ - uint32_t descriptor_buffer_size; + /* Size of the descriptor buffer dedicated to surface states for this + * descriptor set + */ + uint32_t descriptor_buffer_surface_size; + + /* Size of the descriptor buffer dedicated to sampler states for this + * descriptor set + */ + uint32_t descriptor_buffer_sampler_size; /* Bindings in this descriptor set */ struct anv_descriptor_set_binding_layout binding[0]; @@ -2506,13 +2529,20 @@ struct anv_descriptor_set { */ uint32_t generate_surface_states; - /* State relative to anv_descriptor_pool::bo */ - struct anv_state desc_mem; + /* State relative to anv_descriptor_pool::surface_bo */ + struct anv_state desc_surface_mem; + /* State relative to anv_descriptor_pool::sampler_bo */ + struct anv_state desc_sampler_mem; /* Surface state for the descriptor buffer */ struct anv_state desc_surface_state; - /* Descriptor set address. */ - struct anv_address desc_addr; + /* Descriptor set address pointing to desc_surface_mem (we don't need one + * for sampler because they're never accessed other than by the HW through + * the shader sampler handle). + */ + struct anv_address desc_surface_addr; + + struct anv_address desc_sampler_addr; /* Descriptor offset from the * device->va.internal_surface_state_pool.addr @@ -2592,15 +2622,28 @@ anv_descriptor_set_address(struct anv_descriptor_set *set) push_set->set_used_on_gpu = true; } - return set->desc_addr; + return set->desc_surface_addr; } +struct anv_descriptor_pool_heap { + /* BO allocated to back the pool (unused for host pools) */ + struct anv_bo *bo; + + /* Host memory allocated to back a host pool */ + void *host_mem; + + /* Heap tracking allocations in bo/host_mem */ + struct util_vma_heap heap; + + /* Size of the heap */ + uint32_t size; +}; + struct anv_descriptor_pool { struct vk_object_base base; - struct anv_bo *bo; - void *host_bo; - struct util_vma_heap bo_heap; + struct anv_descriptor_pool_heap surfaces; + struct anv_descriptor_pool_heap samplers; struct anv_state_stream surface_state_stream; void *surface_state_free_list; @@ -2614,9 +2657,6 @@ struct anv_descriptor_pool { /** Allocated size of host_mem */ uint32_t host_mem_size; - /** Allocated size of descriptor bo (should be equal to bo->size) */ - uint32_t bo_mem_size; - /** * VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT. If set, then * surface_state_stream is unused. @@ -3265,15 +3305,6 @@ struct anv_push_constants { /** Push constant data provided by the client through vkPushConstants */ uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE]; - /** Dynamic offsets for dynamic UBOs and SSBOs */ - uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; - - /* Robust access pushed registers. */ - uint64_t push_reg_mask[MESA_SHADER_STAGES]; - - /** Ray query globals (RT_DISPATCH_GLOBALS) */ - uint64_t ray_query_globals; - #define ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK ((uint32_t)ANV_UBO_ALIGNMENT - 1) #define ANV_DESCRIPTOR_SET_OFFSET_MASK (~(uint32_t)(ANV_UBO_ALIGNMENT - 1)) @@ -3285,7 +3316,15 @@ struct anv_push_constants { * * In bits [6:63] : descriptor set address */ - uint32_t desc_offsets[MAX_SETS]; + uint32_t desc_surface_offsets[MAX_SETS]; + + /** + * Base offsets for descriptor sets from + */ + uint32_t desc_sampler_offsets[MAX_SETS]; + + /** Dynamic offsets for dynamic UBOs and SSBOs */ + uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; union { struct { @@ -3311,6 +3350,12 @@ struct anv_push_constants { uint32_t subgroup_id; } cs; }; + + /* Robust access pushed registers. */ + uint64_t push_reg_mask[MESA_SHADER_STAGES]; + + /** Ray query globals (RT_DISPATCH_GLOBALS) */ + uint64_t ray_query_globals; }; struct anv_surface_state { diff --git a/src/intel/vulkan/anv_va.c b/src/intel/vulkan/anv_va.c index b3bea2ae120..f7caa822c69 100644 --- a/src/intel/vulkan/anv_va.c +++ b/src/intel/vulkan/anv_va.c @@ -102,30 +102,29 @@ anv_physical_device_init_va_ranges(struct anv_physical_device *device) _1Gb - address); address = va_add(&device->va.low_heap, address, _1Gb); - /* PRMs & simulation disagrees on the actual size of this heap. Take the - * smallest (simulation) so that it works everywhere. - */ - address = va_add(&device->va.dynamic_state_pool, address, _1Gb); - address = va_add(&device->va.sampler_state_pool, address, 2 * _1Gb); - /* The following addresses have to be located in a 4Gb range so that the - * binding tables can address internal surface states & bindless surface - * states. + /* The binding table pool has to be located directly in front of the + * surface states. */ - address = align64(address, _4Gb); + address += _1Gb; address = va_add(&device->va.binding_table_pool, address, _1Gb); address = va_add(&device->va.internal_surface_state_pool, address, 1 * _1Gb); + assert(device->va.internal_surface_state_pool.addr == + align64(device->va.internal_surface_state_pool.addr, 2 * _1Gb)); /* Scratch surface state overlaps with the internal surface state */ va_at(&device->va.scratch_surface_state_pool, device->va.internal_surface_state_pool.addr, 8 * _1Mb); - - /* The bindless surface state heap has be in the same 4Gb range from the - * binding table pool start so they can be addressed from binding table - * entries. - */ address = va_add(&device->va.bindless_surface_state_pool, address, 2 * _1Gb); + + /* PRMs & simulation disagrees on the actual size of this heap. Take the + * smallest (simulation) so that it works everywhere. + */ + address = align64(address, _4Gb); + address = va_add(&device->va.dynamic_state_pool, address, _1Gb); + address = va_add(&device->va.sampler_state_pool, address, 2 * _1Gb); + if (device->indirect_descriptors) { /* With indirect descriptors, descriptor buffers can go anywhere, they * just need to be in a 4Gb aligned range, so all shader accesses can diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index fd8bfb2e7a8..1266a7c80ec 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -171,33 +171,34 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.GeneralStateBufferSize = 0xfffff; sba.IndirectObjectBufferSize = 0xfffff; - sba.DynamicStateBufferSize = device->physical->va.dynamic_state_pool.size / 4096; + sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size + + device->physical->va.sampler_state_pool.size) / 4096; sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096; sba.GeneralStateBufferSizeModifyEnable = true; sba.IndirectObjectBufferSizeModifyEnable = true; sba.DynamicStateBufferSizeModifyEnable = true; sba.InstructionBuffersizeModifyEnable = true; +#if GFX_VER >= 11 + sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS; + sba.BindlessSamplerStateBufferSize = 0; + sba.BindlessSamplerStateMOCS = mocs; + sba.BindlessSamplerStateBaseAddressModifyEnable = true; +#endif + if (!device->physical->indirect_descriptors) { #if GFX_VERx10 >= 125 /* Bindless Surface State & Bindless Sampler State are aligned to the * same heap */ sba.BindlessSurfaceStateBaseAddress = - sba.BindlessSamplerStateBaseAddress = (struct anv_address) { .offset = device->physical->va.binding_table_pool.addr, }; sba.BindlessSurfaceStateSize = - (device->physical->va.binding_table_pool.size + - device->physical->va.internal_surface_state_pool.size + - device->physical->va.descriptor_pool.size) - 1; - sba.BindlessSamplerStateBufferSize = - (device->physical->va.binding_table_pool.size + - device->physical->va.internal_surface_state_pool.size + - device->physical->va.descriptor_pool.size) / 4096 - 1; - sba.BindlessSurfaceStateMOCS = sba.BindlessSamplerStateMOCS = mocs; - sba.BindlessSurfaceStateBaseAddressModifyEnable = - sba.BindlessSamplerStateBaseAddressModifyEnable = true; + (device->physical->va.internal_surface_state_pool.size + + device->physical->va.bindless_surface_state_pool.size) - 1; + sba.BindlessSurfaceStateMOCS = mocs; + sba.BindlessSurfaceStateBaseAddressModifyEnable = true; #else unreachable("Direct descriptor not supported"); #endif @@ -210,12 +211,6 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1; sba.BindlessSurfaceStateMOCS = mocs; sba.BindlessSurfaceStateBaseAddressModifyEnable = true; -#if GFX_VER >= 11 - sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSamplerStateMOCS = mocs; - sba.BindlessSamplerStateBaseAddressModifyEnable = true; - sba.BindlessSamplerStateBufferSize = 0; -#endif } #if GFX_VERx10 >= 125 @@ -2115,7 +2110,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, /* This is a descriptor set buffer so the set index is actually * given by binding->binding. (Yes, that's confusing.) */ - assert(set->desc_mem.alloc_size); + assert(set->desc_surface_mem.alloc_size); assert(set->desc_surface_state.alloc_size); bt_map[s] = set->desc_surface_state.offset + state_offset; add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set)); @@ -2349,8 +2344,8 @@ flush_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, set->desc_surface_state.map, format, ISL_SWIZZLE_IDENTITY, ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, - set->desc_addr, - layout->descriptor_buffer_size, 1); + set->desc_surface_addr, + layout->descriptor_buffer_surface_size, 1); } state->push_descriptor.set_used_on_gpu = true; @@ -2480,9 +2475,10 @@ get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, case ANV_DESCRIPTOR_SET_DESCRIPTORS: { struct anv_descriptor_set *set = gfx_state->base.descriptors[range->index]; - assert(range->start * 32 < set->desc_mem.alloc_size); - assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size); - return set->desc_mem.alloc_size; + struct anv_state state = set->desc_surface_mem; + assert(range->start * 32 < state.alloc_size); + assert((range->start + range->length) * 32 <= state.alloc_size); + return state.alloc_size; } case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index 431e43ba868..87d8c94b442 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -250,7 +250,8 @@ init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch) (struct anv_address) { .offset = device->physical->va.dynamic_state_pool.addr, }; - sba.DynamicStateBufferSize = device->physical->va.dynamic_state_pool.size / 4096; + sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size + + device->physical->va.sampler_state_pool.size) / 4096; sba.DynamicStateMOCS = mocs; sba.DynamicStateBaseAddressModifyEnable = true; sba.DynamicStateBufferSizeModifyEnable = true; @@ -270,6 +271,13 @@ init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch) sba.InstructionBaseAddressModifyEnable = true; sba.InstructionBuffersizeModifyEnable = true; +#if GFX_VER >= 11 + sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS; + sba.BindlessSamplerStateBufferSize = 0; + sba.BindlessSamplerStateMOCS = mocs; + sba.BindlessSamplerStateBaseAddressModifyEnable = true; +#endif + if (device->physical->indirect_descriptors) { sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { .offset = @@ -279,29 +287,18 @@ init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch) anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1; sba.BindlessSurfaceStateMOCS = mocs; sba.BindlessSurfaceStateBaseAddressModifyEnable = true; - - sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; - sba.BindlessSamplerStateMOCS = mocs; - sba.BindlessSamplerStateBaseAddressModifyEnable = true; - sba.BindlessSamplerStateBufferSize = 0; } else { /* Bindless Surface State & Bindless Sampler State are aligned to the * same heap */ - sba.BindlessSurfaceStateBaseAddress = - sba.BindlessSamplerStateBaseAddress = - (struct anv_address) { .offset = device->physical->va.binding_table_pool.addr, }; + sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { + .offset = device->physical->va.internal_surface_state_pool.addr, + }; sba.BindlessSurfaceStateSize = - (device->physical->va.binding_table_pool.size + - device->physical->va.internal_surface_state_pool.size + + (device->physical->va.internal_surface_state_pool.size + device->physical->va.bindless_surface_state_pool.size) - 1; - sba.BindlessSamplerStateBufferSize = - (device->physical->va.binding_table_pool.size + - device->physical->va.internal_surface_state_pool.size + - device->physical->va.bindless_surface_state_pool.size) / 4096 - 1; - sba.BindlessSurfaceStateMOCS = sba.BindlessSamplerStateMOCS = mocs; - sba.BindlessSurfaceStateBaseAddressModifyEnable = - sba.BindlessSamplerStateBaseAddressModifyEnable = true; + sba.BindlessSurfaceStateMOCS = mocs; + sba.BindlessSurfaceStateBaseAddressModifyEnable = true; } #if GFX_VERx10 >= 125