diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index adb8b3ddffe..f265a8bfc89 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1001,6 +1001,37 @@ anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, set->buffer_view_count = layout->buffer_view_count; set->buffer_views = (*push_set)->buffer_views; + if (layout->descriptor_buffer_size && + ((*push_set)->set_used_on_gpu || + set->desc_mem.alloc_size < layout->descriptor_buffer_size)) { + /* The previous buffer is either actively used by some GPU command (so + * we can't modify it) or is too small. Allocate a new one. + */ + struct anv_state desc_mem = + anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, + layout->descriptor_buffer_size, 32); + if (set->desc_mem.alloc_size) { + /* TODO: Do we really need to copy all the time? */ + memcpy(desc_mem.map, set->desc_mem.map, + MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size)); + } + set->desc_mem = desc_mem; + + struct anv_address addr = { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + set->desc_surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + isl_dev->ss.size, isl_dev->ss.align); + anv_fill_buffer_surface_state(cmd_buffer->device, + set->desc_surface_state, + ISL_FORMAT_R32G32B32A32_FLOAT, + addr, layout->descriptor_buffer_size, 1); + } + return set; } diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 964180c5f96..3f0e3235500 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -82,6 +82,33 @@ anv_descriptor_data_for_type(const struct anv_physical_device *device, return data; } +static unsigned +anv_descriptor_data_size(enum anv_descriptor_data data) +{ + return 0; +} + +/** Returns the size in bytes of each descriptor with the given layout */ +unsigned +anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout) +{ + return anv_descriptor_data_size(layout->data); +} + +/** Returns the size in bytes of each descriptor of the given type + * + * This version of the function does not have access to the entire layout so + * it may only work on certain descriptor types where the descriptor size is + * entirely determined by the descriptor type. Whenever possible, code should + * use anv_descriptor_size() instead. + */ +unsigned +anv_descriptor_type_size(const struct anv_physical_device *pdevice, + VkDescriptorType type) +{ + return anv_descriptor_data_size(anv_descriptor_data_for_type(pdevice, type)); +} + void anv_GetDescriptorSetLayoutSupport( VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, @@ -198,6 +225,7 @@ VkResult anv_CreateDescriptorSetLayout( uint32_t buffer_view_count = 0; uint32_t dynamic_offset_count = 0; + uint32_t descriptor_buffer_size = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; @@ -267,11 +295,16 @@ VkResult anv_CreateDescriptorSetLayout( break; } + set_layout->binding[b].descriptor_offset = descriptor_buffer_size; + descriptor_buffer_size += anv_descriptor_size(&set_layout->binding[b]) * + binding->descriptorCount; + set_layout->shader_stages |= binding->stageFlags; } set_layout->buffer_view_count = buffer_view_count; set_layout->dynamic_offset_count = dynamic_offset_count; + set_layout->descriptor_buffer_size = descriptor_buffer_size; *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout); @@ -315,6 +348,7 @@ sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx, SHA1_UPDATE_VALUE(ctx, layout->descriptor_index); SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index); SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset); if (layout->immutable_samplers) { for (uint16_t i = 0; i < layout->array_size; i++) @@ -331,6 +365,7 @@ sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx, SHA1_UPDATE_VALUE(ctx, layout->shader_stages); SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count); SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size); for (uint16_t i = 0; i < layout->binding_count; i++) sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]); @@ -420,6 +455,12 @@ void anv_DestroyPipelineLayout( * and the free lists lets us recycle blocks for case 2). */ +/* The vma heap reserves 0 to mean NULL; we have to offset by some ammount to + * ensure we can allocate the entire BO without hitting zero. The actual + * amount doesn't matter. + */ +#define POOL_HEAP_OFFSET 64 + #define EMPTY 1 VkResult anv_CreateDescriptorPool( @@ -433,6 +474,7 @@ VkResult anv_CreateDescriptorPool( uint32_t descriptor_count = 0; uint32_t buffer_view_count = 0; + uint32_t descriptor_bo_size = 0; for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) { enum anv_descriptor_data desc_data = anv_descriptor_data_for_type(&device->instance->physicalDevice, @@ -441,8 +483,22 @@ VkResult anv_CreateDescriptorPool( if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW) buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount; + unsigned desc_data_size = anv_descriptor_data_size(desc_data) * + pCreateInfo->pPoolSizes[i].descriptorCount; + descriptor_bo_size += desc_data_size; + descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; } + /* We have to align descriptor buffer allocations to 32B so that we can + * push descriptor buffers. This means that each descriptor buffer + * allocated may burn up to 32B of extra space to get the right alignment. + * (Technically, it's at most 28B because we're always going to start at + * least 4B aligned but we're being conservative here.) Allocate enough + * extra space that we can chop it into maxSets pieces and align each one + * of them to 32B. + */ + descriptor_bo_size += 32 * pCreateInfo->maxSets; + descriptor_bo_size = ALIGN(descriptor_bo_size, 4096); const size_t pool_size = pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) + @@ -459,6 +515,33 @@ VkResult anv_CreateDescriptorPool( pool->next = 0; pool->free_list = EMPTY; + if (descriptor_bo_size > 0) { + VkResult result = anv_bo_init_new(&pool->bo, device, descriptor_bo_size); + if (result != VK_SUCCESS) { + vk_free2(&device->alloc, pAllocator, pool); + return result; + } + + anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); + + pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, + descriptor_bo_size, 0); + if (pool->bo.map == NULL) { + anv_gem_close(device, pool->bo.gem_handle); + vk_free2(&device->alloc, pAllocator, pool); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + + if (device->instance->physicalDevice.use_softpin) { + pool->bo.flags |= EXEC_OBJECT_PINNED; + anv_vma_alloc(device, &pool->bo); + } + + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size); + } else { + pool->bo.size = 0; + } + anv_state_stream_init(&pool->surface_state_stream, &device->surface_state_pool, 4096); pool->surface_state_free_list = NULL; @@ -479,6 +562,11 @@ void anv_DestroyDescriptorPool( if (!pool) return; + if (pool->bo.size) { + anv_gem_munmap(pool->bo.map, pool->bo.size); + anv_vma_free(device, &pool->bo); + anv_gem_close(device, pool->bo.gem_handle); + } anv_state_stream_finish(&pool->surface_state_stream); vk_free2(&device->alloc, pAllocator, pool); } @@ -493,6 +581,12 @@ VkResult anv_ResetDescriptorPool( pool->next = 0; pool->free_list = EMPTY; + + if (pool->bo.size) { + util_vma_heap_finish(&pool->bo_heap); + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo.size); + } + anv_state_stream_finish(&pool->surface_state_stream); anv_state_stream_init(&pool->surface_state_stream, &device->surface_state_pool, 4096); @@ -606,6 +700,37 @@ anv_descriptor_set_create(struct anv_device *device, if (result != VK_SUCCESS) return result; + if (layout->descriptor_buffer_size) { + /* Align the size to 32 so that alignment gaps don't cause extra holes + * in the heap which can lead to bad performance. + */ + uint64_t pool_vma_offset = + util_vma_heap_alloc(&pool->bo_heap, + ALIGN(layout->descriptor_buffer_size, 32), 32); + if (pool_vma_offset == 0) { + anv_descriptor_pool_free_set(pool, set); + return vk_error(VK_ERROR_FRAGMENTED_POOL); + } + assert(pool_vma_offset >= POOL_HEAP_OFFSET && + pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX); + set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET; + set->desc_mem.alloc_size = layout->descriptor_buffer_size; + set->desc_mem.map = pool->bo.map + set->desc_mem.offset; + + set->desc_surface_state = anv_descriptor_pool_alloc_state(pool); + anv_fill_buffer_surface_state(device, set->desc_surface_state, + ISL_FORMAT_R32G32B32A32_FLOAT, + (struct anv_address) { + .bo = &pool->bo, + .offset = set->desc_mem.offset, + }, + layout->descriptor_buffer_size, 1); + } else { + set->desc_mem = ANV_STATE_NULL; + set->desc_surface_state = ANV_STATE_NULL; + } + + set->pool = pool; set->layout = layout; anv_descriptor_set_layout_ref(layout); @@ -656,6 +781,13 @@ anv_descriptor_set_destroy(struct anv_device *device, { anv_descriptor_set_layout_unref(device, set->layout); + if (set->desc_mem.alloc_size) { + util_vma_heap_free(&pool->bo_heap, + (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET, + set->desc_mem.alloc_size); + anv_descriptor_pool_free_state(pool, set->desc_surface_state); + } + for (uint32_t b = 0; b < set->buffer_view_count; b++) anv_descriptor_pool_free_state(pool, set->buffer_views[b].surface_state); @@ -925,6 +1057,16 @@ void anv_UpdateDescriptorSets( for (uint32_t j = 0; j < copy->descriptorCount; j++) dst_desc[j] = src_desc[j]; + + unsigned desc_size = anv_descriptor_size(src_layout); + if (desc_size > 0) { + assert(desc_size == anv_descriptor_size(dst_layout)); + memcpy(dst->desc_mem.map + dst_layout->descriptor_offset + + copy->dstArrayElement * desc_size, + src->desc_mem.map + src_layout->descriptor_offset + + copy->srcArrayElement * desc_size, + copy->descriptorCount * desc_size); + } } } diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 89f4bb7899c..1cb3ef51b30 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -27,6 +27,8 @@ #include "compiler/brw_nir.h" struct apply_pipeline_layout_state { + const struct anv_physical_device *pdevice; + nir_shader *shader; nir_builder builder; @@ -38,6 +40,9 @@ struct apply_pipeline_layout_state { bool uses_constants; uint8_t constants_offset; struct { + bool desc_buffer_used; + uint8_t desc_offset; + BITSET_WORD *used; uint8_t *surface_offsets; uint8_t *sampler_offsets; @@ -49,7 +54,17 @@ static void add_binding(struct apply_pipeline_layout_state *state, uint32_t set, uint32_t binding) { + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + BITSET_SET(state->set[set].used, binding); + + /* Only flag the descriptor buffer as used if there's actually data for + * this binding. This lets us be lazy and call this function constantly + * without worrying about unnecessarily enabling the buffer. + */ + if (anv_descriptor_size(bind_layout)) + state->set[set].desc_buffer_used = true; } static void @@ -440,6 +455,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, struct anv_pipeline_bind_map *map) { struct apply_pipeline_layout_state state = { + .pdevice = pdevice, .shader = shader, .layout = layout, .add_bounds_checks = robust_buffer_access, @@ -464,6 +480,18 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, get_used_bindings_block(block, &state); } + for (unsigned s = 0; s < layout->num_sets; s++) { + if (state.set[s].desc_buffer_used) { + map->surface_to_descriptor[map->surface_count] = + (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, + .binding = s, + }; + state.set[s].desc_offset = map->surface_count; + map->surface_count++; + } + } + if (state.uses_constants) { state.constants_offset = map->surface_count; map->surface_to_descriptor[map->surface_count].set = diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 0573b99bab6..cd8414ac01f 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1530,10 +1530,18 @@ struct anv_descriptor_set_binding_layout { /* Index into the descriptor set buffer views */ int16_t buffer_view_index; + /* Offset into the descriptor buffer where this descriptor lives */ + uint32_t descriptor_offset; + /* Immutable samplers (or NULL if no immutable samplers) */ struct anv_sampler **immutable_samplers; }; +unsigned anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout); + +unsigned anv_descriptor_type_size(const struct anv_physical_device *pdevice, + VkDescriptorType type); + struct anv_descriptor_set_layout { /* Descriptor set layouts can be destroyed at almost any time */ uint32_t ref_cnt; @@ -1553,6 +1561,9 @@ struct anv_descriptor_set_layout { /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; + /* Size of the descriptor buffer for this descriptor set */ + uint32_t descriptor_buffer_size; + /* Bindings in this descriptor set */ struct anv_descriptor_set_binding_layout binding[0]; }; @@ -1594,8 +1605,15 @@ struct anv_descriptor { }; struct anv_descriptor_set { + struct anv_descriptor_pool *pool; struct anv_descriptor_set_layout *layout; uint32_t size; + + /* State relative to anv_descriptor_pool::bo */ + struct anv_state desc_mem; + /* Surface state for the descriptor buffer */ + struct anv_state desc_surface_state; + uint32_t buffer_view_count; struct anv_buffer_view *buffer_views; struct anv_descriptor descriptors[0]; @@ -1620,6 +1638,12 @@ struct anv_push_descriptor_set { /* Put this field right behind anv_descriptor_set so it fills up the * descriptors[0] field. */ struct anv_descriptor descriptors[MAX_PUSH_DESCRIPTORS]; + + /** True if the descriptor set buffer has been referenced by a draw or + * dispatch command. + */ + bool set_used_on_gpu; + struct anv_buffer_view buffer_views[MAX_PUSH_DESCRIPTORS]; }; @@ -1628,6 +1652,9 @@ struct anv_descriptor_pool { uint32_t next; uint32_t free_list; + struct anv_bo bo; + struct util_vma_heap bo_heap; + struct anv_state_stream surface_state_stream; void *surface_state_free_list; @@ -1724,6 +1751,7 @@ anv_descriptor_set_destroy(struct anv_device *device, struct anv_descriptor_pool *pool, struct anv_descriptor_set *set); +#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) #define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1) #define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index b5fc8be9475..7687507e6b7 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2029,6 +2029,31 @@ dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, return pipe_state->dynamic_offsets[dynamic_offset_idx]; } +static struct anv_address +anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, + struct anv_descriptor_set *set) +{ + if (set->pool) { + /* This is a normal descriptor set */ + return (struct anv_address) { + .bo = &set->pool->bo, + .offset = set->desc_mem.offset, + }; + } else { + /* This is a push descriptor set. We have to flag it as used on the GPU + * so that the next time we push descriptors, we grab a new memory. + */ + struct anv_push_descriptor_set *push_set = + (struct anv_push_descriptor_set *)set; + push_set->set_used_on_gpu = true; + + return (struct anv_address) { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + } +} + static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, @@ -2149,6 +2174,18 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, add_surface_reloc(cmd_buffer, surface_state, cmd_buffer->state.compute.num_workgroups); continue; + } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + /* This is a descriptor set buffer so the set index is actually + * given by binding->binding. (Yes, that's confusing.) + */ + struct anv_descriptor_set *set = + pipe_state->descriptors[binding->binding]; + assert(set->desc_mem.alloc_size); + assert(set->desc_surface_state.alloc_size); + bt_map[s] = set->desc_surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, set->desc_surface_state, + anv_descriptor_set_address(cmd_buffer, set)); + continue; } const struct anv_descriptor *desc = @@ -2518,6 +2555,21 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, DIV_ROUND_UP(constant_data_size, 32) - range->start); read_addr = anv_address_add(constant_data, range->start * 32); + } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[binding->binding]; + struct anv_address desc_buffer_addr = + anv_descriptor_set_address(cmd_buffer, set); + const unsigned desc_buffer_size = set->desc_mem.alloc_size; + + read_len = MIN2(range->length, + DIV_ROUND_UP(desc_buffer_size, 32) - range->start); + read_addr = anv_address_add(desc_buffer_addr, + range->start * 32); } else { const struct anv_descriptor *desc = anv_descriptor_for_binding(&gfx_state->base, binding);