From cb3872f2cdce8476dbb8f361a7f95f005c657c3d Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 19 Sep 2022 16:59:53 +0200 Subject: [PATCH] tu: Implement VK_EXT_descriptor_buffer Part-of: --- docs/features.txt | 2 +- src/freedreno/vulkan/tu_cmd_buffer.c | 194 ++++++++++++----- src/freedreno/vulkan/tu_cmd_buffer.h | 14 +- src/freedreno/vulkan/tu_descriptor_set.c | 253 +++++++++++++++++++++-- src/freedreno/vulkan/tu_descriptor_set.h | 2 + src/freedreno/vulkan/tu_device.c | 62 ++++++ src/freedreno/vulkan/tu_drm.c | 8 + src/freedreno/vulkan/tu_drm.h | 2 + src/freedreno/vulkan/tu_kgsl.c | 5 + 9 files changed, 465 insertions(+), 77 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index d5233eb5010..277ca841a84 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -548,7 +548,7 @@ Khronos extensions that are not part of any Vulkan version: VK_EXT_depth_clip_control DONE (anv, lvp, radv, tu, v3dv, vn) VK_EXT_depth_clip_enable DONE (anv, lvp, radv, tu, vn) VK_EXT_depth_range_unrestricted DONE (radv, lvp) - VK_EXT_descriptor_buffer DONE (radv) + VK_EXT_descriptor_buffer DONE (radv, tu) VK_EXT_discard_rectangles DONE (radv) VK_EXT_display_control DONE (anv, radv, tu) VK_EXT_extended_dynamic_state3 DONE (lvp, radv, tu) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 7962f5ed740..458beb2e278 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -165,6 +165,12 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS); if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE) tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE); + if (flushes & TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE) { + tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( + .gfx_bindless = 0x1f, + .cs_bindless = 0x1f, + )); + } if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) || @@ -2061,6 +2067,64 @@ tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, cmd->state.index_size = index_size; } +static void +tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd, + VkPipelineBindPoint bind_point) +{ + struct tu_descriptor_state *descriptors_state = + tu_get_descriptors_state(cmd, bind_point); + uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value; + struct tu_cs *cs, state_cs; + + if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { + sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); + hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); + hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f); + + cmd->state.desc_sets = + tu_cs_draw_state(&cmd->sub_cs, &state_cs, + 4 + 4 * descriptors_state->max_sets_bound + + (descriptors_state->dynamic_bound ? 6 : 0)); + cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD; + cs = &state_cs; + } else { + assert(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE); + + sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); + hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); + hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f); + + cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; + cs = &cmd->cs; + } + + tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 2 * descriptors_state->max_sets_bound); + tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound); + tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound); + tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound); + + /* Dynamic descriptors get the last descriptor set. */ + if (descriptors_state->dynamic_bound) { + tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2); + tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); + tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2); + tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]); + } + + tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value)); + + if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { + assert(cs->cur == cs->end); /* validate draw state size */ + /* note: this also avoids emitting draw states before renderpass clears, + * which may use the 3D clear path (for MSAA cases) + */ + if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { + tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); + tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); + } + } +} + VKAPI_ATTR void VKAPI_CALL tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -2086,6 +2150,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); descriptors_state->sets[idx] = set; + descriptors_state->set_iova[idx] = set->va | 3; if (!set) continue; @@ -2138,17 +2203,6 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, } assert(dyn_idx == dynamicOffsetCount); - uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value; - uint64_t addr[MAX_SETS] = {}; - uint64_t dynamic_addr = 0; - struct tu_cs *cs, state_cs; - - for (uint32_t i = 0; i < descriptors_state->max_sets_bound; i++) { - struct tu_descriptor_set *set = descriptors_state->sets[i]; - if (set) - addr[i] = set->va | 3; - } - if (layout->dynamic_offset_size) { /* allocate and fill out dynamic descriptor set */ struct tu_cs_memory dynamic_desc_set; @@ -2162,57 +2216,79 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors, layout->dynamic_offset_size); - dynamic_addr = dynamic_desc_set.iova | 3; + descriptors_state->set_iova[MAX_SETS] = dynamic_desc_set.iova | 3; descriptors_state->dynamic_bound = true; } - if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { - sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); - hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); - hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f); + tu6_emit_descriptor_sets(cmd, pipelineBindPoint); +} - cmd->state.desc_sets = - tu_cs_draw_state(&cmd->sub_cs, &state_cs, - 4 + 4 * descriptors_state->max_sets_bound + - (descriptors_state->dynamic_bound ? 6 : 0)); - cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD; - cs = &state_cs; - } else { - assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE); +VKAPI_ATTR void VKAPI_CALL +tu_CmdBindDescriptorBuffersEXT( + VkCommandBuffer commandBuffer, + uint32_t bufferCount, + const VkDescriptorBufferBindingInfoEXT *pBindingInfos) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); - hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); - hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f); + for (unsigned i = 0; i < bufferCount; i++) + cmd->state.descriptor_buffer_iova[i] = pBindingInfos[i].address; +} - cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; - cs = &cmd->cs; +VKAPI_ATTR void VKAPI_CALL +tu_CmdSetDescriptorBufferOffsetsEXT( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t firstSet, + uint32_t setCount, + const uint32_t *pBufferIndices, + const VkDeviceSize *pOffsets) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout); + + struct tu_descriptor_state *descriptors_state = + tu_get_descriptors_state(cmd, pipelineBindPoint); + + descriptors_state->max_sets_bound = + MAX2(descriptors_state->max_sets_bound, firstSet + setCount); + + for (unsigned i = 0; i < setCount; ++i) { + unsigned idx = i + firstSet; + struct tu_descriptor_set_layout *set_layout = layout->set[idx].layout; + + descriptors_state->set_iova[idx] = + (cmd->state.descriptor_buffer_iova[pBufferIndices[i]] + pOffsets[i]) | 3; + + if (set_layout->has_inline_uniforms) + cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; } - tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_array(cs, (const uint32_t*) addr, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_array(cs, (const uint32_t*) addr, 2 * descriptors_state->max_sets_bound); + tu6_emit_descriptor_sets(cmd, pipelineBindPoint); +} - /* Dynamic descriptors get the last descriptor set. */ - if (descriptors_state->dynamic_bound) { - tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, dynamic_addr); - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, dynamic_addr); - } +VKAPI_ATTR void VKAPI_CALL +tu_CmdBindDescriptorBufferEmbeddedSamplersEXT( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t set) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout); - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value)); + struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; - if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { - assert(cs->cur == cs->end); /* validate draw state size */ - /* note: this also avoids emitting draw states before renderpass clears, - * which may use the 3D clear path (for MSAA cases) - */ - if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { - tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); - } - } + struct tu_descriptor_state *descriptors_state = + tu_get_descriptors_state(cmd, pipelineBindPoint); + + descriptors_state->max_sets_bound = + MAX2(descriptors_state->max_sets_bound, set + 1); + + descriptors_state->set_iova[set] = set_layout->embedded_samplers->iova | 3; + + tu6_emit_descriptor_sets(cmd, pipelineBindPoint); } static enum VkResult @@ -3489,6 +3565,10 @@ tu_flush_for_access(struct tu_cache_state *cache, DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) + if (dst_mask & TU_ACCESS_BINDLESS_DESCRIPTOR_READ) { + flush_bits |= TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE; + } + #undef DST_INCOHERENT_FLUSH cache->flush_bits |= flush_bits; @@ -3592,6 +3672,12 @@ vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only SHADER_STAGES)) mask |= TU_ACCESS_UCHE_READ; + if (gfx_read_access(flags, stages, + VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT, + SHADER_STAGES)) { + mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_BINDLESS_DESCRIPTOR_READ; + } + if (gfx_write_access(flags, stages, VK_ACCESS_2_SHADER_WRITE_BIT | VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, @@ -4492,6 +4578,8 @@ tu6_emit_user_consts(struct tu_cs *cs, for (unsigned i = 0; i < link->tu_const_state.num_inline_ubos; i++) { const struct tu_inline_ubo *ubo = &link->tu_const_state.ubos[i]; + uint64_t va = descriptors->set_iova[ubo->base] & ~0x3f; + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), ubo->push_address ? 7 : 3); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | @@ -4501,11 +4589,11 @@ tu6_emit_user_consts(struct tu_cs *cs, if (ubo->push_address) { tu_cs_emit(cs, 0); tu_cs_emit(cs, 0); - tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset); + tu_cs_emit_qw(cs, va + ubo->offset); tu_cs_emit(cs, 0); tu_cs_emit(cs, 0); } else { - tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset); + tu_cs_emit_qw(cs, va + ubo->offset); } } } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 225da158fbc..e01d032e376 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -46,6 +46,7 @@ struct tu_descriptor_state struct tu_descriptor_set *sets[MAX_SETS]; struct tu_descriptor_set push_set; uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE]; + uint64_t set_iova[MAX_SETS + 1]; uint32_t max_sets_bound; bool dynamic_bound; }; @@ -120,13 +121,20 @@ enum tu_cmd_access_mask { */ TU_ACCESS_CP_WRITE = 1 << 12, + /* Descriptors are read through UCHE but are also prefetched via + * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated + * when they change. + */ + TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13, + TU_ACCESS_READ = TU_ACCESS_UCHE_READ | TU_ACCESS_CCU_COLOR_READ | TU_ACCESS_CCU_DEPTH_READ | TU_ACCESS_CCU_COLOR_INCOHERENT_READ | TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | - TU_ACCESS_SYSMEM_READ, + TU_ACCESS_SYSMEM_READ | + TU_ACCESS_BINDLESS_DESCRIPTOR_READ, TU_ACCESS_WRITE = TU_ACCESS_UCHE_WRITE | @@ -203,6 +211,7 @@ enum tu_cmd_flush_bits { TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, + TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 9, TU_CMD_FLAG_ALL_FLUSH = TU_CMD_FLAG_CCU_FLUSH_DEPTH | @@ -217,6 +226,7 @@ enum tu_cmd_flush_bits { TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE | + TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE | /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a * a command that needs CP_WAIT_FOR_ME is executed. This means we may * insert an extra WAIT_FOR_ME before an indirect command requiring it @@ -527,6 +537,8 @@ struct tu_cmd_state struct tu_vs_params last_vs_params; struct tu_primitive_params last_prim_params; + + uint64_t descriptor_buffer_iova[MAX_SETS]; }; struct tu_cmd_buffer diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c index 6e32127878c..80ca1abdd0e 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.c +++ b/src/freedreno/vulkan/tu_descriptor_set.c @@ -27,6 +27,7 @@ #include "tu_device.h" #include "tu_image.h" +#include "tu_formats.h" static inline uint8_t * pool_base(struct tu_descriptor_pool *pool) @@ -93,6 +94,19 @@ mutable_descriptor_size(struct tu_device *dev, return max_size; } +static void +tu_descriptor_set_layout_destroy(struct vk_device *vk_dev, + struct vk_descriptor_set_layout *vk_layout) +{ + struct tu_device *dev = container_of(vk_dev, struct tu_device, vk); + struct tu_descriptor_set_layout *layout = + container_of(vk_layout, struct tu_descriptor_set_layout, vk); + + if (layout->embedded_samplers) + tu_bo_finish(dev, layout->embedded_samplers); + vk_descriptor_set_layout_destroy(vk_dev, vk_layout); +} + VKAPI_ATTR VkResult VKAPI_CALL tu_CreateDescriptorSetLayout( VkDevice _device, @@ -149,6 +163,7 @@ tu_CreateDescriptorSetLayout( return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); set_layout->flags = pCreateInfo->flags; + set_layout->vk.destroy = tu_descriptor_set_layout_destroy; /* We just allocate all the immutable samplers at the end of the struct */ struct tu_sampler *samplers = (void*) &set_layout->binding[num_bindings]; @@ -256,6 +271,38 @@ tu_CreateDescriptorSetLayout( set_layout->dynamic_offset_size = dynamic_offset_size; + if (pCreateInfo->flags & + VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT) { + result = tu_bo_init_new(device, &set_layout->embedded_samplers, + set_layout->size, TU_BO_ALLOC_ALLOW_DUMP, + "embedded samplers"); + if (result != VK_SUCCESS) { + vk_object_free(&device->vk, pAllocator, set_layout); + return vk_error(device, result); + } + + result = tu_bo_map(device, set_layout->embedded_samplers); + if (result != VK_SUCCESS) { + tu_bo_finish(device, set_layout->embedded_samplers); + vk_object_free(&device->vk, pAllocator, set_layout); + return vk_error(device, result); + } + + char *map = set_layout->embedded_samplers->map; + for (unsigned i = 0; i < set_layout->binding_count; i++) { + if (!set_layout->binding[i].immutable_samplers_offset) + continue; + + unsigned offset = set_layout->binding[i].offset; + const struct tu_sampler *sampler = + (const struct tu_sampler *)((const char *)set_layout + + set_layout->binding[i].immutable_samplers_offset); + assert(set_layout->binding[i].array_size == 1); + memcpy(map + offset, sampler->descriptor, + sizeof(sampler->descriptor)); + } + } + *pSetLayout = tu_descriptor_set_layout_to_handle(set_layout); return VK_SUCCESS; @@ -360,6 +407,30 @@ out: pSupport->supported = supported; } +VKAPI_ATTR void VKAPI_CALL +tu_GetDescriptorSetLayoutSizeEXT( + VkDevice _device, + VkDescriptorSetLayout _layout, + VkDeviceSize *pLayoutSizeInBytes) +{ + TU_FROM_HANDLE(tu_descriptor_set_layout, layout, _layout); + + *pLayoutSizeInBytes = layout->size; +} + +VKAPI_ATTR void VKAPI_CALL +tu_GetDescriptorSetLayoutBindingOffsetEXT( + VkDevice _device, + VkDescriptorSetLayout _layout, + uint32_t binding, + VkDeviceSize *pOffset) +{ + TU_FROM_HANDLE(tu_descriptor_set_layout, layout, _layout); + + assert(binding < layout->binding_count); + *pOffset = layout->binding[binding].offset; +} + /* Note: we must hash any values used in tu_lower_io(). */ #define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x)); @@ -905,6 +976,21 @@ tu_FreeDescriptorSets(VkDevice _device, return VK_SUCCESS; } +static void +write_texel_buffer_descriptor_addr(uint32_t *dst, + const VkDescriptorAddressInfoEXT *buffer_info) +{ + if (!buffer_info || buffer_info->address == 0) { + memset(dst, 0, A6XX_TEX_CONST_DWORDS * sizeof(uint32_t)); + } else { + uint8_t swiz[4] = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W }; + fdl6_buffer_view_init(dst, + tu_vk_format_to_pipe_format(buffer_info->format), + swiz, buffer_info->address, buffer_info->range); + } +} + static void write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) { @@ -917,10 +1003,24 @@ write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) } } +static VkDescriptorAddressInfoEXT +buffer_info_to_address(const VkDescriptorBufferInfo *buffer_info) +{ + TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); + + uint32_t range = buffer ? vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range) : 0; + uint64_t va = buffer ? buffer->iova + buffer_info->offset : 0; + + return (VkDescriptorAddressInfoEXT) { + .address = va, + .range = range, + }; +} + static void -write_buffer_descriptor(const struct tu_device *device, - uint32_t *dst, - const VkDescriptorBufferInfo *buffer_info) +write_buffer_descriptor_addr(const struct tu_device *device, + uint32_t *dst, + const VkDescriptorAddressInfoEXT *buffer_info) { bool storage_16bit = device->physical_device->info->a6xx.storage_16bit; /* newer a6xx allows using 16-bit descriptor for both 16-bit and 32-bit @@ -928,16 +1028,14 @@ write_buffer_descriptor(const struct tu_device *device, * isam. */ unsigned descriptors = storage_16bit ? 2 : 1; - if (buffer_info->buffer == VK_NULL_HANDLE) { + + if (!buffer_info || buffer_info->address == 0) { memset(dst, 0, descriptors * A6XX_TEX_CONST_DWORDS * sizeof(uint32_t)); return; } - TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); - - assert((buffer_info->offset & 63) == 0); /* minStorageBufferOffsetAlignment */ - uint64_t va = buffer->iova + buffer_info->offset; - uint32_t range = vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range); + uint64_t va = buffer_info->address; + uint32_t range = buffer_info->range; for (unsigned i = 0; i < descriptors; i++) { if (storage_16bit && i == 0) { @@ -959,30 +1057,43 @@ write_buffer_descriptor(const struct tu_device *device, } static void -write_ubo_descriptor(uint32_t *dst, const VkDescriptorBufferInfo *buffer_info) +write_buffer_descriptor(const struct tu_device *device, + uint32_t *dst, + const VkDescriptorBufferInfo *buffer_info) { - if (buffer_info->buffer == VK_NULL_HANDLE) { + VkDescriptorAddressInfoEXT addr = buffer_info_to_address(buffer_info); + write_buffer_descriptor_addr(device, dst, &addr); +} + +static void +write_ubo_descriptor_addr(uint32_t *dst, + const VkDescriptorAddressInfoEXT *buffer_info) +{ + if (!buffer_info) { dst[0] = dst[1] = 0; return; } - TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); - - uint32_t range = vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range); + uint64_t va = buffer_info->address; /* The HW range is in vec4 units */ - range = ALIGN_POT(range, 16) / 16; - uint64_t va = buffer->iova + buffer_info->offset; - + uint32_t range = va ? DIV_ROUND_UP(buffer_info->range, 16) : 0; dst[0] = A6XX_UBO_0_BASE_LO(va); dst[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(range); } +static void +write_ubo_descriptor(uint32_t *dst, const VkDescriptorBufferInfo *buffer_info) +{ + VkDescriptorAddressInfoEXT addr = buffer_info_to_address(buffer_info); + write_ubo_descriptor_addr(dst, &addr); +} + static void write_image_descriptor(uint32_t *dst, VkDescriptorType descriptor_type, const VkDescriptorImageInfo *image_info) { - if (image_info->imageView == VK_NULL_HANDLE) { + if (!image_info || image_info->imageView == VK_NULL_HANDLE) { memset(dst, 0, A6XX_TEX_CONST_DWORDS * sizeof(uint32_t)); return; } @@ -1006,14 +1117,15 @@ write_combined_image_sampler_descriptor(uint32_t *dst, /* copy over sampler state */ if (has_sampler) { TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler); + memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler->descriptor, sizeof(sampler->descriptor)); } } static void -write_sampler_descriptor(uint32_t *dst, const VkDescriptorImageInfo *image_info) +write_sampler_descriptor(uint32_t *dst, VkSampler _sampler) { - TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler); + TU_FROM_HANDLE(tu_sampler, sampler, _sampler); memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor)); } @@ -1025,6 +1137,103 @@ write_sampler_push(uint32_t *dst, const struct tu_sampler *sampler) memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor)); } +VKAPI_ATTR void VKAPI_CALL +tu_GetDescriptorEXT( + VkDevice _device, + const VkDescriptorGetInfoEXT *pDescriptorInfo, + size_t dataSize, + void *pDescriptor) +{ + TU_FROM_HANDLE(tu_device, device, _device); + + switch (pDescriptorInfo->type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + write_ubo_descriptor_addr(pDescriptor, pDescriptorInfo->data.pUniformBuffer); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + write_buffer_descriptor_addr(device, pDescriptor, pDescriptorInfo->data.pStorageBuffer); + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + write_texel_buffer_descriptor_addr(pDescriptor, pDescriptorInfo->data.pUniformTexelBuffer); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + write_texel_buffer_descriptor_addr(pDescriptor, pDescriptorInfo->data.pStorageTexelBuffer); + break; + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + write_image_descriptor(pDescriptor, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + pDescriptorInfo->data.pSampledImage); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + write_image_descriptor(pDescriptor, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + pDescriptorInfo->data.pStorageImage); + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + write_combined_image_sampler_descriptor(pDescriptor, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + pDescriptorInfo->data.pCombinedImageSampler, + true); + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + write_sampler_descriptor(pDescriptor, *pDescriptorInfo->data.pSampler); + break; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + /* nothing in descriptor set - framebuffer state is used instead */ + if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { + write_image_descriptor(pDescriptor, VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, + pDescriptorInfo->data.pInputAttachmentImage); + } + break; + default: + unreachable("unimplemented descriptor type"); + break; + } +} + +/* We don't have any mutable state in buffers, images, image views, or + * samplers, so we shouldn't need to save/restore anything to get the same + * descriptor back as long as the user uses the same iova. + */ + +VKAPI_ATTR VkResult VKAPI_CALL +tu_GetBufferOpaqueCaptureDescriptorDataEXT(VkDevice device, + const VkBufferCaptureDescriptorDataInfoEXT *pInfo, + void *pData) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +tu_GetImageOpaqueCaptureDescriptorDataEXT(VkDevice device, + const VkImageCaptureDescriptorDataInfoEXT *pInfo, + void *pData) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +tu_GetImageViewOpaqueCaptureDescriptorDataEXT(VkDevice device, + const VkImageViewCaptureDescriptorDataInfoEXT *pInfo, + void *pData) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +tu_GetSamplerOpaqueCaptureDescriptorDataEXT(VkDevice _device, + const VkSamplerCaptureDescriptorDataInfoEXT *pInfo, + void *pData) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +tu_GetAccelerationStructureOpaqueCaptureDescriptorDataEXT(VkDevice device, + const VkAccelerationStructureCaptureDescriptorDataInfoEXT *pInfo, + void *pData) +{ + return VK_SUCCESS; +} + void tu_update_descriptor_sets(const struct tu_device *device, VkDescriptorSet dstSetOverride, @@ -1124,7 +1333,7 @@ tu_update_descriptor_sets(const struct tu_device *device, break; case VK_DESCRIPTOR_TYPE_SAMPLER: if (!binding_layout->immutable_samplers_offset) - write_sampler_descriptor(ptr, writeset->pImageInfo + j); + write_sampler_descriptor(ptr, writeset->pImageInfo[j].sampler); else if (copy_immutable_samplers) write_sampler_push(ptr, &samplers[writeset->dstArrayElement + j]); break; @@ -1453,7 +1662,7 @@ tu_update_descriptor_set_with_template( break; case VK_DESCRIPTOR_TYPE_SAMPLER: if (templ->entry[i].has_sampler) - write_sampler_descriptor(ptr, src); + write_sampler_descriptor(ptr, ((const VkDescriptorImageInfo *)src)->sampler); else if (samplers) write_sampler_push(ptr, &samplers[j]); break; diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h index 270afcfdc30..23c75dfef80 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.h +++ b/src/freedreno/vulkan/tu_descriptor_set.h @@ -75,6 +75,8 @@ struct tu_descriptor_set_layout bool has_variable_descriptors; bool has_inline_uniforms; + struct tu_bo *embedded_samplers; + /* Bindings in this descriptor set */ struct tu_descriptor_set_binding_layout binding[0]; }; diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 15e4528e590..c603b991d68 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -254,6 +254,7 @@ get_device_extensions(const struct tu_physical_device *device, .KHR_pipeline_library = true, .EXT_graphics_pipeline_library = true, .EXT_post_depth_coverage = true, + .EXT_descriptor_buffer = true, }; } @@ -980,6 +981,15 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, features->presentWait = pdevice->vk.supported_extensions.KHR_present_wait; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_BUFFER_FEATURES_EXT: { + VkPhysicalDeviceDescriptorBufferFeaturesEXT *features = + (VkPhysicalDeviceDescriptorBufferFeaturesEXT *)ext; + features->descriptorBuffer = true; + features->descriptorBufferCaptureReplay = pdevice->has_set_iova; + features->descriptorBufferImageLayoutIgnored = true; + features->descriptorBufferPushDescriptors = true; + break; + } default: break; @@ -1451,6 +1461,52 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, properties->dynamicPrimitiveTopologyUnrestricted = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_BUFFER_PROPERTIES_EXT: { + VkPhysicalDeviceDescriptorBufferPropertiesEXT *properties = + (VkPhysicalDeviceDescriptorBufferPropertiesEXT *)ext; + properties->combinedImageSamplerDescriptorSingleArray = true; + properties->bufferlessPushDescriptors = true; + properties->allowSamplerImageViewPostSubmitCreation = true; + properties->descriptorBufferOffsetAlignment = A6XX_TEX_CONST_DWORDS * 4; + properties->maxDescriptorBufferBindings = MAX_SETS; + properties->maxResourceDescriptorBufferBindings = MAX_SETS; + properties->maxSamplerDescriptorBufferBindings = MAX_SETS; + properties->maxEmbeddedImmutableSamplerBindings = MAX_SETS; + properties->maxEmbeddedImmutableSamplers = max_descriptor_set_size; + properties->bufferCaptureReplayDescriptorDataSize = 0; + properties->imageCaptureReplayDescriptorDataSize = 0; + properties->imageViewCaptureReplayDescriptorDataSize = 0; + properties->samplerCaptureReplayDescriptorDataSize = 0; + properties->accelerationStructureCaptureReplayDescriptorDataSize = 0; + + /* Note: these sizes must match descriptor_size() */ + properties->samplerDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->combinedImageSamplerDescriptorSize = 2 * A6XX_TEX_CONST_DWORDS * 4; + properties->sampledImageDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->storageImageDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->uniformTexelBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->robustUniformTexelBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->storageTexelBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->robustStorageTexelBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->uniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->robustUniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4; + properties->storageBufferDescriptorSize = + pdevice->info->a6xx.storage_16bit ? + 2 * A6XX_TEX_CONST_DWORDS * 4 : + A6XX_TEX_CONST_DWORDS * 4; + properties->robustStorageBufferDescriptorSize = + properties->storageBufferDescriptorSize; + properties->inputAttachmentDescriptorSize = + (pdevice->instance->debug_flags & TU_DEBUG_DYNAMIC) ? + A6XX_TEX_CONST_DWORDS * 4 : 0; + + properties->maxSamplerDescriptorBufferRange = ~0ull; + properties->maxResourceDescriptorBufferRange = ~0ull; + properties->samplerDescriptorBufferAddressSpaceSize = ~0ull; + properties->resourceDescriptorBufferAddressSpaceSize = ~0ull; + properties->descriptorBufferAddressSpaceSize = ~0ull; + break; + } default: break; } @@ -2691,6 +2747,8 @@ tu_BindBufferMemory2(VkDevice device, uint32_t bindInfoCount, const VkBindBufferMemoryInfo *pBindInfos) { + TU_FROM_HANDLE(tu_device, dev, device); + for (uint32_t i = 0; i < bindInfoCount; ++i) { TU_FROM_HANDLE(tu_device_memory, mem, pBindInfos[i].memory); TU_FROM_HANDLE(tu_buffer, buffer, pBindInfos[i].buffer); @@ -2698,6 +2756,10 @@ tu_BindBufferMemory2(VkDevice device, if (mem) { buffer->bo = mem->bo; buffer->iova = mem->bo->iova + pBindInfos[i].memoryOffset; + if (buffer->vk.usage & + (VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT)) + tu_bo_allow_dump(dev, mem->bo); } else { buffer->bo = NULL; } diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c index 872df710a82..250ff53a954 100644 --- a/src/freedreno/vulkan/tu_drm.c +++ b/src/freedreno/vulkan/tu_drm.c @@ -514,6 +514,14 @@ tu_bo_map(struct tu_device *dev, struct tu_bo *bo) return VK_SUCCESS; } +void +tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo) +{ + mtx_lock(&dev->bo_mutex); + dev->bo_list[bo->bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP; + mtx_unlock(&dev->bo_mutex); +} + void tu_bo_finish(struct tu_device *dev, struct tu_bo *bo) { diff --git a/src/freedreno/vulkan/tu_drm.h b/src/freedreno/vulkan/tu_drm.h index c376595354e..e65e1425b5b 100644 --- a/src/freedreno/vulkan/tu_drm.h +++ b/src/freedreno/vulkan/tu_drm.h @@ -97,6 +97,8 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo); VkResult tu_bo_map(struct tu_device *dev, struct tu_bo *bo); +void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo); + static inline struct tu_bo * tu_bo_get_ref(struct tu_bo *bo) { diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c index ec1ae05b6b4..08f82f69bea 100644 --- a/src/freedreno/vulkan/tu_kgsl.c +++ b/src/freedreno/vulkan/tu_kgsl.c @@ -184,6 +184,11 @@ tu_bo_map(struct tu_device *dev, struct tu_bo *bo) return VK_SUCCESS; } +void +tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo) +{ +} + void tu_bo_finish(struct tu_device *dev, struct tu_bo *bo) {