diff --git a/src/asahi/vulkan/hk_buffer.c b/src/asahi/vulkan/hk_buffer.c index 1c4621def8d..c372035f7c5 100644 --- a/src/asahi/vulkan/hk_buffer.c +++ b/src/asahi/vulkan/hk_buffer.c @@ -77,6 +77,23 @@ hk_get_bda_replay_addr(const VkBufferCreateInfo *pCreateInfo) return addr; } +VkResult +hk_bind_scratch(struct hk_device *dev, struct agx_va *va, unsigned offset_B, + size_t size_B) +{ + VkResult result = VK_SUCCESS; + + for (unsigned i = 0; i < size_B; i += AIL_PAGESIZE) { + result = dev->dev.ops.bo_bind(&dev->dev, dev->sparse.write, + va->addr + offset_B + i, AIL_PAGESIZE, 0, + ASAHI_BIND_READ | ASAHI_BIND_WRITE, false); + if (result != VK_SUCCESS) + return result; + } + + return result; +} + VKAPI_ATTR VkResult VKAPI_CALL hk_CreateBuffer(VkDevice device, const VkBufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer) diff --git a/src/asahi/vulkan/hk_buffer.h b/src/asahi/vulkan/hk_buffer.h index 4aef3baf043..e6dce3c6c4e 100644 --- a/src/asahi/vulkan/hk_buffer.h +++ b/src/asahi/vulkan/hk_buffer.h @@ -47,3 +47,6 @@ hk_buffer_addr_range(const struct hk_buffer *buffer, uint64_t offset, .range = vk_buffer_range(&buffer->vk, offset, range), }; } + +VkResult hk_bind_scratch(struct hk_device *dev, struct agx_va *va, + unsigned offs_B, size_t size_B); diff --git a/src/asahi/vulkan/hk_device.c b/src/asahi/vulkan/hk_device.c index 3f99f766629..bae1f8f19e2 100644 --- a/src/asahi/vulkan/hk_device.c +++ b/src/asahi/vulkan/hk_device.c @@ -25,6 +25,7 @@ #include "util/simple_mtx.h" #include "vulkan/vulkan_core.h" #include "vulkan/wsi/wsi_common.h" +#include "layout.h" #include "vk_cmd_enqueue_entrypoints.h" #include "vk_common_entrypoints.h" #include "vk_debug_utils.h" @@ -57,7 +58,10 @@ hk_upload_rodata(struct hk_device *dev) dev->rodata.bo = agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data"); - if (!dev->rodata.bo) + dev->sparse.write = + agx_bo_create(&dev->dev, AIL_PAGESIZE, 0, 0, "Sparse write page"); + + if (!dev->rodata.bo || !dev->sparse.write) return VK_ERROR_OUT_OF_HOST_MEMORY; uint8_t *map = agx_bo_map(dev->rodata.bo); @@ -481,6 +485,7 @@ fail_queue: hk_queue_finish(dev, &dev->queue); fail_rodata: agx_bo_unreference(&dev->dev, dev->rodata.bo); + agx_bo_unreference(&dev->dev, dev->sparse.write); fail_bg_eot: agx_bg_eot_cleanup(&dev->bg_eot); fail_internal_shaders_2: @@ -533,6 +538,7 @@ hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) hk_descriptor_table_finish(dev, &dev->images); hk_descriptor_table_finish(dev, &dev->occlusion_queries); agx_bo_unreference(&dev->dev, dev->rodata.bo); + agx_bo_unreference(&dev->dev, dev->sparse.write); agx_bo_unreference(&dev->dev, dev->heap); agx_bg_eot_cleanup(&dev->bg_eot); agx_close_device(&dev->dev); diff --git a/src/asahi/vulkan/hk_device.h b/src/asahi/vulkan/hk_device.h index 80f10f817cc..6e980ade4da 100644 --- a/src/asahi/vulkan/hk_device.h +++ b/src/asahi/vulkan/hk_device.h @@ -88,6 +88,14 @@ struct hk_device { uint64_t geometry_state; } rodata; + /* Pages for backing sparse resources */ + struct { + /* Undefined content, should not be read (except for atomics where the + * result is already undefined). + */ + struct agx_bo *write; + } sparse; + struct hk_internal_shaders prolog_epilog; struct hk_internal_shaders kernels; struct hk_api_shader *write_shader; diff --git a/src/asahi/vulkan/hk_image.c b/src/asahi/vulkan/hk_image.c index 4079f4ec5a9..601b329736d 100644 --- a/src/asahi/vulkan/hk_image.c +++ b/src/asahi/vulkan/hk_image.c @@ -14,6 +14,8 @@ #include "util/u_math.h" #include "vulkan/vulkan_core.h" +#include "agx_bo.h" +#include "hk_buffer.h" #include "hk_device.h" #include "hk_device_memory.h" #include "hk_entrypoints.h" @@ -27,6 +29,11 @@ */ #define HK_PLANE_ALIGN_B 128 +/* However, exposing the standard sparse block sizes requires using the standard + * alignment 65k. + */ +#define HK_SPARSE_ALIGN_B 65536 + static VkFormatFeatureFlags2 hk_get_image_plane_format_features(struct hk_physical_device *pdev, VkFormat vk_format, VkImageTiling tiling) @@ -241,6 +248,16 @@ hk_can_compress(const struct agx_device *dev, VkFormat format, unsigned plane, if (dev->debug & AGX_DBG_NOCOMPRESS) return false; + /* TODO: Handle compression with sparse. This should be doable but it's a bit + * subtle. Correctness first. + */ + if (flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT | + VK_IMAGE_CREATE_SPARSE_BINDING_BIT | + VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)) { + perf_debug_dev(dev, "No compression: sparse"); + return false; + } + /* Image compression is not (yet?) supported with host image copies, * although the vendor driver does support something similar if I recall. * Compression is not supported in hardware for storage images or mutable @@ -404,11 +421,19 @@ hk_GetPhysicalDeviceImageFormatProperties2( VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))) return VK_ERROR_FORMAT_NOT_SUPPORTED; - /* We don't yet support sparse, but it shouldn't be too hard */ - if (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT | - VK_IMAGE_CREATE_SPARSE_BINDING_BIT | - VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)) + /* Multiplane formats are not supported with sparse residency. This has no + * known use cases and is forbidden in other APIs. + * + * Neither is depth/stencil: this is a hardware limitation on G13. Hardware + * support is added with G14, but that's not implemented yet. We could + * emulate on G13 but it'd be fiddly. Fortunately, vkd3d-proton doesn't need + * sparse depth, as RADV has the same limitation! + */ + if ((ycbcr_info || + vk_format_is_depth_or_stencil(pImageFormatInfo->format)) && + (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)) { return VK_ERROR_FORMAT_NOT_SUPPORTED; + } const uint32_t max_dim = 16384; VkExtent3D maxExtent; @@ -610,18 +635,28 @@ hk_GetPhysicalDeviceImageFormatProperties2( } static VkSparseImageFormatProperties -hk_fill_sparse_image_fmt_props(VkImageAspectFlags aspects) +hk_fill_sparse_image_fmt_props(enum pipe_format format, unsigned samples, + VkImageAspectFlags aspects) { - /* TODO */ + /* Apple tile sizes are exactly 16KiB. The Vulkan standard block sizes are + * sized to be exactly 64KiB. Fortunately, they correspond directly to the + * Apple sizes (except for MSAA 2x), just doubled in each dimensions. Our + * sparse binding code gangs together 4 hardware tiles into an API tile. We + * just need to derive the correct size here. + */ + unsigned blocksize_B = util_format_get_blocksize(format) * samples; + struct ail_tile ail_size = ail_get_max_tile_size(blocksize_B); + + VkExtent3D granularity = { + ail_size.width_el * 2 * util_format_get_blockwidth(format), + ail_size.height_el * 2 * util_format_get_blockheight(format), + 1, + }; + return (VkSparseImageFormatProperties){ .aspectMask = aspects, .flags = VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT, - .imageGranularity = - { - .width = 1, - .height = 1, - .depth = 1, - }, + .imageGranularity = granularity, }; } @@ -672,7 +707,9 @@ hk_GetPhysicalDeviceSparseImageFormatProperties2( vk_outarray_append_typed(VkSparseImageFormatProperties2, &out, props) { - props->properties = hk_fill_sparse_image_fmt_props(aspects); + props->properties = hk_fill_sparse_image_fmt_props( + vk_format_to_pipe_format(pFormatInfo->format), pFormatInfo->samples, + aspects); } } @@ -881,16 +918,35 @@ hk_image_plane_alloc_vma(struct hk_device *dev, struct hk_image_plane *plane, assert(sparse_bound || !sparse_resident); if (sparse_bound) { - plane->vma_size_B = plane->layout.size_B; -#if 0 - plane->addr = nouveau_ws_alloc_vma(dev->ws_dev, 0, plane->vma_size_B, - plane->layout.align_B, - false, sparse_resident); -#endif + plane->va = + agx_va_alloc(&dev->dev, align(plane->layout.size_B, HK_SPARSE_ALIGN_B), + AIL_PAGESIZE, 0, 0); + plane->addr = plane->va->addr; if (plane->addr == 0) { return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, "Sparse VMA allocation failed"); } + + /* Bind scratch pages to discard writes, including from lowered software + * texture atomics. Reads will use the hardware texture unit sparse + * handling to properly handle residency queries. + * + * In the future we could optimize this out using the PBE sparse support + * but that needs more reverse-engineering. + */ + hk_bind_scratch(dev, plane->va, 0, plane->layout.size_B); + } + + if (sparse_resident) { + plane->sparse_map = + agx_bo_create(&dev->dev, plane->layout.sparse_table_size_B, + AIL_PAGESIZE, 0, "Sparse map"); + + /* Zero-initialize the sparse map. This ensures all tiles are disabled, + * which provides correct behaviour for unmapped tiles. + */ + memset(agx_bo_map(plane->sparse_map), 0, + plane->layout.sparse_table_size_B); } return VK_SUCCESS; @@ -901,16 +957,11 @@ hk_image_plane_finish(struct hk_device *dev, struct hk_image_plane *plane, VkImageCreateFlags create_flags, const VkAllocationCallbacks *pAllocator) { - if (plane->vma_size_B) { -#if 0 - const bool sparse_resident = - create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT; - - agx_bo_unbind_vma(dev->ws_dev, plane->addr, plane->vma_size_B); - nouveau_ws_free_vma(dev->ws_dev, plane->addr, plane->vma_size_B, - false, sparse_resident); -#endif + if (plane->va) { + agx_va_free(&dev->dev, plane->va, true); } + + agx_bo_unreference(&dev->dev, plane->sparse_map); } static void @@ -988,14 +1039,15 @@ hk_DestroyImage(VkDevice device, VkImage _image, } static void -hk_image_plane_add_req(struct hk_image_plane *plane, uint64_t *size_B, - uint32_t *align_B) +hk_image_plane_add_req(struct hk_image_plane *plane, bool sparse, + uint64_t *size_B, uint32_t *align_B) { + unsigned plane_align_B = sparse ? HK_SPARSE_ALIGN_B : HK_PLANE_ALIGN_B; assert(util_is_power_of_two_or_zero64(*align_B)); - assert(util_is_power_of_two_or_zero64(HK_PLANE_ALIGN_B)); + assert(util_is_power_of_two_or_zero64(plane_align_B)); - *align_B = MAX2(*align_B, HK_PLANE_ALIGN_B); - *size_B = align64(*size_B, HK_PLANE_ALIGN_B); + *align_B = MAX2(*align_B, plane_align_B); + *size_B = align64(*size_B, plane_align_B); *size_B += plane->layout.size_B; } @@ -1006,17 +1058,26 @@ hk_get_image_memory_requirements(struct hk_device *dev, struct hk_image *image, { struct hk_physical_device *pdev = hk_device_physical(dev); uint32_t memory_types = (1 << pdev->mem_type_count) - 1; - - // TODO hope for the best? + bool sparse = + image->vk.create_flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT | + VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT); uint64_t size_B = 0; uint32_t align_B = 0; if (image->disjoint) { uint8_t plane = hk_image_aspects_to_plane(image, aspects); - hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B); + hk_image_plane_add_req(&image->planes[plane], sparse, &size_B, &align_B); } else { for (unsigned plane = 0; plane < image->plane_count; plane++) - hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B); + hk_image_plane_add_req(&image->planes[plane], sparse, &size_B, + &align_B); + } + + /* For sparse binding, we need to pad to the standard alignment so we don't + * clobber over things when we bind memory. + */ + if (sparse) { + size_B = align64(size_B, align_B); } pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types; @@ -1079,17 +1140,38 @@ hk_fill_sparse_image_memory_reqs(const struct ail_layout *layout, VkImageAspectFlags aspects) { VkSparseImageFormatProperties sparse_format_props = - hk_fill_sparse_image_fmt_props(aspects); + hk_fill_sparse_image_fmt_props(layout->format, layout->sample_count_sa, + aspects); - // assert(layout->mip_tail_first_lod <= layout->num_levels); + unsigned tail_level = layout->mip_tail_first_lod; + assert(tail_level <= layout->levels); VkSparseImageMemoryRequirements sparse_memory_reqs = { .formatProperties = sparse_format_props, - .imageMipTailFirstLod = 0, // layout->mip_tail_first_lod, + .imageMipTailFirstLod = layout->mip_tail_first_lod, .imageMipTailStride = 0, }; - sparse_memory_reqs.imageMipTailSize = layout->size_B; - sparse_memory_reqs.imageMipTailOffset = 0; + /* imageMipTailSize must be aligned to the sparse block size (65k). This + * requires us to manage the miptail manually, because 16k is the actual + * hardware alignment here so we need to give the illusion of extra + * padding. Annoying! + */ + if (tail_level == 0) { + sparse_memory_reqs.imageMipTailSize = + align(layout->size_B, HK_SPARSE_ALIGN_B); + + sparse_memory_reqs.imageMipTailOffset = 0; + } else if (tail_level < layout->levels) { + sparse_memory_reqs.imageMipTailSize = + align(layout->mip_tail_stride * layout->depth_px, HK_SPARSE_ALIGN_B); + + /* TODO: sparse metadata */ + sparse_memory_reqs.imageMipTailOffset = HK_MIP_TAIL_START_OFFSET; + } else { + sparse_memory_reqs.imageMipTailSize = 0; + sparse_memory_reqs.imageMipTailOffset = HK_MIP_TAIL_START_OFFSET; + } + return sparse_memory_reqs; } @@ -1176,8 +1258,10 @@ hk_get_image_subresource_layout(UNUSED struct hk_device *dev, uint64_t offset_B = 0; if (!image->disjoint) { uint32_t align_B = 0; + /* TODO: sparse? */ for (unsigned plane = 0; plane < p; plane++) - hk_image_plane_add_req(&image->planes[plane], &offset_B, &align_B); + hk_image_plane_add_req(&image->planes[plane], false, &offset_B, + &align_B); } offset_B += ail_get_layer_level_B(&plane->layout, isr->arrayLayer, isr->mipLevel); @@ -1245,12 +1329,12 @@ hk_image_plane_bind(struct hk_device *dev, struct hk_image_plane *plane, { *offset_B = align64(*offset_B, HK_PLANE_ALIGN_B); - if (plane->vma_size_B) { + if (plane->va) { #if 0 agx_bo_bind_vma(dev->ws_dev, mem->bo, plane->addr, - plane->vma_size_B, + plane->va, *offset_B, plane->nil.pte_kind); #endif diff --git a/src/asahi/vulkan/hk_image.h b/src/asahi/vulkan/hk_image.h index 480b68e6b59..b8ac4d1434f 100644 --- a/src/asahi/vulkan/hk_image.h +++ b/src/asahi/vulkan/hk_image.h @@ -52,13 +52,16 @@ hk_get_image_format_features(struct hk_physical_device *pdevice, struct hk_image_plane { struct ail_layout layout; uint64_t addr; - - /** Size of the reserved VMA range for sparse images, zero otherwise. */ - uint64_t vma_size_B; + struct agx_va *va; /* For host image copy */ void *map; uint32_t rem; + + /* If the image has sparse residency, its residency is tracked in this + * secondary page table. Otherwise, this map is NULL. + */ + struct agx_bo *sparse_map; }; struct hk_image { diff --git a/src/asahi/vulkan/hk_image_view.c b/src/asahi/vulkan/hk_image_view.c index 5fe68cad490..bc026d2093e 100644 --- a/src/asahi/vulkan/hk_image_view.c +++ b/src/asahi/vulkan/hk_image_view.c @@ -198,7 +198,8 @@ pack_texture(struct hk_image_view *view, unsigned view_plane, { struct hk_image *image = container_of(view->vk.image, struct hk_image, vk); const uint8_t image_plane = view->planes[view_plane].image_plane; - struct ail_layout *layout = &image->planes[image_plane].layout; + struct hk_image_plane *plane = &image->planes[image_plane]; + struct ail_layout *layout = &plane->layout; uint64_t base_addr = hk_image_base_address(image, image_plane); bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED; @@ -282,6 +283,42 @@ pack_texture(struct hk_image_view *view, unsigned view_plane, cfg.last_level = level + view->vk.level_count - 1; } + /* To implement sparse resident textures, the hardware texture descriptor + * can instead point to a secondary page table controlled in userspace. + * This allows remapping pages and - crucially - disabling unmapped pages + * to read zero and report non-resident with shader residency queries. + * When we have a sparse map, we need to point to it here. + * + * However, there's a wrinkle: when handling uncompressed views of + * compressed images in the above code, we need to offset the image + * address to point to the specific mip level rather than use the hardware + * "first level" field. This ensures the layouts are consistent despite us + * munging the image dimensions. In that case, we need to also offset the + * sparse page table accordingly. Of course, the sparse page table is in + * terms of pages, so this trick only works when the mip level is + * page-aligned. + * + * However, if the mip level is NOT page-aligned, it is in the mip tail by + * definition. As the mip tail is always resident, there is no need for a + * sparse page table. So either: + * + * 1. We are in the mip tail and don't need a sparse map, or + * 2. We are not but the level is page-aligned in the sparse map. + * + * Either way we're okay. + */ + if (plane->sparse_map && level < layout->mip_tail_first_lod) { + unsigned page = 0; + if (denom.x > 1) { + page = ail_bytes_to_pages(layout->level_offsets_B[level]); + } + + cfg.mode = AGX_IMAGE_MODE_SPARSE; + cfg.address = plane->sparse_map->va->addr + + ail_page_to_sparse_index_el(layout, layer, page) * + AIL_SPARSE_ELSIZE_B; + } + cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); cfg.unk_mipmapped = layout->levels > 1; cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3; diff --git a/src/asahi/vulkan/hk_nir_lower_descriptors.c b/src/asahi/vulkan/hk_nir_lower_descriptors.c index dd11b01ed4c..4f336e30ba7 100644 --- a/src/asahi/vulkan/hk_nir_lower_descriptors.c +++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c @@ -337,6 +337,7 @@ lower_image_intrin(nir_builder *b, nir_intrinsic_instr *intr, /* Reads and queries use the texture descriptor; writes and atomics PBE. */ unsigned offs; if (intr->intrinsic != nir_intrinsic_image_deref_load && + intr->intrinsic != nir_intrinsic_image_deref_sparse_load && intr->intrinsic != nir_intrinsic_image_deref_size && intr->intrinsic != nir_intrinsic_image_deref_samples) { diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c index 9a8a6d96558..bc6a4a7a105 100644 --- a/src/asahi/vulkan/hk_physical_device.c +++ b/src/asahi/vulkan/hk_physical_device.c @@ -212,6 +212,7 @@ hk_get_device_extensions(const struct hk_instance *instance, static void hk_get_device_features( + const struct agx_device *dev, const struct vk_device_extension_table *supported_extensions, struct vk_features *features) { @@ -260,15 +261,28 @@ hk_get_device_features( .shaderFloat64 = false, .shaderInt64 = true, .shaderInt16 = true, - .shaderResourceResidency = false, + .shaderResourceResidency = true, .shaderResourceMinLod = true, - .sparseBinding = false, + .sparseBinding = true, + + /* We probably could advertise multisampled sparse but we don't have a use + * case yet and it isn't trivial. + */ .sparseResidency2Samples = false, .sparseResidency4Samples = false, .sparseResidency8Samples = false, - .sparseResidencyAliased = false, - .sparseResidencyBuffer = false, - .sparseResidencyImage2D = false, + .sparseResidencyAliased = true, + .sparseResidencyImage2D = true, + + /* We depend on soft fault to implement sparse residency on buffers with + * the appropriate semantics. Lifting this requirement would be possible + * but challenging, given the requirements imposed by + * sparseResidencyNonResidentStrict. + */ + .sparseResidencyBuffer = + (dev->params.feat_compat & DRM_ASAHI_FEAT_SOFT_FAULTS), + + /* This needs investigation. */ .sparseResidencyImage3D = false, .variableMultisampleRate = false, .inheritedQueries = true, @@ -736,10 +750,18 @@ hk_get_device_properties(const struct agx_device *dev, .nonCoherentAtomSize = 64, /* Vulkan 1.0 sparse properties */ - .sparseResidencyNonResidentStrict = false, + .sparseResidencyNonResidentStrict = true, .sparseResidencyAlignedMipSize = false, - .sparseResidencyStandard2DBlockShape = false, + .sparseResidencyStandard2DBlockShape = true, + + /* We can implement the standard block size for MSAA 4x but maybe not MSAA + * 2x? + */ .sparseResidencyStandard2DMultisampleBlockShape = false, + + /* As far as I can tell, there is no way to implement this on G13. This + * is a shame because D3D12 requires it for FL12.2. + */ .sparseResidencyStandard3DBlockShape = false, /* Vulkan 1.1 properties */ @@ -1166,7 +1188,8 @@ hk_create_drm_physical_device(struct vk_instance *_instance, hk_get_device_extensions(instance, &supported_extensions); struct vk_features supported_features; - hk_get_device_features(&supported_extensions, &supported_features); + hk_get_device_features(&pdev->dev, &supported_extensions, + &supported_features); struct vk_properties properties; hk_get_device_properties(&pdev->dev, instance, &properties); @@ -1216,10 +1239,9 @@ hk_create_drm_physical_device(struct vk_instance *_instance, assert(pdev->mem_heap_count <= ARRAY_SIZE(pdev->mem_heaps)); assert(pdev->mem_type_count <= ARRAY_SIZE(pdev->mem_types)); - /* TODO: VK_QUEUE_SPARSE_BINDING_BIT*/ pdev->queue_families[pdev->queue_family_count++] = (struct hk_queue_family){ - .queue_flags = - VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, + .queue_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT, .queue_count = 1, }; diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c index 5cefc6c4204..24ee22ed7a6 100644 --- a/src/asahi/vulkan/hk_queue.c +++ b/src/asahi/vulkan/hk_queue.c @@ -5,9 +5,15 @@ * Copyright 2024 Valve Corporation * Copyright 2024 Alyssa Rosenzweig * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * based in part on anv driver which is: + * Copyright © 2015 Intel Corporation * SPDX-License-Identifier: MIT */ #include "hk_queue.h" +#include "hk_buffer.h" #include "agx_bg_eot.h" #include "agx_bo.h" @@ -16,13 +22,17 @@ #include "decode.h" #include "hk_cmd_buffer.h" #include "hk_device.h" +#include "hk_image.h" #include "hk_physical_device.h" #include #include "asahi/lib/unstable_asahi_drm.h" #include "util/list.h" +#include "util/macros.h" #include "vulkan/vulkan_core.h" +#include "hk_private.h" +#include "layout.h" #include "vk_drm_syncobj.h" #include "vk_sync.h" @@ -426,10 +436,328 @@ queue_submit_looped(struct hk_device *dev, struct drm_asahi_submit *submit) return VK_SUCCESS; } +struct hk_bind_builder { + /* Initialized */ + struct hk_device *dev; + struct vk_object_base *obj_base; + struct agx_va *va; + struct hk_image *image; + + /* State */ + struct hk_device_memory *mem; + VkDeviceSize resourceOffset; + VkDeviceSize size; + VkDeviceSize memoryOffset; + VkResult result; +}; + +static inline struct hk_bind_builder +hk_bind_builder(struct hk_device *dev, struct vk_object_base *obj_base, + struct agx_va *va, struct hk_image *image) +{ + return (struct hk_bind_builder){ + .dev = dev, + .obj_base = obj_base, + .va = va, + .image = image, + }; +} + +static VkResult +hk_flush_bind(struct hk_bind_builder *b) +{ + if (b->result != VK_SUCCESS || b->size == 0) { + return b->result; + } + + uint64_t va_addr = b->va->addr + b->resourceOffset; + + /* If we have an image with sparse residency, we have a userspace-managed + * sparse page table map, which we need to keep in sync with the real + * kernel-managed page table. This ensures textures get strict residency + * semantics, using the hardware sparse support. + */ + if (b->image && b->image->planes[0].sparse_map != NULL) { + assert(b->image->plane_count == 1 && "multiplane sparse not supported"); + + uint32_t *map = agx_bo_map(b->image->planes[0].sparse_map); + uint64_t size_page = ail_bytes_to_pages(b->size); + + struct ail_layout *layout = &b->image->planes[0].layout; + uint64_t layer_stride_page = ail_bytes_to_pages(layout->layer_stride_B); + + for (unsigned offs_page = 0; offs_page < size_page; offs_page++) { + /* Determine the target page to bind */ + uint64_t target_page = + ail_bytes_to_pages(b->resourceOffset) + offs_page; + + /* The page table is per-layer. Fortunately, layers are page-aligned, + * so we can divide to find the layer & the page relative to the start + * of the layer, which give us the index into the sparse map. + * + * Note that we can end up out-of-bounds since the hardware page size + * (16k) is smaller than the Vulkan standard sparse block size (65k). + * Just clamp out-of-bounds maps - there is sufficient VA space for + * them but not sufficient sparse map space for them. + */ + uint64_t z = target_page / layer_stride_page; + if (z >= layout->depth_px) + break; + + uint64_t page_in_layer = target_page % layer_stride_page; + unsigned idx = ail_page_to_sparse_index_el(layout, z, page_in_layer); + + agx_pack(map + idx, SPARSE_BLOCK, cfg) { + cfg.enabled = b->mem != NULL; + cfg.unknown = cfg.enabled; + + if (cfg.enabled) { + cfg.address = va_addr + (offs_page * AIL_PAGESIZE); + } + } + } + } + + /* When the app wants to unbind, replace the bound pages with scratch pages + * so we don't leave a gap. + */ + if (!b->mem) { + return hk_bind_scratch(b->dev, b->va, b->resourceOffset, b->size); + } else { + return b->dev->dev.ops.bo_bind(&b->dev->dev, b->mem->bo, va_addr, b->size, + b->memoryOffset, + ASAHI_BIND_READ | ASAHI_BIND_WRITE, false); + } +} + +static void +hk_add_bind(struct hk_bind_builder *b, struct hk_device_memory *mem, + VkDeviceSize resourceOffset, VkDeviceSize size, + VkDeviceSize memoryOffset) +{ + /* Discard trivial binds to simplify the below logic. */ + if (size == 0) + return; + + /* Try to merge with the previous bind */ + if (b->size && b->mem == mem && + resourceOffset == b->resourceOffset + b->size && + (!mem || memoryOffset == b->memoryOffset + b->size)) { + + b->size += size; + return; + } + + /* Otherwise, flush the previous bind and replace with the new one */ + hk_flush_bind(b); + b->mem = mem; + b->resourceOffset = resourceOffset; + b->size = size; + b->memoryOffset = memoryOffset; +} + +static VkResult +hk_sparse_buffer_bind_memory(struct hk_device *device, + const VkSparseBufferMemoryBindInfo *bind) +{ + VK_FROM_HANDLE(hk_buffer, buffer, bind->buffer); + + struct hk_bind_builder b = + hk_bind_builder(device, &buffer->vk.base, buffer->va, NULL); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct hk_device_memory *cur_mem = NULL; + + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + cur_mem = hk_device_memory_from_handle(bind->pBinds[i].memory); + + hk_add_bind(&b, cur_mem, bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, bind->pBinds[i].memoryOffset); + } + + return hk_flush_bind(&b); +} + +static VkResult +hk_sparse_image_opaque_bind_memory( + struct hk_device *device, const VkSparseImageOpaqueMemoryBindInfo *bind) +{ + VK_FROM_HANDLE(hk_image, image, bind->image); + + struct hk_bind_builder b = + hk_bind_builder(device, &image->vk.base, image->planes[0].va, image); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct hk_device_memory *mem = NULL; + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = hk_device_memory_from_handle(bind->pBinds[i].memory); + + VkDeviceSize resourceOffset = bind->pBinds[i].resourceOffset; + + /* Conceptually, the miptail is a single region at the end of the image, + * possibly layered. However, due to alignment requirements we need to + * use a non-layered miptail and internally fan out to each of the layers. + * This is facilitated by the HK_MIP_TAIL_START_OFFSET magic offset, see + * the comment where that is defined for more detail. + */ + if (resourceOffset >= HK_MIP_TAIL_START_OFFSET) { + assert(resourceOffset == HK_MIP_TAIL_START_OFFSET && + "must bind whole miptail... maybe..."); + + const struct ail_layout *layout = &image->planes[0].layout; + unsigned tail_offset_B = + layout->level_offsets_B[layout->mip_tail_first_lod]; + + for (unsigned z = 0; z < layout->depth_px; ++z) { + uint64_t image_offs = tail_offset_B + (z * layout->layer_stride_B); + uint64_t mem_offs = + bind->pBinds[i].memoryOffset + (z * layout->mip_tail_stride); + + hk_add_bind(&b, mem, image_offs, layout->mip_tail_stride, mem_offs); + } + } else { + hk_add_bind(&b, mem, bind->pBinds[i].resourceOffset, + bind->pBinds[i].size, bind->pBinds[i].memoryOffset); + } + } + + return hk_flush_bind(&b); +} + +static void +bind_hw_tile(struct hk_bind_builder *b, struct hk_device_memory *mem, + struct ail_layout *layout, unsigned layer, unsigned level, + VkOffset3D offset, VkExtent3D extent, struct ail_tile std_size_el, + unsigned mem_offset, unsigned x, unsigned y, unsigned z) +{ + uint64_t bo_offset_B = ail_get_twiddled_block_B( + layout, level, offset.x + x, offset.y + y, layer + offset.z + z); + + /* Consider the standard tiles in the bound memory to be in raster order, and + * address accordingly in standard tiles. + */ + unsigned mem_x_stl = x / std_size_el.width_el; + unsigned mem_y_stl = y / std_size_el.height_el; + unsigned extent_w_stl = DIV_ROUND_UP(extent.width, std_size_el.width_el); + unsigned extent_y_stl = DIV_ROUND_UP(extent.height, std_size_el.height_el); + unsigned mem_offs_stl = (extent_y_stl * extent_w_stl * z) + + (extent_w_stl * mem_y_stl) + mem_x_stl; + + /* There are 4 hardware tiles per standard tile, so offset + * accordingly for each hardware tile. + */ + unsigned mem_offset_B = mem_offset + (mem_offs_stl * 4 * AIL_PAGESIZE); + + if (x % std_size_el.width_el) + mem_offset_B += AIL_PAGESIZE; + + if (y % std_size_el.height_el) + mem_offset_B += (2 * AIL_PAGESIZE); + + hk_add_bind(b, mem, bo_offset_B, AIL_PAGESIZE, mem_offset_B); +} + +static VkResult +hk_sparse_image_bind_memory(struct hk_device *device, + const VkSparseImageMemoryBindInfo *bind) +{ + VK_FROM_HANDLE(hk_image, image, bind->image); + struct ail_layout *layout = &image->planes[0].layout; + + struct hk_bind_builder b = + hk_bind_builder(device, &image->vk.base, image->planes[0].va, image); + + for (uint32_t i = 0; i < bind->bindCount; ++i) { + struct hk_device_memory *mem = NULL; + if (bind->pBinds[i].memory != VK_NULL_HANDLE) + mem = hk_device_memory_from_handle(bind->pBinds[i].memory); + + uint64_t mem_offset = bind->pBinds[i].memoryOffset; + const uint32_t layer = bind->pBinds[i].subresource.arrayLayer; + const uint32_t level = bind->pBinds[i].subresource.mipLevel; + + VkExtent3D bind_extent = bind->pBinds[i].extent; + bind_extent.width = DIV_ROUND_UP( + bind_extent.width, vk_format_get_blockwidth(image->vk.format)); + bind_extent.height = DIV_ROUND_UP( + bind_extent.height, vk_format_get_blockheight(image->vk.format)); + + VkOffset3D bind_offset = bind->pBinds[i].offset; + bind_offset.x /= vk_format_get_blockwidth(image->vk.format); + bind_offset.y /= vk_format_get_blockheight(image->vk.format); + + /* Hardware tiles are exactly one page (16K) */ + struct ail_tile tilesize_el = layout->tilesize_el[level]; + unsigned size_B = tilesize_el.width_el * tilesize_el.height_el * + ail_get_blocksize_B(layout); + + assert(size_B == AIL_PAGESIZE && "fundamental to AGX"); + + /* Standard tiles are exactly 4 pages (65K), consisting of a 2x2 grid of + * hardware tiles. + */ + struct ail_tile std_size_el = tilesize_el; + std_size_el.width_el *= 2; + std_size_el.height_el *= 2; + + for (unsigned z = 0; z < bind_extent.depth; z += 1) { + for (unsigned y = 0; y < bind_extent.height; + y += tilesize_el.height_el) { + for (unsigned x = 0; x < bind_extent.width; + x += tilesize_el.width_el) { + bind_hw_tile(&b, mem, layout, layer, level, bind_offset, + bind_extent, std_size_el, mem_offset, x, y, z); + } + } + } + } + + return hk_flush_bind(&b); +} + +static VkResult +hk_queue_submit_bind_sparse_memory(struct hk_device *device, + struct vk_queue_submit *submission) +{ + assert(submission->command_buffer_count == 0); + + for (uint32_t i = 0; i < submission->buffer_bind_count; ++i) { + VkResult result = + hk_sparse_buffer_bind_memory(device, submission->buffer_binds + i); + if (result != VK_SUCCESS) + return result; + } + + for (uint32_t i = 0; i < submission->image_opaque_bind_count; ++i) { + VkResult result = hk_sparse_image_opaque_bind_memory( + device, submission->image_opaque_binds + i); + if (result != VK_SUCCESS) + return result; + } + + for (uint32_t i = 0; i < submission->image_bind_count; ++i) { + VkResult result = + hk_sparse_image_bind_memory(device, submission->image_binds + i); + if (result != VK_SUCCESS) + return result; + } + + return VK_SUCCESS; +} + static VkResult queue_submit(struct hk_device *dev, struct hk_queue *queue, struct vk_queue_submit *submit) { + /* TODO: Support asynchronous sparse queue? */ + if (submit->buffer_bind_count || submit->image_bind_count || + submit->image_opaque_bind_count) { + + VkResult result = hk_queue_submit_bind_sparse_memory(dev, submit); + if (result != VK_SUCCESS) + return result; + } + unsigned command_count = 0; /* Gather the number of individual commands to submit up front */